From 46606cb2d6089dc473025d681a45757343539c6b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 26 Sep 2022 23:18:48 +0200 Subject: [PATCH 001/833] sparse: Add a guard for netinet/ip6.h header on FreeBSD. Same as arpa/inet.h, the netinet/ip6.h on FreeBSD requires netinet/in.h to be included first. So, adding a similar guard. Also fixing one instance where this is not respected at the moment. We do have FreeBSD CI these days, but it is still nice to have a more clear error message. Fixes: b2befd5bb2db ("sparse: Add guards to prevent FreeBSD-incompatible #include order.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- include/sparse/netinet/ip6.h | 4 ++++ lib/netdev-offload-dpdk.c | 1 + 2 files changed, 5 insertions(+) diff --git a/include/sparse/netinet/ip6.h b/include/sparse/netinet/ip6.h index bfa637a4604..b2b6f47d9e2 100644 --- a/include/sparse/netinet/ip6.h +++ b/include/sparse/netinet/ip6.h @@ -18,6 +18,10 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif +#ifndef NETINET_IN_H_INCLUDED +#error "Must include before for FreeBSD support" +#endif + #ifndef __NETINET_IP6_SPARSE #define __NETINET_IP6_SPARSE 1 diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index cceefbc5075..80a64a6cc06 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include From b8932f5b339c731ed8962316330fa770ec1b4f5b Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Tue, 27 Sep 2022 12:04:53 -0400 Subject: [PATCH 002/833] vconn: Allow ECONNREFUSED in refuse connection test. The "tcp vconn - refuse connection" test may fail due to a Connection Refused error. The network stack returns ECONNREFUSED on a reset connection in SYN_SENT state and EPIPE or ECONNRESET in all other cases. 2022-09-19T17:45:48Z|00001|socket_util|INFO|0:127.0.0.1: listening on port 34189 2022-09-19T17:45:48Z|00002|poll_loop|DBG|wakeup due to [POLLOUT][ POLLERR][POLLHUP] on fd 4 (127.0.0.1:47140<->) at ../lib/stream-fd. c:153 test-vconn: unexpected vconn_connect() return value 111 (Connection refused) ../../tests/vconn.at:21: exit code was 1, expected 0 530. vconn.at:21: 530. tcp vconn - refuse connection (vconn.at:21): FAILED (vconn.at:21) This was observed from a CI system, and isn't a common case. Acked-by: Eelco Chaudron Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/test-vconn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test-vconn.c b/tests/test-vconn.c index fc8ce4a2c0e..96c89bd4e68 100644 --- a/tests/test-vconn.c +++ b/tests/test-vconn.c @@ -157,6 +157,7 @@ test_refuse_connection(struct ovs_cmdl_context *ctx) error = vconn_connect_block(vconn, (TIMEOUT - 2) * 1000); if (!strcmp(type, "tcp")) { if (error != ECONNRESET && error != EPIPE && error != ETIMEDOUT + && error != ECONNREFUSED #ifdef _WIN32 && error != WSAECONNRESET #endif From 691c5a5defc4f67b0932c71d80a517c46c711859 Mon Sep 17 00:00:00 2001 From: Fengqi Li Date: Fri, 30 Sep 2022 09:09:28 +0800 Subject: [PATCH 003/833] daemon-unix: Fix file descriptor leak when monitor restarts child. When segmentation fault occurred in ovn-northd, monitor will try to restart the ovn-northd daemon process every 10s. Assume the following scenarios: There is a segmentation fault and the ovn-northd daemon process does not restart properly every time. New fds are created each time the ovn-northd daemon process is restarted by the monitor process, but old fds(fd[0]) owned by the monitor process was not closed properly. One pipe leak for each restart of the ovn-northd daemon process. After a long time file descriptors were exhausted. Fixes: e2ed6fbeb18c ("fatal-signal: Catch SIGSEGV and print backtrace.") Signed-off-by: Fengqi Li Signed-off-by: Ilya Maximets --- lib/daemon-unix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c index 52f3d4bc635..1a7ba427d7a 100644 --- a/lib/daemon-unix.c +++ b/lib/daemon-unix.c @@ -396,6 +396,8 @@ monitor_daemon(pid_t daemon_pid) } log_received_backtrace(daemonize_fd); + close(daemonize_fd); + daemonize_fd = -1; /* Throttle restarts to no more than once every 10 seconds. */ if (time(NULL) < last_restart + 10) { From 6c47354069ef26a4e89fd3832e148ae86a57d44d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 6 Oct 2022 22:06:18 +0200 Subject: [PATCH 004/833] AUTHORS: Add Fengqi Li. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index f4184be8fc4..c13cf60c5e8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -162,6 +162,7 @@ Ethan J. Jackson ejj@eecs.berkeley.edu Ethan Rahn erahn@arista.com Eziz Durdyyev ezizdurdy@gmail.com Fabrizio D'Angelo fdangelo@redhat.com +Fengqi Li lifengqi@inspur.com Flavio Fernandes flavio@flaviof.com Flavio Leitner fbl@redhat.com Francesco Fusco ffusco@redhat.com From 1a9482d53347de04be5ef1ac557cc0e33b5be1fb Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Thu, 22 Sep 2022 15:40:32 +0200 Subject: [PATCH 005/833] dhparams: Fix .c file generation with OpenSSL >= 3.0. Since OpenSSL upstream commit 1696b8909bbe ("Remove -C from dhparam,dsaparam,ecparam") "openssl dhparam" doesn't support -C anymore. This commit changes generate-dhparams-c to generate dhparams.c by parsing "openssl dhparam -in "$1" -text -noout" output directly. The generated file won't be used on OpenSSL >= 3.0, but it's still needed to be generated if OVS is built on OpenSSL < 3.0. Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- build-aux/generate-dhparams-c | 79 +++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 8 deletions(-) diff --git a/build-aux/generate-dhparams-c b/build-aux/generate-dhparams-c index 1884c99e1f0..a80db6207c4 100755 --- a/build-aux/generate-dhparams-c +++ b/build-aux/generate-dhparams-c @@ -1,5 +1,74 @@ #! /bin/sh -e +dhparam_to_c() { + local bits + local get_p=0 + local line + local nl=" +" + local p + local i=0 + while read -r line; do + case "$line" in + *"DH Parameters: "*) + bits=${line#*DH Parameters: (} + bits=${bits% bit)} + continue + ;; + "P:"|"prime:") + get_p=1 + continue + ;; + "G: "*|"generator: "*) + g=${line#*(} + g=${g%)} + g=$(printf "0x%.2X" "$g") + continue + ;; + esac + if [ "$get_p" = 1 ]; then + IFS=":" + for x in $line; do + [ -z "$p" ] && [ "$x" = "00" ] && continue + [ $i -ge 10 ] && i=0 + [ $i -eq 0 ] && p="$p$nl " + x=0x$x + p=$(printf "%s 0x%.2X," "$p" "$x") + i=$((i + 1)) + done + unset IFS + fi + done < Date: Thu, 22 Sep 2022 15:40:33 +0200 Subject: [PATCH 006/833] Add support for OpenSSL 3.0 functions. In OpenSSL 3.0 some functions were deprecated and replaced. This commit adds some #ifdef to build without warning on both OpenSSL 1.x and OpenSSL 3.x. For OpenSSL 3.x, the default built-in DH parameters are used (as suggested by SSL_CTX_set_dh_auto manpage). Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- build-aux/generate-dhparams-c | 2 ++ lib/dhparams.c | 2 ++ lib/stream-ssl.c | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/build-aux/generate-dhparams-c b/build-aux/generate-dhparams-c index a80db6207c4..aca1dbca910 100755 --- a/build-aux/generate-dhparams-c +++ b/build-aux/generate-dhparams-c @@ -78,6 +78,7 @@ cat <<'EOF' #include "lib/dhparams.h" #include "openvswitch/util.h" +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static int my_DH_set0_pqg(DH *dh, BIGNUM *p, const BIGNUM **q OVS_UNUSED, BIGNUM *g) { @@ -93,3 +94,4 @@ my_DH_set0_pqg(DH *dh, BIGNUM *p, const BIGNUM **q OVS_UNUSED, BIGNUM *g) EOF dhparam_to_c lib/dh2048.pem dhparam_to_c lib/dh4096.pem +echo "#endif" diff --git a/lib/dhparams.c b/lib/dhparams.c index 85123863fc5..50209d5d813 100644 --- a/lib/dhparams.c +++ b/lib/dhparams.c @@ -6,6 +6,7 @@ #include "lib/dhparams.h" #include "openvswitch/util.h" +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static int my_DH_set0_pqg(DH *dh, BIGNUM *p, const BIGNUM **q OVS_UNUSED, BIGNUM *g) { @@ -142,3 +143,4 @@ DH *get_dh4096(void) } return dh; } +#endif diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c index f4fe3432e77..62da9febb66 100644 --- a/lib/stream-ssl.c +++ b/lib/stream-ssl.c @@ -193,7 +193,9 @@ static void ssl_clear_txbuf(struct ssl_stream *); static void interpret_queued_ssl_error(const char *function); static int interpret_ssl_error(const char *function, int ret, int error, int *want); +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static DH *tmp_dh_callback(SSL *ssl, int is_export OVS_UNUSED, int keylength); +#endif static void log_ca_cert(const char *file_name, X509 *cert); static void stream_ssl_set_ca_cert_file__(const char *file_name, bool bootstrap, bool force); @@ -471,7 +473,11 @@ static char * get_peer_common_name(const struct ssl_stream *sslv) { char *peer_name = NULL; +#if OPENSSL_VERSION_NUMBER < 0x3000000fL X509 *peer_cert = SSL_get_peer_certificate(sslv->ssl); +#else + X509 *peer_cert = SSL_get1_peer_certificate(sslv->ssl); +#endif if (!peer_cert) { return NULL; } @@ -1070,7 +1076,11 @@ do_ssl_init(void) return ENOPROTOOPT; } SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); +#if OPENSSL_VERSION_NUMBER < 0x3000000fL SSL_CTX_set_tmp_dh_callback(ctx, tmp_dh_callback); +#else + SSL_CTX_set_dh_auto(ctx, 1); +#endif SSL_CTX_set_mode(ctx, SSL_MODE_ENABLE_PARTIAL_WRITE); SSL_CTX_set_mode(ctx, SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER); SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, @@ -1081,6 +1091,7 @@ do_ssl_init(void) return 0; } +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static DH * tmp_dh_callback(SSL *ssl OVS_UNUSED, int is_export OVS_UNUSED, int keylength) { @@ -1112,6 +1123,7 @@ tmp_dh_callback(SSL *ssl OVS_UNUSED, int is_export OVS_UNUSED, int keylength) keylength); return NULL; } +#endif /* Returns true if SSL is at least partially configured. */ bool From 0b21e234312ee25d52051375f2ca386212d4e609 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Jul 2022 13:11:16 +0200 Subject: [PATCH 007/833] json: Fix deep copy of objects and arrays. When reference counting for json objects was introduced the old json_clone() function became json_deep_clone(), but it still calls shallow json_clone() while cloning objects and arrays not really producing a deep copy. Fixing that by making other functions to perform a deep copy as well. There are no users for this functionality inside OVS right now, but OVS exports this functionality externally. 'ovstest test-json' extended to test both versions of a clone on provided inputs. Fixes: 9854d473adea ("json: Use reference counting in JSON objects") Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/json.c | 16 +++--- tests/test-json.c | 124 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 128 insertions(+), 12 deletions(-) diff --git a/lib/json.c b/lib/json.c index 3267a619633..aded8bb0159 100644 --- a/lib/json.c +++ b/lib/json.c @@ -420,8 +420,8 @@ json_destroy_array(struct json_array *array) free(array->elems); } -static struct json *json_clone_object(const struct shash *object); -static struct json *json_clone_array(const struct json_array *array); +static struct json *json_deep_clone_object(const struct shash *object); +static struct json *json_deep_clone_array(const struct json_array *array); /* Returns a deep copy of 'json'. */ struct json * @@ -429,10 +429,10 @@ json_deep_clone(const struct json *json) { switch (json->type) { case JSON_OBJECT: - return json_clone_object(json->object); + return json_deep_clone_object(json->object); case JSON_ARRAY: - return json_clone_array(&json->array); + return json_deep_clone_array(&json->array); case JSON_STRING: return json_string_create(json->string); @@ -464,7 +464,7 @@ json_nullable_clone(const struct json *json) } static struct json * -json_clone_object(const struct shash *object) +json_deep_clone_object(const struct shash *object) { struct shash_node *node; struct json *json; @@ -472,20 +472,20 @@ json_clone_object(const struct shash *object) json = json_object_create(); SHASH_FOR_EACH (node, object) { struct json *value = node->data; - json_object_put(json, node->name, json_clone(value)); + json_object_put(json, node->name, json_deep_clone(value)); } return json; } static struct json * -json_clone_array(const struct json_array *array) +json_deep_clone_array(const struct json_array *array) { struct json **elems; size_t i; elems = xmalloc(array->n * sizeof *elems); for (i = 0; i < array->n; i++) { - elems[i] = json_clone(array->elems[i]); + elems[i] = json_deep_clone(array->elems[i]); } return json_array_create(elems, array->n); } diff --git a/tests/test-json.c b/tests/test-json.c index a2f4332e77b..6cf5eb75def 100644 --- a/tests/test-json.c +++ b/tests/test-json.c @@ -34,8 +34,123 @@ static int pretty = 0; * instead of exactly one object or array. */ static int multiple = 0; +static void test_json_equal(const struct json *a, const struct json *b, + bool allow_the_same); + +static void +test_json_equal_object(const struct shash *a, const struct shash *b, + bool allow_the_same) +{ + struct shash_node *a_node; + + ovs_assert(allow_the_same || a != b); + + if (a == b) { + return; + } + + ovs_assert(shash_count(a) == shash_count(b)); + + SHASH_FOR_EACH (a_node, a) { + struct shash_node *b_node = shash_find(b, a_node->name); + + ovs_assert(b_node); + test_json_equal(a_node->data, b_node->data, allow_the_same); + } +} + +static void +test_json_equal_array(const struct json_array *a, const struct json_array *b, + bool allow_the_same) +{ + ovs_assert(allow_the_same || a != b); + + if (a == b) { + return; + } + + ovs_assert(a->n == b->n); + + for (size_t i = 0; i < a->n; i++) { + test_json_equal(a->elems[i], b->elems[i], allow_the_same); + } +} + +static void +test_json_equal(const struct json *a, const struct json *b, + bool allow_the_same) +{ + ovs_assert(allow_the_same || a != b); + ovs_assert(a && b); + + if (a == b) { + ovs_assert(a->count > 1); + return; + } + + ovs_assert(a->type == b->type); + + switch (a->type) { + case JSON_OBJECT: + test_json_equal_object(a->object, b->object, allow_the_same); + return; + + case JSON_ARRAY: + test_json_equal_array(&a->array, &b->array, allow_the_same); + return; + + case JSON_STRING: + case JSON_SERIALIZED_OBJECT: + ovs_assert(a->string != b->string); + ovs_assert(!strcmp(a->string, b->string)); + return; + + case JSON_NULL: + case JSON_FALSE: + case JSON_TRUE: + return; + + case JSON_INTEGER: + ovs_assert(a->integer == b->integer); + return; + + case JSON_REAL: + ovs_assert(a->real == b->real); + return; + + case JSON_N_TYPES: + default: + OVS_NOT_REACHED(); + } +} + +static void +test_json_clone(struct json *json) +{ + struct json *copy, *deep_copy; + + copy = json_clone(json); + + ovs_assert(json_equal(json, copy)); + test_json_equal(json, copy, true); + ovs_assert(json->count == 2); + + json_destroy(copy); + ovs_assert(json->count == 1); + + deep_copy = json_deep_clone(json); + + ovs_assert(json_equal(json, deep_copy)); + test_json_equal(json, deep_copy, false); + ovs_assert(json->count == 1); + ovs_assert(deep_copy->count == 1); + + json_destroy(deep_copy); + ovs_assert(json->count == 1); +} + static bool -print_and_free_json(struct json *json) +print_test_and_free_json(struct json *json) { bool ok; if (json->type == JSON_STRING) { @@ -47,6 +162,7 @@ print_and_free_json(struct json *json) free(s); ok = true; } + test_json_clone(json); json_destroy(json); return ok; } @@ -89,7 +205,7 @@ parse_multiple(FILE *stream) used += json_parser_feed(parser, &buffer[used], n - used); if (used < n) { - if (!print_and_free_json(json_parser_finish(parser))) { + if (!print_test_and_free_json(json_parser_finish(parser))) { ok = false; } parser = NULL; @@ -97,7 +213,7 @@ parse_multiple(FILE *stream) } } if (parser) { - if (!print_and_free_json(json_parser_finish(parser))) { + if (!print_test_and_free_json(json_parser_finish(parser))) { ok = false; } } @@ -150,7 +266,7 @@ test_json_main(int argc, char *argv[]) if (multiple) { ok = parse_multiple(stream); } else { - ok = print_and_free_json(json_from_stream(stream)); + ok = print_test_and_free_json(json_from_stream(stream)); } fclose(stream); From 96b26dce1da18f00dcad2e14bc058158fffa313f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 1 Sep 2022 17:42:49 +0200 Subject: [PATCH 008/833] ofproto-dpif-upcall: Print more data on unassociated datapath ports. When OVS fails to find an OpenFlow port for a packet received from the upcall it just prints the warning like this: |INFO|received packet on unassociated datapath port N However, during the flow translation more information is available as if the recirculation id wasn't found or it was a packet from unknown tunnel port. Printing that information might be useful to understand the origin of the problem. Port translation functions already support extended error strings, we just need to pass a variable where to store them. With the change the output may be: |INFO|received packet on unassociated datapath port N (no OpenFlow port for datapath port N) or |INFO|received packet on unassociated datapath port N (no OpenFlow tunnel port for this packet) or |INFO|received packet on unassociated datapath port N (no recirculation data for recirc_id M) Unfortunately, there is no good way to trigger this code from current unit tests. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 27 +++++++++++++++++++-------- ofproto/ofproto-dpif-xlate.c | 6 ++++-- ofproto/ofproto-dpif-xlate.h | 2 +- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 7ad728adffd..ad96354966f 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -402,7 +402,8 @@ static int upcall_receive(struct upcall *, const struct dpif_backer *, const struct dp_packet *packet, enum dpif_upcall_type, const struct nlattr *userdata, const struct flow *, const unsigned int mru, - const ovs_u128 *ufid, const unsigned pmd_id); + const ovs_u128 *ufid, const unsigned pmd_id, + char **errorp); static void upcall_uninit(struct upcall *); static void udpif_flow_rebalance(struct udpif *udpif); @@ -827,6 +828,7 @@ recv_upcalls(struct handler *handler) struct upcall *upcall = &upcalls[n_upcalls]; struct flow *flow = &flows[n_upcalls]; unsigned int mru = 0; + char *errorp = NULL; uint64_t hash = 0; int error; @@ -853,7 +855,7 @@ recv_upcalls(struct handler *handler) error = upcall_receive(upcall, udpif->backer, &dupcall->packet, dupcall->type, dupcall->userdata, flow, mru, - &dupcall->ufid, PMD_ID_NULL); + &dupcall->ufid, PMD_ID_NULL, &errorp); if (error) { if (error == ENODEV) { /* Received packet on datapath port for which we couldn't @@ -864,8 +866,11 @@ recv_upcalls(struct handler *handler) dupcall->key_len, NULL, 0, NULL, 0, &dupcall->ufid, PMD_ID_NULL, NULL); VLOG_INFO_RL(&rl, "received packet on unassociated datapath " - "port %"PRIu32, flow->in_port.odp_port); + "port %"PRIu32"%s%s%s", flow->in_port.odp_port, + errorp ? " (" : "", errorp ? errorp : "", + errorp ? ")" : ""); } + free(errorp); goto free_dupcall; } @@ -1151,7 +1156,8 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, const struct dp_packet *packet, enum dpif_upcall_type type, const struct nlattr *userdata, const struct flow *flow, const unsigned int mru, - const ovs_u128 *ufid, const unsigned pmd_id) + const ovs_u128 *ufid, const unsigned pmd_id, + char **errorp) { int error; @@ -1160,7 +1166,8 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, return EAGAIN; } else if (upcall->type == MISS_UPCALL) { error = xlate_lookup(backer, flow, &upcall->ofproto, &upcall->ipfix, - &upcall->sflow, NULL, &upcall->ofp_in_port); + &upcall->sflow, NULL, &upcall->ofp_in_port, + errorp); if (error) { return error; } @@ -1168,7 +1175,11 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, struct ofproto_dpif *ofproto = ofproto_dpif_lookup_by_uuid(&upcall->cookie.ofproto_uuid); if (!ofproto) { - VLOG_INFO_RL(&rl, "upcall could not find ofproto"); + if (errorp) { + *errorp = xstrdup("upcall could not find ofproto"); + } else { + VLOG_INFO_RL(&rl, "upcall could not find ofproto"); + } return ENODEV; } upcall->ofproto = ofproto; @@ -1358,7 +1369,7 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi atomic_read_relaxed(&enable_megaflows, &megaflow); error = upcall_receive(&upcall, udpif->backer, packet, type, userdata, - flow, 0, ufid, pmd_id); + flow, 0, ufid, pmd_id, NULL); if (error) { return error; } @@ -2154,7 +2165,7 @@ xlate_key(struct udpif *udpif, const struct nlattr *key, unsigned int len, } error = xlate_lookup(udpif->backer, &ctx->flow, &ofproto, NULL, NULL, - ctx->netflow, &ofp_in_port); + ctx->netflow, &ofp_in_port, NULL); if (error) { return error; } diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index ab6f39bb264..3b9b26da171 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -1603,17 +1603,19 @@ xlate_lookup_ofproto(const struct dpif_backer *backer, const struct flow *flow, * be taken. * * Returns 0 if successful, ENODEV if the parsed flow has no associated ofproto. + * Sets an extended error string to 'errorp'. Callers are responsible for + * freeing that string. */ int xlate_lookup(const struct dpif_backer *backer, const struct flow *flow, struct ofproto_dpif **ofprotop, struct dpif_ipfix **ipfix, struct dpif_sflow **sflow, struct netflow **netflow, - ofp_port_t *ofp_in_port) + ofp_port_t *ofp_in_port, char **errorp) { struct ofproto_dpif *ofproto; const struct xport *xport; - ofproto = xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport, NULL); + ofproto = xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport, errorp); if (!ofproto) { return ENODEV; diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index c1af477c496..05b46fb26b1 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -209,7 +209,7 @@ struct ofproto_dpif * xlate_lookup_ofproto(const struct dpif_backer *, int xlate_lookup(const struct dpif_backer *, const struct flow *, struct ofproto_dpif **, struct dpif_ipfix **, struct dpif_sflow **, struct netflow **, - ofp_port_t *ofp_in_port); + ofp_port_t *ofp_in_port, char **errorp); const char *xlate_strerror(enum xlate_error error); From ccd26e79e5d24dd19e59d53337b51ce167966530 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Thu, 6 Oct 2022 15:11:08 +0800 Subject: [PATCH 009/833] ovs-tcpdump: Fix bond port unable to capture jumbo frames. Currently the ovs-tcpdump utility creates a tap port to capture the frames of a bond port. If a user want to capture the packets from the bond port which member interface's mtu is more than 1500. By default the utility creates a tap port which mtu is 1500, regardless the member interface's mtu config. So that user can't get the bond port frames which mtu is lager than 1500. This patch fix this issue by checking the member interface's mtu and set maximal mtu value to the tap port. Acked-by: Aaron Conole Signed-off-by: Lin Huang Signed-off-by: Ilya Maximets --- utilities/ovs-tcpdump.in | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index 7fd26e40557..e12bab88956 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -225,6 +225,13 @@ class OVSDB(object): def interface_mtu(self, intf_name): try: intf = self._find_row_by_name('Interface', intf_name) + if intf is None: + mtu = 1500 + port = self._find_row_by_name('Port', intf_name) + for intf in port.interfaces: + if mtu < intf.mtu[0]: + mtu = intf.mtu[0] + return mtu return intf.mtu[0] except Exception: return None From dc54104526030123fc8390e6106782c6a3aca2f3 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 10 Oct 2022 15:11:57 +0200 Subject: [PATCH 010/833] ovsdb: Fix race for datum JSON string reference counter. Compaction thread supposed to not change anything in the database it is working on, since the same data can be accessed by the main thread at the same time. However, while converting database rows to JSON objects, strings in the datum will be cloned using json_clone(), which is a shallow copy, and that will change the reference counter for the JSON string object. If both the main thread and the compaction thread will clone/destroy the same object at the same time we may end up with a broken reference counter leading to a memory leak or use-after free. Adding a new argument to the database to JSON conversion to prevent use of shallow copies from the compaction thread. This way all the database operations will be truly read-only avoiding the race. 'ovsdb_atom_to_json' and 'ovsdb_datum_to_json' are more widely used, so creating separate variant for these functions instead of adding a new argument, to avoid changing a lot of existing code. Other solution might be to use atomic reference counters, but that will require API/ABI break, because counter is exposed in public headers. Also, we can not easily expose atomic functions, so we'll need to un-inline reference counting with the associated performance cost. Fixes: 3cd2cbd684e0 ("ovsdb: Prepare snapshot JSON in a separate thread.") Reported-at: https://bugzilla.redhat.com/2133431 Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/ovsdb-data.c | 56 ++++++++++++++++++++++++++++++++++++---------- lib/ovsdb-data.h | 2 ++ ovsdb/file.c | 34 ++++++++++++++++++++++------ ovsdb/file.h | 3 ++- ovsdb/ovsdb-tool.c | 5 +++-- ovsdb/ovsdb.c | 7 ++++-- ovsdb/trigger.c | 2 +- 7 files changed, 84 insertions(+), 25 deletions(-) diff --git a/lib/ovsdb-data.c b/lib/ovsdb-data.c index 183e752583a..f18f74298f9 100644 --- a/lib/ovsdb-data.c +++ b/lib/ovsdb-data.c @@ -455,9 +455,15 @@ ovsdb_atom_from_json(union ovsdb_atom *atom, /* Converts 'atom', of the specified 'type', to JSON format, and returns the * JSON. The caller is responsible for freeing the returned JSON. * + * If 'allow_shallow_copies' is false, deep copy of the string JSON object + * will be used. Useful when the same string object is accessed by multiple + * threads as deep copy will not change the reference counter of the original + * JSON string. + * * Refer to RFC 7047 for the format of the JSON that this function produces. */ -struct json * -ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) +static struct json * +ovsdb_atom_to_json__(const union ovsdb_atom *atom, enum ovsdb_atomic_type type, + bool allow_shallow_copies) { switch (type) { case OVSDB_TYPE_VOID: @@ -473,7 +479,8 @@ ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) return json_boolean_create(atom->boolean); case OVSDB_TYPE_STRING: - return json_clone(atom->s); + return allow_shallow_copies ? json_clone(atom->s) + : json_deep_clone(atom->s); case OVSDB_TYPE_UUID: return wrap_json("uuid", json_string_create_nocopy( @@ -485,6 +492,19 @@ ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) } } +struct json * +ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) +{ + return ovsdb_atom_to_json__(atom, type, true); +} + +static struct json * +ovsdb_atom_to_json_deep(const union ovsdb_atom *atom, + enum ovsdb_atomic_type type) +{ + return ovsdb_atom_to_json__(atom, type, false); +} + static char * ovsdb_atom_from_string__(union ovsdb_atom *atom, union ovsdb_atom **range_end_atom, @@ -1409,12 +1429,15 @@ ovsdb_unconstrained_datum_from_json(struct ovsdb_datum *datum, static struct json * ovsdb_base_to_json(const union ovsdb_atom *atom, const struct ovsdb_base_type *base, - bool use_row_names) + bool use_row_names, + bool allow_shallow_copies) { if (!use_row_names || base->type != OVSDB_TYPE_UUID || !base->uuid.refTableName) { - return ovsdb_atom_to_json(atom, base->type); + return allow_shallow_copies + ? ovsdb_atom_to_json(atom, base->type) + : ovsdb_atom_to_json_deep(atom, base->type); } else { return json_array_create_2( json_string_create("named-uuid"), @@ -1425,7 +1448,8 @@ ovsdb_base_to_json(const union ovsdb_atom *atom, static struct json * ovsdb_datum_to_json__(const struct ovsdb_datum *datum, const struct ovsdb_type *type, - bool use_row_names) + bool use_row_names, + bool allow_shallow_copies) { if (ovsdb_type_is_map(type)) { struct json **elems; @@ -1435,14 +1459,15 @@ ovsdb_datum_to_json__(const struct ovsdb_datum *datum, for (i = 0; i < datum->n; i++) { elems[i] = json_array_create_2( ovsdb_base_to_json(&datum->keys[i], &type->key, - use_row_names), + use_row_names, allow_shallow_copies), ovsdb_base_to_json(&datum->values[i], &type->value, - use_row_names)); + use_row_names, allow_shallow_copies)); } return wrap_json("map", json_array_create(elems, datum->n)); } else if (datum->n == 1) { - return ovsdb_base_to_json(&datum->keys[0], &type->key, use_row_names); + return ovsdb_base_to_json(&datum->keys[0], &type->key, + use_row_names, allow_shallow_copies); } else { struct json **elems; size_t i; @@ -1450,7 +1475,7 @@ ovsdb_datum_to_json__(const struct ovsdb_datum *datum, elems = xmalloc(datum->n * sizeof *elems); for (i = 0; i < datum->n; i++) { elems[i] = ovsdb_base_to_json(&datum->keys[i], &type->key, - use_row_names); + use_row_names, allow_shallow_copies); } return wrap_json("set", json_array_create(elems, datum->n)); @@ -1467,14 +1492,21 @@ struct json * ovsdb_datum_to_json(const struct ovsdb_datum *datum, const struct ovsdb_type *type) { - return ovsdb_datum_to_json__(datum, type, false); + return ovsdb_datum_to_json__(datum, type, false, true); +} + +struct json * +ovsdb_datum_to_json_deep(const struct ovsdb_datum *datum, + const struct ovsdb_type *type) +{ + return ovsdb_datum_to_json__(datum, type, false, false); } struct json * ovsdb_datum_to_json_with_row_names(const struct ovsdb_datum *datum, const struct ovsdb_type *type) { - return ovsdb_datum_to_json__(datum, type, true); + return ovsdb_datum_to_json__(datum, type, true, true); } static const char * diff --git a/lib/ovsdb-data.h b/lib/ovsdb-data.h index dcb62051358..f048a8cb03d 100644 --- a/lib/ovsdb-data.h +++ b/lib/ovsdb-data.h @@ -195,6 +195,8 @@ ovsdb_unconstrained_datum_from_json(struct ovsdb_datum *, OVS_WARN_UNUSED_RESULT; struct json *ovsdb_datum_to_json(const struct ovsdb_datum *, const struct ovsdb_type *); +struct json *ovsdb_datum_to_json_deep(const struct ovsdb_datum *, + const struct ovsdb_type *); char *ovsdb_datum_from_string(struct ovsdb_datum *, const struct ovsdb_type *, const char *, diff --git a/ovsdb/file.c b/ovsdb/file.c index ca80c282356..fdc289ad1b7 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -52,7 +52,8 @@ static void ovsdb_file_txn_init(struct ovsdb_file_txn *); static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *, const struct ovsdb_row *old, const struct ovsdb_row *new, - const unsigned long int *changed); + const unsigned long int *changed, + bool allow_shallow_copies); /* If set to 'true', file transactions will contain difference between * datums of old and new rows and not the whole new datum for the column. */ @@ -361,12 +362,19 @@ ovsdb_file_change_cb(const struct ovsdb_row *old, void *ftxn_) { struct ovsdb_file_txn *ftxn = ftxn_; - ovsdb_file_txn_add_row(ftxn, old, new, changed); + ovsdb_file_txn_add_row(ftxn, old, new, changed, true); return true; } +/* Converts the database into transaction JSON representation. + * If 'allow_shallow_copies' is false, makes sure that all the JSON + * objects in the resulted transaction JSON are separately allocated + * objects and not shallow clones of JSON objects already existing + * in the database. Useful when multiple threads are working on the + * same database object. */ struct json * -ovsdb_to_txn_json(const struct ovsdb *db, const char *comment) +ovsdb_to_txn_json(const struct ovsdb *db, const char *comment, + bool allow_shallow_copies) { struct ovsdb_file_txn ftxn; @@ -378,7 +386,8 @@ ovsdb_to_txn_json(const struct ovsdb *db, const char *comment) const struct ovsdb_row *row; HMAP_FOR_EACH (row, hmap_node, &table->rows) { - ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL); + ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL, + allow_shallow_copies); } } @@ -426,7 +435,8 @@ static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, const struct ovsdb_row *old, const struct ovsdb_row *new, - const unsigned long int *changed) + const unsigned long int *changed, + bool allow_shallow_copies) { struct json *row; @@ -451,10 +461,20 @@ ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, if (old && use_column_diff) { ovsdb_datum_diff(&datum, &old->fields[idx], &new->fields[idx], type); - column_json = ovsdb_datum_to_json(&datum, type); + if (allow_shallow_copies) { + column_json = ovsdb_datum_to_json(&datum, type); + } else { + column_json = ovsdb_datum_to_json_deep(&datum, type); + } ovsdb_datum_destroy(&datum, type); } else { - column_json = ovsdb_datum_to_json(&new->fields[idx], type); + if (allow_shallow_copies) { + column_json = ovsdb_datum_to_json( + &new->fields[idx], type); + } else { + column_json = ovsdb_datum_to_json_deep( + &new->fields[idx], type); + } } if (!row) { row = json_object_create(); diff --git a/ovsdb/file.h b/ovsdb/file.h index be4f6ad27ca..ae90d4fe130 100644 --- a/ovsdb/file.h +++ b/ovsdb/file.h @@ -25,7 +25,8 @@ struct ovsdb_txn; void ovsdb_file_column_diff_disable(void); -struct json *ovsdb_to_txn_json(const struct ovsdb *, const char *comment); +struct json *ovsdb_to_txn_json(const struct ovsdb *, const char *comment, + bool allow_shallow_copies); struct json *ovsdb_file_txn_to_json(const struct ovsdb_txn *); struct json *ovsdb_file_txn_annotate(struct json *, const char *comment); struct ovsdb_error *ovsdb_file_txn_from_json(struct ovsdb *, diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index df2e373c3cd..60f353197bf 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -304,7 +304,7 @@ do_create_cluster(struct ovs_cmdl_context *ctx) struct ovsdb *ovsdb = ovsdb_file_read(src_file_name, false); char *comment = xasprintf("created from %s", src_file_name); - data = ovsdb_to_txn_json(ovsdb, comment); + data = ovsdb_to_txn_json(ovsdb, comment, true); free(comment); schema = ovsdb_schema_clone(ovsdb->schema); ovsdb_destroy(ovsdb); @@ -359,7 +359,8 @@ write_standalone_db(const char *file_name, const char *comment, error = ovsdb_log_write_and_free(log, ovsdb_schema_to_json(db->schema)); if (!error) { - error = ovsdb_log_write_and_free(log, ovsdb_to_txn_json(db, comment)); + error = ovsdb_log_write_and_free(log, + ovsdb_to_txn_json(db, comment, true)); } ovsdb_log_close(log); diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 8cbefbe3d21..1c011fab00d 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -585,7 +585,9 @@ compaction_thread(void *aux) struct json *data; VLOG_DBG("%s: Compaction thread started.", state->db->name); - data = ovsdb_to_txn_json(state->db, "compacting database online"); + data = ovsdb_to_txn_json(state->db, "compacting database online", + /* Do not allow shallow copies to avoid races. */ + false); state->data = json_serialized_object_create(data); json_destroy(data); @@ -633,7 +635,8 @@ ovsdb_snapshot(struct ovsdb *db, bool trim_memory OVS_UNUSED) if (!applied_index) { /* Parallel compaction is not supported for standalone databases. */ state = xzalloc(sizeof *state); - state->data = ovsdb_to_txn_json(db, "compacting database online"); + state->data = ovsdb_to_txn_json(db, + "compacting database online", true); state->schema = ovsdb_schema_to_json(db->schema); } else if (ovsdb_snapshot_ready(db)) { xpthread_join(db->snap_state->thread, NULL); diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 7d3003bca32..01bb80e282b 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -282,7 +282,7 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) /* Make the new copy into a transaction log record. */ struct json *txn_json = ovsdb_to_txn_json( - newdb, "converted by ovsdb-server"); + newdb, "converted by ovsdb-server", true); /* Propose the change. */ t->progress = ovsdb_txn_propose_schema_change( From edeefe762331095574be64b238320f4e7cd4f637 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 12 Oct 2022 11:19:41 +0200 Subject: [PATCH 011/833] github: Update versions of action dependencies. checkout@v2, cache@v2 and setup-python@v2 are using outdated Node.js 12 which is now deprecated in GHA [1], so these actions will stop working soon. Updating to most recent major versions with Node.js 16. This stops GHA from throwing warnings in every build. While at it, also updating upload-artifacts to more recent version. [1] https://github.blog/changelog/2022-09-22-github-actions-all-actions-will-begin-running-on-node16-instead-of-node12/ Acked-by: David Marchand Signed-off-by: Ilya Maximets --- .github/workflows/build-and-test.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 58ab85e5d7e..7baa914034a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -96,7 +96,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: update PATH run: | @@ -104,7 +104,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - name: set up python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: '3.9' @@ -120,7 +120,7 @@ jobs: - name: cache if: matrix.dpdk != '' || matrix.dpdk_shared != '' - uses: actions/cache@v2 + uses: actions/cache@v3 env: matrix_key: ${{ matrix.dpdk }}${{ matrix.dpdk_shared }} ci_key: ${{ hashFiles('dpdk-ci-signature') }} @@ -156,7 +156,7 @@ jobs: - name: upload logs on failure if: failure() || cancelled() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: logs-linux-${{ join(matrix.*, '-') }} path: logs.tgz @@ -175,13 +175,13 @@ jobs: steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: update PATH run: | echo "$HOME/bin" >> $GITHUB_PATH echo "$HOME/.local/bin" >> $GITHUB_PATH - name: set up python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: '3.9' - name: install dependencies @@ -192,7 +192,7 @@ jobs: run: ./.ci/osx-build.sh - name: upload logs on failure if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: logs-osx-clang---disable-ssl path: config.log @@ -217,7 +217,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: update PATH run: | @@ -239,7 +239,7 @@ jobs: run: ./.ci/linux-build.sh - name: upload deb packages - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: deb-packages-${{ matrix.dpdk }}-dpdk path: '/home/runner/work/ovs/*.deb' From 6f535383948664794ceccf5471e6d77000478877 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 7 Jun 2019 16:28:24 -0700 Subject: [PATCH 012/833] ofproto-dpif-xlate: Do not use zero-weight buckets in select groups. The OpenFlow specification says that buckets in select groups with a weight of zero should not be selected, but the ofproto-dpif implementation could select them in corner cases. This fixes the problem. Reported-by: ychen Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-May/359349.html Signed-off-by: Ben Pfaff Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 3b9b26da171..81deb72d91c 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -1924,8 +1924,8 @@ group_is_alive(const struct xlate_ctx *ctx, uint32_t group_id, int depth) #define MAX_LIVENESS_RECURSION 128 /* Arbitrary limit */ static bool -bucket_is_alive(const struct xlate_ctx *ctx, - struct ofputil_bucket *bucket, int depth) +bucket_is_alive(const struct xlate_ctx *ctx, const struct group_dpif *group, + const struct ofputil_bucket *bucket, int depth) { if (depth >= MAX_LIVENESS_RECURSION) { xlate_report_error(ctx, "bucket chaining exceeded %d links", @@ -1933,6 +1933,12 @@ bucket_is_alive(const struct xlate_ctx *ctx, return false; } + /* In "select" groups, buckets with weight 0 are not used. + * In other kinds of groups, weight does not matter. */ + if (group->up.type == OFPGT11_SELECT && bucket->weight == 0) { + return false; + } + return (!ofputil_bucket_has_liveness(bucket) || (bucket->watch_port != OFPP_ANY && bucket->watch_port != OFPP_CONTROLLER @@ -1973,7 +1979,7 @@ group_first_live_bucket(const struct xlate_ctx *ctx, { struct ofputil_bucket *bucket; LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { - if (bucket_is_alive(ctx, bucket, depth)) { + if (bucket_is_alive(ctx, group, bucket, depth)) { return bucket; } xlate_report_bucket_not_live(ctx, bucket); @@ -1992,7 +1998,7 @@ group_best_live_bucket(const struct xlate_ctx *ctx, struct ofputil_bucket *bucket; LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { - if (bucket_is_alive(ctx, bucket, 0)) { + if (bucket_is_alive(ctx, group, bucket, 0)) { uint32_t score = (hash_int(bucket->bucket_id, basis) & 0xffff) * bucket->weight; if (score >= best_score) { @@ -4755,7 +4761,7 @@ pick_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group) for (int i = 0; i <= hash_mask; i++) { struct ofputil_bucket *b = group->hash_map[(dp_hash + i) & hash_mask]; - if (bucket_is_alive(ctx, b, 0)) { + if (bucket_is_alive(ctx, group, b, 0)) { return b; } } From 31db0e043119cf597d720d94f70ec19cf5b8b7d4 Mon Sep 17 00:00:00 2001 From: Yanqin Wei Date: Mon, 18 Nov 2019 10:45:18 +0800 Subject: [PATCH 013/833] cmap: Add thread fence for slot update. Bucket update in the cmap lib is protected by a counter. But hash setting is possible to be moved before counter update. This patch fix this issue. Reviewed-by: Ola Liljedahl Reviewed-by: Gavin Hu Signed-off-by: Yanqin Wei Signed-off-by: Ilya Maximets --- lib/cmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/cmap.c b/lib/cmap.c index c9eef3f4aea..8ca893b0b25 100644 --- a/lib/cmap.c +++ b/lib/cmap.c @@ -598,7 +598,9 @@ cmap_set_bucket(struct cmap_bucket *b, int i, uint32_t c; atomic_read_explicit(&b->counter, &c, memory_order_acquire); - atomic_store_explicit(&b->counter, c + 1, memory_order_release); + atomic_store_explicit(&b->counter, c + 1, memory_order_relaxed); + /* Need to make sure setting hash is not moved up before counter update. */ + atomic_thread_fence(memory_order_release); ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */ b->hashes[i] = hash; atomic_store_explicit(&b->counter, c + 2, memory_order_release); From 76ab364ea8facd73366411916d7d0f5ff611daed Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Wed, 31 Aug 2022 12:59:55 +0300 Subject: [PATCH 014/833] netdev-offload: Set 'miss_api_supported' to be under netdev. Cited commit introduced a flag in dpif-netdev level, to optimize performance and avoid hw_miss_packet_recover() for devices with no such support. However, there is a race condition between traffic processing and assigning a 'flow_api' object to the netdev. In such case, EOPNOTSUPP is returned by netdev_hw_miss_packet_recover() in netdev-offload.c layer because 'flow_api' is not yet initialized. As a result, the flag is falsely disabled, and subsequent packets won't be recovered, though they should. In order to fix it, move the flag to be in netdev-offload layer, to avoid that race. Fixes: 6e50c1651869 ("dpif-netdev: Avoid hw_miss_packet_recover() for devices with no support.") Signed-off-by: Eli Britstein Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 18 +++++++----------- lib/netdev-offload.c | 28 +++++++++++++++++++++++----- lib/netdev-offload.h | 2 ++ lib/netdev.c | 1 + 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index a45b460145c..2c08a71c8db 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -431,7 +431,6 @@ struct dp_netdev_rxq { unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */ struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ bool is_vhost; /* Is rxq of a vhost port. */ - bool hw_miss_api_supported; /* hw_miss_packet_recover() supported.*/ /* Counters of cycles spent successfully polling and processing pkts. */ atomic_ullong cycles[RXQ_N_CYCLES]; @@ -5416,7 +5415,6 @@ port_reconfigure(struct dp_netdev_port *port) port->rxqs[i].port = port; port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9); - port->rxqs[i].hw_miss_api_supported = true; err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i); if (err) { @@ -8034,17 +8032,15 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd, #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */ /* Restore the packet if HW processing was terminated before completion. */ struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq; + bool miss_api_supported; - if (rxq->hw_miss_api_supported) { + atomic_read_relaxed(&rxq->port->netdev->hw_info.miss_api_supported, + &miss_api_supported); + if (miss_api_supported) { int err = netdev_hw_miss_packet_recover(rxq->port->netdev, packet); - if (err) { - if (err != EOPNOTSUPP) { - COVERAGE_INC(datapath_drop_hw_miss_recover); - return -1; - } else { - /* API unsupported by the port; avoid subsequent calls. */ - rxq->hw_miss_api_supported = false; - } + if (err && err != EOPNOTSUPP) { + COVERAGE_INC(datapath_drop_hw_miss_recover); + return -1; } } #endif diff --git a/lib/netdev-offload.c b/lib/netdev-offload.c index 9fde5f7a95f..4592262bd34 100644 --- a/lib/netdev-offload.c +++ b/lib/netdev-offload.c @@ -183,6 +183,7 @@ netdev_assign_flow_api(struct netdev *netdev) CMAP_FOR_EACH (rfa, cmap_node, &netdev_flow_apis) { if (!rfa->flow_api->init_flow_api(netdev)) { ovs_refcount_ref(&rfa->refcnt); + atomic_store_relaxed(&netdev->hw_info.miss_api_supported, true); ovsrcu_set(&netdev->flow_api, rfa->flow_api); VLOG_INFO("%s: Assigned flow API '%s'.", netdev_get_name(netdev), rfa->flow_api->type); @@ -191,6 +192,7 @@ netdev_assign_flow_api(struct netdev *netdev) VLOG_DBG("%s: flow API '%s' is not suitable.", netdev_get_name(netdev), rfa->flow_api->type); } + atomic_store_relaxed(&netdev->hw_info.miss_api_supported, false); VLOG_INFO("%s: No suitable flow API found.", netdev_get_name(netdev)); return -1; @@ -322,12 +324,28 @@ int netdev_hw_miss_packet_recover(struct netdev *netdev, struct dp_packet *packet) { - const struct netdev_flow_api *flow_api = - ovsrcu_get(const struct netdev_flow_api *, &netdev->flow_api); + const struct netdev_flow_api *flow_api; + bool miss_api_supported; + int rv; + + atomic_read_relaxed(&netdev->hw_info.miss_api_supported, + &miss_api_supported); + if (!miss_api_supported) { + return EOPNOTSUPP; + } + + flow_api = ovsrcu_get(const struct netdev_flow_api *, &netdev->flow_api); + if (!flow_api || !flow_api->hw_miss_packet_recover) { + return EOPNOTSUPP; + } + + rv = flow_api->hw_miss_packet_recover(netdev, packet); + if (rv == EOPNOTSUPP) { + /* API unsupported by the port; avoid subsequent calls. */ + atomic_store_relaxed(&netdev->hw_info.miss_api_supported, false); + } - return (flow_api && flow_api->hw_miss_packet_recover) - ? flow_api->hw_miss_packet_recover(netdev, packet) - : EOPNOTSUPP; + return rv; } int diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h index 180d3f95f06..edc843cd99a 100644 --- a/lib/netdev-offload.h +++ b/lib/netdev-offload.h @@ -20,6 +20,7 @@ #include "openvswitch/netdev.h" #include "openvswitch/types.h" +#include "ovs-atomic.h" #include "ovs-rcu.h" #include "ovs-thread.h" #include "openvswitch/ofp-meter.h" @@ -46,6 +47,7 @@ struct ovs_action_push_tnl; /* Offload-capable (HW) netdev information */ struct netdev_hw_info { bool oor; /* Out of Offload Resources ? */ + atomic_bool miss_api_supported; /* hw_miss_packet_recover() supported.*/ int offload_count; /* Pending (non-offloaded) flow count */ int pending_count; /* Offloaded flow count */ OVSRCU_TYPE(void *) offload_data; /* Offload metadata. */ diff --git a/lib/netdev.c b/lib/netdev.c index ce0d4117ac0..c797783782f 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -431,6 +431,7 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp) seq_read(netdev->reconfigure_seq); ovsrcu_set(&netdev->flow_api, NULL); netdev->hw_info.oor = false; + atomic_init(&netdev->hw_info.miss_api_supported, false); netdev->node = shash_add(&netdev_shash, name, netdev); /* By default enable one tx and rx queue per netdev. */ From 77f739914d406665dc17733a6cdd4fff9a80f7a3 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 27 Sep 2022 17:32:55 +0200 Subject: [PATCH 015/833] ofproto-dpif-xlate: Allow sample when no in_port. OVN can (and indeed does) set in_port to OFPP_NONE during the pipeline evaluation. If a sample action follows, it will be incorrectly skipped. Per-flow sampling version of: f0a9000ca ofproto: Fix ipfix not always sampling on egress. Signed-off-by: Adrian Moreno Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 81deb72d91c..5d2af93fa26 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -5699,7 +5699,7 @@ xlate_sample_action(struct xlate_ctx *ctx, struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; bool emit_set_tunnel = false; - if (!ipfix || ctx->xin->flow.in_port.ofp_port == OFPP_NONE) { + if (!ipfix) { return; } From f7ae3f93c8511962c0198984004b7c10eb574c9c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 6 Oct 2022 21:37:24 +0200 Subject: [PATCH 016/833] tests: Fix filtering of whole-second durations. Current macros are unable to filter whole seconds, e.g. 'duration:6s'. This is causing random test failures, most frequently in CirrusCI: ./dpif-netdev.at:370: ovs-ofctl -O OpenFlow13 meter-stats br0 | strip_timers --- - +++ /tmp/cirrus-ci-build/tests/testsuite.dir/at-groups/990/stdout @@ -1,5 +1,5 @@ OFPST_METER reply (OF1.3) (xid=0x2): -meter:1 flow_count:1 packet_in_count:10 byte_in_count:600 duration:0.0s bands: +meter:1 flow_count:1 packet_in_count:10 byte_in_count:600 duration:6s bands: Fix sed matches to correctly handle that scenario. Repeating the [0-9\.] twice because it is hard to write a shorter portable version with sed. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/dpif-netdev.at | 10 +++++----- tests/stp.at | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 3179e1645d8..6aff1eda7b0 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -6,8 +6,8 @@ m4_divert_push([PREPARE_TESTS]) # that vary from one run to another (e.g., timing and bond actions). strip_timers () { sed ' - s/duration:[0-9]*\.[0-9]*/duration:0.0/ - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/duration:[0-9\.][0-9\.]*/duration:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ ' } @@ -15,7 +15,7 @@ strip_xout () { sed ' s/mega_ufid:[-0-9a-f]* // s/ufid:[-0-9a-f]* // - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ s/actions:.*/actions: / s/packets:[0-9]*/packets:0/ s/bytes:[0-9]*/bytes:0/ @@ -26,7 +26,7 @@ strip_xout_keep_actions () { sed ' s/mega_ufid:[-0-9a-f]* // s/ufid:[-0-9a-f]* // - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ s/packets:[0-9]*/packets:0/ s/bytes:[0-9]*/bytes:0/ ' | sort @@ -51,7 +51,7 @@ filter_hw_packet_netdev_dummy () { filter_flow_dump () { grep 'flow_dump ' | sed ' s/.*flow_dump // - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ ' | sort | uniq } diff --git a/tests/stp.at b/tests/stp.at index 7ddacfc3a0e..69475843e55 100644 --- a/tests/stp.at +++ b/tests/stp.at @@ -368,7 +368,7 @@ AT_CLEANUP # Strips out uninteresting parts of flow output, as well as parts # that vary from one run to another (e.g., timing and bond actions). m4_define([STRIP_USED], [[sed ' - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ s/duration=[0-9.]*s*/duration=Xs/ s/idle_age=[0-9]*,/idle_age=X,/ ']]) From 9c27bd230f7f108974157d858e71b3eda2139d08 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Wed, 12 Oct 2022 16:36:13 +0200 Subject: [PATCH 017/833] ct-dpif: Replace ct_dpif_format_flags() with format_flags_masked(). This patch removes ct_dpif_format_flags() in favor of the existing format_flags_masked(). This has the extra bonus of showing keys with empty values as "key=0", instead of showing "key=". E.g., the following: NEW tcp,orig=([...]),reply=([...]),id=1800618864, status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120, protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7, wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM,flags_reply=) becomes: NEW tcp,orig=([...]),reply=([...]),id=1800618864, status=CONFIRMED|SRC_NAT_DONE|DST_NAT_DONE,timeout=120, protoinfo=(state_orig=SYN_SENT,state_reply=SYN_SENT,wscale_orig=7, wscale_reply=0,flags_orig=WINDOW_SCALE|SACK_PERM,flags_reply=0) Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- lib/ct-dpif.c | 76 ++++++++++++++++++++++++++------------------------- lib/ct-dpif.h | 4 +++ 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index cfc2315e3dc..6f17a26b5f4 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -35,20 +35,11 @@ static void ct_dpif_format_counters(struct ds *, const struct ct_dpif_counters *); static void ct_dpif_format_timestamp(struct ds *, const struct ct_dpif_timestamp *); -static void ct_dpif_format_flags(struct ds *, const char *title, - uint32_t flags, const struct flags *); static void ct_dpif_format_protoinfo(struct ds *, const char *title, const struct ct_dpif_protoinfo *, bool verbose); static void ct_dpif_format_helper(struct ds *, const char *title, const struct ct_dpif_helper *); - -static const struct flags ct_dpif_status_flags[] = { -#define CT_DPIF_STATUS_FLAG(FLAG) { CT_DPIF_STATUS_##FLAG, #FLAG }, - CT_DPIF_STATUS_FLAGS -#undef CT_DPIF_STATUS_FLAG - { 0, NULL } /* End marker. */ -}; /* Dumping */ @@ -275,6 +266,20 @@ ct_dpif_entry_uninit(struct ct_dpif_entry *entry) } } +static const char * +ct_dpif_status_flags(uint32_t flags) +{ + switch (flags) { +#define CT_DPIF_STATUS_FLAG(FLAG) \ + case CT_DPIF_STATUS_##FLAG: \ + return #FLAG; + CT_DPIF_STATUS_FLAGS +#undef CT_DPIF_TCP_FLAG + default: + return NULL; + } +} + void ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds, bool verbose, bool print_stats) @@ -305,8 +310,9 @@ ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds, ds_put_format(ds, ",zone=%"PRIu16, entry->zone); } if (verbose) { - ct_dpif_format_flags(ds, ",status=", entry->status, - ct_dpif_status_flags); + format_flags_masked(ds, ",status", ct_dpif_status_flags, + entry->status, CT_DPIF_STATUS_MASK, + CT_DPIF_STATUS_MASK); } if (print_stats) { ds_put_format(ds, ",timeout=%"PRIu32, entry->timeout); @@ -415,28 +421,6 @@ ct_dpif_format_tuple(struct ds *ds, const struct ct_dpif_tuple *tuple) } } -static void -ct_dpif_format_flags(struct ds *ds, const char *title, uint32_t flags, - const struct flags *table) -{ - if (title) { - ds_put_cstr(ds, title); - } - for (; table->name; table++) { - if (flags & table->flag) { - ds_put_format(ds, "%s|", table->name); - } - } - ds_chomp(ds, '|'); -} - -static const struct flags tcp_flags[] = { -#define CT_DPIF_TCP_FLAG(FLAG) { CT_DPIF_TCPF_##FLAG, #FLAG }, - CT_DPIF_TCP_FLAGS -#undef CT_DPIF_TCP_FLAG - { 0, NULL } /* End marker. */ -}; - const char *ct_dpif_tcp_state_string[] = { #define CT_DPIF_TCP_STATE(STATE) [CT_DPIF_TCPS_##STATE] = #STATE, CT_DPIF_TCP_STATES @@ -498,6 +482,20 @@ ct_dpif_format_protoinfo_tcp(struct ds *ds, ct_dpif_format_enum(ds, "state=", tcp_state, ct_dpif_tcp_state_string); } +static const char * +ct_dpif_tcp_flags(uint32_t flags) +{ + switch (flags) { +#define CT_DPIF_TCP_FLAG(FLAG) \ + case CT_DPIF_TCPF_##FLAG: \ + return #FLAG; + CT_DPIF_TCP_FLAGS +#undef CT_DPIF_TCP_FLAG + default: + return NULL; + } +} + static void ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds, const struct ct_dpif_protoinfo *protoinfo) @@ -512,10 +510,14 @@ ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds, protoinfo->tcp.wscale_orig, protoinfo->tcp.wscale_reply); } - ct_dpif_format_flags(ds, ",flags_orig=", protoinfo->tcp.flags_orig, - tcp_flags); - ct_dpif_format_flags(ds, ",flags_reply=", protoinfo->tcp.flags_reply, - tcp_flags); + + format_flags_masked(ds, ",flags_orig", ct_dpif_tcp_flags, + protoinfo->tcp.flags_orig, CT_DPIF_TCPF_MASK, + CT_DPIF_TCPF_MASK); + + format_flags_masked(ds, ",flags_reply", ct_dpif_tcp_flags, + protoinfo->tcp.flags_reply, CT_DPIF_TCPF_MASK, + CT_DPIF_TCPF_MASK); } static void diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index b59cba962a7..2848549b0ba 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -103,6 +103,8 @@ enum ct_dpif_tcp_flags { #undef CT_DPIF_TCP_FLAG }; +#define CT_DPIF_TCPF_MASK ((CT_DPIF_TCPF_MAXACK_SET << 1) - 1) + extern const char *ct_dpif_sctp_state_string[]; #define CT_DPIF_SCTP_STATES \ @@ -173,6 +175,8 @@ enum ct_dpif_status_flags { #undef CT_DPIF_STATUS_FLAG }; +#define CT_DPIF_STATUS_MASK ((CT_DPIF_STATUS_UNTRACKED << 1) - 1) + struct ct_dpif_entry { /* Const members. */ struct ct_dpif_tuple tuple_orig; From ba9e387dc4f4acd8dd7ff9188296a4442d16576c Mon Sep 17 00:00:00 2001 From: Wilson Peng Date: Tue, 25 Oct 2022 15:37:48 +0800 Subject: [PATCH 018/833] unaligned: Correct the stats of packet_count and byte_count on Windows. The stats(byte_count) is got via function call ofputil_decode_flow_stats_reply() and for OpenFlow15 it will also call oxs_pull_entry__(). Currently we found on Windows the byte_count counter is incorrect. It will get the byte_count on OpenFlow15 handling via ntohll(get_unaligned_be64(payload)) Quote the comments below from Ilya Maximets (thanks for the given soluton and explanation): static inline uint64_t get_unaligned_u64__(const uint64_t *p_) ... return ntohll(((uint64_t) p[0] << 56) | ((uint64_t) p[1] << 48) | ((uint64_t) p[2] << 40) | ((uint64_t) p[3] << 32) | (p[4] << 24) | (p[5] << 16) | (p[6] << 8) | p[7]); And indeed the expression above has an issue with data types. The problem is the (p[4] << 24) part. The p[4] itself has a type 'uint8_t' which is unsigned 8bit value. It is not enough to hold the result of a left shift, so compiler automatically promotes it to the 'int' by default. But it is *signed* 32bit value. In your original report p[4] was equal to 0x81. After the left shift it became 0x81000000. Looks correct, but the type is 'int'. The next operation that we do is '|' with the previous shifted bytes that were explicitly converted to uint64_t before the left shift. So we have uint64_t | int. In this case compiler needs to extend the 'int' to 'unit64_t' before performing the operation. And since the 'int' is signed and the sign bit happens to be set in the 0x81000000, the sign extension is performed in order to preserve the value. The result is 0xffffffff81000000. And that is breaking everything else. From the new test below, it is incorrect for the n_bytes counter via OpenFlow15 on CMD: ovs-ofctl dump-flows. With the patch, get_unaligned_u64__() will return correct value to caller on Windows. In the output (Got via original CMD without fix) below n_bytes 2177130813 will be incorrectly changed to 18446744071591715133 when processing OpenFlow15 which is equal to 0xFFFFFFFF81C4613D and here the p[4] on Windows is 0x81. With the fix, new compiled ovs-ofctl1025.exe could dump the correct n_bytes counter Via OpenFlow15. ovs-ofctl.exe -O OpenFlow15 dump-flows nsx-managed | findstr 1516011 cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=18446744071591715133, cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=18446744071591715133, ovs-ofctl.exe -O OpenFlow10 dump-flows nsx-managed | findstr 1516011 cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=2177130813, cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=2177130813, ovs-ofctl.exe dump-flows nsx-managed | findstr 1516011 cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=2177130813, cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=2177130813, With the fix, new compiled ovs-ofctl1025.exe could dump the correct n_bytes counter Via OpenFlow15. ovs-ofctl1025.exe -O OpenFlow15 dump-flows nsx-managed | findstr 1516011 cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=2177130813, cookie=<>, duration=<>s, table=4, n_packets=1516011, n_bytes=2177130813, Fixes: afa3a93165f1 ("Add header for access to potentially unaligned data.") Signed-off-by: Wilson Peng Signed-off-by: Ilya Maximets --- lib/unaligned.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/unaligned.h b/lib/unaligned.h index f40e4e10df6..15334e3c764 100644 --- a/lib/unaligned.h +++ b/lib/unaligned.h @@ -95,7 +95,7 @@ GCC_UNALIGNED_ACCESSORS(ovs_be64, be64); static inline uint16_t get_unaligned_u16(const uint16_t *p_) { const uint8_t *p = (const uint8_t *) p_; - return ntohs((p[0] << 8) | p[1]); + return ntohs(((uint16_t) p[0] << 8) | (uint16_t) p[1]); } static inline void put_unaligned_u16(uint16_t *p_, uint16_t x_) @@ -110,7 +110,8 @@ static inline void put_unaligned_u16(uint16_t *p_, uint16_t x_) static inline uint32_t get_unaligned_u32(const uint32_t *p_) { const uint8_t *p = (const uint8_t *) p_; - return ntohl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]); + return ntohl(((uint32_t) p[0] << 24) | ((uint32_t) p[1] << 16) | + ((uint32_t) p[2] << 8) | (uint32_t) p[3]); } static inline void put_unaligned_u32(uint32_t *p_, uint32_t x_) @@ -131,10 +132,10 @@ static inline uint64_t get_unaligned_u64__(const uint64_t *p_) | ((uint64_t) p[1] << 48) | ((uint64_t) p[2] << 40) | ((uint64_t) p[3] << 32) - | (p[4] << 24) - | (p[5] << 16) - | (p[6] << 8) - | p[7]); + | ((uint64_t) p[4] << 24) + | ((uint64_t) p[5] << 16) + | ((uint64_t) p[6] << 8) + | (uint64_t) p[7]); } static inline void put_unaligned_u64__(uint64_t *p_, uint64_t x_) From 850e639021125c3646effa0eae9e422082ade2ca Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Oct 2022 23:57:40 +0200 Subject: [PATCH 019/833] AUTHORS: Add Wilson Peng. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index c13cf60c5e8..145387ce94f 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -460,6 +460,7 @@ Wei Yongjun yjwei@cn.fujitsu.com Wenyu Zhang wenyuz@vmware.com William Fulton William Tu u9012063@gmail.com +Wilson Peng pweisong@vmware.com Xavier Simonart xsimonar@redhat.com Xiao Liang shaw.leon@gmail.com xu rong xu.rong@zte.com.cn From 7a5ee32518dfd55dc207abaf92e1ae1c25b857cc Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Sun, 23 Oct 2022 09:27:10 +0300 Subject: [PATCH 020/833] tc: On last action use drop action attribute instead of pipe OVN is setting ct drop rule with a ct clear action. OVS datapath behavior is if there is no forward action the default is drop. TC behavior is to continue with next match. Fix to match tc to ovs behavior by setting last action attribute as drop instead of pipe. Also update lastused when parsing ct action. example rule recirc_id(0x1),in_port(2),ct_state(+trk),eth(),eth_type(0x0800),ipv4(frag=no), packets:82, bytes:8036, used:2.108s, actions:ct_clear Reviewed-by: Maor Dickman Signed-off-by: Roi Dayan Signed-off-by: Simon Horman --- lib/tc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/tc.c b/lib/tc.c index 94044cde606..f8419e637b9 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1541,6 +1541,9 @@ static const struct nl_policy ct_policy[] = { .optional = true, }, [TCA_CT_NAT_PORT_MAX] = { .type = NL_A_U16, .optional = true, }, + [TCA_CT_TM] = { .type = NL_A_UNSPEC, + .min_len = sizeof(struct tcf_t), + .optional = true, }, }; static int @@ -1551,6 +1554,7 @@ nl_parse_act_ct(struct nlattr *options, struct tc_flower *flower) struct tc_action *action; const struct tc_ct *ct; uint16_t ct_action = 0; + struct tcf_t tm; if (!nl_parse_nested(options, ct_policy, ct_attrs, ARRAY_SIZE(ct_policy))) { @@ -1636,6 +1640,11 @@ nl_parse_act_ct(struct nlattr *options, struct tc_flower *flower) } action->type = TC_ACT_CT; + if (ct_attrs[TCA_CT_TM]) { + memcpy(&tm, nl_attr_get_unspec(ct_attrs[TCA_CT_TM], sizeof tm), + sizeof tm); + nl_parse_tcf(&tm, flower); + } nl_parse_action_pc(ct->action, action); return 0; } @@ -3126,7 +3135,11 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) uint32_t action_pc; /* Programmatic Control */ if (!action->jump_action) { - action_pc = TC_ACT_PIPE; + if (i == flower->action_count - 1) { + action_pc = TC_ACT_SHOT; + } else { + action_pc = TC_ACT_PIPE; + } } else if (action->jump_action == JUMP_ACTION_STOP) { action_pc = TC_ACT_STOLEN; } else { From 743499607bdd0dcb3541a179ba2bb41ea10c4b3b Mon Sep 17 00:00:00 2001 From: Tianyu Yuan Date: Wed, 12 Oct 2022 08:42:28 +0800 Subject: [PATCH 021/833] Revert "tc: Fix stats dump when using same meter table" This reverts commit dd9881ed55e6 ('tc: Fix stats dump when using same meter table') This patch doesn't solve the tc flow stats update issue and will lead to failure of system-offloads-traffic testsuite, it only counts packets surviving after the tc filter, rather than hitting the filter A following patch will come up to solve this flow stats update issue Signed-off-by: Tianyu Yuan Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- lib/tc.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index f8419e637b9..3b591975b12 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1913,8 +1913,6 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower, struct nlattr *act_cookie; const char *act_kind; struct nlattr *action_attrs[ARRAY_SIZE(act_policy)]; - int act_index = flower->action_count; - bool is_meter = false; int err = 0; if (!nl_parse_nested(action, act_policy, action_attrs, @@ -1952,7 +1950,6 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower, nl_parse_act_ct(act_options, flower); } else if (!strcmp(act_kind, "police")) { nl_parse_act_police(act_options, flower); - is_meter = tc_is_meter_index(flower->actions[act_index].police.index); } else { VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind); err = EINVAL; @@ -1967,14 +1964,6 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower, flower->act_cookie.len = nl_attr_get_size(act_cookie); } - /* Skip the stats update when act_police is meter since there are always - * some other actions following meter. For other potential kinds of - * act_police actions, whose stats could not be skipped (e.g. filter has - * only one police action), update the action stats to the flow rule. */ - if (is_meter) { - return 0; - } - return nl_parse_action_stats(action_attrs[TCA_ACT_STATS], &flower->stats_sw, &flower->stats_hw, NULL); } From ffcb6f115fe5e00be3ca8fb9a940a3224e687e23 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 30 Sep 2022 14:07:56 +0800 Subject: [PATCH 022/833] netdev-linux: Allow meter to work in tc software datapath when tc-policy is specified Add tc action flags when adding police action to offload meter table. There is a restriction that the flag of skip_sw/skip_hw should be same for filter rule and the independent created tc actions the rule uses. In this case, if we configure the tc-policy as skip_hw, filter rule will be created with skip_hw flag and the police action according to meter table will have no action flag, then flower rule will fail to add to tc kernel system. To fix this issue, we will add tc action flag when adding police action to offload a meter table, so it will allow meter table to work in tc software datapath. Fixes: 5c039ddc64ff ("netdev-linux: Add functions to manipulate tc police action") Signed-off-by: Baowen Zheng Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- acinclude.m4 | 6 +++--- include/linux/pkt_cls.h | 11 +++++++---- lib/netdev-linux.c | 20 ++++++++++++++------ lib/tc.c | 21 +++++++++++++++++++++ lib/tc.h | 2 ++ 5 files changed, 47 insertions(+), 13 deletions(-) diff --git a/acinclude.m4 b/acinclude.m4 index ad07989ac29..aa9af55062f 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -163,10 +163,10 @@ dnl Configure Linux tc compat. AC_DEFUN([OVS_CHECK_LINUX_TC], [ AC_COMPILE_IFELSE([ AC_LANG_PROGRAM([#include ], [ - int x = TCA_POLICE_PKTRATE64; + int x = TCA_ACT_FLAGS_SKIP_HW; ])], - [AC_DEFINE([HAVE_TCA_POLICE_PKTRATE64], [1], - [Define to 1 if TCA_POLICE_PKTRATE64 is available.])]) + [AC_DEFINE([HAVE_TCA_ACT_FLAGS_SKIP_HW], [1], + [Define to 1 if TCA_ACT_FLAGS_SKIP_HW is available.])]) AC_CHECK_MEMBERS([struct tcf_t.firstuse], [], [], [#include ]) diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index ba82e690eba..a8cd8db5bf8 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -1,7 +1,7 @@ #ifndef __LINUX_PKT_CLS_WRAPPER_H #define __LINUX_PKT_CLS_WRAPPER_H 1 -#if defined(__KERNEL__) || defined(HAVE_TCA_POLICE_PKTRATE64) +#if defined(__KERNEL__) || defined(HAVE_TCA_ACT_FLAGS_SKIP_HW) #include_next #else @@ -21,9 +21,12 @@ enum { __TCA_ACT_MAX }; -#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for - * actions stats. - */ +/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */ +#define TCA_ACT_FLAGS_NO_PERCPU_STATS (1 << 0) /* Don't use percpu allocator for + * actions stats. + */ +#define TCA_ACT_FLAGS_SKIP_HW (1 << 1) /* don't offload action to HW */ +#define TCA_ACT_FLAGS_SKIP_SW (1 << 2) /* don't use action in SW */ #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index cdc66246ced..7ea4070c23a 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2623,10 +2623,17 @@ tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst) static void nl_msg_act_police_start_nest(struct ofpbuf *request, uint32_t prio, - size_t *offset, size_t *act_offset) + size_t *offset, size_t *act_offset, + bool single_action) { *act_offset = nl_msg_start_nested(request, prio); nl_msg_put_string(request, TCA_ACT_KIND, "police"); + + /* If police action is added independently from filter, we need to + * add action flag according to tc-policy. */ + if (single_action) { + nl_msg_put_act_tc_policy_flag(request); + } *offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS); } @@ -2642,7 +2649,7 @@ nl_msg_act_police_end_nest(struct ofpbuf *request, size_t offset, static void nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, uint64_t pkts_rate, uint64_t pkts_burst, - uint32_t notexceed_act) + uint32_t notexceed_act, bool single_action) { size_t offset, act_offset; uint32_t prio = 0; @@ -2651,7 +2658,8 @@ nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, return; } - nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset); + nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset, + single_action); if (police->rate.rate) { tc_put_rtab(request, TCA_POLICE_RATE, &police->rate); } @@ -2698,7 +2706,7 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT); nl_msg_put_act_police(&request, &pol_act, kpkts_rate * 1000, - kpkts_burst * 1000, TC_ACT_UNSPEC); + kpkts_burst * 1000, TC_ACT_UNSPEC, false); nl_msg_end_nested(&request, action_offset); nl_msg_end_nested(&request, basic_offset); @@ -5667,7 +5675,7 @@ tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, police_offset = nl_msg_start_nested(&request, TCA_BASIC_ACT); tc_policer_init(&tc_police, kbits_rate, kbits_burst); nl_msg_put_act_police(&request, &tc_police, kpkts_rate * 1000ULL, - kpkts_burst * 1000ULL, TC_ACT_UNSPEC); + kpkts_burst * 1000ULL, TC_ACT_UNSPEC, false); nl_msg_end_nested(&request, police_offset); nl_msg_end_nested(&request, basic_offset); @@ -5702,7 +5710,7 @@ tc_add_policer_action(uint32_t index, uint32_t kbits_rate, offset = nl_msg_start_nested(&request, TCA_ACT_TAB); nl_msg_put_act_police(&request, &tc_police, pkts_rate, pkts_burst, - TC_ACT_PIPE); + TC_ACT_PIPE, true); nl_msg_end_nested(&request, offset); error = tc_transact(&request, NULL); diff --git a/lib/tc.c b/lib/tc.c index 3b591975b12..4d7de8adde4 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -3810,3 +3810,24 @@ tc_set_policy(const char *policy) VLOG_INFO("tc: Using policy '%s'", policy); } + +void +nl_msg_put_act_tc_policy_flag(struct ofpbuf *request) +{ + int flag = 0; + + if (!request) { + return; + } + + if (tc_policy == TC_POLICY_SKIP_HW) { + flag = TCA_ACT_FLAGS_SKIP_HW; + } else if (tc_policy == TC_POLICY_SKIP_SW) { + flag = TCA_ACT_FLAGS_SKIP_SW; + } + + if (flag) { + struct nla_bitfield32 flags = { flag, flag }; + nl_msg_put_unspec(request, TCA_ACT_FLAGS, &flags, sizeof flags); + } +} diff --git a/lib/tc.h b/lib/tc.h index 2e64ad37259..161f438124b 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -399,4 +399,6 @@ int tc_parse_action_stats(struct nlattr *action, int tc_dump_tc_action_start(char *name, struct nl_dump *dump); int parse_netlink_to_tc_policer(struct ofpbuf *reply, uint32_t police_idx[]); +void nl_msg_put_act_tc_policy_flag(struct ofpbuf *request); + #endif /* tc.h */ From 97873af3734a9300f5eb29f664513edc839cf88a Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Wed, 27 Apr 2022 10:15:25 +0200 Subject: [PATCH 023/833] Documentation: Use new syntax for dpdk port representors. Since DPDK 21.05, the representor identifier now handles a relative VF offset. The legacy representor ID seems only valid in certain cases (first dpdk port). Link: https://github.com/DPDK/dpdk/commit/cebf7f17159a8 Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/phy.rst | 12 ++++++------ lib/netdev-dpdk.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index 937f4c40e5a..8fc34a378cb 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -267,7 +267,7 @@ Representors are multi devices created on top of one PF. For more information, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/prog_guide/switch_representation.html +__ https://doc.dpdk.org/guides-21.11/prog_guide/switch_representation.html#port-representors Prior to port representors there was a one-to-one relationship between the PF and the eth device. With port representors the relationship becomes one PF to @@ -287,18 +287,18 @@ address in devargs. For an existing bridge called ``br0`` and PCI address When configuring a VF-based port, DPDK uses an extended devargs syntax which has the following format:: - BDBF,representor=[] + BDBF,representor= This syntax shows that a representor is an enumerated eth device (with -a representor ID) which uses the PF PCI address. -The following commands add representors 3 and 5 using PCI device address +a representor identifier) which uses the PF PCI address. +The following commands add representors of VF 3 and 5 using PCI device address ``0000:08:00.0``:: $ ovs-vsctl add-port br0 dpdk-rep3 -- set Interface dpdk-rep3 type=dpdk \ - options:dpdk-devargs=0000:08:00.0,representor=[3] + options:dpdk-devargs=0000:08:00.0,representor=vf3 $ ovs-vsctl add-port br0 dpdk-rep5 -- set Interface dpdk-rep5 type=dpdk \ - options:dpdk-devargs=0000:08:00.0,representor=[5] + options:dpdk-devargs=0000:08:00.0,representor=vf5 .. important:: diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 0dd655507b5..d2eeb22ae37 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1823,7 +1823,7 @@ static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs) } /* - * Normally, a PCI id (optionally followed by a representor number) + * Normally, a PCI id (optionally followed by a representor identifier) * is enough for identifying a specific DPDK port. * However, for some NICs having multiple ports sharing the same PCI * id, using PCI id won't work then. From 2db297ea37f4d6ec4bddb2a2db540339fa5af1df Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 2 Nov 2022 16:47:47 +0100 Subject: [PATCH 024/833] AUTHORS: Add Robin Jarry. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 145387ce94f..f62840b1b36 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -365,9 +365,10 @@ Rich Lane rlane@bigswitch.com Richard Oliver richard@richard-oliver.co.uk Rishi Bamba rishi.bamba@tcs.com Rob Adams readams@readams.net -Robert Åkerblom-Andersson Robert.nr1@gmail.com -Robert Wojciechowicz robertx.wojciechowicz@intel.com Rob Hoes rob.hoes@citrix.com +Robert Wojciechowicz robertx.wojciechowicz@intel.com +Robert Åkerblom-Andersson Robert.nr1@gmail.com +Robin Jarry rjarry@redhat.com Rohith Basavaraja rohith.basavaraja@gmail.com Roi Dayan roid@nvidia.com Róbert Mulik robert.mulik@ericsson.com From c98762d91b578b5d8290077af4de0b6e3d95c3ce Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Thu, 1 Sep 2022 12:16:02 +0200 Subject: [PATCH 025/833] netdev-dpdk: Fix tx_dropped counters value. Packets that could not be transmitted because the TXQ are full should be taken into account in the global ovs_tx_failure_drops as it was the case before commit 29b94e12d57d ("netdev-dpdk: Refactor the DPDK transmit path."). netdev_dpdk_eth_tx_burst() returns the number of packets that were *not* transmitted. Add that number to stats.tx_failure_drops and only include the packets that were dropped in previous steps afterwards. Fixes: 29b94e12d57d ("netdev-dpdk: Refactor the DPDK transmit path.") Acked-by: Mike Pattrick Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index d2eeb22ae37..e4b3465e09b 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2882,9 +2882,9 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid, cnt = netdev_dpdk_common_send(netdev, batch, &stats); - dropped = batch_cnt - cnt; - - dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt); + dropped = netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt); + stats.tx_failure_drops += dropped; + dropped += batch_cnt - cnt; if (OVS_UNLIKELY(dropped)) { struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats; From eb86c28ddcdb7922974def08749076c8bf2c5635 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Tue, 13 Sep 2022 23:36:11 +0800 Subject: [PATCH 026/833] ovs-tcpdump: Cleanup mirror port on SIGHUP/SIGTERM. If ovs-tcpdump received HUP or TERM signal, mirror and mirror interface should be destroyed. This often happens, when controlling terminal is closed, like ssh session closed, and other users use kill to terminate it. Acked-by: Mike Pattrick Signed-off-by: Daniel Ding Signed-off-by: Ilya Maximets --- utilities/ovs-tcpdump.in | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index e12bab88956..a49ec9f9426 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -44,6 +44,7 @@ try: from ovs import jsonrpc from ovs.poller import Poller from ovs.stream import Stream + from ovs.fatal_signal import add_hook except Exception: print("ERROR: Please install the correct Open vSwitch python support") print(" libraries (version @VERSION@).") @@ -412,6 +413,24 @@ def py_which(executable): for path in os.environ["PATH"].split(os.pathsep)) +def teardown(db_sock, interface, mirror_interface, tap_created): + def cleanup_mirror(): + try: + ovsdb = OVSDB(db_sock) + ovsdb.destroy_mirror(interface, ovsdb.port_bridge(interface)) + ovsdb.destroy_port(mirror_interface, ovsdb.port_bridge(interface)) + if tap_created is True: + _del_taps[sys.platform](mirror_interface) + except Exception: + print("Unable to tear down the ports and mirrors.") + print("Please use ovs-vsctl to remove the ports and mirrors" + " created.") + print(" ex: ovs-vsctl --db=%s del-port %s" % (db_sock, + mirror_interface)) + + add_hook(cleanup_mirror, None, True) + + def main(): rundir = os.environ.get('OVS_RUNDIR', '@RUNDIR@') db_sock = 'unix:%s' % os.path.join(rundir, "db.sock") @@ -496,6 +515,9 @@ def main(): print("ERROR: Mirror port (%s) exists for port %s." % (mirror_interface, interface)) sys.exit(1) + + teardown(db_sock, interface, mirror_interface, tap_created) + try: ovsdb.make_port(mirror_interface, ovsdb.port_bridge(interface)) ovsdb.bridge_mirror(interface, mirror_interface, @@ -503,12 +525,6 @@ def main(): mirror_select_all) except OVSDBException as oe: print("ERROR: Unable to properly setup the mirror: %s." % str(oe)) - try: - ovsdb.destroy_port(mirror_interface, ovsdb.port_bridge(interface)) - if tap_created is True: - _del_taps[sys.platform](mirror_interface) - except Exception: - pass sys.exit(1) ovsdb.close_idl() @@ -525,18 +541,6 @@ def main(): if pipes.poll() is None: pipes.terminate() - ovsdb = OVSDB(db_sock) - ovsdb.destroy_mirror(interface, ovsdb.port_bridge(interface)) - ovsdb.destroy_port(mirror_interface, ovsdb.port_bridge(interface)) - if tap_created is True: - _del_taps[sys.platform](mirror_interface) - except Exception: - print("Unable to tear down the ports and mirrors.") - print("Please use ovs-vsctl to remove the ports and mirrors created.") - print(" ex: ovs-vsctl --db=%s del-port %s" % (db_sock, - mirror_interface)) - sys.exit(1) - sys.exit(0) From 46ab9d80c2ab8f13dfe2ba2a9700887cd4f7fc36 Mon Sep 17 00:00:00 2001 From: yangchang Date: Fri, 14 Oct 2022 15:29:36 +0800 Subject: [PATCH 027/833] bond: Fix crash while logging not yet enabled member. The log should be printed with the member name, not the active member name, and the active member does not judge whether it is NULL. If null, OVS will crash with the following backtrace: (gdb) bt 0 bond_check_admissibility (ofproto/bond.c:877) 1 is_admissible (ofproto/ofproto-dpif-xlate.c:2574) 2 xlate_normal (ofproto/ofproto-dpif-xlate.c:3027) 3 xlate_output_action (ofproto/ofproto-dpif-xlate.c:5284) 4 do_xlate_actions (ofproto/ofproto-dpif-xlate.c:6960) 5 xlate_actions (ofproto/ofproto-dpif-xlate.c:7924) 6 upcall_xlate (ofproto/ofproto-dpif-upcall.c:1237) 7 process_upcall (ofproto/ofproto-dpif-upcall.c:1456) 8 upcall_cb (ofproto/ofproto-dpif-upcall.c:1358) 9 dp_netdev_upcall (lib/dpif-netdev.c:7793) 10 handle_packet_upcall (lib/dpif-netdev.c:8255) 11 fast_path_processing (lib/dpif-netdev.c:8374) 12 dp_netdev_input__ (lib/dpif-netdev.c:8463) 13 dp_netdev_input (lib/dpif-netdev.c:8501) 14 dp_netdev_process_rxq_port (lib/dpif-netdev.c:5337) 15 pmd_thread_main (lib/dpif-netdev.c:6944) 16 ovsthread_wrapper (lib/ovs-thread.c:422) 17 ?? (/lib64/libpthread.so.0) 18 clone (/lib64/libc.so.6) Fixes: 423416f58749 ("lacp: report desync in ovs threads enabling slave") Signed-off-by: yangchang Signed-off-by: Ilya Maximets --- ofproto/bond.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/bond.c b/ofproto/bond.c index 47630a6b06a..cfdf44f8542 100644 --- a/ofproto/bond.c +++ b/ofproto/bond.c @@ -897,7 +897,7 @@ bond_check_admissibility(struct bond *bond, const void *member_, if (!member->enabled && member->may_enable) { VLOG_DBG_RL(&rl, "bond %s: member %s: " "main thread has not yet enabled member", - bond->name, bond->active_member->name); + bond->name, member->name); } goto out; case LACP_CONFIGURED: From 2158254fcbd97620151525a8aa91b0a040927690 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 18 Oct 2022 15:27:52 +0200 Subject: [PATCH 028/833] utilities: Add a GDB macro to dump any cmap structure. Add a new GDB macro called ovs_dump_cmap, which can be used to dump any cmap structure. Some examples: (gdb) ovs_dump_cmap &subtable->rules (struct cmap *) 0x3e02758 (gdb) ovs_dump_cmap &subtable->rules "struct dpcls_rule" cmap_node (struct dpcls_rule *) 0x3e02758 (gdb) ovs_dump_cmap &subtable->rules "struct dpcls_rule" cmap_node dump (struct dpcls_rule *) 0x3e02758 = {cmap_node = {next = {p = 0x0}}, mask = 0x3dfe100, flow = {hash = ... Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- utilities/gdb/ovs_gdb.py | 66 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/utilities/gdb/ovs_gdb.py b/utilities/gdb/ovs_gdb.py index 763ece2a78d..7f63dd0d592 100644 --- a/utilities/gdb/ovs_gdb.py +++ b/utilities/gdb/ovs_gdb.py @@ -849,6 +849,71 @@ def invoke(self, arg, from_tty): member).dereference())) +# +# Implements the GDB "ovs_dump_cmap" command +# +class CmdDumpCmap(gdb.Command): + """Dump all nodes of a given cmap + Usage: + ovs_dump_cmap {[] [] {dump}]} + + For example dump all the rules in a dpcls_subtable: + + (gdb) ovs_dump_cmap &subtable->rules + (struct cmap *) 0x3e02758 + + This is not very useful, so please use this with the container_of mode: + + (gdb) ovs_dump_cmap &subtable->rules "struct dpcls_rule" cmap_node + (struct dpcls_rule *) 0x3e02758 + + Now you can manually use the print command to show the content, or use the + dump option to dump the structure for all nodes: + + (gdb) ovs_dump_cmap &subtable->rules "struct dpcls_rule" cmap_node dump + (struct dpcls_rule *) 0x3e02758 = + {cmap_node = {next = {p = 0x0}}, mask = 0x3dfe100, flow = {hash = ... + """ + def __init__(self): + super(CmdDumpCmap, self).__init__("ovs_dump_cmap", + gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + arg_list = gdb.string_to_argv(arg) + typeobj = None + member = None + dump = False + + if len(arg_list) != 1 and len(arg_list) != 3 and len(arg_list) != 4: + print("usage: ovs_dump_cmap " + "{[] [] {dump}]}") + return + + cmap = gdb.parse_and_eval(arg_list[0]).cast( + gdb.lookup_type('struct cmap').pointer()) + + if len(arg_list) >= 3: + typeobj = arg_list[1] + member = arg_list[2] + if len(arg_list) == 4 and arg_list[3] == "dump": + dump = True + + for node in ForEachCMAP(cmap.dereference()): + if typeobj is None or member is None: + print("(struct cmap *) {}".format(node)) + else: + print("({} *) {} {}".format( + typeobj, + container_of(node, + gdb.lookup_type(typeobj).pointer(), member), + "=" if dump else "")) + if dump: + print(" {}\n".format(container_of( + node, + gdb.lookup_type(typeobj).pointer(), + member).dereference())) + + # # Implements the GDB "ovs_dump_simap" command # @@ -1449,6 +1514,7 @@ def extract_pkt(self, pkt): CmdDumpOfpacts() CmdDumpOvsList() CmdDumpPackets() +CmdDumpCmap() CmdDumpSimap() CmdDumpSmap() CmdDumpUdpifKeys() From a1de888ab1a4a74dfa6a46b153184fc7dddce6eb Mon Sep 17 00:00:00 2001 From: Han Ding Date: Wed, 19 Oct 2022 23:06:54 +0800 Subject: [PATCH 029/833] ofproto-dpif-xlate: Update tunnel neighbor when receive gratuitous ARP. OVS now just allow the ARP Reply which the destination address is matched against the known xbridge addresses to update tunnel neighbor. So when OVS receive the gratuitous ARP from underlay gateway which the source address and destination address are all gateway IP, tunnel neighbor will not be updated. Fixes: ba07cf222a0c ("Handle gratuitous ARP requests and replies in tnl_arp_snoop()") Fixes: 83c2757bd16e ("xlate: Move tnl_neigh_snoop() to terminate_native_tunnel()") Acked-by: Paolo Valerio Signed-off-by: Han Ding Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 14 +++++++++++--- tests/tunnel-push-pop.at | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 5d2af93fa26..a9cf3cbee0b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -4178,6 +4178,16 @@ xport_has_ip(const struct xport *xport) return n_in6 ? true : false; } +static bool check_neighbor_reply(struct xlate_ctx *ctx, struct flow *flow) +{ + if (flow->dl_type == htons(ETH_TYPE_ARP) || + flow->nw_proto == IPPROTO_ICMPV6) { + return is_neighbor_reply_correct(ctx, flow); + } + + return false; +} + static bool terminate_native_tunnel(struct xlate_ctx *ctx, const struct xport *xport, struct flow *flow, struct flow_wildcards *wc, @@ -4198,9 +4208,7 @@ terminate_native_tunnel(struct xlate_ctx *ctx, const struct xport *xport, /* If no tunnel port was found and it's about an ARP or ICMPv6 packet, * do tunnel neighbor snooping. */ if (*tnl_port == ODPP_NONE && - (flow->dl_type == htons(ETH_TYPE_ARP) || - flow->nw_proto == IPPROTO_ICMPV6) && - is_neighbor_reply_correct(ctx, flow)) { + (check_neighbor_reply(ctx, flow) || is_garp(flow, wc))) { tnl_neigh_snoop(flow, wc, ctx->xbridge->name, ctx->xin->allow_side_effects); } else if (*tnl_port != ODPP_NONE && diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index 92eebba2eaa..013ecbcaa80 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -369,6 +369,26 @@ AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl 1.1.2.92 f8:bc:12:44:34:b6 br0 ]) +dnl Receiving Gratuitous ARP request with correct VLAN id should alter tunnel neighbor cache +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:c8,dst=ff:ff:ff:ff:ff:ff),eth_type(0x8100),vlan(vid=10,pcp=7),encap(eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.92,op=1,sha=f8:bc:12:44:34:c8,tha=00:00:00:00:00:00))']) + +ovs-appctl time/warp 1000 +ovs-appctl time/warp 1000 + +AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl +1.1.2.92 f8:bc:12:44:34:c8 br0 +]) + +dnl Receiving Gratuitous ARP reply with correct VLAN id should alter tunnel neighbor cache +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b2,dst=ff:ff:ff:ff:ff:ff),eth_type(0x8100),vlan(vid=10,pcp=7),encap(eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.92,op=2,sha=f8:bc:12:44:34:b2,tha=f8:bc:12:44:34:b2))']) + +ovs-appctl time/warp 1000 +ovs-appctl time/warp 1000 + +AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl +1.1.2.92 f8:bc:12:44:34:b2 br0 +]) + dnl Receive ARP reply without VLAN header AT_CHECK([ovs-vsctl set port br0 tag=0]) AT_CHECK([ovs-appctl tnl/neigh/flush], [0], [OK From f1eb850aea833c5fd0cc106020184b0db63d7a30 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Sun, 23 Oct 2022 12:58:55 +0800 Subject: [PATCH 030/833] mac-learning: Fix learned fdb entries not age out issue. After user add a static fdb entry, the get_lru() function will always return the static fdb entry. That's normal fdb entries will not age out through mac_learning_run(). Fix the issue by modify the get_lru() function to check the entry->expires field and not return the entry which entry->expires is MAC_ENTRY_AGE_STATIC_ENTRY. Adding a unit test for this. Fixes: ccc24fc88d59 ("ofproto-dpif: APIs and CLI option to add/delete static fdb entry.") Acked-by: Eelco Chaudron Tested-by: Zhang Yuhuang Signed-off-by: Lin Huang Signed-off-by: Ilya Maximets --- lib/mac-learning.c | 37 ++++++++++++++----------------------- tests/ofproto-dpif.at | 23 +++++++++++++++++++++++ 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/lib/mac-learning.c b/lib/mac-learning.c index a60794fb26e..5932e2709d0 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -176,12 +176,18 @@ get_lru(struct mac_learning *ml, struct mac_entry **e) OVS_REQ_RDLOCK(ml->rwlock) { if (!ovs_list_is_empty(&ml->lrus)) { - *e = mac_entry_from_lru_node(ml->lrus.next); - return true; - } else { - *e = NULL; - return false; + struct mac_entry *entry; + + LIST_FOR_EACH (entry, lru_node, &ml->lrus) { + if (entry->expires != MAC_ENTRY_AGE_STATIC_ENTRY) { + *e = entry; + return true; + } + } } + + *e = NULL; + return false; } static unsigned int @@ -618,25 +624,10 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) void mac_learning_flush(struct mac_learning *ml) { - struct mac_entry *e, *first_static_mac = NULL; - - while (get_lru(ml, &e) && (e != first_static_mac)) { - - /* Static mac should not be evicted. */ - if (MAC_ENTRY_AGE_STATIC_ENTRY == e->expires) { - - /* Make note of first static-mac encountered, so that this while - * loop will break on visting this mac again via get_lru(). */ - if (!first_static_mac) { - first_static_mac = e; - } + struct mac_entry *e; - /* Remove from lru head and append it to tail. */ - ovs_list_remove(&e->lru_node); - ovs_list_push_back(&ml->lrus, &e->lru_node); - } else { - mac_learning_expire(ml, e); - } + while (get_lru(ml, &e)) { + mac_learning_expire(ml, e); } hmap_shrink(&ml->table); } diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 8e993c585ff..eb4cd189609 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -7287,6 +7287,29 @@ AT_CHECK([ovs-appctl coverage/read-counter mac_learning_static_none_move], [0], OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - static-mac learned mac age out]) +OVS_VSWITCHD_START([set bridge br0 fail-mode=standalone -- set bridge br0 other_config:mac-aging-time=5]) +add_of_ports br0 1 2 + +dnl Add some static mac entries. +AT_CHECK([ovs-appctl fdb/add br0 p1 0 50:54:00:00:01:01]) +AT_CHECK([ovs-appctl fdb/add br0 p2 0 50:54:00:00:02:02]) + +dnl Generate some dynamic fdb entries on some ports. +OFPROTO_TRACE([ovs-dummy], [in_port(1),eth(src=60:54:00:00:00:01)], [-generate], [100,2]) +OFPROTO_TRACE([ovs-dummy], [in_port(2),eth(src=60:54:00:00:00:02)], [-generate], [100,1]) + +dnl Waiting for aging out. +ovs-appctl time/warp 20000 + +dnl Count number of static entries remaining. +AT_CHECK_UNQUOTED([ovs-appctl fdb/stats-show br0 | grep expired], [0], [dnl + Total number of expired MAC entries : 2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - basic truncate action]) OVS_VSWITCHD_START add_of_ports br0 1 2 3 4 5 From 0d0f282c19e1d83fd18529e225845560c6e830e4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Oct 2022 18:33:53 +0200 Subject: [PATCH 031/833] vswitch.xml: Fix the name of rstp-path-cost option. For some reason it is documented as 'rstp-port-path-cost', while the code and some other bits of documentation use 'rstp-path-cost'. Fixes: 9efd308e957c ("Rapid Spanning Tree Protocol (IEEE 802.1D).") Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- vswitchd/vswitch.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 36388e3c42d..928821a8239 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -2350,7 +2350,7 @@ lowest port-id is elected as the root. - The port path cost. The Port's contribution, when it is the Root Port, to the Root Path Cost for the Bridge. By default the From 0bd4155f560fe5fb790b2f714d3682008b6ce736 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Wed, 26 Oct 2022 10:44:09 +0200 Subject: [PATCH 032/833] odp-util: Add missing separator in format_odp_conntrack_action(). If OVS_CT_ATTR_TIMEOUT is included, the resulting output is the following: actions:ct(commit,timeout=1nat(src=10.1.1.240)) Fix it by trivially adding a trailing ',' to timeout as well. Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- lib/odp-util.c | 2 +- tests/odp.at | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index ba5be4bb355..72e076e1c5b 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -1004,7 +1004,7 @@ format_odp_conntrack_action(struct ds *ds, const struct nlattr *attr) ds_put_format(ds, "helper=%s,", helper); } if (timeout) { - ds_put_format(ds, "timeout=%s", timeout); + ds_put_format(ds, "timeout=%s,", timeout); } if (nat) { format_odp_ct_nat(ds, nat); diff --git a/tests/odp.at b/tests/odp.at index 7a1cf3b2ceb..88b7cfd917f 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -348,7 +348,9 @@ ct(commit,helper=tftp) ct(commit,timeout=ovs_tp_1_tcp4) ct(nat) ct(commit,nat(src)) +ct(commit,timeout=ovs_tp_1_tcp4,nat(src)) ct(commit,nat(dst)) +ct(commit,timeout=ovs_tp_1_tcp4,nat(dst)) ct(commit,nat(src=10.0.0.240,random)) ct(commit,nat(src=10.0.0.240:32768-65535,random)) ct(commit,nat(dst=10.0.0.128-10.0.0.254,hash)) From fec5424aedc9a104013d85cdd4e7399e10777a8a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 26 Oct 2022 15:40:28 +0200 Subject: [PATCH 033/833] tc: Fix misaligned writes while parsing pedit. Offsets within 'rewrite' action are not 4-byte aligned, so has to be accessed carefully. SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior lib/tc.c:1132:17 in lib/tc.c:1132:17: runtime error: store to misaligned address 0x7fba215b2025 for type 'ovs_be32' (aka 'unsigned int'), which requires 4 byte alignment 0 0xd78857 in nl_parse_act_pedit lib/tc.c:1132:24 1 0xd68103 in nl_parse_single_action lib/tc.c:1936:15 2 0xd624ee in nl_parse_flower_actions lib/tc.c:2024:19 3 0xd624ee in nl_parse_flower_options lib/tc.c:2139:12 4 0xd5f082 in parse_netlink_to_tc_flower lib/tc.c:2187:12 5 0xd6a2a1 in tc_replace_flower lib/tc.c:3776:19 6 0xd2ae8f in netdev_tc_flow_put lib/netdev-offload-tc.c:2350:11 7 0x951d07 in netdev_flow_put lib/netdev-offload.c:318:14 8 0xcbb81a in parse_flow_put lib/dpif-netlink.c:2297:11 9 0xcbb81a in try_send_to_netdev lib/dpif-netlink.c:2384:15 10 0xcbb81a in dpif_netlink_operate lib/dpif-netlink.c:2455:23 11 0x8678ae in dpif_operate lib/dpif.c:1372:13 12 0x6bcc89 in handle_upcalls ofproto/ofproto-dpif-upcall.c:1674:5 13 0x6bcc89 in recv_upcalls ofproto/ofproto-dpif-upcall.c:905:9 14 0x6b7f9a in udpif_upcall_handler ofproto/ofproto-dpif-upcall.c:801:13 15 0xb54c5a in ovsthread_wrapper lib/ovs-thread.c:422:12 16 0x7fba2f2081ce in start_thread (/lib64/libpthread.so.0+0x81ce) 17 0x7fba2de39dd2 in clone (/lib64/libc.so.6+0x39dd2) Fixes: 8ada482bbe19 ("tc: Add header rewrite using tc pedit action") Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/tc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index 4d7de8adde4..dce66ab0bd3 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1114,7 +1114,7 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) int diff = flower_off + (keys->off - mf); ovs_be32 *dst = (void *) (rewrite_key + diff); ovs_be32 *dst_m = (void *) (rewrite_mask + diff); - ovs_be32 mask, mask_word, data_word; + ovs_be32 mask, mask_word, data_word, val; uint32_t zero_bits; mask_word = htonl(ntohl(keys->mask) << m->boundary_shift); @@ -1129,8 +1129,13 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) mask &= htonl(UINT32_MAX << zero_bits); } - *dst_m |= mask; - *dst |= data_word & mask; + val = get_unaligned_be32(dst_m); + val |= mask; + put_unaligned_be32(dst_m, val); + + val = get_unaligned_be32(dst); + val |= data_word & mask; + put_unaligned_be32(dst, val); } } From a3848d98e19479cf87cd2216fa606f51fdb32b52 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Mon, 31 Oct 2022 16:57:33 +0100 Subject: [PATCH 034/833] conntrack: Show parent key if present. Similarly to what happens when CTA_TUPLE_MASTER is present in a ct netlink dump, add the ability to print out the parent key to the userspace implementation as well. Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- lib/conntrack.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/conntrack.c b/lib/conntrack.c index 13c5ab6283d..550b2be9b91 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2647,6 +2647,10 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, conn_key_to_tuple(&conn->key, &entry->tuple_orig); conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply); + if (conn->alg_related) { + conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent); + } + entry->zone = conn->key.zone; ovs_mutex_lock(&conn->lock); From 02be2c318c8ef3255b62541ec3de53bd6f325c7a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 31 Oct 2022 17:17:59 +0100 Subject: [PATCH 035/833] netdev-linux: Fix inability to apply QoS on ports with custom qdiscs. tc_del_qdisc() function only removes qdiscs with handle '1:0'. If for some reason the interface has a qdisc with non-zero handle attached, tc_del_qdisc() will not delete it and subsequent tc_install() will fail to install a new qdisc. The problem is that Libvirt by default is setting noqueue qdisc for all tap interfaces it creates. This is done for performance reasons to ensure lockless xmit. The issue is causing non-working QoS in OpenStack setups since new versions of Libvirt started to use OVS to configure it. In the past, Libvirt configured TC on its own, bypassing OVS. Removing the handle value from the deletion request, so any qdisc can be removed. Changing the error checking to also pass ENOENT, since that is the error reported if only default qdisc is present. Alternative solution might be to use NLM_F_REPLACE, but that will be a larger change with a potential need of refactoring. Potential side effect of the change is that OVS may start removing qdiscs that it didn't remove before. Though it's not a new issue and 'linux-noop' QoS type should be used for ports that OVS should not touch. Otherwise, OVS owns qdiscs on all interfaces attached to it. While at it, adding more logs as errors are not logged in any way at the moment making the issue hard to debug. Reported-at: https://bugzilla.redhat.com/2138339 Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-October/052088.html Reported-at: https://github.com/openvswitch/ovs-issues/issues/268 Suggested-by: Slawek Kaplonski Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 13 +++++++++---- tests/system-traffic.at | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 7ea4070c23a..59e8dc0ae6c 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2984,12 +2984,18 @@ netdev_linux_set_qos(struct netdev *netdev_, /* Delete existing qdisc. */ error = tc_del_qdisc(netdev_); if (error) { + VLOG_WARN_RL(&rl, "%s: Failed to delete existing qdisc: %s", + netdev_get_name(netdev_), ovs_strerror(error)); goto exit; } ovs_assert(netdev->tc == NULL); /* Install new qdisc. */ error = new_ops->tc_install(netdev_, details); + if (error) { + VLOG_WARN_RL(&rl, "%s: Failed to install new qdisc: %s", + netdev_get_name(netdev_), ovs_strerror(error)); + } ovs_assert((error == 0) == (netdev->tc != NULL)); } @@ -6143,13 +6149,12 @@ tc_del_qdisc(struct netdev *netdev_) if (!tcmsg) { return ENODEV; } - tcmsg->tcm_handle = tc_make_handle(1, 0); tcmsg->tcm_parent = TC_H_ROOT; error = tc_transact(&request, NULL); - if (error == EINVAL) { - /* EINVAL probably means that the default qdisc was in use, in which - * case we've accomplished our purpose. */ + if (error == EINVAL || error == ENOENT) { + /* EINVAL or ENOENT probably means that the default qdisc was in use, + * in which case we've accomplished our purpose. */ error = 0; } if (!error && netdev->tc) { diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 731de439c7a..e5403519f2a 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2080,6 +2080,42 @@ OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *2627 *2829 *2a2b *2c2d *2e2f *3 OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0060: *3637" 2>&1 1>/dev/null]) +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_BANNER([QoS]) + +AT_SETUP([QoS - basic configuration]) +AT_SKIP_IF([test $HAVE_TC = no]) +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Adding a custom qdisc to ovs-p1, ovs-p0 will have the default qdisc. +AT_CHECK([tc qdisc add dev ovs-p1 root noqueue]) +AT_CHECK([tc qdisc show dev ovs-p1 | grep -q noqueue]) + +dnl Configure the same QoS for both ports. +AT_CHECK([ovs-vsctl set port ovs-p0 qos=@qos -- set port ovs-p1 qos=@qos dnl + -- --id=@qos create qos dnl + type=linux-htb other-config:max-rate=3000000 queues:0=@queue dnl + -- --id=@queue create queue dnl + other_config:min-rate=2000000 other_config:max-rate=3000000 dnl + other_config:burst=3000000], + [ignore], [ignore]) + +dnl Wait for qdiscs to be applied. +OVS_WAIT_UNTIL([tc qdisc show dev ovs-p0 | grep -q htb]) +OVS_WAIT_UNTIL([tc qdisc show dev ovs-p1 | grep -q htb]) + +dnl Check the configuration. +m4_define([HTB_CONF], [rate 2Mbit ceil 3Mbit burst 375000b cburst 375000b]) +AT_CHECK([tc class show dev ovs-p0 | grep -q 'class htb .* HTB_CONF']) +AT_CHECK([tc class show dev ovs-p1 | grep -q 'class htb .* HTB_CONF']) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP From 235fc6f4c416f07ed3cc559c271641542eaf2e04 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 2 Nov 2022 23:45:04 +0100 Subject: [PATCH 036/833] AUTHORS: Add Daniel Ding. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index f62840b1b36..7bb4e41a05d 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -117,6 +117,7 @@ Dan Wendlandt Dan Williams dcbw@redhat.com Daniel Alvarez dalvarez@redhat.com Daniel Borkmann dborkman@redhat.com +Daniel Ding zhihui.ding@easystack.cn Daniel Hiltgen daniel@netkine.com Daniel Roman Daniele Di Proietto daniele.di.proietto@gmail.com From 9a638044ecf26ef4fc3309b75be5aaf1280496bb Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Tue, 1 Nov 2022 21:09:07 -0700 Subject: [PATCH 037/833] ovsdb: transaction: Refactor assess_weak_refs. The loops for adding weak refs are quite similar. Abstract to a function, which will be used by one more cases later. The patch also changes the txn_row arg to the source row. Signed-off-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/transaction.c | 78 +++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 42 deletions(-) diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index bb997b45b5d..6796880561e 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -587,7 +587,7 @@ ovsdb_txn_update_weak_refs(struct ovsdb_txn *txn OVS_UNUSED, } static void -add_weak_ref(struct ovsdb_txn_row *txn_row, const struct ovsdb_row *dst_, +add_weak_ref(const struct ovsdb_row *src, const struct ovsdb_row *dst_, struct ovs_list *ref_list, const union ovsdb_atom *key, const union ovsdb_atom *value, bool by_key, const struct ovsdb_column *column) @@ -595,13 +595,13 @@ add_weak_ref(struct ovsdb_txn_row *txn_row, const struct ovsdb_row *dst_, struct ovsdb_row *dst = CONST_CAST(struct ovsdb_row *, dst_); struct ovsdb_weak_ref *weak; - if (txn_row->new == dst) { + if (src == dst) { return; } weak = xzalloc(sizeof *weak); - weak->src_table = txn_row->new->table; - weak->src = *ovsdb_row_get_uuid(txn_row->new); + weak->src_table = src->table; + weak->src = *ovsdb_row_get_uuid(src); weak->dst_table = dst->table; weak->dst = *ovsdb_row_get_uuid(dst); ovsdb_type_clone(&weak->type, &column->type); @@ -616,7 +616,7 @@ add_weak_ref(struct ovsdb_txn_row *txn_row, const struct ovsdb_row *dst_, } static void -find_and_add_weak_ref(struct ovsdb_txn_row *txn_row, +find_and_add_weak_ref(const struct ovsdb_row *src, const union ovsdb_atom *key, const union ovsdb_atom *value, const struct ovsdb_column *column, @@ -628,7 +628,7 @@ find_and_add_weak_ref(struct ovsdb_txn_row *txn_row, : ovsdb_table_get_row(column->type.value.uuid.refTable, &value->uuid); if (row) { - add_weak_ref(txn_row, row, ref_list, key, value, by_key, column); + add_weak_ref(src, row, ref_list, key, value, by_key, column); } else if (not_found) { if (uuid_is_zero(by_key ? &key->uuid : &value->uuid)) { *zero = true; @@ -637,6 +637,31 @@ find_and_add_weak_ref(struct ovsdb_txn_row *txn_row, } } +static void +find_and_add_weak_refs(const struct ovsdb_row *src, + const struct ovsdb_datum *datum, + const struct ovsdb_column *column, + struct ovs_list *ref_list, + struct ovsdb_datum *not_found, bool *zero) +{ + unsigned int i; + + if (ovsdb_base_type_is_weak_ref(&column->type.key)) { + for (i = 0; i < datum->n; i++) { + find_and_add_weak_ref(src, &datum->keys[i], + datum->values ? &datum->values[i] : NULL, + column, true, ref_list, not_found, zero); + } + } + + if (ovsdb_base_type_is_weak_ref(&column->type.value)) { + for (i = 0; i < datum->n; i++) { + find_and_add_weak_ref(src, &datum->keys[i], &datum->values[i], + column, false, ref_list, not_found, zero); + } + } +} + static struct ovsdb_error * OVS_WARN_UNUSED_RESULT assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) { @@ -678,7 +703,7 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) const struct ovsdb_column *column = node->data; struct ovsdb_datum *datum = &txn_row->new->fields[column->index]; struct ovsdb_datum added, removed, deleted_refs; - unsigned int orig_n, i; + unsigned int orig_n; bool zero = false; orig_n = datum->n; @@ -712,23 +737,8 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) /* Checking added data and creating new references. */ ovsdb_datum_init_empty(&deleted_refs); - if (ovsdb_base_type_is_weak_ref(&column->type.key)) { - for (i = 0; i < added.n; i++) { - find_and_add_weak_ref(txn_row, &added.keys[i], - added.values ? &added.values[i] : NULL, - column, true, &txn_row->added_refs, - &deleted_refs, &zero); - } - } - - if (ovsdb_base_type_is_weak_ref(&column->type.value)) { - for (i = 0; i < added.n; i++) { - find_and_add_weak_ref(txn_row, &added.keys[i], - &added.values[i], - column, false, &txn_row->added_refs, - &deleted_refs, &zero); - } - } + find_and_add_weak_refs(txn_row->new, &added, column, + &txn_row->added_refs, &deleted_refs, &zero); if (deleted_refs.n) { /* Removing all the references that doesn't point to valid rows. */ ovsdb_datum_sort_unique(&deleted_refs, &column->type); @@ -741,24 +751,8 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) /* Creating refs that needs to be removed on commit. This includes * both: the references that got directly removed from the datum and * references removed due to deletion of a referenced row. */ - if (ovsdb_base_type_is_weak_ref(&column->type.key)) { - for (i = 0; i < removed.n; i++) { - find_and_add_weak_ref(txn_row, &removed.keys[i], - removed.values - ? &removed.values[i] : NULL, - column, true, &txn_row->deleted_refs, - NULL, NULL); - } - } - - if (ovsdb_base_type_is_weak_ref(&column->type.value)) { - for (i = 0; i < removed.n; i++) { - find_and_add_weak_ref(txn_row, &removed.keys[i], - &removed.values[i], - column, false, &txn_row->deleted_refs, - NULL, NULL); - } - } + find_and_add_weak_refs(txn_row->new, &removed, column, + &txn_row->deleted_refs, NULL, NULL); ovsdb_datum_destroy(&removed, &column->type); if (datum->n != orig_n) { From c8a08db101237b985c44f81b9a2dd09130c9c3cf Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Tue, 1 Nov 2022 21:09:08 -0700 Subject: [PATCH 038/833] ovsdb: transaction: Fix weak reference leak. When a row is deleted, if the row has weak references to other rows, the weak reference nodes attached to the destination rows (through weak->dst_node hmap) are not destroyed. Deleting weak references is properly handled when a row is modified. The removed references are taken care by: 1. assess_weak_refs() figures out the deleted references from the row and add them to txn_row->deleted_refs. 2. before commit, in ovsdb_txn_update_weak_refs() it finds the destination row for each item in txn_row->deleted_refs (from step 1), and destroy the corresponding weak references of the destination row. However, when the row is deleted, the step 1 in assess_weak_refs() is missing. It directly returns without adding the deleted references to txn_row->deleted_refs. So, the destination nodes will keep those weak references although the source side of the references are already deleted. When such rows that originating weak references are created and deleted, more and more such useless weak reference structures accumulate in the memory, and can stay there until the destination rows are deleted. It is possible that the destination row is never deleted, and in such case the ovsdb-server memory keeps growing (although it is not strictly memory leak, because the structures are still referenced). This problem has an impact to applications like OVN SB DB - the memory grows very fast in long-running deployments and finally causes OOM. This patch fixes it by generating deleted_refs for deleted rows in assess_weak_refs(). Fixes: 4dbff9f0a685 ("ovsdb: transaction: Incremental reassessment of weak refs.") Signed-off-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/transaction.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 6796880561e..5d7c70a51c0 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -666,7 +666,7 @@ static struct ovsdb_error * OVS_WARN_UNUSED_RESULT assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) { struct ovsdb_weak_ref *weak; - struct ovsdb_table *table; + struct ovsdb_table *table = txn_row->table; struct shash_node *node; if (txn_row->old && !txn_row->new) { @@ -688,6 +688,15 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) ovs_assert(ovs_list_is_empty(&weak->src_node)); ovs_list_insert(&src_txn_row->deleted_refs, &weak->src_node); } + + /* Creating refs that needs to be removed on commit. */ + SHASH_FOR_EACH (node, &table->schema->columns) { + const struct ovsdb_column *column = node->data; + struct ovsdb_datum *datum = &txn_row->old->fields[column->index]; + + find_and_add_weak_refs(txn_row->old, datum, column, + &txn_row->deleted_refs, NULL, NULL); + } } if (!txn_row->new) { @@ -698,7 +707,6 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) return NULL; } - table = txn_row->table; SHASH_FOR_EACH (node, &table->schema->columns) { const struct ovsdb_column *column = node->data; struct ovsdb_datum *datum = &txn_row->new->fields[column->index]; From 165edb9ae2f85f4904aac6bba8370a4a891a867b Mon Sep 17 00:00:00 2001 From: Ian Stokes Date: Wed, 2 Nov 2022 18:47:02 +0000 Subject: [PATCH 039/833] ci: Update meson requirement for DPDK. The current version of meson used for building DPDK is 0.49.2. This has the restriction of holding the required python version to 3.9. A recent change [1] in DPDK bumped requirements on meson to 0.53.2. Update the version of meson used to build DPDK to 0.53.2 to remove the restriction. [1] https://git.dpdk.org/dpdk/commit/?id=909ad7b80e5e Signed-off-by: Ian Stokes Reviewed-by: David Marchand --- .ci/linux-prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh index 16a7aec0b5b..11d75a6d598 100755 --- a/.ci/linux-prepare.sh +++ b/.ci/linux-prepare.sh @@ -27,7 +27,7 @@ cd .. pip3 install --disable-pip-version-check --user wheel pip3 install --disable-pip-version-check --user \ flake8 'hacking>=3.0' netaddr pyparsing sphinx setuptools pyelftools -pip3 install --user 'meson==0.49.2' +pip3 install --user 'meson==0.53.2' if [ "$M32" ]; then # Installing 32-bit libraries. From d77f93f363b7bb68186b432f579855b8a837d64e Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Fri, 4 Nov 2022 15:06:03 +0200 Subject: [PATCH 040/833] tc: Pass tun_metadata by reference Fix coverity big parameter passed by value CID 549858 (#1 of 1): Big parameter passed by value (PASS_BY_VALUE) pass_by_value: Passing parameter metadata of type struct tun_metadata (size 272 bytes) by value, which exceeds the medium threshold of 256 bytes Signed-off-by: Roi Dayan Signed-off-by: Simon Horman --- lib/tc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index dce66ab0bd3..b9a0138459e 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -2501,13 +2501,13 @@ nl_msg_put_act_tunnel_key_release(struct ofpbuf *request) static void nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, - struct tun_metadata tun_metadata) + struct tun_metadata *tun_metadata) { const struct geneve_opt *opt; size_t outer, inner; int len, cnt = 0; - len = tun_metadata.present.len; + len = tun_metadata->present.len; if (!len) { return; } @@ -2515,7 +2515,7 @@ nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, outer = nl_msg_start_nested(request, TCA_TUNNEL_KEY_ENC_OPTS); while (len) { - opt = &tun_metadata.opts.gnv[cnt]; + opt = &tun_metadata->opts.gnv[cnt]; inner = nl_msg_start_nested(request, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, @@ -2539,7 +2539,7 @@ nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, bool id_present, ovs_be32 ipv4_dst, struct in6_addr *ipv6_src, struct in6_addr *ipv6_dst, ovs_be16 tp_dst, uint8_t tos, uint8_t ttl, - struct tun_metadata tun_metadata, + struct tun_metadata *tun_metadata, uint8_t no_csum, uint32_t action_pc) { size_t offset; @@ -3207,7 +3207,7 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) action->encap.tp_dst, action->encap.tos, action->encap.ttl, - action->encap.data, + &action->encap.data, action->encap.no_csum, action_pc); nl_msg_put_act_flags(request); @@ -3379,20 +3379,20 @@ nl_msg_put_masked_value(struct ofpbuf *request, uint16_t type, static void nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, - struct tun_metadata metadata) + struct tun_metadata *metadata) { struct geneve_opt *opt; size_t outer, inner; int len, cnt = 0; - len = metadata.present.len; + len = metadata->present.len; if (!len) { return; } outer = nl_msg_start_nested(request, type); while (len) { - opt = &metadata.opts.gnv[cnt]; + opt = &metadata->opts.gnv[cnt]; inner = nl_msg_start_nested(request, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); nl_msg_put_be16(request, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, @@ -3469,9 +3469,9 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower) nl_msg_put_be32(request, TCA_FLOWER_KEY_ENC_KEY_ID, id); } nl_msg_put_flower_tunnel_opts(request, TCA_FLOWER_KEY_ENC_OPTS, - flower->key.tunnel.metadata); + &flower->key.tunnel.metadata); nl_msg_put_flower_tunnel_opts(request, TCA_FLOWER_KEY_ENC_OPTS_MASK, - flower->mask.tunnel.metadata); + &flower->mask.tunnel.metadata); } #define FLOWER_PUT_MASKED_VALUE(member, type) \ From 6ccf8efffccbacd1d7caacbde37f6999a66b3867 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Fri, 4 Nov 2022 15:06:04 +0200 Subject: [PATCH 041/833] tc: Fix coverity dereference null return value CID 550702 (#1 of 1): Dereference null return value (NULL_RETURNS) 7. dereference: Dereferencing a pointer that might be NULL ex_type when calling nl_attr_get_u16. Signed-off-by: Roi Dayan Signed-off-by: Simon Horman --- lib/tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/tc.c b/lib/tc.c index b9a0138459e..a66dc432f98 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1087,6 +1087,10 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) } ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE); + if (!ex_type) { + return EOPNOTSUPP; + } + type = nl_attr_get_u16(ex_type); err = csum_update_flag(flower, type); From 48a0adefae0a06a80be85dfe9adeb2ee2e51704a Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Fri, 4 Nov 2022 15:06:05 +0200 Subject: [PATCH 042/833] dpif-netlink: Remove redundant null assignment The assignment of the features pointer is not doing anything and can be removed. CC: Justin Pettit Signed-off-by: Roi Dayan Acked-by: Justin Pettit Signed-off-by: Simon Horman --- lib/dpif-netlink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index a620a6ec52d..026b0daa8d8 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -4105,7 +4105,6 @@ dpif_netlink_meter_get_features(const struct dpif *dpif_, struct ofputil_meter_features *features) { if (probe_broken_meters(CONST_CAST(struct dpif *, dpif_))) { - features = NULL; return; } From c230c7579c14cbe5119df627f550a3db26391a39 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 2 Nov 2022 14:46:00 +0200 Subject: [PATCH 043/833] netdev-offload-tc: Reserve lower tc prios for ip ethertypes Currently ethertype to prio hmap is static and the first ethertype being used gets a lower priority. Usually there is an arp request before the ip traffic and the arp ethertype gets a lower tc priority while the ip traffic proto gets a higher priority. In this case ip traffic will go through more hops in tc and HW. Instead, reserve lower priorities for ip ethertypes. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- lib/netdev-offload-tc.c | 35 ++++++++++++++++++++++++++++------- lib/tc.h | 2 ++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index f6f90a741fd..ce7f8ad9730 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -325,6 +325,28 @@ struct prio_map_data { uint16_t prio; }; +static uint16_t +get_next_available_prio(ovs_be16 protocol) +{ + static uint16_t last_prio = TC_RESERVED_PRIORITY_MAX; + + if (multi_mask_per_prio) { + if (protocol == htons(ETH_P_IP)) { + return TC_RESERVED_PRIORITY_IPV4; + } else if (protocol == htons(ETH_P_IPV6)) { + return TC_RESERVED_PRIORITY_IPV6; + } + } + + /* last_prio can overflow if there will be many different kinds of + * flows which shouldn't happen organically. */ + if (last_prio == UINT16_MAX) { + return TC_RESERVED_PRIORITY_NONE; + } + + return ++last_prio; +} + /* Get free prio for tc flower * If prio is already allocated for mask/eth_type combination then return it. * If not assign new prio. @@ -336,11 +358,11 @@ get_prio_for_tc_flower(struct tc_flower *flower) { static struct hmap prios = HMAP_INITIALIZER(&prios); static struct ovs_mutex prios_lock = OVS_MUTEX_INITIALIZER; - static uint16_t last_prio = TC_RESERVED_PRIORITY_MAX; size_t key_len = sizeof(struct tc_flower_key); size_t hash = hash_int((OVS_FORCE uint32_t) flower->key.eth_type, 0); struct prio_map_data *data; struct prio_map_data *new_data; + uint16_t prio; if (!multi_mask_per_prio) { hash = hash_bytes(&flower->mask, key_len, hash); @@ -359,21 +381,20 @@ get_prio_for_tc_flower(struct tc_flower *flower) } } - if (last_prio == UINT16_MAX) { - /* last_prio can overflow if there will be many different kinds of - * flows which shouldn't happen organically. */ + prio = get_next_available_prio(flower->key.eth_type); + if (prio == TC_RESERVED_PRIORITY_NONE) { ovs_mutex_unlock(&prios_lock); - return 0; + return prio; } new_data = xzalloc(sizeof *new_data); memcpy(&new_data->mask, &flower->mask, key_len); - new_data->prio = ++last_prio; + new_data->prio = prio; new_data->protocol = flower->key.eth_type; hmap_insert(&prios, &new_data->node, hash); ovs_mutex_unlock(&prios_lock); - return new_data->prio; + return prio; } static uint32_t diff --git a/lib/tc.h b/lib/tc.h index 161f438124b..a828fd3e3f1 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -49,6 +49,8 @@ enum tc_flower_reserved_prio { TC_RESERVED_PRIORITY_NONE, TC_RESERVED_PRIORITY_POLICE, + TC_RESERVED_PRIORITY_IPV4, + TC_RESERVED_PRIORITY_IPV6, __TC_RESERVED_PRIORITY_MAX }; #define TC_RESERVED_PRIORITY_MAX (__TC_RESERVED_PRIORITY_MAX -1) From bb9fedb79af8df5f14922ae588866314a0e31bf5 Mon Sep 17 00:00:00 2001 From: Chaoyong He Date: Wed, 20 Jul 2022 16:42:00 +0800 Subject: [PATCH 044/833] netdev-offload-dpdk: Enhance the support of tunnel pop action Populate the 'is_ipv6' field of 'struct rte_flow_tunnel', which can be used in the implementation of tunnel pop action for DPDK PMD. Fixes: be56e063d028 ("netdev-offload-dpdk: Support tunnel pop action.") Signed-off-by: Chaoyong He Reviewed-by: Louis Peens Acked-by: Eli Britstein Signed-off-by: Simon Horman --- lib/netdev-offload-dpdk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index 80a64a6cc06..38f00fd309e 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -1099,12 +1099,18 @@ vport_to_rte_tunnel(struct netdev *vport, const struct netdev_tunnel_config *tnl_cfg; memset(tunnel, 0, sizeof *tunnel); + + tnl_cfg = netdev_get_tunnel_config(vport); + if (!tnl_cfg) { + return -1; + } + + if (!IN6_IS_ADDR_V4MAPPED(&tnl_cfg->ipv6_dst)) { + tunnel->is_ipv6 = true; + } + if (!strcmp(netdev_get_type(vport), "vxlan")) { tunnel->type = RTE_FLOW_ITEM_TYPE_VXLAN; - tnl_cfg = netdev_get_tunnel_config(vport); - if (!tnl_cfg) { - return -1; - } tunnel->tp_dst = tnl_cfg->dst_port; if (!VLOG_DROP_DBG(&rl)) { ds_put_format(s_tnl, "flow tunnel create %d type vxlan; ", From 62ac7b8a53506d910b787d2909fe8bbe9fd99855 Mon Sep 17 00:00:00 2001 From: Wilson Peng Date: Wed, 9 Nov 2022 09:35:06 +0800 Subject: [PATCH 045/833] datapath-windows: Check the condition to reset pseudo header checksum on Rx side If ovs node running on Windows is processing NAT action on the RX side, it will reset pseudo header checksum only if the L4 checksum is same as the calculated pseudo header checksum before NAT action. Without the fix, if the L4 header checksum is filled with a pseudo header checksum (sourceip, dstip, protocol, tcppayloadlen+tcpheaderlen) OVS will still do the checksum update(replace some IP and port and recalculate the checksum). It will lead to incorrect L4 header checksum. Reported-at:https://github.com/openvswitch/ovs-issues/issues/265 Signed-off-by: Wilson Peng Signed-off-by: Alin-Gabriel Serdean --- datapath-windows/ovsext/Actions.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c index 2f44086b469..97029b0f4e1 100644 --- a/datapath-windows/ovsext/Actions.c +++ b/datapath-windows/ovsext/Actions.c @@ -1514,6 +1514,8 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, UINT16 *checkField = NULL; BOOLEAN l4Offload = FALSE; NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; + UINT16 preNatPseudoChecksum = 0; + BOOLEAN preservePseudoChecksum = FALSE; ASSERT(layers->value != 0); @@ -1549,6 +1551,11 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, * case, we only update the TTL. */ /*Only tx direction the checksum value will be reset to be PseudoChecksum*/ + if (!isTx) { + preNatPseudoChecksum = IPPseudoChecksum(&ipHdr->saddr, &ipHdr->daddr, + tcpHdr ? IPPROTO_TCP : IPPROTO_UDP, + ntohs(ipHdr->tot_len) - ipHdr->ihl * 4); + } if (isSource) { addrField = &ipHdr->saddr; @@ -1565,7 +1572,12 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, ((BOOLEAN)csumInfo.Receive.UdpChecksumSucceeded || (BOOLEAN)csumInfo.Receive.UdpChecksumFailed); } - if (isTx && l4Offload) { + if (!isTx && l4Offload) { + if (*checkField == preNatPseudoChecksum) { + preservePseudoChecksum = TRUE; + } + } + if (isTx && l4Offload || preservePseudoChecksum) { *checkField = IPPseudoChecksum(&newAddr, &ipHdr->daddr, tcpHdr ? IPPROTO_TCP : IPPROTO_UDP, ntohs(ipHdr->tot_len) - ipHdr->ihl * 4); @@ -1585,8 +1597,13 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, ((BOOLEAN)csumInfo.Receive.UdpChecksumSucceeded || (BOOLEAN)csumInfo.Receive.UdpChecksumFailed); } + if (!isTx && l4Offload) { + if (*checkField == preNatPseudoChecksum) { + preservePseudoChecksum = TRUE; + } + } - if (isTx && l4Offload) { + if (isTx && l4Offload || preservePseudoChecksum) { *checkField = IPPseudoChecksum(&ipHdr->saddr, &newAddr, tcpHdr ? IPPROTO_TCP : IPPROTO_UDP, ntohs(ipHdr->tot_len) - ipHdr->ihl * 4); @@ -1595,7 +1612,8 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, if (*addrField != newAddr) { UINT32 oldAddr = *addrField; - if ((checkField && *checkField != 0) && (!l4Offload || !isTx)) { + if ((checkField && *checkField != 0) && + (!l4Offload || (!isTx && !preservePseudoChecksum))) { /* Recompute total checksum. */ *checkField = ChecksumUpdate32(*checkField, oldAddr, newAddr); @@ -1609,7 +1627,8 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, } if (portField && *portField != newPort) { - if ((checkField) && (!l4Offload || !isTx)) { + if ((checkField) && + (!l4Offload || (!isTx && !preservePseudoChecksum))) { /* Recompute total checksum. */ *checkField = ChecksumUpdate16(*checkField, *portField, newPort); From 8b3c86897d6a114a099255997bb74f12a735d9fb Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 23 Nov 2022 22:23:37 +0100 Subject: [PATCH 046/833] learn: Fix parsing immediate value for a field match. The value is right-justified after the string parsing with parse_int_string(), i.e. it is in BE byte order and aligned to the right side of the array. For example, the 0x10011 value in a 4-byte field will look like 0x00 0x01 0x00 0x11. However, value copy to the resulted ofpact is performed from the start of the memory. So, in case the destination size is smaller than the original field size, incorrect part of the value will be copied. In the 0x00 0x01 0x00 0x11 example above, if the copy is performed to a 3-byte field, the first 3 bytes will be copied, which are 0x00 0x01 0x00 instead of 0x01 0x00 0x11. This leads to a problem where NXM_NX_REG3[0..16]=0x10011 turns into NXM_NX_REG3[0..16]=0x100 after the parsing. Fix that by offsetting the starting position to the size difference in bytes similarly to how it is done in learn_parse_load_immediate(). While at it, changing &imm to imm.b in function calls that expect byte arrays as an argument. The old way is technically correct, but more error prone. The mf_write_subfield_value() call was also incorrect. However, the 'match' variable is actually not used for anything since checking removal in commit: dd43a558597b ("Do not perform validation in learn_parse();") So, just removing the call and the 'match' variable entirely instead of fixing it. Fixes: 21b2fa617126 ("ofp-parse: Allow match field names in actions and brackets in matches.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-November/052100.html Reported-by: Thomas Lee Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/learn.c | 18 +++++++----------- tests/learn.at | 4 ++-- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/lib/learn.c b/lib/learn.c index a40209ec0b8..a62add2fda0 100644 --- a/lib/learn.c +++ b/lib/learn.c @@ -241,7 +241,7 @@ static char * OVS_WARN_UNUSED_RESULT learn_parse_spec(const char *orig, char *name, char *value, const struct ofputil_port_map *port_map, struct ofpact_learn_spec *spec, - struct ofpbuf *ofpacts, struct match *match) + struct ofpbuf *ofpacts) { /* Parse destination and check prerequisites. */ struct mf_subfield dst; @@ -275,14 +275,14 @@ learn_parse_spec(const char *orig, char *name, char *value, } else { char *tail; /* Partial field value. */ - if (parse_int_string(value, (uint8_t *)&imm, + if (parse_int_string(value, imm.b, dst.field->n_bytes, &tail) || *tail != 0) { imm_error = xasprintf("%s: cannot parse integer value", orig); } if (!imm_error && - !bitwise_is_all_zeros(&imm, dst.field->n_bytes, + !bitwise_is_all_zeros(imm.b, dst.field->n_bytes, dst.n_bits, dst.field->n_bytes * 8 - dst.n_bits)) { struct ds ds; @@ -304,15 +304,13 @@ learn_parse_spec(const char *orig, char *name, char *value, spec->src_type = NX_LEARN_SRC_IMMEDIATE; - /* Update 'match' to allow for satisfying destination - * prerequisites. */ - mf_write_subfield_value(&dst, &imm, match); - /* Push value last, as this may reallocate 'spec'! */ unsigned int imm_bytes = DIV_ROUND_UP(dst.n_bits, 8); uint8_t *src_imm = ofpbuf_put_zeros(ofpacts, OFPACT_ALIGN(imm_bytes)); - memcpy(src_imm, &imm, imm_bytes); + + memcpy(src_imm, &imm.b[dst.field->n_bytes - imm_bytes], + imm_bytes); free(error); return NULL; @@ -391,7 +389,6 @@ learn_parse__(char *orig, char *arg, const struct ofputil_port_map *port_map, struct ofpbuf *ofpacts) { struct ofpact_learn *learn; - struct match match; char *name, *value; learn = ofpact_put_LEARN(ofpacts); @@ -400,7 +397,6 @@ learn_parse__(char *orig, char *arg, const struct ofputil_port_map *port_map, learn->priority = OFP_DEFAULT_PRIORITY; learn->table_id = 1; - match_init_catchall(&match); while (ofputil_parse_key_value(&arg, &name, &value)) { if (!strcmp(name, "table")) { if (!ofputil_table_from_string(value, table_map, @@ -448,7 +444,7 @@ learn_parse__(char *orig, char *arg, const struct ofputil_port_map *port_map, spec = ofpbuf_put_zeros(ofpacts, sizeof *spec); error = learn_parse_spec(orig, name, value, port_map, - spec, ofpacts, &match); + spec, ofpacts); if (error) { return error; } diff --git a/tests/learn.at b/tests/learn.at index 5f1d6df9de4..d127fed3481 100644 --- a/tests/learn.at +++ b/tests/learn.at @@ -6,7 +6,7 @@ actions=learn() actions=learn(send_flow_rem) actions=learn(delete_learned) actions=learn(send_flow_rem,delete_learned) -actions=learn(NXM_OF_VLAN_TCI[0..11], NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[], output:NXM_OF_IN_PORT[], load:10->NXM_NX_REG0[5..10]) +actions=learn(NXM_OF_VLAN_TCI[0..11], NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[], NXM_NX_REG3[3..19]=0x10011, output:NXM_OF_IN_PORT[], load:10->NXM_NX_REG0[5..10]) actions=learn(table=1,idle_timeout=10, hard_timeout=20, fin_idle_timeout=5, fin_hard_timeout=10, priority=10, cookie=0xfedcba9876543210, in_port=99,eth_dst=eth_src,load:in_port->reg1[16..31]) actions=learn(limit=4096) actions=learn(limit=4096,result_dst=reg0[0]) @@ -18,7 +18,7 @@ OFPT_FLOW_MOD (xid=0x1): ADD actions=learn(table=1) OFPT_FLOW_MOD (xid=0x2): ADD actions=learn(table=1,send_flow_rem) OFPT_FLOW_MOD (xid=0x3): ADD actions=learn(table=1,delete_learned) OFPT_FLOW_MOD (xid=0x4): ADD actions=learn(table=1,send_flow_rem,delete_learned) -OFPT_FLOW_MOD (xid=0x5): ADD actions=learn(table=1,NXM_OF_VLAN_TCI[0..11],NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],output:NXM_OF_IN_PORT[],load:0xa->NXM_NX_REG0[5..10]) +OFPT_FLOW_MOD (xid=0x5): ADD actions=learn(table=1,NXM_OF_VLAN_TCI[0..11],NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],NXM_NX_REG3[3..19]=0x10011,output:NXM_OF_IN_PORT[],load:0xa->NXM_NX_REG0[5..10]) OFPT_FLOW_MOD (xid=0x6): ADD actions=learn(table=1,idle_timeout=10,hard_timeout=20,fin_idle_timeout=5,fin_hard_timeout=10,priority=10,cookie=0xfedcba9876543210,in_port=99,NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],load:NXM_OF_IN_PORT[]->NXM_NX_REG1[16..31]) OFPT_FLOW_MOD (xid=0x7): ADD actions=learn(table=1,limit=4096) OFPT_FLOW_MOD (xid=0x8): ADD actions=learn(table=1,limit=4096,result_dst=NXM_NX_REG0[0]) From c6062d107716e6bf84f8106b0806ee73ba7207a3 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 9 Nov 2022 21:31:50 +0100 Subject: [PATCH 047/833] vswitchd: Publish per iface received multicast packets. The count of received multicast packets has been computed internally, but not exposed to ovsdb. Fix this. Signed-off-by: David Marchand Acked-by: Mike Pattrick Acked-by: Michael Santana Signed-off-by: Ilya Maximets --- vswitchd/bridge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 25ce45e3dc1..d0667f229da 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2619,6 +2619,7 @@ iface_refresh_stats(struct iface *iface) IFACE_STAT(tx_512_to_1023_packets, "tx_512_to_1023_packets") \ IFACE_STAT(tx_1024_to_1522_packets, "tx_1024_to_1522_packets") \ IFACE_STAT(tx_1523_to_max_packets, "tx_1523_to_max_packets") \ + IFACE_STAT(multicast, "rx_multicast_packets") \ IFACE_STAT(tx_multicast_packets, "tx_multicast_packets") \ IFACE_STAT(rx_broadcast_packets, "rx_broadcast_packets") \ IFACE_STAT(tx_broadcast_packets, "tx_broadcast_packets") \ From 2496d854326577a2d7ae94a86a085e8ae336302e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 4 Nov 2022 15:25:42 +0100 Subject: [PATCH 048/833] rculist: Fix iteration macros. Some macros for rculist have no users and there are no unit tests specific to that library as well, so broken code wasn't spotted while updating to multi-variable iterators. Fixing multiple problems like missing commas, parenthesis, incorrect variable and macro names. Fixes: d293965d7b06 ("rculist: use multi-variable helpers for loop macros.") Reported-by: Subrata Nath Co-authored-by: Dumitru Ceara Signed-off-by: Dumitru Ceara Acked-by: Alin-Gabriel Serdean Signed-off-by: Ilya Maximets --- lib/rculist.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/rculist.h b/lib/rculist.h index c0d77acf943..9bb8cbf3eb2 100644 --- a/lib/rculist.h +++ b/lib/rculist.h @@ -380,18 +380,18 @@ rculist_is_singleton_protected(const struct rculist *list) #define RCULIST_FOR_EACH_REVERSE_PROTECTED(ITER, MEMBER, RCULIST) \ for (INIT_MULTIVAR(ITER, MEMBER, (RCULIST)->prev, struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, ITER_VAR(VAR).prev)) + UPDATE_MULTIVAR(ITER, ITER_VAR(ITER)->prev)) #define RCULIST_FOR_EACH_REVERSE_PROTECTED_CONTINUE(ITER, MEMBER, RCULIST) \ for (INIT_MULTIVAR(ITER, MEMBER, (ITER)->MEMBER.prev, struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, ITER_VAR(VAR).prev)) + UPDATE_MULTIVAR(ITER, ITER_VAR(ITER)->prev)) #define RCULIST_FOR_EACH_PROTECTED(ITER, MEMBER, RCULIST) \ for (INIT_MULTIVAR(ITER, MEMBER, rculist_next_protected(RCULIST), \ struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, rculist_next_protected(ITER_VAR(ITER))) \ + UPDATE_MULTIVAR(ITER, rculist_next_protected(ITER_VAR(ITER)))) \ #define RCULIST_FOR_EACH_SAFE_SHORT_PROTECTED(ITER, MEMBER, RCULIST) \ for (INIT_MULTIVAR_SAFE_SHORT(ITER, MEMBER, \ @@ -399,18 +399,18 @@ rculist_is_singleton_protected(const struct rculist *list) struct rculist); \ CONDITION_MULTIVAR_SAFE_SHORT(ITER, MEMBER, \ ITER_VAR(ITER) != (RCULIST), \ - ITER_NEXT_VAR(ITER) = rculist_next_protected(ITER_VAR(VAR))); \ - UPDATE_MULTIVAR_SHORT(ITER)) + ITER_NEXT_VAR(ITER) = rculist_next_protected(ITER_VAR(ITER))); \ + UPDATE_MULTIVAR_SAFE_SHORT(ITER)) #define RCULIST_FOR_EACH_SAFE_LONG_PROTECTED(ITER, NEXT, MEMBER, RCULIST) \ for (INIT_MULTIVAR_SAFE_LONG(ITER, NEXT, MEMBER, \ - rculist_next_protected(RCULIST) \ + rculist_next_protected(RCULIST), \ struct rculist); \ - CONDITION_MULTIVAR_SAFE_LONG(VAR, NEXT, MEMBER \ + CONDITION_MULTIVAR_SAFE_LONG(ITER, NEXT, MEMBER, \ ITER_VAR(ITER) != (RCULIST), \ - ITER_VAR(NEXT) = rculist_next_protected(ITER_VAR(VAR)), \ + ITER_VAR(NEXT) = rculist_next_protected(ITER_VAR(ITER)), \ ITER_VAR(NEXT) != (RCULIST)); \ - UPDATE_MULTIVAR_LONG(ITER)) + UPDATE_MULTIVAR_SAFE_LONG(ITER, NEXT)) #define RCULIST_FOR_EACH_SAFE_PROTECTED(...) \ OVERLOAD_SAFE_MACRO(RCULIST_FOR_EACH_SAFE_LONG_PROTECTED, \ From 5b06970e8eedd074dfa5a5405b8ada7435689fc8 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Tue, 24 May 2022 21:04:32 +0800 Subject: [PATCH 049/833] ofp-msgs: Fix comment typo. Fix comment typo. Signed-off-by: Lin Huang Acked-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/ofp-msgs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ofp-msgs.c b/lib/ofp-msgs.c index 93aa812978e..fdb89806480 100644 --- a/lib/ofp-msgs.c +++ b/lib/ofp-msgs.c @@ -148,7 +148,7 @@ struct raw_instance { /* Information about a particular 'enum ofpraw'. */ struct raw_info { /* All possible instantiations of this OFPRAW_* into OpenFlow headers. */ - struct raw_instance *instances; /* min_version - max_version + 1 elems. */ + struct raw_instance *instances; /* max_version - min_version + 1 elems. */ uint8_t min_version; uint8_t max_version; From 22413fe8a83cc4e153fc35defc6f01f7dc5a21b5 Mon Sep 17 00:00:00 2001 From: yangchang Date: Thu, 23 Jun 2022 18:32:06 +0800 Subject: [PATCH 050/833] lacp: Modify the comment misspelling. Change 'negotations' to 'negotiations'. Signed-off-by: yangchang Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/lacp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lacp.h b/lib/lacp.h index 1ca06f762ba..5ba17c36a5c 100644 --- a/lib/lacp.h +++ b/lib/lacp.h @@ -24,7 +24,7 @@ /* LACP Protocol Implementation. */ enum lacp_status { - LACP_NEGOTIATED, /* Successful LACP negotations. */ + LACP_NEGOTIATED, /* Successful LACP negotiations. */ LACP_CONFIGURED, /* LACP is enabled but not negotiated. */ LACP_DISABLED /* LACP is not enabled. */ }; From 0937209fc7aca1107bb3f77cf1585799a086d065 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 25 Aug 2022 12:25:24 +0200 Subject: [PATCH 051/833] netdev-dpdk: Cleanup code when DPDK is disabled. Remove one unused stub: netdev_dpdk_register() can't be called if DPDK is disabled at build time. Remove unneeded #ifdef in call to free_dpdk_buf. Drop unneeded cast when calling free_dpdk_buf. Acked-by: Sunil Pai G Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 6 +----- lib/dp-packet.h | 4 +--- lib/netdev-dpdk.h | 5 ----- 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 4538d2a6148..61e405460a2 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -134,11 +134,7 @@ dp_packet_uninit(struct dp_packet *b) if (b->source == DPBUF_MALLOC) { free(dp_packet_base(b)); } else if (b->source == DPBUF_DPDK) { -#ifdef DPDK_NETDEV - /* If this dp_packet was allocated by DPDK it must have been - * created as a dp_packet */ - free_dpdk_buf((struct dp_packet*) b); -#endif + free_dpdk_buf(b); } else if (b->source == DPBUF_AFXDP) { free_afxdp_buf(b); } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 55eeaab2ce8..a8ea5b40f71 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -247,9 +247,7 @@ dp_packet_delete(struct dp_packet *b) { if (b) { if (b->source == DPBUF_DPDK) { - /* If this dp_packet was allocated by DPDK it must have been - * created as a dp_packet */ - free_dpdk_buf((struct dp_packet*) b); + free_dpdk_buf(b); return; } diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 7d2f64af23e..5cd95d00f5a 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -150,11 +150,6 @@ netdev_dpdk_rte_flow_tunnel_item_release( #else -static inline void -netdev_dpdk_register(const struct smap *ovs_other_config OVS_UNUSED) -{ - /* Nothing */ -} static inline void free_dpdk_buf(struct dp_packet *buf OVS_UNUSED) { From 126e6046eb9592200bfca2218002b8256f92d617 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 25 Aug 2022 12:25:25 +0200 Subject: [PATCH 052/833] netdev-dpdk: Move DPDK netdev related configuration. vhost related configuration and per port memory are netdev-dpdk configuration items. dpdk-stub.c and netdev-dpdk.c are never linked together, so we can move those bits out of the generic dpdk code. The dpdk_* accessors for those configuration items are then not needed anymore and we can simply reference local variables. Acked-by: Sunil Pai G Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/dpdk-stub.c | 24 ----------- lib/dpdk.c | 101 --------------------------------------------- lib/dpdk.h | 4 -- lib/netdev-dpdk.c | 102 ++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 94 insertions(+), 137 deletions(-) diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index 3eee1f485c0..58ebf6cb62c 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -49,30 +49,6 @@ dpdk_detach_thread(void) { } -const char * -dpdk_get_vhost_sock_dir(void) -{ - return NULL; -} - -bool -dpdk_vhost_iommu_enabled(void) -{ - return false; -} - -bool -dpdk_vhost_postcopy_enabled(void) -{ - return false; -} - -bool -dpdk_per_port_memory(void) -{ - return false; -} - bool dpdk_available(void) { diff --git a/lib/dpdk.c b/lib/dpdk.c index d909974f91b..240babc03e6 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -47,40 +46,9 @@ VLOG_DEFINE_THIS_MODULE(dpdk); static FILE *log_stream = NULL; /* Stream for DPDK log redirection */ -static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ -static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */ -static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY - * support. */ -static bool per_port_memory = false; /* Status of per port memory support */ - /* Indicates successful initialization of DPDK. */ static atomic_bool dpdk_initialized = ATOMIC_VAR_INIT(false); -static int -process_vhost_flags(char *flag, const char *default_val, int size, - const struct smap *ovs_other_config, - char **new_val) -{ - const char *val; - int changed = 0; - - val = smap_get(ovs_other_config, flag); - - /* Process the vhost-sock-dir flag if it is provided, otherwise resort to - * default value. - */ - if (val && (strlen(val) <= size)) { - changed = 1; - *new_val = xstrdup(val); - VLOG_INFO("User-provided %s in use: %s", flag, *new_val); - } else { - VLOG_INFO("No %s provided - defaulting to %s", flag, default_val); - *new_val = xstrdup(default_val); - } - - return changed; -} - static bool args_contains(const struct svec *args, const char *value) { @@ -345,11 +313,9 @@ malloc_dump_stats_wrapper(FILE *stream) static bool dpdk_init__(const struct smap *ovs_other_config) { - char *sock_dir_subcomponent; char **argv = NULL; int result; bool auto_determine = true; - int err = 0; struct ovs_numa_dump *affinity = NULL; struct svec args = SVEC_EMPTY_INITIALIZER; @@ -361,49 +327,6 @@ dpdk_init__(const struct smap *ovs_other_config) rte_openlog_stream(log_stream); } - if (process_vhost_flags("vhost-sock-dir", ovs_rundir(), - NAME_MAX, ovs_other_config, - &sock_dir_subcomponent)) { - struct stat s; - if (!strstr(sock_dir_subcomponent, "..")) { - vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(), - sock_dir_subcomponent); - - err = stat(vhost_sock_dir, &s); - if (err) { - VLOG_ERR("vhost-user sock directory '%s' does not exist.", - vhost_sock_dir); - } - } else { - vhost_sock_dir = xstrdup(ovs_rundir()); - VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid" - "characters '..' - using %s instead.", - ovs_rundir(), sock_dir_subcomponent, ovs_rundir()); - } - free(sock_dir_subcomponent); - } else { - vhost_sock_dir = sock_dir_subcomponent; - } - - vhost_iommu_enabled = smap_get_bool(ovs_other_config, - "vhost-iommu-support", false); - VLOG_INFO("IOMMU support for vhost-user-client %s.", - vhost_iommu_enabled ? "enabled" : "disabled"); - - vhost_postcopy_enabled = smap_get_bool(ovs_other_config, - "vhost-postcopy-support", false); - if (vhost_postcopy_enabled && memory_locked()) { - VLOG_WARN("vhost-postcopy-support and mlockall are not compatible."); - vhost_postcopy_enabled = false; - } - VLOG_INFO("POSTCOPY support for vhost-user-client %s.", - vhost_postcopy_enabled ? "enabled" : "disabled"); - - per_port_memory = smap_get_bool(ovs_other_config, - "per-port-memory", false); - VLOG_INFO("Per port memory for DPDK devices %s.", - per_port_memory ? "enabled" : "disabled"); - svec_add(&args, ovs_get_program_name()); construct_dpdk_args(ovs_other_config, &args); @@ -558,30 +481,6 @@ dpdk_init(const struct smap *ovs_other_config) atomic_store_relaxed(&dpdk_initialized, enabled); } -const char * -dpdk_get_vhost_sock_dir(void) -{ - return vhost_sock_dir; -} - -bool -dpdk_vhost_iommu_enabled(void) -{ - return vhost_iommu_enabled; -} - -bool -dpdk_vhost_postcopy_enabled(void) -{ - return vhost_postcopy_enabled; -} - -bool -dpdk_per_port_memory(void) -{ - return per_port_memory; -} - bool dpdk_available(void) { diff --git a/lib/dpdk.h b/lib/dpdk.h index 64ebca47d6d..1b790e682e4 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -38,10 +38,6 @@ struct ovsrec_open_vswitch; void dpdk_init(const struct smap *ovs_other_config); bool dpdk_attach_thread(unsigned cpu); void dpdk_detach_thread(void); -const char *dpdk_get_vhost_sock_dir(void); -bool dpdk_vhost_iommu_enabled(void); -bool dpdk_vhost_postcopy_enabled(void); -bool dpdk_per_port_memory(void); bool dpdk_available(void); void print_dpdk_version(void); void dpdk_status(const struct ovsrec_open_vswitch *); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index e4b3465e09b..339936b6e29 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -78,6 +79,12 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); COVERAGE_DEFINE(vhost_tx_contention); COVERAGE_DEFINE(vhost_notification); +static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ +static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */ +static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY + * support. */ +static bool per_port_memory = false; /* Status of per port memory support */ + #define DPDK_PORT_WATCHDOG_INTERVAL 5 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE @@ -915,7 +922,7 @@ netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) uint32_t buf_size = dpdk_buf_size(dev->requested_mtu); struct dpdk_mp *dmp; int ret = 0; - bool per_port_mp = dpdk_per_port_memory(); + bool per_port_mp = per_port_memory; /* With shared memory we do not need to configure a mempool if the MTU * and socket ID have not changed, the previous configuration is still @@ -1379,7 +1386,7 @@ netdev_dpdk_vhost_construct(struct netdev *netdev) /* Take the name of the vhost-user port and append it to the location where * the socket is to be created, then register the socket. */ - dev->vhost_id = xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name); + dev->vhost_id = xasprintf("%s/%s", vhost_sock_dir, name); dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT; @@ -5102,12 +5109,12 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT; /* Enable IOMMU support, if explicitly requested. */ - if (dpdk_vhost_iommu_enabled()) { + if (vhost_iommu_enabled) { vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT; } /* Enable POSTCOPY support, if explicitly requested. */ - if (dpdk_vhost_postcopy_enabled()) { + if (vhost_postcopy_enabled) { vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT; } @@ -5389,8 +5396,18 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev, #endif /* ALLOW_EXPERIMENTAL_API */ static void -parse_user_mempools_list(const char *mtus) +parse_mempool_config(const struct smap *ovs_other_config) +{ + per_port_memory = smap_get_bool(ovs_other_config, + "per-port-memory", false); + VLOG_INFO("Per port memory for DPDK devices %s.", + per_port_memory ? "enabled" : "disabled"); +} + +static void +parse_user_mempools_list(const struct smap *ovs_other_config) { + const char *mtus = smap_get(ovs_other_config, "shared-mempool-config"); char *list, *copy, *key, *value; int error = 0; @@ -5438,6 +5455,75 @@ parse_user_mempools_list(const char *mtus) free(copy); } +static int +process_vhost_flags(char *flag, const char *default_val, int size, + const struct smap *ovs_other_config, + char **new_val) +{ + const char *val; + int changed = 0; + + val = smap_get(ovs_other_config, flag); + + /* Process the vhost-sock-dir flag if it is provided, otherwise resort to + * default value. + */ + if (val && (strlen(val) <= size)) { + changed = 1; + *new_val = xstrdup(val); + VLOG_INFO("User-provided %s in use: %s", flag, *new_val); + } else { + VLOG_INFO("No %s provided - defaulting to %s", flag, default_val); + *new_val = xstrdup(default_val); + } + + return changed; +} + +static void +parse_vhost_config(const struct smap *ovs_other_config) +{ + char *sock_dir_subcomponent; + + if (process_vhost_flags("vhost-sock-dir", ovs_rundir(), + NAME_MAX, ovs_other_config, + &sock_dir_subcomponent)) { + struct stat s; + + if (!strstr(sock_dir_subcomponent, "..")) { + vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(), + sock_dir_subcomponent); + + if (stat(vhost_sock_dir, &s)) { + VLOG_ERR("vhost-user sock directory '%s' does not exist.", + vhost_sock_dir); + } + } else { + vhost_sock_dir = xstrdup(ovs_rundir()); + VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid" + "characters '..' - using %s instead.", + ovs_rundir(), sock_dir_subcomponent, ovs_rundir()); + } + free(sock_dir_subcomponent); + } else { + vhost_sock_dir = sock_dir_subcomponent; + } + + vhost_iommu_enabled = smap_get_bool(ovs_other_config, + "vhost-iommu-support", false); + VLOG_INFO("IOMMU support for vhost-user-client %s.", + vhost_iommu_enabled ? "enabled" : "disabled"); + + vhost_postcopy_enabled = smap_get_bool(ovs_other_config, + "vhost-postcopy-support", false); + if (vhost_postcopy_enabled && memory_locked()) { + VLOG_WARN("vhost-postcopy-support and mlockall are not compatible."); + vhost_postcopy_enabled = false; + } + VLOG_INFO("POSTCOPY support for vhost-user-client %s.", + vhost_postcopy_enabled ? "enabled" : "disabled"); +} + #define NETDEV_DPDK_CLASS_COMMON \ .is_pmd = true, \ .alloc = netdev_dpdk_alloc, \ @@ -5523,10 +5609,10 @@ static const struct netdev_class dpdk_vhost_client_class = { void netdev_dpdk_register(const struct smap *ovs_other_config) { - const char *mempoolcfg = smap_get(ovs_other_config, - "shared-mempool-config"); + parse_mempool_config(ovs_other_config); + parse_user_mempools_list(ovs_other_config); + parse_vhost_config(ovs_other_config); - parse_user_mempools_list(mempoolcfg); netdev_register_provider(&dpdk_class); netdev_register_provider(&dpdk_vhost_class); netdev_register_provider(&dpdk_vhost_client_class); From d240f72ad2adb9932b59b8e01f47a93f76c5c93c Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 25 Aug 2022 12:25:26 +0200 Subject: [PATCH 053/833] netdev-dpdk: Cleanup mempool selection code. Propagating per_port_memory value through a DPDK netdev creation gives the false impression its value is somehow contextual to the creation. On the contrary, this parameter value is set once and for all at OVS initialization time. Simplify the code and directly access the local boolean. Acked-by: Sunil Pai G Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 339936b6e29..72e7a32688f 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -694,11 +694,11 @@ dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex) * calculating. */ static uint32_t -dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp) +dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu) { uint32_t n_mbufs; - if (!per_port_mp) { + if (!per_port_memory) { /* Shared memory are being used. * XXX: this is a really rough method of provisioning memory. * It's impossible to determine what the exact memory requirements are @@ -729,7 +729,7 @@ dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp) } static struct dpdk_mp * -dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) +dpdk_mp_create(struct netdev_dpdk *dev, int mtu) { char mp_name[RTE_MEMPOOL_NAMESIZE]; const char *netdev_name = netdev_get_name(&dev->up); @@ -754,7 +754,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) /* Get the size of each mbuf, based on the MTU */ mbuf_size = MTU_TO_FRAME_LEN(mtu); - n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp); + n_mbufs = dpdk_calculate_mbufs(dev, mtu); do { /* Full DPDK memory pool name must be unique and cannot be @@ -840,7 +840,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) } static struct dpdk_mp * -dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) +dpdk_mp_get(struct netdev_dpdk *dev, int mtu) { struct dpdk_mp *dmp, *next; bool reuse = false; @@ -848,7 +848,7 @@ dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) ovs_mutex_lock(&dpdk_mp_mutex); /* Check if shared memory is being used, if so check existing mempools * to see if reuse is possible. */ - if (!per_port_mp) { + if (!per_port_memory) { /* If user has provided defined mempools, check if one is suitable * and get new buffer size.*/ mtu = dpdk_get_user_adjusted_mtu(mtu, dev->requested_mtu, @@ -867,7 +867,7 @@ dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) dpdk_mp_sweep(); if (!reuse) { - dmp = dpdk_mp_create(dev, mtu, per_port_mp); + dmp = dpdk_mp_create(dev, mtu); if (dmp) { /* Shared memory will hit the reuse case above so will not * request a mempool that already exists but we need to check @@ -877,7 +877,7 @@ dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) * dmp to point to the existing entry and increment the refcount * to avoid being freed at a later stage. */ - if (per_port_mp && rte_errno == EEXIST) { + if (per_port_memory && rte_errno == EEXIST) { LIST_FOR_EACH (next, list_node, &dpdk_mp_list) { if (dmp->mp == next->mp) { rte_free(dmp); @@ -922,17 +922,16 @@ netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) uint32_t buf_size = dpdk_buf_size(dev->requested_mtu); struct dpdk_mp *dmp; int ret = 0; - bool per_port_mp = per_port_memory; /* With shared memory we do not need to configure a mempool if the MTU * and socket ID have not changed, the previous configuration is still * valid so return 0 */ - if (!per_port_mp && dev->mtu == dev->requested_mtu + if (!per_port_memory && dev->mtu == dev->requested_mtu && dev->socket_id == dev->requested_socket_id) { return ret; } - dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp); + dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size)); if (!dmp) { VLOG_ERR("Failed to create memory pool for netdev " "%s, with MTU %d on socket %d: %s\n", From b22c4d84038c3eceab9486984e601b2f979ebe6d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Oct 2022 18:37:41 +0200 Subject: [PATCH 054/833] netdev: Assume default link speed to be 10 Gbps instead of 100 Mbps. 100 Mbps was a fair assumption 13 years ago. Modern days 10 Gbps seems like a good value in case no information is available otherwise. The change mainly affects QoS which is currently limited to 100 Mbps if the user didn't specify 'max-rate' and the card doesn't report the speed or OVS doesn't have a predefined enumeration for the speed reported by the NIC. Calculation of the path cost for STP/RSTP is also affected if OVS is unable to determine the link speed. Lower link speed adapters are typically good at reporting their speed, so chances for overshoot should be low. But newer high-speed adapters, for which there is no speed enumeration or if there are some other issues, will not suffer that much. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ include/openvswitch/netdev.h | 2 ++ lib/netdev-linux.c | 4 ++-- lib/rstp.c | 2 +- lib/rstp.h | 2 +- lib/stp.c | 4 ++-- tests/stp.at | 14 +++++++------- tests/test-rstp.c | 7 +++++-- vswitchd/bridge.c | 4 ++-- vswitchd/vswitch.xml | 2 +- 10 files changed, 27 insertions(+), 18 deletions(-) diff --git a/NEWS b/NEWS index ff77ee404f3..3ae6882d551 100644 --- a/NEWS +++ b/NEWS @@ -23,6 +23,10 @@ Post-v3.0.0 bug and CVE fixes addressed since its release. If a user wishes to benefit from these fixes it is recommended to use DPDK 21.11.2. + - For the QoS max-rate and STP/RSTP path-cost configuration OVS now assumes + 10 Gbps link speed by default in case the actual link speed cannot be + determined. Previously it was 10 Mbps. Values can still be overridden + by specifying 'max-rate' or '[r]stp-path-cost' accordingly. v3.0.0 - 15 Aug 2022 diff --git a/include/openvswitch/netdev.h b/include/openvswitch/netdev.h index 0c10f7b487c..cf48f86915f 100644 --- a/include/openvswitch/netdev.h +++ b/include/openvswitch/netdev.h @@ -121,6 +121,8 @@ enum netdev_features { NETDEV_F_PAUSE_ASYM = 1 << 15, /* Asymmetric pause. */ }; +#define NETDEV_DEFAULT_BPS UINT64_C(10 * 1000 * 1000 * 1000) + int netdev_get_features(const struct netdev *, enum netdev_features *current, enum netdev_features *advertised, diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 59e8dc0ae6c..f6d7a1b9743 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -4710,7 +4710,7 @@ htb_parse_qdisc_details__(struct netdev *netdev_, netdev_linux_read_features(netdev); current = !netdev->get_features_error ? netdev->current : 0; - hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8; + hc->max_rate = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 8; } hc->min_rate = hc->max_rate; hc->burst = 0; @@ -5182,7 +5182,7 @@ hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details, netdev_linux_read_features(netdev); current = !netdev->get_features_error ? netdev->current : 0; - max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8; + max_rate = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 8; } class->min_rate = max_rate; diff --git a/lib/rstp.c b/lib/rstp.c index 7e351bf32ff..2f01966f796 100644 --- a/lib/rstp.c +++ b/lib/rstp.c @@ -784,7 +784,7 @@ rstp_convert_speed_to_cost(unsigned int speed) : speed >= 100 ? 200000 /* 100 Mb/s. */ : speed >= 10 ? 2000000 /* 10 Mb/s. */ : speed >= 1 ? 20000000 /* 1 Mb/s. */ - : RSTP_DEFAULT_PORT_PATH_COST; /* 100 Mb/s. */ + : RSTP_DEFAULT_PORT_PATH_COST; /* 10 Gb/s. */ return value; } diff --git a/lib/rstp.h b/lib/rstp.h index 39a13b58c1f..13af2019516 100644 --- a/lib/rstp.h +++ b/lib/rstp.h @@ -84,7 +84,7 @@ struct dp_packet; /* Port path cost [Table 17-3] */ #define RSTP_MIN_PORT_PATH_COST 1 #define RSTP_MAX_PORT_PATH_COST 200000000 -#define RSTP_DEFAULT_PORT_PATH_COST 200000 +#define RSTP_DEFAULT_PORT_PATH_COST 2000 /* RSTP Bridge identifier [9.2.5]. Top four most significant bits are a * priority value. The next most significant twelve bits are a locally diff --git a/lib/stp.c b/lib/stp.c index a869b5f390c..f37337992a3 100644 --- a/lib/stp.c +++ b/lib/stp.c @@ -313,7 +313,7 @@ stp_create(const char *name, stp_identifier bridge_id, for (p = stp->ports; p < &stp->ports[ARRAY_SIZE(stp->ports)]; p++) { p->stp = stp; p->port_id = (stp_port_no(p) + 1) | (STP_DEFAULT_PORT_PRIORITY << 8); - p->path_cost = 19; /* Recommended default for 100 Mb/s link. */ + p->path_cost = 2; /* Recommended default for 10 Gb/s link. */ stp_initialize_port(p, STP_DISABLED); } ovs_refcount_init(&stp->ref_cnt); @@ -989,7 +989,7 @@ stp_convert_speed_to_cost(unsigned int speed) : speed >= 16 ? 62 /* 16 Mb/s. */ : speed >= 10 ? 100 /* 10 Mb/s. */ : speed >= 4 ? 250 /* 4 Mb/s. */ - : 19; /* 100 Mb/s (guess). */ + : 2; /* 10 Gb/s (guess). */ ovs_mutex_unlock(&mutex); return ret; } diff --git a/tests/stp.at b/tests/stp.at index 69475843e55..a6b6465d12a 100644 --- a/tests/stp.at +++ b/tests/stp.at @@ -620,10 +620,10 @@ ovs-appctl time/stop ovs-appctl time/warp 31000 1000 AT_CHECK([ovs-appctl stp/show br0 | grep p1], [0], [dnl - p1 designated forwarding 19 128.1 + p1 designated forwarding 2 128.1 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p2], [0], [dnl - p2 designated forwarding 19 128.2 + p2 designated forwarding 2 128.2 ]) # add a stp port @@ -637,10 +637,10 @@ ovs-appctl netdev-dummy/set-admin-state p3 down # We should not show the p3 because its link-state is down AT_CHECK([ovs-appctl stp/show br0 | grep p1], [0], [dnl - p1 designated forwarding 19 128.1 + p1 designated forwarding 2 128.1 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p2], [0], [dnl - p2 designated forwarding 19 128.2 + p2 designated forwarding 2 128.2 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p3], [1], [dnl ]) @@ -648,13 +648,13 @@ AT_CHECK([ovs-appctl stp/show br0 | grep p3], [1], [dnl ovs-appctl netdev-dummy/set-admin-state p3 up AT_CHECK([ovs-appctl stp/show br0 | grep p1], [0], [dnl - p1 designated forwarding 19 128.1 + p1 designated forwarding 2 128.1 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p2], [0], [dnl - p2 designated forwarding 19 128.2 + p2 designated forwarding 2 128.2 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p3], [0], [dnl - p3 designated listening 19 128.3 + p3 designated listening 2 128.3 ]) diff --git a/tests/test-rstp.c b/tests/test-rstp.c index 01aeaf84783..9c1026ec1a8 100644 --- a/tests/test-rstp.c +++ b/tests/test-rstp.c @@ -107,6 +107,8 @@ send_bpdu(struct dp_packet *pkt, void *port_, void *b_) dp_packet_delete(pkt); } +#define RSTP_PORT_PATH_COST_100M 200000 + static struct bridge * new_bridge(struct test_case *tc, int id) { @@ -122,6 +124,7 @@ new_bridge(struct test_case *tc, int id) for (i = 1; i < MAX_PORTS; i++) { p = rstp_add_port(b->rstp); rstp_port_set_aux(p, p); + rstp_port_set_path_cost(p, RSTP_PORT_PATH_COST_100M); rstp_port_set_state(p, RSTP_DISABLED); rstp_port_set_mac_operational(p, true); } @@ -544,8 +547,8 @@ test_rstp_main(int argc, char *argv[]) } get_token(); - path_cost = match(":") ? must_get_int() : - RSTP_DEFAULT_PORT_PATH_COST; + path_cost = match(":") ? must_get_int() + : RSTP_PORT_PATH_COST_100M; if (port_no < bridge->n_ports) { /* Enable port. */ reinitialize_port(p); diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index d0667f229da..bfb2adef1dd 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1678,7 +1678,7 @@ port_configure_stp(const struct ofproto *ofproto, struct port *port, unsigned int mbps; netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - mbps = netdev_features_to_bps(current, 100 * 1000 * 1000) / 1000000; + mbps = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 1000000; port_s->path_cost = stp_convert_speed_to_cost(mbps); } @@ -1761,7 +1761,7 @@ port_configure_rstp(const struct ofproto *ofproto, struct port *port, unsigned int mbps; netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - mbps = netdev_features_to_bps(current, 100 * 1000 * 1000) / 1000000; + mbps = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 1000000; port_s->path_cost = rstp_convert_speed_to_cost(mbps); } diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 928821a8239..f9bdb2d92be 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -4776,7 +4776,7 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Maximum rate shared by all queued traffic, in bit/s. Optional. If not specified, for physical interfaces, the default is the link rate. For other interfaces or if the link rate cannot be determined, the default - is currently 100 Mbps. + is currently 10 Gbps. From 59e8cb8a053d50f49629be8b6fd614562d066404 Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Mon, 14 Nov 2022 20:41:53 +0100 Subject: [PATCH 055/833] rhel: Move conf.db to /var/lib/openvswitch, using symlinks. conf.db is by default at /etc/openvswitch, but it should be at /var/lib/openvswitch like on Debian or like ovnnb_db.db and ovnsb_db.db. If conf.db already exists in /etc/openvswitch then it's moved to /var/lib/openvswitch. Symlinks are created for conf.db and .conf.db.~lock~ into /etc/openvswitch for backward compatibility. Reported-at: https://bugzilla.redhat.com/1830857 Reported-by: Yedidyah Bar David Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- rhel/openvswitch-fedora.spec.in | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 67268cb7833..c21592e47cb 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -238,8 +238,6 @@ rm -rf $RPM_BUILD_ROOT/%{_datadir}/openvswitch/python/ install -d -m 0755 $RPM_BUILD_ROOT/%{_sharedstatedir}/openvswitch -touch $RPM_BUILD_ROOT%{_sysconfdir}/openvswitch/conf.db -touch $RPM_BUILD_ROOT%{_sysconfdir}/openvswitch/.conf.db.~lock~ touch $RPM_BUILD_ROOT%{_sysconfdir}/openvswitch/system-id.conf install -p -m 644 -D selinux/openvswitch-custom.pp \ @@ -328,6 +326,27 @@ if [ $1 -eq 1 ]; then fi %endif +# Ensure that /etc/openvswitch/conf.db links to /var/lib/openvswitch, +# moving an existing file if there is one. +# +# Ditto for .conf.db.~lock~. +for base in conf.db .conf.db.~lock~; do + new=/var/lib/openvswitch/$base + old=/etc/openvswitch/$base + if test -f $old && test ! -e $new; then + mv $old $new + fi + if test ! -e $old && test ! -h $old; then + ln -s $new $old + fi + touch $new +%if %{with dpdk} + chown openvswitch:hugetlbfs $new +%else + chown openvswitch:openvswitch $new +%endif +done + %if 0%{?systemd_post:1} # This may not enable openvswitch service or do daemon-reload. %systemd_post %{name}.service @@ -413,8 +432,8 @@ fi %endif %dir %{_sysconfdir}/openvswitch %{_sysconfdir}/openvswitch/default.conf -%config %ghost %{_sysconfdir}/openvswitch/conf.db -%ghost %{_sysconfdir}/openvswitch/.conf.db.~lock~ +%config %ghost %{_sharedstatedir}/openvswitch/conf.db +%ghost %{_sharedstatedir}/openvswitch/.conf.db.~lock~ %config %ghost %{_sysconfdir}/openvswitch/system-id.conf %config(noreplace) %{_sysconfdir}/sysconfig/openvswitch %defattr(-,root,root) From cd475f976512bd1ce3abaf325c835780c37d6386 Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Wed, 12 May 2021 19:44:33 +0200 Subject: [PATCH 056/833] ovs-dpctl-top: Fix ovs-dpctl-top via pipe. Currently it's not possible to use ovs-dpctl-top via pipe (eg: ovs-dpctl dump-flows | ovs-dpctl-top --script --verbose) since Python3 doesn't allow to open a file (stdin in our case) in binary mode without buffering enabled. This commit changes the behaviour in order to directly pass stdin to flows_read instead of re-opening it without buffering. Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- utilities/ovs-dpctl-top.in | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/utilities/ovs-dpctl-top.in b/utilities/ovs-dpctl-top.in index fbe6e4f560a..2c1766eff5e 100755 --- a/utilities/ovs-dpctl-top.in +++ b/utilities/ovs-dpctl-top.in @@ -1236,11 +1236,7 @@ def flows_script(args): if (args.flowFiles is None): logging.info("reading flows from stdin") - ihdl = os.fdopen(sys.stdin.fileno(), 'r', 0) - try: - flow_db = flows_read(ihdl, flow_db) - finally: - ihdl.close() + flow_db = flows_read(sys.stdin, flow_db) else: for flowFile in args.flowFiles: logging.info("reading flows from %s", flowFile) From 954ae38a12f0c0d7bab1334c9ba353da94de887c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 24 Nov 2022 15:15:15 +0100 Subject: [PATCH 057/833] odp-util: Fix reporting unknown keys as keys with bad length. check_attr_len() currently reports all unknown keys as keys with bad length. For example, IPv6 extension headers are printed out like this in flow dumps: eth_type(0x86dd),ipv6(...) (bad key length 2, expected -1)(00 00/(bad mask length 2, expected -1)(00 00), icmpv6(type=0/0,code=0/0) However, since the key is unknown, the length check on it makes no sense and should be ignored. This will allow the unknown key to be caught later by the format_unknown_key() function and printed in a more user-friendly way: eth_type(0x86dd),ipv6(...),key32(00 00/00 00),icmpv6(type=0/0,code=0/0) '32' here is the actual index of the key attribute, so we know that it is unknown attribute #32 with the value/mask pair printed out inside the parenthesis. Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- lib/odp-util.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index 72e076e1c5b..5fc312f8c00 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -3594,9 +3594,16 @@ static bool check_attr_len(struct ds *ds, const struct nlattr *a, const struct nlattr *ma, const struct attr_len_tbl tbl[], int max_type, bool need_key) { + uint16_t type = nl_attr_type(a); int expected_len; - expected_len = odp_key_attr_len(tbl, max_type, nl_attr_type(a)); + if (type > max_type) { + /* Unknown attribute, can't check the length. */ + return true; + } + + expected_len = odp_key_attr_len(tbl, max_type, type); + if (expected_len != ATTR_LEN_VARIABLE && expected_len != ATTR_LEN_NESTED) { @@ -3605,7 +3612,7 @@ check_attr_len(struct ds *ds, const struct nlattr *a, const struct nlattr *ma, if (bad_key_len || bad_mask_len) { if (need_key) { - ds_put_format(ds, "key%u", nl_attr_type(a)); + ds_put_format(ds, "key%u", type); } if (bad_key_len) { ds_put_format(ds, "(bad key length %"PRIuSIZE", expected %d)(", From 55b9507e6824b935ffa0205fc7c7bebfe4e54279 Mon Sep 17 00:00:00 2001 From: Numan Siddique Date: Sun, 27 Nov 2022 22:56:13 -0500 Subject: [PATCH 058/833] ovsdb-idl: Add the support to specify the uuid for row insert. ovsdb-server allows the OVSDB clients to specify the uuid for the row inserts [1]. Both the C IDL client library and Python IDL are missing this feature. This patch adds this support. In C IDL, for each schema table, a new function is generated - insert_persistent_uuid(txn, uuid) which can be used the clients to persist the uuid. ovs-vsctl and other derivatives of ctl now supports the same in the generic 'create' command with the option "--id=". In Python IDL, the uuid to persist can be specified in the Transaction.insert() function. [1] - a529e3cd1f("ovsdb-server: Allow OVSDB clients to specify the UUID for inserted rows.:) Acked-by: Adrian Moreno Acked-by: Han Zhou Acked-by: Terry Wilson Signed-off-by: Numan Siddique Signed-off-by: Ilya Maximets --- NEWS | 3 ++ lib/db-ctl-base.c | 38 ++++++++++++------ lib/db-ctl-base.man | 5 ++- lib/db-ctl-base.xml | 6 ++- lib/ovsdb-idl-provider.h | 1 + lib/ovsdb-idl.c | 85 +++++++++++++++++++++++++++++----------- lib/ovsdb-idl.h | 3 ++ ovsdb/ovsdb-idlc.in | 15 +++++++ python/ovs/db/idl.py | 26 ++++++++---- tests/ovs-vsctl.at | 25 ++++++++++++ tests/ovsdb-idl.at | 58 +++++++++++++++++++++++++++ tests/test-ovsdb.c | 28 +++++++++++-- tests/test-ovsdb.py | 20 +++++++++- 13 files changed, 263 insertions(+), 50 deletions(-) diff --git a/NEWS b/NEWS index 3ae6882d551..f6caf1ca7f0 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,9 @@ Post-v3.0.0 - ovs-appctl: * "ovs-appctl ofproto/trace" command can now display port names with the "--names" option. + - OVSDB-IDL: + * Add the support to specify the persistent uuid for row insert in both + C and Python IDLs. - Windows: * Conntrack IPv6 fragment support. - DPDK: diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c index bc85e992173..856832a04d2 100644 --- a/lib/db-ctl-base.c +++ b/lib/db-ctl-base.c @@ -1731,29 +1731,43 @@ cmd_create(struct ctl_context *ctx) const struct ovsdb_idl_table_class *table; const struct ovsdb_idl_row *row; const struct uuid *uuid = NULL; + bool persist_uuid = false; + struct uuid uuid_; int i; ctx->error = get_table(table_name, &table); if (ctx->error) { return; } + if (id) { - struct ovsdb_symbol *symbol = NULL; + if (uuid_from_string(&uuid_, id)) { + uuid = &uuid_; + persist_uuid = true; + } else { + struct ovsdb_symbol *symbol = NULL; - ctx->error = create_symbol(ctx->symtab, id, &symbol, NULL); - if (ctx->error) { - return; - } - if (table->is_root) { - /* This table is in the root set, meaning that rows created in it - * won't disappear even if they are unreferenced, so disable - * warnings about that by pretending that there is a reference. */ - symbol->strong_ref = true; + ctx->error = create_symbol(ctx->symtab, id, &symbol, NULL); + if (ctx->error) { + return; + } + if (table->is_root) { + /* This table is in the root set, meaning that rows created in + * it won't disappear even if they are unreferenced, so disable + * warnings about that by pretending that there is a + * reference. */ + symbol->strong_ref = true; + } + uuid = &symbol->uuid; } - uuid = &symbol->uuid; } - row = ovsdb_idl_txn_insert(ctx->txn, table, uuid); + if (persist_uuid) { + row = ovsdb_idl_txn_insert_persist_uuid(ctx->txn, table, uuid); + } else { + row = ovsdb_idl_txn_insert(ctx->txn, table, uuid); + } + for (i = 2; i < ctx->argc; i++) { ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab); if (ctx->error) { diff --git a/lib/db-ctl-base.man b/lib/db-ctl-base.man index a529d8b4d3f..c8111c9efbe 100644 --- a/lib/db-ctl-base.man +++ b/lib/db-ctl-base.man @@ -203,7 +203,7 @@ Without \fB\-\-if-exists\fR, it is an error if \fIrecord\fR does not exist. With \fB\-\-if-exists\fR, this command does nothing if \fIrecord\fR does not exist. . -.IP "[\fB\-\-id=@\fIname\fR] \fBcreate\fR \fItable column\fR[\fB:\fIkey\fR]\fB=\fIvalue\fR..." +.IP "[\fB\-\-id=(@\fIname\fR | \fIuuid\fR] \fBcreate\fR \fItable column\fR[\fB:\fIkey\fR]\fB=\fIvalue\fR..." Creates a new record in \fItable\fR and sets the initial values of each \fIcolumn\fR. Columns not explicitly set will receive their default values. Outputs the UUID of the new row. @@ -212,6 +212,9 @@ If \fB@\fIname\fR is specified, then the UUID for the new row may be referred to by that name elsewhere in the same \fB\*(PN\fR invocation in contexts where a UUID is expected. Such references may precede or follow the \fBcreate\fR command. +.IP +If a valid \fIuuid\fR is specified, then it is used as the UUID +of the new row. . .RS .IP "Caution (ovs-vsctl as example)" diff --git a/lib/db-ctl-base.xml b/lib/db-ctl-base.xml index f6efe98eaf0..27c999fe71f 100644 --- a/lib/db-ctl-base.xml +++ b/lib/db-ctl-base.xml @@ -310,7 +310,7 @@

-
[--id=@name] create table column[:key]=value...
+
[--id=(@name|uuid)] create table column[:key]=value...

Creates a new record in table and sets the initial values of @@ -323,6 +323,10 @@ invocation in contexts where a UUID is expected. Such references may precede or follow the create command.

+

+ If a valid uuid is specified, then it is used as the + UUID of the new row. +

Caution (ovs-vsctl as example)
diff --git a/lib/ovsdb-idl-provider.h b/lib/ovsdb-idl-provider.h index 8797686f900..8d2b7d6b914 100644 --- a/lib/ovsdb-idl-provider.h +++ b/lib/ovsdb-idl-provider.h @@ -74,6 +74,7 @@ struct ovsdb_idl_row { struct ovs_list dst_arcs; /* Backward arcs (ovsdb_idl_arc.dst_node). */ struct ovsdb_idl_table *table; /* Containing table. */ struct ovsdb_datum *old_datum; /* Committed data (null if orphaned). */ + bool persist_uuid; /* Persist 'uuid' during insert txn if set. */ bool parsed; /* Whether the row is parsed. */ struct ovs_list reparse_node; /* Rows that needs to be re-parsed due to * insertion of a referenced row. */ diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c index 99b58422eca..dbdfe45d87e 100644 --- a/lib/ovsdb-idl.c +++ b/lib/ovsdb-idl.c @@ -2855,11 +2855,14 @@ substitute_uuids(struct json *json, const struct ovsdb_idl_txn *txn) row = ovsdb_idl_txn_get_row(txn, &uuid); if (row && !row->old_datum && row->new_datum) { - json_destroy(json); - - return json_array_create_2( - json_string_create("named-uuid"), - json_string_create_nocopy(ovsdb_data_row_name(&uuid))); + if (row->persist_uuid) { + return json; + } else { + json_destroy(json); + return json_array_create_2( + json_string_create("named-uuid"), + json_string_create_nocopy(ovsdb_data_row_name(&uuid))); + } } } @@ -3284,9 +3287,19 @@ ovsdb_idl_txn_commit(struct ovsdb_idl_txn *txn) any_updates = true; - json_object_put(op, "uuid-name", - json_string_create_nocopy( - ovsdb_data_row_name(&row->uuid))); + char *uuid_json; + struct json *value; + if (row->persist_uuid) { + uuid_json = "uuid"; + value = json_string_create_nocopy( + xasprintf(UUID_FMT, UUID_ARGS(&row->uuid))); + } else { + uuid_json = "uuid-name"; + value = json_string_create_nocopy( + ovsdb_data_row_name(&row->uuid)); + } + + json_object_put(op, uuid_json, value); insert = xmalloc(sizeof *insert); insert->dummy = row->uuid; @@ -3770,6 +3783,31 @@ ovsdb_idl_txn_delete(const struct ovsdb_idl_row *row_) row->new_datum = NULL; } +static const struct ovsdb_idl_row * +ovsdb_idl_txn_insert__(struct ovsdb_idl_txn *txn, + const struct ovsdb_idl_table_class *class, + const struct uuid *uuid, + bool persist_uuid) +{ + struct ovsdb_idl_row *row = ovsdb_idl_row_create__(class); + + ovs_assert(uuid || !persist_uuid); + if (uuid) { + ovs_assert(!ovsdb_idl_txn_get_row(txn, uuid)); + row->uuid = *uuid; + } else { + uuid_generate(&row->uuid); + } + row->persist_uuid = persist_uuid; + row->table = ovsdb_idl_table_from_class(txn->idl, class); + row->new_datum = xmalloc(class->n_columns * sizeof *row->new_datum); + hmap_insert(&row->table->rows, &row->hmap_node, uuid_hash(&row->uuid)); + hmap_insert(&txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); + ovsdb_idl_add_to_indexes(row); + + return row; +} + /* Inserts and returns a new row in the table with the specified 'class' in the * database with open transaction 'txn'. * @@ -3787,22 +3825,23 @@ ovsdb_idl_txn_insert(struct ovsdb_idl_txn *txn, const struct ovsdb_idl_table_class *class, const struct uuid *uuid) { - struct ovsdb_idl_row *row = ovsdb_idl_row_create__(class); - - if (uuid) { - ovs_assert(!ovsdb_idl_txn_get_row(txn, uuid)); - row->uuid = *uuid; - } else { - uuid_generate(&row->uuid); - } - - row->table = ovsdb_idl_table_from_class(txn->idl, class); - row->new_datum = xmalloc(class->n_columns * sizeof *row->new_datum); - hmap_insert(&row->table->rows, &row->hmap_node, uuid_hash(&row->uuid)); - hmap_insert(&txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); - ovsdb_idl_add_to_indexes(row); + return ovsdb_idl_txn_insert__(txn, class, uuid, false); +} - return row; +/* Inserts and returns a new row in the table with the specified 'class' in the + * database with open transaction 'txn'. + * + * The new row is assigned the specified UUID (which cannot be null). + * + * Usually this function is used indirectly through one of the + * "insert_persist_uuid" functions generated by ovsdb-idlc. */ +const struct ovsdb_idl_row * +ovsdb_idl_txn_insert_persist_uuid(struct ovsdb_idl_txn *txn, + const struct ovsdb_idl_table_class *class, + const struct uuid *uuid) +{ + ovs_assert(uuid); + return ovsdb_idl_txn_insert__(txn, class, uuid, true); } static void diff --git a/lib/ovsdb-idl.h b/lib/ovsdb-idl.h index fbd9f671a20..9a3e19f2055 100644 --- a/lib/ovsdb-idl.h +++ b/lib/ovsdb-idl.h @@ -375,6 +375,9 @@ void ovsdb_idl_txn_delete(const struct ovsdb_idl_row *); const struct ovsdb_idl_row *ovsdb_idl_txn_insert( struct ovsdb_idl_txn *, const struct ovsdb_idl_table_class *, const struct uuid *); +const struct ovsdb_idl_row *ovsdb_idl_txn_insert_persist_uuid( + struct ovsdb_idl_txn *txn, const struct ovsdb_idl_table_class *class, + const struct uuid *uuid); struct ovsdb_idl *ovsdb_idl_txn_get_idl (struct ovsdb_idl_txn *); void ovsdb_idl_get_initial_snapshot(struct ovsdb_idl *); diff --git a/ovsdb/ovsdb-idlc.in b/ovsdb/ovsdb-idlc.in index 5a97a8ea3e1..9a54f06a191 100755 --- a/ovsdb/ovsdb-idlc.in +++ b/ovsdb/ovsdb-idlc.in @@ -362,6 +362,8 @@ struct %(s)s *%(s)s_cursor_data(struct ovsdb_idl_cursor *); void %(s)s_init(struct %(s)s *); void %(s)s_delete(const struct %(s)s *); struct %(s)s *%(s)s_insert(struct ovsdb_idl_txn *); +struct %(s)s *%(s)s_insert_persist_uuid( + struct ovsdb_idl_txn *txn, const struct uuid *uuid); /* Returns true if the tracked column referenced by 'enum %(s)s_column_id' of * the row referenced by 'struct %(s)s *' was updated since the last change @@ -809,6 +811,19 @@ struct %(s)s * return %(s)s_cast(ovsdb_idl_txn_insert(txn, &%(p)stable_%(tl)s, NULL)); } +/* Inserts and returns a new row in the table "%(t)s" in the database + * with open transaction 'txn'. + * + * The new row is assigned the UUID specified in the 'uuid' parameter + * (which cannot be null). ovsdb-server will try to assign the same + * UUID when 'txn' is committed. */ +struct %(s)s * +%(s)s_insert_persist_uuid(struct ovsdb_idl_txn *txn, const struct uuid *uuid) +{ + return %(s)s_cast(ovsdb_idl_txn_insert_persist_uuid( + txn, &%(p)stable_%(tl)s, uuid)); +} + bool %(s)s_is_updated(const struct %(s)s *row, enum %(s)s_column_id column) { diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index 8e31e02d791..fe66402cff4 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -1223,7 +1223,7 @@ class Row(object): d["a"] = "b" row.mycolumn = d """ - def __init__(self, idl, table, uuid, data): + def __init__(self, idl, table, uuid, data, persist_uuid=False): # All of the explicit references to self.__dict__ below are required # to set real attributes with invoking self.__getattr__(). self.__dict__["uuid"] = uuid @@ -1278,6 +1278,10 @@ def __init__(self, idl, table, uuid, data): # in the dictionary are all None. self.__dict__["_prereqs"] = {} + # Indicates if the specified 'uuid' should be used as the row uuid + # or let the server generate it. + self.__dict__["_persist_uuid"] = persist_uuid + def __lt__(self, other): if not isinstance(other, Row): return NotImplemented @@ -1816,7 +1820,11 @@ def commit(self): op = {"table": row._table.name} if row._data is None: op["op"] = "insert" - op["uuid-name"] = _uuid_name_from_uuid(row.uuid) + if row._persist_uuid: + op["uuid"] = row.uuid + else: + op["uuid-name"] = _uuid_name_from_uuid(row.uuid) + any_updates = True op_index = len(operations) - 1 @@ -2056,20 +2064,22 @@ def _write(self, row, column, datum): row._mutations['_removes'].pop(column.name, None) row._changes[column.name] = datum.copy() - def insert(self, table, new_uuid=None): + def insert(self, table, new_uuid=None, persist_uuid=False): """Inserts and returns a new row in 'table', which must be one of the ovs.db.schema.TableSchema objects in the Idl's 'tables' dict. The new row is assigned a provisional UUID. If 'uuid' is None then one is randomly generated; otherwise 'uuid' should specify a randomly - generated uuid.UUID not otherwise in use. ovsdb-server will assign a - different UUID when 'txn' is committed, but the IDL will replace any - uses of the provisional UUID in the data to be to be committed by the - UUID assigned by ovsdb-server.""" + generated uuid.UUID not otherwise in use. If 'persist_uuid' is true + and 'new_uuid' is specified, IDL requests the ovsdb-server to assign + the same UUID, otherwise ovsdb-server will assign a different UUID when + 'txn' is committed and the IDL will replace any uses of the provisional + UUID in the data to be committed by the UUID assigned by + ovsdb-server.""" assert self._status == Transaction.UNCOMMITTED if new_uuid is None: new_uuid = uuid.uuid4() - row = Row(self.idl, table, new_uuid, None) + row = Row(self.idl, table, new_uuid, None, persist_uuid=persist_uuid) table.rows[row.uuid] = row self._txn_rows[row.uuid] = row return row diff --git a/tests/ovs-vsctl.at b/tests/ovs-vsctl.at index d6cd2c0849a..abf4fb9cf4e 100644 --- a/tests/ovs-vsctl.at +++ b/tests/ovs-vsctl.at @@ -1710,3 +1710,28 @@ ingress_policing_kpkts_rate: 100 ]) OVS_VSCTL_CLEANUP AT_CLEANUP + +AT_SETUP([ovs-vsctl create bridge with uuid]) +AT_KEYWORDS([create bridge with uuid]) +OVS_VSCTL_SETUP + +AT_CHECK([ovs-vsctl --no-wait --id=c5cc12f8-eaa1-43a7-8a73-bccd18df1111 create bridge \ +name=tst0 -- add open . bridges c5cc12f8-eaa1-43a7-8a73-bccd18df1111], [0],[dnl +c5cc12f8-eaa1-43a7-8a73-bccd18df1111 +]) + +AT_CHECK([ovs-vsctl --no-wait --id=c5cc12f8-eaa1-43a7-8a73-bccd18df1111 create bridge \ +name=tst1 -- add open . bridges c5cc12f8-eaa1-43a7-8a73-bccd18df1111], [1], [ignore], [ignore]) + +AT_CHECK([ovs-vsctl --no-wait --bare --columns _uuid,name list bridge], [0], [dnl +c5cc12f8-eaa1-43a7-8a73-bccd18df1111 +tst0 +]) + +ovs-vsctl --no-wait --id=@a create bridge \ +name=tst1 -- add open . bridges @a + +AT_CHECK([ovs-vsctl --no-wait --bare --columns _uuid,name list bridge tst1], [0], [ignore]) + +OVS_VSCTL_CLEANUP +AT_CLEANUP diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 8e75d00d7cc..c2970984bae 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -2555,3 +2555,61 @@ OVSDB_CHECK_IDL_TRACK([track, insert and delete, refs to link2], 005: table link2: i=1 l1= uuid=<1> 006: done ]]) + +m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_C], + [AT_SETUP([$1 - C]) + AT_KEYWORDS([idl persistent uuid insert]) + AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest.ovsschema"]) + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $2], + [0], [stdout], [stderr]) + AT_CHECK([sort stdout], + [0], [$3]) + AT_CHECK([grep $4 stderr], [0], [ignore]) + OVSDB_SERVER_SHUTDOWN + AT_CLEANUP]) + +m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_PY], + [AT_SETUP([$1 - Python3]) + AT_KEYWORDS([idl persistent uuid insert]) + AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest.ovsschema"]) + AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $2], + [0], [stdout], [stderr]) + AT_CHECK([sort stdout], + [0], [$3]) + AT_CHECK([grep $4 stderr], [0], [ignore]) + OVSDB_SERVER_SHUTDOWN + AT_CLEANUP]) + + +m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT], + [OVSDB_CHECK_IDL_PERS_UUID_INSERT_C($@) + OVSDB_CHECK_IDL_PERS_UUID_INSERT_PY($@)]) + +OVSDB_CHECK_IDL_PERS_UUID_INSERT([simple idl, persistent uuid insert], + [['insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df2222 2, insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df3333 3' \ + 'insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df4444 4, insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df2222 5' \ + 'insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df4444 4' \ + 'delete 2' \ + 'insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df2222 5' + ]], + [[000: empty +001: commit, status=success +002: table simple: i=2 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +002: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +003: commit, status=error +004: table simple: i=2 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +004: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +005: commit, status=success +006: table simple: i=2 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +006: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +006: table simple: i=4 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df4444 +007: commit, status=success +008: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +008: table simple: i=4 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df4444 +009: commit, status=success +010: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +010: table simple: i=4 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df4444 +010: table simple: i=5 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +011: done +]], + [['This UUID would duplicate a UUID already present within the table or deleted within the same transaction']]) diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index 5f7110f415f..84fe232765a 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -2400,7 +2400,7 @@ idltest_find_simple(struct ovsdb_idl *idl, int i) return NULL; } -static void +static bool idl_set(struct ovsdb_idl *idl, char *commands, int step) { char *cmd, *save_ptr1 = NULL; @@ -2458,6 +2458,19 @@ idl_set(struct ovsdb_idl *idl, char *commands, int step) s = idltest_simple_insert(txn); idltest_simple_set_i(s, atoi(arg1)); + } else if (!strcmp(name, "insert_uuid")) { + struct idltest_simple *s; + + if (!arg1 || !arg2) { + ovs_fatal(0, "\"insert\" command requires 2 arguments"); + } + + struct uuid s_uuid; + if (!uuid_from_string(&s_uuid, arg1)) { + ovs_fatal(0, "\"insert_uuid\" command requires valid uuid"); + } + s = idltest_simple_insert_persist_uuid(txn, &s_uuid); + idltest_simple_set_i(s, atoi(arg2)); } else if (!strcmp(name, "delete")) { const struct idltest_simple *s; @@ -2522,7 +2535,7 @@ idl_set(struct ovsdb_idl *idl, char *commands, int step) print_and_log("%03d: destroy", step); ovsdb_idl_txn_destroy(txn); ovsdb_idl_check_consistency(idl); - return; + return true; } else { ovs_fatal(0, "unknown command %s", name); } @@ -2543,6 +2556,8 @@ idl_set(struct ovsdb_idl *idl, char *commands, int step) ovsdb_idl_txn_destroy(txn); ovsdb_idl_check_consistency(idl); + + return (status != TXN_ERROR); } static const struct ovsdb_idl_table_class * @@ -2777,7 +2792,14 @@ do_idl(struct ovs_cmdl_context *ctx) update_conditions(idl, arg + strlen(cond_s)); print_and_log("%03d: change conditions", step++); } else if (arg[0] != '[') { - idl_set(idl, arg, step++); + if (!idl_set(idl, arg, step++)) { + /* If idl_set() returns false, then no transaction + * was sent to the server and most likely 'seqno' + * would remain the same. And the above 'Wait for update' + * for loop poll_block() would never return. + * So set seqno to 0. */ + seqno = 0; + } } else { struct json *json = parse_json(arg); substitute_uuids(json, symtab); diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index 402cacbe9d7..cca1818ea3a 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -429,6 +429,14 @@ def notify(event, row, updates=None): s = txn.insert(idl.tables["simple"]) s.i = int(args[0]) + elif name == "insert_uuid": + if len(args) != 2: + sys.stderr.write('"set" command requires 2 argument\n') + sys.exit(1) + + s = txn.insert(idl.tables["simple"], new_uuid=args[0], + persist_uuid=True) + s.i = int(args[1]) elif name == "delete": if len(args) != 1: sys.stderr.write('"delete" command requires 1 argument\n') @@ -491,7 +499,7 @@ def notify(event, row, updates=None): print("%03d: destroy" % step) sys.stdout.flush() txn.abort() - return + return True elif name == "linktest": l1_0 = txn.insert(idl.tables["link1"]) l1_0.i = 1 @@ -615,6 +623,8 @@ def notify(event, row, updates=None): sys.stdout.write("\n") sys.stdout.flush() + return status != ovs.db.idl.Transaction.ERROR + def update_condition(idl, commands): commands = commands[len("condition "):].split(";") @@ -748,7 +758,13 @@ def mock_notify(event, row, updates=None): sys.stdout.flush() step += 1 elif not command.startswith("["): - idl_set(idl, command, step) + if not idl_set(idl, command, step): + # If idl_set() returns false, then no transaction + # was sent to the server and most likely seqno + # would remain the same. And the above 'Wait for update' + # for loop poller.block() would never return. + # So set seqno to 0. + seqno = 0 step += 1 else: json = ovs.json.from_string(command) From a77c7796f23a76190b61e2109a009df980253b0f Mon Sep 17 00:00:00 2001 From: Ian Stokes Date: Mon, 5 Dec 2022 21:31:10 +0000 Subject: [PATCH 059/833] dpdk: Update to use v22.11.1. This commit add support to for DPDK v22.11.1, it includes the following changes. 1. ci: Reduce DPDK compilation time. 2. system-dpdk: Update vhost tests to be compatible with DPDK 22.07. http://patchwork.ozlabs.org/project/openvswitch/list/?series=316528 3. system-dpdk: Update vhost tests to be compatible with DPDK 22.07. http://patchwork.ozlabs.org/project/openvswitch/list/?series=311332 4. netdev-dpdk: Report device bus specific information. 5. netdev-dpdk: Drop reference to Rx header split. http://patchwork.ozlabs.org/project/openvswitch/list/?series=321808 In addition documentation was also updated in this commit for use with DPDK v22.11.1. The Debian shared DPDK compilation test is removed as part of this patch due to a packaging requirement. Once DPDK v22.11.1 is available in Debian repositories it should be re-enabled in OVS. For credit all authors of the original commits to 'dpdk-latest' with the above changes have been added as co-authors for this commit Signed-off-by: David Marchand Co-authored-by: David Marchand Signed-off-by: Sunil Pai G Co-authored-by: Sunil Pai G Tested-by: Michael Phelan Tested-by: Emma Finn Signed-off-by: Ian Stokes --- .ci/linux-build.sh | 7 ++- .github/workflows/build-and-test.yml | 1 - Documentation/faq/releases.rst | 2 +- Documentation/intro/install/dpdk.rst | 16 ++--- Documentation/topics/dpdk/phy.rst | 8 +-- Documentation/topics/dpdk/vdev.rst | 2 +- Documentation/topics/dpdk/vhost-user.rst | 2 +- Documentation/topics/testing.rst | 2 +- Documentation/topics/userspace-tso.rst | 2 +- NEWS | 18 +----- debian/control.in | 2 +- lib/netdev-dpdk.c | 24 +++----- rhel/openvswitch-fedora.spec.in | 2 +- tests/system-dpdk.at | 78 ++++++++++++------------ 14 files changed, 73 insertions(+), 93 deletions(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 23c8bbb7aed..48510967238 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -160,6 +160,11 @@ function install_dpdk() # meson verbose outputs. DPDK_OPTS="$DPDK_OPTS -Ddeveloper_mode=disabled" + # OVS compilation and "normal" unit tests (run in the CI) do not depend on + # any DPDK driver being present. + # We can disable all drivers to save compilation time. + DPDK_OPTS="$DPDK_OPTS -Ddisable_drivers=*/*" + # Install DPDK using prefix. DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" @@ -228,7 +233,7 @@ fi if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then if [ -z "$DPDK_VER" ]; then - DPDK_VER="21.11.2" + DPDK_VER="22.11.1" fi install_dpdk $DPDK_VER fi diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 7baa914034a..e08d7b1bac1 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -213,7 +213,6 @@ jobs: matrix: include: - dpdk: no - - dpdk: shared steps: - name: checkout diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index ac0001cd576..e19f54c8f01 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -233,7 +233,7 @@ Q: Are all the DPDK releases that OVS versions work with maintained? The latest information about DPDK stable and LTS releases can be found at `DPDK stable`_. -.. _DPDK stable: http://doc.dpdk.org/guides-21.11/contributing/stable.html +.. _DPDK stable: http://doc.dpdk.org/guides-22.11/contributing/stable.html Q: I get an error like this when I configure Open vSwitch: diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index a284e68514c..e360ee83ddc 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -42,7 +42,7 @@ Build requirements In addition to the requirements described in :doc:`general`, building Open vSwitch with DPDK will require the following: -- DPDK 21.11.2 +- DPDK 22.11.1 - A `DPDK supported NIC`_ @@ -59,8 +59,8 @@ vSwitch with DPDK will require the following: Detailed system requirements can be found at `DPDK requirements`_. -.. _DPDK supported NIC: https://doc.dpdk.org/guides-21.11/nics/index.html -.. _DPDK requirements: https://doc.dpdk.org/guides-21.11/linux_gsg/sys_reqs.html +.. _DPDK supported NIC: https://doc.dpdk.org/guides-22.11/nics/index.html +.. _DPDK requirements: https://doc.dpdk.org/guides-22.11/linux_gsg/sys_reqs.html .. _dpdk-install: @@ -73,9 +73,9 @@ Install DPDK #. Download the `DPDK sources`_, extract the file and set ``DPDK_DIR``:: $ cd /usr/src/ - $ wget https://fast.dpdk.org/rel/dpdk-21.11.2.tar.xz - $ tar xf dpdk-21.11.2.tar.xz - $ export DPDK_DIR=/usr/src/dpdk-stable-21.11.2 + $ wget https://fast.dpdk.org/rel/dpdk-22.11.1.tar.xz + $ tar xf dpdk-22.11.tar.xz + $ export DPDK_DIR=/usr/src/dpdk-stable-22.11.1 $ cd $DPDK_DIR #. Configure and install DPDK using Meson @@ -121,7 +121,7 @@ Install DPDK .. _DPDK sources: http://dpdk.org/rel .. _DPDK documentation: - https://doc.dpdk.org/guides-21.11/linux_gsg/build_dpdk.html + https://doc.dpdk.org/guides-22.11/linux_gsg/build_dpdk.html Install OVS ~~~~~~~~~~~ @@ -722,7 +722,7 @@ Limitations release notes`_. .. _DPDK release notes: - https://doc.dpdk.org/guides-21.11/rel_notes/release_21_11.html + https://doc.dpdk.org/guides-22.11/rel_notes/release_22_11.html - Upper bound MTU: DPDK device drivers differ in how the L2 frame for a given MTU value is calculated e.g. i40e driver includes 2 x vlan headers in diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index 8fc34a378cb..cb2d5bcb7b3 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -117,7 +117,7 @@ tool:: For more information, refer to the `DPDK documentation `__. -.. _dpdk-drivers: https://doc.dpdk.org/guides-21.11/linux_gsg/linux_drivers.html +.. _dpdk-drivers: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html .. _dpdk-phy-multiqueue: @@ -235,7 +235,7 @@ To hotplug a port with igb_uio in this case, DPDK must be configured to use physical addressing for IOVA mode. For more information regarding IOVA modes in DPDK please refer to the `DPDK IOVA Mode Detection`__. -__ https://doc.dpdk.org/guides-21.11/prog_guide/env_abstraction_layer.html#iova-mode-detection +__ https://doc.dpdk.org/guides-22.11/prog_guide/env_abstraction_layer.html#iova-mode-detection To configure OVS DPDK to use physical addressing for IOVA:: @@ -267,7 +267,7 @@ Representors are multi devices created on top of one PF. For more information, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/prog_guide/switch_representation.html#port-representors +__ https://doc.dpdk.org/guides-22.11/prog_guide/switch_representation.html#port-representors Prior to port representors there was a one-to-one relationship between the PF and the eth device. With port representors the relationship becomes one PF to @@ -401,7 +401,7 @@ in the ``options`` column of the ``Interface`` table. kernel netdevice, and be inherited from it when Open vSwitch is restarted, even if the options described in this section are unset from Open vSwitch. -.. _bifurcated-drivers: https://doc.dpdk.org/guides-21.11/linux_gsg/linux_drivers.html#bifurcated-driver +.. _bifurcated-drivers: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html#bifurcated-driver - Configure the VF MAC address:: diff --git a/Documentation/topics/dpdk/vdev.rst b/Documentation/topics/dpdk/vdev.rst index 97ac6d9a52a..3383afce562 100644 --- a/Documentation/topics/dpdk/vdev.rst +++ b/Documentation/topics/dpdk/vdev.rst @@ -63,4 +63,4 @@ run:: More information on the different types of virtual DPDK PMDs can be found in the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/nics/overview.html +__ https://doc.dpdk.org/guides-22.11/nics/overview.html diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst index 8c233c1d305..3a5f5be9887 100644 --- a/Documentation/topics/dpdk/vhost-user.rst +++ b/Documentation/topics/dpdk/vhost-user.rst @@ -539,4 +539,4 @@ shown with:: Further information can be found in the `DPDK documentation -`__ +`__ diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index 871ce5637d1..abccce1ee60 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -353,7 +353,7 @@ All tests are skipped if no hugepages are configured. User must look into the DP manual to figure out how to `Configure hugepages`_. The phy test will skip if no compatible physical device is available. -.. _Configure hugepages: https://doc.dpdk.org/guides-21.11/linux_gsg/sys_reqs.html +.. _Configure hugepages: https://doc.dpdk.org/guides-22.11/linux_gsg/sys_reqs.html All the features documented under `Unit Tests`_ are available for the DPDK testsuite. diff --git a/Documentation/topics/userspace-tso.rst b/Documentation/topics/userspace-tso.rst index 33a85965c19..5a43c2e86b8 100644 --- a/Documentation/topics/userspace-tso.rst +++ b/Documentation/topics/userspace-tso.rst @@ -46,7 +46,7 @@ datasheet for compatibility. Secondly, the NIC must have an associated DPDK Poll Mode Driver (PMD) which supports `TSO`. For a list of features per PMD, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/nics/overview.html +__ https://doc.dpdk.org/guides-22.11/nics/overview.html Enabling TSO ~~~~~~~~~~~~ diff --git a/NEWS b/NEWS index f6caf1ca7f0..265375e1cb8 100644 --- a/NEWS +++ b/NEWS @@ -9,23 +9,7 @@ Post-v3.0.0 - Windows: * Conntrack IPv6 fragment support. - DPDK: - * OVS validated with DPDK 21.11.2. - DPDK 21.11.2 contains fixes for the following CVEs: - CVE-2022-28199 cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-28199 - CVE-2022-2132 cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-2132 - A bug was introduced in DPDK 21.11.1 by the commit - 01e3dee29c02 ("vhost: fix unsafe vring addresses modifications"). - This bug can cause a deadlock when vIOMMU is enabled and NUMA - reallocation of the virtqueues happen. - A fix has been posted and pushed to the DPDK 21.11 branch. - It can be found here: - https://patches.dpdk.org/project/dpdk/patch/20220725203206.427083-2-david.marchand@redhat.com/. - If a user wishes to avoid the issue then it is recommended to use - DPDK 21.11.0 until the release of DPDK 21.11.3. - It should be noted that DPDK 21.11.0 does not benefit from the numerous - bug and CVE fixes addressed since its release. - If a user wishes to benefit from these fixes it is recommended to use - DPDK 21.11.2. + * Add support for DPDK 22.11.1. - For the QoS max-rate and STP/RSTP path-cost configuration OVS now assumes 10 Gbps link speed by default in case the actual link speed cannot be determined. Previously it was 10 Mbps. Values can still be overridden diff --git a/debian/control.in b/debian/control.in index db52c8a99f0..19f590d0645 100644 --- a/debian/control.in +++ b/debian/control.in @@ -21,7 +21,7 @@ Build-Depends: iproute2, libcap-ng-dev, libdbus-1-dev [amd64 i386 ppc64el arm64], -# DPDK_NETDEV libdpdk-dev (>= 21.11) [amd64 i386 ppc64el arm64], +# DPDK_NETDEV libdpdk-dev (>= 22.11) [amd64 i386 ppc64el arm64], libnuma-dev [amd64 i386 ppc64el arm64], libpcap-dev [amd64 i386 ppc64el arm64], libssl-dev, diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 72e7a32688f..fff57f78279 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -27,9 +27,10 @@ #include #include -#include +#include #include #include +#include #include #include #include @@ -166,7 +167,6 @@ typedef uint16_t dpdk_port_t; static const struct rte_eth_conf port_conf = { .rxmode = { - .split_hdr_size = 0, .offloads = 0, }, .rx_adv_conf = { @@ -3645,6 +3645,7 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct rte_eth_dev_info dev_info; + const char *bus_info; uint32_t link_speed; uint32_t dev_flags; @@ -3657,19 +3658,8 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) rte_eth_dev_info_get(dev->port_id, &dev_info); link_speed = dev->link.link_speed; dev_flags = *dev_info.dev_flags; + bus_info = rte_dev_bus_info(dev_info.device); ovs_mutex_unlock(&dev->mutex); - const struct rte_bus *bus; - const struct rte_pci_device *pci_dev; - uint16_t vendor_id = RTE_PCI_ANY_ID; - uint16_t device_id = RTE_PCI_ANY_ID; - bus = rte_bus_find_by_device(dev_info.device); - if (bus && !strcmp(bus->name, "pci")) { - pci_dev = RTE_DEV_TO_PCI(dev_info.device); - if (pci_dev) { - vendor_id = pci_dev->id.vendor_id; - device_id = pci_dev->id.device_id; - } - } ovs_mutex_unlock(&dpdk_mutex); smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id); @@ -3693,8 +3683,10 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD); smap_add_format(args, "if_descr", "%s %s", rte_version(), dev_info.driver_name); - smap_add_format(args, "pci-vendor_id", "0x%x", vendor_id); - smap_add_format(args, "pci-device_id", "0x%x", device_id); + smap_add_format(args, "bus_info", "bus_name=%s%s%s", + rte_bus_name(rte_dev_bus(dev_info.device)), + bus_info != NULL ? ", " : "", + bus_info != NULL ? bus_info : ""); /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps. * In that case the speed will not be reported as part of the usual diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index c21592e47cb..4a3e6294bfb 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -71,7 +71,7 @@ BuildRequires: libcap-ng libcap-ng-devel %endif %if %{with dpdk} BuildRequires: libpcap-devel numactl-devel -BuildRequires: dpdk-devel >= 21.11 +BuildRequires: dpdk-devel >= 22.11 Provides: %{name}-dpdk = %{version}-%{release} %endif %if %{with afxdp} diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index fd7884e0f8c..8dc187a61d4 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -78,14 +78,14 @@ AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -112,11 +112,11 @@ AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuser0 -- set Interface dpdkvhostuser0 AT_CHECK([ovs-vsctl show], [], [stdout]) dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user server: socket created" \ +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) vhost-user server: socket created" \ ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "Socket $OVS_RUNDIR/dpdkvhostuser0 created for vhost-user port dpdkvhostuser0" \ ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: bind to $OVS_RUNDIR/dpdkvhostuser0" ovs-vswitchd.log], [], +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) binding succeeded" ovs-vswitchd.log], [], [stdout]) dnl Set up namespaces @@ -157,8 +157,8 @@ pkill -f -x -9 'tail -f /dev/null' dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuser0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: recvmsg failed@d -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostuser0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) recvmsg failed@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) failed to connect: No such file or directory@d \@dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports.@d \@failed to enumerate system datapaths: No such file or directory@d ])") @@ -187,9 +187,9 @@ AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface \ AT_CHECK([ovs-vsctl show], [], [stdout]) dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Set up namespaces ADD_NAMESPACES(ns1, ns2) @@ -229,8 +229,8 @@ pkill -f -x -9 'tail -f /dev/null' dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: recvmsg failed@d -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) recvmsg failed@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d \@dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports.@d \@failed to enumerate system datapaths: No such file or directory@d ])") @@ -304,14 +304,14 @@ AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -345,14 +345,14 @@ AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -385,14 +385,14 @@ AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 10000' stdout], [], [stdout]) dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -448,9 +448,9 @@ AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Fail if egress policer could not be created AT_FAIL_IF([grep "Could not create rte meter for egress policer" ovs-vswitchd.log], [], [stdout]) @@ -465,7 +465,7 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -487,9 +487,9 @@ OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@new sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Check egress policer was not created AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) @@ -498,7 +498,7 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d \@Could not create rte meter for egress policer@d \@Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument@d ])") @@ -522,9 +522,9 @@ OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@new sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Check egress policer was not created AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) @@ -533,7 +533,7 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d \@Could not create rte meter for egress policer@d \@Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument@d ])") @@ -646,9 +646,9 @@ AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Execute testpmd in background on_exit "pkill -f -x -9 'tail -f /dev/null'" @@ -675,7 +675,7 @@ pkill -f -x -9 'tail -f /dev/null' dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -703,9 +703,9 @@ AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Execute testpmd in background on_exit "pkill -f -x -9 'tail -f /dev/null'" @@ -732,7 +732,7 @@ pkill -f -x -9 'tail -f /dev/null' dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d ])") AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -864,7 +864,7 @@ pkill -f -x -9 'tail -f /dev/null' dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d \@dpdkvhostuserclient0: unsupported MTU 9711@d \@failed to set MTU for network device dpdkvhostuserclient0: Invalid argument@d ])") @@ -894,9 +894,9 @@ AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) dnl Execute testpmd in background on_exit "pkill -f -x -9 'tail -f /dev/null'" @@ -921,7 +921,7 @@ pkill -f -x -9 'tail -f /dev/null' dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d +\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d \@dpdkvhostuserclient0: unsupported MTU 67@d \@failed to set MTU for network device dpdkvhostuserclient0: Invalid argument@d ])") From b8bf410a5c94173da02279b369d75875c4035959 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 21 Sep 2022 22:50:49 +0200 Subject: [PATCH 060/833] db-ctl-base: Use partial map/set updates for last add/set commands. Currently, command to add one item into a large set generates the transaction with the full new content of that set plus 'wait' operation for the full old content of that set. So, if we're adding one new load-balancer into a load-balancer group in OVN using ovn-nbctl, transaction will include all the existing load-balancers from that groups twice. IDL supports partial updates for sets and maps. The problem with that is changes are not visible to the IDL user until the transaction is committed. That will cause problems for chained ctl commands. However, we still can optimize the very last command in the list. It makes sense to do, since it's a common case for manual invocations. Updating the 'add' command as well as 'set' for a case where we're actually adding one new element to the map. One downside is that we can't check the set size without examining it and checking for duplicates, so allowing the transaction to be sent and constraints to be checked on the server side in that case. Not touching 'remove' operation for now, since removals may have different type, e.g. if elements from the map are removed by the key. The function will likely need to be fully re-written to accommodate all the corner cases. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/db-ctl-base.c | 87 +++++++++++++++++++++++++++++++------------ lib/db-ctl-base.h | 8 +++- tests/ovs-vsctl.at | 6 ++- utilities/ovs-vsctl.c | 7 ++-- vtep/vtep-ctl.c | 7 ++-- 5 files changed, 83 insertions(+), 32 deletions(-) diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c index 856832a04d2..134496ef3f6 100644 --- a/lib/db-ctl-base.c +++ b/lib/db-ctl-base.c @@ -75,7 +75,7 @@ static struct shash all_commands = SHASH_INITIALIZER(&all_commands); static char *get_table(const char *, const struct ovsdb_idl_table_class **); static char *set_column(const struct ovsdb_idl_table_class *, const struct ovsdb_idl_row *, const char *, - struct ovsdb_symbol_table *); + struct ovsdb_symbol_table *, bool use_partial_update); static struct option * @@ -1325,11 +1325,17 @@ cmd_find(struct ctl_context *ctx) } /* Sets the column of 'row' in 'table'. Returns NULL on success or a - * malloc()'ed error message on failure. */ + * malloc()'ed error message on failure. + * + * If 'use_partial_update' is true, then this function will try to use + * partial set/map updates, if possible. As a side effect, result will + * not be reflected in the IDL until the transaction is committed. + * The last access to a particular column is a good candidate to use + * this option. */ static char * OVS_WARN_UNUSED_RESULT set_column(const struct ovsdb_idl_table_class *table, const struct ovsdb_idl_row *row, const char *arg, - struct ovsdb_symbol_table *symtab) + struct ovsdb_symbol_table *symtab, bool use_partial_update) { const struct ovsdb_idl_column *column; char *key_string = NULL; @@ -1352,7 +1358,7 @@ set_column(const struct ovsdb_idl_table_class *table, if (key_string) { union ovsdb_atom key, value; - struct ovsdb_datum datum; + struct ovsdb_datum *datum; if (column->type.value.type == OVSDB_TYPE_VOID) { error = xasprintf("cannot specify key to set for non-map column " @@ -1371,16 +1377,22 @@ set_column(const struct ovsdb_idl_table_class *table, goto out; } - ovsdb_datum_init_empty(&datum); - ovsdb_datum_add_unsafe(&datum, &key, &value, &column->type, NULL); + datum = xmalloc(sizeof *datum); + ovsdb_datum_init_empty(datum); + ovsdb_datum_add_unsafe(datum, &key, &value, &column->type, NULL); ovsdb_atom_destroy(&key, column->type.key.type); ovsdb_atom_destroy(&value, column->type.value.type); - ovsdb_datum_union(&datum, ovsdb_idl_read(row, column), - &column->type); - ovsdb_idl_txn_verify(row, column); - ovsdb_idl_txn_write(row, column, &datum); + if (use_partial_update) { + ovsdb_idl_txn_write_partial_map(row, column, datum); + } else { + ovsdb_datum_union(datum, ovsdb_idl_read(row, column), + &column->type); + ovsdb_idl_txn_verify(row, column); + ovsdb_idl_txn_write(row, column, datum); + free(datum); + } } else { struct ovsdb_datum datum; @@ -1441,7 +1453,8 @@ cmd_set(struct ctl_context *ctx) } for (i = 3; i < ctx->argc; i++) { - ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab); + ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab, + ctx->last_command); if (ctx->error) { return; } @@ -1479,7 +1492,7 @@ cmd_add(struct ctl_context *ctx) const struct ovsdb_idl_column *column; const struct ovsdb_idl_row *row; const struct ovsdb_type *type; - struct ovsdb_datum old; + struct ovsdb_datum new; int i; ctx->error = get_table(table_name, &table); @@ -1503,7 +1516,13 @@ cmd_add(struct ctl_context *ctx) } type = &column->type; - ovsdb_datum_clone(&old, ovsdb_idl_read(row, column)); + + if (ctx->last_command) { + ovsdb_datum_init_empty(&new); + } else { + ovsdb_datum_clone(&new, ovsdb_idl_read(row, column)); + } + for (i = 4; i < ctx->argc; i++) { struct ovsdb_type add_type; struct ovsdb_datum add; @@ -1514,23 +1533,41 @@ cmd_add(struct ctl_context *ctx) ctx->error = ovsdb_datum_from_string(&add, &add_type, ctx->argv[i], ctx->symtab); if (ctx->error) { - ovsdb_datum_destroy(&old, &column->type); + ovsdb_datum_destroy(&new, &column->type); return; } - ovsdb_datum_union(&old, &add, type); + ovsdb_datum_union(&new, &add, type); ovsdb_datum_destroy(&add, type); } - if (old.n > type->n_max) { + + if (!ctx->last_command && new.n > type->n_max) { ctl_error(ctx, "\"add\" operation would put %u %s in column %s of " "table %s but the maximum number is %u", - old.n, + new.n, type->value.type == OVSDB_TYPE_VOID ? "values" : "pairs", column->name, table->name, type->n_max); - ovsdb_datum_destroy(&old, &column->type); + ovsdb_datum_destroy(&new, &column->type); return; } - ovsdb_idl_txn_verify(row, column); - ovsdb_idl_txn_write(row, column, &old); + + if (ctx->last_command) { + /* Partial updates can only be made one by one. */ + for (i = 0; i < new.n; i++) { + struct ovsdb_datum *datum = xmalloc(sizeof *datum); + + ovsdb_datum_init_empty(datum); + ovsdb_datum_add_from_index_unsafe(datum, &new, i, type); + if (ovsdb_type_is_map(type)) { + ovsdb_idl_txn_write_partial_map(row, column, datum); + } else { + ovsdb_idl_txn_write_partial_set(row, column, datum); + } + } + ovsdb_datum_destroy(&new, &column->type); + } else { + ovsdb_idl_txn_verify(row, column); + ovsdb_idl_txn_write(row, column, &new); + } invalidate_cache(ctx); } @@ -1769,7 +1806,7 @@ cmd_create(struct ctl_context *ctx) } for (i = 2; i < ctx->argc; i++) { - ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab); + ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab, false); if (ctx->error) { return; } @@ -2620,7 +2657,8 @@ ctl_list_db_tables_usage(void) /* Initializes 'ctx' from 'command'. */ void ctl_context_init_command(struct ctl_context *ctx, - struct ctl_command *command) + struct ctl_command *command, + bool last) { ctx->argc = command->argc; ctx->argv = command->argv; @@ -2629,6 +2667,7 @@ ctl_context_init_command(struct ctl_context *ctx, ds_swap(&ctx->output, &command->output); ctx->table = command->table; ctx->try_again = false; + ctx->last_command = last; ctx->error = NULL; } @@ -2640,7 +2679,7 @@ ctl_context_init(struct ctl_context *ctx, struct ctl_command *command, void (*invalidate_cache_cb)(struct ctl_context *)) { if (command) { - ctl_context_init_command(ctx, command); + ctl_context_init_command(ctx, command, false); } ctx->idl = idl; ctx->txn = txn; @@ -2684,7 +2723,7 @@ ctl_set_column(const char *table_name, const struct ovsdb_idl_row *row, if (error) { return error; } - error = set_column(table, row, arg, symtab); + error = set_column(table, row, arg, symtab, false); if (error) { return error; } diff --git a/lib/db-ctl-base.h b/lib/db-ctl-base.h index 284b573d0bc..ea7e97b7844 100644 --- a/lib/db-ctl-base.h +++ b/lib/db-ctl-base.h @@ -239,9 +239,15 @@ struct ctl_context { /* A command may set this member to true if some prerequisite is not met * and the caller should wait for something to change and then retry. */ bool try_again; + + /* If set during the context initialization, command implementation + * may use optimizations that will leave database changes invisible + * to IDL, e.g. use partial set updates. */ + bool last_command; }; -void ctl_context_init_command(struct ctl_context *, struct ctl_command *); +void ctl_context_init_command(struct ctl_context *, struct ctl_command *, + bool last); void ctl_context_init(struct ctl_context *, struct ctl_command *, struct ovsdb_idl *, struct ovsdb_idl_txn *, struct ovsdb_symbol_table *, diff --git a/tests/ovs-vsctl.at b/tests/ovs-vsctl.at index abf4fb9cf4e..a92156f001c 100644 --- a/tests/ovs-vsctl.at +++ b/tests/ovs-vsctl.at @@ -1071,9 +1071,13 @@ AT_CHECK([RUN_OVS_VSCTL([set controller br1 'connection-mode=xyz'])], AT_CHECK([RUN_OVS_VSCTL([set controller br1 connection-mode:x=y])], [1], [], [ovs-vsctl: cannot specify key to set for non-map column connection_mode ]) -AT_CHECK([RUN_OVS_VSCTL([add bridge br1 datapath_id x y])], +AT_CHECK([RUN_OVS_VSCTL([add bridge br1 datapath_id x y -- show])], [1], [], [ovs-vsctl: "add" operation would put 2 values in column datapath_id of table Bridge but the maximum number is 1 ]) +AT_CHECK([RUN_OVS_VSCTL([add bridge br1 datapath_id x y])], [1], [], [stderr]) +AT_CHECK([sed "/^.*|WARN|.*/d" < stderr], [0], [dnl +ovs-vsctl: transaction error: {"details":"set must have 0 to 1 members but 2 are present","error":"syntax error","syntax":"[[\"set\",[\"x\",\"y\"]]]"} +]) AT_CHECK([RUN_OVS_VSCTL([remove netflow `cat netflow-uuid` targets '"1.2.3.4:567"'])], [1], [], [ovs-vsctl: "remove" operation would put 0 values in column targets of table NetFlow but the minimum number is 1 ]) diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index 1032089fc26..c1d47000616 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -2711,9 +2711,9 @@ post_db_reload_do_checks(const struct vsctl_context *vsctl_ctx) static void vsctl_context_init_command(struct vsctl_context *vsctl_ctx, - struct ctl_command *command) + struct ctl_command *command, bool last_command) { - ctl_context_init_command(&vsctl_ctx->base, command); + ctl_context_init_command(&vsctl_ctx->base, command, last_command); vsctl_ctx->verified_ports = false; } @@ -2859,7 +2859,8 @@ do_vsctl(const char *args, struct ctl_command *commands, size_t n_commands, } vsctl_context_init(&vsctl_ctx, NULL, idl, txn, ovs, symtab); for (c = commands; c < &commands[n_commands]; c++) { - vsctl_context_init_command(&vsctl_ctx, c); + vsctl_context_init_command(&vsctl_ctx, c, + c == &commands[n_commands - 1]); if (c->syntax->run) { (c->syntax->run)(&vsctl_ctx.base); } diff --git a/vtep/vtep-ctl.c b/vtep/vtep-ctl.c index 99c4adcd53d..e5d99714dee 100644 --- a/vtep/vtep-ctl.c +++ b/vtep/vtep-ctl.c @@ -2207,9 +2207,9 @@ static const struct ctl_table_class tables[VTEPREC_N_TABLES] = { static void vtep_ctl_context_init_command(struct vtep_ctl_context *vtepctl_ctx, - struct ctl_command *command) + struct ctl_command *command, bool last_command) { - ctl_context_init_command(&vtepctl_ctx->base, command); + ctl_context_init_command(&vtepctl_ctx->base, command, last_command); vtepctl_ctx->verified_ports = false; } @@ -2304,7 +2304,8 @@ do_vtep_ctl(const char *args, struct ctl_command *commands, } vtep_ctl_context_init(&vtepctl_ctx, NULL, idl, txn, vtep_global, symtab); for (c = commands; c < &commands[n_commands]; c++) { - vtep_ctl_context_init_command(&vtepctl_ctx, c); + vtep_ctl_context_init_command(&vtepctl_ctx, c, + c == &commands[n_commands - 1]); if (c->syntax->run) { (c->syntax->run)(&vtepctl_ctx.base); } From 093915e04a978c3c37005968f2a4358ef24a2745 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Thu, 27 Oct 2022 14:01:13 +0800 Subject: [PATCH 061/833] vswitch.ovsschema: Set bfd_status to ephemeral. When restart openvswitch, the bfd status will be kept before ovs-vswitchd running. And if the ovs-vswitchd has high workload, which will defer updating bfd status, which not we excepted. Signed-off-by: Daniel Ding Signed-off-by: Ilya Maximets --- vswitchd/vswitch.ovsschema | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index 4873cfde72d..1a49cdffea7 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "8.3.0", - "cksum": "3781850481 26690", + "version": "8.3.1", + "cksum": "3012963480 26720", "tables": { "Open_vSwitch": { "columns": { @@ -280,7 +280,8 @@ "min": 0, "max": "unlimited"}}, "bfd_status": { "type": {"key": "string", "value": "string", - "min": 0, "max": "unlimited"}}, + "min": 0, "max": "unlimited"}, + "ephemeral": true}, "cfm_mpid": { "type": { "key": {"type": "integer"}, From e83dad6e53f3fe04ca9c4d6972fcaa7995de2ba2 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 25 Nov 2022 13:37:04 +0100 Subject: [PATCH 062/833] ovsdb: Count weak reference objects. OVSDB creates a separate object for each weak reference in order to track them and there could be a significant amount of these objects in the database. We also had problems with number of these objects growing out of bounds recently. So, adding them to a memory report seems to be a good thing. Counting them globally to cover all the copied instances in transactions and the transaction history (even though there should be none). It's also hard to count them per-database, because weak references are stored on destination rows and can be destroyed either while destroying the destination row or while removing the reference from the source row. Also, not all the involved functions have direct access to the database object. So, there is no single clear place where counters should be updated. Acked-by: Dumitru Ceara Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/ovsdb.c | 4 ++++ ovsdb/ovsdb.h | 4 ++++ ovsdb/row.c | 5 ++++- ovsdb/transaction.c | 2 ++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 1c011fab00d..11786f37660 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -43,6 +43,8 @@ #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(ovsdb); +size_t n_weak_refs = 0; + struct ovsdb_schema * ovsdb_schema_create(const char *name, const char *version, const char *cksum) { @@ -546,6 +548,8 @@ ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage) if (db->storage) { ovsdb_storage_get_memory_usage(db->storage, usage); } + + simap_put(usage, "n-weak-refs", n_weak_refs); } struct ovsdb_table * diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index d05e7c64a69..13d8bf407be 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -125,6 +125,10 @@ struct ovsdb { struct ovsdb_compaction_state *snap_state; }; +/* Total number of 'weak reference' objects in all databases + * and transactions. */ +extern size_t n_weak_refs; + struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *); void ovsdb_destroy(struct ovsdb *); diff --git a/ovsdb/row.c b/ovsdb/row.c index 3f0bb8acf12..d7bfbdd365e 100644 --- a/ovsdb/row.c +++ b/ovsdb/row.c @@ -21,8 +21,9 @@ #include "openvswitch/dynamic-string.h" #include "openvswitch/json.h" -#include "ovsdb-error.h" #include "openvswitch/shash.h" +#include "ovsdb-error.h" +#include "ovsdb.h" #include "sort.h" #include "table.h" #include "util.h" @@ -78,6 +79,7 @@ ovsdb_weak_ref_clone(struct ovsdb_weak_ref *src) ovsdb_type_clone(&weak->type, &src->type); weak->column_idx = src->column_idx; weak->by_key = src->by_key; + n_weak_refs++; return weak; } @@ -130,6 +132,7 @@ ovsdb_weak_ref_destroy(struct ovsdb_weak_ref *weak) } ovsdb_type_destroy(&weak->type); free(weak); + n_weak_refs--; } struct ovsdb_row * diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 5d7c70a51c0..03541af85d7 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -613,6 +613,8 @@ add_weak_ref(const struct ovsdb_row *src, const struct ovsdb_row *dst_, weak->column_idx = column->index; hmap_node_nullify(&weak->dst_node); ovs_list_push_back(ref_list, &weak->src_node); + + n_weak_refs++; } static void From 6bc92db366631f0996c8cbae2d6e1263d437ce21 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 5 Dec 2022 09:41:21 +0100 Subject: [PATCH 063/833] rculist: Use rculist_back_protected to access prev. The .prev member of a rculist should not be used directly by users because it's not rcu-safe. A convenient fake mutex (rculist_fake_mutex) helps ensuring that in conjunction with clang's thread safety extensions. Only writers with exclusive access to the rculist should access .prev via some of the provided *_protected() accessors. Use rculist_back_protected() in REVERSE_PROTECTED iterators to avoid clang's compilation warning. Acked-by: Mike Pattrick Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/rculist.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/rculist.h b/lib/rculist.h index 9bb8cbf3eb2..6df963eb2b8 100644 --- a/lib/rculist.h +++ b/lib/rculist.h @@ -378,12 +378,14 @@ rculist_is_singleton_protected(const struct rculist *list) UPDATE_MULTIVAR(ITER, rculist_next(ITER_VAR(ITER)))) #define RCULIST_FOR_EACH_REVERSE_PROTECTED(ITER, MEMBER, RCULIST) \ - for (INIT_MULTIVAR(ITER, MEMBER, (RCULIST)->prev, struct rculist); \ + for (INIT_MULTIVAR(ITER, MEMBER, rculist_back_protected(RCULIST), \ + struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, ITER_VAR(ITER)->prev)) + UPDATE_MULTIVAR(ITER, rculist_back_protected(ITER_VAR(ITER)))) #define RCULIST_FOR_EACH_REVERSE_PROTECTED_CONTINUE(ITER, MEMBER, RCULIST) \ - for (INIT_MULTIVAR(ITER, MEMBER, (ITER)->MEMBER.prev, struct rculist); \ + for (INIT_MULTIVAR(ITER, MEMBER, rculist_back_protected(ITER->MEMBER), \ + struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ UPDATE_MULTIVAR(ITER, ITER_VAR(ITER)->prev)) From 481555587f753d035d011712b4877a4300dbc9d9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 6 Dec 2022 14:25:34 +0100 Subject: [PATCH 064/833] faq: Update some wording since kernel module is already removed. The kernel module was removed in 3.0 release, but the faq page still talks about that in a future tense. Fixes: 3476bd3932b0 ("Documentation: Remove kernel module documentation.") Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/faq/releases.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index e19f54c8f01..53ce230047c 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -88,11 +88,10 @@ Q: What Linux kernel versions does each Open vSwitch release work with? RHEL and CentOS 7 3.10 based kernels since they have diverged from the Linux kernel.org 3.10 kernels. - Starting with Open vSwitch 2.15, building the Linux kernel module from - the Open vSwitch source tree is deprecated. It will not be updated to - support Linux versions later than 5.8. We will remove the kernel module - source code from the Open vSwitch source tree for the Open vSwitch 3.0 - release. + Building the Linux kernel module from the Open vSwitch source tree was + deprecated starting with Open vSwitch 2.15. And the kernel module + source code was completely removed from the Open vSwitch source tree in + 3.0 release. Q: Are all features available with all datapaths? From 739bcf2263b3dfbc8a855c6e5b4a2b77742dd8db Mon Sep 17 00:00:00 2001 From: Emma Finn Date: Tue, 6 Dec 2022 14:18:00 +0000 Subject: [PATCH 065/833] odp-execute: Fix ipv4 missing clearing of connection tracking fields. This patch add clearing of connection tracking fields to the avx512 implementation of the ipv4 action. This patch also extends the actions autovalidator to include a compare for packet metadata. Fixes: 92eb03f7b03a ("odp-execute: Add ISA implementation of set_masked IPv4 action") Signed-off-by: Emma Finn Signed-off-by: Ilya Maximets --- lib/odp-execute-avx512.c | 2 ++ lib/odp-execute-private.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 6c77132516a..66b3998dabd 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -477,6 +477,8 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, th->tcp_csum = tcp_checksum; } + + pkt_metadata_init_conn(&packet->md); } /* Write back the modified IPv4 addresses. */ _mm256_mask_storeu_epi32((void *) nh, 0x1F, v_new_hdr); diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c index f80ae5a239c..57be5cfe75a 100644 --- a/lib/odp-execute-private.c +++ b/lib/odp-execute-private.c @@ -229,6 +229,18 @@ action_autoval_generic(struct dp_packet_batch *batch, const struct nlattr *a) } } + /* Compare packet metadata. */ + if (memcmp(&good_pkt->md, &test_pkt->md, sizeof good_pkt->md)) { + ds_put_format(&log_msg, "Autovalidation metadata failed\n"); + ds_put_format(&log_msg, "Good packet metadata:\n"); + ds_put_sparse_hex_dump(&log_msg, &good_pkt->md, + sizeof good_pkt->md, 0, false); + ds_put_format(&log_msg, "Test packet metadata:\n"); + ds_put_sparse_hex_dump(&log_msg, &test_pkt->md, + sizeof test_pkt->md, 0, false); + failed = true; + } + if (failed) { VLOG_ERR("Autovalidation of %s failed. Details:\n%s", action_impls[impl].name, ds_cstr(&log_msg)); From a787fbbf9dd6a108a53053afb45fb59a0b58b514 Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Tue, 13 Dec 2022 18:11:18 +0100 Subject: [PATCH 066/833] ovsdb-cs: Consider default conditions implicitly acked. When initializing a monitor table the default monitor condition is [True] which matches the behavior of the server (to send all rows of that table). There's no need to include this default condition in the initial monitor request so we can consider it implicitly acked by the server. This fixes the incorrect (one too large) expected condition sequence number reported by ovsdb_idl_set_condition() when application is trying to set a [True] condition for a new table. Reported-by: Numan Siddique Suggested-by: Ilya Maximets Signed-off-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/ovsdb-cs.c | 2 +- python/ovs/db/idl.py | 4 +- tests/ovsdb-idl.at | 105 +++++++++++++++++++++++++++++-------------- tests/test-ovsdb.c | 38 ++++++++++++---- tests/test-ovsdb.py | 37 +++++++++++---- 5 files changed, 133 insertions(+), 53 deletions(-) diff --git a/lib/ovsdb-cs.c b/lib/ovsdb-cs.c index a6fbd290c87..0fca03d7231 100644 --- a/lib/ovsdb-cs.c +++ b/lib/ovsdb-cs.c @@ -892,7 +892,7 @@ ovsdb_cs_db_get_table(struct ovsdb_cs_db *db, const char *table) t = xzalloc(sizeof *t); t->name = xstrdup(table); - t->new_cond = json_array_create_1(json_boolean_create(true)); + t->ack_cond = json_array_create_1(json_boolean_create(true)); hmap_insert(&db->tables, &t->hmap_node, hash); return t; } diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index fe66402cff4..9fc2159b04a 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -85,9 +85,9 @@ class Monitor(enum.IntEnum): class ConditionState(object): def __init__(self): - self._ack_cond = None + self._ack_cond = [True] self._req_cond = None - self._new_cond = [True] + self._new_cond = None def __iter__(self): return iter([self._new_cond, self._req_cond, self._ack_cond]) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index c2970984bae..5a7e76eaa95 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -576,9 +576,9 @@ OVSDB_CHECK_IDL([simple idl, conditional, false condition], "b": true}}]']], [['condition simple []' \ 'condition simple [true]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 004: done ]]) @@ -592,13 +592,40 @@ OVSDB_CHECK_IDL([simple idl, conditional, true condition], "b": true}}]']], [['condition simple []' \ 'condition simple [true]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 004: done ]]) +dnl This test ensures that the first explicitly set monitor condition +dnl is sent to the server. +OVSDB_CHECK_IDL([simple idl, conditional, wait for condition], + [], + [['["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 1, + "r": 2.0, + "b": true}}]' \ + 'condition simple [true]' \ + '^["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 2, + "r": 4.0, + "b": true}}]']], + [[000: empty +001: {"error":null,"result":[{"uuid":["uuid","<0>"]}]} +002: table simple: i=1 r=2 b=true s= u=<1> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +003: simple: conditions unchanged +004: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} +005: table simple: i=1 r=2 b=true s= u=<1> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +005: table simple: i=2 r=4 b=true s= u=<1> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> +006: done +]]) + OVSDB_CHECK_IDL([simple idl, conditional, multiple clauses in condition], [['["idltest", {"op": "insert", @@ -613,9 +640,9 @@ OVSDB_CHECK_IDL([simple idl, conditional, multiple clauses in condition], "b": true}}]']], [['condition simple []' \ 'condition simple [["i","==",1],["i","==",2]]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: i=2 r=3 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> 004: done @@ -630,9 +657,9 @@ OVSDB_CHECK_IDL([simple idl, conditional, modify as insert due to condition], "b": true}}]']], [['condition simple []' \ 'condition simple [["i","==",1]]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 004: done ]]) @@ -653,11 +680,11 @@ OVSDB_CHECK_IDL([simple idl, conditional, modify as delete due to condition], "row": {"i": 2, "r": 3.0, "b": true}}]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -004: change conditions +004: simple: change conditions 005: empty 006: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} 007: table simple: i=2 r=3 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> @@ -688,14 +715,16 @@ OVSDB_CHECK_IDL([simple idl, conditional, multiple tables], "table": "link2", "row": {"i": 3}, "uuid-name": "row0"}]']], - [[000: change conditions + [[000: link1: change conditions +000: link2: change conditions +000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -004: change conditions +004: link1: change conditions 005: table link1: i=0 k=0 ka=[] l2= uuid=<2> 005: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -006: change conditions +006: link2: change conditions 007: {"error":null,"result":[{"uuid":["uuid","<3>"]}]} 008: table link1: i=0 k=0 ka=[] l2= uuid=<2> 008: table link2: i=3 l1= uuid=<3> @@ -1266,10 +1295,10 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, orphan weak refer {"op": "delete", "table": "simple6", "where": []}]']], - [[000: change conditions + [[000: simple: change conditions 001: table simple6: inserted row: name=first_row weak_ref=[] uuid=<0> 001: table simple6: updated columns: name weak_ref -002: change conditions +002: simple: change conditions 003: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 003: table simple: inserted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: updated columns: s @@ -1308,19 +1337,19 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, orphan rows, cond {"op": "delete", "table": "simple6", "where": []}]']], - [[000: change conditions + [[000: simple: change conditions 001: table simple6: inserted row: name=first_row weak_ref=[] uuid=<0> 001: table simple6: updated columns: name weak_ref -002: change conditions +002: simple: change conditions 003: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 003: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: updated columns: s -004: change conditions +004: simple: change conditions 005: table simple6: name=first_row weak_ref=[] uuid=<0> 005: table simple: deleted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 005: table simple: inserted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> 005: table simple: updated columns: s -006: change conditions +006: simple: change conditions 007: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 007: table simple: deleted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> 007: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> @@ -1362,14 +1391,14 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, references, condi {"op": "delete", "table": "simple6", "where": []}]']], - [[000: change conditions + [[000: simple: change conditions 001: table simple6: inserted row: name=first_row weak_ref=[] uuid=<0> 001: table simple6: updated columns: name weak_ref -002: change conditions +002: simple: change conditions 003: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 003: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: updated columns: s -004: change conditions +004: simple: change conditions 005: table simple6: name=first_row weak_ref=[<3>] uuid=<0> 005: table simple: deleted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 005: table simple: inserted row: i=1 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> @@ -1405,7 +1434,8 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, references, singl {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple6: conditions unchanged +000: simple: conditions unchanged 001: table simple6: inserted row: name=row0_s6 weak_ref=[<0>] uuid=<1> 001: table simple6: updated columns: name weak_ref 001: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> @@ -1447,7 +1477,8 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, weak references, {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple6: conditions unchanged +000: simple: conditions unchanged 001: table simple6: inserted row: name=row0_s6 weak_ref=[<0>] uuid=<1> 001: table simple6: updated columns: name weak_ref 001: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> @@ -1487,7 +1518,9 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, strong references {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple3: conditions unchanged +000: simple4: conditions unchanged +000: simple: conditions unchanged 001: table simple3: inserted row: name=row0_s3 uset=[] uref=[<0>] uuid=<1> 001: table simple3: updated columns: name uref 001: table simple4: inserted row: name=row0_s4 uuid=<0> @@ -1522,12 +1555,14 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, strong references {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple3: conditions unchanged +000: simple4: conditions unchanged +000: simple: conditions unchanged 001: table simple3: inserted row: name=row0_s3 uset=[] uref=[<0>] uuid=<1> 001: table simple3: updated columns: name uref 001: table simple4: inserted row: name=row0_s4 uuid=<0> 001: table simple4: updated columns: name -002: change conditions +002: simple4: change conditions 003: table simple3: name=row0_s3 uset=[] uref=[] uuid=<1> 003: table simple4: deleted row: name=row0_s4 uuid=<0> 004: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} @@ -1558,10 +1593,12 @@ OVSDB_CHECK_IDL([simple idl, initially populated, strong references, conditional {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple3: conditions unchanged +000: simple4: conditions unchanged +000: simple: conditions unchanged 001: table simple3: name=row0_s3 uset=[] uref=[<0>] uuid=<1> 001: table simple4: name=row0_s4 uuid=<0> -002: change conditions +002: simple4: change conditions 003: table simple3: name=row0_s3 uset=[] uref=[] uuid=<1> 004: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} 005: table simple3: name=row0_s3 uset=[] uref=[] uuid=<1> @@ -2370,11 +2407,11 @@ OVSDB_CHECK_CLUSTER_IDL([simple idl, monitor_cond_since, cluster disconnect], "table": "simple", "where": [["i", "==", 1]], "row": {"r": 2.0 }}]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=2 r=1 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -004: change conditions +004: simple: change conditions 005: reconnect 006: table simple 007: {"error":null,"result":[{"count":1}]} diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index 84fe232765a..1bc5ac17a01 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -2627,11 +2627,12 @@ parse_link2_json_clause(struct ovsdb_idl_condition *cond, } } -static void -update_conditions(struct ovsdb_idl *idl, char *commands) +static unsigned int +update_conditions(struct ovsdb_idl *idl, char *commands, int step) { - char *cmd, *save_ptr1 = NULL; const struct ovsdb_idl_table_class *tc; + unsigned int next_cond_seqno = 0; + char *cmd, *save_ptr1 = NULL; for (cmd = strtok_r(commands, ";", &save_ptr1); cmd; cmd = strtok_r(NULL, ";", &save_ptr1)) { @@ -2682,15 +2683,20 @@ update_conditions(struct ovsdb_idl *idl, char *commands) unsigned int seqno = ovsdb_idl_get_condition_seqno(idl); unsigned int next_seqno = ovsdb_idl_set_condition(idl, tc, &cond); if (seqno == next_seqno ) { - ovs_fatal(0, "condition unchanged"); + print_and_log("%03d: %s: conditions unchanged", + step, table_name); + } else { + print_and_log("%03d: %s: change conditions", step, table_name); } unsigned int new_next_seqno = ovsdb_idl_set_condition(idl, tc, &cond); if (next_seqno != new_next_seqno) { ovs_fatal(0, "condition expected seqno changed"); } + next_cond_seqno = MAX(next_cond_seqno, next_seqno); ovsdb_idl_condition_destroy(&cond); json_destroy(json); } + return next_cond_seqno; } static void @@ -2699,6 +2705,7 @@ do_idl(struct ovs_cmdl_context *ctx) struct test_ovsdb_pvt_context *pvt = ctx->pvt; struct jsonrpc *rpc; struct ovsdb_idl *idl; + unsigned int next_cond_seqno = 0; unsigned int seqno = 0; struct ovsdb_symbol_table *symtab; size_t n_uuids = 0; @@ -2735,8 +2742,8 @@ do_idl(struct ovs_cmdl_context *ctx) const char remote_s[] = "set-remote "; const char cond_s[] = "condition "; if (ctx->argc > 2 && strstr(ctx->argv[2], cond_s)) { - update_conditions(idl, ctx->argv[2] + strlen(cond_s)); - print_and_log("%03d: change conditions", step++); + next_cond_seqno = + update_conditions(idl, ctx->argv[2] + strlen(cond_s), step++); i = 3; } else { i = 2; @@ -2755,6 +2762,21 @@ do_idl(struct ovs_cmdl_context *ctx) if (*arg == '+') { /* The previous transaction didn't change anything. */ arg++; + } else if (*arg == '^') { + /* Wait for condition change to be acked by the server. */ + arg++; + for (;;) { + ovsdb_idl_run(idl); + ovsdb_idl_check_consistency(idl); + if (ovsdb_idl_get_condition_seqno(idl) == next_cond_seqno) { + break; + } + jsonrpc_run(rpc); + + ovsdb_idl_wait(idl); + jsonrpc_wait(rpc); + poll_block(); + } } else { /* Wait for update. */ for (;;) { @@ -2789,8 +2811,8 @@ do_idl(struct ovs_cmdl_context *ctx) arg + strlen(remote_s), ovsdb_idl_is_connected(idl) ? "true" : "false"); } else if (!strncmp(arg, cond_s, strlen(cond_s))) { - update_conditions(idl, arg + strlen(cond_s)); - print_and_log("%03d: change conditions", step++); + next_cond_seqno = update_conditions(idl, arg + strlen(cond_s), + step++); } else if (arg[0] != '[') { if (!idl_set(idl, arg, step++)) { /* If idl_set() returns false, then no transaction diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index cca1818ea3a..a841adba4e1 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -626,7 +626,8 @@ def notify(event, row, updates=None): return status != ovs.db.idl.Transaction.ERROR -def update_condition(idl, commands): +def update_condition(idl, commands, step): + next_cond_seqno = 0 commands = commands[len("condition "):].split(";") for command in commands: command = command.split(" ") @@ -637,7 +638,20 @@ def update_condition(idl, commands): table = command[0] cond = ovs.json.from_string(command[1]) - idl.cond_change(table, cond) + next_seqno = idl.cond_change(table, cond) + if idl.cond_seqno == next_seqno: + sys.stdout.write("%03d: %s: conditions unchanged\n" % + (step, table)) + else: + sys.stdout.write("%03d: %s: change conditions\n" % + (step, table)) + sys.stdout.flush() + + assert next_seqno == idl.cond_change(table, cond), \ + "condition expected seqno changed" + next_cond_seqno = max(next_cond_seqno, next_seqno) + + return next_cond_seqno def do_idl(schema_file, remote, *commands): @@ -694,6 +708,7 @@ def do_idl(schema_file, remote, *commands): else: rpc = None + next_cond_seqno = 0 symtab = {} seqno = 0 step = 0 @@ -717,9 +732,7 @@ def mock_notify(event, row, updates=None): commands = list(commands) if len(commands) >= 1 and "condition" in commands[0]: - update_condition(idl, commands.pop(0)) - sys.stdout.write("%03d: change conditions\n" % step) - sys.stdout.flush() + next_cond_seqno = update_condition(idl, commands.pop(0), step) step += 1 for command in commands: @@ -732,6 +745,16 @@ def mock_notify(event, row, updates=None): if command.startswith("+"): # The previous transaction didn't change anything. command = command[1:] + elif command.startswith("^"): + # Wait for condition change to be acked by the server. + command = command[1:] + while idl.cond_seqno != next_cond_seqno and not idl.run(): + rpc.run() + + poller = ovs.poller.Poller() + idl.wait(poller) + rpc.wait(poller) + poller.block() else: # Wait for update. while idl.change_seqno == seqno and not idl.run(): @@ -753,9 +776,7 @@ def mock_notify(event, row, updates=None): step += 1 idl.force_reconnect() elif "condition" in command: - update_condition(idl, command) - sys.stdout.write("%03d: change conditions\n" % step) - sys.stdout.flush() + next_cond_seqno = update_condition(idl, command, step) step += 1 elif not command.startswith("["): if not idl_set(idl, command, step): From 69e71bf791c89690e38afe3b7012066e5d64a129 Mon Sep 17 00:00:00 2001 From: Emma Finn Date: Thu, 8 Dec 2022 15:59:35 +0000 Subject: [PATCH 067/833] odp-execute: Add check for L4 header size. This patch adds check for L4 header size for avx512 implementation of the ipv4 action. Fixes: 92eb03f7b03a ("odp-execute: Add ISA implementation of set_masked IPv4 action") Signed-off-by: Emma Finn Signed-off-by: Ilya Maximets --- lib/odp-execute-avx512.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 66b3998dabd..5207ece15d9 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -453,8 +453,9 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, uint16_t delta_checksum = avx512_ipv4_addr_csum_delta(v_packet, v_new_hdr); + size_t l4_size = dp_packet_l4_size(packet); - if (nh->ip_proto == IPPROTO_UDP) { + if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { /* New UDP checksum. */ struct udp_header *uh = dp_packet_l4(packet); if (uh->udp_csum) { @@ -468,7 +469,8 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, /* Insert new udp checksum. */ uh->udp_csum = udp_checksum; } - } else if (nh->ip_proto == IPPROTO_TCP) { + } else if (nh->ip_proto == IPPROTO_TCP && + l4_size >= TCP_HEADER_LEN) { /* New TCP checksum. */ struct tcp_header *th = dp_packet_l4(packet); uint16_t old_tcp_checksum = ~th->tcp_csum; From 1ea0fa4ad7dc2dbfdb1f221eff97efbf3e1af894 Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Fri, 16 Dec 2022 16:29:46 +0100 Subject: [PATCH 068/833] rhel: Avoid creating an empty database file. In 59e8cb8a053d ("rhel: Move conf.db to /var/lib/openvswitch, using symlinks.") conf.db is created as empty file in /var/lib/openvswitch, if it doesn't exists, but this prevent ovsdb-server to start. This commit changes the previous behaviour to set /var/lib/openvswitch owner to openvswitch:hugetlbfs, if built with dpdk, or openvswitch:openvswitch. Fixes: 59e8cb8a053d ("rhel: Move conf.db to /var/lib/openvswitch, using symlinks.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2022-December/400045.html Reported-by: Roi Dayan Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- rhel/openvswitch-fedora.spec.in | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 4a3e6294bfb..17aab796fca 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -339,12 +339,6 @@ for base in conf.db .conf.db.~lock~; do if test ! -e $old && test ! -h $old; then ln -s $new $old fi - touch $new -%if %{with dpdk} - chown openvswitch:hugetlbfs $new -%else - chown openvswitch:openvswitch $new -%endif done %if 0%{?systemd_post:1} @@ -505,7 +499,11 @@ fi %{_prefix}/lib/udev/rules.d/91-vfio.rules %endif %doc NOTICE README.rst NEWS rhel/README.RHEL.rst -/var/lib/openvswitch +%if %{with dpdk} +%attr(750,openvswitch,hugetlbfs) /var/lib/openvswitch +%else +%attr(750,openvswitch,openvswitch) /var/lib/openvswitch +%endif %attr(750,root,root) /var/log/openvswitch %ghost %attr(755,root,root) %{_rundir}/openvswitch %ghost %attr(644,root,root) %{_rundir}/openvswitch.useropts From bf8fa1fe414e92f8386ca2b7745822ced63385ee Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 8 Dec 2022 09:06:58 +0100 Subject: [PATCH 069/833] dpdk: Fix typo in v22.11.1 tarball extract example. There was a small typo that slipped in when updating to v22.11.1 tag. Fixes: a77c7796f23a ("dpdk: Update to use v22.11.1.") Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/intro/install/dpdk.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index e360ee83ddc..63a0ebb23bb 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -74,7 +74,7 @@ Install DPDK $ cd /usr/src/ $ wget https://fast.dpdk.org/rel/dpdk-22.11.1.tar.xz - $ tar xf dpdk-22.11.tar.xz + $ tar xf dpdk-22.11.1.tar.xz $ export DPDK_DIR=/usr/src/dpdk-stable-22.11.1 $ cd $DPDK_DIR From 79e7756a5d9e10c18343096187744f95a793ccf8 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 7 Dec 2022 17:26:39 +0100 Subject: [PATCH 070/833] utilities: Add a GDB macro to dump hmap structures. Add a new GDB macro called ovs_dump_hmap, which can be used to dump any cmap structure. For example (gdb) ovs_dump_hmap "&'all_bridges.lto_priv.0'" "struct bridge" "node" (struct bridge *) 0x55ec43069c70 (struct bridge *) 0x55ec430428a0 (struct bridge *) 0x55ec430a55f0 Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- utilities/gdb/ovs_gdb.py | 53 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/utilities/gdb/ovs_gdb.py b/utilities/gdb/ovs_gdb.py index 7f63dd0d592..982395dd1d2 100644 --- a/utilities/gdb/ovs_gdb.py +++ b/utilities/gdb/ovs_gdb.py @@ -30,6 +30,8 @@ # - ovs_dump_netdev_provider # - ovs_dump_ovs_list {[] [] {dump}]} # - ovs_dump_packets [tcpdump options] +# - ovs_dump_cmap {[] [] {dump}]} +# - ovs_dump_hmap {dump} # - ovs_dump_simap # - ovs_dump_smap # - ovs_dump_udpif_keys {|} {short} @@ -876,7 +878,7 @@ class CmdDumpCmap(gdb.Command): """ def __init__(self): super(CmdDumpCmap, self).__init__("ovs_dump_cmap", - gdb.COMMAND_DATA) + gdb.COMMAND_DATA) def invoke(self, arg, from_tty): arg_list = gdb.string_to_argv(arg) @@ -914,6 +916,54 @@ def invoke(self, arg, from_tty): member).dereference())) +# +# Implements the GDB "ovs_dump_hmap" command +# +class CmdDumpHmap(gdb.Command): + """Dump all nodes of a given hmap + Usage: + ovs_dump_hmap {dump} + + For example dump all the bridges when the all_bridges variable is + optimized out due to LTO: + + (gdb) ovs_dump_hmap "&'all_bridges.lto_priv.0'" "struct bridge" "node" + (struct bridge *) 0x55ec43069c70 + (struct bridge *) 0x55ec430428a0 + (struct bridge *) 0x55ec430a55f0 + + The 'dump' option will also include the full structure content in the + output. + """ + def __init__(self): + super(CmdDumpHmap, self).__init__("ovs_dump_hmap", + gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + arg_list = gdb.string_to_argv(arg) + typeobj = None + member = None + dump = False + + if len(arg_list) != 3 and len(arg_list) != 4: + print("usage: ovs_dump_hmap " + " {dump}") + return + + hmap = gdb.parse_and_eval(arg_list[0]).cast( + gdb.lookup_type('struct hmap').pointer()) + + typeobj = arg_list[1] + member = arg_list[2] + if len(arg_list) == 4 and arg_list[3] == "dump": + dump = True + + for node in ForEachHMAP(hmap.dereference(), typeobj, member): + print("({} *) {} {}".format(typeobj, node, "=" if dump else "")) + if dump: + print(" {}\n".format(node.dereference())) + + # # Implements the GDB "ovs_dump_simap" command # @@ -1515,6 +1565,7 @@ def extract_pkt(self, pkt): CmdDumpOvsList() CmdDumpPackets() CmdDumpCmap() +CmdDumpHmap() CmdDumpSimap() CmdDumpSmap() CmdDumpUdpifKeys() From c82f496c3b69a036432af7c79adbc00545621ed1 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 28 Nov 2022 09:53:30 +0100 Subject: [PATCH 071/833] dpif-netdev: Use unmasked key when adding datapath flows. The datapath supports installing wider flows, and OVS relies on this behavior. For example if ipv4(src=1.1.1.1/192.0.0.0, dst=1.1.1.2/192.0.0.0) exists, a wider flow (smaller mask) of ipv4(src=192.1.1.1/128.0.0.0,dst=192.1.1.2/128.0.0.0) is allowed to be added. However, if we try to add a wildcard rule, the installation fails: # ovs-appctl dpctl/add-flow system@myDP "in_port(1),eth_type(0x0800), \ ipv4(src=1.1.1.1/192.0.0.0,dst=1.1.1.2/192.0.0.0,frag=no)" 2 # ovs-appctl dpctl/add-flow system@myDP "in_port(1),eth_type(0x0800), \ ipv4(src=192.1.1.1/0.0.0.0,dst=49.1.1.2/0.0.0.0,frag=no)" 2 ovs-vswitchd: updating flow table (File exists) The reason is that the key used to determine if the flow is already present in the system uses the original key ANDed with the mask. This results in the IP address not being part of the (miniflow) key, i.e., being substituted with an all-zero value. When doing the actual lookup, this results in the key wrongfully matching the first flow, and therefore the flow does not get installed. The solution is to use the unmasked key for the existence check, the same way this is handled in the "slow" dpif_flow_put() case. OVS relies on the fact that overlapping flows can exist if one is a superset of the other. Note that this is only true when the same set of actions is applied. This is due to how the revalidator process works. During revalidation, OVS removes too generic flows from the datapath to avoid incorrect matches but allows too narrow flows to stay in the datapath to avoid the data plane disruption and also to avoid constant flow deletions if the datapath ignores wildcards on certain fields/bits. See flow_wildcards_has_extra() check in the revalidate_ukey__() function. The problem here is that we have a too narrow flow installed, and now OpenFlow rules got changed, so the actual flow should be more generic. Revalidators will not remove the narrow flow, and we will eventually get an upcall on the packet that doesn't match the narrow flow, but we will not be able to install a more generic flow because after masking with the new wider mask, the key matches on the narrow flow, so we get EEXIST. Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline") Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 33 +++++++++++++++++++++++++++++---- tests/dpif-netdev.at | 14 ++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 2c08a71c8db..9331f2cbac6 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -3320,6 +3320,28 @@ netdev_flow_key_init_masked(struct netdev_flow_key *dst, (dst_u64 - miniflow_get_values(&dst->mf)) * 8); } +/* Initializes 'key' as a copy of 'flow'. */ +static inline void +netdev_flow_key_init(struct netdev_flow_key *key, + const struct flow *flow) +{ + uint64_t *dst = miniflow_values(&key->mf); + uint32_t hash = 0; + uint64_t value; + + miniflow_map_init(&key->mf, flow); + miniflow_init(&key->mf, flow); + + size_t n = dst - miniflow_get_values(&key->mf); + + FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { + hash = hash_add64(hash, value); + } + + key->hash = hash_finish(hash, n * 8); + key->len = netdev_flow_key_size(n); +} + static inline void emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow, const struct netdev_flow_key *key) @@ -4194,7 +4216,7 @@ static int dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) { struct dp_netdev *dp = get_dp_netdev(dpif); - struct netdev_flow_key key, mask; + struct netdev_flow_key key; struct dp_netdev_pmd_thread *pmd; struct match match; ovs_u128 ufid; @@ -4243,9 +4265,12 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) /* Must produce a netdev_flow_key for lookup. * Use the same method as employed to create the key when adding - * the flow to the dplcs to make sure they match. */ - netdev_flow_mask_init(&mask, &match); - netdev_flow_key_init_masked(&key, &match.flow, &mask); + * the flow to the dplcs to make sure they match. + * We need to put in the unmasked key as flow_put_on_pmd() will first try + * to see if an entry exists doing a packet type lookup. As masked-out + * fields are interpreted as zeros, they could falsely match a wider IP + * address mask. Installation of the flow will use the match variable. */ + netdev_flow_key_init(&key, &match.flow); if (put->pmd_id == PMD_ID_NULL) { if (cmap_count(&dp->poll_threads) == 0) { diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 6aff1eda7b0..9af70a68d75 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -636,6 +636,20 @@ OVS_VSWITCHD_STOP(["/flow: in_port is not an exact match/d /failed to put/d"]) AT_CLEANUP +AT_SETUP([dpif-netdev - check dpctl/add-flow wider ip match]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set interface p1 type=dummy options:pstream=punix:$OVS_RUNDIR/p0.sock \ + -- set bridge br0 datapath-type=dummy]) + +AT_CHECK([ovs-appctl revalidator/pause]) +AT_CHECK([ovs-appctl dpctl/add-flow "in_port(1),eth_type(0x0800),ipv4(src=0.0.0.0/192.0.0.0,dst=0.0.0.0/192.0.0.0,frag=no)" "3"]) +AT_CHECK([ovs-appctl dpctl/add-flow "in_port(1),eth_type(0x0800),ipv4(src=192.1.1.1/0.0.0.0,dst=49.1.1.1/0.0.0.0,frag=no)" "3"]) +AT_CHECK([ovs-appctl revalidator/resume]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + # SEND_UDP_PKTS([p_name], [p_ofport]) # # Sends 128 packets to port 'p_name' with different UDP destination ports. From d34245ea150a7ae4dbae9e7fc37e3adfcbbf0bc6 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 19 Dec 2022 08:38:38 -0500 Subject: [PATCH 072/833] ovs-ctl: Allow inclusion of hugepages in coredumps. Add new option --dump-hugepages option in ovs-ctl to enable the addition of hugepages in the core dump filter. Reviewed-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ utilities/ovs-ctl.in | 15 +++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 265375e1cb8..95d82632f25 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,10 @@ Post-v3.0.0 10 Gbps link speed by default in case the actual link speed cannot be determined. Previously it was 10 Mbps. Values can still be overridden by specifying 'max-rate' or '[r]stp-path-cost' accordingly. + - ovs-ctl: + * New option '--dump-hugepages' to include hugepages in core dumps. This + can assist with postmortem analysis involving DPDK, but may also produce + significantly larger core dump files. v3.0.0 - 15 Aug 2022 diff --git a/utilities/ovs-ctl.in b/utilities/ovs-ctl.in index eba9512fe8b..d9155258868 100644 --- a/utilities/ovs-ctl.in +++ b/utilities/ovs-ctl.in @@ -103,8 +103,13 @@ set_system_ids () { action "Configuring Open vSwitch system IDs" "$@" $extra_ids } -check_force_cores () { - if test X"$FORCE_COREFILES" = Xyes; then +check_core_config () { + if test X"$DUMP_HUGEPAGES" = Xyes; then + echo 0x7f > /proc/self/coredump_filter + if test X"$FORCE_COREFILES" = Xyes; then + ulimit -c unlimited + fi + elif test X"$FORCE_COREFILES" = Xyes; then ulimit -c 67108864 fi } @@ -116,7 +121,7 @@ del_transient_ports () { } do_start_ovsdb () { - check_force_cores + check_core_config if daemon_is_running ovsdb-server; then log_success_msg "ovsdb-server is already running" @@ -193,7 +198,7 @@ add_managers () { } do_start_forwarding () { - check_force_cores + check_core_config insert_mod_if_required || return 1 @@ -330,6 +335,7 @@ set_defaults () { DAEMON_CWD=/ FORCE_COREFILES=yes + DUMP_HUGEPAGES=no MLOCKALL=yes SELF_CONFINEMENT=yes MONITOR=yes @@ -419,6 +425,7 @@ Other important options for "start", "restart" and "force-reload-kmod": Less important options for "start", "restart" and "force-reload-kmod": --daemon-cwd=DIR set working dir for OVS daemons (default: $DAEMON_CWD) --no-force-corefiles do not force on core dumps for OVS daemons + --dump-hugepages include hugepages in core dumps --no-mlockall do not lock all of ovs-vswitchd into memory --ovsdb-server-priority=NICE set ovsdb-server's niceness (default: $OVSDB_SERVER_PRIORITY) --ovsdb-server-options=OPTIONS additional options for ovsdb-server (example: '-vconsole:dbg -vfile:dbg') From 0d23948a598ac609e9865174e0874e782a48d6a8 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 19:29:06 +0100 Subject: [PATCH 073/833] ovs-thread: Detect changes in number of CPUs. Currently, things like the number of handler and revalidator threads are calculated based on the number of available CPUs. However, this number is considered static and only calculated once, hence ignoring events such as cpus being hotplugged, switched on/off or affinity mask changing. On the other hand, checking the number of available CPUs multiple times per second seems like an overkill. Affinity should not change that often and, even if it does, the impact of destroying and recreating all the threads so often is probably a price too expensive to pay. I tested the impact of updating the threads every 5 seconds and saw an impact in the main loop duration of <1% and a worst-case scenario impact in throughput of < 5% [1]. This patch sets the default period to 10 seconds just to be safer. [1] Tested in the worst-case scenario of disabling the kernel cache (other_config:flow-size=0), modifying ovs-vswithd's affinity so the number of handlers go up and down every 5 seconds and calculated the difference in netperf's ops/sec. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- NEWS | 2 ++ lib/ovs-thread.c | 67 +++++++++++++++++++++++++++++++----------------- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/NEWS b/NEWS index 95d82632f25..c79d9f97dc4 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,7 @@ Post-v3.0.0 -------------------- + - ovs-vswitchd now detects changes in CPU affinity and adjusts the number + of handler and revalidator threads if necessary. - ovs-appctl: * "ovs-appctl ofproto/trace" command can now display port names with the "--names" option. diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c index 78ed3e9707e..2d382f1e8bc 100644 --- a/lib/ovs-thread.c +++ b/lib/ovs-thread.c @@ -31,6 +31,7 @@ #include "openvswitch/poll-loop.h" #include "seq.h" #include "socket-util.h" +#include "timeval.h" #include "util.h" #ifdef __CHECKER__ @@ -627,42 +628,60 @@ ovs_thread_stats_next_bucket(const struct ovsthread_stats *stats, size_t i) } -/* Returns the total number of cores available to this process, or 0 if the - * number cannot be determined. */ -int -count_cpu_cores(void) +static int +count_cpu_cores__(void) { - static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; - static long int n_cores; + long int n_cores; - if (ovsthread_once_start(&once)) { #ifndef _WIN32 - n_cores = sysconf(_SC_NPROCESSORS_ONLN); + n_cores = sysconf(_SC_NPROCESSORS_ONLN); +#else + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + n_cores = sysinfo.dwNumberOfProcessors; +#endif #ifdef __linux__ - if (n_cores > 0) { - cpu_set_t *set = CPU_ALLOC(n_cores); + if (n_cores > 0) { + cpu_set_t *set = CPU_ALLOC(n_cores); - if (set) { - size_t size = CPU_ALLOC_SIZE(n_cores); + if (set) { + size_t size = CPU_ALLOC_SIZE(n_cores); - if (!sched_getaffinity(0, size, set)) { - n_cores = CPU_COUNT_S(size, set); - } - CPU_FREE(set); + if (!sched_getaffinity(0, size, set)) { + n_cores = CPU_COUNT_S(size, set); } + CPU_FREE(set); } -#endif -#else - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - n_cores = sysinfo.dwNumberOfProcessors; -#endif - ovsthread_once_done(&once); } - +#endif return n_cores > 0 ? n_cores : 0; } +/* It's unlikely that the available cpus change several times per second and + * even if it does, it's not needed (or desired) to react to such changes so + * quickly. */ +#define COUNT_CPU_UPDATE_TIME_MS 10000 + +static struct ovs_mutex cpu_cores_mutex = OVS_MUTEX_INITIALIZER; + +/* Returns the current total number of cores available to this process, or 0 + * if the number cannot be determined. */ +int +count_cpu_cores(void) +{ + static long long int last_updated = 0; + long long int now = time_msec(); + static int cpu_cores; + + ovs_mutex_lock(&cpu_cores_mutex); + if (now - last_updated >= COUNT_CPU_UPDATE_TIME_MS) { + last_updated = now; + cpu_cores = count_cpu_cores__(); + } + ovs_mutex_unlock(&cpu_cores_mutex); + return cpu_cores; +} + /* Returns the total number of cores on the system, or 0 if the * number cannot be determined. */ int From 7490f281f09a8455c48e19b0cf1b99ab758ee4f4 Mon Sep 17 00:00:00 2001 From: Qian Chen Date: Tue, 20 Dec 2022 09:36:08 -0500 Subject: [PATCH 074/833] lldp: Fix bugs when parsing malformed AutoAttach. The OVS LLDP implementation includes support for AutoAttach standard, which the 'upstream' lldpd project does not include. As part of adding this support, the message parsing for these TLVs did not include proper length checks for the LLDP_TLV_AA_ELEMENT_SUBTYPE and the LLDP_TLV_AA_ISID_VLAN_ASGNS_SUBTYPE elements. The result is that a message without a proper boundary will cause an overread of memory, and lead to undefined results, including crashes or other unidentified behavior. The fix is to introduce proper bounds checking for these elements. Introduce a unit test to ensure that we have some proper rejection in this code base in the future. Fixes: be53a5c447c3 ("auto-attach: Initial support for Auto-Attach standard") Signed-off-by: Qian Chen Co-authored-by: Aaron Conole Signed-off-by: Aaron Conole Signed-off-by: Ilya Maximets --- lib/lldp/lldp.c | 2 ++ tests/ofproto-dpif.at | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/lib/lldp/lldp.c b/lib/lldp/lldp.c index dfeb2a80024..6fdcfef5694 100644 --- a/lib/lldp/lldp.c +++ b/lib/lldp/lldp.c @@ -583,6 +583,7 @@ lldp_decode(struct lldpd *cfg OVS_UNUSED, char *frame, int s, switch(tlv_subtype) { case LLDP_TLV_AA_ELEMENT_SUBTYPE: + CHECK_TLV_SIZE(50, "ELEMENT"); PEEK_BYTES(&msg_auth_digest, sizeof msg_auth_digest); aa_element_dword = PEEK_UINT32; @@ -629,6 +630,7 @@ lldp_decode(struct lldpd *cfg OVS_UNUSED, char *frame, int s, break; case LLDP_TLV_AA_ISID_VLAN_ASGNS_SUBTYPE: + CHECK_TLV_SIZE(36, "ISID_VLAN_ASGNS"); PEEK_BYTES(&msg_auth_digest, sizeof msg_auth_digest); /* Subtract off tlv type and length (2Bytes) + OUI (3B) + diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index eb4cd189609..fa6111c1ed2 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -62,6 +62,25 @@ AT_CHECK([ovs-appctl coverage/read-counter rev_reconfigure], [0], [dnl OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - malformed lldp autoattach tlv]) +OVS_VSWITCHD_START() +add_of_ports br0 1 + +dnl Enable lldp +AT_CHECK([ovs-vsctl set interface p1 lldp:enable=true]) + +dnl Send a malformed lldp packet +packet="0180c200000ef6b426aa5f0088cc020704f6b426aa5f000403057632060200780c"dnl +"5044454144424545464445414442454546444541444245454644454144424545464445414"dnl +"4424545464445414442454546444541444245454644454144424545464445414442454546"dnl +"4445414442454546fe0500040d0c010000" +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "$packet"], [0], [stdout]) + +OVS_WAIT_UNTIL([grep -q "ISID_VLAN_ASGNS TLV too short" ovs-vswitchd.log]) + +OVS_VSWITCHD_STOP(["/|WARN|ISID_VLAN_ASGNS TLV too short received on/d"]) +AT_CLEANUP + AT_SETUP([ofproto-dpif - active-backup bonding (with primary)]) dnl Create br0 with members p1, p2 and p7, creating bond0 with p1 and From c1daeb4b41c48635032039cc556412c836d47c5d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 20 Dec 2022 18:02:01 +0100 Subject: [PATCH 075/833] AUTHORS: Add Qian Chen. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 7bb4e41a05d..2df76c56f11 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -350,8 +350,9 @@ Pim van den Berg pim@nethuis.nl pritesh pritesh.kothari@cisco.com Pravin B Shelar pshelar@ovn.org Przemyslaw Szczerbik przemyslawx.szczerbik@intel.com -Quentin Monnet quentin.monnet@6wind.com +Qian Chen cq674350529@163.com Qiuyu Xiao qiuyu.xiao.qyx@gmail.com +Quentin Monnet quentin.monnet@6wind.com Raju Subramanian Rami Rosen ramirose@gmail.com Ramu Ramamurthy ramu.ramamurthy@us.ibm.com From a879beb4dbeed0376f12627cb7c6f71ba81bdb9e Mon Sep 17 00:00:00 2001 From: Emma Finn Date: Thu, 8 Dec 2022 16:01:23 +0000 Subject: [PATCH 076/833] odp-execute: Add ISA implementation of set_masked IPv6 action This commit adds support for the AVX512 implementation of the ipv6_set_addrs action as well as an AVX512 implementation of updating the L4 checksums. Here are some relative performance numbers for this patch: +-----------------------------+----------------+ | Actions | AVX with patch | +-----------------------------+----------------+ | ipv6_src | 1.14x | +-----------------------------+----------------+ | ipv6_src + ipv6_dst | 1.40x | +-----------------------------+----------------+ | ipv6_label | 1.14x | +-----------------------------+----------------+ | mod_ipv6 4 x field | 1.43x | +-----------------------------+----------------+ Signed-off-by: Emma Finn Acked-by: Eelco Chaudron Signed-off-by: Ian Stokes --- lib/odp-execute-avx512.c | 222 ++++++++++++++++++++++++++++++++++++++ lib/odp-execute-private.c | 14 +++ lib/odp-execute-private.h | 1 + lib/packets.c | 2 +- lib/packets.h | 2 + 5 files changed, 240 insertions(+), 1 deletion(-) diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 5207ece15d9..c28461ec1a0 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -20,6 +20,9 @@ #include #include +#include +#include +#include #include "csum.h" #include "dp-packet.h" @@ -28,6 +31,7 @@ #include "odp-execute-private.h" #include "odp-netlink.h" #include "openvswitch/vlog.h" +#include "packets.h" VLOG_DEFINE_THIS_MODULE(odp_execute_avx512); @@ -75,6 +79,26 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) == offsetof(struct ovs_key_ipv4, ipv4_ttl)); +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) == + offsetof(struct ovs_key_ipv6, ipv6_dst)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) == + offsetof(struct ovs_key_ipv6, ipv6_label)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) == + offsetof(struct ovs_key_ipv6, ipv6_proto)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) == + offsetof(struct ovs_key_ipv6, ipv6_tclass)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) == + offsetof(struct ovs_key_ipv6, ipv6_hlimit)); + /* Array of callback functions, one for each masked operation. */ odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX]; @@ -487,6 +511,198 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, } } +#if HAVE_AVX512VBMI +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_sum_header(__m512i ip6_header) +{ + __m256i v_zeros = _mm256_setzero_si256(); + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, + 0xFF, 0xFF, 0xFF, 0xFF); + + /* Shuffle ip6 src and dst to beginning of register. */ + __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst, + ip6_header); + + /* Extract ip6 src and dst into smaller 256-bit wide register. */ + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0); + + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the + * src and dst fields and add padding after each 16-bit value for the + * following carry over addition. */ + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF, + 0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF); + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a); + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b); + + /* Add each part of the old and new headers together. */ + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); + + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + + /* Shuffle 32-bit value from 3rd lane into first lane for final + * horizontal add. */ + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, + 0xF, 0xF, 0xF, 0xF); + + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); + + /* Extract delta value. */ + return _mm256_extract_epi16(v_delta, 0); +} + +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) +{ + uint16_t old_delta = avx512_ipv6_sum_header(old_header); + uint16_t new_delta = avx512_ipv6_sum_header(new_header); + uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; + + return ~csum_finish(csum_delta); +} + +/* This function performs the same operation on each packet in the batch as + * the scalar odp_set_ipv6() function. */ +static void +__attribute__((__target__("avx512vbmi"))) +action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) +{ + const struct ovs_key_ipv6 *key, *mask; + struct dp_packet *packet; + + a = nl_attr_get(a); + key = nl_attr_get(a); + mask = odp_get_key_mask(a, struct ovs_key_ipv6); + + /* Read the content of the key and mask in the respective registers. We + * only load the size of the actual structure, which is only 40 bytes. */ + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); + + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the + * ip6_hdr structure layout. */ + static const uint8_t ip_shuffle_mask[64] = { + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF + }; + + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); + + /* This shuffle is required for key and mask to match the layout of the + * ip6_hdr struct. */ + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); + + /* Set the v_zero register to all zero's. */ + const __m128i v_zeros = _mm_setzero_si128(); + + /* Set the v_all_ones register to all one's. */ + const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros); + + /* Load ip6 src and dst masks respectively into 128-bit wide registers. */ + __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src); + __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst); + + /* Perform a bitwise OR between src and dst registers. */ + __m128i v_or = _mm_or_si128(v_src, v_dst); + + /* Will return true if any bit has been set in v_or, else it will return + * false. */ + bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones); + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); + + /* Load the 40 bytes of the IPv6 header. */ + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh); + + /* AND the v_pkt_mask to the packet data (v_packet). */ + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet); + + /* OR the new addresses (v_key_shuf) with the masked packet addresses + * (v_pkt_masked). */ + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked); + + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to be + * updated. */ + uint8_t proto = 0; + bool rh_present; + bool do_csum = do_checksum; + + rh_present = packet_rh_present(packet, &proto, &do_csum); + + if (do_csum) { + size_t l4_size = dp_packet_l4_size(packet); + __m512i v_new_hdr_for_cksum = v_new_hdr; + uint16_t delta_checksum; + + /* In case of routing header being present, checksum should not be + * updated for the destination address. */ + if (rh_present) { + v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr, + v_packet); + } + + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr_for_cksum); + + if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { + struct udp_header *uh = dp_packet_l4(packet); + + if (uh->udp_csum) { + uint16_t old_udp_checksum = ~uh->udp_csum; + uint32_t udp_checksum = old_udp_checksum + delta_checksum; + + udp_checksum = csum_finish(udp_checksum); + + if (!udp_checksum) { + udp_checksum = htons(0xffff); + } + + uh->udp_csum = udp_checksum; + } + } else if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + + tcp_checksum = csum_finish(tcp_checksum); + th->tcp_csum = tcp_checksum; + } else if (proto == IPPROTO_ICMPV6 && + l4_size >= sizeof(struct icmp6_header)) { + struct icmp6_header *icmp = dp_packet_l4(packet); + uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum; + uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum; + + icmp6_checksum = csum_finish(icmp6_checksum); + icmp->icmp6_cksum = icmp6_checksum; + } + + pkt_metadata_init_conn(&packet->md); + } + /* Write back the modified IPv6 addresses. */ + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); + } +} +#endif /* HAVE_AVX512VBMI */ + static void action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a) { @@ -518,6 +734,12 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED) impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs; impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs; +#if HAVE_AVX512VBMI + if (action_avx512vbmi_isa_probe()) { + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = action_avx512_set_ipv6; + } +#endif + return 0; } diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c index 57be5cfe75a..8b7a6b4ab0e 100644 --- a/lib/odp-execute-private.c +++ b/lib/odp-execute-private.c @@ -60,6 +60,20 @@ action_avx512_isa_probe(void) #endif +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI +bool +action_avx512vbmi_isa_probe(void) +{ + return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI); +} +#else +bool +action_avx512vbmi_isa_probe(void) +{ + return false; +} +#endif + static struct odp_execute_action_impl action_impls[] = { [ACTION_IMPL_AUTOVALIDATOR] = { .available = false, diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h index 940180c99f9..643f41c2a61 100644 --- a/lib/odp-execute-private.h +++ b/lib/odp-execute-private.h @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) bool action_avx512_isa_probe(void); +bool action_avx512vbmi_isa_probe(void); /* Odp execute init handles setting up the state of the actions functions at * initialization time. It cannot return errors, as it must always succeed in diff --git a/lib/packets.c b/lib/packets.c index 1dcd4a6fcd2..06f516cb1af 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1152,7 +1152,7 @@ packet_set_ipv4_addr(struct dp_packet *packet, * segements_left > 0. * * This function assumes that L3 and L4 offsets are set in the packet. */ -static bool +bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag) { const struct ovs_16aligned_ip6_hdr *nh; diff --git a/lib/packets.h b/lib/packets.h index 5bdf6e4bbd9..8626aac8d53 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1642,6 +1642,8 @@ void packet_put_ra_prefix_opt(struct dp_packet *, ovs_be32 preferred_lifetime, const ovs_be128 router_prefix); uint32_t packet_csum_pseudoheader(const struct ip_header *); +bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, + bool *first_frag); void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); #define DNS_HEADER_LEN 12 From 363cc26839ed4587640620e80a226bf739f2257f Mon Sep 17 00:00:00 2001 From: Cian Ferriter Date: Fri, 16 Sep 2022 10:12:04 +0000 Subject: [PATCH 077/833] dpif-netdev/dpcls: Specialize 8, 1 and 5, 2 signatures. The subtable signatures being specialized here were found in an NVGRE tunnel scenario. Signed-off-by: Cian Ferriter Acked-by: Sunil Pai G Acked-by: Eelco Chaudron Signed-off-by: Ian Stokes --- lib/dpif-netdev-lookup-avx512-gather.c | 4 ++++ lib/dpif-netdev-lookup-generic.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c index 7d3d81151f1..b916b24875e 100644 --- a/lib/dpif-netdev-lookup-avx512-gather.c +++ b/lib/dpif-netdev-lookup-avx512-gather.c @@ -380,7 +380,9 @@ avx512_lookup_impl(struct dpcls_subtable *subtable, DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(8, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 2) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0) @@ -419,7 +421,9 @@ dpcls_subtable_avx512_gather_probe__(uint32_t u0_bits, uint32_t u1_bits, CHECK_LOOKUP_FUNCTION(9, 4, use_vpop); CHECK_LOOKUP_FUNCTION(9, 1, use_vpop); + CHECK_LOOKUP_FUNCTION(8, 1, use_vpop); CHECK_LOOKUP_FUNCTION(5, 3, use_vpop); + CHECK_LOOKUP_FUNCTION(5, 2, use_vpop); CHECK_LOOKUP_FUNCTION(5, 1, use_vpop); CHECK_LOOKUP_FUNCTION(4, 1, use_vpop); CHECK_LOOKUP_FUNCTION(4, 0, use_vpop); diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c index 6c74ac3a1b7..76f92dd5e69 100644 --- a/lib/dpif-netdev-lookup-generic.c +++ b/lib/dpif-netdev-lookup-generic.c @@ -284,7 +284,9 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable, DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(8, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 2) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0) @@ -308,7 +310,9 @@ dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t u1_bits) CHECK_LOOKUP_FUNCTION(9, 4); CHECK_LOOKUP_FUNCTION(9, 1); + CHECK_LOOKUP_FUNCTION(8, 1); CHECK_LOOKUP_FUNCTION(5, 3); + CHECK_LOOKUP_FUNCTION(5, 2); CHECK_LOOKUP_FUNCTION(5, 1); CHECK_LOOKUP_FUNCTION(4, 1); CHECK_LOOKUP_FUNCTION(4, 0); From 9855f35dd219f48ea274500a83bf27d63f679cc5 Mon Sep 17 00:00:00 2001 From: Cian Ferriter Date: Fri, 16 Sep 2022 10:12:05 +0000 Subject: [PATCH 078/833] dpif-netdev/mfex: Add AVX512 NVGRE traffic profiles. A typical NVGRE encapsulated packet starts with the ETH/IP/GRE protocols. Miniflow extract will parse just the ETH and IP headers. The GRE header will be processed later as part of the pop action. Add support for parsing the ETH/IP headers in this scenario. Signed-off-by: Cian Ferriter Acked-by: Sunil Pai G Acked-by: Eelco Chaudron Signed-off-by: Ian Stokes --- lib/dp-packet.h | 59 +++++++++++++++++++++++-------- lib/dpif-netdev-extract-avx512.c | 43 ++++++++++++++++++++-- lib/dpif-netdev-private-extract.c | 10 ++++++ lib/dpif-netdev-private-extract.h | 5 +++ 4 files changed, 101 insertions(+), 16 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index a8ea5b40f71..ed1e5b3f6d1 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1087,8 +1087,29 @@ dp_packet_l4_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_L4_CKSUM_BAD; } +static inline uint32_t ALWAYS_INLINE +dp_packet_calc_hash_ipv4(const uint8_t *pkt, const uint16_t l3_ofs, + uint32_t hash) +{ + const void *ipv4_src = &pkt[l3_ofs + offsetof(struct ip_header, ip_src)]; + const void *ipv4_dst = &pkt[l3_ofs + offsetof(struct ip_header, ip_dst)]; + uint32_t ip_src, ip_dst; + + memcpy(&ip_src, ipv4_src, sizeof ip_src); + memcpy(&ip_dst, ipv4_dst, sizeof ip_dst); + + /* IPv4 Src and Dst. */ + hash = hash_add(hash, ip_src); + hash = hash_add(hash, ip_dst); + + /* IPv4 proto. */ + hash = hash_add(hash, pkt[l3_ofs + offsetof(struct ip_header, ip_proto)]); + + return hash; +} + static inline void ALWAYS_INLINE -dp_packet_update_rss_hash_ipv4_tcp_udp(struct dp_packet *packet) +dp_packet_update_rss_hash_ipv4(struct dp_packet *packet) { if (dp_packet_rss_valid(packet)) { return; @@ -1096,26 +1117,36 @@ dp_packet_update_rss_hash_ipv4_tcp_udp(struct dp_packet *packet) const uint8_t *pkt = dp_packet_data(packet); const uint16_t l3_ofs = packet->l3_ofs; - const void *ipv4_src = &pkt[l3_ofs + offsetof(struct ip_header, ip_src)]; - const void *ipv4_dst = &pkt[l3_ofs + offsetof(struct ip_header, ip_dst)]; + uint32_t hash = 0; + + /* IPv4 Src, Dst and proto. */ + hash = dp_packet_calc_hash_ipv4(pkt, l3_ofs, hash); + + hash = hash_finish(hash, 42); + dp_packet_set_rss_hash(packet, hash); +} + +static inline void ALWAYS_INLINE +dp_packet_update_rss_hash_ipv4_tcp_udp(struct dp_packet *packet) +{ + if (dp_packet_rss_valid(packet)) { + return; + } + + const uint8_t *pkt = dp_packet_data(packet); const void *l4_ports = &pkt[packet->l4_ofs]; - uint32_t ip_src, ip_dst, ports; + const uint16_t l3_ofs = packet->l3_ofs; uint32_t hash = 0; + uint32_t ports; - memcpy(&ip_src, ipv4_src, sizeof ip_src); - memcpy(&ip_dst, ipv4_dst, sizeof ip_dst); - memcpy(&ports, l4_ports, sizeof ports); + /* IPv4 Src, Dst and proto. */ + hash = dp_packet_calc_hash_ipv4(pkt, l3_ofs, hash); - /* IPv4 Src and Dst. */ - hash = hash_add(hash, ip_src); - hash = hash_add(hash, ip_dst); - /* IPv4 proto. */ - hash = hash_add(hash, - pkt[l3_ofs + offsetof(struct ip_header, ip_proto)]); /* L4 ports. */ + memcpy(&ports, l4_ports, sizeof ports); hash = hash_add(hash, ports); - hash = hash_finish(hash, 42); + hash = hash_finish(hash, 42); dp_packet_set_rss_hash(packet, hash); } diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 4afbed97eac..968845f2d3b 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -194,6 +194,7 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, #define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xBF, 0xFF, 0xFF) #define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11) #define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06) +#define PATTERN_IPV4_NVGRE PATTERN_IPV4_GEN(0x45, 0, 0, 0x2f) #define PATTERN_TCP_GEN(data_offset) \ 0, 0, 0, 0, /* sport, dport */ \ @@ -218,6 +219,12 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, NU, NU, NU, NU, NU, NU, NU, NU, 34, 35, 36, 37, NU, NU, NU, NU, /* TCP */ \ NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. */ +#define PATTERN_IPV4_NVGRE_SHUFFLE \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, NU, NU, /* Ether */ \ + 26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ \ + NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused */\ + NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused */ + #define PATTERN_DT1Q_IPV4_UDP_SHUFFLE \ /* Ether (2 blocks): Note that *VLAN* type is written here. */ \ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 0, 0, \ @@ -286,6 +293,9 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, #define KMASK_DT1Q_IPV6 0xFF0FULL #define KMASK_IPV6_NOHDR 0x00FFULL +#define PATTERN_IPV4_KMASK \ + (KMASK_ETHER | (KMASK_IPV4 << 16)) + #define PATTERN_IPV4_UDP_KMASK \ (KMASK_ETHER | (KMASK_IPV4 << 16) | (KMASK_UDP << 32)) @@ -332,6 +342,7 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, #define PKT_OFFSET_VLAN_IPV6_L4 (PKT_OFFSET_VLAN_L3 + IPV6_HEADER_LEN) #define PKT_OFFSET_IPV6_L4 (PKT_OFFSET_L3 + IPV6_HEADER_LEN) +#define PKT_MIN_ETH_IPV4 (ETH_HEADER_LEN + IP_HEADER_LEN) #define PKT_MIN_ETH_IPV4_UDP (PKT_OFFSET_IPV4_L4 + UDP_HEADER_LEN) #define PKT_MIN_ETH_VLAN_IPV4_UDP (PKT_OFFSET_VLAN_IPV4_L4 + UDP_HEADER_LEN) #define PKT_MIN_ETH_IPV4_TCP (PKT_OFFSET_IPV4_L4 + TCP_HEADER_LEN) @@ -352,8 +363,8 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, | MF_BIT(dl_dst) | MF_BIT(dl_src)| MF_BIT(dl_type)) #define MF_ETH_VLAN (MF_ETH | MF_BIT(vlans)) -#define MF_IPV4_UDP (MF_BIT(nw_src) | MF_BIT(ipv6_label) | MF_BIT(tp_src) | \ - MF_BIT(tp_dst)) +#define MF_IPV4 (MF_BIT(nw_src) | MF_BIT(ipv6_label)) +#define MF_IPV4_UDP (MF_IPV4 | MF_BIT(tp_src) | MF_BIT(tp_dst)) #define MF_IPV4_TCP (MF_IPV4_UDP | MF_BIT(tcp_flags) | MF_BIT(arp_tha.ea[2])) #define MF_IPV6_UDP (MF_BIT(ipv6_label) | MF_WORD(ipv6_src, 2) | \ @@ -449,6 +460,7 @@ enum MFEX_PROFILES { PROFILE_ETH_IPV6_TCP, PROFILE_ETH_VLAN_IPV6_TCP, PROFILE_ETH_VLAN_IPV6_UDP, + PROFILE_ETH_IPV4_NVGRE, PROFILE_COUNT, }; @@ -608,6 +620,21 @@ static const struct mfex_profile mfex_profiles[PROFILE_COUNT] = }, .dp_pkt_min_size = PKT_MIN_ETH_VLAN_IPV6_UDP, }, + + [PROFILE_ETH_IPV4_NVGRE] = { + .probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK }, + .probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_NVGRE}, + + .store_shuf.u8_data = { PATTERN_IPV4_NVGRE_SHUFFLE }, + .strip_mask.u8_data = { PATTERN_STRIP_IPV4_MASK }, + .store_kmsk = PATTERN_IPV4_KMASK, + + .mf_bits = { MF_ETH, MF_IPV4}, + .dp_pkt_offs = { + 0, UINT16_MAX, PKT_OFFSET_L3, PKT_OFFSET_IPV4_L4, + }, + .dp_pkt_min_size = PKT_MIN_ETH_IPV4, + }, }; /* IPv6 header helper function to fix TC, flow label and next header. */ @@ -959,6 +986,17 @@ mfex_avx512_process(struct dp_packet_batch *packets, mfex_handle_ipv6_l4((void *)&pkt[58], &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } break; + + case PROFILE_ETH_IPV4_NVGRE: { + /* Handle dynamic l2_pad_size. */ + uint32_t size_from_ipv4 = size - sizeof(struct eth_header); + struct ip_header *nh = (void *)&pkt[sizeof(struct eth_header)]; + if (mfex_ipv4_set_l2_pad_size(packet, nh, size_from_ipv4, 0)) { + continue; + } + dp_packet_update_rss_hash_ipv4(packet); + } break; + default: break; }; @@ -1013,6 +1051,7 @@ DECLARE_MFEX_FUNC(ipv6_udp, PROFILE_ETH_IPV6_UDP) DECLARE_MFEX_FUNC(ipv6_tcp, PROFILE_ETH_IPV6_TCP) DECLARE_MFEX_FUNC(dot1q_ipv6_tcp, PROFILE_ETH_VLAN_IPV6_TCP) DECLARE_MFEX_FUNC(dot1q_ipv6_udp, PROFILE_ETH_VLAN_IPV6_UDP) +DECLARE_MFEX_FUNC(ip_nvgre, PROFILE_ETH_IPV4_NVGRE) #endif /* __CHECKER__ */ #endif /* __x86_64__ */ diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c index 1a9b354201a..ded08fd3ef2 100644 --- a/lib/dpif-netdev-private-extract.c +++ b/lib/dpif-netdev-private-extract.c @@ -184,6 +184,16 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = { .extract_func = mfex_avx512_dot1q_ipv6_udp, .name = "avx512_dot1q_ipv6_udp", }, +#if HAVE_AVX512VBMI + [MFEX_IMPL_VBMI_IPv4_NVGRE] = { + .probe = mfex_avx512_vbmi_probe, + .extract_func = mfex_avx512_vbmi_ip_nvgre, + .name = "avx512_vbmi_ipv4_nvgre", }, +#endif + [MFEX_IMPL_IPv4_NVGRE] = { + .probe = mfex_avx512_probe, + .extract_func = mfex_avx512_ip_nvgre, + .name = "avx512_ipv4_nvgre", }, #endif }; diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h index 8a7f9b01aff..48549beaa0e 100644 --- a/lib/dpif-netdev-private-extract.h +++ b/lib/dpif-netdev-private-extract.h @@ -117,6 +117,10 @@ enum dpif_miniflow_extract_impl_idx { MFEX_IMPL_VBMI_DOT1Q_IPv6_UDP, #endif MFEX_IMPL_DOT1Q_IPv6_UDP, +#if HAVE_AVX512VBMI + MFEX_IMPL_VBMI_IPv4_NVGRE, +#endif + MFEX_IMPL_IPv4_NVGRE, #endif MFEX_IMPL_MAX }; @@ -230,6 +234,7 @@ DECLARE_AVX512_MFEX_PROTOTYPE(ipv6_udp); DECLARE_AVX512_MFEX_PROTOTYPE(ipv6_tcp); DECLARE_AVX512_MFEX_PROTOTYPE(dot1q_ipv6_tcp); DECLARE_AVX512_MFEX_PROTOTYPE(dot1q_ipv6_udp); +DECLARE_AVX512_MFEX_PROTOTYPE(ip_nvgre); #endif /* __x86_64__ */ From c627cfd9cb630c052285a540cd65dd809be0ea95 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:42 +0100 Subject: [PATCH 079/833] python: Fix datapath flow decoders. Fix the following erros in odp decoding: - Missing push_mpls action - Typos in collector_set_id, tp_src/tp_dst and csum - Missing two fields in vxlan match Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- python/ovs/flow/odp.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index 87a3bae2f9a..3bc3aec8e00 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -225,7 +225,7 @@ def _action_decoders_args(): KVDecoders( { "probability": decode_int, - "collector_sed_id": decode_int, + "collector_set_id": decode_int, "obs_domain_id": decode_int, "obs_point_id": decode_int, "output_port": decode_default, @@ -303,6 +303,21 @@ def _action_decoders_args(): ), "pop_nsh": decode_flag, "tnl_pop": decode_int, + "pop_mpls": KVDecoders({"eth_type": decode_int}), + **dict.fromkeys( + ["push_mpls", "add_mpls"], + nested_kv_decoder( + KVDecoders( + { + "label": decode_int, + "tc": decode_int, + "ttl": decode_int, + "bos": decode_int, + "eth_type": decode_int, + } + ) + ), + ), "ct_clear": decode_flag, "ct": nested_kv_decoder( KVDecoders( @@ -412,7 +427,7 @@ def _tnl_action_decoder_args(): { "src": decode_int, "dst": decode_int, - "dsum": Mask16, + "csum": Mask16, } ) ), @@ -499,8 +514,8 @@ def _field_decoders_args(): "src": IPMask, "dst": IPMask, "proto": Mask8, - "tcp_src": Mask16, - "tcp_dst": Mask16, + "tp_src": Mask16, + "tp_dst": Mask16, } ) ), @@ -541,6 +556,8 @@ def _field_decoders_args(): "vxlan": nested_kv_decoder( KVDecoders( { + "flags": decode_int, + "vni": decode_int, "gbp": nested_kv_decoder( KVDecoders( { @@ -548,7 +565,7 @@ def _field_decoders_args(): "flags": Mask8, } ) - ) + ), } ) ), From 3648fec08f15b3f2cc37cd4b85eaccb773d1f444 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:43 +0100 Subject: [PATCH 080/833] python: Include aliases in ofp_fields.py. We currently auto-generate a dictionary of field names and decoders. However, sometimes fields can be specified by their cannonical NXM or OXM names. Modify gen_ofp_field_decoders to also generate a dictionary of aliases so it's easy to map OXM/NXM names to their fields and decoding information. Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- build-aux/gen_ofp_field_decoders | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/build-aux/gen_ofp_field_decoders b/build-aux/gen_ofp_field_decoders index 96f99e860f7..0b797ee8c8c 100755 --- a/build-aux/gen_ofp_field_decoders +++ b/build-aux/gen_ofp_field_decoders @@ -22,12 +22,16 @@ def main(): fields = extract_fields.extract_ofp_fields(args.metaflow) field_decoders = {} + aliases = {} for field in fields: decoder = get_decoder(field) field_decoders[field.get("name")] = decoder if field.get("extra_name"): field_decoders[field.get("extra_name")] = decoder + for nxm in field.get("OXM", []): + aliases[nxm[1]] = field.get("name") + code = """ # This file is auto-generated. Do not edit! @@ -35,14 +39,25 @@ from ovs.flow import decoders field_decoders = {{ {decoders} +}} + +field_aliases = {{ +{aliases} }}""".format( decoders="\n".join( [ " '{name}': {decoder},".format(name=name, decoder=decoder) for name, decoder in field_decoders.items() ] + ), + aliases="\n".join( + [ + " '{alias}': '{name}',".format(name=name, alias=alias) + for alias, name in aliases.items() + ] ) ) + print(code) From fe204743cbc609dc5dfefd1437fc058b7ad3ca52 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:44 +0100 Subject: [PATCH 081/833] python: Add explicit decoders for all ofp actions. We were silently relying on some ofp actions to be decoded by the default decoder which would yield decent string values. In order to be more safe and robust, add an explicit decoder for all missing actions. This patch also reworks the learn action decoding to make it more explicit and verify all the fields specified in the learn action are actually valid fields. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- python/ovs/flow/kv.py | 13 +++--- python/ovs/flow/ofp.py | 50 ++++++++++++++------- python/ovs/flow/ofp_act.py | 85 +++++++++++++++++++++++++----------- python/ovs/tests/test_ofp.py | 6 +-- 4 files changed, 105 insertions(+), 49 deletions(-) diff --git a/python/ovs/flow/kv.py b/python/ovs/flow/kv.py index cceb95e4387..383d7ee7878 100644 --- a/python/ovs/flow/kv.py +++ b/python/ovs/flow/kv.py @@ -87,10 +87,11 @@ class KVDecoders(object): Args: decoders (dict): Optional; A dictionary of decoders indexed by keyword. - default (callable): Optional; A decoder used if a match is not found in - configured decoders. If not provided, the default behavior is to - try to decode the value into an integer and, if that fails, - just return the string as-is. + default (callable): Optional; A function to use if a match is not + found in configured decoders. If not provided, the default behavior + is to try to decode the value into an integer and, if that fails, + just return the string as-is. The function must accept a the key + and the value and return the decoded (key, value) tuple back. default_free (callable): Optional; The decoder used if a match is not found in configured decoders and it's a free value (e.g: a value without a key) Defaults to returning the free value as @@ -100,7 +101,7 @@ class KVDecoders(object): def __init__(self, decoders=None, default=None, default_free=None): self._decoders = decoders or dict() - self._default = default or decode_default + self._default = default or (lambda k, v: (k, decode_default(v))) self._default_free = default_free or self._default_free_decoder def decode(self, keyword, value_str): @@ -126,7 +127,7 @@ def decode(self, keyword, value_str): return keyword, value else: if value_str: - return keyword, self._default(value_str) + return self._default(keyword, value_str) else: return self._default_free(keyword) diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index 0bc110c576e..3d79ed6ad77 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -243,6 +243,7 @@ def _gen_action_decoders(): **OFPFlow._fw_action_decoders_args(), **OFPFlow._control_action_decoders_args(), **OFPFlow._other_action_decoders_args(), + **OFPFlow._instruction_action_decoders_args(), } clone_actions = OFPFlow._clone_actions_decoders_args(actions) actions.update(clone_actions) @@ -272,6 +273,8 @@ def _encap_actions_decoders_args(): "pop_vlan": decode_flag, "strip_vlan": decode_flag, "push_vlan": decode_default, + "pop_mpls": decode_int, + "push_mpls": decode_int, "decap": decode_flag, "encap": decode_encap, } @@ -286,8 +289,8 @@ def _field_action_decoders_args(): "set_mpls_ttl", "mod_nw_tos", "mod_nw_ecn", - "mod_tcp_src", - "mod_tcp_dst", + "mod_tp_src", + "mod_tp_dst", ] return { "load": decode_load_field, @@ -299,9 +302,15 @@ def _field_action_decoders_args(): "mod_dl_src": EthMask, "mod_nw_dst": IPMask, "mod_nw_src": IPMask, + "mod_nw_ttl": decode_int, + "mod_vlan_vid": decode_int, + "set_vlan_vid": decode_int, + "mod_vlan_pcp": decode_int, + "set_vlan_pcp": decode_int, "dec_ttl": decode_dec_ttl, "dec_mpls_ttl": decode_flag, "dec_nsh_ttl": decode_flag, + "delete_field": decode_field, "check_pkt_larger": decode_chk_pkt_larger, **{field: decode_default for field in field_default_decoders}, } @@ -342,6 +351,14 @@ def _fw_action_decoders_args(): ) ), "ct_clear": decode_flag, + "fin_timeout": nested_kv_decoder( + KVDecoders( + { + "idle_timeout": decode_time, + "hard_timeout": decode_time, + } + ) + ), } @staticmethod @@ -382,22 +399,13 @@ def _clone_actions_decoders_args(action_decoders): actions. """ return { - "learn": decode_learn( - { - **action_decoders, - "fin_timeout": nested_kv_decoder( - KVDecoders( - { - "idle_timeout": decode_time, - "hard_timeout": decode_time, - } - ) - ), - } - ), + "learn": decode_learn(action_decoders), "clone": functools.partial( decode_exec, KVDecoders(action_decoders) ), + "write_actions": functools.partial( + decode_exec, KVDecoders(action_decoders) + ), } @staticmethod @@ -426,3 +434,15 @@ def _other_action_decoders_args(): ) ), } + + @staticmethod + def _instruction_action_decoders_args(): + """Generate the decoder arguments for instruction actions + (see man(7) ovs-actions).""" + return { + "meter": decode_int, + "clear_actions": decode_flag, + # write_actions moved to _clone actions + "write_metadata": decode_mask(64), + "goto_table": decode_int, + } diff --git a/python/ovs/flow/ofp_act.py b/python/ovs/flow/ofp_act.py index acb16cd9a62..c481d6fc721 100644 --- a/python/ovs/flow/ofp_act.py +++ b/python/ovs/flow/ofp_act.py @@ -9,9 +9,15 @@ decode_flag, decode_int, ) -from ovs.flow.kv import nested_kv_decoder, KVDecoders, KeyValue, KVParser +from ovs.flow.kv import ( + nested_kv_decoder, + KVDecoders, + KeyValue, + KVParser, + ParseError, +) from ovs.flow.list import nested_list_decoder, ListDecoders -from ovs.flow.ofp_fields import field_decoders +from ovs.flow.ofp_fields import field_decoders, field_aliases def decode_output(value): @@ -20,7 +26,9 @@ def decode_output(value): Does not support field specification. """ if len(value.split(",")) > 1: - return nested_kv_decoder()(value) + return nested_kv_decoder( + KVDecoders({"port": decode_default, "max_len": decode_int}) + )(value) try: return {"port": int(value)} except ValueError: @@ -41,7 +49,17 @@ def decode_controller(value): except ValueError: pass # controller(key[=val], ...) - return nested_kv_decoder()(value) + return nested_kv_decoder( + KVDecoders( + { + "max_len": decode_int, + "reason": decode_default, + "id": decode_int, + "userdata": decode_default, + "pause": decode_flag, + } + ) + )(value) def decode_bundle_load(value): @@ -141,6 +159,12 @@ def decode_field(value): man page: http://www.openvswitch.org/support/dist-docs/ovs-actions.7.txt.""" parts = value.strip("]\n\r").split("[") + if ( + parts[0] not in field_decoders.keys() + and parts[0] not in field_aliases.keys() + ): + raise ParseError("Field not supported: {}".format(parts[0])) + result = { "field": parts[0], } @@ -269,31 +293,36 @@ def decode_learn(action_decoders): action decoding. """ - def decode_learn_field(decoder, value): - """Generates a decoder to be used for the 'field' argument of the - 'learn' action. - - The field can hold a value that should be decoded, either as a field, - or as a the value (see man(7) ovs-actions). - - Args: - decoder (callable): The decoder. + def learn_field_decoding_kv(key, value): + """Decodes a key, value pair from the learn action. + The key must be a decodable field. The value can be either a value + in the format defined for the field or another field. """ - if value in field_decoders.keys(): - # It's a field - return value - else: - return decoder(value) - - learn_field_decoders = { - field: functools.partial(decode_learn_field, decoder) - for field, decoder in field_decoders.items() - } + key_field = decode_field(key) + try: + return key, decode_field(value) + except ParseError: + return key, field_decoders.get(key_field.get("field"))(value) + + def learn_field_decoding_free(key): + """Decodes the free fields found in the learn action. + Free fields indicate that the filed is to be copied from the original. + In order to express that in a dictionary, return the fieldspec as + value. So, the free fild NXM_OF_IP_SRC[], is encoded as: + "NXM_OF_IP_SRC[]": { + "field": "NXM_OF_IP_SRC" + } + That way we also ensure the actual free key is correct. + """ + key_field = decode_field(key) + return key, key_field + learn_decoders = { **action_decoders, - **learn_field_decoders, "idle_timeout": decode_time, "hard_timeout": decode_time, + "fin_idle_timeout": decode_time, + "fin_hard_timeout": decode_time, "priority": decode_int, "cookie": decode_int, "send_flow_rem": decode_flag, @@ -303,4 +332,10 @@ def decode_learn_field(decoder, value): "result_dst": decode_field, } - return functools.partial(decode_exec, KVDecoders(learn_decoders)) + learn_decoder = KVDecoders( + learn_decoders, + default=learn_field_decoding_kv, + default_free=learn_field_decoding_free, + ) + + return functools.partial(decode_exec, learn_decoder) diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 7a93b2fd453..389c4544a2e 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -331,12 +331,12 @@ {"table": 69}, {"delete_learned": True}, {"cookie": 3664728752}, - {"OXM_OF_METADATA[]": True}, + {"OXM_OF_METADATA[]": {"field": "OXM_OF_METADATA"}}, {"eth_type": 2048}, - {"NXM_OF_IP_SRC[]": True}, + {"NXM_OF_IP_SRC[]": {"field": "NXM_OF_IP_SRC"}}, {"ip_dst": IPMask("172.30.204.105/32")}, {"nw_proto": 6}, - {"NXM_OF_TCP_SRC[]": "NXM_OF_TCP_DST[]"}, + {"NXM_OF_TCP_SRC[]": {"field": "NXM_OF_TCP_DST"}}, { "load": { "value": 1, From d33e548fc7d7ae03cfeba8b70ba84b5b998beca8 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:45 +0100 Subject: [PATCH 082/833] python: Make key-value matching strict by default. Currently, if a key is not found in the decoder information, we use the default decoder which typically returns a string. This not only means we can go out of sync with the C code without noticing but it's also error prone as malformed flows could be parsed without warning. Make KeyValue parsing strict, raising an error if a decoder is not found for a key. This behaviour can be turned off globally by running 'KVDecoders.strict = False' but it's generally not recommended. Also, if a KVDecoder does need this default behavior, it can be explicitly configured specifying it's default decoder. Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- python/ovs/flow/kv.py | 25 ++++++++++++++++++------- python/ovs/flow/list.py | 7 ++++++- python/ovs/tests/test_kv.py | 20 ++++++++++---------- python/ovs/tests/test_ofp.py | 28 +++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 19 deletions(-) diff --git a/python/ovs/flow/kv.py b/python/ovs/flow/kv.py index 383d7ee7878..32463254b07 100644 --- a/python/ovs/flow/kv.py +++ b/python/ovs/flow/kv.py @@ -85,13 +85,17 @@ class KVDecoders(object): reason, the default_free decoder, must return both the key and value to be stored. + Globally defined "strict" variable controls what to do when decoders do not + contain a valid decoder for a key and a default function is not provided. + If set to True (default), a ParseError is raised. + If set to False, the value will be decoded as a string. + Args: decoders (dict): Optional; A dictionary of decoders indexed by keyword. default (callable): Optional; A function to use if a match is not found in configured decoders. If not provided, the default behavior - is to try to decode the value into an integer and, if that fails, - just return the string as-is. The function must accept a the key - and the value and return the decoded (key, value) tuple back. + depends on "strict". The function must accept a the key and a value + and return the decoded (key, value) tuple back. default_free (callable): Optional; The decoder used if a match is not found in configured decoders and it's a free value (e.g: a value without a key) Defaults to returning the free value as @@ -99,9 +103,11 @@ class KVDecoders(object): The callable must accept a string and return a key-value pair. """ + strict = True + def __init__(self, decoders=None, default=None, default_free=None): self._decoders = decoders or dict() - self._default = default or (lambda k, v: (k, decode_default(v))) + self._default = default self._default_free = default_free or self._default_free_decoder def decode(self, keyword, value_str): @@ -127,9 +133,14 @@ def decode(self, keyword, value_str): return keyword, value else: if value_str: - return self._default(keyword, value_str) - else: - return self._default_free(keyword) + if self._default: + return self._default(keyword, value_str) + if self.strict: + raise ParseError( + "Cannot parse key {}: No decoder found".format(keyword) + ) + return keyword, decode_default(value_str) + return self._default_free(keyword) @staticmethod def _default_free_decoder(key): diff --git a/python/ovs/flow/list.py b/python/ovs/flow/list.py index b1e9e3fcaa6..bc466ef89f0 100644 --- a/python/ovs/flow/list.py +++ b/python/ovs/flow/list.py @@ -31,7 +31,12 @@ def decode(self, index, value_str): value_str (str): The value string to decode. """ if index < 0 or index >= len(self._decoders): - return self._default_decoder(index, value_str) + if self._default_decoder: + return self._default_decoder(index, value_str) + else: + raise ParseError( + f"Cannot decode element {index} in list: {value_str}" + ) try: key = self._decoders[index][0] diff --git a/python/ovs/tests/test_kv.py b/python/ovs/tests/test_kv.py index c5b66de887b..76887498a57 100644 --- a/python/ovs/tests/test_kv.py +++ b/python/ovs/tests/test_kv.py @@ -1,6 +1,9 @@ import pytest -from ovs.flow.kv import KVParser, KeyValue +from ovs.flow.kv import KVParser, KVDecoders, KeyValue +from ovs.flow.decoders import decode_default + +decoders = KVDecoders(default=lambda k, v: (k, decode_default(v))) @pytest.mark.parametrize( @@ -9,7 +12,7 @@ ( ( "cookie=0x0, duration=147566.365s, table=0, n_packets=39, n_bytes=2574, idle_age=65534, hard_age=65534", # noqa: E501 - None, + decoders, ), [ KeyValue("cookie", 0), @@ -24,7 +27,7 @@ ( ( "load:0x4->NXM_NX_REG13[],load:0x9->NXM_NX_REG11[],load:0x8->NXM_NX_REG12[],load:0x1->OXM_OF_METADATA[],load:0x1->NXM_NX_REG14[],mod_dl_src:0a:58:a9:fe:00:02,resubmit(,8)", # noqa: E501 - None, + decoders, ), [ KeyValue("load", "0x4->NXM_NX_REG13[]"), @@ -36,20 +39,17 @@ KeyValue("resubmit", ",8"), ], ), + (("l1(l2(l3(l4())))", decoders), [KeyValue("l1", "l2(l3(l4()))")]), ( - ("l1(l2(l3(l4())))", None), - [KeyValue("l1", "l2(l3(l4()))")] - ), - ( - ("l1(l2(l3(l4()))),foo:bar", None), + ("l1(l2(l3(l4()))),foo:bar", decoders), [KeyValue("l1", "l2(l3(l4()))"), KeyValue("foo", "bar")], ), ( - ("enqueue:1:2,output=2", None), + ("enqueue:1:2,output=2", decoders), [KeyValue("enqueue", "1:2"), KeyValue("output", 2)], ), ( - ("value_to_reg(100)->someReg[10],foo:bar", None), + ("value_to_reg(100)->someReg[10],foo:bar", decoders), [ KeyValue("value_to_reg", "(100)->someReg[10]"), KeyValue("foo", "bar"), diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 389c4544a2e..328ab7285ea 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -2,7 +2,7 @@ import pytest from ovs.flow.ofp import OFPFlow -from ovs.flow.kv import KeyValue +from ovs.flow.kv import KeyValue, ParseError from ovs.flow.decoders import EthMask, IPMask, decode_mask @@ -509,11 +509,37 @@ ), ], ), + ( + "actions=doesnotexist(1234)", + ParseError, + ), + ( + "actions=learn(eth_type=nofield)", + ParseError, + ), + ( + "actions=learn(nofield=eth_type)", + ParseError, + ), + ( + "nofield=0x123 actions=drop", + ParseError, + ), + ( + "actions=load:0x12334->NOFILED", + ParseError, + ), ], ) def test_act(input_string, expected): + if isinstance(expected, type): + with pytest.raises(expected): + ofp = OFPFlow(input_string) + return + ofp = OFPFlow(input_string) actions = ofp.actions_kv + for i in range(len(expected)): assert expected[i].key == actions[i].key assert expected[i].value == actions[i].value From 75a6e8db9c5f9dc2887cae1555d977f0fdf08471 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:46 +0100 Subject: [PATCH 083/833] python: Return list of actions for odp action clone. Sometimes we don't want to return the result of a nested key-value decoding as a dictionary but as a list of dictionaries. This happens when we parse actions where keys can be repeated. Refactor code that already takes that into account from ofp_act.py to kv.py and use it for datapath action "clone". Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- python/ovs/flow/kv.py | 21 +++++++++++++++++++- python/ovs/flow/odp.py | 6 ++++-- python/ovs/flow/ofp.py | 14 ++++++------- python/ovs/flow/ofp_act.py | 18 +---------------- python/ovs/tests/test_odp.py | 38 +++++++++++++++++++++++++----------- 5 files changed, 59 insertions(+), 38 deletions(-) diff --git a/python/ovs/flow/kv.py b/python/ovs/flow/kv.py index 32463254b07..3138db00880 100644 --- a/python/ovs/flow/kv.py +++ b/python/ovs/flow/kv.py @@ -320,7 +320,26 @@ def decode_nested_kv(decoders, value): return {kv.key: kv.value for kv in parser.kv()} -def nested_kv_decoder(decoders=None): +def decode_nested_kv_list(decoders, value): + """A key-value decoder that extracts nested key-value pairs and returns + them in a list of dictionary. + + Args: + decoders (KVDecoders): The KVDecoders to use. + value (str): The value string to decode. + """ + if not value: + # Mark as flag + return True + + parser = KVParser(value, decoders) + parser.parse() + return [{kv.key: kv.value} for kv in parser.kv()] + + +def nested_kv_decoder(decoders=None, is_list=False): """Helper function that creates a nested kv decoder with given KVDecoders.""" + if is_list: + return functools.partial(decode_nested_kv_list, decoders) return functools.partial(decode_nested_kv, decoders) diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index 3bc3aec8e00..db63afc8d64 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -337,7 +337,8 @@ def _action_decoders_args(): } _decoders["clone"] = nested_kv_decoder( - KVDecoders(decoders=_decoders, default_free=decode_free_output) + KVDecoders(decoders=_decoders, default_free=decode_free_output), + is_list=True, ) return { @@ -350,7 +351,8 @@ def _action_decoders_args(): KVDecoders( decoders=_decoders, default_free=decode_free_output, - ) + ), + is_list=True, ), } ) diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index 3d79ed6ad77..8f272736173 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -31,7 +31,6 @@ decode_dec_ttl, decode_chk_pkt_larger, decode_zone, - decode_exec, decode_learn, ) @@ -336,8 +335,7 @@ def _fw_action_decoders_args(): "table": decode_int, "nat": decode_nat, "force": decode_flag, - "exec": functools.partial( - decode_exec, + "exec": nested_kv_decoder( KVDecoders( { **OFPFlow._encap_actions_decoders_args(), @@ -345,6 +343,7 @@ def _fw_action_decoders_args(): **OFPFlow._meta_action_decoders_args(), } ), + is_list=True, ), "alg": decode_default, } @@ -359,6 +358,7 @@ def _fw_action_decoders_args(): } ) ), + # learn moved to _clone actions. } @staticmethod @@ -400,11 +400,11 @@ def _clone_actions_decoders_args(action_decoders): """ return { "learn": decode_learn(action_decoders), - "clone": functools.partial( - decode_exec, KVDecoders(action_decoders) + "clone": nested_kv_decoder( + KVDecoders(action_decoders), is_list=True ), - "write_actions": functools.partial( - decode_exec, KVDecoders(action_decoders) + "write_actions": nested_kv_decoder( + KVDecoders(action_decoders), is_list=True ), } diff --git a/python/ovs/flow/ofp_act.py b/python/ovs/flow/ofp_act.py index c481d6fc721..5eaf0b2185a 100644 --- a/python/ovs/flow/ofp_act.py +++ b/python/ovs/flow/ofp_act.py @@ -1,8 +1,5 @@ """Defines decoders for OpenFlow actions. """ - -import functools - from ovs.flow.decoders import ( decode_default, decode_time, @@ -258,19 +255,6 @@ def decode_zone(value): return decode_field(value) -def decode_exec(action_decoders, value): - """Decodes the value of the 'exec' keyword (part of the ct action). - - Args: - decode_actions (KVDecoders): The decoders to be used to decode the - nested exec. - value (string): The string to be decoded. - """ - exec_parser = KVParser(value, action_decoders) - exec_parser.parse() - return [{kv.key: kv.value} for kv in exec_parser.kv()] - - def decode_learn(action_decoders): """Create the decoder to be used to decode the 'learn' action. @@ -338,4 +322,4 @@ def learn_field_decoding_free(key): default_free=learn_field_decoding_free, ) - return functools.partial(decode_exec, learn_decoder) + return nested_kv_decoder(learn_decoder, is_list=True) diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index 715be386940..f8017ca8a16 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -453,21 +453,37 @@ def test_odp_fields(input_string, expected): ], ), ( - "actions:clone(1)" ",clone(clone(push_vlan(vid=12,pcp=0),2),1)", + "actions:clone(1),clone(clone(push_vlan(vid=12,pcp=0),2),1)", [ - KeyValue("clone", {"output": {"port": 1}}), + KeyValue("clone", [{"output": {"port": 1}}]), KeyValue( "clone", - { - "output": {"port": 1}, - "clone": { - "push_vlan": { - "vid": 12, - "pcp": 0, - }, - "output": {"port": 2}, + [ + { + "clone": [ + { + "push_vlan": { + "vid": 12, + "pcp": 0, + }, + }, + {"output": {"port": 2}}, + ] }, - }, + {"output": {"port": 1}}, + ], + ), + ], + ), + ( + "actions:clone(recirc(0x1),recirc(0x2))", + [ + KeyValue( + "clone", + [ + {"recirc": 1}, + {"recirc": 2}, + ], ), ], ), From 1850e5e6891282d84bdeb7b7100166cfd8deed28 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:47 +0100 Subject: [PATCH 084/833] python: Support case-insensitive OpenFlow actions. OpenFlow actions names can be capitalized so in order to support this, support case-insensitive KVDecoders and use it in Openflow actions. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- python/ovs/flow/kv.py | 17 ++++++++++++++--- python/ovs/flow/ofp.py | 7 ++++--- python/ovs/tests/test_ofp.py | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/python/ovs/flow/kv.py b/python/ovs/flow/kv.py index 3138db00880..f7d7be0cf1e 100644 --- a/python/ovs/flow/kv.py +++ b/python/ovs/flow/kv.py @@ -105,10 +105,17 @@ class KVDecoders(object): strict = True - def __init__(self, decoders=None, default=None, default_free=None): - self._decoders = decoders or dict() + def __init__(self, decoders=None, default=None, default_free=None, + ignore_case=False): + if not decoders: + self._decoders = dict() + elif ignore_case: + self._decoders = {k.lower(): v for k, v in decoders.items()} + else: + self._decoders = decoders self._default = default self._default_free = default_free or self._default_free_decoder + self._ignore_case = ignore_case def decode(self, keyword, value_str): """Decode a keyword and value. @@ -121,7 +128,11 @@ def decode(self, keyword, value_str): The key (str) and value(any) to be stored. """ - decoder = self._decoders.get(keyword) + decoder = None + if self._ignore_case: + decoder = self._decoders.get(keyword.lower()) + else: + decoder = self._decoders.get(keyword) if decoder: result = decoder(value_str) if isinstance(result, KeyValue): diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index 8f272736173..bf832f71b98 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -246,7 +246,8 @@ def _gen_action_decoders(): } clone_actions = OFPFlow._clone_actions_decoders_args(actions) actions.update(clone_actions) - return KVDecoders(actions, default_free=decode_free_output) + return KVDecoders(actions, default_free=decode_free_output, + ignore_case=True) @staticmethod def _output_actions_decoders_args(): @@ -401,10 +402,10 @@ def _clone_actions_decoders_args(action_decoders): return { "learn": decode_learn(action_decoders), "clone": nested_kv_decoder( - KVDecoders(action_decoders), is_list=True + KVDecoders(action_decoders, ignore_case=True), is_list=True ), "write_actions": nested_kv_decoder( - KVDecoders(action_decoders), is_list=True + KVDecoders(action_decoders, ignore_case=True), is_list=True ), } diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 328ab7285ea..5aa8d591bf6 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -509,6 +509,21 @@ ), ], ), + ( + "actions=POP_VLAN,push_vlan:0x8100,NORMAL,clone(MOD_NW_SRC:192.168.1.1,resubmit(,10))", # noqa: E501 + [ + KeyValue("POP_VLAN", True), + KeyValue("push_vlan", 0x8100), + KeyValue("output", {"port": "NORMAL"}), + KeyValue( + "clone", + [ + {"MOD_NW_SRC": netaddr.IPAddress("192.168.1.1")}, + {"resubmit": {"port": "", "table": 10}}, + ] + ), + ], + ), ( "actions=doesnotexist(1234)", ParseError, From 542fdad701403c11cfe8356957f934fa657c1742 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:48 +0100 Subject: [PATCH 085/833] python: Fix output=CONTROLLER action. When CONTROLLER is used as free key, it means output=CONTROLLER which is handled by decode_controller. However, it must output the KV in the right format: "output": {"format": "CONTROLLER"}. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- python/ovs/flow/ofp_act.py | 2 +- python/ovs/tests/test_ofp.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/python/ovs/flow/ofp_act.py b/python/ovs/flow/ofp_act.py index 5eaf0b2185a..c540443eaea 100644 --- a/python/ovs/flow/ofp_act.py +++ b/python/ovs/flow/ofp_act.py @@ -35,7 +35,7 @@ def decode_output(value): def decode_controller(value): """Decodes the controller action.""" if not value: - return KeyValue("output", "controller") + return KeyValue("output", {"port": "CONTROLLER"}) else: # Try controller:max_len try: diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 5aa8d591bf6..e17188e2b44 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -22,7 +22,7 @@ ( "actions=controller,controller:200", [ - KeyValue("output", "controller"), + KeyValue("output", {"port": "CONTROLLER"}), KeyValue("controller", {"max_len": 200}), ], ), @@ -524,6 +524,14 @@ ), ], ), + ( + "actions=MOD_NW_SRC:192.168.1.1,CONTROLLER,CONTROLLER:123", + [ + KeyValue("MOD_NW_SRC", netaddr.IPAddress("192.168.1.1")), + KeyValue("output", {"port": "CONTROLLER"}), + KeyValue("CONTROLLER", {"max_len": 123}), + ], + ), ( "actions=doesnotexist(1234)", ParseError, From c395e9810e07ab957676b4f75e9cacd39dca6839 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:49 +0100 Subject: [PATCH 086/833] python: Interpret free keys as output in clone. clone-like actions can also output to ports by specifying the port name. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- python/ovs/flow/ofp.py | 6 ++++-- python/ovs/tests/test_ofp.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index bf832f71b98..eac8d08513f 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -402,10 +402,12 @@ def _clone_actions_decoders_args(action_decoders): return { "learn": decode_learn(action_decoders), "clone": nested_kv_decoder( - KVDecoders(action_decoders, ignore_case=True), is_list=True + KVDecoders(action_decoders, default_free=decode_free_output, + ignore_case=True), is_list=True ), "write_actions": nested_kv_decoder( - KVDecoders(action_decoders, ignore_case=True), is_list=True + KVDecoders(action_decoders, default_free=decode_free_output, + ignore_case=True), is_list=True ), } diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index e17188e2b44..27bcf0c47cb 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -532,6 +532,19 @@ KeyValue("CONTROLLER", {"max_len": 123}), ], ), + ( + "actions=LOCAL,clone(myport,CONTROLLER)", + [ + KeyValue("output", {"port": "LOCAL"}), + KeyValue( + "clone", + [ + {"output": {"port": "myport"}}, + {"output": {"port": "CONTROLLER"}}, + ] + ), + ], + ), ( "actions=doesnotexist(1234)", ParseError, From fc3f918cb56110884092106af8723ff24e63a9c2 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:50 +0100 Subject: [PATCH 087/833] tests: Verify flows in ofp-actions are parseable. Create a small helper script and check that flows used in ofp-actions.at are parseable. Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/automake.mk | 2 ++ tests/ofp-actions.at | 18 +++++++++++++++++ tests/test-ofparse.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100755 tests/test-ofparse.py diff --git a/tests/automake.mk b/tests/automake.mk index d509cf93504..63a0490adfb 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -19,6 +19,7 @@ EXTRA_DIST += \ $(OVSDB_CLUSTER_TESTSUITE) \ tests/atlocal.in \ $(srcdir)/package.m4 \ + $(srcdir)/tests/test-ofparse.py \ $(srcdir)/tests/testsuite \ $(srcdir)/tests/testsuite.patch @@ -522,6 +523,7 @@ CHECK_PYFILES = \ tests/test-json.py \ tests/test-jsonrpc.py \ tests/test-l7.py \ + tests/test-ofparse.py \ tests/test-ovsdb.py \ tests/test-reconnect.py \ tests/test-stream.py \ diff --git a/tests/ofp-actions.at b/tests/ofp-actions.at index 9d820eba6d4..40a23bb15dc 100644 --- a/tests/ofp-actions.at +++ b/tests/ofp-actions.at @@ -329,6 +329,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow10 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([OpenFlow 1.0 "instruction" translations]) @@ -359,6 +360,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-instructions OpenFlow10 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([OpenFlow 1.1 action translation]) @@ -502,6 +504,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow11 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([OpenFlow 1.1 instruction translation]) @@ -737,6 +740,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow12 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP dnl Our primary goal here is to verify OpenFlow 1.3-specific changes, @@ -798,6 +802,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow13 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP dnl Our primary goal here is to verify that OpenFlow 1.5-specific changes, @@ -827,17 +832,20 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow15 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([ofp-actions - inconsistent MPLS actions]) OVS_VSWITCHD_START dnl OK: Use fin_timeout action on TCP flow AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn add-flow br0 'tcp actions=fin_timeout(idle_timeout=1)']) +AT_CHECK([echo 'tcp actions=fin_timeout(idle_timeout=1)' | test-ofparse.py]) dnl Bad: Use fin_timeout action on TCP flow that has been converted to MPLS AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn add-flow br0 'tcp actions=push_mpls:0x8847,fin_timeout(idle_timeout=1)'], [1], [], [dnl ovs-ofctl: none of the usable flow formats (OpenFlow10,NXM) is among the allowed flow formats (OpenFlow11) ]) +AT_CHECK([echo 'tcp actions=push_mpls:0x8847,fin_timeout(idle_timeout=1)' | test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -853,6 +861,8 @@ AT_CHECK([ovs-ofctl -O OpenFlow10 dump-flows br0 | ofctl_strip], [0], [dnl NXST_FLOW reply: mpls actions=load:0xa->OXM_OF_MPLS_LABEL[[]] ]) +AT_CHECK([echo 'mpls actions=set_field:10->mpls_label' | test-ofparse.py]) +AT_CHECK([echo 'mpls actions=load:0xa->OXM_OF_MPLS_LABEL[[]]'| test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -862,14 +872,17 @@ OVS_VSWITCHD_START dnl OpenFlow 1.0 has an "enqueue" action. For OpenFlow 1.1+, we translate dnl it to a series of actions that accomplish the same thing. AT_CHECK([ovs-ofctl -O OpenFlow10 add-flow br0 'actions=enqueue(123,456)']) +AT_CHECK([echo 'actions=enqueue(123,456)' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow10 dump-flows br0 | ofctl_strip], [0], [dnl NXST_FLOW reply: actions=enqueue:123:456 ]) +AT_CHECK([echo 'actions=enqueue:123:456' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow13 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.3): reset_counts actions=set_queue:456,output:123,pop_queue ]) +AT_CHECK([echo 'actions=set_queue:456,output:123,pop_queue' | test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -887,6 +900,8 @@ AT_CHECK([ovs-ofctl -O OpenFlow11 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.1): ip actions=mod_nw_ttl:123 ]) +AT_CHECK([echo 'ip,actions=mod_nw_ttl:123' | test-ofparse.py]) +AT_CHECK([echo 'ip actions=load:0x7b->NXM_NX_IP_TTL[[]]' | test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -898,10 +913,12 @@ dnl OpenFlow 1.1, but no other version, has a "mod_nw_ecn" action. dnl Check that we translate it properly for OF1.0 and OF1.2. dnl (OF1.3+ should be the same as OF1.2.) AT_CHECK([ovs-ofctl -O OpenFlow11 add-flow br0 'ip,actions=mod_nw_ecn:2']) +AT_CHECK([echo 'ip,actions=mod_nw_ecn:2' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow10 dump-flows br0 | ofctl_strip], [0], [dnl NXST_FLOW reply: ip actions=load:0x2->NXM_NX_IP_ECN[[]] ]) +AT_CHECK([echo 'ip actions=load:0x2->NXM_NX_IP_ECN[[]]' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow11 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.1): ip actions=mod_nw_ecn:2 @@ -910,6 +927,7 @@ AT_CHECK([ovs-ofctl -O OpenFlow12 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.2): ip actions=set_field:2->nw_ecn ]) +AT_CHECK([echo 'ip actions=set_field:2->nw_ecn' | test-ofparse.py]) dnl Check that OF1.2+ set_field to set ECN is translated into the OF1.1 dnl mod_nw_ecn action. diff --git a/tests/test-ofparse.py b/tests/test-ofparse.py new file mode 100755 index 00000000000..ba96e8344c2 --- /dev/null +++ b/tests/test-ofparse.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""test-ofparse reads flows from stdin and tries to parse them using +the python flow parsing library. +""" + +import fileinput +import sys + +try: + from ovs.flow.ofp import OFPFlow +except ImportError: + sys.exit(0) + + +def main(): + for flow in fileinput.input(): + try: + result_flow = OFPFlow(flow) + if flow != str(result_flow): + print("in: {}".format(flow)) + print("out: {}".format(str(result_flow))) + raise ValueError("Flow conversion back to string failed") + except Exception as e: + print("Error parsing flow {}: {}".format(flow, e)) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 22eb2243864d42580dd1447cf09906d4d34fbb68 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:51 +0100 Subject: [PATCH 088/833] tests: Verify flows in odp.at are parseable. Create a small helper script and check that flows tested in odp.at are parseable. Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/automake.mk | 2 ++ tests/odp.at | 12 +++++++++++- tests/test-dpparse.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100755 tests/test-dpparse.py diff --git a/tests/automake.mk b/tests/automake.mk index 63a0490adfb..4091a2796d8 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -19,6 +19,7 @@ EXTRA_DIST += \ $(OVSDB_CLUSTER_TESTSUITE) \ tests/atlocal.in \ $(srcdir)/package.m4 \ + $(srcdir)/tests/test-dpparse.py \ $(srcdir)/tests/test-ofparse.py \ $(srcdir)/tests/testsuite \ $(srcdir)/tests/testsuite.patch @@ -520,6 +521,7 @@ CHECK_PYFILES = \ tests/mfex_fuzzy.py \ tests/ovsdb-monitor-sort.py \ tests/test-daemon.py \ + tests/test-dpparse.py \ tests/test-json.py \ tests/test-jsonrpc.py \ tests/test-l7.py \ diff --git a/tests/odp.at b/tests/odp.at index 88b7cfd917f..41eb726e922 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -104,9 +104,9 @@ dnl specified. We can skip these. sed -i'back' 's/\(skb_mark(0)\),\(ct\)/\1,ct_state(0),ct_zone(0),\2/' odp-out.txt sed -i'back' 's/\(skb_mark([[^)]]*)\),\(recirc\)/\1,ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),\2/' odp-out.txt sed -i'back' 's/\(in_port(1)\),\(eth\)/\1,packet_type(ns=0,id=0),\2/' odp-out.txt - AT_CHECK_UNQUOTED([ovstest test-odp parse-keys < odp-in.txt], [0], [`cat odp-out.txt` ]) +AT_CHECK_UNQUOTED([cat odp-in.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath wildcarded key parsing and formatting - valid forms]) @@ -194,6 +194,7 @@ sed -n 's/,frag=no),.*/,frag=later)/p' odp-base.txt AT_CAPTURE_FILE([odp.txt]) AT_CHECK_UNQUOTED([ovstest test-odp parse-wc-keys < odp.txt], [0], [`cat odp.txt` ]) +AT_CHECK_UNQUOTED([cat odp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath wildcarded key filtering.]) @@ -241,24 +242,31 @@ in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_type=0x1235' < odp-base.txt], [0], [`cat odp-eth-type.txt` ]) +AT_CHECK_UNQUOTED([cat odp-eth-type.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_vlan=99' < odp-vlan-base.txt], [0], [`cat odp-vlan.txt` ]) +AT_CHECK_UNQUOTED([cat odp-vlan.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_vlan=99,ip' < odp-vlan-base.txt], [0], [`cat odp-vlan.txt` ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='ip,nw_src=35.8.2.199' < odp-base.txt], [0], [`cat odp-ipv4.txt` ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='ip,nw_dst=172.16.0.199' < odp-base.txt], [0], [`cat odp-ipv4.txt` ]) +AT_CHECK_UNQUOTED([cat odp-ipv4.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_type=0x0800,nw_src=35.8.2.199,nw_dst=172.16.0.199' < odp-base.txt], [0], [`cat odp-ipv4.txt` ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='icmp,nw_src=35.8.2.199' < odp-base.txt], [0], [`cat odp-icmp.txt` ]) +AT_CHECK_UNQUOTED([cat odp-icmp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='arp,arp_spa=1.2.3.5' < odp-base.txt], [0], [`cat odp-arp.txt` ]) +AT_CHECK_UNQUOTED([cat odp-arp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='tcp,tp_src=90' < odp-base.txt], [0], [`cat odp-tcp.txt` ]) +AT_CHECK_UNQUOTED([cat odp-tcp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='tcp6,tp_src=90' < odp-base.txt], [0], [`cat odp-tcp6.txt` ]) +AT_CHECK_UNQUOTED([cat odp-tcp6.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - valid forms]) @@ -391,6 +399,7 @@ add_mpls(label=200,tc=7,ttl=64,bos=1,eth_type=0x8847) AT_CHECK_UNQUOTED([ovstest test-odp parse-actions < actions.txt], [0], [`cat actions.txt` ]) +AT_CHECK_UNQUOTED([cat actions.txt | sed 's/^/actions:/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - invalid forms]) @@ -436,6 +445,7 @@ odp_actions_from_string: error `cat actions.txt | head -3 | tail -1` odp_actions_from_string: error ]) +AT_CHECK_UNQUOTED([cat actions.txt | sed 's/^/actions:/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - actions too long]) diff --git a/tests/test-dpparse.py b/tests/test-dpparse.py new file mode 100755 index 00000000000..7762e5e8a90 --- /dev/null +++ b/tests/test-dpparse.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""test-dpparse reads flows from stdin and tries to parse them using +the python flow parsing library. +""" + +import fileinput +import sys + +try: + from ovs.flow.odp import ODPFlow +except ImportError: + sys.exit(0) + + +def main(): + for flow in fileinput.input(): + try: + result_flow = ODPFlow(flow) + if flow != str(result_flow): + print("in: {}".format(flow)) + print("out: {}".format(str(result_flow))) + raise ValueError("Flow conversion back to string failed") + except Exception as e: + print("Error parsing flow {}: {}".format(flow, e)) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 863d2e1a8c2a6ced49a49024c094ef6a9aa7e55a Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 19 Dec 2022 17:13:52 +0100 Subject: [PATCH 089/833] python: Don't exit OFPFlow constructor. Returning None in a constructor does not make sense and is just error prone. Removing what was a leftover from an attempt to handle a common error case of trying to parse what is commonly outputted by ovs-ofctl. This should be done by the caller anyway. Signed-off-by: Adrian Moreno Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- python/ovs/flow/ofp.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index eac8d08513f..20231fd9f38 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -104,9 +104,6 @@ def __init__(self, ofp_string, id=None): ValueError if the string is malformed. ParseError if an error in parsing occurs. """ - if " reply " in ofp_string: - return None - sections = list() parts = ofp_string.split("actions=") if len(parts) != 2: From 685973a9f1cb2c9a49ea517a8feab7012a35a1fd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 14 Dec 2022 10:29:16 -0600 Subject: [PATCH 090/833] ovsdb-server: Don't log when memory-trim-on-compaction doesn't change. But log at least once even if the value hasn't changed, for informational purposes. Signed-off-by: Dan Williams Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 7a6bfe0a03c..33ca4910d70 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -1600,6 +1600,8 @@ ovsdb_server_memory_trim_on_compaction(struct unixctl_conn *conn, const char *argv[], void *arg OVS_UNUSED) { + bool old_trim_memory = trim_memory; + static bool have_logged = false; const char *command = argv[1]; #if !HAVE_DECL_MALLOC_TRIM @@ -1615,8 +1617,11 @@ ovsdb_server_memory_trim_on_compaction(struct unixctl_conn *conn, unixctl_command_reply_error(conn, "invalid argument"); return; } - VLOG_INFO("memory trimming after compaction %s.", - trim_memory ? "enabled" : "disabled"); + if (!have_logged || (trim_memory != old_trim_memory)) { + have_logged = true; + VLOG_INFO("memory trimming after compaction %s.", + trim_memory ? "enabled" : "disabled"); + } unixctl_command_reply(conn, NULL); } From d5469cb743c284461739cb99c686dfbe92ded70c Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 8 Dec 2022 10:48:06 +0100 Subject: [PATCH 091/833] Makefile: Add USDT scripts to make install and fedora/debian test rpm. This change will install all the USDT scripts to the {_datadir}/openvswitch/scripts/usdt directory with the make install command. In addition it will also add them to the Fedora and Debian openvswitch-test rpm. Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Makefile.am | 2 ++ debian/openvswitch-test.install | 1 + rhel/openvswitch-fedora.spec.in | 1 + utilities/automake.mk | 4 ++++ 4 files changed, 8 insertions(+) diff --git a/Makefile.am b/Makefile.am index d4385386743..606bcc22e12 100644 --- a/Makefile.am +++ b/Makefile.am @@ -120,6 +120,7 @@ OVSIDL_BUILT = pkgdata_DATA = sbin_SCRIPTS = scripts_SCRIPTS = +usdt_SCRIPTS = completion_SCRIPTS = scripts_DATA = SUFFIXES = @@ -133,6 +134,7 @@ C ?= 1 endif scriptsdir = $(pkgdatadir)/scripts +usdtdir = $(pkgdatadir)/scripts/usdt completiondir = $(sysconfdir)/bash_completion.d pkgconfigdir = $(libdir)/pkgconfig diff --git a/debian/openvswitch-test.install b/debian/openvswitch-test.install index b3a80d86ae2..88c82528054 100644 --- a/debian/openvswitch-test.install +++ b/debian/openvswitch-test.install @@ -2,3 +2,4 @@ usr/bin/ovs-l3ping usr/bin/ovs-test usr/share/man/man8/ovs-l3ping.8 usr/share/man/man8/ovs-test.8 +usr/share/openvswitch/scripts/usdt/* diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 17aab796fca..8fc6e8ab233 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -396,6 +396,7 @@ fi %{_bindir}/ovs-pcap %{_bindir}/ovs-tcpdump %{_bindir}/ovs-tcpundump +%{_datadir}/openvswitch/scripts/usdt/* %{_mandir}/man8/ovs-test.8* %{_mandir}/man8/ovs-vlan-test.8* %{_mandir}/man8/ovs-l3ping.8* diff --git a/utilities/automake.mk b/utilities/automake.mk index eb57653a1cd..132a16942e8 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -20,6 +20,10 @@ scripts_SCRIPTS += \ utilities/ovs-kmod-ctl \ utilities/ovs-save scripts_DATA += utilities/ovs-lib +usdt_SCRIPTS += \ + utilities/usdt-scripts/bridge_loop.bt \ + utilities/usdt-scripts/upcall_cost.py \ + utilities/usdt-scripts/upcall_monitor.py completion_SCRIPTS += \ utilities/ovs-appctl-bashcomp.bash \ From 9a86a3dd68f054d47e1a93b8dec03d51479554f4 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 21 Dec 2022 18:51:20 +0100 Subject: [PATCH 092/833] travis: Drop support. Following a change in the terms of use, free Travis credits are really too low for a realistic usage by OVS contributors. As a consequence, testing OVS with Travis has been abandoned by most (if not all) contributors to the project. Drop the Travis configuration from our repository, clean references in the documentation and move GHA specifics to the association yml. Acked-by: Aaron Conole Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- .ci/linux-build.sh | 31 +--------- .ci/linux-prepare.sh | 22 +------ .ci/osx-build.sh | 15 ----- .github/workflows/build-and-test.yml | 4 ++ .travis.yml | 57 ------------------- .../contributing/submitting-patches.rst | 7 +-- Documentation/topics/testing.rst | 40 ------------- Makefile.am | 1 - NEWS | 2 + README.rst | 2 - 10 files changed, 14 insertions(+), 167 deletions(-) delete mode 100644 .travis.yml diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 48510967238..c06186ce1cf 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -7,21 +7,6 @@ CFLAGS_FOR_OVS="-g -O2" SPARSE_FLAGS="" EXTRA_OPTS="--enable-Werror" -on_exit() { - if [ $? = 0 ]; then - exit - fi - FILES_TO_PRINT="config.log" - FILES_TO_PRINT="$FILES_TO_PRINT */_build/sub/tests/testsuite.log" - - for pr_file in $FILES_TO_PRINT; do - cat "$pr_file" 2>/dev/null - done -} -# We capture the error logs as artifacts in Github Actions, no need to dump -# them via a EXIT handler. -[ -n "$GITHUB_WORKFLOW" ] || trap on_exit EXIT - function install_kernel() { if [[ "$1" =~ ^5.* ]]; then @@ -98,19 +83,9 @@ function install_kernel() function install_dpdk() { local DPDK_VER=$1 - local VERSION_FILE="dpdk-dir/travis-dpdk-cache-version" + local VERSION_FILE="dpdk-dir/cached-version" local DPDK_OPTS="" - local DPDK_LIB="" - - if [ -z "$TRAVIS_ARCH" ] || - [ "$TRAVIS_ARCH" == "amd64" ]; then - DPDK_LIB=$(pwd)/dpdk-dir/build/lib/x86_64-linux-gnu - elif [ "$TRAVIS_ARCH" == "aarch64" ]; then - DPDK_LIB=$(pwd)/dpdk-dir/build/lib/aarch64-linux-gnu - else - echo "Target is unknown" - exit 1 - fi + local DPDK_LIB=$(pwd)/dpdk-dir/build/lib/x86_64-linux-gnu if [ "$DPDK_SHARED" ]; then EXTRA_OPTS="$EXTRA_OPTS --with-dpdk=shared" @@ -245,7 +220,7 @@ elif [ "$M32" ]; then # Adding m32 flag directly to CC to avoid any posiible issues with API/ABI # difference on 'configure' and 'make' stages. export CC="$CC -m32" -elif [ "$TRAVIS_ARCH" != "aarch64" ]; then +else OPTS="--enable-sparse" if [ "$AFXDP" ]; then # netdev-afxdp uses memset for 64M for umem initialization. diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh index 11d75a6d598..f414a879c70 100755 --- a/.ci/linux-prepare.sh +++ b/.ci/linux-prepare.sh @@ -10,14 +10,11 @@ fi # Build and install sparse. # -# Explicitly disable sparse support for llvm because some travis -# environments claim to have LLVM (llvm-config exists and works) but -# linking against it fails. # Disabling sqlite support because sindex build fails and we don't # really need this utility being installed. git clone git://git.kernel.org/pub/scm/devel/sparse/sparse.git cd sparse -make -j4 HAVE_LLVM= HAVE_SQLITE= install +make -j4 HAVE_SQLITE= install cd .. # Installing wheel separately because it may be needed to build some @@ -29,23 +26,8 @@ pip3 install --disable-pip-version-check --user \ flake8 'hacking>=3.0' netaddr pyparsing sphinx setuptools pyelftools pip3 install --user 'meson==0.53.2' -if [ "$M32" ]; then - # Installing 32-bit libraries. - pkgs="gcc-multilib" - if [ -z "$GITHUB_WORKFLOW" ]; then - # 32-bit and 64-bit libunwind can not be installed at the same time. - # This will remove the 64-bit libunwind and install 32-bit version. - # GitHub Actions doesn't have 32-bit versions of these libs. - pkgs=$pkgs" libunwind-dev:i386 libunbound-dev:i386" - fi - - sudo apt-get install -y $pkgs -fi - # Install python test dependencies pip3 install -r python/test_requirements.txt -# IPv6 is supported by kernel but disabled in TravisCI images: -# https://github.com/travis-ci/travis-ci/issues/8891 -# Enable it to avoid skipping of IPv6 related tests. +# Make sure IPv6 is enabled to avoid skipping of IPv6 related tests. sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0 diff --git a/.ci/osx-build.sh b/.ci/osx-build.sh index f8facebeb02..09df61826f1 100755 --- a/.ci/osx-build.sh +++ b/.ci/osx-build.sh @@ -5,21 +5,6 @@ set -o errexit CFLAGS="-Werror $CFLAGS" EXTRA_OPTS="" -on_exit() { - if [ $? = 0 ]; then - exit - fi - FILES_TO_PRINT="config.log" - FILES_TO_PRINT="$FILES_TO_PRINT */_build/sub/tests/testsuite.log" - - for pr_file in $FILES_TO_PRINT; do - cat "$pr_file" 2>/dev/null - done -} -# We capture the error logs as artifacts in Github Actions, no need to dump -# them via a EXIT handler. -[ -n "$GITHUB_WORKFLOW" ] || trap on_exit EXIT - function configure_ovs() { ./boot.sh && ./configure $* diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index e08d7b1bac1..1949d12001b 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -133,8 +133,12 @@ jobs: - name: install common dependencies run: sudo apt install -y ${{ env.dependencies }} - name: install libunbound libunwind + # GitHub Actions doesn't have 32-bit versions of these libraries. if: matrix.m32 == '' run: sudo apt install -y libunbound-dev libunwind-dev + - name: install 32-bit libraries + if: matrix.m32 != '' + run: sudo apt install -y gcc-multilib - name: prepare run: ./.ci/linux-prepare.sh diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c7aeede06e6..00000000000 --- a/.travis.yml +++ /dev/null @@ -1,57 +0,0 @@ -language: c - -os: - - linux - -cache: - directories: - - dpdk-dir - -addons: - apt: - packages: - - bc - - libssl-dev - - llvm-dev - - libjemalloc1 - - libjemalloc-dev - - libnuma-dev - - libpcap-dev - - python3-pip - - python3-sphinx - - libelf-dev - - selinux-policy-dev - - libunbound-dev - - libunwind-dev - - python3-setuptools - - python3-wheel - - ninja-build - -before_install: ./.ci/${TRAVIS_OS_NAME}-prepare.sh - -before_script: export PATH=$PATH:$HOME/bin - -matrix: - include: - - arch: arm64 - compiler: gcc - env: TESTSUITE=1 DPDK=1 - - arch: arm64 - compiler: gcc - env: KERNEL_LIST="5.5 4.19" - - arch: arm64 - compiler: gcc - env: KERNEL_LIST="4.9 3.16" - - arch: arm64 - compiler: gcc - env: DPDK_SHARED=1 - - arch: arm64 - compiler: clang - env: OPTS="--disable-ssl" - -script: ./.ci/${TRAVIS_OS_NAME}-build.sh $OPTS - -notifications: - email: - recipients: - - ovs-build@openvswitch.org diff --git a/Documentation/internals/contributing/submitting-patches.rst b/Documentation/internals/contributing/submitting-patches.rst index 9d718982712..8a8bc11b0a9 100644 --- a/Documentation/internals/contributing/submitting-patches.rst +++ b/Documentation/internals/contributing/submitting-patches.rst @@ -68,10 +68,9 @@ Testing is also important: feature. A bug fix patch should preferably add a test that would fail if the bug recurs. -If you are using GitHub, then you may utilize the travis-ci.org and the GitHub -Actions CI build systems. They will run some of the above tests automatically -when you push changes to your repository. See the "Continuous Integration with -Travis-CI" in :doc:`/topics/testing` for details on how to set it up. +If you are using GitHub, then you may utilize the GitHub Actions CI build +systems. They will run some of the above tests automatically +when you push changes to your repository. Email Subject ------------- diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index abccce1ee60..bc41b217a5c 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -474,46 +474,6 @@ You should invoke scan-view to view analysis results. The last line of output from ``clang-analyze`` will list the command (containing results directory) that you should invoke to view the results on a browser. -Continuous Integration with Travis CI -------------------------------------- - -A .travis.yml file is provided to automatically build Open vSwitch with various -build configurations and run the testsuite using Travis CI. Builds will be -performed with gcc, sparse and clang with the -Werror compiler flag included, -therefore the build will fail if a new warning has been introduced. - -The CI build is triggered via git push (regardless of the specific branch) or -pull request against any Open vSwitch GitHub repository that is linked to -travis-ci. - -Instructions to setup travis-ci for your GitHub repository: - -1. Go to https://travis-ci.org/ and sign in using your GitHub ID. -2. Go to the "Repositories" tab and enable the ovs repository. You may disable - builds for pushes or pull requests. -3. In order to avoid forks sending build failures to the upstream mailing list, - the notification email recipient is encrypted. If you want to receive email - notification for build failures, replace the encrypted string: - - 1. Install the travis-ci CLI (Requires ruby >=2.0): gem install travis - 2. In your Open vSwitch repository: travis encrypt mylist@mydomain.org - 3. Add/replace the notifications section in .travis.yml and fill in the - secure string as returned by travis encrypt:: - - notifications: - email: - recipients: - - secure: "....." - - .. note:: - You may remove/omit the notifications section to fall back to default - notification behaviour which is to send an email directly to the author and - committer of the failing commit. Note that the email is only sent if the - author/committer have commit rights for the particular GitHub repository. - -4. Pushing a commit to the repository which breaks the build or the - testsuite will now trigger a email sent to mylist@mydomain.org - vsperf ------ diff --git a/Makefile.am b/Makefile.am index 606bcc22e12..e605187b813 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,7 +81,6 @@ EXTRA_DIST = \ .ci/osx-prepare.sh \ .cirrus.yml \ .github/workflows/build-and-test.yml \ - .travis.yml \ appveyor.yml \ boot.sh \ poc/builders/Vagrantfile \ diff --git a/NEWS b/NEWS index c79d9f97dc4..c0095c345d1 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,8 @@ Post-v3.0.0 * New option '--dump-hugepages' to include hugepages in core dumps. This can assist with postmortem analysis involving DPDK, but may also produce significantly larger core dump files. + - Support for travis-ci.org based continuous integration builds has been + dropped. v3.0.0 - 15 Aug 2022 diff --git a/README.rst b/README.rst index 8fe01f4cf23..a60a314feb3 100644 --- a/README.rst +++ b/README.rst @@ -8,8 +8,6 @@ Open vSwitch .. image:: https://github.com/openvswitch/ovs/workflows/Build%20and%20Test/badge.svg :target: https://github.com/openvswitch/ovs/actions -.. image:: https://travis-ci.org/openvswitch/ovs.png - :target: https://travis-ci.org/openvswitch/ovs .. image:: https://ci.appveyor.com/api/projects/status/github/openvswitch/ovs?branch=master&svg=true&retina=true :target: https://ci.appveyor.com/project/blp/ovs/history .. image:: https://api.cirrus-ci.com/github/openvswitch/ovs.svg From 526230bfab09095cf0214c7033382463b9d506cf Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 30 Nov 2022 17:39:52 +0000 Subject: [PATCH 093/833] dpif-netdev: Make pmd-rxq-show time configurable. pmd-rxq-show shows the Rx queue to pmd assignments as well as the pmd usage of each Rx queue. Up until now a tail length of 60 seconds pmd usage was shown for each Rx queue, as this is the value used during rebalance to avoid any spike effects. When debugging or tuning, it is also convenient to display the pmd usage of an Rx queue over a shorter time frame, so any changes config or traffic that impact pmd usage can be evaluated more quickly. A parameter is added that allows pmd-rxq-show stats pmd usage to be shown for a shorter time frame. Values are rounded up to the nearest 5 seconds as that is the measurement granularity and the value used is displayed. e.g. $ ovs-appctl dpif-netdev/pmd-rxq-show -secs 5 Displaying last 5 seconds pmd usage % pmd thread numa_id 0 core_id 4: isolated : false port: dpdk0 queue-id: 0 (enabled) pmd usage: 95 % overhead: 4 % The default time frame has not changed and the maximum value is limited to the maximum stored tail length (60 seconds). Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/dpif-netdev-private-thread.h | 2 +- lib/dpif-netdev.c | 98 ++++++++++++++++++++++++-------- tests/pmd.at | 62 ++++++++++++++++++++ 3 files changed, 138 insertions(+), 24 deletions(-) diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h index 4472b199d5c..1ec3cd79470 100644 --- a/lib/dpif-netdev-private-thread.h +++ b/lib/dpif-netdev-private-thread.h @@ -114,7 +114,7 @@ struct dp_netdev_pmd_thread { atomic_ullong intrvl_cycles; /* Write index for 'busy_cycles_intrvl'. */ - unsigned int intrvl_idx; + atomic_count intrvl_idx; /* Busy cycles in last PMD_INTERVAL_MAX intervals. */ atomic_ullong *busy_cycles_intrvl; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 9331f2cbac6..af99a91d1cc 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -160,11 +160,13 @@ static struct odp_support dp_netdev_support = { /* Time in microseconds of the interval in which rxq processing cycles used * in rxq to pmd assignments is measured and stored. */ -#define PMD_INTERVAL_LEN 10000000LL +#define PMD_INTERVAL_LEN 5000000LL +/* For converting PMD_INTERVAL_LEN to secs. */ +#define INTERVAL_USEC_TO_SEC 1000000LL /* Number of intervals for which cycles are stored * and used during rxq to pmd assignment. */ -#define PMD_INTERVAL_MAX 6 +#define PMD_INTERVAL_MAX 12 /* Time in microseconds to try RCU quiescing. */ #define PMD_RCU_QUIESCE_INTERVAL 10000LL @@ -428,7 +430,7 @@ struct dp_netdev_rxq { pinned. OVS_CORE_UNSPEC if the queue doesn't need to be pinned to a particular core. */ - unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */ + atomic_count intrvl_idx; /* Write index for 'cycles_intrvl'. */ struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ bool is_vhost; /* Is rxq of a vhost port. */ @@ -615,6 +617,9 @@ dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned long long cycles); static uint64_t dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx); +static uint64_t +get_interval_values(atomic_ullong *source, atomic_count *cur_idx, + int num_to_read); static void dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, bool purge); @@ -869,7 +874,8 @@ sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list, } static void -pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) +pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, + int secs) { if (pmd->core_id != NON_PMD_CORE_ID) { struct rxq_poll *list; @@ -877,6 +883,7 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) uint64_t total_cycles = 0; uint64_t busy_cycles = 0; uint64_t total_rxq_proc_cycles = 0; + unsigned int intervals; ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n isolated : %s\n", @@ -888,15 +895,14 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) /* Get the total pmd cycles for an interval. */ atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles); + /* Calculate how many intervals are to be used. */ + intervals = DIV_ROUND_UP(secs, + PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); /* Estimate the cycles to cover all intervals. */ - total_cycles *= PMD_INTERVAL_MAX; - - for (int j = 0; j < PMD_INTERVAL_MAX; j++) { - uint64_t cycles; - - atomic_read_relaxed(&pmd->busy_cycles_intrvl[j], &cycles); - busy_cycles += cycles; - } + total_cycles *= intervals; + busy_cycles = get_interval_values(pmd->busy_cycles_intrvl, + &pmd->intrvl_idx, + intervals); if (busy_cycles > total_cycles) { busy_cycles = total_cycles; } @@ -906,9 +912,9 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) const char *name = netdev_rxq_get_name(rxq->rx); uint64_t rxq_proc_cycles = 0; - for (int j = 0; j < PMD_INTERVAL_MAX; j++) { - rxq_proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j); - } + rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl, + &rxq->intrvl_idx, + intervals); total_rxq_proc_cycles += rxq_proc_cycles; ds_put_format(reply, " port: %-16s queue-id: %2d", name, netdev_rxq_get_queue_id(list[i].rxq->rx)); @@ -1422,6 +1428,10 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], unsigned int core_id; bool filter_on_pmd = false; size_t n; + unsigned int secs = 0; + unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) + / INTERVAL_USEC_TO_SEC; + bool first_show_rxq = true; ovs_mutex_lock(&dp_netdev_mutex); @@ -1432,6 +1442,14 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], } argc -= 2; argv += 2; + } else if (type == PMD_INFO_SHOW_RXQ && + !strcmp(argv[1], "-secs") && + argc > 2) { + if (!str_to_uint(argv[2], 10, &secs)) { + secs = max_secs; + } + argc -= 2; + argv += 2; } else { dp = shash_find_data(&dp_netdevs, argv[1]); argc -= 1; @@ -1461,7 +1479,18 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], continue; } if (type == PMD_INFO_SHOW_RXQ) { - pmd_info_show_rxq(&reply, pmd); + if (first_show_rxq) { + if (!secs || secs > max_secs) { + secs = max_secs; + } else { + secs = ROUND_UP(secs, + PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); + } + ds_put_format(&reply, "Displaying last %u seconds " + "pmd usage %%\n", secs); + first_show_rxq = false; + } + pmd_info_show_rxq(&reply, pmd, secs); } else if (type == PMD_INFO_CLEAR_STATS) { pmd_perf_stats_clear(&pmd->perf_stats); } else if (type == PMD_INFO_SHOW_STATS) { @@ -1576,8 +1605,9 @@ dpif_netdev_init(void) unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]", 0, 3, dpif_netdev_pmd_info, (void *)&clear_aux); - unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]", - 0, 3, dpif_netdev_pmd_info, + unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] " + "[-secs secs] [dp]", + 0, 5, dpif_netdev_pmd_info, (void *)&poll_aux); unixctl_command_register("dpif-netdev/pmd-perf-show", "[-nh] [-it iter-history-len]" @@ -5174,7 +5204,7 @@ static void dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned long long cycles) { - unsigned int idx = rx->intrvl_idx++ % PMD_INTERVAL_MAX; + unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX; atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles); } @@ -6914,6 +6944,9 @@ pmd_thread_main(void *f_) reload: atomic_count_init(&pmd->pmd_overloaded, 0); + pmd->intrvl_tsc_prev = 0; + atomic_store_relaxed(&pmd->intrvl_cycles, 0); + if (!dpdk_attached) { dpdk_attached = dpdk_attach_thread(pmd->core_id); } @@ -6945,12 +6978,10 @@ pmd_thread_main(void *f_) } } - pmd->intrvl_tsc_prev = 0; - atomic_store_relaxed(&pmd->intrvl_cycles, 0); for (i = 0; i < PMD_INTERVAL_MAX; i++) { atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0); } - pmd->intrvl_idx = 0; + atomic_count_set(&pmd->intrvl_idx, 0); cycles_counter_update(s); pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; @@ -9931,7 +9962,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, atomic_store_relaxed(&pmd->intrvl_cycles, curr_tsc - pmd->intrvl_tsc_prev); } - idx = pmd->intrvl_idx++ % PMD_INTERVAL_MAX; + idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX; atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc); pmd->intrvl_tsc_prev = curr_tsc; /* Start new measuring interval */ @@ -9954,6 +9985,27 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, } } +/* Returns the sum of a specified number of newest to + * oldest interval values. 'cur_idx' is where the next + * write will be and wrap around needs to be handled. + */ +static uint64_t +get_interval_values(atomic_ullong *source, atomic_count *cur_idx, + int num_to_read) { + unsigned int i; + uint64_t total = 0; + + i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX; + for (int read = 0; read < num_to_read; read++) { + uint64_t interval_value; + + i = i ? i - 1 : PMD_INTERVAL_MAX - 1; + atomic_read_relaxed(&source[i], &interval_value); + total += interval_value; + } + return total; +} + /* Insert 'rule' into 'cls'. */ static void dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule, diff --git a/tests/pmd.at b/tests/pmd.at index 10879a349b9..ed90f88c4cb 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -70,6 +70,7 @@ CHECK_CPU_DISCOVERED() CHECK_PMD_THREADS_CREATED() AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -102,6 +103,7 @@ dummy@ovs-dummy: hit:0 missed:0 ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -134,6 +136,7 @@ dummy@ovs-dummy: hit:0 missed:0 ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -183,6 +186,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) CHECK_PMD_THREADS_CREATED([1], [], [+$TMP]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -215,6 +219,7 @@ dummy@ovs-dummy: hit:0 missed:0 ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -280,6 +285,7 @@ CHECK_PMD_THREADS_CREATED([1], [1], [+$TMP]) OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 1: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -302,6 +308,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin]) OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using roundrobin algorithm"]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 1: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -322,6 +329,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles]) OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 1: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -343,6 +351,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) CHECK_PMD_THREADS_CREATED([1], [1], [+$TMP]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 0: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -471,6 +480,59 @@ pmd thread numa_id core_id : OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([PMD - pmd-rxq-show pmd usage time]) +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd], [], [], [DUMMY_NUMA]) + +#CHECK_CPU_DISCOVERED() +#CHECK_PMD_THREADS_CREATED() + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs -1 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 0 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 1 | grep Displaying], [0], [dnl +Displaying last 5 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 5 | grep Displaying], [0], [dnl +Displaying last 5 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 6 | grep Displaying], [0], [dnl +Displaying last 10 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 51 | grep Displaying], [0], [dnl +Displaying last 55 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 55 | grep Displaying], [0], [dnl +Displaying last 55 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 56 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 60 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 61 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + dnl Reconfigure the number of rx queues of a port, make sure that all the dnl queues are polled by the datapath and try to send a couple of packets. AT_SETUP([PMD - reconfigure n_rxq]) From e9ab15f4f82330e0d7bc33e57d3357fa52f76749 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 30 Nov 2022 17:39:53 +0000 Subject: [PATCH 094/833] docs: Add documentation for pmd-rxq-show secs parameter. Add description of new '-secs' parameter in docs. Also, add to NEWS as it is a user facing change. Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 23 ++++++++++++++++++----- NEWS | 3 +++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index b259cc8b32d..88457f36694 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -101,12 +101,20 @@ core cycles for each Rx queue:: .. note:: - A history of one minute is recorded and shown for each Rx queue to allow for - traffic pattern spikes. Any changes in the Rx queue's PMD core cycles usage, - due to traffic pattern or reconfig changes, will take one minute to be fully - reflected in the stats. + By default a history of one minute is recorded and shown for each Rx queue + to allow for traffic pattern spikes. Any changes in the Rx queue's PMD core + cycles usage, due to traffic pattern or reconfig changes, will take one + minute to be fully reflected in the stats by default. - .. versionchanged:: 2.6.0 +PMD thread usage of an Rx queue can be displayed for a shorter period of time, +from the last 5 seconds up to the default 60 seconds in 5 second steps. + +To see the port/Rx queue assignment and the last 5 secs of measured usage +history of PMD core cycles for each Rx queue:: + + $ ovs-appctl dpif-netdev/pmd-rxq-show -secs 5 + +.. versionchanged:: 2.6.0 The ``pmd-rxq-show`` command was added in OVS 2.6.0. @@ -115,6 +123,11 @@ core cycles for each Rx queue:: A ``overhead`` statistics is shown per PMD: it represents the number of cycles inherently consumed by the OVS PMD processing loop. +.. versionchanged:: 3.1.0 + + The ``-secs`` parameter was added to the dpif-netdev/pmd-rxq-show + command. + Rx queue to PMD assignment takes place whenever there are configuration changes or can be triggered by using:: diff --git a/NEWS b/NEWS index c0095c345d1..92d33c2912a 100644 --- a/NEWS +++ b/NEWS @@ -22,6 +22,9 @@ Post-v3.0.0 significantly larger core dump files. - Support for travis-ci.org based continuous integration builds has been dropped. + - Userspace datapath: + * Add '-secs' argument to appctl 'dpif-netdev/pmd-rxq-show' to show + the pmd usage of an Rx queue over a configurable time period. v3.0.0 - 15 Aug 2022 From ad6e506fcb63e34f3398c5284cb2bd1858ac3a49 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 30 Nov 2022 17:39:54 +0000 Subject: [PATCH 095/833] dpif-netdev: Rename pmd_info_show_rxq variables. There are some similar readings taken for pmds and Rx queues in this function and a few of the variable names are ambiguous. Improve the readability of the code by updating some variables names to indicate that they are readings related to the pmd. Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index af99a91d1cc..c015fb6ddc9 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -880,8 +880,8 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, if (pmd->core_id != NON_PMD_CORE_ID) { struct rxq_poll *list; size_t n_rxq; - uint64_t total_cycles = 0; - uint64_t busy_cycles = 0; + uint64_t total_pmd_cycles = 0; + uint64_t busy_pmd_cycles = 0; uint64_t total_rxq_proc_cycles = 0; unsigned int intervals; @@ -894,17 +894,17 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, sorted_poll_list(pmd, &list, &n_rxq); /* Get the total pmd cycles for an interval. */ - atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles); + atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles); /* Calculate how many intervals are to be used. */ intervals = DIV_ROUND_UP(secs, PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); /* Estimate the cycles to cover all intervals. */ - total_cycles *= intervals; - busy_cycles = get_interval_values(pmd->busy_cycles_intrvl, - &pmd->intrvl_idx, - intervals); - if (busy_cycles > total_cycles) { - busy_cycles = total_cycles; + total_pmd_cycles *= intervals; + busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl, + &pmd->intrvl_idx, + intervals); + if (busy_pmd_cycles > total_pmd_cycles) { + busy_pmd_cycles = total_pmd_cycles; } for (int i = 0; i < n_rxq; i++) { @@ -921,9 +921,9 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx) ? "(enabled) " : "(disabled)"); ds_put_format(reply, " pmd usage: "); - if (total_cycles) { + if (total_pmd_cycles) { ds_put_format(reply, "%2"PRIu64"", - rxq_proc_cycles * 100 / total_cycles); + rxq_proc_cycles * 100 / total_pmd_cycles); ds_put_cstr(reply, " %"); } else { ds_put_format(reply, "%s", "NOT AVAIL"); @@ -933,14 +933,14 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, if (n_rxq > 0) { ds_put_cstr(reply, " overhead: "); - if (total_cycles) { + if (total_pmd_cycles) { uint64_t overhead_cycles = 0; - if (total_rxq_proc_cycles < busy_cycles) { - overhead_cycles = busy_cycles - total_rxq_proc_cycles; + if (total_rxq_proc_cycles < busy_pmd_cycles) { + overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles; } ds_put_format(reply, "%2"PRIu64" %%", - overhead_cycles * 100 / total_cycles); + overhead_cycles * 100 / total_pmd_cycles); } else { ds_put_cstr(reply, "NOT AVAIL"); } From 46e04ec31bb2b889bd5715d436be2bdc0268f08b Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Sat, 17 Dec 2022 13:15:36 +0000 Subject: [PATCH 096/833] dpif-netdev: Calculate per numa variance. Currently, pmd_rebalance_dry_run() calculate overall variance of all pmds regardless of their numa location. The overall result may hide un-balance in an individual numa. Considering the following case. Numa0 is free because VMs on numa0 are not sending pkts, while numa1 is busy. Within numa1, pmds workloads are not balanced. Obviously, moving 500 kpps workloads from pmd 126 to pmd 62 will make numa1 much more balance. For numa1 the variance improvement will be almost 100%, because after rebalance each pmd in numa1 holds same workload(variance ~= 0). But the overall variance improvement is only about 20%, which may not trigger auto_lb. ``` numa_id core_id kpps 0 30 0 0 31 0 0 94 0 0 95 0 1 126 1500 1 127 1000 1 63 1000 1 62 500 ``` As auto_lb doesn't balance workload across numa nodes. So it makes more sense to calculate variance improvement per numa node. Signed-off-by: Cheng Li Signed-off-by: Kevin Traynor Co-authored-by: Kevin Traynor Acked-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 8 +-- lib/dpif-netdev.c | 87 +++++++++++++++---------------- 2 files changed, 47 insertions(+), 48 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index 88457f36694..9006fd40f07 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -291,10 +291,10 @@ If a PMD core is detected to be above the load threshold and the minimum pre-requisites are met, a dry-run using the current PMD assignment algorithm is performed. -The current variance of load between the PMD cores and estimated variance from -the dry-run are both calculated. If the estimated dry-run variance is improved -from the current one by the variance threshold, a new Rx queue to PMD -assignment will be performed. +For each numa node, the current variance of load between the PMD cores and +estimated variance from the dry-run are both calculated. If any numa's +estimated dry-run variance is improved from the current one by the variance +threshold, a new Rx queue to PMD assignment will be performed. For example, to set the variance improvement threshold to 40%:: diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index c015fb6ddc9..7127068fe0e 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -6131,39 +6131,33 @@ rxq_scheduling(struct dp_netdev *dp) static uint64_t variance(uint64_t a[], int n); static uint64_t -sched_numa_list_variance(struct sched_numa_list *numa_list) +sched_numa_variance(struct sched_numa *numa) { - struct sched_numa *numa; uint64_t *percent_busy = NULL; - unsigned total_pmds = 0; int n_proc = 0; uint64_t var; - HMAP_FOR_EACH (numa, node, &numa_list->numas) { - total_pmds += numa->n_pmds; - percent_busy = xrealloc(percent_busy, - total_pmds * sizeof *percent_busy); + percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy); - for (unsigned i = 0; i < numa->n_pmds; i++) { - struct sched_pmd *sched_pmd; - uint64_t total_cycles = 0; + for (unsigned i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + uint64_t total_cycles = 0; - sched_pmd = &numa->pmds[i]; - /* Exclude isolated PMDs from variance calculations. */ - if (sched_pmd->isolated == true) { - continue; - } - /* Get the total pmd cycles for an interval. */ - atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); - - if (total_cycles) { - /* Estimate the cycles to cover all intervals. */ - total_cycles *= PMD_INTERVAL_MAX; - percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) - / total_cycles; - } else { - percent_busy[n_proc++] = 0; - } + sched_pmd = &numa->pmds[i]; + /* Exclude isolated PMDs from variance calculations. */ + if (sched_pmd->isolated == true) { + continue; + } + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); + + if (total_cycles) { + /* Estimate the cycles to cover all intervals. */ + total_cycles *= PMD_INTERVAL_MAX; + percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) + / total_cycles; + } else { + percent_busy[n_proc++] = 0; } } var = variance(percent_busy, n_proc); @@ -6237,6 +6231,7 @@ pmd_rebalance_dry_run(struct dp_netdev *dp) struct sched_numa_list numa_list_est; bool thresh_met = false; uint64_t current_var, estimate_var; + struct sched_numa *numa_cur, *numa_est; uint64_t improvement = 0; VLOG_DBG("PMD auto load balance performing dry run."); @@ -6255,25 +6250,29 @@ pmd_rebalance_dry_run(struct dp_netdev *dp) sched_numa_list_count(&numa_list_est) == 1) { /* Calculate variances. */ - current_var = sched_numa_list_variance(&numa_list_cur); - estimate_var = sched_numa_list_variance(&numa_list_est); - - if (estimate_var < current_var) { - improvement = ((current_var - estimate_var) * 100) / current_var; - } - VLOG_DBG("Current variance %"PRIu64" Estimated variance %"PRIu64".", - current_var, estimate_var); - VLOG_DBG("Variance improvement %"PRIu64"%%.", improvement); - - if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { - thresh_met = true; - VLOG_DBG("PMD load variance improvement threshold %u%% " - "is met.", dp->pmd_alb.rebalance_improve_thresh); - } else { - VLOG_DBG("PMD load variance improvement threshold " - "%u%% is not met.", - dp->pmd_alb.rebalance_improve_thresh); + HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) { + numa_est = sched_numa_list_lookup(&numa_list_est, + numa_cur->numa_id); + if (!numa_est) { + continue; + } + current_var = sched_numa_variance(numa_cur); + estimate_var = sched_numa_variance(numa_est); + if (estimate_var < current_var) { + improvement = ((current_var - estimate_var) * 100) + / current_var; + } + VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated " + "variance %"PRIu64". Variance improvement %"PRIu64"%%.", + numa_cur->numa_id, current_var, + estimate_var, improvement); + if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { + thresh_met = true; + } } + VLOG_DBG("PMD load variance improvement threshold %u%% is %s.", + dp->pmd_alb.rebalance_improve_thresh, + thresh_met ? "met" : "not met"); } else { VLOG_DBG("PMD auto load balance detected cross-numa polling with " "multiple numa nodes. Unable to accurately estimate."); From d83d7c4915f1fc538f52fd05076532b744e389dd Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:18 +0100 Subject: [PATCH 097/833] ci: Fix overriding OPTS provided from the yml. For GCC builds we're overriding --disable-ssl or --enable-shared options set up in the GHA yml file. Fix that by adding to EXTRA_OPTS instead. Fixes: 2581b0ad1159 ("travis: Combine kernel builds.") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .ci/linux-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index c06186ce1cf..a944cf14962 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -221,7 +221,7 @@ elif [ "$M32" ]; then # difference on 'configure' and 'make' stages. export CC="$CC -m32" else - OPTS="--enable-sparse" + EXTRA_OPTS="$EXTRA_OPTS --enable-sparse" if [ "$AFXDP" ]; then # netdev-afxdp uses memset for 64M for umem initialization. SPARSE_FLAGS="${SPARSE_FLAGS} -Wno-memcpy-max-count" From 0d8318db633fb24936a0f55e869331f0c27f243f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:19 +0100 Subject: [PATCH 098/833] netdev-afxdp: Disable -Wfree-nonheap-object on receive. GCC 11+ generates a warning: In file included from lib/netdev-linux-private.h:30, from lib/netdev-afxdp.c:19: In function 'dp_packet_delete', inlined from 'dp_packet_delete' at lib/dp-packet.h:246:1, inlined from 'dp_packet_batch_add__' at lib/dp-packet.h:775:9, inlined from 'dp_packet_batch_add' at lib/dp-packet.h:783:5, inlined from 'netdev_afxdp_rxq_recv' at lib/netdev-afxdp.c:898:9: lib/dp-packet.h:260:9: warning: 'free' called on pointer '*umem.xpool.array' with nonzero offset [8, 2558044588346441168] [-Wfree-nonheap-object] 260 | free(b); | ^~~~~~~ But it is a false positive since the code path is not possible. In this call chain the packet will always have source DPBUF_AFXDP and the free() will never be called. GCC doesn't see that, because initialization function dp_packet_use_afxdp() is part of a different translation unit. Disabling a warning in this particular place to avoid build failures. Older versions of clang do not have the -Wfree-nonheap-object, so we need to additionally guard the pragmas. Clang is using GCC pragmas and complains about unknown ones. Reported-at: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108187 Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-afxdp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index ca3f2431eac..4d57efa5ce9 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -868,9 +868,22 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, OVS_XDP_HEADROOM); dp_packet_set_size(packet, len); +#if __GNUC__ >= 11 && !__clang__ + /* GCC 11+ generates a false-positive warning about free() being + * called on DPBUF_AFXDP packet, but it is an imposisible code path. + * Disabling a warning to avoid build failures. + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108187 */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfree-nonheap-object" +#endif + /* Add packet into batch, increase batch->count. */ dp_packet_batch_add(batch, packet); +#if __GNUC__ && !__clang__ +#pragma GCC diagnostic pop +#endif + idx_rx++; } /* Release the RX queue. */ From 1dcc490d44879f33392337dfd9175645fcc4118e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:20 +0100 Subject: [PATCH 099/833] netdev-afxdp: Allow building with libxdp and newer libbpf. AF_XDP functions was deprecated in libbpf 0.7 and moved to libxdp. Functions bpf_get/set_link_xdp_id() was deprecated in libbpf 0.8 and replaced with bpf_xdp_query_id() and bpf_xdp_attach/detach(). Updating configuration and source code to accommodate above changes and allow building OVS with AF_XDP support on newer systems: - Checking the version of libbpf by detecting availability of bpf_xdp_detach. - Checking availability of the libxdp in a system by looking for a library providing libxdp_strerror(), if libbpf is newer than 0.6. And checking for xsk.h header provided by libxdp-dev[el]. - Use xsk.h from libbpf if it is older than 0.7 and not linking with libxdp in this case as there are known incompatible versions of libxdp in distributions. - Check for the NEED_WAKEUP feature replaced with direct checking in the source code if XDP_USE_NEED_WAKEUP is defined. - Checking availability of bpf_xdp_query_id and bpf_xdp_detach and using them instead of deprecated APIs. Fall back to old functions if not found. - Dropped LIBBPF_LDADD variable as it makes library and function detection much harder without providing any actual benefits. AC_SEARCH_LIBS is used instead and it allows use of AC_CHECK_FUNCS. - Header includes moved around to files where they are actually used. - Removed libelf dependency as it is not really used. With these changes it should be possible to build OVS with either: - libbpf built from the kernel sources (5.19 or older). - libbpf < 0.7 provided in distributions. - libxdp and libbpf >= 0.7 provided in newer distributions. While it is technically possible to build with libbpf 0.7+ without libxdp at the moment we're not allowing that for a few reasons. First, required functions in libbpf are deprecated and can be removed in future releases. Second, support for all these combinations makes the detection code fairly complex. AFAIK, most of the distributions packaging libbpf 0.7+ do package libxdp as well. libxdp added as a build dependency for Fedora build since all supported versions of Fedora are packaging this library. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- NEWS | 2 ++ acinclude.m4 | 28 ++++++++++++++---------- lib/automake.mk | 1 - lib/libopenvswitch.pc.in | 2 +- lib/netdev-afxdp-pool.c | 2 ++ lib/netdev-afxdp-pool.h | 5 ----- lib/netdev-afxdp.c | 38 ++++++++++++++++++++++++++------- rhel/openvswitch-fedora.spec.in | 2 +- 8 files changed, 53 insertions(+), 27 deletions(-) diff --git a/NEWS b/NEWS index 92d33c2912a..ce5d11d73a9 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,8 @@ Post-v3.0.0 -------------------- - ovs-vswitchd now detects changes in CPU affinity and adjusts the number of handler and revalidator threads if necessary. + - AF_XDP: + * Added support for building with libxdp and libbpf >= 0.7. - ovs-appctl: * "ovs-appctl ofproto/trace" command can now display port names with the "--names" option. diff --git a/acinclude.m4 b/acinclude.m4 index aa9af55062f..e47e925b376 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -251,7 +251,7 @@ AC_DEFUN([OVS_FIND_DEPENDENCY], [ dnl OVS_CHECK_LINUX_AF_XDP dnl -dnl Check both Linux kernel AF_XDP and libbpf support +dnl Check both Linux kernel AF_XDP and libbpf/libxdp support AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ AC_ARG_ENABLE([afxdp], [AS_HELP_STRING([--enable-afxdp], [Enable AF-XDP support])], @@ -270,8 +270,21 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ AC_CHECK_HEADER([linux/if_xdp.h], [], [AC_MSG_ERROR([unable to find linux/if_xdp.h for AF_XDP support])]) - AC_CHECK_HEADER([bpf/xsk.h], [], - [AC_MSG_ERROR([unable to find bpf/xsk.h for AF_XDP support])]) + OVS_FIND_DEPENDENCY([libbpf_strerror], [bpf], [libbpf]) + AC_CHECK_FUNCS([bpf_xdp_query_id bpf_xdp_detach]) + + if test "x$ac_cv_func_bpf_xdp_detach" = xyes; then + dnl We have libbpf >= 0.7. Look for libxdp as xsk functions + dnl were moved into this library. + OVS_FIND_DEPENDENCY([libxdp_strerror], [xdp], [libxdp]) + AC_CHECK_HEADER([xdp/xsk.h], + AC_DEFINE([HAVE_LIBXDP], [1], [xsk.h is supplied with libxdp]), + AC_MSG_ERROR([unable to find xdp/xsk.h for AF_XDP support])) + else + dnl libbpf < 0.7 contains all the necessary functionality. + AC_CHECK_HEADER([bpf/xsk.h], [], + [AC_MSG_ERROR([unable to find bpf/xsk.h for AF_XDP support])]) + fi AC_CHECK_FUNCS([pthread_spin_lock], [], [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])]) @@ -280,13 +293,6 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ AC_DEFINE([HAVE_AF_XDP], [1], [Define to 1 if AF_XDP support is available and enabled.]) - LIBBPF_LDADD=" -lbpf -lelf" - AC_SUBST([LIBBPF_LDADD]) - - AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [ - AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1], - [XDP need wakeup support detected in xsk.h.]) - ], [], [[#include ]]) fi AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true) ]) @@ -357,7 +363,7 @@ AC_DEFUN([OVS_CHECK_DPDK], [ ], [], [[#include ]]) AC_CHECK_DECL([RTE_NET_AF_XDP], [ - LIBBPF_LDADD="-lbpf" + OVS_FIND_DEPENDENCY([libbpf_strerror], [bpf], [libbpf]) ], [], [[#include ]]) AC_CHECK_DECL([RTE_LIBRTE_VHOST_NUMA], [ diff --git a/lib/automake.mk b/lib/automake.mk index a0fabe38f36..61bdc308f07 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -9,7 +9,6 @@ lib_LTLIBRARIES += lib/libopenvswitch.la lib_libopenvswitch_la_LIBADD = $(SSL_LIBS) lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD) -lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD) if WIN32 diff --git a/lib/libopenvswitch.pc.in b/lib/libopenvswitch.pc.in index 44fbb1f9fd2..a5f4d39479a 100644 --- a/lib/libopenvswitch.pc.in +++ b/lib/libopenvswitch.pc.in @@ -7,5 +7,5 @@ Name: libopenvswitch Description: Open vSwitch library Version: @VERSION@ Libs: -L${libdir} -lopenvswitch -Libs.private: @LIBS@ @SSL_LIBS@ @CAPNG_LDADD@ @LIBBPF_LDADD@ +Libs.private: @LIBS@ @SSL_LIBS@ @CAPNG_LDADD@ Cflags: -I${includedir} diff --git a/lib/netdev-afxdp-pool.c b/lib/netdev-afxdp-pool.c index 3386d2dcf78..f56a7b29ece 100644 --- a/lib/netdev-afxdp-pool.c +++ b/lib/netdev-afxdp-pool.c @@ -15,6 +15,8 @@ */ #include +#include + #include "dp-packet.h" #include "netdev-afxdp-pool.h" #include "openvswitch/util.h" diff --git a/lib/netdev-afxdp-pool.h b/lib/netdev-afxdp-pool.h index f929b9489c7..6681cf539e9 100644 --- a/lib/netdev-afxdp-pool.h +++ b/lib/netdev-afxdp-pool.h @@ -19,12 +19,7 @@ #ifdef HAVE_AF_XDP -#include -#include -#include - #include "openvswitch/thread.h" -#include "ovs-atomic.h" /* LIFO ptr_array. */ struct umem_pool { diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index 4d57efa5ce9..f8995da1fda 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -21,6 +21,11 @@ #include "netdev-afxdp.h" #include "netdev-afxdp-pool.h" +#ifdef HAVE_LIBXDP +#include +#else +#include +#endif #include #include #include @@ -29,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +50,7 @@ #include "openvswitch/list.h" #include "openvswitch/thread.h" #include "openvswitch/vlog.h" +#include "ovs-atomic.h" #include "ovs-numa.h" #include "packets.h" #include "socket-util.h" @@ -72,7 +79,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS -#ifdef HAVE_XDP_NEED_WAKEUP +#ifdef XDP_USE_NEED_WAKEUP #define NEED_WAKEUP_DEFAULT true #else #define NEED_WAKEUP_DEFAULT false @@ -169,7 +176,7 @@ struct netdev_afxdp_tx_lock { ); }; -#ifdef HAVE_XDP_NEED_WAKEUP +#ifdef XDP_USE_NEED_WAKEUP static inline void xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem, struct netdev *netdev, int fd) @@ -201,7 +208,7 @@ xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info) return xsk_ring_prod__needs_wakeup(&xsk_info->tx); } -#else /* !HAVE_XDP_NEED_WAKEUP */ +#else /* !XDP_USE_NEED_WAKEUP */ static inline void xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED, struct netdev *netdev OVS_UNUSED, @@ -215,7 +222,7 @@ xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED) { return true; } -#endif /* HAVE_XDP_NEED_WAKEUP */ +#endif /* XDP_USE_NEED_WAKEUP */ static void netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool) @@ -351,7 +358,7 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, cfg.bind_flags = xdp_modes[mode].bind_flags; cfg.xdp_flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST; -#ifdef HAVE_XDP_NEED_WAKEUP +#ifdef XDP_USE_NEED_WAKEUP if (use_need_wakeup) { cfg.bind_flags |= XDP_USE_NEED_WAKEUP; } @@ -377,7 +384,11 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, } /* Make sure the built-in AF_XDP program is loaded. */ +#ifdef HAVE_BPF_XDP_QUERY_ID + ret = bpf_xdp_query_id(ifindex, cfg.xdp_flags, &prog_id); +#else ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags); +#endif if (ret || !prog_id) { if (ret) { VLOG_ERR("Get XDP prog ID failed (%s)", ovs_strerror(errno)); @@ -630,9 +641,9 @@ netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, } need_wakeup = smap_get_bool(args, "use-need-wakeup", NEED_WAKEUP_DEFAULT); -#ifndef HAVE_XDP_NEED_WAKEUP +#ifndef XDP_USE_NEED_WAKEUP if (need_wakeup) { - VLOG_WARN("XDP need_wakeup is not supported in libbpf."); + VLOG_WARN("XDP need_wakeup is not supported in libbpf/libxdp."); need_wakeup = false; } #endif @@ -742,7 +753,11 @@ xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode) uint32_t ret, prog_id = 0; /* Check whether XDP program is loaded. */ +#ifdef HAVE_BPF_XDP_QUERY_ID + ret = bpf_xdp_query_id(ifindex, flags, &prog_id); +#else ret = bpf_get_link_xdp_id(ifindex, &prog_id, flags); +#endif if (ret) { VLOG_ERR("Failed to get XDP prog id (%s)", ovs_strerror(errno)); return; @@ -753,7 +768,14 @@ xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode) return; } - bpf_set_link_xdp_fd(ifindex, -1, flags); +#ifdef HAVE_BPF_XDP_DETACH + if (bpf_xdp_detach(ifindex, flags, NULL) != 0) { +#else + if (bpf_set_link_xdp_fd(ifindex, -1, flags) != 0) { +#endif + VLOG_ERR("Failed to detach XDP program (%s) at ifindex %d", + ovs_strerror(errno), ifindex); + } } void diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 8fc6e8ab233..eb5077a215f 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -75,7 +75,7 @@ BuildRequires: dpdk-devel >= 22.11 Provides: %{name}-dpdk = %{version}-%{release} %endif %if %{with afxdp} -BuildRequires: libbpf-devel numactl-devel +BuildRequires: libxdp-devel libbpf-devel numactl-devel %endif BuildRequires: unbound unbound-devel From b17cadff1d3d060eb8b19aac8787b894d2e1c89a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:21 +0100 Subject: [PATCH 100/833] netdev-afxdp: Hide too large memset from sparse. Sparse complains about 64M umem initialization. Hide it from the checker instead of disabling a warning globally. SPARSE_FLAGS are kept in the CI script even though they are empty at the moment. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .ci/linux-build.sh | 4 ---- lib/netdev-afxdp.c | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index a944cf14962..e6e4f6a60e6 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -222,10 +222,6 @@ elif [ "$M32" ]; then export CC="$CC -m32" else EXTRA_OPTS="$EXTRA_OPTS --enable-sparse" - if [ "$AFXDP" ]; then - # netdev-afxdp uses memset for 64M for umem initialization. - SPARSE_FLAGS="${SPARSE_FLAGS} -Wno-memcpy-max-count" - fi CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${SPARSE_FLAGS}" fi diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index f8995da1fda..16f26bc3065 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -434,7 +434,11 @@ xsk_configure(int ifindex, int xdp_queue_id, enum afxdp_mode mode, /* Umem memory region. */ bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE); +#ifndef __CHECKER__ + /* Sparse complains about a very large memset, but it is OK in this case. + * So, hiding it from the checker. */ memset(bufs, 0, NUM_FRAMES * FRAME_SIZE); +#endif /* Create AF_XDP socket. */ umem = xsk_configure_umem(bufs, NUM_FRAMES * FRAME_SIZE); From 649dbc19ffc0acd050ad729b9052aba8c7fce090 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:22 +0100 Subject: [PATCH 101/833] github: Test AF_XDP build using libbpf instead of kernel sources. AF_XDP bits was removed from kernel's libbpf in 6.0. libbpf and libxdp are now primary way to build AF_XDP applications. Most of modern distributions are already packaging some version of libbpf, so it's better to test building with it instead of building old unsupported kernel tree. Ubuntu started packaging libxdp only in 22.10, so not using it for now. Kernel build infrastructure in CI scripts is not needed anymore. Removed. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .ci/linux-build.sh | 77 ---------------------------- .github/workflows/build-and-test.yml | 10 ++-- 2 files changed, 3 insertions(+), 84 deletions(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index e6e4f6a60e6..10021fddb25 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -7,79 +7,6 @@ CFLAGS_FOR_OVS="-g -O2" SPARSE_FLAGS="" EXTRA_OPTS="--enable-Werror" -function install_kernel() -{ - if [[ "$1" =~ ^5.* ]]; then - PREFIX="v5.x" - elif [[ "$1" =~ ^4.* ]]; then - PREFIX="v4.x" - elif [[ "$1" =~ ^3.* ]]; then - PREFIX="v3.x" - else - PREFIX="v2.6/longterm/v2.6.32" - fi - - base_url="https://cdn.kernel.org/pub/linux/kernel/${PREFIX}" - # Download page with list of all available kernel versions. - wget ${base_url}/ - # Uncompress in case server returned gzipped page. - (file index* | grep ASCII) || (mv index* index.new.gz && gunzip index*) - # Get version of the latest stable release. - hi_ver=$(echo ${1} | sed 's/\./\\\./') - lo_ver=$(cat ./index* | grep -P -o "${hi_ver}\.[0-9]+" | \ - sed 's/.*\..*\.\(.*\)/\1/' | sort -h | tail -1) - version="${1}.${lo_ver}" - - rm -rf index* linux-* - - url="${base_url}/linux-${version}.tar.xz" - # Download kernel sources. Try direct link on CDN failure. - wget ${url} || - (rm -f linux-${version}.tar.xz && wget ${url}) || - (rm -f linux-${version}.tar.xz && wget ${url/cdn/www}) - - tar xvf linux-${version}.tar.xz > /dev/null - pushd linux-${version} - make allmodconfig - - # Cannot use CONFIG_KCOV: -fsanitize-coverage=trace-pc is not supported by compiler - sed -i 's/CONFIG_KCOV=y/CONFIG_KCOV=n/' .config - - # stack validation depends on tools/objtool, but objtool does not compile on travis. - # It is giving following error. - # >>> GEN arch/x86/insn/inat-tables.c - # >>> Semantic error at 40: Unknown imm opnd: AL - # So for now disable stack-validation for the build. - - sed -i 's/CONFIG_STACK_VALIDATION=y/CONFIG_STACK_VALIDATION=n/' .config - make oldconfig - - # Older kernels do not include openvswitch - if [ -d "net/openvswitch" ]; then - make net/openvswitch/ - else - make net/bridge/ - fi - - if [ "$AFXDP" ]; then - sudo make headers_install INSTALL_HDR_PATH=/usr - pushd tools/lib/bpf/ - # Bulding with gcc because there are some issues in make files - # that breaks building libbpf with clang on Travis. - CC=gcc sudo make install - CC=gcc sudo make install_headers - sudo ldconfig - popd - # The Linux kernel defines __always_inline in stddef.h (283d7573), and - # sys/cdefs.h tries to re-define it. Older libc-dev package in xenial - # doesn't have a fix for this issue. Applying it manually. - sudo sed -i '/^# define __always_inline .*/i # undef __always_inline' \ - /usr/include/x86_64-linux-gnu/sys/cdefs.h || true - EXTRA_OPTS="${EXTRA_OPTS} --enable-afxdp" - fi - popd -} - function install_dpdk() { local DPDK_VER=$1 @@ -202,10 +129,6 @@ assert ovs.json.from_string('{\"a\": 42}') == {'a': 42}" exit 0 fi -if [ "$KERNEL" ]; then - install_kernel $KERNEL -fi - if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then if [ -z "$DPDK_VER" ]; then DPDK_VER="22.11.1" diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 1949d12001b..82675b9734d 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -8,14 +8,12 @@ jobs: dependencies: | automake libtool gcc bc libjemalloc2 libjemalloc-dev \ libssl-dev llvm-dev libelf-dev libnuma-dev libpcap-dev \ - ninja-build selinux-policy-dev - AFXDP: ${{ matrix.afxdp }} + ninja-build selinux-policy-dev libbpf-dev ASAN: ${{ matrix.asan }} UBSAN: ${{ matrix.ubsan }} CC: ${{ matrix.compiler }} DPDK: ${{ matrix.dpdk }} DPDK_SHARED: ${{ matrix.dpdk_shared }} - KERNEL: ${{ matrix.kernel }} LIBS: ${{ matrix.libs }} M32: ${{ matrix.m32 }} OPTS: ${{ matrix.opts }} @@ -65,11 +63,9 @@ jobs: libs: -ljemalloc - compiler: gcc - afxdp: afxdp - kernel: 5.3 + opts: --enable-afxdp - compiler: clang - afxdp: afxdp - kernel: 5.3 + opts: --enable-afxdp - compiler: gcc dpdk: dpdk From 771a55825f4a1d84c18439ae5a7485807169b0f9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:23 +0100 Subject: [PATCH 102/833] Documentation/afxdp: Use packaged libbpf/libxdp for the build. Necessary bits was removed from the kernel's libbpf in 6.0 release, so the instructions on how to build libbpf from kernel sources are now incorrect. Suggest to use libbpf and libxdp packaged by distributions instead. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/intro/install/afxdp.rst | 39 ++++++--------------------- 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index bfef4986015..a4f0b87fe2c 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -88,7 +88,7 @@ Build requirements In addition to the requirements described in :doc:`general`, building Open vSwitch with AF_XDP will require the following: -- libbpf from kernel source tree (kernel 5.0.0 or later) +- ``libbpf`` and ``libxdp`` (if version of ``libbpf`` if higher than ``0.6``). - Linux kernel XDP support, with the following options (required) @@ -125,41 +125,18 @@ vSwitch with AF_XDP will require the following: Installing ---------- For OVS to use AF_XDP netdev, it has to be configured with LIBBPF support. -First, clone a recent version of Linux bpf-next tree:: - git clone git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git +First, install ``libbpf`` and ``libxdp``. For example, on Fedora these +libraries along with development headers can be obtained by installing +``libbpf-devel`` and ``libxdp-devel`` packages. For Ubuntu that will be +``libbpf-dev`` package with additional ``libxdp-dev`` on Ubuntu 22.10 +or later. -Second, go into the Linux source directory and build libbpf in the tools -directory:: - - cd bpf-next/ - cd tools/lib/bpf/ - make && make install - make install_headers - -.. note:: - Make sure xsk.h and bpf.h are installed in system's library path, - e.g. /usr/local/include/bpf/ or /usr/include/bpf/ - -Make sure the libbpf.so is installed correctly:: - - ldconfig - ldconfig -p | grep libbpf - -.. note:: - Check /etc/ld.so.conf if libbpf is installed but can not be found by - ldconfig. - -Third, ensure the standard OVS requirements are installed and +Next, ensure the standard OVS requirements are installed and bootstrap/configure the package:: ./boot.sh && ./configure --enable-afxdp -.. note:: - If you encounter "WARNING: bpf/libbpf.h: present but cannot be compiled", - check the Linux headers are in line with libbpf. For example, in Ubuntu, - check the installed linux-headers* and linux-libc-dev* dpkg. - Finally, build and install OVS:: make && make install @@ -182,7 +159,7 @@ If a test case fails, check the log at:: Setup AF_XDP netdev ------------------- -Before running OVS with AF_XDP, make sure the libbpf, libelf, and libnuma are +Before running OVS with AF_XDP, make sure the libbpf and libnuma are set-up right:: ldd vswitchd/ovs-vswitchd From e44e80343189fcb7ec10d776f1b62747d7095c18 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:24 +0100 Subject: [PATCH 103/833] acinclude.m4: Build with AF_XDP support by default if possible. With this change we will try to detect all the netdev-afxdp dependencies and enable AF_XDP support by default if they are present at the build time. Configuration script behaves in a following way: - ./configure --enable-afxdp Will check for AF_XDP dependencies and fail if they are not available. - ./configure --disable-afxdp Disables checking for AF_XDP. Build will not support AF_XDP even if all dependencies are installed. - Just ./configure or ./configure --enable-afxdp=auto Will check for AF_XDP dependencies. Will print a warning if they are not available, but will continue without AF_XDP support. If dependencies are available in a system, this option is equal to --enable-afxdp. '--disable-afxdp' added to the debian and fedora package builds to keep predictable behavior. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/intro/install/afxdp.rst | 6 ++- NEWS | 3 ++ acinclude.m4 | 72 ++++++++++++++++++--------- debian/rules | 25 ++++++---- rhel/openvswitch-fedora.spec.in | 2 + 5 files changed, 72 insertions(+), 36 deletions(-) diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index a4f0b87fe2c..51c24bf5b1e 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -30,8 +30,7 @@ This document describes how to build and install Open vSwitch using AF_XDP netdev. .. warning:: - The AF_XDP support of Open vSwitch is considered 'experimental', - and it is not compiled in by default. + The AF_XDP support of Open vSwitch is considered 'experimental'. Introduction @@ -137,6 +136,9 @@ bootstrap/configure the package:: ./boot.sh && ./configure --enable-afxdp +``--enable-afxdp`` here is optional, but it will ensure that all dependencies +are available at the build time. + Finally, build and install OVS:: make && make install diff --git a/NEWS b/NEWS index ce5d11d73a9..2f6ededfe47 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,9 @@ Post-v3.0.0 of handler and revalidator threads if necessary. - AF_XDP: * Added support for building with libxdp and libbpf >= 0.7. + * Support for AF_XDP is now enabled by default if all dependencies are + available at the build time. Use --disable-afxdp to disable. + Use --enable-afxdp to fail the build if dependencies are not present. - ovs-appctl: * "ovs-appctl ofproto/trace" command can now display port names with the "--names" option. diff --git a/acinclude.m4 b/acinclude.m4 index e47e925b376..8aecfb63d2a 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -253,46 +253,70 @@ dnl OVS_CHECK_LINUX_AF_XDP dnl dnl Check both Linux kernel AF_XDP and libbpf/libxdp support AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ - AC_ARG_ENABLE([afxdp], - [AS_HELP_STRING([--enable-afxdp], [Enable AF-XDP support])], - [], [enable_afxdp=no]) + AC_ARG_ENABLE( + [afxdp], + [AS_HELP_STRING([--disable-afxdp], [Disable AF-XDP support])], + [case "${enableval}" in + (yes | no | auto) ;; + (*) AC_MSG_ERROR([bad value ${enableval} for --enable-afxdp]) ;; + esac], + [enable_afxdp=auto]) + AC_MSG_CHECKING([whether AF_XDP is enabled]) - if test "$enable_afxdp" != yes; then + if test "$enable_afxdp" == no; then AC_MSG_RESULT([no]) AF_XDP_ENABLE=false else - AC_MSG_RESULT([yes]) + AC_MSG_RESULT([$enable_afxdp]) AF_XDP_ENABLE=true + failed_dep=none + dnl Saving libs to restore in case we will end up not building with AF_XDP. + save_LIBS=$LIBS - AC_CHECK_HEADER([bpf/libbpf.h], [], - [AC_MSG_ERROR([unable to find bpf/libbpf.h for AF_XDP support])]) + AC_CHECK_HEADER([bpf/libbpf.h], [], [failed_dep="bpf/libbpf.h"]) - AC_CHECK_HEADER([linux/if_xdp.h], [], - [AC_MSG_ERROR([unable to find linux/if_xdp.h for AF_XDP support])]) + if test "$failed_dep" = none; then + AC_CHECK_HEADER([linux/if_xdp.h], [], [failed_dep="linux/if_xdp.h"]) + fi - OVS_FIND_DEPENDENCY([libbpf_strerror], [bpf], [libbpf]) - AC_CHECK_FUNCS([bpf_xdp_query_id bpf_xdp_detach]) + if test "$failed_dep" = none; then + AC_SEARCH_LIBS([libbpf_strerror], [bpf], [], [failed_dep="libbpf"]) + AC_CHECK_FUNCS([bpf_xdp_query_id bpf_xdp_detach]) + fi - if test "x$ac_cv_func_bpf_xdp_detach" = xyes; then + if test "$failed_dep" = none -a "x$ac_cv_func_bpf_xdp_detach" = xyes; then dnl We have libbpf >= 0.7. Look for libxdp as xsk functions dnl were moved into this library. - OVS_FIND_DEPENDENCY([libxdp_strerror], [xdp], [libxdp]) - AC_CHECK_HEADER([xdp/xsk.h], - AC_DEFINE([HAVE_LIBXDP], [1], [xsk.h is supplied with libxdp]), - AC_MSG_ERROR([unable to find xdp/xsk.h for AF_XDP support])) - else + AC_SEARCH_LIBS([libxdp_strerror], [xdp], + AC_CHECK_HEADER([xdp/xsk.h], + AC_DEFINE([HAVE_LIBXDP], [1], [xsk.h is supplied with libxdp]), + [failed_dep="xdp/xsk.h"]), + [failed_dep="libxdp"]) + elif test "$failed_dep" = none; then dnl libbpf < 0.7 contains all the necessary functionality. - AC_CHECK_HEADER([bpf/xsk.h], [], - [AC_MSG_ERROR([unable to find bpf/xsk.h for AF_XDP support])]) + AC_CHECK_HEADER([bpf/xsk.h], [], [failed_dep="bpf/xsk.h"]) fi - AC_CHECK_FUNCS([pthread_spin_lock], [], - [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])]) + if test "$failed_dep" = none; then + AC_CHECK_FUNCS([pthread_spin_lock], [], [failed_dep="pthread_spin_lock"]) + fi - OVS_FIND_DEPENDENCY([numa_alloc_onnode], [numa], [libnuma]) + if test "$failed_dep" = none; then + AC_SEARCH_LIBS([numa_alloc_onnode], [numa], [], [failed_dep="libnuma"]) + fi - AC_DEFINE([HAVE_AF_XDP], [1], - [Define to 1 if AF_XDP support is available and enabled.]) + if test "$failed_dep" = none; then + AC_DEFINE([HAVE_AF_XDP], [1], + [Define to 1 if AF_XDP support is available and enabled.]) + elif test "$enable_afxdp" = yes; then + AC_MSG_ERROR([Missing $failed_dep dependency for AF_XDP support]) + else + AC_MSG_WARN(m4_normalize( + [Cannot find $failed_dep, netdev-afxdp will not be supported + (use --disable-afxdp to suppress this warning).])) + AF_XDP_ENABLE=false + LIBS=$save_LIBS + fi fi AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true) ]) diff --git a/debian/rules b/debian/rules index 971bc1775ee..ddbd4dc5c15 100755 --- a/debian/rules +++ b/debian/rules @@ -23,21 +23,26 @@ override_dh_auto_configure: test -d _debian || mkdir _debian cd _debian && ( \ test -e Makefile || \ - ../configure --prefix=/usr --localstatedir=/var --enable-ssl \ - --sysconfdir=/etc \ - $(DATAPATH_CONFIGURE_OPTS) \ - $(EXTRA_CONFIGURE_OPTS) \ - ) + ../configure --prefix=/usr --localstatedir=/var \ + --enable-ssl \ + --disable-afxdp \ + --sysconfdir=/etc \ + $(DATAPATH_CONFIGURE_OPTS) \ + $(EXTRA_CONFIGURE_OPTS) \ + ) ifneq (,$(filter i386 amd64 ppc64el arm64, $(DEB_HOST_ARCH))) ifeq (,$(filter nodpdk, $(DEB_BUILD_OPTIONS))) test -d _dpdk || mkdir _dpdk cd _dpdk && ( \ test -e Makefile || \ - ../configure --prefix=/usr --localstatedir=/var --enable-ssl \ - --with-dpdk=shared --sysconfdir=/etc \ - $(DATAPATH_CONFIGURE_OPTS) \ - $(EXTRA_CONFIGURE_OPTS) \ - ) + ../configure --prefix=/usr --localstatedir=/var \ + --enable-ssl \ + --disable-afxdp \ + --with-dpdk=shared \ + --sysconfdir=/etc \ + $(DATAPATH_CONFIGURE_OPTS) \ + $(EXTRA_CONFIGURE_OPTS) \ + ) endif endif diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index eb5077a215f..3091e204e15 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -171,6 +171,8 @@ This package provides IPsec tunneling support for OVS tunnels. %endif %if %{with afxdp} --enable-afxdp \ +%else + --disable-afxdp \ %endif --enable-ssl \ --disable-static \ From 9736b971b519b725507116578d780d822755b2a6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Dec 2022 01:06:25 +0100 Subject: [PATCH 104/833] rhel: Enable AF_XDP by default in Fedora builds. All supported versions of Fedora do package libxdp and libbpf, so it makes sense to enable AF_XDP support. Control files for debian packaging are much less flexible, so its hard to enable AF_XDP builds while not breaking builds for version of Ubuntu and Debian that do not package libbpf or libxdp. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- rhel/openvswitch-fedora.spec.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 3091e204e15..44899c1ca74 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -26,8 +26,8 @@ %bcond_without libcapng # To enable DPDK support, specify '--with dpdk' when building %bcond_with dpdk -# To enable AF_XDP support, specify '--with afxdp' when building -%bcond_with afxdp +# To disable AF_XDP support, specify '--without afxdp' when building +%bcond_without afxdp # If there is a need to automatically enable the package after installation, # specify the "--with autoenable" From 62e85106b4439faf28261ee0776d3d9f9736994e Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 22 Dec 2022 10:12:12 +0100 Subject: [PATCH 105/833] utilities: Add USDT script to monitor dpif netlink execute message queuing. This patch adds the dpif_nl_exec_monitor.py script that will used the existing dpif_netlink_operate__:op_flow_execute USDT probe to show all DPIF_OP_EXECUTE operations being queued for transmission over the netlink interface. Here is an example, truncated output: Display DPIF_OP_EXECUTE operations being queued for transmission... TIME CPU COMM PID NL_SIZE 3124.516679897 1 ovs-vswitchd 8219 180 nlmsghdr : len = 0, type = 36, flags = 1, seq = 0, pid = 0 genlmsghdr: cmd = 3, version = 1, reserver = 0 ovs_header: dp_ifindex = 21 > Decode OVS_PACKET_ATTR_* TLVs: nla_len 46, nla_type OVS_PACKET_ATTR_PACKET[1], data: 00 00 00... nla_len 20, nla_type OVS_PACKET_ATTR_KEY[2], data: 08 00 02 00... > Decode OVS_KEY_ATTR_* TLVs: nla_len 8, nla_type OVS_KEY_ATTR_PRIORITY[2], data: 00 00... nla_len 8, nla_type OVS_KEY_ATTR_SKB_MARK[15], data: 00 00... nla_len 88, nla_type OVS_PACKET_ATTR_ACTIONS[3], data: 4c 00 03... > Decode OVS_ACTION_ATTR_* TLVs: nla_len 76, nla_type OVS_ACTION_ATTR_SET[3], data: 48 00... > Decode OVS_TUNNEL_KEY_ATTR_* TLVs: nla_len 12, nla_type OVS_TUNNEL_KEY_ATTR_ID[0], data:... nla_len 20, nla_type OVS_TUNNEL_KEY_ATTR_IPV6_DST[13], ... nla_len 5, nla_type OVS_TUNNEL_KEY_ATTR_TTL[4], data: 40 nla_len 4, nla_type OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT[5]... nla_len 4, nla_type OVS_TUNNEL_KEY_ATTR_CSUM[6], data: nla_len 6, nla_type OVS_TUNNEL_KEY_ATTR_TP_DST[10],... nla_len 12, nla_type OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS[8],... nla_len 8, nla_type OVS_ACTION_ATTR_OUTPUT[1], data: 02 00 00 00 - Dumping OVS_PACKET_ATR_PACKET data: ###[ Ethernet ]### dst = 00:00:00:00:ec:01 src = 04:f4:bc:28:57:00 type = IPv4 ###[ IP ]### version = 4 ihl = 5 tos = 0x0 len = 50 id = 0 flags = frag = 0 ttl = 127 proto = icmp chksum = 0x2767 src = 10.0.0.1 dst = 10.0.0.100 \options \ ###[ ICMP ]### type = echo-request code = 0 chksum = 0xf7f3 id = 0x0 seq = 0xc Acked-by: Adrian Moreno Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/topics/usdt-probes.rst | 1 + utilities/automake.mk | 3 + .../usdt-scripts/dpif_nl_exec_monitor.py | 662 ++++++++++++++++++ 3 files changed, 666 insertions(+) create mode 100755 utilities/usdt-scripts/dpif_nl_exec_monitor.py diff --git a/Documentation/topics/usdt-probes.rst b/Documentation/topics/usdt-probes.rst index 7ce19aaedea..004817b1c54 100644 --- a/Documentation/topics/usdt-probes.rst +++ b/Documentation/topics/usdt-probes.rst @@ -254,6 +254,7 @@ DPIF_OP_FLOW_EXECUTE operation as part of the dpif ``operate()`` callback. **Script references**: +- ``utilities/usdt-scripts/dpif_nl_exec_monitor.py`` - ``utilities/usdt-scripts/upcall_cost.py`` diff --git a/utilities/automake.mk b/utilities/automake.mk index 132a16942e8..b020511c61c 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -22,6 +22,7 @@ scripts_SCRIPTS += \ scripts_DATA += utilities/ovs-lib usdt_SCRIPTS += \ utilities/usdt-scripts/bridge_loop.bt \ + utilities/usdt-scripts/dpif_nl_exec_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py @@ -67,6 +68,7 @@ EXTRA_DIST += \ utilities/docker/debian/Dockerfile \ utilities/docker/debian/build-kernel-modules.sh \ utilities/usdt-scripts/bridge_loop.bt \ + utilities/usdt-scripts/dpif_nl_exec_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py MAN_ROOTS += \ @@ -137,6 +139,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \ utilities/ovs-check-dead-ifs.in \ utilities/ovs-tcpdump.in \ utilities/ovs-pipegen.py \ + utilities/usdt-scripts/dpif_nl_exec_monitor.py \ utilities/usdt-scripts/upcall_monitor.py \ utilities/usdt-scripts/upcall_cost.py diff --git a/utilities/usdt-scripts/dpif_nl_exec_monitor.py b/utilities/usdt-scripts/dpif_nl_exec_monitor.py new file mode 100755 index 00000000000..0a9ff812334 --- /dev/null +++ b/utilities/usdt-scripts/dpif_nl_exec_monitor.py @@ -0,0 +1,662 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script information: +# ------------------- +# dpif_nl_exec_monitor.py uses the dpif_netlink_operate__:op_flow_execute USDT +# probe to receive all DPIF_OP_EXECUTE operations that are queued for +# transmission over the netlink socket. It will do some basic decoding, and if +# requested a packet dump. +# +# Here is an example: +# +# # ./dpif_nl_exec_monitor.py --packet-decode decode +# Display DPIF_OP_EXECUTE operations being queued for transmission... +# TIME CPU COMM PID NL_SIZE +# 3124.516679897 1 ovs-vswitchd 8219 180 +# nlmsghdr : len = 0, type = 36, flags = 1, seq = 0, pid = 0 +# genlmsghdr: cmd = 3, version = 1, reserver = 0 +# ovs_header: dp_ifindex = 21 +# > Decode OVS_PACKET_ATTR_* TLVs: +# nla_len 46, nla_type OVS_PACKET_ATTR_PACKET[1], data: 00 00 00... +# nla_len 20, nla_type OVS_PACKET_ATTR_KEY[2], data: 08 00 02 00... +# > Decode OVS_KEY_ATTR_* TLVs: +# nla_len 8, nla_type OVS_KEY_ATTR_PRIORITY[2], data: 00 00... +# nla_len 8, nla_type OVS_KEY_ATTR_SKB_MARK[15], data: 00 00... +# nla_len 88, nla_type OVS_PACKET_ATTR_ACTIONS[3], data: 4c 00 03... +# > Decode OVS_ACTION_ATTR_* TLVs: +# nla_len 76, nla_type OVS_ACTION_ATTR_SET[3], data: 48 00... +# > Decode OVS_TUNNEL_KEY_ATTR_* TLVs: +# nla_len 12, nla_type OVS_TUNNEL_KEY_ATTR_ID[0], data:... +# nla_len 20, nla_type OVS_TUNNEL_KEY_ATTR_IPV6_DST[13], ... +# nla_len 5, nla_type OVS_TUNNEL_KEY_ATTR_TTL[4], data: 40 +# nla_len 4, nla_type OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT... +# nla_len 4, nla_type OVS_TUNNEL_KEY_ATTR_CSUM[6], data: +# nla_len 6, nla_type OVS_TUNNEL_KEY_ATTR_TP_DST[10],... +# nla_len 12, nla_type OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS... +# nla_len 8, nla_type OVS_ACTION_ATTR_OUTPUT[1], data: 02 00 00 00 +# - Dumping OVS_PACKET_ATR_PACKET data: +# ###[ Ethernet ]### +# dst = 00:00:00:00:ec:01 +# src = 04:f4:bc:28:57:00 +# type = IPv4 +# ###[ IP ]### +# version = 4 +# ihl = 5 +# tos = 0x0 +# len = 50 +# id = 0 +# flags = +# frag = 0 +# ttl = 127 +# proto = icmp +# chksum = 0x2767 +# src = 10.0.0.1 +# dst = 10.0.0.100 +# \options \ +# ###[ ICMP ]### +# type = echo-request +# code = 0 +# chksum = 0xf7f3 +# id = 0x0 +# seq = 0xc +# +# The example above dumps the full netlink and packet decode. However options +# exist to disable this. Here is the full list of supported options: +# +# usage: dpif_nl_exec_monitor.py [-h] [--buffer-page-count NUMBER] [-D [DEBUG]] +# [-d {none,hex,decode}] [-n {none,hex,nlraw}] +# [-p VSWITCHD_PID] [-s [64-2048]] +# [-w PCAP_FILE] +# +# optional arguments: +# -h, --help show this help message and exit +# --buffer-page-count NUMBER +# Number of BPF ring buffer pages, default 1024 +# -D [DEBUG], --debug [DEBUG] +# Enable eBPF debugging +# -d {none,hex,decode}, --packet-decode {none,hex,decode} +# Display packet content in selected mode, default none +# -n {none,hex,nlraw}, --nlmsg-decode {none,hex,nlraw} +# Display netlink message content in selected mode, +# default nlraw +# -p VSWITCHD_PID, --pid VSWITCHD_PID +# ovs-vswitch's PID +# -s [64-2048], --nlmsg-size [64-2048] +# Set maximum netlink message size to capture, default +# 512 +# -w PCAP_FILE, --pcap PCAP_FILE +# Write upcall packets to specified pcap file + +from bcc import BPF, USDT, USDTException +from os.path import exists +from scapy.all import hexdump, wrpcap +from scapy.layers.l2 import Ether + +import argparse +import psutil +import re +import struct +import sys +import time + +# +# Actual eBPF source code +# +ebpf_source = """ +#include + +#define MAX_NLMSG + +struct event_t { + u32 cpu; + u32 pid; + u64 ts; + u32 nl_size; + char comm[TASK_COMM_LEN]; + u8 nl_msg[MAX_NLMSG]; +}; + +struct ofpbuf { + void *base; + void *data; + uint32_t size; + + /* The actual structure is longer, but we are only interested in the + * first couple of entries. */ +}; + +BPF_RINGBUF_OUTPUT(events, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); + +int trace__op_flow_execute(struct pt_regs *ctx) { + struct ofpbuf nlbuf; + uint32_t size; + + bpf_usdt_readarg_p(5, ctx, &nlbuf, sizeof(nlbuf)); + + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + if (!event) { + uint32_t type = 0; + uint64_t *value = dropcnt.lookup(&type); + if (value) + __sync_fetch_and_add(value, 1); + + return 1; + } + + event->ts = bpf_ktime_get_ns(); + event->cpu = bpf_get_smp_processor_id(); + event->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&event->comm, sizeof(event->comm)); + + event->nl_size = nlbuf.size; + if (event->nl_size > MAX_NLMSG) + size = MAX_NLMSG; + else + size = event->nl_size; + + bpf_probe_read(&event->nl_msg, size, nlbuf.data); + + events.ringbuf_submit(event, 0); + return 0; +}; +""" + + +# +# print_event() +# +def print_event(ctx, data, size): + event = b["events"].event(data) + print("{:<18.9f} {:<4} {:<16} {:<10} {:<10}". + format(event.ts / 1000000000, + event.cpu, + event.comm.decode("utf-8"), + event.pid, + event.nl_size)) + + # + # Dumping the netlink message data if requested. + # + if event.nl_size < options.nlmsg_size: + nl_size = event.nl_size + else: + nl_size = options.nlmsg_size + + if options.nlmsg_decode == "hex": + # + # Abuse scapy's hex dump to dump flow key + # + print(re.sub("^", " " * 4, + hexdump(Ether(bytes(event.nl_msg)[:nl_size]), dump=True), + flags=re.MULTILINE)) + + if options.nlmsg_decode == "nlraw": + decode_result = decode_nlm(bytes(event.nl_msg)[:nl_size], dump=True) + else: + decode_result = decode_nlm(bytes(event.nl_msg)[:nl_size], dump=False) + + # + # Decode packet only if there is data + # + if "OVS_PACKET_ATTR_PACKET" not in decode_result: + return + + pkt_data = decode_result["OVS_PACKET_ATTR_PACKET"] + indent = 4 if options.nlmsg_decode != "nlraw" else 6 + + if options.packet_decode != "none": + print("{}- Dumping OVS_PACKET_ATR_PACKET data:".format(" " * indent)) + + if options.packet_decode == "hex": + print(re.sub("^", " " * indent, hexdump(pkt_data, dump=True), + flags=re.MULTILINE)) + + packet = Ether(pkt_data) + if options.packet_decode == "decode": + print(re.sub("^", " " * indent, packet.show(dump=True), + flags=re.MULTILINE)) + + if options.pcap is not None: + wrpcap(options.pcap, packet, append=True) + + +# +# decode_nlm_tlvs() +# +def decode_nlm_tlvs(tlvs, header=None, indent=4, dump=True, + attr_to_str_func=None, decode_tree=None): + bytes_left = len(tlvs) + result = {} + + if dump: + print("{}{}".format(" " * indent, header)) + + while bytes_left: + if bytes_left < 4: + if dump: + print("{}WARN: decode truncated; can't read header".format( + " " * indent)) + break + + nla_len, nla_type = struct.unpack("=HH", tlvs[:4]) + + if nla_len < 4: + if dump: + print("{}WARN: decode truncated; nla_len < 4".format( + " " * indent)) + break + + nla_data = tlvs[4:nla_len] + trunc = "" + + if attr_to_str_func is None: + nla_type_name = "type_{}".format(nla_type) + else: + nla_type_name = attr_to_str_func(nla_type) + + if nla_len > bytes_left: + trunc = "..." + nla_data = nla_data[:(bytes_left - 4)] + else: + result[nla_type_name] = nla_data + + if dump: + print("{}nla_len {}, nla_type {}[{}], data: {}{}".format( + " " * indent, nla_len, nla_type_name, nla_type, + "".join("{:02x} ".format(b) for b in nla_data), trunc)) + + # + # If we have the full data, try to decode further + # + if trunc == "" and decode_tree is not None \ + and nla_type_name in decode_tree: + node = decode_tree[nla_type_name] + decode_nlm_tlvs(nla_data, + header=node["header"], + indent=indent + node["indent"], dump=True, + attr_to_str_func=node["attr_str_func"], + decode_tree=node["decode_tree"]) + + if trunc != "": + if dump: + print("{}WARN: decode truncated; nla_len > msg_len[{}] ". + format(" " * indent, bytes_left)) + break + + # update next offset, but make sure it's aligned correctly + next_offset = (nla_len + 3) & ~(3) + tlvs = tlvs[next_offset:] + bytes_left -= next_offset + + return result + + +# +# decode_nlm() +# +def decode_nlm(msg, indent=4, dump=True): + result = {} + + # + # Decode 'struct nlmsghdr' + # + if dump: + print("{}nlmsghdr : len = {}, type = {}, flags = {}, seq = {}, " + "pid = {}".format(" " * indent, + *struct.unpack("=IHHII", msg[:16]))) + + msg = msg[16:] + + # + # Decode 'struct genlmsghdr' + # + if dump: + print("{}genlmsghdr: cmd = {}, version = {}, reserver = {}".format( + " " * indent, *struct.unpack("=BBH", msg[:4]))) + + msg = msg[4:] + + # + # Decode 'struct ovs_header' + # + if dump: + print("{}ovs_header: dp_ifindex = {}".format( + " " * indent, *struct.unpack("=I", msg[:4]))) + + msg = msg[4:] + + # + # Decode TLVs + # + nl_attr_tree = { + "OVS_PACKET_ATTR_KEY": { + "header": "> Decode OVS_KEY_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_key_attr_str, + "decode_tree": None, + }, + "OVS_PACKET_ATTR_ACTIONS": { + "header": "> Decode OVS_ACTION_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_action_attr_str, + "decode_tree": { + "OVS_ACTION_ATTR_SET": { + "header": "> Decode OVS_KEY_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_key_attr_str, + "decode_tree": { + "OVS_KEY_ATTR_TUNNEL": { + "header": "> Decode OVS_TUNNEL_KEY_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_tunnel_key_attr_str, + "decode_tree": None, + }, + }, + }, + }, + }, + } + + result = decode_nlm_tlvs(msg, indent=indent + 2, dump=dump, + header="> Decode OVS_PACKET_ATTR_* TLVs:", + attr_to_str_func=get_ovs_pkt_attr_str, + decode_tree=nl_attr_tree) + return result + + +# +# get_ovs_pkt_attr_str() +# +def get_ovs_pkt_attr_str(attr): + ovs_pkt_attr = ["OVS_PACKET_ATTR_UNSPEC", + "OVS_PACKET_ATTR_PACKET", + "OVS_PACKET_ATTR_KEY", + "OVS_PACKET_ATTR_ACTIONS", + "OVS_PACKET_ATTR_USERDATA", + "OVS_PACKET_ATTR_EGRESS_TUN_KEY", + "OVS_PACKET_ATTR_UNUSED1", + "OVS_PACKET_ATTR_UNUSED2", + "OVS_PACKET_ATTR_PROBE", + "OVS_PACKET_ATTR_MRU", + "OVS_PACKET_ATTR_LEN", + "OVS_PACKET_ATTR_HASH"] + if attr < 0 or attr >= len(ovs_pkt_attr): + return "".format(attr) + + return ovs_pkt_attr[attr] + + +# +# get_ovs_key_attr_str() +# +def get_ovs_key_attr_str(attr): + ovs_key_attr = ["OVS_KEY_ATTR_UNSPEC", + "OVS_KEY_ATTR_ENCAP", + "OVS_KEY_ATTR_PRIORITY", + "OVS_KEY_ATTR_IN_PORT", + "OVS_KEY_ATTR_ETHERNET", + "OVS_KEY_ATTR_VLAN", + "OVS_KEY_ATTR_ETHERTYPE", + "OVS_KEY_ATTR_IPV4", + "OVS_KEY_ATTR_IPV6", + "OVS_KEY_ATTR_TCP", + "OVS_KEY_ATTR_UDP", + "OVS_KEY_ATTR_ICMP", + "OVS_KEY_ATTR_ICMPV6", + "OVS_KEY_ATTR_ARP", + "OVS_KEY_ATTR_ND", + "OVS_KEY_ATTR_SKB_MARK", + "OVS_KEY_ATTR_TUNNEL", + "OVS_KEY_ATTR_SCTP", + "OVS_KEY_ATTR_TCP_FLAGS", + "OVS_KEY_ATTR_DP_HASH", + "OVS_KEY_ATTR_RECIRC_ID", + "OVS_KEY_ATTR_MPLS", + "OVS_KEY_ATTR_CT_STATE", + "OVS_KEY_ATTR_CT_ZONE", + "OVS_KEY_ATTR_CT_MARK", + "OVS_KEY_ATTR_CT_LABELS", + "OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4", + "OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6", + "OVS_KEY_ATTR_NSH"] + + if attr < 0 or attr >= len(ovs_key_attr): + return "".format(attr) + + return ovs_key_attr[attr] + + +# +# get_ovs_action_attr_str() +# +def get_ovs_action_attr_str(attr): + ovs_action_attr = ["OVS_ACTION_ATTR_UNSPEC", + "OVS_ACTION_ATTR_OUTPUT", + "OVS_ACTION_ATTR_USERSPACE", + "OVS_ACTION_ATTR_SET", + "OVS_ACTION_ATTR_PUSH_VLAN", + "OVS_ACTION_ATTR_POP_VLAN", + "OVS_ACTION_ATTR_SAMPLE", + "OVS_ACTION_ATTR_RECIRC", + "OVS_ACTION_ATTR_HASH", + "OVS_ACTION_ATTR_PUSH_MPLS", + "OVS_ACTION_ATTR_POP_MPLS", + "OVS_ACTION_ATTR_SET_MASKED", + "OVS_ACTION_ATTR_CT", + "OVS_ACTION_ATTR_TRUNC", + "OVS_ACTION_ATTR_PUSH_ETH", + "OVS_ACTION_ATTR_POP_ETH", + "OVS_ACTION_ATTR_CT_CLEAR", + "OVS_ACTION_ATTR_PUSH_NSH", + "OVS_ACTION_ATTR_POP_NSH", + "OVS_ACTION_ATTR_METER", + "OVS_ACTION_ATTR_CLONE", + "OVS_ACTION_ATTR_CHECK_PKT_LEN", + "OVS_ACTION_ATTR_ADD_MPLS", + "OVS_ACTION_ATTR_TUNNEL_PUSH", + "OVS_ACTION_ATTR_TUNNEL_POP", + "OVS_ACTION_ATTR_DROP", + "OVS_ACTION_ATTR_LB_OUTPUT"] + if attr < 0 or attr >= len(ovs_action_attr): + return "".format(attr) + + return ovs_action_attr[attr] + + +# +# get_ovs_tunnel_key_attr_str() +# +def get_ovs_tunnel_key_attr_str(attr): + ovs_tunnel_key_attr = ["OVS_TUNNEL_KEY_ATTR_ID", + "OVS_TUNNEL_KEY_ATTR_IPV4_SRC", + "OVS_TUNNEL_KEY_ATTR_IPV4_DST", + "OVS_TUNNEL_KEY_ATTR_TOS", + "OVS_TUNNEL_KEY_ATTR_TTL", + "OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT", + "OVS_TUNNEL_KEY_ATTR_CSUM", + "OVS_TUNNEL_KEY_ATTR_OAM", + "OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS", + "OVS_TUNNEL_KEY_ATTR_TP_SRC", + "OVS_TUNNEL_KEY_ATTR_TP_DST", + "OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS", + "OVS_TUNNEL_KEY_ATTR_IPV6_SRC", + "OVS_TUNNEL_KEY_ATTR_IPV6_DST", + "OVS_TUNNEL_KEY_ATTR_PAD", + "OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS", + "OVS_TUNNEL_KEY_ATTR_GTPU_OPTS"] + if attr < 0 or attr >= len(ovs_tunnel_key_attr): + return "".format(attr) + + return ovs_tunnel_key_attr[attr] + + +# +# buffer_size_type() +# +def buffer_size_type(astr, min=64, max=2048): + value = int(astr) + if min <= value <= max: + return value + else: + raise argparse.ArgumentTypeError( + "value not in range {}-{}".format(min, max)) + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global b + global options + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=int, default=1024, metavar="NUMBER") + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs="?") + parser.add_argument("-d", "--packet-decode", + help="Display packet content in selected mode, " + "default none", + choices=["none", "hex", "decode"], default="none") + parser.add_argument("-n", "--nlmsg-decode", + help="Display netlink message content in selected mode" + ", default nlraw", + choices=["none", "hex", "nlraw"], default="nlraw") + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=int, default=None) + parser.add_argument("-s", "--nlmsg-size", + help="Set maximum netlink message size to capture, " + "default 512", type=buffer_size_type, default=512, + metavar="[64-2048]") + parser.add_argument("-w", "--pcap", metavar="PCAP_FILE", + help="Write upcall packets to specified pcap file", + type=str, default=None) + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if options.pid is None: + for proc in psutil.process_iter(): + if "ovs-vswitchd" in proc.name(): + if options.pid is not None: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(-1) + + options.pid = proc.pid + + # + # Error checking on input parameters + # + if options.pid is None: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(-1) + + if options.pcap is not None: + if exists(options.pcap): + print("ERROR: Destination capture file \"{}\" already exists!". + format(options.pcap)) + sys.exit(-1) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Attach the usdt probe + # + u = USDT(pid=int(options.pid)) + try: + u.enable_probe(probe="dpif_netlink_operate__:op_flow_execute", + fn_name="trace__op_flow_execute") + except USDTException as e: + print("ERROR: {}" + "ovs-vswitchd!".format( + (re.sub("^", " " * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(-1) + + # + # Uncomment to see how arguments are decoded. + # print(u.get_text()) + # + + # + # Attach probe to running process + # + source = ebpf_source.replace("", str(options.nlmsg_size)) + source = source.replace("", + str(options.buffer_page_count)) + + b = BPF(text=source, usdt_contexts=[u], debug=options.debug) + + # + # Print header + # + print("Display DPIF_OP_EXECUTE operations being queued for transmission " + "onto the netlink socket.") + print("{:<18} {:<4} {:<16} {:<10} {:<10}".format( + "TIME", "CPU", "COMM", "PID", "NL_SIZE")) + + # + # Dump out all events + # + b["events"].open_ring_buffer(print_event) + while 1: + try: + b.ring_buffer_poll() + time.sleep(0.5) + except KeyboardInterrupt: + break + + dropcnt = b.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\nWARNING: Not all upcalls were captured, {} were dropped!" + "\n Increase the BPF ring buffer size with the " + "--buffer-page-count option.".format(count)) + + +# +# Start main() as the default entry point... +# +if __name__ == "__main__": + main() From 182b9cb3524c5e7be20be7ebf061c6730a7f7e26 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 5 Jan 2023 13:07:59 +0100 Subject: [PATCH 106/833] dpif: Fix tunnel key set for IPv6 tunnels with SLOW_ACTION. The dpif_execute_helper_cb() function is supposed to add the OVS_ACTION_ATTR_SET(OVS_KEY_ATTR_TUNNEL()) action to the list of actions when passing it down to the kernel. This function was only checking if the IPv4 destination address was set, not both. This patch fixes this, including a datapath testcase. Fixes: 076caa2fb077 ("ofproto: Meter translation.") Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/dpif.c | 2 +- tests/system-traffic.at | 48 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/lib/dpif.c b/lib/dpif.c index 40f5fe44606..fe4db83fbfe 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1213,7 +1213,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, /* The Linux kernel datapath throws away the tunnel information * that we supply as metadata. We have to use a "set" action to * supply it. */ - if (md->tunnel.ip_dst) { + if (flow_tnl_dst_is_set(&md->tunnel)) { odp_put_tunnel_action(&md->tunnel, &execute_actions, NULL); } ofpbuf_put(&execute_actions, action, NLA_ALIGN(action->nla_len)); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index e5403519f2a..08c78ff57e1 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -855,6 +855,54 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PI OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - slow_action on geneve6 tunnel]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_GENEVE_UDP6ZEROCSUM() + +OVS_TRAFFIC_VSWITCHD_START() +ADD_BR([br-underlay]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "fc00::1/64", [], [], "nodad") +AT_CHECK([ip addr add dev br-underlay "fc00::100/64" nodad]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL6([geneve], [br0], [at_gnv0], [fc00::1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL6([geneve], [ns_gnv0], [at_ns0], [fc00::100], [10.1.1.1/24], + [vni 0 udp6zerocsumtx udp6zerocsumrx]) +AT_CHECK([ovs-ofctl add-flow br0 "table=37,actions=at_gnv0"]) + +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Start tcpdump to capture the encapsulated packets. +NETNS_DAEMONIZE([at_ns0], [tcpdump -U -i p0 -w p0.pcap], [tcpdump.pid]) +sleep 1 + +dnl Generate a single packet trough the controler that needs an ARP modification +AT_CHECK([ovs-ofctl -O OpenFlow15 packet-out br0 "in_port=controller packet=fffffffffffffa163e949d8008060001080006040001fa163e949d80c0a820300000000000000a0000fe actions=set_field:0xa0000f4->reg1,move:NXM_NX_XXREG0[[64..95]]->NXM_OF_ARP_SPA[[]],resubmit(,37)"]) +sleep 1 + +dnl Stop OVS and tcpdump and verify the results. +OVS_TRAFFIC_VSWITCHD_STOP + +ovs-pcap p0.pcap + +AT_CHECK([ovs-pcap p0.pcap | grep -Eq "^[[[:xdigit:]]]{24}86dd60000000003a1140fc000000000000000000000000000100fc000000000000000000000000000001[[[:xdigit:]]]{4}17c1003a[[[:xdigit:]]]{4}0000655800000000fffffffffffffa163e949d8008060001080006040001[[[:xdigit:]]]{12}0a0000f40000000000000a0000fe$"]) +AT_CLEANUP + AT_SETUP([datapath - ping over gre tunnel by simulated packets]) OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) From 461ab419ead100ef38a3f596151e826ca3dd131d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 5 Jan 2023 15:12:29 +0100 Subject: [PATCH 107/833] treewide: Don't use non-portable '==' with test command. '==' is not defined by POSIX and not supported by some shells. This is causing test failures and potential other issues: ./tests/testsuite: 54: test: X2: unexpected operator ./tests/testsuite: 54: test: X157: unexpected operator ./tests/testsuite: 54: test: X116: unexpected operator Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-December/052157.html Reviewed-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- acinclude.m4 | 2 +- tests/ovsdb-cluster.at | 14 +++++++------- tests/ovsdb-server.at | 2 +- utilities/ovs-sim.in | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/acinclude.m4 b/acinclude.m4 index 8aecfb63d2a..ac1eab79004 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -263,7 +263,7 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ [enable_afxdp=auto]) AC_MSG_CHECKING([whether AF_XDP is enabled]) - if test "$enable_afxdp" == no; then + if test "$enable_afxdp" = no; then AC_MSG_RESULT([no]) AF_XDP_ENABLE=false else diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 920b833b721..9fbf5dc897f 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -8,7 +8,7 @@ ovsdb_check_cluster () { $schema_func > schema schema=`ovsdb-tool schema-name schema` AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db schema unix:s1.raft], [0], [], [stderr]) - if test X$local_config == X"yes"; then + if test X$local_config = X"yes"; then for i in `seq $n`; do AT_CHECK([ovsdb-tool create c$i.db $top_srcdir/ovsdb/local-config.ovsschema], [0], [], [stderr]) local ctxn="[[\"Local_Config\", @@ -30,7 +30,7 @@ ovsdb_check_cluster () { for i in `seq $n`; do local remote=punix:s$i.ovsdb local config_db= - if test X$local_config == X"yes"; then + if test X$local_config = X"yes"; then remote=db:Local_Config,Config,connections config_db=c$i.db fi @@ -129,7 +129,7 @@ ovsdb_test_cluster_disconnect () { # When a node is disconnected from the cluster, the IDL should disconnect # and retry even if it uses a single remote, because the remote IP can be # a VIP on a load-balance. So we use single remote to test here. - if test $leader_or_follower == "leader"; then + if test $leader_or_follower = "leader"; then target=1 shutdown=`seq $(($n/2 + 1)) $n` cleanup=`seq $(($n/2))` @@ -188,13 +188,13 @@ ovsdb_test_cluster_disconnect () { count_old=`grep "raft_is_connected: true" raft_is_connected.log | wc -l` echo count_old $count_old - if test X$check_flapping == X"yes"; then + if test X$check_flapping = X"yes"; then sleep 10 fi # Make sure raft_is_connected didn't flap from false to true. count_new=`grep "raft_is_connected: true" raft_is_connected.log | wc -l` echo count_new $count_new - AT_CHECK([test $count_new == $count_old]) + AT_CHECK([test $count_new = $count_old]) for i in $cleanup; do OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) @@ -493,7 +493,7 @@ ovsdb_cluster_failure_test () { remote_2=$2 crash_node=$3 crash_command=$4 - if test "$crash_node" == "1"; then + if test "$crash_node" = "1"; then new_leader=$5 fi log_grep=$6 @@ -536,7 +536,7 @@ ovsdb_cluster_failure_test () { # To ensure $new_leader node the new leader, we delay election timer for # the other follower. if test -n "$new_leader"; then - if test "$new_leader" == "2"; then + if test "$new_leader" = "2"; then delay_election_node=3 else delay_election_node=2 diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index c7b2fe3ae6e..0828e6d04c1 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1270,7 +1270,7 @@ dnl a case where there is only one transaction in a history. get_memory_value () { n=$(ovs-appctl -t ovsdb-server memory/show dnl | tr ' ' '\n' | grep "^$1:" | cut -d ':' -f 2) - if test X"$n" == "X"; then + if test X"$n" = "X"; then n=0 fi echo $n diff --git a/utilities/ovs-sim.in b/utilities/ovs-sim.in index 08957bdf46f..779ea60aee1 100755 --- a/utilities/ovs-sim.in +++ b/utilities/ovs-sim.in @@ -131,7 +131,7 @@ EOF export -f as sim_add() { - if test "$1" == --help; then + if test "$1" = --help; then cat < Date: Thu, 5 Jan 2023 20:33:45 +0100 Subject: [PATCH 108/833] Documentation: Fix links in the DPDK guide on physical ports. The text enclosed in '<...>' supposed to be an actual link and not the name of the link. This generates incorrect links that lead nowhere. Also, a single underscore supposed to be used for external links. Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/phy.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index cb2d5bcb7b3..4b0fe8dded3 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -76,8 +76,8 @@ persist across reboots. In addition, there are two options available for this kernel space driver - VFIO (Virtual Function I/O) and UIO (Userspace I/O) - along with a number of drivers for each option. We will demonstrate examples of both tools and will use the ``vfio-pci`` driver, which is the more secure, -robust driver of those available. More information can be found in the `DPDK -documentation `__. +robust driver of those available. More information can be found in the +`DPDK drivers documentation`_. To list devices using :command:`driverctl`, run:: @@ -115,9 +115,9 @@ tool:: Open vSwitch 2.6.0 added support for DPDK 16.07, which in turn renamed the former ``dpdk_nic_bind`` tool to ``dpdk-devbind``. -For more information, refer to the `DPDK documentation `__. +For more information, refer to the `DPDK drivers documentation`_. -.. _dpdk-drivers: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html +.. _DPDK drivers documentation: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html .. _dpdk-phy-multiqueue: @@ -394,14 +394,14 @@ in the ``options`` column of the ``Interface`` table. .. important:: - Some DPDK port use `bifurcated drivers `__, - which means that a kernel netdevice remains when Open vSwitch is stopped. + Some DPDK port use `bifurcated drivers`_, which means that a kernel + netdevice remains when Open vSwitch is stopped. In such case, any configuration applied to a VF would remain set on the kernel netdevice, and be inherited from it when Open vSwitch is restarted, even if the options described in this section are unset from Open vSwitch. -.. _bifurcated-drivers: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html#bifurcated-driver +.. _bifurcated drivers: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html#bifurcated-driver - Configure the VF MAC address:: From a7826d05b8ce8af2fc4042261edd2b0a196d7582 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 5 Jan 2023 20:42:43 +0100 Subject: [PATCH 109/833] Documentation: Fix links in maintainers.rst. GitHub and Sphinx are parsing links differently. Sphinx knows about the overall documentation structure and all the sections defined in other docs, while GitHub is using direct rst 2 html conversion and doesn't know any of that. Sphinx wants links to sections in other docs to be defined with a :doc: field, but GitHub can't parse that and requires having a direct link to the other rST document. The problem is that we have a top level MAINTAINERS.rst, that should be parseable by GitHub, included in the maintainers.rst in the main documentation section that is used by Sphinx to generate html, pdf and other docs. So, it's hard to make links work in both. Working around that limitation by using rST substitutions for the links. Cutting off the substitutions for actual links and adding :doc: links instead during the file inclusion for Sphinx. Reported-by: Igor Zhukov Acked-by: Han Zhou Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/internals/maintainers.rst | 5 +++++ MAINTAINERS.rst | 21 +++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/Documentation/internals/maintainers.rst b/Documentation/internals/maintainers.rst index 172d684df97..0203bbe9554 100644 --- a/Documentation/internals/maintainers.rst +++ b/Documentation/internals/maintainers.rst @@ -22,3 +22,8 @@ Avoid deeper levels because they do not render well. .. include:: ../../MAINTAINERS.rst + :end-before: Cut here for the Documentation/internals/maintainers.rst + +.. |responsibilities| replace:: :doc:`committer-responsibilities` +.. |grant-revocation| replace:: :doc:`committer-grant-revocation` +.. |emeritus-status| replace:: :doc:`committer-emeritus-status` diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 27be4aa4129..1dc406170f2 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -28,11 +28,11 @@ Committers Open vSwitch committers are the people who have been granted access to push changes to the Open vSwitch git repository. -The responsibilities of an Open vSwitch committer are documented -`here `__. +The responsibilities of an Open vSwitch committer are documented here: +|responsibilities|. -The process for adding or removing committers is documented -`here `__. +The process for adding or removing committers is documented here: +|grant-revocation|. This is the current list of active Open vSwitch committers: @@ -77,8 +77,8 @@ This is the current list of active Open vSwitch committers: - yamamoto@midokura.com The project also maintains a list of Emeritus Committers (or Maintainers). -More information about Emeritus Committers can be found -`here `__. +More information about Emeritus Committers can be found here: +|emeritus-status|. .. list-table:: OVS Emeritus Maintainers :header-rows: 1 @@ -91,3 +91,12 @@ More information about Emeritus Committers can be found - ejj@eecs.berkeley.edu * - Joe Stringer - joe@ovn.org + +.. Cut here for the Documentation/internals/maintainers.rst + +.. |responsibilities| replace:: `Expectations for Developers with Open vSwitch + Repo Access `__ +.. |grant-revocation| replace:: `OVS Committer Grant/Revocation Policy + `__ +.. |emeritus-status| replace:: `Emeritus Status for OVS Committers + `__ From 006e1c6dbfbadf474c17c8fa1ea358918d371588 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 5 Jan 2023 15:24:53 -0500 Subject: [PATCH 110/833] tc: Add support for TCA_STATS_PKT64. Currently tc offload flow packet counters will roll over every ~4 billion packets. This is because the packet counter in struct tc_stats provided by TCA_STATS_BASIC is a 32bit integer. Now we check for the optional TCA_STATS_PKT64 attribute which provides the full 64bit packet counter if the 32bit one has rolled over. Because the TCA_STATS_PKT64 attribute may appear multiple times in a netlink message, the method of parsing attributes was changed. Fixes: f98e418fbdb6 ("tc: Add tc flower functions") Reported-at: https://bugzilla.redhat.com/1776816 Acked-by: Eelco Chaudron Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/tc.c | 110 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 41 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index a66dc432f98..447ab376ee0 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -84,6 +84,11 @@ struct flower_key_to_pedit { int boundary_shift; }; +struct tc_flow_stats { + uint64_t n_packets; + uint64_t n_bytes; +}; + static struct flower_key_to_pedit flower_pedit_map[] = { { TCA_PEDIT_KEY_EX_HDR_TYPE_IP4, @@ -1852,66 +1857,89 @@ static const struct nl_policy act_policy[] = { [TCA_ACT_STATS] = { .type = NL_A_NESTED, .optional = false, }, }; -static const struct nl_policy stats_policy[] = { - [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, - .min_len = sizeof(struct gnet_stats_basic), - .optional = false, }, - [TCA_STATS_BASIC_HW] = { .type = NL_A_UNSPEC, - .min_len = sizeof(struct gnet_stats_basic), - .optional = true, }, - [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, - .min_len = sizeof(struct gnet_stats_queue), - .optional = true, }, -}; - static int nl_parse_action_stats(struct nlattr *act_stats, struct ovs_flow_stats *stats_sw, struct ovs_flow_stats *stats_hw, struct ovs_flow_stats *stats_dropped) { - struct nlattr *stats_attrs[ARRAY_SIZE(stats_policy)]; - struct gnet_stats_basic bs_all, bs_sw, bs_hw; - const struct gnet_stats_queue *qs; + struct tc_flow_stats s_sw = {0}, s_hw = {0}; + const struct gnet_stats_queue *qs = NULL; + uint16_t prev_type = __TCA_STATS_MAX; + const struct nlattr *nla; + unsigned int seen = 0; + size_t left; - if (!nl_parse_nested(act_stats, stats_policy, stats_attrs, - ARRAY_SIZE(stats_policy))) { - VLOG_ERR_RL(&error_rl, "Failed to parse action stats policy"); - return EPROTO; - } + /* Cannot use nl_parse_nested due to duplicate attributes. */ + NL_NESTED_FOR_EACH (nla, left, act_stats) { + struct gnet_stats_basic stats_basic; + uint16_t type = nl_attr_type(nla); - memcpy(&bs_all, - nl_attr_get_unspec(stats_attrs[TCA_STATS_BASIC], sizeof bs_all), - sizeof bs_all); - if (stats_attrs[TCA_STATS_BASIC_HW]) { - memcpy(&bs_hw, nl_attr_get_unspec(stats_attrs[TCA_STATS_BASIC_HW], - sizeof bs_hw), - sizeof bs_hw); + seen |= 1 << type; - bs_sw.packets = bs_all.packets - bs_hw.packets; - bs_sw.bytes = bs_all.bytes - bs_hw.bytes; - } else { - bs_sw.packets = bs_all.packets; - bs_sw.bytes = bs_all.bytes; + switch (type) { + case TCA_STATS_BASIC: + memcpy(&stats_basic, nl_attr_get_unspec(nla, sizeof stats_basic), + sizeof stats_basic); + s_sw.n_packets = stats_basic.packets; + s_sw.n_bytes = stats_basic.bytes; + break; + + case TCA_STATS_BASIC_HW: + memcpy(&stats_basic, nl_attr_get_unspec(nla, sizeof stats_basic), + sizeof stats_basic); + s_hw.n_packets = stats_basic.packets; + s_hw.n_bytes = stats_basic.bytes; + break; + + case TCA_STATS_QUEUE: + qs = nl_attr_get_unspec(nla, sizeof *qs); + break; + + case TCA_STATS_PKT64: + if (prev_type == TCA_STATS_BASIC) { + s_sw.n_packets = nl_attr_get_u64(nla); + } else if (prev_type == TCA_STATS_BASIC_HW) { + s_hw.n_packets = nl_attr_get_u64(nla); + } else { + goto err; + } + break; + + default: + break; + } + prev_type = type; } - if (bs_sw.packets > get_32aligned_u64(&stats_sw->n_packets)) { - put_32aligned_u64(&stats_sw->n_packets, bs_sw.packets); - put_32aligned_u64(&stats_sw->n_bytes, bs_sw.bytes); + if (!(seen & (1 << TCA_STATS_BASIC))) { + goto err; } - if (stats_attrs[TCA_STATS_BASIC_HW] - && bs_hw.packets > get_32aligned_u64(&stats_hw->n_packets)) { - put_32aligned_u64(&stats_hw->n_packets, bs_hw.packets); - put_32aligned_u64(&stats_hw->n_bytes, bs_hw.bytes); + if (seen & (1 << TCA_STATS_BASIC_HW)) { + s_sw.n_packets = s_sw.n_packets - s_hw.n_packets; + s_sw.n_bytes = s_sw.n_bytes - s_hw.n_bytes; + + if (s_hw.n_packets > get_32aligned_u64(&stats_hw->n_packets)) { + put_32aligned_u64(&stats_hw->n_packets, s_hw.n_packets); + put_32aligned_u64(&stats_hw->n_bytes, s_hw.n_bytes); + } } - if (stats_dropped && stats_attrs[TCA_STATS_QUEUE]) { - qs = nl_attr_get_unspec(stats_attrs[TCA_STATS_QUEUE], sizeof *qs); + if (s_sw.n_packets > get_32aligned_u64(&stats_sw->n_packets)) { + put_32aligned_u64(&stats_sw->n_packets, s_sw.n_packets); + put_32aligned_u64(&stats_sw->n_bytes, s_sw.n_bytes); + } + + if (stats_dropped && qs) { put_32aligned_u64(&stats_dropped->n_packets, qs->drops); } return 0; + +err: + VLOG_ERR_RL(&error_rl, "Failed to parse action stats policy"); + return EPROTO; } static int From 3b29286db1c5908aab25f613e5fab0a4e731e5a9 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Fri, 6 Jan 2023 16:58:55 +0100 Subject: [PATCH 111/833] netdev-dpdk: Add per virtqueue statistics. The DPDK vhost-user library maintains more granular per queue stats which can replace what OVS was providing for vhost-user ports. The benefits for OVS: - OVS can skip parsing packet sizes on the rx side, - dev->stats_lock won't be taken in rx/tx code unless some packet is dropped, - vhost-user is aware of which packets are transmitted to the guest, so per *transmitted* packet size stats can be reported, - more internal stats from vhost-user may be exposed, without OVS needing to understand them, Note: the vhost-user library does not provide global stats for a port. The proposed implementation is to have the global stats (exposed via netdev_get_stats()) computed by querying and aggregating all per queue stats. Since per queue stats are exposed via another netdev ops (netdev_get_custom_stats()), this may lead to some race and small discrepancies. This issue might already affect other netdev classes. Example: $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep -v =0$ rx_1_to_64_packets=12 rx_256_to_511_packets=15 rx_65_to_127_packets=21 rx_broadcast_packets=15 rx_bytes=7497 rx_multicast_packets=33 rx_packets=48 rx_q0_good_bytes=242 rx_q0_good_packets=3 rx_q0_guest_notifications=3 rx_q0_multicast_packets=3 rx_q0_size_65_127_packets=2 rx_q0_undersize_packets=1 rx_q1_broadcast_packets=15 rx_q1_good_bytes=7255 rx_q1_good_packets=45 rx_q1_guest_notifications=45 rx_q1_multicast_packets=30 rx_q1_size_256_511_packets=15 rx_q1_size_65_127_packets=19 rx_q1_undersize_packets=11 tx_1_to_64_packets=36 tx_256_to_511_packets=45 tx_65_to_127_packets=63 tx_broadcast_packets=45 tx_bytes=22491 tx_multicast_packets=99 tx_packets=144 tx_q0_broadcast_packets=30 tx_q0_good_bytes=14994 tx_q0_good_packets=96 tx_q0_guest_notifications=96 tx_q0_multicast_packets=66 tx_q0_size_256_511_packets=30 tx_q0_size_65_127_packets=42 tx_q0_undersize_packets=24 tx_q1_broadcast_packets=15 tx_q1_good_bytes=7497 tx_q1_good_packets=48 tx_q1_guest_notifications=48 tx_q1_multicast_packets=33 tx_q1_size_256_511_packets=15 tx_q1_size_65_127_packets=21 tx_q1_undersize_packets=12 Reviewed-by: Maxime Coquelin Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 447 ++++++++++++++++++++++++++++++------------- tests/system-dpdk.at | 33 +++- 2 files changed, 348 insertions(+), 132 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index fff57f78279..61a35985ece 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2363,72 +2363,6 @@ is_vhost_running(struct netdev_dpdk *dev) return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured); } -static inline void -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats, - unsigned int packet_size) -{ - /* Hard-coded search for the size bucket. */ - if (packet_size < 256) { - if (packet_size >= 128) { - stats->rx_128_to_255_packets++; - } else if (packet_size <= 64) { - stats->rx_1_to_64_packets++; - } else { - stats->rx_65_to_127_packets++; - } - } else { - if (packet_size >= 1523) { - stats->rx_1523_to_max_packets++; - } else if (packet_size >= 1024) { - stats->rx_1024_to_1522_packets++; - } else if (packet_size < 512) { - stats->rx_256_to_511_packets++; - } else { - stats->rx_512_to_1023_packets++; - } - } -} - -static inline void -netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev, - struct dp_packet **packets, int count, - int qos_drops) -{ - struct netdev_stats *stats = &dev->stats; - struct dp_packet *packet; - unsigned int packet_size; - int i; - - stats->rx_packets += count; - stats->rx_dropped += qos_drops; - for (i = 0; i < count; i++) { - packet = packets[i]; - packet_size = dp_packet_size(packet); - - if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) { - /* This only protects the following multicast counting from - * too short packets, but it does not stop the packet from - * further processing. */ - stats->rx_errors++; - stats->rx_length_errors++; - continue; - } - - netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size); - - struct eth_header *eh = (struct eth_header *) dp_packet_data(packet); - if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) { - stats->multicast++; - } - - stats->rx_bytes += packet_size; - } - - if (OVS_UNLIKELY(qos_drops)) { - dev->sw_stats->rx_qos_drops += qos_drops; - } -} - /* * The receive path for the vhost port is the TX path out from guest. */ @@ -2473,10 +2407,12 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, qos_drops -= nb_rx; } - rte_spinlock_lock(&dev->stats_lock); - netdev_dpdk_vhost_update_rx_counters(dev, batch->packets, - nb_rx, qos_drops); - rte_spinlock_unlock(&dev->stats_lock); + if (OVS_UNLIKELY(qos_drops)) { + rte_spinlock_lock(&dev->stats_lock); + dev->stats.rx_dropped += qos_drops; + dev->sw_stats->rx_qos_drops += qos_drops; + rte_spinlock_unlock(&dev->stats_lock); + } batch->count = nb_rx; dp_packet_batch_init_packet_fields(batch); @@ -2587,38 +2523,6 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } -static inline void -netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev, - struct dp_packet **packets, - int attempted, - struct netdev_dpdk_sw_stats *sw_stats_add) -{ - int dropped = sw_stats_add->tx_mtu_exceeded_drops + - sw_stats_add->tx_qos_drops + - sw_stats_add->tx_failure_drops + - sw_stats_add->tx_invalid_hwol_drops; - struct netdev_stats *stats = &dev->stats; - int sent = attempted - dropped; - int i; - - stats->tx_packets += sent; - stats->tx_dropped += dropped; - - for (i = 0; i < sent; i++) { - stats->tx_bytes += dp_packet_size(packets[i]); - } - - if (OVS_UNLIKELY(dropped || sw_stats_add->tx_retries)) { - struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats; - - sw_stats->tx_retries += sw_stats_add->tx_retries; - sw_stats->tx_failure_drops += sw_stats_add->tx_failure_drops; - sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops; - sw_stats->tx_qos_drops += sw_stats_add->tx_qos_drops; - sw_stats->tx_invalid_hwol_drops += sw_stats_add->tx_invalid_hwol_drops; - } -} - static void netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) { @@ -2799,6 +2703,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, int vid = netdev_dpdk_get_vid(dev); struct netdev_dpdk_sw_stats stats; struct rte_mbuf **pkts; + int dropped; int retries; batch_cnt = cnt = dp_packet_batch_size(batch); @@ -2818,6 +2723,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, } cnt = netdev_dpdk_common_send(netdev, batch, &stats); + dropped = batch_cnt - cnt; pkts = (struct rte_mbuf **) batch->packets; vhost_batch_cnt = cnt; @@ -2848,12 +2754,21 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); stats.tx_failure_drops += cnt; + dropped += cnt; stats.tx_retries = MIN(retries, max_retries); - rte_spinlock_lock(&dev->stats_lock); - netdev_dpdk_vhost_update_tx_counters(dev, batch->packets, batch_cnt, - &stats); - rte_spinlock_unlock(&dev->stats_lock); + if (OVS_UNLIKELY(dropped || stats.tx_retries)) { + struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats; + + rte_spinlock_lock(&dev->stats_lock); + dev->stats.tx_dropped += dropped; + sw_stats->tx_retries += stats.tx_retries; + sw_stats->tx_failure_drops += stats.tx_failure_drops; + sw_stats->tx_mtu_exceeded_drops += stats.tx_mtu_exceeded_drops; + sw_stats->tx_qos_drops += stats.tx_qos_drops; + sw_stats->tx_invalid_hwol_drops += stats.tx_invalid_hwol_drops; + rte_spinlock_unlock(&dev->stats_lock); + } pkts = (struct rte_mbuf **) batch->packets; for (int i = 0; i < vhost_batch_cnt; i++) { @@ -3007,41 +2922,305 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu) return 0; } -static int -netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); - static int netdev_dpdk_vhost_get_stats(const struct netdev *netdev, struct netdev_stats *stats) { + struct rte_vhost_stat_name *vhost_stats_names = NULL; struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_vhost_stat *vhost_stats = NULL; + int vhost_stats_count; + int err; + int qid; + int vid; ovs_mutex_lock(&dev->mutex); + if (!is_vhost_running(dev)) { + err = EPROTO; + goto out; + } + + vid = netdev_dpdk_get_vid(dev); + + /* We expect all rxqs have the same number of stats, only query rxq0. */ + qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + err = EPROTO; + goto out; + } + + vhost_stats_count = err; + vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats); + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + +#define VHOST_RXQ_STATS \ + VHOST_RXQ_STAT(rx_packets, "good_packets") \ + VHOST_RXQ_STAT(rx_bytes, "good_bytes") \ + VHOST_RXQ_STAT(rx_broadcast_packets, "broadcast_packets") \ + VHOST_RXQ_STAT(multicast, "multicast_packets") \ + VHOST_RXQ_STAT(rx_undersized_errors, "undersize_packets") \ + VHOST_RXQ_STAT(rx_1_to_64_packets, "size_64_packets") \ + VHOST_RXQ_STAT(rx_65_to_127_packets, "size_65_127_packets") \ + VHOST_RXQ_STAT(rx_128_to_255_packets, "size_128_255_packets") \ + VHOST_RXQ_STAT(rx_256_to_511_packets, "size_256_511_packets") \ + VHOST_RXQ_STAT(rx_512_to_1023_packets, "size_512_1023_packets") \ + VHOST_RXQ_STAT(rx_1024_to_1522_packets, "size_1024_1518_packets") \ + VHOST_RXQ_STAT(rx_1523_to_max_packets, "size_1519_max_packets") + +#define VHOST_RXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0; + VHOST_RXQ_STATS; +#undef VHOST_RXQ_STAT + + for (int q = 0; q < dev->up.n_rxq; q++) { + qid = q * VIRTIO_QNUM + VIRTIO_TXQ; + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + + for (int i = 0; i < vhost_stats_count; i++) { +#define VHOST_RXQ_STAT(MEMBER, NAME) \ + if (string_ends_with(vhost_stats_names[i].name, NAME)) { \ + dev->stats.MEMBER += vhost_stats[i].value; \ + continue; \ + } + VHOST_RXQ_STATS; +#undef VHOST_RXQ_STAT + } + } + + /* OVS reports 64 bytes and smaller packets into "rx_1_to_64_packets". + * Since vhost only reports good packets and has no error counter, + * rx_undersized_errors is highjacked (see above) to retrieve + * "undersize_packets". */ + dev->stats.rx_1_to_64_packets += dev->stats.rx_undersized_errors; + memset(&dev->stats.rx_undersized_errors, 0xff, + sizeof dev->stats.rx_undersized_errors); + +#define VHOST_RXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER; + VHOST_RXQ_STATS; +#undef VHOST_RXQ_STAT + + free(vhost_stats_names); + vhost_stats_names = NULL; + free(vhost_stats); + vhost_stats = NULL; + + /* We expect all txqs have the same number of stats, only query txq0. */ + qid = 0 * VIRTIO_QNUM; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + err = EPROTO; + goto out; + } + + vhost_stats_count = err; + vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats); + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + +#define VHOST_TXQ_STATS \ + VHOST_TXQ_STAT(tx_packets, "good_packets") \ + VHOST_TXQ_STAT(tx_bytes, "good_bytes") \ + VHOST_TXQ_STAT(tx_broadcast_packets, "broadcast_packets") \ + VHOST_TXQ_STAT(tx_multicast_packets, "multicast_packets") \ + VHOST_TXQ_STAT(rx_undersized_errors, "undersize_packets") \ + VHOST_TXQ_STAT(tx_1_to_64_packets, "size_64_packets") \ + VHOST_TXQ_STAT(tx_65_to_127_packets, "size_65_127_packets") \ + VHOST_TXQ_STAT(tx_128_to_255_packets, "size_128_255_packets") \ + VHOST_TXQ_STAT(tx_256_to_511_packets, "size_256_511_packets") \ + VHOST_TXQ_STAT(tx_512_to_1023_packets, "size_512_1023_packets") \ + VHOST_TXQ_STAT(tx_1024_to_1522_packets, "size_1024_1518_packets") \ + VHOST_TXQ_STAT(tx_1523_to_max_packets, "size_1519_max_packets") + +#define VHOST_TXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0; + VHOST_TXQ_STATS; +#undef VHOST_TXQ_STAT + + for (int q = 0; q < dev->up.n_txq; q++) { + qid = q * VIRTIO_QNUM; + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + + for (int i = 0; i < vhost_stats_count; i++) { +#define VHOST_TXQ_STAT(MEMBER, NAME) \ + if (string_ends_with(vhost_stats_names[i].name, NAME)) { \ + dev->stats.MEMBER += vhost_stats[i].value; \ + continue; \ + } + VHOST_TXQ_STATS; +#undef VHOST_TXQ_STAT + } + } + + /* OVS reports 64 bytes and smaller packets into "tx_1_to_64_packets". + * Same as for rx, rx_undersized_errors is highjacked. */ + dev->stats.tx_1_to_64_packets += dev->stats.rx_undersized_errors; + memset(&dev->stats.rx_undersized_errors, 0xff, + sizeof dev->stats.rx_undersized_errors); + +#define VHOST_TXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER; + VHOST_TXQ_STATS; +#undef VHOST_TXQ_STAT + rte_spinlock_lock(&dev->stats_lock); - /* Supported Stats */ - stats->rx_packets = dev->stats.rx_packets; - stats->tx_packets = dev->stats.tx_packets; stats->rx_dropped = dev->stats.rx_dropped; stats->tx_dropped = dev->stats.tx_dropped; - stats->multicast = dev->stats.multicast; - stats->rx_bytes = dev->stats.rx_bytes; - stats->tx_bytes = dev->stats.tx_bytes; - stats->rx_errors = dev->stats.rx_errors; - stats->rx_length_errors = dev->stats.rx_length_errors; - - stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets; - stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets; - stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets; - stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets; - stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets; - stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets; - stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets; - rte_spinlock_unlock(&dev->stats_lock); + err = 0; +out: + ovs_mutex_unlock(&dev->mutex); + free(vhost_stats); + free(vhost_stats_names); + + return err; +} + +static int +netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev, + struct netdev_custom_stats *custom_stats) +{ + struct rte_vhost_stat_name *vhost_stats_names = NULL; + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_vhost_stat *vhost_stats = NULL; + int vhost_rxq_stats_count; + int vhost_txq_stats_count; + int stat_offset; + int err; + int qid; + int vid; + + netdev_dpdk_get_sw_custom_stats(netdev, custom_stats); + stat_offset = custom_stats->size; + + ovs_mutex_lock(&dev->mutex); + + if (!is_vhost_running(dev)) { + goto out; + } + + vid = netdev_dpdk_get_vid(dev); + + qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + goto out; + } + vhost_rxq_stats_count = err; + + qid = 0 * VIRTIO_QNUM; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + goto out; + } + vhost_txq_stats_count = err; + + stat_offset += dev->up.n_rxq * vhost_rxq_stats_count; + stat_offset += dev->up.n_txq * vhost_txq_stats_count; + custom_stats->counters = xrealloc(custom_stats->counters, + stat_offset * + sizeof *custom_stats->counters); + stat_offset = custom_stats->size; + + vhost_stats_names = xcalloc(vhost_rxq_stats_count, + sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_rxq_stats_count, sizeof *vhost_stats); + + for (int q = 0; q < dev->up.n_rxq; q++) { + qid = q * VIRTIO_QNUM + VIRTIO_TXQ; + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_rxq_stats_count); + if (err != vhost_rxq_stats_count) { + goto out; + } + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_rxq_stats_count); + if (err != vhost_rxq_stats_count) { + goto out; + } + + for (int i = 0; i < vhost_rxq_stats_count; i++) { + ovs_strlcpy(custom_stats->counters[stat_offset + i].name, + vhost_stats_names[i].name, + NETDEV_CUSTOM_STATS_NAME_SIZE); + custom_stats->counters[stat_offset + i].value = + vhost_stats[i].value; + } + stat_offset += vhost_rxq_stats_count; + } + + free(vhost_stats_names); + vhost_stats_names = NULL; + free(vhost_stats); + vhost_stats = NULL; + + vhost_stats_names = xcalloc(vhost_txq_stats_count, + sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_txq_stats_count, sizeof *vhost_stats); + + for (int q = 0; q < dev->up.n_txq; q++) { + qid = q * VIRTIO_QNUM; + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_txq_stats_count); + if (err != vhost_txq_stats_count) { + goto out; + } + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_txq_stats_count); + if (err != vhost_txq_stats_count) { + goto out; + } + + for (int i = 0; i < vhost_txq_stats_count; i++) { + ovs_strlcpy(custom_stats->counters[stat_offset + i].name, + vhost_stats_names[i].name, + NETDEV_CUSTOM_STATS_NAME_SIZE); + custom_stats->counters[stat_offset + i].value = + vhost_stats[i].value; + } + stat_offset += vhost_txq_stats_count; + } + + free(vhost_stats_names); + vhost_stats_names = NULL; + free(vhost_stats); + vhost_stats = NULL; + +out: + ovs_mutex_unlock(&dev->mutex); + + custom_stats->size = stat_offset; + return 0; } @@ -3088,6 +3267,9 @@ netdev_dpdk_convert_xstats(struct netdev_stats *stats, #undef DPDK_XSTATS } +static int +netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); + static int netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats) { @@ -3536,6 +3718,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev, if (NETDEV_UP & on) { rte_spinlock_lock(&dev->stats_lock); memset(&dev->stats, 0, sizeof dev->stats); + memset(dev->sw_stats, 0, sizeof *dev->sw_stats); rte_spinlock_unlock(&dev->stats_lock); } } @@ -5036,6 +5219,11 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) dev->tx_q[0].map = 0; } + rte_spinlock_lock(&dev->stats_lock); + memset(&dev->stats, 0, sizeof dev->stats); + memset(dev->sw_stats, 0, sizeof *dev->sw_stats); + rte_spinlock_unlock(&dev->stats_lock); + if (userspace_tso_enabled()) { dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev->up)); @@ -5096,6 +5284,9 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) /* Register client-mode device. */ vhost_flags |= RTE_VHOST_USER_CLIENT; + /* Extended per vq statistics. */ + vhost_flags |= RTE_VHOST_USER_NET_STATS_ENABLE; + /* There is no support for multi-segments buffers. */ vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT; @@ -5574,7 +5765,7 @@ static const struct netdev_class dpdk_vhost_class = { .send = netdev_dpdk_vhost_send, .get_carrier = netdev_dpdk_vhost_get_carrier, .get_stats = netdev_dpdk_vhost_get_stats, - .get_custom_stats = netdev_dpdk_get_sw_custom_stats, + .get_custom_stats = netdev_dpdk_vhost_get_custom_stats, .get_status = netdev_dpdk_vhost_user_get_status, .reconfigure = netdev_dpdk_vhost_reconfigure, .rxq_recv = netdev_dpdk_vhost_rxq_recv, @@ -5590,7 +5781,7 @@ static const struct netdev_class dpdk_vhost_client_class = { .send = netdev_dpdk_vhost_send, .get_carrier = netdev_dpdk_vhost_get_carrier, .get_stats = netdev_dpdk_vhost_get_stats, - .get_custom_stats = netdev_dpdk_get_sw_custom_stats, + .get_custom_stats = netdev_dpdk_vhost_get_custom_stats, .get_status = netdev_dpdk_vhost_user_get_status, .reconfigure = netdev_dpdk_vhost_client_reconfigure, .rxq_recv = netdev_dpdk_vhost_rxq_recv, diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 8dc187a61d4..5ef7f8ccdc0 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -200,9 +200,10 @@ ADD_VETH(tap1, ns2, br10, "172.31.110.12/24") dnl Execute testpmd in background on_exit "pkill -f -x -9 'tail -f /dev/null'" tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & + --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,queues=2,server=1" \ + --vdev="net_tap0,iface=tap0" --file-prefix page0 \ + --single-file-segments -- -a --nb-cores 2 --rxq 2 --txq 2 \ + >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ip link show dev tap0 | grep -qw LOWER_UP]) @@ -220,9 +221,33 @@ AT_CHECK([ip netns exec ns1 ip addr add 172.31.110.11/24 dev tap0], [], AT_CHECK([ip netns exec ns1 ip link show], [], [stdout], [stderr]) AT_CHECK([ip netns exec ns2 ip link show], [], [stdout], [stderr]) -AT_CHECK([ip netns exec ns1 ping -c 4 -I tap0 172.31.110.12], [], [stdout], +AT_CHECK([ip netns exec ns1 ping -i 0.1 -c 10 -I tap0 172.31.110.12], [], [stdout], [stderr]) +AT_CHECK([ip netns exec ns1 ip link set tap0 down], [], [stdout], [stderr]) + +# Wait for stats to be queried ("stats-update-interval") +sleep 5 +AT_CHECK([ovs-vsctl get interface dpdkvhostuserclient0 statistics], [], [stdout], [stderr]) + +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_packets` -gt 0 -a dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_bytes` -gt 0]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_packets` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q0_good_packets` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q1_good_packets`))]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_bytes` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q0_good_bytes` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q1_good_bytes`))]) + +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_packets` -gt 0 -a dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_bytes` -gt 0]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_packets` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q0_good_packets` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q1_good_packets`))]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_bytes` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q0_good_bytes` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q1_good_bytes`))]) + dnl Clean up the testpmd now pkill -f -x -9 'tail -f /dev/null' From c9e10ac57fb84b783f762d52a25ef3aa78a185c8 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Fri, 6 Jan 2023 16:58:56 +0100 Subject: [PATCH 112/833] netdev-dpdk: Drop coverage counter for vhost IRQs. The vhost library now provides finegrained statistics for guest notifications: - notifications for buffer reclaim by the guest, - notifications for buffer availability to the guest, Example before this patch: $ ovs-appctl coverage/show | grep vhost_notification vhost_notification 0.0/sec 0.000/sec 2.0283/sec total: 7302 $ ovs-vsctl get interface vhost4 statistics | sed -e 's#[{}]##g' -e 's#, #\n#g' | grep guest_notifications rx_q0_guest_notifications=66 tx_q0_guest_notifications=7236 Reviewed-by: Maxime Coquelin Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 61a35985ece..5e2d64651db 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -78,7 +78,6 @@ VLOG_DEFINE_THIS_MODULE(netdev_dpdk); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); COVERAGE_DEFINE(vhost_tx_contention); -COVERAGE_DEFINE(vhost_notification); static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */ @@ -188,7 +187,6 @@ static int new_device(int vid); static void destroy_device(int vid); static int vring_state_changed(int vid, uint16_t queue_id, int enable); static void destroy_connection(int vid); -static void vhost_guest_notified(int vid); static const struct rte_vhost_device_ops virtio_net_device_ops = { @@ -198,7 +196,6 @@ static const struct rte_vhost_device_ops virtio_net_device_ops = .features_changed = NULL, .new_connection = NULL, .destroy_connection = destroy_connection, - .guest_notified = vhost_guest_notified, }; /* Custom software stats for dpdk ports */ @@ -4350,12 +4347,6 @@ destroy_connection(int vid) } } -static -void vhost_guest_notified(int vid OVS_UNUSED) -{ - COVERAGE_INC(vhost_notification); -} - /* * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser * or vhostuserclient netdev. From 264ae342dc0f338734e9f67ba62f691f5ce8b272 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 6 Jan 2023 11:57:00 +0100 Subject: [PATCH 113/833] system-dpdk: Fix error message in ping vhost-user ports. In some environments, ovs-vswitchd gets shutdown before the pkill of testpmd has been completed, which results in the following error messages: Removing port 'dpdkvhostuser0' while vhost device still attached. To restore connectivity after re-adding of port, VM on socket '' must be restarted. This patch will wait for the socket disconnect to be handled by the vhost-user before shutting down OVS. Signed-off-by: Eelco Chaudron Signed-off-by: David Marchand Co-authored-by: David Marchand Signed-off-by: Ilya Maximets --- tests/system-dpdk.at | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 5ef7f8ccdc0..cb6c6d59075 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -154,6 +154,9 @@ AT_CHECK([ip netns exec ns1 ping -c 4 -I tap0 172.31.110.12], [], [stdout], dnl Clean up the testpmd now pkill -f -x -9 'tail -f /dev/null' +dnl Wait for vhost-user handling the socket disconnect. +OVS_WAIT_UNTIL([grep "vHost Device '$OVS_RUNDIR/dpdkvhostuser0' has been removed" ovs-vswitchd.log]) + dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuser0], [], [stdout], [stderr]) OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ From e5d92c1a54852e9b5912aa53417d1f64bfee9af2 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 5 Jan 2023 13:40:40 +0100 Subject: [PATCH 114/833] cirrus: Update to use FreeBSD 12.4. 12.4 was released in December. That means that 12.3 will become unavailable in a near future. Updating. Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index e3c1cd5811d..952d964315c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -2,7 +2,7 @@ freebsd_build_task: freebsd_instance: matrix: - image_family: freebsd-12-3-snap + image_family: freebsd-12-4-snap image_family: freebsd-13-1-snap cpu: 4 memory: 4G From bd14aa31e39b55d12a366c2c7a627358112f3fca Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 10 Jan 2023 09:32:01 +0100 Subject: [PATCH 115/833] tests: Add unit tests to rculist. Low test coverage on this area caused some errors to remain unnoticed. Add basic functional test of rculist. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- tests/automake.mk | 1 + tests/library.at | 5 + tests/test-rculist.c | 235 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 tests/test-rculist.c diff --git a/tests/automake.mk b/tests/automake.mk index 4091a2796d8..c8de3fe28d2 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -476,6 +476,7 @@ tests_ovstest_SOURCES = \ tests/test-packets.c \ tests/test-random.c \ tests/test-rcu.c \ + tests/test-rculist.c \ tests/test-reconnect.c \ tests/test-rstp.c \ tests/test-sflow.c \ diff --git a/tests/library.at b/tests/library.at index bafb28277e8..164ae789dde 100644 --- a/tests/library.at +++ b/tests/library.at @@ -27,6 +27,11 @@ AT_CHECK([ovstest test-hindex], [0], [..................... ]) AT_CLEANUP +AT_SETUP([test rcu linked lists]) +AT_CHECK([ovstest test-rculist], [0], [..... +]) +AT_CLEANUP + AT_SETUP([cuckoo hash]) AT_KEYWORDS([cmap]) AT_CHECK([ovstest test-cmap check 1], [0], [... diff --git a/tests/test-rculist.c b/tests/test-rculist.c new file mode 100644 index 00000000000..07a6338b862 --- /dev/null +++ b/tests/test-rculist.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include + +#include "openvswitch/list.h" +#include "ovstest.h" +#include "ovs-thread.h" +#include "random.h" +#include "rculist.h" +#include "util.h" + +enum { MAX_ELEMS = 10, MAX_CHECKS = 200 }; + +/* Sample list element. */ +struct element { + int value; + struct rculist node; +}; + +static void +do_usleep(unsigned int usecs) +{ +#ifdef _WIN32 + Sleep(MAX(usecs / 1000, 1)); +#else + usleep(usecs); +#endif +} + +/* Continuously check the integrity of the list until it's empty. */ +static void * +checker_main(void *aux) +{ + struct rculist *list = (struct rculist *) aux; + struct element *elem; + bool checked = false; + + for (int i = 0; i < MAX_CHECKS; i++) { + int value = -1; + + RCULIST_FOR_EACH (elem, node, list) { + ovs_assert(value <= elem->value); + ovs_assert(elem->value < MAX_ELEMS); + value = elem->value; + if (!checked) { + checked = true; + } + do_usleep(10); + } + + ovsrcu_quiesce(); + + if (checked && rculist_is_empty(list)) { + break; + } + } + return NULL; +} + +/* Run test while a thread checks the integrity of the list. + * Tests must end up emptying the list. */ +static void +run_test_while_checking(void (*function)(struct rculist *list)) +{ + struct rculist list; + pthread_t checker; + + rculist_init(&list); + + checker = ovs_thread_create("checker", checker_main, &list); + function(&list); + + ovs_assert(rculist_is_empty(&list)); + ovsrcu_quiesce(); + xpthread_join(checker, NULL); + printf("."); +} + +static void +test_rculist_insert_delete__(struct rculist *list, bool long_version) +{ + struct element *elem; + int value; + + for (int i = 1; i < MAX_ELEMS; i++) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_insert(list, &elem->node); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + value = MAX_ELEMS; + RCULIST_FOR_EACH_REVERSE_PROTECTED (elem, node, list) { + ovs_assert (elem->value <= value); + value = elem->value; + } + + if (long_version) { + struct element *next; + RCULIST_FOR_EACH_SAFE_PROTECTED (elem, next, node, list) { + rculist_remove(&elem->node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + } else { + RCULIST_FOR_EACH_SAFE_PROTECTED (elem, node, list) { + rculist_remove(&elem->node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + } +} + +static void +test_rculist_insert_delete(struct rculist *list) +{ + test_rculist_insert_delete__(list, false); +} + +static void +test_rculist_insert_delete_long(struct rculist *list) +{ + test_rculist_insert_delete__(list, true); +} + +static void +test_rculist_push_front_pop_back(struct rculist *list) +{ + struct element *elem; + + for (int i = MAX_ELEMS - 1; i > 0; i--) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_push_front(list, &elem->node); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + while (!rculist_is_empty(list)) { + elem = CONTAINER_OF(rculist_pop_back(list), struct element, node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } +} + +static void +test_rculist_push_back_pop_front(struct rculist *list) +{ + struct element *elem; + + for (int i = 0; i < MAX_ELEMS; i++) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_push_back(list, &elem->node); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + while (!rculist_is_empty(list)) { + elem = CONTAINER_OF(rculist_pop_front(list), struct element, node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } +} + +static void +test_rculist_splice(struct rculist *list) +{ + struct element *elem; + struct rculist other; + + rculist_init(&other); + + /* Insert elements in list by splicing an intermediate rculist. */ + for (int i = 0; i < MAX_ELEMS; i++) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_insert(&other, &elem->node); + rculist_splice_hidden(list, rculist_next_protected(&other), &other); + rculist_init(&other); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + ovs_assert(rculist_size(list) == MAX_ELEMS); + ovs_assert(rculist_is_empty(&other)); + while (!rculist_is_empty(list)) { + elem = CONTAINER_OF(rculist_pop_front(list), struct element, node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } +} + +static void +test_rculist_main(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) +{ + run_test_while_checking(test_rculist_insert_delete); + run_test_while_checking(test_rculist_insert_delete_long); + run_test_while_checking(test_rculist_push_back_pop_front); + run_test_while_checking(test_rculist_push_front_pop_back); + run_test_while_checking(test_rculist_splice); + printf("\n"); +} + +OVSTEST_REGISTER("test-rculist", test_rculist_main); From c7da49bc64157f6dec7cfbf2d90a0217744310ee Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 11 Jan 2023 09:53:26 +0100 Subject: [PATCH 116/833] netdev-offload-dpdk: Fix transfer flows. Following DPDK commit bd2a4d4b2e3a ("ethdev: forbid direction attribute in transfer flow rules"), the ingress attribute presence is rejected for transfer flows. Fixes: a77c7796f23a ("dpdk: Update to use v22.11.1.") Acked-by: Eli Britstein Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-offload-dpdk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index 38f00fd309e..b3421c0996e 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -2242,7 +2242,7 @@ netdev_offload_dpdk_actions(struct netdev *netdev, struct nlattr *nl_actions, size_t actions_len) { - const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 }; + const struct rte_flow_attr flow_attr = { .transfer = 1, }; struct flow_actions actions = { .actions = NULL, .cnt = 0, From 7e18ae63a6dd723b425e4479acfd83fa9216f326 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 9 Jan 2023 17:55:02 +0100 Subject: [PATCH 117/833] Documentation: Fix link to iproute2 git repository. iproute2 git repositories were split and moved around v4.15 [1]. It is time to fix the link in OVS documentation. 1: https://lore.kernel.org/netdev/20180129082052.0eb85e9b@xeon-e3/ Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/topics/testing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index bc41b217a5c..5f6940b84d9 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -448,7 +448,7 @@ datapath testsuite. an updated iproute2 utilities package. The package is available from the Linux kernel organization open source git repositories. - https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git + https://git.kernel.org/pub/scm/network/iproute2/iproute2.git .. _testing-static-analysis: From 61e2259cf4d663811e942928e7d0102f9fca31c6 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 9 Jan 2023 17:55:03 +0100 Subject: [PATCH 118/833] Documentation: Fix link to AppVeyor. Sphinx linkcheck complains with: Warning, treated as error: .../Documentation/intro/install/windows.rst:1093:broken link: www.appveyor.com () Add a https scheme in link to AppVeyor website. Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/intro/install/windows.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/intro/install/windows.rst b/Documentation/intro/install/windows.rst index 44fc6ae3795..78f60f35acf 100644 --- a/Documentation/intro/install/windows.rst +++ b/Documentation/intro/install/windows.rst @@ -1090,9 +1090,9 @@ To stop and delete the services, run: Windows CI Service ------------------ -`AppVeyor `__ provides a free Windows autobuild service for -open source projects. Open vSwitch has integration with AppVeyor for -continuous build. A developer can build test his changes for Windows by +`AppVeyor `__ provides a free Windows autobuild +service for open source projects. Open vSwitch has integration with AppVeyor +for continuous build. A developer can build test his changes for Windows by logging into appveyor.com using a github account, creating a new project by linking it to his development repository in github and triggering a new build. From 8ef198425b1a03d95a544a86ba8e565459408c54 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 9 Jan 2023 17:55:04 +0100 Subject: [PATCH 119/833] Documentation: Fix link to Netperf. netperf.org was shut down in favor of some HP related resources. Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/howto/qos.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/howto/qos.rst b/Documentation/howto/qos.rst index 376ec2514bd..7d625e00197 100644 --- a/Documentation/howto/qos.rst +++ b/Documentation/howto/qos.rst @@ -59,10 +59,10 @@ is participating in an OVS bridge, no IP address can be assigned on `eth0`. The second host, named Measurement Host, can be any host capable of measuring throughput from a VM. For this guide, we use `netperf -`__, a free tool for testing the rate at which one host -can send to another. The Measurement Host has only a single NIC, `eth0`, which -is connected to the Data Network. `eth0` has an IP address that can reach any -VM on `host1`. +`__, a free tool for testing the rate +at which one host can send to another. The Measurement Host has only a single +NIC, `eth0`, which is connected to the Data Network. `eth0` has an IP address +that can reach any VM on `host1`. Two VMs ~~~~~~~ From 68ff5e9811e597d406aa313f538ee63644da4bb6 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 9 Jan 2023 17:55:06 +0100 Subject: [PATCH 120/833] Documentation: Remove reference to RST online editor. rst.ninjs.org is not available anymore, but there are alternatives listed in this doc. Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/internals/contributing/documentation-style.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Documentation/internals/contributing/documentation-style.rst b/Documentation/internals/contributing/documentation-style.rst index 045cdf69672..2eec4c4d269 100644 --- a/Documentation/internals/contributing/documentation-style.rst +++ b/Documentation/internals/contributing/documentation-style.rst @@ -423,10 +423,6 @@ Helpful Tools There are a number of tools, online and offline, which can be used to preview documents are you edit them: -- `rst.ninjs.org `__ - - An online rST editor/previewer - - `ReText `__ A simple but powerful editor for Markdown and reStructuredText. ReText is From 4de6b009cfec1e4aac57283f5cab129718939292 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 11 Jan 2023 21:41:12 +0100 Subject: [PATCH 121/833] Documentation: Remove link to obsolete sources. This archive website disappeared. On the other hand, the link to an obsolete dpif-provider man page probably did not provide much info and we can simply mention the current file. Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/topics/windows.rst | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/Documentation/topics/windows.rst b/Documentation/topics/windows.rst index c5b34c85fb8..1f1b513e4a9 100644 --- a/Documentation/topics/windows.rst +++ b/Documentation/topics/windows.rst @@ -66,14 +66,14 @@ ingress path. In the egress path, it is the other way round. In addition, there is a object identifier (OID) interface for control operations Eg. addition of a port. The workflow for the calls is similar in nature to the packets, where higher level layers call into the lower level layers. A good representational -diagram of this architecture is in [4]_. +diagram of this architecture is in [3]_. -Windows Filtering Platform (WFP) [5]_ is a platform implemented on Hyper-V that +Windows Filtering Platform (WFP) [4]_ is a platform implemented on Hyper-V that provides APIs and services for filtering packets. WFP has been utilized to filter on some of the packets that OVS is not equipped to handle directly. More details in later sections. -IP Helper [6]_ is a set of API available on Hyper-V to retrieve information +IP Helper [5]_ is a set of API available on Hyper-V to retrieve information related to the network configuration information on the host machine. IP Helper has been used to retrieve some of the configuration information that OVS needs. @@ -188,10 +188,10 @@ The userspace portion of the OVS solution is mostly POSIX code, and not very Linux specific. Majority of the userspace code does not interface directly with the kernel datapath and was ported independently of the kernel datapath effort. -As explained in the OVS porting design document [7]_, DPIF is the portion of +As explained in the OVS porting design document [6]_, DPIF is the portion of userspace that interfaces with the kernel portion of the OVS. The interface -that each DPIF provider has to implement is defined in ``dpif-provider.h`` -[3]_. Though each platform is allowed to have its own implementation of the +that each DPIF provider has to implement is defined in ``dpif-provider.h``. +Though each platform is allowed to have its own implementation of the DPIF provider, it was found, via community feedback, that it is desired to share code whenever possible. Thus, the DPIF provider for OVS on Hyper-V shares code with the DPIF provider on Linux. This interface is implemented in @@ -253,7 +253,7 @@ Netlink Message Parser ~~~~~~~~~~~~~~~~~~~~~~ The communication between OVS userspace and OVS kernel datapath is in the form -of Netlink messages [1]_, [8]_. More details about this are provided below. In +of Netlink messages [1]_, [7]_. More details about this are provided below. In the kernel, a full fledged netlink message parser has been implemented along the lines of the netlink message parser in OVS userspace. In fact, a lot of the code is ported code. @@ -407,7 +407,7 @@ As has been mentioned in earlier sections, the netlink socket and netlink message based DPIF provider on Linux has been ported to Windows. Most of the code is common. Some divergence is in the code to receive packets. -The Linux implementation uses epoll() [9]_ which is not natively supported on +The Linux implementation uses epoll() [8]_ which is not natively supported on Windows. netdev-windows @@ -501,10 +501,9 @@ References .. [1] Hyper-V Extensible Switch https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch .. [2] Hyper-V Extensible Switch Extensions https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch-extensions -.. [3] DPIF Provider http://openvswitch.sourcearchive.com/documentation/1.1.0-1/dpif-provider_8h_source.html -.. [4] Hyper-V Extensible Switch Components https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch-components -.. [5] Windows Filtering Platform https://msdn.microsoft.com/en-us/library/windows/desktop/aa366510(v=vs.85).aspx -.. [6] IP Helper https://msdn.microsoft.com/windows/hardware/drivers/network/ip-helper -.. [7] How to Port Open vSwitch to New Software or Hardware :doc:`porting` -.. [8] Netlink https://en.wikipedia.org/wiki/Netlink -.. [9] epoll https://en.wikipedia.org/wiki/Epoll +.. [3] Hyper-V Extensible Switch Components https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch-components +.. [4] Windows Filtering Platform https://msdn.microsoft.com/en-us/library/windows/desktop/aa366510(v=vs.85).aspx +.. [5] IP Helper https://msdn.microsoft.com/windows/hardware/drivers/network/ip-helper +.. [6] How to Port Open vSwitch to New Software or Hardware :doc:`porting` +.. [7] Netlink https://en.wikipedia.org/wiki/Netlink +.. [8] epoll https://en.wikipedia.org/wiki/Epoll From f4c884135139f0d9e309bcd58244191145c5abba Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 11 Jan 2023 09:35:00 +0000 Subject: [PATCH 122/833] util: Add non quiesce xnanosleep. xnanosleep forces the thread into quiesce state in anticipation that it will be sleeping for a considerable time and that the thread may need to quiesce before the sleep is finished. In some cases, a very short sleep may be requested and in that case the overhead of going to into quiesce state may be unnecessary. To allow for those cases add a xnanosleep_no_quiesce() variant. Suggested-by: Ilya Maximets Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/util.c | 21 +++++++++++++++++---- lib/util.h | 1 + 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/lib/util.c b/lib/util.c index 1195c798211..7576eb06eb3 100644 --- a/lib/util.c +++ b/lib/util.c @@ -2371,11 +2371,9 @@ xsleep(unsigned int seconds) ovsrcu_quiesce_end(); } -/* High resolution sleep. */ -void -xnanosleep(uint64_t nanoseconds) +static void +xnanosleep__(uint64_t nanoseconds) { - ovsrcu_quiesce_start(); #ifndef _WIN32 int retval; struct timespec ts_sleep; @@ -2403,9 +2401,24 @@ xnanosleep(uint64_t nanoseconds) ovs_lasterror_to_string()); } #endif +} + +/* High resolution sleep with thread quiesce. */ +void +xnanosleep(uint64_t nanoseconds) +{ + ovsrcu_quiesce_start(); + xnanosleep__(nanoseconds); ovsrcu_quiesce_end(); } +/* High resolution sleep without thread quiesce. */ +void +xnanosleep_no_quiesce(uint64_t nanoseconds) +{ + xnanosleep__(nanoseconds); +} + /* Determine whether standard output is a tty or not. This is useful to decide * whether to use color output or not when --color option for utilities is set * to `auto`. diff --git a/lib/util.h b/lib/util.h index 9ff84b3dccb..f35f330217c 100644 --- a/lib/util.h +++ b/lib/util.h @@ -593,6 +593,7 @@ ovs_u128_is_superset(ovs_u128 super, ovs_u128 sub) void xsleep(unsigned int seconds); void xnanosleep(uint64_t nanoseconds); +void xnanosleep_no_quiesce(uint64_t nanoseconds); bool is_stdout_a_tty(void); From de3bbdc479a9a78135e1922e4e6011732515e7ef Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 11 Jan 2023 09:35:01 +0000 Subject: [PATCH 123/833] dpif-netdev: Add PMD load based sleeping. Sleep for an incremental amount of time if none of the Rx queues assigned to a PMD have at least half a batch of packets (i.e. 16 pkts) on an polling iteration of the PMD. Upon detecting the threshold of >= 16 pkts on an Rxq, reset the sleep time to zero (i.e. no sleep). Sleep time will be increased on each iteration where the low load conditions remain up to a total of the max sleep time which is set by the user e.g: ovs-vsctl set Open_vSwitch . other_config:pmd-maxsleep=500 The default pmd-maxsleep value is 0, which means that no sleeps will occur and the default behaviour is unchanged from previously. Also add new stats to pmd-perf-show to get visibility of operation e.g. ... - sleep iterations: 153994 ( 76.8 % of iterations) Sleep time (us): 9159399 ( 59 us/iteration avg.) ... Reviewed-by: Robin Jarry Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 54 +++++++++++++++++++++++++ NEWS | 3 ++ lib/dpif-netdev-perf.c | 24 +++++++++--- lib/dpif-netdev-perf.h | 5 ++- lib/dpif-netdev.c | 65 +++++++++++++++++++++++++++++-- tests/pmd.at | 46 ++++++++++++++++++++++ vswitchd/vswitch.xml | 26 +++++++++++++ 7 files changed, 213 insertions(+), 10 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index 9006fd40f07..604ac3f6b1d 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -324,5 +324,59 @@ A user can use this option to set a minimum frequency of Rx queue to PMD reassignment due to PMD Auto Load Balance. For example, this could be set (in min) such that a reassignment is triggered at most every few hours. +PMD load based sleeping (Experimental) +-------------------------------------- + +PMD threads constantly poll Rx queues which are assigned to them. In order to +reduce the CPU cycles they use, they can sleep for small periods of time +when there is no load or very-low load on all the Rx queues they poll. + +This can be enabled by setting the max requested sleep time (in microseconds) +for a PMD thread:: + + $ ovs-vsctl set open_vswitch . other_config:pmd-maxsleep=500 + +Non-zero values will be rounded up to the nearest 10 microseconds to avoid +requesting very small sleep times. + +With a non-zero max value a PMD may request to sleep by an incrementing amount +of time up to the maximum time. If at any point the threshold of at least half +a batch of packets (i.e. 16) is received from an Rx queue that the PMD is +polling is met, the requested sleep time will be reset to 0. At that point no +sleeps will occur until the no/low load conditions return. + +Sleeping in a PMD thread will mean there is a period of time when the PMD +thread will not process packets. Sleep times requested are not guaranteed +and can differ significantly depending on system configuration. The actual +time not processing packets will be determined by the sleep and processor +wake-up times and should be tested with each system configuration. + +Sleep time statistics for 10 secs can be seen with:: + + $ ovs-appctl dpif-netdev/pmd-stats-clear \ + && sleep 10 && ovs-appctl dpif-netdev/pmd-perf-show + +Example output, showing that during the last 10 seconds, 76.8% of iterations +had a sleep of some length. The total amount of sleep time was 9.15 seconds and +the average sleep time per iteration was 46 microseconds:: + + - sleep iterations: 153994 ( 76.8 % of iterations) + Sleep time (us): 9159399 ( 59 us/iteration avg.) + +Any potential power saving from PMD load based sleeping is dependent on the +system configuration (e.g. enabling processor C-states) and workloads. + +.. note:: + + If there is a sudden spike of packets while the PMD thread is sleeping and + the processor is in a low-power state it may result in some lost packets or + extra latency before the PMD thread returns to processing packets at full + rate. + +.. note:: + + By default Linux kernel groups timer expirations and this can add an + overhead of up to 50 microseconds to a requested timer expiration. + .. _ovs-vswitchd(8): http://openvswitch.org/support/dist-docs/ovs-vswitchd.8.html diff --git a/NEWS b/NEWS index 2f6ededfe47..4f9291bf13c 100644 --- a/NEWS +++ b/NEWS @@ -30,6 +30,9 @@ Post-v3.0.0 - Userspace datapath: * Add '-secs' argument to appctl 'dpif-netdev/pmd-rxq-show' to show the pmd usage of an Rx queue over a configurable time period. + * Add new experimental PMD load based sleeping feature. PMD threads can + request to sleep up to a user configured 'pmd-maxsleep' value under + low load conditions. v3.0.0 - 15 Aug 2022 diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index a2a7d8f0b88..1a7bab04c0c 100644 --- a/lib/dpif-netdev-perf.c +++ b/lib/dpif-netdev-perf.c @@ -230,18 +230,26 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, uint64_t tot_iter = histogram_samples(&s->pkts); uint64_t idle_iter = s->pkts.bin[0]; uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0; + uint64_t sleep_iter = stats[PMD_SLEEP_ITER]; + uint64_t tot_sleep_cycles = stats[PMD_CYCLES_SLEEP]; ds_put_format(str, " Iterations: %12"PRIu64" (%.2f us/it)\n" " - Used TSC cycles: %12"PRIu64" (%5.1f %% of total cycles)\n" " - idle iterations: %12"PRIu64" (%5.1f %% of used cycles)\n" - " - busy iterations: %12"PRIu64" (%5.1f %% of used cycles)\n", - tot_iter, tot_cycles * us_per_cycle / tot_iter, + " - busy iterations: %12"PRIu64" (%5.1f %% of used cycles)\n" + " - sleep iterations: %12"PRIu64" (%5.1f %% of iterations)\n" + " Sleep time (us): %12.0f (%3.0f us/iteration avg.)\n", + tot_iter, + (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter, tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz, idle_iter, 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles, busy_iter, - 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles); + 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles, + sleep_iter, tot_iter ? 100.0 * sleep_iter / tot_iter : 0, + tot_sleep_cycles * us_per_cycle, + sleep_iter ? (tot_sleep_cycles * us_per_cycle) / sleep_iter : 0); if (rx_packets > 0) { ds_put_format(str, " Rx packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n" @@ -518,14 +526,15 @@ OVS_REQUIRES(s->stats_mutex) void pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, - int tx_packets, bool full_metrics) + int tx_packets, uint64_t sleep_cycles, + bool full_metrics) { uint64_t now_tsc = cycles_counter_update(s); struct iter_stats *cum_ms; uint64_t cycles, cycles_per_pkt = 0; char *reason = NULL; - cycles = now_tsc - s->start_tsc; + cycles = now_tsc - s->start_tsc - sleep_cycles; s->current.timestamp = s->iteration_cnt; s->current.cycles = cycles; s->current.pkts = rx_packets; @@ -539,6 +548,11 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, histogram_add_sample(&s->cycles, cycles); histogram_add_sample(&s->pkts, rx_packets); + if (sleep_cycles) { + pmd_perf_update_counter(s, PMD_SLEEP_ITER, 1); + pmd_perf_update_counter(s, PMD_CYCLES_SLEEP, sleep_cycles); + } + if (!full_metrics) { return; } diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index 9673dddd835..84beced1519 100644 --- a/lib/dpif-netdev-perf.h +++ b/lib/dpif-netdev-perf.h @@ -80,6 +80,8 @@ enum pmd_stat_type { PMD_CYCLES_ITER_IDLE, /* Cycles spent in idle iterations. */ PMD_CYCLES_ITER_BUSY, /* Cycles spent in busy iterations. */ PMD_CYCLES_UPCALL, /* Cycles spent processing upcalls. */ + PMD_SLEEP_ITER, /* Iterations where a sleep has taken place. */ + PMD_CYCLES_SLEEP, /* Total cycles slept to save power. */ PMD_N_STATS }; @@ -408,7 +410,8 @@ void pmd_perf_start_iteration(struct pmd_perf_stats *s); void pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, - int tx_packets, bool full_metrics); + int tx_packets, uint64_t sleep_cycles, + bool full_metrics); /* Formatting the output of commands. */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 7127068fe0e..a47d54c6fde 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -171,6 +171,11 @@ static struct odp_support dp_netdev_support = { /* Time in microseconds to try RCU quiescing. */ #define PMD_RCU_QUIESCE_INTERVAL 10000LL +/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ +#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) +/* Time in uS to increment a pmd thread sleep time. */ +#define PMD_SLEEP_INC_US 10 + struct dpcls { struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ odp_port_t in_port; @@ -279,6 +284,8 @@ struct dp_netdev { atomic_uint32_t emc_insert_min; /* Enable collection of PMD performance metrics. */ atomic_bool pmd_perf_metrics; + /* Max load based sleep request. */ + atomic_uint64_t pmd_max_sleep; /* Enable the SMC cache from ovsdb config */ atomic_bool smc_enable_db; @@ -4821,8 +4828,10 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) uint64_t rebalance_intvl; uint8_t cur_rebalance_load; uint32_t rebalance_load, rebalance_improve; + uint64_t pmd_max_sleep, cur_pmd_max_sleep; bool log_autolb = false; enum sched_assignment_type pmd_rxq_assign_type; + static bool first_set_config = true; tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", DEFAULT_TX_FLUSH_INTERVAL); @@ -4969,6 +4978,19 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false); set_pmd_auto_lb(dp, autolb_state, log_autolb); + + pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", 0); + pmd_max_sleep = ROUND_UP(pmd_max_sleep, 10); + pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); + atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep); + if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) { + atomic_store_relaxed(&dp->pmd_max_sleep, pmd_max_sleep); + VLOG_INFO("PMD max sleep request is %"PRIu64" usecs.", pmd_max_sleep); + VLOG_INFO("PMD load based sleeps are %s.", + pmd_max_sleep ? "enabled" : "disabled" ); + } + + first_set_config = false; return 0; } @@ -6929,6 +6951,7 @@ pmd_thread_main(void *f_) int poll_cnt; int i; int process_packets = 0; + uint64_t sleep_time = 0; poll_list = NULL; @@ -6989,10 +7012,13 @@ pmd_thread_main(void *f_) ovs_mutex_lock(&pmd->perf_stats.stats_mutex); for (;;) { uint64_t rx_packets = 0, tx_packets = 0; + uint64_t time_slept = 0; + uint64_t max_sleep; pmd_perf_start_iteration(s); atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); + atomic_read_relaxed(&pmd->dp->pmd_max_sleep, &max_sleep); for (i = 0; i < poll_cnt; i++) { @@ -7011,6 +7037,9 @@ pmd_thread_main(void *f_) dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, poll_list[i].port_no); rx_packets += process_packets; + if (process_packets >= PMD_SLEEP_THRESH) { + sleep_time = 0; + } } if (!rx_packets) { @@ -7018,7 +7047,30 @@ pmd_thread_main(void *f_) * Check if we need to send something. * There was no time updates on current iteration. */ pmd_thread_ctx_time_update(pmd); - tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false); + tx_packets = dp_netdev_pmd_flush_output_packets(pmd, + max_sleep && sleep_time + ? true : false); + } + + if (max_sleep) { + /* Check if a sleep should happen on this iteration. */ + if (sleep_time) { + struct cycle_timer sleep_timer; + + cycle_timer_start(&pmd->perf_stats, &sleep_timer); + xnanosleep_no_quiesce(sleep_time * 1000); + time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer); + pmd_thread_ctx_time_update(pmd); + } + if (sleep_time < max_sleep) { + /* Increase sleep time for next iteration. */ + sleep_time += PMD_SLEEP_INC_US; + } else { + sleep_time = max_sleep; + } + } else { + /* Reset sleep time as max sleep policy may have been changed. */ + sleep_time = 0; } /* Do RCU synchronization at fixed interval. This ensures that @@ -7058,7 +7110,7 @@ pmd_thread_main(void *f_) break; } - pmd_perf_end_iteration(s, rx_packets, tx_packets, + pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, pmd_perf_metrics_enabled(pmd)); } ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); @@ -9909,7 +9961,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, struct polled_queue *poll_list, int poll_cnt) { struct dpcls *cls; - uint64_t tot_idle = 0, tot_proc = 0; + uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; unsigned int pmd_load = 0; if (pmd->ctx.now > pmd->next_cycle_store) { @@ -9926,10 +9978,13 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; + tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - + pmd->prev_stats[PMD_CYCLES_SLEEP]; if (pmd_alb->is_enabled && !pmd->isolated) { if (tot_proc) { - pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc)); + pmd_load = ((tot_proc * 100) / + (tot_idle + tot_proc + tot_sleep)); } atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, @@ -9946,6 +10001,8 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; + pmd->prev_stats[PMD_CYCLES_SLEEP] = + pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; /* Get the cycles that were used to process each queue and store. */ for (unsigned i = 0; i < poll_cnt; i++) { diff --git a/tests/pmd.at b/tests/pmd.at index ed90f88c4cb..e0f58f7a606 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1254,3 +1254,49 @@ ovs-appctl: ovs-vswitchd: server returned an error OVS_VSWITCHD_STOP AT_CLEANUP + +dnl Check default state +AT_SETUP([PMD - pmd sleep]) +OVS_VSWITCHD_START + +dnl Check default +OVS_WAIT_UNTIL([tail ovs-vswitchd.log | grep "PMD max sleep request is 0 usecs."]) +OVS_WAIT_UNTIL([tail ovs-vswitchd.log | grep "PMD load based sleeps are disabled."]) + +dnl Check low value max sleep +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="1"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) + +dnl Check high value max sleep +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="10000"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10000 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) + +dnl Check setting max sleep to zero +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="0"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 0 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are disabled."]) + +dnl Check above high value max sleep +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="10001"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10000 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) + +dnl Check rounding +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="490"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 490 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) +dnl Check rounding +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="491"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 500 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index f9bdb2d92be..8c4acfb1817 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -788,6 +788,32 @@ The default value is 25%.

+ +

+ Specifies the maximum sleep time that will be requested in + microseconds per iteration for a PMD thread which has received zero + or a small amount of packets from the Rx queues it is polling. +

+

+ The actual sleep time requested is based on the load + of the Rx queues that the PMD polls and may be less than + the maximum value. +

+

+ The default value is 0 microseconds, which means + that the PMD will not sleep regardless of the load from the + Rx queues that it polls. +

+

+ To avoid requesting very small sleeps (e.g. less than 10 us) the + value will be rounded up to the nearest 10 us. +

+

+ The maximum value is 10000 microseconds. +

+

From a9ae73b916bad528dcac2b8bb302fee6935fc163 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 16 Jan 2023 12:45:07 +0100 Subject: [PATCH 124/833] ofp, dpif: Allow CT flush based on partial match. Currently, the CT can be flushed by dpctl only by specifying the whole 5-tuple. This is not very convenient when there are only some fields known to the user of CT flush. Add new struct ofp_ct_match which represents the generic filtering that can be done for CT flush. The match is done only on fields that are non-zero with exception to the icmp fields. This allows the filtering just within dpctl, however it is a preparation for OpenFlow extension. Reported-at: https://bugzilla.redhat.com/2120546 Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- NEWS | 3 + include/openvswitch/automake.mk | 1 + include/openvswitch/ofp-ct.h | 66 +++++++ lib/automake.mk | 1 + lib/ct-dpif.c | 298 +++++++++++++++++++------------- lib/ct-dpif.h | 5 +- lib/dpctl.c | 43 +++-- lib/dpctl.man | 24 ++- lib/ofp-ct.c | 213 +++++++++++++++++++++++ tests/system-traffic.at | 102 ++++++++++- 10 files changed, 615 insertions(+), 141 deletions(-) create mode 100644 include/openvswitch/ofp-ct.h create mode 100644 lib/ofp-ct.c diff --git a/NEWS b/NEWS index 4f9291bf13c..3685d7c159d 100644 --- a/NEWS +++ b/NEWS @@ -25,6 +25,9 @@ Post-v3.0.0 * New option '--dump-hugepages' to include hugepages in core dumps. This can assist with postmortem analysis involving DPDK, but may also produce significantly larger core dump files. + - ovs-dpctl and 'ovs-appctl dpctl/' commands: + * 'flush-conntrack' is now capable of handling partial 5-tuple, + with additional optional parameter to specify the reply direction. - Support for travis-ci.org based continuous integration builds has been dropped. - Userspace datapath: diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 84670d80aae..0cc1f569e0a 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -15,6 +15,7 @@ openvswitchinclude_HEADERS = \ include/openvswitch/ofp-actions.h \ include/openvswitch/ofp-bundle.h \ include/openvswitch/ofp-connection.h \ + include/openvswitch/ofp-ct.h \ include/openvswitch/ofp-ed-props.h \ include/openvswitch/ofp-errors.h \ include/openvswitch/ofp-flow.h \ diff --git a/include/openvswitch/ofp-ct.h b/include/openvswitch/ofp-ct.h new file mode 100644 index 00000000000..3d919ddf974 --- /dev/null +++ b/include/openvswitch/ofp-ct.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OPENVSWITCH_OFP_CT_H +#define OPENVSWITCH_OFP_CT_H 1 + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct ofp_ct_tuple { + struct in6_addr src; + struct in6_addr dst; + + union { + ovs_be16 src_port; + ovs_be16 icmp_id; + }; + union { + ovs_be16 dst_port; + struct { + uint8_t icmp_code; + uint8_t icmp_type; + }; + }; +}; + +struct ofp_ct_match { + uint8_t ip_proto; + uint16_t l3_type; + + struct ofp_ct_tuple tuple_orig; + struct ofp_ct_tuple tuple_reply; +}; + +bool ofp_ct_match_is_zero(const struct ofp_ct_match *); +bool ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *, uint8_t ip_proto); +bool ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *, uint8_t ip_proto); + +void ofp_ct_match_format(struct ds *, const struct ofp_ct_match *); +bool ofp_ct_tuple_parse(struct ofp_ct_tuple *, const char *, + struct ds *, uint8_t *ip_proto, uint16_t *l3_type); + +#ifdef __cplusplus +} +#endif + +#endif /* ofp-ct.h */ diff --git a/lib/automake.mk b/lib/automake.mk index 61bdc308f07..e64ee76ce79 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -226,6 +226,7 @@ lib_libopenvswitch_la_SOURCES = \ lib/ofp-actions.c \ lib/ofp-bundle.c \ lib/ofp-connection.c \ + lib/ofp-ct.c \ lib/ofp-ed-props.c \ lib/ofp-errors.c \ lib/ofp-flow.c \ diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index 6f17a26b5f4..d3b2783ce49 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -20,6 +20,7 @@ #include #include "ct-dpif.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-parse.h" #include "openvswitch/vlog.h" @@ -101,24 +102,191 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump) : EOPNOTSUPP); } +/* Flushing. */ + +static void +ct_dpif_tuple_from_ofp_ct_tuple(const struct ofp_ct_tuple *ofp_tuple, + struct ct_dpif_tuple *tuple, + uint16_t l3_type, uint8_t ip_proto) +{ + if (l3_type == AF_INET) { + tuple->src.ip = in6_addr_get_mapped_ipv4(&ofp_tuple->src); + tuple->dst.ip = in6_addr_get_mapped_ipv4(&ofp_tuple->dst); + } else { + tuple->src.in6 = ofp_tuple->src; + tuple->dst.in6 = ofp_tuple->dst; + } + + tuple->l3_type = l3_type; + tuple->ip_proto = ip_proto; + tuple->src_port = ofp_tuple->src_port; + + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + tuple->icmp_code = ofp_tuple->icmp_code; + tuple->icmp_type = ofp_tuple->icmp_type; + } else { + tuple->dst_port = ofp_tuple->dst_port; + } +} + +static inline bool +ct_dpif_inet_addr_cmp_partial(const union ct_dpif_inet_addr *addr, + const struct in6_addr *partial, uint16_t l3_type) +{ + if (ipv6_is_zero(partial)) { + return true; + } + + if (l3_type == AF_INET && in6_addr_get_mapped_ipv4(partial) != addr->ip) { + return false; + } + + if (l3_type == AF_INET6 && !ipv6_addr_equals(partial, &addr->in6)) { + return false; + } + + return true; +} + +static inline bool +ct_dpif_tuple_ip_cmp_partial(const struct ct_dpif_tuple *tuple, + const struct ofp_ct_tuple *partial, + uint16_t l3_type, uint8_t ip_proto) +{ + if (!ct_dpif_inet_addr_cmp_partial(&tuple->src, &partial->src, l3_type)) { + return false; + } + + if (!ct_dpif_inet_addr_cmp_partial(&tuple->dst, &partial->dst, l3_type)) { + return false; + } + + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + if (partial->icmp_id != tuple->icmp_id) { + return false; + } + + if (partial->icmp_type != tuple->icmp_type) { + return false; + } + + if (partial->icmp_code != tuple->icmp_code) { + return false; + } + } else { + if (partial->src_port && partial->src_port != tuple->src_port) { + return false; + } + + if (partial->dst_port && partial->dst_port != tuple->dst_port) { + return false; + } + } + + return true; +} + +/* Returns 'true' if all non-zero members of 'match' equal to corresponding + * members of 'entry'. */ +static bool +ct_dpif_entry_cmp(const struct ct_dpif_entry *entry, + const struct ofp_ct_match *match) +{ + if (match->l3_type && match->l3_type != entry->tuple_orig.l3_type) { + return false; + } + + if (match->ip_proto && match->ip_proto != entry->tuple_orig.ip_proto) { + return false; + } + + if (!ct_dpif_tuple_ip_cmp_partial(&entry->tuple_orig, &match->tuple_orig, + match->l3_type, match->ip_proto)) { + return false; + } + + if (!ct_dpif_tuple_ip_cmp_partial(&entry->tuple_reply, &match->tuple_reply, + match->l3_type, match->ip_proto)) { + return false; + } + + return true; +} + +static int +ct_dpif_flush_tuple(struct dpif *dpif, const uint16_t *zone, + const struct ofp_ct_match *match) +{ + struct ct_dpif_dump_state *dump; + struct ct_dpif_entry cte; + int error; + int tot_bkts; + + if (!dpif->dpif_class->ct_flush) { + return EOPNOTSUPP; + } + + if (VLOG_IS_DBG_ENABLED()) { + struct ds ds = DS_EMPTY_INITIALIZER; + ofp_ct_match_format(&ds, match); + VLOG_DBG("%s: ct_flush: zone=%d %s", dpif_name(dpif), zone ? *zone : 0, + ds_cstr(&ds)); + ds_destroy(&ds); + } + + /* If we have full five tuple in original and empty reply tuple just + * do the flush over original tuple directly. */ + if (ofp_ct_tuple_is_five_tuple(&match->tuple_orig, match->ip_proto) && + ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto)) { + struct ct_dpif_tuple tuple; + + ct_dpif_tuple_from_ofp_ct_tuple(&match->tuple_orig, &tuple, + match->l3_type, match->ip_proto); + return dpif->dpif_class->ct_flush(dpif, zone, &tuple); + } + + error = ct_dpif_dump_start(dpif, &dump, zone, &tot_bkts); + if (error) { + return error; + } + + while (!(error = ct_dpif_dump_next(dump, &cte))) { + if (zone && *zone != cte.zone) { + continue; + } + + if (ct_dpif_entry_cmp(&cte, match)) { + error = dpif->dpif_class->ct_flush(dpif, &cte.zone, + &cte.tuple_orig); + if (error) { + break; + } + } + } + if (error == EOF) { + error = 0; + } + + ct_dpif_dump_done(dump); + return error; +} + /* Flush the entries in the connection tracker used by 'dpif'. The * arguments have the following behavior: * - * - If both 'zone' and 'tuple' are NULL, flush all the conntrack entries. - * - If 'zone' is not NULL, and 'tuple' is NULL, flush all the conntrack + * - If both 'zone' is NULL and 'match' is NULL or zero, flush all the + * conntrack entries. + * - If 'zone' is not NULL, and 'match' is NULL, flush all the conntrack * entries in '*zone'. - * - If 'tuple' is not NULL, flush the conntrack entry specified by 'tuple' - * in '*zone'. If 'zone' is NULL, use the default zone (zone 0). */ + * - If 'match' is not NULL or zero, flush the conntrack entry specified + * by 'match' in '*zone'. If 'zone' is NULL, use the default zone + * (zone 0). */ int ct_dpif_flush(struct dpif *dpif, const uint16_t *zone, - const struct ct_dpif_tuple *tuple) + const struct ofp_ct_match *match) { - if (tuple) { - struct ds ds = DS_EMPTY_INITIALIZER; - ct_dpif_format_tuple(&ds, tuple); - VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), ds_cstr(&ds), - zone ? *zone : 0); - ds_destroy(&ds); + if (match && !ofp_ct_match_is_zero(match)) { + return ct_dpif_flush_tuple(dpif, zone, match); } else if (zone) { VLOG_DBG("%s: ct_flush: zone %"PRIu16, dpif_name(dpif), *zone); } else { @@ -126,7 +294,7 @@ ct_dpif_flush(struct dpif *dpif, const uint16_t *zone, } return (dpif->dpif_class->ct_flush - ? dpif->dpif_class->ct_flush(dpif, zone, tuple) + ? dpif->dpif_class->ct_flush(dpif, zone, NULL) : EOPNOTSUPP); } @@ -583,112 +751,6 @@ ct_dpif_format_tcp_stat(struct ds * ds, int tcp_state, int conn_per_state) ds_put_format(ds, "=%u", conn_per_state); } -/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. - * Returns true on success. Otherwise, returns false and puts the error - * message in 'ds'. */ -bool -ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const char *s, struct ds *ds) -{ - char *pos, *key, *value, *copy; - memset(tuple, 0, sizeof *tuple); - - pos = copy = xstrdup(s); - while (ofputil_parse_key_value(&pos, &key, &value)) { - if (!*value) { - ds_put_format(ds, "field %s missing value", key); - goto error; - } - - if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) { - if (tuple->l3_type && tuple->l3_type != AF_INET) { - ds_put_cstr(ds, "L3 type set multiple times"); - goto error; - } else { - tuple->l3_type = AF_INET; - } - if (!ip_parse(value, key[6] == 's' ? &tuple->src.ip : - &tuple->dst.ip)) { - goto error_with_msg; - } - } else if (!strcmp(key, "ct_ipv6_src") || - !strcmp(key, "ct_ipv6_dst")) { - if (tuple->l3_type && tuple->l3_type != AF_INET6) { - ds_put_cstr(ds, "L3 type set multiple times"); - goto error; - } else { - tuple->l3_type = AF_INET6; - } - if (!ipv6_parse(value, key[8] == 's' ? &tuple->src.in6 : - &tuple->dst.in6)) { - goto error_with_msg; - } - } else if (!strcmp(key, "ct_nw_proto")) { - char *err = str_to_u8(value, key, &tuple->ip_proto); - if (err) { - free(err); - goto error_with_msg; - } - } else if (!strcmp(key, "ct_tp_src") || !strcmp(key,"ct_tp_dst")) { - uint16_t port; - char *err = str_to_u16(value, key, &port); - if (err) { - free(err); - goto error_with_msg; - } - if (key[6] == 's') { - tuple->src_port = htons(port); - } else { - tuple->dst_port = htons(port); - } - } else if (!strcmp(key, "icmp_type") || !strcmp(key, "icmp_code") || - !strcmp(key, "icmp_id") ) { - if (tuple->ip_proto != IPPROTO_ICMP && - tuple->ip_proto != IPPROTO_ICMPV6) { - ds_put_cstr(ds, "invalid L4 fields"); - goto error; - } - uint16_t icmp_id; - char *err; - if (key[5] == 't') { - err = str_to_u8(value, key, &tuple->icmp_type); - } else if (key[5] == 'c') { - err = str_to_u8(value, key, &tuple->icmp_code); - } else { - err = str_to_u16(value, key, &icmp_id); - tuple->icmp_id = htons(icmp_id); - } - if (err) { - free(err); - goto error_with_msg; - } - } else { - ds_put_format(ds, "invalid conntrack tuple field: %s", key); - goto error; - } - } - - if (ipv6_is_zero(&tuple->src.in6) || ipv6_is_zero(&tuple->dst.in6) || - !tuple->ip_proto) { - /* icmp_type, icmp_code, and icmp_id can be 0. */ - if (tuple->ip_proto != IPPROTO_ICMP && - tuple->ip_proto != IPPROTO_ICMPV6) { - if (!tuple->src_port || !tuple->dst_port) { - ds_put_cstr(ds, "at least one of the conntrack 5-tuple fields " - "is missing."); - goto error; - } - } - } - - free(copy); - return true; - -error_with_msg: - ds_put_format(ds, "failed to parse field %s", key); -error: - free(copy); - return false; -} void ct_dpif_push_zone_limit(struct ovs_list *zone_limits, uint16_t zone, diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index 2848549b0ba..5edbbfd3bdc 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -20,6 +20,8 @@ #include "openvswitch/types.h" #include "packets.h" +struct ofp_ct_match; + union ct_dpif_inet_addr { ovs_be32 ip; ovs_be32 ip6[4]; @@ -285,7 +287,7 @@ int ct_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **, int ct_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_entry *); int ct_dpif_dump_done(struct ct_dpif_dump_state *); int ct_dpif_flush(struct dpif *, const uint16_t *zone, - const struct ct_dpif_tuple *); + const struct ofp_ct_match *); int ct_dpif_set_maxconns(struct dpif *dpif, uint32_t maxconns); int ct_dpif_get_maxconns(struct dpif *dpif, uint32_t *maxconns); int ct_dpif_get_nconns(struct dpif *dpif, uint32_t *nconns); @@ -311,7 +313,6 @@ void ct_dpif_format_ipproto(struct ds *ds, uint16_t ipproto); void ct_dpif_format_tuple(struct ds *, const struct ct_dpif_tuple *); uint8_t ct_dpif_coalesce_tcp_state(uint8_t state); void ct_dpif_format_tcp_stat(struct ds *, int, int); -bool ct_dpif_parse_tuple(struct ct_dpif_tuple *, const char *s, struct ds *); void ct_dpif_push_zone_limit(struct ovs_list *, uint16_t zone, uint32_t limit, uint32_t count); void ct_dpif_free_zone_limits(struct ovs_list *); diff --git a/lib/dpctl.c b/lib/dpctl.c index 29041fa3e30..d12d9b8a5e8 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -41,6 +41,7 @@ #include "netlink.h" #include "odp-util.h" #include "openvswitch/ofpbuf.h" +#include "openvswitch/ofp-ct.h" #include "packets.h" #include "openvswitch/shash.h" #include "simap.h" @@ -1707,37 +1708,55 @@ dpctl_flush_conntrack(int argc, const char *argv[], struct dpctl_params *dpctl_p) { struct dpif *dpif = NULL; - struct ct_dpif_tuple tuple, *ptuple = NULL; + struct ofp_ct_match match = {0}; struct ds ds = DS_EMPTY_INITIALIZER; uint16_t zone, *pzone = NULL; int error; int args = argc - 1; - /* Parse ct tuple */ - if (args && ct_dpif_parse_tuple(&tuple, argv[args], &ds)) { - ptuple = &tuple; + /* Parse zone. */ + if (args && !strncmp(argv[1], "zone=", 5)) { + if (!ovs_scan(argv[1], "zone=%"SCNu16, &zone)) { + ds_put_cstr(&ds, "failed to parse zone"); + error = EINVAL; + goto error; + } + pzone = &zone; args--; } - /* Parse zone */ - if (args && ovs_scan(argv[args], "zone=%"SCNu16, &zone)) { - pzone = &zone; + /* Parse ct tuples. */ + for (int i = 0; i < 2; i++) { + if (!args) { + break; + } + + struct ofp_ct_tuple *tuple = + i ? &match.tuple_reply : &match.tuple_orig; + const char *arg = argv[argc - args]; + + if (arg[0] && !ofp_ct_tuple_parse(tuple, arg, &ds, &match.ip_proto, + &match.l3_type)) { + error = EINVAL; + goto error; + } args--; } - /* Report error if there are more than one unparsed argument. */ + /* Report error if there is more than one unparsed argument. */ if (args > 1) { ds_put_cstr(&ds, "invalid arguments"); error = EINVAL; goto error; } - error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); + error = opt_dpif_open(argc, argv, dpctl_p, 5, &dpif); if (error) { + dpctl_error(dpctl_p, error, "Cannot open dpif"); return error; } - error = ct_dpif_flush(dpif, pzone, ptuple); + error = ct_dpif_flush(dpif, pzone, &match); if (!error) { dpif_close(dpif); return 0; @@ -2862,8 +2881,8 @@ static const struct dpctl_command all_commands[] = { 0, 1, dpctl_offload_stats_show, DP_RO }, { "dump-conntrack", "[-m] [-s] [dp] [zone=N]", 0, 4, dpctl_dump_conntrack, DP_RO }, - { "flush-conntrack", "[dp] [zone=N] [ct-tuple]", 0, 3, - dpctl_flush_conntrack, DP_RW }, + { "flush-conntrack", "[dp] [zone=N] [ct-orig-tuple] [ct-reply-tuple]", + 0, 4, dpctl_flush_conntrack, DP_RW }, { "cache-get-size", "[dp]", 0, 1, dpctl_cache_get_size, DP_RO }, { "cache-set-size", "dp cache ", 3, 3, dpctl_cache_set_size, DP_RW }, { "ct-stats-show", "[dp] [zone=N]", diff --git a/lib/dpctl.man b/lib/dpctl.man index 87ea8087bb8..920446e8cb6 100644 --- a/lib/dpctl.man +++ b/lib/dpctl.man @@ -302,22 +302,30 @@ are included. With \fB\-\-statistics\fR timeouts and timestamps are added to the output. . .TP -\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] [\fIct-tuple\fR] +\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] [\fIct-origin-tuple\fR [\fIct-reply-tuple\fR]] Flushes the connection entries in the tracker used by \fIdp\fR based on -\fIzone\fR and connection tracking tuple \fIct-tuple\fR. +\fIzone\fR and connection tracking tuple \fIct-origin-tuple\fR. If \fIct-tuple\fR is not provided, flushes all the connection entries. If \fBzone\fR=\fIzone\fR is specified, only flushes the connections in \fIzone\fR. .IP -If \fIct-tuple\fR is provided, flushes the connection entry specified by -\fIct-tuple\fR in \fIzone\fR. The zone defaults to 0 if it is not provided. -The userspace connection tracker requires flushing with the original pre-NATed -tuple and a warning log will be otherwise generated. -An example of an IPv4 ICMP \fIct-tuple\fR: +If \fIct-[orig|reply]-tuple\fR is provided, flushes the connection entry +specified by \fIct-[orig|reply]-tuple\fR in \fIzone\fR. The zone defaults +to 0 if it is not provided. The userspace connection tracker requires flushing +with the original pre-NATed tuple and a warning log will be otherwise +generated. The tuple can be partial and will remove all connections that are +matching on the specified fields. In order to specify only +\fIct-reply-tuple\fR, provide empty string as \fIct-origin-tuple\fR. +.IP +Note: Currently there is a limitation for matching on ICMP, in order to +partially match on ICMP parameters the \fIct-[orig|reply]-tuple\fR has +to include either source or destination IP. +.IP +An example of an IPv4 ICMP \fIct-[orig|reply]-tuple\fR: .IP "ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=1,icmp_type=8,icmp_code=0,icmp_id=10" .IP -An example of an IPv6 TCP \fIct-tuple\fR: +An example of an IPv6 TCP \fIct-[orig|reply]-tuple\fR: .IP "ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2,ct_nw_proto=6,ct_tp_src=1,ct_tp_dst=2" . diff --git a/lib/ofp-ct.c b/lib/ofp-ct.c new file mode 100644 index 00000000000..150caa9b3d5 --- /dev/null +++ b/lib/ofp-ct.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2023, Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "ct-dpif.h" +#include "openvswitch/ofp-ct.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/ofp-parse.h" +#include "openvswitch/ofp-util.h" +#include "openvswitch/packets.h" + +static void +ofp_ct_tuple_format(struct ds *ds, const struct ofp_ct_tuple *tuple, + uint8_t ip_proto, uint16_t l3_type) +{ + ds_put_cstr(ds, l3_type == AF_INET ? "ct_nw_src=": "ct_ipv6_src="); + ipv6_format_mapped(&tuple->src, ds); + ds_put_cstr(ds, l3_type == AF_INET ? ",ct_nw_dst=": ",ct_ipv6_dst="); + ipv6_format_mapped(&tuple->dst, ds); + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + ds_put_format(ds, ",icmp_id=%u,icmp_type=%u,icmp_code=%u", + ntohs(tuple->icmp_id), tuple->icmp_type, + tuple->icmp_code); + } else { + ds_put_format(ds, ",ct_tp_src=%u,ct_tp_dst=%u", ntohs(tuple->src_port), + ntohs(tuple->dst_port)); + } +} + +bool +ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) +{ + bool is_zero = ipv6_is_zero(&tuple->src) && ipv6_is_zero(&tuple->dst); + + if (!(ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6)) { + is_zero = is_zero && !tuple->src_port && !tuple->dst_port; + } + + return is_zero; +} + +bool +ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) +{ + /* First check if we have address. */ + bool five_tuple = !ipv6_is_zero(&tuple->src) && !ipv6_is_zero(&tuple->dst); + + if (!(ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6)) { + five_tuple = five_tuple && tuple->src_port && tuple->dst_port; + } + + return five_tuple; +} + +bool +ofp_ct_match_is_zero(const struct ofp_ct_match *match) +{ + return !match->ip_proto && !match->l3_type && + ofp_ct_tuple_is_zero(&match->tuple_orig, match->ip_proto) && + ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto); +} + +void +ofp_ct_match_format(struct ds *ds, const struct ofp_ct_match *match) +{ + ds_put_cstr(ds, "'"); + ofp_ct_tuple_format(ds, &match->tuple_orig, match->ip_proto, + match->l3_type); + ds_put_format(ds, ",ct_nw_proto=%u' '", match->ip_proto); + ofp_ct_tuple_format(ds, &match->tuple_reply, match->ip_proto, + match->l3_type); + ds_put_cstr(ds, "'"); +} + +/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. + * Returns true on success. Otherwise, returns false and puts the error + * message in 'ds'. */ +bool +ofp_ct_tuple_parse(struct ofp_ct_tuple *tuple, const char *s, + struct ds *ds, uint8_t *ip_proto, uint16_t *l3_type) +{ + char *pos, *key, *value, *copy; + + pos = copy = xstrdup(s); + while (ofputil_parse_key_value(&pos, &key, &value)) { + if (!*value) { + ds_put_format(ds, "field %s missing value", key); + goto error; + } + + if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) { + struct in6_addr *addr = key[6] == 's' ? &tuple->src : &tuple->dst; + + if (*l3_type && *l3_type != AF_INET) { + ds_put_format(ds ,"the L3 protocol does not match %s", value); + goto error; + } + + if (!ipv6_is_zero(addr)) { + ds_put_format(ds, "%s is set multiple times", key); + goto error; + } + + ovs_be32 ip = 0; + if (!ip_parse(value, &ip)) { + goto error_with_msg; + } + + *l3_type = AF_INET; + *addr = in6_addr_mapped_ipv4(ip); + } else if (!strcmp(key, "ct_ipv6_src") || + !strcmp(key, "ct_ipv6_dst")) { + struct in6_addr *addr = key[8] == 's' ? &tuple->src : &tuple->dst; + + if (*l3_type && *l3_type != AF_INET6) { + ds_put_format(ds, "the L3 protocol does not match %s", value); + goto error; + } + + if (!ipv6_is_zero(addr)) { + ds_put_format(ds, "%s is set multiple times", key); + goto error; + } + + + if (!ipv6_parse(value, addr)) { + goto error_with_msg; + } + + *l3_type = AF_INET6; + } else if (!strcmp(key, "ct_nw_proto")) { + if (*ip_proto) { + ds_put_format(ds, "%s is set multiple times", key); + } + char *err = str_to_u8(value, key, ip_proto); + + if (err) { + free(err); + goto error_with_msg; + } + } else if (!strcmp(key, "ct_tp_src") || !strcmp(key, "ct_tp_dst")) { + uint16_t port; + char *err = str_to_u16(value, key, &port); + + if (err) { + free(err); + goto error_with_msg; + } + if (key[6] == 's') { + tuple->src_port = htons(port); + } else { + tuple->dst_port = htons(port); + } + } else if (!strcmp(key, "icmp_type") || !strcmp(key, "icmp_code") || + !strcmp(key, "icmp_id")) { + if (*ip_proto != IPPROTO_ICMP && *ip_proto != IPPROTO_ICMPV6) { + ds_put_cstr(ds, "invalid L4 fields"); + goto error; + } + uint16_t icmp_id; + char *err; + + if (key[5] == 't') { + err = str_to_u8(value, key, &tuple->icmp_type); + } else if (key[5] == 'c') { + err = str_to_u8(value, key, &tuple->icmp_code); + } else { + err = str_to_u16(value, key, &icmp_id); + tuple->icmp_id = htons(icmp_id); + } + if (err) { + free(err); + goto error_with_msg; + } + } else { + ds_put_format(ds, "invalid conntrack tuple field: %s", key); + goto error; + } + } + + if (!*ip_proto && (tuple->src_port || tuple->dst_port)) { + ds_put_cstr(ds, "port is set without protocol"); + goto error; + } + + free(copy); + return true; + +error_with_msg: + ds_put_format(ds, "failed to parse field %s", key); +error: + free(copy); + return false; +} diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 08c78ff57e1..e7ec1d96b53 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2278,7 +2278,7 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([conntrack - ct flush by 5-tuple]) +AT_SETUP([conntrack - ct flush]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() @@ -2339,6 +2339,106 @@ AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 $ICMP_TUPLE]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [1], [dnl ]) +dnl Test UDP from port 1 and 2, partial flush by src port +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by dst port +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_dst=2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_dst=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by src address +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by dst address +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_dst=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_dst=10.1.1.1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by src address in reply direction +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack '' 'ct_nw_src=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 '' 'ct_nw_src=10.1.1.1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP From 08146bf7d9b4ad635312901ae017370b0108c62f Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 16 Jan 2023 12:45:08 +0100 Subject: [PATCH 125/833] openflow: Add extension to flush CT by generic match. Add extension that allows to flush connections from CT by specifying fields that the connections should be matched against. This allows to match only some fields of the connection e.g. source address for orig direction. Reported-at: https://bugzilla.redhat.com/2120546 Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- NEWS | 6 + include/openflow/nicira-ext.h | 37 +++++++ include/openvswitch/ofp-ct.h | 8 ++ include/openvswitch/ofp-msgs.h | 4 + lib/ofp-bundle.c | 1 + lib/ofp-ct.c | 196 +++++++++++++++++++++++++++++++++ lib/ofp-print.c | 20 ++++ lib/rconn.c | 1 + ofproto/ofproto-dpif.c | 9 +- ofproto/ofproto-provider.h | 7 +- ofproto/ofproto.c | 29 ++++- tests/ofp-print.at | 108 ++++++++++++++++++ tests/ovs-ofctl.at | 38 +++++++ tests/system-traffic.at | 38 ++++--- utilities/ovs-ofctl.8.in | 31 ++++++ utilities/ovs-ofctl.c | 51 +++++++++ 16 files changed, 562 insertions(+), 22 deletions(-) diff --git a/NEWS b/NEWS index 3685d7c159d..e31b092be8a 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,9 @@ Post-v3.0.0 10 Gbps link speed by default in case the actual link speed cannot be determined. Previously it was 10 Mbps. Values can still be overridden by specifying 'max-rate' or '[r]stp-path-cost' accordingly. + - OpenFlow: + * New OpenFlow extension NXT_CT_FLUSH to flush connections matching + the specified fields. - ovs-ctl: * New option '--dump-hugepages' to include hugepages in core dumps. This can assist with postmortem analysis involving DPDK, but may also produce @@ -28,6 +31,9 @@ Post-v3.0.0 - ovs-dpctl and 'ovs-appctl dpctl/' commands: * 'flush-conntrack' is now capable of handling partial 5-tuple, with additional optional parameter to specify the reply direction. + - ovs-ofctl: + * New command 'flush-conntrack' that accepts zone and 5-tuple (or partial + 5-tuple) for both directions. - Support for travis-ci.org based continuous integration builds has been dropped. - Userspace datapath: diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h index b68804991aa..7687758985a 100644 --- a/include/openflow/nicira-ext.h +++ b/include/openflow/nicira-ext.h @@ -1064,4 +1064,41 @@ struct nx_zone_id { }; OFP_ASSERT(sizeof(struct nx_zone_id) == 8); +/* CT flush available TLVs. */ +enum nx_ct_flush_tlv_type { + /* Outer types. */ + NXT_CT_ORIG_TUPLE = 0, /* Outer type for original tuple TLV. + * Nested TLVs are specified + * by 'enum nx_ct_flush_tuple_tlv_type'. */ + NXT_CT_REPLY_TUPLE = 1, /* Outer type for reply tuple TLV. * + * Nested TLVs are specified + * by 'enum nx_ct_flush_tuple_tlv_type'*/ + /* Primitive types. */ + NXT_CT_ZONE_ID = 2, /* be16 zone id. */ +}; + +/* CT flush nested TLVs. */ +enum nx_ct_flush_tuple_tlv_type { + NXT_CT_TUPLE_SRC = 0, /* IPv6 or mapped IPv4 address. */ + NXT_CT_TUPLE_DST = 1, /* IPv6 or mapped IPv4 address. */ + NXT_CT_TUPLE_SRC_PORT = 2, /* be16 source port. */ + NXT_CT_TUPLE_DST_PORT = 3, /* be16 destination port. */ + NXT_CT_TUPLE_ICMP_ID = 4, /* be16 ICMP id. */ + NXT_CT_TUPLE_ICMP_TYPE = 5, /* u8 ICMP type. */ + NXT_CT_TUPLE_ICMP_CODE = 6, /* u8 ICMP code. */ +}; + +/* NXT_CT_FLUSH. + * + * Flushes the connection tracking entries specified by 5-tuple. + * The struct should be followed by TLVs specifying the matching parameters. + * Currently there is a limitation for ICMP, in order to partially match on + * ICMP parameters the tuple should include at least SRC/DST. */ +struct nx_ct_flush { + uint8_t ip_proto; /* IP protocol. */ + uint8_t pad[7]; /* Align to 64 bits (must be zero). */ + /* Followed by optional TLVs of type 'enum nx_ct_flush_tlv_type'. */ +}; +OFP_ASSERT(sizeof(struct nx_ct_flush) == 8); + #endif /* openflow/nicira-ext.h */ diff --git a/include/openvswitch/ofp-ct.h b/include/openvswitch/ofp-ct.h index 3d919ddf974..c8023c3097e 100644 --- a/include/openvswitch/ofp-ct.h +++ b/include/openvswitch/ofp-ct.h @@ -22,6 +22,8 @@ #include #include +#include "openflow/nicira-ext.h" + #ifdef __cplusplus extern "C" { #endif @@ -59,6 +61,12 @@ void ofp_ct_match_format(struct ds *, const struct ofp_ct_match *); bool ofp_ct_tuple_parse(struct ofp_ct_tuple *, const char *, struct ds *, uint8_t *ip_proto, uint16_t *l3_type); +enum ofperr ofp_ct_match_decode(struct ofp_ct_match *, bool *with_zone, + uint16_t *zone_id, const struct ofp_header *); +struct ofpbuf *ofp_ct_match_encode(const struct ofp_ct_match *, + uint16_t *zone_id, + enum ofp_version version); + #ifdef __cplusplus } #endif diff --git a/include/openvswitch/ofp-msgs.h b/include/openvswitch/ofp-msgs.h index 921a937e5e3..708427fc041 100644 --- a/include/openvswitch/ofp-msgs.h +++ b/include/openvswitch/ofp-msgs.h @@ -515,6 +515,9 @@ enum ofpraw { /* NXT 1.0+ (29): struct nx_zone_id. */ OFPRAW_NXT_CT_FLUSH_ZONE, + /* NXT 1.0+ (32): struct nx_ct_flush, uint8_t[8][]. */ + OFPRAW_NXT_CT_FLUSH, + /* NXST 1.0+ (3): void. */ OFPRAW_NXST_IPFIX_BRIDGE_REQUEST, @@ -772,6 +775,7 @@ enum ofptype { OFPTYPE_IPFIX_FLOW_STATS_REQUEST, /* OFPRAW_NXST_IPFIX_FLOW_REQUEST */ OFPTYPE_IPFIX_FLOW_STATS_REPLY, /* OFPRAW_NXST_IPFIX_FLOW_REPLY */ OFPTYPE_CT_FLUSH_ZONE, /* OFPRAW_NXT_CT_FLUSH_ZONE. */ + OFPTYPE_CT_FLUSH, /* OFPRAW_NXT_CT_FLUSH. */ /* Flow monitor extension. */ OFPTYPE_FLOW_MONITOR_CANCEL, /* OFPRAW_NXT_FLOW_MONITOR_CANCEL. diff --git a/lib/ofp-bundle.c b/lib/ofp-bundle.c index 0161c2bc615..941a8370e08 100644 --- a/lib/ofp-bundle.c +++ b/lib/ofp-bundle.c @@ -292,6 +292,7 @@ ofputil_is_bundlable(enum ofptype type) case OFPTYPE_IPFIX_FLOW_STATS_REQUEST: case OFPTYPE_IPFIX_FLOW_STATS_REPLY: case OFPTYPE_CT_FLUSH_ZONE: + case OFPTYPE_CT_FLUSH: break; } diff --git a/lib/ofp-ct.c b/lib/ofp-ct.c index 150caa9b3d5..85a9d8beca7 100644 --- a/lib/ofp-ct.c +++ b/lib/ofp-ct.c @@ -23,8 +23,12 @@ #include "ct-dpif.h" #include "openvswitch/ofp-ct.h" +#include "openflow/nicira-ext.h" #include "openvswitch/dynamic-string.h" +#include "openvswitch/ofp-msgs.h" #include "openvswitch/ofp-parse.h" +#include "openvswitch/ofp-errors.h" +#include "openvswitch/ofp-prop.h" #include "openvswitch/ofp-util.h" #include "openvswitch/packets.h" @@ -211,3 +215,195 @@ ofp_ct_tuple_parse(struct ofp_ct_tuple *tuple, const char *s, free(copy); return false; } + +static enum ofperr +ofpprop_pull_ipv6(struct ofpbuf *property, struct in6_addr *addr, + uint16_t *l3_type) +{ + if (ofpbuf_msgsize(property) < sizeof *addr) { + return OFPERR_OFPBPC_BAD_LEN; + } + + memcpy(addr, property->msg, sizeof *addr); + + uint16_t l3 = 0; + if (!ipv6_is_zero(addr)) { + l3 = IN6_IS_ADDR_V4MAPPED(addr) ? AF_INET : AF_INET6; + } + + if (*l3_type && l3 && *l3_type != l3) { + return OFPERR_OFPBPC_BAD_VALUE; + } + + *l3_type = l3; + + return 0; +} + +static enum ofperr +ofp_ct_tuple_decode_nested(struct ofpbuf *property, struct ofp_ct_tuple *tuple, + uint16_t *l3_type) +{ + struct ofpbuf nested; + enum ofperr error = ofpprop_parse_nested(property, &nested); + if (error) { + return error; + } + + while (nested.size) { + struct ofpbuf inner; + uint64_t type; + + error = ofpprop_pull(&nested, &inner, &type); + if (error) { + return error; + } + switch (type) { + case NXT_CT_TUPLE_SRC: + error = ofpprop_pull_ipv6(&inner, &tuple->src, l3_type); + break; + + case NXT_CT_TUPLE_DST: + error = ofpprop_pull_ipv6(&inner, &tuple->dst, l3_type); + break; + + case NXT_CT_TUPLE_SRC_PORT: + error = ofpprop_parse_be16(&inner, &tuple->src_port); + break; + + case NXT_CT_TUPLE_DST_PORT: + error = ofpprop_parse_be16(&inner, &tuple->dst_port); + break; + + case NXT_CT_TUPLE_ICMP_ID: + error = ofpprop_parse_be16(&inner, &tuple->icmp_id); + break; + + case NXT_CT_TUPLE_ICMP_TYPE: + error = ofpprop_parse_u8(&inner, &tuple->icmp_type); + break; + + case NXT_CT_TUPLE_ICMP_CODE: + error = ofpprop_parse_u8(&inner, &tuple->icmp_code); + break; + } + + if (error) { + return error; + } + } + + return 0; +} + +static void +ofp_ct_tuple_encode(const struct ofp_ct_tuple *tuple, struct ofpbuf *buf, + enum nx_ct_flush_tlv_type type, uint8_t ip_proto) +{ + /* 128 B is enough to hold the whole tuple. */ + uint8_t stub[128]; + struct ofpbuf nested = OFPBUF_STUB_INITIALIZER(stub); + + if (!ipv6_is_zero(&tuple->src)) { + ofpprop_put(&nested, NXT_CT_TUPLE_SRC, &tuple->src, sizeof tuple->src); + } + + if (!ipv6_is_zero(&tuple->dst)) { + ofpprop_put(&nested, NXT_CT_TUPLE_DST, &tuple->dst, sizeof tuple->dst); + } + + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + ofpprop_put_be16(&nested, NXT_CT_TUPLE_ICMP_ID, tuple->icmp_id); + ofpprop_put_u8(&nested, NXT_CT_TUPLE_ICMP_TYPE, tuple->icmp_type); + ofpprop_put_u8(&nested, NXT_CT_TUPLE_ICMP_CODE, tuple->icmp_code); + } else { + if (tuple->src_port) { + ofpprop_put_be16(&nested, NXT_CT_TUPLE_SRC_PORT, tuple->src_port); + } + + if (tuple->dst_port) { + ofpprop_put_be16(&nested, NXT_CT_TUPLE_DST_PORT, tuple->dst_port); + } + } + + if (nested.size) { + ofpprop_put_nested(buf, type, &nested); + } + + ofpbuf_uninit(&nested); +} + +enum ofperr +ofp_ct_match_decode(struct ofp_ct_match *match, bool *with_zone, + uint16_t *zone_id, const struct ofp_header *oh) +{ + struct ofpbuf msg = ofpbuf_const_initializer(oh, ntohs(oh->length)); + ofpraw_pull_assert(&msg); + + const struct nx_ct_flush *nx_flush = ofpbuf_pull(&msg, sizeof *nx_flush); + + if (!is_all_zeros(nx_flush->pad, sizeof nx_flush->pad)) { + return OFPERR_NXBRC_MUST_BE_ZERO; + } + + match->ip_proto = nx_flush->ip_proto; + + struct ofp_ct_tuple *orig = &match->tuple_orig; + struct ofp_ct_tuple *reply = &match->tuple_reply; + + while (msg.size) { + struct ofpbuf property; + uint64_t type; + + enum ofperr error = ofpprop_pull(&msg, &property, &type); + if (error) { + return error; + } + + switch (type) { + case NXT_CT_ORIG_TUPLE: + error = ofp_ct_tuple_decode_nested(&property, orig, + &match->l3_type); + break; + + case NXT_CT_REPLY_TUPLE: + error = ofp_ct_tuple_decode_nested(&property, reply, + &match->l3_type); + break; + + case NXT_CT_ZONE_ID: + if (with_zone) { + *with_zone = true; + } + error = ofpprop_parse_u16(&property, zone_id); + break; + } + + if (error) { + return error; + } + } + + return 0; +} + +struct ofpbuf * +ofp_ct_match_encode(const struct ofp_ct_match *match, uint16_t *zone_id, + enum ofp_version version) +{ + struct ofpbuf *msg = ofpraw_alloc(OFPRAW_NXT_CT_FLUSH, version, 0); + struct nx_ct_flush *nx_flush = ofpbuf_put_zeros(msg, sizeof *nx_flush); + const struct ofp_ct_tuple *orig = &match->tuple_orig; + const struct ofp_ct_tuple *reply = &match->tuple_reply; + + nx_flush->ip_proto = match->ip_proto; + + ofp_ct_tuple_encode(orig, msg, NXT_CT_ORIG_TUPLE,match->ip_proto); + ofp_ct_tuple_encode(reply, msg, NXT_CT_REPLY_TUPLE, match->ip_proto); + + if (zone_id) { + ofpprop_put_u16(msg, NXT_CT_ZONE_ID, *zone_id); + } + + return msg; +} diff --git a/lib/ofp-print.c b/lib/ofp-print.c index bd37fa17a59..874079b84b4 100644 --- a/lib/ofp-print.c +++ b/lib/ofp-print.c @@ -45,6 +45,7 @@ #include "openvswitch/ofp-actions.h" #include "openvswitch/ofp-bundle.h" #include "openvswitch/ofp-connection.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-group.h" #include "openvswitch/ofp-ipfix.h" @@ -949,6 +950,23 @@ ofp_print_nxt_ct_flush_zone(struct ds *string, const struct nx_zone_id *nzi) return 0; } +static enum ofperr +ofp_print_nxt_ct_flush(struct ds *string, const struct ofp_header *oh) +{ + uint16_t zone_id = 0; + struct ofp_ct_match match = {0}; + + enum ofperr error = ofp_ct_match_decode(&match, NULL, &zone_id, oh); + if (error) { + return error; + } + + ds_put_format(string, " zone=%"PRIu16" ", zone_id); + ofp_ct_match_format(string, &match); + + return 0; +} + static enum ofperr ofp_to_string__(const struct ofp_header *oh, const struct ofputil_port_map *port_map, @@ -1184,6 +1202,8 @@ ofp_to_string__(const struct ofp_header *oh, case OFPTYPE_CT_FLUSH_ZONE: return ofp_print_nxt_ct_flush_zone(string, ofpmsg_body(oh)); + case OFPTYPE_CT_FLUSH: + return ofp_print_nxt_ct_flush(string, oh); } return 0; diff --git a/lib/rconn.c b/lib/rconn.c index a96b2eb8bf4..4afa2151540 100644 --- a/lib/rconn.c +++ b/lib/rconn.c @@ -1426,6 +1426,7 @@ is_admitted_msg(const struct ofpbuf *b) case OFPTYPE_IPFIX_FLOW_STATS_REQUEST: case OFPTYPE_IPFIX_FLOW_STATS_REPLY: case OFPTYPE_CT_FLUSH_ZONE: + case OFPTYPE_CT_FLUSH: default: return true; } diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index f9562dee877..f87e27a8cd7 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -5358,11 +5358,12 @@ type_set_config(const char *type, const struct smap *other_config) } static void -ct_flush(const struct ofproto *ofproto_, const uint16_t *zone) +ct_flush(const struct ofproto *ofproto_, const uint16_t *zone, + const struct ofp_ct_match *match) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - ct_dpif_flush(ofproto->backer->dpif, zone, NULL); + ct_dpif_flush(ofproto->backer->dpif, zone, match); } static struct ct_timeout_policy * @@ -5674,6 +5675,10 @@ get_datapath_cap(const char *datapath_type, struct smap *cap) smap_add(cap, "lb_output_action", s.lb_output_action ? "true" : "false"); smap_add(cap, "ct_zero_snat", s.ct_zero_snat ? "true" : "false"); smap_add(cap, "add_mpls", s.add_mpls ? "true" : "false"); + + /* The ct_tuple_flush is implemented on dpif level, so it is supported + * for all backers. */ + smap_add(cap, "ct_flush", "true"); } /* Gets timeout policy name in 'backer' based on 'zone', 'dl_type' and diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 7e3fb669852..a84ddc1d06a 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -42,6 +42,7 @@ #include "ofproto/ofproto.h" #include "openvswitch/list.h" #include "openvswitch/ofp-actions.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-flow.h" #include "openvswitch/ofp-group.h" @@ -1902,8 +1903,10 @@ struct ofproto_class { /* ## Connection tracking ## */ /* ## ------------------- ## */ /* Flushes the connection tracking tables. If 'zone' is not NULL, - * only deletes connections in '*zone'. */ - void (*ct_flush)(const struct ofproto *, const uint16_t *zone); + * only deletes connections in '*zone'. If 'match' is not NULL, + * deletes connections specified by the match. */ + void (*ct_flush)(const struct ofproto *, const uint16_t *zone, + const struct ofp_ct_match *match); /* Sets conntrack timeout policy specified by 'timeout_policy' to 'zone' * in datapath type 'dp_type'. */ diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 3a527683cb3..17f636ed9dc 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -42,6 +42,7 @@ #include "openvswitch/meta-flow.h" #include "openvswitch/ofp-actions.h" #include "openvswitch/ofp-bundle.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-match.h" #include "openvswitch/ofp-msgs.h" @@ -934,7 +935,30 @@ handle_nxt_ct_flush_zone(struct ofconn *ofconn, const struct ofp_header *oh) uint16_t zone = ntohs(nzi->zone_id); if (ofproto->ofproto_class->ct_flush) { - ofproto->ofproto_class->ct_flush(ofproto, &zone); + ofproto->ofproto_class->ct_flush(ofproto, &zone, NULL); + } else { + return EOPNOTSUPP; + } + + return 0; +} + +static enum ofperr +handle_nxt_ct_flush(struct ofconn *ofconn, const struct ofp_header *oh) +{ + struct ofproto *ofproto = ofconn_get_ofproto(ofconn); + struct ofp_ct_match match = {0}; + bool with_zone = false; + uint16_t zone_id = 0; + + enum ofperr error = ofp_ct_match_decode(&match, &with_zone, &zone_id, oh); + if (error) { + return error; + } + + if (ofproto->ofproto_class->ct_flush) { + ofproto->ofproto_class->ct_flush(ofproto, with_zone ? &zone_id : NULL, + &match); } else { return EOPNOTSUPP; } @@ -8787,6 +8811,9 @@ handle_single_part_openflow(struct ofconn *ofconn, const struct ofp_header *oh, case OFPTYPE_CT_FLUSH_ZONE: return handle_nxt_ct_flush_zone(ofconn, oh); + case OFPTYPE_CT_FLUSH: + return handle_nxt_ct_flush(ofconn, oh); + case OFPTYPE_HELLO: case OFPTYPE_ERROR: case OFPTYPE_FEATURES_REPLY: diff --git a/tests/ofp-print.at b/tests/ofp-print.at index fe41cc42c7f..14aa5541694 100644 --- a/tests/ofp-print.at +++ b/tests/ofp-print.at @@ -4073,3 +4073,111 @@ AT_CHECK([ovs-ofctl ofp-print "\ NXT_CT_FLUSH_ZONE (xid=0x3): zone_id=13 ]) AT_CLEANUP + +AT_SETUP([NXT_CT_FLUSH]) +AT_KEYWORDS([ofp-print]) +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 18 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 68 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 48 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 02 00 08 00 50 00 00 \ +00 03 00 08 1f 90 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_nw_src=10.10.0.1,ct_nw_dst=10.10.0.2,ct_tp_src=80,ct_tp_dst=8080,ct_nw_proto=6' 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 68 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 01 00 48 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 03 00 08 00 50 00 00 \ +00 02 00 08 1f 90 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_nw_src=10.10.0.2,ct_nw_dst=10.10.0.1,ct_tp_src=8080,ct_tp_dst=80' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 b0 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 48 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 02 00 08 00 50 00 00 \ +00 03 00 08 1f 90 00 00 \ +00 01 00 48 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 03 00 08 00 50 00 00 \ +00 02 00 08 1f 90 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_nw_src=10.10.0.1,ct_nw_dst=10.10.0.2,ct_tp_src=80,ct_tp_dst=8080,ct_nw_proto=6' 'ct_nw_src=10.10.0.2,ct_nw_dst=10.10.0.1,ct_tp_src=8080,ct_tp_dst=80' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 b8 00 00 00 03 00 00 23 20 00 00 00 20 \ +01 \ +00 00 00 00 00 00 00 \ +00 00 00 50 00 00 00 00 \ +00 00 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 01 00 00 00 00 \ +00 01 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 02 00 00 00 00 \ +00 04 00 08 00 0a 00 00 \ +00 05 00 05 01 00 00 00 \ +00 06 00 05 02 00 00 00 \ +00 01 00 50 00 00 00 00 \ +00 01 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 02 00 00 00 00 \ +00 00 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 01 00 00 00 00 \ +00 04 00 08 00 0a 00 00 \ +00 05 00 05 03 00 00 00 \ +00 06 00 05 04 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 'ct_ipv6_src=fd18::ffff:abcd:1,ct_ipv6_dst=fd18::ffff:abcd:2,icmp_id=10,icmp_type=1,icmp_code=2,ct_nw_proto=1' 'ct_ipv6_src=fd18::ffff:abcd:1,ct_ipv6_dst=fd18::ffff:abcd:2,icmp_id=10,icmp_type=3,icmp_code=4' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 58 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 38 00 00 00 00 \ +00 00 00 14 00 0a 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +" | grep -q OFPBPC_BAD_VALUE], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 60 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 20 00 00 00 00 \ +00 00 00 14 00 0a 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 20 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +" | grep -q OFPBPC_BAD_VALUE], [0]) +AT_CLEANUP diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at index a8934051ef1..8531b2e2eb6 100644 --- a/tests/ovs-ofctl.at +++ b/tests/ovs-ofctl.at @@ -3271,3 +3271,41 @@ AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sed '/OFPST_FLO OVS_VSWITCHD_STOP(["/Flow exceeded the maximum flow statistics reply size and was excluded from the response set/d"]) AT_CLEANUP + +AT_SETUP([ovs-ofctl ct-flush]) +OVS_VSWITCHD_START + +AT_CHECK([ovs-appctl vlog/set ct_dpif:dbg]) + +# Check flush conntrack with both zone and tuple +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 1]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_tp_src=1,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack just with tuple +AT_CHECK([ovs-ofctl ct-flush br0 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 2]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_tp_src=1,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack with reply tuple +AT_CHECK([ovs-ofctl ct-flush br0 '' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 3]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_tp_src=1,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack with zone and reply tuple +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 '' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 4]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_tp_src=1,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack without any tuple and zone +AT_CHECK([ovs-ofctl ct-flush br0]) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 5]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: " ovs-vswitchd.log]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/system-traffic.at b/tests/system-traffic.at index e7ec1d96b53..503455cc635 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2298,6 +2298,10 @@ priority=100,in_port=2,icmp,action=ct(zone=5,commit),1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +m4_foreach([FLUSH_CMD], [[ovs-appctl dpctl/flush-conntrack], + [ovs-ofctl ct-flush br0]], [ +AS_BOX([Testing with FLUSH_CMD]) + dnl Test UDP from port 1 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -2305,10 +2309,10 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [], udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [1]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [1], [dnl -]) dnl Test UDP from port 2 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) @@ -2317,10 +2321,9 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [0], udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) +AT_CHECK([FLUSH_CMD zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl -]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0]) dnl Test ICMP traffic NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl @@ -2334,7 +2337,7 @@ icmp,orig=(src=10.1.1.2,dst=10.1.1.1,id=,type=8,code=0),reply=(src=10.1 ICMP_ID=`cat stdout | cut -d ',' -f4 | cut -d '=' -f2` ICMP_TUPLE=ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=1,icmp_id=$ICMP_ID,icmp_type=8,icmp_code=0 -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 $ICMP_TUPLE]) +AT_CHECK([FLUSH_CMD zone=5 $ICMP_TUPLE]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [1], [dnl ]) @@ -2349,13 +2352,13 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=1']) +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=2']) +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) @@ -2369,13 +2372,13 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_dst=2']) +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_dst=1']) +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) @@ -2389,13 +2392,13 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.1']) +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.2']) +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) @@ -2409,13 +2412,13 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_dst=10.1.1.2']) +AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_dst=10.1.1.1']) +AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) @@ -2429,15 +2432,16 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack '' 'ct_nw_src=10.1.1.2']) +AT_CHECK([FLUSH_CMD '' 'ct_nw_src=10.1.1.2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 '' 'ct_nw_src=10.1.1.1']) +AT_CHECK([FLUSH_CMD zone=5 '' 'ct_nw_src=10.1.1.1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in index 10a6a64de90..0a611b2ee23 100644 --- a/utilities/ovs-ofctl.8.in +++ b/utilities/ovs-ofctl.8.in @@ -296,6 +296,37 @@ Flushes the connection tracking entries in \fIzone\fR on \fIswitch\fR. This command uses an Open vSwitch extension that is only in Open vSwitch 2.6 and later. . +.IP "\fBct\-flush \fIswitch [zone=N] [ct-orig-tuple [ct-reply-tuple]]\fR +Flushes the connection entries on \fIswitch\fR based on \fIzone\fR and +connection tracking tuples \fIct-[orig|reply]-tuple\fR. +.IP +If \fIct-[orig|reply]-tuple\fR is not provided, flushes all the connection +entries. If \fIzone\fR is specified, only flushes the connections in +\fIzone\fR. +.IP +If \fIct-[orig|reply]-tuple\fR is provided, flushes the connection entry +specified by \fIct-[orig|reply]-tuple\fR in \fIzone\fR. The zone defaults +to 0 if it is not provided. The userspace connection tracker requires flushing +with the original pre-NATed tuple and a warning log will be otherwise +generated. The tuple can be partial and will remove all connections that are +matching on the specified fields. In order to specify only +\fIct-reply-tuple\fR, provide empty string as \fIct-orig-tuple\fR. +.IP +Note: Currently there is limitation for matching on ICMP, in order to partially +match on ICMP parameters the \fIct-[orig|reply]-tuple\fR has to include +either source or destination IP. +.IP +An example of an IPv4 ICMP \fIct-[orig|reply]-tuple\fR: +.IP +"ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=1,icmp_type=8,icmp_code=0,icmp_id=10" +.IP +An example of an IPv6 TCP \fIct-[orig|reply]-tuple\fR: +.IP +"ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2,ct_nw_proto=6,ct_tp_src=1,ct_tp_dst=2" +.IP +This command uses an Open vSwitch extension that is only in Open vSwitch 3.1 +and later. +. .SS "OpenFlow Switch Flow Table Commands" . These commands manage the flow table in an OpenFlow switch. In each diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index fe911458027..eabec18a367 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -48,6 +48,7 @@ #include "openvswitch/meta-flow.h" #include "openvswitch/ofp-actions.h" #include "openvswitch/ofp-bundle.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-group.h" #include "openvswitch/ofp-match.h" @@ -485,6 +486,9 @@ usage(void) " dump-ipfix-bridge SWITCH print ipfix stats of bridge\n" " dump-ipfix-flow SWITCH print flow ipfix of a bridge\n" " ct-flush-zone SWITCH ZONE flush conntrack entries in ZONE\n" + " ct-flush SWITCH [ZONE] [CT_ORIG_TUPLE [CT_REPLY_TUPLE]]\n" + " flush conntrack entries specified\n" + " by CT_ORIG/REPLY_TUPLE and ZONE\n" "\nFor OpenFlow switches and controllers:\n" " probe TARGET probe whether TARGET is up\n" " ping TARGET [N] latency of N-byte echos\n" @@ -3050,6 +3054,50 @@ ofctl_ct_flush_zone(struct ovs_cmdl_context *ctx) vconn_close(vconn); } +static void +ofctl_ct_flush(struct ovs_cmdl_context *ctx) +{ + struct vconn *vconn; + struct ofp_ct_match match = {0}; + struct ds ds = DS_EMPTY_INITIALIZER; + uint16_t zone, *pzone = NULL; + int args = ctx->argc - 2; + + /* Parse zone. */ + if (args && !strncmp(ctx->argv[2], "zone=", 5)) { + if (!ovs_scan(ctx->argv[2], "zone=%"SCNu16, &zone)) { + ovs_fatal(0, "Failed to parse zone"); + } + pzone = &zone; + args--; + } + + /* Parse ct tuples. */ + for (int i = 0; i < 2; i++) { + if (!args) { + break; + } + + struct ofp_ct_tuple *tuple = + i ? &match.tuple_reply : &match.tuple_orig; + const char *arg = ctx->argv[ctx->argc - args]; + + if (arg[0] && !ofp_ct_tuple_parse(tuple, arg, &ds, &match.ip_proto, + &match.l3_type)) { + ovs_fatal(0, "Failed to parse ct-tuple: %s", ds_cstr(&ds)); + } + args--; + } + + open_vconn(ctx->argv[1], &vconn); + enum ofp_version version = vconn_get_version(vconn); + struct ofpbuf *msg = ofp_ct_match_encode(&match, pzone, version); + + ds_destroy(&ds); + transact_noreply(vconn, msg); + vconn_close(vconn); +} + static void ofctl_dump_ipfix_flow(struct ovs_cmdl_context *ctx) { @@ -5063,6 +5111,9 @@ static const struct ovs_cmdl_command all_commands[] = { { "ct-flush-zone", "switch zone", 2, 2, ofctl_ct_flush_zone, OVS_RO }, + { "ct-flush", "switch [zone=N] [ct-orig-tuple [ct-reply-tuple]]", + 1, 4, ofctl_ct_flush, OVS_RO }, + { "ofp-parse", "file", 1, 1, ofctl_ofp_parse, OVS_RW }, { "ofp-parse-pcap", "pcap", From 8833e7c8edb0c19d4f8b5ba8f15ba0405ebb0cb1 Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Thu, 12 Jan 2023 09:08:53 -0800 Subject: [PATCH 126/833] ovsdb-idl: Provide API to disable set_db_change_aware request. For ovsdb clients that are short-lived, e.g. when using ovn-nbctl/ovn-sbctl to read some metrics from the OVN NB/SB server, they don't really need to be aware of db changes, because they exit immediately after getting the initial response for the requested data. In such use cases, however, the clients still send 'set_db_change_aware' request, which results in server side error logs when the server tries to send out the response for the 'set_db_change_aware' request, because at the moment the client that is supposed to receive the request has already closed the connection and exited. E.g.: 2023-01-10T18:23:29.431Z|00007|jsonrpc|WARN|unix#3: receive error: Connection reset by peer 2023-01-10T18:23:29.431Z|00008|reconnect|WARN|unix#3: connection dropped (Connection reset by peer) To avoid such problems, this patch provides an API to allow a client to choose to not send the 'set_db_change_aware' request. There was an earlier attempt to fix this [0], but it was not accepted back then as discussed in the email [1]. It was also discussed in the emails that an alternative approach is to use notification instead of request, but that would require protocol changes and taking backward compatibility into consideration. So this patch takes a different approach and tries to keep the change small. [0] http://patchwork.ozlabs.org/project/openvswitch/patch/1594380801-32134-1-git-send-email-dceara@redhat.com/ [1] https://mail.openvswitch.org/pipermail/ovs-discuss/2021-February/050919.html Reported-by: Girish Moodalbail Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2020-July/050343.html Reported-by: Tobias Hofmann Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2021-February/050914.html Acked-by: Dumitru Ceara Signed-off-by: Han Zhou Signed-off-by: Ilya Maximets --- lib/ovsdb-cs.c | 22 +++++++++++++++++++++- lib/ovsdb-cs.h | 3 +++ lib/ovsdb-idl.c | 8 ++++++++ lib/ovsdb-idl.h | 2 ++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/lib/ovsdb-cs.c b/lib/ovsdb-cs.c index 0fca03d7231..c7c147cc02b 100644 --- a/lib/ovsdb-cs.c +++ b/lib/ovsdb-cs.c @@ -219,6 +219,9 @@ struct ovsdb_cs { struct uuid cid; struct hmap server_rows; + /* Whether to send 'set_db_change_aware'. */ + bool set_db_change_aware; + /* Clustered servers. */ uint64_t min_index; /* Minimum allowed index, to avoid regression. */ bool leader_only; /* If true, do not connect to Raft followers. */ @@ -331,6 +334,7 @@ ovsdb_cs_create(const char *db_name, int max_version, cs->request_id = NULL; cs->leader_only = true; cs->shuffle_remotes = true; + cs->set_db_change_aware = true; hmap_init(&cs->server_rows); return cs; @@ -461,7 +465,7 @@ ovsdb_cs_process_response(struct ovsdb_cs *cs, struct jsonrpc_msg *msg) cs->server.monitor_version = cs->server.max_version; ovsdb_cs_db_parse_monitor_reply(&cs->server, msg->result, cs->server.monitor_version); - if (ovsdb_cs_check_server_db(cs)) { + if (ovsdb_cs_check_server_db(cs) && cs->set_db_change_aware) { ovsdb_cs_send_db_change_aware(cs); } } else { @@ -1150,6 +1154,22 @@ ovsdb_cs_send_cond_change(struct ovsdb_cs *cs) } } +/* Database change awareness. */ + +/* By default, or if 'set_db_change_aware' is true, 'cs' will send + * 'set_db_change_aware' request to the server after receiving the _SERVER data + * (when the server supports it), which is useful for clients that intends to + * keep long connections to the server. Otherwise, 'cs' will not send the + * 'set_db_change_aware' request, which is more reasonable for short-lived + * connections to avoid unnecessary processing at the server side and possible + * error handling due to connections being closed by the clients before the + * responses are sent by the server. */ +void +ovsdb_cs_set_db_change_aware(struct ovsdb_cs *cs, bool set_db_change_aware) +{ + cs->set_db_change_aware = set_db_change_aware; +} + /* Clustered servers. */ /* By default, or if 'leader_only' is true, when 'cs' connects to a clustered diff --git a/lib/ovsdb-cs.h b/lib/ovsdb-cs.h index 5d5b58f0a0a..4cf9ca2b99c 100644 --- a/lib/ovsdb-cs.h +++ b/lib/ovsdb-cs.h @@ -142,6 +142,9 @@ unsigned int ovsdb_cs_set_condition(struct ovsdb_cs *, const char *table, const struct json *condition); unsigned int ovsdb_cs_get_condition_seqno(const struct ovsdb_cs *); +/* Database change awareness. */ +void ovsdb_cs_set_db_change_aware(struct ovsdb_cs *, bool set_db_change_aware); + /* Clustered servers. */ void ovsdb_cs_set_leader_only(struct ovsdb_cs *, bool leader_only); void ovsdb_cs_set_shuffle_remotes(struct ovsdb_cs *, bool shuffle); diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c index dbdfe45d87e..634fbb56df2 100644 --- a/lib/ovsdb-idl.c +++ b/lib/ovsdb-idl.c @@ -321,6 +321,14 @@ ovsdb_idl_set_shuffle_remotes(struct ovsdb_idl *idl, bool shuffle) ovsdb_cs_set_shuffle_remotes(idl->cs, shuffle); } +/* Passes 'set_db_change_aware' to ovsdb_cs_set_db_change_aware(). See that + * function for documentation. */ +void +ovsdb_idl_set_db_change_aware(struct ovsdb_idl *idl, bool set_db_change_aware) +{ + ovsdb_cs_set_db_change_aware(idl->cs, set_db_change_aware); +} + /* Reset min_index to 0. This prevents a situation where the client * thinks all databases have stale data, when they actually have all * been destroyed and rebuilt from scratch. diff --git a/lib/ovsdb-idl.h b/lib/ovsdb-idl.h index 9a3e19f2055..86fd2bd36f2 100644 --- a/lib/ovsdb-idl.h +++ b/lib/ovsdb-idl.h @@ -66,6 +66,8 @@ struct ovsdb_idl *ovsdb_idl_create_unconnected( const struct ovsdb_idl_class *, bool monitor_everything_by_default); void ovsdb_idl_set_remote(struct ovsdb_idl *, const char *remote, bool retry); void ovsdb_idl_set_shuffle_remotes(struct ovsdb_idl *, bool shuffle); +void ovsdb_idl_set_db_change_aware(struct ovsdb_idl *, + bool set_db_change_aware); void ovsdb_idl_reset_min_index(struct ovsdb_idl *); void ovsdb_idl_destroy(struct ovsdb_idl *); From 43266915a486bf2f7d52909cf58731b0077f0075 Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Thu, 12 Jan 2023 09:08:54 -0800 Subject: [PATCH 127/833] ovs-vsctl: Do not sent 'set_db_change_aware'. ovs-vsctl's connections are short-lived, so it doesn't care about db status changes. Reported-by: Tobias Hofmann Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2021-February/050914.html Acked-by: Dumitru Ceara Signed-off-by: Han Zhou Signed-off-by: Ilya Maximets --- utilities/ovs-vsctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index c1d47000616..2f5ac1a2622 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -180,6 +180,7 @@ main(int argc, char *argv[]) ovsdb_idl_set_shuffle_remotes(idl, shuffle_remotes); ovsdb_idl_set_remote(idl, db, retry); ovsdb_idl_set_leader_only(idl, leader_only); + ovsdb_idl_set_db_change_aware(idl, false); run_prerequisites(commands, n_commands, idl); /* Execute the commands. From 8986d4d5564401eeef3dea828b51fe8bae2cc8aa Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 16 Jan 2023 17:09:54 +0100 Subject: [PATCH 128/833] Prepare for 3.1.0. Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- Documentation/faq/releases.rst | 1 + NEWS | 2 +- configure.ac | 2 +- debian/changelog | 4 ++-- debian/rules | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 53ce230047c..9e1b4226200 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -216,6 +216,7 @@ Q: What DPDK version does each Open vSwitch release work with? 2.16.x 20.11.6 2.17.x 21.11.2 3.0.x 21.11.2 + 3.1.x 22.11.1 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? diff --git a/NEWS b/NEWS index e31b092be8a..82acf8e72a0 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Post-v3.0.0 +v3.1.0 - xx xxx xxxx -------------------- - ovs-vswitchd now detects changes in CPU affinity and adjusts the number of handler and revalidator threads if necessary. diff --git a/configure.ac b/configure.ac index adfd9f00679..9bf896c0133 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.0.90, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.1.0, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 32cea72d9ca..6b99df5af01 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,8 +1,8 @@ -openvswitch (3.0.90-1) unstable; urgency=low +openvswitch (3.1.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Mon, 15 Aug 2022 17:43:59 +0200 + -- Open vSwitch team Mon, 16 Jan 2023 16:51:00 +0100 openvswitch (3.0.0-1) unstable; urgency=low diff --git a/debian/rules b/debian/rules index ddbd4dc5c15..28c249d07cd 100755 --- a/debian/rules +++ b/debian/rules @@ -134,8 +134,8 @@ override_dh_python3: # Helper target for creating snapshots from upstream git DATE=$(shell date +%Y%m%d) # Upstream branch to track -BRANCH=branch-3.0 -VERSION=3.0.0 +BRANCH=branch-3.1 +VERSION=3.1.0 get-orig-snapshot: rm -Rf openvswitch-upstream From b02356ebbf6b049bfbd8938bbeb6cdb717001797 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 16 Jan 2023 17:09:55 +0100 Subject: [PATCH 129/833] Prepare for post-3.1.0 (3.1.90). Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ configure.ac | 2 +- debian/changelog | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 82acf8e72a0..83c126b0024 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Post-v3.1.0 +-------------------- + + v3.1.0 - xx xxx xxxx -------------------- - ovs-vswitchd now detects changes in CPU affinity and adjusts the number diff --git a/configure.ac b/configure.ac index 9bf896c0133..d05e544b549 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.1.0, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.1.90, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 6b99df5af01..c62bb5646db 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +openvswitch (3.1.90-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Mon, 16 Jan 2023 16:51:01 +0100 + openvswitch (3.1.0-1) unstable; urgency=low * New upstream version From 7402dae8f463f9ca091efb69bb346bfa64c3935b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 3 Jan 2023 17:22:36 +0100 Subject: [PATCH 130/833] ovsdb: Fix database statistics during the database replacement. The counter for the number of atoms has to be re-set to the number from the new database, otherwise the value will be incorrect. For example, this is causing the atom counter doubling after online conversion of a clustered database. Miscounting may also lead to increased memory consumption by the transaction history or otherwise too aggressive transaction history sweep. Fixes: 317b1bfd7dd3 ("ovsdb: Don't let transaction history grow larger than the database.") Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/ovsdb.c | 3 +++ tests/ovsdb-server.at | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 11786f37660..afec96264ca 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -715,5 +715,8 @@ ovsdb_replace(struct ovsdb *dst, struct ovsdb *src) dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role"); + /* Get statistics from the new database. */ + dst->n_atoms = src->n_atoms; + ovsdb_destroy(src); } diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 0828e6d04c1..bf539b6e5be 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1308,6 +1308,24 @@ dnl After removing all the bridges, the number of atoms in the database dnl should return to its initial value. AT_CHECK([test $(get_memory_value atoms) -eq $initial_db_atoms]) +dnl Add a few more resources. +for i in $(seq 1 10); do + cmd=$(add_ports $i $(($i / 4 + 1))) + AT_CHECK([ovs-vsctl --no-wait add-br br$i $cmd]) +done +check_atoms + +db_atoms_before_conversion=$(get_memory_value atoms) + +dnl Trigger online conversion. +AT_CHECK([ovsdb-client convert $abs_top_srcdir/vswitchd/vswitch.ovsschema], + [0], [ignore], [ignore]) + +dnl Check that conversion didn't change the number of atoms and the history +dnl still has a reasonable size. +check_atoms +AT_CHECK([test $(get_memory_value atoms) -eq $db_atoms_before_conversion]) + OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP From e24b68fa708c1c31388bee24ebe781dc49b284da Mon Sep 17 00:00:00 2001 From: David Marchand Date: Tue, 17 Jan 2023 09:04:25 +0100 Subject: [PATCH 131/833] netdev-dpdk: Fix deadlock due to virtqueue stats retrieval. As Ilya reported, we have a ABBA deadlock between DPDK vq->access_lock and OVS dev->mutex when OVS main thread refreshes statistics, while a vring state change event is being processed for a same vhost port. To break from this situation, move vring state change notifications handling from the vhost-events DPDK thread to a dedicated thread using a lockless queue. Besides, for the case when a bogus/malicious guest is sending continuous updates, add a counter of pending updates in the queue and warn if a threshold of 1000 entries is reached. Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2023-January/401101.html Fixes: 3b29286db1c5 ("netdev-dpdk: Add per virtqueue statistics.") Reviewed-by: Maxime Coquelin Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 122 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 102 insertions(+), 20 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 5e2d64651db..ab5b8223efd 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -49,6 +49,7 @@ #include "dpif-netdev.h" #include "fatal-signal.h" #include "if-notifier.h" +#include "mpsc-queue.h" #include "netdev-provider.h" #include "netdev-vport.h" #include "odp-util.h" @@ -4255,30 +4256,38 @@ destroy_device(int vid) } } -static int -vring_state_changed(int vid, uint16_t queue_id, int enable) +static struct mpsc_queue vhost_state_change_queue + = MPSC_QUEUE_INITIALIZER(&vhost_state_change_queue); +static atomic_uint64_t vhost_state_change_queue_size; + +struct vhost_state_change { + struct mpsc_queue_node node; + char ifname[IF_NAME_SZ]; + uint16_t queue_id; + int enable; +}; + +static void +vring_state_changed__(struct vhost_state_change *sc) { struct netdev_dpdk *dev; bool exists = false; - int qid = queue_id / VIRTIO_QNUM; - bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ; - char ifname[IF_NAME_SZ]; - - rte_vhost_get_ifname(vid, ifname, sizeof ifname); + int qid = sc->queue_id / VIRTIO_QNUM; + bool is_rx = (sc->queue_id % VIRTIO_QNUM) == VIRTIO_TXQ; ovs_mutex_lock(&dpdk_mutex); LIST_FOR_EACH (dev, list_node, &dpdk_list) { ovs_mutex_lock(&dev->mutex); - if (nullable_string_is_equal(ifname, dev->vhost_id)) { + if (nullable_string_is_equal(sc->ifname, dev->vhost_id)) { if (is_rx) { bool old_state = dev->vhost_rxq_enabled[qid]; - dev->vhost_rxq_enabled[qid] = enable != 0; + dev->vhost_rxq_enabled[qid] = sc->enable != 0; if (old_state != dev->vhost_rxq_enabled[qid]) { netdev_change_seq_changed(&dev->up); } } else { - if (enable) { + if (sc->enable) { dev->tx_q[qid].map = qid; } else { dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED; @@ -4295,11 +4304,69 @@ vring_state_changed(int vid, uint16_t queue_id, int enable) if (exists) { VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' " - "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx", - qid, ifname, (enable == 1) ? "enabled" : "disabled"); + "changed to \'%s\'", sc->queue_id, is_rx ? "rx" : "tx", + qid, sc->ifname, sc->enable == 1 ? "enabled" : "disabled"); } else { - VLOG_INFO("vHost Device '%s' not found", ifname); - return -1; + VLOG_INFO("vHost Device '%s' not found", sc->ifname); + } +} + +#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN 1 +#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX 64 +static void * +netdev_dpdk_vhost_events_main(void *arg OVS_UNUSED) +{ + mpsc_queue_acquire(&vhost_state_change_queue); + + for (;;) { + struct mpsc_queue_node *node; + uint64_t backoff; + + backoff = NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN; + while (mpsc_queue_tail(&vhost_state_change_queue) == NULL) { + xnanosleep(backoff * 1E6); + if (backoff < NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX) { + backoff <<= 1; + } + } + + MPSC_QUEUE_FOR_EACH_POP (node, &vhost_state_change_queue) { + struct vhost_state_change *sc; + + sc = CONTAINER_OF(node, struct vhost_state_change, node); + vring_state_changed__(sc); + free(sc); + atomic_count_dec64(&vhost_state_change_queue_size); + } + } + + OVS_NOT_REACHED(); + mpsc_queue_release(&vhost_state_change_queue); + + return NULL; +} + +static int +vring_state_changed(int vid, uint16_t queue_id, int enable) +{ + static struct vlog_rate_limit vhost_rl = VLOG_RATE_LIMIT_INIT(5, 5); + struct vhost_state_change *sc; + + sc = xmalloc(sizeof *sc); + if (!rte_vhost_get_ifname(vid, sc->ifname, sizeof sc->ifname)) { + uint64_t queue_size; + + sc->queue_id = queue_id; + sc->enable = enable; + mpsc_queue_insert(&vhost_state_change_queue, &sc->node); + queue_size = atomic_count_inc64(&vhost_state_change_queue_size); + if (queue_size >= 1000) { + VLOG_WARN_RL(&vhost_rl, "vring state change queue has %"PRIu64" " + "entries. Last update was for socket %s.", queue_size, + sc->ifname); + } + } else { + free(sc); } return 0; @@ -4366,12 +4433,6 @@ netdev_dpdk_get_vid(const struct netdev_dpdk *dev) return ovsrcu_index_get(&dev->vid); } -struct ingress_policer * -netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev) -{ - return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer); -} - static int netdev_dpdk_class_init(void) { @@ -4409,8 +4470,27 @@ netdev_dpdk_class_init(void) return 0; } +static int +netdev_dpdk_vhost_class_init(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + + if (ovsthread_once_start(&once)) { + ovs_thread_create("ovs_vhost", netdev_dpdk_vhost_events_main, NULL); + ovsthread_once_done(&once); + } + + return 0; +} + /* QoS Functions */ +struct ingress_policer * +netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev) +{ + return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer); +} + /* * Initialize QoS configuration operations. */ @@ -5751,6 +5831,7 @@ static const struct netdev_class dpdk_class = { static const struct netdev_class dpdk_vhost_class = { .type = "dpdkvhostuser", NETDEV_DPDK_CLASS_COMMON, + .init = netdev_dpdk_vhost_class_init, .construct = netdev_dpdk_vhost_construct, .destruct = netdev_dpdk_vhost_destruct, .send = netdev_dpdk_vhost_send, @@ -5766,6 +5847,7 @@ static const struct netdev_class dpdk_vhost_class = { static const struct netdev_class dpdk_vhost_client_class = { .type = "dpdkvhostuserclient", NETDEV_DPDK_CLASS_COMMON, + .init = netdev_dpdk_vhost_class_init, .construct = netdev_dpdk_vhost_client_construct, .destruct = netdev_dpdk_vhost_destruct, .set_config = netdev_dpdk_vhost_client_set_config, From f62629a55894546ff043e8a116c3c57aff73c285 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 18 Jan 2023 16:23:55 +0000 Subject: [PATCH 132/833] dpif-netdev: Set timer slack for PMD threads. The default Linux timer slack groups timer expires into 50 uS intervals. With some traffic patterns this can mean that returning to process packets after a sleep takes too long and packets are dropped. Add a helper to util.c and set use it to reduce the timer slack for PMD threads, so that sleeps with smaller resolutions can be done to prevent sleeping for too long. Fixes: de3bbdc479a9 ("dpif-netdev: Add PMD load based sleeping.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2023-January/401121.html Reported-by: Ilya Maximets Signed-off-by: David Marchand Co-authored-by: Kevin Traynor Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 5 ----- lib/dpif-netdev.c | 4 ++++ lib/util.c | 16 ++++++++++++++++ lib/util.h | 1 + 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index 604ac3f6b1d..0c3bb717f85 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -373,10 +373,5 @@ system configuration (e.g. enabling processor C-states) and workloads. extra latency before the PMD thread returns to processing packets at full rate. -.. note:: - - By default Linux kernel groups timer expirations and this can add an - overhead of up to 50 microseconds to a requested timer expiration. - .. _ovs-vswitchd(8): http://openvswitch.org/support/dist-docs/ovs-vswitchd.8.html diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index a47d54c6fde..4f06e3f4eb1 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -171,6 +171,9 @@ static struct odp_support dp_netdev_support = { /* Time in microseconds to try RCU quiescing. */ #define PMD_RCU_QUIESCE_INTERVAL 10000LL +/* Timer resolution for PMD threads in nanoseconds. */ +#define PMD_TIMER_RES_NS 1000 + /* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ #define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) /* Time in uS to increment a pmd thread sleep time. */ @@ -6962,6 +6965,7 @@ pmd_thread_main(void *f_) poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); dfc_cache_init(&pmd->flow_cache); pmd_alloc_static_tx_qid(pmd); + set_timer_resolution(PMD_TIMER_RES_NS); reload: atomic_count_init(&pmd->pmd_overloaded, 0); diff --git a/lib/util.c b/lib/util.c index 7576eb06eb3..96a71550d91 100644 --- a/lib/util.c +++ b/lib/util.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef __linux__ +#include +#endif #include #include #include "bitmap.h" @@ -2419,6 +2422,19 @@ xnanosleep_no_quiesce(uint64_t nanoseconds) xnanosleep__(nanoseconds); } +#if __linux__ +void +set_timer_resolution(unsigned long nanoseconds) +{ + prctl(PR_SET_TIMERSLACK, nanoseconds); +} +#else +void +set_timer_resolution(unsigned long nanoseconds OVS_UNUSED) +{ +} +#endif + /* Determine whether standard output is a tty or not. This is useful to decide * whether to use color output or not when --color option for utilities is set * to `auto`. diff --git a/lib/util.h b/lib/util.h index f35f330217c..62801e85f55 100644 --- a/lib/util.h +++ b/lib/util.h @@ -594,6 +594,7 @@ ovs_u128_is_superset(ovs_u128 super, ovs_u128 sub) void xsleep(unsigned int seconds); void xnanosleep(uint64_t nanoseconds); void xnanosleep_no_quiesce(uint64_t nanoseconds); +void set_timer_resolution(unsigned long nanoseconds); bool is_stdout_a_tty(void); From 948767a18d5c2fb73baf85e62f52fd5da990d077 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 18 Jan 2023 16:23:56 +0000 Subject: [PATCH 133/833] dpif-netdev: Set PMD load based sleep start/inc to 1 us. Now that the timer slack for the PMD threads is reduced we can also reduce the start/increment for PMD load based sleeping to match it. This will further reduce initial sleep times making it more resilient to interfaces that might be sensitive to large sleep times. Signed-off-by: Kevin Traynor Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 15 ++++++--------- lib/dpif-netdev.c | 3 +-- tests/pmd.at | 6 +++--- vswitchd/vswitch.xml | 4 ---- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index 0c3bb717f85..e70986d16b2 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -334,10 +334,7 @@ when there is no load or very-low load on all the Rx queues they poll. This can be enabled by setting the max requested sleep time (in microseconds) for a PMD thread:: - $ ovs-vsctl set open_vswitch . other_config:pmd-maxsleep=500 - -Non-zero values will be rounded up to the nearest 10 microseconds to avoid -requesting very small sleep times. + $ ovs-vsctl set open_vswitch . other_config:pmd-maxsleep=50 With a non-zero max value a PMD may request to sleep by an incrementing amount of time up to the maximum time. If at any point the threshold of at least half @@ -356,12 +353,12 @@ Sleep time statistics for 10 secs can be seen with:: $ ovs-appctl dpif-netdev/pmd-stats-clear \ && sleep 10 && ovs-appctl dpif-netdev/pmd-perf-show -Example output, showing that during the last 10 seconds, 76.8% of iterations -had a sleep of some length. The total amount of sleep time was 9.15 seconds and -the average sleep time per iteration was 46 microseconds:: +Example output, showing that during the last 10 seconds, 74.5% of iterations +had a sleep of some length. The total amount of sleep time was 9.06 seconds +and the average sleep time where a sleep was requested was 9 microseconds:: - - sleep iterations: 153994 ( 76.8 % of iterations) - Sleep time (us): 9159399 ( 59 us/iteration avg.) + - sleep iterations: 977037 ( 74.5 % of iterations) + Sleep time (us): 9068841 ( 9 us/iteration avg.) Any potential power saving from PMD load based sleeping is dependent on the system configuration (e.g. enabling processor C-states) and workloads. diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4f06e3f4eb1..c9f7179c3b4 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -177,7 +177,7 @@ static struct odp_support dp_netdev_support = { /* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ #define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) /* Time in uS to increment a pmd thread sleep time. */ -#define PMD_SLEEP_INC_US 10 +#define PMD_SLEEP_INC_US 1 struct dpcls { struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ @@ -4983,7 +4983,6 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) set_pmd_auto_lb(dp, autolb_state, log_autolb); pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", 0); - pmd_max_sleep = ROUND_UP(pmd_max_sleep, 10); pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep); if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) { diff --git a/tests/pmd.at b/tests/pmd.at index e0f58f7a606..c707f762c78 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1266,7 +1266,7 @@ OVS_WAIT_UNTIL([tail ovs-vswitchd.log | grep "PMD load based sleeps are disabled dnl Check low value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="1"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10 usecs."]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 1 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) dnl Check high value max sleep @@ -1294,8 +1294,8 @@ OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) dnl Check rounding get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="491"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 500 usecs."]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="499"]) +OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 499 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) OVS_VSWITCHD_STOP diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 8c4acfb1817..2b57fc0e3f1 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -806,10 +806,6 @@ that the PMD will not sleep regardless of the load from the Rx queues that it polls.

-

- To avoid requesting very small sleeps (e.g. less than 10 us) the - value will be rounded up to the nearest 10 us. -

The maximum value is 10000 microseconds.

From e0e4266a90a28c2313cde6951d32d70680af327d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 3 Jan 2023 18:47:35 +0100 Subject: [PATCH 134/833] ovsdb-types: Add functions to compare types for equality. Will be used in the next commit to optimize database conversion. Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- lib/ovsdb-types.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ lib/ovsdb-types.h | 3 +++ 2 files changed, 64 insertions(+) diff --git a/lib/ovsdb-types.c b/lib/ovsdb-types.c index 61efe59cffa..197cee1c67a 100644 --- a/lib/ovsdb-types.c +++ b/lib/ovsdb-types.c @@ -275,6 +275,58 @@ ovsdb_base_type_is_valid(const struct ovsdb_base_type *base) } } +bool +ovsdb_base_type_equals(const struct ovsdb_base_type *a, + const struct ovsdb_base_type *b) +{ + if (a == b) { + return true; + } + + if (a->type != b->type) { + return false; + } + + if ((a->enum_ && !b->enum_) || (!a->enum_ && b->enum_)) { + return false; + } else if (a->enum_ && + !ovsdb_datum_equals(a->enum_, b->enum_, + ovsdb_base_type_get_enum_type(a->type))) { + return false; + } + + switch (a->type) { + case OVSDB_TYPE_VOID: + return true; + + case OVSDB_TYPE_INTEGER: + return a->integer.min == b->integer.min + && a->integer.max == b->integer.max; + + case OVSDB_TYPE_REAL: + return a->real.min == b->real.min && a->real.max == b->real.max; + + case OVSDB_TYPE_BOOLEAN: + return true; + + case OVSDB_TYPE_STRING: + return a->string.minLen == b->string.minLen + && a->string.maxLen == b->string.maxLen; + + case OVSDB_TYPE_UUID: + /* Not comparing the table pointer here, only the table name, as this + * function can be used to compare types from different databases, so + * pointers will be different. */ + return a->uuid.refType == b->uuid.refType + && nullable_string_is_equal(a->uuid.refTableName, + b->uuid.refTableName); + + case OVSDB_N_TYPES: + default: + OVS_NOT_REACHED(); + } +} + bool ovsdb_base_type_has_constraints(const struct ovsdb_base_type *base) { @@ -568,6 +620,15 @@ ovsdb_type_is_valid(const struct ovsdb_type *type) && type->n_max >= 1); } +bool +ovsdb_type_equals(const struct ovsdb_type *a, const struct ovsdb_type *b) +{ + return ovsdb_base_type_equals(&a->key, &b->key) + && ovsdb_base_type_equals(&a->value, &b->value) + && a->n_min == b->n_min + && a->n_max == b->n_max; +} + static struct ovsdb_error * n_from_json(const struct json *json, unsigned int *n) { diff --git a/lib/ovsdb-types.h b/lib/ovsdb-types.h index b9eb0928df6..9777efea332 100644 --- a/lib/ovsdb-types.h +++ b/lib/ovsdb-types.h @@ -107,6 +107,8 @@ void ovsdb_base_type_clone(struct ovsdb_base_type *, void ovsdb_base_type_destroy(struct ovsdb_base_type *); bool ovsdb_base_type_is_valid(const struct ovsdb_base_type *); +bool ovsdb_base_type_equals(const struct ovsdb_base_type *, + const struct ovsdb_base_type *); bool ovsdb_base_type_has_constraints(const struct ovsdb_base_type *); void ovsdb_base_type_clear_constraints(struct ovsdb_base_type *); const struct ovsdb_type *ovsdb_base_type_get_enum_type(enum ovsdb_atomic_type); @@ -157,6 +159,7 @@ void ovsdb_type_clone(struct ovsdb_type *, const struct ovsdb_type *); void ovsdb_type_destroy(struct ovsdb_type *); bool ovsdb_type_is_valid(const struct ovsdb_type *); +bool ovsdb_type_equals(const struct ovsdb_type *, const struct ovsdb_type *); static inline bool ovsdb_type_is_scalar(const struct ovsdb_type *); static inline bool ovsdb_type_is_optional(const struct ovsdb_type *); From b7f540129b587616d9977020bb529dcd8490f5c4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 3 Jan 2023 18:47:36 +0100 Subject: [PATCH 135/833] ovsdb: Don't convert unchanged columns during database conversion. Column conversion involves converting it to json and back. These are heavy operations and completely unnecessary if the column type didn't change. Most of the time schema changes only add new columns/tables without changing existing ones at all. Clone the column instead to save some time. This will also save time while destroying the original database since we will only need to reduce reference counters on unchanged datum objects that were cloned instead of actually freeing them. Additionally, moving the column lookup into a separate loop, so we don't perform an shash lookup for each column of each row. Testing with 440 MB OVN_Southbound database shows 70% speed up of the ovsdb_convert() function. Execution time reduced from 15 to 4.4 seconds, 3.5 of which is a post-conversion transaction replay. Overall time required for the online database conversion reduced from 37 to 25 seconds. Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/file.c | 59 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index fdc289ad1b7..2d887e53ebc 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -270,22 +270,48 @@ ovsdb_convert_table(struct ovsdb_txn *txn, const struct ovsdb_table *src_table, struct ovsdb_table *dst_table) { + const struct ovsdb_column **dst_columns; + struct ovsdb_error *error = NULL; const struct ovsdb_row *src_row; + unsigned long *src_equal; + struct shash_node *node; + size_t n_src_columns; + + n_src_columns = shash_count(&src_table->schema->columns); + src_equal = bitmap_allocate(n_src_columns); + dst_columns = xzalloc(n_src_columns * sizeof *dst_columns); + + SHASH_FOR_EACH (node, &src_table->schema->columns) { + const struct ovsdb_column *src_column = node->data; + + if (src_column->index == OVSDB_COL_UUID || + src_column->index == OVSDB_COL_VERSION) { + continue; + } + + const struct ovsdb_column *dst_column = + shash_find_data(&dst_table->schema->columns, src_column->name); + + if (!dst_column) { + continue; + } + + dst_columns[src_column->index] = dst_column; + + if (ovsdb_type_equals(&src_column->type, &dst_column->type)) { + bitmap_set1(src_equal, src_column->index); + } + } + HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) { struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); - struct shash_node *node; SHASH_FOR_EACH (node, &src_table->schema->columns) { const struct ovsdb_column *src_column = node->data; - if (src_column->index == OVSDB_COL_UUID || - src_column->index == OVSDB_COL_VERSION) { - continue; - } + const struct ovsdb_column *dst_column; - const struct ovsdb_column *dst_column - = shash_find_data(&dst_table->schema->columns, - src_column->name); + dst_column = dst_columns[src_column->index]; if (!dst_column) { continue; } @@ -293,19 +319,30 @@ ovsdb_convert_table(struct ovsdb_txn *txn, ovsdb_datum_destroy(&dst_row->fields[dst_column->index], &dst_column->type); - struct ovsdb_error *error = ovsdb_datum_convert( + if (bitmap_is_set(src_equal, src_column->index)) { + /* This column didn't change - no need to convert. */ + ovsdb_datum_clone(&dst_row->fields[dst_column->index], + &src_row->fields[src_column->index]); + continue; + } + + error = ovsdb_datum_convert( &dst_row->fields[dst_column->index], &dst_column->type, &src_row->fields[src_column->index], &src_column->type); if (error) { ovsdb_datum_init_empty(&dst_row->fields[dst_column->index]); ovsdb_row_destroy(dst_row); - return error; + goto exit; } } ovsdb_txn_row_insert(txn, dst_row); } - return NULL; + +exit: + free(dst_columns); + bitmap_free(src_equal); + return error; } /* Copies the data in 'src', converts it into the schema specified in From ebaee446240133f5ec5064553535dfe392f60999 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 9 Jan 2023 19:28:46 +0100 Subject: [PATCH 136/833] netdev-dpdk: Free mbufs in bulk. rte_pktmbuf_free_bulk() function was introduced in 19.11 and became stable in 21.11. Use it to free arrays of mbufs instead of freeing packets one by one. In simple V2V testing with 64B packets, 2 PMD threads and bidirectional traffic this change improves performance by 3.5 - 4.5 %. Reviewed-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index ab5b8223efd..fb0dd43f75c 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2287,13 +2287,8 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, } if (OVS_UNLIKELY(nb_tx != cnt)) { - /* Free buffers, which we couldn't transmit, one at a time (each - * packet could come from a different mempool) */ - int i; - - for (i = nb_tx; i < cnt; i++) { - rte_pktmbuf_free(pkts[i]); - } + /* Free buffers, which we couldn't transmit. */ + rte_pktmbuf_free_bulk(&pkts[nb_tx], cnt - nb_tx); } return cnt - nb_tx; @@ -2769,9 +2764,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, } pkts = (struct rte_mbuf **) batch->packets; - for (int i = 0; i < vhost_batch_cnt; i++) { - rte_pktmbuf_free(pkts[i]); - } + rte_pktmbuf_free_bulk(pkts, vhost_batch_cnt); return 0; } From e5b3cb9995445cd83ab1d2811bfd79ca03dd46d4 Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Mon, 16 Jan 2023 19:01:29 -0800 Subject: [PATCH 137/833] revalidator: Allow min-revalidator-pps to be 0. Today the minimum value for this setting is 1. This patch allows it to be 0, meaning not checking pps at all, and always do revalidation. This is particularly useful for environments where some of the applications with long-lived connections may have very low traffic for certain period but have high rate of burst periodically. It is desirable to keep the datapath flows instead of periodically deleting them to avoid burst of packet miss to userspace. When setting to 0, there may be more datapath flows to be revalidated, resulting in higher CPU cost of revalidator threads. This is the downside but in certain cases this is still more desirable than packet misses to user space. Signed-off-by: Han Zhou Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 4 ++++ ofproto/ofproto.c | 2 +- vswitchd/vswitch.xml | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index ad96354966f..442141ccd91 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -2099,6 +2099,10 @@ should_revalidate(const struct udpif *udpif, uint64_t packets, { long long int metric, now, duration; + if (!ofproto_min_revalidate_pps) { + return true; + } + if (!used) { /* Always revalidate the first time a flow is dumped. */ return true; diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 17f636ed9dc..e4a1bee769d 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -724,7 +724,7 @@ ofproto_set_max_revalidator(unsigned max_revalidator) void ofproto_set_min_revalidate_pps(unsigned min_revalidate_pps) { - ofproto_min_revalidate_pps = min_revalidate_pps ? min_revalidate_pps : 1; + ofproto_min_revalidate_pps = min_revalidate_pps; } /* If forward_bpdu is true, the NORMAL action will forward frames with diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 2b57fc0e3f1..64f23302dd1 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -205,10 +205,11 @@
+ type='{"type": "integer", "minInteger": 0}'>

Set minimum pps that flow must have in order to be revalidated when revalidation duration exceeds half of max-revalidator config variable. + Setting to 0 means always revalidate flows regardless of pps.

The default is 5. From c3ed0bf34b8a77af4dd4e6d7a6991ce92b7f1aaf Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Mon, 23 Jan 2023 11:57:19 +0100 Subject: [PATCH 138/833] tests/mfex: Silence Blowfish/CAST5 deprecation warnings. On Fedora 37 (at least), MFEX unit tests are failing because of deprecation warnings: $ python3 tests/mfex_fuzzy.py test_traffic.pcap 2000 /usr/lib/python3.11/site-packages/scapy/layers/ipsec.py:471: CryptographyDeprecationWarning: Blowfish has been deprecated cipher=algorithms.Blowfish, /usr/lib/python3.11/site-packages/scapy/layers/ipsec.py:485: CryptographyDeprecationWarning: CAST5 has been deprecated cipher=algorithms.CAST5, Signed-off-by: Robin Jarry Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/mfex_fuzzy.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/mfex_fuzzy.py b/tests/mfex_fuzzy.py index ee2183f8eb3..15f7f4e517b 100755 --- a/tests/mfex_fuzzy.py +++ b/tests/mfex_fuzzy.py @@ -1,7 +1,15 @@ #!/usr/bin/python3 import sys +import warnings +import cryptography +warnings.filterwarnings( + "ignore", + category=cryptography.CryptographyDeprecationWarning, + message=r"(blowfish|cast5)", +) +# flake8: noqa: E402 from scapy.all import RandMAC, RandIP, PcapWriter, RandIP6, RandShort, fuzz from scapy.all import IPv6, Dot1Q, IP, Ether, UDP, TCP, random From 6ad35dd80e0c4695c6ff0ae8605738f5bb1b56e7 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 23 Jan 2023 12:03:29 +0100 Subject: [PATCH 139/833] utilities: Add revalidator measurement script and needed USDT probes. This patch adds a Python script that can be used to analyze the revalidator runs by providing statistics (including some real time graphs). The USDT events can also be captured to a file and used for later offline analysis. The following blog explains the Open vSwitch revalidator implementation and how this tool can help you understand what is happening in your system. https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained Signed-off-by: Eelco Chaudron Acked-by: Adrian Moreno Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- Documentation/topics/usdt-probes.rst | 84 +++ ofproto/ofproto-dpif-upcall.c | 11 + utilities/automake.mk | 2 + utilities/usdt-scripts/reval_monitor.py | 894 ++++++++++++++++++++++++ 4 files changed, 991 insertions(+) create mode 100755 utilities/usdt-scripts/reval_monitor.py diff --git a/Documentation/topics/usdt-probes.rst b/Documentation/topics/usdt-probes.rst index 004817b1c54..e527f43bab6 100644 --- a/Documentation/topics/usdt-probes.rst +++ b/Documentation/topics/usdt-probes.rst @@ -214,6 +214,10 @@ Available probes in ``ovs_vswitchd``: - dpif_recv:recv_upcall - main:poll_block - main:run_start +- revalidate_ukey\_\_:entry +- revalidate_ukey\_\_:exit +- udpif_revalidator:start_dump +- udpif_revalidator:sweep_done dpif_netlink_operate\_\_:op_flow_del @@ -328,6 +332,7 @@ probe main:run_start ~~~~~~~~~~~~~~~~~~~~ **Description**: + The ovs-vswitchd's main process contains a loop that runs every time some work needs to be done. This probe gets triggered every time the loop starts from the beginning. See also the ``main:poll_block`` probe below. @@ -345,6 +350,7 @@ probe main:poll_block ~~~~~~~~~~~~~~~~~~~~~ **Description**: + The ovs-vswitchd's main process contains a loop that runs every time some work needs to be done. This probe gets triggered every time the loop is done, and it's about to wait for being re-started by a poll_block() call returning. @@ -359,6 +365,84 @@ See also the ``main:run_start`` probe above. - ``utilities/usdt-scripts/bridge_loop.bt`` +revalidate_ukey\_\_:entry +~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +This probe gets triggered on entry of the revalidate_ukey__() function. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(uint16_t) tcp_flags`` +- *arg3*: ``(struct ofpbuf *) odp_actions`` +- *arg4*: ``(struct recirc_refs *) recircs`` +- *arg5*: ``(struct xlate_cache *) xcache`` + +**Script references**: + +- ``utilities/usdt-scripts/reval_monitor.py`` + + +revalidate_ukey\_\_:exit +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +This probe gets triggered right before the revalidate_ukey__() function exits. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(enum reval_result) result`` + +**Script references**: + +*None* + + +udpif_revalidator:start_dump +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +The ovs-vswitchd's revalidator process contains a loop that runs every time +revalidation work is needed. This probe gets triggered every time the +dump phase has started. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(size_t) n_flows`` + +**Script references**: + +- ``utilities/usdt-scripts/reval_monitor.py`` + + +udpif_revalidator:sweep_done +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +The ovs-vswitchd's revalidator process contains a loop that runs every time +revalidation work is needed. This probe gets triggered every time the +sweep phase was completed. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(size_t) n_flows`` +- *arg2*: ``(unsigned) MIN(ofproto_max_idle, ofproto_max_revalidator)`` + +**Script references**: + +- ``utilities/usdt-scripts/reval_monitor.py`` + + Adding your own probes ---------------------- diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 442141ccd91..31ac02d116f 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -42,6 +42,7 @@ #include "seq.h" #include "tunnel.h" #include "unixctl.h" +#include "openvswitch/usdt-probes.h" #include "openvswitch/vlog.h" #include "lib/netdev-provider.h" @@ -978,6 +979,7 @@ udpif_revalidator(void *arg) terse_dump = udpif_use_ufid(udpif); udpif->dump = dpif_flow_dump_create(udpif->dpif, terse_dump, NULL); + OVS_USDT_PROBE(udpif_revalidator, start_dump, udpif, n_flows); } } @@ -1029,6 +1031,9 @@ udpif_revalidator(void *arg) duration); } + OVS_USDT_PROBE(udpif_revalidator, sweep_done, udpif, n_flows, + MIN(ofproto_max_idle, ofproto_max_revalidator)); + poll_timer_wait_until(start_time + MIN(ofproto_max_idle, ofproto_max_revalidator)); seq_wait(udpif->reval_seq, last_reval_seq); @@ -2239,6 +2244,9 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, .wc = &wc, }; + OVS_USDT_PROBE(revalidate_ukey__, entry, udpif, ukey, tcp_flags, + odp_actions, recircs, xcache); + result = UKEY_DELETE; xoutp = NULL; netflow = NULL; @@ -2302,6 +2310,9 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, netflow_flow_clear(netflow, &ctx.flow); } xlate_out_uninit(xoutp); + + OVS_USDT_PROBE(revalidate_ukey__, exit, udpif, ukey, result); + return result; } diff --git a/utilities/automake.mk b/utilities/automake.mk index b020511c61c..37d679f8227 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -23,6 +23,7 @@ scripts_DATA += utilities/ovs-lib usdt_SCRIPTS += \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py @@ -69,6 +70,7 @@ EXTRA_DIST += \ utilities/docker/debian/build-kernel-modules.sh \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py MAN_ROOTS += \ diff --git a/utilities/usdt-scripts/reval_monitor.py b/utilities/usdt-scripts/reval_monitor.py new file mode 100755 index 00000000000..5f69998c806 --- /dev/null +++ b/utilities/usdt-scripts/reval_monitor.py @@ -0,0 +1,894 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script information: +# ------------------- +# reval_monitor.py uses various user-defined tracepoints to get all the +# revalidator-process related variables and will display them in a (dynamic) +# graph. In addition, it will also dump the data to the console +# in a CSV format. Note that all the graphical output can be disabled. +# +# All the USDT events can be saved to a file and than can be used to +# replay the trace offline and still get the plots. +# +# The script can simple be invoked without any options, and it will try +# to find the running ovs-vswitchd instance: +# +# # ./reval_monitor.py +# # Starting trace @2022-09-20T04:07:43.588749 (08:07:43 UTC) +# ts_start, ts_complete, n_flows, n_reval_flows, avg_n_flows, max_n_flows, +# flow_limit, dump_duration, poll_wait, actual_wait +# 1741367714251645, 1741367714532545, 0, 0, 0, 10000, 69000, 1, 500, 500.52 +# 1741368215056961, 1741368215318223, 0, 0, 0, 10000, 69000, 1, 500, 500.55 +# 1741368715865871, 1741368716107089, 0, 0, 0, 10000, 69000, 1, 500, 499.48 +# ^C# Stopping trace @2022-09-20T04:07:49.893827 (08:07:49 UTC) +# +# IMPORTANT NOTE: This script only works when a single datapath is configured! +# 2nd IMPORTANT NOTE: ovs-vswitchd either needs to be built with debug info +# or the debug info package needs to be installed! +# +# The following are the available options: +# +# usage: reval_monitor.py [-h] [-c] [--buffer-page-count NUMBER] +# [-D [DEBUG]] [-g] [--no-ukey-count] +# [-p VSWITCHD_PID] [-P PAHOLE] [-r FILE] [-R] +# [-u SECONDS] [-w FILE] [-W FILE] +# +# options: +# -h, --help show this help message and exit +# -c, --compress-output +# Compress output, i.e. only dump changes in +# the dataset +# --buffer-page-count NUMBER +# Number of BPF ring buffer pages, default +# 1024 +# -D [DEBUG], --debug [DEBUG] +# Enable eBPF debugging +# -g, --no-gui Do not use the gui to display plots +# --no-ukey-count No revalidate_ukey() counting +# -p VSWITCHD_PID, --pid VSWITCHD_PID +# ovs-vswitch's PID +# -P PAHOLE, --pahole PAHOLE +# Pahole executable to use, default pahole +# -r FILE, --read-events FILE +# Read events from instead of +# installing tracepoints +# -R, --no-realtime-plots +# Do not show realtime plot while tracing +# -u SECONDS, --update-interval SECONDS +# Seconds to wait between real time update, +# default 1 +# -w FILE, --write-events FILE +# Write events to +# -W FILE, --write-charts FILE +# Write overall charts to .png + +# [-D [DEBUG]] [-g] [--no-ukey-count] +# [-p VSWITCHD_PID] [-r FILE] [-R] +# [-u SECONDS] [-w FILE] [-W FILE] +# +# The -g option disabled all GUI output of matplotlib, -R only disables the +# real-time plots. As real-time plots are rather slow, the -u option can be +# used to only update the graph every x seconds, which might speed up the +# processing. +# +# The --no-ukey-count option disables counting of the number of flows actually +# being revalidated against the current OpenFlow ruleset. This will not install +# the specific tracepoint which would be called for each flow being +# revalidated. +# +# What is plotted in the graphs (and dumped in the CSV output)? +# - n_flows: Number of flows active in the system. +# - n_reval_flows: Number of flows that where revalidated against the OpenFlow +# ruleset. +# - dump_duration: Time it took to dump and process all flows. +# - avg_n_flows: Average number of flows in the system. +# - max_n_flows: Maximum number of flows in the system. +# - flow_limit: Dynamic flow limit. +# - poll_wait: Time requested for the poll wait. +# - actual_wait: Time it took to be woken up. +# +# Dependencies: +# This script needs the 'readelf' binary to be available. In addition, it also +# needs pahole to be installed, and it needs a version that is equal or newer +# than the following commit on the next branch: +# +# https://git.kernel.org/pub/scm/devel/pahole/pahole.git/?h=next +# c55b13b9d785 ("WIP: Remove DW_TAG_atomic_type when encoding BTF") +# +# To use a locally compiled pahole the --pahole option can be used. +# For example: +# # ./reval_monitor.py --pahole ~/pahole/build/pahole -g +# Starting trace @2022-12-20T14:57:26.077815 (13:57:26 UTC) +# ts_start, ts_complete, n_flows, n_reval_flows, avg_n_flows, max_n_flows, \ +# flow_limit, dump_duration, poll_wait, actual_wait +# 4202771850983494, 4202771851472838, 0, 0, 0, 0, 10000, 1, 500, 15.06 +# 4202771866531996, 4202771867713366, 0, 0, 0, 0, 10000, 1, 500, 4.23 +# 4202771871941979, 4202771872749915, 0, 0, 0, 0, 10000, 1, 500, 500.02 +# 4202772372770361, 4202772373531820, 0, 0, 0, 0, 10000, 1, 500, 499.96 +# 4202772873487942, 4202772874514753, 0, 0, 0, 0, 10000, 1, 500, 500.01 +# 4202773374528435, 4202773375695054, 0, 0, 0, 0, 10000, 1, 500, 500.01 +# 4202773875701559, 4202773876880763, 0, 0, 0, 0, 10000, 1, 500, 500.04 +# 4202774376925058, 4202774377905799, 0, 0, 0, 0, 10000, 1, 500, 500.03 +# ^C# Stopping trace @2022-12-20T14:57:40.391730 (13:57:40 UTC) +# + +try: + from bcc import BPF, USDT, USDTException +except ModuleNotFoundError: + print("WARNING: Can't find the BPF Compiler Collection (BCC) tools!") + print(" This is NOT problem if you analyzing previously collected" + " data.\n") + +from collections import namedtuple +from enum import IntEnum +from pathlib import Path + +import argparse +import ast +import datetime +import re +import subprocess +import sys + +import pytz +import psutil +import matplotlib.pyplot as plt + +# +# Actual eBPF source code +# +EBPF_SOURCE = """ +#include + + + +enum { + +}; + +struct event_t { + u64 ts; + u32 pid; + u32 id; + u64 n_flows; + u32 avg_n_flows; + u32 max_n_flows; + u32 flow_limit; + u32 dump_duration; + u32 poll_wait; +}; + + +BPF_RINGBUF_OUTPUT(events, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); + +static struct event_t *get_event(uint32_t id) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->id = id; + event->ts = bpf_ktime_get_ns(); + event->pid = bpf_get_current_pid_tgid(); + + return event; +} + +int probe__start_dump(struct pt_regs *ctx) { + struct event_t *event = get_event(EVENT_START_DUMP); + if (!event) + return 1; + + events.ringbuf_submit(event, 0); + return 0; +}; + +int probe__sweep_done(struct pt_regs *ctx) { + struct udpif udpif; + + bpf_usdt_readarg_p(1, ctx, &udpif, sizeof(udpif)); + + struct event_t *event = get_event(EVENT_SWEEP_DONE); + if (!event) + return 1; + + event->avg_n_flows = udpif.avg_n_flows; + event->max_n_flows = udpif.max_n_flows; + event->flow_limit = udpif.flow_limit; + event->dump_duration = udpif.dump_duration; + + bpf_usdt_readarg(2, ctx, &event->n_flows); + bpf_usdt_readarg(3, ctx, &event->poll_wait); + + events.ringbuf_submit(event, 0); + return 0; +}; + +int probe__reval_entry(struct pt_regs *ctx) { + struct event_t *event = get_event(EVENT_REVAL_ENTRY); + if (!event) + return 1; + + events.ringbuf_submit(event, 0); + return 0; +}; +""" + + +# +# event_to_dict() +# +def event_to_dict(event): + return dict([(field, getattr(event, field)) + for (field, _) in event._fields_ + if isinstance(getattr(event, field), (int, bytes))]) + + +# +# print_csv_header() +# +def print_csv_header(): + print("ts_start, ts_complete, n_flows, n_reval_flows, avg_n_flows, " + "max_n_flows, flow_limit, dump_duration, poll_wait, actual_wait") + + +# +# Event enum +# +Event = IntEnum("Event", ["START_DUMP", + "SWEEP_DONE", + "REVAL_ENTRY"], start=0) + + +# +# process_event() +# +def process_event(ctx, data, size): + event = b['events'].event(data) + _process_event(event) + + +def _process_event(event): + global graph + + if export_file is not None: + export_file.write("event = {}\n".format(event_to_dict(event))) + + if event.id == Event.START_DUMP and not state['running']: + start = state["last_start"] + done = state["last_done"] + if done and start: + actual_wait = (event.ts - done.ts) / 1000000 + csv = "{}, {}, {}, {}, {}, {}, {}, {}, {}, {:.2f}".format( + start.ts, done.ts, done.n_flows, graph.ukey_count, + done.avg_n_flows, done.max_n_flows, done.flow_limit, + done.dump_duration, done.poll_wait, actual_wait) + + if graph.base_time == 0: + graph = graph._replace(base_time=done.ts) + + graph.time.append((done.ts - graph.base_time) / 1000000000) + graph.n_flows.append(done.n_flows) + graph.n_reval_flows.append(graph.ukey_count) + graph.avg_n_flows.append(done.avg_n_flows) + graph.max_n_flows.append(done.max_n_flows) + graph.flow_limit.append(done.flow_limit) + graph.dump_duration.append(done.dump_duration) + graph.poll_wait.append(done.poll_wait) + graph.actual_wait.append(actual_wait) + + if not options.no_gui and not options.no_realtime_plots: + updated_graph = dynamic_plot_update( + graph, refresh=options.update_interval) + if updated_graph is None: + raise KeyboardInterrupt + graph = updated_graph + + if options.compress_output: + last_csv = state["last_csv"] + if not last_csv or \ + csv.split(",")[2:-1] != last_csv.split(",")[2:-1] or \ + abs((event.ts - done.ts) / 1000000 - done.poll_wait) > 100: + print(csv) + else: + state["last_not_printed_csv"] = csv + + state["last_csv"] = csv + else: + print(csv) + + state["last_start"] = event + state['running'] = True + graph = graph._replace(ukey_count=0) + elif event.id == Event.SWEEP_DONE and state['running']: + state["last_done"] = event + state['running'] = False + elif event.id == Event.REVAL_ENTRY and state['running']: + graph = graph._replace(ukey_count=graph.ukey_count + 1) + + +# +# run_program() +# +def run_program(command): + try: + process = subprocess.run(command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf8', + check=True) + + except subprocess.CalledProcessError as perror: + return perror.returncode, perror.stdout + + return 0, process.stdout + + +# +# get_ovs_definitions() +# +def get_ovs_definitions(objects, pahole="pahole", pid=None): + if pid is None: + raise ValueError("A valid pid value should be supplied!") + + if not isinstance(objects, list): + objects = [objects] + + if len(objects) == 0: + raise ValueError("Must supply at least one object!") + + vswitchd = Path("/proc/{}/exe".format(str(pid))).resolve() + + object_str = ','.join(objects) + + def run_pahole(debug_file): + error, result = run_program([pahole, "-C", object_str, "--compile", + debug_file]) + + if error: + if "pahole: {}: Invalid argument".format(debug_file) not in result: + print("ERROR: Pahole failed to get ovs-vswitchd data " + "structures!\n{}".format(re.sub('^', ' ' * 7, + result.rstrip(), + flags=re.MULTILINE))) + sys.exit(-1) + + return None + + if bool(re.search("pahole: type .* not found", result)): + return None + + return result + + def run_readelf(bin_file): + error, result = run_program(['readelf', "-n", + "--debug-dump=links", bin_file]) + + if error: + print("ERROR: Failed 'readelf' on \"{}\"!\n{}". + format(bin_file, re.sub('^', ' ' * 7, result, + flags=re.MULTILINE))) + sys.exit(-1) + + return result + + def get_debug_file(bin_file): + elf_result = run_readelf(bin_file) + match = re.search("Build ID: ([0-9a-fA-F]+)", elf_result) + if not match: + print("ERROR: Can't find build ID to read debug symbols!") + sys.exit(-1) + + dbg_file = "/usr/lib/debug/.build-id/{}/{}.debug".format( + match.group(1)[:2], match.group(1)[2:]) + + return dbg_file + + def get_from_shared_library(debug_file): + ovs_libs = ['libofproto', 'libopenvswitch', 'libovsdb', 'libsflow', + 'libvtep'] + error, ldd_result = run_program(['ldd', debug_file]) + + if error: + print("ERROR: Failed 'ldd' on \"{}\"!\n{}". + format(debug_file, re.sub('^', ' ' * 7, ldd_result, + flags=re.MULTILINE))) + sys.exit(-1) + + for lib in ovs_libs: + match = re.search(r"^\s*{}.* => (.*) \(.*\)$".format(lib), + ldd_result, flags=re.MULTILINE) + if match is None: + continue + + result = run_pahole(match.group(1)) + if result is None: + result = run_pahole(get_debug_file(match.group(1))) + + if result: + return result + + return None + + # + # First try to find the debug data as part of the executable. + # + result = run_pahole(vswitchd) + + if result is None: + print("INFO: Failed to find debug info in \"{}\"!".format(vswitchd)) + + # + # Get additional .debug information if available. + # + dbg_file = get_debug_file(vswitchd) + result = run_pahole(dbg_file) + if result is None: + print("INFO: Failed to find debug info in \"{}\"!".format( + dbg_file)) + + # + # Try to get information from shared libraries if used. + # + result = get_from_shared_library(vswitchd) + + if result is None: + print("ERROR: Failed to find needed data structures through pahole!") + sys.exit(-1) + + # + # We need an empty _Atomic definition to avoid compiler complaints. + # + result = "#define _Atomic\n" + result + + # + # Remove the uint64_t definition as it conflicts with the kernel one. + # + result = re.sub("^typedef.*uint64_t;$", "", result, flags=re.MULTILINE) + + return result + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# dynamic_plot_init() +# +def dynamic_plot_init(real_time=True): + + if real_time: + lines = [] + fig, axs = plt.subplots(4, figsize=(19, 10)) + fig.suptitle('Revalidator Handling') + for ax in axs: + ax.grid() + + axs[0].set_ylabel("Numer of flows", weight='bold') + axs[1].set_ylabel("Time spend (ms)", weight='bold') + axs[2].set_ylabel("Numer of flows", weight='bold') + axs[3].set_ylabel("Time spend (ms)", weight='bold') + axs[3].set_xlabel("Time (seconds since start)", weight='bold') + + lines.append(axs[0].plot([], [], label="n_flows", marker='o')[0]) + lines.append(axs[0].plot([], [], label="n_reval_flows")[0]) + axs[0].legend(bbox_to_anchor=(1, 1), loc='upper left', + borderaxespad=0.5) + axs[0].set_xlim(0, 30) + axs[0].set_ylim(-4, 104) + + lines.append(axs[1].plot([], [], color="orange", + label="dump_duration")[0]) + axs[1].legend(bbox_to_anchor=(1, 1), + loc='upper left', borderaxespad=0.5) + axs[1].set_xlim(0, 30) + axs[1].set_ylim(-0.4, 10.4) + + lines.append(axs[2].plot([], [], label="avg_n_flows")[0]) + lines.append(axs[2].plot([], [], label="max_n_flows")[0]) + lines.append(axs[2].plot([], [], label="flow_limit")[0]) + axs[2].legend(bbox_to_anchor=(1, 1), loc='upper left', + borderaxespad=0.5) + axs[2].set_xlim(0, 30) + axs[2].set_ylim(-600, 15600) + + lines.append(axs[3].plot([], [], label="poll_wait")[0]) + lines.append(axs[3].plot([], [], label="actual_wait")[0]) + axs[3].legend(bbox_to_anchor=(1, 1), loc='upper left', + borderaxespad=0.5) + axs[3].set_xlim(0, 30) + axs[3].set_ylim(-20, 520) + + fig.tight_layout() + + plt.ion() + plt.show() + else: + fig = None + axs = None + lines = None + + graph_data = {"base_time": 0, + "l_index": 0, + "fig": fig, + "axs": axs, + "lines": lines, + "last_update": 0, + "ukey_count": 0, + "time": [], + "n_flows": [], + "n_reval_flows": [], + "avg_n_flows": [], + "max_n_flows": [], + "flow_limit": [], + "dump_duration": [], + "poll_wait": [], + "actual_wait": []} + + return namedtuple("GraphData", graph_data.keys())(*graph_data.values()) + + +# +# dynamic_plot_update() +# +def dynamic_plot_update(graph_data, refresh=1): + + if graph_data.last_update != 0 and \ + (graph_data.time[-1] - graph_data.last_update) < refresh: + return graph_data + + graph_data = graph_data._replace(last_update=graph_data.time[-1]) + + if (graph_data.time[-1] - graph_data.time[graph_data.l_index]) > 30: + for i in range(graph_data.l_index + 1, len(graph_data.time)): + if (graph_data.time[-1] - graph_data.time[i]) <= 30: + graph_data = graph_data._replace(l_index=i) + break + + for line in graph_data.lines: + line.set_xdata(graph_data.time[graph_data.l_index:]) + + graph_data.lines[0].set_ydata(graph_data.n_flows[graph_data.l_index:]) + graph_data.lines[1].set_ydata( + graph_data.n_reval_flows[graph_data.l_index:]) + graph_data.lines[2].set_ydata( + graph_data.dump_duration[graph_data.l_index:]) + graph_data.lines[3].set_ydata(graph_data.avg_n_flows[graph_data.l_index:]) + graph_data.lines[4].set_ydata(graph_data.max_n_flows[graph_data.l_index:]) + graph_data.lines[5].set_ydata(graph_data.flow_limit[graph_data.l_index:]) + graph_data.lines[6].set_ydata(graph_data.poll_wait[graph_data.l_index:]) + graph_data.lines[7].set_ydata(graph_data.actual_wait[graph_data.l_index:]) + + for ax in graph_data.axs: + if graph_data.l_index == 0: + ax.autoscale(enable=True, axis='y') + else: + ax.autoscale(enable=True) + + ax.relim(visible_only=True) + ax.autoscale_view(tight=True, scalex=True, scaley=True) + + try: + graph_data.fig.canvas.draw() + graph_data.fig.canvas.flush_events() + except KeyboardInterrupt: + return None + + return graph_data + + +# +# show_graph() +# +def show_graph(graph_data, gui=False, file_name=None): + + if len(graph_data.time) == 0 or (not gui and file_name is None): + return + + plt.ioff() + + fig, (nf_ax, dd_ax, f_ax, t_ax) = plt.subplots(4, figsize=(19, 10)) + fig.suptitle('Revalidator Handling') + nf_ax.grid() + f_ax.grid() + dd_ax.grid() + t_ax.grid() + + nf_ax.set_ylabel("Numer of flows", weight='bold') + f_ax.set_ylabel("Numer of flows", weight='bold') + dd_ax.set_ylabel("Time spend (ms)", weight='bold') + t_ax.set_ylabel("Time spend (ms)", weight='bold') + t_ax.set_xlabel("Time (seconds since start)", weight='bold') + + nf_ax.plot(graph_data.time, graph_data.n_flows, label="n_flows") + nf_ax.plot(graph_data.time, graph_data.n_reval_flows, + label="n_reval_flows") + nf_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + dd_ax.plot(graph_data.time, graph_data.dump_duration, color="orange", + label="dump_duration") + dd_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + f_ax.plot(graph_data.time, graph_data.avg_n_flows, label="avg_n_flows") + f_ax.plot(graph_data.time, graph_data.max_n_flows, label="max_n_flows") + f_ax.plot(graph_data.time, graph_data.flow_limit, label="flow_limit") + f_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + t_ax.plot(graph_data.time, graph_data.poll_wait, label="poll_wait") + t_ax.plot(graph_data.time, graph_data.actual_wait, label="actual_wait") + t_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + fig.tight_layout() + + if file_name is not None and file_name != "": + fig.savefig(file_name + '.png') + + if gui: + try: + plt.show() + except KeyboardInterrupt: + pass + + plt.close(fig) + + +# +# process_events_from_file() +# +def process_events_from_file(file_name): + try: + with open(file_name, 'r') as fd: + print("- Reading events from \"{}\"...".format(file_name)) + + print_csv_header() + for entry in fd: + entry.rstrip() + if entry.startswith('event = {'): + event = ast.literal_eval(entry[8:]) + event = namedtuple("EventObject", + event.keys())(*event.values()) + try: + _process_event(event) + except KeyboardInterrupt: + break + + except (FileNotFoundError, PermissionError): + print("ERROR: Can't open file \"{}\" for reading!".format(file_name)) + sys.exit(-1) + + show_graph(graph, gui=not options.no_gui, file_name=options.write_charts) + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global b + global export_file + global options + global state + global graph + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("-c", "--compress-output", action="store_true", + help="Compress output, i.e. only dump changes in " + "the dataset") + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=int, default=1024, metavar="NUMBER") + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs='?') + parser.add_argument("-g", "--no-gui", action="store_true", + help="Do not use the gui to display plots") + parser.add_argument("--no-ukey-count", action="store_true", + help="No revalidate_ukey() counting") + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=int, default=None) + parser.add_argument("-P", "--pahole", metavar="PAHOLE", + help="Pahole executable to use, default pahole", + type=str, default="pahole") + parser.add_argument("-r", "--read-events", + help="Read events from instead of installing " + "tracepoints", type=str, default=None, metavar="FILE") + parser.add_argument("-R", "--no-realtime-plots", action="store_true", + help="Do not show realtime plot while tracing") + parser.add_argument("-u", "--update-interval", + help="Seconds to wait between real time update, " + "default 1", type=float, default=1, metavar="SECONDS") + parser.add_argument("-w", "--write-events", + help="Write events to ", + type=str, default=None, metavar="FILE") + parser.add_argument("-W", "--write-charts", + help="Write overall charts to .png", + type=str, default=None, metavar="FILE") + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if options.pid is None and options.read_events is None: + for proc in psutil.process_iter(): + if 'ovs-vswitchd' in proc.name(): + if options.pid is not None: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(-1) + + options.pid = proc.pid + + # + # Error checking on input parameters. + # + if options.pid is None and options.read_events is None: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(-1) + + if options.read_events is not None and options.write_events is not None: + print("ERROR: Either supply the read or write events option, " + "not both!") + sys.exit(-1) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Define the state and graph. + # + state = {"last_start": None, + "last_done": None, + "running": False, + "last_csv": None, + "last_not_printed_csv": None} + + export_file = None + + graph = dynamic_plot_init(real_time=(not options.no_gui + and not options.no_realtime_plots)) + + # + # Process events from file if required. + # + if options.read_events is not None: + process_events_from_file(options.read_events) + sys.exit(0) + + # + # Open write handle if needed. + # + if options.write_events is not None: + try: + export_file = open(options.write_events, "w") + except (FileNotFoundError, IOError, PermissionError) as e: + print("ERROR: Can't create export file \"{}\": {}".format( + options.write_events, e.strerror)) + sys.exit(-1) + + # + # Attach the usdt probe. + # + u = USDT(pid=int(options.pid)) + try: + u.enable_probe(probe="start_dump", fn_name="probe__start_dump") + u.enable_probe(probe="sweep_done", fn_name="probe__sweep_done") + if not options.no_ukey_count: + u.enable_probe(probe="revalidate_ukey__:entry", + fn_name="probe__reval_entry") + except USDTException as e: + print("ERROR: {}".format( + (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(-1) + + # + # Attach probe to running process. + # + source = EBPF_SOURCE.replace("", "\n".join( + [" EVENT_{} = {},".format( + event.name, event.value) for event in Event])) + source = source.replace("", + str(options.buffer_page_count)) + source = source.replace("", + get_ovs_definitions("udpif", pid=options.pid, + pahole=options.pahole)) + + b = BPF(text=source, usdt_contexts=[u], debug=options.debug) + + # + # Print header. + # + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "# Starting trace @{} ({} UTC)".format( + ltz.isoformat(), utc.strftime("%H:%M:%S")) + + if export_file is not None: + export_file.write(time_string + "\n") + + print(time_string) + print_csv_header() + + # + # Process all events. + b['events'].open_ring_buffer(process_event) + while 1: + try: + b.ring_buffer_poll() + except KeyboardInterrupt: + break + + dropcnt = b.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all upcalls were captured, {} were " + "dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count)) + + # + # Display footer. + # + if state["last_not_printed_csv"] is not None: + print(state["last_not_printed_csv"]) + + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "# Stopping trace @{} ({} UTC)".format( + ltz.isoformat(), utc.strftime("%H:%M:%S")) + + if export_file is not None: + export_file.write(time_string + "\n") + + print(time_string) + + # + # Close event file is used. + # + if options.write_events is not None: + export_file.close() + + # + # Do final graph if requested. + # + show_graph(graph, gui=not options.no_gui, file_name=options.write_charts) + + +# +# Start main() as the default entry point... +# +if __name__ == '__main__': + main() From 4f0a728a590d34e65bcfcf7efe130f8905ea5857 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 23 Jan 2023 15:05:10 +0100 Subject: [PATCH 140/833] system-traffic.at: Skip the 'ICMP6 Related' test if nc is missing. Test fails is 'nc' is not available, it should be skipped instead. Fixes: b020a416e24c ("System Tests: Enhance NAT tests.") Reviewed-by: David Marchand Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 503455cc635..fa605d16d99 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -6432,6 +6432,7 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 ICMP6 Related with SNAT]) AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +AT_SKIP_IF([test $HAVE_NC = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() OVS_TRAFFIC_VSWITCHD_START() From 7db18054ffee964b5a32c5f88e7dd3a0ee60b82e Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Tue, 24 Jan 2023 09:59:32 +0000 Subject: [PATCH 141/833] dpif-netdev-perf: Remove not a number stat value. Some stats in pmd-perf-show don't check for divide by zero which results in not a number (-nan). This is a normal case for some of the stats when there are no Rx queues assigned to the PMD thread core. It is not obvious what -nan is to a user so add a check for divide by zero and set stat to 0 if present. Before patch: pmd thread numa_id 1 core_id 9: Iterations: 0 (-nan us/it) - Used TSC cycles: 0 ( 0.0 % of total cycles) - idle iterations: 0 ( -nan % of used cycles) - busy iterations: 0 ( -nan % of used cycles) After patch: pmd thread numa_id 1 core_id 9: Iterations: 0 (0.00 us/it) - Used TSC cycles: 0 ( 0.0 % of total cycles) - idle iterations: 0 ( 0.0 % of used cycles) - busy iterations: 0 ( 0.0 % of used cycles) Acked-by: Mike Pattrick Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/dpif-netdev-perf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index 1a7bab04c0c..a552948acbe 100644 --- a/lib/dpif-netdev-perf.c +++ b/lib/dpif-netdev-perf.c @@ -241,12 +241,14 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, " - sleep iterations: %12"PRIu64" (%5.1f %% of iterations)\n" " Sleep time (us): %12.0f (%3.0f us/iteration avg.)\n", tot_iter, - (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter, + tot_iter + ? (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter + : 0, tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz, idle_iter, - 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles, + tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles : 0, busy_iter, - 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles, + tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles : 0, sleep_iter, tot_iter ? 100.0 * sleep_iter / tot_iter : 0, tot_sleep_cycles * us_per_cycle, sleep_iter ? (tot_sleep_cycles * us_per_cycle) / sleep_iter : 0); From 3beff0a6b09e183f46249b94865b56ea7c4c0350 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Tue, 24 Jan 2023 09:59:33 +0000 Subject: [PATCH 142/833] dpif-netdev-perf: Add metric averages when no iterations. pmd-perf-show with pmd-perf-metrics=true displays a histogram with averages. However, averages were not displayed when there is no iterations. They will be all zero so it is not hiding useful information but the stats look incomplete without them, especially when they are displayed for some PMD thread cores and not others. The histogram print is large and this is just an extra couple of lines, so might as well print them all the time to ensure that the user does not think there is something missing from the display. Before patch: Histograms cycles/it 499 0 716 0 1025 0 1469 0 After patch: Histograms cycles/it 499 0 716 0 1025 0 1469 0 --------------- cycles/it 0 Acked-by: Mike Pattrick Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/dpif-netdev-perf.c | 49 ++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index a552948acbe..79ea5e3bef2 100644 --- a/lib/dpif-netdev-perf.c +++ b/lib/dpif-netdev-perf.c @@ -337,29 +337,32 @@ pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s) ">", s->max_vhost_qfill.bin[i], ">", s->upcalls.bin[i], ">", s->cycles_per_upcall.bin[i]); - if (s->totals.iterations > 0) { - ds_put_cstr(str, - "-----------------------------------------------------" - "-----------------------------------------------------" - "------------------------------------------------\n"); - ds_put_format(str, - " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", - "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", - "vhost qlen", "upcalls/it", "cycles/upcall"); - ds_put_format(str, - " %-21"PRIu64" %-21.5f %-21"PRIu64 - " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", - s->totals.cycles / s->totals.iterations, - 1.0 * s->totals.pkts / s->totals.iterations, - s->totals.pkts - ? s->totals.busy_cycles / s->totals.pkts : 0, - s->totals.batches - ? 1.0 * s->totals.pkts / s->totals.batches : 0, - 1.0 * s->totals.max_vhost_qfill / s->totals.iterations, - 1.0 * s->totals.upcalls / s->totals.iterations, - s->totals.upcalls - ? s->totals.upcall_cycles / s->totals.upcalls : 0); - } + ds_put_cstr(str, + "-----------------------------------------------------" + "-----------------------------------------------------" + "------------------------------------------------\n"); + ds_put_format(str, + " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", + "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", + "vhost qlen", "upcalls/it", "cycles/upcall"); + ds_put_format(str, + " %-21"PRIu64" %-21.5f %-21"PRIu64 + " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", + s->totals.iterations + ? s->totals.cycles / s->totals.iterations : 0, + s->totals.iterations + ? 1.0 * s->totals.pkts / s->totals.iterations : 0, + s->totals.pkts + ? s->totals.busy_cycles / s->totals.pkts : 0, + s->totals.batches + ? 1.0 * s->totals.pkts / s->totals.batches : 0, + s->totals.iterations + ? 1.0 * s->totals.max_vhost_qfill / s->totals.iterations + : 0, + s->totals.iterations + ? 1.0 * s->totals.upcalls / s->totals.iterations : 0, + s->totals.upcalls + ? s->totals.upcall_cycles / s->totals.upcalls : 0); } void From 9117f4d54f6af76d3ba0bf2e1732cb16440b87f8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 25 Jan 2023 14:48:52 +0100 Subject: [PATCH 143/833] netdev-offload-tc: Fix misaligned access to ct label. UndefinedBehaviorSanitizer: lib/netdev-offload-tc.c:1356:50: runtime error: member access within misaligned address 0x60700001a89c for type 'const struct (unnamed struct at lib/netdev-offload-tc.c:1350:27)', which requires 8 byte alignment 0x60700001a89c: note: pointer points here 24 00 04 00 01 00 00 05 00 00 0d 00 0a 00 00 00 00 00 00 00 ... ^ 0 0xd5d183 in parse_put_flow_ct_action lib/netdev-offload-tc.c:1356:50 1 0xd5783f in netdev_tc_parse_nl_actions lib/netdev-offload-tc.c:2015:19 2 0xd4027c in netdev_tc_flow_put lib/netdev-offload-tc.c:2355:11 3 0x9666d7 in netdev_flow_put lib/netdev-offload.c:318:14 4 0xcd4c0a in parse_flow_put lib/dpif-netlink.c:2297:11 5 0xcd4c0a in try_send_to_netdev lib/dpif-netlink.c:2384:15 6 0xcd4c0a in dpif_netlink_operate lib/dpif-netlink.c:2455:23 7 0x87d40e in dpif_operate lib/dpif.c:1372:13 8 0x6d43e9 in handle_upcalls ofproto/ofproto-dpif-upcall.c:1674:5 9 0x6d43e9 in recv_upcalls ofproto/ofproto-dpif-upcall.c:905:9 10 0x6cf6ea in udpif_upcall_handler ofproto/ofproto-dpif-upcall.c:801:13 11 0xb6d7ea in ovsthread_wrapper lib/ovs-thread.c:423:12 12 0x7f5ccf017801 in start_thread 13 0x7f5ccefb744f in __GI___clone3 Fixes: 9221c721bec0 ("netdev-offload-tc: Add conntrack label and mark support") Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index ce7f8ad9730..15d1c36aa04 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -843,13 +843,13 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, struct { ovs_u128 key; ovs_u128 mask; - } *ct_label; + } ct_label = { + .key = action->ct.label, + .mask = action->ct.label_mask, + }; - ct_label = nl_msg_put_unspec_uninit(buf, - OVS_CT_ATTR_LABELS, - sizeof *ct_label); - ct_label->key = action->ct.label; - ct_label->mask = action->ct.label_mask; + nl_msg_put_unspec(buf, OVS_CT_ATTR_LABELS, + &ct_label, sizeof ct_label); } if (action->ct.nat_type) { @@ -1339,13 +1339,14 @@ parse_put_flow_ct_action(struct tc_flower *flower, break; case OVS_CT_ATTR_LABELS: { const struct { - ovs_u128 key; - ovs_u128 mask; + ovs_32aligned_u128 key; + ovs_32aligned_u128 mask; } *ct_label; ct_label = nl_attr_get_unspec(ct_attr, sizeof *ct_label); - action->ct.label = ct_label->key; - action->ct.label_mask = ct_label->mask; + action->ct.label = get_32aligned_u128(&ct_label->key); + action->ct.label_mask = + get_32aligned_u128(&ct_label->mask); } break; } From 6e5661d17d908c20c570a5210c2948537ea01b06 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 25 Jan 2023 10:45:10 +0100 Subject: [PATCH 144/833] system-traffic: Remove unnecessary dependency on nc. The conntrack - ICMP related to original direction" test does not use nc and therefore does not need to be skipped if nc is not present. Fixes: d0e4206230b3 ("tests: ICMP related to original direction test.") Reported-by: David Marchand Reviewed-by: Louis Peens Signed-off-by: Simon Horman Acked-by: Ilya Maximets Reviewed-by: David Marchand --- tests/system-traffic.at | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index fa605d16d99..6d8651a44c4 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3384,7 +3384,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - ICMP related to original direction]) -AT_SKIP_IF([test $HAVE_NC = no]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() From 3f85b11d50bbac9b7e746a1a061dd361ab2fa00f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 25 Jan 2023 10:52:53 +0100 Subject: [PATCH 145/833] system-offloads-traffic: Skip tests if nc is not present. The following tests use the nc command and should be skipped if nc is not present. - "offloads - check interface meter offloading - offloads disabled" - "offloads - check interface meter offloading - offloads enabled" Fixes: 5660b89a309d ("dpif-netlink: Offloading meter to tc police action") Reported-by: David Marchand Reviewed-by: Louis Peens Signed-off-by: Simon Horman Acked-by: Ilya Maximets Reviewed-by: David Marchand --- tests/system-offloads-traffic.at | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 1a60570801e..16a4c1a0088 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -181,6 +181,7 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads disabled]) AT_KEYWORDS([dp-meter]) +AT_SKIP_IF([test $HAVE_NC = "no"]) OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) @@ -230,6 +231,7 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads enabled]) AT_KEYWORDS([offload-meter]) AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +AT_SKIP_IF([test $HAVE_NC = "no"]) OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) From e1e5eac5b0167c65c802bd60ed37605b1e1c9c92 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 27 Jan 2023 12:16:36 +0100 Subject: [PATCH 146/833] tc: Add TCA_KIND flower to delete and get operation to avoid rtnl_lock(). A long long time ago, an effort was made to make tc flower rtnl_lock() free. However, on the OVS part we forgot to add the TCA_KIND "flower" attribute, which tell the kernel to skip the lock. This patch corrects this by adding the attribute for the delete and get operations. The kernel code calls tcf_proto_is_unlocked() to determine the rtnl_lock() is needed for the specific tc protocol. It does this in the tc_new_tfilter(), tc_del_tfilter(), and in tc_get_tfilter(). If the name is not set, tcf_proto_is_unlocked() will always return false. If set, the specific protocol is queried for unlocked support. Fixes: f98e418fbdb6 ("tc: Add tc flower functions") Signed-off-by: Eelco Chaudron Reviewed-by: Roi Dayan Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 2 +- lib/netdev-offload-tc.c | 20 ++++++++++---------- lib/tc.c | 10 +++++++++- lib/tc.h | 3 ++- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index f6d7a1b9743..65bdd51dbaa 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2735,7 +2735,7 @@ tc_del_matchall_policer(struct netdev *netdev) } id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS); - err = tc_del_filter(&id); + err = tc_del_filter(&id, "matchall"); if (err) { return err; } diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 15d1c36aa04..6e1bbaa2855 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -239,7 +239,7 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid) { int err; - err = tc_del_filter(id); + err = tc_del_flower_filter(id); if (!err) { del_ufid_tc_mapping(ufid); } @@ -461,7 +461,7 @@ delete_chains_from_netdev(struct netdev *netdev, struct tcf_id *id) */ HMAP_FOR_EACH_POP (chain_node, node, &map) { id->chain = chain_node->chain; - tc_del_filter(id); + tc_del_flower_filter(id); free(chain_node); } } @@ -482,7 +482,7 @@ netdev_tc_flow_flush(struct netdev *netdev) continue; } - err = tc_del_filter(&data->id); + err = tc_del_flower_filter(&data->id); if (!err) { del_ufid_tc_mapping_unlocked(&data->ufid); } @@ -2499,13 +2499,13 @@ probe_multi_mask_per_prio(int ifindex) id2 = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS); error = tc_replace_flower(&id2, &flower); - tc_del_filter(&id1); + tc_del_flower_filter(&id1); if (error) { goto out; } - tc_del_filter(&id2); + tc_del_flower_filter(&id2); multi_mask_per_prio = true; VLOG_INFO("probe tc: multiple masks on single tc prio is supported."); @@ -2557,7 +2557,7 @@ probe_ct_state_support(int ifindex) goto out_del; } - tc_del_filter(&id); + tc_del_flower_filter(&id); ct_state_support = OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | OVS_CS_F_TRACKED | @@ -2571,7 +2571,7 @@ probe_ct_state_support(int ifindex) goto out_del; } - tc_del_filter(&id); + tc_del_flower_filter(&id); /* Test for ct_state INVALID support */ memset(&flower, 0, sizeof flower); @@ -2582,7 +2582,7 @@ probe_ct_state_support(int ifindex) goto out; } - tc_del_filter(&id); + tc_del_flower_filter(&id); ct_state_support |= OVS_CS_F_INVALID; /* Test for ct_state REPLY support */ @@ -2598,7 +2598,7 @@ probe_ct_state_support(int ifindex) ct_state_support |= OVS_CS_F_REPLY_DIR; out_del: - tc_del_filter(&id); + tc_del_flower_filter(&id); out: tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS); VLOG_INFO("probe tc: supported ovs ct_state bits: 0x%x", ct_state_support); @@ -2751,7 +2751,7 @@ netdev_tc_init_flow_api(struct netdev *netdev) /* fallback here if delete chains fail */ if (!get_chain_supported) { - tc_del_filter(&id); + tc_del_flower_filter(&id); } /* make sure there is no ingress/egress qdisc */ diff --git a/lib/tc.c b/lib/tc.c index 447ab376ee0..1fb2b4a92ca 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -2337,14 +2337,21 @@ parse_netlink_to_tc_policer(struct ofpbuf *reply, uint32_t police_idx[]) } int -tc_del_filter(struct tcf_id *id) +tc_del_filter(struct tcf_id *id, const char *kind) { struct ofpbuf request; request_from_tcf_id(id, 0, RTM_DELTFILTER, NLM_F_ACK, &request); + nl_msg_put_string(&request, TCA_KIND, kind); return tc_transact(&request, NULL); } +int +tc_del_flower_filter(struct tcf_id *id) +{ + return tc_del_filter(id, "flower"); +} + int tc_get_flower(struct tcf_id *id, struct tc_flower *flower) { @@ -2353,6 +2360,7 @@ tc_get_flower(struct tcf_id *id, struct tc_flower *flower) int error; request_from_tcf_id(id, 0, RTM_GETTFILTER, NLM_F_ECHO, &request); + nl_msg_put_string(&request, TCA_KIND, "flower"); error = tc_transact(&request, &reply); if (error) { return error; diff --git a/lib/tc.h b/lib/tc.h index a828fd3e3f1..ea4ce806bc8 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -384,7 +384,8 @@ struct tc_flower { }; int tc_replace_flower(struct tcf_id *id, struct tc_flower *flower); -int tc_del_filter(struct tcf_id *id); +int tc_del_filter(struct tcf_id *id, const char *kind); +int tc_del_flower_filter(struct tcf_id *id); int tc_get_flower(struct tcf_id *id, struct tc_flower *flower); int tc_dump_flower_start(struct tcf_id *id, struct nl_dump *dump, bool terse); int tc_dump_tc_chain_start(struct tcf_id *id, struct nl_dump *dump); From e22e1f6725b99007b469fcccdbb48f525f302757 Mon Sep 17 00:00:00 2001 From: wangchuanlei Date: Wed, 18 Jan 2023 20:31:17 -0500 Subject: [PATCH 147/833] dpctl: Add support to count upcall packets. Add support to count upcall packets per port, both succeed and failed, which is a better way to see how many packets upcalled on each interface. Acked-by: Eelco Chaudron Signed-off-by: wangchuanlei Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ include/linux/openvswitch.h | 14 ++++++++++++++ include/openvswitch/netdev.h | 4 ++++ lib/dpctl.c | 4 ++++ lib/dpif-netlink.c | 17 +++++++++++++++++ lib/dpif-netlink.h | 2 ++ lib/netdev-linux.c | 24 +++++++++++++----------- vswitchd/bridge.c | 4 +++- 8 files changed, 61 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index 83c126b0024..fe6055a2700 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,9 @@ Post-v3.1.0 -------------------- + - Linux kernel datapath: + * OVS now collects per-interface upcall statistics that can be obtained + via 'ovs-appctl dpctl/show -s' or the interface's statistics column + in OVSDB. Available with upstream kernel 6.2+. v3.1.0 - xx xxx xxxx diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index 8bb5abdc834..bc8f7499184 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -301,11 +301,25 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_PAD, OVS_VPORT_ATTR_IFINDEX, OVS_VPORT_ATTR_NETNSID, + OVS_VPORT_ATTR_UPCALL_STATS, __OVS_VPORT_ATTR_MAX }; #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/** + * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands + * @OVS_VPORT_UPCALL_ATTR_SUCCESS: 64-bit upcall success packets. + * @OVS_VPORT_UPCALL_ATTR_FAIL: 64-bit upcall fail packets. + */ +enum ovs_vport_upcall_attr { + OVS_VPORT_UPCALL_ATTR_SUCCESS, + OVS_VPORT_UPCALL_ATTR_FAIL, + __OVS_VPORT_UPCALL_ATTR_MAX, +}; + +#define OVS_VPORT_UPCALL_ATTR_MAX (__OVS_VPORT_UPCALL_ATTR_MAX - 1) + enum { OVS_VXLAN_EXT_UNSPEC, OVS_VXLAN_EXT_GBP, diff --git a/include/openvswitch/netdev.h b/include/openvswitch/netdev.h index cf48f86915f..cafd6fd7bee 100644 --- a/include/openvswitch/netdev.h +++ b/include/openvswitch/netdev.h @@ -87,6 +87,10 @@ struct netdev_stats { uint64_t rx_oversize_errors; uint64_t rx_fragmented_errors; uint64_t rx_jabber_errors; + + /* Datapath upcall statistics. */ + uint64_t upcall_packets; /* Rx packets forwarded to userspace. */ + uint64_t upcall_errors; /* Rx packets failed forwarding to userspace. */ }; /* Structure representation of custom statistics counter */ diff --git a/lib/dpctl.c b/lib/dpctl.c index d12d9b8a5e8..c501a0cd76b 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -750,6 +750,10 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) print_stat(dpctl_p, " TX bytes:", s.tx_bytes); print_human_size(dpctl_p, s.tx_bytes); dpctl_print(dpctl_p, "\n"); + + print_stat(dpctl_p, " UPCALL packets:", s.upcall_packets); + print_stat(dpctl_p, " errors:", s.upcall_errors); + dpctl_print(dpctl_p, "\n"); } else { dpctl_print(dpctl_p, ", could not retrieve stats (%s)", ovs_strerror(error)); diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 026b0daa8d8..586fb8893d2 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -4685,6 +4685,8 @@ dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *vport, .optional = true }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NL_A_NESTED, .optional = true }, [OVS_VPORT_ATTR_NETNSID] = { .type = NL_A_U32, .optional = true }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NL_A_NESTED, + .optional = true }, }; dpif_netlink_vport_init(vport); @@ -4716,6 +4718,21 @@ dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *vport, if (a[OVS_VPORT_ATTR_STATS]) { vport->stats = nl_attr_get(a[OVS_VPORT_ATTR_STATS]); } + if (a[OVS_VPORT_ATTR_UPCALL_STATS]) { + const struct nlattr *nla; + size_t left; + + NL_NESTED_FOR_EACH (nla, left, a[OVS_VPORT_ATTR_UPCALL_STATS]) { + if (nl_attr_type(nla) == OVS_VPORT_UPCALL_ATTR_SUCCESS) { + vport->upcall_success = nl_attr_get_u64(nla); + } else if (nl_attr_type(nla) == OVS_VPORT_UPCALL_ATTR_FAIL) { + vport->upcall_fail = nl_attr_get_u64(nla); + } + } + } else { + vport->upcall_success = UINT64_MAX; + vport->upcall_fail = UINT64_MAX; + } if (a[OVS_VPORT_ATTR_OPTIONS]) { vport->options = nl_attr_get(a[OVS_VPORT_ATTR_OPTIONS]); vport->options_len = nl_attr_get_size(a[OVS_VPORT_ATTR_OPTIONS]); diff --git a/lib/dpif-netlink.h b/lib/dpif-netlink.h index 24294bc42dc..4909fe16089 100644 --- a/lib/dpif-netlink.h +++ b/lib/dpif-netlink.h @@ -44,6 +44,8 @@ struct dpif_netlink_vport { uint32_t n_upcall_pids; const uint32_t *upcall_pids; /* OVS_VPORT_ATTR_UPCALL_PID. */ const struct ovs_vport_stats *stats; /* OVS_VPORT_ATTR_STATS. */ + uint64_t upcall_success; /* OVS_VPORT_UPCALL_ATTR_SUCCESS. */ + uint64_t upcall_fail; /* OVS_VPORT_UPCALL_ATTR_FAIL. */ const struct nlattr *options; /* OVS_VPORT_ATTR_OPTIONS. */ size_t options_len; }; diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 65bdd51dbaa..7c19c40163f 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2156,16 +2156,16 @@ swap_uint64(uint64_t *a, uint64_t *b) * 'src' is allowed to be misaligned. */ static void netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst, - const struct ovs_vport_stats *src) -{ - dst->rx_packets = get_32aligned_u64(&src->rx_packets); - dst->tx_packets = get_32aligned_u64(&src->tx_packets); - dst->rx_bytes = get_32aligned_u64(&src->rx_bytes); - dst->tx_bytes = get_32aligned_u64(&src->tx_bytes); - dst->rx_errors = get_32aligned_u64(&src->rx_errors); - dst->tx_errors = get_32aligned_u64(&src->tx_errors); - dst->rx_dropped = get_32aligned_u64(&src->rx_dropped); - dst->tx_dropped = get_32aligned_u64(&src->tx_dropped); + const struct dpif_netlink_vport *vport) +{ + dst->rx_packets = get_32aligned_u64(&vport->stats->rx_packets); + dst->tx_packets = get_32aligned_u64(&vport->stats->tx_packets); + dst->rx_bytes = get_32aligned_u64(&vport->stats->rx_bytes); + dst->tx_bytes = get_32aligned_u64(&vport->stats->tx_bytes); + dst->rx_errors = get_32aligned_u64(&vport->stats->rx_errors); + dst->tx_errors = get_32aligned_u64(&vport->stats->tx_errors); + dst->rx_dropped = get_32aligned_u64(&vport->stats->rx_dropped); + dst->tx_dropped = get_32aligned_u64(&vport->stats->tx_dropped); dst->multicast = 0; dst->collisions = 0; dst->rx_length_errors = 0; @@ -2179,6 +2179,8 @@ netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst, dst->tx_fifo_errors = 0; dst->tx_heartbeat_errors = 0; dst->tx_window_errors = 0; + dst->upcall_packets = vport->upcall_success; + dst->upcall_errors = vport->upcall_fail; } static int @@ -2196,7 +2198,7 @@ get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats) return EOPNOTSUPP; } - netdev_stats_from_ovs_vport_stats(stats, reply.stats); + netdev_stats_from_ovs_vport_stats(stats, &reply); ofpbuf_delete(buf); diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index bfb2adef1dd..abf2afe5737 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2626,7 +2626,9 @@ iface_refresh_stats(struct iface *iface) IFACE_STAT(rx_undersized_errors, "rx_undersized_errors") \ IFACE_STAT(rx_oversize_errors, "rx_oversize_errors") \ IFACE_STAT(rx_fragmented_errors, "rx_fragmented_errors") \ - IFACE_STAT(rx_jabber_errors, "rx_jabber_errors") + IFACE_STAT(rx_jabber_errors, "rx_jabber_errors") \ + IFACE_STAT(upcall_packets, "upcall_packets") \ + IFACE_STAT(upcall_errors, "upcall_errors") #define IFACE_STAT(MEMBER, NAME) + 1 enum { N_IFACE_STATS = IFACE_STATS }; From 4fd2d46c01f2fabcd2fbcdc805d85d9d9190fc8c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 31 Jan 2023 17:31:23 +0100 Subject: [PATCH 148/833] AUTHORS: Add wangchuanlei. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 2df76c56f11..c82570fb6e3 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -495,6 +495,7 @@ lic121 lic121@chinatelecom.cn lzhecheng lzhecheng@vmware.com parameswaran krishnamurthy parkrish@gmail.com solomon liwei.solomon@gmail.com +wangchuanlei wangchuanlei@inspur.com wenxu wenxu@ucloud.cn wisd0me ak47izatool@gmail.com xushengping shengping.xu@huawei.com From d6501c66050ad7ad54081f2104a01a28f1b3ce42 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 17 Jan 2023 18:08:27 +0100 Subject: [PATCH 149/833] sparse: Fix numa.h for libnuma >= 2.0.13. Current numa.h header for sparse re-defines functions in a way that breaks the header from libnuma 2.0.13+, because the original issue was fixed in that version: https://github.com/numactl/numactl/commit/25dcde021dd4f1a1dcac2ba0094f1cb441a2e4a5 Sparse errors as a result: lib/netdev-afxdp.c: note: in included file (through include/sparse/numa.h): /usr/include/numa.h:346:26: error: macro "numa_get_interleave_mask_compat" passed 1 arguments, but takes just 0 /usr/include/numa.h:376:26: error: macro "numa_get_membind_compat" passed 1 arguments, but takes just 0 /usr/include/numa.h:406:26: error: macro "numa_get_run_node_mask_compat" passed 1 arguments, but takes just 0 /usr/include/numa.h:347:1: error: Expected ; at end of declaration /usr/include/numa.h:347:1: error: got { /usr/include/numa.h:351:9: error: 'tp' has implicit type It's hard to adjust defines to work with both versions of a header. Just defining all the functions we actually use in OVS instead and not including the original header. Fixes: e8568993e062 ("netdev-afxdp: NUMA-aware memory allocation for XSK related memory.") Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- include/sparse/numa.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/include/sparse/numa.h b/include/sparse/numa.h index 3691a0eaf72..a185972e31a 100644 --- a/include/sparse/numa.h +++ b/include/sparse/numa.h @@ -18,10 +18,21 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif -/* Avoid sparse warning: non-ANSI function declaration of function" */ -#define numa_get_membind_compat() numa_get_membind_compat(void) -#define numa_get_interleave_mask_compat() numa_get_interleave_mask_compat(void) -#define numa_get_run_node_mask_compat() numa_get_run_node_mask_compat(void) +#ifndef __NUMA_H_SPARSE +#define __NUMA_H_SPARSE 1 -/* Get actual definitions for us to annotate and build on. */ -#include_next +/* Avoid sparse warning "non-ANSI function declaration of function" with + * libnuma < 2.0.13. */ + +struct bitmask { + unsigned long size; + unsigned long *maskp; +}; + +int numa_available(void); +struct bitmask *numa_allocate_nodemask(void); +void numa_bitmask_free(struct bitmask *); +void numa_set_localalloc(void); +void numa_set_preferred(int node); + +#endif /* for sparse. */ From b1f58f5072d6c934aafadc8ee27832ffac003db5 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 1 Feb 2023 12:12:14 +0100 Subject: [PATCH 150/833] netdev-offload-tc: Preserve tc statistics when flow gets modified. When a flow gets modified, i.e. the actions are changes, the tc layer will remove, and re-add the flow. This is causing all the counters to be reset. This patch will remember the previous tc counters and adjust any requests for statistics. This is done in a similar way as the rte_flow implementation. It also updates the check_pkt_len tc test to purge the flows, so we do not use existing updated tc flow counters, but start with fresh installed set of datapath flows. Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 99 +++++++++++++++++++++++++++----- lib/tc.h | 1 - tests/system-offloads-traffic.at | 78 ++++++++++++++++++++++--- tests/system-traffic.at | 63 ++++++++++++++++++++ 4 files changed, 218 insertions(+), 23 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 6e1bbaa2855..134c241576a 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -97,6 +97,12 @@ static int netdev_tc_parse_nl_actions(struct netdev *netdev, bool *recirc_act, bool more_actions, struct tc_action **need_jump_update); +static void parse_tc_flower_to_stats(struct tc_flower *flower, + struct dpif_flow_stats *stats); + +static int get_ufid_adjust_stats(const ovs_u128 *ufid, + struct dpif_flow_stats *stats); + static bool is_internal_port(const char *type) { @@ -193,6 +199,9 @@ static struct ovs_mutex ufid_lock = OVS_MUTEX_INITIALIZER; * @ufid: ufid assigned to the flow * @id: tc filter id (tcf_id) * @netdev: netdev associated with the tc rule + * @adjust_stats: When flow gets updated with new actions, we need to adjust + * the reported stats to include previous values as the hardware + * rule is removed and re-added. This stats copy is used for it. */ struct ufid_tc_data { struct hmap_node ufid_to_tc_node; @@ -200,6 +209,7 @@ struct ufid_tc_data { ovs_u128 ufid; struct tcf_id id; struct netdev *netdev; + struct dpif_flow_stats adjust_stats; }; static void @@ -233,12 +243,38 @@ del_ufid_tc_mapping(const ovs_u128 *ufid) ovs_mutex_unlock(&ufid_lock); } +static void +netdev_tc_adjust_stats(struct dpif_flow_stats *stats, + const struct dpif_flow_stats *adjust_stats) +{ + /* Do not try to restore the stats->used, as in terse mode dumps TC doesn't + * report TCA_ACT_OPTIONS, so the 'lastused' value is not available, hence + * we report used as 0. + * tcp_flags is not collected by tc, so no need to update it. */ + stats->n_bytes += adjust_stats->n_bytes; + stats->n_packets += adjust_stats->n_packets; +} + /* Wrapper function to delete filter and ufid tc mapping */ static int -del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid) +del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid, + struct dpif_flow_stats *stats) { + struct tc_flower flower; int err; + if (stats) { + memset(stats, 0, sizeof *stats); + if (!tc_get_flower(id, &flower)) { + struct dpif_flow_stats adjust_stats; + + parse_tc_flower_to_stats(&flower, stats); + if (!get_ufid_adjust_stats(ufid, &adjust_stats)) { + netdev_tc_adjust_stats(stats, &adjust_stats); + } + } + } + err = tc_del_flower_filter(id); if (!err) { del_ufid_tc_mapping(ufid); @@ -249,7 +285,7 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid) /* Add ufid entry to ufid_to_tc hashmap. */ static void add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid, - struct tcf_id *id) + struct tcf_id *id, struct dpif_flow_stats *stats) { struct ufid_tc_data *new_data = xzalloc(sizeof *new_data); size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0); @@ -261,6 +297,9 @@ add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid, new_data->ufid = *ufid; new_data->id = *id; new_data->netdev = netdev_ref(netdev); + if (stats) { + new_data->adjust_stats = *stats; + } ovs_mutex_lock(&ufid_lock); hmap_insert(&ufid_to_tc, &new_data->ufid_to_tc_node, ufid_hash); @@ -292,6 +331,30 @@ get_ufid_tc_mapping(const ovs_u128 *ufid, struct tcf_id *id) return ENOENT; } +/* Get adjust_stats from ufid_to_tc hashmap. + * + * Returns 0 if successful and fills stats with adjust_stats. + * Otherwise returns the error. +*/ +static int +get_ufid_adjust_stats(const ovs_u128 *ufid, struct dpif_flow_stats *stats) +{ + size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0); + struct ufid_tc_data *data; + + ovs_mutex_lock(&ufid_lock); + HMAP_FOR_EACH_WITH_HASH (data, ufid_to_tc_node, ufid_hash, &ufid_to_tc) { + if (ovs_u128_equals(*ufid, data->ufid)) { + *stats = data->adjust_stats; + ovs_mutex_unlock(&ufid_lock); + return 0; + } + } + ovs_mutex_unlock(&ufid_lock); + + return ENOENT; +} + /* Find ufid entry in ufid_to_tc hashmap using tcf_id id. * The result is saved in ufid. * @@ -1193,6 +1256,7 @@ netdev_tc_flow_dump_next(struct netdev_flow_dump *dump, get_tc_qdisc_hook(netdev)); while (nl_dump_next(dump->nl_dump, &nl_flow, rbuffer)) { + struct dpif_flow_stats adjust_stats; struct tc_flower flower; if (parse_netlink_to_tc_flower(&nl_flow, &id, &flower, dump->terse)) { @@ -1210,6 +1274,10 @@ netdev_tc_flow_dump_next(struct netdev_flow_dump *dump, continue; } + if (!get_ufid_adjust_stats(ufid, &adjust_stats)) { + netdev_tc_adjust_stats(stats, &adjust_stats); + } + match->wc.masks.in_port.odp_port = u32_to_odp(UINT32_MAX); match->flow.in_port.odp_port = dump->port; match_set_recirc_id(match, id.chain); @@ -2059,6 +2127,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, struct flow *mask = &match->wc.masks; const struct flow_tnl *tnl = &match->flow.tunnel; struct flow_tnl *tnl_mask = &mask->tunnel; + struct dpif_flow_stats adjust_stats; bool recirc_act = false; uint32_t block_id = 0; struct tcf_id id; @@ -2352,10 +2421,12 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, return EOPNOTSUPP; } + memset(&adjust_stats, 0, sizeof adjust_stats); if (get_ufid_tc_mapping(ufid, &id) == 0) { VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d", id.handle, id.prio); - info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping(&id, ufid); + info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping( + &id, ufid, &adjust_stats); } prio = get_prio_for_tc_flower(&flower); @@ -2373,8 +2444,9 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, if (!err) { if (stats) { memset(stats, 0, sizeof *stats); + netdev_tc_adjust_stats(stats, &adjust_stats); } - add_ufid_tc_mapping(netdev, ufid, &id); + add_ufid_tc_mapping(netdev, ufid, &id, &adjust_stats); } return err; @@ -2415,6 +2487,13 @@ netdev_tc_flow_get(struct netdev *netdev, parse_tc_flower_to_match(netdev, &flower, match, actions, stats, attrs, buf, false); + if (stats) { + struct dpif_flow_stats adjust_stats; + + if (!get_ufid_adjust_stats(ufid, &adjust_stats)) { + netdev_tc_adjust_stats(stats, &adjust_stats); + } + } match->wc.masks.in_port.odp_port = u32_to_odp(UINT32_MAX); match->flow.in_port.odp_port = in_port; match_set_recirc_id(match, id.chain); @@ -2427,7 +2506,6 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED, const ovs_u128 *ufid, struct dpif_flow_stats *stats) { - struct tc_flower flower; struct tcf_id id; int error; @@ -2436,16 +2514,7 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED, return error; } - if (stats) { - memset(stats, 0, sizeof *stats); - if (!tc_get_flower(&id, &flower)) { - parse_tc_flower_to_stats(&flower, stats); - } - } - - error = del_filter_and_ufid_mapping(&id, ufid); - - return error; + return del_filter_and_ufid_mapping(&id, ufid, stats); } static int diff --git a/lib/tc.h b/lib/tc.h index ea4ce806bc8..cdd3b4f60ec 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -343,7 +343,6 @@ static inline bool is_tcf_id_eq(struct tcf_id *id1, struct tcf_id *id2) { return id1->prio == id2->prio - && id1->handle == id2->handle && id1->handle == id2->handle && id1->hook == id2->hook && id1->block_id == id2->block_id diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 16a4c1a0088..8775f99226d 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -397,7 +397,7 @@ AT_CHECK([cat p4.pcap | awk 'NF{print $NF}' | uniq -c | awk '{$1=$1;print}'], [0 # This test verifies the total packet counters work when individual branches # are taken. -AT_CHECK([ovs-appctl revalidator/wait], [0]) +AT_CHECK([ovs-appctl revalidator/purge], [0]) AT_CHECK([ovs-ofctl del-flows br0]) AT_DATA([flows.txt], [dnl table=0,in_port=2 actions=output:1 @@ -417,9 +417,9 @@ NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PIN 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed 's/bytes:11440/bytes:11720/'], [0], [dnl -in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:20, bytes:11720, used:0.001s, actions:check_pkt_len(size=200,gt(3),le(3)) -in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:20, bytes:11720, used:0.001s, actions:output +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed 's/bytes:11348/bytes:11614/'], [0], [dnl +in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:11614, used:0.001s, actions:check_pkt_len(size=200,gt(3),le(3)) +in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:11614, used:0.001s, actions:output ]) @@ -492,7 +492,7 @@ NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PIN OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(3,5),le(3,4))]) -AT_CHECK([ovs-appctl revalidator/wait], [0]) +AT_CHECK([ovs-appctl revalidator/purge], [0]) AT_CHECK([ovs-ofctl del-flows br0]) AT_DATA([flows.txt], [dnl table=0,in_port=2 actions=output:1 @@ -517,9 +517,9 @@ NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PIN 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed -e 's/bytes:11348/bytes:11614/' -e 's/bytes:11440/bytes:11720/'], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed -e 's/bytes:11348/bytes:11614/'], [0], [dnl in_port(2),eth(),eth_type(0x0800),ipv4(proto=1,tos=0/0xfc,frag=no), packets:19, bytes:11614, used:0.001s, actions:check_pkt_len(size=200,gt(set(ipv4(tos=0x4/0xfc)),4),le(set(ipv4(tos=0x8/0xfc)),5)),3 -in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:20, bytes:11720, used:0.001s, actions:output +in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:11614, used:0.001s, actions:output ]) sleep 1 @@ -680,3 +680,67 @@ OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(check_pkt_len(size=100,gt(5), OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + + +AT_SETUP([offloads - simulated flow action update]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +add in_port=ovs-p0,actions=ovs-p1,br0 +add in_port=ovs-p1,actions=ovs-p0,br0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:756/bytes:882/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:3,1 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:2,1 +]) + +AT_DATA([flows2.txt], [dnl +modify in_port=ovs-p0,actions=ovs-p1 +modify in_port=ovs-p1,actions=ovs-p0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed -e 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:1596/bytes:1862/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:3 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:2 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:2436/bytes:2842/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:3,1 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:2,1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 6d8651a44c4..b1b01380aff 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1924,6 +1924,69 @@ masks-cache:size:256 OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - simulated flow action update]) +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +add in_port=ovs-p0,actions=ovs-p1,br0 +add in_port=ovs-p1,actions=ovs-p0,br0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:756/bytes:882/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:3,1 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:2,1 +]) + +AT_DATA([flows2.txt], [dnl +modify in_port=ovs-p0,actions=ovs-p1 +modify in_port=ovs-p1,actions=ovs-p0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed -e 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:1596/bytes:1862/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:3 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:2 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:2436/bytes:2842/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:3,1 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:2,1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([MPLS]) AT_SETUP([mpls - encap header dp-support]) From 531c17023cd21cdbdc876b544507fc4f723afd7b Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 30 Jan 2023 17:04:15 -0500 Subject: [PATCH 151/833] netdev-dummy: Allocate dummy_packet_stream on cacheline boundary. UB Sanitizer report: lib/netdev-dummy.c:197:15: runtime error: member access within misaligned address 0x00000217a7f0 for type 'struct dummy_packet_stream', which requires 64 byte alignment ^ #0 dummy_packet_stream_init lib/netdev-dummy.c:197 #1 dummy_packet_stream_create lib/netdev-dummy.c:208 #2 dummy_packet_conn_set_config lib/netdev-dummy.c:436 [...] Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dummy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 72cb9547110..5d59c9c0312 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -204,7 +204,7 @@ dummy_packet_stream_create(struct stream *stream) { struct dummy_packet_stream *s; - s = xzalloc(sizeof *s); + s = xzalloc_cacheline(sizeof *s); dummy_packet_stream_init(s, stream); return s; @@ -350,7 +350,7 @@ dummy_packet_conn_close(struct dummy_packet_conn *conn) pstream_close(pconn->pstream); for (i = 0; i < pconn->n_streams; i++) { dummy_packet_stream_close(pconn->streams[i]); - free(pconn->streams[i]); + free_cacheline(pconn->streams[i]); } free(pconn->streams); pconn->pstream = NULL; @@ -359,7 +359,7 @@ dummy_packet_conn_close(struct dummy_packet_conn *conn) case ACTIVE: dummy_packet_stream_close(rconn->rstream); - free(rconn->rstream); + free_cacheline(rconn->rstream); rconn->rstream = NULL; reconnect_destroy(rconn->reconnect); rconn->reconnect = NULL; @@ -469,7 +469,7 @@ dummy_pconn_run(struct netdev_dummy *dev) pconn->streams = xrealloc(pconn->streams, ((pconn->n_streams + 1) * sizeof s)); - s = xmalloc(sizeof *s); + s = xmalloc_cacheline(sizeof *s); pconn->streams[pconn->n_streams++] = s; dummy_packet_stream_init(s, new_stream); } else if (error != EAGAIN) { @@ -489,7 +489,7 @@ dummy_pconn_run(struct netdev_dummy *dev) stream_get_name(s->stream), ovs_retval_to_string(error)); dummy_packet_stream_close(s); - free(s); + free_cacheline(s); pconn->streams[i] = pconn->streams[--pconn->n_streams]; } else { i++; From 4339e7b19f721d6f6813118effb47144f4a2aade Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 30 Jan 2023 17:04:16 -0500 Subject: [PATCH 152/833] dp-packet: Allocate on cacheline boundary with DPDK. UB Sanitizer report: lib/dp-packet.h:587:22: runtime error: member access within misaligned address 0x000001ecde10 for type 'struct dp_packet', which requires 64 byte alignment #0 in dp_packet_set_base lib/dp-packet.h:587 #1 in dp_packet_use__ lib/dp-packet.c:46 #2 in dp_packet_use lib/dp-packet.c:60 #3 in dp_packet_init lib/dp-packet.c:126 #4 in dp_packet_new lib/dp-packet.c:150 [...] Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 4 ++++ lib/dp-packet.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 61e405460a2..ae8ab5800e4 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -146,7 +146,11 @@ dp_packet_uninit(struct dp_packet *b) struct dp_packet * dp_packet_new(size_t size) { +#ifdef DPDK_NETDEV + struct dp_packet *b = xmalloc_cacheline(sizeof *b); +#else struct dp_packet *b = xmalloc(sizeof *b); +#endif dp_packet_init(b, size); return b; } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index ed1e5b3f6d1..b3e6a5d10c7 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -257,7 +257,11 @@ dp_packet_delete(struct dp_packet *b) } dp_packet_uninit(b); +#ifdef DPDK_NETDEV + free_cacheline(b); +#else free(b); +#endif } } From e85e8a7541cbe7d9c57bd34ae99d47edad92f111 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 30 Jan 2023 17:04:17 -0500 Subject: [PATCH 153/833] hash: Avoid 64bit crc intrinsics on 32bit aligned data. UB Sanitizer report: lib/hash.h:219:17: runtime error: load of misaligned address 0x7ffc164a88b4 for type 'const uint64_t', which requires 8 byte alignment #0 in hash_words_inline lib/hash.h:219 #1 in hash_words lib/hash.h:297 [...] Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/hash.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/lib/hash.h b/lib/hash.h index 60a39a40b8a..7b7f70c112a 100644 --- a/lib/hash.h +++ b/lib/hash.h @@ -187,13 +187,72 @@ static inline uint32_t hash_finish(uint64_t hash, uint64_t final) return hash ^ (uint32_t)hash >> 16; /* Increase entropy in LSBs. */ } +static inline uint32_t +hash_finish32(uint64_t hash, uint32_t final, uint32_t semifinal) +{ + /* The finishing multiplier 0x805204f3 has been experimentally + * derived to pass the testsuite hash tests. */ + hash = _mm_crc32_u32(hash, semifinal); + hash = _mm_crc32_u32(hash, final) * 0x805204f3; + return hash ^ ((uint32_t) hash >> 16); /* Increase entropy in LSBs. */ +} + +static inline uint32_t +hash_words_32aligned(const uint32_t *p_, size_t n_words, uint32_t basis) +{ + const uint32_t *p = (const void *) p_; + uint32_t hash1 = basis; + uint32_t hash2 = 0; + uint32_t hash3 = n_words; + const uint32_t *endp = (const uint32_t *) p + n_words; + const uint32_t *limit = p + n_words - 6; + + while (p <= limit) { + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + hash2 = _mm_crc32_u32(hash2, p[3]); + hash3 = _mm_crc32_u32(hash3, p[4]); + hash3 = _mm_crc32_u32(hash3, p[5]); + p += 6; + } + switch (endp - (const uint32_t *) p) { + case 1: + hash1 = _mm_crc32_u32(hash1, p[0]); + break; + case 2: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + break; + case 3: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + break; + case 4: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + hash2 = _mm_crc32_u32(hash2, p[3]); + break; + case 5: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + hash2 = _mm_crc32_u32(hash2, p[3]); + hash3 = _mm_crc32_u32(hash3, p[4]); + break; + } + return hash_finish32(hash1, hash2, hash3); +} + /* Returns the hash of the 'n' 32-bit words at 'p_', starting from 'basis'. * We access 'p_' as a uint64_t pointer, which is fine for __SSE_4_2__. * * This is inlined for the compiler to have access to the 'n_words', which * in many cases is a constant. */ static inline uint32_t -hash_words_inline(const uint32_t p_[], size_t n_words, uint32_t basis) +hash_words_inline(const uint32_t *p_, size_t n_words, uint32_t basis) { const uint64_t *p = (const void *)p_; uint64_t hash1 = basis; @@ -202,6 +261,10 @@ hash_words_inline(const uint32_t p_[], size_t n_words, uint32_t basis) const uint32_t *endp = (const uint32_t *)p + n_words; const uint64_t *limit = p + n_words / 2 - 3; + if (OVS_UNLIKELY(((intptr_t) p & ((sizeof(uint64_t)) - 1)) != 0)) { + return hash_words_32aligned(p_, n_words, basis); + } + while (p <= limit) { hash1 = _mm_crc32_u64(hash1, p[0]); hash2 = _mm_crc32_u64(hash2, p[1]); From 5dfc8309d9df10075c6dd85250acda6daaeca6fd Mon Sep 17 00:00:00 2001 From: Peng He Date: Sun, 27 Nov 2022 07:28:55 +0000 Subject: [PATCH 154/833] ofproto-dpif-upcall: New ukey needs to take the old ukey's dump seq. The userspace datapath manages all the magaflows by a cmap. The cmap data structure will grow/shrink during the datapath processing and it will re-position megaflows. This might result in two revalidator threads might process a same megaflow during one dump stage. Consider a situation that, revalidator 1 processes a megaflow A, and decides to delete it from the datapath, at the mean time, this megaflow A is also queued in the process batch of revalidator 2. Normally it's ok for revalidators to process the same megaflow multiple times, as the dump_seq shows it's already dumped and the stats will not be contributed twice. Assume that right after A is deleted, a PMD thread generates again a new megaflow B which has the same match and action of A. The ukey of megaflow B will replace the one of megaflow A. Now the ukey B is new to the revalidator system and its dump seq is 0. Now since the dump seq of ukey B is 0, when processing megaflow A, the revalidator 2 will not identify this megaflow A has already been dumped by revalidator 1 and will contribute the old megaflow A's stats again, this results in an inconsistent stats between ukeys and megaflows. To fix this, the newly generated the ukey B should take the dump_seq of the replaced ukey A to avoid a same megaflow being revalidated twice in one dump stage. We observe in the production environment, the OpenFlow rules' stats sometimes are amplified compared to the actual value. Signed-off-by: Peng He Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 31ac02d116f..db7570ee2a7 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -1893,6 +1893,7 @@ try_ukey_replace(struct umap *umap, struct udpif_key *old_ukey, ovs_mutex_lock(&new_ukey->mutex); cmap_replace(&umap->cmap, &old_ukey->cmap_node, &new_ukey->cmap_node, new_ukey->hash); + new_ukey->dump_seq = old_ukey->dump_seq; ovsrcu_postpone(ukey_delete__, old_ukey); transition_ukey(old_ukey, UKEY_DELETED); transition_ukey(new_ukey, UKEY_VISIBLE); From f68e757ef1f6f3b10e7d5fabdd09631f9fe20da1 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:03:57 +0100 Subject: [PATCH 155/833] tests: Include working system-traffic tests into the system-offloads-testsuite. Include and run the system-traffic.at tests as part of the system offload testsuite. Exclude all the tests that will not run without any special modifications. Lowered log level for "recirc_id sharing not supported" message, so tests will not fail with older kernels. This is not an error level message, but should be debug, like all other, EOPNOTSUPP, related log messages. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 2 +- tests/automake.mk | 3 +- tests/system-kmod-macros.at | 5 ++++ tests/system-offloads-testsuite-macros.at | 36 +++++++++++++++++++++++ tests/system-offloads-testsuite.at | 3 ++ tests/system-traffic.at | 27 +++++++++++++++++ tests/system-userspace-macros.at | 5 ++++ 7 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 tests/system-offloads-testsuite-macros.at diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 134c241576a..a13f2fe6bb9 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -2417,7 +2417,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, } if ((chain || recirc_act) && !info->recirc_id_shared_with_tc) { - VLOG_ERR_RL(&error_rl, "flow_put: recirc_id sharing not supported"); + VLOG_DBG_RL(&rl, "flow_put: recirc_id sharing not supported"); return EOPNOTSUPP; } diff --git a/tests/automake.mk b/tests/automake.mk index c8de3fe28d2..86e496a5b9f 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -186,7 +186,8 @@ SYSTEM_TESTSUITE_AT = \ SYSTEM_OFFLOADS_TESTSUITE_AT = \ tests/system-common-macros.at \ tests/system-offloads-traffic.at \ - tests/system-offloads-testsuite.at + tests/system-offloads-testsuite.at \ + tests/system-offloads-testsuite-macros.at SYSTEM_DPDK_TESTSUITE_AT = \ tests/system-common-macros.at \ diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index 11920e60b66..822a80618d6 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -224,3 +224,8 @@ m4_define([VSCTL_ADD_DATAPATH_TABLE], # or necessary for the userspace datapath as it is checking for a kernel # specific regression. m4_define([CHECK_L3L4_CONNTRACK_REASM]) + +# CHECK_NO_TC_OFFLOAD +# +# The kernel module tests do not use TC offload. +m4_define([CHECK_NO_TC_OFFLOAD]) diff --git a/tests/system-offloads-testsuite-macros.at b/tests/system-offloads-testsuite-macros.at new file mode 100644 index 00000000000..2129cf7f034 --- /dev/null +++ b/tests/system-offloads-testsuite-macros.at @@ -0,0 +1,36 @@ +AT_COPYRIGHT([Copyright (c) 2022 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at: + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.]) + +# The goal is to run as many as possible of the system-traffic tests with +# OVS tc offload enabled. We do this by overriding the +# OVS_TRAFFIC_VSWITCHD_START() with offloading enabled. +m4_define([OVS_TRAFFIC_VSWITCHD_START], + [AT_CHECK([modprobe openvswitch]) + on_exit 'modprobe -r openvswitch' + m4_foreach([mod], [[vport_geneve], [vport_gre], [vport_lisp], [vport_stt], [vport_vxlan]], + [modprobe -q mod || echo "Module mod not loaded." + on_exit 'modprobe -q -r mod' + ]) + on_exit 'ovs-dpctl del-dp ovs-system' + on_exit 'ovs-appctl dpctl/flush-conntrack' + _OVS_VSWITCHD_START([], [-- set Open_vSwitch . other_config:hw-offload=true $3]) + dnl Add bridges, ports, etc. + AT_CHECK([ovs-vsctl -- _ADD_BR([br0]) -- $1 m4_if([$2], [], [], [| uuidfilt])], [0], [$2]) +]) + +# Macro to exclude tests that will fail with TC offload enabled. +m4_define([CHECK_NO_TC_OFFLOAD], +[ + AT_SKIP_IF([:]) +]) diff --git a/tests/system-offloads-testsuite.at b/tests/system-offloads-testsuite.at index eb5d2d4b329..23637d4f522 100644 --- a/tests/system-offloads-testsuite.at +++ b/tests/system-offloads-testsuite.at @@ -23,3 +23,6 @@ m4_include([tests/system-common-macros.at]) m4_include([tests/system-kmod-macros.at]) m4_include([tests/system-offloads-traffic.at]) + +m4_include([tests/system-offloads-testsuite-macros.at]) +m4_include([tests/system-traffic.at]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index b1b01380aff..5d15c4712bc 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1199,6 +1199,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - mpls actions]) +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -1236,6 +1237,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - multiple mpls label pop]) +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -1638,6 +1640,7 @@ dnl br-underlay: with IP: 172.31.1.100 dnl ns0: connect to br-underlay, with IP: 10.1.1.1 AT_SETUP([datapath - truncate and output to gre tunnel by simulated packets]) OVS_CHECK_MIN_KERNEL(3, 10) +CHECK_NO_TC_OFFLOAD() AT_SKIP_IF([test $HAVE_NC = no]) OVS_TRAFFIC_VSWITCHD_START() @@ -1772,6 +1775,7 @@ AT_SETUP([datapath - truncate and output to gre tunnel]) AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay]) @@ -2919,6 +2923,7 @@ AT_CLEANUP AT_SETUP([conntrack - zones from other field, more tests]) CHECK_CONNTRACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -3050,6 +3055,7 @@ AT_CLEANUP AT_SETUP([conntrack - multiple namespaces, internal ports]) CHECK_CONNTRACK() CHECK_CONNTRACK_LOCAL_STACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START( [set-fail-mode br0 secure -- ]) @@ -3672,6 +3678,7 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation + cvlan]) CHECK_CONNTRACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0]) OVS_CHECK_8021AD() @@ -4195,6 +4202,7 @@ AT_SETUP([conntrack - Fragmentation over vxlan]) OVS_CHECK_VXLAN() CHECK_CONNTRACK() CHECK_CONNTRACK_LOCAL_STACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay]) @@ -4385,6 +4393,7 @@ AT_CLEANUP AT_SETUP([conntrack - zone-based timeout policy]) CHECK_CONNTRACK() CHECK_CONNTRACK_TIMEOUT() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -4878,6 +4887,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - FTP]) +CHECK_NO_TC_OFFLOAD() AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() @@ -4987,6 +4997,7 @@ AT_SETUP([conntrack - FTP over IPv6]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5042,6 +5053,7 @@ AT_SETUP([conntrack - IPv6 FTP Passive]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5101,6 +5113,7 @@ AT_SETUP([conntrack - FTP with multiple expectations]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5167,6 +5180,7 @@ AT_SETUP([conntrack - TFTP]) AT_SKIP_IF([test $HAVE_TFTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5802,6 +5816,7 @@ m4_define([CHECK_FTP_NAT], CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() + CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6109,6 +6124,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6169,6 +6185,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6229,6 +6246,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6289,6 +6307,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6349,6 +6368,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6551,6 +6571,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6611,6 +6632,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6672,6 +6694,7 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -6732,6 +6755,7 @@ AT_SKIP_IF([test $HAVE_TFTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -7106,6 +7130,7 @@ AT_SETUP([conntrack - Multiple ICMP traverse]) dnl This tracks sending ICMP packets via conntrack multiple times for the dnl same packet CHECK_CONNTRACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() OVS_CHECK_CT_CLEAR() @@ -7148,6 +7173,7 @@ AT_CLEANUP AT_SETUP([conntrack - can match and clear ct_state from outside OVS]) CHECK_CONNTRACK_LOCAL_STACK() +CHECK_NO_TC_OFFLOAD() OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() @@ -7196,6 +7222,7 @@ AT_CLEANUP AT_BANNER([IGMP]) AT_SETUP([IGMP - flood under normal action]) +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index b34a84775bf..610fa2e94ae 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -325,3 +325,8 @@ m4_define([CHECK_L3L4_CONNTRACK_REASM], [ AT_SKIP_IF([:]) ]) + +# CHECK_NO_TC_OFFLOAD +# +# Userspace tests do not use TC offload. +m4_define([CHECK_NO_TC_OFFLOAD]) From 3209287103efd422bbdc7ba8ad6b9572b9fbd17b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:04:17 +0100 Subject: [PATCH 156/833] test: Do not use MPLS implicit null label in test cases. TC flower does not allow the push of the implicit null labels (RFC3032). Avoid the use of such labels in the MPLS test cases. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 5d15c4712bc..796adb4ca08 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1199,7 +1199,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - mpls actions]) -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -1216,8 +1215,8 @@ AT_CHECK([ovs-vsctl add-port br0 patch0]) AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_DATA([flows.txt], [dnl -table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:3,resubmit(,1) -table=0,priority=100,dl_type=0x8847,mpls_label=3 actions=pop_mpls:0x0800,resubmit(,1) +table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:4,resubmit(,1) +table=0,priority=100,dl_type=0x8847,mpls_label=4 actions=pop_mpls:0x0800,resubmit(,1) table=0,priority=10 actions=resubmit(,1) table=1,priority=10 actions=normal ]) @@ -1237,7 +1236,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - multiple mpls label pop]) -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -1254,10 +1252,10 @@ AT_CHECK([ovs-vsctl add-port br0 patch0]) AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_DATA([flows.txt], [dnl -table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:3,push_mpls:0x8847,set_mpls_label:2,push_mpls:0x8847,set_mpls_label:1,resubmit(,3) +table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:4,push_mpls:0x8847,set_mpls_label:2,push_mpls:0x8847,set_mpls_label:1,resubmit(,3) table=0,priority=100,dl_type=0x8847,mpls_label=1 actions=pop_mpls:0x8847,resubmit(,1) table=1,priority=100,dl_type=0x8847,mpls_label=2 actions=pop_mpls:0x8847,resubmit(,2) -table=2,priority=100,dl_type=0x8847,mpls_label=3 actions=pop_mpls:0x0800,resubmit(,3) +table=2,priority=100,dl_type=0x8847,mpls_label=4 actions=pop_mpls:0x0800,resubmit(,3) table=0,priority=10 actions=resubmit(,3) table=3,priority=10 actions=normal ]) From 564d09ef53ebecca66f0f932a82e2acad9ed0567 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:04:42 +0100 Subject: [PATCH 157/833] netdev-offload-tc: Fix tc conntrack force commit support. tc was not setting the OVS_CT_ATTR_FORCE_COMMIT flag when a forced commit was requested. This patch will fix this. Fixes: 576126a931cd ("netdev-offload-tc: Add conntrack support") Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index a13f2fe6bb9..5a2b2665104 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -888,7 +888,11 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, ct_offset = nl_msg_start_nested(buf, OVS_ACTION_ATTR_CT); if (action->ct.commit) { - nl_msg_put_flag(buf, OVS_CT_ATTR_COMMIT); + if (action->ct.force) { + nl_msg_put_flag(buf, OVS_CT_ATTR_FORCE_COMMIT); + } else { + nl_msg_put_flag(buf, OVS_CT_ATTR_COMMIT); + } } if (action->ct.zone) { @@ -1377,7 +1381,12 @@ parse_put_flow_ct_action(struct tc_flower *flower, NL_ATTR_FOR_EACH_UNSAFE (ct_attr, ct_left, ct, ct_len) { switch (nl_attr_type(ct_attr)) { case OVS_CT_ATTR_COMMIT: { - action->ct.commit = true; + action->ct.commit = true; + } + break; + case OVS_CT_ATTR_FORCE_COMMIT: { + action->ct.commit = true; + action->ct.force = true; } break; case OVS_CT_ATTR_ZONE: { From 7a176f9636932bf7339fd8a3dbe194f28de871d3 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:05:07 +0100 Subject: [PATCH 158/833] test: Flush datapath when changing rules on the fly. Flush datapath flows as TC flows take some more time to be flushed out. The flush speeds this up. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 796adb4ca08..8ac35489a30 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2911,6 +2911,9 @@ AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl AT_CHECK([ovs-ofctl mod-flows br0 dnl 'priority=100,ct_state=-trk,tcp,in_port="ovs-p0" actions=ct(table=0,zone=15)']) +dnl Wait for a flow flush as some datapaths (read TC) might take time to clear. +AT_CHECK([ovs-appctl revalidator/wait], [0]) + NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o wget0.log]) AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl @@ -2921,7 +2924,6 @@ AT_CLEANUP AT_SETUP([conntrack - zones from other field, more tests]) CHECK_CONNTRACK() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2960,6 +2962,9 @@ AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl AT_CHECK([ovs-ofctl mod-flows br0 'priority=100,ct_state=-trk,tcp,in_port="ovs-p0" actions=ct(table=0,zone=15,commit,exec(load:0xffff000f->NXM_NX_CT_LABEL[[0..31]]))']) +dnl Wait for a flow flush as some datapaths (read TC) might take time to clear. +AT_CHECK([ovs-appctl revalidator/wait], [0]) + NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o wget0.log]) AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl From b292cce2ff5336524874b80da26b60c653984ca4 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:05:25 +0100 Subject: [PATCH 159/833] netdev-offload-tc: Conntrack ALGs are not supported with tc. tc does not support conntrack ALGs. Even worse, with tc enabled, they should not be used/configured at all. This is because even though TC will ignore the rules with ALG configured, i.e., they will flow through the kernel module, return traffic might flow through a tc conntrack rule, and it will not invoke the ALG helper. Fixes: 576126a931cd ("netdev-offload-tc: Add conntrack support") Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- Documentation/howto/tc-offload.rst | 11 +++++++++++ lib/netdev-offload-tc.c | 4 ++++ tests/system-offloads-testsuite-macros.at | 6 ++++++ tests/system-traffic.at | 15 --------------- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/Documentation/howto/tc-offload.rst b/Documentation/howto/tc-offload.rst index f6482c8aff0..681dff13e08 100644 --- a/Documentation/howto/tc-offload.rst +++ b/Documentation/howto/tc-offload.rst @@ -112,3 +112,14 @@ First flow packet not processed by meter Packets that are received by ovs-vswitchd through an upcall before the actual meter flow is installed, are not passing TC police action and therefore are not considered for policing. + +Conntrack Application Layer Gateways (ALG) +++++++++++++++++++++++++++++++++++++++++++ + +TC does not support conntrack helpers, i.e., ALGs. TC will not offload flows if +the ALG keyword is present within the ct() action. However, this will not allow +ALGs to work within the datapath, as the return traffic without the ALG keyword +might run through a TC rule, which internally will not call the conntrack +helper required. + +So if ALG support is required, tc offload must be disabled. diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 5a2b2665104..4fb9d9f2127 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -1426,6 +1426,10 @@ parse_put_flow_ct_action(struct tc_flower *flower, get_32aligned_u128(&ct_label->mask); } break; + /* The following option we do not support in tc-ct, and should + * not be ignored for proper operation. */ + case OVS_CT_ATTR_HELPER: + return EOPNOTSUPP; } } diff --git a/tests/system-offloads-testsuite-macros.at b/tests/system-offloads-testsuite-macros.at index 2129cf7f034..5d7044f4263 100644 --- a/tests/system-offloads-testsuite-macros.at +++ b/tests/system-offloads-testsuite-macros.at @@ -34,3 +34,9 @@ m4_define([CHECK_NO_TC_OFFLOAD], [ AT_SKIP_IF([:]) ]) + +# Conntrack ALGs are not supported for tc. +m4_define([CHECK_CONNTRACK_ALG], +[ + AT_SKIP_IF([:]) +]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 8ac35489a30..1ea180be63d 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -4890,7 +4890,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - FTP]) -CHECK_NO_TC_OFFLOAD() AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() @@ -5000,7 +4999,6 @@ AT_SETUP([conntrack - FTP over IPv6]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5056,7 +5054,6 @@ AT_SETUP([conntrack - IPv6 FTP Passive]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5116,7 +5113,6 @@ AT_SETUP([conntrack - FTP with multiple expectations]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5183,7 +5179,6 @@ AT_SETUP([conntrack - TFTP]) AT_SKIP_IF([test $HAVE_TFTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5819,7 +5814,6 @@ m4_define([CHECK_FTP_NAT], CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() - CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6127,7 +6121,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6188,7 +6181,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6249,7 +6241,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6310,7 +6301,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6371,7 +6361,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6574,7 +6563,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6635,7 +6623,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() @@ -6697,7 +6684,6 @@ AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -6758,7 +6744,6 @@ AT_SKIP_IF([test $HAVE_TFTP = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() CHECK_CONNTRACK_ALG() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() From 3655ddb4f5ecb325bb0a332e56afa07a28449de3 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:05:46 +0100 Subject: [PATCH 160/833] test: Tc does not support conntrack timeout, skip the related test. The tc conntrack implementation does not support the timeout option. The current implementation is silently ignoring the timeout option by adding a general conntrack entry. This patch will skip the related test by overriding the support macro. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-offloads-testsuite-macros.at | 6 ++++++ tests/system-traffic.at | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/system-offloads-testsuite-macros.at b/tests/system-offloads-testsuite-macros.at index 5d7044f4263..322166b8c4d 100644 --- a/tests/system-offloads-testsuite-macros.at +++ b/tests/system-offloads-testsuite-macros.at @@ -40,3 +40,9 @@ m4_define([CHECK_CONNTRACK_ALG], [ AT_SKIP_IF([:]) ]) + +# Conntrack timeout not supported for tc. +m4_define([CHECK_CONNTRACK_TIMEOUT], +[ + AT_SKIP_IF([:]) +]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 1ea180be63d..b5705498556 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -4396,7 +4396,6 @@ AT_CLEANUP AT_SETUP([conntrack - zone-based timeout policy]) CHECK_CONNTRACK() CHECK_CONNTRACK_TIMEOUT() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) From 25b6f5585ba04e92560ef7f5cd37826caf54cc68 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:05:57 +0100 Subject: [PATCH 161/833] test: Fix 'conntrack - Multiple ICMP traverse' for tc case. tc does not include ethernet header length in packet byte count. This fix will allow the packets that go trough tc to be 14 bytes less. This difference in the TC implementation is already described in tc-offload.rst. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index b5705498556..4f3e767896d 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -7117,7 +7117,6 @@ AT_SETUP([conntrack - Multiple ICMP traverse]) dnl This tracks sending ICMP packets via conntrack multiple times for the dnl same packet CHECK_CONNTRACK() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() OVS_CHECK_CT_CLEAR() @@ -7149,7 +7148,7 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1)], [0], [dnl icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0) ]) -AT_CHECK([ovs-ofctl dump-flows br0 | grep table=2, | OFPROTO_CLEAR_DURATION_IDLE], +AT_CHECK([ovs-ofctl dump-flows br0 | grep table=2, | OFPROTO_CLEAR_DURATION_IDLE | sed 's/n_bytes=70,/n_bytes=84,/'], [0], [dnl cookie=0x0, duration=, table=2, n_packets=2, n_bytes=84, idle_age=, priority=10,ct_state=+new+trk,in_port=1 actions=drop cookie=0x0, duration=, table=2, n_packets=0, n_bytes=0, idle_age=, priority=10,ct_state=+est+trk actions=drop From d57299fc6cb3fc802faaa56e41514f28b87477de Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:06:07 +0100 Subject: [PATCH 162/833] odp-util: Make odp_flow_key_from_flow__ nlattr order the same as the kernel. Make the order of the Netlink attributes for odp_flow_key_from_flow__() the same as the kernel will return them. This will make sure the attributes displayed in the dpctl/dump-flows output appear in the same order for all datapath. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/odp-util.c | 21 ++++----- tests/dpif-netdev.at | 28 ++++++------ tests/mcast-snooping.at | 4 +- tests/nsh.at | 10 ++--- tests/odp.at | 84 +++++++++++++++++------------------ tests/ofproto-dpif.at | 30 ++++++------- tests/packet-type-aware.at | 22 ++++----- tests/pmd.at | 2 +- tests/system-traffic.at | 1 - tests/tunnel-push-pop-ipv6.at | 2 +- tests/tunnel-push-pop.at | 2 +- tests/tunnel.at | 2 +- 12 files changed, 102 insertions(+), 106 deletions(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index 5fc312f8c00..dbd4554d062 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -6204,6 +6204,11 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, const struct flow *mask = parms->mask; const struct flow *data = export_mask ? mask : flow; + if (parms->support.recirc) { + nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id); + nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash); + } + nl_msg_put_u32(buf, OVS_KEY_ATTR_PRIORITY, data->skb_priority); if (flow_tnl_dst_is_set(&flow->tunnel) || @@ -6212,6 +6217,12 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, parms->key_buf, NULL); } + /* Add an ingress port attribute if this is a mask or 'in_port.odp_port' + * is not the magical value "ODPP_NONE". */ + if (export_mask || flow->in_port.odp_port != ODPP_NONE) { + nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, data->in_port.odp_port); + } + nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark); if (parms->support.ct_state) { @@ -6255,16 +6266,6 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, ct->ipv6_proto = data->ct_nw_proto; } } - if (parms->support.recirc) { - nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id); - nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash); - } - - /* Add an ingress port attribute if this is a mask or 'in_port.odp_port' - * is not the magical value "ODPP_NONE". */ - if (export_mask || flow->in_port.odp_port != ODPP_NONE) { - nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, data->in_port.odp_port); - } nl_msg_put_be32(buf, OVS_KEY_ATTR_PACKET_TYPE, data->packet_type); diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 9af70a68d75..baab60a2221 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -72,13 +72,13 @@ ovs-appctl time/warp 5000 AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:01,dst=50:54:00:00:02:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack)']) OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:01,dst=50:54:00:00:02:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:01,dst=50:54:00:00:02:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) ]) AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:06:00),eth_type(0x0800),ipv4(src=10.0.0.5,dst=10.0.0.6,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack)' --len 1024]) OVS_WAIT_UNTIL([test `grep -c "miss upcall" ovs-vswitchd.log` -ge 2]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:06:00),eth_type(0x0800),ipv4(src=10.0.0.5,dst=10.0.0.6,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:06:00),eth_type(0x0800),ipv4(src=10.0.0.5,dst=10.0.0.6,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) ]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -139,7 +139,7 @@ m4_define([DPIF_NETDEV_MISS_FLOW_INSTALL], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) AT_CHECK([filter_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(frag=no), actions: @@ -152,11 +152,11 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50: OVS_WAIT_UNTIL([test `grep -c "miss upcall" ovs-vswitchd.log` -ge 2]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) AT_CHECK([filter_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(frag=no), actions: -skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions: ]) OVS_VSWITCHD_STOP @@ -187,7 +187,7 @@ m4_define([DPIF_NETDEV_FLOW_PUT_MODIFY], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=1000,pcp=5),encap(eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no)) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=1000,pcp=5),encap(eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no)) ]) ovs-appctl revalidator/wait # Dump the datapath flow to see that it goes to p2 ("actions:2"). @@ -236,11 +236,11 @@ m4_define([DPIF_NETDEV_MISS_FLOW_DUMP], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) ovs-appctl revalidator/wait AT_CHECK([filter_flow_dump < ovs-vswitchd.log | strip_xout], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: ]) # Now, the same again without megaflows. @@ -252,12 +252,12 @@ skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label OVS_WAIT_UNTIL([test `grep -c "miss upcall" ovs-vswitchd.log` -ge 2]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) ovs-appctl revalidator/wait AT_CHECK([filter_flow_dump < ovs-vswitchd.log | strip_xout], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:0, bytes:0, used:never, actions: -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:0, bytes:0, used:never, actions: +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: ]) OVS_VSWITCHD_STOP @@ -423,7 +423,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no) ]) # Check that flow successfully offloaded. OVS_WAIT_UNTIL([grep "succeed to add netdev flow" ovs-vswitchd.log]) @@ -489,7 +489,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),dnl +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),dnl packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=99,pcp=7),encap(eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=17,tos=0,ttl=64,frag=no),udp(src=81,dst=82)) ]) # Check that flow successfully offloaded. @@ -566,7 +566,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),dnl +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),dnl packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=99,pcp=7),encap(eth_type(0x0806),arp(sip=127.0.0.1,tip=127.0.0.1,op=1,sha=00:0b:0c:0d:0e:0f,tha=00:00:00:00:00:00)) ]) # Check that flow successfully offloaded. diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index fe475e7b38c..d5b7c4774c7 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -277,9 +277,9 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep -e .*ipv4 | sort | dnl sed 's/pid=[[0-9]]*,// s/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/'], [0], [dnl -ct_state(+new-inv+trk),recirc_id(),in_port(1),eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:2 -ct_state(+new-inv+trk),recirc_id(),in_port(1),eth_type(0x0800),ipv4(proto=2,frag=no), packets:0, bytes:0, used:never, actions:userspace(controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) recirc_id(),in_port(1),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:ct(zone=64000),recirc() +recirc_id(),in_port(1),ct_state(+new-inv+trk),eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:2 +recirc_id(),in_port(1),ct_state(+new-inv+trk),eth_type(0x0800),ipv4(proto=2,frag=no), packets:0, bytes:0, used:never, actions:userspace(controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) ]) AT_CLEANUP diff --git a/tests/nsh.at b/tests/nsh.at index 6b7b6856f26..55296e5593a 100644 --- a/tests/nsh.at +++ b/tests/nsh.at @@ -725,8 +725,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(4),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,push_nsh(flags=0,ttl=63,mdtype=1,np=1,spi=0x3000,si=255,c1=0x0,c2=0x0,c3=0x0,c4=0x0),tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(4789) -tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3000,si=255), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x1) -tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0x1),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 +recirc_id(0),tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3000,si=255), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x1) +recirc_id(0x1),tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 ]) AT_CHECK([ @@ -779,9 +779,9 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(4),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20/255.255.255.248,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,push_nsh(flags=0,ttl=63,mdtype=1,np=1,spi=0x3020,si=255,c1=0x0,c2=0x0,c3=0x0,c4=0x0),tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(1)),set(ipv4(src=20.0.0.1,dst=20.0.0.2)),tnl_pop(4789) -tunnel(tun_id=0x0,src=20.0.0.1,dst=20.0.0.2,flags(-df-csum+key)),recirc_id(0),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(spi=0x3020,si=255), packets:1, bytes:108, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=11:22:33:44:55:66),set(nsh(spi=0x3020,si=254)),pop_eth,tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.3,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(2)),set(ipv4(src=30.0.0.2,dst=30.0.0.3)),tnl_pop(4789) -tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3020,si=254), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x2) -tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0x2),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 +recirc_id(0),tunnel(tun_id=0x0,src=20.0.0.1,dst=20.0.0.2,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(spi=0x3020,si=255), packets:1, bytes:108, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=11:22:33:44:55:66),set(nsh(spi=0x3020,si=254)),pop_eth,tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.3,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(2)),set(ipv4(src=30.0.0.2,dst=30.0.0.3)),tnl_pop(4789) +recirc_id(0),tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3020,si=254), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x2) +recirc_id(0x2),tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 ]) AT_CHECK([ diff --git a/tests/odp.at b/tests/odp.at index 41eb726e922..26cda296723 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -3,92 +3,87 @@ AT_BANNER([datapath parsing and formatting]) AT_SETUP([OVS datapath key parsing and formatting - valid forms]) dnl We could add a test for invalid forms, but that's less important. AT_DATA([odp-base.txt], [dnl -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x1234) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x81,ttl=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=first) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=later) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=6,tos=0,ttl=128,frag=no),tcp(src=80,dst=8080) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=17,tos=0,ttl=128,frag=no),udp(src=81,dst=6632) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=1,tos=0,ttl=128,frag=no),icmp(type=1,code=2) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x71,hlimit=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=first) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=later) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=6,tclass=0,hlimit=128,frag=no),tcp(src=80,dst=8080) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=17,tclass=0,hlimit=128,frag=no),udp(src=6630,dst=22) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=1,code=2) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0806),arp(sip=1.2.3.4,tip=5.6.7.8,op=1,sha=00:0f:10:11:12:13,tha=00:14:15:16:17:18) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e),nd_ext(nd_reserved=0x0,nd_options_type=2) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=3,ttl=64,bos=1) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=1) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=0) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=1) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=0) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x1234) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x81,ttl=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=first) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=later) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=6,tos=0,ttl=128,frag=no),tcp(src=80,dst=8080) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=17,tos=0,ttl=128,frag=no),udp(src=81,dst=6632) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=1,tos=0,ttl=128,frag=no),icmp(type=1,code=2) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x71,hlimit=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=first) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=later) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=6,tclass=0,hlimit=128,frag=no),tcp(src=80,dst=8080) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=17,tclass=0,hlimit=128,frag=no),udp(src=6630,dst=22) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=1,code=2) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0806),arp(sip=1.2.3.4,tip=5.6.7.8,op=1,sha=00:0f:10:11:12:13,tha=00:14:15:16:17:18) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e),nd_ext(nd_reserved=0x0,nd_options_type=2) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=3,ttl=64,bos=1) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=1) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=0) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=1) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=0) ]) (echo '# Valid forms without tun_id or VLAN header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt - - sed ' -s/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ -' odp-base.txt - + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/' odp-base.txt echo echo '# Valid forms with tunnel header.' - sed 's/^/skb_priority(0),tunnel(tun_id=0x7f10354,src=10.10.10.10,dst=20.20.20.20,ttl=64,flags(csum|key)),skb_mark(0x1234),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0x7f10354,src=10.10.10.10,dst=20.20.20.20,ttl=64,flags(csum|key)),in_port(1),skb_mark(0x1234),/' odp-base.txt echo echo '# Valid forms with VLAN header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ s/$/)/' odp-base.txt echo echo '# Valid forms with MPLS header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*),?\)/\1,eth_type(0x8847),mpls(label=100,tc=7,ttl=64,bos=1)/' odp-base.txt echo echo '# Valid forms with MPLS multicast header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*),?\)/\1,eth_type(0x8848),mpls(label=100,tc=7,ttl=64,bos=1)/' odp-base.txt echo echo '# Valid forms with tunnel and VLAN headers.' - sed 's/^/skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,tos=0x8,ttl=128,flags(key)),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,tos=0x8,ttl=128,flags(key)),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ s/$/)/' odp-base.txt echo echo '# Valid forms with QOS priority, tunnel, and VLAN headers.' - sed 's/^/skb_priority(0x1234),tunnel(tun_id=0xfedcba9876543210,src=10.10.10.10,dst=20.20.20.20,tos=0x8,ttl=64,flags(key)),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0x1234),tunnel(tun_id=0xfedcba9876543210,src=10.10.10.10,dst=20.20.20.20,tos=0x8,ttl=64,flags(key)),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ s/$/)/' odp-base.txt echo echo '# Valid forms with conntrack fields.' - sed 's/^/skb_priority(0),skb_mark(0),ct_mark(0x12345678),ct_label(0x1234567890abcdef1234567890abcdef),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_mark(0x12345678),ct_label(0x1234567890abcdef1234567890abcdef),/' odp-base.txt echo echo '# Valid forms with IP first fragment.' -sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt | sed -n 's/,frag=no),/,frag=first),/p' + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/' odp-base.txt | sed -n 's/,frag=no),/,frag=first),/p' echo echo '# Valid forms with IP later fragment.' -sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt | sed -n 's/,frag=no),.*/,frag=later)/p' + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/' odp-base.txt | sed -n 's/,frag=no),.*/,frag=later)/p' echo echo '# Valid forms with tunnel and ERSPAN v1 headers.' - sed 's/^/skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=1,idx=0x7),flags(df|key)),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=1,idx=0x7),flags(df|key)),in_port(1),skb_mark(0),/' odp-base.txt echo echo '# Valid forms with tunnel and ERSPAN v2 headers.' - sed 's/^/skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=2,dir=1,hwid=0x7),flags(df|key)),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=2,dir=1,hwid=0x7),flags(df|key)),in_port(1),skb_mark(0),/' odp-base.txt ) > odp-in.txt AT_CAPTURE_FILE([odp-in.txt]) @@ -102,8 +97,9 @@ s/^/ODP_FIT_TOO_LITTLE: / dnl Some fields are always printed for this test, because wildcards aren't dnl specified. We can skip these. sed -i'back' 's/\(skb_mark(0)\),\(ct\)/\1,ct_state(0),ct_zone(0),\2/' odp-out.txt -sed -i'back' 's/\(skb_mark([[^)]]*)\),\(recirc\)/\1,ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),\2/' odp-out.txt -sed -i'back' 's/\(in_port(1)\),\(eth\)/\1,packet_type(ns=0,id=0),\2/' odp-out.txt +sed -i'back' 's/\(skb_mark([[^)]]*)\),\(eth\)/\1,ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),\2/' odp-out.txt +sed -i'back' 's/\(ct_label([[^)]]*)\),\(eth\)/\1,packet_type(ns=0,id=0),\2/' odp-out.txt + AT_CHECK_UNQUOTED([ovstest test-odp parse-keys < odp-in.txt], [0], [`cat odp-out.txt` ]) AT_CHECK_UNQUOTED([cat odp-in.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index fa6111c1ed2..222415ac096 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -8777,12 +8777,12 @@ recirc_id(0),in_port(3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), p ]) AT_CHECK([ovs-appctl dpif/dump-flows -m br0 | strip_ufid | strip_used | sort], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(p2),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=0/0,code=0/0), packets:0, bytes:0, used:never, actions:drop +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(p1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(p2),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=0/0,code=0/0), packets:0, bytes:0, used:never, actions:drop ]) AT_CHECK([ovs-appctl dpif/dump-flows -m br1 | strip_ufid | strip_used | sort], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(p3),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(p3),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop ]) OVS_VSWITCHD_STOP @@ -8942,10 +8942,10 @@ recirc_id(0),in_port(101),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), ]) AT_CHECK([grep -e 'in_port(100).*packets:9' ovs-vswitchd.log | strip_ufid | filter_flow_dump], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(100),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:9, bytes:954, used:0.0s, actions:101,3,2 +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(100),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:9, bytes:954, used:0.0s, actions:101,3,2 ]) AT_CHECK([grep -e 'in_port(101).*packets:4' ovs-vswitchd.log | strip_ufid | filter_flow_dump], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(101),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:4, bytes:424, used:0.0s, actions:100,2,3 +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(101),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:4, bytes:424, used:0.0s, actions:100,2,3 ]) AT_CHECK([ovs-ofctl dump-ports br0 pbr0], [0], [dnl @@ -9637,12 +9637,12 @@ table=0 in_port=1,ip,nw_dst=10.0.0.3 actions=drop done sleep 1 AT_CHECK([strip_ufid < ovs-vswitchd.log | filter_flow_install | strip_used], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:2 -skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:drop +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:2 +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:drop ]) AT_CHECK([strip_ufid < ovs-vswitchd.log | filter_flow_dump | grep 'packets:3'], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:2 -skb_priority(0),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:drop +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:2 +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:drop ]) OVS_VSWITCHD_STOP AT_CLEANUP]) @@ -10344,7 +10344,7 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x1234), packets:5, byte ]) AT_CHECK([grep 'modify' ovs-vswitchd.log | strip_ufid ], [0], [dnl -dpif|DBG|dummy@ovs-dummy: put[[modify]] skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:push_vlan(vid=4,pcp=0),100 +dpif|DBG|dummy@ovs-dummy: put[[modify]] recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:push_vlan(vid=4,pcp=0),100 ]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -10425,8 +10425,8 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x8100),vlan(vid=99,pcp= # are wildcarded. AT_CHECK([grep '\(modify\)\|\(flow_add\)' ovs-vswitchd.log | strip_ufid ], [0], [dnl dpif_netdev|DBG|flow_add: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x1234), actions:100 -dpif|DBG|dummy@ovs-dummy: put[[modify]] skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:drop -dpif|DBG|dummy@ovs-dummy: put[[modify]] skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:100 +dpif|DBG|dummy@ovs-dummy: put[[modify]] recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:drop +dpif|DBG|dummy@ovs-dummy: put[[modify]] recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:100 dpif_netdev|DBG|flow_add: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x8100),vlan(vid=99,pcp=7/0x0),encap(eth_type(0x1234)), actions:drop ]) OVS_VSWITCHD_STOP @@ -10752,10 +10752,10 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00: AT_CHECK([cat ovs-vswitchd.log | strip_ufid | filter_flow_install], [0], [dnl -ct_state(+new-est+trk),recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:drop -ct_state(-new+est+trk),recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:1 recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:ct(commit),2 recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:ct,recirc(0x1) +recirc_id(0x1),in_port(2),ct_state(+new-est+trk),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:drop +recirc_id(0x1),in_port(2),ct_state(-new+est+trk),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:1 ]) OVS_VSWITCHD_STOP @@ -11161,9 +11161,9 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00: ovs-appctl revalidator/wait AT_CHECK([cat ovs-vswitchd.log | strip_ufid | filter_flow_install], [0], [dnl -ct_state(+rpl+trk),ct_label(0x1),recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:1 recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no),udp(src=1), actions:ct(commit,label=0x1),2 recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:ct,recirc(0x1) +recirc_id(0x1),in_port(2),ct_state(+rpl+trk),ct_label(0x1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:1 ]) OVS_VSWITCHD_STOP diff --git a/tests/packet-type-aware.at b/tests/packet-type-aware.at index 3b5c66fe526..acfb0913169 100644 --- a/tests/packet-type-aware.at +++ b/tests/packet-type-aware.at @@ -327,7 +327,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(gre_sys) -tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 +recirc_id(0),tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 ]) # Clear up megaflow cache @@ -345,7 +345,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=20.0.0.1,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=20.0.0.1,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 +recirc_id(0),tunnel(src=20.0.0.1,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 ]) # Clear up megaflow cache @@ -363,7 +363,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) -tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=3a:6d:d2:09:9c:ab),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:01)),n1 +recirc_id(0),tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=3a:6d:d2:09:9c:ab),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:01)),n1 ]) # Clear up megaflow cache @@ -381,8 +381,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) -tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(gre_sys) -tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 +recirc_id(0),tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(gre_sys) +recirc_id(0),tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 ]) # Clear up megaflow cache @@ -400,8 +400,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:03,dl_type=0x0800),ipv4(src=30.0.0.3,dst=30.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p3)),set(ipv4(src=20.0.0.3,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:01),n1 -tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:84, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) +recirc_id(0),tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:01),n1 +recirc_id(0),tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:84, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) ]) # Clear up megaflow cache @@ -419,7 +419,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:03,dl_type=0x0800),ipv4(src=30.0.0.3,dst=30.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p3)),set(ipv4(src=20.0.0.3,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 +recirc_id(0),tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 ]) ### Check the received packets @@ -505,7 +505,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:03,dl_type=0x0800),ipv4(src=30.0.0.3,dst=30.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p3)),set(ipv4(src=20.0.0.3,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:84, used:0.0s, actions:drop +recirc_id(0),tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:84, used:0.0s, actions:drop ]) OVS_VSWITCHD_STOP(["/The Open vSwitch kernel module is probably not loaded/d"]) @@ -1020,8 +1020,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(p0),packet_type(ns=0,id=0),eth(src=aa:bb:cc:00:00:02,dst=aa:bb:cc:00:00:01),eth_type(0x0800),ipv4(dst=20.0.0.1,proto=47,frag=no), packets:3, bytes:378, used:0.0s, actions:tnl_pop(gre_sys) -tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x8847),eth_type(0x8847),mpls(label=999/0x0,tc=0/0,ttl=64/0x0,bos=1/1), packets:3, bytes:264, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00),pop_mpls(eth_type=0x800),recirc(0x1) -tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),recirc_id(0x1),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(ttl=64,frag=no), packets:3, bytes:294, used:0.0s, actions:set(ipv4(ttl=63)),int-br +recirc_id(0),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x8847),eth_type(0x8847),mpls(label=999/0x0,tc=0/0,ttl=64/0x0,bos=1/1), packets:3, bytes:264, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00),pop_mpls(eth_type=0x800),recirc(0x1) +recirc_id(0x1),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(ttl=64,frag=no), packets:3, bytes:294, used:0.0s, actions:set(ipv4(ttl=63)),int-br ]) ovs-appctl time/warp 1000 diff --git a/tests/pmd.at b/tests/pmd.at index c707f762c78..48f3d432d22 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -455,7 +455,7 @@ for i in `seq 0 19`; ovs-appctl time/warp 100 AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) AT_CHECK([cat ovs-vswitchd.log | filter_flow_install | strip_xout], [0], [dnl recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(frag=no), actions: diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 4f3e767896d..ce0f14cf1dd 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -7159,7 +7159,6 @@ AT_CLEANUP AT_SETUP([conntrack - can match and clear ct_state from outside OVS]) CHECK_CONNTRACK_LOCAL_STACK() -CHECK_NO_TC_OFFLOAD() OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index c96b77cd15f..2cf306c67ec 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -459,7 +459,7 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 5'], [0], [dnl port 5: rx pkts=1, bytes=98, drop=?, errs=?, frame=?, over=?, crc=? ]) AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)'], [0], [dnl -tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),recirc_id(0),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=3,rule_cookie=0,controller_id=0,max_len=65535)) +recirc_id(0),tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=3,rule_cookie=0,controller_id=0,max_len=65535)) ]) dnl Receive VXLAN with different MAC and verify that the neigh cache gets updated diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index 013ecbcaa80..b1440f59045 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -608,7 +608,7 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 5'], [0], [dnl port 5: rx pkts=1, bytes=98, drop=?, errs=?, frame=?, over=?, crc=? ]) AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)' | sed -e 's/recirc_id=[[0-9]]*/recirc_id=/g'], [0], [dnl -tunnel(tun_id=0x7b,src=1.1.2.92,dst=1.1.2.88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),recirc_id(0),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) +recirc_id(0),tunnel(tun_id=0x7b,src=1.1.2.92,dst=1.1.2.88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) ]) dnl Receive VXLAN with different MAC and verify that the neigh cache gets updated diff --git a/tests/tunnel.at b/tests/tunnel.at index 037b4c39081..78cc3f3e99a 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -126,7 +126,7 @@ AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl AT_CHECK([ovs-appctl dpctl/add-flow "tunnel(dst=1.1.1.1,src=3.3.3.200/255.255.255.0,tp_dst=123,tp_src=1,ttl=64),recirc_id(0),in_port(1),eth(),eth_type(0x0800),ipv4()" "2"]) AT_CHECK([ovs-appctl dpctl/dump-flows | tail -1], [0], [dnl -tunnel(src=3.3.3.200/255.255.255.0,dst=1.1.1.1,ttl=64,tp_src=1,tp_dst=123),recirc_id(0),in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:2 +recirc_id(0),tunnel(src=3.3.3.200/255.255.255.0,dst=1.1.1.1,ttl=64,tp_src=1,tp_dst=123),in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:2 ]) OVS_VSWITCHD_STOP From a08a589ce494fffccefadc4a767197796d30da2f Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:06:23 +0100 Subject: [PATCH 163/833] netdev-offload-tc: If the flow has not been used, report it as such. If a tc flow was installed but has not yet been used, report it as such. In addition, add a delay to the "IGMP - flood under normal action" test case to make it work with many repetitions. This delay is also present in other ICMP/IGMP tests. Fixes: f98e418fbdb6 ("tc: Add tc flower functions") Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/tc.c | 14 +++++++++++++- tests/system-traffic.at | 1 - 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index 1fb2b4a92ca..4c07e22162e 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1366,7 +1366,19 @@ get_user_hz(void) static void nl_parse_tcf(const struct tcf_t *tm, struct tc_flower *flower) { - uint64_t lastused = time_msec() - (tm->lastuse * 1000 / get_user_hz()); + uint64_t lastused; + + /* On creation both tm->install and tm->lastuse are set to jiffies + * by the kernel. So if both values are the same, the flow has not been + * used yet. + * + * Note that tm->firstuse can not be used due to some kernel bug, i.e., + * hardware offloaded flows do not update tm->firstuse. */ + if (tm->lastuse == tm->install) { + lastused = 0; + } else { + lastused = time_msec() - (tm->lastuse * 1000 / get_user_hz()); + } if (flower->lastused < lastused) { flower->lastused = lastused; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index ce0f14cf1dd..76f1f39a23c 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -7207,7 +7207,6 @@ AT_CLEANUP AT_BANNER([IGMP]) AT_SETUP([IGMP - flood under normal action]) -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) From 594d1fee5b04c0a219cc87488507b021820020a1 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:07:24 +0100 Subject: [PATCH 164/833] tests: Fix reading of OpenFlow byte counters in GRE test cases. With some datapaths, read TC, it takes a bit longer to update the OpenFlow statistics. Rather than adding an additional delay, try to read the counters multiple times until we get the desired value. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 76f1f39a23c..2d46e063959 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1638,7 +1638,6 @@ dnl br-underlay: with IP: 172.31.1.100 dnl ns0: connect to br-underlay, with IP: 10.1.1.1 AT_SETUP([datapath - truncate and output to gre tunnel by simulated packets]) OVS_CHECK_MIN_KERNEL(3, 10) -CHECK_NO_TC_OFFLOAD() AT_SKIP_IF([test $HAVE_NC = no]) OVS_TRAFFIC_VSWITCHD_START() @@ -1709,9 +1708,8 @@ AT_CHECK([ovs-ofctl dump-flows br0 | grep "in_port=2" | sed -n 's/.*\(n\_bytes=[ n_bytes=242 ]) dnl After truncation = outer ETH(14) + outer IP(20) + GRE(4) + 100 = 138B -AT_CHECK([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [0], [dnl -n_bytes=138 -]) +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [dnl +n_bytes=138]) dnl check tunnel pop path, from at_ns0 to at_ns1 dnl This 200-byte packet is simulated on behalf of ns_gre0 @@ -1719,9 +1717,9 @@ ovs-ofctl -O OpenFlow13 packet-out br-underlay "in_port=1 packet=02908ca8a149faa dnl After truncation = 100 byte at loopback device p2(4) AT_CHECK([ovs-appctl revalidator/purge], [0]) -AT_CHECK([ovs-ofctl dump-flows br0 | grep "in_port=4" | ofctl_strip], [0], [dnl - n_packets=1, n_bytes=100, priority=1,ip,in_port=4 actions=drop -]) +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows br0 | grep "in_port=4" | ofctl_strip], [dnl + n_packets=1, n_bytes=100, priority=1,ip,in_port=4 actions=drop]) + dnl SLOW_ACTION: disable datapath truncate support dnl Repeat the test above, but exercise the SLOW_ACTION code path @@ -1746,9 +1744,8 @@ AT_CHECK([ovs-ofctl dump-flows br0 | grep "in_port=2" | sed -n 's/.*\(n\_bytes=[ n_bytes=242 ]) dnl After truncation = outer ETH(14) + outer IP(20) + GRE(4) + 100 = 138B -AT_CHECK([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [0], [dnl -n_bytes=138 -]) +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [dnl +n_bytes=138]) dnl check tunnel pop path, from at_ns0 to at_ns1 dnl This 200-byte packet is simulated on behalf of ns_gre0 @@ -1773,7 +1770,6 @@ AT_SETUP([datapath - truncate and output to gre tunnel]) AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() -CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay]) From 7bb0c33d78a2731b568c97209e0ec08baecbce3d Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 7 Feb 2023 15:07:56 +0100 Subject: [PATCH 165/833] tests: Comment currently failing TC system-traffic tests. I commented the three remaining failures when running tc with the system-traffic tests. In addition I ran the following test to verify we did not see any failures with recheck enabled: for i in {1..50}; do make check-offloads || \ make check-offloads TESTSUITEFLAGS="--recheck" || break; \ echo "ALL_50_OK: $i"; done; Unfortunately, a bunch of test cases showed occasional failures. For now, they are excluded from the test cases and need further investigation. They are: datapath - truncate and output to gre tunnel datapath - truncate and output to gre tunnel by simulated packets These tests where executed on a Fedora37 machine with the kernel 6.1.5-200.fc37.x86_64 installed. Signed-off-by: Eelco Chaudron Acked-by: Roi Dayan Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/system-offloads-testsuite-macros.at | 21 +++++++++++++++++++++ tests/system-traffic.at | 2 ++ 2 files changed, 23 insertions(+) diff --git a/tests/system-offloads-testsuite-macros.at b/tests/system-offloads-testsuite-macros.at index 322166b8c4d..e50dc07fbcc 100644 --- a/tests/system-offloads-testsuite-macros.at +++ b/tests/system-offloads-testsuite-macros.at @@ -30,6 +30,27 @@ m4_define([OVS_TRAFFIC_VSWITCHD_START], ]) # Macro to exclude tests that will fail with TC offload enabled. +# We currently have the below tests disabled in system-traffic.at +# for the following reasons: +# +# TC does not support moving ports to a different namespace than vswitchd's +# namespace, so we need to disable this test. +# - 'conntrack - multiple namespaces, internal ports' +# +# The kernel's tcf_ct_act() function does not seem to take care of any (QinQ) +# VLAN headers causing commits to fail. However, if this is solved, we have to +# make sure conntrack does not break the VLAN boundary, i.e., putting together +# two packets with different CVLAN+SVLAN values. +# - 'conntrack - IPv4 fragmentation + cvlan' +# +# Fragmentation handling in ct zone 9 does not seem to work correctly. +# When moving this test over to the default zone all works fine. +# - 'conntrack - Fragmentation over vxlan' +# +# Occasionally we fail with invalid byte counts. +# - 'datapath - truncate and output to gre tunnel by simulated packets' +# - 'datapath - truncate and output to gre tunnel' +# m4_define([CHECK_NO_TC_OFFLOAD], [ AT_SKIP_IF([:]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 2d46e063959..c8b0acdd0eb 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1639,6 +1639,7 @@ dnl ns0: connect to br-underlay, with IP: 10.1.1.1 AT_SETUP([datapath - truncate and output to gre tunnel by simulated packets]) OVS_CHECK_MIN_KERNEL(3, 10) AT_SKIP_IF([test $HAVE_NC = no]) +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay], [set bridge br-underlay other-config:hwaddr=\"02:90:8c:a8:a1:49\"]) @@ -1770,6 +1771,7 @@ AT_SETUP([datapath - truncate and output to gre tunnel]) AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay]) From 5f219af8b3c723d7117c075941ab80fe6a6c768f Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Thu, 9 Feb 2023 08:48:24 +0100 Subject: [PATCH 166/833] ovsdb-server: Fix handling of DNS name for listener configuration. Commit 08e9e5337383 fixed proper initialization of the dns-resolve module, and made DNS resolution asynchronous. A side effect of that change revealed a long standing logic bug which broke ovsdb-server listener configuration using DNS names. Previously this worked because the DNS resolution would block, now that DNS resolution is asynchronous the code before this change would assume the error from jsonrpc_pstream_open meant the remote was a specification for an active outgoing connection, even when that was not the case. To fix this a couple of changes was made to socket-util: 1) Pass optional result of dns resolution from inet_parse_passive. When (re-)configuring listeners that use DNS names, we may need to know whether the provided connection string is invalid or if the provided DNS name has finished resolving. 2) Check dns resolution status in inet_open_passive. If the connection string is valid, and contains a DNS name, inet_open_passive will now return -EAGAIN if dns resolution failed. DNS resolution failure may either mean the asynchronous resolver has not completed yet, or that the name does not resolve. Reported-at: https://bugs.launchpad.net/bugs/1998781 Fixes: 08e9e5337383 ("ovsdb: raft: Fix inability to read the database with DNS host names.") Fixes: 771680d96fb6 ("DNS: Add basic support for asynchronous DNS resolving") Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- lib/socket-util.c | 13 ++++++++++--- lib/socket-util.h | 3 ++- ovsdb/jsonrpc-server.c | 43 ++++++++++++++++++++++++++---------------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/lib/socket-util.c b/lib/socket-util.c index 38705cc51e0..3eb3a3816b7 100644 --- a/lib/socket-util.c +++ b/lib/socket-util.c @@ -660,7 +660,8 @@ inet_open_active(int style, const char *target, int default_port, * zeros '*ss' and returns false. */ bool inet_parse_passive(const char *target_, int default_port, - struct sockaddr_storage *ss) + struct sockaddr_storage *ss, + bool resolve_host, bool *dns_failure) { char *target = xstrdup(target_); char *port, *host; @@ -672,7 +673,7 @@ inet_parse_passive(const char *target_, int default_port, ok = false; } else { ok = parse_sockaddr_components(ss, host, port, default_port, - target_, true, NULL); + target_, resolve_host, dns_failure); } if (!ok) { memset(ss, 0, sizeof *ss); @@ -710,8 +711,14 @@ inet_open_passive(int style, const char *target, int default_port, struct sockaddr_storage ss; int fd = 0, error; unsigned int yes = 1; + bool dns_failure; - if (!inet_parse_passive(target, default_port, &ss)) { + if (!inet_parse_passive(target, default_port, &ss, true, &dns_failure)) { + if (dns_failure) { + /* DNS failure means asynchronous DNS resolution is in progress, + * or that the name does currently not resolve. */ + return -EAGAIN; + } return -EAFNOSUPPORT; } kernel_chooses_port = ss_get_port(&ss) == 0; diff --git a/lib/socket-util.h b/lib/socket-util.h index bf66393df94..4eec627e3ed 100644 --- a/lib/socket-util.h +++ b/lib/socket-util.h @@ -55,7 +55,8 @@ int inet_open_active(int style, const char *target, int default_port, struct sockaddr_storage *ssp, int *fdp, uint8_t dscp); bool inet_parse_passive(const char *target, int default_port, - struct sockaddr_storage *ssp); + struct sockaddr_storage *ssp, + bool resolve_host, bool *dns_failure); int inet_open_passive(int style, const char *target, int default_port, struct sockaddr_storage *ssp, uint8_t dscp, bool kernel_print_port); diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 916a1f414e5..17868f5b720 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -267,25 +267,36 @@ ovsdb_jsonrpc_server_add_remote(struct ovsdb_jsonrpc_server *svr, int error; error = jsonrpc_pstream_open(name, &listener, options->dscp); - if (error && error != EAFNOSUPPORT) { - VLOG_ERR_RL(&rl, "%s: listen failed: %s", name, ovs_strerror(error)); - return NULL; - } + switch (error) { + case 0: + case EAFNOSUPPORT: + remote = xmalloc(sizeof *remote); + remote->server = svr; + remote->listener = listener; + ovs_list_init(&remote->sessions); + remote->dscp = options->dscp; + remote->read_only = options->read_only; + remote->role = nullable_xstrdup(options->role); + shash_add(&svr->remotes, name, remote); + if (!listener) { + /* Not a listener, attempt creation of active jsonrpc session. */ + ovsdb_jsonrpc_session_create(remote, + jsonrpc_session_open(name, true), + svr->read_only || remote->read_only); + } + return remote; - remote = xmalloc(sizeof *remote); - remote->server = svr; - remote->listener = listener; - ovs_list_init(&remote->sessions); - remote->dscp = options->dscp; - remote->read_only = options->read_only; - remote->role = nullable_xstrdup(options->role); - shash_add(&svr->remotes, name, remote); + case EAGAIN: + VLOG_DBG_RL(&rl, "%s: listen failed: " + "DNS resolution in progress or host not found", name); + return NULL; - if (!listener) { - ovsdb_jsonrpc_session_create(remote, jsonrpc_session_open(name, true), - svr->read_only || remote->read_only); + default: + VLOG_ERR_RL(&rl, "%s: listen failed: %s", name, + ovs_strerror(error)); + return NULL; } - return remote; + OVS_NOT_REACHED(); } static void From fc3d5e1dad0a45a84fae2b6157a5d55e40adf429 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 9 Feb 2023 13:57:59 +0100 Subject: [PATCH 167/833] sparse: Fix build with DPDK and GCC 12. rte_vect.h pulls some AVX512 instrinsics headers added in GCC 12 [1] trigger a lot of warnings: libtool: compile: env "REAL_CC=ccache gcc" "CHECK=sparse -Wsparse-error -I ../include/sparse -I ../include -m64 -I /usr/local/include " cgcc -target=x86_64 -target=host_os_specs -D__MMX__=1 -D__MMX_WITH_SSE__=1 -D__SSE2_MATH__=1 -D__SSE_MATH__=1 -D__SSE__=1 -D__SSE2__=1 -DHAVE_CONFIG_H -I. -I.. -I ../include -I ./include -I ../lib -I ./lib -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Wshadow -Wmultistatement-macros -Wcast-align=strict -mssse3 -I/home/dmarchan/git/pub/dpdk.org/22.11/install/include -include rte_config.h -I/usr/local/include -Werror -D_FILE_OFFSET_BITS=64 -g -O2 -MT lib/bfd.lo -MD -MP -MF lib/.deps/bfd.Tpo -c ../lib/bfd.c -o lib/bfd.o ../lib/bfd.c: note: in included file (through /usr/lib/gcc/x86_64-redhat-linux/12//include/immintrin.h, /usr/lib/gcc/x86_64-redhat-linux/12//include/x86intrin.h, ...): /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:38:9: error: '_Float16' has implicit type /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:38:18: error: Expected ; at end of declaration /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:38:18: error: got __v8hf /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:62:41: error: Expected ; at end of statement /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:62:41: error: got { /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:420:32: error: Expected ) in expression /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:420:32: error: got __A /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:2271:61: error: Expected ) in function call /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:2271:61: error: got __A /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:2279:61: error: Expected ) in function call /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:2279:61: error: got __A /usr/lib/gcc/x86_64-redhat-linux/12//include/avx512fp16intrin.h:2328:50: error: Expected ) in function call [...] Besides, the list of headers by rte_memcpy.h is now out of sync with DPDK. OVS takes care to include the right headers in its sources. Simply make this header self-sufficient. 1: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a68412117fa4 Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- include/sparse/rte_memcpy.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/sparse/rte_memcpy.h b/include/sparse/rte_memcpy.h index 5cd3f013ea8..ec88500242a 100644 --- a/include/sparse/rte_memcpy.h +++ b/include/sparse/rte_memcpy.h @@ -20,11 +20,8 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif -/* Include the same headers as the real rte_memcpy(). */ -#include +#include #include -#include -#include /* Declare the same functions as the real rte_memcpy.h, without defining them. * This gives sparse the information it needs without provoking sparse's From 0a7587034dc903119a71572efe812f1e1ac163f8 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Thu, 9 Feb 2023 13:29:39 +0100 Subject: [PATCH 168/833] conntrack: Properly unNAT inner header of related traffic. The inner header was not handled properly. Simplify the code which allows proper handling of the inner headers. Reported-at: https://bugzilla.redhat.com/2137754 Acked-by: Paolo Valerio Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/conntrack.c | 252 ++++++++++++++-------------------------- tests/system-traffic.at | 107 +++++++++++++++++ 2 files changed, 196 insertions(+), 163 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 550b2be9b91..524670e45d4 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -764,109 +764,59 @@ handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, } static void -pat_packet(struct dp_packet *pkt, const struct conn *conn) +pat_packet(struct dp_packet *pkt, const struct conn_key *key) { - if (conn->nat_action & NAT_ACTION_SRC) { - if (conn->key.nw_proto == IPPROTO_TCP) { - struct tcp_header *th = dp_packet_l4(pkt); - packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - struct udp_header *uh = dp_packet_l4(pkt); - packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - if (conn->key.nw_proto == IPPROTO_TCP) { - packet_set_tcp_port(pkt, conn->rev_key.dst.port, - conn->rev_key.src.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - packet_set_udp_port(pkt, conn->rev_key.dst.port, - conn->rev_key.src.port); - } + if (key->nw_proto == IPPROTO_TCP) { + packet_set_tcp_port(pkt, key->dst.port, key->src.port); + } else if (key->nw_proto == IPPROTO_UDP) { + packet_set_udp_port(pkt, key->dst.port, key->src.port); } } -static void -nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related) +static uint16_t +nat_action_reverse(uint16_t nat_action) { - if (conn->nat_action & NAT_ACTION_SRC) { - pkt->md.ct_state |= CS_SRC_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_src, - conn->rev_key.dst.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_src.be32, - &conn->rev_key.dst.addr.ipv6, true); - } - if (!related) { - pat_packet(pkt, conn); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - pkt->md.ct_state |= CS_DST_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_dst, - conn->rev_key.src.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_dst.be32, - &conn->rev_key.src.addr.ipv6, true); - } - if (!related) { - pat_packet(pkt, conn); - } + if (nat_action & NAT_ACTION_SRC) { + nat_action ^= NAT_ACTION_SRC; + nat_action |= NAT_ACTION_DST; + } else if (nat_action & NAT_ACTION_DST) { + nat_action ^= NAT_ACTION_DST; + nat_action |= NAT_ACTION_SRC; } + return nat_action; } static void -un_pat_packet(struct dp_packet *pkt, const struct conn *conn) +nat_packet_ipv4(struct dp_packet *pkt, const struct conn_key *key, + uint16_t nat_action) { - if (conn->nat_action & NAT_ACTION_SRC) { - if (conn->key.nw_proto == IPPROTO_TCP) { - struct tcp_header *th = dp_packet_l4(pkt); - packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - struct udp_header *uh = dp_packet_l4(pkt); - packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - if (conn->key.nw_proto == IPPROTO_TCP) { - packet_set_tcp_port(pkt, conn->key.dst.port, conn->key.src.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - packet_set_udp_port(pkt, conn->key.dst.port, conn->key.src.port); - } + struct ip_header *nh = dp_packet_l3(pkt); + + if (nat_action & NAT_ACTION_SRC) { + packet_set_ipv4_addr(pkt, &nh->ip_src, key->dst.addr.ipv4); + } else if (nat_action & NAT_ACTION_DST) { + packet_set_ipv4_addr(pkt, &nh->ip_dst, key->src.addr.ipv4); } } static void -reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn) +nat_packet_ipv6(struct dp_packet *pkt, const struct conn_key *key, + uint16_t nat_action) { - if (conn->nat_action & NAT_ACTION_SRC) { - if (conn->key.nw_proto == IPPROTO_TCP) { - struct tcp_header *th_in = dp_packet_l4(pkt); - packet_set_tcp_port(pkt, conn->key.src.port, - th_in->tcp_dst); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - struct udp_header *uh_in = dp_packet_l4(pkt); - packet_set_udp_port(pkt, conn->key.src.port, - uh_in->udp_dst); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - if (conn->key.nw_proto == IPPROTO_TCP) { - packet_set_tcp_port(pkt, conn->key.src.port, - conn->key.dst.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - packet_set_udp_port(pkt, conn->key.src.port, - conn->key.dst.port); - } + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); + + if (nat_action & NAT_ACTION_SRC) { + packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_src.be32, + &key->dst.addr.ipv6, true); + } else if (nat_action & NAT_ACTION_DST) { + packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_dst.be32, + &key->src.addr.ipv6, true); } } static void -reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn) +nat_inner_packet(struct dp_packet *pkt, struct conn_key *key, + uint16_t nat_action) { char *tail = dp_packet_tail(pkt); uint16_t pad = dp_packet_l2_pad_size(pkt); @@ -875,98 +825,77 @@ reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn) uint16_t orig_l3_ofs = pkt->l3_ofs; uint16_t orig_l4_ofs = pkt->l4_ofs; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - struct icmp_header *icmp = dp_packet_l4(pkt); - struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1); - /* This call is already verified to succeed during the code path from - * 'conn_key_extract()' which calls 'extract_l4_icmp()'. */ - extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad, + void *l3 = dp_packet_l3(pkt); + void *l4 = dp_packet_l4(pkt); + void *inner_l3; + /* These calls are already verified to succeed during the code path from + * 'conn_key_extract()' which calls + * 'extract_l4_icmp()'/'extract_l4_icmp6()'. */ + if (key->dl_type == htons(ETH_TYPE_IP)) { + inner_l3 = (char *) l4 + sizeof(struct icmp_header); + extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *) inner_l3) - pad, &inner_l4, false); - pkt->l3_ofs += (char *) inner_l3 - (char *) nh; - pkt->l4_ofs += inner_l4 - (char *) icmp; + } else { + inner_l3 = (char *) l4 + sizeof(struct icmp6_data_header); + extract_l3_ipv6(&inner_key, inner_l3, tail - ((char *) inner_l3) - pad, + &inner_l4); + } + pkt->l3_ofs += (char *) inner_l3 - (char *) l3; + pkt->l4_ofs += inner_l4 - (char *) l4; - if (conn->nat_action & NAT_ACTION_SRC) { - packet_set_ipv4_addr(pkt, &inner_l3->ip_src, - conn->key.src.addr.ipv4); - } else if (conn->nat_action & NAT_ACTION_DST) { - packet_set_ipv4_addr(pkt, &inner_l3->ip_dst, - conn->key.dst.addr.ipv4); - } + /* Reverse the key for inner packet. */ + struct conn_key rev_key = *key; + conn_key_reverse(&rev_key); + + pat_packet(pkt, &rev_key); + + if (key->dl_type == htons(ETH_TYPE_IP)) { + nat_packet_ipv4(pkt, &rev_key, nat_action); - reverse_pat_packet(pkt, conn); + struct icmp_header *icmp = (struct icmp_header *) l4; icmp->icmp_csum = 0; icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad); } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - struct icmp6_data_header *icmp6 = dp_packet_l4(pkt); - struct ovs_16aligned_ip6_hdr *inner_l3_6 = - (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1); - /* This call is already verified to succeed during the code path from - * 'conn_key_extract()' which calls 'extract_l4_icmp6()'. */ - extract_l3_ipv6(&inner_key, inner_l3_6, - tail - ((char *)inner_l3_6) - pad, - &inner_l4); - pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6; - pkt->l4_ofs += inner_l4 - (char *) icmp6; - - if (conn->nat_action & NAT_ACTION_SRC) { - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - inner_l3_6->ip6_src.be32, - &conn->key.src.addr.ipv6, true); - } else if (conn->nat_action & NAT_ACTION_DST) { - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - inner_l3_6->ip6_dst.be32, - &conn->key.dst.addr.ipv6, true); - } - reverse_pat_packet(pkt, conn); + nat_packet_ipv6(pkt, &rev_key, nat_action); + + struct icmp6_data_header *icmp6 = (struct icmp6_data_header *) l4; icmp6->icmp6_base.icmp6_cksum = 0; - icmp6->icmp6_base.icmp6_cksum = packet_csum_upperlayer6(nh6, icmp6, - IPPROTO_ICMPV6, tail - (char *) icmp6 - pad); + icmp6->icmp6_base.icmp6_cksum = + packet_csum_upperlayer6(l3, icmp6, IPPROTO_ICMPV6, + tail - (char *) icmp6 - pad); } + pkt->l3_ofs = orig_l3_ofs; pkt->l4_ofs = orig_l4_ofs; } static void -un_nat_packet(struct dp_packet *pkt, const struct conn *conn, - bool related) +nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool related) { - if (conn->nat_action & NAT_ACTION_SRC) { - pkt->md.ct_state |= CS_DST_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_dst, - conn->key.src.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_dst.be32, - &conn->key.src.addr.ipv6, true); - } + struct conn_key *key = reply ? &conn->key : &conn->rev_key; + uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action) + : conn->nat_action; - if (OVS_UNLIKELY(related)) { - reverse_nat_packet(pkt, conn); - } else { - un_pat_packet(pkt, conn); - } - } else if (conn->nat_action & NAT_ACTION_DST) { + /* Update ct_state. */ + if (nat_action & NAT_ACTION_SRC) { pkt->md.ct_state |= CS_SRC_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_src, - conn->key.dst.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_src.be32, - &conn->key.dst.addr.ipv6, true); - } + } else if (nat_action & NAT_ACTION_DST) { + pkt->md.ct_state |= CS_DST_NAT; + } + + /* Reverse the key for outer header. */ + if (key->dl_type == htons(ETH_TYPE_IP)) { + nat_packet_ipv4(pkt, key, nat_action); + } else { + nat_packet_ipv6(pkt, key, nat_action); + } + if (nat_action & NAT_ACTION_SRC || nat_action & NAT_ACTION_DST) { if (OVS_UNLIKELY(related)) { - reverse_nat_packet(pkt, conn); + nat_action = nat_action_reverse(nat_action); + nat_inner_packet(pkt, key, nat_action); } else { - un_pat_packet(pkt, conn); + pat_packet(pkt, key); } } } @@ -1082,7 +1011,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, memcpy(nc, nat_conn, sizeof *nc); } - nat_packet(pkt, nc, ctx->icmp_related); + nat_packet(pkt, nc, false, ctx->icmp_related); memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key); memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key); nat_conn->conn_type = CT_CONN_TYPE_UN_NAT; @@ -1185,11 +1114,8 @@ handle_nat(struct dp_packet *pkt, struct conn *conn, if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) { pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT); } - if (reply) { - un_nat_packet(pkt, conn, related); - } else { - nat_packet(pkt, conn, related); - } + + nat_packet(pkt, conn, reply, related); } } diff --git a/tests/system-traffic.at b/tests/system-traffic.at index c8b0acdd0eb..3a15b88a259 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -7202,6 +7202,113 @@ recirc_id(0),in_port(br-underlay),ct_state(+trk),eth(src=f0:00:00:01:01:02,dst=f OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - ICMP from different source related with NAT]) +AT_SKIP_IF([test $HAVE_NC = no]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(client, server) + +ADD_VETH(client, client, br0, "192.168.20.10/24", "00:00:00:00:20:10") +ADD_VETH(server, server, br0, "192.168.10.20/24", "00:00:00:00:10:20") + +dnl Send traffic from client to CT, do DNAT if the traffic is new otherwise send it to server +AT_DATA([flows.txt], [dnl +table=0,ip,actions=ct(table=1,zone=42,nat) +table=1,in_port=ovs-client,ip,ct_state=+trk+new,actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20) +table=1,icmp,ct_state=+trk+rel-rpl,actions=ct(commit,table=2,zone=42,nat) +table=1,ip,actions=resubmit(,2) +table=2,in_port=ovs-client,ip,ct_state=+trk+new,actions=output:ovs-server +table=2,in_port=ovs-client,icmp,ct_state=+trk+rel,actions=output:ovs-server +table=2,in_port=ovs-server,icmp,ct_state=+trk+rel,actions=output:ovs-client +table=2,in_port=ovs-server,ip,ct_state=+trk+rpl,actions=output:ovs-client +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +rm server.pcap +OVS_DAEMONIZE([tcpdump -l -U -i ovs-server -w server.pcap 2>tcpdump0_err], [tcpdump0.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) + +dnl Send UDP client->server +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-client,\ +packet=00000000102000000000201008004500001C000040000A11C762C0A8140AC0A814140001000200080000,actions=resubmit(,0)"]) +dnl Send UDP response server->client +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-server,\ +packet=00000000201000000000102008004500001C000040000A11D162C0A80A14C0A8140A0002000100080000,actions=resubmit(,0)"]) +dnl Fake router sending ICMP need frag router->server +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-client,\ +packet=000000001020000000002000080045000038011F0000FF011140C0A81401C0A814140304F778000005784500001C000040000A11C762C0A81414C0A8140A0002000100080000,\ +actions=resubmit(,0)" +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sort ], [0], [dnl + n_packets=3, n_bytes=154, reset_counts ip actions=ct(table=1,zone=42,nat) + table=1, n_packets=1, n_bytes=42, reset_counts ct_state=+new+trk,ip,in_port=1 actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20)) + table=1, n_packets=1, n_bytes=42, reset_counts ip actions=resubmit(,2) + table=1, n_packets=1, n_bytes=70, reset_counts ct_state=+rel-rpl+trk,icmp actions=ct(commit,table=2,zone=42,nat) + table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+new+trk,ip,in_port=1 actions=output:2 + table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+rpl+trk,ip,in_port=2 actions=output:1 + table=2, n_packets=1, n_bytes=70, reset_counts ct_state=+rel+trk,icmp,in_port=1 actions=output:2 + table=2, reset_counts ct_state=+rel+trk,icmp,in_port=2 actions=output:1 +OFPST_FLOW reply (OF1.5): +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "192.168.20.10"], [0], [dnl +udp,orig=(src=192.168.20.10,dst=192.168.20.20,sport=1,dport=2),reply=(src=192.168.10.20,dst=192.168.20.10,sport=2,dport=1),zone=42 +]) + +OVS_WAIT_UNTIL([ovs-pcap server.pcap | grep 000000001020000000002000]) + +AT_CHECK([ovs-pcap server.pcap | grep 000000001020000000002000], [0], [dnl +000000001020000000002000080045000038011f0000ff011b40c0a81401c0a80a140304f778000005784500001c000040000a11d162c0a80a14c0a8140a0002000100080000 +]) + +dnl Check the ICMP error in reply direction +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=42]) + +rm client.pcap +OVS_DAEMONIZE([tcpdump -l -U -i ovs-client -w client.pcap 2>tcpdump1_err], [tcpdump1.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump1_err]) + +dnl Send UDP client->server +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-client,\ +packet=00000000102000000000201008004500001C000040000A11C762C0A8140AC0A814140001000200080000,actions=resubmit(,0)"]) +dnl Fake router sending ICMP need frag router->client +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-server,\ +packet=000000002010000000002000080045000038011F0000FF01114AC0A81401C0A8140A0304F778000005784500001C000040000A11D162C0A8140AC0A80A140001000200080000,\ +actions=resubmit(,0)" +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sort ], [0], [dnl + n_packets=5, n_bytes=266, reset_counts ip actions=ct(table=1,zone=42,nat) + table=1, n_packets=1, n_bytes=70, reset_counts ct_state=+rel-rpl+trk,icmp actions=ct(commit,table=2,zone=42,nat) + table=1, n_packets=2, n_bytes=112, reset_counts ip actions=resubmit(,2) + table=1, n_packets=2, n_bytes=84, reset_counts ct_state=+new+trk,ip,in_port=1 actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20)) + table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+rpl+trk,ip,in_port=2 actions=output:1 + table=2, n_packets=1, n_bytes=70, reset_counts ct_state=+rel+trk,icmp,in_port=1 actions=output:2 + table=2, n_packets=1, n_bytes=70, reset_counts ct_state=+rel+trk,icmp,in_port=2 actions=output:1 + table=2, n_packets=2, n_bytes=84, reset_counts ct_state=+new+trk,ip,in_port=1 actions=output:2 +OFPST_FLOW reply (OF1.5): +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "192.168.20.10"], [0], [dnl +udp,orig=(src=192.168.20.10,dst=192.168.20.20,sport=1,dport=2),reply=(src=192.168.10.20,dst=192.168.20.10,sport=2,dport=1),zone=42 +]) + +OVS_WAIT_UNTIL([ovs-pcap client.pcap | grep 000000002010000000002000]) + +AT_CHECK([ovs-pcap client.pcap | grep 000000002010000000002000], [0], [dnl +000000002010000000002000080045000038011f0000ff011137c0a81414c0a8140a0304f778000005784500001c000040000a11c762c0a8140ac0a814140001000200080000 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([IGMP]) AT_SETUP([IGMP - flood under normal action]) From 6c24851f433a13e99fc48a0cd8a0e90ab873f901 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 5 Jan 2023 13:56:59 +0100 Subject: [PATCH 169/833] ofproto-dpif-upcall: Use last known stats ukey stats on revalidate missed dp flows. Instead of using all zero stats when executing a revalidate for missed dp flows, use the last known stats to avoid odd statistics being used. As these zero stats are stored in the ukey, the next time revalidate_ukey() is called the delta between the new stats and the zero stats is used, which would cause an additional increase in total packets/bytes. Signed-off-by: Eelco Chaudron Acked-by: Michael Santana Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index db7570ee2a7..fc94078cbba 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -2889,7 +2889,7 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) } else { struct dpif_flow_stats stats; COVERAGE_INC(revalidate_missed_dp_flow); - memset(&stats, 0, sizeof stats); + memcpy(&stats, &ukey->stats, sizeof stats); result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, reval_seq, &recircs, false); } From cd1cf6a24b708b6f133fa83913ab3e4f586a200c Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 14 Feb 2023 14:53:45 +0100 Subject: [PATCH 170/833] test: Remove duplicate test from system-offloads-traffic.at. Remove the "offloads - simulated flow action update" test case, as it's covered by the "datapath - simulated flow action update" test. Fixes: b1f58f5072d6 ("netdev-offload-tc: Preserve tc statistics when flow gets modified.") Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/system-offloads-traffic.at | 64 -------------------------------- 1 file changed, 64 deletions(-) diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 8775f99226d..f2bf9c0639a 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -680,67 +680,3 @@ OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(check_pkt_len(size=100,gt(5), OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP - - -AT_SETUP([offloads - simulated flow action update]) -OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) - -ADD_NAMESPACES(at_ns0, at_ns1) - -ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") -ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") - -AT_DATA([flows.txt], [dnl -add in_port=ovs-p0,actions=ovs-p1,br0 -add in_port=ovs-p1,actions=ovs-p0,br0 -]) -AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) - -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl -10 packets transmitted, 10 received, 0% packet loss, time 0ms -]) - -AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl - strip_recirc | strip_used | dnl - sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:756/bytes:882/'], - [0], [dnl -recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:3,1 -recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:2,1 -]) - -AT_DATA([flows2.txt], [dnl -modify in_port=ovs-p0,actions=ovs-p1 -modify in_port=ovs-p1,actions=ovs-p0 -]) -AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) -AT_CHECK([ovs-appctl revalidator/wait], [0]) - -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl -10 packets transmitted, 10 received, 0% packet loss, time 0ms -]) - -AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl - strip_recirc | strip_used | dnl - sed -e 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:1596/bytes:1862/'], - [0], [dnl -recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:3 -recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:2 -]) - -AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl revalidator/wait], [0]) - -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl -10 packets transmitted, 10 received, 0% packet loss, time 0ms -]) - -AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl - strip_recirc | strip_used | dnl - sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:2436/bytes:2842/'], - [0], [dnl -recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:3,1 -recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:2,1 -]) - -OVS_TRAFFIC_VSWITCHD_STOP -AT_CLEANUP From 1f47d73996b0c565f9ce035c899a042f2ea394a6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 16 Feb 2023 13:53:52 +0100 Subject: [PATCH 171/833] Set release date for 3.1.0. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- NEWS | 2 +- debian/changelog | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index fe6055a2700..391badd7cfd 100644 --- a/NEWS +++ b/NEWS @@ -6,7 +6,7 @@ Post-v3.1.0 in OVSDB. Available with upstream kernel 6.2+. -v3.1.0 - xx xxx xxxx +v3.1.0 - 16 Feb 2023 -------------------- - ovs-vswitchd now detects changes in CPU affinity and adjusts the number of handler and revalidator threads if necessary. diff --git a/debian/changelog b/debian/changelog index c62bb5646db..9a87224b283 100644 --- a/debian/changelog +++ b/debian/changelog @@ -8,7 +8,7 @@ openvswitch (3.1.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Mon, 16 Jan 2023 16:51:00 +0100 + -- Open vSwitch team Thu, 16 Feb 2023 13:52:24 +0100 openvswitch (3.0.0-1) unstable; urgency=low From 4f27d5a024dddf6bcaafce7dac95514dd2d12c48 Mon Sep 17 00:00:00 2001 From: Vladislav Odintsov Date: Fri, 10 Feb 2023 19:02:29 +0300 Subject: [PATCH 172/833] utilities: Add support to set umask in ovs-ctl. This patch adds new ovs-ctl options to pass umask configuration to allow OVS daemons set requested socket permissions on group. Previous behaviour (if using with systemd service unit) created sockets with 0750 permissions mask (group has no write permission). Write permission for group is reasonable in usecase, where ovs-vswitchd or ovsdb-server runs as a non-privileged user:group (say, openvswitch:openvswitch) and it is needed to access unix socket from process running as another non-privileged user. In this case administrator has to add that user to openvswitch group and can connect to OVS sockets from a process running under that user. Two new ovs-ctl options --ovsdb-server-umask and --ovs-vswitchd-umask were added to manage umask values for appropriate daemons. This is useful for systemd users: both ovs-vswitchd and ovsdb-server systemd units read options from single /etc/sysconfig/openvswitch configuration file. So, with separate options it is possible to set umask only for specific daemon. OPTIONS="--ovsdb-server-umask=0002" in /etc/openvswitch/sysconfig file will set umask to 0002 value before starting only ovsdb-server, while OPTIONS="--ovs-vswitchd-umask=0002" will set umask to ovs-vswitchd daemon. Previous behaviour (not setting umask) is left as default. Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2023-January/401501.html Acked-by: Eelco Chaudron Signed-off-by: Vladislav Odintsov Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ utilities/ovs-ctl.in | 16 ++++++++++++---- utilities/ovs-lib.in | 17 ++++++++++++++--- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 391badd7cfd..e43334b33f9 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,10 @@ Post-v3.1.0 * OVS now collects per-interface upcall statistics that can be obtained via 'ovs-appctl dpctl/show -s' or the interface's statistics column in OVSDB. Available with upstream kernel 6.2+. + - ovs-ctl: + * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask + value when starting OVS daemons. E.g., use --ovsdb-server-umask=0002 + in order to create OVSDB sockets with access mode of 0770. v3.1.0 - 16 Feb 2023 diff --git a/utilities/ovs-ctl.in b/utilities/ovs-ctl.in index d9155258868..0b2820c3611 100644 --- a/utilities/ovs-ctl.in +++ b/utilities/ovs-ctl.in @@ -156,8 +156,8 @@ do_start_ovsdb () { [ "$OVS_USER" != "" ] && set "$@" --user "$OVS_USER" [ "$OVSDB_SERVER_OPTIONS" != "" ] && set "$@" $OVSDB_SERVER_OPTIONS - start_daemon "$OVSDB_SERVER_PRIORITY" "$OVSDB_SERVER_WRAPPER" "$@" \ - || return 1 + start_daemon "$OVSDB_SERVER_PRIORITY" "$OVSDB_SERVER_WRAPPER" \ + "$OVSDB_SERVER_UMASK" "$@" || return 1 # Initialize database settings. ovs_vsctl -- init -- set Open_vSwitch . db-version="$schemaver" \ @@ -226,8 +226,8 @@ do_start_forwarding () { [ "$OVS_USER" != "" ] && set "$@" --user "$OVS_USER" [ "$OVS_VSWITCHD_OPTIONS" != "" ] &&set "$@" $OVS_VSWITCHD_OPTIONS - start_daemon "$OVS_VSWITCHD_PRIORITY" "$OVS_VSWITCHD_WRAPPER" "$@" || - return 1 + start_daemon "$OVS_VSWITCHD_PRIORITY" "$OVS_VSWITCHD_WRAPPER" \ + "$OVS_VSWITCHD_UMASK" "$@" || return 1 fi } @@ -348,6 +348,8 @@ set_defaults () { OVS_VSWITCHD_WRAPPER= OVSDB_SERVER_OPTIONS= OVS_VSWITCHD_OPTIONS= + OVSDB_SERVER_UMASK= + OVS_VSWITCHD_UMASK= DB_FILE=$dbdir/conf.db DB_SOCK=$rundir/db.sock @@ -421,6 +423,12 @@ Other important options for "start", "restart" and "force-reload-kmod": add given key-value pair to Open_vSwitch external-ids --delete-bridges delete all bridges just before starting ovs-vswitchd --ovs-user="user[:group]" pass the --user flag to ovs daemons + --ovsdb-server-umask=MODE Set umask prior to run ovsdb-server daemon. + This is useful to manage daemon's sockets permissions. + Default is not to change umask (inherited from shell). + --ovs-vswitchd-umask=MODE Set umask prior to run ovs-vswitchd daemon. + This is useful to manage daemon's sockets permissions. + Default is not to change umask (inherited from shell). Less important options for "start", "restart" and "force-reload-kmod": --daemon-cwd=DIR set working dir for OVS daemons (default: $DAEMON_CWD) diff --git a/utilities/ovs-lib.in b/utilities/ovs-lib.in index 13477a6a9e9..7812a94ee8b 100644 --- a/utilities/ovs-lib.in +++ b/utilities/ovs-lib.in @@ -165,9 +165,9 @@ install_dir () { } start_daemon () { - priority=$1 - wrapper=$2 - shift; shift + priority=$1 && shift + wrapper=$1 && shift + umask=$1 && shift daemon=$1 strace="" @@ -223,8 +223,19 @@ start_daemon () { set nice -n "$priority" "$@" fi + # Set requested umask if any and turn previous value back. + if [ -n "$umask" ]; then + previuos_umask_value=$(umask) + umask "$umask" + fi + action "Starting $daemon" "$@" || return 1 + # If umask was set, turn umask value to previous value. + if [ -n "$umask" ]; then + umask "$previuos_umask_value" + fi + if test X"$strace" != X; then # Strace doesn't have the -D option so we attach after the fact. setsid $strace -o "$logdir/$daemon.strace.log" \ From f3c6cb907cec035bf2d038e58e37749a78f3149b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 20 Feb 2023 20:01:14 +0100 Subject: [PATCH 173/833] AUTHORS: Add Vladislav Odintsov. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index c82570fb6e3..8a286de9915 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -450,6 +450,7 @@ Venkata Anil Kommaddi vkommadi@redhat.com Vishal Deep Ajmera vishal.deep.ajmera@ericsson.com Vivien Bernet-Rollande vbr@soprive.net Vlad Buslov vladbu@nvidia.com +Vladislav Odintsov odivlad@gmail.com Volkan Atlı volkan.atli@b-ulltech.com Wan Junjie wanjunjie@bytedance.com Wang Li wangli39@baidu.com From a6195e2c4236cbe16b3649940fac3b08493eabb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miika=20Pet=C3=A4j=C3=A4niemi?= Date: Wed, 7 Dec 2022 14:03:39 +0200 Subject: [PATCH 174/833] netdev-linux: Add jitter parameter to the netem qos options. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds jitter option to enable emulating latency fluctuation with netem. Submitted-at: https://github.com/openvswitch/ovs/pull/407 Signed-off-by: Miika Petäjäniemi Signed-off-by: Ilya Maximets --- NEWS | 2 ++ lib/netdev-linux.c | 26 ++++++++++++++++++-------- vswitchd/vswitch.xml | 4 ++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index e43334b33f9..85b34962145 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,8 @@ Post-v3.1.0 * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask value when starting OVS daemons. E.g., use --ovsdb-server-umask=0002 in order to create OVSDB sockets with access mode of 0770. + - QoS: + * Added new configuration option 'jitter' for a linux-netem QoS type. v3.1.0 - 16 Feb 2023 diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 7c19c40163f..36620199ec8 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -4347,6 +4347,7 @@ struct netem { uint32_t latency; uint32_t limit; uint32_t loss; + uint32_t jitter; }; static struct netem * @@ -4358,7 +4359,7 @@ netem_get__(const struct netdev *netdev_) static void netem_install__(struct netdev *netdev_, uint32_t latency, - uint32_t limit, uint32_t loss) + uint32_t limit, uint32_t loss, uint32_t jitter) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); struct netem *netem; @@ -4368,13 +4369,14 @@ netem_install__(struct netdev *netdev_, uint32_t latency, netem->latency = latency; netem->limit = limit; netem->loss = loss; + netem->jitter = jitter; netdev->tc = &netem->tc; } static int netem_setup_qdisc__(struct netdev *netdev, uint32_t latency, - uint32_t limit, uint32_t loss) + uint32_t limit, uint32_t loss, uint32_t jitter) { struct tc_netem_qopt opt; struct ofpbuf request; @@ -4410,6 +4412,7 @@ netem_setup_qdisc__(struct netdev *netdev, uint32_t latency, } opt.latency = tc_time_to_ticks(latency); + opt.jitter = tc_time_to_ticks(jitter); nl_msg_put_string(&request, TCA_KIND, "netem"); nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt); @@ -4417,9 +4420,10 @@ netem_setup_qdisc__(struct netdev *netdev, uint32_t latency, error = tc_transact(&request, NULL); if (error) { VLOG_WARN_RL(&rl, "failed to replace %s qdisc, " - "latency %u, limit %u, loss %u error %d(%s)", + "latency %u, limit %u, loss %u, jitter %u " + "error %d(%s)", netdev_get_name(netdev), - opt.latency, opt.limit, opt.loss, + opt.latency, opt.limit, opt.loss, opt.jitter, error, ovs_strerror(error)); } return error; @@ -4432,6 +4436,7 @@ netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED, netem->latency = smap_get_ullong(details, "latency", 0); netem->limit = smap_get_ullong(details, "limit", 0); netem->loss = smap_get_ullong(details, "loss", 0); + netem->jitter = smap_get_ullong(details, "jitter", 0); if (!netem->limit) { netem->limit = 1000; @@ -4446,9 +4451,10 @@ netem_tc_install(struct netdev *netdev, const struct smap *details) netem_parse_qdisc_details__(netdev, details, &netem); error = netem_setup_qdisc__(netdev, netem.latency, - netem.limit, netem.loss); + netem.limit, netem.loss, netem.jitter); if (!error) { - netem_install__(netdev, netem.latency, netem.limit, netem.loss); + netem_install__(netdev, netem.latency, + netem.limit, netem.loss, netem.jitter); } return error; } @@ -4464,7 +4470,8 @@ netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg) error = tc_parse_qdisc(nlmsg, &kind, &nlattr); if (error == 0) { netem = nl_attr_get(nlattr); - netem_install__(netdev, netem->latency, netem->limit, netem->loss); + netem_install__(netdev, netem->latency, + netem->limit, netem->loss, netem->jitter); return 0; } @@ -4486,6 +4493,7 @@ netem_qdisc_get(const struct netdev *netdev, struct smap *details) smap_add_format(details, "latency", "%u", netem->latency); smap_add_format(details, "limit", "%u", netem->limit); smap_add_format(details, "loss", "%u", netem->loss); + smap_add_format(details, "jitter", "%u", netem->jitter); return 0; } @@ -4495,10 +4503,12 @@ netem_qdisc_set(struct netdev *netdev, const struct smap *details) struct netem netem; netem_parse_qdisc_details__(netdev, details, &netem); - netem_install__(netdev, netem.latency, netem.limit, netem.loss); + netem_install__(netdev, netem.latency, + netem.limit, netem.loss, netem.jitter); netem_get__(netdev)->latency = netem.latency; netem_get__(netdev)->limit = netem.limit; netem_get__(netdev)->loss = netem.loss; + netem_get__(netdev)->jitter = netem.jitter; return 0; } diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 64f23302dd1..05ac1fbe5ef 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -4895,6 +4895,10 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Adds an independent loss probability to the packets outgoing from the chosen network interface. + + Adds the provided jitter to the latency outgoing to the + chosen network interface. The jitter value expressed in us. + From 481e3fa6903e02d1036d4036d3b2bd358e2dd1b4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 21 Feb 2023 14:09:38 +0100 Subject: [PATCH 175/833] =?UTF-8?q?AUTHORS:=20Add=20Miika=20Pet=C3=A4j?= =?UTF-8?q?=C3=A4niemi.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 8a286de9915..836070649bb 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -310,6 +310,7 @@ Michal Weglicki michalx.weglicki@intel.com Michele Baldessari michele@acksyn.org Mickey Spiegel mickeys.dev@gmail.com Miguel Angel Ajo majopela@redhat.com +Miika Petäjäniemi miika.petajaniemi@solita.fi Mijo Safradin mijo@linux.vnet.ibm.com Mika Vaisanen mika.vaisanen@gmail.com Mike Pattrick mkp@redhat.com From 564dc0f2cc1a29ae5a3a87eb6a8ad9ebf84d8057 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 20 Feb 2023 16:24:23 -0500 Subject: [PATCH 176/833] MAINTAINERS: Move myself to emeritus status. I have not been active in OVS development in long enough that I should move to emeritus status. Signed-off-by: Russell Bryant Signed-off-by: Ilya Maximets --- MAINTAINERS.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 1dc406170f2..5df9aab78d4 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -65,8 +65,6 @@ This is the current list of active Open vSwitch committers: - jpettit@ovn.org * - Pravin B Shelar - pshelar@ovn.org - * - Russell Bryant - - russell@ovn.org * - Simon Horman - horms@ovn.org * - Thomas Graf @@ -91,6 +89,8 @@ More information about Emeritus Committers can be found here: - ejj@eecs.berkeley.edu * - Joe Stringer - joe@ovn.org + * - Russell Bryant + - russell@ovn.org .. Cut here for the Documentation/internals/maintainers.rst From 2b1c70656503ad9ce1e50a8b2457a846557bd20b Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Fri, 10 Feb 2023 17:03:13 +0100 Subject: [PATCH 177/833] ofproto-ipfix: Use per-domain template timeouts. IPFIX templates have to be sent for each Observation Domain ID. Currently, a timer is kept at each dpif_ipfix_exporter to send them. This works fine for per-bridge sampling where there is only one Observation Domain ID per exporter. However, this is does not work for per-flow sampling where more than one Observation Domain IDs can be specified by the controller. In this case, ovs-vswitchd will only send template information for one (arbitrary) DomainID. Fix per-flow sampling by using an hmap to keep a timer for each Observation Domain ID. Signed-off-by: Adrian Moreno Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-ipfix.c | 129 ++++++++++++++++++++++++++++------- 1 file changed, 105 insertions(+), 24 deletions(-) diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index 742eed39981..f13478a8842 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -124,11 +124,18 @@ struct dpif_ipfix_port { uint32_t ifindex; }; +struct dpif_ipfix_domain { + struct hmap_node hmap_node; /* In struct dpif_ipfix_exporter's domains. */ + time_t last_template_set_time; +}; + struct dpif_ipfix_exporter { uint32_t exporter_id; /* Exporting Process identifier */ - struct collectors *collectors; uint32_t seq_number; - time_t last_template_set_time; + struct collectors *collectors; + struct hmap domains; /* Contains struct dpif_ipfix_domain indexed by + observation domain id. */ + time_t last_stats_sent_time; struct hmap cache_flow_key_map; /* ipfix_flow_cache_entry. */ struct ovs_list cache_flow_start_timestamp_list; /* ipfix_flow_cache_entry. */ uint32_t cache_active_timeout; /* In seconds. */ @@ -617,6 +624,9 @@ static void get_export_time_now(uint64_t *, uint32_t *); static void dpif_ipfix_cache_expire_now(struct dpif_ipfix_exporter *, bool); +static void dpif_ipfix_exporter_del_domain(struct dpif_ipfix_exporter *, + struct dpif_ipfix_domain *); + static bool ofproto_ipfix_bridge_exporter_options_equal( const struct ofproto_ipfix_bridge_exporter_options *a, @@ -697,13 +707,14 @@ dpif_ipfix_exporter_init(struct dpif_ipfix_exporter *exporter) exporter->exporter_id = ++exporter_total_count; exporter->collectors = NULL; exporter->seq_number = 1; - exporter->last_template_set_time = 0; + exporter->last_stats_sent_time = 0; hmap_init(&exporter->cache_flow_key_map); ovs_list_init(&exporter->cache_flow_start_timestamp_list); exporter->cache_active_timeout = 0; exporter->cache_max_flows = 0; exporter->virtual_obs_id = NULL; exporter->virtual_obs_len = 0; + hmap_init(&exporter->domains); memset(&exporter->ipfix_global_stats, 0, sizeof(struct dpif_ipfix_global_stats)); @@ -711,6 +722,7 @@ dpif_ipfix_exporter_init(struct dpif_ipfix_exporter *exporter) static void dpif_ipfix_exporter_clear(struct dpif_ipfix_exporter *exporter) + OVS_REQUIRES(mutex) { /* Flush the cache with flow end reason "forced end." */ dpif_ipfix_cache_expire_now(exporter, true); @@ -719,22 +731,29 @@ dpif_ipfix_exporter_clear(struct dpif_ipfix_exporter *exporter) exporter->exporter_id = 0; exporter->collectors = NULL; exporter->seq_number = 1; - exporter->last_template_set_time = 0; + exporter->last_stats_sent_time = 0; exporter->cache_active_timeout = 0; exporter->cache_max_flows = 0; free(exporter->virtual_obs_id); exporter->virtual_obs_id = NULL; exporter->virtual_obs_len = 0; + struct dpif_ipfix_domain *dom; + HMAP_FOR_EACH_SAFE (dom, hmap_node, &exporter->domains) { + dpif_ipfix_exporter_del_domain(exporter, dom); + } + memset(&exporter->ipfix_global_stats, 0, sizeof(struct dpif_ipfix_global_stats)); } static void dpif_ipfix_exporter_destroy(struct dpif_ipfix_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_exporter_clear(exporter); hmap_destroy(&exporter->cache_flow_key_map); + hmap_destroy(&exporter->domains); } static bool @@ -742,7 +761,7 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, const struct sset *targets, const uint32_t cache_active_timeout, const uint32_t cache_max_flows, - const char *virtual_obs_id) + const char *virtual_obs_id) OVS_REQUIRES(mutex) { size_t virtual_obs_len; collectors_destroy(exporter->collectors); @@ -769,6 +788,37 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, return true; } +static struct dpif_ipfix_domain * +dpif_ipfix_exporter_find_domain(const struct dpif_ipfix_exporter *exporter, + uint32_t domain_id) OVS_REQUIRES(mutex) +{ + struct dpif_ipfix_domain *dom; + HMAP_FOR_EACH_WITH_HASH (dom, hmap_node, hash_int(domain_id, 0), + &exporter->domains) { + return dom; + } + return NULL; +} + +static struct dpif_ipfix_domain * +dpif_ipfix_exporter_insert_domain(struct dpif_ipfix_exporter *exporter, + const uint32_t domain_id) OVS_REQUIRES(mutex) +{ + struct dpif_ipfix_domain *dom = xmalloc(sizeof *dom); + dom->last_template_set_time = 0; + hmap_insert(&exporter->domains, &dom->hmap_node, hash_int(domain_id, 0)); + return dom; +} + +static void +dpif_ipfix_exporter_del_domain(struct dpif_ipfix_exporter *exporter, + struct dpif_ipfix_domain *dom) + OVS_REQUIRES(mutex) +{ + hmap_remove(&exporter->domains, &dom->hmap_node); + free(dom); +} + static struct dpif_ipfix_port * dpif_ipfix_find_port(const struct dpif_ipfix *di, odp_port_t odp_port) OVS_REQUIRES(mutex) @@ -909,6 +959,7 @@ dpif_ipfix_bridge_exporter_init(struct dpif_ipfix_bridge_exporter *exporter) static void dpif_ipfix_bridge_exporter_clear(struct dpif_ipfix_bridge_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_exporter_clear(&exporter->exporter); ofproto_ipfix_bridge_exporter_options_destroy(exporter->options); @@ -918,6 +969,7 @@ dpif_ipfix_bridge_exporter_clear(struct dpif_ipfix_bridge_exporter *exporter) static void dpif_ipfix_bridge_exporter_destroy(struct dpif_ipfix_bridge_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_bridge_exporter_clear(exporter); dpif_ipfix_exporter_destroy(&exporter->exporter); @@ -927,7 +979,7 @@ static void dpif_ipfix_bridge_exporter_set_options( struct dpif_ipfix_bridge_exporter *exporter, const struct ofproto_ipfix_bridge_exporter_options *options, - bool *options_changed) + bool *options_changed) OVS_REQUIRES(mutex) { if (!options || sset_is_empty(&options->targets)) { /* No point in doing any work if there are no targets. */ @@ -1003,6 +1055,7 @@ dpif_ipfix_flow_exporter_init(struct dpif_ipfix_flow_exporter *exporter) static void dpif_ipfix_flow_exporter_clear(struct dpif_ipfix_flow_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_exporter_clear(&exporter->exporter); ofproto_ipfix_flow_exporter_options_destroy(exporter->options); @@ -1011,6 +1064,7 @@ dpif_ipfix_flow_exporter_clear(struct dpif_ipfix_flow_exporter *exporter) static void dpif_ipfix_flow_exporter_destroy(struct dpif_ipfix_flow_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_flow_exporter_clear(exporter); dpif_ipfix_exporter_destroy(&exporter->exporter); @@ -1020,7 +1074,7 @@ static bool dpif_ipfix_flow_exporter_set_options( struct dpif_ipfix_flow_exporter *exporter, const struct ofproto_ipfix_flow_exporter_options *options, - bool *options_changed) + bool *options_changed) OVS_REQUIRES(mutex) { if (sset_is_empty(&options->targets)) { /* No point in doing any work if there are no targets. */ @@ -1071,6 +1125,7 @@ dpif_ipfix_flow_exporter_set_options( static void remove_flow_exporter(struct dpif_ipfix *di, struct dpif_ipfix_flow_exporter_map_node *node) + OVS_REQUIRES(mutex) { hmap_remove(&di->flow_exporter_map, &node->node); dpif_ipfix_flow_exporter_destroy(&node->exporter); @@ -2000,6 +2055,7 @@ static void ipfix_cache_update(struct dpif_ipfix_exporter *exporter, struct ipfix_flow_cache_entry *entry, enum ipfix_sampled_packet_type sampled_pkt_type) + OVS_REQUIRES(mutex) { struct ipfix_flow_cache_entry *old_entry; size_t current_flows = 0; @@ -2811,14 +2867,36 @@ dpif_ipfix_flow_sample(struct dpif_ipfix *di, const struct dp_packet *packet, ovs_mutex_unlock(&mutex); } +static bool +dpif_ipfix_should_send_template(struct dpif_ipfix_exporter *exporter, + const uint32_t observation_domain_id, + const uint32_t export_time_sec) + OVS_REQUIRES(mutex) +{ + struct dpif_ipfix_domain *domain; + domain = dpif_ipfix_exporter_find_domain(exporter, + observation_domain_id); + if (!domain) { + /* First time we see this obs_domain_id. */ + domain = dpif_ipfix_exporter_insert_domain(exporter, + observation_domain_id); + } + + if ((domain->last_template_set_time + IPFIX_TEMPLATE_INTERVAL) + <= export_time_sec) { + domain->last_template_set_time = export_time_sec; + return true; + } + return false; +} + static void dpif_ipfix_cache_expire(struct dpif_ipfix_exporter *exporter, bool forced_end, const uint64_t export_time_usec, - const uint32_t export_time_sec) + const uint32_t export_time_sec) OVS_REQUIRES(mutex) { struct ipfix_flow_cache_entry *entry; uint64_t max_flow_start_timestamp_usec; - bool template_msg_sent = false; enum ipfix_flow_end_reason flow_end_reason; if (ovs_list_is_empty(&exporter->cache_flow_start_timestamp_list)) { @@ -2844,25 +2922,28 @@ dpif_ipfix_cache_expire(struct dpif_ipfix_exporter *exporter, break; } - ovs_list_remove(&entry->cache_flow_start_timestamp_list_node); - hmap_remove(&exporter->cache_flow_key_map, - &entry->flow_key_map_node); + /* XXX: Make frequency of the (Options) Template and Exporter Process + * Statistics transmission configurable. + * Cf. IETF RFC 5101 Section 4.3. and 10.3.6. */ + if ((exporter->last_stats_sent_time + IPFIX_TEMPLATE_INTERVAL) + <= export_time_sec) { + exporter->last_stats_sent_time = export_time_sec; + ipfix_send_exporter_data_msg(exporter, export_time_sec); + } - /* XXX: Make frequency of the (Options) Template and Exporter Process - * Statistics transmission configurable. - * Cf. IETF RFC 5101 Section 4.3. and 10.3.6. */ - if (!template_msg_sent - && (exporter->last_template_set_time + IPFIX_TEMPLATE_INTERVAL) - <= export_time_sec) { + if (dpif_ipfix_should_send_template(exporter, + entry->flow_key.obs_domain_id, + export_time_sec)) { + VLOG_DBG("Sending templates for ObservationDomainID %"PRIu32, + entry->flow_key.obs_domain_id); ipfix_send_template_msgs(exporter, export_time_sec, entry->flow_key.obs_domain_id); - exporter->last_template_set_time = export_time_sec; - template_msg_sent = true; - - /* Send Exporter Process Statistics. */ - ipfix_send_exporter_data_msg(exporter, export_time_sec); } + ovs_list_remove(&entry->cache_flow_start_timestamp_list_node); + hmap_remove(&exporter->cache_flow_key_map, + &entry->flow_key_map_node); + /* XXX: Group multiple data records for the same obs domain id * into the same message. */ ipfix_send_data_msg(exporter, export_time_sec, entry, flow_end_reason); @@ -2883,7 +2964,7 @@ get_export_time_now(uint64_t *export_time_usec, uint32_t *export_time_sec) static void dpif_ipfix_cache_expire_now(struct dpif_ipfix_exporter *exporter, - bool forced_end) + bool forced_end) OVS_REQUIRES(mutex) { uint64_t export_time_usec; uint32_t export_time_sec; From 71e5669af6bf1d8f7ba0afbea24ce811d23e7d2e Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Tue, 21 Feb 2023 14:18:13 -0500 Subject: [PATCH 178/833] ovs-actions: Correct typo in ovs-actions man page. There was a minor typo in the ovs-actions man page. Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-actions.7.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ref/ovs-actions.7.rst b/Documentation/ref/ovs-actions.7.rst index b59b7634fa0..d1389565564 100644 --- a/Documentation/ref/ovs-actions.7.rst +++ b/Documentation/ref/ovs-actions.7.rst @@ -1380,7 +1380,7 @@ The ``delete_field`` action | ``delete_field:``\ *field* The ``delete_field`` action deletes a *field* in the syntax described under -`Field Specifications`_ above. Currently, only the ``tun_metadta`` fields are +`Field Specifications`_ above. Currently, only the ``tun_metadata`` fields are supported. This action was added in Open vSwitch 2.14. From b0d9a1efccb931d3eb71661b1506c2bdde58d4d9 Mon Sep 17 00:00:00 2001 From: Liang Mancang Date: Tue, 21 Feb 2023 17:19:01 +0800 Subject: [PATCH 179/833] conntrack: Fix conntrack_clean may access the same exp_list each time. when a exp_list contains more than the clean_end's number of nodes, and these nodes will not expire immediately. Then, every times we call conntrack_clean, it use the same next_sweep to get exp_list. Actually, we should add i every times after we call ct_sweep. Fixes: 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with rculists.") Acked-by: Paolo Valerio Signed-off-by: Liang Mancang Signed-off-by: Ilya Maximets --- lib/conntrack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 524670e45d4..8cf7779c670 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -1512,12 +1512,12 @@ conntrack_clean(struct conntrack *ct, long long now) clean_end = n_conn_limit / 64; for (i = ct->next_sweep; i < N_EXP_LISTS; i++) { - count += ct_sweep(ct, &ct->exp_lists[i], now); - if (count > clean_end) { next_wakeup = 0; break; } + + count += ct_sweep(ct, &ct->exp_lists[i], now); } ct->next_sweep = (i < N_EXP_LISTS) ? i : 0; From c156f9bc502a18858a4d9d6d04cae7982f341e6a Mon Sep 17 00:00:00 2001 From: Viacheslav Galaktionov Date: Tue, 21 Feb 2023 17:02:25 +0400 Subject: [PATCH 180/833] ofproto: Include flow cookies in bridge/dump-flows output. Cookies are an important part of flow descriptions and must be available to the end user. Signed-off-by: Viacheslav Galaktionov Signed-off-by: Ilya Maximets --- ofproto/ofproto.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index e4a1bee769d..863b34d25bb 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -4824,6 +4824,10 @@ flow_stats_ds(struct ofproto *ofproto, struct rule *rule, struct ds *results, created = rule->created; ovs_mutex_unlock(&rule->mutex); + if (rule->flow_cookie != 0) { + ds_put_format(results, "cookie=0x%"PRIx64", ", + ntohll(rule->flow_cookie)); + } if (rule->table_id != 0) { ds_put_format(results, "table_id=%"PRIu8", ", rule->table_id); } From cf288fdfe2bfc11909a6d14ee55039e193f96460 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 21 Feb 2023 21:05:13 +0100 Subject: [PATCH 181/833] AUTHORS: Add Liang Mancang and Viacheslav Galaktionov. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 836070649bb..ac1c37747dd 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -262,6 +262,7 @@ Leif Madsen lmadsen@redhat.com Leo Alterman Li RongQing lirongqing@baidu.com Lian-min Wang liang-min.wang@intel.com +Liang Mancang liangmc1@chinatelecom.cn Lin Huang linhuang@ruijie.com.cn Liu Chang liuchang@cmss.chinamobile.com Lilijun jerry.lilijun@huawei.com @@ -448,6 +449,7 @@ Usman Ansari ua1422@gmail.com Valient Gough vgough@pobox.com Vasu Dasari vdasari@gmail.com Venkata Anil Kommaddi vkommadi@redhat.com +Viacheslav Galaktionov viacheslav.galaktionov@arknetworks.am Vishal Deep Ajmera vishal.deep.ajmera@ericsson.com Vivien Bernet-Rollande vbr@soprive.net Vlad Buslov vladbu@nvidia.com From b5313a8ceca8ee45b3e8fd862324c6a43f562bd9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 22 Feb 2023 22:12:46 +0100 Subject: [PATCH 182/833] ofproto: Fix re-creation of tunnel backing interfaces on restart. Tunnel OpenFlow ports do not exist in the datapath, instead there is a tunnel backing interface that serves all the tunnels of the same type. For example, if the geneve port 'my_tunnel' is added to OVS, it will create 'geneve_sys_6041' datapath port, if it doesn't already exist, and use this port as a tunnel output. However, while creating/opening a new datapath after re-start, ovs-vswitchd only has a list of names of OpenFlow interfaces. And it thinks that each datapath port, that is not on the list, is a stale port that needs to be removed. This is obviously not correct for tunnel backing interfaces that can serve multiple tunnel ports and do not match OpenFlow port names. This is causing removal and re-creation of all the tunnel backing interfaces in the datapath on OVS restart, causing disruption in existing connections. It's hard to tell by only having a name of the interface if this interface is a tunnel backing interface, or someone just named a normal interface this way. So, instead of trying to determine that, not removing any interfaces at all, while we don't know types of actual ports we need. Assuming that all the ports that are currently not in the list of OF ports are tunnel backing ports. Later, revalidation of tunnel backing ports in type_run() will determine which ports are still needed and which should be removed. It's OK to add even a non-tunnel stale ports into tnl_backers, they will be cleaned up the same way as stale tunnel backers. Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2023-February/052215.html Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif.c | 28 ++++++------------- tests/system-interface.at | 59 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index f87e27a8cd7..fad7342b0b0 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -714,12 +714,6 @@ close_dpif_backer(struct dpif_backer *backer, bool del) free(backer); } -/* Datapath port slated for removal from datapath. */ -struct odp_garbage { - struct ovs_list list_node; - odp_port_t odp_port; -}; - static void check_support(struct dpif_backer *backer); static int @@ -729,8 +723,6 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) struct dpif_port_dump port_dump; struct dpif_port port; struct shash_node *node; - struct ovs_list garbage_list; - struct odp_garbage *garbage; struct sset names; char *backer_name; @@ -792,25 +784,23 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) dpif_flow_flush(backer->dpif); } - /* Loop through the ports already on the datapath and remove any - * that we don't need anymore. */ - ovs_list_init(&garbage_list); + /* Loop through the ports already on the datapath and find ones that are + * not on the initial OpenFlow ports list. These are stale ports, that we + * do not need anymore, or tunnel backing interfaces, that do not generally + * match the name of OpenFlow tunnel ports, or both. Add all of them to + * the list of tunnel backers. type_run() will garbage collect those that + * are not active tunnel backing interfaces during revalidation. */ dpif_port_dump_start(&port_dump, backer->dpif); while (dpif_port_dump_next(&port_dump, &port)) { node = shash_find(&init_ofp_ports, port.name); if (!node && strcmp(port.name, dpif_base_name(backer->dpif))) { - garbage = xmalloc(sizeof *garbage); - garbage->odp_port = port.port_no; - ovs_list_push_front(&garbage_list, &garbage->list_node); + simap_put(&backer->tnl_backers, port.name, + odp_to_u32(port.port_no)); + backer->need_revalidate = REV_RECONFIGURE; } } dpif_port_dump_done(&port_dump); - LIST_FOR_EACH_POP (garbage, list_node, &garbage_list) { - dpif_port_del(backer->dpif, garbage->odp_port, false); - free(garbage); - } - shash_add(&all_dpif_backers, type, backer); check_support(backer); diff --git a/tests/system-interface.at b/tests/system-interface.at index 784bada12cb..3bf339582dd 100644 --- a/tests/system-interface.at +++ b/tests/system-interface.at @@ -63,3 +63,62 @@ AT_CHECK([ [stdout], [Device "br-p1" does not exist.] ) AT_CLEANUP + +AT_SETUP([interface - datapath ports garbage collection]) +OVS_CHECK_GENEVE() +OVS_TRAFFIC_VSWITCHD_START() + +dnl Not relevant for userspace datapath. +AT_SKIP_IF([! ovs-appctl dpctl/show | grep -q ovs-system]) + +AT_CHECK([ovs-vsctl add-port br0 tunnel_port dnl + -- set Interface tunnel_port dnl + type=geneve options:remote_ip=flow options:key=123]) + +AT_CHECK([ip link add ovs-veth0 type veth peer name ovs-veth1]) +on_exit 'ip link del ovs-veth0' + +AT_CHECK([ovs-vsctl add-port br0 ovs-veth0]) + +OVS_WAIT_UNTIL([ip link show | grep -q " genev_sys_[[0-9]]*: .* ovs-system "]) + +dnl Store the output of ip link for geneve port to compare ifindex later. +AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " > geneve.0]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: genev_sys_6081 (geneve: packet_type=ptap) + port 3: ovs-veth0 +]) + +OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovs-vswitchd], [ovs-vswitchd.pid]) + +dnl Check that geneve backing interface is still in the datapath. +AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " | diff -u - geneve.0]) + +dnl Remove the veth port from the database while ovs-vswitchd is down. +AT_CHECK([ovs-vsctl --no-wait del-port ovs-veth0]) + +dnl Check that it is still tied to the OVS datapath. +AT_CHECK([ip link show ovs-veth0 | grep -q ovs-system]) + +dnl Bring ovs-vswitchd back up. +AT_CHECK([ovs-vswitchd --detach --no-chdir --pidfile --log-file -vdpif:dbg], + [0], [], [stderr]) + +dnl Wait for the veth port to be removed from the datapath. +OVS_WAIT_WHILE([ip link show ovs-veth0 | grep -q ovs-system]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: genev_sys_6081 (geneve: packet_type=ptap) +]) + +dnl Check that geneve backing interface is still in the datapath and it wasn't +dnl re-created, i.e. the ifindex is the same. +AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " | diff -u - geneve.0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From f1f278f5e125be47038379e208de6d2508ace277 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Fri, 24 Feb 2023 12:05:04 +0100 Subject: [PATCH 183/833] ipfix: Make template and stats interval configurable. Add options to the IPFIX table configure the interval to send statistics and template information. Reviewed-by: Simon Horman Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- NEWS | 2 ++ ofproto/ofproto-dpif-ipfix.c | 38 ++++++++++++++++++++++++++---------- ofproto/ofproto.h | 9 +++++++++ vswitchd/bridge.c | 17 ++++++++++++++++ vswitchd/vswitch.ovsschema | 14 +++++++++++-- vswitchd/vswitch.xml | 20 +++++++++++++++++++ 6 files changed, 88 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index 85b34962145..ad84898ce80 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,7 @@ Post-v3.1.0 -------------------- + - IPFIX template and statistics intervals can now be configured through two + new options in the IPFIX table: 'template_interval' and 'stats_interval'. - Linux kernel datapath: * OVS now collects per-interface upcall statistics that can be obtained via 'ovs-appctl dpctl/show -s' or the interface's statistics column diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index f13478a8842..e6c2968f7e9 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -140,6 +140,8 @@ struct dpif_ipfix_exporter { struct ovs_list cache_flow_start_timestamp_list; /* ipfix_flow_cache_entry. */ uint32_t cache_active_timeout; /* In seconds. */ uint32_t cache_max_flows; + uint32_t stats_interval; + uint32_t template_interval; char *virtual_obs_id; uint8_t virtual_obs_len; @@ -174,11 +176,6 @@ struct dpif_ipfix { #define IPFIX_VERSION 0x000a -/* When using UDP, IPFIX Template Records must be re-sent regularly. - * The standard default interval is 10 minutes (600 seconds). - * Cf. IETF RFC 5101 Section 10.3.6. */ -#define IPFIX_TEMPLATE_INTERVAL 600 - /* Cf. IETF RFC 5101 Section 3.1. */ OVS_PACKED( struct ipfix_header { @@ -637,6 +634,8 @@ ofproto_ipfix_bridge_exporter_options_equal( && a->sampling_rate == b->sampling_rate && a->cache_active_timeout == b->cache_active_timeout && a->cache_max_flows == b->cache_max_flows + && a->stats_interval == b->stats_interval + && a->template_interval == b->template_interval && a->enable_tunnel_sampling == b->enable_tunnel_sampling && a->enable_input_sampling == b->enable_input_sampling && a->enable_output_sampling == b->enable_output_sampling @@ -674,6 +673,8 @@ ofproto_ipfix_flow_exporter_options_equal( return (a->collector_set_id == b->collector_set_id && a->cache_active_timeout == b->cache_active_timeout && a->cache_max_flows == b->cache_max_flows + && a->stats_interval == b->stats_interval + && a->template_interval == b->template_interval && a->enable_tunnel_sampling == b->enable_tunnel_sampling && sset_equals(&a->targets, &b->targets) && nullable_string_is_equal(a->virtual_obs_id, b->virtual_obs_id)); @@ -712,6 +713,9 @@ dpif_ipfix_exporter_init(struct dpif_ipfix_exporter *exporter) ovs_list_init(&exporter->cache_flow_start_timestamp_list); exporter->cache_active_timeout = 0; exporter->cache_max_flows = 0; + exporter->stats_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->template_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->last_stats_sent_time = 0; exporter->virtual_obs_id = NULL; exporter->virtual_obs_len = 0; hmap_init(&exporter->domains); @@ -734,6 +738,9 @@ dpif_ipfix_exporter_clear(struct dpif_ipfix_exporter *exporter) exporter->last_stats_sent_time = 0; exporter->cache_active_timeout = 0; exporter->cache_max_flows = 0; + exporter->stats_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->template_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->last_stats_sent_time = 0; free(exporter->virtual_obs_id); exporter->virtual_obs_id = NULL; exporter->virtual_obs_len = 0; @@ -761,6 +768,8 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, const struct sset *targets, const uint32_t cache_active_timeout, const uint32_t cache_max_flows, + const uint32_t stats_interval, + const uint32_t template_interval, const char *virtual_obs_id) OVS_REQUIRES(mutex) { size_t virtual_obs_len; @@ -775,6 +784,8 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, } exporter->cache_active_timeout = cache_active_timeout; exporter->cache_max_flows = cache_max_flows; + exporter->stats_interval = stats_interval; + exporter->template_interval = template_interval; virtual_obs_len = virtual_obs_id ? strlen(virtual_obs_id) : 0; if (virtual_obs_len > IPFIX_VIRTUAL_OBS_MAX_LEN) { VLOG_WARN_RL(&rl, "Virtual obsevation ID too long (%d bytes), " @@ -1007,6 +1018,7 @@ dpif_ipfix_bridge_exporter_set_options( if (!dpif_ipfix_exporter_set_options( &exporter->exporter, &options->targets, options->cache_active_timeout, options->cache_max_flows, + options->stats_interval, options->template_interval, options->virtual_obs_id)) { return; } @@ -1022,6 +1034,14 @@ dpif_ipfix_bridge_exporter_set_options( exporter->probability = MAX(1, UINT32_MAX / exporter->options->sampling_rate); + /* Configure static observation_domain_id. */ + struct dpif_ipfix_domain *dom; + HMAP_FOR_EACH_SAFE (dom, hmap_node, &(exporter->exporter.domains)) { + dpif_ipfix_exporter_del_domain(&exporter->exporter, dom); + } + dpif_ipfix_exporter_insert_domain(&exporter->exporter, + options->obs_domain_id); + /* Run over the cache as some entries might have expired after * changing the timeouts. */ dpif_ipfix_cache_expire_now(&exporter->exporter, false); @@ -1102,6 +1122,7 @@ dpif_ipfix_flow_exporter_set_options( if (!dpif_ipfix_exporter_set_options( &exporter->exporter, &options->targets, options->cache_active_timeout, options->cache_max_flows, + options->stats_interval, options->template_interval, options->virtual_obs_id)) { return false; } @@ -2882,7 +2903,7 @@ dpif_ipfix_should_send_template(struct dpif_ipfix_exporter *exporter, observation_domain_id); } - if ((domain->last_template_set_time + IPFIX_TEMPLATE_INTERVAL) + if ((domain->last_template_set_time + exporter->template_interval) <= export_time_sec) { domain->last_template_set_time = export_time_sec; return true; @@ -2922,10 +2943,7 @@ dpif_ipfix_cache_expire(struct dpif_ipfix_exporter *exporter, break; } - /* XXX: Make frequency of the (Options) Template and Exporter Process - * Statistics transmission configurable. - * Cf. IETF RFC 5101 Section 4.3. and 10.3.6. */ - if ((exporter->last_stats_sent_time + IPFIX_TEMPLATE_INTERVAL) + if ((exporter->last_stats_sent_time + exporter->stats_interval) <= export_time_sec) { exporter->last_stats_sent_time = export_time_sec; ipfix_send_exporter_data_msg(exporter, export_time_sec); diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 4e15167ab72..c79f372bce5 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -72,6 +72,11 @@ struct ofproto_sflow_options { char *control_ip; }; +/* When using UDP, IPFIX Template Records must be re-sent regularly. + * The standard default interval is 10 minutes (600 seconds). + * Cf. IETF RFC 5101 Section 10.3.6. */ +#define OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL 600 + struct ofproto_ipfix_bridge_exporter_options { struct sset targets; uint32_t sampling_rate; @@ -79,6 +84,8 @@ struct ofproto_ipfix_bridge_exporter_options { uint32_t obs_point_id; /* Bridge-wide Observation Point ID. */ uint32_t cache_active_timeout; uint32_t cache_max_flows; + uint32_t template_interval; + uint32_t stats_interval; bool enable_tunnel_sampling; bool enable_input_sampling; bool enable_output_sampling; @@ -90,6 +97,8 @@ struct ofproto_ipfix_flow_exporter_options { struct sset targets; uint32_t cache_active_timeout; uint32_t cache_max_flows; + uint32_t template_interval; + uint32_t stats_interval; bool enable_tunnel_sampling; char *virtual_obs_id; }; diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index abf2afe5737..307a515279d 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1542,6 +1542,17 @@ bridge_configure_ipfix(struct bridge *br) if (be_cfg->cache_max_flows) { be_opts.cache_max_flows = *be_cfg->cache_max_flows; } + if (be_cfg->stats_interval) { + be_opts.stats_interval = *be_cfg->stats_interval; + } else { + be_opts.stats_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + } + if (be_cfg->template_interval) { + be_opts.template_interval = *be_cfg->template_interval; + } else { + be_opts.template_interval = + OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + } be_opts.enable_tunnel_sampling = smap_get_bool(&be_cfg->other_config, "enable-tunnel-sampling", true); @@ -1570,6 +1581,12 @@ bridge_configure_ipfix(struct bridge *br) ? *fe_cfg->ipfix->cache_active_timeout : 0; opts->cache_max_flows = fe_cfg->ipfix->cache_max_flows ? *fe_cfg->ipfix->cache_max_flows : 0; + opts->stats_interval = fe_cfg->ipfix->stats_interval + ? *fe_cfg->ipfix->stats_interval + : OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + opts->template_interval = fe_cfg->ipfix->template_interval + ? *fe_cfg->ipfix->template_interval + : OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; opts->enable_tunnel_sampling = smap_get_bool( &fe_cfg->ipfix->other_config, "enable-tunnel-sampling", true); diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index 1a49cdffea7..2d395ff952c 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "8.3.1", - "cksum": "3012963480 26720", + "version": "8.4.0", + "cksum": "2738838700 27127", "tables": { "Open_vSwitch": { "columns": { @@ -531,6 +531,16 @@ "minInteger": 0, "maxInteger": 4294967295}, "min": 0, "max": 1}}, + "stats_interval": { + "type": {"key": {"type": "integer", + "minInteger": 1, + "maxInteger": 3600}, + "min": 0, "max": 1}}, + "template_interval": { + "type": {"key": {"type": "integer", + "minInteger": 1, + "maxInteger": 3600}, + "min": 0, "max": 1}}, "other_config": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 05ac1fbe5ef..12708a3131d 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -6596,6 +6596,26 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ disabled. + +

+ Interval (in seconds) for sending IPFIX exporting process statistics + according to IETF RFC 5101 Section 4.3. +

+

+ Default value is 600 +

+
+ + +

+ Interval (in seconds) for sending IPFIX Template information for each + Observation Domain ID. +

+

+ Default value is 600 +

+
+

From 5f0fdf5e2c2e959048fc8ea8be1a57d518805644 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 22 Feb 2023 14:42:34 +0100 Subject: [PATCH 184/833] test: Move check for tc ingress pps support to test script. Move check for tc ingress pps support to from aclocal to test script This has several problems: 1. Stderror from failing commands is output when executing various make targets. 2. There are various failure conditions that lead to veth0 and veth1 being created by not cleaned up. 3. The check seems to execute for many make targets. And it attempts to temporarily modify system state. This seems inappropriate. 4. veth0 and veth1 seem far too generic and could easily conflict with other parts of the system. All these problems are addressed by this patch. Signed-off-by: Simon Horman Reviewed-by: Louis Peens Acked-by: Ilya Maximets --- tests/atlocal.in | 11 ----------- tests/system-offloads-traffic.at | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/atlocal.in b/tests/atlocal.in index e02248f6f82..85966858629 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -172,17 +172,6 @@ fi # Set HAVE_TC find_command tc -# When HAVE_TC=yes, check if the current tc supports adding pps filter -SUPPORT_TC_INGRESS_PPS="no" -if test $HAVE_TC="yes"; then - ip link add veth0 type veth peer name veth1 - tc qdisc add dev veth0 handle ffff: ingress - if tc filter add dev veth0 parent ffff: u32 match u32 0 0 police pkts_rate 100 pkts_burst 10; then - SUPPORT_TC_INGRESS_PPS="yes" - fi - ip link del veth0 -fi - # Set HAVE_TCPDUMP find_command tcpdump diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index f2bf9c0639a..7558812ebc0 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -18,6 +18,16 @@ m4_define([OVS_CHECK_ACTIONS], [ [0], [$1]) ]) +m4_define([CHECK_TC_INGRESS_PPS], +[ + AT_SKIP_IF([test $HAVE_TC = "no"]) + AT_CHECK([ip link add ovs_tc_pps0 type veth peer name ovs_tc_pps1 dnl + || exit 77]) + on_exit 'ip link del ovs_tc_pps0' + AT_CHECK([tc qdisc add dev ovs_tc_pps0 handle ffff: ingress || exit 77]) + AT_CHECK([tc filter add dev ovs_tc_pps0 parent ffff: u32 match dnl + u32 0 0 police pkts_rate 100 pkts_burst 10 || exit 77]) +]) AT_SETUP([offloads - ping between two ports - offloads disabled]) OVS_TRAFFIC_VSWITCHD_START() @@ -132,7 +142,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_kpkts_rate and ingress_policing_kpkts_burst - offloads disabled]) AT_KEYWORDS([ingress_policing_kpkts]) -AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +CHECK_TC_INGRESS_PPS() OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=false]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) @@ -156,7 +166,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_kpkts_rate and ingress_policing_kpkts_burst - offloads enabled]) AT_KEYWORDS([ingress_policing_kpkts]) -AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +CHECK_TC_INGRESS_PPS() OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) ADD_NAMESPACES(at_ns0) From 8bd68806307863bd706504fd662c00069e0b31f4 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Mon, 27 Feb 2023 17:51:10 +0100 Subject: [PATCH 185/833] system-traffic.at: Add icmp error tests while dnatting address and port. The two tests verify, for both icmp and icmpv6, that the correct port translation happen in the inner packet in the case an error is received in the reply direction. Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 3a15b88a259..380372430b6 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3561,6 +3561,43 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.3)], [0], [dnl OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - ICMP related NAT with single port]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "f0:00:00:01:01:01") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "f0:00:00:01:01:02") + +AT_DATA([flows.txt], [dnl +table=0,ip,ct_state=-trk,actions=ct(table=0,nat) +table=0,in_port=ovs-p0,ct_state=+trk+new,udp,actions=ct(commit,nat(dst=10.1.1.2:8080)),ovs-p1 +table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0 +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +rm p0.pcap +OVS_DAEMONIZE([tcpdump -l -U -i ovs-p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) + +dnl Send UDP packet from 10.1.1.1:1234 to 10.1.1.240:80 +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p0,packet=f00000010102f0000001010108004500002944c140004011df100a0101010a0101f004d2005000156b24646573745f756e72656163680a,actions=resubmit(,0)"]) +dnl Send "destination unreachable" response +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p1,packet=f00000010101f00000010102080045c000456a3700004001f9bc0a0101020a01010103031328000000004500002944c140004011dffe0a0101010a01010204d21f9000154cd2646573745f756e72656163680a,actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.240,sport=1234,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=8080,dport=1234) +]) + +OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -q "f00000010101f00000010102080045c000456a3700004001f8ce0a0101f00a01010103031416000000004500002944c140004011df100a0101010a0101f004d2005000156b24646573745f756e72656163680a"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([conntrack - IPv4 fragmentation]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() @@ -6555,6 +6592,43 @@ udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - ICMPv6 related NAT with single port]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96", "f0:00:00:01:01:01", [], "nodad") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96", "f0:00:00:01:01:02", [], "nodad") + +AT_DATA([flows.txt], [dnl +table=0,ipv6,ct_state=-trk,actions=ct(table=0,nat) +table=0,in_port=ovs-p0,ct_state=+trk+new,udp6,actions=ct(commit,nat(dst=[[fc00::2]]:8080)),ovs-p1 +table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0 +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +rm p0.pcap +OVS_DAEMONIZE([tcpdump -l -U -i ovs-p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) + +dnl Send UDP packet from [[fc00::1]]:1234 to [[fc00::240]]:80 +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p0,packet=f00000010102f0000001010186dd60066ced00151140fc000000000000000000000000000001fc00000000000000000000000000024004d20050001587d4646573745f756e72656163680a,actions=resubmit(,0)"]) +dnl Send "destination unreachable" response +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p1,packet=f00000010101f0000001010286dd600733ed00453a40fc000000000000000000000000000002fc000000000000000000000000000001010428550000000060066ced00151140fc000000000000000000000000000001fc00000000000000000000000000000204d21f9000156ad2646573745f756e72656163680a,actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=fc00::1," | sort], [0], [dnl +udp,orig=(src=fc00::1,dst=fc00::240,sport=1234,dport=80),reply=(src=fc00::2,dst=fc00::1,sport=8080,dport=1234) +]) + +OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -q "f00000010101f0000001010286dd600733ed00453a40fc000000000000000000000000000240fc000000000000000000000000000001010426170000000060066ced00151140fc000000000000000000000000000001fc00000000000000000000000000024004d20050001587d4646573745f756e72656163680a"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([conntrack - IPv6 FTP with SNAT]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() From 489553b1c21692063931a9f50b6849b23128443c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 17 Feb 2023 21:09:59 +0100 Subject: [PATCH 186/833] classifier: Fix missing masks on a final stage with ports trie. Flow lookup doesn't include masks of the final stage in a resulting flow wildcards in case that stage had L4 ports match. Only the result of ports trie lookup is added to the mask. It might be sufficient in many cases, but it's not correct, because ports trie is not how we decided that the packet didn't match in this subtable. In fact, we used a full subtable mask in order to determine that, so all the subtable mask bits has to be added. Ports trie can still be used to adjust ports' mask, but it is not sufficient to determine that the packet didn't match. Assuming we have following 2 OpenFlow rules on the bridge: table=0, priority=10,tcp,tp_dst=80,tcp_flags=+psh actions=drop table=0, priority=0 actions=output(1) The first high priority rule supposed to drop all the TCP data traffic sent on port 80. The handshake, however, is allowed for forwarding. Both 'tcp_flags' and 'tp_dst' are on the final stage in the flow. Since the stage mask from that stage is not incorporated into the flow wildcards and only ports mask is getting updated, we have the following megaflow for the SYN packet that has no match on 'tcp_flags': $ ovs-appctl ofproto/trace br0 "in_port=br0,tcp,tp_dst=80,tcp_flags=syn" Megaflow: recirc_id=0,eth,tcp,in_port=LOCAL,nw_frag=no,tp_dst=80 Datapath actions: 1 If this flow is getting installed into datapath flow table, all the packets for port 80, regardless of TCP flags, will be forwarded. Incorporating all the looked at bits from the final stage into the stages map in order to get all the necessary wildcards. Ports mask has to be updated as a last step, because it doesn't cover the full 64-bit slot in the flowmap. With this change, in the example above, OVS is producing correct flow wildcards including match on TCP flags: Megaflow: recirc_id=0,eth,tcp,in_port=LOCAL,nw_frag=no,tp_dst=80,tcp_flags=-psh Datapath actions: 1 This way only -psh packets will be forwarded, as expected. This issue affects all other fields on stage 4, not only TCP flags. Tests included to cover tcp_flags, nd_target and ct_tp_src/dst. First two are frequently used, ct ones are sharing the same flowmap slot with L4 ports, so important to test. Before the pre-computation of stage masks, flow wildcards were updated during lookup, so there was no issue. The bits of the final stage was lost with introduction of 'stages_map'. Recent adjustment of segment boundaries exposed 'tcp_flags' to the issue. Reported-at: https://github.com/openvswitch/ovs-issues/issues/272 Fixes: ca44218515f0 ("classifier: Adjust segment boundary to execute prerequisite processing.") Fixes: fa2fdbf8d0c1 ("classifier: Pre-compute stage masks.") Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- lib/classifier.c | 25 ++++++++++--- tests/classifier.at | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 5 deletions(-) diff --git a/lib/classifier.c b/lib/classifier.c index 0a89626cc30..18dbfc83ad4 100644 --- a/lib/classifier.c +++ b/lib/classifier.c @@ -1695,6 +1695,8 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, const struct cls_match *rule = NULL; struct flowmap stages_map = FLOWMAP_EMPTY_INITIALIZER; unsigned int mask_offset = 0; + bool adjust_ports_mask = false; + ovs_be32 ports_mask; int i; /* Try to finish early by checking fields in segments. */ @@ -1722,6 +1724,9 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, subtable->index_maps[i], flow, wc)) { goto no_match; } + /* Accumulate the map used so far. */ + stages_map = flowmap_or(stages_map, subtable->index_maps[i]); + hash = flow_hash_in_minimask_range(flow, &subtable->mask, subtable->index_maps[i], &mask_offset, &basis); @@ -1731,14 +1736,16 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, * unwildcarding all the ports bits, use the ports trie to figure out a * smaller set of bits to unwildcard. */ unsigned int mbits; - ovs_be32 value, plens, mask; + ovs_be32 value, plens; - mask = miniflow_get_ports(&subtable->mask.masks); - value = ((OVS_FORCE ovs_be32 *)flow)[TP_PORTS_OFS32] & mask; + ports_mask = miniflow_get_ports(&subtable->mask.masks); + value = ((OVS_FORCE ovs_be32 *) flow)[TP_PORTS_OFS32] & ports_mask; mbits = trie_lookup_value(&subtable->ports_trie, &value, &plens, 32); - ((OVS_FORCE ovs_be32 *)&wc->masks)[TP_PORTS_OFS32] |= - mask & be32_prefix_mask(mbits); + ports_mask &= be32_prefix_mask(mbits); + ports_mask |= ((OVS_FORCE ovs_be32 *) &wc->masks)[TP_PORTS_OFS32]; + + adjust_ports_mask = true; goto no_match; } @@ -1751,6 +1758,14 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, /* Unwildcard the bits in stages so far, as they were used in determining * there is no match. */ flow_wildcards_fold_minimask_in_map(wc, &subtable->mask, stages_map); + if (adjust_ports_mask) { + /* This has to be done after updating flow wildcards to overwrite + * the ports mask back. We can't simply disable the corresponding bit + * in the stages map, because it has 64-bit resolution, i.e. one + * bit covers not only tp_src/dst, but also ct_tp_src/dst, which are + * not covered by the trie. */ + ((OVS_FORCE ovs_be32 *) &wc->masks)[TP_PORTS_OFS32] = ports_mask; + } return NULL; } diff --git a/tests/classifier.at b/tests/classifier.at index f652b59837b..de2705653e0 100644 --- a/tests/classifier.at +++ b/tests/classifier.at @@ -65,6 +65,94 @@ Datapath actions: 2 OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([flow classifier - lookup segmentation - final stage]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 +AT_DATA([flows.txt], [dnl +table=0 in_port=1 priority=33,tcp,tp_dst=80,tcp_flags=+psh,action=output(2) +table=0 in_port=1 priority=0,ip,action=drop +table=0 in_port=2 priority=16,icmp6,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=1000::1 ,action=output(1) +table=0 in_port=2 priority=0,ip,action=drop +table=0 in_port=3 action=resubmit(,1) +table=1 in_port=3 priority=45,ct_state=+trk+rpl,ct_nw_proto=6,ct_tp_src=3/0x1,tcp,tp_dst=80,tcp_flags=+psh,action=output(2) +table=1 in_port=3 priority=10,ip,action=drop +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=syn'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=-psh +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=syn|ack'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=-psh +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=ack|psh'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=+psh +Datapath actions: 2 +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=-psh +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=79'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=0x40/0xfff0,tcp_flags=-psh +Datapath actions: drop +]) + +dnl Having both the port and the tcp flags in the resulting megaflow below +dnl is redundant, but that is how ports trie logic is implemented. +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=81'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=81,tcp_flags=-psh +Datapath actions: drop +]) + +dnl nd_target is redundant in the megaflow below and it is also not relevant +dnl for an icmp reply. Datapath may discard that match, but it is OK as long +dnl as we have prerequisites (icmp_type) in the match as well. +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=128,icmpv6_code=0"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x80/0xfc,nd_target=:: +Datapath actions: drop +]) + +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=135,icmpv6_code=0"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x87/0xff,icmp_code=0x0/0xff,nd_target=:: +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=135,icmpv6_code=0,nd_target=1000::1"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x87/0xff,icmp_code=0x0/0xff,nd_target=1000::1 +Datapath actions: 1 +]) +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=135,icmpv6_code=0,nd_target=1000::2"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x87/0xff,icmp_code=0x0/0xff,nd_target=1000::2 +Datapath actions: drop +]) + +dnl Check that ports' mask doesn't affect ct ports. +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=3,ct_state=trk|rpl,ct_nw_proto=6,ct_tp_src=3,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=psh'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,ct_state=+rpl+trk,ct_nw_proto=6,ct_tp_src=0x1/0x1,eth,tcp,in_port=3,nw_frag=no,tp_dst=80,tcp_flags=+psh +Datapath actions: 2 +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=3,ct_state=trk|rpl,ct_nw_proto=6,ct_tp_src=3,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=79,tcp_flags=psh'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,ct_state=+rpl+trk,ct_nw_proto=6,ct_tp_src=0x1/0x1,eth,tcp,in_port=3,nw_frag=no,tp_dst=0x40/0xfff0,tcp_flags=+psh +Datapath actions: drop +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([flow classifier prefix lookup]) AT_SETUP([flow classifier - prefix lookup]) OVS_VSWITCHD_START From 4d69c19000357812fcbe8202a10822d57ac9cc43 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 27 Feb 2023 16:29:26 +0100 Subject: [PATCH 187/833] ofproto-dpif-upcall: Reset ukey's last stats value if the datapath changed. When the ukey's action set changes, it could cause the flow to use a different datapath, for example, when it moves from tc to kernel. This will cause the the cached previous datapath statistics to be used. This change will reset the cached statistics when a change in datapath is discovered. Reviewed-by: Simon Horman Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 1 + lib/dpif-netlink.c | 1 + lib/dpif-provider.h | 8 +++++ lib/dpif.c | 6 ++++ lib/dpif.h | 1 + ofproto/ofproto-dpif-upcall.c | 41 ++++++++++++++++++++-- tests/system-offloads-traffic.at | 60 ++++++++++++++++++++++++++++++++ 7 files changed, 116 insertions(+), 2 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index c9f7179c3b4..aed2c8fbbe9 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9616,6 +9616,7 @@ dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id, const struct dpif_class dpif_netdev_class = { "netdev", true, /* cleanup_required */ + true, /* synced_dp_layers */ dpif_netdev_init, dpif_netdev_enumerate, dpif_netdev_port_open_type, diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 586fb8893d2..7875e573e64 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -4515,6 +4515,7 @@ dpif_netlink_cache_set_size(struct dpif *dpif_, uint32_t level, uint32_t size) const struct dpif_class dpif_netlink_class = { "system", false, /* cleanup_required */ + false, /* synced_dp_layers */ NULL, /* init */ dpif_netlink_enumerate, NULL, diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index 12477a24fee..b8ead8a02a0 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -127,6 +127,14 @@ struct dpif_class { * datapaths that can not exist without it (e.g. netdev datapath). */ bool cleanup_required; + /* If 'true' the specific dpif implementation synchronizes the various + * datapath implementation layers, i.e., the dpif's layer in combination + * with the underlying netdev offload layers. For example, dpif-netlink + * does not sync its kernel flows with the tc ones, i.e., only one gets + * installed. On the other hand, dpif-netdev installs both flows, + * internally keeps track of both, and represents them as one. */ + bool synced_dp_layers; + /* Called when the dpif provider is registered, typically at program * startup. Returning an error from this function will prevent any * datapath with this class from being created. diff --git a/lib/dpif.c b/lib/dpif.c index fe4db83fbfe..3305401fe01 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -2109,3 +2109,9 @@ dpif_cache_set_size(struct dpif *dpif, uint32_t level, uint32_t size) ? dpif->dpif_class->cache_set_size(dpif, level, size) : EOPNOTSUPP; } + +bool +dpif_synced_dp_layers(struct dpif *dpif) +{ + return dpif->dpif_class->synced_dp_layers; +} diff --git a/lib/dpif.h b/lib/dpif.h index 6cb4dae6d8d..129cbf6a1d5 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -939,6 +939,7 @@ int dpif_get_pmds_for_port(const struct dpif * dpif, odp_port_t port_no, char *dpif_get_dp_version(const struct dpif *); bool dpif_supports_tnl_push_pop(const struct dpif *); bool dpif_supports_explicit_drop_action(const struct dpif *); +bool dpif_synced_dp_layers(struct dpif *); /* Log functions. */ struct vlog_module; diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index fc94078cbba..4031e766f1d 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -48,17 +48,20 @@ #define UPCALL_MAX_BATCH 64 #define REVALIDATE_MAX_BATCH 50 +#define UINT64_THREE_QUARTERS (UINT64_MAX / 4 * 3) VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall); COVERAGE_DEFINE(dumped_duplicate_flow); COVERAGE_DEFINE(dumped_new_flow); COVERAGE_DEFINE(handler_duplicate_upcall); -COVERAGE_DEFINE(upcall_ukey_contention); -COVERAGE_DEFINE(upcall_ukey_replace); COVERAGE_DEFINE(revalidate_missed_dp_flow); +COVERAGE_DEFINE(ukey_dp_change); +COVERAGE_DEFINE(ukey_invalid_stat_reset); COVERAGE_DEFINE(upcall_flow_limit_hit); COVERAGE_DEFINE(upcall_flow_limit_kill); +COVERAGE_DEFINE(upcall_ukey_contention); +COVERAGE_DEFINE(upcall_ukey_replace); /* A thread that reads upcalls from dpif, forwards each upcall's packet, * and possibly sets up a kernel flow as a cache. */ @@ -288,6 +291,7 @@ struct udpif_key { struct ovs_mutex mutex; /* Guards the following. */ struct dpif_flow_stats stats OVS_GUARDED; /* Last known stats.*/ + const char *dp_layer OVS_GUARDED; /* Last known dp_layer. */ long long int created OVS_GUARDED; /* Estimate of creation time. */ uint64_t dump_seq OVS_GUARDED; /* Tracks udpif->dump_seq. */ uint64_t reval_seq OVS_GUARDED; /* Tracks udpif->reval_seq. */ @@ -1771,6 +1775,7 @@ ukey_create__(const struct nlattr *key, size_t key_len, ukey->created = ukey->flow_time = time_msec(); memset(&ukey->stats, 0, sizeof ukey->stats); ukey->stats.used = used; + ukey->dp_layer = NULL; ukey->xcache = NULL; ukey->offloaded = false; @@ -2357,6 +2362,13 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, ? stats->n_bytes - ukey->stats.n_bytes : 0); + if (stats->n_packets < ukey->stats.n_packets && + ukey->stats.n_packets < UINT64_THREE_QUARTERS) { + /* Report cases where the packet counter is lower than the previous + * instance, but exclude the potential wrapping of an uint64_t. */ + COVERAGE_INC(ukey_invalid_stat_reset); + } + if (need_revalidate) { if (should_revalidate(udpif, push.n_packets, ukey->stats.used)) { if (!ukey->xcache) { @@ -2470,6 +2482,15 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops) push->tcp_flags = stats->tcp_flags | op->ukey->stats.tcp_flags; push->n_packets = stats->n_packets - op->ukey->stats.n_packets; push->n_bytes = stats->n_bytes - op->ukey->stats.n_bytes; + + if (stats->n_packets < op->ukey->stats.n_packets && + op->ukey->stats.n_packets < UINT64_THREE_QUARTERS) { + /* Report cases where the packet counter is lower than the + * previous instance, but exclude the potential wrapping of an + * uint64_t. */ + COVERAGE_INC(ukey_invalid_stat_reset); + } + ovs_mutex_unlock(&op->ukey->mutex); } else { push = stats; @@ -2774,6 +2795,22 @@ revalidate(struct revalidator *revalidator) continue; } + ukey->offloaded = f->attrs.offloaded; + if (!ukey->dp_layer + || (!dpif_synced_dp_layers(udpif->dpif) + && strcmp(ukey->dp_layer, f->attrs.dp_layer))) { + + if (ukey->dp_layer) { + /* The dp_layer has changed this is probably due to an + * earlier revalidate cycle moving it to/from hw offload. + * In this case we should reset the ukey stored statistics, + * as they are from the deleted DP flow. */ + COVERAGE_INC(ukey_dp_change); + memset(&ukey->stats, 0, sizeof ukey->stats); + } + ukey->dp_layer = f->attrs.dp_layer; + } + already_dumped = ukey->dump_seq == dump_seq; if (already_dumped) { /* The flow has already been handled during this flow dump diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 7558812ebc0..eb331d6ce18 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -690,3 +690,63 @@ OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(check_pkt_len(size=100,gt(5), OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + + +AT_SETUP([offloads - offload flow to none-offload]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +add in_port=ovs-p0,actions=ovs-p1 +add in_port=ovs-p1,actions=ovs-p0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc | grep "eth_type(0x0800)" | sort | strip_recirc | strip_used], [0], [dnl +recirc_id(),in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:756, used:0.0s, actions:3 +recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:756, used:0.0s, actions:2 +]) + +dnl Here we use an output action with truncate, which will force a kernel flow. +AT_DATA([flows2.txt], [dnl +modify in_port=ovs-p0,actions=output(port=ovs-p1, max_len=128) +modify in_port=ovs-p1,actions=output(port=ovs-p0, max_len=128) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=ovs | grep "eth_type(0x0800)" | sort | strip_recirc | strip_used], [0], [dnl +recirc_id(),in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:980, used:0.0s, actions:trunc(128),3 +recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:980, used:0.0s, actions:trunc(128),2 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc | grep "eth_type(0x0800)" | sort | strip_recirc | strip_used], [0], [dnl +recirc_id(),in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:840, used:0.0s, actions:3 +recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:840, used:0.0s, actions:2 +]) + +AT_CHECK([ovs-appctl coverage/read-counter ukey_invalid_stat_reset], [0], [dnl +0 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From bfc0d5da350775f9872b57817169eaf146fb5461 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 27 Feb 2023 16:30:11 +0100 Subject: [PATCH 188/833] ofproto-dpif-upcall: Include hardware offloaded flows in total flows. The revalidator process uses the internal call udpif_get_n_flows() to get the total number of flows installed in the system. It uses this value for various decisions on flow installation and removal. With the tc offload this values is incorrect, as the hardware offloaded are not included. With rte_flow offload this is not a problem as dpif netdev keeps both in sync. This patch will include the hardware offloaded flows if the underlying dpif implementation is not syncing them. Reviewed-by: Simon Horman Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 4031e766f1d..06873d47791 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -785,6 +785,17 @@ udpif_get_n_flows(struct udpif *udpif) atomic_store_relaxed(&udpif->n_flows_timestamp, now); dpif_get_dp_stats(udpif->dpif, &stats); flow_count = stats.n_flows; + + if (!dpif_synced_dp_layers(udpif->dpif)) { + /* If the dpif layer does not sync the flows, we need to include + * the hardware offloaded flows separately. */ + uint64_t hw_flows; + + if (!dpif_get_n_offloaded_flows(udpif->dpif, &hw_flows)) { + flow_count += hw_flows; + } + } + atomic_store_relaxed(&udpif->n_flows, flow_count); ovs_mutex_unlock(&udpif->n_flows_mutex); } else { From e3c821f8ca866508b8aba70b08ed016898a06625 Mon Sep 17 00:00:00 2001 From: Wilson Peng Date: Wed, 9 Nov 2022 11:31:46 +0800 Subject: [PATCH 189/833] netdev-windows: Add checking when creating netdev with system type on Windows In the recent Antrea project testing, some port could not be created on Windows. When doing debug, our team found there is one case happening when multiple ports are waiting for be created with correct port number. Some system type port will be created netdev successfully and it will cause conflict as in the dpif side it will be internal type. So finally the port will be created failed and it could not be easily recovered. With the patch, on Windows the netdev creating will be blocked for system type when the ovs_tyep got on dpif is internal. More detailed case description is in the reported issue No.262 with link below. Reported-at:https://github.com/openvswitch/ovs-issues/issues/262 Signed-off-by: Wilson Peng Signed-off-by: Alin Gabriel Serdean --- lib/netdev-windows.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/netdev-windows.c b/lib/netdev-windows.c index 4ad45ffa1b2..3fad501e3ee 100644 --- a/lib/netdev-windows.c +++ b/lib/netdev-windows.c @@ -156,6 +156,7 @@ netdev_windows_system_construct(struct netdev *netdev_) struct netdev_windows_netdev_info info; struct ofpbuf *buf; int ret; + const char *type = NULL; /* Query the attributes and runtime status of the netdev. */ ret = query_netdev(netdev_get_name(&netdev->up), &info, &buf); @@ -167,6 +168,16 @@ netdev_windows_system_construct(struct netdev *netdev_) } ofpbuf_delete(buf); + /* Don't create netdev if ovs-type is "internal" + * but the type of netdev->up is "system". */ + type = netdev_get_type(&netdev->up); + if (type && !strcmp(type, "system") && + (info.ovs_type == OVS_VPORT_TYPE_INTERNAL)) { + VLOG_DBG("construct device %s, ovs_type: %u failed", + netdev_get_name(&netdev->up), info.ovs_type); + return 1; + } + netdev->change_seq = 1; netdev->dev_type = info.ovs_type; netdev->port_no = info.port_no; From 71ca8393b7005d4705336ad1bc1be9ce9ae49ef9 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 28 Feb 2023 18:30:56 -0800 Subject: [PATCH 190/833] treewide: Remove uses of ATOMIC_VAR_INIT. ATOMIC_VAR_INIT has a trivial definition `#define ATOMIC_VAR_INIT(value) (value)`, is deprecated in C17/C++20, and will be removed in newer standards in newer GCC/Clang (e.g. https://reviews.llvm.org/D144196). Signed-off-by: Fangrui Song Signed-off-by: Ilya Maximets --- lib/dpdk.c | 2 +- lib/mpsc-queue.h | 6 +++--- lib/ovs-atomic-clang.h | 2 -- lib/ovs-atomic-gcc4+.h | 1 - lib/ovs-atomic-gcc4.7+.h | 1 - lib/ovs-atomic-i586.h | 1 - lib/ovs-atomic-msvc.h | 1 - lib/ovs-atomic-pthreads.h | 1 - lib/ovs-atomic-x86_64.h | 1 - lib/ovs-atomic.h | 5 ++--- lib/ovs-rcu.h | 4 ++-- lib/ovs-replay.c | 2 +- lib/versions.h | 2 +- lib/vlog.c | 2 +- ofproto/ofproto-dpif-upcall.c | 4 ++-- tests/test-atomic.c | 12 ++++++------ 16 files changed, 19 insertions(+), 28 deletions(-) diff --git a/lib/dpdk.c b/lib/dpdk.c index 240babc03e6..d76d53f8f16 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -47,7 +47,7 @@ VLOG_DEFINE_THIS_MODULE(dpdk); static FILE *log_stream = NULL; /* Stream for DPDK log redirection */ /* Indicates successful initialization of DPDK. */ -static atomic_bool dpdk_initialized = ATOMIC_VAR_INIT(false); +static atomic_bool dpdk_initialized = false; static bool args_contains(const struct svec *args, const char *value) diff --git a/lib/mpsc-queue.h b/lib/mpsc-queue.h index 8c7109621a1..70c2d7a01ec 100644 --- a/lib/mpsc-queue.h +++ b/lib/mpsc-queue.h @@ -116,9 +116,9 @@ struct mpsc_queue { }; #define MPSC_QUEUE_INITIALIZER(Q) { \ - .head = ATOMIC_VAR_INIT(&(Q)->stub), \ - .tail = ATOMIC_VAR_INIT(&(Q)->stub), \ - .stub = { .next = ATOMIC_VAR_INIT(NULL) }, \ + .head = &(Q)->stub, \ + .tail = &(Q)->stub, \ + .stub = { .next = NULL }, \ .read_lock = OVS_MUTEX_INITIALIZER, \ } diff --git a/lib/ovs-atomic-clang.h b/lib/ovs-atomic-clang.h index cdf02a512a9..0fc643c8a97 100644 --- a/lib/ovs-atomic-clang.h +++ b/lib/ovs-atomic-clang.h @@ -23,8 +23,6 @@ #define ATOMIC(TYPE) _Atomic(TYPE) -#define ATOMIC_VAR_INIT(VALUE) (VALUE) - #define atomic_init(OBJECT, VALUE) __c11_atomic_init(OBJECT, VALUE) /* Clang hard-codes these exact values internally but does not appear to diff --git a/lib/ovs-atomic-gcc4+.h b/lib/ovs-atomic-gcc4+.h index f9accde1a39..1917df69007 100644 --- a/lib/ovs-atomic-gcc4+.h +++ b/lib/ovs-atomic-gcc4+.h @@ -43,7 +43,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) (sizeof(OBJECT) <= sizeof(void *)) -#define ATOMIC_VAR_INIT(VALUE) VALUE #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) static inline void diff --git a/lib/ovs-atomic-gcc4.7+.h b/lib/ovs-atomic-gcc4.7+.h index 846e0577520..9680e546fc1 100644 --- a/lib/ovs-atomic-gcc4.7+.h +++ b/lib/ovs-atomic-gcc4.7+.h @@ -30,7 +30,6 @@ typedef enum { memory_order_seq_cst = __ATOMIC_SEQ_CST } memory_order; -#define ATOMIC_VAR_INIT(VALUE) (VALUE) #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) #define atomic_thread_fence __atomic_thread_fence diff --git a/lib/ovs-atomic-i586.h b/lib/ovs-atomic-i586.h index 35a0959ffca..2b651865215 100644 --- a/lib/ovs-atomic-i586.h +++ b/lib/ovs-atomic-i586.h @@ -119,7 +119,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) \ (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT))) -#define ATOMIC_VAR_INIT(VALUE) VALUE #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) /* diff --git a/lib/ovs-atomic-msvc.h b/lib/ovs-atomic-msvc.h index fb8cd03bd69..3a71f61aeec 100644 --- a/lib/ovs-atomic-msvc.h +++ b/lib/ovs-atomic-msvc.h @@ -59,7 +59,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) \ (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT))) -#define ATOMIC_VAR_INIT(VALUE) (VALUE) #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) static inline void diff --git a/lib/ovs-atomic-pthreads.h b/lib/ovs-atomic-pthreads.h index 570a67fe4cb..0e4263fe288 100644 --- a/lib/ovs-atomic-pthreads.h +++ b/lib/ovs-atomic-pthreads.h @@ -42,7 +42,6 @@ typedef enum { memory_order_seq_cst } memory_order; -#define ATOMIC_VAR_INIT(VALUE) (VALUE) #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) static inline void diff --git a/lib/ovs-atomic-x86_64.h b/lib/ovs-atomic-x86_64.h index 3bdaf2f08e9..2f538699f18 100644 --- a/lib/ovs-atomic-x86_64.h +++ b/lib/ovs-atomic-x86_64.h @@ -120,7 +120,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) \ (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT))) -#define ATOMIC_VAR_INIT(VALUE) VALUE #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) /* diff --git a/lib/ovs-atomic.h b/lib/ovs-atomic.h index 8fdce0cf804..ab9ce6b2e0f 100644 --- a/lib/ovs-atomic.h +++ b/lib/ovs-atomic.h @@ -91,10 +91,9 @@ * Life Cycle * ========== * - * To initialize an atomic variable at its point of definition, use - * ATOMIC_VAR_INIT: + * To initialize an atomic variable at its point of definition, use: * - * static atomic_int ai = ATOMIC_VAR_INIT(123); + * static atomic_int ai = 123; * * To initialize an atomic variable in code, use atomic_init(): * diff --git a/lib/ovs-rcu.h b/lib/ovs-rcu.h index 8b397b7fb0c..a1c15c1266e 100644 --- a/lib/ovs-rcu.h +++ b/lib/ovs-rcu.h @@ -175,7 +175,7 @@ #if __GNUC__ #define OVSRCU_TYPE(TYPE) struct { ATOMIC(TYPE) p; } -#define OVSRCU_INITIALIZER(VALUE) { ATOMIC_VAR_INIT(VALUE) } +#define OVSRCU_INITIALIZER(VALUE) { VALUE } #define ovsrcu_get__(TYPE, VAR, ORDER) \ ({ \ TYPE value__; \ @@ -207,7 +207,7 @@ #else /* not GNU C */ struct ovsrcu_pointer { ATOMIC(void *) p; }; #define OVSRCU_TYPE(TYPE) struct ovsrcu_pointer -#define OVSRCU_INITIALIZER(VALUE) { ATOMIC_VAR_INIT(VALUE) } +#define OVSRCU_INITIALIZER(VALUE) { VALUE } static inline void * ovsrcu_get__(const struct ovsrcu_pointer *pointer, memory_order order) { diff --git a/lib/ovs-replay.c b/lib/ovs-replay.c index f386246c7ef..551c7f56d3b 100644 --- a/lib/ovs-replay.c +++ b/lib/ovs-replay.c @@ -34,7 +34,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 25); static struct ovs_mutex replay_mutex = OVS_MUTEX_INITIALIZER; static int replay_seqno OVS_GUARDED_BY(replay_mutex) = 0; -static atomic_int replay_state = ATOMIC_VAR_INIT(OVS_REPLAY_NONE); +static atomic_int replay_state = OVS_REPLAY_NONE; static char *dirname = NULL; diff --git a/lib/versions.h b/lib/versions.h index d92f0a319e6..724880cb7e4 100644 --- a/lib/versions.h +++ b/lib/versions.h @@ -36,7 +36,7 @@ struct versions { }; #define VERSIONS_INITIALIZER(ADD, REMOVE) \ - (struct versions){ ADD, ATOMIC_VAR_INIT(REMOVE) } + (struct versions){ ADD, REMOVE } static inline void versions_set_remove_version(struct versions *versions, ovs_version_t version) diff --git a/lib/vlog.c b/lib/vlog.c index 0a615bb664b..9ddea48b85f 100644 --- a/lib/vlog.c +++ b/lib/vlog.c @@ -118,7 +118,7 @@ static struct ovs_list vlog_modules OVS_GUARDED_BY(log_file_mutex) static int syslog_fd OVS_GUARDED_BY(pattern_rwlock) = -1; /* Log facility configuration. */ -static atomic_int log_facility = ATOMIC_VAR_INIT(0); +static atomic_int log_facility = 0; /* Facility name and its value. */ struct vlog_facility { diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 06873d47791..4dab51dff0c 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -420,8 +420,8 @@ static int udpif_flow_unprogram(struct udpif *udpif, struct udpif_key *ukey, static upcall_callback upcall_cb; static dp_purge_callback dp_purge_cb; -static atomic_bool enable_megaflows = ATOMIC_VAR_INIT(true); -static atomic_bool enable_ufid = ATOMIC_VAR_INIT(true); +static atomic_bool enable_megaflows = true; +static atomic_bool enable_ufid = true; void udpif_init(void) diff --git a/tests/test-atomic.c b/tests/test-atomic.c index 4b1374b70b2..7853c3e59f2 100644 --- a/tests/test-atomic.c +++ b/tests/test-atomic.c @@ -28,7 +28,7 @@ VLOG_DEFINE_THIS_MODULE(test_atomic); #define TEST_ATOMIC_TYPE(ATOMIC_TYPE, BASE_TYPE) \ { \ - ATOMIC_TYPE x = ATOMIC_VAR_INIT(1); \ + ATOMIC_TYPE x = 1; \ BASE_TYPE value, orig; \ \ atomic_read(&x, &value); \ @@ -71,7 +71,7 @@ VLOG_DEFINE_THIS_MODULE(test_atomic); #define TEST_ATOMIC_TYPE_EXPLICIT(ATOMIC_TYPE, BASE_TYPE, \ ORDER_READ, ORDER_STORE, ORDER_RMW) \ { \ - ATOMIC_TYPE x = ATOMIC_VAR_INIT(1); \ + ATOMIC_TYPE x = 1; \ BASE_TYPE value, orig; \ \ atomic_read_explicit(&x, &value, ORDER_READ); \ @@ -181,7 +181,7 @@ struct atomic_aux { ATOMIC(uint64_t) data64; }; -static ATOMIC(struct atomic_aux *) paux = ATOMIC_VAR_INIT(NULL); +static ATOMIC(struct atomic_aux *) paux = NULL; static struct atomic_aux *auxes = NULL; #define ATOMIC_ITEM_COUNT 1000000 @@ -229,7 +229,7 @@ atomic_producer(void * arg1 OVS_UNUSED) for (i = 0; i < ATOMIC_ITEM_COUNT; i++) { struct atomic_aux *aux = &auxes[i]; - aux->count = ATOMIC_VAR_INIT(i); + aux->count = i; aux->b = i + 42; /* Publish the new item. */ @@ -337,9 +337,9 @@ test_acq_rel(void) a = 0; aux->b = 0; - aux->count = ATOMIC_VAR_INIT(0); + aux->count = 0; atomic_init(&aux->data, NULL); - aux->data64 = ATOMIC_VAR_INIT(0); + aux->data64 = 0; reader = ovs_thread_create("reader", atomic_reader, aux); writer = ovs_thread_create("writer", atomic_writer, aux); From f65d1951dfd06be469111c754d35890f7491bc5d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 6 Mar 2023 19:57:00 +0100 Subject: [PATCH 191/833] AUTHORS: Add Fangrui Song. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index ac1c37747dd..20f83176d5b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -163,6 +163,7 @@ Ethan J. Jackson ejj@eecs.berkeley.edu Ethan Rahn erahn@arista.com Eziz Durdyyev ezizdurdy@gmail.com Fabrizio D'Angelo fdangelo@redhat.com +Fangrui Song maskray@google.com Fengqi Li lifengqi@inspur.com Flavio Fernandes flavio@flaviof.com Flavio Leitner fbl@redhat.com From de6589799e7e810d5ff243b7b00fd2af5bf99ff2 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Mon, 6 Mar 2023 11:49:15 +0900 Subject: [PATCH 192/833] netdev-dummy: Support multiple IP addresses. This is useful in test cases where multiple IPv4/IPv6 addresses are assigned together. Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/netdev-dummy.c | 70 +++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 5d59c9c0312..7467e9fbcb9 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -136,8 +136,7 @@ struct netdev_dummy { struct pcap_file *tx_pcap, *rxq_pcap OVS_GUARDED; - struct in_addr address, netmask; - struct in6_addr ipv6, ipv6_mask; + struct ovs_list addrs OVS_GUARDED; struct ovs_list rxes OVS_GUARDED; /* List of child "netdev_rxq_dummy"s. */ struct hmap offloaded_flows OVS_GUARDED; @@ -161,6 +160,12 @@ struct netdev_rxq_dummy { struct seq *seq; /* Reports newly queued packets. */ }; +struct netdev_addr_dummy { + struct in6_addr address; + struct in6_addr netmask; + struct ovs_list node; /* In netdev_dummy's "addrs" list. */ +}; + static unixctl_cb_func netdev_dummy_set_admin_state; static int netdev_dummy_construct(struct netdev *); static void netdev_dummy_queue_packet(struct netdev_dummy *, @@ -169,6 +174,7 @@ static void netdev_dummy_queue_packet(struct netdev_dummy *, static void dummy_packet_stream_close(struct dummy_packet_stream *); static void pkt_list_delete(struct ovs_list *); +static void addr_list_delete(struct ovs_list *); static bool is_dummy_class(const struct netdev_class *class) @@ -720,6 +726,7 @@ netdev_dummy_construct(struct netdev *netdev_) dummy_packet_conn_init(&netdev->conn); ovs_list_init(&netdev->rxes); + ovs_list_init(&netdev->addrs); hmap_init(&netdev->offloaded_flows); ovs_mutex_unlock(&netdev->mutex); @@ -756,6 +763,7 @@ netdev_dummy_destruct(struct netdev *netdev_) free(off_flow); } hmap_destroy(&netdev->offloaded_flows); + addr_list_delete(&netdev->addrs); ovs_mutex_unlock(&netdev->mutex); ovs_mutex_destroy(&netdev->mutex); @@ -803,32 +811,24 @@ netdev_dummy_get_addr_list(const struct netdev *netdev_, struct in6_addr **paddr struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); int cnt = 0, i = 0, err = 0; struct in6_addr *addr, *mask; + struct netdev_addr_dummy *addr_dummy; ovs_mutex_lock(&netdev->mutex); - if (netdev->address.s_addr != INADDR_ANY) { - cnt++; - } - if (ipv6_addr_is_set(&netdev->ipv6)) { - cnt++; - } + cnt = ovs_list_size(&netdev->addrs); if (!cnt) { err = EADDRNOTAVAIL; goto out; } addr = xmalloc(sizeof *addr * cnt); mask = xmalloc(sizeof *mask * cnt); - if (netdev->address.s_addr != INADDR_ANY) { - in6_addr_set_mapped_ipv4(&addr[i], netdev->address.s_addr); - in6_addr_set_mapped_ipv4(&mask[i], netdev->netmask.s_addr); - i++; - } - if (ipv6_addr_is_set(&netdev->ipv6)) { - memcpy(&addr[i], &netdev->ipv6, sizeof *addr); - memcpy(&mask[i], &netdev->ipv6_mask, sizeof *mask); + LIST_FOR_EACH (addr_dummy, node, &netdev->addrs) { + memcpy(&addr[i], &addr_dummy->address, sizeof *addr); + memcpy(&mask[i], &addr_dummy->netmask, sizeof *mask); i++; } + if (paddr) { *paddr = addr; *pmask = mask; @@ -844,14 +844,16 @@ netdev_dummy_get_addr_list(const struct netdev *netdev_, struct in6_addr **paddr } static int -netdev_dummy_set_in4(struct netdev *netdev_, struct in_addr address, +netdev_dummy_add_in4(struct netdev *netdev_, struct in_addr address, struct in_addr netmask) { struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); + struct netdev_addr_dummy *addr_dummy = xmalloc(sizeof *addr_dummy); ovs_mutex_lock(&netdev->mutex); - netdev->address = address; - netdev->netmask = netmask; + in6_addr_set_mapped_ipv4(&addr_dummy->address, address.s_addr); + in6_addr_set_mapped_ipv4(&addr_dummy->netmask, netmask.s_addr); + ovs_list_push_back(&netdev->addrs, &addr_dummy->node); netdev_change_seq_changed(netdev_); ovs_mutex_unlock(&netdev->mutex); @@ -859,14 +861,16 @@ netdev_dummy_set_in4(struct netdev *netdev_, struct in_addr address, } static int -netdev_dummy_set_in6(struct netdev *netdev_, struct in6_addr *in6, +netdev_dummy_add_in6(struct netdev *netdev_, struct in6_addr *in6, struct in6_addr *mask) { struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); + struct netdev_addr_dummy *addr_dummy = xmalloc(sizeof *addr_dummy); ovs_mutex_lock(&netdev->mutex); - netdev->ipv6 = *in6; - netdev->ipv6_mask = *mask; + addr_dummy->address = *in6; + addr_dummy->netmask = *mask; + ovs_list_push_back(&netdev->addrs, &addr_dummy->node); netdev_change_seq_changed(netdev_); ovs_mutex_unlock(&netdev->mutex); @@ -1178,7 +1182,10 @@ netdev_dummy_send(struct netdev *netdev, int qid, dummy_packet_conn_send(&dev->conn, buffer, size); /* Reply to ARP requests for 'dev''s assigned IP address. */ - if (dev->address.s_addr) { + struct netdev_addr_dummy *addr_dummy; + LIST_FOR_EACH (addr_dummy, node, &dev->addrs) { + ovs_be32 address = in6_addr_get_mapped_ipv4(&addr_dummy->address); + struct dp_packet dp; struct flow flow; @@ -1186,11 +1193,12 @@ netdev_dummy_send(struct netdev *netdev, int qid, flow_extract(&dp, &flow); if (flow.dl_type == htons(ETH_TYPE_ARP) && flow.nw_proto == ARP_OP_REQUEST - && flow.nw_dst == dev->address.s_addr) { + && flow.nw_dst == address) { struct dp_packet *reply = dp_packet_new(0); compose_arp(reply, ARP_OP_REPLY, dev->hwaddr, flow.dl_src, false, flow.nw_dst, flow.nw_src); netdev_dummy_queue_packet(dev, reply, NULL, 0); + break; } } @@ -1677,6 +1685,16 @@ pkt_list_delete(struct ovs_list *l) } } +static void +addr_list_delete(struct ovs_list *l) +{ + struct netdev_addr_dummy *addr_dummy; + + LIST_FOR_EACH_POP (addr_dummy, node, l) { + free(addr_dummy); + } +} + static struct dp_packet * eth_from_packet(const char *s) { @@ -2009,7 +2027,7 @@ netdev_dummy_ip4addr(struct unixctl_conn *conn, int argc OVS_UNUSED, error = ip_parse_masked(argv[2], &ip.s_addr, &mask.s_addr); if (!error) { - netdev_dummy_set_in4(netdev, ip, mask); + netdev_dummy_add_in4(netdev, ip, mask); unixctl_command_reply(conn, "OK"); } else { unixctl_command_reply_error(conn, error); @@ -2038,7 +2056,7 @@ netdev_dummy_ip6addr(struct unixctl_conn *conn, int argc OVS_UNUSED, struct in6_addr mask; mask = ipv6_create_mask(plen); - netdev_dummy_set_in6(netdev, &ip6, &mask); + netdev_dummy_add_in6(netdev, &ip6, &mask); unixctl_command_reply(conn, "OK"); } else { unixctl_command_reply_error(conn, error); From 915f084b9ff80f32e265c66c9b1aa51f9bbbd275 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Mon, 6 Mar 2023 11:49:16 +0900 Subject: [PATCH 193/833] ovs-router: Cleanup parser for ovs/route/add command. This patch cleans up the parser to accept pkt_mark and gw in any order. pkt_mark and gw are normally expected to be specified exactly once. However, as with other tools, if specified multiple times, the last specification is used. Also, pkt_mark and gw have separate prefix strings so they can be parsed in any order. Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/ovs-router.c | 53 +++++++++++++++++++++++++-------------------- tests/ovs-router.at | 27 +++++++++++++++++++---- 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/lib/ovs-router.c b/lib/ovs-router.c index 5d0fbd503e9..b5ac1edb6c6 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -345,41 +345,46 @@ ovs_router_add(struct unixctl_conn *conn, int argc, struct in6_addr ip6; uint32_t mark = 0; unsigned int plen; + ovs_be32 gw = 0; + bool is_ipv6; ovs_be32 ip; int err; + int i; if (scan_ipv4_route(argv[1], &ip, &plen)) { - ovs_be32 gw = 0; - - if (argc > 3) { - if (!ovs_scan(argv[3], "pkt_mark=%"SCNi32, &mark) && - !ip_parse(argv[3], &gw)) { - unixctl_command_reply_error(conn, "Invalid pkt_mark or gateway"); - return; - } - } in6_addr_set_mapped_ipv4(&ip6, ip); - if (gw) { - in6_addr_set_mapped_ipv4(&gw6, gw); - } plen += 96; + is_ipv6 = false; } else if (scan_ipv6_route(argv[1], &ip6, &plen)) { - if (argc > 3) { - if (!ovs_scan(argv[3], "pkt_mark=%"SCNi32, &mark) && - !ipv6_parse(argv[3], &gw6)) { - unixctl_command_reply_error(conn, "Invalid pkt_mark or IPv6 gateway"); - return; - } - } + is_ipv6 = true; } else { - unixctl_command_reply_error(conn, "Invalid parameters"); + unixctl_command_reply_error(conn, + "Invalid 'ip_addr/prefix_len' parameter"); return; } - if (argc > 4) { - if (!ovs_scan(argv[4], "pkt_mark=%"SCNi32, &mark)) { - unixctl_command_reply_error(conn, "Invalid pkt_mark"); - return; + + /* Parse optional parameters. */ + for (i = 3; i < argc; i++) { + if (ovs_scan(argv[i], "pkt_mark=%"SCNi32, &mark)) { + continue; + } + + if (is_ipv6) { + if (ipv6_parse(argv[i], &gw6)) { + continue; + } + } else { + if (ip_parse(argv[i], &gw)) { + continue; + } } + + unixctl_command_reply_error(conn, "Invalid pkt_mark or IP gateway"); + return; + } + + if (gw) { + in6_addr_set_mapped_ipv4(&gw6, gw); } err = ovs_router_insert__(mark, plen + 32, false, &ip6, plen, argv[2], &gw6); diff --git a/tests/ovs-router.at b/tests/ovs-router.at index 6dacc2954bc..a36990f1ea1 100644 --- a/tests/ovs-router.at +++ b/tests/ovs-router.at @@ -1,14 +1,33 @@ AT_BANNER([ovs-router]) -AT_SETUP([appctl - route/add with gateway]) +AT_SETUP([appctl - route/add with gateway and pkt_mark]) AT_KEYWORDS([ovs_router]) -OVS_VSWITCHD_START([add-port br0 p2 -- set Interface p2 type=gre \ - options:local_ip=2.2.2.2 options:remote_ip=1.1.1.1 \ - -- add-port br0 p1 -- set interface p1 type=dummy]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 2.2.2.2/24], [0], [OK ]) +AT_CHECK([ovs-appctl ovs/route/add 2.2.2.3/32 br0 pkt_mark=1], [0], [OK +]) AT_CHECK([ovs-appctl ovs/route/add 1.1.1.0/24 br0 2.2.2.10], [0], [OK ]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.2.0/24 br0 2.2.2.10 pkt_mark=2], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.3.0/24 br0 pkt_mark=3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.foo.bar/24 br0 2.2.2.10], [2], [], [dnl +Invalid 'ip_addr/prefix_len' parameter +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2.2.2.4/24 br0 pkt_mark=baz], [2], [], [dnl +Invalid pkt_mark or IP gateway +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/show | grep User | sort], [0], [dnl +User: 1.1.1.0/24 dev br0 GW 2.2.2.10 SRC 2.2.2.2 +User: 1.1.2.0/24 MARK 2 dev br0 GW 2.2.2.10 SRC 2.2.2.2 +User: 2.2.2.3/32 MARK 1 dev br0 SRC 2.2.2.2 +]) OVS_VSWITCHD_STOP AT_CLEANUP From 01acf09f746e4678e81b545b38ca682171628d02 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Mon, 6 Mar 2023 11:49:17 +0900 Subject: [PATCH 194/833] ofproto: Fix man page for tunnel related commands. Fixed the manual page to indicate that both IPv4/IPv6 are supported. Also added missing pkt_mark on one side and fixed the "gw" and "bridge" notation quirks. Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/ovs-router.c | 6 +++--- ofproto/ofproto-tnl-unixctl.man | 8 ++++---- tests/ovs-router.at | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/ovs-router.c b/lib/ovs-router.c index b5ac1edb6c6..02fce9095a9 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -359,7 +359,7 @@ ovs_router_add(struct unixctl_conn *conn, int argc, is_ipv6 = true; } else { unixctl_command_reply_error(conn, - "Invalid 'ip_addr/prefix_len' parameter"); + "Invalid 'ip/plen' parameter"); return; } @@ -537,12 +537,12 @@ ovs_router_init(void) fatal_signal_add_hook(ovs_router_flush_handler, NULL, NULL, true); classifier_init(&cls, NULL); unixctl_command_register("ovs/route/add", - "ip_addr/prefix_len out_br_name [gw] " + "ip/plen output_bridge [gw] " "[pkt_mark=mark]", 2, 4, ovs_router_add, NULL); unixctl_command_register("ovs/route/show", "", 0, 0, ovs_router_show, NULL); - unixctl_command_register("ovs/route/del", "ip_addr/prefix_len " + unixctl_command_register("ovs/route/del", "ip/plen " "[pkt_mark=mark]", 1, 2, ovs_router_del, NULL); unixctl_command_register("ovs/route/lookup", "ip_addr " diff --git a/ofproto/ofproto-tnl-unixctl.man b/ofproto/ofproto-tnl-unixctl.man index 13a465119a9..6ed7e7fcea9 100644 --- a/ofproto/ofproto-tnl-unixctl.man +++ b/ofproto/ofproto-tnl-unixctl.man @@ -1,8 +1,8 @@ .SS "OPENVSWITCH TUNNELING COMMANDS" These commands query and modify OVS tunnel components. . -.IP "\fBovs/route/add ipv4_address/plen output_bridge [GW]\fR" -Adds ipv4_address/plen route to vswitchd routing table. output_bridge +.IP "\fBovs/route/add ip/plen output_bridge [gw] [pkt_mark=mark]\fR" +Adds ip/plen route to vswitchd routing table. output_bridge needs to be OVS bridge name. This command is useful if OVS cached routes does not look right. . @@ -10,8 +10,8 @@ routes does not look right. Print all routes in OVS routing table, This includes routes cached from system routing table and user configured routes. . -.IP "\fBovs/route/del ipv4_address/plen\fR" -Delete ipv4_address/plen route from OVS routing table. +.IP "\fBovs/route/del ip/plen [pkt_mark=mark]\fR" +Delete ip/plen route from OVS routing table. . .IP "\fBtnl/neigh/show\fR" .IP "\fBtnl/arp/show\fR" diff --git a/tests/ovs-router.at b/tests/ovs-router.at index a36990f1ea1..ec3b1dffe58 100644 --- a/tests/ovs-router.at +++ b/tests/ovs-router.at @@ -16,7 +16,7 @@ Error while inserting route. ovs-appctl: ovs-vswitchd: server returned an error ]) AT_CHECK([ovs-appctl ovs/route/add 1.1.foo.bar/24 br0 2.2.2.10], [2], [], [dnl -Invalid 'ip_addr/prefix_len' parameter +Invalid 'ip/plen' parameter ovs-appctl: ovs-vswitchd: server returned an error ]) AT_CHECK([ovs-appctl ovs/route/add 2.2.2.4/24 br0 pkt_mark=baz], [2], [], [dnl From b801f1aa001cf0537cc64b268a49c7988b78cbf5 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Mon, 6 Mar 2023 11:49:18 +0900 Subject: [PATCH 195/833] ovs-router: Introduce src option in ovs/route/add command. When adding a route with ovs/route/add command, the source address in "ovs_router_entry" structure is always the FIRST address that the interface has. See "ovs_router_get_netdev_source_address" function for more information. If an interface has multiple ipv4 and/or ipv6 addresses, there are use cases where the user wants to control the source address. This patch therefore addresses this issue by adding a src parameter. Note that same constraints also exist when caching routes from Kernel FIB with Netlink, but are not dealt with in this patch. Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- NEWS | 3 ++ lib/ovs-router.c | 86 +++++++++++++++++++++++++++++---- ofproto/ofproto-tnl-unixctl.man | 5 +- tests/ovs-router.at | 80 +++++++++++++++++++++++++++++- 4 files changed, 161 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index ad84898ce80..3fe30bbf6dd 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,9 @@ Post-v3.1.0 * OVS now collects per-interface upcall statistics that can be obtained via 'ovs-appctl dpctl/show -s' or the interface's statistics column in OVSDB. Available with upstream kernel 6.2+. + - ovs-appctl: + * Add support for selecting the source address with the + 'ovs-appctl ovs/route/add' command. - ovs-ctl: * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask value when starting OVS daemons. E.g., use --ovsdb-server-umask=0002 diff --git a/lib/ovs-router.c b/lib/ovs-router.c index 02fce9095a9..3107f2d5607 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -164,6 +164,46 @@ static void rt_init_match(struct match *match, uint32_t mark, match->flow.pkt_mark = mark; } +static int +verify_prefsrc(const struct in6_addr *ip6_dst, + const char output_bridge[], + struct in6_addr *prefsrc) +{ + struct in6_addr *mask, *addr6; + struct netdev *dev; + int err, n_in6, i; + + err = netdev_open(output_bridge, NULL, &dev); + if (err) { + return err; + } + + err = netdev_get_addr_list(dev, &addr6, &mask, &n_in6); + if (err) { + goto out; + } + + for (i = 0; i < n_in6; i++) { + struct in6_addr a1, a2; + a1 = ipv6_addr_bitand(ip6_dst, &mask[i]); + a2 = ipv6_addr_bitand(prefsrc, &mask[i]); + + /* Check that the interface has "prefsrc" and + * it is same broadcast domain with "ip6_dst". */ + if (IN6_ARE_ADDR_EQUAL(prefsrc, &addr6[i]) && + IN6_ARE_ADDR_EQUAL(&a1, &a2)) { + goto out; + } + } + err = ENOENT; + +out: + free(addr6); + free(mask); + netdev_close(dev); + return err; +} + int ovs_router_get_netdev_source_address(const struct in6_addr *ip6_dst, const char output_bridge[], @@ -217,8 +257,12 @@ static int ovs_router_insert__(uint32_t mark, uint8_t priority, bool local, const struct in6_addr *ip6_dst, uint8_t plen, const char output_bridge[], - const struct in6_addr *gw) + const struct in6_addr *gw, + const struct in6_addr *ip6_src) { + int (*get_src_addr)(const struct in6_addr *ip6_dst, + const char output_bridge[], + struct in6_addr *prefsrc); const struct cls_rule *cr; struct ovs_router_entry *p; struct match match; @@ -236,11 +280,17 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, bool local, p->plen = plen; p->local = local; p->priority = priority; - err = ovs_router_get_netdev_source_address(ip6_dst, output_bridge, - &p->src_addr); + + if (ipv6_addr_is_set(ip6_src)) { + p->src_addr = *ip6_src; + get_src_addr = verify_prefsrc; + } else { + get_src_addr = ovs_router_get_netdev_source_address; + } + + err = get_src_addr(ip6_dst, output_bridge, &p->src_addr); if (err && ipv6_addr_is_set(gw)) { - err = ovs_router_get_netdev_source_address(gw, output_bridge, - &p->src_addr); + err = get_src_addr(gw, output_bridge, &p->src_addr); } if (err) { struct ds ds = DS_EMPTY_INITIALIZER; @@ -274,7 +324,8 @@ ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, { if (use_system_routing_table) { uint8_t priority = local ? plen + 64 : plen; - ovs_router_insert__(mark, priority, local, ip_dst, plen, output_bridge, gw); + ovs_router_insert__(mark, priority, local, ip_dst, plen, + output_bridge, gw, &in6addr_any); } } @@ -341,10 +392,13 @@ static void ovs_router_add(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED) { + struct in6_addr src6 = in6addr_any; struct in6_addr gw6 = in6addr_any; + char src6_s[IPV6_SCAN_LEN + 1]; struct in6_addr ip6; uint32_t mark = 0; unsigned int plen; + ovs_be32 src = 0; ovs_be32 gw = 0; bool is_ipv6; ovs_be32 ip; @@ -370,24 +424,36 @@ ovs_router_add(struct unixctl_conn *conn, int argc, } if (is_ipv6) { + if (ovs_scan(argv[i], "src="IPV6_SCAN_FMT, src6_s) && + ipv6_parse(src6_s, &src6)) { + continue; + } if (ipv6_parse(argv[i], &gw6)) { continue; } } else { + if (ovs_scan(argv[i], "src="IP_SCAN_FMT, IP_SCAN_ARGS(&src))) { + continue; + } if (ip_parse(argv[i], &gw)) { continue; } } - unixctl_command_reply_error(conn, "Invalid pkt_mark or IP gateway"); + unixctl_command_reply_error(conn, + "Invalid pkt_mark, IP gateway or src_ip"); return; } if (gw) { in6_addr_set_mapped_ipv4(&gw6, gw); } + if (src) { + in6_addr_set_mapped_ipv4(&src6, src); + } - err = ovs_router_insert__(mark, plen + 32, false, &ip6, plen, argv[2], &gw6); + err = ovs_router_insert__(mark, plen + 32, false, &ip6, plen, argv[2], + &gw6, &src6); if (err) { unixctl_command_reply_error(conn, "Error while inserting route."); } else { @@ -538,8 +604,8 @@ ovs_router_init(void) classifier_init(&cls, NULL); unixctl_command_register("ovs/route/add", "ip/plen output_bridge [gw] " - "[pkt_mark=mark]", - 2, 4, ovs_router_add, NULL); + "[pkt_mark=mark] [src=src_ip]", + 2, 5, ovs_router_add, NULL); unixctl_command_register("ovs/route/show", "", 0, 0, ovs_router_show, NULL); unixctl_command_register("ovs/route/del", "ip/plen " diff --git a/ofproto/ofproto-tnl-unixctl.man b/ofproto/ofproto-tnl-unixctl.man index 6ed7e7fcea9..a801cfdccc5 100644 --- a/ofproto/ofproto-tnl-unixctl.man +++ b/ofproto/ofproto-tnl-unixctl.man @@ -1,8 +1,9 @@ .SS "OPENVSWITCH TUNNELING COMMANDS" These commands query and modify OVS tunnel components. . -.IP "\fBovs/route/add ip/plen output_bridge [gw] [pkt_mark=mark]\fR" -Adds ip/plen route to vswitchd routing table. output_bridge +.IP "\fBovs/route/add \fIip\fB/\fIplen\fB \fIoutput_bridge\fB \ +[\fIgw\fB] [pkt_mark=\fImark\fB] [src=\fIsrc_ip\fB]\fR" +Adds \fIip\fR/\fIplen\fR route to vswitchd routing table. \fIoutput_bridge\fR needs to be OVS bridge name. This command is useful if OVS cached routes does not look right. . diff --git a/tests/ovs-router.at b/tests/ovs-router.at index ec3b1dffe58..b3314b3dff0 100644 --- a/tests/ovs-router.at +++ b/tests/ovs-router.at @@ -20,7 +20,7 @@ Invalid 'ip/plen' parameter ovs-appctl: ovs-vswitchd: server returned an error ]) AT_CHECK([ovs-appctl ovs/route/add 2.2.2.4/24 br0 pkt_mark=baz], [2], [], [dnl -Invalid pkt_mark or IP gateway +Invalid pkt_mark, IP gateway or src_ip ovs-appctl: ovs-vswitchd: server returned an error ]) AT_CHECK([ovs-appctl ovs/route/show | grep User | sort], [0], [dnl @@ -31,6 +31,84 @@ User: 2.2.2.3/32 MARK 1 dev br0 SRC 2.2.2.2 OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([appctl - route/add with src - ipv4]) +AT_KEYWORDS([ovs_router]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 192.168.9.2/24], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 192.168.9.3/24], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.9.11/32 br0 src=192.168.9.3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.12/32 br0 192.168.9.1 src=192.168.9.3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.13/32 br0 192.168.9.1 pkt_mark=13 src=192.168.9.3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.14/32 br0 192.168.9.1 pkt_mark=14 src=192.168.9.2], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.15/32 br0 192.168.9.1 src=foo.bar.9.200], [2], [], [dnl +Invalid pkt_mark, IP gateway or src_ip +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.16/32 br0 192.168.9.1 src=192.168.9.200], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.17/32 br0 192.168.11.1 src=192.168.9.3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.18/32 br0 src=192.168.9.3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/show | grep User | grep 192.168.10 | sort], [0], [dnl +User: 192.168.10.12/32 dev br0 GW 192.168.9.1 SRC 192.168.9.3 +User: 192.168.10.13/32 MARK 13 dev br0 GW 192.168.9.1 SRC 192.168.9.3 +User: 192.168.10.14/32 MARK 14 dev br0 GW 192.168.9.1 SRC 192.168.9.2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([appctl - route/add with src - ipv6]) +AT_KEYWORDS([ovs_router]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:db8:cafe::2/64], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:db8:cafe::3/64], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:cafe::11/128 br0 src=2001:db8:cafe::3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::12/128 br0 2001:db8:cafe::1 src=2001:db8:cafe::3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::13/128 br0 2001:db8:cafe::1 pkt_mark=13 src=2001:db8:cafe::3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::14/128 br0 2001:db8:cafe::1 pkt_mark=14 src=2001:db8:cafe::2], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::15/128 br0 2001:db8:cafe::1 src=foo:bar:2001:db8:cafe], [2], [], [dnl +Invalid pkt_mark, IP gateway or src_ip +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::16/128 br0 2001:db8:cafe::1 src=2001:db8:cafe::200], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::17/128 br0 2001:db8:face::1 src=2001:db8:cafe::3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::18/128 br0 src=2001:db8:cafe::3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/show | grep User | grep 2001:db8:beef | sort], [0], [dnl +User: 2001:db8:beef::12/128 dev br0 GW 2001:db8:cafe::1 SRC 2001:db8:cafe::3 +User: 2001:db8:beef::13/128 MARK 13 dev br0 GW 2001:db8:cafe::1 SRC 2001:db8:cafe::3 +User: 2001:db8:beef::14/128 MARK 14 dev br0 GW 2001:db8:cafe::1 SRC 2001:db8:cafe::2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([appctl - route/lookup]) AT_KEYWORDS([ovs_router]) OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) From 49e534cd3764e853f70f01b63196f320c9a5790e Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Mon, 6 Mar 2023 11:49:19 +0900 Subject: [PATCH 196/833] route-table: Retrieving the preferred source address from Netlink. We can use the "ip route add ... src ..." command to set the preferred source address for each entry in the kernel FIB. OVS has a mechanism to cache the FIB, but the preferred source address is ignored and calculated with its own logic. This patch resolves the difference between kernel FIB and OVS route table cache by retrieving the RTA_PREFSRC attribute of Netlink messages. Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- NEWS | 2 ++ lib/ovs-router.c | 6 +++--- lib/ovs-router.h | 3 ++- lib/route-table.c | 16 +++++++++++++++- tests/system-route.at | 39 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index 3fe30bbf6dd..72b9024e6d8 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,8 @@ Post-v3.1.0 * OVS now collects per-interface upcall statistics that can be obtained via 'ovs-appctl dpctl/show -s' or the interface's statistics column in OVSDB. Available with upstream kernel 6.2+. + - OVS route table in userspace now takes into account preferred source + address from cached kernel routes. - ovs-appctl: * Add support for selecting the source address with the 'ovs-appctl ovs/route/add' command. diff --git a/lib/ovs-router.c b/lib/ovs-router.c index 3107f2d5607..7c04bb0e6b1 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -319,13 +319,13 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, bool local, void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, - bool local, const char output_bridge[], - const struct in6_addr *gw) + bool local, const char output_bridge[], + const struct in6_addr *gw, const struct in6_addr *prefsrc) { if (use_system_routing_table) { uint8_t priority = local ? plen + 64 : plen; ovs_router_insert__(mark, priority, local, ip_dst, plen, - output_bridge, gw, &in6addr_any); + output_bridge, gw, prefsrc); } } diff --git a/lib/ovs-router.h b/lib/ovs-router.h index d8ce3c00ded..eb4ff85d9e6 100644 --- a/lib/ovs-router.h +++ b/lib/ovs-router.h @@ -32,7 +32,8 @@ bool ovs_router_lookup(uint32_t mark, const struct in6_addr *ip_dst, void ovs_router_init(void); void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, bool local, - const char output_bridge[], const struct in6_addr *gw); + const char output_bridge[], const struct in6_addr *gw, + const struct in6_addr *prefsrc); void ovs_router_flush(void); void ovs_router_disable_system_routing_table(void); diff --git a/lib/route-table.c b/lib/route-table.c index ac82cf262f8..9927dcc1854 100644 --- a/lib/route-table.c +++ b/lib/route-table.c @@ -51,6 +51,7 @@ struct route_data { /* Extracted from Netlink attributes. */ struct in6_addr rta_dst; /* 0 if missing. */ + struct in6_addr rta_prefsrc; /* 0 if missing. */ struct in6_addr rta_gw; char ifname[IFNAMSIZ]; /* Interface name. */ uint32_t mark; @@ -201,6 +202,7 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) [RTA_OIF] = { .type = NL_A_U32, .optional = true }, [RTA_GATEWAY] = { .type = NL_A_U32, .optional = true }, [RTA_MARK] = { .type = NL_A_U32, .optional = true }, + [RTA_PREFSRC] = { .type = NL_A_U32, .optional = true }, }; static const struct nl_policy policy6[] = { @@ -208,6 +210,7 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) [RTA_OIF] = { .type = NL_A_U32, .optional = true }, [RTA_MARK] = { .type = NL_A_U32, .optional = true }, [RTA_GATEWAY] = { .type = NL_A_IPV6, .optional = true }, + [RTA_PREFSRC] = { .type = NL_A_IPV6, .optional = true }, }; struct nlattr *attrs[ARRAY_SIZE(policy)]; @@ -274,6 +277,16 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) } else if (ipv4) { in6_addr_set_mapped_ipv4(&change->rd.rta_dst, 0); } + if (attrs[RTA_PREFSRC]) { + if (ipv4) { + ovs_be32 prefsrc; + prefsrc = nl_attr_get_be32(attrs[RTA_PREFSRC]); + in6_addr_set_mapped_ipv4(&change->rd.rta_prefsrc, prefsrc); + } else { + change->rd.rta_prefsrc = + nl_attr_get_in6_addr(attrs[RTA_PREFSRC]); + } + } if (attrs[RTA_GATEWAY]) { if (ipv4) { ovs_be32 gw; @@ -309,7 +322,8 @@ route_table_handle_msg(const struct route_table_msg *change) const struct route_data *rd = &change->rd; ovs_router_insert(rd->mark, &rd->rta_dst, rd->rtm_dst_len, - rd->local, rd->ifname, &rd->rta_gw); + rd->local, rd->ifname, &rd->rta_gw, + &rd->rta_prefsrc); } } diff --git a/tests/system-route.at b/tests/system-route.at index 270956d13f6..114aaebc77f 100644 --- a/tests/system-route.at +++ b/tests/system-route.at @@ -25,3 +25,42 @@ OVS_WAIT_UNTIL([test `ovs-appctl ovs/route/show | grep -c 'p1-route'` -eq 0 ]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ovs-route - add system route with src - ipv4]) +AT_KEYWORDS([route]) +OVS_TRAFFIC_VSWITCHD_START() +AT_CHECK([ip link set br0 up]) + +AT_CHECK([ip addr add 192.168.9.2/24 dev br0], [0], [stdout]) +AT_CHECK([ip addr add 192.168.9.3/24 dev br0], [0], [stdout]) + +AT_CHECK([ip route add 192.168.10.12/32 dev br0 via 192.168.9.1 src 192.168.9.2], [0], [stdout]) +AT_CHECK([ip route add 192.168.10.13/32 dev br0 via 192.168.9.1 src 192.168.9.3], [0], [stdout]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep -E '192.168.10.1[[23]]/32' | sort], [dnl +Cached: 192.168.10.12/32 dev br0 GW 192.168.9.1 SRC 192.168.9.2 +Cached: 192.168.10.13/32 dev br0 GW 192.168.9.1 SRC 192.168.9.3]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ovs-route - add system route with src - ipv6]) +AT_KEYWORDS([route]) +OVS_TRAFFIC_VSWITCHD_START() +AT_CHECK([ip link set br0 up]) + +AT_CHECK([ip -6 addr add fc00:db8:cafe::2/64 dev br0], [0], [stdout]) +AT_CHECK([ip -6 addr add fc00:db8:cafe::3/64 dev br0], [0], [stdout]) + +dnl If we try to add a route immediately after assigning ipv6 addresses, +dnl iproute2 would give us "Invalid source address" error, +dnl so wait a while to succeed. +OVS_WAIT_UNTIL([ip -6 route add fc00:db8:beef::12/128 via fc00:db8:cafe::1 dev br0 src fc00:db8:cafe::3]) +OVS_WAIT_UNTIL([ip -6 route add fc00:db8:beef::13/128 via fc00:db8:cafe::1 dev br0 src fc00:db8:cafe::2]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep -E 'fc00:db8:beef::1[[23]]/128' | sort], [dnl +Cached: fc00:db8:beef::12/128 dev br0 GW fc00:db8:cafe::1 SRC fc00:db8:cafe::3 +Cached: fc00:db8:beef::13/128 dev br0 GW fc00:db8:cafe::1 SRC fc00:db8:cafe::2]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From 51778134d4c8a84801230b1e5a7d59e180d9e8b5 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 13 Mar 2023 15:31:40 +0100 Subject: [PATCH 197/833] system-traffic: Fix conntrack test cases which are failing with af_xdp. The recently added test cases below are not passing on the af_xdp datapath due to tcpdump not working on the OVS ports with this datapath. conntrack - ICMP related NAT with single port conntrack - ICMPv6 related NAT with single port conntrack - ICMP from different source related with NAT The tests are changed to attach tcpdump on the associated veth port in the netns. Tests are now passing with all datapaths (afxdp, kernel, userspace, and offloads). Fixes: 8bd688063078 ("system-traffic.at: Add icmp error tests while dnatting address and port.") Fixes: 0a7587034dc9 ("conntrack: Properly unNAT inner header of related traffic.") Signed-off-by: Eelco Chaudron Acked-by: Ales Musil Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 380372430b6..2558f3b24d7 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3581,7 +3581,7 @@ table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -OVS_DAEMONIZE([tcpdump -l -U -i ovs-p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) dnl Send UDP packet from 10.1.1.1:1234 to 10.1.1.240:80 @@ -6612,7 +6612,7 @@ table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -OVS_DAEMONIZE([tcpdump -l -U -i ovs-p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) dnl Send UDP packet from [[fc00::1]]:1234 to [[fc00::240]]:80 @@ -7303,7 +7303,7 @@ table=2,in_port=ovs-server,ip,ct_state=+trk+rpl,actions=output:ovs-client AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) rm server.pcap -OVS_DAEMONIZE([tcpdump -l -U -i ovs-server -w server.pcap 2>tcpdump0_err], [tcpdump0.pid]) +NETNS_DAEMONIZE([server], [tcpdump -l -U -i server -w server.pcap 2>tcpdump0_err], [tcpdump0.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) dnl Send UDP client->server @@ -7345,7 +7345,7 @@ dnl Check the ICMP error in reply direction AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=42]) rm client.pcap -OVS_DAEMONIZE([tcpdump -l -U -i ovs-client -w client.pcap 2>tcpdump1_err], [tcpdump1.pid]) +NETNS_DAEMONIZE([client], [tcpdump -l -U -i client -w client.pcap 2>tcpdump1_err], [tcpdump1.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump1_err]) dnl Send UDP client->server From 29720e378e96b27bc250aac9b287a67e023650fd Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 8 Mar 2023 13:55:44 +0100 Subject: [PATCH 198/833] ofproto-dpif-upcall: Wait for valid hw flow stats before applying min-revalidate-pps. Depending on the driver implementation, it can take from 0.2 seconds up to 2 seconds before offloaded flow statistics are updated. This is true for both TC and rte_flow-based offloading. This is causing a problem with min-revalidate-pps, as old statistic values are used during this period. This fix will wait for at least 2 seconds, by default, before assuming no packets where received during this period. Reviewed-by: Simon Horman Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 25 +++++++++++++++---------- ofproto/ofproto-provider.h | 5 +++++ ofproto/ofproto.c | 10 ++++++++++ ofproto/ofproto.h | 2 ++ vswitchd/bridge.c | 3 +++ vswitchd/vswitch.xml | 13 +++++++++++++ 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 4dab51dff0c..cac118c61de 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -2116,10 +2116,12 @@ ukey_delete(struct umap *umap, struct udpif_key *ukey) } static bool -should_revalidate(const struct udpif *udpif, uint64_t packets, - long long int used) +should_revalidate(const struct udpif *udpif, const struct udpif_key *ukey, + uint64_t packets) + OVS_REQUIRES(ukey->mutex) { long long int metric, now, duration; + long long int used = ukey->stats.used; if (!ofproto_min_revalidate_pps) { return true; @@ -2150,8 +2152,12 @@ should_revalidate(const struct udpif *udpif, uint64_t packets, duration = now - used; metric = duration / packets; - if (metric < 1000 / ofproto_min_revalidate_pps) { - /* The flow is receiving more than min-revalidate-pps, so keep it. */ + if (metric < 1000 / ofproto_min_revalidate_pps || + (ukey->offloaded && duration < ofproto_offloaded_stats_delay)) { + /* The flow is receiving more than min-revalidate-pps, so keep it. + * Or it's a hardware offloaded flow that might take up to X seconds + * to update its statistics. Until we are sure the statistics had a + * chance to be updated, also keep it. */ return true; } return false; @@ -2355,7 +2361,7 @@ static enum reval_result revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, const struct dpif_flow_stats *stats, struct ofpbuf *odp_actions, uint64_t reval_seq, - struct recirc_refs *recircs, bool offloaded) + struct recirc_refs *recircs) OVS_REQUIRES(ukey->mutex) { bool need_revalidate = ukey->reval_seq != reval_seq; @@ -2381,7 +2387,7 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, } if (need_revalidate) { - if (should_revalidate(udpif, push.n_packets, ukey->stats.used)) { + if (should_revalidate(udpif, ukey, push.n_packets)) { if (!ukey->xcache) { ukey->xcache = xlate_cache_new(); } else { @@ -2397,7 +2403,7 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, /* Stats for deleted flows will be attributed upon flow deletion. Skip. */ if (result != UKEY_DELETE) { - xlate_push_stats(ukey->xcache, &push, offloaded); + xlate_push_stats(ukey->xcache, &push, ukey->offloaded); ukey->stats = *stats; ukey->reval_seq = reval_seq; } @@ -2853,8 +2859,7 @@ revalidate(struct revalidator *revalidator) result = UKEY_DELETE; } else { result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, - reval_seq, &recircs, - f->attrs.offloaded); + reval_seq, &recircs); } ukey->dump_seq = dump_seq; @@ -2939,7 +2944,7 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) COVERAGE_INC(revalidate_missed_dp_flow); memcpy(&stats, &ukey->stats, sizeof stats); result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, - reval_seq, &recircs, false); + reval_seq, &recircs); } if (result != UKEY_KEEP) { /* Clears 'recircs' if filled by revalidate_ukey(). */ diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index a84ddc1d06a..143ded6904e 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -541,6 +541,11 @@ extern unsigned ofproto_max_revalidator; * duration exceeds half of max-revalidator config variable. */ extern unsigned ofproto_min_revalidate_pps; +/* Worst case delay (in ms) it might take before statistics of offloaded flows + * are updated. Offloaded flows younger than this delay will always be + * revalidated regardless of ofproto_min_revalidate_pps. */ +extern unsigned ofproto_offloaded_stats_delay; + /* Number of upcall handler and revalidator threads. Only affects the * ofproto-dpif implementation. */ extern uint32_t n_handlers, n_revalidators; diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 863b34d25bb..11cc0c6f602 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -311,6 +311,7 @@ unsigned ofproto_flow_limit = OFPROTO_FLOW_LIMIT_DEFAULT; unsigned ofproto_max_idle = OFPROTO_MAX_IDLE_DEFAULT; unsigned ofproto_max_revalidator = OFPROTO_MAX_REVALIDATOR_DEFAULT; unsigned ofproto_min_revalidate_pps = OFPROTO_MIN_REVALIDATE_PPS_DEFAULT; +unsigned ofproto_offloaded_stats_delay = OFPROTO_OFFLOADED_STATS_DELAY; uint32_t n_handlers, n_revalidators; @@ -727,6 +728,15 @@ ofproto_set_min_revalidate_pps(unsigned min_revalidate_pps) ofproto_min_revalidate_pps = min_revalidate_pps; } +/* Set worst case delay (in ms) it might take before statistics of offloaded + * flows are updated. Offloaded flows younger than this delay will always be + * revalidated regardless of ofproto_min_revalidate_pps. */ +void +ofproto_set_offloaded_stats_delay(unsigned offloaded_stats_delay) +{ + ofproto_offloaded_stats_delay = offloaded_stats_delay; +} + /* If forward_bpdu is true, the NORMAL action will forward frames with * reserved (e.g. STP) destination Ethernet addresses. if forward_bpdu is false, * the NORMAL action will drop these frames. */ diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index c79f372bce5..8efdb20a072 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -320,6 +320,7 @@ int ofproto_port_dump_done(struct ofproto_port_dump *); #define OFPROTO_MAX_IDLE_DEFAULT 10000 /* ms */ #define OFPROTO_MAX_REVALIDATOR_DEFAULT 500 /* ms */ #define OFPROTO_MIN_REVALIDATE_PPS_DEFAULT 5 +#define OFPROTO_OFFLOADED_STATS_DELAY 2000 /* ms */ const char *ofproto_port_open_type(const struct ofproto *, const char *port_type); @@ -349,6 +350,7 @@ void ofproto_set_flow_limit(unsigned limit); void ofproto_set_max_idle(unsigned max_idle); void ofproto_set_max_revalidator(unsigned max_revalidator); void ofproto_set_min_revalidate_pps(unsigned min_revalidate_pps); +void ofproto_set_offloaded_stats_delay(unsigned offloaded_stats_delay); void ofproto_set_forward_bpdu(struct ofproto *, bool forward_bpdu); void ofproto_set_mac_table_config(struct ofproto *, unsigned idle_time, size_t max_entries); diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 307a515279d..f5dc59ad06e 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -832,6 +832,9 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) ofproto_set_min_revalidate_pps( smap_get_uint(&ovs_cfg->other_config, "min-revalidate-pps", OFPROTO_MIN_REVALIDATE_PPS_DEFAULT)); + ofproto_set_offloaded_stats_delay( + smap_get_uint(&ovs_cfg->other_config, "offloaded-stats-delay", + OFPROTO_OFFLOADED_STATS_DELAY)); ofproto_set_vlan_limit(smap_get_int(&ovs_cfg->other_config, "vlan-limit", LEGACY_MAX_VLAN_HEADERS)); ofproto_set_bundle_idle_timeout(smap_get_uint(&ovs_cfg->other_config, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 12708a3131d..3e94b969ce7 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -216,6 +216,19 @@

+ +

+ Set worst case delay (in ms) it might take before statistics of + offloaded flows are updated. Offloaded flows younger than this + delay will always be revalidated regardless of + . +

+

+ The default is 2000. +

+
+

From a4cd2afea5c3ba08f38379a7356ffb3bf5662d5c Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 9 Mar 2023 13:30:16 +0100 Subject: [PATCH 199/833] ofproto-dpif-upcall: Remove redundant time_msec() in revalidate(). Remove one of two consecutive time_msec() calls in the revalidate() function. We take the time stamp after udpif_get_n_flows(), to avoid any potential delays in getting the number of offloaded flows. Signed-off-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index cac118c61de..cd57fdbd9e6 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -2750,8 +2750,6 @@ revalidate(struct revalidator *revalidator) break; } - now = time_msec(); - /* In normal operation we want to keep flows around until they have * been idle for 'ofproto_max_idle' milliseconds. However: * @@ -2788,7 +2786,7 @@ revalidate(struct revalidator *revalidator) max_idle = n_dp_flows > flow_limit ? 100 : ofproto_max_idle; - udpif->dpif->current_ms = time_msec(); + udpif->dpif->current_ms = now = time_msec(); for (f = flows; f < &flows[n_dumped]; f++) { long long int used = f->stats.used; struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER; From ebe98c587deb1bc011d9ec6263dc25bed9df0d3b Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 13 Mar 2023 08:16:34 +0100 Subject: [PATCH 200/833] dpctl: Fix flush-conntrack with datapath as argument. Specifying datapath with "dpctl/flush-conntrack" didn't work as expected and caused error: ovs-dpctl: field system@ovs-system missing value (Invalid argument) To prevent that, check if we have datapath as first argument and use it accordingly. Also add couple of test cases to ensure that everything works as expected. Fixes: a9ae73b916ba ("ofp, dpif: Allow CT flush based on partial match.") Signed-off-by: Ales Musil Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/dpctl.c | 12 +++++++++--- tests/system-traffic.at | 42 +++++++++++++++++++++++++++++++++++++++++ utilities/ovs-ofctl.c | 4 ++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index c501a0cd76b..59cc4f58c98 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1717,10 +1717,16 @@ dpctl_flush_conntrack(int argc, const char *argv[], uint16_t zone, *pzone = NULL; int error; int args = argc - 1; + int zone_pos = 1; + + if (dp_arg_exists(argc, argv)) { + args--; + zone_pos = 2; + } /* Parse zone. */ - if (args && !strncmp(argv[1], "zone=", 5)) { - if (!ovs_scan(argv[1], "zone=%"SCNu16, &zone)) { + if (args && !strncmp(argv[zone_pos], "zone=", 5)) { + if (!ovs_scan(argv[zone_pos], "zone=%"SCNu16, &zone)) { ds_put_cstr(&ds, "failed to parse zone"); error = EINVAL; goto error; @@ -1748,7 +1754,7 @@ dpctl_flush_conntrack(int argc, const char *argv[], } /* Report error if there is more than one unparsed argument. */ - if (args > 1) { + if (args > 0) { ds_put_cstr(&ds, "invalid arguments"); error = EINVAL; goto error; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 2558f3b24d7..39a48175271 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2360,8 +2360,10 @@ priority=100,in_port=2,icmp,action=ct(zone=5,commit),1 ]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +dp=$(ovs-appctl dpctl/dump-dps) m4_foreach([FLUSH_CMD], [[ovs-appctl dpctl/flush-conntrack], + [ovs-appctl dpctl/flush-conntrack $dp], [ovs-ofctl ct-flush br0]], [ AS_BOX([Testing with FLUSH_CMD]) @@ -2504,8 +2506,48 @@ udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10. AT_CHECK([FLUSH_CMD zone=5 '' 'ct_nw_src=10.1.1.1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test UDP from port 1 and 2, flush without arguments +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 ]) +AT_CHECK([FLUSH_CMD]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +]) + +dnl Test flush with invalid arguments + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=invalid 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1'], [2], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse zone" stderr]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_src=10.1.1.1,invalid=invalid' 'ct_nw_dst=10.1.1.1'], [2], [ignore], [stderr]) +AT_CHECK([grep -q "invalid conntrack tuple field: invalid" stderr]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_src=invalid' 'ct_nw_dst=10.1.1.1'], [2], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse field ct_nw_src" stderr]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [2], [ignore], [stderr]) +AT_CHECK([grep -q "invalid arguments" stderr]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack $dp zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [2], [ignore], [stderr]) +AT_CHECK([grep -q "command takes at most 4 arguments" stderr]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack $dp 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [2], [ignore], [stderr]) +AT_CHECK([grep -q "invalid arguments" stderr]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [1], [ignore], [stderr]) +AT_CHECK([grep -q "command takes at most 4 arguments" stderr]) + +AT_CHECK([ovs-ofctl ct-flush br0 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [1], [ignore], [stderr]) +AT_CHECK([grep -q "Invalid arguments" stderr]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index eabec18a367..3ce4e82ec0b 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -3089,6 +3089,10 @@ ofctl_ct_flush(struct ovs_cmdl_context *ctx) args--; } + if (args > 0) { + ovs_fatal(0, "Invalid arguments"); + } + open_vconn(ctx->argv[1], &vconn); enum ofp_version version = vconn_get_version(vconn); struct ofpbuf *msg = ofp_ct_match_encode(&match, pzone, version); From e90a0727f17f6ad915a32735a8c0b282f2c8cd6f Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 13 Mar 2023 08:16:35 +0100 Subject: [PATCH 201/833] vswitch: Add missing documentation for "ct_flush" capability. Fixes: 08146bf7d9b4 ("openflow: Add extension to flush CT by generic match.") Signed-off-by: Ales Musil Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- vswitchd/vswitch.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 3e94b969ce7..88e2c94e2f0 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -6314,6 +6314,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ translated to an ephemeral port. If there is no collision, no SNAT is performed. + + True if the datapath supports CT flush OpenFlow Nicira extension + called NXT_CT_FLUSH. The NXT_CT_FLUSH + extensions allows to flush CT entries based on specified parameters. + From 07cf5810de8da12c700324bc421bde92376abe06 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Thu, 16 Mar 2023 08:00:39 -0400 Subject: [PATCH 202/833] dpdk: Allow retaining CAP_SYS_RAWIO privileges. Open vSwitch generally tries to let the underlying operating system managed the low level details of hardware, for example DMA mapping, bus arbitration, etc. However, when using DPDK, the underlying operating system yields control of many of these details to userspace for management. In the case of some DPDK port drivers, configuring rte_flow or even allocating resources may require access to iopl/ioperm calls, which are guarded by the CAP_SYS_RAWIO privilege on linux systems. These calls are dangerous, and can allow a process to completely compromise a system. However, they are needed in the case of some userspace driver code which manages the hardware (for example, the mlx implementation of backend support for rte_flow). Here, we create an opt-in flag passed to the command line to allow this access. We need to do this before ever accessing the database, because we want to drop all privileges asap, and cannot wait for a connection to the database to be established and functional before dropping. There may be distribution specific ways to do capability management as well (using for example, systemd), but they are not as universal to the vswitchd as a flag. Reviewed-by: Simon Horman Signed-off-by: Aaron Conole Acked-by: Flavio Leitner Acked-by: Gaetan Rivet Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ lib/daemon-unix.c | 33 +++++++++++++++++++++++++-------- lib/daemon-windows.c | 6 ++++-- lib/daemon.c | 2 +- lib/daemon.h | 4 ++-- ovsdb/ovsdb-client.c | 6 +++--- ovsdb/ovsdb-server.c | 4 ++-- tests/test-netflow.c | 2 +- tests/test-sflow.c | 2 +- tests/test-unixctl.c | 2 +- utilities/ovs-ofctl.c | 4 ++-- utilities/ovs-testcontroller.c | 4 ++-- vswitchd/ovs-vswitchd.8.in | 9 +++++++++ vswitchd/ovs-vswitchd.c | 11 ++++++++++- 14 files changed, 67 insertions(+), 26 deletions(-) diff --git a/NEWS b/NEWS index 72b9024e6d8..8771ee618ae 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,10 @@ Post-v3.1.0 in order to create OVSDB sockets with access mode of 0770. - QoS: * Added new configuration option 'jitter' for a linux-netem QoS type. + - DPDK: + * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started + with the --hw-rawio-access command line option. This allows the + process extra privileges when mapping physical interconnect memory. v3.1.0 - 16 Feb 2023 diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c index 1a7ba427d7a..4fdc6e3c496 100644 --- a/lib/daemon-unix.c +++ b/lib/daemon-unix.c @@ -88,7 +88,8 @@ static bool switch_user = false; static uid_t uid; static gid_t gid; static char *user = NULL; -static void daemon_become_new_user__(bool access_datapath); +static void daemon_become_new_user__(bool access_datapath, + bool access_hardware_ports); static void check_already_running(void); static int lock_pidfile(FILE *, int command); @@ -443,13 +444,13 @@ monitor_daemon(pid_t daemon_pid) * daemonize_complete()) or that it failed to start up (by exiting with a * nonzero exit code). */ void -daemonize_start(bool access_datapath) +daemonize_start(bool access_datapath, bool access_hardware_ports) { assert_single_threaded(); daemonize_fd = -1; if (switch_user) { - daemon_become_new_user__(access_datapath); + daemon_become_new_user__(access_datapath, access_hardware_ports); switch_user = false; } @@ -807,7 +808,8 @@ daemon_become_new_user_unix(void) /* Linux specific implementation of daemon_become_new_user() * using libcap-ng. */ static void -daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) +daemon_become_new_user_linux(bool access_datapath OVS_UNUSED, + bool access_hardware_ports OVS_UNUSED) { #if defined __linux__ && HAVE_LIBCAPNG int ret; @@ -827,6 +829,20 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN) || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW) || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST); +#ifdef DPDK_NETDEV + if (access_hardware_ports && !ret) { + ret = capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO); + if (!ret) { + VLOG_INFO("The Linux capability CAP_SYS_RAWIO " + "is enabled."); + } + } +#else + if (access_hardware_ports) { + VLOG_WARN("No driver requires Linux capability " + "CAP_SYS_RAWIO, disabling it."); + } +#endif } } else { ret = -1; @@ -854,7 +870,7 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) } static void -daemon_become_new_user__(bool access_datapath) +daemon_become_new_user__(bool access_datapath, bool access_hardware_ports) { /* If vlog file has been created, change its owner to the non-root user * as specifed by the --user option. */ @@ -862,7 +878,8 @@ daemon_become_new_user__(bool access_datapath) if (LINUX) { if (LIBCAPNG) { - daemon_become_new_user_linux(access_datapath); + daemon_become_new_user_linux(access_datapath, + access_hardware_ports); } else { VLOG_FATAL("%s: fail to downgrade user using libcap-ng. " "(libcap-ng is not configured at compile time), " @@ -877,11 +894,11 @@ daemon_become_new_user__(bool access_datapath) * However, there in case the user switch needs to be done * before daemonize_start(), the following API can be used. */ void -daemon_become_new_user(bool access_datapath) +daemon_become_new_user(bool access_datapath, bool access_hardware_ports) { assert_single_threaded(); if (switch_user) { - daemon_become_new_user__(access_datapath); + daemon_become_new_user__(access_datapath, access_hardware_ports); /* daemonize_start() should not switch user again. */ switch_user = false; } diff --git a/lib/daemon-windows.c b/lib/daemon-windows.c index 7e5f264f5b9..4e6bbe0f040 100644 --- a/lib/daemon-windows.c +++ b/lib/daemon-windows.c @@ -498,7 +498,8 @@ make_pidfile(void) } void -daemonize_start(bool access_datapath OVS_UNUSED) +daemonize_start(bool access_datapath OVS_UNUSED, + bool access_hardware_ports OVS_UNUSED) { if (pidfile) { make_pidfile(); @@ -526,7 +527,8 @@ daemonize_complete(void) } void -daemon_become_new_user(bool access_datapath OVS_UNUSED) +daemon_become_new_user(bool access_datapath OVS_UNUSED, + bool access_hardware_ports OVS_UNUSED) { } diff --git a/lib/daemon.c b/lib/daemon.c index 3249c5ab4b5..1e1c019eb1b 100644 --- a/lib/daemon.c +++ b/lib/daemon.c @@ -48,7 +48,7 @@ get_detach(void) void daemonize(void) { - daemonize_start(false); + daemonize_start(false, false); daemonize_complete(); } diff --git a/lib/daemon.h b/lib/daemon.h index 09415749636..42372d14630 100644 --- a/lib/daemon.h +++ b/lib/daemon.h @@ -167,10 +167,10 @@ void set_detach(void); bool get_detach(void); void daemon_save_fd(int fd); void daemonize(void); -void daemonize_start(bool access_datapath); +void daemonize_start(bool access_datapath, bool access_hardware_ports); void daemonize_complete(void); void daemon_set_new_user(const char * user_spec); -void daemon_become_new_user(bool access_datapath); +void daemon_become_new_user(bool access_datapath, bool access_hardware_ports); void daemon_usage(void); void daemon_disable_self_confinement(void); bool daemon_should_self_confine(void); diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index f1b8d649105..bae2c5f0414 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -250,7 +250,7 @@ main(int argc, char *argv[]) parse_options(argc, argv); fatal_ignore_sigpipe(); - daemon_become_new_user(false); + daemon_become_new_user(false, false); if (optind >= argc) { ovs_fatal(0, "missing command name; use --help for help"); } @@ -1392,7 +1392,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database, daemon_save_fd(STDOUT_FILENO); daemon_save_fd(STDERR_FILENO); - daemonize_start(false); + daemonize_start(false, false); if (get_detach()) { int error; @@ -2276,7 +2276,7 @@ do_lock(struct jsonrpc *rpc, const char *method, const char *lock) getting a reply of the previous request. */ daemon_save_fd(STDOUT_FILENO); - daemonize_start(false); + daemonize_start(false, false); lock_req_init(&lock_req, method, lock); if (get_detach()) { diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 33ca4910d70..4fea2dbda7b 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -341,7 +341,7 @@ main(int argc, char *argv[]) &run_command, &sync_from, &sync_exclude, &active); is_backup = sync_from && !active; - daemon_become_new_user(false); + daemon_become_new_user(false, false); /* Create and initialize 'config_tmpfile' as a temporary file to hold * ovsdb-server's most basic configuration, and then save our initial @@ -359,7 +359,7 @@ main(int argc, char *argv[]) save_config__(config_tmpfile, &remotes, &db_filenames, sync_from, sync_exclude, is_backup); - daemonize_start(false); + daemonize_start(false, false); /* Load the saved config. */ load_config(config_tmpfile, &remotes, &db_filenames, &sync_from, diff --git a/tests/test-netflow.c b/tests/test-netflow.c index d2322d4509a..7f89cfcae0d 100644 --- a/tests/test-netflow.c +++ b/tests/test-netflow.c @@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[]) } daemon_save_fd(STDOUT_FILENO); - daemonize_start(false); + daemonize_start(false, false); error = unixctl_server_create(NULL, &server); if (error) { diff --git a/tests/test-sflow.c b/tests/test-sflow.c index 460d4d6c54d..3c617bdd168 100644 --- a/tests/test-sflow.c +++ b/tests/test-sflow.c @@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[]) } daemon_save_fd(STDOUT_FILENO); - daemonize_start(false); + daemonize_start(false, false); error = unixctl_server_create(NULL, &server); if (error) { diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c index 3eadf54cd90..9e89827895a 100644 --- a/tests/test-unixctl.c +++ b/tests/test-unixctl.c @@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[]) fatal_ignore_sigpipe(); parse_options(&argc, &argv, &unixctl_path); - daemonize_start(false); + daemonize_start(false, false); int retval = unixctl_server_create(unixctl_path, &unixctl); if (retval) { exit(EXIT_FAILURE); diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index 3ce4e82ec0b..24d0941cf2e 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -173,7 +173,7 @@ main(int argc, char *argv[]) ctx.argc = argc - optind; ctx.argv = argv + optind; - daemon_become_new_user(false); + daemon_become_new_user(false, false); if (read_only) { ovs_cmdl_run_command_read_only(&ctx, get_all_commands()); } else { @@ -2127,7 +2127,7 @@ monitor_vconn(struct vconn *vconn, bool reply_to_echo_requests, int error; daemon_save_fd(STDERR_FILENO); - daemonize_start(false); + daemonize_start(false, false); error = unixctl_server_create(unixctl_path, &server); if (error) { ovs_fatal(error, "failed to create unixctl server"); diff --git a/utilities/ovs-testcontroller.c b/utilities/ovs-testcontroller.c index b489ff5fc7a..9f2fbfdf51e 100644 --- a/utilities/ovs-testcontroller.c +++ b/utilities/ovs-testcontroller.c @@ -109,7 +109,7 @@ main(int argc, char *argv[]) parse_options(argc, argv); fatal_ignore_sigpipe(); - daemon_become_new_user(false); + daemon_become_new_user(false, false); if (argc - optind < 1) { ovs_fatal(0, "at least one vconn argument required; " @@ -148,7 +148,7 @@ main(int argc, char *argv[]) ovs_fatal(0, "no active or passive switch connections"); } - daemonize_start(false); + daemonize_start(false, false); retval = unixctl_server_create(unixctl_path, &unixctl); if (retval) { diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in index 9569265fcb6..10c6e077bac 100644 --- a/vswitchd/ovs-vswitchd.8.in +++ b/vswitchd/ovs-vswitchd.8.in @@ -81,6 +81,15 @@ unavailable or unsuccessful. .SS "DPDK Options" For details on initializing \fBovs\-vswitchd\fR to use DPDK ports, refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5). +.SS "DPDK HW Access Options" +.IP "\fB\-\-hw\-rawio\-access\fR" +Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability, +to allow userspace drivers access to raw hardware memory. This will +also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and +\fBioperm()\fR functions as well as access memory devices to set port +access. This is a \fBvery\fR powerful capability, so generally only +enable as needed for specific hardware (for example mlx5 with full +hardware offload via rte_flow). .SS "Daemon Options" .ds DD \ \fBovs\-vswitchd\fR detaches only after it has connected to the \ diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index 407bfc60eb6..a244d2f7095 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd); * the kernel from paging any of its memory to disk. */ static bool want_mlockall; +/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */ +static bool hw_rawio_access; + static unixctl_cb_func ovs_vswitchd_exit; static char *parse_options(int argc, char *argv[], char **unixctl_path); @@ -89,7 +92,7 @@ main(int argc, char *argv[]) remote = parse_options(argc, argv, &unixctl_path); fatal_ignore_sigpipe(); - daemonize_start(true); + daemonize_start(true, hw_rawio_access); if (want_mlockall) { #ifdef HAVE_MLOCKALL @@ -169,6 +172,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) OPT_DPDK, SSL_OPTION_ENUMS, OPT_DUMMY_NUMA, + OPT_HW_RAWIO_ACCESS, }; static const struct option long_options[] = { {"help", no_argument, NULL, 'h'}, @@ -185,6 +189,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) {"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE}, {"dpdk", optional_argument, NULL, OPT_DPDK}, {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA}, + {"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS}, {NULL, 0, NULL, 0}, }; char *short_options = ovs_cmdl_long_options_to_short_options(long_options); @@ -249,6 +254,10 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) ovs_numa_set_dummy(optarg); break; + case OPT_HW_RAWIO_ACCESS: + hw_rawio_access = true; + break; + default: abort(); } From b3935cf90e31da0bb9ce8a99df62616c64446a49 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Tue, 21 Mar 2023 09:56:16 -0400 Subject: [PATCH 203/833] tests/mfex: Retain support for cryptography pre-v37. Prior to v37.0.0, CryptographyDeprecationWarning could not be imported from __init__.py resulting in: Traceback (most recent call last): File "mfex_fuzzy.py", line 9, in category=cryptography.CryptographyDeprecationWarning, AttributeError: module 'cryptography' has no attribute 'CryptographyDeprecationWarning' This import was only added to __init__ to deprecate python3.6. Importing the exception from cryptography.utils is the compatible option. Fixes: c3ed0bf34b8a ("tests/mfex: Silence Blowfish/CAST5 deprecation warnings.") Acked-by: Eelco Chaudron Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/mfex_fuzzy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/mfex_fuzzy.py b/tests/mfex_fuzzy.py index 15f7f4e517b..30028ba7a04 100755 --- a/tests/mfex_fuzzy.py +++ b/tests/mfex_fuzzy.py @@ -3,12 +3,13 @@ import sys import warnings -import cryptography +from cryptography.utils import CryptographyDeprecationWarning warnings.filterwarnings( "ignore", - category=cryptography.CryptographyDeprecationWarning, + category=CryptographyDeprecationWarning, message=r"(blowfish|cast5)", ) + # flake8: noqa: E402 from scapy.all import RandMAC, RandIP, PcapWriter, RandIP6, RandShort, fuzz from scapy.all import IPv6, Dot1Q, IP, Ether, UDP, TCP, random From d53ee36aa6e8c309adb0b26b80fafa5d7eb3996a Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 20 Mar 2023 12:56:49 +0100 Subject: [PATCH 204/833] netdev-offload-tc: Fix parse_tc_flower_to_actions() reporting errors. parse_tc_flower_to_actions() was not reporting errors, which would cause parse_tc_flower_to_match() to ignore them. Fixes: dd03672f7bbb ("netdev-offload-tc: Move flower_to_match action handling to isolated function.") Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 4fb9d9f2127..247c1ff8b72 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -871,7 +871,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, outport = netdev_ifindex_to_odp_port(action->out.ifindex_out); if (!outport) { - return ENOENT; + return -ENOENT; } } nl_msg_put_u32(buf, OVS_ACTION_ATTR_OUTPUT, odp_to_u32(outport)); @@ -964,7 +964,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, uint32_t meter_id; if (police_idx_lookup(action->police.index, &meter_id)) { - return ENOENT; + return -ENOENT; } nl_msg_put_u32(buf, OVS_ACTION_ATTR_METER, meter_id); } @@ -983,6 +983,9 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, buf, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); i = parse_tc_flower_to_actions__(flower, buf, i + 1, action->police.result_jump); + if (i < 0) { + return i; + } nl_msg_end_nested(buf, act_offset); act_offset = nl_msg_start_nested( @@ -994,6 +997,9 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, } if (jump != 0) { i = parse_tc_flower_to_actions__(flower, buf, i, jump); + if (i < 0) { + return i; + } } nl_msg_end_nested(buf, act_offset); @@ -1013,11 +1019,11 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, return i; } -static void +static int parse_tc_flower_to_actions(struct tc_flower *flower, struct ofpbuf *buf) { - parse_tc_flower_to_actions__(flower, buf, 0, 0); + return parse_tc_flower_to_actions__(flower, buf, 0, 0); } static int @@ -1030,9 +1036,10 @@ parse_tc_flower_to_match(const struct netdev *netdev, struct ofpbuf *buf, bool terse) { - size_t act_off; struct tc_flower_key *key = &flower->key; struct tc_flower_key *mask = &flower->mask; + size_t act_off; + int err; if (terse) { return parse_tc_flower_terse_to_match(flower, match, stats, attrs); @@ -1229,7 +1236,10 @@ parse_tc_flower_to_match(const struct netdev *netdev, } act_off = nl_msg_start_nested(buf, OVS_FLOW_ATTR_ACTIONS); - parse_tc_flower_to_actions(flower, buf); + err = parse_tc_flower_to_actions(flower, buf); + if (err < 0) { + return -err; + } nl_msg_end_nested(buf, act_off); *actions = ofpbuf_at_assert(buf, act_off, sizeof(struct nlattr)); @@ -2490,15 +2500,23 @@ netdev_tc_flow_get(struct netdev *netdev, err = tc_get_flower(&id, &flower); if (err) { - VLOG_ERR_RL(&error_rl, "flow get failed (dev %s prio %d handle %d): %s", + VLOG_ERR_RL(&error_rl, + "flow get failed (dev %s prio %d handle %d): %s", netdev_get_name(netdev), id.prio, id.handle, ovs_strerror(err)); return err; } in_port = netdev_ifindex_to_odp_port(id.ifindex); - parse_tc_flower_to_match(netdev, &flower, match, actions, - stats, attrs, buf, false); + err = parse_tc_flower_to_match(netdev, &flower, match, actions, + stats, attrs, buf, false); + if (err) { + VLOG_ERR_RL(&error_rl, + "flow get parse failed (dev %s prio %d handle %d): %s", + netdev_get_name(netdev), id.prio, id.handle, + ovs_strerror(err)); + return err; + } if (stats) { struct dpif_flow_stats adjust_stats; From 79f936744916cfbc6952308d33012c994eb274de Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 27 Mar 2023 10:40:10 +0200 Subject: [PATCH 205/833] dpif-netlink: Always create at least 1 handler. Ensure at least 1 handler is created even if something goes wrong during cpu detection or prime numer calculation. Fixes: a5cacea5f988 ("handlers: Create additional handler threads when using CPU isolation.") Suggested-by: Aaron Conole Acked-by: Mike Pattrick Acked-by: Michael Santana Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/dpif-netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 7875e573e64..ebe7b5cb145 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -2582,7 +2582,7 @@ dpif_netlink_calculate_n_handlers(void) n_handlers = MIN(next_prime_num, total_cores); } - return n_handlers; + return MAX(n_handlers, 1); } static int From b354cee2e0a79a5fbe7140e964bb30e7c57919d6 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 27 Mar 2023 10:40:11 +0200 Subject: [PATCH 206/833] ovs-thread: Fix cpus not read for the first 10s. With the current implementation the available CPUs will not be read until 10s have passed since the system's boot. For systems that boot faster, this can make ovs-vswitchd create fewer handlers than necessary for some time. Fixes: 0d23948a598a ("ovs-thread: Detect changes in number of CPUs.") Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2180460 Suggested-by: Ilya Maximets Acked-by: Mike Pattrick Acked-by: Michael Santana Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/ovs-thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c index 2d382f1e8bc..ac5d2c3d029 100644 --- a/lib/ovs-thread.c +++ b/lib/ovs-thread.c @@ -674,7 +674,7 @@ count_cpu_cores(void) static int cpu_cores; ovs_mutex_lock(&cpu_cores_mutex); - if (now - last_updated >= COUNT_CPU_UPDATE_TIME_MS) { + if (!last_updated || now - last_updated >= COUNT_CPU_UPDATE_TIME_MS) { last_updated = now; cpu_cores = count_cpu_cores__(); } From 0db74e0eb41b1cd105d7ef2d27d55af47cc7cf87 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Wed, 29 Mar 2023 14:51:14 +0900 Subject: [PATCH 207/833] tests: Define new ADD_VETH_NS macro. The new ADD_VETH_NS macro creates two netns and connects them with a veth pair. We can use it for testing in a generic purpose. e.g. ADD_VETH_NS([ns1], [p1], [1.1.1.1/24], [ns2], [p2], [1.1.1.2/24]) Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- tests/system-common-macros.at | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 8b9f5c75254..0077a8609c0 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -126,6 +126,22 @@ m4_define([ADD_VETH_BOND], ] ) +# ADD_VETH_NS([ns1], [port1], [ip_addr1], [ns2], [port2], [ip_addr2]) +# +# Add a pair of veth ports in 'ns1' and 'ns2'. The port names are 'port1' +# and 'port2' respectively, and the IP addresses 'ip_addr1' and 'ip_addr2' +# are assigned to each port. +m4_define([ADD_VETH_NS], + [ AT_CHECK([ip link add $2 type veth peer name $5]), + AT_CHECK([ip link set $2 netns $1]) + AT_CHECK([ip link set $5 netns $4]) + NS_CHECK_EXEC([$1], [ip link set $2 up]) + NS_CHECK_EXEC([$4], [ip link set $5 up]) + NS_CHECK_EXEC([$1], [ip addr add $3 dev $2]) + NS_CHECK_EXEC([$4], [ip addr add $6 dev $5]) + ] +) + # ADD_VLAN([port], [namespace], [vlan-id], [ip-addr]) # # Add a VLAN device named 'port' within 'namespace'. It will be configured From 57b9fc50dd2c9fe929a84306994fd9d943dc4246 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Wed, 29 Mar 2023 14:51:15 +0900 Subject: [PATCH 208/833] tnl-ports: Support multiple nw_protos. In some tunnels, inner packet needs to support both IPv4 and IPv6. Therefore, this patch improves to allow two protocols to be tied together in one tunneling. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/tnl-ports.c | 80 +++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/lib/tnl-ports.c b/lib/tnl-ports.c index 050eafa6b8c..829457ee50f 100644 --- a/lib/tnl-ports.c +++ b/lib/tnl-ports.c @@ -161,40 +161,28 @@ map_insert_ipdev__(struct ip_device *ip_dev, char dev_name[], } } -static uint8_t -tnl_type_to_nw_proto(const char type[]) +static void +tnl_type_to_nw_proto(const char type[], uint8_t nw_protos[2]) { - if (!strcmp(type, "geneve")) { - return IPPROTO_UDP; - } - if (!strcmp(type, "stt")) { - return IPPROTO_TCP; - } - if (!strcmp(type, "gre") || !strcmp(type, "erspan") || - !strcmp(type, "ip6erspan") || !strcmp(type, "ip6gre")) { - return IPPROTO_GRE; - } - if (!strcmp(type, "vxlan")) { - return IPPROTO_UDP; - } - if (!strcmp(type, "gtpu")) { - return IPPROTO_UDP; + nw_protos[0] = nw_protos[1] = 0; + + if (!strcmp(type, "geneve") || !strcmp(type, "vxlan") || + !strcmp(type, "gtpu")) { + nw_protos[0] = IPPROTO_UDP; + } else if (!strcmp(type, "stt")) { + nw_protos[0] = IPPROTO_TCP; + } else if (!strcmp(type, "gre") || !strcmp(type, "erspan") || + !strcmp(type, "ip6erspan") || !strcmp(type, "ip6gre")) { + nw_protos[0] = IPPROTO_GRE; } - return 0; } -void -tnl_port_map_insert(odp_port_t port, ovs_be16 tp_port, - const char dev_name[], const char type[]) +static void +tnl_port_map_insert__(odp_port_t port, ovs_be16 tp_port, + const char dev_name[], uint8_t nw_proto) { struct tnl_port *p; struct ip_device *ip_dev; - uint8_t nw_proto; - - nw_proto = tnl_type_to_nw_proto(type); - if (!nw_proto) { - return; - } ovs_mutex_lock(&mutex); LIST_FOR_EACH(p, node, &port_list) { @@ -220,6 +208,22 @@ tnl_port_map_insert(odp_port_t port, ovs_be16 tp_port, ovs_mutex_unlock(&mutex); } +void +tnl_port_map_insert(odp_port_t port, ovs_be16 tp_port, + const char dev_name[], const char type[]) +{ + uint8_t nw_protos[2]; + int i; + + tnl_type_to_nw_proto(type, nw_protos); + + for (i = 0; i < 2; i++) { + if (nw_protos[i]) { + tnl_port_map_insert__(port, tp_port, dev_name, nw_protos[i]); + } + } +} + static void tnl_port_unref(const struct cls_rule *cr) { @@ -256,14 +260,11 @@ ipdev_map_delete(struct ip_device *ip_dev, ovs_be16 tp_port, uint8_t nw_proto) } } -void -tnl_port_map_delete(odp_port_t port, const char type[]) +static void +tnl_port_map_delete__(odp_port_t port, uint8_t nw_proto) { struct tnl_port *p; struct ip_device *ip_dev; - uint8_t nw_proto; - - nw_proto = tnl_type_to_nw_proto(type); ovs_mutex_lock(&mutex); LIST_FOR_EACH_SAFE (p, node, &port_list) { @@ -280,6 +281,21 @@ tnl_port_map_delete(odp_port_t port, const char type[]) ovs_mutex_unlock(&mutex); } +void +tnl_port_map_delete(odp_port_t port, const char type[]) +{ + uint8_t nw_protos[2]; + int i; + + tnl_type_to_nw_proto(type, nw_protos); + + for (i = 0; i < 2; i++) { + if (nw_protos[i]) { + tnl_port_map_delete__(port, nw_protos[i]); + } + } +} + /* 'flow' is non-const to allow for temporary modifications during the lookup. * Any changes are restored before returning. */ odp_port_t From 349112f975ed3a9876d7bde92ba0622d2384f0c4 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Wed, 29 Mar 2023 14:51:16 +0900 Subject: [PATCH 209/833] flow: Support rt_hdr in parse_ipv6_ext_hdrs(). Checks whether IPPROTO_ROUTING exists in the IPv6 extension headers. If it exists, the first address is retrieved. If NULL is specified for "frag_hdr" and/or "rt_hdr", those addresses in the header are not reported to the caller. Of course, "frag_hdr" and "rt_hdr" are properly parsed inside this function. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/conntrack.c | 4 ++-- lib/flow.c | 48 +++++++++++++++++++++++++++++++++++++----------- lib/flow.h | 3 ++- lib/ipf.c | 15 ++++++++------- lib/packets.h | 9 +++++++++ 5 files changed, 58 insertions(+), 21 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 8cf7779c670..f86fa26f466 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -1617,8 +1617,8 @@ extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, uint8_t nw_proto = ip6->ip6_nxt; uint8_t nw_frag = 0; - const struct ovs_16aligned_ip6_frag *frag_hdr; - if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, &frag_hdr)) { + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, NULL)) { return false; } diff --git a/lib/flow.c b/lib/flow.c index c3a3aa3ce45..9501a259e9d 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -479,9 +479,17 @@ parse_icmpv6(const void **datap, size_t *sizep, static inline bool parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, uint8_t *nw_frag, - const struct ovs_16aligned_ip6_frag **frag_hdr) + const struct ovs_16aligned_ip6_frag **frag_hdr, + const struct ip6_rt_hdr **rt_hdr) { - *frag_hdr = NULL; + if (frag_hdr) { + *frag_hdr = NULL; + } + + if (rt_hdr) { + *rt_hdr = NULL; + } + while (1) { if (OVS_LIKELY((*nw_proto != IPPROTO_HOPOPTS) && (*nw_proto != IPPROTO_ROUTING) @@ -504,7 +512,6 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, } if ((*nw_proto == IPPROTO_HOPOPTS) - || (*nw_proto == IPPROTO_ROUTING) || (*nw_proto == IPPROTO_DSTOPTS)) { /* These headers, while different, have the fields we care * about in the same location and with the same @@ -515,6 +522,18 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, (ext_hdr->ip6e_len + 1) * 8))) { return false; } + } else if (*nw_proto == IPPROTO_ROUTING) { + const struct ip6_rt_hdr *tmp; + if (!rt_hdr) { + rt_hdr = &tmp; + } + + *rt_hdr = *datap; + *nw_proto = (*rt_hdr)->nexthdr; + if (OVS_UNLIKELY(!data_try_pull(datap, sizep, + ((*rt_hdr)->hdrlen + 1) * 8))) { + return false; + } } else if (*nw_proto == IPPROTO_AH) { /* A standard AH definition isn't available, but the fields * we care about are in the same location as the generic @@ -527,6 +546,11 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, return false; } } else if (*nw_proto == IPPROTO_FRAGMENT) { + const struct ovs_16aligned_ip6_frag *tmp; + if (!frag_hdr) { + frag_hdr = &tmp; + } + *frag_hdr = *datap; *nw_proto = (*frag_hdr)->ip6f_nxt; @@ -561,15 +585,19 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, * has FLOW_NW_FRAG_LATER set. Both first and later fragments have * FLOW_NW_FRAG_ANY set in 'nw_frag'. * + * If a routing header is found, '*rt_hdr' is set to the routing + * header and otherwise set to NULL. + * * A return value of false indicates that there was a problem parsing * the extension headers.*/ bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto, uint8_t *nw_frag, - const struct ovs_16aligned_ip6_frag **frag_hdr) + const struct ovs_16aligned_ip6_frag **frag_hdr, + const struct ip6_rt_hdr **rt_hdr) { return parse_ipv6_ext_hdrs__(datap, sizep, nw_proto, nw_frag, - frag_hdr); + frag_hdr, rt_hdr); } bool @@ -945,9 +973,8 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) nw_ttl = nh->ip6_hlim; nw_proto = nh->ip6_nxt; - const struct ovs_16aligned_ip6_frag *frag_hdr; - if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag, - &frag_hdr)) { + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, NULL)) { goto out; } @@ -1200,10 +1227,9 @@ parse_tcp_flags(struct dp_packet *packet, plen = ntohs(nh->ip6_plen); /* Never pull padding. */ dp_packet_set_l2_pad_size(packet, size - plen); size = plen; - const struct ovs_16aligned_ip6_frag *frag_hdr; nw_proto = nh->ip6_nxt; - if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag, - &frag_hdr)) { + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, NULL)) { return 0; } } else { diff --git a/lib/flow.h b/lib/flow.h index c647ad83c25..a9d026e1ce3 100644 --- a/lib/flow.h +++ b/lib/flow.h @@ -132,7 +132,8 @@ void packet_expand(struct dp_packet *, const struct flow *, size_t size); bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto, uint8_t *nw_frag, - const struct ovs_16aligned_ip6_frag **frag_hdr); + const struct ovs_16aligned_ip6_frag **frag_hdr, + const struct ip6_rt_hdr **rt_hdr); bool parse_nsh(const void **datap, size_t *sizep, struct ovs_key_nsh *key); uint16_t parse_tcp_flags(struct dp_packet *packet, ovs_be16 *dl_type_p, uint8_t *nw_frag_p, ovs_be16 *first_vlan_tci_p); diff --git a/lib/ipf.c b/lib/ipf.c index d452663743c..affd440f638 100644 --- a/lib/ipf.c +++ b/lib/ipf.c @@ -485,9 +485,9 @@ ipf_reassemble_v6_frags(struct ipf_list *ipf_list) const void *data = l3 + 1; size_t datasize = pl; - const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; - if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr) - || !nw_frag || !frag_hdr) { + const struct ovs_16aligned_ip6_frag *frag_hdr; + if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr, + NULL) || !nw_frag || !frag_hdr) { ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3); dp_packet_delete(pkt); @@ -678,9 +678,9 @@ ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt) uint8_t nw_proto = l3->ip6_nxt; const void *data = l3 + 1; size_t datasize = l3_size - l3_hdr_size; - const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + const struct ovs_16aligned_ip6_frag *frag_hdr; if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, - &frag_hdr) || !nw_frag || !frag_hdr) { + &frag_hdr, NULL) || !nw_frag || !frag_hdr) { return false; } @@ -721,9 +721,10 @@ ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone, uint8_t nw_proto = l3->ip6_nxt; const void *data = l3 + 1; size_t datasize = dp_packet_l3_size(pkt) - sizeof *l3; - const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + const struct ovs_16aligned_ip6_frag *frag_hdr; - parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr); + parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr, + NULL); ovs_assert(nw_frag && frag_hdr); ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg; *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) + diff --git a/lib/packets.h b/lib/packets.h index 8626aac8d53..70cd072228a 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -988,6 +988,15 @@ struct ovs_16aligned_ip6_frag { ovs_16aligned_be32 ip6f_ident; }; +#define IP6_RT_HDR_LEN 4 +struct ip6_rt_hdr { + uint8_t nexthdr; + uint8_t hdrlen; + uint8_t type; + uint8_t segments_left; +}; +BUILD_ASSERT_DECL(IP6_RT_HDR_LEN == sizeof(struct ip6_rt_hdr)); + #define ICMP6_HEADER_LEN 4 struct icmp6_header { uint8_t icmp6_type; From 03fc1ad78521544c7269355ec72fec8c2373b96d Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Wed, 29 Mar 2023 14:51:17 +0900 Subject: [PATCH 210/833] userspace: Add SRv6 tunnel support. SRv6 (Segment Routing IPv6) tunnel vport is responsible for encapsulation and decapsulation the inner packets with IPv6 header and an extended header called SRH (Segment Routing Header). See spec in: https://datatracker.ietf.org/doc/html/rfc8754 This patch implements SRv6 tunneling in userspace datapath. It uses `remote_ip` and `local_ip` options as with existing tunnel protocols. It also adds a dedicated `srv6_segs` option to define a sequence of routers called segment list. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- Documentation/faq/configuration.rst | 21 +++++ Documentation/faq/releases.rst | 1 + NEWS | 2 + include/linux/openvswitch.h | 1 + include/sparse/netinet/in.h | 1 + lib/dpif-netlink-rtnl.c | 5 ++ lib/dpif-netlink.c | 5 ++ lib/netdev-native-tnl.c | 130 ++++++++++++++++++++++++++++ lib/netdev-native-tnl.h | 10 +++ lib/netdev-vport.c | 53 ++++++++++++ lib/netdev.h | 4 + lib/packets.h | 15 ++++ lib/tnl-ports.c | 5 +- ofproto/ofproto-dpif-xlate.c | 4 + tests/system-kmod-macros.at | 8 ++ tests/system-traffic.at | 124 ++++++++++++++++++++++++++ tests/system-userspace-macros.at | 6 ++ tests/tunnel.at | 56 ++++++++++++ 18 files changed, 450 insertions(+), 1 deletion(-) diff --git a/Documentation/faq/configuration.rst b/Documentation/faq/configuration.rst index dc6c92446f9..4df390dc2d9 100644 --- a/Documentation/faq/configuration.rst +++ b/Documentation/faq/configuration.rst @@ -238,6 +238,27 @@ Q: Does Open vSwitch support GTP-U? set int gtpu0 type=gtpu options:key= \ options:remote_ip=172.31.1.1 +Q: Does Open vSwitch support SRv6? + + A: Yes. Starting with version 3.2, the Open vSwitch userspace + datapath supports SRv6 (Segment Routing over IPv6). The following + example shows tunneling to fc00:300::1 via fc00:100::1 and fc00:200::1. + In the current implementation, if "IPv6 in IPv6" or "IPv4 in IPv6" packets + are routed to this interface, and these packets are not SRv6 packets, they + may be dropped, so be careful in workloads with a mix of these tunnels. + Also note the following restrictions: + + * Segment list length is limited to 6. + * SRv6 packets with other than segments_left = 0 are simply dropped. + + :: + + $ ovs-vsctl add-br br0 + $ ovs-vsctl add-port br0 srv6_0 -- \ + set int srv6_0 type=srv6 \ + options:remote_ip=fc00:100::1 \ + options:srv6_segs="fc00:100::1,fc00:200::1,fc00:300::1" + Q: How do I connect two bridges? A: First, why do you want to do this? Two connected bridges are not much diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 9e1b4226200..9fb679e307d 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -151,6 +151,7 @@ Q: Are all features available with all datapaths? Tunnel - ERSPAN 4.18 2.10 2.10 NO Tunnel - ERSPAN-IPv6 4.18 2.10 2.10 NO Tunnel - GTP-U NO NO 2.14 NO + Tunnel - SRv6 NO NO 3.2 NO Tunnel - Bareudp 5.7 NO NO NO QoS - Policing YES 1.1 2.6 NO QoS - Shaping YES 1.1 NO NO diff --git a/NEWS b/NEWS index 8771ee618ae..b6418c36e95 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,8 @@ Post-v3.1.0 * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started with the --hw-rawio-access command line option. This allows the process extra privileges when mapping physical interconnect memory. + - SRv6 Tunnel Protocol + * Added support for userspace datapath (only). v3.1.0 - 16 Feb 2023 diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index bc8f7499184..e305c331516 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -254,6 +254,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_IP6GRE = 109, OVS_VPORT_TYPE_GTPU = 110, OVS_VPORT_TYPE_BAREUDP = 111, /* Bareudp tunnel. */ + OVS_VPORT_TYPE_SRV6 = 112, /* SRv6 tunnel. */ __OVS_VPORT_TYPE_MAX }; diff --git a/include/sparse/netinet/in.h b/include/sparse/netinet/in.h index 21deceb28d4..00927281643 100644 --- a/include/sparse/netinet/in.h +++ b/include/sparse/netinet/in.h @@ -68,6 +68,7 @@ struct sockaddr_in6 { #define IPPROTO_HOPOPTS 0 #define IPPROTO_ICMP 1 #define IPPROTO_IGMP 2 +#define IPPROTO_IPIP 4 #define IPPROTO_TCP 6 #define IPPROTO_UDP 17 #define IPPROTO_ROUTING 43 diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c index 4fc42daed2d..5788294ae0d 100644 --- a/lib/dpif-netlink-rtnl.c +++ b/lib/dpif-netlink-rtnl.c @@ -129,6 +129,8 @@ vport_type_to_kind(enum ovs_vport_type type, } case OVS_VPORT_TYPE_GTPU: return NULL; + case OVS_VPORT_TYPE_SRV6: + return "srv6"; case OVS_VPORT_TYPE_BAREUDP: return "bareudp"; case OVS_VPORT_TYPE_NETDEV: @@ -319,6 +321,7 @@ dpif_netlink_rtnl_verify(const struct netdev_tunnel_config *tnl_cfg, case OVS_VPORT_TYPE_LISP: case OVS_VPORT_TYPE_STT: case OVS_VPORT_TYPE_GTPU: + case OVS_VPORT_TYPE_SRV6: case OVS_VPORT_TYPE_UNSPEC: case __OVS_VPORT_TYPE_MAX: default: @@ -411,6 +414,7 @@ dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg, case OVS_VPORT_TYPE_LISP: case OVS_VPORT_TYPE_STT: case OVS_VPORT_TYPE_GTPU: + case OVS_VPORT_TYPE_SRV6: case OVS_VPORT_TYPE_UNSPEC: case __OVS_VPORT_TYPE_MAX: default: @@ -519,6 +523,7 @@ dpif_netlink_rtnl_port_destroy(const char *name, const char *type) case OVS_VPORT_TYPE_ERSPAN: case OVS_VPORT_TYPE_IP6ERSPAN: case OVS_VPORT_TYPE_IP6GRE: + case OVS_VPORT_TYPE_SRV6: case OVS_VPORT_TYPE_BAREUDP: return dpif_netlink_rtnl_destroy(name); case OVS_VPORT_TYPE_NETDEV: diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index ebe7b5cb145..55b5b0a8549 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -919,6 +919,9 @@ get_vport_type(const struct dpif_netlink_vport *vport) case OVS_VPORT_TYPE_GTPU: return "gtpu"; + case OVS_VPORT_TYPE_SRV6: + return "srv6"; + case OVS_VPORT_TYPE_BAREUDP: return "bareudp"; @@ -957,6 +960,8 @@ netdev_to_ovs_vport_type(const char *type) return OVS_VPORT_TYPE_GRE; } else if (!strcmp(type, "gtpu")) { return OVS_VPORT_TYPE_GTPU; + } else if (!strcmp(type, "srv6")) { + return OVS_VPORT_TYPE_SRV6; } else if (!strcmp(type, "bareudp")) { return OVS_VPORT_TYPE_BAREUDP; } else { diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index b89dfdd52a8..9abdf51076a 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -845,6 +845,136 @@ netdev_gtpu_build_header(const struct netdev *netdev, return 0; } +int +netdev_srv6_build_header(const struct netdev *netdev, + struct ovs_action_push_tnl *data, + const struct netdev_tnl_build_header_params *params) +{ + struct netdev_vport *dev = netdev_vport_cast(netdev); + struct netdev_tunnel_config *tnl_cfg; + const struct in6_addr *segs; + struct srv6_base_hdr *srh; + struct in6_addr *s; + ovs_be16 dl_type; + int err = 0; + int nr_segs; + int i; + + ovs_mutex_lock(&dev->mutex); + tnl_cfg = &dev->tnl_cfg; + + if (tnl_cfg->srv6_num_segs) { + nr_segs = tnl_cfg->srv6_num_segs; + segs = tnl_cfg->srv6_segs; + } else { + /* + * If explicit segment list setting is omitted, tunnel destination + * is considered to be the first segment list. + */ + nr_segs = 1; + segs = ¶ms->flow->tunnel.ipv6_dst; + } + + if (!ipv6_addr_equals(&segs[0], ¶ms->flow->tunnel.ipv6_dst)) { + err = EINVAL; + goto out; + } + + srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING); + srh->rt_hdr.segments_left = nr_segs - 1; + srh->rt_hdr.type = IPV6_SRCRT_TYPE_4; + srh->rt_hdr.hdrlen = 2 * nr_segs; + srh->last_entry = nr_segs - 1; + srh->flags = 0; + srh->tag = 0; + + dl_type = params->flow->dl_type; + if (dl_type == htons(ETH_TYPE_IP)) { + srh->rt_hdr.nexthdr = IPPROTO_IPIP; + } else if (dl_type == htons(ETH_TYPE_IPV6)) { + srh->rt_hdr.nexthdr = IPPROTO_IPV6; + } else { + err = EOPNOTSUPP; + goto out; + } + + s = ALIGNED_CAST(struct in6_addr *, + (char *) srh + sizeof *srh); + for (i = 0; i < nr_segs; i++) { + /* Segment list is written to the header in reverse order. */ + memcpy(s, &segs[nr_segs - i - 1], sizeof *s); + s++; + } + + data->header_len += sizeof *srh + 8 * srh->rt_hdr.hdrlen; + data->tnl_type = OVS_VPORT_TYPE_SRV6; +out: + ovs_mutex_unlock(&dev->mutex); + + return err; +} + +void +netdev_srv6_push_header(const struct netdev *netdev OVS_UNUSED, + struct dp_packet *packet, + const struct ovs_action_push_tnl *data) +{ + int ip_tot_size; + + netdev_tnl_push_ip_header(packet, data->header, + data->header_len, &ip_tot_size); +} + +struct dp_packet * +netdev_srv6_pop_header(struct dp_packet *packet) +{ + const struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); + size_t size = dp_packet_l3_size(packet) - IPV6_HEADER_LEN; + struct pkt_metadata *md = &packet->md; + struct flow_tnl *tnl = &md->tunnel; + const struct ip6_rt_hdr *rt_hdr; + uint8_t nw_proto = nh->ip6_nxt; + const void *data = nh + 1; + uint8_t nw_frag = 0; + unsigned int hlen; + + /* + * Verifies that the routing header is present in the IPv6 + * extension headers and that its type is SRv6. + */ + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, &rt_hdr)) { + goto err; + } + + if (!rt_hdr || rt_hdr->type != IPV6_SRCRT_TYPE_4) { + goto err; + } + + if (rt_hdr->segments_left > 0) { + VLOG_WARN_RL(&err_rl, "invalid srv6 segments_left=%d\n", + rt_hdr->segments_left); + goto err; + } + + if (rt_hdr->nexthdr == IPPROTO_IPIP) { + packet->packet_type = htonl(PT_IPV4); + } else if (rt_hdr->nexthdr == IPPROTO_IPV6) { + packet->packet_type = htonl(PT_IPV6); + } else { + goto err; + } + + pkt_metadata_init_tnl(md); + netdev_tnl_ip_extract_tnl_md(packet, tnl, &hlen); + dp_packet_reset_packet(packet, hlen); + + return packet; +err: + dp_packet_delete(packet); + return NULL; +} + struct dp_packet * netdev_vxlan_pop_header(struct dp_packet *packet) { diff --git a/lib/netdev-native-tnl.h b/lib/netdev-native-tnl.h index 22ae2ce5369..4dad8f978cc 100644 --- a/lib/netdev-native-tnl.h +++ b/lib/netdev-native-tnl.h @@ -65,6 +65,16 @@ netdev_gtpu_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *p); +struct dp_packet *netdev_srv6_pop_header(struct dp_packet *); + +void netdev_srv6_push_header(const struct netdev *, + struct dp_packet *, + const struct ovs_action_push_tnl *); + +int netdev_srv6_build_header(const struct netdev *, + struct ovs_action_push_tnl *, + const struct netdev_tnl_build_header_params *); + void netdev_tnl_push_udp_header(const struct netdev *netdev, struct dp_packet *packet, diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 3b39278650d..663ee8606c3 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -424,6 +424,35 @@ parse_tunnel_ip(const char *value, bool accept_mcast, bool *flow, return 0; } +static int +parse_srv6_segs(char *s, struct in6_addr *segs, uint8_t *num_segs) +{ + char *save_ptr = NULL; + char *token; + + if (!s) { + return EINVAL; + } + + *num_segs = 0; + + while ((token = strtok_r(s, ",", &save_ptr)) != NULL) { + if (*num_segs == SRV6_MAX_SEGS) { + return EINVAL; + } + + if (inet_pton(AF_INET6, token, segs) != 1) { + return EINVAL; + } + + segs++; + (*num_segs)++; + s = NULL; + } + + return 0; +} + enum tunnel_layers { TNL_L2 = 1 << 0, /* 1 if a tunnel type can carry Ethernet traffic. */ TNL_L3 = 1 << 1 /* 1 if a tunnel type can carry L3 traffic. */ @@ -443,6 +472,8 @@ tunnel_supported_layers(const char *type, return TNL_L3; } else if (!strcmp(type, "bareudp")) { return TNL_L3; + } else if (!strcmp(type, "srv6")) { + return TNL_L3; } else { return TNL_L2; } @@ -750,6 +781,17 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) goto out; } } + } else if (!strcmp(node->key, "srv6_segs")) { + err = parse_srv6_segs(node->value, + tnl_cfg.srv6_segs, + &tnl_cfg.srv6_num_segs); + + switch (err) { + case EINVAL: + ds_put_format(&errors, "%s: bad %s 'srv6_segs'\n", + name, node->value); + break; + } } else if (!strcmp(node->key, "payload_type")) { if (!strcmp(node->value, "mpls")) { tnl_cfg.payload_ethertype = htons(ETH_TYPE_MPLS); @@ -1290,6 +1332,17 @@ netdev_vport_tunnel_register(void) }, {{NULL, NULL, 0, 0}} }, + { "srv6_sys", + { + TUNNEL_FUNCTIONS_COMMON, + .type = "srv6", + .build_header = netdev_srv6_build_header, + .push_header = netdev_srv6_push_header, + .pop_header = netdev_srv6_pop_header, + .get_ifindex = NETDEV_VPORT_GET_IFINDEX, + }, + {{NULL, NULL, 0, 0}} + }, }; static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; diff --git a/lib/netdev.h b/lib/netdev.h index acf174927d2..ff207f56c28 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -140,6 +140,10 @@ struct netdev_tunnel_config { bool erspan_idx_flow; bool erspan_dir_flow; bool erspan_hwid_flow; + + uint8_t srv6_num_segs; + #define SRV6_MAX_SEGS 6 + struct in6_addr srv6_segs[SRV6_MAX_SEGS]; }; void netdev_run(void); diff --git a/lib/packets.h b/lib/packets.h index 70cd072228a..9465bec16c9 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -706,6 +706,10 @@ char *ip_parse_cidr_len(const char *s, int *n, ovs_be32 *ip, #define IPPROTO_IGMP 2 #endif +#ifndef IPPROTO_IPIP +#define IPPROTO_IPIP 4 +#endif + #ifndef IPPROTO_UDPLITE #define IPPROTO_UDPLITE 136 #endif @@ -1523,6 +1527,17 @@ BUILD_ASSERT_DECL(sizeof(struct vxlanhdr) == 8); #define VXLAN_F_GPE 0x4000 #define VXLAN_HF_GPE 0x04000000 +/* SRv6 protocol header. */ +#define IPV6_SRCRT_TYPE_4 4 +#define SRV6_BASE_HDR_LEN 8 +struct srv6_base_hdr { + struct ip6_rt_hdr rt_hdr; + uint8_t last_entry; + uint8_t flags; + ovs_be16 tag; +}; +BUILD_ASSERT_DECL(sizeof(struct srv6_base_hdr) == SRV6_BASE_HDR_LEN); + /* Input values for PACKET_TYPE macros have to be in host byte order. * The _BE postfix indicates result is in network byte order. Otherwise result * is in host byte order. */ diff --git a/lib/tnl-ports.c b/lib/tnl-ports.c index 829457ee50f..f16409a0bf0 100644 --- a/lib/tnl-ports.c +++ b/lib/tnl-ports.c @@ -126,7 +126,7 @@ map_insert(odp_port_t port, struct eth_addr mac, struct in6_addr *addr, /* XXX: No fragments support. */ match.wc.masks.nw_frag = FLOW_NW_FRAG_MASK; - /* 'tp_port' is zero for GRE tunnels. In this case it + /* 'tp_port' is zero for GRE and SRv6 tunnels. In this case it * doesn't make sense to match on UDP port numbers. */ if (tp_port) { match.wc.masks.tp_dst = OVS_BE16_MAX; @@ -174,6 +174,9 @@ tnl_type_to_nw_proto(const char type[], uint8_t nw_protos[2]) } else if (!strcmp(type, "gre") || !strcmp(type, "erspan") || !strcmp(type, "ip6erspan") || !strcmp(type, "ip6gre")) { nw_protos[0] = IPPROTO_GRE; + } else if (!strcmp(type, "srv6")) { + nw_protos[0] = IPPROTO_IPIP; + nw_protos[1] = IPPROTO_IPV6; } } diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index a9cf3cbee0b..dee4c7d63af 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3632,6 +3632,10 @@ propagate_tunnel_data_to_flow(struct xlate_ctx *ctx, struct eth_addr dmac, case OVS_VPORT_TYPE_BAREUDP: nw_proto = IPPROTO_UDP; break; + case OVS_VPORT_TYPE_SRV6: + nw_proto = (flow->dl_type == htons(ETH_TYPE_IP)) + ? IPPROTO_IPIP : IPPROTO_IPV6; + break; case OVS_VPORT_TYPE_LISP: case OVS_VPORT_TYPE_STT: case OVS_VPORT_TYPE_UNSPEC: diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index 822a80618d6..fb15a5a7ce0 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -202,6 +202,14 @@ m4_define([OVS_CHECK_KERNEL_EXCL], AT_SKIP_IF([ ! ( test $version -lt $1 || ( test $version -eq $1 && test $sublevel -lt $2 ) || test $version -gt $3 || ( test $version -eq $3 && test $sublevel -gt $4 ) ) ]) ]) +# OVS_CHECK_SRV6() +# +# The kernel datapath does not support this feature. +m4_define([OVS_CHECK_SRV6], +[ + AT_SKIP_IF([:]) +]) + # CHECK_LATER_IPV6_FRAGMENTS() # # Upstream kernels beetween 4.20 and 5.19 are not parsing IPv6 fragments diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 39a48175271..4c378e1d02b 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1164,6 +1164,130 @@ OVS_WAIT_UNTIL([cat p0.pcap | grep -E "IP6 fc00:100::100 > fc00:100::1: GREv0, . OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - ping over srv6 tunnel]) +OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_SRV6() + +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0) +ADD_NAMESPACES(at_ns1) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv4.conf.default.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv4.conf.all.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.forwarding=1]) + +dnl Set up underlay link from host into the namespace 'at_ns0' +dnl using veth pair. Kernel side tunnel endpoint (SID) is +dnl 'fc00:a::1/128', so add it to the route. +dnl Only IPPROTO_IPIP(4) and IPPROTO_ICMPV6(58) are needed in underlay link. +ADD_BR([br-underlay]) +ADD_VETH(p0, at_ns0, br-underlay, "fc00::1/64", [], [], "nodad") +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=1,actions=drop"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=4,actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=58,actions=normal"]) +AT_CHECK([ip addr add dev br-underlay "fc00::100/64" nodad]) +AT_CHECK([ip link set dev br-underlay up]) +AT_CHECK([ip route add fc00:a::1/128 dev br-underlay via fc00::1]) + +dnl Set up tunnel endpoints on OVS outside the namespace. +ADD_OVS_TUNNEL6([srv6], [br0], [at_srv6], [fc00:a::1], [10.100.100.100/24]) +AT_CHECK([ovs-vsctl set bridge br0 other_config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ip route add 10.1.1.0/24 dev br0 via 10.100.100.1]) +AT_CHECK([arp -s 10.100.100.1 aa:55:aa:55:00:01]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=LOCAL,actions=output:at_srv6]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=at_srv6,actions=mod_dl_dst:aa:55:aa:55:00:00,output:LOCAL]) + +dnl Set up tunnel endpoints on the namespace 'at_ns0', +dnl and overlay port on the namespace 'at_ns1' +ADD_VETH_NS([at_ns0], [veth0], [10.1.1.2/24], [at_ns1], [veth1], [10.1.1.1/24]) +NS_CHECK_EXEC([at_ns0], [ip sr tunsrc set fc00:a::1]) +NS_CHECK_EXEC([at_ns0], [ip route add 10.100.100.0/24 encap seg6 mode encap segs fc00::100 dev p0]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX4 nh4 0.0.0.0 dev veth0]) +NS_CHECK_EXEC([at_ns1], [ip route add 10.100.100.0/24 via 10.1.1.2 dev veth1]) + +dnl Linux seems to take a little time to get its IPv6 stack in order. Without +dnl waiting, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay. +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.100.100.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([datapath - ping6 over srv6 tunnel]) +OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_SRV6() + +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0) +ADD_NAMESPACES(at_ns1) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.forwarding=1]) + +dnl Set up underlay link from host into the namespace 'at_ns0' +dnl using veth pair. Kernel side tunnel endpoint (SID) is +dnl 'fc00:a::1/128', so add it to the route. +dnl Only IPPROTO_IPV6(41) and IPPROTO_ICMPV6(58) are needed in underlay link. +ADD_BR([br-underlay]) +ADD_VETH(p0, at_ns0, br-underlay, "fc00::1/64", [], [], "nodad") +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=1,actions=drop"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=41,actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=58,actions=normal"]) +AT_CHECK([ip addr add dev br-underlay "fc00::100/64" nodad]) +AT_CHECK([ip link set dev br-underlay up]) +AT_CHECK([ip -6 route add fc00:a::1/128 dev br-underlay via fc00::1]) + +dnl Set up tunnel endpoints on OVS outside the namespace. +ADD_OVS_TUNNEL6([srv6], [br0], [at_srv6], [fc00:a::1], [fc00:100::100/64]) +AT_CHECK([ovs-vsctl set bridge br0 other_config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ip addr add dev br0 fc00:100::100/64]) +AT_CHECK([ip -6 route add fc00:1::1/128 dev br0 via fc00:100::1]) +AT_CHECK([ip -6 neigh add fc00:100::1 lladdr aa:55:aa:55:00:01 dev br0]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=LOCAL,actions=output:at_srv6]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=at_srv6,actions=mod_dl_dst:aa:55:aa:55:00:00,output:LOCAL]) + +dnl Set up tunnel endpoints on the namespace 'at_ns0', +dnl and overlay port on the namespace 'at_ns1' +ADD_VETH_NS([at_ns0], [veth0], [fc00:1::2/64], [at_ns1], [veth1], [fc00:1::1/64]) +NS_CHECK_EXEC([at_ns0], [ip sr tunsrc set fc00:a::1]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:100::0/64 encap seg6 mode encap segs fc00::100 dev p0]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX6 nh6 :: dev veth0]) +NS_CHECK_EXEC([at_ns1], [ip -6 route add fc00:100::/64 via fc00:1::2 dev veth1]) + +dnl Linux seems to take a little time to get its IPv6 stack in order. Without +dnl waiting, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) +OVS_WAIT_UNTIL([ip netns exec at_ns1 ping6 -c 1 fc00:100::100]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay. +NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([datapath - clone action]) OVS_TRAFFIC_VSWITCHD_START() diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index 610fa2e94ae..482079386a4 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -301,6 +301,12 @@ m4_define([OVS_CHECK_KERNEL_EXCL], AT_SKIP_IF([:]) ]) +# OVS_CHECK_SRV6() +m4_define([OVS_CHECK_SRV6], + [AT_SKIP_IF([! ip -6 route add fc00::1/96 encap seg6 mode encap dev lo 2>&1 >/dev/null]) + AT_CHECK([ip -6 route del fc00::1/96 2>&1 >/dev/null]) + OVS_CHECK_FIREWALL()]) + # CHECK_LATER_IPV6_FRAGMENTS() # # Userspace is parsing later IPv6 fragments correctly. diff --git a/tests/tunnel.at b/tests/tunnel.at index 78cc3f3e99a..ddeb66bc9fb 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -1223,3 +1223,59 @@ AT_CHECK([ovs-vsctl add-port br0 p1 -- set int p1 type=dummy]) OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) OVS_APP_EXIT_AND_WAIT([ovsdb-server])] AT_CLEANUP + +AT_SETUP([tunnel - SRV6 basic]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy \ + ofport_request=1 \ + -- add-port br0 p2 -- set Interface p2 type=srv6 \ + options:remote_ip=flow \ + ofport_request=2]) +OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP + +dnl First setup dummy interface IP address, then add the route +dnl so that tnl-port table can get valid IP address for the device. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 fc00::1/64], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add fc00::0/64 br0], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/show], [0], [dnl +Route Table: +User: fc00::/64 dev br0 SRC fc00::1 +]) + +AT_DATA([flows.txt], [dnl +in_port=1,actions=set_field:fc00::2->tun_ipv6_dst,output:2 +in_port=2,actions=1 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl + br0 65534/100: (dummy-internal) + p1 1/1: (dummy) + p2 2/6: (srv6: remote_ip=flow) +]) + +AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl +Listening ports: +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 +]) + +AT_CHECK([ovs-appctl ofproto/list-tunnels], [0], [dnl +port 6: p2 (srv6: ::->flow, key=0, legacy_l3, dp port=6, ttl=64) +]) + +dnl Encap: ipv4 inner packet +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=4,ttl=128,frag=no),tcp(src=8,dst=9)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: set(tunnel(ipv6_dst=fc00::2,ttl=64,flags(df))),pop_eth,6 +]) + +dnl Encap: ipv6 inner packet +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x86dd),ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=47,tclass=0x0,hlimit=64)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: set(tunnel(ipv6_dst=fc00::2,ttl=64,flags(df))),pop_eth,6 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From 7381fd440a88ae92ca3bbc6b2ee34c5d5861a061 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Wed, 29 Mar 2023 14:51:18 +0900 Subject: [PATCH 211/833] odp: Add SRv6 tunnel actions. This patch adds ODP actions for SRv6 and its tests. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/odp-util.c | 70 +++++++++++++++++++++++++++++++++++ python/ovs/flow/odp.py | 8 ++++ python/ovs/tests/test_odp.py | 16 ++++++++ tests/odp.at | 12 +++++- tests/tunnel-push-pop-ipv6.at | 23 ++++++++++++ 5 files changed, 127 insertions(+), 2 deletions(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index dbd4554d062..2ec889c417e 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -715,6 +715,24 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data) } ds_put_char(ds, ')'); + } else if (data->tnl_type == OVS_VPORT_TYPE_SRV6) { + const struct srv6_base_hdr *srh; + struct in6_addr *segs; + int nr_segs; + int i; + + srh = (const struct srv6_base_hdr *) l4; + segs = ALIGNED_CAST(struct in6_addr *, srh + 1); + nr_segs = srh->last_entry + 1; + + ds_put_format(ds, "srv6("); + ds_put_format(ds, "segments_left=%d", srh->rt_hdr.segments_left); + ds_put_format(ds, ",segs("); + for (i = 0; i < nr_segs; i++) { + ds_put_format(ds, i > 0 ? "," : ""); + ipv6_format_addr(&segs[nr_segs - i - 1], ds); + } + ds_put_format(ds, "))"); } else if (data->tnl_type == OVS_VPORT_TYPE_GRE || data->tnl_type == OVS_VPORT_TYPE_IP6GRE) { const struct gre_base_hdr *greh; @@ -1534,6 +1552,7 @@ ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) uint8_t hwid, dir; uint32_t teid; uint8_t gtpu_flags, gtpu_msgtype; + uint8_t segments_left; if (!ovs_scan_len(s, &n, "tnl_push(tnl_port(%"SCNi32"),", &data->tnl_port)) { return -EINVAL; @@ -1775,6 +1794,57 @@ ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) tnl_type = OVS_VPORT_TYPE_GTPU; header_len = sizeof *eth + ip_len + sizeof *udp + sizeof *gtph; + } else if (ovs_scan_len(s, &n, "srv6(segments_left=%"SCNu8, + &segments_left)) { + struct srv6_base_hdr *srh = (struct srv6_base_hdr *) (ip6 + 1); + char seg_s[IPV6_SCAN_LEN + 1]; + struct in6_addr *segs; + struct in6_addr seg; + uint8_t n_segs = 0; + + if (segments_left + 1 > SRV6_MAX_SEGS) { + return -EINVAL; + } + + ip6->ip6_nxt = IPPROTO_ROUTING; + + srh->rt_hdr.hdrlen = 2 * (segments_left + 1); + srh->rt_hdr.segments_left = segments_left; + srh->rt_hdr.type = IPV6_SRCRT_TYPE_4; + srh->last_entry = segments_left; + + tnl_type = OVS_VPORT_TYPE_SRV6; + header_len = sizeof *eth + ip_len + + sizeof *srh + 8 * srh->rt_hdr.hdrlen; + /* Parse segment list. */ + if (!ovs_scan_len(s, &n, ",segs(")) { + return -EINVAL; + } + + segs = ALIGNED_CAST(struct in6_addr *, srh + 1); + segs += segments_left; + + while (ovs_scan_len(s, &n, IPV6_SCAN_FMT, seg_s) + && inet_pton(AF_INET6, seg_s, &seg) == 1) { + if (n_segs == segments_left + 1) { + return -EINVAL; + } + + memcpy(segs--, &seg, sizeof *segs); + n_segs++; + + if (s[n] == ',') { + n++; + } + } + + if (!ovs_scan_len(s, &n, ")))")) { + return -EINVAL; + } + + if (n_segs != segments_left + 1) { + return -EINVAL; + } } else { return -EINVAL; } diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index db63afc8d64..88aee17fb2a 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -474,6 +474,14 @@ def _tnl_action_decoder_args(): } ) ), + "srv6": nested_kv_decoder( + KVDecoders( + { + "segments_left": decode_int, + "segs": decode_default, + } + ) + ), } ) ), diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index f8017ca8a16..a50d3185cc6 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -452,6 +452,22 @@ def test_odp_fields(input_string, expected): ), ], ), + ( + "actions:tnl_push(header(srv6(segments_left=1,segs(2001:cafe::90,2001:cafe::91))))", # noqa: E501 + [ + KeyValue( + "tnl_push", + { + "header": { + "srv6": { + "segments_left": 1, + "segs": "2001:cafe::90,2001:cafe::91", + } + } + }, + ), + ], + ), ( "actions:clone(1),clone(clone(push_vlan(vid=12,pcp=0),2),1)", [ diff --git a/tests/odp.at b/tests/odp.at index 26cda296723..ba20604e43d 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -342,6 +342,8 @@ tnl_push(tnl_port(6),header(size=70,type=4,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:1 tnl_push(tnl_port(6),header(size=70,type=5,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=6081,csum=0x0),geneve(oam,vni=0x1c7)),out_port(1)) tnl_push(tnl_port(6),header(size=78,type=5,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=6081,csum=0x0),geneve(crit,vni=0x1c7,options({class=0xffff,type=0x80,len=4,0xa}))),out_port(1)) tnl_push(tnl_port(6),header(size=70,type=5,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x1c7)),out_port(1)) +tnl_push(tnl_port(6),header(size=78,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=0,segs(2001:cafe::90))),out_port(1)) +tnl_push(tnl_port(6),header(size=110,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91,2001:cafe::92))),out_port(1)) ct ct(commit) ct(commit,zone=5) @@ -400,8 +402,14 @@ AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - invalid forms]) dnl This caused a hang in older versions. -AT_CHECK([echo 'encap_nsh@:{@' | ovstest test-odp parse-actions -], [0], [dnl +AT_DATA([actions.txt], [dnl +encap_nsh@:{@ +tnl_push(tnl_port(6),header(size=94,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91))),out_port(1)) +tnl_push(tnl_port(6),header(size=126,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91,2001:cafe::92,2001:cafe::93))),out_port(1)) +]) +AT_CHECK_UNQUOTED([ovstest test-odp parse-actions < actions.txt], [0], [dnl +odp_actions_from_string: error +odp_actions_from_string: error odp_actions_from_string: error ]) AT_CLEANUP diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index 2cf306c67ec..e300fe3a0d2 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -202,6 +202,8 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=vxlan \ options:remote_ip=flow options:key=123 ofport_request=5\ -- add-port int-br t5 -- set Interface t5 type=gre \ options:remote_ip=2001:cafe::92 options:key=455 options:packet_type=legacy_l3 ofport_request=6\ + -- add-port int-br t6 -- set Interface t6 type=srv6 \ + options:remote_ip=2001:cafe::92 ofport_request=7\ ], [0]) AT_CHECK([ovs-appctl dpif/show], [0], [dnl @@ -216,12 +218,15 @@ dummy@ovs-dummy: hit:0 missed:0 t3 4/4789: (vxlan: csum=true, out_key=flow, remote_ip=2001:cafe::93) t4 5/6081: (geneve: key=123, remote_ip=flow) t5 6/3: (gre: key=455, packet_type=legacy_l3, remote_ip=2001:cafe::92) + t6 7/6: (srv6: remote_ip=2001:cafe::92) ]) AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) ref_cnt=1 gre_sys (3) ref_cnt=2 +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=2 ]) @@ -363,6 +368,8 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) ref_cnt=1 gre_sys (3) ref_cnt=2 +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=2 ]) @@ -384,6 +391,12 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_pop(6081) ]) +dnl Check SRv6 tunnel pop +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=4,tclass=0x0,hlimit=64)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_pop(6) +]) + dnl Check VXLAN tunnel push AT_CHECK([ovs-ofctl add-flow int-br action=2]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:01),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) @@ -405,6 +418,13 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_push(tnl_port(3),header(size=62,type=109,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=47,tclass=0x0,hlimit=64),gre((flags=0x2000,proto=0x6558),key=0x1c8)),out_port(100)),1 ]) +dnl Check SRv6 tunnel push +AT_CHECK([ovs-ofctl add-flow int-br action=7]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:01),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: pop_eth,tnl_push(tnl_port(6),header(size=78,type=112,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=0,segs(2001:cafe::92))),out_port(100)),1 +]) + dnl Check Geneve tunnel push AT_CHECK([ovs-ofctl add-flow int-br "actions=set_field:2001:cafe::92->tun_ipv6_dst,5"]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:01),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) @@ -510,6 +530,8 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) ref_cnt=1 gre_sys (3) ref_cnt=1 +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=1 vxlan_sys_4790 (4790) ref_cnt=1 ]) @@ -518,6 +540,7 @@ AT_CHECK([ovs-vsctl del-port int-br t1 \ -- del-port int-br t2 \ -- del-port int-br t4 \ -- del-port int-br t5 \ + -- del-port int-br t6 \ ], [0]) dnl Check tunnel lookup entries after deleting all remaining tunnel ports From 306583b56868edc5f9bcc2a21ac07e334891a6c7 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Tue, 28 Mar 2023 11:21:53 -0400 Subject: [PATCH 212/833] netdev-tc-offloads: Fix misaligned 8 byte read. UB Sanitizer report: lib/netdev-offload-tc.c:1276:19: runtime error: load of misaligned address 0x7f74e801976c for type 'union ovs_u128', which requires 8 byte alignment 0 in netdev_tc_flow_dump_next lib/netdev-offload-tc.c:1276 1 in netdev_flow_dump_next lib/netdev-offload.c:303 2 in dpif_netlink_flow_dump_next lib/dpif-netlink.c:1921 [...] Fixes: 8f7620e6a406 ("netdev-tc-offloads: Implement netdev flow dump api using tc interface") Acked-by: Eelco Chaudron Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 247c1ff8b72..4721f016076 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -1282,8 +1282,8 @@ netdev_tc_flow_dump_next(struct netdev_flow_dump *dump, continue; } - if (flower.act_cookie.len) { - *ufid = *((ovs_u128 *) flower.act_cookie.data); + if (flower.act_cookie.len >= sizeof *ufid) { + *ufid = get_32aligned_u128(flower.act_cookie.data); } else if (!find_ufid(netdev, &id, ufid)) { continue; } From 0f34ecbd5a73b00f1dd8c6675c7be6fec6e094d4 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Thu, 30 Mar 2023 16:29:47 +0900 Subject: [PATCH 213/833] vswitch.xml: Add description of SRv6 tunnel and related options. The description of SRv6 was missing in vswitch.xml, which is used to generate the man page, so this patch adds it. Fixes: 03fc1ad78521 ("userspace: Add SRv6 tunnel support.") Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- vswitchd/vswitch.xml | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 88e2c94e2f0..edb5eafa04c 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -2845,6 +2845,16 @@

+
srv6
+
+

+ Segment Routing IPv6 (SRv6) tunnel encapsulates L3 traffic as + "IPv6 in IPv6" or "IPv4 in IPv6" with Segment Routing Header (SRH) + defined in RFC 8754. The segment list in SRH can be set using a + SRv6 specific option. +

+
+
@@ -2853,8 +2863,8 @@

These options apply to interfaces with of geneve, bareudp, gre, - ip6gre, vxlan, lisp and - stt. + ip6gre, vxlan, lisp, + stt and srv6.

@@ -2867,7 +2877,8 @@ considered more specific than if a port defines one and another port defines the other. is not applicable for bareudp - tunnels. Hence it is not considered while identifying a bareudp tunnel. + and srv6 tunnels. Hence it is not considered while identifying + bareudp or srv6 tunnels.

@@ -2935,8 +2946,9 @@

- Optional, not applicable for bareudp. The key that - received packets must contain, one of: + Optional, not applicable for bareudp and + srv6. The key that received packets must contain, + one of:

    @@ -2965,8 +2977,9 @@

    - Optional, not applicable for bareudp. The key to be set - on outgoing packets, one of: + Optional, not applicable for bareudp and + srv6. The key to be set on outgoing packets, + one of:

      @@ -3264,6 +3277,18 @@ + + +

      + Specifies the segment list in Segment Routing Header (SRH). + It consists of a comma-separated list of segments represented + in IPv6 format, e.g. "fc00:100::1,fc00:200::1,fc00:300::1". + Note that the first segment must be the same as + . +

      +
      +
      +

      These options apply only to patch ports, that is, interfaces From daeab9548acc07fec839a46fd6bfd2a392bb91a0 Mon Sep 17 00:00:00 2001 From: Daniel Alvarez Sanchez Date: Wed, 29 Mar 2023 16:26:38 -0400 Subject: [PATCH 214/833] db-ctl-base: Partially revert b8bf410a5. The commit b8bf410a5 [0] broke the `ovs-vsctl add` command which now overwrites the value if it existed already. This patch reverts the code around the `cmd_add` function to restore the previous behavior. It also adds testing coverage for this functionality. [0] https://github.com/openvswitch/ovs/commit/b8bf410a5c94173da02279b369d75875c4035959 Fixes: b8bf410a5c94 ("db-ctl-base: Use partial map/set updates for last add/set commands.") Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2182767 Acked-by: Dumitru Ceara Signed-off-by: Daniel Alvarez Sanchez Signed-off-by: Ilya Maximets --- lib/db-ctl-base.c | 42 +++++++++--------------------------------- tests/ovs-vsctl.at | 8 +++----- 2 files changed, 12 insertions(+), 38 deletions(-) diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c index 134496ef3f6..5d2635946d3 100644 --- a/lib/db-ctl-base.c +++ b/lib/db-ctl-base.c @@ -1492,7 +1492,7 @@ cmd_add(struct ctl_context *ctx) const struct ovsdb_idl_column *column; const struct ovsdb_idl_row *row; const struct ovsdb_type *type; - struct ovsdb_datum new; + struct ovsdb_datum old; int i; ctx->error = get_table(table_name, &table); @@ -1516,13 +1516,7 @@ cmd_add(struct ctl_context *ctx) } type = &column->type; - - if (ctx->last_command) { - ovsdb_datum_init_empty(&new); - } else { - ovsdb_datum_clone(&new, ovsdb_idl_read(row, column)); - } - + ovsdb_datum_clone(&old, ovsdb_idl_read(row, column)); for (i = 4; i < ctx->argc; i++) { struct ovsdb_type add_type; struct ovsdb_datum add; @@ -1533,41 +1527,23 @@ cmd_add(struct ctl_context *ctx) ctx->error = ovsdb_datum_from_string(&add, &add_type, ctx->argv[i], ctx->symtab); if (ctx->error) { - ovsdb_datum_destroy(&new, &column->type); + ovsdb_datum_destroy(&old, &column->type); return; } - ovsdb_datum_union(&new, &add, type); + ovsdb_datum_union(&old, &add, type); ovsdb_datum_destroy(&add, type); } - - if (!ctx->last_command && new.n > type->n_max) { + if (old.n > type->n_max) { ctl_error(ctx, "\"add\" operation would put %u %s in column %s of " "table %s but the maximum number is %u", - new.n, + old.n, type->value.type == OVSDB_TYPE_VOID ? "values" : "pairs", column->name, table->name, type->n_max); - ovsdb_datum_destroy(&new, &column->type); + ovsdb_datum_destroy(&old, &column->type); return; } - - if (ctx->last_command) { - /* Partial updates can only be made one by one. */ - for (i = 0; i < new.n; i++) { - struct ovsdb_datum *datum = xmalloc(sizeof *datum); - - ovsdb_datum_init_empty(datum); - ovsdb_datum_add_from_index_unsafe(datum, &new, i, type); - if (ovsdb_type_is_map(type)) { - ovsdb_idl_txn_write_partial_map(row, column, datum); - } else { - ovsdb_idl_txn_write_partial_set(row, column, datum); - } - } - ovsdb_datum_destroy(&new, &column->type); - } else { - ovsdb_idl_txn_verify(row, column); - ovsdb_idl_txn_write(row, column, &new); - } + ovsdb_idl_txn_verify(row, column); + ovsdb_idl_txn_write(row, column, &old); invalidate_cache(ctx); } diff --git a/tests/ovs-vsctl.at b/tests/ovs-vsctl.at index a92156f001c..a368bff6ede 100644 --- a/tests/ovs-vsctl.at +++ b/tests/ovs-vsctl.at @@ -425,6 +425,7 @@ AT_CHECK([RUN_OVS_VSCTL_ONELINE( [add-port a a1], [add-bond a bond0 a2 a3], [br-set-external-id a key0 value0], + [add Bridge a external_ids key0=value1], [set port a1 external-ids:key1=value1], [set interface a2 external-ids:key2=value2], [set interface a2 external-ids:key3=value3], @@ -446,6 +447,7 @@ AT_CHECK([RUN_OVS_VSCTL_ONELINE( + key0=value0 value0 @@ -1071,13 +1073,9 @@ AT_CHECK([RUN_OVS_VSCTL([set controller br1 'connection-mode=xyz'])], AT_CHECK([RUN_OVS_VSCTL([set controller br1 connection-mode:x=y])], [1], [], [ovs-vsctl: cannot specify key to set for non-map column connection_mode ]) -AT_CHECK([RUN_OVS_VSCTL([add bridge br1 datapath_id x y -- show])], +AT_CHECK([RUN_OVS_VSCTL([add bridge br1 datapath_id x y])], [1], [], [ovs-vsctl: "add" operation would put 2 values in column datapath_id of table Bridge but the maximum number is 1 ]) -AT_CHECK([RUN_OVS_VSCTL([add bridge br1 datapath_id x y])], [1], [], [stderr]) -AT_CHECK([sed "/^.*|WARN|.*/d" < stderr], [0], [dnl -ovs-vsctl: transaction error: {"details":"set must have 0 to 1 members but 2 are present","error":"syntax error","syntax":"[[\"set\",[\"x\",\"y\"]]]"} -]) AT_CHECK([RUN_OVS_VSCTL([remove netflow `cat netflow-uuid` targets '"1.2.3.4:567"'])], [1], [], [ovs-vsctl: "remove" operation would put 0 values in column targets of table NetFlow but the minimum number is 1 ]) From f9507c1ea4343efddbc5c5425c8938b74d3a6260 Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Thu, 30 Mar 2023 17:27:23 +0800 Subject: [PATCH 215/833] netdev-offload-tc: Del ufid mapping if device not exist. The device may be deleted and added with ifindex changed. The tc rules on the device will be deleted if the device is deleted. The func tc_del_filter will fail when flow del. The mapping of ufid to tc will not be deleted. The traffic will trigger the same flow(with same ufid) to put to tc on the new device. Duplicated ufid mapping will be added. If the hashmap is expanded, the old mapping entry will be the first entry, and now the dp flow can't be deleted. Signed-off-by: Faicker Mo Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Tested-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 3 +- tests/system-offloads-traffic.at | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 4721f016076..c9662081fc6 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -276,8 +276,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid, } err = tc_del_flower_filter(id); - if (!err) { + if (!err || err == ENODEV) { del_ufid_tc_mapping(ufid); + return 0; } return err; } diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index eb331d6ce18..da18597cd85 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -750,3 +750,58 @@ AT_CHECK([ovs-appctl coverage/read-counter ukey_invalid_stat_reset], [0], [dnl OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([offloads - delete ufid mapping if device not exist - offloads enabled]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1, at_ns2) + +dnl Disable IPv6 to skip unexpected flow +AT_CHECK([sysctl -w net.ipv6.conf.br0.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns1], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns2], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "aa:1a:54:e9:c5:56") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +NS_CHECK_EXEC([at_ns0], [ping -q -c 2 -i 0.2 10.1.1.2 | FORMAT_PING], [0], [dnl +2 packets transmitted, 2 received, 0% packet loss, time 0ms +]) + +dnl Delete and add interface ovs-p0/p0 +AT_CHECK([ip link del dev ovs-p0]) +AT_CHECK([ip link add p0 type veth peer name ovs-p0 || return 77]) +AT_CHECK([ip link set p0 netns at_ns0]) +AT_CHECK([ip link set dev ovs-p0 up]) +NS_CHECK_EXEC([at_ns0], [ip addr add dev p0 "10.1.1.1/24"]) +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 up]) +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address "aa:1a:54:e9:c5:56"]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +dnl Generate flows to trigger the hmap expand once +ADD_VETH(p2, at_ns2, br0, "10.1.1.3/24") +NS_CHECK_EXEC([at_ns0], [ping -q -c 2 -i 0.2 10.1.1.2 | FORMAT_PING], [0], [dnl +2 packets transmitted, 2 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -q -c 2 -i 0.2 10.1.1.3 | FORMAT_PING], [0], [dnl +2 packets transmitted, 2 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +dnl Fix purge fail occasionally +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +AT_CHECK([test $(ovs-appctl dpctl/dump-flows | grep -c "eth_type(0x0800)") -eq 0], [0], [ignore]) + +OVS_TRAFFIC_VSWITCHD_STOP(["/could not open network device ovs-p0/d +/on nonexistent port/d +/failed to flow_get/d +/Failed to acquire udpif_key/d +/No such device/d +/failed to offload flow/d +"]) +AT_CLEANUP From b5354766805ca885072d06d1cab01c4da7537d64 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 3 Apr 2023 20:41:23 +0200 Subject: [PATCH 216/833] AUTHORS: Add Faicker Mo. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 20f83176d5b..00a6dd5f7ce 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -163,6 +163,7 @@ Ethan J. Jackson ejj@eecs.berkeley.edu Ethan Rahn erahn@arista.com Eziz Durdyyev ezizdurdy@gmail.com Fabrizio D'Angelo fdangelo@redhat.com +Faicker Mo faicker.mo@ucloud.cn Fangrui Song maskray@google.com Fengqi Li lifengqi@inspur.com Flavio Fernandes flavio@flaviof.com From e41bdb17613ba2df284f0f6aed98dbb1c2e2e081 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Fri, 31 Mar 2023 11:16:56 +0800 Subject: [PATCH 217/833] conntrack-tp: Fix clang warning. Declaration of 'struct conn' will not be visible outside of this function. Declaration of 'struct conntrack' will not be visible outside of this function. Declaration of 'struct timeout_policy' will not be visible outside of this function. Signed-off-by: Lin Huang Signed-off-by: Ilya Maximets --- lib/conntrack-tp.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/conntrack-tp.h b/lib/conntrack-tp.h index 4d411d19fd5..7ece2eae2f9 100644 --- a/lib/conntrack-tp.h +++ b/lib/conntrack-tp.h @@ -17,8 +17,15 @@ #ifndef CONNTRACK_TP_H #define CONNTRACK_TP_H 1 +#include + #define CT_DPIF_NETDEV_TP_MIN 30 + enum ct_timeout; +struct conn; +struct conntrack; +struct timeout_policy; + void timeout_policy_init(struct conntrack *ct); int timeout_policy_update(struct conntrack *ct, struct timeout_policy *tp); int timeout_policy_delete(struct conntrack *ct, uint32_t tp_id); From 9d840923d32124fe427de76e8234c49d64e4bb77 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Fri, 31 Mar 2023 17:17:27 -0400 Subject: [PATCH 218/833] ofproto-dpif-xlate: Always mask ip proto field. The ofproto layer currently treats nw_proto field as overloaded to mean both that a proper nw layer exists, as well as the value contained in the header for the nw proto. However, this is incorrect behavior as relevant standards permit that any value, including '0' should be treated as a valid value. Because of this overload, when the ofproto layer builds action list for a packet with nw_proto of 0, it won't build the complete action list that we expect to be built for the packet. That will cause a bad behavior where all packets passing the datapath will fall into an incomplete action set. The fix here is to unwildcard nw_proto, allowing us to preserve setting actions for protocols which we know have support for the actions we program. This means that a traffic which contains nw_proto == 0 cannot cause connectivity breakage with other traffic on the link. Reported-by: David Marchand Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2134873 Acked-by: Ilya Maximets Signed-off-by: Aaron Conole Signed-off-by: Ilya Maximets --- include/openvswitch/meta-flow.h | 4 + lib/meta-flow.c | 25 +++++ ofproto/ofproto-dpif-xlate.c | 8 ++ tests/ofproto-dpif.at | 18 ++-- tests/ofproto.at | 182 ++++++++++++++++++++++++++++++++ tests/packet-type-aware.at | 2 +- 6 files changed, 229 insertions(+), 10 deletions(-) diff --git a/include/openvswitch/meta-flow.h b/include/openvswitch/meta-flow.h index 045dce8f5fa..3b0220aaa25 100644 --- a/include/openvswitch/meta-flow.h +++ b/include/openvswitch/meta-flow.h @@ -2366,6 +2366,10 @@ void mf_format_subvalue(const union mf_subvalue *subvalue, struct ds *s); void field_array_set(enum mf_field_id id, const union mf_value *, struct field_array *); +/* Mask the required l3 prerequisites if a 'set' action occurs. */ +void mf_set_mask_l3_prereqs(const struct mf_field *, const struct flow *, + struct flow_wildcards *); + #ifdef __cplusplus } #endif diff --git a/lib/meta-flow.c b/lib/meta-flow.c index c576ae6202a..474344194fa 100644 --- a/lib/meta-flow.c +++ b/lib/meta-flow.c @@ -3676,3 +3676,28 @@ mf_bitmap_not(struct mf_bitmap x) bitmap_not(x.bm, MFF_N_IDS); return x; } + +void +mf_set_mask_l3_prereqs(const struct mf_field *mf, const struct flow *fl, + struct flow_wildcards *wc) +{ + if (is_ip_any(fl) && + ((mf->id == MFF_IPV4_SRC) || + (mf->id == MFF_IPV4_DST) || + (mf->id == MFF_IPV6_SRC) || + (mf->id == MFF_IPV6_DST) || + (mf->id == MFF_IPV6_LABEL) || + (mf->id == MFF_IP_DSCP) || + (mf->id == MFF_IP_ECN) || + (mf->id == MFF_IP_TTL))) { + WC_MASK_FIELD(wc, nw_proto); + } else if ((fl->dl_type == htons(ETH_TYPE_ARP)) && + ((mf->id == MFF_ARP_OP) || + (mf->id == MFF_ARP_SHA) || + (mf->id == MFF_ARP_THA) || + (mf->id == MFF_ARP_SPA) || + (mf->id == MFF_ARP_TPA))) { + /* mask only the lower 8 bits. */ + wc->masks.nw_proto = 0xff; + } +} diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index dee4c7d63af..c0117771809 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -5215,6 +5215,7 @@ compose_dec_ttl(struct xlate_ctx *ctx, struct ofpact_cnt_ids *ids) } ctx->wc->masks.nw_ttl = 0xff; + WC_MASK_FIELD(ctx->wc, nw_proto); if (flow->nw_ttl > 1) { flow->nw_ttl--; return false; @@ -7132,6 +7133,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IPV4_SRC: if (flow->dl_type == htons(ETH_TYPE_IP)) { memset(&wc->masks.nw_src, 0xff, sizeof wc->masks.nw_src); + WC_MASK_FIELD(wc, nw_proto); flow->nw_src = ofpact_get_SET_IPV4_SRC(a)->ipv4; } break; @@ -7139,12 +7141,14 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IPV4_DST: if (flow->dl_type == htons(ETH_TYPE_IP)) { memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst); + WC_MASK_FIELD(wc, nw_proto); flow->nw_dst = ofpact_get_SET_IPV4_DST(a)->ipv4; } break; case OFPACT_SET_IP_DSCP: if (is_ip_any(flow)) { + WC_MASK_FIELD(wc, nw_proto); wc->masks.nw_tos |= IP_DSCP_MASK; flow->nw_tos &= ~IP_DSCP_MASK; flow->nw_tos |= ofpact_get_SET_IP_DSCP(a)->dscp; @@ -7153,6 +7157,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IP_ECN: if (is_ip_any(flow)) { + WC_MASK_FIELD(wc, nw_proto); wc->masks.nw_tos |= IP_ECN_MASK; flow->nw_tos &= ~IP_ECN_MASK; flow->nw_tos |= ofpact_get_SET_IP_ECN(a)->ecn; @@ -7161,6 +7166,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IP_TTL: if (is_ip_any(flow)) { + WC_MASK_FIELD(wc, nw_proto); wc->masks.nw_ttl = 0xff; flow->nw_ttl = ofpact_get_SET_IP_TTL(a)->ttl; } @@ -7228,6 +7234,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, /* Set the field only if the packet actually has it. */ if (mf_are_prereqs_ok(mf, flow, wc)) { + mf_set_mask_l3_prereqs(mf, flow, wc); mf_mask_field_masked(mf, ofpact_set_field_mask(set_field), wc); mf_set_flow_value_masked(mf, set_field->value, ofpact_set_field_mask(set_field), @@ -7284,6 +7291,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_DEC_TTL: wc->masks.nw_ttl = 0xff; + WC_MASK_FIELD(wc, nw_proto); if (compose_dec_ttl(ctx, ofpact_get_DEC_TTL(a))) { return; } diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 222415ac096..62291de4ac1 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -849,7 +849,7 @@ table=2 ip actions=set_field:192.168.3.91->ip_src,output(11) AT_CHECK([ovs-ofctl -O OpenFlow12 add-flows br0 flows.txt]) AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=1,nw_tos=0,nw_ttl=128,nw_frag=no,icmp_type=8,icmp_code=0'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_src=192.168.0.1,nw_frag=no + [Megaflow: recirc_id=0,eth,icmp,in_port=1,nw_src=192.168.0.1,nw_frag=no Datapath actions: 10,set(ipv4(src=192.168.3.91)),11,set(ipv4(src=192.168.3.90)),13 ]) OVS_VSWITCHD_STOP @@ -912,7 +912,7 @@ AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_ds # Must match on the source address to be able to restore it's value for # the second bucket AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_src=192.168.0.1,nw_frag=no + [Megaflow: recirc_id=0,eth,icmp,in_port=1,nw_src=192.168.0.1,nw_frag=no Datapath actions: set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),11 ]) OVS_VSWITCHD_STOP @@ -944,7 +944,7 @@ done AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/packets.*actions:/actions:/' | strip_ufid | strip_used | sort], [0], [dnl flow-dump from the main thread: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:hash(sym_l4(0)),recirc(0x1) -recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), actions:set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,proto=1,frag=no), actions:set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),10 ]) OVS_VSWITCHD_STOP @@ -959,7 +959,7 @@ AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_ds # Must match on the source address to be able to restore it's value for # the third bucket AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_src=192.168.0.1,nw_frag=no + [Megaflow: recirc_id=0,eth,icmp,in_port=1,nw_src=192.168.0.1,nw_frag=no Datapath actions: set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),11 ]) OVS_VSWITCHD_STOP @@ -1536,17 +1536,17 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=111,tos=0,ttl=2,frag=no)' -generate], [0], [stdout]) AT_CHECK([tail -4 stdout], [0], [ Final flow: ip,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=111,nw_tos=0,nw_ecn=0,nw_ttl=1,nw_frag=no -Megaflow: recirc_id=0,eth,ip,in_port=1,nw_ttl=2,nw_frag=no +Megaflow: recirc_id=0,eth,ip,in_port=1,nw_proto=111,nw_ttl=2,nw_frag=no Datapath actions: set(ipv4(ttl=1)),2,userspace(pid=0,controller(reason=2,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535)),4 ]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=111,tos=0,ttl=3,frag=no)'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_ttl=3,nw_frag=no + [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_proto=111,nw_ttl=3,nw_frag=no Datapath actions: set(ipv4(ttl=2)),2,set(ipv4(ttl=1)),3,4 ]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no)'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ipv6,in_port=1,nw_ttl=128,nw_frag=no + [Megaflow: recirc_id=0,eth,ipv6,in_port=1,nw_proto=10,nw_ttl=128,nw_frag=no Datapath actions: set(ipv6(hlimit=127)),2,set(ipv6(hlimit=126)),3,4 ]) @@ -1656,7 +1656,7 @@ AT_CHECK([ovs-vsctl -- \ --id=@q2 create Queue dscp=2], [0], [ignore]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(9),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=1.1.1.1,dst=2.2.2.2,proto=1,tos=0xff,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,skb_priority=0,eth,ip,in_port=9,nw_tos=252,nw_frag=no + [Megaflow: recirc_id=0,skb_priority=0,eth,icmp,in_port=9,nw_tos=252,nw_frag=no Datapath actions: dnl 100,dnl set(ipv4(tos=0x4/0xfc)),set(skb_priority(0x1)),1,dnl @@ -11884,7 +11884,7 @@ ovs-ofctl dump-flows br0 AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) AT_CHECK([tail -3 stdout], [0], [dnl -Megaflow: recirc_id=0,eth,ip,reg0=0/0x1,in_port=1,nw_src=10.10.10.2,nw_frag=no +Megaflow: recirc_id=0,eth,icmp,reg0=0/0x1,in_port=1,nw_src=10.10.10.2,nw_frag=no Datapath actions: drop Translation failed (Recursion too deep), packet is dropped. ]) diff --git a/tests/ofproto.at b/tests/ofproto.at index a666bebcac4..2fa8486a86f 100644 --- a/tests/ofproto.at +++ b/tests/ofproto.at @@ -6538,3 +6538,185 @@ verify_deleted OVS_VSWITCHD_STOP(["/nw_dst,output=2 +table=0 in_port=1 priority=83,ip,nw_dst=192.168.1.15,actions=set_field:192.168.21.26->nw_src,output=2 +table=0 in_port=1 priority=82,ip,nw_dst=192.168.1.14,actions=set_field:0x40->nw_tos,output=2 +table=0 in_port=1 priority=0,actions=drop +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl send a proto 0 packet to try and poison the DP flow path +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ + '5054000000075054000000050800450000548de140004000289fc0a801c4c0a8011408003bf60002001bbf080a640000000032ad010000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637']) + +AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.20,proto=0,frag=no), packets:0, bytes:0, used:never, actions:2 +]) + +dnl Send ICMP for mod nw_src and mod nw_dst +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.21,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.20,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will dec TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.10,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will mod TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.19,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will mod ECN +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.18,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will mod TOS +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.17,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will set DST +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.16,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will set SRC +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.15,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will set TOS +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.14,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.10,proto=1,ttl=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(ttl=63)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.14,proto=1,tos=0/0xfc,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(tos=0x40/0xfc)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.16,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(dst=192.168.20.26)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.17,proto=1,tos=0/0xfc,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(tos=0x40/0xfc)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.18,proto=1,tos=0/0x3,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(tos=0x2/0x3)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.19,proto=1,ttl=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(ttl=8)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.20,proto=0,frag=no), packets:0, bytes:0, used:never, actions:2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.20,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(dst=192.168.20.20)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.15,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(src=192.168.21.26)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.21,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(src=192.168.20.21)),2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ofproto - implicit mask of ipv6 proto with HOPOPT field]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_DATA([flows.txt], [dnl +table=0 in_port=1 priority=77,ip6,ipv6_dst=111:db8::3,actions=dec_ttl,output=2 +table=0 in_port=1 priority=76,ip6,ipv6_dst=111:db8::4,actions=mod_nw_ttl:8,output=2 +table=0 in_port=1 priority=75,ip6,ipv6_dst=111:db8::5,actions=mod_nw_ecn:2,output=2 +table=0 in_port=1 priority=74,ip6,ipv6_dst=111:db8::6,actions=mod_nw_tos:0x40,output=2 +table=0 in_port=1 priority=73,ip6,ipv6_dst=111:db8::7,actions=set_field:2112:db8::2->ipv6_dst,output=2 +table=0 in_port=1 priority=72,ip6,ipv6_dst=111:db8::8,actions=set_field:2112:db8::3->ipv6_src,output=2 +table=0 in_port=1 priority=72,ip6,ipv6_dst=111:db8::9,actions=set_field:44->ipv6_label,output=2 +table=0 in_port=1 priority=0,actions=drop +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl send a proto 0 packet to try and poison the DP flow path +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::3,proto=0,tclass=0,hlimit=64,frag=no)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::3,proto=0,hlimit=0,frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=2,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535)) +]) + +dnl Send ICMP for mod nw_src and mod nw_dst +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::3,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::4,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will dec TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::5,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will mod TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::6,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will mod ECN +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::7,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will mod TOS +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::8,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will set LABEL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::9,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::3,proto=0,hlimit=0,frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=2,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::3,proto=1,hlimit=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(hlimit=63)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::4,proto=1,hlimit=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(hlimit=8)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::5,proto=1,tclass=0/0x3,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(tclass=0x2/0x3)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::6,proto=1,tclass=0/0xfc,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(tclass=0x40/0xfc)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::7,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(dst=2112:db8::2)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::9,label=0,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(label=0x2c)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::8,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(src=2112:db8::3)),2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ofproto - implicit mask of ARP OPer field]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_DATA([flows.txt], [dnl +table=0 in_port=1 priority=77,arp,arp_sha=00:01:02:03:04:06,actions=set_field:0x1->arp_op,2 +table=0 in_port=1 priority=76,arp,arp_sha=00:01:02:03:04:07,actions=set_field:00:02:03:04:05:06->arp_sha,2 +table=0 in_port=1 priority=75,arp,arp_sha=00:01:02:03:04:08,actions=set_field:ff:00:00:00:00:ff->arp_tha,2 +table=0 in_port=1 priority=74,arp,arp_sha=00:01:02:03:04:09,actions=set_field:172.31.110.26->arp_spa,2 +table=0 in_port=1 priority=73,arp,arp_sha=00:01:02:03:04:0a,actions=set_field:172.31.110.10->arp_tpa,2 +table=0 in_port=1 priority=1,actions=drop +]) + +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Send op == 0 packet +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ + 'ffffffffffffaa55aa550000080600010800060400000001020304070c0a00010000000000000c0a0002']) + +AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=0,sha=00:01:02:03:04:07), packets:0, bytes:0, used:never, actions:2 +]) + +dnl Send op 2 -> set op +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=2,sha=00:01:02:03:04:06,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set SHA +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:07,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set THA +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:08,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set SIP +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:09,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set TIP +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:0a,tha=ff:ff:ff:ff:ff:ff)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=0,sha=00:01:02:03:04:07), packets:0, bytes:0, used:never, actions:2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=1,sha=00:01:02:03:04:07), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=1,sha=00:01:02:03:04:08,tha=ff:ff:ff:ff:ff:ff), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=2,sha=00:01:02:03:04:06), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(sip=172.31.110.1,op=1,sha=00:01:02:03:04:09), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(tip=172.31.110.25,op=1,sha=00:01:02:03:04:0a), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/packet-type-aware.at b/tests/packet-type-aware.at index acfb0913169..14cebf6efa5 100644 --- a/tests/packet-type-aware.at +++ b/tests/packet-type-aware.at @@ -1021,7 +1021,7 @@ AT_CHECK([ ], [0], [flow-dump from the main thread: recirc_id(0),in_port(p0),packet_type(ns=0,id=0),eth(src=aa:bb:cc:00:00:02,dst=aa:bb:cc:00:00:01),eth_type(0x0800),ipv4(dst=20.0.0.1,proto=47,frag=no), packets:3, bytes:378, used:0.0s, actions:tnl_pop(gre_sys) recirc_id(0),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x8847),eth_type(0x8847),mpls(label=999/0x0,tc=0/0,ttl=64/0x0,bos=1/1), packets:3, bytes:264, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00),pop_mpls(eth_type=0x800),recirc(0x1) -recirc_id(0x1),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(ttl=64,frag=no), packets:3, bytes:294, used:0.0s, actions:set(ipv4(ttl=63)),int-br +recirc_id(0x1),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=1,ttl=64,frag=no), packets:3, bytes:294, used:0.0s, actions:set(ipv4(ttl=63)),int-br ]) ovs-appctl time/warp 1000 From 8cba7a76d55c4e01f8cb394fb99063adb1b77dce Mon Sep 17 00:00:00 2001 From: Songtao Zhan Date: Tue, 4 Apr 2023 11:16:35 +0800 Subject: [PATCH 219/833] ovs-tcpdump: Stdout is shutdown before ovs-tcpdump exit. If there is a pipe behind ovs-tcpdump (such as ovs-tcpdump -i eth0 | grep "192.168.1.1"), the child process (grep "192.168.1.1") may exit first and close the pipe when received SIGTERM. When farther process (ovs-tcpdump) exit, stdout is flushed into broken pipe, and then received a exception IOError. To avoid such problems, ovs-tcpdump first close stdout before exit. Signed-off-by: Songtao Zhan Signed-off-by: Ilya Maximets --- utilities/ovs-tcpdump.in | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index a49ec9f9426..420c11eb8a6 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -538,6 +538,17 @@ def main(): print(data.decode('utf-8')) raise KeyboardInterrupt except KeyboardInterrupt: + # If there is a pipe behind ovs-tcpdump (such as ovs-tcpdump + # -i eth0 | grep "192.168.1.1"), the pipe is no longer available + # after received Ctrl+C. + # If we write data to an unavailable pipe, a pipe error will be + # reported, so we turn off stdout to avoid subsequent flushing + # of data into the pipe. + try: + sys.stdout.close() + except IOError: + pass + if pipes.poll() is None: pipes.terminate() From 7864b380d8dd88b74bed6e6fbf48c65dff5afb0e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 6 Apr 2023 22:45:27 +0200 Subject: [PATCH 220/833] AUTHORS: Add Songtao Zhan. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 00a6dd5f7ce..0b5408a30d1 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -412,6 +412,7 @@ Shu Shen shu.shen@radisys.com Simon Horman simon.horman@corigine.com Sivaprasad Tummala sivaprasad.tummala@intel.com Somnath Chatterjee somnath.b.chatterjee@ericsson.com +Songtao Zhan zhanst1@chinatelecom.cn Sorin Vinturis svinturis@cloudbasesolutions.com Sriharsha Basavapatna sriharsha.basavapatna@broadcom.com Steffen Gebert steffen.gebert@informatik.uni-wuerzburg.de From 75eae65602c8b665d882bfb9bb8259259ad95a4a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 5 Apr 2023 21:41:44 +0200 Subject: [PATCH 221/833] github: Test building Fedora RPMs. Testing that RPMs can be built to catch possible spec file issues like missing dependencies. GitHub seems to have an agreement with Docker Hub about rate limiting of image downloads, so it should not affect us. We may switch to quay.io if that will ever become a problem in the future. Reviewed-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .github/workflows/build-and-test.yml | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 82675b9734d..39649c1b5cd 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -242,3 +242,40 @@ jobs: with: name: deb-packages-${{ matrix.dpdk }}-dpdk path: '/home/runner/work/ovs/*.deb' + + build-linux-rpm: + name: linux rpm fedora + runs-on: ubuntu-latest + container: fedora:37 + timeout-minutes: 30 + + strategy: + fail-fast: false + + steps: + - name: checkout + uses: actions/checkout@v3 + - name: install dependencies + run: | + dnf install -y rpm-build dnf-plugins-core + sed -e 's/@VERSION@/0.0.1/' rhel/openvswitch-fedora.spec.in \ + > /tmp/ovs.spec + dnf builddep -y /tmp/ovs.spec + rm -f /tmp/ovs.spec + + - name: configure + run: ./boot.sh && ./configure + + - name: build + run: make rpm-fedora + + - name: install + run: dnf install -y rpm/rpmbuild/RPMS/*/*.rpm + + - name: upload rpm packages + uses: actions/upload-artifact@v3 + with: + name: rpm-packages + path: | + rpm/rpmbuild/SRPMS/*.rpm + rpm/rpmbuild/RPMS/*/*.rpm From 9fa612959cfb37115aac2678f10f1538b755c797 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Thu, 6 Apr 2023 12:10:22 +0200 Subject: [PATCH 222/833] ovs-dpctl: Add new command dpctl/ct-[sg]et-sweep-interval. Since 3d9c1b855a5f ("conntrack: Replace timeout based expiration lists with rculists.") the sweep interval changed as well as the constraints related to the sweeper. Being able to change the default reschedule time may be convenient in some conditions, like debugging. This patch introduces new commands allowing to get and set the sweep interval in ms. Signed-off-by: Paolo Valerio Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- NEWS | 3 ++ lib/conntrack-private.h | 1 + lib/conntrack.c | 18 +++++++++++- lib/conntrack.h | 2 ++ lib/ct-dpif.c | 14 ++++++++++ lib/ct-dpif.h | 1 + lib/dpctl.c | 61 +++++++++++++++++++++++++++++++++++++++++ lib/dpctl.man | 9 ++++++ lib/dpif-netdev.c | 17 ++++++++++++ lib/dpif-netlink.c | 2 ++ lib/dpif-provider.h | 4 +++ tests/ofproto-dpif.at | 22 +++++++++++++++ 12 files changed, 153 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index b6418c36e95..1155bfbb146 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,9 @@ Post-v3.1.0 - ovs-appctl: * Add support for selecting the source address with the 'ovs-appctl ovs/route/add' command. + * New commands "dpctl/{ct-get-sweep-interval,ct-set-sweep-interval}" that + allow to get and set, for the userspace datapath, the sweep interval + for the conntrack garbage collector. - ovs-ctl: * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask value when starting OVS daemons. E.g., use --ovsdb-server-umask=0002 diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index fae8b3a9baf..bb326868e9d 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -224,6 +224,7 @@ struct conntrack { struct ipf *ipf; /* Fragmentation handling context. */ uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */ atomic_bool tcp_seq_chk; /* Check TCP sequence numbers. */ + atomic_uint32_t sweep_ms; /* Next sweep interval. */ }; /* Lock acquisition order: diff --git a/lib/conntrack.c b/lib/conntrack.c index f86fa26f466..ce8a63de5b8 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -320,6 +320,7 @@ conntrack_init(void) atomic_count_init(&ct->n_conn, 0); atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT); atomic_init(&ct->tcp_seq_chk, true); + atomic_init(&ct->sweep_ms, 20000); latch_init(&ct->clean_thread_exit); ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct); ct->ipf = ipf_init(); @@ -1480,6 +1481,21 @@ set_label(struct dp_packet *pkt, struct conn *conn, } +int +conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms) +{ + atomic_store_relaxed(&ct->sweep_ms, ms); + return 0; +} + +uint32_t +conntrack_get_sweep_interval(struct conntrack *ct) +{ + uint32_t ms; + atomic_read_relaxed(&ct->sweep_ms, &ms); + return ms; +} + static size_t ct_sweep(struct conntrack *ct, struct rculist *list, long long now) OVS_NO_THREAD_SAFETY_ANALYSIS @@ -1504,7 +1520,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, long long now) static long long conntrack_clean(struct conntrack *ct, long long now) { - long long next_wakeup = now + 20 * 1000; + long long next_wakeup = now + conntrack_get_sweep_interval(ct); unsigned int n_conn_limit, i; size_t clean_end, count = 0; diff --git a/lib/conntrack.h b/lib/conntrack.h index b064abc9fa4..524ec0acb32 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -139,6 +139,8 @@ int conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns); int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns); int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns); int conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled); +int conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms); +uint32_t conntrack_get_sweep_interval(struct conntrack *ct); bool conntrack_get_tcp_seq_chk(struct conntrack *ct); struct ipf *conntrack_ipf_ctx(struct conntrack *ct); struct conntrack_zone_limit zone_limit_get(struct conntrack *ct, diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index d3b2783ce49..0c4b2964ff6 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -368,6 +368,20 @@ ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *zone_limits) : EOPNOTSUPP); } +int +ct_dpif_sweep(struct dpif *dpif, uint32_t *ms) +{ + if (*ms) { + return (dpif->dpif_class->ct_set_sweep_interval + ? dpif->dpif_class->ct_set_sweep_interval(dpif, *ms) + : EOPNOTSUPP); + } else { + return (dpif->dpif_class->ct_get_sweep_interval + ? dpif->dpif_class->ct_get_sweep_interval(dpif, ms) + : EOPNOTSUPP); + } +} + int ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) { diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index 5edbbfd3bdc..5579ac9253b 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -298,6 +298,7 @@ int ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit, int ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit, const struct ovs_list *, struct ovs_list *); int ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *); +int ct_dpif_sweep(struct dpif *, uint32_t *ms); int ct_dpif_ipf_set_enabled(struct dpif *, bool v6, bool enable); int ct_dpif_ipf_set_min_frag(struct dpif *, bool v6, uint32_t min_frag); int ct_dpif_ipf_set_max_nfrags(struct dpif *, uint32_t max_frags); diff --git a/lib/dpctl.c b/lib/dpctl.c index 59cc4f58c98..3ba40fa8fb6 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -2300,6 +2300,65 @@ dpctl_ct_get_limits(int argc, const char *argv[], return error; } +static int +dpctl_ct_get_sweep(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + uint32_t sweep_ms = 0; + struct dpif *dpif; + + int error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif); + if (error) { + return error; + } + + error = ct_dpif_sweep(dpif, &sweep_ms); + if (error) { + dpctl_error(dpctl_p, error, "failed to get the sweep interval"); + } else { + dpctl_print(dpctl_p, "%"PRIu32, sweep_ms); + } + + dpif_close(dpif); + return error; +} + +static int +dpctl_ct_set_sweep(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + uint32_t sweep_ms = 0; + struct dpif *dpif; + + int error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif); + if (error) { + return error; + } + + if (!ovs_scan(argv[argc - 1], "%"SCNu32, &sweep_ms) || + sweep_ms == 0) { + ds_put_format(&ds, "invalid sweep value"); + error = EINVAL; + goto error; + } + + error = ct_dpif_sweep(dpif, &sweep_ms); + if (!error) { + dpctl_print(dpctl_p, "setting sweep interval successful\n"); + goto out; + } + + ds_put_format(&ds, "failed to set the sweep interval"); + +error: + dpctl_error(dpctl_p, error, "%s", ds_cstr(&ds)); + ds_destroy(&ds); +out: + dpif_close(dpif); + return error; +} + static int ipf_set_enabled__(int argc, const char *argv[], struct dpctl_params *dpctl_p, bool enabled) @@ -2913,6 +2972,8 @@ static const struct dpctl_command all_commands[] = { DP_RO }, { "ct-get-limits", "[dp] [zone=N1[,N2]...]", 0, 2, dpctl_ct_get_limits, DP_RO }, + { "ct-get-sweep-interval", "[dp]", 0, 1, dpctl_ct_get_sweep, DP_RO }, + { "ct-set-sweep-interval", "[dp] ms", 1, 2, dpctl_ct_set_sweep, DP_RW }, { "ipf-set-enabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_enabled, DP_RW }, { "ipf-set-disabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_disabled, DP_RW }, { "ipf-set-min-frag", "[dp] v4|v6 minfragment", 2, 3, diff --git a/lib/dpctl.man b/lib/dpctl.man index 920446e8cb6..d448596d353 100644 --- a/lib/dpctl.man +++ b/lib/dpctl.man @@ -382,6 +382,15 @@ Prints whether TCP sequence checking is enabled or disabled on \fIdp\fR. Only supported for the userspace datapath. . .TP +\*(DX\fBct\-set\-sweep\-interval\fR [\fIdp\fR] \fIms\fR +Sets the sweep interval. Only supported for the userspace datapath. +. +.TP +\*(DX\fBct\-get\-sweep\-interval\fR [\fIdp\fR] +Prints the current sweep interval in ms. Only supported for the userspace +datapath. +. +.TP \*(DX\fBct\-set\-limits\fR [\fIdp\fR] [\fBdefault=\fIdefault_limit\fR] [\fBzone=\fIzone\fR,\fBlimit=\fIlimit\fR]... Sets the maximum allowed number of connections in a connection tracking zone. A specific \fIzone\fR may be set to \fIlimit\fR, and multiple zones diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index aed2c8fbbe9..70b953ae6dd 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9317,6 +9317,21 @@ dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) return 0; } +static int +dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + return conntrack_set_sweep_interval(dp->conntrack, ms); +} + +static int +dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + *ms = conntrack_get_sweep_interval(dp->conntrack); + return 0; +} + static int dpif_netdev_ct_set_limits(struct dpif *dpif, const uint32_t *default_limits, @@ -9668,6 +9683,8 @@ const struct dpif_class dpif_netdev_class = { dpif_netdev_ct_get_nconns, dpif_netdev_ct_set_tcp_seq_chk, dpif_netdev_ct_get_tcp_seq_chk, + dpif_netdev_ct_set_sweep_interval, + dpif_netdev_ct_get_sweep_interval, dpif_netdev_ct_set_limits, dpif_netdev_ct_get_limits, dpif_netdev_ct_del_limits, diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 55b5b0a8549..de071127788 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -4572,6 +4572,8 @@ const struct dpif_class dpif_netlink_class = { NULL, /* ct_get_nconns */ NULL, /* ct_set_tcp_seq_chk */ NULL, /* ct_get_tcp_seq_chk */ + NULL, /* ct_set_sweep_interval */ + NULL, /* ct_get_sweep_interval */ dpif_netlink_ct_set_limits, dpif_netlink_ct_get_limits, dpif_netlink_ct_del_limits, diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index b8ead8a02a0..a33c6ec3089 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -493,6 +493,10 @@ struct dpif_class { int (*ct_set_tcp_seq_chk)(struct dpif *, bool enabled); /* Get the TCP sequence checking configuration. */ int (*ct_get_tcp_seq_chk)(struct dpif *, bool *enabled); + /* Updates the sweep interval for the CT sweeper. */ + int (*ct_set_sweep_interval)(struct dpif *, uint32_t ms); + /* Get the current value of the sweep interval for the CT sweeper. */ + int (*ct_get_sweep_interval)(struct dpif *, uint32_t *ms); /* Connection tracking per zone limit */ diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 62291de4ac1..6824ce0bbfe 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -11721,6 +11721,28 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP +dnl Checks the get/set sweep interval +AT_SETUP([ofproto-dpif - conntrack - change sweep interval]) +OVS_VSWITCHD_START + +# Check the default value. +AT_CHECK([ovs-appctl dpctl/ct-get-sweep-interval], [0], [dnl +20000 +]) + +# Set the interval to 5s. +AT_CHECK([ovs-appctl dpctl/ct-set-sweep-interval 5000], [0], [dnl +setting sweep interval successful +]) + +# Verify that the previous value has been applied. +AT_CHECK([ovs-appctl dpctl/ct-get-sweep-interval], [0], [dnl +5000 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto - set mtu]) OVS_VSWITCHD_START From d70688a7291edb432fd66b9230a92842fcfd3607 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Tue, 18 Apr 2023 14:58:07 +0200 Subject: [PATCH 223/833] system-offloads-traffic: Fix tc ingress pps check for meter offload. Caught during some code review. SUPPORT_TC_INGRESS_PPS has been replaced with CHECK_TC_INGRESS_PPS(). Fixes: 5f0fdf5e2c2e ("test: Move check for tc ingress pps support to test script.") Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/system-offloads-traffic.at | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index da18597cd85..ae302a29499 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -240,7 +240,7 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads enabled]) AT_KEYWORDS([offload-meter]) -AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +CHECK_TC_INGRESS_PPS() AT_SKIP_IF([test $HAVE_NC = "no"]) OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) From 5575539f6c98cbec91f955805ae079899396f521 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 Mar 2023 21:42:56 +0200 Subject: [PATCH 224/833] ovsdb-tool: Fix cluster-to-standalone for DB conversion records. If database conversion happens, both schema and the new data are present in the database record. However, the schema is just silently ignored by ovsdb-tool cluster-to-standalone. This creates data inconsistency if the new data contains new columns, for example, so the resulting database file will not be readable, or data will be lost. Fix that by re-setting the database whenever a conversion record is found and actually writing a new schema that will match the actual data. The database file will not be that similar to the original, but there is no way to represent conversion in a standalone database file format otherwise. Fixes: 00de46f9ee42 ("ovsdb-tool: Convert clustered db to standalone db.") Reviewed-by: Simon Horman Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/log.c | 17 +++++++++++ ovsdb/log.h | 3 ++ ovsdb/ovsdb-tool.c | 18 ++++++++++++ tests/ovsdb-tool.at | 69 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 107 insertions(+) diff --git a/ovsdb/log.c b/ovsdb/log.c index e42f002464b..fff7c6ba104 100644 --- a/ovsdb/log.c +++ b/ovsdb/log.c @@ -552,6 +552,23 @@ ovsdb_log_truncate(struct ovsdb_log *file) return error; } +/* Removes all the data from the log by moving current offset to zero and + * truncating the file to zero bytes. After this operation the file is empty + * and in a write state. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_log_reset(struct ovsdb_log *file) +{ + ovsdb_error_destroy(file->error); + file->offset = file->prev_offset = 0; + file->error = ovsdb_log_truncate(file); + if (file->error) { + file->state = OVSDB_LOG_WRITE_ERROR; + return ovsdb_error_clone(file->error); + } + file->state = OVSDB_LOG_WRITE; + return NULL; +} + /* Composes a log record for 'json' by filling 'header' with a header line and * 'data' with a data line (each ending with a new-line). To write the record * to a file, write 'header' followed by 'data'. diff --git a/ovsdb/log.h b/ovsdb/log.h index 90714ea1319..63e5681a0b6 100644 --- a/ovsdb/log.h +++ b/ovsdb/log.h @@ -66,6 +66,9 @@ struct ovsdb_error *ovsdb_log_read(struct ovsdb_log *, struct json **) OVS_WARN_UNUSED_RESULT; void ovsdb_log_unread(struct ovsdb_log *); +struct ovsdb_error *ovsdb_log_reset(struct ovsdb_log *) + OVS_WARN_UNUSED_RESULT; + void ovsdb_log_compose_record(const struct json *, const char *magic, struct ds *header, struct ds *data); diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index 60f353197bf..ea2b75b4671 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -1018,7 +1018,25 @@ raft_record_to_standalone_log(const struct raft_record *r, if (pa->n != 2) { ovs_fatal(0, "Incorrect raft record array length"); } + + struct json *schema_json = pa->elems[0]; struct json *data_json = pa->elems[1]; + + if (schema_json->type != JSON_NULL) { + /* This is a database conversion record. Reset the log and + * write the new schema. Data JSON should also be part of + * the conversion. */ + struct ovsdb_schema *schema; + + if (data_json->type == JSON_NULL) { + ovs_fatal( + 0, "Invalid database conversion in the log: no data"); + } + check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema)); + ovsdb_schema_destroy(schema); + check_ovsdb_error(ovsdb_log_reset(db_log_data)); + check_ovsdb_error(ovsdb_log_write(db_log_data, schema_json)); + } if (data_json->type != JSON_NULL) { check_ovsdb_error(ovsdb_log_write(db_log_data, data_json)); } diff --git a/tests/ovsdb-tool.at b/tests/ovsdb-tool.at index 12ad6fb3fc6..5496ccda77d 100644 --- a/tests/ovsdb-tool.at +++ b/tests/ovsdb-tool.at @@ -465,6 +465,7 @@ AT_SETUP([ovsdb-tool convert-to-standalone]) AT_KEYWORDS([ovsdb file positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:s1.raft], [0], [stdout], [ignore]) +on_exit 'kill `cat ovsdb-server.pid`' AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db >/dev/null 2>&1]) for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -498,3 +499,71 @@ OVS_APP_EXIT_AND_WAIT([ovsdb-server]) # Make sure both standalone and cluster db data matches. AT_CHECK([diff standalonedump clusterdump]) AT_CLEANUP + +AT_SETUP([ovsdb-tool convert-to-standalone after schema conversion]) +AT_KEYWORDS([ovsdb file positive]) +ordinal_schema > schema +AT_CHECK([ovsdb-tool create-cluster db schema unix:s1.raft], [0], [stdout], [ignore]) +on_exit 'kill `cat ovsdb-server.pid`' +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket dnl + --log-file db >/dev/null 2>&1]) +for txn in m4_foreach([txn], [[[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 0, "name": "zero"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 1, "name": "one"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do + AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) +done + +dnl Change the schema. +AT_CHECK([sed 's/5\.1\.3/5.1.4/' < schema > schema2]) +AT_CHECK([sed -i'back' -e '/.*"number":.*/a \ + "is_seven": {"type": "boolean"}, + ' schema2]) + +dnl Convert the database. +AT_CHECK([ovsdb-client convert unix:socket schema2]) + +dnl Add a new row with a new column. +AT_CHECK([ovsdb-client transact unix:socket dnl + '[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 7, "name": "seven", "is_seven": true} + }]]'], [0], [ignore], [ignore]) + +AT_CHECK([ovsdb-client dump unix:socket > clusterdump]) + +AT_CHECK([uuidfilt clusterdump], [0], [dnl +ordinals table +_uuid is_seven name number +------------------------------------ -------- ----- ------ +<0> false one 1 +<1> false two 2 +<2> false zero 0 +<3> true seven 7 +]) + +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + +dnl Convert to standalone database from clustered database. +AT_CHECK(ovsdb-tool cluster-to-standalone db1 db) + +dnl Check it's a standalone db. +AT_CHECK([ovsdb-tool db-is-standalone db1]) + +dnl Dump the standalone db data. +AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir dnl + --pidfile --log-file --remote=punix:db.sock db1]) +AT_CHECK([ovsdb_client_wait ordinals connected]) +AT_CHECK([ovsdb-client dump > standalonedump]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + +dnl Make sure both standalone and cluster db data matches. +AT_CHECK([diff standalonedump clusterdump]) +AT_CLEANUP From a73b0206ba6f3991ac1550c7c07f11fa4237a898 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 Mar 2023 21:42:57 +0200 Subject: [PATCH 225/833] ovsdb: Check for ephemeral columns before writing a new schema. Clustered databases do not support ephemeral columns, but ovsdb-server checks for them after the conversion result is read from the storage. It's much easier to recover if this constraint is checked before writing to the storage instead. It's not a big problem, because the check is always performed by the native ovsdb clients before sending a conversion request. But the server, in general, should not trust clients to do the right thing. Check in the update_schema() remains, because we shouldn't blindly trust the storage. Reviewed-by: Simon Horman Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/storage.c | 24 +++++++++++++++++------- ovsdb/storage.h | 2 +- ovsdb/transaction.c | 2 +- ovsdb/transaction.h | 3 ++- ovsdb/trigger.c | 5 +++-- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/ovsdb/storage.c b/ovsdb/storage.c index e8f95ce6428..6c395106c01 100644 --- a/ovsdb/storage.c +++ b/ovsdb/storage.c @@ -623,7 +623,7 @@ ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, struct ovsdb_write * OVS_WARN_UNUSED_RESULT ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, - const struct json *schema, + const struct ovsdb_schema *schema, const struct json *data, const struct uuid *prereq, struct uuid *resultp) @@ -633,13 +633,23 @@ ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, if (storage->error) { w->error = ovsdb_error_clone(storage->error); } else if (storage->raft) { - struct json *txn_json = json_array_create_2(json_clone(schema), - json_clone(data)); - w->command = raft_command_execute(storage->raft, txn_json, - prereq, &result); - json_destroy(txn_json); + /* Clustered storage doesn't support ephemeral columns. */ + w->error = ovsdb_schema_check_for_ephemeral_columns(schema); + if (!w->error) { + struct json *schema_json, *txn_json; + + schema_json = ovsdb_schema_to_json(schema); + txn_json = json_array_create_2(schema_json, json_clone(data)); + w->command = raft_command_execute(storage->raft, txn_json, + prereq, &result); + json_destroy(txn_json); + } } else if (storage->log) { - w->error = ovsdb_storage_store_snapshot__(storage, schema, data, 0); + struct json *schema_json = ovsdb_schema_to_json(schema); + + w->error = ovsdb_storage_store_snapshot__(storage, schema_json, + data, 0); + json_destroy(schema_json); } else { /* When 'error' and 'command' are both null, it indicates that the * command is complete. This is fine since this unbacked storage drops diff --git a/ovsdb/storage.h b/ovsdb/storage.h index a1fdaa564e4..05f40ce934a 100644 --- a/ovsdb/storage.h +++ b/ovsdb/storage.h @@ -85,7 +85,7 @@ struct ovsdb_error *ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, struct ovsdb_write *ovsdb_storage_write_schema_change( struct ovsdb_storage *, - const struct json *schema, const struct json *data, + const struct ovsdb_schema *, const struct json *data, const struct uuid *prereq, struct uuid *result) OVS_WARN_UNUSED_RESULT; diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 03541af85d7..f01de2a34fd 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -1251,7 +1251,7 @@ ovsdb_txn_precheck_prereq(const struct ovsdb *db) struct ovsdb_txn_progress * ovsdb_txn_propose_schema_change(struct ovsdb *db, - const struct json *schema, + const struct ovsdb_schema *schema, const struct json *data) { struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress); diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h index 6b5bb7f24b2..9991f34d24c 100644 --- a/ovsdb/transaction.h +++ b/ovsdb/transaction.h @@ -21,6 +21,7 @@ struct json; struct ovsdb; +struct ovsdb_schema; struct ovsdb_table; struct uuid; @@ -41,7 +42,7 @@ struct ovsdb_error *ovsdb_txn_propose_commit_block(struct ovsdb_txn *, void ovsdb_txn_complete(struct ovsdb_txn *); struct ovsdb_txn_progress *ovsdb_txn_propose_schema_change( - struct ovsdb *, const struct json *schema, const struct json *data); + struct ovsdb *, const struct ovsdb_schema *, const struct json *data); bool ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *); const struct ovsdb_error *ovsdb_txn_progress_get_error( diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 01bb80e282b..3c93ae580f3 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -274,8 +274,8 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) if (!error) { error = ovsdb_convert(t->db, new_schema, &newdb); } - ovsdb_schema_destroy(new_schema); if (error) { + ovsdb_schema_destroy(new_schema); trigger_convert_error(t, error); return false; } @@ -286,7 +286,8 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) /* Propose the change. */ t->progress = ovsdb_txn_propose_schema_change( - t->db, new_schema_json, txn_json); + t->db, new_schema, txn_json); + ovsdb_schema_destroy(new_schema); json_destroy(txn_json); t->reply = jsonrpc_create_reply(json_object_create(), t->request->id); From 4d6cdd8e0d86d2b3b6866aaacf327d8c5e7092df Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 Mar 2023 21:42:58 +0200 Subject: [PATCH 226/833] ovsdb: Allow conversion records with no data in a clustered storage. If the schema with no data was read from the clustered storage, it should mean a database conversion request. In general, we can get: 1. Just data --> Transaction record. 2. Schema + Data --> Database conversion or raft snapshot install. 3. Just schema --> New. Database conversion request. We cannot distinguish between conversion and snapshot installation request in the current implementation, so we will keep handling conversion with data in the same way as before, i.e. if data is provided, we should use it. ovsdb-tool is updated to handle this record type as well while converting cluster to standalone. This change doesn't introduce a way for such records to appear in the database. That will be added in the future commits targeting conversion speed increase. Reviewed-by: Simon Horman Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 65 ++++++++++++++++++++++++++++++-------------- ovsdb/ovsdb-tool.c | 35 ++++++++++++++++++------ ovsdb/relay.c | 22 ++++++++++++--- ovsdb/relay.h | 7 +++-- 4 files changed, 93 insertions(+), 36 deletions(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 4fea2dbda7b..91c284e99c9 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -573,8 +573,9 @@ close_db(struct server_config *config, struct db *db, char *comment) } } -static void -update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, void *aux) +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, + bool conversion_with_no_data, void *aux) { struct server_config *config = aux; @@ -586,13 +587,27 @@ update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, void *aux) : xasprintf("database %s connected to storage", db->name))); } - ovsdb_replace(db, ovsdb_create(ovsdb_schema_clone(schema), NULL)); + if (db->schema && conversion_with_no_data) { + struct ovsdb *new_db = NULL; + struct ovsdb_error *error; + + error = ovsdb_convert(db, schema, &new_db); + if (error) { + /* Should never happen, because conversion should have been + * checked before writing the schema to the storage. */ + return error; + } + ovsdb_replace(db, new_db); + } else { + ovsdb_replace(db, ovsdb_create(ovsdb_schema_clone(schema), NULL)); + } /* Force update to schema in _Server database. */ struct db *dbp = shash_find_data(config->all_dbs, db->name); if (dbp) { dbp->row_uuid = UUID_ZERO; } + return NULL; } static struct ovsdb_error * OVS_WARN_UNUSED_RESULT @@ -600,23 +615,30 @@ parse_txn(struct server_config *config, struct db *db, const struct ovsdb_schema *schema, const struct json *txn_json, const struct uuid *txnid) { + struct ovsdb_error *error = NULL; + struct ovsdb_txn *txn = NULL; + if (schema) { - /* We're replacing the schema (and the data). Destroy the database - * (first grabbing its storage), then replace it with the new schema. - * The transaction must also include the replacement data. + /* We're replacing the schema (and the data). If transaction includes + * replacement data, destroy the database (first grabbing its storage), + * then replace it with the new schema. If not, it's a conversion + * without data specified. In this case, convert the current database + * to a new schema instead. * * Only clustered database schema changes and snapshot installs * go through this path. */ - ovs_assert(txn_json); ovs_assert(ovsdb_storage_is_clustered(db->db->storage)); - struct ovsdb_error *error = ovsdb_schema_check_for_ephemeral_columns( - schema); + error = ovsdb_schema_check_for_ephemeral_columns(schema); + if (error) { + return error; + } + + error = update_schema(db->db, schema, txn_json == NULL, config); if (error) { return error; } - update_schema(db->db, schema, config); } if (txn_json) { @@ -624,24 +646,25 @@ parse_txn(struct server_config *config, struct db *db, return ovsdb_error(NULL, "%s: data without schema", db->filename); } - struct ovsdb_txn *txn; - struct ovsdb_error *error; - error = ovsdb_file_txn_from_json(db->db, txn_json, false, &txn); - if (!error) { - ovsdb_txn_set_txnid(txnid, txn); - log_and_free_error(ovsdb_txn_replay_commit(txn)); - } - if (!error && !uuid_is_zero(txnid)) { - db->db->prereq = *txnid; - } if (error) { ovsdb_storage_unread(db->db->storage); return error; } + } else if (schema) { + /* We just performed conversion without data. Transaction history + * was destroyed. Commit a dummy transaction to set the txnid. */ + txn = ovsdb_txn_create(db->db); } - return NULL; + if (txn) { + ovsdb_txn_set_txnid(txnid, txn); + error = ovsdb_txn_replay_commit(txn); + if (!error && !uuid_is_zero(txnid)) { + db->db->prereq = *txnid; + } + } + return error; } static void diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index ea2b75b4671..e265365322c 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -1006,7 +1006,8 @@ raft_header_to_standalone_log(const struct raft_header *h, } static void -raft_record_to_standalone_log(const struct raft_record *r, +raft_record_to_standalone_log(const char *db_file_name, + const struct raft_record *r, struct ovsdb_log *db_log_data) { if (r->type == RAFT_REC_ENTRY) { @@ -1024,15 +1025,30 @@ raft_record_to_standalone_log(const struct raft_record *r, if (schema_json->type != JSON_NULL) { /* This is a database conversion record. Reset the log and - * write the new schema. Data JSON should also be part of - * the conversion. */ + * write the new schema. */ struct ovsdb_schema *schema; + check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema)); + if (data_json->type == JSON_NULL) { - ovs_fatal( - 0, "Invalid database conversion in the log: no data"); + /* We have a conversion request with no data. There is no + * other way as to read back what we have and convert. */ + struct ovsdb *old_db, *new_db; + + check_ovsdb_error(ovsdb_log_commit_block(db_log_data)); + + old_db = ovsdb_file_read(db_file_name, false); + check_ovsdb_error(ovsdb_convert(old_db, schema, &new_db)); + ovsdb_destroy(old_db); + + pa->elems[1] = ovsdb_to_txn_json( + new_db, "converted by ovsdb-tool", true); + ovsdb_destroy(new_db); + + json_destroy(data_json); + data_json = pa->elems[1]; } - check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema)); + ovsdb_schema_destroy(schema); check_ovsdb_error(ovsdb_log_reset(db_log_data)); check_ovsdb_error(ovsdb_log_write(db_log_data, schema_json)); @@ -1654,7 +1670,8 @@ do_compare_versions(struct ovs_cmdl_context *ctx) } static void -do_convert_to_standalone(struct ovsdb_log *log, struct ovsdb_log *db_log_data) +do_convert_to_standalone(const char *db_file_name, + struct ovsdb_log *log, struct ovsdb_log *db_log_data) { for (unsigned int i = 0; ; i++) { struct json *json; @@ -1671,7 +1688,7 @@ do_convert_to_standalone(struct ovsdb_log *log, struct ovsdb_log *db_log_data) } else { struct raft_record r; check_ovsdb_error(raft_record_from_json(&r, json)); - raft_record_to_standalone_log(&r, db_log_data); + raft_record_to_standalone_log(db_file_name, &r, db_log_data); raft_record_uninit(&r); } json_destroy(json); @@ -1694,7 +1711,7 @@ do_cluster_standalone(struct ovs_cmdl_context *ctx) if (strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC) != 0) { ovs_fatal(0, "Database is not clustered db.\n"); } - do_convert_to_standalone(log, db_log_data); + do_convert_to_standalone(db_file_name, log, db_log_data); check_ovsdb_error(ovsdb_log_commit_block(db_log_data)); ovsdb_log_close(db_log_data); ovsdb_log_close(log); diff --git a/ovsdb/relay.c b/ovsdb/relay.c index 9ff6ed8f393..94ffe01e54e 100644 --- a/ovsdb/relay.c +++ b/ovsdb/relay.c @@ -301,6 +301,8 @@ static void ovsdb_relay_parse_update(struct relay_ctx *ctx, const struct ovsdb_cs_update_event *update) { + struct ovsdb_error *error = NULL; + if (!ctx->db) { return; } @@ -308,15 +310,27 @@ ovsdb_relay_parse_update(struct relay_ctx *ctx, if (update->monitor_reply && ctx->new_schema) { /* There was a schema change. Updating a database with a new schema * before processing monitor reply with the new data. */ - ctx->schema_change_cb(ctx->db, ctx->new_schema, - ctx->schema_change_aux); + error = ctx->schema_change_cb(ctx->db, ctx->new_schema, false, + ctx->schema_change_aux); + if (error) { + /* Should never happen, but handle this case anyway. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + char *s = ovsdb_error_to_string_free(error); + + VLOG_ERR_RL(&rl, "%s", s); + free(s); + + ovsdb_cs_flag_inconsistency(ctx->cs); + return; + } ovsdb_schema_destroy(ctx->new_schema); ctx->new_schema = NULL; } struct ovsdb_cs_db_update *du; - struct ovsdb_error *error = ovsdb_cs_parse_db_update(update->table_updates, - update->version, &du); + + error = ovsdb_cs_parse_db_update(update->table_updates, + update->version, &du); if (!error) { if (update->clear) { error = ovsdb_relay_clear(ctx->db); diff --git a/ovsdb/relay.h b/ovsdb/relay.h index 390ea70c827..2d66b5e5fa8 100644 --- a/ovsdb/relay.h +++ b/ovsdb/relay.h @@ -23,8 +23,11 @@ struct json; struct ovsdb; struct ovsdb_schema; -typedef void (*schema_change_callback)(struct ovsdb *, - const struct ovsdb_schema *, void *aux); +typedef struct ovsdb_error *(*schema_change_callback)( + struct ovsdb *, + const struct ovsdb_schema *, + bool conversion_with_no_data, + void *aux); void ovsdb_relay_add_db(struct ovsdb *, const char *remote, schema_change_callback schema_change_cb, From 08449bb470c9f99f8bdc13a14d8ef91bd0b4be4f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 Mar 2023 21:42:59 +0200 Subject: [PATCH 227/833] ovsdb: Perform conversion with no data for clustered databases. Currently, database schema conversion in case of clustered database produces a transaction record with both new schema and converted database data. So, the sequence of events is following: 1. Get the new schema. 2. Convert the database to a new schema. 3. Translate the newly converted database into JSON. 4. Write the schema + data JSON to the storage. 5. Destroy converted version of a database. 6. Read schema + data JSON from the storage and parse. 7. Create a new database from a parsed database data. 8. Replace current database with the new one. Most of these steps are very computationally expensive. Also, conversion to/from JSON is much more expensive than direct database conversion with ovsdb_convert() that can make use of shallow data copies. Instead of doing all that, let's make use of previously introduced ability to not write the converted data into the storage. The process will look like this then: 1. Get the new schema. 2. Convert the database to a new schema (to verify that it is possible). 3. Write the schema to the storage. 4. Destroy converted version of a database. 5. Read the new schema from the storage and parse. 6. Convert the database to a new schema. 7. Replace current database with the new one. Most of the operations here are performed on the small schema object, instead of the actual database data. Two remaining data operations (actual conversion) are noticeably faster than conversion to/from JSON due to reference counting and shallow data copies. Steps 4-6 can be optimized later to not convert twice on the process that initiates the conversion. The change results in following performance improvements in conversion of OVN_Southbound database schema from version 20.23.0 to 20.27.0 (measured on a single-server RAFT cluster with no clients): | Before | After +---------+-------------------+---------+------------------ DB size | Total | Max poll interval | Total | Max poll interval --------+---------+-------------------+---------+------------------ 542 MB | 47 sec. | 26 sec. | 15 sec. | 10 sec. 225 MB | 19 sec. | 10 sec. | 6 sec. | 4.5 sec. 542 MB database had 19.5 M atoms, 225 MB database had 7.5 M atoms. Overall performance improvement is about 3x. Also, note that before this change database conversion basically doubles the database file on disk. Now it only writes a small schema JSON. Since the change requires backward-incompatible database file format changes, documentation is updated on how to perform an upgrade. Handled the same way as we did for the previous incompatible format change in 2.15 (column diffs). Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-December/052140.html Reviewed-by: Simon Horman Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- Documentation/ref/ovsdb.7.rst | 63 +++++++++++++++++++++++++++++++++++ NEWS | 10 ++++++ ovsdb/ovsdb-server.c | 7 ++++ ovsdb/ovsdb.c | 34 +++++++++++++++++++ ovsdb/ovsdb.h | 3 ++ ovsdb/trigger.c | 11 ++++-- 6 files changed, 125 insertions(+), 3 deletions(-) diff --git a/Documentation/ref/ovsdb.7.rst b/Documentation/ref/ovsdb.7.rst index 980ba29e760..84b153d2424 100644 --- a/Documentation/ref/ovsdb.7.rst +++ b/Documentation/ref/ovsdb.7.rst @@ -213,6 +213,12 @@ Open vSwitch 2.6 introduced support for the active-backup service model. `Upgrading from version 2.14 and earlier to 2.15 and later`_ and `Downgrading from version 2.15 and later to 2.14 and earlier`_. + Another change happened in version 3.2. To upgrade/downgrade the + ``ovsdb-server`` processes across this version follow the instructions + described under + `Upgrading from version 3.1 and earlier to 3.2 and later`_ and + `Downgrading from version 3.2 and later to 3.1 and earlier`_. + Clustered Database Service Model -------------------------------- @@ -287,6 +293,12 @@ schema, which is covered later under `Upgrading or Downgrading a Database`_.) `Upgrading from version 2.14 and earlier to 2.15 and later`_ and `Downgrading from version 2.15 and later to 2.14 and earlier`_. + Another change happened in version 3.2. To upgrade/downgrade the + ``ovsdb-server`` processes across this version follow the instructions + described under + `Upgrading from version 3.1 and earlier to 3.2 and later`_ and + `Downgrading from version 3.2 and later to 3.1 and earlier`_. + Clustered OVSDB does not support the OVSDB "ephemeral columns" feature. ``ovsdb-tool`` and ``ovsdb-client`` change ephemeral columns into persistent ones when they work with schemas for clustered databases. Future versions of @@ -341,6 +353,57 @@ For all service models it's required to: 3. Downgrade and restart ``ovsdb-server`` processes. +Upgrading from version 3.1 and earlier to 3.2 and later +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is another change of a database file format in version 3.2 that doesn't +allow older versions of ``ovsdb-server`` to read the database file modified by +the ``ovsdb-server`` version 3.2 or later. This also affects runtime +communications between servers in **cluster** service models. To upgrade the +``ovsdb-server`` processes from one version of Open vSwitch (3.1 or earlier) to +another (3.2 or higher) instructions below should be followed. (This is +different from upgrading a database schema, which is covered later under +`Upgrading or Downgrading a Database`_.) + +In case of **standalone** or **active-backup** service model no special +handling during upgrade is required. + +For the **cluster** service model recommended upgrade strategy is following: + +1. Upgrade processes one at a time. Each ``ovsdb-server`` process after + upgrade should be started with ``--disable-file-no-data-conversion`` command + line argument. + +2. When all ``ovsdb-server`` processes upgraded, use ``ovs-appctl`` to invoke + ``ovsdb/file/no-data-conversion-enable`` command on each of them or restart + all ``ovsdb-server`` processes one at a time without + ``--disable-file-no-data-conversion`` command line option. + +Downgrading from version 3.2 and later to 3.1 and earlier +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to upgrading covered under `Upgrading from version 3.1 and earlier to +3.2 and later`_, downgrading from the ``ovsdb-server`` version 3.2 and later +to 3.1 and earlier requires additional steps. (This is different from +upgrading a database schema, which is covered later under +`Upgrading or Downgrading a Database`_.) + +For all service models it's required to: + +1. Compact all database files via ``ovsdb-server/compact`` command with + ``ovs-appctl`` utility. This should be done for each involved + ``ovsdb-server`` process separately (single process for **standalone** + service model, all involved processes for **active-backup** and **cluster** + service models). + +2. Stop all ``ovsdb-server`` processes. Make sure that no database schema + conversion operations were performed between steps 1 and 2. For + **standalone** and **active-backup** service models, the database compaction + can be performed after stopping all the processes instead with the + ``ovsdb-tool compact`` command. + +3. Downgrade and restart ``ovsdb-server`` processes. + Understanding Cluster Consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/NEWS b/NEWS index 1155bfbb146..cfd4666630d 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,15 @@ Post-v3.1.0 -------------------- + - OVSDB: + * Changed format in which ovsdb schema conversion operations are stored in + clustered database files. Such operations are now allowed to contain + the bare schema (without data). This allows to significantly improve + the schema conversion performance. + New ovsdb-server process will be able to read old database format, but + old processes will *fail* to read database created by the new one, if + conversion operation is present. For the cluster service model follow + upgrade instructions in 'Upgrading from version 3.1 and earlier to 3.2 + and later' section of ovsdb(7). - IPFIX template and statistics intervals can now be configured through two new options in the IPFIX table: 'template_interval' and 'stats_interval'. - Linux kernel datapath: diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 91c284e99c9..b6481407628 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -1971,6 +1971,7 @@ parse_options(int argc, char *argv[], OPT_ACTIVE, OPT_NO_DBS, OPT_FILE_COLUMN_DIFF, + OPT_FILE_NO_DATA_CONVERSION, VLOG_OPTION_ENUMS, DAEMON_OPTION_ENUMS, SSL_OPTION_ENUMS, @@ -1996,6 +1997,8 @@ parse_options(int argc, char *argv[], {"active", no_argument, NULL, OPT_ACTIVE}, {"no-dbs", no_argument, NULL, OPT_NO_DBS}, {"disable-file-column-diff", no_argument, NULL, OPT_FILE_COLUMN_DIFF}, + {"disable-file-no-data-conversion", no_argument, NULL, + OPT_FILE_NO_DATA_CONVERSION}, {NULL, 0, NULL, 0}, }; char *short_options = ovs_cmdl_long_options_to_short_options(long_options); @@ -2092,6 +2095,10 @@ parse_options(int argc, char *argv[], ovsdb_file_column_diff_disable(); break; + case OPT_FILE_NO_DATA_CONVERSION: + ovsdb_no_data_conversion_disable(); + break; + case '?': exit(EXIT_FAILURE); diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index afec96264ca..f67b836d736 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -39,6 +39,7 @@ #include "transaction.h" #include "transaction-forward.h" #include "trigger.h" +#include "unixctl.h" #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(ovsdb); @@ -177,6 +178,39 @@ ovsdb_is_valid_version(const char *s) return ovsdb_parse_version(s, &version); } +/* If set to 'true', database schema conversion operations in the storage + * may not contain the converted data, only the schema. Currently affects + * only the clustered storage. */ +static bool use_no_data_conversion = true; + +static void +ovsdb_no_data_conversion_enable(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *arg OVS_UNUSED) +{ + use_no_data_conversion = true; + unixctl_command_reply(conn, NULL); +} + +void +ovsdb_no_data_conversion_disable(void) +{ + if (!use_no_data_conversion) { + return; + } + use_no_data_conversion = false; + unixctl_command_register("ovsdb/file/no-data-conversion-enable", "", + 0, 0, ovsdb_no_data_conversion_enable, NULL); +} + +/* Returns true if the database storage allows conversion records without + * data specified. */ +bool +ovsdb_conversion_with_no_data_supported(const struct ovsdb *db) +{ + return use_no_data_conversion && ovsdb_storage_is_clustered(db->storage); +} + /* Returns the number of tables in 'schema''s root set. */ static size_t root_set_size(const struct ovsdb_schema *schema) diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index 13d8bf407be..d45630e8f0f 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -132,6 +132,9 @@ extern size_t n_weak_refs; struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *); void ovsdb_destroy(struct ovsdb *); +void ovsdb_no_data_conversion_disable(void); +bool ovsdb_conversion_with_no_data_supported(const struct ovsdb *); + void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage); struct ovsdb_table *ovsdb_get_table(const struct ovsdb *, const char *); diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 3c93ae580f3..0706d66cc9d 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -280,9 +280,14 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) return false; } - /* Make the new copy into a transaction log record. */ - struct json *txn_json = ovsdb_to_txn_json( - newdb, "converted by ovsdb-server", true); + struct json *txn_json; + if (ovsdb_conversion_with_no_data_supported(t->db)) { + txn_json = json_null_create(); + } else { + /* Make the new copy into a transaction log record. */ + txn_json = ovsdb_to_txn_json( + newdb, "converted by ovsdb-server", true); + } /* Propose the change. */ t->progress = ovsdb_txn_propose_schema_change( From 172c935ed9a8332004fcc15fbf4ab43c9f5fe043 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 Mar 2023 21:43:00 +0200 Subject: [PATCH 228/833] ovsdb: Avoid converting database twice on an initiator. Cluster member, that initiates the schema conversion, converts the database twice. First time while verifying the possibility of the conversion, and the second time after reading conversion request back from the storage. Keep the converted database from the first time around and use it after reading the request back from the storage. This cuts in half the conversion CPU cost. Reviewed-by: Simon Horman Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 22 +++++++++++++++------- ovsdb/relay.c | 4 ++-- ovsdb/relay.h | 2 ++ ovsdb/transaction.c | 6 +++--- ovsdb/transaction.h | 3 ++- ovsdb/trigger.c | 41 ++++++++++++++++++++++++++++++++++------- ovsdb/trigger.h | 7 +++++++ 7 files changed, 65 insertions(+), 20 deletions(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index b6481407628..9bad0c8ddf2 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -574,7 +574,9 @@ close_db(struct server_config *config, struct db *db, char *comment) } static struct ovsdb_error * OVS_WARN_UNUSED_RESULT -update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, +update_schema(struct ovsdb *db, + const struct ovsdb_schema *schema, + const struct uuid *txnid, bool conversion_with_no_data, void *aux) { struct server_config *config = aux; @@ -591,11 +593,17 @@ update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, struct ovsdb *new_db = NULL; struct ovsdb_error *error; - error = ovsdb_convert(db, schema, &new_db); - if (error) { - /* Should never happen, because conversion should have been - * checked before writing the schema to the storage. */ - return error; + /* If conversion was triggered by the current process, we might + * already have converted version of a database. */ + new_db = ovsdb_trigger_find_and_steal_converted_db(db, txnid); + if (!new_db) { + /* No luck. Converting. */ + error = ovsdb_convert(db, schema, &new_db); + if (error) { + /* Should never happen, because conversion should have been + * checked before writing the schema to the storage. */ + return error; + } } ovsdb_replace(db, new_db); } else { @@ -635,7 +643,7 @@ parse_txn(struct server_config *config, struct db *db, return error; } - error = update_schema(db->db, schema, txn_json == NULL, config); + error = update_schema(db->db, schema, txnid, txn_json == NULL, config); if (error) { return error; } diff --git a/ovsdb/relay.c b/ovsdb/relay.c index 94ffe01e54e..377f3285f61 100644 --- a/ovsdb/relay.c +++ b/ovsdb/relay.c @@ -310,8 +310,8 @@ ovsdb_relay_parse_update(struct relay_ctx *ctx, if (update->monitor_reply && ctx->new_schema) { /* There was a schema change. Updating a database with a new schema * before processing monitor reply with the new data. */ - error = ctx->schema_change_cb(ctx->db, ctx->new_schema, false, - ctx->schema_change_aux); + error = ctx->schema_change_cb(ctx->db, ctx->new_schema, &UUID_ZERO, + false, ctx->schema_change_aux); if (error) { /* Should never happen, but handle this case anyway. */ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); diff --git a/ovsdb/relay.h b/ovsdb/relay.h index 2d66b5e5fa8..f841554ca9e 100644 --- a/ovsdb/relay.h +++ b/ovsdb/relay.h @@ -22,10 +22,12 @@ struct json; struct ovsdb; struct ovsdb_schema; +struct uuid; typedef struct ovsdb_error *(*schema_change_callback)( struct ovsdb *, const struct ovsdb_schema *, + const struct uuid *, bool conversion_with_no_data, void *aux); diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index f01de2a34fd..7cf4a851aac 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -1252,14 +1252,14 @@ ovsdb_txn_precheck_prereq(const struct ovsdb *db) struct ovsdb_txn_progress * ovsdb_txn_propose_schema_change(struct ovsdb *db, const struct ovsdb_schema *schema, - const struct json *data) + const struct json *data, + struct uuid *txnid) { struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress); progress->storage = db->storage; - struct uuid next; struct ovsdb_write *write = ovsdb_storage_write_schema_change( - db->storage, schema, data, &db->prereq, &next); + db->storage, schema, data, &db->prereq, txnid); if (!ovsdb_write_is_complete(write)) { progress->write = write; } else { diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h index 9991f34d24c..0e054eef3bd 100644 --- a/ovsdb/transaction.h +++ b/ovsdb/transaction.h @@ -42,7 +42,8 @@ struct ovsdb_error *ovsdb_txn_propose_commit_block(struct ovsdb_txn *, void ovsdb_txn_complete(struct ovsdb_txn *); struct ovsdb_txn_progress *ovsdb_txn_propose_schema_change( - struct ovsdb *, const struct ovsdb_schema *, const struct json *data); + struct ovsdb *, const struct ovsdb_schema *, + const struct json *data, struct uuid *txnid); bool ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *); const struct ovsdb_error *ovsdb_txn_progress_get_error( diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 0706d66cc9d..0edcdd89c64 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -31,6 +31,7 @@ #include "transaction-forward.h" #include "openvswitch/vlog.h" #include "util.h" +#include "uuid.h" VLOG_DEFINE_THIS_MODULE(trigger); @@ -52,6 +53,7 @@ ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db, trigger->db = db; ovs_list_push_back(&trigger->db->triggers, &trigger->node); trigger->request = request; + trigger->converted_db = NULL; trigger->reply = NULL; trigger->progress = NULL; trigger->txn_forward = NULL; @@ -69,6 +71,7 @@ ovsdb_trigger_destroy(struct ovsdb_trigger *trigger) ovsdb_txn_progress_destroy(trigger->progress); ovsdb_txn_forward_destroy(trigger->db, trigger->txn_forward); ovs_list_remove(&trigger->node); + ovsdb_destroy(trigger->converted_db); jsonrpc_msg_destroy(trigger->request); jsonrpc_msg_destroy(trigger->reply); free(trigger->role); @@ -143,6 +146,30 @@ ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger) } } +/* Find among incomplete triggers one that caused database conversion + * with specified transaction ID. */ +struct ovsdb * +ovsdb_trigger_find_and_steal_converted_db(const struct ovsdb *db, + const struct uuid *txnid) +{ + struct ovsdb *converted_db = NULL; + struct ovsdb_trigger *t; + + if (uuid_is_zero(txnid)) { + return NULL; + } + + LIST_FOR_EACH_SAFE (t, node, &db->triggers) { + if (t->db == db && t->converted_db + && uuid_equals(&t->conversion_txnid, txnid)) { + converted_db = t->converted_db; + t->converted_db = NULL; + break; + } + } + return converted_db; +} + bool ovsdb_trigger_run(struct ovsdb *db, long long int now) { @@ -200,7 +227,6 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) ovs_assert(!t->progress); struct ovsdb_txn *txn = NULL; - struct ovsdb *newdb = NULL; if (!strcmp(t->request->method, "transact")) { if (!ovsdb_txn_precheck_prereq(t->db)) { return false; @@ -272,7 +298,8 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) new_schema->name, t->db->schema->name); } if (!error) { - error = ovsdb_convert(t->db, new_schema, &newdb); + ovsdb_destroy(t->converted_db); + error = ovsdb_convert(t->db, new_schema, &t->converted_db); } if (error) { ovsdb_schema_destroy(new_schema); @@ -286,12 +313,12 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) } else { /* Make the new copy into a transaction log record. */ txn_json = ovsdb_to_txn_json( - newdb, "converted by ovsdb-server", true); + t->converted_db, "converted by ovsdb-server", true); } /* Propose the change. */ t->progress = ovsdb_txn_propose_schema_change( - t->db, new_schema, txn_json); + t->db, new_schema, txn_json, &t->conversion_txnid); ovsdb_schema_destroy(new_schema); json_destroy(txn_json); t->reply = jsonrpc_create_reply(json_object_create(), @@ -313,13 +340,13 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) ovsdb_txn_progress_destroy(t->progress); t->progress = NULL; ovsdb_trigger_complete(t); - if (newdb) { - ovsdb_replace(t->db, newdb); + if (t->converted_db) { + ovsdb_replace(t->db, t->converted_db); + t->converted_db = NULL; return true; } return false; } - ovsdb_destroy(newdb); /* Fall through to the general handling for the "committing" state. We * abort the transaction--if and when it eventually commits, we'll read diff --git a/ovsdb/trigger.h b/ovsdb/trigger.h index d060c72e5c7..87ff4d0531b 100644 --- a/ovsdb/trigger.h +++ b/ovsdb/trigger.h @@ -17,6 +17,7 @@ #define OVSDB_TRIGGER_H 1 #include "openvswitch/list.h" +#include "openvswitch/uuid.h" struct ovsdb; @@ -54,6 +55,8 @@ struct ovsdb_trigger { struct ovs_list node; struct ovsdb_session *session; /* Session that owns this trigger. */ struct ovsdb *db; /* Database on which trigger acts. */ + struct ovsdb *converted_db; /* Result of the 'convert' request. */ + struct uuid conversion_txnid; /* txnid of the conversion request. */ struct jsonrpc_msg *request; /* Database request. */ struct jsonrpc_msg *reply; /* Result (null if none yet). */ struct ovsdb_txn_progress *progress; @@ -77,6 +80,10 @@ void ovsdb_trigger_cancel(struct ovsdb_trigger *, const char *reason); void ovsdb_trigger_prereplace_db(struct ovsdb_trigger *); +struct ovsdb *ovsdb_trigger_find_and_steal_converted_db( + const struct ovsdb *, const struct uuid *) + OVS_WARN_UNUSED_RESULT; + bool ovsdb_trigger_run(struct ovsdb *, long long int now); void ovsdb_trigger_wait(struct ovsdb *, long long int now); From 07c27226ee96a3715126c50e1dbf6d8a1886b305 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 Mar 2023 21:43:01 +0200 Subject: [PATCH 229/833] ovsdb: Monitor: Keep and maintain the initial change set. Change sets in OVSDB monitor are storing all the changes that happened between a particular transaction ID and now. Initial change set basically contains all the data. On each monitor request a new initial change set is created by creating an empty change set and adding all the database rows. Then it is converted into JSON reply and immediately untracked and destroyed. This is causing significant performance issues if many clients are requesting new monitors at the same time. For example, that is happening after database schema conversion, because conversion triggers cancellation of all monitors. After cancellation, every client sends a new monitor request. The server then creates a new initial change set, sends a reply, destroys initial change set and repeats that for each client. On a system with 200 MB database and 500 clients, cluster of 3 servers spends 20 minutes replying to all the clients (200 MB x 500 = 100 GB): timeval|WARN|Unreasonably long 1201525ms poll interval Of course, all the clients are already disconnected due to inactivity at this point. When they are re-connecting back, server accepts new connections one at a time, so inactivity probes will not be triggered anymore, but it still takes another 20 minutes to handle all the incoming connections. Let's keep the initial change set around for as long as the monitor itself exists. This will allow us to not construct a new change set on each new monitor request and even utilize the JSON cache in some cases. All that at a relatively small maintenance cost, since we'll need to commit changes to one extra change set on every transaction. Measured memory usage increase due to keeping around a shallow copy of a database is about 10%. Measured CPU usage difference during normal operation is negligible. With this change it takes only 30 seconds to send out all the monitor replies in the example above. So, it's a 40x performance improvement. On a more reasonable setup with 250 nodes, the process takes up to 8-10 seconds instead of 4-5 minutes. Conditional monitoring will benefit from this change as well, however results might be less impressive due to lack of JSON cache. Reviewed-by: Simon Horman Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/monitor.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 191befcae3b..3cdd03b20fa 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -609,7 +609,10 @@ ovsdb_monitor_untrack_change_set(struct ovsdb_monitor *dbmon, ovs_assert(mcs); if (--mcs->n_refs == 0) { if (mcs == dbmon->init_change_set) { - dbmon->init_change_set = NULL; + /* The initial change set should exist as long as the + * monitor itself. */ + mcs->n_refs++; + return; } else if (mcs == dbmon->new_change_set) { dbmon->new_change_set = NULL; } From 70ba6e97dbd75d5c255a339568e6ff56ed8f67fc Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Fri, 7 Apr 2023 14:30:22 +0800 Subject: [PATCH 230/833] learning-switch: Fix coredump of OpenFlow15 learning-switch. The OpenFlow15 Packet-Out message contains the match instead of the in_port. The flow.tunnel.metadata.tab is not inited but used in the loop of tun_metadata_to_nx_match. The coredump gdb backtrace is: 0 memcpy_from_metadata (dst=0x2f060, src=0x30880, loc=0x10) at lib/tun-metadata.c:467 1 metadata_loc_from_match_read (match=0x30598, is_masked=<..>, mask=0x30838, idx=0, map=0x0) at lib/tun-metadata.c:865 2 metadata_loc_from_match_read (is_masked=<...>, mask=0x30838, idx=0, match=0x30598, map=0x0) at lib/tun-metadata.c:854 3 tun_metadata_to_nx_match (b=0x892260, oxm=OFP15_VERSION, match=0x30598) at lib/tun-metadata.c:888 4 nx_put_raw (b=0x892260, oxm=OFP15_VERSION, match=0x30598, cookie=<...>, cookie=0, cookie_mask=<...>, cookie_mask=0) at lib/nx-match.c:1186 5 oxm_put_match (b=0x892260, match=0x30598, version=OFP15_VERSION) at lib/nx-match.c:1343 6 ofputil_encode_packet_out (po=0x30580, protocol=<...>) at lib/ofp-packet.c:1226 7 process_packet_in (sw=0x891d70, oh=<...>) at lib/learning-switch.c:619 8 lswitch_process_packet (msg=0x892210, sw=0x891d70) at lib/learning-switch.c:374 9 lswitch_run (sw=0x891d70) at lib/learning-switch.c:324 10 main (argc=<...>, argv=<...>) at utilities/ovs-testcontroller.c:180 Fix that by initing the flow metadata. Fixes: 35eb6326d5d0 ("ofp-util: Add flow metadata to ofputil_packet_out") Signed-off-by: Faicker Mo Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/learning-switch.c | 1 + tests/automake.mk | 3 ++- tests/learning-switch.at | 23 +++++++++++++++++++++++ tests/testsuite.at | 1 + 4 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 tests/learning-switch.at diff --git a/lib/learning-switch.c b/lib/learning-switch.c index 8102475cae5..cdf42935c1d 100644 --- a/lib/learning-switch.c +++ b/lib/learning-switch.c @@ -569,6 +569,7 @@ process_packet_in(struct lswitch *sw, const struct ofp_header *oh) } /* Prepare packet_out in case we need one. */ + match_init_catchall(&po.flow_metadata); po.buffer_id = buffer_id; if (buffer_id == UINT32_MAX) { po.packet = dp_packet_data(&pkt); diff --git a/tests/automake.mk b/tests/automake.mk index 86e496a5b9f..720c944496b 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -110,7 +110,8 @@ TESTSUITE_AT = \ tests/mcast-snooping.at \ tests/packet-type-aware.at \ tests/nsh.at \ - tests/drop-stats.at + tests/drop-stats.at \ + tests/learning-switch.at EXTRA_DIST += $(FUZZ_REGRESSION_TESTS) FUZZ_REGRESSION_TESTS = \ diff --git a/tests/learning-switch.at b/tests/learning-switch.at new file mode 100644 index 00000000000..ac2fc1b8017 --- /dev/null +++ b/tests/learning-switch.at @@ -0,0 +1,23 @@ +AT_BANNER([learning switch]) + +### ----------------------------------------------------------------- +### learning switch OpenFlow15 test case +### ----------------------------------------------------------------- + +AT_SETUP([learning switch - OpenFlow15]) +dnl Start ovs-testcontroller +AT_CHECK([ovs-testcontroller --no-chdir --detach punix:controller --pidfile -v ptcp:], [0], [ignore]) +dnl Start ovs +OVS_VSWITCHD_START([dnl + set bridge br0 datapath_type=dummy \ + protocols=OpenFlow15 -- \ + add-port br0 p1 -- set Interface p1 type=dummy ofport_request=1 -- \ + set-controller br0 tcp:127.0.0.1:6653]) +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 1e2ce92a669e3a6dd2099cab0800450000548a53400040011addc0a80a0ac0a80a1e08006f200a4d0001fc509a58000000002715020000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637 +], [0], [ignore]) +AT_CHECK([kill `cat ovs-testcontroller.pid`]) + +OVS_WAIT_UNTIL([! test -e controller]) +OVS_VSWITCHD_STOP(["/cannot find route for controller/d"]) +AT_CLEANUP diff --git a/tests/testsuite.at b/tests/testsuite.at index cf4e3eadfb5..9d77a9f512e 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -77,3 +77,4 @@ m4_include([tests/packet-type-aware.at]) m4_include([tests/nsh.at]) m4_include([tests/drop-stats.at]) m4_include([tests/pytest.at]) +m4_include([tests/learning-switch.at]) From 36c8c101cdcd668afefee94f3c0f62ef0bc6d286 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Thu, 13 Apr 2023 14:55:49 +0900 Subject: [PATCH 231/833] doc: Fix the list of supported tunnels in README. Without distinguishing between IPv4 and IPv6, such as GRE and GRE-IPv6, nine types of tunneling are currently supported. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index a60a314feb3..e6c0d3d3061 100644 --- a/README.rst +++ b/README.rst @@ -35,7 +35,7 @@ following features: - NIC bonding with or without LACP on upstream switch - NetFlow, sFlow(R), and mirroring for increased visibility - QoS (Quality of Service) configuration, plus policing -- Geneve, GRE, VXLAN, STT, and LISP tunneling +- Geneve, GRE, VXLAN, STT, ERSPAN, GTP-U, SRv6, Bareudp, and LISP tunneling - 802.1ag connectivity fault management - OpenFlow 1.0 plus numerous extensions - Transactional configuration database with C and Python bindings From 3fa0fc5824324c11d78bf961648bb200da31d7bd Mon Sep 17 00:00:00 2001 From: Songtao Zhan Date: Wed, 19 Apr 2023 09:38:52 +0800 Subject: [PATCH 232/833] util: Fix an issue that thread name cannot be set. The name of the current thread consists of a name with a maximum length of 16 bytes and a thread ID. The final name may be longer than 16 bytes. If the name is longer than 16 bytes, the thread name will fail to be set Acked-by: Eelco Chaudron Signed-off-by: Songtao Zhan Signed-off-by: Ilya Maximets --- lib/util.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/util.c b/lib/util.c index 96a71550d91..3fb3a4b40fd 100644 --- a/lib/util.c +++ b/lib/util.c @@ -645,6 +645,12 @@ set_subprogram_name(const char *subprogram_name) free(subprogram_name_set(pname)); #if HAVE_GLIBC_PTHREAD_SETNAME_NP + /* The maximum supported thread name including '\0' is 16. + * Add '>' at 0th position to highlight that the name was truncated. */ + if (strlen(pname) > 15) { + memmove(pname, &pname[strlen(pname) - 15], 15 + 1); + pname[0] = '>'; + } pthread_setname_np(pthread_self(), pname); #elif HAVE_NETBSD_PTHREAD_SETNAME_NP pthread_setname_np(pthread_self(), "%s", pname); From 8d59ab31d2a74003a3f2b83d67e2ba78e1a1225d Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Fri, 21 Apr 2023 16:27:10 +0800 Subject: [PATCH 233/833] ofp-parse: Check ranges on string to uint32_t conversion. An unnecessarily overflow would occurs when the 'value' is longer than 4294967295. So it's required to check ranges to avoid uint32_t overflow. Reported-by: Nan Zhou Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Yunjian Wang Signed-off-by: Ilya Maximets --- lib/ofp-parse.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/ofp-parse.c b/lib/ofp-parse.c index a90b926efb5..102b183a8fd 100644 --- a/lib/ofp-parse.c +++ b/lib/ofp-parse.c @@ -71,16 +71,13 @@ str_to_u16(const char *str, const char *name, uint16_t *valuep) char * OVS_WARN_UNUSED_RESULT str_to_u32(const char *str, uint32_t *valuep) { - char *tail; - uint32_t value; + unsigned long long value; if (!str[0]) { return xstrdup("missing required numeric argument"); } - errno = 0; - value = strtoul(str, &tail, 0); - if (errno == EINVAL || errno == ERANGE || *tail) { + if (!str_to_ullong(str, 0, &value) || value > UINT32_MAX) { return xasprintf("invalid numeric format %s", str); } *valuep = value; From c3559dffcb0436a3ab56bee35d4823c349cfbbc6 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Mon, 24 Apr 2023 19:54:58 +0800 Subject: [PATCH 234/833] dpif-netlink: Fix memory leak dpif_netlink_open(). In the specific call to dpif_netlink_dp_transact() (line 398) in dpif_netlink_open(), the 'dp' content is not being used in the branch when no error is returned (starting line 430). Furthermore, the 'dp' and 'buf' variables are overwritten later in this same branch when a new netlink request is sent (line 437), which results in a memory leak. Reported by Address Sanitizer. Indirect leak of 1024 byte(s) in 1 object(s) allocated from: 0 0x7fe09d3bfe70 in __interceptor_malloc (/usr/lib64/libasan.so.4+0xe0e70) 1 0x8759be in xmalloc__ lib/util.c:140 2 0x875a9a in xmalloc lib/util.c:175 3 0x7ba0d2 in ofpbuf_init lib/ofpbuf.c:141 4 0x7ba1d6 in ofpbuf_new lib/ofpbuf.c:169 5 0x9057f9 in nl_sock_transact lib/netlink-socket.c:1113 6 0x907a7e in nl_transact lib/netlink-socket.c:1817 7 0x8b5abe in dpif_netlink_dp_transact lib/dpif-netlink.c:5007 8 0x89a6b5 in dpif_netlink_open lib/dpif-netlink.c:398 9 0x5de16f in do_open lib/dpif.c:348 10 0x5de69a in dpif_open lib/dpif.c:393 11 0x5de71f in dpif_create_and_open lib/dpif.c:419 12 0x47b918 in open_dpif_backer ofproto/ofproto-dpif.c:764 13 0x483e4a in construct ofproto/ofproto-dpif.c:1658 14 0x441644 in ofproto_create ofproto/ofproto.c:556 15 0x40ba5a in bridge_reconfigure vswitchd/bridge.c:885 16 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 17 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 18 0x7fe09cc03c86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) Fixes: b841e3cd4a28 ("dpif-netlink: Fix feature negotiation for older kernels.") Reviewed-by: David Marchand Reviewed-by: Simon Horman Signed-off-by: Yunjian Wang Signed-off-by: Ilya Maximets --- lib/dpif-netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index de071127788..60bd39643c7 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -395,7 +395,7 @@ dpif_netlink_open(const struct dpif_class *class OVS_UNUSED, const char *name, dp_request.user_features |= OVS_DP_F_UNALIGNED; dp_request.user_features |= OVS_DP_F_VPORT_PIDS; dp_request.user_features |= OVS_DP_F_UNSUPPORTED; - error = dpif_netlink_dp_transact(&dp_request, &dp, &buf); + error = dpif_netlink_dp_transact(&dp_request, NULL, NULL); if (error) { /* The Open vSwitch kernel module has two modes for dispatching * upcalls: per-vport and per-cpu. From b456b1a02f629c2438ef2c3f247f35c8712f12c6 Mon Sep 17 00:00:00 2001 From: Stefan Hoffmann Date: Fri, 21 Apr 2023 10:29:36 +0200 Subject: [PATCH 235/833] python-stream: Handle SSL error in do_handshake. In some cases ovsdb server or relay gets restarted, ovsdb python clients may keep the local socket open. Instead of reconnecting a lot of failures will be logged. This can be reproduced with ssl connections to the server/relay and restarting it, so it has the same IP after restart. This patch catches the Exceptions at do_handshake to recreate the connection on the client side. Reviewed-by: Simon Horman Signed-off-by: Stefan Hoffmann Signed-off-by: Luca Czesla Signed-off-by: Max Lamprecht Co-authored-by: Luca Czesla Co-authored-by: Max Lamprecht Signed-off-by: Ilya Maximets --- python/ovs/stream.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ovs/stream.py b/python/ovs/stream.py index ac5b0fd0c64..b32341076ca 100644 --- a/python/ovs/stream.py +++ b/python/ovs/stream.py @@ -824,7 +824,8 @@ def connect(self): self.socket.do_handshake() except ssl.SSLWantReadError: return errno.EAGAIN - except ssl.SSLSyscallError as e: + except (ssl.SSLSyscallError, ssl.SSLZeroReturnError, + ssl.SSLEOFError, OSError) as e: return ovs.socket_util.get_exception_errno(e) return 0 From 572e89f418e81f63053de7c284c4dcbf960ebaee Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 26 Apr 2023 14:54:41 +0200 Subject: [PATCH 236/833] AUTHORS: Add Stefan, Luca and Max. Also, slightly re-sort the list to fix the order. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 0b5408a30d1..4dca731fc91 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -280,8 +280,9 @@ Lucas Alvares Gomes lucasagomes@gmail.com Lucian Petrut lpetrut@cloudbasesolutions.com Luigi Rizzo rizzo@iet.unipi.it Luis E. P. l31g@hotmail.com -Lukasz Rzasik lukasz.rzasik@gmail.com +Luca Czesla luca.czesla@mail.schwarz Lukasz Pawlik lukaszx.pawlik@intel.com +Lukasz Rzasik lukasz.rzasik@gmail.com Maciej Józefczyk mjozefcz@redhat.com Madhu Challa challa@noironetworks.com Manohar K C manukc@gmail.com @@ -295,14 +296,15 @@ Mark Michelson mmichels@redhat.com Markos Chandras mchandras@suse.de Martin Casado casado@cs.stanford.edu Martin Fong mwfong@csl.sri.com -Martino Fornasa mf@fornasa.it Martin Varghese martin.varghese@nokia.com Martin Xu martinxu9.ovs@gmail.com Martin Zhang martinbj2008@gmail.com +Martino Fornasa mf@fornasa.it Maryam Tahhan maryam.tahhan@intel.com Matteo Croce mcroce@redhat.com Matthias May matthias.may@neratec.com Mauricio Vásquez mauricio.vasquezbernal@studenti.polito.it +Max Lamprecht max.lamprecht@mail.schwarz Maxime Coquelin maxime.coquelin@redhat.com Mehak Mahajan Michael Arnaldi arnaldimichael@gmail.com @@ -415,6 +417,7 @@ Somnath Chatterjee somnath.b.chatterjee@ericsson.com Songtao Zhan zhanst1@chinatelecom.cn Sorin Vinturis svinturis@cloudbasesolutions.com Sriharsha Basavapatna sriharsha.basavapatna@broadcom.com +Stefan Hoffmann stefan.hoffmann@cloudandheat.com Steffen Gebert steffen.gebert@informatik.uni-wuerzburg.de Sten Spans sten@blinkenlights.nl Stephane A. Sezer sas@cd80.net From 77d82289857f5cdcaaf4be06e17e750edcf0abd3 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 27 Apr 2023 14:32:58 +0300 Subject: [PATCH 237/833] tc: Fix cleaning chains. Sometimes there is a need to clean empty chains as done in delete_chains_from_netdev(). The cited commit doesn't remove the chain completely which cause adding ingress_block later to fail. This can be reproduced with adding bond as ovs port which makes ovs use ingress_block for it. While at it add the netdev name that fails to the log. Fixes: e1e5eac5b016 ("tc: Add TCA_KIND flower to delete and get operation to avoid rtnl_lock().") Signed-off-by: Roi Dayan Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 11 ++++++++--- lib/tc.c | 4 +++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index c9662081fc6..4f26dd8cca5 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -525,7 +525,11 @@ delete_chains_from_netdev(struct netdev *netdev, struct tcf_id *id) */ HMAP_FOR_EACH_POP (chain_node, node, &map) { id->chain = chain_node->chain; - tc_del_flower_filter(id); + /* Delete empty chain doesn't seem to work with + * tc_del_flower_filter() so use tc_del_filter() + * without specifying TCA_KIND. + */ + tc_del_filter(id, NULL); free(chain_node); } } @@ -2879,8 +2883,9 @@ netdev_tc_init_flow_api(struct netdev *netdev) error = tc_add_del_qdisc(ifindex, true, block_id, hook); if (error && error != EEXIST) { - VLOG_INFO("failed adding ingress qdisc required for offloading: %s", - ovs_strerror(error)); + VLOG_INFO("failed adding ingress qdisc required for offloading " + "on %s: %s", + netdev_get_name(netdev), ovs_strerror(error)); return error; } diff --git a/lib/tc.c b/lib/tc.c index 4c07e22162e..5c32c6f971d 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -2354,7 +2354,9 @@ tc_del_filter(struct tcf_id *id, const char *kind) struct ofpbuf request; request_from_tcf_id(id, 0, RTM_DELTFILTER, NLM_F_ACK, &request); - nl_msg_put_string(&request, TCA_KIND, kind); + if (kind) { + nl_msg_put_string(&request, TCA_KIND, kind); + } return tc_transact(&request, NULL); } From 46240314ac483b93aef081d828b9e86fa9754feb Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 28 Apr 2023 16:17:58 +0200 Subject: [PATCH 238/833] ovsdb-idl.at: Fix write-changed-only tests without change tracking. The '-w' command line argument is not passed to test-ovsdb in the OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C, so it juts repeats normal tests without testing the feature. Adding the flag. And using the long version of the flag to make things more obvious and harder to overlook. Swapping the argument in the other working test as well, just for consistency. Fixes: d94cd0d3eec3 ("ovsdb-idl: Support write-only-changed IDL monitor mode.") Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- tests/ovsdb-idl.at | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 5a7e76eaa95..9d28672efe6 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -94,7 +94,7 @@ m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C], AT_CHECK([ovsdb_start_idltest]) m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) - AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $3], + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 --write-changed-only idl unix:socket $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) @@ -1216,7 +1216,7 @@ m4_define([OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C], AT_CHECK([ovsdb_start_idltest]) m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) - AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c -w idl unix:socket $3], + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c --write-changed-only idl unix:socket $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) From 1a1b3106d90e517be416ae14eed625eee240517e Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 4 May 2023 19:10:49 +0200 Subject: [PATCH 239/833] ci: Separate DPDK from OVS build. Let's separate DPDK compilation from the rest of OVS build: - this avoids multiple jobs building DPDK in parallel, which especially affects builds in the dpdk-latest branch, - we separate concerns about DPDK build requirements from OVS build requirements, like python dependencies, - building DPDK does not depend on how we will link OVS against it, so we can use a single cache entry regardless of DPDK_SHARED option, Reviewed-by: Simon Horman Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- .ci/dpdk-build.sh | 54 +++++++++++++++++ .ci/dpdk-prepare.sh | 11 ++++ .ci/linux-build.sh | 64 ++------------------ .ci/linux-prepare.sh | 3 +- .github/workflows/build-and-test.yml | 89 ++++++++++++++++++++++------ Makefile.am | 2 + 6 files changed, 145 insertions(+), 78 deletions(-) create mode 100755 .ci/dpdk-build.sh create mode 100755 .ci/dpdk-prepare.sh diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh new file mode 100755 index 00000000000..02dcefef618 --- /dev/null +++ b/.ci/dpdk-build.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -o errexit +set -x + +function build_dpdk() +{ + local VERSION_FILE="dpdk-dir/cached-version" + local DPDK_VER=$1 + local DPDK_OPTS="" + + rm -rf dpdk-dir + + if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then + git clone --single-branch $DPDK_GIT dpdk-dir -b "${DPDK_VER##refs/*/}" + pushd dpdk-dir + git log -1 --oneline + else + wget https://fast.dpdk.org/rel/dpdk-$1.tar.xz + tar xvf dpdk-$1.tar.xz > /dev/null + DIR_NAME=$(tar -tf dpdk-$1.tar.xz | head -1 | cut -f1 -d"/") + mv ${DIR_NAME} dpdk-dir + pushd dpdk-dir + fi + + # Switching to 'default' machine to make dpdk-dir cache usable on + # different CPUs. We can't be sure that all CI machines are exactly same. + DPDK_OPTS="$DPDK_OPTS -Dmachine=default" + + # Disable building DPDK unit tests. Not needed for OVS build or tests. + DPDK_OPTS="$DPDK_OPTS -Dtests=false" + + # Disable DPDK developer mode, this results in less build checks and less + # meson verbose outputs. + DPDK_OPTS="$DPDK_OPTS -Ddeveloper_mode=disabled" + + # OVS compilation and "normal" unit tests (run in the CI) do not depend on + # any DPDK driver being present. + # We can disable all drivers to save compilation time. + DPDK_OPTS="$DPDK_OPTS -Ddisable_drivers=*/*" + + # Install DPDK using prefix. + DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" + + meson $DPDK_OPTS build + ninja -C build + ninja -C build install + + echo "Installed DPDK in $(pwd)" + popd + echo "${DPDK_VER}" > ${VERSION_FILE} +} + +build_dpdk $DPDK_VER diff --git a/.ci/dpdk-prepare.sh b/.ci/dpdk-prepare.sh new file mode 100755 index 00000000000..f7e6215ddac --- /dev/null +++ b/.ci/dpdk-prepare.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -ev + +# Installing wheel separately because it may be needed to build some +# of the packages during dependency backtracking and pip >= 22.0 will +# abort backtracking on build failures: +# https://github.com/pypa/pip/issues/10655 +pip3 install --disable-pip-version-check --user wheel +pip3 install --disable-pip-version-check --user pyelftools +pip3 install --user 'meson==0.53.2' diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 10021fddb25..99850a94346 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -9,9 +9,7 @@ EXTRA_OPTS="--enable-Werror" function install_dpdk() { - local DPDK_VER=$1 local VERSION_FILE="dpdk-dir/cached-version" - local DPDK_OPTS="" local DPDK_LIB=$(pwd)/dpdk-dir/build/lib/x86_64-linux-gnu if [ "$DPDK_SHARED" ]; then @@ -24,63 +22,14 @@ function install_dpdk() # Export the following path for pkg-config to find the .pc file. export PKG_CONFIG_PATH=$DPDK_LIB/pkgconfig/:$PKG_CONFIG_PATH - if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then - # Avoid using cache for git tree build. - rm -rf dpdk-dir - - DPDK_GIT=${DPDK_GIT:-https://dpdk.org/git/dpdk} - git clone --single-branch $DPDK_GIT dpdk-dir -b "${DPDK_VER##refs/*/}" - pushd dpdk-dir - git log -1 --oneline - else - if [ -f "${VERSION_FILE}" ]; then - VER=$(cat ${VERSION_FILE}) - if [ "${VER}" = "${DPDK_VER}" ]; then - # Update the library paths. - sudo ldconfig - echo "Found cached DPDK ${VER} build in $(pwd)/dpdk-dir" - return - fi - fi - # No cache or version mismatch. - rm -rf dpdk-dir - wget https://fast.dpdk.org/rel/dpdk-$1.tar.xz - tar xvf dpdk-$1.tar.xz > /dev/null - DIR_NAME=$(tar -tf dpdk-$1.tar.xz | head -1 | cut -f1 -d"/") - mv ${DIR_NAME} dpdk-dir - pushd dpdk-dir + if [ ! -f "${VERSION_FILE}" ]; then + echo "Could not find DPDK in $(pwd)/dpdk-dir" + return 1 fi - # Switching to 'default' machine to make dpdk-dir cache usable on - # different CPUs. We can't be sure that all CI machines are exactly same. - DPDK_OPTS="$DPDK_OPTS -Dmachine=default" - - # Disable building DPDK unit tests. Not needed for OVS build or tests. - DPDK_OPTS="$DPDK_OPTS -Dtests=false" - - # Disable DPDK developer mode, this results in less build checks and less - # meson verbose outputs. - DPDK_OPTS="$DPDK_OPTS -Ddeveloper_mode=disabled" - - # OVS compilation and "normal" unit tests (run in the CI) do not depend on - # any DPDK driver being present. - # We can disable all drivers to save compilation time. - DPDK_OPTS="$DPDK_OPTS -Ddisable_drivers=*/*" - - # Install DPDK using prefix. - DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" - - CC=gcc meson $DPDK_OPTS build - ninja -C build - ninja -C build install - # Update the library paths. sudo ldconfig - - - echo "Installed DPDK source in $(pwd)" - popd - echo "${DPDK_VER}" > ${VERSION_FILE} + echo "Found cached DPDK $(cat ${VERSION_FILE}) build in $(pwd)/dpdk-dir" } function configure_ovs() @@ -130,10 +79,7 @@ assert ovs.json.from_string('{\"a\": 42}') == {'a': 42}" fi if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then - if [ -z "$DPDK_VER" ]; then - DPDK_VER="22.11.1" - fi - install_dpdk $DPDK_VER + install_dpdk fi if [ "$CC" = "clang" ]; then diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh index f414a879c70..c28b6819a35 100755 --- a/.ci/linux-prepare.sh +++ b/.ci/linux-prepare.sh @@ -23,8 +23,7 @@ cd .. # https://github.com/pypa/pip/issues/10655 pip3 install --disable-pip-version-check --user wheel pip3 install --disable-pip-version-check --user \ - flake8 'hacking>=3.0' netaddr pyparsing sphinx setuptools pyelftools -pip3 install --user 'meson==0.53.2' + flake8 'hacking>=3.0' netaddr pyparsing sphinx setuptools # Install python test dependencies pip3 install -r python/test_requirements.txt diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 39649c1b5cd..f66ab43b0bf 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -3,12 +3,80 @@ name: Build and Test on: [push, pull_request] jobs: + build-dpdk: + env: + dependencies: gcc libnuma-dev ninja-build + CC: gcc + DPDK_GIT: https://dpdk.org/git/dpdk-stable + DPDK_VER: 22.11.1 + name: dpdk gcc + outputs: + dpdk_key: ${{ steps.gen_dpdk_key.outputs.key }} + runs-on: ubuntu-20.04 + timeout-minutes: 30 + + steps: + - name: checkout + uses: actions/checkout@v3 + + - name: update PATH + run: | + echo "$HOME/bin" >> $GITHUB_PATH + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: create ci signature file for the dpdk cache key + # This will collect most of DPDK related lines, so hash will be different + # if something changed in a way we're building DPDK including DPDK_VER. + # This also allows us to use cache from any branch as long as version + # and a way we're building DPDK stays the same. + run: | + grep -irE 'RTE_|DPDK|meson|ninja' .ci/dpdk-* > dpdk-ci-signature + grep -rwE 'DPDK_GIT|DPDK_VER' .github/ >> dpdk-ci-signature + if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then + git ls-remote --heads $DPDK_GIT $DPDK_VER >> dpdk-ci-signature + fi + cat dpdk-ci-signature + + - name: generate ci DPDK key + id: gen_dpdk_key + env: + ci_key: ${{ hashFiles('dpdk-ci-signature') }} + run: echo 'key=dpdk-${{ env.ci_key }}' >> $GITHUB_OUTPUT + + - name: cache + id: dpdk_cache + uses: actions/cache@v3 + with: + path: dpdk-dir + key: ${{ steps.gen_dpdk_key.outputs.key }} + + - name: set up python + if: steps.dpdk_cache.outputs.cache-hit != 'true' + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: update APT cache + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: sudo apt update || true + - name: install common dependencies + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: sudo apt install -y ${{ env.dependencies }} + + - name: prepare + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: ./.ci/dpdk-prepare.sh + + - name: build + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: ./.ci/dpdk-build.sh + build-linux: + needs: build-dpdk env: dependencies: | - automake libtool gcc bc libjemalloc2 libjemalloc-dev \ - libssl-dev llvm-dev libelf-dev libnuma-dev libpcap-dev \ - ninja-build selinux-policy-dev libbpf-dev + automake libtool gcc bc libjemalloc2 libjemalloc-dev libssl-dev \ + llvm-dev libnuma-dev libpcap-dev selinux-policy-dev libbpf-dev ASAN: ${{ matrix.asan }} UBSAN: ${{ matrix.ubsan }} CC: ${{ matrix.compiler }} @@ -104,25 +172,12 @@ jobs: with: python-version: '3.9' - - name: create ci signature file for the dpdk cache key - if: matrix.dpdk != '' || matrix.dpdk_shared != '' - # This will collect most of DPDK related lines, so hash will be different - # if something changed in a way we're building DPDK including DPDK_VER. - # This also allows us to use cache from any branch as long as version - # and a way we're building DPDK stays the same. - run: | - grep -irE 'RTE_|DPDK|meson|ninja' -r .ci/ > dpdk-ci-signature - cat dpdk-ci-signature - - name: cache if: matrix.dpdk != '' || matrix.dpdk_shared != '' uses: actions/cache@v3 - env: - matrix_key: ${{ matrix.dpdk }}${{ matrix.dpdk_shared }} - ci_key: ${{ hashFiles('dpdk-ci-signature') }} with: path: dpdk-dir - key: ${{ env.matrix_key }}-${{ env.ci_key }} + key: ${{ needs.build-dpdk.outputs.dpdk_key }} - name: update APT cache run: sudo apt update || true diff --git a/Makefile.am b/Makefile.am index e605187b813..df9c33dfe63 100644 --- a/Makefile.am +++ b/Makefile.am @@ -75,6 +75,8 @@ EXTRA_DIST = \ MAINTAINERS.rst \ README.rst \ NOTICE \ + .ci/dpdk-build.sh \ + .ci/dpdk-prepare.sh \ .ci/linux-build.sh \ .ci/linux-prepare.sh \ .ci/osx-build.sh \ From 14773af4b28fd3c30832cd1cb05711fd9b345fbf Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Sat, 6 May 2023 18:00:09 +0800 Subject: [PATCH 240/833] ofproto-dpif-xlate: Fix use-after-free when xlate_actions(). Currently, bundle->cvlans and xbundle->cvlans are pointing to the same memory location. This can cause issues if the main thread modifies bundle->cvlans and frees it while the revalidator thread is still accessing xbundle->cvlans. This leads to use-after-free error. AddressSanitizer: heap-use-after-free on address 0x615000007b08 at pc 0x0000004ede1e bp 0x7f3120ee0310 sp 0x7f3120ee0300 READ of size 8 at 0x615000007b08 thread T25 (revalidator25) 0 0x4ede1d in bitmap_is_set lib/bitmap.h:91 1 0x4fcb26 in xbundle_allows_cvlan ofproto/ofproto-dpif-xlate.c:2028 2 0x4fe279 in input_vid_is_valid ofproto/ofproto-dpif-xlate.c:2294 3 0x502abf in xlate_normal ofproto/ofproto-dpif-xlate.c:3051 4 0x5164dc in xlate_output_action ofproto/ofproto-dpif-xlate.c:5361 5 0x522576 in do_xlate_actions ofproto/ofproto-dpif-xlate.c:7047 6 0x52a751 in xlate_actions ofproto/ofproto-dpif-xlate.c:8061 7 0x4e2b66 in xlate_key ofproto/ofproto-dpif-upcall.c:2212 8 0x4e2e13 in xlate_ukey ofproto/ofproto-dpif-upcall.c:2227 9 0x4e345d in revalidate_ukey__ ofproto/ofproto-dpif-upcall.c:2276 10 0x4e3f85 in revalidate_ukey ofproto/ofproto-dpif-upcall.c:2395 11 0x4e7ac5 in revalidate ofproto/ofproto-dpif-upcall.c:2858 12 0x4d9ed3 in udpif_revalidator ofproto/ofproto-dpif-upcall.c:1010 13 0x7cd92e in ovsthread_wrapper lib/ovs-thread.c:423 14 0x7f312ff01f3a (/usr/lib64/libpthread.so.0+0x8f3a) 15 0x7f312fc8f51f in clone (/usr/lib64/libc.so.6+0xf851f) 0x615000007b08 is located 8 bytes inside of 512-byte region [0x615000007b00,0x615000007d00) freed by thread T0 here: 0 0x7f3130378ad8 in free (/usr/lib64/libasan.so.4+0xe0ad8) 1 0x49044e in bundle_set ofproto/ofproto-dpif.c:3431 2 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 3 0x40e6c9 in port_configure vswitchd/bridge.c:1300 4 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 5 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 6 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 7 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) previously allocated by thread T0 here: 0 0x7f3130378e70 in __interceptor_malloc 1 0x8757fe in xmalloc__ lib/util.c:140 2 0x8758da in xmalloc lib/util.c:175 3 0x875927 in xmemdup lib/util.c:188 4 0x475f63 in bitmap_clone lib/bitmap.h:79 5 0x47797c in vlan_bitmap_clone lib/vlan-bitmap.h:40 6 0x49048d in bundle_set ofproto/ofproto-dpif.c:3433 7 0x444f92 in ofproto_bundle_register ofproto/ofproto.c:1455 8 0x40e6c9 in port_configure vswitchd/bridge.c:1300 9 0x40bcfd in bridge_reconfigure vswitchd/bridge.c:921 10 0x41f1a9 in bridge_run vswitchd/bridge.c:3313 11 0x42d4fb in main vswitchd/ovs-vswitchd.c:132 12 0x7f312fbbcc86 in __libc_start_main (/usr/lib64/libc.so.6+0x25c86) Fixes: fed8962aff57 ("Add new port VLAN mode "dot1q-tunnel"") Signed-off-by: Yunjian Wang Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index c0117771809..29f4daa6357 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -66,6 +66,7 @@ #include "tunnel.h" #include "util.h" #include "uuid.h" +#include "vlan-bitmap.h" COVERAGE_DEFINE(xlate_actions); COVERAGE_DEFINE(xlate_actions_oversize); @@ -1028,7 +1029,10 @@ xlate_xbundle_set(struct xbundle *xbundle, xbundle->qinq_ethtype = qinq_ethtype; xbundle->vlan = vlan; xbundle->trunks = trunks; - xbundle->cvlans = cvlans; + if (!vlan_bitmap_equal(xbundle->cvlans, cvlans)) { + free(xbundle->cvlans); + xbundle->cvlans = vlan_bitmap_clone(cvlans); + } xbundle->use_priority_tags = use_priority_tags; xbundle->floodable = floodable; xbundle->protected = protected; @@ -1380,6 +1384,7 @@ xlate_xbundle_remove(struct xlate_cfg *xcfg, struct xbundle *xbundle) ovs_list_remove(&xbundle->list_node); bond_unref(xbundle->bond); lacp_unref(xbundle->lacp); + free(xbundle->cvlans); free(xbundle->name); free(xbundle); } From cd608cf96eb93ebc4aa44d1393b9cb00bfde46e5 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 9 May 2023 16:29:58 +0200 Subject: [PATCH 241/833] netdev-offload: Fix deadlock/recursive use of the netdev_hmap_rwlock rwlock. When doing performance testing with OVS v3.1 we ran into a deadlock situation with the netdev_hmap_rwlock read/write lock. After some debugging, it was discovered that the netdev_hmap_rwlock read lock was taken recursively. And well in the following sequence of events: netdev_ports_flow_get() It takes the read lock, while it walks all the ports in the port_to_netdev hmap and calls: - netdev_flow_get() which will call: - netdev_tc_flow_get() which will call: - netdev_ifindex_to_odp_port() This function also takes the same read lock to walk the ifindex_to_port hmap. In OVS a read/write lock does not support recursive readers. For details see the comments in ovs-thread.h. If you do this, it will lock up, mainly due to OVS setting the PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP attribute to the lock. The solution with this patch is to use two separate read/write locks, with an order guarantee to avoid another potential deadlock. Fixes: 9fe21a4fc12a ("netdev-offload: replace netdev_hmap_mutex to netdev_hmap_rwlock") Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2182541 Reviewed-by: Simon Horman Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-offload.c | 70 ++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/lib/netdev-offload.c b/lib/netdev-offload.c index 4592262bd34..a5fa6248754 100644 --- a/lib/netdev-offload.c +++ b/lib/netdev-offload.c @@ -485,11 +485,13 @@ netdev_set_hw_info(struct netdev *netdev, int type, int val) } /* Protects below port hashmaps. */ -static struct ovs_rwlock netdev_hmap_rwlock = OVS_RWLOCK_INITIALIZER; +static struct ovs_rwlock ifindex_to_port_rwlock = OVS_RWLOCK_INITIALIZER; +static struct ovs_rwlock port_to_netdev_rwlock + OVS_ACQ_BEFORE(ifindex_to_port_rwlock) = OVS_RWLOCK_INITIALIZER; -static struct hmap port_to_netdev OVS_GUARDED_BY(netdev_hmap_rwlock) +static struct hmap port_to_netdev OVS_GUARDED_BY(port_to_netdev_rwlock) = HMAP_INITIALIZER(&port_to_netdev); -static struct hmap ifindex_to_port OVS_GUARDED_BY(netdev_hmap_rwlock) +static struct hmap ifindex_to_port OVS_GUARDED_BY(ifindex_to_port_rwlock) = HMAP_INITIALIZER(&ifindex_to_port); struct port_to_netdev_data { @@ -506,12 +508,12 @@ struct port_to_netdev_data { */ bool netdev_any_oor(void) - OVS_EXCLUDED(netdev_hmap_rwlock) + OVS_EXCLUDED(port_to_netdev_rwlock) { struct port_to_netdev_data *data; bool oor = false; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { struct netdev *dev = data->netdev; @@ -520,7 +522,7 @@ netdev_any_oor(void) break; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return oor; } @@ -594,13 +596,13 @@ netdev_ports_flow_flush(const char *dpif_type) { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { netdev_flow_flush(data->netdev); } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); } void @@ -610,7 +612,7 @@ netdev_ports_traverse(const char *dpif_type, { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { if (cb(data->netdev, data->dpif_port.port_no, aux)) { @@ -618,7 +620,7 @@ netdev_ports_traverse(const char *dpif_type, } } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); } struct netdev_flow_dump ** @@ -629,7 +631,7 @@ netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) int count = 0; int i = 0; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { count++; @@ -648,7 +650,7 @@ netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) i++; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); *ports = i; return dumps; @@ -660,15 +662,15 @@ netdev_ports_flow_del(const char *dpif_type, const ovs_u128 *ufid, { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type && !netdev_flow_del(data->netdev, ufid, stats)) { - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return 0; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ENOENT; } @@ -681,16 +683,16 @@ netdev_ports_flow_get(const char *dpif_type, struct match *match, { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type && !netdev_flow_get(data->netdev, match, actions, ufid, stats, attrs, buf)) { - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return 0; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ENOENT; } @@ -702,7 +704,7 @@ netdev_ports_hash(odp_port_t port, const char *dpif_type) static struct port_to_netdev_data * netdev_ports_lookup(odp_port_t port_no, const char *dpif_type) - OVS_REQ_RDLOCK(netdev_hmap_rwlock) + OVS_REQ_RDLOCK(port_to_netdev_rwlock) { struct port_to_netdev_data *data; @@ -726,9 +728,9 @@ netdev_ports_insert(struct netdev *netdev, struct dpif_port *dpif_port) ovs_assert(dpif_type); - ovs_rwlock_wrlock(&netdev_hmap_rwlock); + ovs_rwlock_wrlock(&port_to_netdev_rwlock); if (netdev_ports_lookup(dpif_port->port_no, dpif_type)) { - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return EEXIST; } @@ -738,14 +740,16 @@ netdev_ports_insert(struct netdev *netdev, struct dpif_port *dpif_port) if (ifindex >= 0) { data->ifindex = ifindex; + ovs_rwlock_wrlock(&ifindex_to_port_rwlock); hmap_insert(&ifindex_to_port, &data->ifindex_node, ifindex); + ovs_rwlock_unlock(&ifindex_to_port_rwlock); } else { data->ifindex = -1; } hmap_insert(&port_to_netdev, &data->portno_node, netdev_ports_hash(dpif_port->port_no, dpif_type)); - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); netdev_init_flow_api(netdev); @@ -758,12 +762,12 @@ netdev_ports_get(odp_port_t port_no, const char *dpif_type) struct port_to_netdev_data *data; struct netdev *ret = NULL; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); data = netdev_ports_lookup(port_no, dpif_type); if (data) { ret = netdev_ref(data->netdev); } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ret; } @@ -774,19 +778,21 @@ netdev_ports_remove(odp_port_t port_no, const char *dpif_type) struct port_to_netdev_data *data; int ret = ENOENT; - ovs_rwlock_wrlock(&netdev_hmap_rwlock); + ovs_rwlock_wrlock(&port_to_netdev_rwlock); data = netdev_ports_lookup(port_no, dpif_type); if (data) { dpif_port_destroy(&data->dpif_port); netdev_close(data->netdev); /* unref and possibly close */ hmap_remove(&port_to_netdev, &data->portno_node); if (data->ifindex >= 0) { + ovs_rwlock_wrlock(&ifindex_to_port_rwlock); hmap_remove(&ifindex_to_port, &data->ifindex_node); + ovs_rwlock_unlock(&ifindex_to_port_rwlock); } free(data); ret = 0; } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ret; } @@ -798,7 +804,7 @@ netdev_ports_get_n_flows(const char *dpif_type, odp_port_t port_no, struct port_to_netdev_data *data; int ret = EOPNOTSUPP; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); data = netdev_ports_lookup(port_no, dpif_type); if (data) { uint64_t thread_n_flows[MAX_OFFLOAD_THREAD_NB] = {0}; @@ -812,7 +818,7 @@ netdev_ports_get_n_flows(const char *dpif_type, odp_port_t port_no, } } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ret; } @@ -822,14 +828,14 @@ netdev_ifindex_to_odp_port(int ifindex) struct port_to_netdev_data *data; odp_port_t ret = 0; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&ifindex_to_port_rwlock); HMAP_FOR_EACH_WITH_HASH (data, ifindex_node, ifindex, &ifindex_to_port) { if (data->ifindex == ifindex) { ret = data->dpif_port.port_no; break; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&ifindex_to_port_rwlock); return ret; } @@ -847,11 +853,11 @@ netdev_ports_flow_init(void) { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { netdev_init_flow_api(data->netdev); } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); } void From ffb8b743bb7706f6b33d0b329d011bf163976652 Mon Sep 17 00:00:00 2001 From: Zhiqi Chen Date: Wed, 10 May 2023 16:35:37 +0800 Subject: [PATCH 242/833] dpctl: Fix dereferencing null pointer in parse_ct_limit_zones(). Command with empty string following "dpctl/ct-get-limits zone=" such as "ovs-appctl dpctl/ct-get-limits zone=" will cause parse_ct_limit_zones() dereferencing null. Signed-off-by: Zhiqi Chen Signed-off-by: Ilya Maximets --- lib/dpctl.c | 5 +++-- tests/dpctl.at | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index 3ba40fa8fb6..15950bd50c2 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -2206,7 +2206,7 @@ parse_ct_limit_zones(const char *argv, struct ovs_list *zone_limits, argcopy = xstrdup(argv + 5); next_zone = strtok_r(argcopy, ",", &save_ptr); - do { + while (next_zone != NULL) { if (ovs_scan(next_zone, "%"SCNu16, &zone)) { ct_dpif_push_zone_limit(zone_limits, zone, 0, 0); } else { @@ -2214,7 +2214,8 @@ parse_ct_limit_zones(const char *argv, struct ovs_list *zone_limits, free(argcopy); return EINVAL; } - } while ((next_zone = strtok_r(NULL, ",", &save_ptr)) != NULL); + next_zone = strtok_r(NULL, ",", &save_ptr); + } free(argcopy); return 0; diff --git a/tests/dpctl.at b/tests/dpctl.at index 7454a51ec6b..d2f1046f8b5 100644 --- a/tests/dpctl.at +++ b/tests/dpctl.at @@ -135,3 +135,19 @@ AT_CHECK([ovs-appctl dpctl/dump-flows dummy@br0 | sort], [0], [dnl AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([dpctl - ct-get-limits ct-del-limits]) +OVS_VSWITCHD_START +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [default limit=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=], [0], [default limit=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=,], [0], [default limit=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=x], [2], [], + [ovs-vswitchd: invalid zone (Invalid argument) +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=]) +OVS_VSWITCHD_STOP +AT_CLEANUP \ No newline at end of file From 64e4cca5c4925f903386367848bc8ad5df10f417 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 11 May 2023 21:13:29 +0200 Subject: [PATCH 243/833] AUTHORS: Add Zhiqi Chen. Additionally re-sorted part of the list that was particularly not ordered. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4dca731fc91..a8ff226ec10 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -491,15 +491,16 @@ Yuanhan Liu yuanhan.liu@linux.intel.com Yunjian Wang wangyunjian@huawei.com Yousong Zhou yszhou4tech@gmail.com Zak Whittington zwhitt.vmware@gmail.com -ZhengLingyun konghuarukhr@163.com -Zoltán Balogh zoltan.balogh.eth@gmail.com -Zoltan Kiss zoltan.kiss@citrix.com -Zongkai LI zealokii@gmail.com -Zhi Yong Wu zwu.kernel@gmail.com Zang MingJie zealot0630@gmail.com +ZhengLingyun konghuarukhr@163.com Zhenyu Gao sysugaozhenyu@gmail.com +Zhi Yong Wu zwu.kernel@gmail.com ZhiPeng Lu luzhipeng@uniudc.com +Zhiqi Chen chenzhiqi.123@bytedance.com Zhou Yangchao 1028519445@qq.com +Zoltan Kiss zoltan.kiss@citrix.com +Zoltán Balogh zoltan.balogh.eth@gmail.com +Zongkai LI zealokii@gmail.com aginwala amginwal@gmail.com lic121 lic121@chinatelecom.cn lzhecheng lzhecheng@vmware.com From f3f3be682dfaaf13cecb69a17767151d18787a57 Mon Sep 17 00:00:00 2001 From: Stefan Hoffmann Date: Thu, 11 May 2023 15:35:15 +0200 Subject: [PATCH 244/833] tests-ovsdb: Switch OVSDB_START_IDLTEST to macro. Define bash function as macro now. Later we can extend this macro for other usecases. Signed-off-by: Stefan Hoffmann Signed-off-by: Ilya Maximets --- tests/ovsdb-idl.at | 83 +++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 9d28672efe6..258d79fe93f 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -1,17 +1,6 @@ AT_BANNER([OVSDB -- interface description language (IDL)]) m4_divert_text([PREPARE_TESTS], [ -# ovsdb_start_idltest [REMOTE] [SCHEMA] -# -# Creates a database using SCHEMA (default: idltest.ovsschema) and -# starts a database server listening on punix:socket and REMOTE (if -# specified). -ovsdb_start_idltest () { - ovsdb-tool create db ${2:-$abs_srcdir/idltest.ovsschema} || return $? - ovsdb-server -vconsole:warn --log-file --detach --no-chdir --pidfile --remote=punix:socket ${1:+--remote=$1} db || return $? - on_exit 'kill `cat ovsdb-server.pid`' -} - # ovsdb_cluster_leader [REMOTES] [DATABASE] # # Returns the leader of the DATABASE cluster. @@ -29,6 +18,24 @@ ovsdb_cluster_leader () { done }]) + +# OVSDB_START_IDLTEST([REMOTE], [SCHEMA]) +# +# Creates a database using SCHEMA (default: idltest.ovsschema) and +# starts a database server listening on punix:socket and REMOTE (if +# specified). +m4_define([OVSDB_START_IDLTEST], +[ + AT_CHECK([ovsdb-tool create db dnl + m4_if([$2], [], [$abs_srcdir/idltest.ovsschema], [$2])]) + AT_CHECK([ovsdb-server -vconsole:warn --log-file --detach --no-chdir dnl + --pidfile --remote=punix:socket dnl + m4_if([$1], [], [], [--remote=$1]) db dnl + ]) + on_exit 'kill `cat ovsdb-server.pid`' +]) + + # OVSDB_CLUSTER_START_IDLTEST([N], [REMOTE]) # # Creates a clustered database using idltest.ovsschema and starts a database @@ -77,7 +84,7 @@ m4_define([OVSDB_CLUSTER_START_IDLTEST], m4_define([OVSDB_CHECK_IDL_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $3], @@ -91,7 +98,7 @@ m4_define([OVSDB_CHECK_IDL_C], m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C], [AT_SETUP([$1 - write-changed-only - C]) AT_KEYWORDS([ovsdb server idl positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 --write-changed-only idl unix:socket $3], @@ -105,7 +112,7 @@ m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C], m4_define([OVSDB_CHECK_IDL_TCP_C], [AT_SETUP([$1 - C - tcp]) AT_KEYWORDS([ovsdb server idl positive tcp socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) m4_if([$2], [], [], @@ -123,7 +130,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_C], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test $HAVE_IPV6 = no]) AT_KEYWORDS([ovsdb server idl positive tcp6 socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:[[::1]]"]) + OVSDB_START_IDLTEST(["ptcp:0:[[::1]]"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) m4_if([$2], [], [], @@ -139,7 +146,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_C], m4_define([OVSDB_CHECK_IDL_PY], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl positive Python $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $3], @@ -152,7 +159,7 @@ m4_define([OVSDB_CHECK_IDL_PY], m4_define([OVSDB_CHECK_IDL_REGISTER_COLUMNS_PY], [AT_SETUP([$1 - Python3 - register_columns]) AT_KEYWORDS([ovsdb server idl positive Python register_columns $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket ?simple:b,ba,i,ia,r,ra,s,sa,u,ua?simple3:name,uset,uref?simple4:name?simple6:name,weak_ref?link1:i,k,ka,l2?link2:i,l1?singleton:name $3], @@ -166,7 +173,7 @@ m4_define([OVSDB_CHECK_IDL_REGISTER_COLUMNS_PY], m4_define([OVSDB_CHECK_IDL_TCP_PY], [AT_SETUP([$1 - Python3 - tcp]) AT_KEYWORDS([ovsdb server idl positive Python with tcp socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) m4_if([$2], [], [], @@ -183,7 +190,7 @@ m4_define([OVSDB_CHECK_IDL_TCP_PY], m4_define([OVSDB_CHECK_IDL_TCP_MULTIPLE_REMOTES_PY], [AT_SETUP([$1 - Python3 (multiple remotes) - tcp]) AT_KEYWORDS([ovsdb server idl positive Python with tcp socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT_1=$((TCP_PORT + 101)) WRONG_PORT_2=$((TCP_PORT + 102)) @@ -203,7 +210,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_PY], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test $HAVE_IPV6 = no]) AT_KEYWORDS([ovsdb server idl positive Python with tcp6 socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:[[::1]]"]) + OVSDB_START_IDLTEST(["ptcp:0:[[::1]]"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) echo "TCP_PORT=$TCP_PORT" @@ -221,7 +228,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_MULTIPLE_REMOTES_PY], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test $HAVE_IPV6 = no]) AT_KEYWORDS([ovsdb server idl positive Python with tcp6 socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:[[::1]]"]) + OVSDB_START_IDLTEST(["ptcp:0:[[::1]]"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT_1=$((TCP_PORT + 101)) WRONG_PORT_2=$((TCP_PORT + 102)) @@ -287,13 +294,13 @@ m4_define([OVSDB_CHECK_IDL_PASSIVE_TCP_PY], [AT_SETUP([$1 - Python3 - ptcp]) AT_KEYWORDS([ovsdb server idl positive Python with tcp socket $5]) # find free TCP port - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) OVSDB_SERVER_SHUTDOWN rm -f db # start OVSDB server in passive mode - AT_CHECK([ovsdb_start_idltest "tcp:127.0.0.1:$TCP_PORT"]) + OVSDB_START_IDLTEST(["tcp:127.0.0.1:$TCP_PORT"]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl_passive $srcdir/idltest.ovsschema ptcp:127.0.0.1:$TCP_PORT $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), @@ -473,7 +480,7 @@ OVSDB_CHECK_IDL([simple idl, writing via IDL with unicode], m4_define([OVSDB_CHECK_IDL_PY_WITH_EXPOUT], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl positive Python $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $3], @@ -990,7 +997,7 @@ AT_KEYWORDS([ovsdb server idl positive]) # table link2 and column l2 have been deleted. But the IDL still # expects them to be there, so this test checks that it properly # tolerates them being missing. -AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest2.ovsschema"]) +OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest2.ovsschema"]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket ['["idltest", {"op": "insert", "table": "link1", @@ -1063,7 +1070,7 @@ AT_CLEANUP m4_define([OVSDB_CHECK_IDL_FETCH_COLUMNS_PY], [AT_SETUP([$1 - Python3 - fetch]) AT_KEYWORDS([ovsdb server idl positive Python increment fetch $6]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket [$3] $4], @@ -1107,7 +1114,7 @@ OVSDB_CHECK_IDL_FETCH_COLUMNS([simple idl, initially populated], m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_PY], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl Python monitor $4]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/disable-monitor-cond]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $2], [0], [stdout], [ignore]) @@ -1200,7 +1207,7 @@ OVSDB_CHECK_IDL_WO_MONITOR_COND([simple idl disable monitor-cond], m4_define([OVSDB_CHECK_IDL_TRACK_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl tracking positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl unix:socket $3], @@ -1213,7 +1220,7 @@ m4_define([OVSDB_CHECK_IDL_TRACK_C], m4_define([OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C], [AT_SETUP([$1 - write-changed-only - C]) AT_KEYWORDS([ovsdb server idl tracking positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c --write-changed-only idl unix:socket $3], @@ -1716,7 +1723,7 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially empty, various ops], m4_define([OVSDB_CHECK_IDL_PARTIAL_UPDATE_MAP_COLUMN], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl partial update map column positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl-partial-update-map-column unix:socket $3], @@ -1777,7 +1784,7 @@ OVSDB_CHECK_IDL_PY([partial-map update set refmap idl], m4_define([OVSDB_CHECK_IDL_PARTIAL_UPDATE_SET_COLUMN], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl partial update set column positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl-partial-update-set-column unix:socket $3], @@ -1966,7 +1973,7 @@ OVSDB_CHECK_IDL_NOTIFY([simple idl verify notify], m4_define([OVSDB_CHECK_IDL_COMPOUND_INDEX_SINGLE_COLUMN_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl compound_index_single_column compound_index positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) # Generate the data to be tested. @@ -2113,7 +2120,7 @@ OVSDB_CHECK_IDL_COMPOUND_INDEX_SINGLE_COLUMN_C([Compound_index, single column te m4_define([OVSDB_CHECK_IDL_COMPOUND_INDEX_DOUBLE_COLUMN_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl compound_index_double_column compound_index positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) # Generate the data to be tested. @@ -2252,7 +2259,7 @@ OVSDB_CHECK_IDL_COMPOUND_INDEX_DOUBLE_COLUMN_C([Compound_index, double column te m4_define([OVSDB_CHECK_IDL_COMPOUND_INDEX_WITH_REF], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl compound_index compound_index_with_ref positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl-compound-index-with-ref unix:socket $3], @@ -2280,7 +2287,7 @@ m4_define([CHECK_STREAM_OPEN_BLOCK], AT_SKIP_IF([test "$3" = "tcp6" && test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test "$3" = "tcp6" && test "$HAVE_IPV6" = "no"]) AT_KEYWORDS([ovsdb server stream open_block $3]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:$4"]) + OVSDB_START_IDLTEST(["ptcp:0:$4"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT=$(($TCP_PORT + 101)) AT_CHECK([$2 tcp:$4:$TCP_PORT], [0], [ignore]) @@ -2468,7 +2475,7 @@ reconnect.*waiting .* seconds before reconnect) AT_SETUP([idl table and column presence check]) AT_KEYWORDS([ovsdb server idl table column check]) -AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest2.ovsschema"]) +OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest2.ovsschema"]) AT_CHECK(ovsdb-tool create db2 $abs_srcdir/idltest.ovsschema) AT_CHECK(ovsdb-server -vconsole:warn --log-file=ovsdb-server2.log --detach dnl @@ -2596,7 +2603,7 @@ OVSDB_CHECK_IDL_TRACK([track, insert and delete, refs to link2], m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([idl persistent uuid insert]) - AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest.ovsschema"]) + OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest.ovsschema"]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $2], [0], [stdout], [stderr]) AT_CHECK([sort stdout], @@ -2608,7 +2615,7 @@ m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_C], m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_PY], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([idl persistent uuid insert]) - AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest.ovsschema"]) + OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest.ovsschema"]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $2], [0], [stdout], [stderr]) AT_CHECK([sort stdout], From 965c2955e6750f503b55d5c0af516cbb7b45f7ae Mon Sep 17 00:00:00 2001 From: Stefan Hoffmann Date: Thu, 11 May 2023 15:38:50 +0200 Subject: [PATCH 245/833] test-stream: Add ssl tests for stream open block. This tests stream.c and stream.py with ssl connection at CHECK_STREAM_OPEN_BLOCK. For the tests, ovsdb needs to be build with libssl. Signed-off-by: Stefan Hoffmann Signed-off-by: Ilya Maximets --- tests/ovsdb-idl.at | 31 +++++++++++++++++++++++++++---- tests/test-stream.c | 12 +++++++++++- tests/test-stream.py | 18 ++++++++++++++++++ 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 258d79fe93f..978a6677bd6 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -28,8 +28,13 @@ m4_define([OVSDB_START_IDLTEST], [ AT_CHECK([ovsdb-tool create db dnl m4_if([$2], [], [$abs_srcdir/idltest.ovsschema], [$2])]) + PKIDIR=$abs_top_builddir/tests AT_CHECK([ovsdb-server -vconsole:warn --log-file --detach --no-chdir dnl --pidfile --remote=punix:socket dnl + m4_if(m4_substr($1, 0, 5), [pssl:], + [--private-key=$PKIDIR/testpki-privkey2.pem dnl + --certificate=$PKIDIR/testpki-cert2.pem dnl + --ca-cert=$PKIDIR/testpki-cacert.pem], []) dnl m4_if([$1], [], [], [--remote=$1]) db dnl ]) on_exit 'kill `cat ovsdb-server.pid`' @@ -2286,14 +2291,26 @@ m4_define([CHECK_STREAM_OPEN_BLOCK], [AT_SETUP([Check stream open block - $1 - $3]) AT_SKIP_IF([test "$3" = "tcp6" && test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test "$3" = "tcp6" && test "$HAVE_IPV6" = "no"]) + AT_SKIP_IF([test "$3" = "ssl6" && test "$IS_WIN32" = "yes"]) + AT_SKIP_IF([test "$3" = "ssl6" && test "$HAVE_IPV6" = "no"]) + AT_SKIP_IF([test "$3" = "ssl" && test "$HAVE_OPENSSL" = "no"]) + $PYTHON3 -c "import ssl" + SSL_PRESENT=$? + AT_SKIP_IF([test "$3" = "ssl" && test $SSL_PRESENT != 0]) + AT_SKIP_IF([test "$3" = "ssl6" && test "$HAVE_OPENSSL" = "no"]) + AT_SKIP_IF([test "$3" = "ssl6" && test $SSL_PRESENT != 0]) AT_KEYWORDS([ovsdb server stream open_block $3]) - OVSDB_START_IDLTEST(["ptcp:0:$4"]) + PKIDIR=$abs_top_builddir/tests + m4_define([PROTOCOL], [m4_substr([$3], [0], [3])]) + OVSDB_START_IDLTEST([m4_join([], [p], PROTOCOL, [:0:], $4)]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT=$(($TCP_PORT + 101)) - AT_CHECK([$2 tcp:$4:$TCP_PORT], [0], [ignore]) - AT_CHECK([$2 tcp:$4:$WRONG_PORT], [1], [ignore], [ignore]) + SSL_KEY_ARGS="$PKIDIR/testpki-privkey.pem $PKIDIR/testpki-cert.pem $PKIDIR/testpki-cacert.pem" + AT_CHECK([$2 PROTOCOL:$4:$TCP_PORT $SSL_KEY_ARGS], [0], [ignore]) + AT_CHECK([$2 PROTOCOL:$4:$WRONG_PORT $SSL_KEY_ARGS], [1], [ignore], + [ignore]) OVSDB_SERVER_SHUTDOWN - AT_CHECK([$2 tcp:$4:$TCP_PORT], [1], [ignore], [ignore]) + AT_CHECK([$2 PROTOCOL:$4:$TCP_PORT $SSL_KEY_ARGS], [1], [ignore], [ignore]) AT_CLEANUP]) CHECK_STREAM_OPEN_BLOCK([C], [test-stream], [tcp], [127.0.0.1]) @@ -2302,6 +2319,12 @@ CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], [tcp], [127.0.0.1]) CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], [tcp6], [[[::1]]]) +CHECK_STREAM_OPEN_BLOCK([C], [test-stream], [ssl], [127.0.0.1]) +CHECK_STREAM_OPEN_BLOCK([C], [test-stream], [ssl6], [[[::1]]]) +CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], + [ssl], [127.0.0.1]) +CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], + [ssl6], [[[::1]]]) # same as OVSDB_CHECK_IDL but uses Python IDL implementation with tcp # with multiple remotes to assert the idl connects to the leader of the Raft cluster diff --git a/tests/test-stream.c b/tests/test-stream.c index 68ce2c5442f..14e3bfe381d 100644 --- a/tests/test-stream.c +++ b/tests/test-stream.c @@ -19,6 +19,7 @@ #include "fatal-signal.h" #include "openvswitch/vlog.h" #include "stream.h" +#include "stream-ssl.h" #include "util.h" VLOG_DEFINE_THIS_MODULE(test_stream); @@ -33,7 +34,16 @@ main(int argc, char *argv[]) set_program_name(argv[0]); if (argc < 2) { - ovs_fatal(0, "usage: %s REMOTE", argv[0]); + ovs_fatal(0, "usage: %s REMOTE [SSL_KEY] [SSL_CERT] [SSL_CA]", + argv[0]); + } + if (strncmp("ssl:", argv[1], 4) == 0) { + if (argc < 5) { + ovs_fatal(0, "usage with ssl: %s REMOTE SSL_KEY SSL_CERT SSL_CA", + argv[0]); + } + stream_ssl_set_ca_cert_file(argv[4], false); + stream_ssl_set_key_and_cert(argv[2], argv[3]); } error = stream_open_block(stream_open(argv[1], &stream, DSCP_DEFAULT), diff --git a/tests/test-stream.py b/tests/test-stream.py index 93d63c019b3..a6a9c18b24b 100644 --- a/tests/test-stream.py +++ b/tests/test-stream.py @@ -15,10 +15,28 @@ import sys import ovs.stream +import ovs.util def main(argv): + if len(argv) < 2: + ovs.util.ovs_fatal(0, + "usage: %s REMOTE [SSL_KEY] [SSL_CERT] [SSL_CA]", + argv[0], + ) remote = argv[1] + + if remote.startswith("ssl:"): + if len(argv) < 5: + ovs.util.ovs_fatal( + 0, + "usage with ssl: %s REMOTE [SSL_KEY] [SSL_CERT] [SSL_CA]", + argv[0], + ) + ovs.stream.SSLStream.ssl_set_ca_cert_file(argv[4]) + ovs.stream.SSLStream.ssl_set_certificate_file(argv[3]) + ovs.stream.SSLStream.ssl_set_private_key_file(argv[2]) + err, stream = ovs.stream.Stream.open_block( ovs.stream.Stream.open(remote), 10000) From 5cb543bc59fbdececfcb5496b643e162284efba4 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Thu, 18 May 2023 09:22:44 -0400 Subject: [PATCH 246/833] MAINTAINERS.rst: Make myself an active maintainer I am currently an emeritus committer, but I would like to become active again for a short period of time to work through some governance issues preventing us from updating our committers list following our approved policies for doing so. Signed-off-by: Russell Bryant Acked-by: Alin Gabriel Serdean --- MAINTAINERS.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 5df9aab78d4..1dc406170f2 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -65,6 +65,8 @@ This is the current list of active Open vSwitch committers: - jpettit@ovn.org * - Pravin B Shelar - pshelar@ovn.org + * - Russell Bryant + - russell@ovn.org * - Simon Horman - horms@ovn.org * - Thomas Graf @@ -89,8 +91,6 @@ More information about Emeritus Committers can be found here: - ejj@eecs.berkeley.edu * - Joe Stringer - joe@ovn.org - * - Russell Bryant - - russell@ovn.org .. Cut here for the Documentation/internals/maintainers.rst From 8045c0f8de5192355ca438ed7eef77457c3c1625 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Fri, 12 May 2023 15:41:41 +0200 Subject: [PATCH 247/833] tests: dpdk: Pass `--no-pci` to tests that do not use physical ports. At present, the system-dpdk-testsuite makes assumptions about environment configuration, and will error out if DPDK compatible interfaces not configured for DPDK are present in the system with a message like: EAL: Probe PCI driver: net_virtio (1af4:1000) device: 0000:00:03.0 (socket -1) eth_virtio_pci_init(): Failed to init PCI device EAL: Requested device 0000:00:03.0 cannot be used The system-dpdk-testsuite is useful even with no DPDK PHY available, as the tests requiring a PHY will skip gracefully when none present. This patch extends the OVS_DPDK_START and OVS_DPDK_START_VSWITCHD macros to allow passing in values that will be set in `other_config:dpdk-extra` before the test runs. Tests that do not use physical ports are also extended to pass the `--no-pci` argument. We will use this patch in a follow-up, enabling more elaborate Debian autopkgtests for Open vSwitch. Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/system-dpdk-macros.at | 4 ++-- tests/system-dpdk.at | 36 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 53fbc13206c..3920f08a5ed 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -42,7 +42,7 @@ m4_define([OVS_DPDK_START], OVS_DPDK_START_OVSDB() dnl Enable DPDK functionality AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true]) - OVS_DPDK_START_VSWITCHD() + OVS_DPDK_START_VSWITCHD($1) ]) # OVS_DPDK_START_OVSDB() @@ -72,7 +72,7 @@ m4_define([OVS_DPDK_START_OVSDB], # m4_define([OVS_DPDK_START_VSWITCHD], [dnl Change DPDK drivers log levels so that tests only catch errors - AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-extra=--log-level=pmd.*:error]) + AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-extra="--log-level=pmd.*:error $1"]) dnl Start ovs-vswitchd. AT_CHECK([ovs-vswitchd --detach --no-chdir --pidfile --log-file -vvconn -vofproto_dpif -vunixctl], [0], [stdout], [stderr]) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index cb6c6d59075..0f58e857422 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -32,7 +32,7 @@ dnl Check if EAL init is successful AT_SETUP([OVS-DPDK - EAL init]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) AT_CHECK([grep "DPDK Enabled - initializing..." ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "EAL" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "DPDK Enabled - initialized" ovs-vswitchd.log], [], [stdout]) @@ -69,7 +69,7 @@ dnl Add vhost-user-client port AT_SETUP([OVS-DPDK - add vhost-user-client port]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -98,7 +98,7 @@ AT_SETUP([OVS-DPDK - ping vhost-user ports]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Find number of sockets AT_CHECK([lscpu], [], [stdout]) @@ -174,7 +174,7 @@ AT_SETUP([OVS-DPDK - ping vhost-user-client ports]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Find number of sockets AT_CHECK([lscpu], [], [stdout]) @@ -309,7 +309,7 @@ AT_SETUP([OVS-DPDK - Ingress policing create delete vport port]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -352,7 +352,7 @@ AT_SETUP([OVS-DPDK - Ingress policing no policing rate]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -393,7 +393,7 @@ AT_SETUP([OVS-DPDK - Ingress policing no policing burst]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -465,7 +465,7 @@ AT_SETUP([OVS-DPDK - QoS create delete vport port]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -506,7 +506,7 @@ AT_SETUP([OVS-DPDK - QoS no cir]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -541,7 +541,7 @@ AT_SETUP([OVS-DPDK - QoS no cbs]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) @@ -661,7 +661,7 @@ AT_KEYWORDS([dpdk]) AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Find number of sockets AT_CHECK([lscpu], [], [stdout]) @@ -717,7 +717,7 @@ AT_KEYWORDS([dpdk]) AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Find number of sockets AT_CHECK([lscpu], [], [stdout]) @@ -856,7 +856,7 @@ AT_KEYWORDS([dpdk]) AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Find number of sockets AT_CHECK([lscpu], [], [stdout]) @@ -908,7 +908,7 @@ AT_KEYWORDS([dpdk]) AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Find number of sockets AT_CHECK([lscpu], [], [stdout]) @@ -963,7 +963,7 @@ dnl MFEX Autovalidator AT_SETUP([OVS-DPDK - MFEX Autovalidator]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl ]) @@ -996,7 +996,7 @@ dnl MFEX Autovalidator Fuzzy AT_SETUP([OVS-DPDK - MFEX Autovalidator Fuzzy]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl ]) @@ -1032,7 +1032,7 @@ AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py test_traffic.pcap 1], [], [stdout]) -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) @@ -1153,7 +1153,7 @@ AT_SETUP([OVS-DPDK - user configured mempool]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() OVS_DPDK_START_OVSDB() -OVS_DPDK_START_VSWITCHD() +OVS_DPDK_START_VSWITCHD([--no-pci]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:shared-mempool-config=8000,6000,1500]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true]) From be6f096fbe5e75b1969d10c1b3813499fbc32d9a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 19 May 2023 22:05:37 +0200 Subject: [PATCH 248/833] netdev-vport: Fix unsafe handling of GRE sequence number. GRE sequence number is maintained as part of the tunnel config. This triggers tunnel reconfiguration every time set_tunnel_config() is called, because memset over tunnel config will never be equal to the new config constructed from database options. And sequence number incremented non-atomically without holding a mutex on tunnel push, that may lead to corruption if multiple threads are sending packets to the same tunnel. Fix that by moving sequence number to the netdev_vport structure instead and using an atomic counter. Fixes: 0ffff4975308 ("userspace: add gre sequence number support.") Fixes: 7dc18ae96d33 ("userspace: add erspan tunnel support.") Fixes: 3c6d05a02e0f ("userspace: Add GTP-U support.") Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 14 ++++---------- lib/netdev-vport-private.h | 4 ++++ lib/netdev-vport.c | 2 ++ lib/netdev.h | 1 - 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 9abdf51076a..e31d61dd5ec 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -452,7 +452,6 @@ netdev_gre_push_header(const struct netdev *netdev, const struct ovs_action_push_tnl *data) { struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct gre_base_hdr *greh; int ip_tot_size; @@ -468,8 +467,7 @@ netdev_gre_push_header(const struct netdev *netdev, int seq_ofs = gre_header_len(greh->flags) - 4; ovs_16aligned_be32 *seq_opt = ALIGNED_CAST(ovs_16aligned_be32 *, (char *)greh + seq_ofs); - tnl_cfg = &dev->tnl_cfg; - put_16aligned_be32(seq_opt, htonl(tnl_cfg->seqno++)); + put_16aligned_be32(seq_opt, htonl(atomic_count_inc(&dev->gre_seqno))); } } @@ -605,7 +603,6 @@ netdev_erspan_push_header(const struct netdev *netdev, const struct ovs_action_push_tnl *data) { struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct erspan_base_hdr *ersh; struct gre_base_hdr *greh; struct erspan_md2 *md2; @@ -615,9 +612,8 @@ netdev_erspan_push_header(const struct netdev *netdev, data->header_len, &ip_tot_size); /* update GRE seqno */ - tnl_cfg = &dev->tnl_cfg; ovs_16aligned_be32 *seqno = (ovs_16aligned_be32 *) (greh + 1); - put_16aligned_be32(seqno, htonl(tnl_cfg->seqno++)); + put_16aligned_be32(seqno, htonl(atomic_count_inc(&dev->gre_seqno))); /* update v2 timestamp */ if (greh->protocol == htons(ETH_TYPE_ERSPAN2)) { @@ -786,7 +782,6 @@ netdev_gtpu_push_header(const struct netdev *netdev, const struct ovs_action_push_tnl *data) { struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct udp_header *udp; struct gtpuhdr *gtpuh; int ip_tot_size; @@ -801,10 +796,9 @@ netdev_gtpu_push_header(const struct netdev *netdev, gtpuh = ALIGNED_CAST(struct gtpuhdr *, udp + 1); - tnl_cfg = &dev->tnl_cfg; - if (tnl_cfg->set_seq) { + if (gtpuh->md.flags & GTPU_S_MASK) { ovs_be16 *seqno = ALIGNED_CAST(ovs_be16 *, gtpuh + 1); - *seqno = htons(tnl_cfg->seqno++); + *seqno = htons(atomic_count_inc(&dev->gre_seqno)); payload_len += sizeof(struct gtpuhdr_opt); } gtpuh->len = htons(payload_len); diff --git a/lib/netdev-vport-private.h b/lib/netdev-vport-private.h index d89a28c66c6..e3c3bdb4348 100644 --- a/lib/netdev-vport-private.h +++ b/lib/netdev-vport-private.h @@ -22,11 +22,15 @@ #include "compiler.h" #include "netdev.h" #include "netdev-provider.h" +#include "ovs-atomic.h" #include "ovs-thread.h" struct netdev_vport { struct netdev up; + /* Sequence number for outgoing GRE packets. */ + atomic_count gre_seqno; + /* Protects all members below. */ struct ovs_mutex mutex; diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 663ee8606c3..6bbaa2feb99 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -37,6 +37,7 @@ #include "netdev-provider.h" #include "netdev-vport-private.h" #include "openvswitch/dynamic-string.h" +#include "ovs-atomic.h" #include "ovs-router.h" #include "packets.h" #include "openvswitch/poll-loop.h" @@ -198,6 +199,7 @@ netdev_vport_construct(struct netdev *netdev_) uint16_t port = 0; ovs_mutex_init(&dev->mutex); + atomic_count_init(&dev->gre_seqno, 0); eth_addr_random(&dev->etheraddr); if (name && dpif_port && (strlen(name) > strlen(dpif_port) + 1) && diff --git a/lib/netdev.h b/lib/netdev.h index ff207f56c28..1fab9127374 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -130,7 +130,6 @@ struct netdev_tunnel_config { enum netdev_pt_mode pt_mode; bool set_seq; - uint32_t seqno; uint32_t erspan_idx; uint8_t erspan_ver; uint8_t erspan_dir; From 0c4b299ebb2b455064ba9199aedefd3b99e0040b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 19 May 2023 23:45:30 +0200 Subject: [PATCH 249/833] smap: Make argument of smap_add_ipv6 constant. The address is not getting modified inside. Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/smap.c | 2 +- lib/smap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/smap.c b/lib/smap.c index c1633e2a18d..47fb3450201 100644 --- a/lib/smap.c +++ b/lib/smap.c @@ -100,7 +100,7 @@ smap_add_format(struct smap *smap, const char *key, const char *format, ...) /* Adds 'key' paired with a string representation of 'addr'. It is the * caller's responsibility to avoid duplicate keys if desirable. */ void -smap_add_ipv6(struct smap *smap, const char *key, struct in6_addr *addr) +smap_add_ipv6(struct smap *smap, const char *key, const struct in6_addr *addr) { char buf[INET6_ADDRSTRLEN]; ipv6_string_mapped(buf, addr); diff --git a/lib/smap.h b/lib/smap.h index 2fe6c540a71..d1d2ae6f20a 100644 --- a/lib/smap.h +++ b/lib/smap.h @@ -100,7 +100,7 @@ struct smap_node *smap_add_nocopy(struct smap *, char *, char *); bool smap_add_once(struct smap *, const char *, const char *); void smap_add_format(struct smap *, const char *key, const char *, ...) OVS_PRINTF_FORMAT(3, 4); -void smap_add_ipv6(struct smap *, const char *, struct in6_addr *); +void smap_add_ipv6(struct smap *, const char *, const struct in6_addr *); void smap_replace(struct smap *, const char *, const char *); void smap_replace_nocopy(struct smap *, const char *, char *); From ce8828a37250feed1f8d6e23b33b936dc6a09b4e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Sat, 20 May 2023 01:35:26 +0200 Subject: [PATCH 250/833] netdev-vport: RCU-fy tunnel config. Tunnel config can be accessed by multiple threads at the same time and it is supposed to be protected by the netdev_vport mutex. However, many functions are getting direct access to it via netdev API without taking the mutex, creating a potential for various race conditions. Fix that by protecting the tunnel config with RCU. The whole structure is replaced on configuration changes. Individual fields are never updated and the structure itself is constant. This way it can be safely used by different threads within RCU grace period. Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 84 +++++------------ lib/netdev-vport-private.h | 3 +- lib/netdev-vport.c | 189 +++++++++++++++++++++---------------- lib/netdev.h | 3 + 4 files changed, 134 insertions(+), 145 deletions(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index e31d61dd5ec..53dde61f148 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -320,7 +320,7 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, } static void * -udp_build_header(struct netdev_tunnel_config *tnl_cfg, +udp_build_header(const struct netdev_tunnel_config *tnl_cfg, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { @@ -476,16 +476,11 @@ netdev_gre_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct gre_base_hdr *greh; ovs_16aligned_be32 *options; unsigned int hlen; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; - greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE); if (params->flow->packet_type == htonl(PT_ETH)) { @@ -493,8 +488,7 @@ netdev_gre_build_header(const struct netdev *netdev, } else if (pt_ns(params->flow->packet_type) == OFPHTN_ETHERTYPE) { greh->protocol = pt_ns_type_be(params->flow->packet_type); } else { - ovs_mutex_unlock(&dev->mutex); - return 1; + return EINVAL; } greh->flags = 0; @@ -505,6 +499,8 @@ netdev_gre_build_header(const struct netdev *netdev, options++; } + tnl_cfg = netdev_get_tunnel_config(netdev); + if (tnl_cfg->out_key_present) { greh->flags |= htons(GRE_KEY); put_16aligned_be32(options, be64_to_be32(params->flow->tunnel.tun_id)); @@ -517,8 +513,6 @@ netdev_gre_build_header(const struct netdev *netdev, options++; } - ovs_mutex_unlock(&dev->mutex); - hlen = (uint8_t *) options - (uint8_t *) greh; data->header_len += hlen; @@ -628,8 +622,7 @@ netdev_erspan_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct gre_base_hdr *greh; struct erspan_base_hdr *ersh; unsigned int hlen; @@ -637,21 +630,19 @@ netdev_erspan_build_header(const struct netdev *netdev, int erspan_ver; uint16_t sid; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE); ersh = ERSPAN_HDR(greh); tun_id = ntohl(be64_to_be32(params->flow->tunnel.tun_id)); /* ERSPAN only has 10-bit session ID */ if (tun_id & ~ERSPAN_SID_MASK) { - ovs_mutex_unlock(&dev->mutex); - return 1; + return EINVAL; } else { sid = (uint16_t) tun_id; } + tnl_cfg = netdev_get_tunnel_config(netdev); + if (tnl_cfg->erspan_ver_flow) { erspan_ver = params->flow->tunnel.erspan_ver; } else { @@ -698,12 +689,9 @@ netdev_erspan_build_header(const struct netdev *netdev, hlen = ERSPAN_GREHDR_LEN + sizeof *ersh + ERSPAN_V2_MDSIZE; } else { VLOG_WARN_RL(&err_rl, "ERSPAN version error %d", tnl_cfg->erspan_ver); - ovs_mutex_unlock(&dev->mutex); - return 1; + return EINVAL; } - ovs_mutex_unlock(&dev->mutex); - data->header_len += hlen; if (params->is_ipv6) { @@ -809,13 +797,12 @@ netdev_gtpu_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct gtpuhdr *gtph; unsigned int gtpu_hlen; - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; + tnl_cfg = netdev_get_tunnel_config(netdev); + gtph = udp_build_header(tnl_cfg, data, params); /* Set to default if not set in flow. */ @@ -831,7 +818,6 @@ netdev_gtpu_build_header(const struct netdev *netdev, gtph->md.flags |= GTPU_S_MASK; gtpu_hlen += sizeof(struct gtpuhdr_opt); } - ovs_mutex_unlock(&dev->mutex); data->header_len += gtpu_hlen; data->tnl_type = OVS_VPORT_TYPE_GTPU; @@ -844,19 +830,15 @@ netdev_srv6_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; const struct in6_addr *segs; struct srv6_base_hdr *srh; struct in6_addr *s; ovs_be16 dl_type; - int err = 0; int nr_segs; int i; - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; - + tnl_cfg = netdev_get_tunnel_config(netdev); if (tnl_cfg->srv6_num_segs) { nr_segs = tnl_cfg->srv6_num_segs; segs = tnl_cfg->srv6_segs; @@ -870,8 +852,7 @@ netdev_srv6_build_header(const struct netdev *netdev, } if (!ipv6_addr_equals(&segs[0], ¶ms->flow->tunnel.ipv6_dst)) { - err = EINVAL; - goto out; + return EINVAL; } srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING); @@ -888,8 +869,7 @@ netdev_srv6_build_header(const struct netdev *netdev, } else if (dl_type == htons(ETH_TYPE_IPV6)) { srh->rt_hdr.nexthdr = IPPROTO_IPV6; } else { - err = EOPNOTSUPP; - goto out; + return EOPNOTSUPP; } s = ALIGNED_CAST(struct in6_addr *, @@ -902,10 +882,8 @@ netdev_srv6_build_header(const struct netdev *netdev, data->header_len += sizeof *srh + 8 * srh->rt_hdr.hdrlen; data->tnl_type = OVS_VPORT_TYPE_SRV6; -out: - ovs_mutex_unlock(&dev->mutex); - return err; + return 0; } void @@ -1044,13 +1022,10 @@ netdev_vxlan_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct vxlanhdr *vxh; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; + tnl_cfg = netdev_get_tunnel_config(netdev); vxh = udp_build_header(tnl_cfg, data, params); @@ -1075,10 +1050,10 @@ netdev_vxlan_build_header(const struct netdev *netdev, vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_ETHERNET; break; default: - goto drop; + return EINVAL; } } else { - goto drop; + return EINVAL; } } else { put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS)); @@ -1086,14 +1061,9 @@ netdev_vxlan_build_header(const struct netdev *netdev, htonl(ntohll(params->flow->tunnel.tun_id) << 8)); } - ovs_mutex_unlock(&dev->mutex); data->header_len += sizeof *vxh; data->tnl_type = OVS_VPORT_TYPE_VXLAN; return 0; - -drop: - ovs_mutex_unlock(&dev->mutex); - return 1; } struct dp_packet * @@ -1157,22 +1127,14 @@ netdev_geneve_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct genevehdr *gnh; int opt_len; bool crit_opt; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; - - gnh = udp_build_header(tnl_cfg, data, params); + gnh = udp_build_header(netdev_get_tunnel_config(netdev), data, params); put_16aligned_be32(&gnh->vni, htonl(ntohll(params->flow->tunnel.tun_id) << 8)); - ovs_mutex_unlock(&dev->mutex); - opt_len = tun_metadata_to_geneve_header(¶ms->flow->tunnel, gnh->options, &crit_opt); diff --git a/lib/netdev-vport-private.h b/lib/netdev-vport-private.h index e3c3bdb4348..586231057c6 100644 --- a/lib/netdev-vport-private.h +++ b/lib/netdev-vport-private.h @@ -28,6 +28,8 @@ struct netdev_vport { struct netdev up; + OVSRCU_TYPE(const struct netdev_tunnel_config *) tnl_cfg; + /* Sequence number for outgoing GRE packets. */ atomic_count gre_seqno; @@ -38,7 +40,6 @@ struct netdev_vport { struct netdev_stats stats; /* Tunnels. */ - struct netdev_tunnel_config tnl_cfg; char egress_iface[IFNAMSIZ]; bool carrier_status; diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 6bbaa2feb99..480117a14b8 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -69,8 +69,8 @@ static int get_patch_config(const struct netdev *netdev, struct smap *args); static int get_tunnel_config(const struct netdev *, struct smap *args); static bool tunnel_check_status_change__(struct netdev_vport *); static void update_vxlan_global_cfg(struct netdev *, - struct netdev_tunnel_config *, - struct netdev_tunnel_config *); + const struct netdev_tunnel_config *, + const struct netdev_tunnel_config *); struct vport_class { const char *dpif_port; @@ -91,10 +91,16 @@ vport_class_cast(const struct netdev_class *class) return CONTAINER_OF(class, struct vport_class, netdev_class); } +static const struct netdev_tunnel_config * +vport_tunnel_config(struct netdev_vport *netdev) +{ + return ovsrcu_get(const struct netdev_tunnel_config *, &netdev->tnl_cfg); +} + static const struct netdev_tunnel_config * get_netdev_tunnel_config(const struct netdev *netdev) { - return &netdev_vport_cast(netdev)->tnl_cfg; + return vport_tunnel_config(netdev_vport_cast(netdev)); } bool @@ -135,8 +141,6 @@ netdev_vport_get_dpif_port(const struct netdev *netdev, } if (netdev_vport_needs_dst_port(netdev)) { - const struct netdev_vport *vport = netdev_vport_cast(netdev); - /* * Note: IFNAMSIZ is 16 bytes long. Implementations should choose * a dpif port name that is short enough to fit including any @@ -145,7 +149,7 @@ netdev_vport_get_dpif_port(const struct netdev *netdev, BUILD_ASSERT(NETDEV_VPORT_NAME_BUFSIZE >= IFNAMSIZ); ovs_assert(strlen(dpif_port) + 6 < IFNAMSIZ); snprintf(namebuf, bufsize, "%s_%d", dpif_port, - ntohs(vport->tnl_cfg.dst_port)); + ntohs(netdev_get_tunnel_config(netdev)->dst_port)); return namebuf; } else { return dpif_port; @@ -163,12 +167,14 @@ netdev_vport_route_changed(void) vports = netdev_get_vports(&n_vports); for (i = 0; i < n_vports; i++) { + const struct netdev_tunnel_config *tnl_cfg; struct netdev *netdev_ = vports[i]; struct netdev_vport *netdev = netdev_vport_cast(netdev_); ovs_mutex_lock(&netdev->mutex); /* Finds all tunnel vports. */ - if (ipv6_addr_is_set(&netdev->tnl_cfg.ipv6_dst)) { + tnl_cfg = netdev_get_tunnel_config(netdev_); + if (tnl_cfg && ipv6_addr_is_set(&tnl_cfg->ipv6_dst)) { if (tunnel_check_status_change__(netdev)) { netdev_change_seq_changed(netdev_); } @@ -208,26 +214,31 @@ netdev_vport_construct(struct netdev *netdev_) port = atoi(p); } + struct netdev_tunnel_config *tnl_cfg = xzalloc(sizeof *tnl_cfg); + /* If a destination port for tunnel ports is specified in the netdev * name, use it instead of the default one. Otherwise, use the default * destination port */ if (!strcmp(type, "geneve")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(GENEVE_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(GENEVE_DST_PORT); } else if (!strcmp(type, "vxlan")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(VXLAN_DST_PORT); - update_vxlan_global_cfg(netdev_, NULL, &dev->tnl_cfg); + tnl_cfg->dst_port = port ? htons(port) : htons(VXLAN_DST_PORT); + update_vxlan_global_cfg(netdev_, NULL, tnl_cfg); } else if (!strcmp(type, "lisp")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(LISP_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(LISP_DST_PORT); } else if (!strcmp(type, "stt")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(STT_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(STT_DST_PORT); } else if (!strcmp(type, "gtpu")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(GTPU_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(GTPU_DST_PORT); } else if (!strcmp(type, "bareudp")) { - dev->tnl_cfg.dst_port = htons(port); + tnl_cfg->dst_port = htons(port); } - dev->tnl_cfg.dont_fragment = true; - dev->tnl_cfg.ttl = DEFAULT_TTL; + tnl_cfg->dont_fragment = true; + tnl_cfg->ttl = DEFAULT_TTL; + + ovsrcu_set(&dev->tnl_cfg, tnl_cfg); + return 0; } @@ -235,12 +246,15 @@ static void netdev_vport_destruct(struct netdev *netdev_) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); + const struct netdev_tunnel_config *tnl_cfg = vport_tunnel_config(netdev); const char *type = netdev_get_type(netdev_); if (!strcmp(type, "vxlan")) { - update_vxlan_global_cfg(netdev_, &netdev->tnl_cfg, NULL); + update_vxlan_global_cfg(netdev_, tnl_cfg, NULL); } + ovsrcu_set(&netdev->tnl_cfg, NULL); + ovsrcu_postpone(free, CONST_CAST(struct netdev_tunnel_config *, tnl_cfg)); free(netdev->peer); ovs_mutex_destroy(&netdev->mutex); } @@ -283,15 +297,16 @@ static bool tunnel_check_status_change__(struct netdev_vport *netdev) OVS_REQUIRES(netdev->mutex) { + const struct netdev_tunnel_config *tnl_cfg = vport_tunnel_config(netdev); + const struct in6_addr *route; char iface[IFNAMSIZ]; bool status = false; - struct in6_addr *route; struct in6_addr gw; uint32_t mark; iface[0] = '\0'; - route = &netdev->tnl_cfg.ipv6_dst; - mark = netdev->tnl_cfg.egress_pkt_mark; + route = &tnl_cfg->ipv6_dst; + mark = tnl_cfg->egress_pkt_mark; if (ovs_router_lookup(mark, route, iface, NULL, &gw)) { struct netdev *egress_netdev; @@ -498,8 +513,8 @@ vxlan_get_port_ext_gbp_str(uint16_t port, bool gbp, static void update_vxlan_global_cfg(struct netdev *netdev, - struct netdev_tunnel_config *old_cfg, - struct netdev_tunnel_config *new_cfg) + const struct netdev_tunnel_config *old_cfg, + const struct netdev_tunnel_config *new_cfg) { unsigned int count; char namebuf[20]; @@ -543,19 +558,20 @@ static bool is_concomitant_vxlan_tunnel_present(struct netdev_vport *dev, const struct netdev_tunnel_config *tnl_cfg) { - char namebuf[20]; - const char *type = netdev_get_type(&dev->up); + const struct netdev_tunnel_config *dev_tnl_cfg = vport_tunnel_config(dev); struct vport_class *vclass = vport_class_cast(netdev_get_class(&dev->up)); + const char *type = netdev_get_type(&dev->up); + char namebuf[20]; if (strcmp(type, "vxlan")) { return false; } - if (dev->tnl_cfg.dst_port == tnl_cfg->dst_port && - (dev->tnl_cfg.exts & (1 << OVS_VXLAN_EXT_GBP)) == + if (dev_tnl_cfg->dst_port == tnl_cfg->dst_port && + (dev_tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) == (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP))) { - if (ntohs(dev->tnl_cfg.dst_port) == VXLAN_DST_PORT) { + if (ntohs(dev_tnl_cfg->dst_port) == VXLAN_DST_PORT) { /* Special case where we kept the default port/gbp, only ok if the opposite of the default does not exits */ vxlan_get_port_ext_gbp_str(ntohs(tnl_cfg->dst_port), @@ -571,9 +587,9 @@ is_concomitant_vxlan_tunnel_present(struct netdev_vport *dev, } /* Same port: ok if no one is left with the previous configuration */ - if (dev->tnl_cfg.dst_port == tnl_cfg->dst_port) { - vxlan_get_port_ext_gbp_str(ntohs(dev->tnl_cfg.dst_port), - dev->tnl_cfg.exts & + if (dev_tnl_cfg->dst_port == tnl_cfg->dst_port) { + vxlan_get_port_ext_gbp_str(ntohs(dev_tnl_cfg->dst_port), + dev_tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP), namebuf, sizeof(namebuf)); @@ -601,6 +617,7 @@ static int set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) { struct netdev_vport *dev = netdev_vport_cast(dev_); + const struct netdev_tunnel_config *curr_tnl_cfg; const char *name = netdev_get_name(dev_); const char *type = netdev_get_type(dev_); struct ds errors = DS_EMPTY_INITIALIZER; @@ -902,11 +919,16 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) err = EEXIST; goto out; } - update_vxlan_global_cfg(dev_, &dev->tnl_cfg, &tnl_cfg); ovs_mutex_lock(&dev->mutex); - if (memcmp(&dev->tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) { - dev->tnl_cfg = tnl_cfg; + + curr_tnl_cfg = vport_tunnel_config(dev); + update_vxlan_global_cfg(dev_, curr_tnl_cfg, &tnl_cfg); + + if (memcmp(curr_tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) { + ovsrcu_set(&dev->tnl_cfg, xmemdup(&tnl_cfg, sizeof tnl_cfg)); + ovsrcu_postpone(free, CONST_CAST(struct netdev_tunnel_config *, + curr_tnl_cfg)); tunnel_check_status_change__(dev); netdev_change_seq_changed(dev_); } @@ -931,61 +953,60 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) static int get_tunnel_config(const struct netdev *dev, struct smap *args) { - struct netdev_vport *netdev = netdev_vport_cast(dev); + const struct netdev_tunnel_config *tnl_cfg = netdev_get_tunnel_config(dev); const char *type = netdev_get_type(dev); - struct netdev_tunnel_config tnl_cfg; - ovs_mutex_lock(&netdev->mutex); - tnl_cfg = netdev->tnl_cfg; - ovs_mutex_unlock(&netdev->mutex); + if (!tnl_cfg) { + return 0; + } - if (ipv6_addr_is_set(&tnl_cfg.ipv6_dst)) { - smap_add_ipv6(args, "remote_ip", &tnl_cfg.ipv6_dst); - } else if (tnl_cfg.ip_dst_flow) { + if (ipv6_addr_is_set(&tnl_cfg->ipv6_dst)) { + smap_add_ipv6(args, "remote_ip", &tnl_cfg->ipv6_dst); + } else if (tnl_cfg->ip_dst_flow) { smap_add(args, "remote_ip", "flow"); } - if (ipv6_addr_is_set(&tnl_cfg.ipv6_src)) { - smap_add_ipv6(args, "local_ip", &tnl_cfg.ipv6_src); - } else if (tnl_cfg.ip_src_flow) { + if (ipv6_addr_is_set(&tnl_cfg->ipv6_src)) { + smap_add_ipv6(args, "local_ip", &tnl_cfg->ipv6_src); + } else if (tnl_cfg->ip_src_flow) { smap_add(args, "local_ip", "flow"); } - if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) { + if (tnl_cfg->in_key_flow && tnl_cfg->out_key_flow) { smap_add(args, "key", "flow"); - } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present - && tnl_cfg.in_key == tnl_cfg.out_key) { - smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key)); + } else if (tnl_cfg->in_key_present && tnl_cfg->out_key_present + && tnl_cfg->in_key == tnl_cfg->out_key) { + smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg->in_key)); } else { - if (tnl_cfg.in_key_flow) { + if (tnl_cfg->in_key_flow) { smap_add(args, "in_key", "flow"); - } else if (tnl_cfg.in_key_present) { + } else if (tnl_cfg->in_key_present) { smap_add_format(args, "in_key", "%"PRIu64, - ntohll(tnl_cfg.in_key)); + ntohll(tnl_cfg->in_key)); } - if (tnl_cfg.out_key_flow) { + if (tnl_cfg->out_key_flow) { smap_add(args, "out_key", "flow"); - } else if (tnl_cfg.out_key_present) { + } else if (tnl_cfg->out_key_present) { smap_add_format(args, "out_key", "%"PRIu64, - ntohll(tnl_cfg.out_key)); + ntohll(tnl_cfg->out_key)); } } - if (tnl_cfg.ttl_inherit) { + if (tnl_cfg->ttl_inherit) { smap_add(args, "ttl", "inherit"); - } else if (tnl_cfg.ttl != DEFAULT_TTL) { - smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl); + } else if (tnl_cfg->ttl != DEFAULT_TTL) { + smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg->ttl); } - if (tnl_cfg.tos_inherit) { + if (tnl_cfg->tos_inherit) { smap_add(args, "tos", "inherit"); - } else if (tnl_cfg.tos) { - smap_add_format(args, "tos", "0x%x", tnl_cfg.tos); + } else if (tnl_cfg->tos) { + smap_add_format(args, "tos", "0x%x", tnl_cfg->tos); } - if (tnl_cfg.dst_port) { - uint16_t dst_port = ntohs(tnl_cfg.dst_port); + if (tnl_cfg->dst_port) { + uint16_t dst_port = ntohs(tnl_cfg->dst_port); if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) || (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) || @@ -997,33 +1018,33 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) } } - if (tnl_cfg.csum) { + if (tnl_cfg->csum) { smap_add(args, "csum", "true"); } - if (tnl_cfg.set_seq) { + if (tnl_cfg->set_seq) { smap_add(args, "seq", "true"); } - enum tunnel_layers layers = tunnel_supported_layers(type, &tnl_cfg); - if (tnl_cfg.pt_mode != default_pt_mode(layers)) { + enum tunnel_layers layers = tunnel_supported_layers(type, tnl_cfg); + if (tnl_cfg->pt_mode != default_pt_mode(layers)) { smap_add(args, "packet_type", - tnl_cfg.pt_mode == NETDEV_PT_LEGACY_L2 ? "legacy_l2" - : tnl_cfg.pt_mode == NETDEV_PT_LEGACY_L3 ? "legacy_l3" + tnl_cfg->pt_mode == NETDEV_PT_LEGACY_L2 ? "legacy_l2" + : tnl_cfg->pt_mode == NETDEV_PT_LEGACY_L3 ? "legacy_l3" : "ptap"); } - if (!tnl_cfg.dont_fragment) { + if (!tnl_cfg->dont_fragment) { smap_add(args, "df_default", "false"); } - if (tnl_cfg.set_egress_pkt_mark) { + if (tnl_cfg->set_egress_pkt_mark) { smap_add_format(args, "egress_pkt_mark", - "%"PRIu32, tnl_cfg.egress_pkt_mark); + "%"PRIu32, tnl_cfg->egress_pkt_mark); } if (!strcmp("erspan", type) || !strcmp("ip6erspan", type)) { - if (tnl_cfg.erspan_ver_flow) { + if (tnl_cfg->erspan_ver_flow) { /* since version number is not determined, * assume print all other as flow */ @@ -1032,27 +1053,27 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) smap_add(args, "erspan_dir", "flow"); smap_add(args, "erspan_hwid", "flow"); } else { - smap_add_format(args, "erspan_ver", "%d", tnl_cfg.erspan_ver); + smap_add_format(args, "erspan_ver", "%d", tnl_cfg->erspan_ver); - if (tnl_cfg.erspan_ver == 1) { - if (tnl_cfg.erspan_idx_flow) { + if (tnl_cfg->erspan_ver == 1) { + if (tnl_cfg->erspan_idx_flow) { smap_add(args, "erspan_idx", "flow"); } else { smap_add_format(args, "erspan_idx", "0x%x", - tnl_cfg.erspan_idx); + tnl_cfg->erspan_idx); } - } else if (tnl_cfg.erspan_ver == 2) { - if (tnl_cfg.erspan_dir_flow) { + } else if (tnl_cfg->erspan_ver == 2) { + if (tnl_cfg->erspan_dir_flow) { smap_add(args, "erspan_dir", "flow"); } else { smap_add_format(args, "erspan_dir", "%d", - tnl_cfg.erspan_dir); + tnl_cfg->erspan_dir); } - if (tnl_cfg.erspan_hwid_flow) { + if (tnl_cfg->erspan_hwid_flow) { smap_add(args, "erspan_hwid", "flow"); } else { smap_add_format(args, "erspan_hwid", "0x%x", - tnl_cfg.erspan_hwid); + tnl_cfg->erspan_hwid); } } } @@ -1182,9 +1203,11 @@ netdev_vport_get_stats(const struct netdev *netdev, struct netdev_stats *stats) static enum netdev_pt_mode netdev_vport_get_pt_mode(const struct netdev *netdev) { - struct netdev_vport *dev = netdev_vport_cast(netdev); + const struct netdev_tunnel_config *tnl_cfg; + + tnl_cfg = netdev_get_tunnel_config(netdev); - return dev->tnl_cfg.pt_mode; + return tnl_cfg ? tnl_cfg->pt_mode : NETDEV_PT_UNKNOWN; } diff --git a/lib/netdev.h b/lib/netdev.h index 1fab9127374..aaec9ded1af 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -72,6 +72,9 @@ struct sset; struct ovs_action_push_tnl; enum netdev_pt_mode { + /* Unknown mode. The netdev is not configured yet. */ + NETDEV_PT_UNKNOWN = 0, + /* The netdev is packet type aware. It can potentially carry any kind of * packet. This "modern" mode is appropriate for both netdevs that handle * only a single kind of packet (such as a virtual or physical Ethernet From eb8c19ebac76b18caf30427cb27c606d6c56c761 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Tue, 23 May 2023 12:58:21 +0900 Subject: [PATCH 251/833] netdev-native-tnl: Add ipv6_label param in netdev_tnl_push_ip_header. For tunnels such as SRv6, some popular vendor appliances support IPv6 flowlabel based load balancing. In preparation for OVS to support it, this patch modifies the encapsulation to allow IPv6 flowlabel to be configured. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 23 +++++++++++++---------- lib/netdev-native-tnl.h | 4 ++-- lib/packets.c | 2 +- lib/packets.h | 2 ++ 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 53dde61f148..7c1f0a13b8d 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -146,8 +146,8 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, * * Return pointer to the L4 header added to 'packet'. */ void * -netdev_tnl_push_ip_header(struct dp_packet *packet, - const void *header, int size, int *ip_tot_size) +netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header, + int size, int *ip_tot_size, ovs_be32 ipv6_label) { struct eth_header *eth; struct ip_header *ip; @@ -166,6 +166,7 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, ip6 = netdev_tnl_ipv6_hdr(eth); *ip_tot_size -= IPV6_HEADER_LEN; ip6->ip6_plen = htons(*ip_tot_size); + packet_set_ipv6_flow_label(&ip6->ip6_flow, ipv6_label); packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; return ip6 + 1; } else { @@ -245,7 +246,8 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct udp_header *udp; int ip_tot_size; - udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, &ip_tot_size); + udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); /* set udp src port */ udp->udp_src = netdev_tnl_get_src_port(packet); @@ -455,7 +457,8 @@ netdev_gre_push_header(const struct netdev *netdev, struct gre_base_hdr *greh; int ip_tot_size; - greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len, &ip_tot_size); + greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); if (greh->flags & htons(GRE_CSUM)) { ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1); @@ -602,8 +605,8 @@ netdev_erspan_push_header(const struct netdev *netdev, struct erspan_md2 *md2; int ip_tot_size; - greh = netdev_tnl_push_ip_header(packet, data->header, - data->header_len, &ip_tot_size); + greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); /* update GRE seqno */ ovs_16aligned_be32 *seqno = (ovs_16aligned_be32 *) (greh + 1); @@ -776,8 +779,8 @@ netdev_gtpu_push_header(const struct netdev *netdev, unsigned int payload_len; payload_len = dp_packet_size(packet); - udp = netdev_tnl_push_ip_header(packet, data->header, - data->header_len, &ip_tot_size); + udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); @@ -893,8 +896,8 @@ netdev_srv6_push_header(const struct netdev *netdev OVS_UNUSED, { int ip_tot_size; - netdev_tnl_push_ip_header(packet, data->header, - data->header_len, &ip_tot_size); + netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); } struct dp_packet * diff --git a/lib/netdev-native-tnl.h b/lib/netdev-native-tnl.h index 4dad8f978cc..3311d796ed8 100644 --- a/lib/netdev-native-tnl.h +++ b/lib/netdev-native-tnl.h @@ -138,8 +138,8 @@ void * netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, unsigned int *hlen); void * -netdev_tnl_push_ip_header(struct dp_packet *packet, - const void *header, int size, int *ip_tot_size); +netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header, + int size, int *ip_tot_size, ovs_be32 ipv6_label); void netdev_tnl_egress_port_range(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED); diff --git a/lib/packets.c b/lib/packets.c index 06f516cb1af..7e5a52fd40e 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1274,7 +1274,7 @@ packet_set_ipv6_addr(struct dp_packet *packet, uint8_t proto, pkt_metadata_init_conn(&packet->md); } -static void +void packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, ovs_be32 flow_key) { ovs_be32 old_label = get_16aligned_be32(flow_label); diff --git a/lib/packets.h b/lib/packets.h index 9465bec16c9..ac4c28e471e 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1622,6 +1622,8 @@ void packet_set_ipv6_addr(struct dp_packet *packet, uint8_t proto, ovs_16aligned_be32 addr[4], const struct in6_addr *new_addr, bool recalculate_csum); +void packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, + ovs_be32 flow_key); void packet_set_tcp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); void packet_set_udp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); void packet_set_sctp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); From f328fd4892c927468dc85e545f0b581d10e662ea Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Tue, 23 May 2023 12:58:22 +0900 Subject: [PATCH 252/833] netdev-native-tnl: Add ipv6_label param in netdev_tnl_ip_build_header. For tunnels such as SRv6, some popular vendor appliances support IPv6 flowlabel based load balancing. In preparation for OVS to support it, this patch modifies the encapsulation to allow IPv6 flowlabel to be configured. Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 13 +++++++------ lib/netdev-native-tnl.h | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 7c1f0a13b8d..2b98c3884ec 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -278,7 +278,7 @@ eth_build_header(struct ovs_action_push_tnl *data, void * netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params, - uint8_t next_proto) + uint8_t next_proto, ovs_be32 ipv6_label) { void *l3; @@ -310,7 +310,8 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, ip6 = (struct ovs_16aligned_ip6_hdr *) l3; put_16aligned_be32(&ip6->ip6_flow, htonl(6 << 28) | - htonl(params->flow->tunnel.ip_tos << 20)); + htonl(params->flow->tunnel.ip_tos << 20) | + (ipv6_label & htonl(IPV6_LABEL_MASK))); ip6->ip6_hlim = params->flow->tunnel.ip_ttl; ip6->ip6_nxt = next_proto; memcpy(&ip6->ip6_src, params->s_ip, sizeof(ovs_be32[4])); @@ -328,7 +329,7 @@ udp_build_header(const struct netdev_tunnel_config *tnl_cfg, { struct udp_header *udp; - udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP); + udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP, 0); udp->udp_dst = tnl_cfg->dst_port; if (params->is_ipv6 || params->flow->tunnel.flags & FLOW_TNL_F_CSUM) { @@ -484,7 +485,7 @@ netdev_gre_build_header(const struct netdev *netdev, ovs_16aligned_be32 *options; unsigned int hlen; - greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE); + greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE, 0); if (params->flow->packet_type == htonl(PT_ETH)) { greh->protocol = htons(ETH_TYPE_TEB); @@ -633,7 +634,7 @@ netdev_erspan_build_header(const struct netdev *netdev, int erspan_ver; uint16_t sid; - greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE); + greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE, 0); ersh = ERSPAN_HDR(greh); tun_id = ntohl(be64_to_be32(params->flow->tunnel.tun_id)); @@ -858,7 +859,7 @@ netdev_srv6_build_header(const struct netdev *netdev, return EINVAL; } - srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING); + srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING, 0); srh->rt_hdr.segments_left = nr_segs - 1; srh->rt_hdr.type = IPV6_SRCRT_TYPE_4; srh->rt_hdr.hdrlen = 2 * nr_segs; diff --git a/lib/netdev-native-tnl.h b/lib/netdev-native-tnl.h index 3311d796ed8..eb55dd0417a 100644 --- a/lib/netdev-native-tnl.h +++ b/lib/netdev-native-tnl.h @@ -118,7 +118,7 @@ netdev_tnl_ipv6_hdr(void *eth) void * netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params, - uint8_t next_proto); + uint8_t next_proto, ovs_be32 ipv6_label); extern uint16_t tnl_udp_port_min; extern uint16_t tnl_udp_port_max; From 701c2dbfb8970ec5d939e5f0fe24479716174762 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Tue, 23 May 2023 12:58:23 +0900 Subject: [PATCH 253/833] userspace: Add new option srv6_flowlabel in SRv6 tunnel. It supports flowlabel based load balancing by controlling the flowlabel of outer IPv6 header, which is already implemented in Linux kernel as seg6_flowlabel sysctl [1]. [1]: https://docs.kernel.org/networking/seg6-sysctl.html Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 35 ++++++++++++-- lib/netdev-vport.c | 8 ++++ lib/netdev.h | 12 +++++ tests/tunnel-push-pop-ipv6.at | 86 +++++++++++++++++++++++++++++++++++ vswitchd/vswitch.xml | 26 +++++++++++ 5 files changed, 164 insertions(+), 3 deletions(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 2b98c3884ec..c2c6ca55957 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -859,7 +859,12 @@ netdev_srv6_build_header(const struct netdev *netdev, return EINVAL; } - srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING, 0); + /* Writes the netdev_srv6_flowlabel enum value to the ipv6 + * flowlabel field. It must later be replaced by a valid value + * in the header push. */ + srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING, + htonl(tnl_cfg->srv6_flowlabel)); + srh->rt_hdr.segments_left = nr_segs - 1; srh->rt_hdr.type = IPV6_SRCRT_TYPE_4; srh->rt_hdr.hdrlen = 2 * nr_segs; @@ -895,10 +900,34 @@ netdev_srv6_push_header(const struct netdev *netdev OVS_UNUSED, struct dp_packet *packet, const struct ovs_action_push_tnl *data) { + struct ovs_16aligned_ip6_hdr *inner_ip6, *outer_ip6; + enum netdev_srv6_flowlabel srv6_flowlabel; + ovs_be32 ipv6_label = 0; int ip_tot_size; + uint32_t flow; + + inner_ip6 = dp_packet_l3(packet); + outer_ip6 = netdev_tnl_ipv6_hdr((void *) data->header); + srv6_flowlabel = ntohl(get_16aligned_be32(&outer_ip6->ip6_flow)) & + IPV6_LABEL_MASK; + + switch (srv6_flowlabel) { + case SRV6_FLOWLABEL_COPY: + flow = ntohl(get_16aligned_be32(&inner_ip6->ip6_flow)); + ipv6_label = (flow >> 28) == 6 ? htonl(flow & IPV6_LABEL_MASK) : 0; + break; + + case SRV6_FLOWLABEL_ZERO: + ipv6_label = 0; + break; + + case SRV6_FLOWLABEL_COMPUTE: + ipv6_label = htonl(dp_packet_get_rss_hash(packet) & IPV6_LABEL_MASK); + break; + } - netdev_tnl_push_ip_header(packet, data->header, data->header_len, - &ip_tot_size, 0); + netdev_tnl_push_ip_header(packet, data->header, + data->header_len, &ip_tot_size, ipv6_label); } struct dp_packet * diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 480117a14b8..60caa02fbb9 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -811,6 +811,14 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) name, node->value); break; } + } else if (!strcmp(node->key, "srv6_flowlabel")) { + if (!strcmp(node->value, "zero")) { + tnl_cfg.srv6_flowlabel = SRV6_FLOWLABEL_ZERO; + } else if (!strcmp(node->value, "compute")) { + tnl_cfg.srv6_flowlabel = SRV6_FLOWLABEL_COMPUTE; + } else { + tnl_cfg.srv6_flowlabel = SRV6_FLOWLABEL_COPY; + } } else if (!strcmp(node->key, "payload_type")) { if (!strcmp(node->value, "mpls")) { tnl_cfg.payload_ethertype = htons(ETH_TYPE_MPLS); diff --git a/lib/netdev.h b/lib/netdev.h index aaec9ded1af..67a8486bdba 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -100,6 +100,17 @@ enum netdev_pt_mode { NETDEV_PT_LEGACY_L3, }; +enum netdev_srv6_flowlabel { + /* Copy the flowlabel of inner packet. */ + SRV6_FLOWLABEL_COPY, + + /* Simply set flowlabel to 0. */ + SRV6_FLOWLABEL_ZERO, + + /* Set flowlabel to a hash over L3/L4 fields of the inner packet. */ + SRV6_FLOWLABEL_COMPUTE, +}; + /* Configuration specific to tunnels. */ struct netdev_tunnel_config { ovs_be64 in_key; @@ -146,6 +157,7 @@ struct netdev_tunnel_config { uint8_t srv6_num_segs; #define SRV6_MAX_SEGS 6 struct in6_addr srv6_segs[SRV6_MAX_SEGS]; + enum netdev_srv6_flowlabel srv6_flowlabel; }; void netdev_run(void); diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index e300fe3a0d2..a8dd28c5b59 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -1,5 +1,91 @@ AT_BANNER([tunnel_push_pop_ipv6]) +AT_SETUP([tunnel_push_pop_ipv6 - srv6]) + +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00 options:pcap=p0.pcap]) +AT_CHECK([ovs-vsctl add-br int-br1 -- set bridge int-br1 datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-br int-br2 -- set bridge int-br2 datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-br int-br3 -- set bridge int-br3 datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-port int-br1 t1 -- set Interface t1 type=srv6 \ + options:remote_ip=2001:cafe::91 ofport_request=2 \ + options:srv6_flowlabel=copy \ + ], [0]) +AT_CHECK([ovs-vsctl add-port int-br2 t2 -- set Interface t2 type=srv6 \ + options:remote_ip=2001:cafe::92 ofport_request=3 \ + options:srv6_flowlabel=zero \ + ], [0]) +AT_CHECK([ovs-vsctl add-port int-br3 t3 -- set Interface t3 type=srv6 \ + options:remote_ip=2001:cafe::93 ofport_request=4 \ + options:srv6_flowlabel=compute \ + ], [0]) + +dnl First setup dummy interface IP address, then add the route +dnl so that tnl-port table can get valid IP address for the device. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::0/24 br0], [0], [OK +]) +AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::91 aa:55:aa:55:00:01], [0], [OK +]) +AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::92 aa:55:aa:55:00:02], [0], [OK +]) +AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::93 aa:55:aa:55:00:03], [0], [OK +]) +AT_CHECK([ovs-ofctl add-flow br0 action=1]) +AT_CHECK([ovs-ofctl add-flow int-br1 action=2]) +AT_CHECK([ovs-ofctl add-flow int-br2 action=3]) +AT_CHECK([ovs-ofctl add-flow int-br3 action=4]) + +dnl Check "srv6_flowlabel=copy". +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::3,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 4 | grep -o 'ipv6_label=0x[[0-9a-f]]*' | sort], [0], [dnl +ipv6_label=0x00000 +ipv6_label=0x00000 +ipv6_label=0x00002 +ipv6_label=0x00003 +]) + +dnl Check "srv6_flowlabel=zero". +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::3,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 4 | grep -o 'ipv6_label=0x[[0-9a-f]]*'], [0], [dnl +ipv6_label=0x00000 +ipv6_label=0x00000 +ipv6_label=0x00000 +ipv6_label=0x00000 +]) + +dnl dnl Check "srv6_flowlabel=compute" for different flows. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::3,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 4 | grep -o 'ipv6_label=0x[[0-9a-f]]*'| sort | uniq -c | wc -l], [0], [dnl +4 +]) + +dnl dnl Check "srv6_flowlabel=compute" for same IPv4/TCP flow. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x002)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 2 | grep -o 'ipv6_label=0x[[0-9a-f]]*' | sort | uniq -c | wc -l], [0], [dnl +1 +]) + +dnl dnl Check "srv6_flowlabel=compute" for same IPv6/TCP flow. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x002)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 2 | grep -o 'ipv6_label=0x[[0-9a-f]]*' | sort | uniq -c | wc -l], [0], [dnl +1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([tunnel_push_pop_ipv6 - ip6gre]) OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00]) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index edb5eafa04c..59c404bbbc7 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3287,6 +3287,32 @@ .

      + +

      + Optional. + This option controls how flowlabel in outer IPv6 header is + configured. It gives the benefit of IPv6 flow label based + load balancing, which is supported by some popular vendor + appliances. Like net.ipv6.seg6_flowlabel sysconfig, it is + one of the three values below: +

      +
        +
      • + By default, or if this option is copy, copy the + flowlabel of inner IPv6 header to the flowlabel of outer IPv6 + header. If inner header is not IPv6, it is set to 0. +
      • +
      • + If this option is zero, simply set flowlabel to 0. +
      • +
      • + If this option is compute, set flowlabel to a hash + over the L3/L4 fields of the inner packet. +
      • +
      +
      From 263fcdfdb8aeee6c59738efa1ae1e3abd3789816 Mon Sep 17 00:00:00 2001 From: yangchang Date: Thu, 25 May 2023 17:54:37 +0800 Subject: [PATCH 254/833] ovs-fields: Modify the width of tpa and spa. Arp_spa and arp_tpa are IP addresses, their width should be 32 bits. Reviewed-by: Simon Horman Signed-off-by: yangchang Signed-off-by: Ilya Maximets --- lib/meta-flow.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml index a1a20366d40..bdd12f6a7bb 100644 --- a/lib/meta-flow.xml +++ b/lib/meta-flow.xml @@ -4312,9 +4312,9 @@ r r c c c. - + - + From 68d6d2777f1235cdade6ad3eb4263a4142a5b10c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 25 May 2023 19:45:09 +0200 Subject: [PATCH 255/833] AUTHORS: Add yangchang. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index a8ff226ec10..7175766482f 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -510,6 +510,7 @@ wangchuanlei wangchuanlei@inspur.com wenxu wenxu@ucloud.cn wisd0me ak47izatool@gmail.com xushengping shengping.xu@huawei.com +yangchang yangchang@chinatelecom.cn yinpeijun yinpeijun@huawei.com zangchuanqiang zangchuanqiang@huawei.com zhaojingjing zhao.jingjing1@zte.com.cn From d51a4ef0a63bd17acd4486f0ce38102e378599dd Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Thu, 25 May 2023 15:07:53 +0200 Subject: [PATCH 256/833] tests: layer3-tunnels: Skip bareudp tests if not supported by kernel. The bareudp tests depend on specific kernel configuration to succeed. Skip the test if the feature is not enabled in the running kernel. Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/system-kmod-macros.at | 10 ++++++++++ tests/system-layer3-tunnels.at | 4 ++-- tests/system-userspace-macros.at | 8 ++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index fb15a5a7ce0..712925ded77 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -237,3 +237,13 @@ m4_define([CHECK_L3L4_CONNTRACK_REASM]) # # The kernel module tests do not use TC offload. m4_define([CHECK_NO_TC_OFFLOAD]) + +# OVS_CHECK_BAREUDP() +# +# The feature needs to be enabled in the kernel configuration (CONFIG_BAREUDP) +# to work. +m4_define([OVS_CHECK_BAREUDP], +[ + AT_SKIP_IF([! ip link add dev ovs_bareudp0 type bareudp dstport 6635 ethertype mpls_uc 2>&1 >/dev/null]) + AT_CHECK([ip link del dev ovs_bareudp0]) +]) diff --git a/tests/system-layer3-tunnels.at b/tests/system-layer3-tunnels.at index c37852b2163..81123f7309a 100644 --- a/tests/system-layer3-tunnels.at +++ b/tests/system-layer3-tunnels.at @@ -154,7 +154,7 @@ OVS_VSWITCHD_STOP AT_CLEANUP AT_SETUP([layer3 - ping over MPLS Bareudp]) -OVS_CHECK_MIN_KERNEL(5, 7) +OVS_CHECK_BAREUDP() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -202,7 +202,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([layer3 - ping over Bareudp]) -OVS_CHECK_MIN_KERNEL(5, 7) +OVS_CHECK_BAREUDP() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index 482079386a4..c1855cbc5b3 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -336,3 +336,11 @@ m4_define([CHECK_L3L4_CONNTRACK_REASM], # # Userspace tests do not use TC offload. m4_define([CHECK_NO_TC_OFFLOAD]) + +# OVS_CHECK_BAREUDP() +# +# The userspace datapath does not support bareudp tunnels. +m4_define([OVS_CHECK_BAREUDP], +[ + AT_SKIP_IF([:]) +]) From 0826de990cd2c3d55b5f8ea4da74416cc3d40a22 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 May 2023 18:51:04 +0200 Subject: [PATCH 257/833] stream-ssl: Disable alerts on unexpected EOF. OpenSSL 3.0 enabled alerts for unexpected EOF by default. It supposed to alert the application whenever the connection terminated without a proper close_notify. And that should allow applications to take actions to protect themselves from potential TLS truncation attack. This is how it looks like in the log: |stream_ssl|WARN|SSL_read: error:0A000126:SSL routines::unexpected eof while reading |jsonrpc|WARN|ssl:127.0.0.1:34288: receive error: Input/output error |reconnect|WARN|ssl:127.0.0.1:34288: connection dropped (Input/output error) The problem is that clients based on OVS libraries do not wait for the proper termination if it didn't happen right away. It means that chances to have alerts on the server side for every single disconnection are very high. None of the high level protocols supported by OVS daemons can carry state between re-connections, e.g., there are no session cookies or anything like that. So, the TLS truncation attack is no applicable. Disable the alert to avoid unnecessary warnings in the log. Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/stream-ssl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c index 62da9febb66..86747e58ba2 100644 --- a/lib/stream-ssl.c +++ b/lib/stream-ssl.c @@ -1075,7 +1075,13 @@ do_ssl_init(void) VLOG_ERR("SSL_CTX_new: %s", ERR_error_string(ERR_get_error(), NULL)); return ENOPROTOOPT; } - SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); + + long options = SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3; +#ifdef SSL_OP_IGNORE_UNEXPECTED_EOF + options |= SSL_OP_IGNORE_UNEXPECTED_EOF; +#endif + SSL_CTX_set_options(ctx, options); + #if OPENSSL_VERSION_NUMBER < 0x3000000fL SSL_CTX_set_tmp_dh_callback(ctx, tmp_dh_callback); #else From 59c908410570e852be0b5c04c85093179924410c Mon Sep 17 00:00:00 2001 From: Balazs Nemeth Date: Fri, 26 May 2023 14:03:38 +0200 Subject: [PATCH 258/833] ofproto-dpif-upcall: Don't set statistics to 0 when they jump back. The only way that stats->{n_packets,n_bytes} would decrease is due to an overflow, or if there are bugs in how statistics are handled. In the past, there were multiple issues that caused a jump backward. A workaround was in place to set the statistics to 0 in that case. When this happened while the revalidator was under heavy load, the workaround had an unintended side effect where should_revalidate returned false causing the flow to be removed because the metric it calculated was based on a bogus value. Since many of those bugs have now been identified and resolved, there is no need to set the statistics to 0. In addition, the (unlikely) overflow still needs to be handled appropriately. If an unexpected jump does happen, just log it as a warning. Signed-off-by: Balazs Nemeth Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index cd57fdbd9e6..04b583f816f 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -2339,6 +2339,27 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, return result; } +static void +log_unexpected_stats_jump(struct udpif_key *ukey, + const struct dpif_flow_stats *stats) + OVS_REQUIRES(ukey->mutex) +{ + static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5); + struct ds ds = DS_EMPTY_INITIALIZER; + struct ofpbuf *actions; + + odp_format_ufid(&ukey->ufid, &ds); + ds_put_cstr(&ds, ", "); + odp_flow_key_format(ukey->key, ukey->key_len, &ds); + ds_put_cstr(&ds, ", actions:"); + actions = ovsrcu_get(struct ofpbuf *, &ukey->actions); + format_odp_actions(&ds, actions->data, actions->size, NULL); + VLOG_WARN_RL(&rll, "Unexpected jump in packet stats from %"PRIu64 + " to %"PRIu64" when handling ukey %s", + ukey->stats.n_packets, stats->n_packets, ds_cstr(&ds)); + ds_destroy(&ds); +} + /* Verifies that the datapath actions of 'ukey' are still correct, and pushes * 'stats' for it. * @@ -2372,18 +2393,15 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, push.used = stats->used; push.tcp_flags = stats->tcp_flags; - push.n_packets = (stats->n_packets > ukey->stats.n_packets - ? stats->n_packets - ukey->stats.n_packets - : 0); - push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes - ? stats->n_bytes - ukey->stats.n_bytes - : 0); + push.n_packets = stats->n_packets - ukey->stats.n_packets; + push.n_bytes = stats->n_bytes - ukey->stats.n_bytes; if (stats->n_packets < ukey->stats.n_packets && ukey->stats.n_packets < UINT64_THREE_QUARTERS) { /* Report cases where the packet counter is lower than the previous * instance, but exclude the potential wrapping of an uint64_t. */ COVERAGE_INC(ukey_invalid_stat_reset); + log_unexpected_stats_jump(ukey, stats); } if (need_revalidate) { From 0af352b6dffd668bca29085444cf153936ea0a71 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Fri, 26 May 2023 09:57:11 +0100 Subject: [PATCH 259/833] netdev-dpdk: Remove requested descriptors from get_config. There is no need to display 'requested_rx/tx_descriptors' and 'configured_rx/tx_descriptors' as they will be the same. It is simpler to just have a single 'n_rxq/txq_desc' value. Suggested-by: Ilya Maximets Reviewed-by: David Marchand Reviewed-by: Simon Horman Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index fb0dd43f75c..2d9afc49323 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1740,14 +1740,8 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) smap_add_format(args, "mtu", "%d", dev->mtu); if (dev->type == DPDK_DEV_ETH) { - smap_add_format(args, "requested_rxq_descriptors", "%d", - dev->requested_rxq_size); - smap_add_format(args, "configured_rxq_descriptors", "%d", - dev->rxq_size); - smap_add_format(args, "requested_txq_descriptors", "%d", - dev->requested_txq_size); - smap_add_format(args, "configured_txq_descriptors", "%d", - dev->txq_size); + smap_add_format(args, "n_rxq_desc", "%d", dev->rxq_size); + smap_add_format(args, "n_txq_desc", "%d", dev->txq_size); if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) { smap_add(args, "rx_csum_offload", "true"); } else { From 9dad8dfd1ed9e1c4629b584b477114e11f3556b7 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Fri, 26 May 2023 09:57:12 +0100 Subject: [PATCH 260/833] netdev-dpdk: Check rx/tx descriptor sizes for device. By default OVS configures 2048 descriptors for tx and rx queues on DPDK devices. It also allows the user to configure those values. If the values used are not acceptable to the device then queue setup would fail. The device exposes it's max/min/alignment requirements and OVS applies some limits also. Use these to ensure an acceptable value is used for the number of descriptors on a device tx/rx. If the default or user value is not acceptable, adjust to a suitable value and log. Reported-at: https://bugzilla.redhat.com/2119876 Reviewed-by: David Marchand Reviewed-by: Simon Horman Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 60 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 2d9afc49323..6bf672d43d5 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1910,18 +1910,56 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args) static void dpdk_process_queue_size(struct netdev *netdev, const struct smap *args, - const char *flag, int default_size, int *new_size) + struct rte_eth_dev_info *info, bool is_rx) { - int queue_size = smap_get_int(args, flag, default_size); + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_eth_desc_lim *lim; + int default_size, queue_size, cur_size, new_requested_size; + int *cur_requested_size; + bool reconfig = false; + + if (is_rx) { + default_size = NIC_PORT_DEFAULT_RXQ_SIZE; + new_requested_size = smap_get_int(args, "n_rxq_desc", default_size); + cur_size = dev->rxq_size; + cur_requested_size = &dev->requested_rxq_size; + lim = info ? &info->rx_desc_lim : NULL; + } else { + default_size = NIC_PORT_DEFAULT_TXQ_SIZE; + new_requested_size = smap_get_int(args, "n_txq_desc", default_size); + cur_size = dev->txq_size; + cur_requested_size = &dev->requested_txq_size; + lim = info ? &info->tx_desc_lim : NULL; + } + + queue_size = new_requested_size; + /* Check for OVS limits. */ if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE || !is_pow2(queue_size)) { queue_size = default_size; } - if (queue_size != *new_size) { - *new_size = queue_size; + if (lim) { + /* Check for device limits. */ + if (lim->nb_align) { + queue_size = ROUND_UP(queue_size, lim->nb_align); + } + queue_size = MIN(queue_size, lim->nb_max); + queue_size = MAX(queue_size, lim->nb_min); + } + + *cur_requested_size = queue_size; + + if (cur_size != queue_size) { netdev_request_reconfigure(netdev); + reconfig = true; + } + if (new_requested_size != queue_size) { + VLOG(reconfig ? VLL_INFO : VLL_DBG, + "%s: Unable to set the number of %s descriptors to %d. " + "Adjusted to %d.", netdev_get_name(netdev), + is_rx ? "rx": "tx", new_requested_size, queue_size); } } @@ -1937,22 +1975,17 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, {RTE_ETH_FC_NONE, RTE_ETH_FC_TX_PAUSE}, {RTE_ETH_FC_RX_PAUSE, RTE_ETH_FC_FULL } }; + struct rte_eth_dev_info info; const char *new_devargs; const char *vf_mac; int err = 0; + int ret; ovs_mutex_lock(&dpdk_mutex); ovs_mutex_lock(&dev->mutex); dpdk_set_rxq_config(dev, args); - dpdk_process_queue_size(netdev, args, "n_rxq_desc", - NIC_PORT_DEFAULT_RXQ_SIZE, - &dev->requested_rxq_size); - dpdk_process_queue_size(netdev, args, "n_txq_desc", - NIC_PORT_DEFAULT_TXQ_SIZE, - &dev->requested_txq_size); - new_devargs = smap_get(args, "dpdk-devargs"); if (dev->devargs && new_devargs && strcmp(new_devargs, dev->devargs)) { @@ -2008,6 +2041,11 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, goto out; } + ret = rte_eth_dev_info_get(dev->port_id, &info); + + dpdk_process_queue_size(netdev, args, !ret ? &info : NULL, true); + dpdk_process_queue_size(netdev, args, !ret ? &info : NULL, false); + vf_mac = smap_get(args, "dpdk-vf-mac"); if (vf_mac) { struct eth_addr mac; From c3e410a03ad0068fabf309d3714ecb9f08d66839 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Fri, 26 May 2023 17:04:49 +0200 Subject: [PATCH 261/833] netdev-offload-dpdk: Fix crash in debug log. The offload thread calling ufid_to_rte_flow_disassociate() may be the last one holding a reference on the netdev and physdev. So displaying information about them might trigger a crash when removing a physical port. Fixes: faf71e492263 ("netdev-dpdk: Print port name in offload API messages.") Acked-by: Mike Pattrick Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-offload-dpdk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index b3421c0996e..2d7858f51ce 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -2345,13 +2345,13 @@ netdev_offload_dpdk_flow_destroy(struct ufid_to_rte_flow_data *rte_flow_data) ovsrcu_get(void *, &netdev->hw_info.offload_data); data->rte_flow_counters[tid]--; - ufid_to_rte_flow_disassociate(rte_flow_data); VLOG_DBG_RL(&rl, "%s/%s: rte_flow 0x%"PRIxPTR " flow destroy %d ufid " UUID_FMT, netdev_get_name(netdev), netdev_get_name(physdev), (intptr_t) rte_flow, netdev_dpdk_get_port_id(physdev), UUID_ARGS((struct uuid *) ufid)); + ufid_to_rte_flow_disassociate(rte_flow_data); } else { VLOG_ERR("Failed flow: %s/%s: flow destroy %d ufid " UUID_FMT, netdev_get_name(netdev), netdev_get_name(physdev), From e3d0e84ed3f0a49dfb7107db56ce709aaeaf1ac7 Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Fri, 26 May 2023 19:16:38 +0200 Subject: [PATCH 262/833] utilities/bashcomp: Fix PS1 generation on new bash. The current implementation used to extract PS1 prompt for ovs-vsctl is broken on recent Bash releases. Starting from Bash 4.4 it's possible to use @P expansion in order to get the quoted PS1 directly. This commit makes the 2 bash completion files to use @P expansion in order to get the quoted PS1 on Bash >= 4.4. Reported-at: https://bugzilla.redhat.com/2170344 Reported-by: Martin Necas Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- utilities/ovs-appctl-bashcomp.bash | 7 +++++++ utilities/ovs-vsctl-bashcomp.bash | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/utilities/ovs-appctl-bashcomp.bash b/utilities/ovs-appctl-bashcomp.bash index 4384be8ae10..0a9af1a18f0 100644 --- a/utilities/ovs-appctl-bashcomp.bash +++ b/utilities/ovs-appctl-bashcomp.bash @@ -223,6 +223,13 @@ printf_stderr() { # The code below is taken from Peter Amidon. His change makes it more # robust. extract_bash_prompt() { + # On Bash 4.4+ just use the @P expansion + if ((BASH_VERSINFO[0] > 4 || + (BASH_VERSINFO[0] == 4 && BASH_VERSINFO[1] >= 4))); then + _BASH_PROMPT="${PS1@P}" + return + fi + local myPS1 v myPS1="$(sed 's/Begin prompt/\\Begin prompt/; s/End prompt/\\End prompt/' <<< "$PS1")" diff --git a/utilities/ovs-vsctl-bashcomp.bash b/utilities/ovs-vsctl-bashcomp.bash index fc8245bfb55..c5ad24fb708 100644 --- a/utilities/ovs-vsctl-bashcomp.bash +++ b/utilities/ovs-vsctl-bashcomp.bash @@ -413,6 +413,13 @@ _ovs_vsctl_get_PS1 () { return; fi + # On Bash 4.4+ just use the @P expansion + if ((BASH_VERSINFO[0] > 4 || + (BASH_VERSINFO[0] == 4 && BASH_VERSINFO[1] >= 4))); then + printf '%s\n' "${PS1@P}" + return + fi + # Original inspiration from # http://stackoverflow.com/questions/10060500/bash-how-to-evaluate-ps1-ps2, # but changed quite a lot to make it more robust. From 1335af2f55458e6924a158e6c9206ff69dc67589 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 19 May 2023 10:36:11 -0400 Subject: [PATCH 263/833] MAINTAINERS.rst: Move several people to emeritus status The following document discusses emeritus committer status: https://docs.openvswitch.org/en/latest/internals/committer-emeritus-status/ There are several people who I would guess consider themselves emeritus committers but have not formally declared it. Those moved to emeritus status in this commit have either explicitly communicated their desire to move or have both not been active in the last year and have not yet replied to this patch. It is easy to re-add people in the future should any emeritus committer desire to become active again. Per our policies, a vote of the majority of current committers (or the list of maintainers prior to this change) is required to move a committer to emeritus status. Signed-off-by: Russell Bryant Acked-by: Alin Gabriel Serdean Acked-by: Ansis Atteka Acked-by: Daniele Di Proietto Acked-by: Ilya Maximets Acked-by: Jesse Gross Acked-by: Justin Pettit Acked-by: Pravin B Shelar Acked-by: Simon Horman Acked-by: Thomas Graf Acked-by: William Tu CC: Andy Zhou CC: Gurucharan Shetty CC: Ian Stokes CC: Jarno Rajahalme CC: YAMAMOTO Takashi --- MAINTAINERS.rst | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 1dc406170f2..85b8e641658 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -41,40 +41,20 @@ This is the current list of active Open vSwitch committers: * - Name - Email - * - Alex Wang - - ee07b291@gmail.com * - Alin Serdean - aserdean@ovn.org - * - Andy Zhou - - azhou@ovn.org * - Ansis Atteka - - aatteka@nicira.com - * - Daniele Di Proietto - - daniele.di.proietto@gmail.com - * - Gurucharan Shetty - - guru@ovn.org + - ansisatteka@gmail.com * - Ian Stokes - istokes@ovn.org * - Ilya Maximets - i.maximets@ovn.org - * - Jarno Rajahalme - - jarno@ovn.org - * - Jesse Gross - - jesse@kernel.org - * - Justin Pettit - - jpettit@ovn.org - * - Pravin B Shelar - - pshelar@ovn.org * - Russell Bryant - russell@ovn.org * - Simon Horman - horms@ovn.org - * - Thomas Graf - - tgraf@noironetworks.com * - William Tu - u9012063@gmail.com - * - YAMAMOTO Takashi - - yamamoto@midokura.com The project also maintains a list of Emeritus Committers (or Maintainers). More information about Emeritus Committers can be found here: @@ -85,12 +65,32 @@ More information about Emeritus Committers can be found here: * - Name - Email + * - Alex Wang + - ee07b291@gmail.com + * - Andy Zhou + - azhou@ovn.org * - Ben Pfaff - blp@ovn.org + * - Daniele Di Proietto + - daniele.di.proietto@gmail.com * - Ethan J. Jackson - ejj@eecs.berkeley.edu + * - Gurucharan Shetty + - guru@ovn.org + * - Jarno Rajahalme + - jarno@ovn.org + * - Jesse Gross + - jesse@kernel.org * - Joe Stringer - joe@ovn.org + * - Justin Pettit + - jpettit@ovn.org + * - Pravin B Shelar + - pshelar@ovn.org + * - Thomas Graf + - tgraf@tgraf.ch + * - YAMAMOTO Takashi + - yamamoto@midokura.com .. Cut here for the Documentation/internals/maintainers.rst From d56366bfa05b90e7b610716ebf9164bfd06e25f1 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 May 2023 18:51:05 +0200 Subject: [PATCH 264/833] tests: Check ovsdb-server logs in OVSDB tests. Many OVSDB tests are not checking the server log for warnings or errors. Some are not even using the log file. It's mostly OK as we're usually checking the user-visible behavior. But it would also be nice to detect some internal warnings if there are some. Moving the OVSDB_SERVER_SHUTDOWN macro to the common place, adding the call to check_logs into it and making OVSDB tests use this macro. Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/ovsdb-client.at | 12 +++--- tests/ovsdb-idl.at | 10 ++++- tests/ovsdb-lock.at | 10 ++--- tests/ovsdb-macros.at | 12 ++++++ tests/ovsdb-monitor.at | 88 ++++++++++++++++++++-------------------- tests/ovsdb-server.at | 91 +++++++++++++++++++++++++----------------- tests/ovsdb-tool.at | 44 ++++++++++---------- tests/vtep-ctl.at | 2 +- 8 files changed, 152 insertions(+), 117 deletions(-) diff --git a/tests/ovsdb-client.at b/tests/ovsdb-client.at index 2d14f1ac262..68fb962bd7e 100644 --- a/tests/ovsdb-client.at +++ b/tests/ovsdb-client.at @@ -5,7 +5,7 @@ AT_KEYWORDS([ovsdb client positive]) ordinal_schema > schema on_exit 'kill `cat *.pid`' AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client get-schema-version unix:socket ordinals], [0], [5.1.3 ]) AT_CHECK([ovsdb-client get-schema-cksum unix:socket ordinals], [0], [12345678 9 @@ -19,7 +19,7 @@ on_exit 'kill `cat *.pid`' ordinal_schema > schema touch .db.~lock~ AT_CHECK([ovsdb-tool create db schema], [0], [], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client needs-conversion unix:socket schema], [0], [no ]) OVSDB_SERVER_SHUTDOWN @@ -31,7 +31,7 @@ ordinal_schema > schema touch .db.~lock~ on_exit 'kill `cat *.pid`' AT_CHECK([ovsdb-tool create db schema], [0], [], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) sed 's/5\.1\.3/5.1.4/' < schema > schema2 AT_CHECK([diff schema schema2], [1], [ignore]) AT_CHECK([ovsdb-client needs-conversion unix:socket schema2], [0], [yes @@ -134,7 +134,7 @@ _uuid name number ]) dnl Stop the database server, then re-start it based on the backup. -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock backup], [0]) dnl Dump a new copy of the data. @@ -195,7 +195,7 @@ ordinals table _uuid,name,number ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP @@ -254,7 +254,7 @@ _uuid,name,number ]) dnl Stopping the server. -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl ovsdb-client should exit by itself after disconnection form the server. OVS_WAIT_WHILE([test -e ovsdb-client.pid]) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 978a6677bd6..df5a9d2fd20 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -278,7 +278,10 @@ m4_define([OVSDB_CHECK_IDL_SSL_PY], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) - OVSDB_SERVER_SHUTDOWN + OVSDB_SERVER_SHUTDOWN([" + /unexpected SSL connection close/d + /Protocol error/d + "]) AT_CLEANUP]) m4_define([OVSDB_CHECK_IDL], @@ -2309,7 +2312,10 @@ m4_define([CHECK_STREAM_OPEN_BLOCK], AT_CHECK([$2 PROTOCOL:$4:$TCP_PORT $SSL_KEY_ARGS], [0], [ignore]) AT_CHECK([$2 PROTOCOL:$4:$WRONG_PORT $SSL_KEY_ARGS], [1], [ignore], [ignore]) - OVSDB_SERVER_SHUTDOWN + OVSDB_SERVER_SHUTDOWN([" + /unexpected SSL connection close/d + /Protocol error/d + "]) AT_CHECK([$2 PROTOCOL:$4:$TCP_PORT $SSL_KEY_ARGS], [1], [ignore], [ignore]) AT_CLEANUP]) diff --git a/tests/ovsdb-lock.at b/tests/ovsdb-lock.at index a3acd2f27a0..6bc24730273 100644 --- a/tests/ovsdb-lock.at +++ b/tests/ovsdb-lock.at @@ -12,8 +12,8 @@ m4_define([OVSDB_CHECK_LOCK_SETUP], AT_KEYWORDS([ovsdb lock $2]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) - AT_CAPTURE_FILE([ovsdb-server-log]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1], [0], [], [])]) + AT_CAPTURE_FILE([ovsdb-server.log]) + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore])]) # # Two sessions create two locks. Both sessions should be able to get their @@ -23,7 +23,7 @@ AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock0 >c1-output 2>& [0], [], []) AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock1 >c2-output 2>&1], [0], [], []) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([cat c1-output], 0, [{"locked":true} ], []) AT_CHECK([cat c2-output], 0, [{"locked":true} @@ -40,7 +40,7 @@ AT_CHECK([ovsdb-client --detach --no-chdir --pidfile lock unix:socket lock0 >c1- AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock0 >c2-output 2>&1], [0], [], []) AT_CHECK([ovs-appctl -t ovsdb-client unlock lock0], [0], [], []) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([cat c1-output], 0, [{"locked":true} {} ]) @@ -60,7 +60,7 @@ AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock0 >c1-output 2>& AT_CHECK([ovsdb-client --detach --no-chdir --pidfile steal unix:socket lock0 >c2-output 2>&1], [0], [], []) AT_CHECK([ovs-appctl -t ovsdb-client unlock lock0], [0], [], []) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([cat c1-output], 0, [{"locked":true} stolen [["lock0"]] diff --git a/tests/ovsdb-macros.at b/tests/ovsdb-macros.at index 0f8e4bd20b3..503b8b722e5 100644 --- a/tests/ovsdb-macros.at +++ b/tests/ovsdb-macros.at @@ -13,6 +13,18 @@ m4_define([OVSDB_INIT], "row": {}}]']], [0], [ignore], [ignore])]) +dnl OVSDB_SERVER_SHUTDOWN([ALLOWLIST]) +dnl +dnl Gracefully stops ovsdb-server, checking log files for messages with +dnl severity WARN or higher and signaling an error if any is present. +dnl The optional ALLOWLIST may contain shell-quoted "sed" commands to +dnl delete any warnings that are actually expected, e.g.: +dnl +dnl OVSDB_SERVER_SHUTDOWN(["/expected error/d"]) +m4_define([OVSDB_SERVER_SHUTDOWN], + [AT_CHECK([check_logs $1]) + OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid])]) + # OVSDB_CHECK_POSITIVE(TITLE, TEST-OVSDB-ARGS, OUTPUT, [KEYWORDS], [PREREQ]) # # Runs "test-ovsdb TEST-OVSDB-ARGS" and checks that it exits with diff --git a/tests/ovsdb-monitor.at b/tests/ovsdb-monitor.at index 3b622b3ec05..7e1ff64f0f3 100644 --- a/tests/ovsdb-monitor.at +++ b/tests/ovsdb-monitor.at @@ -28,7 +28,7 @@ ovsdb_check_monitor () { for txn in ${1+"$@"} '[["'$db'"]]'; do AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done - OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid]) + OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-client.pid]) AT_CHECK_UNQUOTED([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [$output], [ignore]) } @@ -88,10 +88,10 @@ m4_define([OVSDB_CHECK_MONITOR_COND], for txn in m4_foreach([txn], [$3], ['txn' ]); do AT_CHECK([ovsdb-tool transact db "$txn"], [0], [ignore], [ignore]) done - AT_CAPTURE_FILE([ovsdb-server-log]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) + AT_CAPTURE_FILE([ovsdb-server.log]) + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' - AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket $4 '[$8]' $5 $9 > output], + AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket $4 '[$8]' $5 $9 > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [$6], ['txn' ]); do @@ -103,7 +103,7 @@ m4_define([OVSDB_CHECK_MONITOR_COND], done AT_CHECK([ovsdb-client transact unix:socket '[["$4"]]'], [0], [ignore], [ignore]) - AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) + OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [$7], [ignore]) AT_CLEANUP]) @@ -595,9 +595,9 @@ AT_SETUP(monitor-cond-change with many sessions pending) AT_KEYWORDS([ovsdb server monitor monitor-cond negative]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) +AT_CAPTURE_FILE([ovsdb-server.log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -619,14 +619,14 @@ done cond='[[["name","==","ten"]]]' for i in `seq 1 990`; do - AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"]], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] >ovsdb-client$i.out 2>&1], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) for i in `seq 991 1000`; do - AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"]], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] >ovsdb-client$i.out 2>&1 ], [0], [ignore], [ignore]) done for txn in m4_foreach([txn], [[[["ordinals", @@ -647,7 +647,7 @@ sleep 1 AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN("/Too many open files/d") OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[row,action,name <0>,insert,"""ten""" @@ -666,8 +666,8 @@ AT_SETUP([monitor-cond-since not found]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -684,7 +684,7 @@ done # Omitting the last_id parameter in ovsdb-client monitor-cond-since command # will by default using all zero uuid, which doesn't exist in any history txn. -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [[[["ordinals", @@ -699,7 +699,7 @@ for txn in m4_foreach([txn], [[[["ordinals", done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: false, last_id: <0> @@ -720,8 +720,8 @@ AT_SETUP([monitor-cond-since db restart]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -736,19 +736,18 @@ for txn in m4_foreach([txn], [[[["ordinals", AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' OVS_WAIT_UNTIL([grep last_id output]) -kill `cat ovsdb-client.pid` -kill `cat ovsdb-server.pid` +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) # Remember the last_id, which will be used for monitor-cond-since later. last_id=`grep last_id output | awk '{print $4}'` -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) # Some new changes made to db after restarting the server. for txn in m4_foreach([txn], [[[["ordinals", @@ -763,12 +762,12 @@ for txn in m4_foreach([txn], [[[["ordinals", done # Use last_id to monitor and get only the new changes. -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: true, last_id: <0> @@ -784,8 +783,8 @@ AT_SETUP([monitor-cond-since found but no new rows]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -799,7 +798,7 @@ for txn in m4_foreach([txn], [[[["ordinals", "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' OVS_WAIT_UNTIL([grep last_id output]) @@ -807,12 +806,12 @@ OVS_WAIT_UNTIL([grep last_id output]) kill `cat ovsdb-client.pid` OVS_WAIT_UNTIL([test ! -e ovsdb-client.pid]) last_id=`grep last_id output | awk '{print $4}'` -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: true, last_id: <0> @@ -825,17 +824,17 @@ AT_SETUP([monitor-cond-since empty db]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' OVS_WAIT_UNTIL([grep last_id output]) AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: false, last_id: <0> @@ -848,8 +847,8 @@ AT_SETUP([monitor-cond-since condition change]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file -vjsonrpc:file:dbg db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -863,7 +862,8 @@ for txn in m4_foreach([txn], [[[["ordinals", "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[]]' ordinals > output], [0], [ignore], [ignore]) +AT_CAPTURE_FILE([ovsdb-client.log]) +AT_CHECK([ovsdb-client -vjsonrpc --log-file --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[]]' ordinals > output 2> ovsdb-client.stderr]) on_exit 'kill `cat ovsdb-client.pid`' for cond in m4_foreach([cond], [[[[["name","==","one"],["name","==","two"]]]], @@ -874,7 +874,7 @@ for cond in m4_foreach([cond], done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: false, last_id: <0> @@ -909,8 +909,8 @@ AT_SETUP([monitor-cond-since non-cluster]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -925,7 +925,7 @@ for txn in m4_foreach([txn], [[[["ordinals", AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [[[["ordinals", @@ -940,7 +940,7 @@ for txn in m4_foreach([txn], [[[["ordinals", done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) # Transaction shouldn't be found, and last_id returned should always @@ -962,8 +962,8 @@ AT_SETUP([monitor-cond-since non-cluster non-zero last_id]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since negative]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -980,7 +980,7 @@ done # A non-zero uuid last_id=11111111-1111-1111-1111-111111111111 -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [[[["ordinals", @@ -995,7 +995,7 @@ for txn in m4_foreach([txn], [[[["ordinals", done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) # Transaction shouldn't be found, and last_id returned should always diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index bf539b6e5be..b53ab8f5227 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1,15 +1,17 @@ AT_BANNER([OVSDB -- ovsdb-server transactions (Unix sockets)]) -m4_define([OVSDB_SERVER_SHUTDOWN], - [OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid])]) - +dnl OVSDB_SERVER_SHUTDOWN_N(N, [ALLOWLIST]) +dnl +dnl Similar to OVSDB_SERVER_SHUTDOWN, but stops the server started with N.pid +dnl pidfile and unixctlN socket. m4_define([OVSDB_SERVER_SHUTDOWN_N], - [cp $1.pid savepid$1 + [AT_CHECK([check_logs $2]) + cp $1.pid savepid$1 AT_CHECK([ovs-appctl -t "`pwd`"/unixctl$1 -e exit], [0], [ignore], [ignore]) OVS_WAIT_WHILE([kill -0 `cat savepid$1`], [kill `cat savepid$1`])]) m4_define([OVSDB_SERVER_SHUTDOWN2], - [OVSDB_SERVER_SHUTDOWN_N([2])]) + [OVSDB_SERVER_SHUTDOWN_N([2], $1)]) # OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS]) # @@ -31,7 +33,7 @@ m4_define([OVSDB_CHECK_EXECUTION], $2 > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ovsdb-client transact unix:socket 'txn'], [0], [stdout], [ignore]) cat stdout >> output @@ -157,7 +159,7 @@ constraint_schema > schema2 AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) CHECK_DBS([constraints ordinals ]) @@ -177,7 +179,7 @@ AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) # Start ovsdb-server with just a single database - db1. -AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) CHECK_DBS([ordinals ]) @@ -280,7 +282,7 @@ AT_CHECK([uuidfilt db-change-unaware.stdout], [0], [dnl <0> initial _Server ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN(["/no database named ordinals/d"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-db with --monitor]) @@ -298,7 +300,7 @@ AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1], [0], [ignore], [ignore]) # Add the second database. constraint_schema > schema2 @@ -319,7 +321,10 @@ OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) CHECK_DBS([constraints ordinals ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-db and remove-db with --monitor]) @@ -339,7 +344,7 @@ AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) constraint_schema > schema2 AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) # Remove the second database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db constraints]) @@ -356,7 +361,10 @@ OVS_WAIT_UNTIL( OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) CHECK_DBS([ordinals ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([--remote=db: implementation]) @@ -400,7 +408,7 @@ AT_CHECK( "uuid-name": "x", "row": {"target": "punix:socket2"}}]']], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=db:mydb,Root,managers --remote=db:mydb,Root,manager_options --log-file db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=db:mydb,Root,managers --remote=db:mydb,Root,manager_options db], [0], [ignore], [ignore]) ovs-appctl -t ovsdb-server time/warp 6000 1000 AT_CHECK( [[ovsdb-client transact unix:socket1 \ @@ -420,7 +428,7 @@ AT_CHECK( [[[{"rows":[{"managers":"punix:socket1"}]},{"rows":[{"is_connected":false,"target":"punix:socket2"}]}] ]], [ignore]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-server/add-remote and remove-remote]) @@ -428,7 +436,7 @@ AT_KEYWORDS([ovsdb server positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile db], [0], [ignore], [ignore]) AT_CHECK([test ! -e socket1]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-remote punix:socket1]) @@ -473,7 +481,7 @@ AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-remote punix:socket2]) OVS_WAIT_UNTIL([test ! -e socket2]) AT_CHECK([test ! -e socket1]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-server/add-remote with --monitor]) @@ -491,7 +499,7 @@ AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db], [0], [ignore], [ignore]) # Add a remote. AT_CHECK([test ! -e socket1]) @@ -512,7 +520,10 @@ OVS_WAIT_UNTIL( [test -s ovsdb-server.pid && test `cat ovsdb-server.pid` != `cat old.pid`]) OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) OVS_WAIT_UNTIL([test -S socket1]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-remote and remove-remote with --monitor]) @@ -530,7 +541,7 @@ AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db], [0], [ignore], [ignore]) # Add a remote. AT_CHECK([test ! -e socket1]) @@ -555,7 +566,10 @@ OVS_WAIT_UNTIL( [test -s ovsdb-server.pid && test `cat ovsdb-server.pid` != `cat old.pid`]) OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) AT_CHECK([test ! -e socket1]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([SSL db: implementation]) @@ -674,7 +688,10 @@ AT_CHECK_UNQUOTED( [grep "sslv3 alert handshake failure" output], [0], [stdout], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /stream_ssl|WARN/d + /Protocol error/d +"]) AT_CLEANUP OVS_START_SHELL_HELPERS @@ -701,7 +718,7 @@ ovsdb_check_online_compaction() { fi]) dnl Start ovsdb-server. on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0]) + AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) AT_CHECK([ovsdb_client_wait unix:socket ordinals connected]) AT_CAPTURE_FILE([ovsdb-server.log]) dnl Do a bunch of random transactions that put crap in the database log. @@ -837,8 +854,8 @@ _uuid name number dnl Then check that the dumped data is correct. This time first kill dnl and restart the database server to ensure that the data is correct on dnl disk as well as in memory. - OVS_APP_EXIT_AND_WAIT([ovsdb-server]) - AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db]) + OVSDB_SERVER_SHUTDOWN + AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -893,7 +910,7 @@ ovsdb_check_online_conversion() { fi]) dnl Start the database server. - AT_CHECK([ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0]) + AT_CHECK([ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0], [ignore], [ignore]) AT_CAPTURE_FILE([ovsdb-server.log]) dnl Put some data in the database. @@ -1109,9 +1126,9 @@ _uuid number ]) dnl Now kill and restart the database server to ensure that the data is dnl correct on disk as well as in memory. - OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + OVSDB_SERVER_SHUTDOWN AT_CHECK([[ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db]], - [0]) + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl ordinals table _uuid number @@ -1134,7 +1151,7 @@ _uuid number AT_CHECK([test -f dir/.db.~lock~]) fi - OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + OVSDB_SERVER_SHUTDOWN } OVS_END_SHELL_HELPERS @@ -1243,7 +1260,7 @@ AT_CHECK([test $logged_updates -lt $logged_nonblock_updates]) AT_CHECK_UNQUOTED([ovs-vsctl get open_vswitch . system_version], [0], [xyzzy$counter ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-server transaction history size]) @@ -1326,7 +1343,7 @@ dnl still has a reasonable size. check_atoms AT_CHECK([test $(get_memory_value atoms) -eq $db_atoms_before_conversion]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_BANNER([OVSDB -- ovsdb-server transactions (SSL IPv4 sockets)]) @@ -1709,7 +1726,7 @@ AT_KEYWORDS([ovsdb server replication get-active]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --sync-from=tcp:127.0.0.1:9999 db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --sync-from=tcp:127.0.0.1:9999 db], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/get-active-ovsdb-server], [0], [tcp:127.0.0.1:9999 @@ -1722,7 +1739,7 @@ AT_KEYWORDS([ovsdb server replication set-active]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile db], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/set-active-ovsdb-server tcp:127.0.0.1:9999]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/get-active-ovsdb-server], @@ -1736,7 +1753,7 @@ AT_KEYWORDS([ovsdb server replication get-exclude-tables]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --sync-exclude-tables=mydb:db1,mydb:db2 db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --sync-exclude-tables=mydb:db1,mydb:db2 db], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/get-sync-exclude-tables], [0], [mydb:db1,mydb:db2 @@ -2079,7 +2096,7 @@ AT_CHECK( "row": {"target": "ptcp:0:127.0.0.1", "read_only": true}}]']], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --log-file --detach --no-chdir --pidfile --remote=db:mydb,Root,managers db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=db:mydb,Root,managers db], [0], [ignore], [ignore]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) AT_CHECK([ovsdb-client get-schema-version tcp:127.0.0.1:$TCP_PORT mydb], [0], [5.1.3 ]) @@ -2310,8 +2327,8 @@ AT_CHECK([uuidfilt monitor.stdout | sed '/^$/d'], [0], [dnl <8> delete 4 four <9> insert 4 four ]) -OVSDB_SERVER_SHUTDOWN -OVSDB_SERVER_SHUTDOWN2 +OVSDB_SERVER_SHUTDOWN(["/Address already in use/d"]) +OVSDB_SERVER_SHUTDOWN2(["/Address already in use/d"]) dnl Starting a replay. AT_CHECK([ovsdb-server --replay=./replay_dir dnl diff --git a/tests/ovsdb-tool.at b/tests/ovsdb-tool.at index 5496ccda77d..d8d2b1c9990 100644 --- a/tests/ovsdb-tool.at +++ b/tests/ovsdb-tool.at @@ -118,11 +118,11 @@ AT_CHECK([[uuidfilt db | grep -v ^OVSDB | sed 's/"_date":[0-9]*/"_date":0/' | \ dnl Dump out and check the actual database contents. on_exit 'kill `cat ovsdb-server.pid`' -AT_CHECK([[ovsdb-server --detach --pidfile --no-chdir --remote=punix:socket db]], +AT_CHECK([[ovsdb-server --detach --pidfile --log-file --no-chdir --remote=punix:socket db]], [0], [stdout], [ignore]) AT_CHECK([[ovsdb-client dump unix:socket ordinals]], [0], [stdout], [ignore]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -151,11 +151,11 @@ dnl in it now. AT_CAPTURE_FILE([db]) AT_CHECK([test `wc -l < db` -eq 4]) dnl And check that the dumped data is the same too: -AT_CHECK([[ovsdb-server --detach --pidfile --no-chdir --remote=punix:socket db]], +AT_CHECK([[ovsdb-server --detach --pidfile --log-file --no-chdir --remote=punix:socket db]], [0], [stdout], [ignore]) AT_CHECK([[ovsdb-client dump unix:socket ordinals]], [0], [stdout], [ignore]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -196,8 +196,8 @@ AT_CHECK( done]], [0], [stdout], [ignore]) dnl Dump out and check the actual database contents. -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -210,7 +210,7 @@ _uuid name number <4> two 2 <5> zero 0 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl Now convert the database in-place. touch .db.tmp.~lock~ AT_CHECK([[ovsdb-tool convert db new-schema]], [0], [], [ignore]) @@ -220,8 +220,8 @@ dnl in it now. AT_CAPTURE_FILE([db]) AT_CHECK([test `wc -l < db` -eq 4]) dnl And check that the dumped data is the same except for the removed column: -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -234,7 +234,7 @@ _uuid number <4> 4 <5> 5 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-tool convert -- adding a column]) @@ -262,8 +262,8 @@ AT_CHECK( done]], [0], [stdout], [ignore]) dnl Dump out and check the actual database contents. -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -276,7 +276,7 @@ _uuid number <4> 4 <5> 5 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl Now convert the database in-place. touch .db.tmp.~lock~ AT_CHECK([[ovsdb-tool convert db new-schema]], [0], [], [ignore]) @@ -286,8 +286,8 @@ dnl in it now. AT_CAPTURE_FILE([db]) AT_CHECK([test `wc -l < db` -eq 4]) dnl And check that the dumped data is the same except for the added column: -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -300,7 +300,7 @@ _uuid name number <4> "" 4 <5> "" 5 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-tool unsupported cluster operations]) @@ -446,7 +446,7 @@ AT_CHECK( # Dump the data. AT_CHECK([ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) AT_CHECK([ovsdb-client dump > expout]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN # Create a clustered database from the standalone one. ovsdb-tool create-cluster db2 db1 unix:s1.raft @@ -455,7 +455,7 @@ ovsdb-tool create-cluster db2 db1 unix:s1.raft AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db2]) AT_CHECK([ovsdb_client_wait ordinals connected]) AT_CHECK([ovsdb-client dump > dump2]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN # Make sure that the clustered data matched the standalone data. AT_CHECK([cat dump2], [0], [expout]) @@ -482,7 +482,7 @@ done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket > clusterdump]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN # Convert to standalone database from clustered database. AT_CHECK(ovsdb-tool cluster-to-standalone db1 db) @@ -494,7 +494,7 @@ AT_CHECK([ovsdb-tool db-is-standalone db1]) AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) AT_CHECK([ovsdb_client_wait ordinals connected]) AT_CHECK([ovsdb-client dump > standalonedump]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN # Make sure both standalone and cluster db data matches. AT_CHECK([diff standalonedump clusterdump]) @@ -549,7 +549,7 @@ _uuid is_seven name number <3> true seven 7 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl Convert to standalone database from clustered database. AT_CHECK(ovsdb-tool cluster-to-standalone db1 db) @@ -562,7 +562,7 @@ AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir dnl --pidfile --log-file --remote=punix:db.sock db1]) AT_CHECK([ovsdb_client_wait ordinals connected]) AT_CHECK([ovsdb-client dump > standalonedump]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl Make sure both standalone and cluster db data matches. AT_CHECK([diff standalonedump clusterdump]) diff --git a/tests/vtep-ctl.at b/tests/vtep-ctl.at index 98067658446..e4ddfe5df03 100644 --- a/tests/vtep-ctl.at +++ b/tests/vtep-ctl.at @@ -19,7 +19,7 @@ dnl Creates an empty database in the current directory and then starts dnl an ovsdb-server on it for vtep-ctl to connect to. m4_define([VTEP_CTL_SETUP], [VTEP_OVSDB_INIT([db]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db >/dev/null 2>&1], [0], [ignore], [ignore])]) + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore])]) dnl VTEP_CTL_CLEANUP dnl From ef1da757f01670d19a34cc176031a70482ec003d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 26 May 2023 19:18:43 +0200 Subject: [PATCH 265/833] ovsdb: condition: Process condition changes incrementally. In most cases, after the condition change request, the new condition is the same as old one plus minus a few clauses. Today, ovsdb-server will evaluate every database row against all the old clauses and then against all the new clauses in order to tell if an update should be generated. For example, every time a new port is added, ovn-controller adds two new clauses to conditions for a Port_Binding table. And this condition may grow significantly in size making addition of every new port heavier on the server side. The difference between conditions is not larger and, likely, significantly smaller than old and new conditions combined. And if the row doesn't match clauses that are different between old and new conditions, that row should not be part of the update. It either matches both old and new, or it doesn't match either of them. If the row matches some clauses in the difference, then we need to perform a full match against old and new in order to tell if it should be added/removed/modified. This is necessary because different clauses may select same rows. Let's generate the condition difference and use it to avoid evaluation of all the clauses for rows not affected by the condition change. Testing shows 70% reduction in total CPU time in ovn-heater's 120-node density-light test with conditional monitoring. Average CPU usage during the test phase went down from frequent 100% spikes to just 6-8%. Note: This will not help with new connections, or re-connections, or new monitor requests after database conversion. ovsdb-server will still evaluate every database row against every clause in the condition in these cases. So, it's still important to not have too many clauses in conditions for large tables. Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- ovsdb/condition.c | 56 +++++++++++++++++++++++++++++++++++++ ovsdb/condition.h | 9 ++++++ ovsdb/monitor.c | 71 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 114 insertions(+), 22 deletions(-) diff --git a/ovsdb/condition.c b/ovsdb/condition.c index d0016fa7f79..09c89b2a02c 100644 --- a/ovsdb/condition.c +++ b/ovsdb/condition.c @@ -497,6 +497,62 @@ ovsdb_condition_cmp_3way(const struct ovsdb_condition *a, return 0; } +/* Given conditions 'a' and 'b', composes a new condition 'diff' that contains + * clauses that are present in one of the conditions, but not in the other. + * + * If some data doesn't match the resulted 'diff' condition, that means one of: + * 1. The data matches both 'a' and 'b'. + * 2. The data does not match either 'a' or 'b'. + * + * However, that is not true if one of the original conditions is a trivial + * True or False. In this case the function will currently just return an + * empty (True) condition. */ +void +ovsdb_condition_diff(struct ovsdb_condition *diff, + const struct ovsdb_condition *a, + const struct ovsdb_condition *b) +{ + size_t i, j; + int cmp; + + ovsdb_condition_init(diff); + + if (ovsdb_condition_is_trivial(a) || ovsdb_condition_is_trivial(b)) { + return; + } + + diff->clauses = xcalloc(a->n_clauses + b->n_clauses, + sizeof *diff->clauses); + + /* Clauses are sorted. */ + for (i = j = 0; i < a->n_clauses && j < b->n_clauses;) { + cmp = compare_clauses_3way_with_data(&a->clauses[i], &b->clauses[j]); + if (cmp < 0) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &a->clauses[i++]); + } else if (cmp > 0) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &b->clauses[j++]); + } else { + i++; + j++; + } + } + for (; i < a->n_clauses; i++) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &a->clauses[i]); + } + for (; j < b->n_clauses; j++) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &b->clauses[j]); + } + + diff->optimized = a->optimized && b->optimized; + if (diff->optimized) { + ovsdb_condition_optimize(diff); + } +} + void ovsdb_condition_clone(struct ovsdb_condition *to, const struct ovsdb_condition *from) diff --git a/ovsdb/condition.h b/ovsdb/condition.h index c794966ce94..95e4c4f2033 100644 --- a/ovsdb/condition.h +++ b/ovsdb/condition.h @@ -58,6 +58,9 @@ bool ovsdb_condition_match_any_clause(const struct ovsdb_datum *, unsigned int index_map[]); int ovsdb_condition_cmp_3way(const struct ovsdb_condition *a, const struct ovsdb_condition *b); +void ovsdb_condition_diff(struct ovsdb_condition *, + const struct ovsdb_condition *, + const struct ovsdb_condition *); void ovsdb_condition_clone(struct ovsdb_condition *to, const struct ovsdb_condition *from); bool ovsdb_condition_is_true(const struct ovsdb_condition *cond); @@ -66,6 +69,12 @@ const struct ovsdb_column ** ovsdb_condition_get_columns(const struct ovsdb_condition *cond, size_t *n_columns); +static inline bool +ovsdb_condition_is_trivial(const struct ovsdb_condition *cond) +{ + return ovsdb_condition_is_true(cond) || ovsdb_condition_is_false(cond); +} + static inline bool ovsdb_condition_empty_or_match_any(const struct ovsdb_datum *row_datum, const struct ovsdb_condition *cnd, diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 3cdd03b20fa..04dcd229891 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -55,6 +55,10 @@ struct ovsdb_monitor_table_condition { struct ovsdb_monitor_table *mt; struct ovsdb_condition old_condition; struct ovsdb_condition new_condition; + + /* Condition composed from difference between clauses in old and new. + * Note: Empty diff condition doesn't mean that old == new. */ + struct ovsdb_condition diff_condition; }; /* Backend monitor. @@ -713,6 +717,7 @@ ovsdb_monitor_session_condition_destroy( ovsdb_condition_destroy(&mtc->new_condition); ovsdb_condition_destroy(&mtc->old_condition); + ovsdb_condition_destroy(&mtc->diff_condition); shash_delete(&condition->tables, node); free(mtc); } @@ -733,6 +738,7 @@ ovsdb_monitor_table_condition_create( mtc->table = table; ovsdb_condition_init(&mtc->old_condition); ovsdb_condition_init(&mtc->new_condition); + ovsdb_condition_init(&mtc->diff_condition); if (json_cnd) { error = ovsdb_condition_from_json(table->schema, @@ -746,7 +752,7 @@ ovsdb_monitor_table_condition_create( } shash_add(&condition->tables, table->schema->name, mtc); - /* On session startup old == new condition */ + /* On session startup old == new condition, diff is empty. */ ovsdb_condition_clone(&mtc->new_condition, &mtc->old_condition); ovsdb_monitor_session_condition_set_mode(condition); @@ -758,7 +764,8 @@ ovsdb_monitor_get_table_conditions( const struct ovsdb_monitor_table *mt, const struct ovsdb_monitor_session_condition *condition, struct ovsdb_condition **old_condition, - struct ovsdb_condition **new_condition) + struct ovsdb_condition **new_condition, + struct ovsdb_condition **diff_condition) { if (!condition) { return false; @@ -772,6 +779,7 @@ ovsdb_monitor_get_table_conditions( } *old_condition = &mtc->old_condition; *new_condition = &mtc->new_condition; + *diff_condition = &mtc->diff_condition; return true; } @@ -800,6 +808,8 @@ ovsdb_monitor_table_condition_update( ovsdb_condition_destroy(&mtc->new_condition); ovsdb_condition_clone(&mtc->new_condition, &cond); ovsdb_condition_destroy(&cond); + ovsdb_condition_diff(&mtc->diff_condition, + &mtc->old_condition, &mtc->new_condition); ovsdb_monitor_condition_add_columns(dbmon, table, &mtc->new_condition); @@ -815,11 +825,14 @@ ovsdb_monitor_table_condition_updated(struct ovsdb_monitor_table *mt, shash_find_data(&condition->tables, mt->table->schema->name); if (mtc) { - /* If conditional monitoring - set old condition to new condition */ + /* If conditional monitoring - set old condition to new condition + * and clear the diff. */ if (ovsdb_condition_cmp_3way(&mtc->old_condition, &mtc->new_condition)) { ovsdb_condition_destroy(&mtc->old_condition); ovsdb_condition_clone(&mtc->old_condition, &mtc->new_condition); + ovsdb_condition_destroy(&mtc->diff_condition); + ovsdb_condition_init(&mtc->diff_condition); ovsdb_monitor_session_condition_set_mode(condition); } } @@ -834,29 +847,42 @@ ovsdb_monitor_row_update_type_condition( const struct ovsdb_datum *old, const struct ovsdb_datum *new) { - struct ovsdb_condition *old_condition, *new_condition; + struct ovsdb_condition *old_condition, *new_condition, *diff_condition; enum ovsdb_monitor_selection type = ovsdb_monitor_row_update_type(initial, old, new); if (ovsdb_monitor_get_table_conditions(mt, condition, &old_condition, - &new_condition)) { - bool old_cond = !old ? false - : ovsdb_condition_empty_or_match_any(old, - old_condition, - row_type == OVSDB_MONITOR_ROW ? - mt->columns_index_map : - NULL); - bool new_cond = !new ? false - : ovsdb_condition_empty_or_match_any(new, - new_condition, - row_type == OVSDB_MONITOR_ROW ? - mt->columns_index_map : - NULL); - - if (!old_cond && !new_cond) { + &new_condition, + &diff_condition)) { + unsigned int *index_map = row_type == OVSDB_MONITOR_ROW + ? mt->columns_index_map : NULL; + bool old_cond = false, new_cond = false; + + if (old && old == new + && !ovsdb_condition_empty_or_match_any(old, diff_condition, + index_map)) { + /* Condition changed, but not the data. And the row is not + * affected by the condition change. It either mathes or + * doesn't match both old and new conditions at the same time. + * In any case, this row should not be part of the update. */ type = OJMS_NONE; + } else { + /* The row changed or the condition change affects this row. + * Need to fully check old and new conditions. */ + if (old) { + old_cond = ovsdb_condition_empty_or_match_any( + old, old_condition, index_map); + } + if (new) { + new_cond = ovsdb_condition_empty_or_match_any( + new, new_condition, index_map); + } + + if (!old_cond && !new_cond) { + type = OJMS_NONE; + } } switch (type) { @@ -1155,15 +1181,16 @@ ovsdb_monitor_compose_cond_change_update( unsigned long int *changed = xmalloc(bitmap_n_bytes(max_columns)); SHASH_FOR_EACH (node, &dbmon->tables) { + struct ovsdb_condition *old_condition, *new_condition, *diff_condition; struct ovsdb_monitor_table *mt = node->data; - struct ovsdb_row *row; struct json *table_json = NULL; - struct ovsdb_condition *old_condition, *new_condition; + struct ovsdb_row *row; if (!ovsdb_monitor_get_table_conditions(mt, condition, &old_condition, - &new_condition) || + &new_condition, + &diff_condition) || !ovsdb_condition_cmp_3way(old_condition, new_condition)) { /* Nothing to update on this table */ continue; From 359cabbd6eb2cee19f6aa7db749a1a0c59e25292 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 31 May 2023 15:22:20 +0200 Subject: [PATCH 266/833] netdev-offload: Fix some typos. Caught while reviewing code. Fixes: aca2f8a8a6b6 ("netdev-offload-dpdk: Implement HW miss packet recover for vport.") Fixes: 241bad15d99a ("dpif-netdev: associate flow with a mark id") Acked-by: Eelco Chaudron Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-offload-dpdk.c | 2 +- lib/netdev-offload.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index 2d7858f51ce..14bc877719c 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -2672,7 +2672,7 @@ netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev, if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) { if (!vport_netdev->netdev_class || !vport_netdev->netdev_class->pop_header) { - VLOG_ERR_RL(&rl, "vport nedtdev=%s with no pop_header method", + VLOG_ERR_RL(&rl, "vport netdev=%s with no pop_header method", netdev_get_name(vport_netdev)); ret = EOPNOTSUPP; goto close_vport_netdev; diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h index edc843cd99a..47f8e6f48b7 100644 --- a/lib/netdev-offload.h +++ b/lib/netdev-offload.h @@ -72,7 +72,7 @@ struct offload_info { * sync with datapath recirc ids. */ /* - * The flow mark id assigened to the flow. If any pkts hit the flow, + * The flow mark id assigned to the flow. If any pkts hit the flow, * it will be in the pkt meta data. */ uint32_t flow_mark; From 8bcc6d694c8628820b0c924a4728e71e21828bd5 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Wed, 31 May 2023 11:37:55 +0200 Subject: [PATCH 267/833] netdev-dpdk: Fix warning with gcc 13. GCC now reports uninitialized warnings from function return values. ../lib/netdev-dpdk.c: In function 'netdev_dpdk_mempool_configure': ../lib/netdev-dpdk.c:964:22: warning: 'dmp' may be used uninitialized [-Wmaybe-uninitialized] 964 | dev->dpdk_mp = dmp; | ~~~~~~~~~~~~~^~~~~ ../lib/netdev-dpdk.c:854:21: note: 'dmp' was declared here 854 | struct dpdk_mp *dmp, *next; | ^~~ NB: this looks like a false positive, gcc 13 probably fails to see the link between reuse and dmp in dpdk_mp_get(). Reviewed-by: David Marchand Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 6bf672d43d5..8cb1a77031e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -840,7 +840,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu) static struct dpdk_mp * dpdk_mp_get(struct netdev_dpdk *dev, int mtu) { - struct dpdk_mp *dmp, *next; + struct dpdk_mp *dmp = NULL, *next; bool reuse = false; ovs_mutex_lock(&dpdk_mp_mutex); From 64cdc290ef441bc3b4c2cddc230311ba58bc31b3 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 31 May 2023 21:23:41 +0200 Subject: [PATCH 268/833] appveyor: Silence the git clone of pthreads4w. Git by default reports progress on stderr. This doesn't fail the build, but upsets the powershell: git : Cloning into 'c:\pthreads4w-code'... At line:3 char:1 + git clone https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-cod ... + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CategoryInfo : NotSpecified: (Cloning into 'c:\pthreads4w-code'...:String) [], RemoteException + FullyQualifiedErrorId : NativeCommandError Silence the git clone to avoid the warning. Acked-by: Alin Gabriel Serdean Signed-off-by: Ilya Maximets --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 25c3f69fb48..25f69bb8d11 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,7 +29,7 @@ init: cd C:\openvswitch - git clone https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code + git clone -q https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code python3 -m pip install pypiwin32 --disable-pip-version-check From 106ef21860c935e5e0017a88bf42b94025c4e511 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 6 Jun 2023 20:33:35 +0200 Subject: [PATCH 269/833] tc: Fix crash on malformed reply from kernel. The tc module combines the use of the `tc_transact` helper function for communication with the in-kernel tc infrastructure with assertions on the reply data by `ofpbuf_at_assert` on the received data prior to further processing. With the presence of bugs on the kernel side, we need to treat the kernel as an unreliable service provider and replace assertions on the reply from it with checks to avoid a fatal crash of OVS. For the record, the symptom of the crash is this in the log: EMER|include/openvswitch/ofpbuf.h:194: assertion offset + size <= b->size failed in ofpbuf_at_assert() And an excerpt of the backtrace looks like this: ofpbuf_at_assert (offset=16, size=20) at include/openvswitch/ofpbuf.h:194 tc_replace_flower at lib/tc.c:3223 netdev_tc_flow_put at lib/netdev-offload-tc.c:2096 netdev_flow_put at lib/netdev-offload.c:257 parse_flow_put at lib/dpif-netlink.c:2297 try_send_to_netdev at lib/dpif-netlink.c:2384 Reported-At: https://launchpad.net/bugs/2018500 Fixes: 5c039ddc64ff ("netdev-linux: Add functions to manipulate tc police action") Fixes: e7f6ba220e10 ("lib/tc: add ingress ratelimiting support for tc-offload") Fixes: f98e418fbdb6 ("tc: Add tc flower functions") Fixes: c1c9c9c4b636 ("Implement QoS framework.") Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 33 +++++++++++++++++++++--------- lib/tc.c | 50 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 36620199ec8..49c74346a42 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2714,8 +2714,16 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, err = tc_transact(&request, &reply); if (!err) { - struct tcmsg *tc = - ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); + + if (!nlmsg || !tc) { + VLOG_ERR_RL(&rl, + "Failed to add match all policer, malformed reply"); + ofpbuf_delete(reply); + return EPROTO; + } ofpbuf_delete(reply); } @@ -5744,26 +5752,27 @@ static int tc_update_policer_action_stats(struct ofpbuf *msg, struct ofputil_meter_stats *stats) { + struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca); struct ovs_flow_stats stats_dropped; struct ovs_flow_stats stats_hw; struct ovs_flow_stats stats_sw; const struct nlattr *act; struct nlattr *prio; - struct tcamsg *tca; int error = 0; if (!stats) { goto exit; } - if (NLMSG_HDRLEN + sizeof *tca > msg->size) { + if (!nlmsg || !tca) { VLOG_ERR_RL(&rl, "Failed to get action stats, size error"); error = EPROTO; goto exit; } - tca = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tca); - act = nl_attr_find(msg, NLMSG_HDRLEN + sizeof *tca, TCA_ACT_TAB); + act = nl_attr_find(&b, 0, TCA_ACT_TAB); if (!act) { VLOG_ERR_RL(&rl, "Failed to get action stats, can't find attribute"); error = EPROTO; @@ -6028,20 +6037,26 @@ static int tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep, struct nlattr **options, struct netdev_queue_stats *stats) { + struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); static const struct nl_policy tca_policy[] = { [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false }, [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false }, }; struct nlattr *ta[ARRAY_SIZE(tca_policy)]; - if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg), - tca_policy, ta, ARRAY_SIZE(ta))) { + if (!nlmsg || !tc) { + VLOG_ERR_RL(&rl, "failed to parse class message, malformed reply"); + goto error; + } + + if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) { VLOG_WARN_RL(&rl, "failed to parse class message"); goto error; } if (handlep) { - struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc); *handlep = tc->tcm_handle; } diff --git a/lib/tc.c b/lib/tc.c index 5c32c6f971d..270dc95ce53 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -36,6 +36,7 @@ #include #include "byte-order.h" +#include "coverage.h" #include "netlink-socket.h" #include "netlink.h" #include "openvswitch/ofpbuf.h" @@ -67,6 +68,8 @@ VLOG_DEFINE_THIS_MODULE(tc); +COVERAGE_DEFINE(tc_netlink_malformed_reply); + static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5); static enum tc_offload_policy tc_policy = TC_POLICY_NONE; @@ -2190,18 +2193,19 @@ int parse_netlink_to_tc_flower(struct ofpbuf *reply, struct tcf_id *id, struct tc_flower *flower, bool terse) { - struct tcmsg *tc; + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); struct nlattr *ta[ARRAY_SIZE(tca_policy)]; const char *kind; - if (NLMSG_HDRLEN + sizeof *tc > reply->size) { + if (!nlmsg || !tc) { + COVERAGE_INC(tc_netlink_malformed_reply); return EPROTO; } memset(flower, 0, sizeof *flower); - tc = ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); - flower->key.eth_type = (OVS_FORCE ovs_be16) tc_get_minor(tc->tcm_info); flower->mask.eth_type = OVS_BE16_MAX; id->prio = tc_get_major(tc->tcm_info); @@ -2215,8 +2219,7 @@ parse_netlink_to_tc_flower(struct ofpbuf *reply, struct tcf_id *id, return EAGAIN; } - if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof *tc, - tca_policy, ta, ARRAY_SIZE(ta))) { + if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) { VLOG_ERR_RL(&error_rl, "failed to parse tca policy"); return EPROTO; } @@ -2237,13 +2240,17 @@ parse_netlink_to_tc_flower(struct ofpbuf *reply, struct tcf_id *id, int parse_netlink_to_tc_chain(struct ofpbuf *reply, uint32_t *chain) { + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); struct nlattr *ta[ARRAY_SIZE(tca_chain_policy)]; - struct tcmsg *tc; - tc = ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); + if (!nlmsg || !tc) { + COVERAGE_INC(tc_netlink_malformed_reply); + return EPROTO; + } - if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof *tc, - tca_chain_policy, ta, ARRAY_SIZE(ta))) { + if (!nl_policy_parse(&b, 0, tca_chain_policy, ta, ARRAY_SIZE(ta))) { VLOG_ERR_RL(&error_rl, "failed to parse tca chain policy"); return EINVAL; } @@ -2307,21 +2314,27 @@ int parse_netlink_to_tc_policer(struct ofpbuf *reply, uint32_t police_idx[]) { static struct nl_policy actions_orders_policy[TCA_ACT_MAX_PRIO] = {}; + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); struct nlattr *actions_orders[ARRAY_SIZE(actions_orders_policy)]; + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); const int max_size = ARRAY_SIZE(actions_orders_policy); + struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca); const struct nlattr *actions; struct tc_flower flower; - struct tcamsg *tca; int i, cnt = 0; int err; + if (!nlmsg || !tca) { + COVERAGE_INC(tc_netlink_malformed_reply); + return EPROTO; + } + for (i = 0; i < max_size; i++) { actions_orders_policy[i].type = NL_A_NESTED; actions_orders_policy[i].optional = true; } - tca = ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tca); - actions = nl_attr_find(reply, NLMSG_HDRLEN + sizeof *tca, TCA_ACT_TAB); + actions = nl_attr_find(&b, 0, TCA_ACT_TAB); if (!actions || !nl_parse_nested(actions, actions_orders_policy, actions_orders, max_size)) { VLOG_ERR_RL(&error_rl, @@ -3823,8 +3836,15 @@ tc_replace_flower(struct tcf_id *id, struct tc_flower *flower) error = tc_transact(&request, &reply); if (!error) { - struct tcmsg *tc = - ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); + + if (!nlmsg || !tc) { + COVERAGE_INC(tc_netlink_malformed_reply); + ofpbuf_delete(reply); + return EPROTO; + } id->prio = tc_get_major(tc->tcm_info); id->handle = tc->tcm_handle; From 474a179aff6c4199d8007910e3f79f000af9d659 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 7 Jun 2023 10:24:40 +0200 Subject: [PATCH 270/833] cpu: Fix cpuid check for some AMD processors. Some venerable AMD processors do not support querying extended features (EAX=7) with cpuid. In this case, it is not a programmatic error and the runtime check should simply return the isa is unsupported. Reported-by: Davide Repetto Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2211747 Fixes: b366fa2f4947 ("dpif-netdev: Call cpuid for x86 isa availability.") Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/cpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/cpu.c b/lib/cpu.c index 0292f715ec4..fbbea400535 100644 --- a/lib/cpu.c +++ b/lib/cpu.c @@ -37,7 +37,9 @@ static bool x86_has_isa(uint32_t leaf, enum x86_reg reg, uint32_t bit) { uint32_t regs[4]; - ovs_assert(__get_cpuid_max(leaf & X86_LEAF_MASK, NULL) >= leaf); + if (__get_cpuid_max(leaf & X86_LEAF_MASK, NULL) < leaf) { + return false; + } __cpuid_count(leaf, 0, regs[EAX], regs[EBX], regs[ECX], regs[EDX]); return (regs[reg] & ((uint32_t) 1 << bit)) != 0; From 759a29dc2d97bfba9e3c5270a621beca673962ca Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Tue, 30 May 2023 09:34:12 +0200 Subject: [PATCH 271/833] backtrace: Extend the backtrace functionality. Use the backtrace functions that is provided by libc, this allows us to get backtrace that is independent of the current memory map of the process. Which in turn can be used for debugging/tracing purpose. The backtrace is not 100% accurate due to various optimizations, most notably "-fomit-frame-pointer" and LTO. This might result that the line in source file doesn't correspond to the real line. However, it should be able to pinpoint at least the function where the backtrace was called. The implementation is determined during compilation based on available libraries. Libunwind has higher priority if both methods are available to keep the compatibility with current behavior. The backtrace is not marked as signal safe however the backtrace manual page gives more detailed explanation why it might be the case [0]. Load the "libgcc" or equivalent in advance within the "fatal_signal_init" which should ensure that subsequent calls to backtrace* do not call malloc and are signal safe. The typical backtrace will look similar to the one below: /lib64/libopenvswitch-3.1.so.0(backtrace_capture+0x1e) [0x7fc5db298dfe] /lib64/libopenvswitch-3.1.so.0(log_backtrace_at+0x57) [0x7fc5db2999e7] /lib64/libovsdb-3.1.so.0(ovsdb_txn_complete+0x7b) [0x7fc5db56247b] /lib64/libovsdb-3.1.so.0(ovsdb_txn_propose_commit_block+0x8d) [0x7fc5db563a8d] ovsdb-server(+0xa661) [0x562cfce2e661] ovsdb-server(+0x7e39) [0x562cfce2be39] /lib64/libc.so.6(+0x27b4a) [0x7fc5db048b4a] /lib64/libc.so.6(__libc_start_main+0x8b) [0x7fc5db048c0b] ovsdb-server(+0x8c35) [0x562cfce2cc35] backtrace.h elaborates on how to effectively get the line information associated with the addressed presented in the backtrace. [0] backtrace() and backtrace_symbols_fd() don't call malloc() explicitly, but they are part of libgcc, which gets loaded dynamically when first used. Dynamic loading usually triggers a call to malloc(3). If you need certain calls to these two functions to not allocate memory (in signal handlers, for example), you need to make sure libgcc is loaded beforehand Reported-at: https://bugzilla.redhat.com/2177760 Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- include/openvswitch/vlog.h | 3 + lib/backtrace.c | 128 ++++++++++++++++++++++++++----------- lib/backtrace.h | 58 ++++++++++------- lib/fatal-signal.c | 53 +++++++++++++-- lib/ovsdb-error.c | 6 +- lib/vlog.c | 7 ++ m4/openvswitch.m4 | 8 ++- tests/atlocal.in | 2 + tests/daemon.at | 51 +++++++++++++++ 9 files changed, 244 insertions(+), 72 deletions(-) diff --git a/include/openvswitch/vlog.h b/include/openvswitch/vlog.h index e53ce6d8145..481e1c0f0a8 100644 --- a/include/openvswitch/vlog.h +++ b/include/openvswitch/vlog.h @@ -148,6 +148,9 @@ void vlog_set_syslog_target(const char *target); /* Write directly to log file. */ void vlog_direct_write_to_log_file_unsafe(const char *s); +/* Return the current log file descriptor. */ +int vlog_get_log_file_fd_unsafe(void); + /* Initialization. */ void vlog_init(void); void vlog_enable_async(void); diff --git a/lib/backtrace.c b/lib/backtrace.c index 2853d5ff150..65c92fd723c 100644 --- a/lib/backtrace.c +++ b/lib/backtrace.c @@ -32,12 +32,27 @@ VLOG_DEFINE_THIS_MODULE(backtrace); void backtrace_capture(struct backtrace *b) { - void *frames[BACKTRACE_MAX_FRAMES]; - int i; + b->n_frames = backtrace(b->frames, BACKTRACE_MAX_FRAMES); +} + +void +backtrace_format(struct ds *ds, const struct backtrace *bt, + const char *delimiter) +{ + if (bt->n_frames) { + char **symbols = backtrace_symbols(bt->frames, bt->n_frames); + + if (!symbols) { + return; + } - b->n_frames = backtrace(frames, BACKTRACE_MAX_FRAMES); - for (i = 0; i < b->n_frames; i++) { - b->frames[i] = (uintptr_t) frames[i]; + for (int i = 0; i < bt->n_frames - 1; i++) { + ds_put_format(ds, "%s%s", symbols[i], delimiter); + } + + ds_put_format(ds, "%s", symbols[bt->n_frames - 1]); + + free(symbols); } } @@ -47,23 +62,14 @@ backtrace_capture(struct backtrace *backtrace) { backtrace->n_frames = 0; } -#endif -static char * -backtrace_format(const struct backtrace *b, struct ds *ds) +void +backtrace_format(struct ds *ds, const struct backtrace *bt OVS_UNUSED, + const char *delimiter OVS_UNUSED) { - if (b->n_frames) { - int i; - - ds_put_cstr(ds, " (backtrace:"); - for (i = 0; i < b->n_frames; i++) { - ds_put_format(ds, " 0x%08"PRIxPTR, b->frames[i]); - } - ds_put_cstr(ds, ")"); - } - - return ds_cstr(ds); + ds_put_cstr(ds, "backtrace() is not supported!\n"); } +#endif void log_backtrace_at(const char *msg, const char *where) @@ -77,41 +83,85 @@ log_backtrace_at(const char *msg, const char *where) } ds_put_cstr(&ds, where); - VLOG_ERR("%s", backtrace_format(&b, &ds)); + ds_put_cstr(&ds, " backtrace:\n"); + backtrace_format(&ds, &b, "\n"); + VLOG_ERR("%s", ds_cstr_ro(&ds)); ds_destroy(&ds); } +#if defined(HAVE_UNWIND) || defined(HAVE_BACKTRACE) +static bool +read_received_backtrace(int fd, void *dest, size_t len) +{ + VLOG_DBG("%s fd %d", __func__, fd); + fcntl(fd, F_SETFL, O_NONBLOCK); + memset(dest, 0, len); + + int byte_read = read(fd, dest, len); + if (byte_read < 0) { + VLOG_ERR("Read fd %d failed: %s", fd, ovs_strerror(errno)); + } + + return byte_read > 0;; +} +#else +static bool +read_received_backtrace(int fd OVS_UNUSED, void *dest OVS_UNUSED, + size_t len OVS_UNUSED) +{ + return false; +} +#endif + #ifdef HAVE_UNWIND void -log_received_backtrace(int fd) { - int byte_read; +log_received_backtrace(int fd) +{ struct unw_backtrace backtrace[UNW_MAX_DEPTH]; - VLOG_WARN("%s fd %d", __func__, fd); - fcntl(fd, F_SETFL, O_NONBLOCK); - memset(backtrace, 0, UNW_MAX_BUF); + if (read_received_backtrace(fd, backtrace, UNW_MAX_BUF)) { + struct ds ds = DS_EMPTY_INITIALIZER; + + ds_put_cstr(&ds, BACKTRACE_DUMP_MSG); - byte_read = read(fd, backtrace, UNW_MAX_BUF); - if (byte_read < 0) { - VLOG_ERR("Read fd %d failed: %s", fd, - ovs_strerror(errno)); - } else if (byte_read > 0) { - VLOG_WARN("SIGSEGV detected, backtrace:"); for (int i = 0; i < UNW_MAX_DEPTH; i++) { if (backtrace[i].func[0] == 0) { break; } - VLOG_WARN("0x%016"PRIxPTR" <%s+0x%"PRIxPTR">\n", - backtrace[i].ip, - backtrace[i].func, - backtrace[i].offset); + ds_put_format(&ds, "0x%016"PRIxPTR" <%s+0x%"PRIxPTR">\n", + backtrace[i].ip, + backtrace[i].func, + backtrace[i].offset); } + + VLOG_WARN("%s", ds_cstr_ro(&ds)); + + ds_destroy(&ds); } } -#else /* !HAVE_UNWIND */ +#elif HAVE_BACKTRACE void -log_received_backtrace(int daemonize_fd OVS_UNUSED) { - VLOG_WARN("Backtrace using libunwind not supported."); +log_received_backtrace(int fd) +{ + struct backtrace bt; + + if (read_received_backtrace(fd, &bt, sizeof bt)) { + struct ds ds = DS_EMPTY_INITIALIZER; + + bt.n_frames = MIN(bt.n_frames, BACKTRACE_MAX_FRAMES); + + ds_put_cstr(&ds, BACKTRACE_DUMP_MSG); + backtrace_format(&ds, &bt, "\n"); + VLOG_WARN("%s", ds_cstr_ro(&ds)); + + ds_destroy(&ds); + } } -#endif /* HAVE_UNWIND */ +#else +void +log_received_backtrace(int daemonize_fd OVS_UNUSED) +{ + VLOG_WARN("Backtrace using libunwind or backtrace() is not supported."); +} +#endif diff --git a/lib/backtrace.h b/lib/backtrace.h index 5708bf9c683..9ccafd6d47c 100644 --- a/lib/backtrace.h +++ b/lib/backtrace.h @@ -36,41 +36,53 @@ * log_backtrace_msg("your message"); <-- with a message * * - * A typical log will look like the following. The hex numbers listed after - * "backtrace" are the addresses of the backtrace. + * A typical backtrace will look like the following example: + * /lib64/libopenvswitch-3.1.so.0(backtrace_capture+0x1e) [0x7fc5db298dfe] + * /lib64/libopenvswitch-3.1.so.0(log_backtrace_at+0x57) [0x7fc5db2999e7] + * /lib64/libovsdb-3.1.so.0(ovsdb_txn_complete+0x7b) [0x7fc5db56247b] + * /lib64/libovsdb-3.1.so.0(ovsdb_txn_propose_commit_block+0x8d) + * [0x7fc5db563a8d] + * ovsdb-server(+0xa661) [0x562cfce2e661] + * ovsdb-server(+0x7e39) [0x562cfce2be39] + * /lib64/libc.so.6(+0x27b4a) [0x7fc5db048b4a] + * /lib64/libc.so.6(__libc_start_main+0x8b) [0x7fc5db048c0b] + * ovsdb-server(+0x8c35) [0x562cfce2cc35] * - * 2014-03-13T23:18:11.979Z|00002|backtrace(revalidator_6)|ERR|lib/dpif-netdev.c:1312: (backtrace: 0x00521f57 0x00460365 0x00463ea4 0x0046470b 0x0043b32d 0x0043bac3 0x0043bae2 0x0043943b 0x004c22b3 0x2b5b3ac94e9a 0x2b5b3b4a33fd) + * GDB can be used to view the exact line of the code for particular backtrace. + * One thing to keep in mind is that the lines in source files might not + * 100% correspond with the backtrace due to various optimizations as LTO etc. + * (The effect can be seen in this example). * - * The following bash command can be used to view backtrace in - * a more readable form. - * addr2line -p -e vswitchd/ovs-vswitchd + * Assuming that debuginfo for the library or binary is installed load it to + * GDB: + * $ gdb ovsdb-server + * (gdb) list *(+0x7e39) + * 0x7e39 is in main (ovsdb/ovsdb-server.c:278). + * (gdb) list *(+0xa661) + * 0xa661 is in commit_txn (ovsdb/ovsdb-server.c:1173) * - * An typical run and output will look like: - * addr2line -p -e vswitchd/ovs-vswitchd 0x00521f57 0x00460365 0x00463ea4 - * 0x0046470b 0x0043b32d 0x0043bac3 0x0043bae2 0x0043943b 0x004c22b3 - * 0x2b5b3ac94e9a 0x2b5b3b4a33fd + * $ gdb /lib64/libovsdb-3.1.so.0 + * (gdb) list *(ovsdb_txn_propose_commit_block+0x8d) + * 0x3aa8d is in ovsdb_txn_propose_commit_block (ovsdb/transaction.c:1328) + * (gdb) list *(ovsdb_txn_complete+0x7b) + * 0x3947b is in ovsdb_txn_complete (./include/openvswitch/list.h:321) * - * openvswitch/lib/backtrace.c:33 - * openvswitch/lib/dpif-netdev.c:1312 - * openvswitch/lib/dpif.c:937 - * openvswitch/lib/dpif.c:1258 - * openvswitch/ofproto/ofproto-dpif-upcall.c:1440 - * openvswitch/ofproto/ofproto-dpif-upcall.c:1595 - * openvswitch/ofproto/ofproto-dpif-upcall.c:160 - * openvswitch/ofproto/ofproto-dpif-upcall.c:717 - * openvswitch/lib/ovs-thread.c:268 - * ??:0 - * ??:0 + * $ gdb /lib64/libopenvswitch-3.1.so.0 + * (gdb) list *(log_backtrace_at+0x57) + * 0x999e7 is in log_backtrace_at (lib/backtrace.c:77) + * (gdb) list *(backtrace_capture+0x1e) + * 0x98dfe is in backtrace_capture (lib/backtrace.c:35) */ #define log_backtrace() log_backtrace_at(NULL, OVS_SOURCE_LOCATOR); #define log_backtrace_msg(msg) log_backtrace_at(msg, OVS_SOURCE_LOCATOR); #define BACKTRACE_MAX_FRAMES 31 +#define BACKTRACE_DUMP_MSG "SIGSEGV detected, backtrace:\n" struct backtrace { int n_frames; - uintptr_t frames[BACKTRACE_MAX_FRAMES]; + void *frames[BACKTRACE_MAX_FRAMES]; }; #ifdef HAVE_UNWIND @@ -88,6 +100,8 @@ struct unw_backtrace { void backtrace_capture(struct backtrace *); void log_backtrace_at(const char *msg, const char *where); +void backtrace_format(struct ds *, const struct backtrace *, + const char *delimiter); void log_received_backtrace(int fd); #endif /* backtrace.h */ diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c index bbb31ef2751..f80f32182ce 100644 --- a/lib/fatal-signal.c +++ b/lib/fatal-signal.c @@ -35,10 +35,14 @@ #include "openvswitch/type-props.h" -#ifdef HAVE_UNWIND +#if defined(HAVE_UNWIND) || defined(HAVE_BACKTRACE) #include "daemon-private.h" #endif +#ifdef HAVE_BACKTRACE +#include +#endif + #ifndef SIG_ATOMIC_MAX #define SIG_ATOMIC_MAX TYPE_MAXIMUM(sig_atomic_t) #endif @@ -94,6 +98,17 @@ fatal_signal_init(void) inited = true; ovs_mutex_init_recursive(&mutex); + + /* The dummy backtrace is needed. + * See comment for send_backtrace_to_monitor(). */ + struct backtrace dummy_bt; + + backtrace_capture(&dummy_bt); + + if (!dummy_bt.n_frames) { + VLOG_DBG("Capturing of dummy backtrace has failed."); + } + #ifndef _WIN32 xpipe_nonblocking(signal_fds); #else @@ -181,7 +196,8 @@ llong_to_hex_str(unsigned long long value, char *str) * library functions used here must be async-signal-safe. */ static inline void -send_backtrace_to_monitor(void) { +send_backtrace_to_monitor(void) +{ /* volatile added to prevent a "clobbered" error on ppc64le with gcc */ volatile int dep; struct unw_backtrace unw_bt[UNW_MAX_DEPTH]; @@ -211,11 +227,10 @@ send_backtrace_to_monitor(void) { /* Since there is no monitor daemon running, write backtrace * in current process. */ - char str[] = "SIGSEGV detected, backtrace:\n"; char ip_str[16], offset_str[6]; char line[64], fn_name[UNW_MAX_FUNCN]; - vlog_direct_write_to_log_file_unsafe(str); + vlog_direct_write_to_log_file_unsafe(BACKTRACE_DUMP_MSG); for (int i = 0; i < dep; i++) { memset(line, 0, sizeof line); @@ -239,6 +254,36 @@ send_backtrace_to_monitor(void) { } } } +#elif HAVE_BACKTRACE +/* Send the backtrace to monitor thread. + * + * Note that this runs in the signal handling context, any system + * library functions used here must be async-signal-safe. + * backtrace() is only signal safe if the "libgcc" or equivalent was loaded + * before the signal handler. In order to keep it safe the fatal_signal_init() + * should always call backtrace_capture which will ensure that "libgcc" or + * equivlent is loaded. + */ +static inline void +send_backtrace_to_monitor(void) +{ + struct backtrace bt; + + backtrace_capture(&bt); + + if (monitor && daemonize_fd > -1) { + ignore(write(daemonize_fd, &bt, sizeof bt)); + } else { + int log_fd = vlog_get_log_file_fd_unsafe(); + + if (log_fd < 0) { + return; + } + + vlog_direct_write_to_log_file_unsafe(BACKTRACE_DUMP_MSG); + backtrace_symbols_fd(bt.frames, bt.n_frames, log_fd); + } +} #else static inline void send_backtrace_to_monitor(void) { diff --git a/lib/ovsdb-error.c b/lib/ovsdb-error.c index a75ad36b737..9ad42b232d4 100644 --- a/lib/ovsdb-error.c +++ b/lib/ovsdb-error.c @@ -141,12 +141,8 @@ ovsdb_internal_error(struct ovsdb_error *inner_error, backtrace_capture(&backtrace); if (backtrace.n_frames) { - int i; - ds_put_cstr(&ds, " (backtrace:"); - for (i = 0; i < backtrace.n_frames; i++) { - ds_put_format(&ds, " 0x%08"PRIxPTR, backtrace.frames[i]); - } + backtrace_format(&ds, &backtrace, ", "); ds_put_char(&ds, ')'); } diff --git a/lib/vlog.c b/lib/vlog.c index 9ddea48b85f..b2653142f3f 100644 --- a/lib/vlog.c +++ b/lib/vlog.c @@ -664,6 +664,13 @@ vlog_direct_write_to_log_file_unsafe(const char *s) } } +int +vlog_get_log_file_fd_unsafe(void) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + return log_fd; +} + /* Returns 'false' if 'facility' is not a valid string. If 'facility' * is a valid string, sets 'value' with the integer value of 'facility' * and returns 'true'. */ diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4 index 14d9249b89c..47f486be49b 100644 --- a/m4/openvswitch.m4 +++ b/m4/openvswitch.m4 @@ -360,8 +360,12 @@ AC_DEFUN([OVS_CHECK_DBDIR], dnl Defines HAVE_BACKTRACE if backtrace() is found. AC_DEFUN([OVS_CHECK_BACKTRACE], [AC_SEARCH_LIBS([backtrace], [execinfo ubacktrace], - [AC_DEFINE([HAVE_BACKTRACE], [1], - [Define to 1 if you have backtrace(3).])])]) + [HAVE_BACKTRACE=yes], [HAVE_BACKTRACE=no]) + if test "$HAVE_BACKTRACE" = "yes"; then + AC_DEFINE([HAVE_BACKTRACE], [1], [Define to 1 if you have backtrace(3).]) + fi + AM_CONDITIONAL([HAVE_BACKTRACE], [test "$HAVE_BACKTRACE" = "yes"]) + AC_SUBST([HAVE_BACKTRACE])]) dnl Defines HAVE_PERF_EVENT if linux/perf_event.h is found. AC_DEFUN([OVS_CHECK_PERF_EVENT], diff --git a/tests/atlocal.in b/tests/atlocal.in index 85966858629..18d5efae047 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -2,6 +2,8 @@ HAVE_OPENSSL='@HAVE_OPENSSL@' OPENSSL_SUPPORTS_SNI='@OPENSSL_SUPPORTS_SNI@' HAVE_UNBOUND='@HAVE_UNBOUND@' +HAVE_BACKTRACE='@HAVE_BACKTRACE@' +HAVE_UNWIND='@HAVE_UNWIND@' EGREP='@EGREP@' PYTHON3='@PYTHON3@' CFLAGS='@CFLAGS@' diff --git a/tests/daemon.at b/tests/daemon.at index d7981f9d23a..13cb8fc1c14 100644 --- a/tests/daemon.at +++ b/tests/daemon.at @@ -234,3 +234,54 @@ OVS_WAIT_UNTIL([sc query ovsdb-server | grep STATE | grep STOPPED > /dev/null 2> AT_CHECK([sc delete ovsdb-server], [0], [[[SC]] DeleteService SUCCESS ]) AT_CLEANUP + +AT_SETUP([backtrace without monitor]) +AT_SKIP_IF([test "$HAVE_BACKTRACE" = "no" && test "$HAVE_UNWIND" = "no"]) +AT_SKIP_IF([test "$IS_WIN32" = "yes"]) + +# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS + +# Skip it if UB Sanitizer is being used. There's no way to disable the +# SEGV check at runtime. +AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) + +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --no-db \ + --log-file --verbose=DBG], [0], [ignore], [ignore]) +OVS_WAIT_UNTIL([test -s ovsdb-server.pid]) +child=$(cat ovsdb-server.pid) + +AT_CAPTURE_FILE([ovsdb-server.log]) + +AT_CHECK([kill -SEGV $child]) + +OVS_WAIT_UNTIL([grep -q "^SIGSEGV detected, backtrace:" ovsdb-server.log]) + +AT_CLEANUP + +AT_SETUP([backtrace with monitor]) +AT_SKIP_IF([test "$HAVE_BACKTRACE" = "no" && test "$HAVE_UNWIND" = "no"]) +AT_SKIP_IF([test "$IS_WIN32" = "yes"]) + +# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS + +# Skip it if UB Sanitizer is being used. There's no way to disable the +# SEGV check at runtime. +AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) + +on_exit 'kill $(cat *.pid)' + +AT_CHECK([ovsdb-server --detach --monitor --no-chdir --pidfile --no-db \ + --log-file --verbose=DBG], [0], [ignore], [ignore]) +OVS_WAIT_UNTIL([test -s ovsdb-server.pid]) +child=$(cat ovsdb-server.pid) + +AT_CAPTURE_FILE([ovsdb-server.log]) + +AT_CHECK([kill -SEGV $child]) + +OVS_WAIT_UNTIL([grep -q "backtrace(monitor)|WARN|SIGSEGV detected, backtrace:" ovsdb-server.log]) +OVS_WAIT_UNTIL([grep -q "daemon_unix(monitor)|ERR|1 crashes: pid .* died, killed (Segmentation fault)" ovsdb-server.log]) + +AT_CLEANUP From 469e98e16db1a67765bffd23d62c47b69dfa73cd Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 7 Jun 2023 15:08:32 +0200 Subject: [PATCH 272/833] ovsdb: monitor: Destroy initial change set when new columns added. Initial change set is preserved for as long as the monitor itself. However, if a new client has a condition on a column that is not one of the monitored columns, this column will be added to the monitor via ovsdb_monitor_condition_bind(). This new column, however, doesn't exist in the initial change set. That will cause ovsdb-server to malfunction or crash trying to access non-existent column during condition evaluation: ERROR: AddressSanitizer: heap-buffer-overflow READ of size 4 at 0x606000006780 thread T0 0 ovsdb_clause_evaluate ovsdb/condition.c:328:26 1 ovsdb_condition_match_any_clause ovsdb/condition.c:441:13 2 ovsdb_condition_empty_or_match_any ovsdb/condition.h:84:13 3 ovsdb_monitor_row_update_type_condition ovsdb/monitor.c:892:28 4 ovsdb_monitor_compose_row_update2 ovsdb/monitor.c:1058:12 5 ovsdb_monitor_compose_update ovsdb/monitor.c:1172:24 6 ovsdb_monitor_get_update ovsdb/monitor.c:1276:24 7 ovsdb_jsonrpc_monitor_create ovsdb/jsonrpc-server.c:1505:12 8 ovsdb_jsonrpc_session_got_request ovsdb/jsonrpc-server.c:1030:21 9 ovsdb_jsonrpc_session_run ovsdb/jsonrpc-server.c:572:17 10 ovsdb_jsonrpc_session_run_all ovsdb/jsonrpc-server.c:602:21 11 ovsdb_jsonrpc_server_run ovsdb/jsonrpc-server.c:417:9 12 main_loop ovsdb/ovsdb-server.c:222:9 13 main ovsdb/ovsdb-server.c:500:5 14 __libc_start_call_main 15 __libc_start_main@GLIBC_2.2.5 16 _start (ovsdb/ovsdb-server+0x473034) Located 0 bytes after 64-byte region [0x606000006740,0x606000006780) allocated by thread T0 here: 0 malloc (ovsdb/ovsdb-server+0x50dc82) 1 xmalloc__ lib/util.c:140:15 2 xmalloc lib/util.c:175:12 3 clone_monitor_row_data ovsdb/monitor.c:336:12 4 ovsdb_monitor_changes_update ovsdb/monitor.c:1384:23 5 ovsdb_monitor_get_initial ovsdb/monitor.c:1535:21 6 ovsdb_jsonrpc_monitor_create ovsdb/jsonrpc-server.c:1502:9 7 ovsdb_jsonrpc_session_got_request ovsdb/jsonrpc-server.c:1030:21 8 ovsdb_jsonrpc_session_run ovsdb/jsonrpc-server.c:572:17 9 ovsdb_jsonrpc_session_run_all ovsdb/jsonrpc-server.c:602:21 10 ovsdb_jsonrpc_server_run ovsdb/jsonrpc-server.c:417:9 11 main_loop ovsdb/ovsdb-server.c:222:9 12 main ovsdb/ovsdb-server.c:500:5 13 __libc_start_call_main 14 __libc_start_main@GLIBC_2.2.5 15 _start (ovsdb/ovsdb-server+0x473034) Fix that by destroying the initial change set every time new columns are added to the monitor. This will trigger re-generation of the change set and it will contain all the necessary columns afterwards. Fixes: 07c27226ee96 ("ovsdb: Monitor: Keep and maintain the initial change set.") Reported-by: Han Zhou Acked-by: Han Zhou Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- ovsdb/monitor.c | 15 +++++++++- tests/ovsdb-monitor.at | 66 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 04dcd229891..4afaa89f48e 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -478,6 +478,7 @@ ovsdb_monitor_add_column(struct ovsdb_monitor *dbmon, enum ovsdb_monitor_selection select, bool monitored) { + struct ovsdb_monitor_change_set *mcs; struct ovsdb_monitor_table *mt; struct ovsdb_monitor_column *c; @@ -488,6 +489,18 @@ ovsdb_monitor_add_column(struct ovsdb_monitor *dbmon, return column->name; } + mcs = dbmon->init_change_set; + if (mcs) { + /* A new column is going to be added to the monitor. Existing + * initial change set doesn't have it, so can no longer be used. + * Initial change set is never used by more than one session at + * the same time, so it's safe to destroy it here. */ + ovs_assert(mcs->n_refs == 1); + ovsdb_monitor_json_cache_destroy(dbmon, mcs); + ovsdb_monitor_change_set_destroy(mcs); + dbmon->init_change_set = NULL; + } + if (mt->n_columns >= mt->allocated_columns) { mt->columns = x2nrealloc(mt->columns, &mt->allocated_columns, sizeof *mt->columns); @@ -614,7 +627,7 @@ ovsdb_monitor_untrack_change_set(struct ovsdb_monitor *dbmon, if (--mcs->n_refs == 0) { if (mcs == dbmon->init_change_set) { /* The initial change set should exist as long as the - * monitor itself. */ + * monitor doesn't change. */ mcs->n_refs++; return; } else if (mcs == dbmon->new_change_set) { diff --git a/tests/ovsdb-monitor.at b/tests/ovsdb-monitor.at index 7e1ff64f0f3..12cd2bc3194 100644 --- a/tests/ovsdb-monitor.at +++ b/tests/ovsdb-monitor.at @@ -1011,3 +1011,69 @@ row,action,name,number,_version ]], [ignore]) AT_CLEANUP +AT_SETUP([monitor-cond initial reply with condition on non-monitored column]) +AT_KEYWORDS([ovsdb server monitor monitor-cond positive initial non-monitored]) + +ordinal_schema > schema +AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) +on_exit 'kill `cat ovsdb-server.pid`' +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile \ + --remote=punix:socket --log-file db], [0], [ignore], [ignore]) + +dnl Initialize the database content. +for txn in m4_foreach([txn], [[[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 0, "name": "zero"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 1, "name": "one"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do + AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) +done + +dnl Start a first client that monitors only the column 'name'. +on_exit 'kill `cat client-1.pid`' +AT_CAPTURE_FILE([client-1.out]) +AT_CHECK([ovsdb-client -vjsonrpc --pidfile=client-1.pid --detach --no-chdir \ + -d json monitor-cond --format=csv unix:socket \ + ordinals '[[true]]' ordinals ["name"] \ + > client-1.out 2> client-1.err], [0], [ignore], [ignore]) +dnl Wait for the initial monitor reply. +OVS_WAIT_UNTIL([grep -q 'initial' client-1.out]) + +dnl Start a second client that monitors the column 'name', but has a condition +dnl on column 'number'. +on_exit 'kill `cat client-2.pid`' +AT_CAPTURE_FILE([client-2.out]) +AT_CHECK([ovsdb-client -vjsonrpc --pidfile=client-2.pid --detach --no-chdir \ + -d json monitor-cond --format=csv unix:socket \ + ordinals '[[["number", "!=", 1]]]' ordinals ["name"] \ + > client-2.out 2> client-2.err], [0], [ignore], [ignore]) +dnl Wait for the initial monitor reply. +OVS_WAIT_UNTIL([grep -q 'initial' client-2.out]) + +OVSDB_SERVER_SHUTDOWN +OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && \ + test ! -e client-1.pid && test ! -e client-2.pid]) + +dnl The first client should have all the names. +AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < client-1.out | uuidfilt], + [0], [dnl +row,action,name +<0>,initial,"""one""" +<1>,initial,"""two""" +<2>,initial,"""zero""" +]) + +dnl The second client should not have the name 'one'. +AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < client-2.out | uuidfilt], + [0], [dnl +row,action,name +<0>,initial,"""two""" +<1>,initial,"""zero""" +]) +AT_CLEANUP From 04f854f938b3310ee2ac08400d3b4bf72d070935 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 8 Jun 2023 21:16:59 +0200 Subject: [PATCH 273/833] fatal-signal: Don't share signal fds/handles with forked process. The signal_fds pipe and wevent are a mechanism to wake up the process after it received a signal and stored the number for the future processing. They are not intended for inter-process communication. However, in the current code, descriptors are not closed on fork(). The main scenario where we use fork() is a monitor process. Monitor doesn't actually use poll loops and doesn't wait on the descriptor. But when a child process is killed, it (child) sends a byte to itself, then it wakes up due to POLLIN on the pipe and terminates itself after processing all the callbacks. The byte stays unread. And the pipe is still open in the monitor process. When child dies, the monitor wakes up and forks again. New child inherits the same pipe that still contains unread data. This data is never read, so the child will constantly wake itself up for no reason. Interestingly enough raise(SIGSEGV) doesn't immediately kill the process. The execution continues til the end of a signal handler, so we're still able to write a byte to a pipe even in this case. Presumably because we don't have SA_NODEFER. Fix the issue by re-creating the pipe/event on fork. This way every new child will have its own notification channel and will not wake up any other processes. There was already an attempt to fix the issue, but it didn't get a follow up (see the reported-at tag). This is an alternative solution. Fixes: ff8decf1a318 ("daemon: Add support for process monitoring and restart.") Reported-at: https://patchwork.ozlabs.org/project/openvswitch/patch/20221019093147.2072-1-lifengqi@inspur.com/ Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/fatal-signal.c | 49 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c index f80f32182ce..77f0c87dd48 100644 --- a/lib/fatal-signal.c +++ b/lib/fatal-signal.c @@ -82,6 +82,39 @@ static void call_hooks(int sig_nr); static BOOL WINAPI ConsoleHandlerRoutine(DWORD dwCtrlType); #endif +/* Sets up a pipe or event handle that will be used to wake up the current + * process after signal is received, so it can be processed outside of the + * signal handler context in fatal_signal_run(). */ +static void +fatal_signal_create_wakeup_events(void) +{ +#ifndef _WIN32 + xpipe_nonblocking(signal_fds); +#else + wevent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!wevent) { + char *msg_buf = ovs_lasterror_to_string(); + VLOG_FATAL("Failed to create a event (%s).", msg_buf); + } +#endif +} + +static void +fatal_signal_destroy_wakeup_events(void) +{ +#ifndef _WIN32 + close(signal_fds[0]); + signal_fds[0] = -1; + close(signal_fds[1]); + signal_fds[1] = -1; +#else + ResetEvent(wevent); + CloseHandle(wevent); + wevent = NULL; +#endif +} + + /* Initializes the fatal signal handling module. Calling this function is * optional, because calling any other function in the module will also * initialize it. However, in a multithreaded program, the module must be @@ -109,15 +142,9 @@ fatal_signal_init(void) VLOG_DBG("Capturing of dummy backtrace has failed."); } -#ifndef _WIN32 - xpipe_nonblocking(signal_fds); -#else - wevent = CreateEvent(NULL, TRUE, FALSE, NULL); - if (!wevent) { - char *msg_buf = ovs_lasterror_to_string(); - VLOG_FATAL("Failed to create a event (%s).", msg_buf); - } + fatal_signal_create_wakeup_events(); +#ifdef _WIN32 /* Register a function to handle Ctrl+C. */ SetConsoleCtrlHandler(ConsoleHandlerRoutine, true); #endif @@ -501,6 +528,9 @@ do_unlink_files(void) * hooks passed a 'cancel_cb' function to fatal_signal_add_hook(), then those * functions will be called, allowing them to free resources, etc. * + * Also re-creates wake-up events, so signals in one of the processes do not + * wake up the other one. + * * Following a fork, one of the resulting processes can call this function to * allow it to terminate without calling the hooks registered before calling * this function. New hooks registered after calling this function will take @@ -512,6 +542,9 @@ fatal_signal_fork(void) assert_single_threaded(); + fatal_signal_destroy_wakeup_events(); + fatal_signal_create_wakeup_events(); + for (i = 0; i < n_hooks; i++) { struct hook *h = &hooks[i]; if (h->cancel_cb) { From e3ba0be48ca457ab3a1c9f1e3522e82218eca0f9 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 5 Jun 2023 15:39:02 +0200 Subject: [PATCH 274/833] seq: Make read of the current value atomic. Make the read of the current seq->value atomic, i.e., not needing to acquire the global mutex when reading it. On 64-bit systems, this incurs no overhead, and it will avoid the mutex and potentially a system call. For incrementing the value followed by waking up the threads, we are still taking the mutex, so the current behavior is not changing. The seq_read() behavior is already defined as, "Returns seq's current sequence number (which could change immediately)". So the change should not impact the current behavior. Signed-off-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/ovs-rcu.c | 2 +- lib/seq.c | 32 +++++++++++--------------------- lib/seq.h | 1 - 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c index 946aa04d18e..9e07d9bab66 100644 --- a/lib/ovs-rcu.c +++ b/lib/ovs-rcu.c @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void) ovs_assert(!single_threaded()); perthread = ovsrcu_perthread_get(); if (!seq_try_lock()) { - perthread->seqno = seq_read_protected(global_seqno); + perthread->seqno = seq_read(global_seqno); if (perthread->cbset) { ovsrcu_flush_cbset__(perthread, true); } diff --git a/lib/seq.c b/lib/seq.c index 99e5bf8bd10..7c2fa0c69f3 100644 --- a/lib/seq.c +++ b/lib/seq.c @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change); /* A sequence number object. */ struct seq { - uint64_t value OVS_GUARDED; + atomic_uint64_t value; struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */ }; @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex); struct seq * OVS_EXCLUDED(seq_mutex) seq_create(void) { + uint64_t seq_value; struct seq *seq; seq_init(); @@ -81,7 +82,8 @@ seq_create(void) COVERAGE_INC(seq_change); ovs_mutex_lock(&seq_mutex); - seq->value = seq_next++; + seq_value = seq_next++; + atomic_store_relaxed(&seq->value, seq_value); hmap_init(&seq->waiters); ovs_mutex_unlock(&seq_mutex); @@ -126,9 +128,11 @@ void seq_change_protected(struct seq *seq) OVS_REQUIRES(seq_mutex) { + uint64_t seq_value = seq_next++; + COVERAGE_INC(seq_change); - seq->value = seq_next++; + atomic_store_explicit(&seq->value, seq_value, memory_order_release); seq_wake_waiters(seq); } @@ -143,18 +147,6 @@ seq_change(struct seq *seq) ovs_mutex_unlock(&seq_mutex); } -/* Returns 'seq''s current sequence number (which could change immediately). - * - * seq_read() and seq_wait() can be used together to yield a race-free wakeup - * when an object changes, even without an ability to lock the object. See - * Usage in seq.h for details. */ -uint64_t -seq_read_protected(const struct seq *seq) - OVS_REQUIRES(seq_mutex) -{ - return seq->value; -} - /* Returns 'seq''s current sequence number (which could change immediately). * * seq_read() and seq_wait() can be used together to yield a race-free wakeup @@ -162,14 +154,12 @@ seq_read_protected(const struct seq *seq) * Usage in seq.h for details. */ uint64_t seq_read(const struct seq *seq) - OVS_EXCLUDED(seq_mutex) { uint64_t value; - ovs_mutex_lock(&seq_mutex); - value = seq_read_protected(seq); - ovs_mutex_unlock(&seq_mutex); - + /* Note that the odd CONST_CAST() is here to keep sparse happy. */ + atomic_read_explicit(&CONST_CAST(struct seq *, seq)->value, &value, + memory_order_acquire); return value; } @@ -226,7 +216,7 @@ seq_wait_at(const struct seq *seq_, uint64_t value, const char *where) struct seq *seq = CONST_CAST(struct seq *, seq_); ovs_mutex_lock(&seq_mutex); - if (value == seq->value) { + if (value == seq_read(seq_)) { seq_wait__(seq, value, where); } else { poll_immediate_wake_at(where); diff --git a/lib/seq.h b/lib/seq.h index c88b9d1c814..fcfa010376d 100644 --- a/lib/seq.h +++ b/lib/seq.h @@ -128,7 +128,6 @@ void seq_unlock(void); /* For observers. */ uint64_t seq_read(const struct seq *); -uint64_t seq_read_protected(const struct seq *); void seq_wait_at(const struct seq *, uint64_t value, const char *where); #define seq_wait(seq, value) seq_wait_at(seq, value, OVS_SOURCE_LOCATOR) From 22df63c3844f8c310fca25716227c84820ec85d4 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 14 Jun 2023 15:03:24 -0400 Subject: [PATCH 275/833] Documentation: Document netdev offload. Document the implementation of netdev hardware offloading in userspace datapath. Signed-off-by: Flavio Leitner Co-authored-by: Flavio Leitner Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- Documentation/automake.mk | 1 + Documentation/topics/index.rst | 1 + .../topics/userspace-checksum-offloading.rst | 96 +++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 Documentation/topics/userspace-checksum-offloading.rst diff --git a/Documentation/automake.mk b/Documentation/automake.mk index cdf3c992660..8bd3dbb2b88 100644 --- a/Documentation/automake.mk +++ b/Documentation/automake.mk @@ -57,6 +57,7 @@ DOC_SOURCE = \ Documentation/topics/record-replay.rst \ Documentation/topics/tracing.rst \ Documentation/topics/usdt-probes.rst \ + Documentation/topics/userspace-checksum-offloading.rst \ Documentation/topics/userspace-tso.rst \ Documentation/topics/userspace-tx-steering.rst \ Documentation/topics/windows.rst \ diff --git a/Documentation/topics/index.rst b/Documentation/topics/index.rst index 90d4c66e625..f239fcf83f8 100644 --- a/Documentation/topics/index.rst +++ b/Documentation/topics/index.rst @@ -55,5 +55,6 @@ OVS userspace-tso idl-compound-indexes ovs-extensions + userspace-checksum-offloading userspace-tx-steering usdt-probes diff --git a/Documentation/topics/userspace-checksum-offloading.rst b/Documentation/topics/userspace-checksum-offloading.rst new file mode 100644 index 00000000000..036d3965faa --- /dev/null +++ b/Documentation/topics/userspace-checksum-offloading.rst @@ -0,0 +1,96 @@ +.. + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. + + Convention for heading levels in Open vSwitch documentation: + + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + + Avoid deeper levels because they do not render well. + +======================================== +Userspace Datapath - Checksum Offloading +======================================== + +This document explains the internals of Open vSwitch support for checksum +offloading in the userspace datapath. + +Design +------ + +Open vSwitch strives to forward packets as they arrive regardless of whether +the checksum is correct or not. OVS is not responsible for fixing external +checksum issues. + +The interface (internally referred to as a netdev) can set flags indicating +whether each packet's checksum is good or bad upon receipt. If this flag is not +set, OVS will consider the validity of the packet's checksum to be unknown. + +OVS will not re-calculate or update the packet's checksum if the checksum is +already known to be correct, known to be explicitly incorrect, or destined for +an egress interface that will recalculate the checksum anyways. + +If OVS does invalidate the checksum, and the packet ingresses the datapath with +a checksum that is not known to be incorrect, OVS postpones checksum updates +until the packet egresses the datapath. This recalculation can either be +performed by OVS or, be offloaded onto the NIC if the egress NIC supports +checksum offloading. + +When a packet egress the datapath, the packet flags and the egress interface +flags are verified to make sure all required offload features to send out the +packet are available on the egress interface. If not, the data path will fall +back to equivalent software implementation. + + +Interface (a.k.a. Netdev) +------------------------- + +When the interface initiates, it should set the flags to tell the datapath +which offload features are supported. For example, if the driver supports IP +checksum offloading, then ``netdev->ol_flags`` should set the flag +``NETDEV_TX_OFFLOAD_IPV4_CKSUM``. + + +Rules +----- + +1) OVS should strive to forward all packets regardless of checksum. + +2) OVS must not correct a known bad packet checksum. + +3) Packet with flag ``DP_PACKET_OL_RX_IP_CKSUM_GOOD`` means that the IP + checksum is present in the packet and it is good. + +4) Packet with flag ``DP_PACKET_OL_RX_IP_CKSUM_BAD`` means that the IP + checksum is present in the packet and it is bad. Extra care should be taken + to not fix the packet during data path processing. + +5) The ingress packet parser can only set ``DP_PACKET_OL_TX_IP_CKSUM`` if the + packet has ``DP_PACKET_OL_RX_IP_CKSUM_GOOD`` to not violate rule #2. + +6) Packet with flag ``DP_PACKET_OL_TX_IPV4`` is an IPv4 packet. + +7) Packet with flag ``DP_PACKET_OL_TX_IPV6`` is an IPv6 packet. + +8) Packet with flag ``DP_PACKET_OL_TX_IP_CKSUM`` tells the datapath to skip + updating the IP checksum if the packet is modified. The IP checksum will be + calculated by the egress interface if that supports IP checksum offload, + otherwise the IP checksum will be performed in software before handing over + the packet to the interface. + +9) When there are modifications to the packet that requires a checksum update, + the datapath needs to remove the ``DP_PACKET_OL_RX_IP_CKSUM_GOOD`` flag, + otherwise the checksum is assumed to be good in the packet. From 4433cc68605cdfeeedd5c9edf5d3d5596f35f9b2 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 14 Jun 2023 15:03:25 -0400 Subject: [PATCH 276/833] dpif-netdev: Show netdev offloading flags. This patch modifies netdev_get_status to include information about checksum offload status by port, allowing the user to gain insight into where checksum offloading is active. Signed-off-by: Flavio Leitner Co-authored-by: Flavio Leitner Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 5 ----- lib/netdev.c | 29 ++++++++++++++++++++++++++--- tests/dpif-netdev.at | 18 ++++++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 8cb1a77031e..87b3f02972c 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1747,11 +1747,6 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) } else { smap_add(args, "rx_csum_offload", "false"); } - if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - smap_add(args, "tx_tso_offload", "true"); - } else { - smap_add(args, "tx_tso_offload", "false"); - } smap_add(args, "lsc_interrupt_mode", dev->lsc_interrupt_mode ? "true" : "false"); diff --git a/lib/netdev.c b/lib/netdev.c index c797783782f..79b17d00662 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -43,6 +43,7 @@ #include "netdev-provider.h" #include "netdev-vport.h" #include "odp-netlink.h" +#include "openvswitch/json.h" #include "openflow/openflow.h" #include "packets.h" #include "openvswitch/ofp-print.h" @@ -1373,9 +1374,31 @@ netdev_get_next_hop(const struct netdev *netdev, int netdev_get_status(const struct netdev *netdev, struct smap *smap) { - return (netdev->netdev_class->get_status - ? netdev->netdev_class->get_status(netdev, smap) - : EOPNOTSUPP); + int err = EOPNOTSUPP; + + /* Set offload status only if relevant. */ + if (netdev_get_dpif_type(netdev) && + strcmp(netdev_get_dpif_type(netdev), "system")) { + +#define OL_ADD_STAT(name, bit) \ + smap_add(smap, "tx_" name "_offload", \ + netdev->ol_flags & bit ? "true" : "false"); + + OL_ADD_STAT("ip_csum", NETDEV_TX_OFFLOAD_IPV4_CKSUM); + OL_ADD_STAT("tcp_csum", NETDEV_TX_OFFLOAD_TCP_CKSUM); + OL_ADD_STAT("udp_csum", NETDEV_TX_OFFLOAD_UDP_CKSUM); + OL_ADD_STAT("sctp_csum", NETDEV_TX_OFFLOAD_SCTP_CKSUM); + OL_ADD_STAT("tcp_seg", NETDEV_TX_OFFLOAD_TCP_TSO); +#undef OL_ADD_STAT + + err = 0; + } + + if (!netdev->netdev_class->get_status) { + return err; + } + + return netdev->netdev_class->get_status(netdev, smap); } /* Returns all assigned IP address to 'netdev' and returns 0. diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index baab60a2221..60d789bebf2 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -650,6 +650,24 @@ AT_CHECK([ovs-appctl revalidator/resume]) OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([dpif-netdev - check tx packet checksum offloading]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set interface p1 type=dummy options:pstream=punix:$OVS_RUNDIR/p0.sock \ + -- set bridge br0 datapath-type=dummy \ + other-config:datapath-id=1234 fail-mode=secure]) + +AT_CHECK([ovs-vsctl get interface p1 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl +tx_ip_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false" +], []) + +AT_CHECK([ovs-vsctl get interface br0 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl +tx_ip_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false" +], []) + +OVS_VSWITCHD_STOP +AT_CLEANUP + # SEND_UDP_PKTS([p_name], [p_ofport]) # # Sends 128 packets to port 'p_name' with different UDP destination ports. From 5d11c47d3ebe0acd48280239ba9a22bc21ee6273 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 14 Jun 2023 15:03:26 -0400 Subject: [PATCH 277/833] userspace: Enable IP checksum offloading by default. The netdev receiving packets is supposed to provide the flags indicating if the IP checksum was verified and it is GOOD or BAD, otherwise the stack will check when appropriate by software. If the packet comes with good checksum, then postpone the checksum calculation to the egress device if needed. When encapsulate a packet with that flag, set the checksum of the inner IP header since that is not yet supported. Calculate the IP checksum when the packet is going to be sent over a device that doesn't support the feature. Linux devices don't support IP checksum offload alone, so the support is not enabled. Signed-off-by: Flavio Leitner Co-authored-by: Flavio Leitner Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- NEWS | 4 ++ lib/conntrack.c | 19 ++++---- lib/dp-packet.c | 17 +++++++ lib/dp-packet.h | 62 +++++++++++++++++++++++-- lib/dpif-netdev-extract-avx512.c | 5 ++ lib/dpif-netdev.c | 2 + lib/flow.c | 15 ++++-- lib/ipf.c | 11 +++-- lib/netdev-dpdk.c | 71 +++++++++++++++++++---------- lib/netdev-dummy.c | 22 +++++++++ lib/netdev-native-tnl.c | 21 ++++++--- lib/netdev.c | 16 +++++++ lib/odp-execute-avx512.c | 20 +++++--- lib/odp-execute.c | 21 +++++++-- lib/packets.c | 34 +++++++++++--- tests/dpif-netdev.at | 78 ++++++++++++++++++++++++++++++++ 16 files changed, 351 insertions(+), 67 deletions(-) diff --git a/NEWS b/NEWS index cfd4666630d..d7d0f3f1528 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,10 @@ Post-v3.1.0 process extra privileges when mapping physical interconnect memory. - SRv6 Tunnel Protocol * Added support for userspace datapath (only). + - Userspace datapath: + * IP checksum offload support is now enabled by default for interfaces + that support it. See the 'status' column in the 'interface' table to + check the status. v3.1.0 - 16 Feb 2023 diff --git a/lib/conntrack.c b/lib/conntrack.c index ce8a63de5b8..78c3e578cb2 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2043,16 +2043,15 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type, ctx->key.dl_type = dl_type; if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { - bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt); - if (hwol_bad_l3_csum) { + if (dp_packet_ip_checksum_bad(pkt)) { ok = false; COVERAGE_INC(conntrack_l3csum_err); } else { - bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt) - || dp_packet_hwol_is_ipv4(pkt); - /* Validate the checksum only when hwol is not supported. */ + /* Validate the checksum only when hwol is not supported and the + * packet's checksum status is not known. */ ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL, - !hwol_good_l3_csum); + !dp_packet_hwol_is_ipv4(pkt) && + !dp_packet_ip_checksum_good(pkt)); } } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL); @@ -2063,8 +2062,8 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type, if (ok) { bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt); if (!hwol_bad_l4_csum) { - bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt) - || dp_packet_hwol_tx_l4_checksum(pkt); + bool hwol_good_l4_csum = dp_packet_l4_checksum_good(pkt) + || dp_packet_hwol_tx_l4_checksum(pkt); /* Validate the checksum only when hwol is not supported. */ if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt), &ctx->icmp_related, l3, !hwol_good_l4_csum, @@ -3373,7 +3372,9 @@ handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, } if (seq_skew) { ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew; - if (!dp_packet_hwol_is_ipv4(pkt)) { + if (dp_packet_hwol_tx_ip_csum(pkt)) { + dp_packet_ol_reset_ip_csum_good(pkt); + } else { l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum, l3_hdr->ip_tot_len, htons(ip_len)); diff --git a/lib/dp-packet.c b/lib/dp-packet.c index ae8ab5800e4..35856bf5396 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -21,6 +21,7 @@ #include "dp-packet.h" #include "netdev-afxdp.h" #include "netdev-dpdk.h" +#include "netdev-provider.h" #include "openvswitch/dynamic-string.h" #include "util.h" @@ -530,3 +531,19 @@ dp_packet_compare_offsets(struct dp_packet *b1, struct dp_packet *b2, } return true; } + +/* Checks if the packet 'p' is compatible with netdev_ol_flags 'flags' + * and if not, updates the packet with the software fall back. */ +void +dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) +{ + if (dp_packet_hwol_tx_ip_csum(p)) { + if (dp_packet_ip_checksum_good(p)) { + dp_packet_hwol_reset_tx_ip_csum(p); + } else if (!(flags & NETDEV_TX_OFFLOAD_IPV4_CKSUM)) { + dp_packet_ip_set_header_csum(p); + dp_packet_ol_set_ip_csum_good(p); + dp_packet_hwol_reset_tx_ip_csum(p); + } + } +} diff --git a/lib/dp-packet.h b/lib/dp-packet.h index b3e6a5d10c7..af0a2b7f0db 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -25,6 +25,7 @@ #include #endif +#include "csum.h" #include "netdev-afxdp.h" #include "netdev-dpdk.h" #include "openvswitch/list.h" @@ -83,6 +84,8 @@ enum dp_packet_offload_mask { DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_CKSUM, RTE_MBUF_F_TX_UDP_CKSUM, 0x400), /* Offload SCTP checksum. */ DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800), + /* Offload IP checksum. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000), /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */ }; @@ -97,7 +100,8 @@ enum dp_packet_offload_mask { DP_PACKET_OL_TX_IPV6 | \ DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ - DP_PACKET_OL_TX_SCTP_CKSUM) + DP_PACKET_OL_TX_SCTP_CKSUM | \ + DP_PACKET_OL_TX_IP_CKSUM) #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ @@ -239,6 +243,7 @@ static inline bool dp_packet_equal(const struct dp_packet *, bool dp_packet_compare_offsets(struct dp_packet *good, struct dp_packet *test, struct ds *err_str); +void dp_packet_ol_send_prepare(struct dp_packet *, uint64_t); /* Frees memory that 'b' points to, as well as 'b' itself. */ @@ -1030,6 +1035,26 @@ dp_packet_hwol_set_tx_ipv6(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6; } +/* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */ +static inline bool +dp_packet_hwol_tx_ip_csum(const struct dp_packet *p) +{ + return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IP_CKSUM); +} + +/* Marks packet 'p' for IPv4 checksum offloading. */ +static inline void +dp_packet_hwol_set_tx_ip_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IP_CKSUM; +} + +static inline void +dp_packet_hwol_reset_tx_ip_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IP_CKSUM; +} + /* Mark packet 'b' for TCP checksum offloading. It implies that either * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */ static inline void @@ -1063,13 +1088,31 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Returns 'true' if the IP header has good integrity and the + * checksum in it is complete. */ static inline bool -dp_packet_ip_checksum_valid(const struct dp_packet *p) +dp_packet_ip_checksum_good(const struct dp_packet *p) { return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_IP_CKSUM_MASK) == DP_PACKET_OL_RX_IP_CKSUM_GOOD; } +/* Marks packet 'p' with good IPv4 checksum. */ +static inline void +dp_packet_ol_set_ip_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_IP_CKSUM_BAD; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_IP_CKSUM_GOOD; +} + +/* Resets IP good checksum flag in packet 'p'. */ +static inline void +dp_packet_ol_reset_ip_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_IP_CKSUM_GOOD; +} + +/* Marks packet 'p' with bad IPv4 checksum. */ static inline bool dp_packet_ip_checksum_bad(const struct dp_packet *p) { @@ -1077,8 +1120,21 @@ dp_packet_ip_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_IP_CKSUM_BAD; } +/* Calculate and set the IPv4 header checksum in packet 'p'. */ +static inline void +dp_packet_ip_set_header_csum(struct dp_packet *p) +{ + struct ip_header *ip = dp_packet_l3(p); + + ovs_assert(ip); + ip->ip_csum = 0; + ip->ip_csum = csum(ip, sizeof *ip); +} + +/* Returns 'true' if the packet 'p' has good integrity and the + * checksum in it is correct. */ static inline bool -dp_packet_l4_checksum_valid(const struct dp_packet *p) +dp_packet_l4_checksum_good(const struct dp_packet *p) { return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) == DP_PACKET_OL_RX_L4_CKSUM_GOOD; diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 968845f2d3b..66884eaf041 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -698,6 +698,7 @@ mfex_ipv6_set_l2_pad_size(struct dp_packet *pkt, return -1; } dp_packet_set_l2_pad_size(pkt, len_from_ipv6 - (p_len + IPV6_HEADER_LEN)); + dp_packet_hwol_set_tx_ipv6(pkt); return 0; } @@ -728,6 +729,10 @@ mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh, return -1; } dp_packet_set_l2_pad_size(pkt, len_from_ipv4 - ip_tot_len); + dp_packet_hwol_set_tx_ipv4(pkt); + if (dp_packet_ip_checksum_good(pkt)) { + dp_packet_hwol_set_tx_ip_csum(pkt); + } return 0; } diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 70b953ae6dd..abe63412ebf 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -7913,6 +7913,8 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, ds_destroy(&ds); } + dp_packet_ol_send_prepare(packet_, 0); + return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, actions, wc, put_actions, dp->upcall_aux); } diff --git a/lib/flow.c b/lib/flow.c index 9501a259e9d..9397c99254c 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -935,6 +935,10 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) nw_proto = nh->ip_proto; nw_frag = ipv4_get_nw_frag(nh); data_pull(&data, &size, ip_len); + dp_packet_hwol_set_tx_ipv4(packet); + if (dp_packet_ip_checksum_good(packet)) { + dp_packet_hwol_set_tx_ip_csum(packet); + } } else if (dl_type == htons(ETH_TYPE_IPV6)) { const struct ovs_16aligned_ip6_hdr *nh = data; ovs_be32 tc_flow; @@ -948,6 +952,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } data_pull(&data, &size, sizeof *nh); + dp_packet_hwol_set_tx_ipv6(packet); plen = ntohs(nh->ip6_plen); dp_packet_set_l2_pad_size(packet, size - plen); size = plen; /* Never pull padding. */ @@ -3247,9 +3252,12 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) struct ip_header *ip = dp_packet_l3(p); ip->ip_tot_len = htons(p->l4_ofs - p->l3_ofs + l4_len); - ip->ip_csum = 0; - ip->ip_csum = csum(ip, sizeof *ip); - + if (dp_packet_hwol_tx_ip_csum(p)) { + dp_packet_ol_reset_ip_csum_good(p); + } else { + dp_packet_ip_set_header_csum(p); + dp_packet_ol_set_ip_csum_good(p); + } pseudo_hdr_csum = packet_csum_pseudoheader(ip); } else { /* ETH_TYPE_IPV6 */ struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(p); @@ -3339,6 +3347,7 @@ flow_compose(struct dp_packet *p, const struct flow *flow, /* Checksum has already been zeroed by put_zeros call. */ ip->ip_csum = csum(ip, sizeof *ip); + dp_packet_ol_set_ip_csum_good(p); pseudo_hdr_csum = packet_csum_pseudoheader(ip); flow_compose_l4_csum(p, flow, pseudo_hdr_csum); } else if (flow->dl_type == htons(ETH_TYPE_IPV6)) { diff --git a/lib/ipf.c b/lib/ipf.c index affd440f638..7d74e2c131e 100644 --- a/lib/ipf.c +++ b/lib/ipf.c @@ -433,7 +433,9 @@ ipf_reassemble_v4_frags(struct ipf_list *ipf_list) len += rest_len; l3 = dp_packet_l3(pkt); ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS); - if (!dp_packet_hwol_is_ipv4(pkt)) { + if (dp_packet_hwol_tx_ip_csum(pkt)) { + dp_packet_ol_reset_ip_csum_good(pkt); + } else { l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off, new_ip_frag_off); l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len)); @@ -608,8 +610,7 @@ ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt) goto invalid_pkt; } - if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt) - && !dp_packet_hwol_is_ipv4(pkt) + if (OVS_UNLIKELY(!dp_packet_ip_checksum_good(pkt) && csum(l3, ip_hdr_len) != 0)) { COVERAGE_INC(ipf_l3csum_err); goto invalid_pkt; @@ -1186,7 +1187,9 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, } else { struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt); struct ip_header *l3_reass = dp_packet_l3(pkt); - if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) { + if (dp_packet_hwol_tx_ip_csum(frag_i->pkt)) { + dp_packet_ol_reset_ip_csum_good(frag_i->pkt); + } else { ovs_be32 reass_ip = get_16aligned_be32(&l3_reass->ip_src); ovs_be32 frag_ip = diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 87b3f02972c..cac46eac781 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -411,8 +411,9 @@ enum dpdk_hw_ol_features { NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, NETDEV_RX_HW_CRC_STRIP = 1 << 1, NETDEV_RX_HW_SCATTER = 1 << 2, - NETDEV_TX_TSO_OFFLOAD = 1 << 3, - NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3, + NETDEV_TX_TSO_OFFLOAD = 1 << 4, + NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 5, }; /* @@ -1039,6 +1040,10 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_KEEP_CRC; } + if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM; + } + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { @@ -1179,6 +1184,12 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER; } + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD; + } + dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD; if (userspace_tso_enabled()) { if ((info.tx_offload_capa & tx_tso_offload_capa) @@ -2227,13 +2238,16 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); - if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) { - mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt); - mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); - mbuf->outer_l2_len = 0; - mbuf->outer_l3_len = 0; + if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK + | RTE_MBUF_F_TX_TCP_SEG))) { + return true; } + mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); + mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); + mbuf->outer_l2_len = 0; + mbuf->outer_l3_len = 0; + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { struct tcp_header *th = dp_packet_l4(pkt); @@ -2292,13 +2306,11 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, uint32_t nb_tx = 0; uint16_t nb_tx_prep = cnt; - if (userspace_tso_enabled()) { - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); - if (nb_tx_prep != cnt) { - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, - cnt, rte_strerror(rte_errno)); - } + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); + if (nb_tx_prep != cnt) { + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " + "Only %u/%u are valid: %s", netdev_get_name(&dev->up), + nb_tx_prep, cnt, rte_strerror(rte_errno)); } while (nb_tx != nb_tx_prep) { @@ -2637,11 +2649,19 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); - if (mbuf_dest->ol_flags & RTE_MBUF_F_TX_L4_MASK) { - mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest) - - (char *)dp_packet_eth(pkt_dest); - mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest) + if (dp_packet_l3(pkt_dest)) { + if (dp_packet_eth(pkt_dest)) { + mbuf_dest->l2_len = (char *) dp_packet_l3(pkt_dest) + - (char *) dp_packet_eth(pkt_dest); + } else { + mbuf_dest->l2_len = 0; + } + if (dp_packet_l4(pkt_dest)) { + mbuf_dest->l3_len = (char *) dp_packet_l4(pkt_dest) - (char *) dp_packet_l3(pkt_dest); + } else { + mbuf_dest->l3_len = 0; + } } return pkt_dest; @@ -2699,11 +2719,9 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch, pkt_cnt = cnt; /* Prepare each mbuf for hardware offloading. */ - if (userspace_tso_enabled()) { - cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt); - stats->tx_invalid_hwol_drops += pkt_cnt - cnt; - pkt_cnt = cnt; - } + cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt); + stats->tx_invalid_hwol_drops += pkt_cnt - cnt; + pkt_cnt = cnt; /* Apply Quality of Service policy. */ cnt = netdev_dpdk_qos_run(dev, pkts, pkt_cnt, true); @@ -5260,6 +5278,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev) } err = dpdk_eth_dev_init(dev); + + if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) { + netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } else { + netdev->ol_flags &= ~NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 7467e9fbcb9..1a54add87f0 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -147,6 +147,11 @@ struct netdev_dummy { int requested_n_txq OVS_GUARDED; int requested_n_rxq OVS_GUARDED; int requested_numa_id OVS_GUARDED; + + /* Enable netdev IP csum offload. */ + bool ol_ip_csum OVS_GUARDED; + /* Flag RX packet with good csum. */ + bool ol_ip_csum_set_good OVS_GUARDED; }; /* Max 'recv_queue_len' in struct netdev_dummy. */ @@ -914,6 +919,13 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args, } } + netdev->ol_ip_csum_set_good = smap_get_bool(args, "ol_ip_csum_set_good", + false); + netdev->ol_ip_csum = smap_get_bool(args, "ol_ip_csum", false); + if (netdev->ol_ip_csum) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } + netdev_change_seq_changed(netdev_); /* 'dummy-pmd' specific config. */ @@ -1092,6 +1104,10 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, netdev->rxq_stats[rxq_->queue_id].bytes += dp_packet_size(packet); netdev->custom_stats[0].value++; netdev->custom_stats[1].value++; + if (netdev->ol_ip_csum_set_good) { + /* The netdev hardware sets the flag when the packet has good csum. */ + dp_packet_ol_set_ip_csum_good(packet); + } ovs_mutex_unlock(&netdev->mutex); dp_packet_batch_init_packet(batch, packet); @@ -1173,6 +1189,12 @@ netdev_dummy_send(struct netdev *netdev, int qid, } } + if (dp_packet_hwol_tx_ip_csum(packet) && + !dp_packet_ip_checksum_good(packet)) { + dp_packet_ip_set_header_csum(packet); + dp_packet_ol_set_ip_csum_good(packet); + } + ovs_mutex_lock(&dev->mutex); dev->stats.tx_packets++; dev->txq_stats[qid].packets++; diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index c2c6ca55957..72d24459854 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -88,7 +88,10 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, ovs_be32 ip_src, ip_dst; - if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(packet))) { + /* A packet coming from a network device might have the + * csum already checked. In this case, skip the check. */ + if (OVS_UNLIKELY(!dp_packet_ip_checksum_good(packet)) + && !dp_packet_hwol_tx_ip_csum(packet)) { if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) { VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum"); return NULL; @@ -142,7 +145,8 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, * * This function sets the IP header's ip_tot_len field (which should be zeroed * as part of 'header') and puts its value into '*ip_tot_size' as well. Also - * updates IP header checksum, as well as the l3 and l4 offsets in 'packet'. + * updates IP header checksum if not offloaded, as well as the l3 and l4 + * offsets in the 'packet'. * * Return pointer to the L4 header added to 'packet'. */ void * @@ -168,11 +172,16 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header, ip6->ip6_plen = htons(*ip_tot_size); packet_set_ipv6_flow_label(&ip6->ip6_flow, ipv6_label); packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; + dp_packet_hwol_set_tx_ipv6(packet); + dp_packet_ol_reset_ip_csum_good(packet); return ip6 + 1; } else { ip = netdev_tnl_ip_hdr(eth); ip->ip_tot_len = htons(*ip_tot_size); - ip->ip_csum = recalc_csum16(ip->ip_csum, 0, ip->ip_tot_len); + /* Postpone checksum to when the packet is pushed to the port. */ + dp_packet_hwol_set_tx_ipv4(packet); + dp_packet_hwol_set_tx_ip_csum(packet); + dp_packet_ol_reset_ip_csum_good(packet); *ip_tot_size -= IP_HEADER_LEN; packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; return ip + 1; @@ -191,7 +200,7 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, } if (udp->udp_csum) { - if (OVS_UNLIKELY(!dp_packet_l4_checksum_valid(packet))) { + if (OVS_UNLIKELY(!dp_packet_l4_checksum_good(packet))) { uint32_t csum; if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) { csum = packet_csum_pseudoheader6(dp_packet_l3(packet)); @@ -299,8 +308,8 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, ip->ip_frag_off = (params->flow->tunnel.flags & FLOW_TNL_F_DONT_FRAGMENT) ? htons(IP_DF) : 0; - /* Checksum has already been zeroed by eth_build_header. */ - ip->ip_csum = csum(ip, sizeof *ip); + /* The checksum will be calculated when the headers are pushed + * to the packet if offloading is not enabled. */ data->header_len += IP_HEADER_LEN; return ip + 1; diff --git a/lib/netdev.c b/lib/netdev.c index 79b17d00662..b86afbf36d4 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -808,6 +808,14 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, return false; } + /* Packet with IP csum offloading enabled was received with verified csum. + * Leave the IP csum offloading enabled even with good checksum to the + * netdev to decide what would be the best to do. + * Provide a software fallback in case the device doesn't support IP csum + * offloading. Note: Encapsulated packet must have the inner IP header + * csum already calculated. */ + dp_packet_ol_send_prepare(packet, netdev_flags); + l4_mask = dp_packet_hwol_l4_mask(packet); if (l4_mask) { if (dp_packet_hwol_l4_is_tcp(packet)) { @@ -975,7 +983,15 @@ netdev_push_header(const struct netdev *netdev, "not supported: packet dropped", netdev_get_name(netdev)); } else { + /* The packet is going to be encapsulated and there is + * no support yet for inner network header csum offloading. */ + if (dp_packet_hwol_tx_ip_csum(packet) + && !dp_packet_ip_checksum_good(packet)) { + dp_packet_ip_set_header_csum(packet); + } + netdev->netdev_class->push_header(netdev, packet, data); + pkt_metadata_init(&packet->md, data->out_port); dp_packet_batch_refill(batch, packet, i); } diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index c28461ec1a0..9597f3554ce 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -450,7 +450,6 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { struct ip_header *nh = dp_packet_l3(packet); - ovs_be16 old_csum = ~nh->ip_csum; /* Load the 20 bytes of the IPv4 header. Without options, which is the * most common case it's 20 bytes, but can be up to 60 bytes. */ @@ -463,13 +462,20 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, * (v_pkt_masked). */ __m256i v_new_hdr = _mm256_or_si256(v_key_shuf, v_pkt_masked); - /* Update the IP checksum based on updated IP values. */ - uint16_t delta = avx512_ipv4_hdr_csum_delta(v_packet, v_new_hdr); - uint32_t new_csum = old_csum + delta; - delta = csum_finish(new_csum); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + ovs_be16 old_csum = ~nh->ip_csum; - /* Insert new checksum. */ - v_new_hdr = _mm256_insert_epi16(v_new_hdr, delta, 5); + /* Update the IP checksum based on updated IP values. */ + uint16_t delta = avx512_ipv4_hdr_csum_delta(v_packet, v_new_hdr); + uint32_t new_csum = old_csum + delta; + + delta = csum_finish(new_csum); + + /* Insert new checksum. */ + v_new_hdr = _mm256_insert_epi16(v_new_hdr, delta, 5); + } /* If ip_src or ip_dst has been modified, L4 checksum needs to * be updated too. */ diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 5cf6fbec09a..37f0f717af6 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -169,9 +169,14 @@ odp_set_ipv4(struct dp_packet *packet, const struct ovs_key_ipv4 *key, new_tos = key->ipv4_tos | (nh->ip_tos & ~mask->ipv4_tos); if (nh->ip_tos != new_tos) { - nh->ip_csum = recalc_csum16(nh->ip_csum, - htons((uint16_t) nh->ip_tos), - htons((uint16_t) new_tos)); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, + htons((uint16_t) nh->ip_tos), + htons((uint16_t) new_tos)); + } + nh->ip_tos = new_tos; } } @@ -180,8 +185,14 @@ odp_set_ipv4(struct dp_packet *packet, const struct ovs_key_ipv4 *key, new_ttl = key->ipv4_ttl | (nh->ip_ttl & ~mask->ipv4_ttl); if (OVS_LIKELY(nh->ip_ttl != new_ttl)) { - nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_ttl << 8), - htons(new_ttl << 8)); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, + htons(nh->ip_ttl << 8), + htons(new_ttl << 8)); + } + nh->ip_ttl = new_ttl; } } diff --git a/lib/packets.c b/lib/packets.c index 7e5a52fd40e..a4ccc21f823 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1144,7 +1144,12 @@ packet_set_ipv4_addr(struct dp_packet *packet, } } } - nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr); + + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr); + } put_16aligned_be32(addr, new_addr); } @@ -1311,16 +1316,26 @@ packet_set_ipv4(struct dp_packet *packet, ovs_be32 src, ovs_be32 dst, if (nh->ip_tos != tos) { uint8_t *field = &nh->ip_tos; - nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) *field), - htons((uint16_t) tos)); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) *field), + htons((uint16_t) tos)); + } + *field = tos; } if (nh->ip_ttl != ttl) { uint8_t *field = &nh->ip_ttl; - nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8), - htons(ttl << 8)); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8), + htons(ttl << 8)); + } + *field = ttl; } } @@ -1931,8 +1946,13 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) tos |= IP_ECN_CE; if (nh->ip_tos != tos) { - nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos), - htons((uint16_t) tos)); + if (dp_packet_hwol_tx_ip_csum(pkt)) { + dp_packet_ol_reset_ip_csum_good(pkt); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos), + htons((uint16_t) tos)); + } + nh->ip_tos = tos; } } diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 60d789bebf2..67adf27fb19 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -734,3 +734,81 @@ AT_CHECK([test `ovs-vsctl get Interface p2 statistics:tx_q0_packets` -gt 0 -a dn OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([userspace offload - ip csum offload]) +OVS_VSWITCHD_START( + [add-br br1 -- set bridge br1 datapath-type=dummy -- \ + add-port br1 p1 -- \ + set Interface p1 type=dummy -- \ + add-port br1 p2 -- \ + set Interface p2 type=dummy --]) + +# Modify the ip_dst addr to force changing the IP csum. +AT_CHECK([ovs-ofctl add-flow br1 in_port=p1,actions=mod_nw_dst:192.168.1.1,output:p2]) + +# Check if no offload remains ok. +AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 +]) + +# Checksum should change to 0x990 with ip_dst changed to 192.168.1.1 +# by the datapath while processing the packet. +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +]) + +# Check if packets entering the datapath with csum offloading +# enabled gets the csum updated properly by egress handling +# in the datapath and not by the netdev. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 +]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +]) + +# Check if packets entering the datapath with csum offloading +# enabled gets the csum updated properly by netdev and not +# by the datapath. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 +]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +]) + +# Push a packet with bad csum and offloading disabled to check +# if the datapath updates the csum, but does not fix the issue. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +0a8f394fe0738abf7e2f058408004500003433e0400040068f03c0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 +]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl +0a8f394fe0738abf7e2f058408004500003433e0400040060904c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +]) + +# Push a packet with bad csum and offloading enabled to check +# if the driver updates and fixes the csum. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +0a8f394fe0738abf7e2f058408004500003433e0400040068f03c0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 +]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP From 3337e6d91c5b5657b3e91cff091289c4eb0aeba9 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 14 Jun 2023 15:03:27 -0400 Subject: [PATCH 278/833] userspace: Enable L4 checksum offloading by default. The netdev receiving packets is supposed to provide the flags indicating if the L4 checksum was verified and it is OK or BAD, otherwise the stack will check when appropriate by software. If the packet comes with good checksum, then postpone the checksum calculation to the egress device if needed. When encapsulate a packet with that flag, set the checksum of the inner L4 header since that is not yet supported. Calculate the L4 checksum when the packet is going to be sent over a device that doesn't support the feature. Linux tap devices allows enabling L3 and L4 offload, so this patch enables the feature. However, Linux socket interface remains disabled because the API doesn't allow enabling those two features without enabling TSO too. Signed-off-by: Flavio Leitner Co-authored-by: Flavio Leitner Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- NEWS | 6 +- lib/conntrack.c | 15 +- lib/dp-packet.c | 29 ++++ lib/dp-packet.h | 78 +++++++++- lib/dpif-netdev-extract-avx512.c | 62 +++++++- lib/flow.c | 23 +++ lib/netdev-dpdk.c | 172 +++++++++++++++------ lib/netdev-linux.c | 258 ++++++++++++++++++++++--------- lib/netdev-native-tnl.c | 32 +--- lib/netdev.c | 46 ++---- lib/odp-execute-avx512.c | 88 +++++++---- lib/packets.c | 175 ++++++++++++++++----- lib/packets.h | 3 + 13 files changed, 717 insertions(+), 270 deletions(-) diff --git a/NEWS b/NEWS index d7d0f3f1528..66d5a4ea375 100644 --- a/NEWS +++ b/NEWS @@ -37,9 +37,9 @@ Post-v3.1.0 - SRv6 Tunnel Protocol * Added support for userspace datapath (only). - Userspace datapath: - * IP checksum offload support is now enabled by default for interfaces - that support it. See the 'status' column in the 'interface' table to - check the status. + * IP and L4 checksum offload support is now enabled by default for + interfaces that support it. See the 'status' column in the 'interface' + table to check the status. v3.1.0 - 16 Feb 2023 diff --git a/lib/conntrack.c b/lib/conntrack.c index 78c3e578cb2..f5ebfa05bad 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2060,13 +2060,12 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type, } if (ok) { - bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt); - if (!hwol_bad_l4_csum) { - bool hwol_good_l4_csum = dp_packet_l4_checksum_good(pkt) - || dp_packet_hwol_tx_l4_checksum(pkt); + if (!dp_packet_l4_checksum_bad(pkt)) { /* Validate the checksum only when hwol is not supported. */ if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt), - &ctx->icmp_related, l3, !hwol_good_l4_csum, + &ctx->icmp_related, l3, + !dp_packet_l4_checksum_good(pkt) && + !dp_packet_hwol_tx_l4_checksum(pkt), NULL)) { ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); return true; @@ -3395,8 +3394,10 @@ handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, adj_seqnum(&th->tcp_seq, ec->seq_skew); } - th->tcp_csum = 0; - if (!dp_packet_hwol_tx_l4_checksum(pkt)) { + if (dp_packet_hwol_tx_l4_checksum(pkt)) { + dp_packet_ol_reset_l4_csum_good(pkt); + } else { + th->tcp_csum = 0; if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto, dp_packet_l4_size(pkt)); diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 35856bf5396..27114a9a998 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -38,6 +38,9 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, enum dp_packet_source so dp_packet_init_specific(b); /* By default assume the packet type to be Ethernet. */ b->packet_type = htonl(PT_ETH); + /* Reset csum start and offset. */ + b->csum_start = 0; + b->csum_offset = 0; } static void @@ -546,4 +549,30 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) dp_packet_hwol_reset_tx_ip_csum(p); } } + + if (!dp_packet_hwol_tx_l4_checksum(p)) { + return; + } + + if (dp_packet_l4_checksum_good(p)) { + dp_packet_hwol_reset_tx_l4_csum(p); + return; + } + + if (dp_packet_hwol_l4_is_tcp(p) + && !(flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { + packet_tcp_complete_csum(p); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } else if (dp_packet_hwol_l4_is_udp(p) + && !(flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { + packet_udp_complete_csum(p); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } else if (!(flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM) + && dp_packet_hwol_l4_is_sctp(p)) { + packet_sctp_complete_csum(p); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index af0a2b7f0db..70ddf8aa45a 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -140,6 +140,8 @@ struct dp_packet { or UINT16_MAX. */ uint32_t cutlen; /* length in bytes to cut from the end. */ ovs_be32 packet_type; /* Packet type as defined in OpenFlow */ + uint16_t csum_start; /* Position to start checksumming from. */ + uint16_t csum_offset; /* Offset to place checksum. */ union { struct pkt_metadata md; uint64_t data[DP_PACKET_CONTEXT_SIZE / 8]; @@ -997,6 +999,13 @@ dp_packet_hwol_is_ipv4(const struct dp_packet *b) return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_IPV4); } +/* Returns 'true' if packet 'p' is marked as IPv6. */ +static inline bool +dp_packet_hwol_tx_ipv6(const struct dp_packet *p) +{ + return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IPV6); +} + /* Returns 'true' if packet 'b' is marked for TCP checksum offloading. */ static inline bool dp_packet_hwol_l4_is_tcp(const struct dp_packet *b) @@ -1021,18 +1030,26 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b) DP_PACKET_OL_TX_SCTP_CKSUM; } -/* Mark packet 'b' for IPv4 checksum offloading. */ static inline void -dp_packet_hwol_set_tx_ipv4(struct dp_packet *b) +dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_L4_MASK; +} + +/* Mark packet 'p' as IPv4. */ +static inline void +dp_packet_hwol_set_tx_ipv4(struct dp_packet *p) { - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV4; + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IPV6; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IPV4; } -/* Mark packet 'b' for IPv6 checksum offloading. */ +/* Mark packet 'a' as IPv6. */ static inline void -dp_packet_hwol_set_tx_ipv6(struct dp_packet *b) +dp_packet_hwol_set_tx_ipv6(struct dp_packet *a) { - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6; + *dp_packet_ol_flags_ptr(a) &= ~DP_PACKET_OL_TX_IPV4; + *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_IPV6; } /* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */ @@ -1147,6 +1164,55 @@ dp_packet_l4_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_L4_CKSUM_BAD; } +/* Returns 'true' if the packet has good integrity though the + * checksum in the packet 'p' is not complete. */ +static inline bool +dp_packet_ol_l4_csum_partial(const struct dp_packet *p) +{ + return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) == + DP_PACKET_OL_RX_L4_CKSUM_MASK; +} + +/* Marks packet 'p' with good integrity though the checksum in the + * packet is not complete. */ +static inline void +dp_packet_ol_set_l4_csum_partial(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_MASK; +} + +/* Marks packet 'p' with good L4 checksum. */ +static inline void +dp_packet_ol_set_l4_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_BAD; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_GOOD; +} + +/* Marks packet 'p' with good L4 checksum as modified. */ +static inline void +dp_packet_ol_reset_l4_csum_good(struct dp_packet *p) +{ + if (!dp_packet_ol_l4_csum_partial(p)) { + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_GOOD; + } +} + +/* Marks packet 'p' with good integrity if the 'start' and 'offset' + * matches with the 'csum_start' and 'csum_offset' in packet 'p'. + * The 'start' is the offset from the begin of the packet headers. + * The 'offset' is the offset from start to place the checksum. + * The csum_start and csum_offset fields are set from the virtio_net_hdr + * struct that may be provided by a netdev on packet ingress. */ +static inline void +dp_packet_ol_l4_csum_check_partial(struct dp_packet *p, uint16_t start, + uint16_t offset) +{ + if (p->csum_start == start && p->csum_offset == offset) { + dp_packet_ol_set_l4_csum_partial(p); + } +} + static inline uint32_t ALWAYS_INLINE dp_packet_calc_hash_ipv4(const uint8_t *pkt, const uint16_t l3_ofs, uint32_t hash) diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 66884eaf041..1bc7e8d0e08 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -698,7 +698,6 @@ mfex_ipv6_set_l2_pad_size(struct dp_packet *pkt, return -1; } dp_packet_set_l2_pad_size(pkt, len_from_ipv6 - (p_len + IPV6_HEADER_LEN)); - dp_packet_hwol_set_tx_ipv6(pkt); return 0; } @@ -729,10 +728,6 @@ mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh, return -1; } dp_packet_set_l2_pad_size(pkt, len_from_ipv4 - ip_tot_len); - dp_packet_hwol_set_tx_ipv4(pkt); - if (dp_packet_ip_checksum_good(pkt)) { - dp_packet_hwol_set_tx_ip_csum(pkt); - } return 0; } @@ -763,6 +758,45 @@ mfex_check_tcp_data_offset(const struct tcp_header *tcp) return ret; } +static void +mfex_ipv4_set_hwol(struct dp_packet *pkt) +{ + dp_packet_hwol_set_tx_ipv4(pkt); + if (dp_packet_ip_checksum_good(pkt)) { + dp_packet_hwol_set_tx_ip_csum(pkt); + } +} + +static void +mfex_ipv6_set_hwol(struct dp_packet *pkt) +{ + dp_packet_hwol_set_tx_ipv6(pkt); +} + +static void +mfex_tcp_set_hwol(struct dp_packet *pkt) +{ + dp_packet_ol_l4_csum_check_partial(pkt, pkt->l4_ofs, + offsetof(struct tcp_header, + tcp_csum)); + if (dp_packet_l4_checksum_good(pkt) + || dp_packet_ol_l4_csum_partial(pkt)) { + dp_packet_hwol_set_csum_tcp(pkt); + } +} + +static void +mfex_udp_set_hwol(struct dp_packet *pkt) +{ + dp_packet_ol_l4_csum_check_partial(pkt, pkt->l4_ofs, + offsetof(struct udp_header, + udp_csum)); + if (dp_packet_l4_checksum_good(pkt) + || dp_packet_ol_l4_csum_partial(pkt)) { + dp_packet_hwol_set_csum_udp(pkt); + } +} + /* Generic loop to process any mfex profile. This code is specialized into * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE * to ensure the compiler specializes each instance. The code is marked "hot" @@ -864,6 +898,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, const struct tcp_header *tcp = (void *)&pkt[38]; mfex_handle_tcp_flags(tcp, &blocks[7]); dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV4_UDP: { @@ -876,6 +912,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_TCP: { @@ -891,6 +929,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_UDP: { @@ -902,6 +942,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV6_UDP: { @@ -920,6 +962,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process UDP header. */ mfex_handle_ipv6_l4((void *)&pkt[54], &blocks[9]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV6_TCP: { @@ -943,6 +987,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, } mfex_handle_tcp_flags(tcp, &blocks[9]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV6_TCP: { @@ -969,6 +1015,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, } mfex_handle_tcp_flags(tcp, &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV6_UDP: { @@ -990,6 +1038,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process UDP header. */ mfex_handle_ipv6_l4((void *)&pkt[58], &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_NVGRE: { @@ -1000,6 +1050,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; default: diff --git a/lib/flow.c b/lib/flow.c index 9397c99254c..fe226cf0fe5 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -1054,6 +1054,13 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } + dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, + offsetof(struct tcp_header, + tcp_csum)); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_tcp(packet); + } } } } else if (OVS_LIKELY(nw_proto == IPPROTO_UDP)) { @@ -1069,6 +1076,13 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } + dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, + offsetof(struct udp_header, + udp_csum)); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_udp(packet); + } } } else if (OVS_LIKELY(nw_proto == IPPROTO_SCTP)) { if (OVS_LIKELY(size >= SCTP_HEADER_LEN)) { @@ -1078,6 +1092,13 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) miniflow_push_be16(mf, tp_dst, sctp->sctp_dst); miniflow_push_be16(mf, ct_tp_src, ct_tp_src); miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst); + dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, + offsetof(struct sctp_header, + sctp_csum)); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_sctp(packet); + } } } else if (OVS_LIKELY(nw_proto == IPPROTO_ICMP)) { if (OVS_LIKELY(size >= ICMP_HEADER_LEN)) { @@ -3196,6 +3217,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, tcp->tcp_csum = 0; tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, tcp, l4_len)); + dp_packet_ol_set_l4_csum_good(p); } else if (flow->nw_proto == IPPROTO_UDP) { struct udp_header *udp = dp_packet_l4(p); @@ -3205,6 +3227,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, if (!udp->udp_csum) { udp->udp_csum = htons(0xffff); } + dp_packet_ol_set_l4_csum_good(p); } else if (flow->nw_proto == IPPROTO_ICMP) { struct icmp_header *icmp = dp_packet_l4(p); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index cac46eac781..63dac689e38 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -412,8 +412,10 @@ enum dpdk_hw_ol_features { NETDEV_RX_HW_CRC_STRIP = 1 << 1, NETDEV_RX_HW_SCATTER = 1 << 2, NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3, - NETDEV_TX_TSO_OFFLOAD = 1 << 4, - NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 5, + NETDEV_TX_TCP_CKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5, + NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6, + NETDEV_TX_TSO_OFFLOAD = 1 << 7, }; /* @@ -1008,6 +1010,37 @@ dpdk_watchdog(void *dummy OVS_UNUSED) return NULL; } +static void +netdev_dpdk_update_netdev_flag(struct netdev_dpdk *dev, + enum dpdk_hw_ol_features hw_ol_features, + enum netdev_ol_flags flag) + OVS_REQUIRES(dev->mutex) +{ + struct netdev *netdev = &dev->up; + + if (dev->hw_ol_features & hw_ol_features) { + netdev->ol_flags |= flag; + } else { + netdev->ol_flags &= ~flag; + } +} + +static void +netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_IPV4_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_IPV4_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TCP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_TCP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_UDP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_UDP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_SCTP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_SCTP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD, + NETDEV_TX_OFFLOAD_TCP_TSO); +} + static int dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) { @@ -1044,11 +1077,20 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM; } + if (dev->hw_ol_features & NETDEV_TX_TCP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_UDP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_SCTP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM; + } + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { - conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM; - } + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO; } /* Limit configured rss hash functions to only those supported @@ -1154,7 +1196,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) struct rte_ether_addr eth_addr; int diag; int n_rxq, n_txq; - uint32_t tx_tso_offload_capa = DPDK_TX_TSO_OFFLOAD_FLAGS; uint32_t rx_chksm_offload_capa = RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM; @@ -1190,18 +1231,28 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD; } + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } + dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD; if (userspace_tso_enabled()) { - if ((info.tx_offload_capa & tx_tso_offload_capa) - == tx_tso_offload_capa) { + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) { - dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD; - } else { - VLOG_WARN("%s: Tx SCTP checksum offload is not supported, " - "SCTP packets sent to this device will be dropped", - netdev_get_name(&dev->up)); - } } else { VLOG_WARN("%s: Tx TSO offload is not supported.", netdev_get_name(&dev->up)); @@ -2245,6 +2296,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); + mbuf->l4_len = 0; mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; @@ -4181,6 +4233,7 @@ new_device(int vid) ovs_mutex_lock(&dev->mutex); if (nullable_string_is_equal(ifname, dev->vhost_id)) { uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM; + uint64_t features; /* Get NUMA information */ newnode = rte_vhost_get_numa_node(vid); @@ -4205,6 +4258,36 @@ new_device(int vid) dev->vhost_reconfigured = true; } + if (rte_vhost_get_negotiated_features(vid, &features)) { + VLOG_INFO("Error checking guest features for " + "vHost Device '%s'", dev->vhost_id); + } else { + if (features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) { + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD; + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD; + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } + + if (userspace_tso_enabled()) { + if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4) + && features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) { + + dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; + VLOG_DBG("%s: TSO enabled on vhost port", + netdev_get_name(&dev->up)); + } else { + VLOG_WARN("%s: Tx TSO offload is not supported.", + netdev_get_name(&dev->up)); + } + } + } + + /* There is no support in virtio net to offload IPv4 csum, + * but the vhost library handles IPv4 csum offloading fine. */ + dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; + + netdev_dpdk_update_netdev_flags(dev); + ovsrcu_index_set(&dev->vid, vid); exists = true; @@ -4268,6 +4351,10 @@ destroy_device(int vid) dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled); netdev_dpdk_txq_map_clear(dev); + /* Clear offload capabilities before next new_device. */ + dev->hw_ol_features = 0; + netdev_dpdk_update_netdev_flags(dev); + netdev_change_seq_changed(&dev->up); ovs_mutex_unlock(&dev->mutex); exists = true; @@ -5278,22 +5365,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) } err = dpdk_eth_dev_init(dev); - - if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } else { - netdev->ol_flags &= ~NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } - - if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - } - } + netdev_dpdk_update_netdev_flags(dev); /* If both requested and actual hwaddr were previously * unset (initialized to 0), then first device init above @@ -5340,11 +5412,6 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) memset(dev->sw_stats, 0, sizeof *dev->sw_stats); rte_spinlock_unlock(&dev->stats_lock); - if (userspace_tso_enabled()) { - dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; - VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev->up)); - } - netdev_dpdk_remap_txqs(dev); if (netdev_dpdk_get_vid(dev) >= 0) { @@ -5365,6 +5432,8 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) } } + netdev_dpdk_update_netdev_flags(dev); + return 0; } @@ -5386,8 +5455,6 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); int err; - uint64_t vhost_flags = 0; - uint64_t vhost_unsup_flags; ovs_mutex_lock(&dev->mutex); @@ -5397,6 +5464,9 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) * 2. A path has been specified. */ if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) { + uint64_t virtio_unsup_features = 0; + uint64_t vhost_flags = 0; + /* Register client-mode device. */ vhost_flags |= RTE_VHOST_USER_CLIENT; @@ -5443,22 +5513,22 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) } if (userspace_tso_enabled()) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN - | 1ULL << VIRTIO_NET_F_HOST_UFO; + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN + | 1ULL << VIRTIO_NET_F_HOST_UFO; + VLOG_DBG("%s: TSO enabled on vhost port", + netdev_get_name(&dev->up)); } else { - /* This disables checksum offloading and all the features - * that depends on it (TSO, UFO, ECN) according to virtio - * specification. */ - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM; + /* Advertise checksum offloading to the guest, but explicitly + * disable TSO and friends. + * NOTE: we can't disable HOST_ECN which may have been wrongly + * negotiated by a running guest. */ + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_TSO4 + | 1ULL << VIRTIO_NET_F_HOST_TSO6 + | 1ULL << VIRTIO_NET_F_HOST_UFO; } err = rte_vhost_driver_disable_features(dev->vhost_id, - vhost_unsup_flags); + virtio_unsup_features); if (err) { VLOG_ERR("rte_vhost_driver_disable_features failed for " "vhost user client port: %s\n", dev->up.name); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 49c74346a42..3dba2ef1fe4 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -530,6 +530,11 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); * changes in the device miimon status, so we can use atomic_count. */ static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0); +/* Very old kernels from the 2.6 era don't support vnet headers with the tun + * device. We can detect this while constructing a netdev, but need this for + * packet rx/tx. */ +static bool tap_supports_vnet_hdr = true; + static int netdev_linux_parse_vnet_hdr(struct dp_packet *b); static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu); static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *, @@ -938,14 +943,6 @@ netdev_linux_common_construct(struct netdev *netdev_) netnsid_unset(&netdev->netnsid); ovs_mutex_init(&netdev->mutex); - if (userspace_tso_enabled()) { - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } - return 0; } @@ -959,6 +956,16 @@ netdev_linux_construct(struct netdev *netdev_) return error; } + /* The socket interface doesn't offer the option to enable only + * csum offloading without TSO. */ + if (userspace_tso_enabled()) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } + error = get_flags(&netdev->up, &netdev->ifi_flags); if (error == ENODEV) { if (netdev->up.netdev_class != &netdev_internal_class) { @@ -984,9 +991,12 @@ netdev_linux_construct(struct netdev *netdev_) static int netdev_linux_construct_tap(struct netdev *netdev_) { + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; struct netdev_linux *netdev = netdev_linux_cast(netdev_); static const char tap_dev[] = "/dev/net/tun"; const char *name = netdev_->name; + unsigned long oflags; + unsigned int up; struct ifreq ifr; int error = netdev_linux_common_construct(netdev_); @@ -1004,8 +1014,21 @@ netdev_linux_construct_tap(struct netdev *netdev_) /* Create tap device. */ get_flags(&netdev->up, &netdev->ifi_flags); + + if (ovsthread_once_start(&once)) { + if (ioctl(netdev->tap_fd, TUNGETFEATURES, &up) == -1) { + VLOG_WARN("%s: querying tap features failed: %s", name, + ovs_strerror(errno)); + tap_supports_vnet_hdr = false; + } else if (!(up & IFF_VNET_HDR)) { + VLOG_WARN("TAP interfaces do not support virtio-net headers"); + tap_supports_vnet_hdr = false; + } + ovsthread_once_done(&once); + } + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - if (userspace_tso_enabled()) { + if (tap_supports_vnet_hdr) { ifr.ifr_flags |= IFF_VNET_HDR; } @@ -1030,21 +1053,23 @@ netdev_linux_construct_tap(struct netdev *netdev_) goto error_close; } + oflags = TUN_F_CSUM; if (userspace_tso_enabled()) { - /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is - * available, it will return EINVAL when a flag is unknown. - * Therefore, try enabling offload with no flags to check - * if TUNSETOFFLOAD support is available or not. */ - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) { - unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6; - - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) { - VLOG_WARN("%s: enabling tap offloading failed: %s", name, - ovs_strerror(errno)); - error = errno; - goto error_close; - } + oflags |= (TUN_F_TSO4 | TUN_F_TSO6); + } + + if (tap_supports_vnet_hdr + && ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) { + netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_IPV4_CKSUM + | NETDEV_TX_OFFLOAD_TCP_CKSUM + | NETDEV_TX_OFFLOAD_UDP_CKSUM); + + if (userspace_tso_enabled()) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; } + } else { + VLOG_INFO("%s: Disabling checksum and segment offloading due to " + "missing kernel support", name); } netdev->present = true; @@ -1344,18 +1369,23 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, pkt = buffers[i]; } - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) { - struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); - struct netdev_linux *netdev = netdev_linux_cast(netdev_); + if (virtio_net_hdr_size) { + int ret = netdev_linux_parse_vnet_hdr(pkt); + if (OVS_UNLIKELY(ret)) { + struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); + struct netdev_linux *netdev = netdev_linux_cast(netdev_); - /* Unexpected error situation: the virtio header is not present - * or corrupted. Drop the packet but continue in case next ones - * are correct. */ - dp_packet_delete(pkt); - netdev->rx_dropped += 1; - VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header", - netdev_get_name(netdev_)); - continue; + /* Unexpected error situation: the virtio header is not + * present or corrupted or contains unsupported features. + * Drop the packet but continue in case next ones are + * correct. */ + dp_packet_delete(pkt); + netdev->rx_dropped += 1; + VLOG_WARN_RL(&rl, "%s: Dropped packet: vnet header is missing " + "or corrupt: %s", netdev_get_name(netdev_), + ovs_strerror(ret)); + continue; + } } for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg; @@ -1413,10 +1443,13 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, /* Use the buffer from the allocated packet below to receive MTU * sized packets and an aux_buf for extra TSO data. */ iovlen = IOV_TSO_SIZE; - virtio_net_hdr_size = sizeof(struct virtio_net_hdr); } else { /* Use only the buffer from the allocated packet. */ iovlen = IOV_STD_SIZE; + } + if (OVS_LIKELY(tap_supports_vnet_hdr)) { + virtio_net_hdr_size = sizeof(struct virtio_net_hdr); + } else { virtio_net_hdr_size = 0; } @@ -1462,7 +1495,8 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, pkt = buffer; } - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) { + if (OVS_LIKELY(virtio_net_hdr_size) && + netdev_linux_parse_vnet_hdr(pkt)) { struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -1611,7 +1645,7 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, * on other interface types because we attach a socket filter to the rx * socket. */ static int -netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu, +netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu, struct dp_packet_batch *batch) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -1632,7 +1666,7 @@ netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu, ssize_t retval; int error; - if (tso) { + if (OVS_LIKELY(tap_supports_vnet_hdr)) { netdev_linux_prepend_vnet_hdr(packet, mtu); } @@ -1765,7 +1799,7 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); } else { - error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch); + error = netdev_linux_tap_batch_send(netdev_, mtu, batch); } if (error) { if (error == ENOBUFS) { @@ -6846,53 +6880,76 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return 0; } +/* Initializes packet 'b' with features enabled in the prepended + * struct virtio_net_hdr. Returns 0 if successful, otherwise a + * positive errno value. */ static int netdev_linux_parse_vnet_hdr(struct dp_packet *b) { struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet); - uint16_t l4proto = 0; if (OVS_UNLIKELY(!vnet)) { - return -EINVAL; + return EINVAL; } if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) { return 0; } - if (netdev_linux_parse_l2(b, &l4proto)) { - return -EINVAL; - } - if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (l4proto == IPPROTO_TCP) { - dp_packet_hwol_set_csum_tcp(b); - } else if (l4proto == IPPROTO_UDP) { + uint16_t l4proto = 0; + + if (netdev_linux_parse_l2(b, &l4proto)) { + return EINVAL; + } + + if (l4proto == IPPROTO_UDP) { dp_packet_hwol_set_csum_udp(b); - } else if (l4proto == IPPROTO_SCTP) { - dp_packet_hwol_set_csum_sctp(b); } + /* The packet has offloaded checksum. However, there is no + * additional information like the protocol used, so it would + * require to parse the packet here. The checksum starting point + * and offset are going to be verified when the packet headers + * are parsed during miniflow extraction. */ + b->csum_start = (OVS_FORCE uint16_t) vnet->csum_start; + b->csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset; + } else { + b->csum_start = 0; + b->csum_offset = 0; } - if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) { - uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4 - | VIRTIO_NET_HDR_GSO_TCPV6 - | VIRTIO_NET_HDR_GSO_UDP; - uint8_t type = vnet->gso_type & allowed_mask; + int ret = 0; + switch (vnet->gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + /* FIXME: The packet has offloaded TCP segmentation. The gso_size + * is given and needs to be respected. */ + dp_packet_hwol_set_tcp_seg(b); + break; - if (type == VIRTIO_NET_HDR_GSO_TCPV4 - || type == VIRTIO_NET_HDR_GSO_TCPV6) { - dp_packet_hwol_set_tcp_seg(b); - } + case VIRTIO_NET_HDR_GSO_UDP: + /* UFO is not supported. */ + VLOG_WARN_RL(&rl, "Received an unsupported packet with UFO enabled."); + ret = ENOTSUP; + break; + + case VIRTIO_NET_HDR_GSO_NONE: + break; + + default: + ret = ENOTSUP; + VLOG_WARN_RL(&rl, "Received an unsupported packet with GSO type: 0x%x", + vnet->gso_type); } - return 0; + return ret; } static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) { - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); + struct virtio_net_hdr v; + struct virtio_net_hdr *vnet = &v; if (dp_packet_hwol_is_tso(b)) { uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) @@ -6902,30 +6959,91 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); if (dp_packet_hwol_is_ipv4(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - } else { + } else if (dp_packet_hwol_tx_ipv6(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; } } else { - vnet->flags = VIRTIO_NET_HDR_GSO_NONE; + vnet->hdr_len = 0; + vnet->gso_size = 0; + vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; } - if (dp_packet_hwol_l4_mask(b)) { - vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b) - - (char *)dp_packet_eth(b)); - + if (dp_packet_l4_checksum_good(b)) { + /* The packet has good L4 checksum. No need to validate again. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID; + } else if (dp_packet_hwol_tx_l4_checksum(b)) { + /* The csum calculation is offloaded. */ if (dp_packet_hwol_l4_is_tcp(b)) { + /* Virtual I/O Device (VIRTIO) Version 1.1 + * 5.1.6.2 Packet Transmission + * If the driver negotiated VIRTIO_NET_F_CSUM, it can skip + * checksumming the packet: + * - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, + * - csum_start is set to the offset within the packet + * to begin checksumming, and + * - csum_offset indicates how many bytes after the + * csum_start the new (16 bit ones complement) checksum + * is placed by the device. + * The TCP checksum field in the packet is set to the sum of + * the TCP pseudo header, so that replacing it by the ones + * complement checksum of the TCP header and body will give + * the correct result. */ + + struct tcp_header *tcp_hdr = dp_packet_l4(b); + ovs_be16 csum = 0; + if (dp_packet_hwol_is_ipv4(b)) { + const struct ip_header *ip_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else if (dp_packet_hwol_tx_ipv6(b)) { + const struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); + } + + tcp_hdr->tcp_csum = csum; + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = (OVS_FORCE __virtio16) b->l4_ofs; vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct tcp_header, tcp_csum); } else if (dp_packet_hwol_l4_is_udp(b)) { + struct udp_header *udp_hdr = dp_packet_l4(b); + ovs_be16 csum = 0; + + if (dp_packet_hwol_is_ipv4(b)) { + const struct ip_header *ip_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else if (dp_packet_hwol_tx_ipv6(b)) { + const struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(b); + csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); + } + + udp_hdr->udp_csum = csum; + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = (OVS_FORCE __virtio16) b->l4_ofs; vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct udp_header, udp_csum); } else if (dp_packet_hwol_l4_is_sctp(b)) { - vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( - struct sctp_header, sctp_csum); + /* The Linux kernel networking stack only supports csum_start + * and csum_offset when SCTP GSO is enabled. See kernel's + * skb_csum_hwoffload_help(). Currently there is no SCTP + * segmentation offload support in OVS. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } else { - VLOG_WARN_RL(&rl, "Unsupported L4 protocol"); + /* This should only happen when DP_PACKET_OL_TX_L4_MASK includes + * a new flag that is not covered in above checks. */ + VLOG_WARN_RL(&rl, "Unsupported L4 checksum offload. " + "Flags: %"PRIu64, + (uint64_t)*dp_packet_ol_flags_ptr(b)); + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } + } else { + /* Packet L4 csum is unknown. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } + + dp_packet_push(b, vnet, sizeof *vnet); } diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 72d24459854..715bbab2bec 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -225,28 +225,6 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, return udp + 1; } -static void -netdev_tnl_calc_udp_csum(struct udp_header *udp, struct dp_packet *packet, - int ip_tot_size) -{ - uint32_t csum; - - if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) { - csum = packet_csum_pseudoheader6(netdev_tnl_ipv6_hdr( - dp_packet_data(packet))); - } else { - csum = packet_csum_pseudoheader(netdev_tnl_ip_hdr( - dp_packet_data(packet))); - } - - csum = csum_continue(csum, udp, ip_tot_size); - udp->udp_csum = csum_finish(csum); - - if (!udp->udp_csum) { - udp->udp_csum = htons(0xffff); - } -} - void netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct dp_packet *packet, @@ -262,8 +240,12 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); + /* Postpone checksum to the egress netdev. */ + dp_packet_hwol_set_csum_udp(packet); if (udp->udp_csum) { - netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); + dp_packet_ol_reset_l4_csum_good(packet); + } else { + dp_packet_ol_set_l4_csum_good(packet); } } @@ -793,7 +775,9 @@ netdev_gtpu_push_header(const struct netdev *netdev, &ip_tot_size, 0); udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); - netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); + /* Postpone checksum to the egress netdev. */ + dp_packet_hwol_set_csum_udp(packet); + dp_packet_ol_reset_l4_csum_good(packet); gtpuh = ALIGNED_CAST(struct gtpuhdr *, udp + 1); diff --git a/lib/netdev.c b/lib/netdev.c index b86afbf36d4..8df7f873715 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -799,8 +799,6 @@ static bool netdev_send_prepare_packet(const uint64_t netdev_flags, struct dp_packet *packet, char **errormsg) { - uint64_t l4_mask; - if (dp_packet_hwol_is_tso(packet) && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { /* Fall back to GSO in software. */ @@ -813,36 +811,16 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, * netdev to decide what would be the best to do. * Provide a software fallback in case the device doesn't support IP csum * offloading. Note: Encapsulated packet must have the inner IP header + * csum already calculated. + * Packet with L4 csum offloading enabled was received with verified csum. + * Leave the L4 csum offloading enabled even with good checksum for the + * netdev to decide what would be the best to do. + * Netdev that requires pseudo header csum needs to calculate that. + * Provide a software fallback in case the netdev doesn't support L4 csum + * offloading. Note: Encapsulated packet must have the inner L4 header * csum already calculated. */ dp_packet_ol_send_prepare(packet, netdev_flags); - l4_mask = dp_packet_hwol_l4_mask(packet); - if (l4_mask) { - if (dp_packet_hwol_l4_is_tcp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { - /* Fall back to TCP csum in software. */ - VLOG_ERR_BUF(errormsg, "No TCP checksum support"); - return false; - } - } else if (dp_packet_hwol_l4_is_udp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { - /* Fall back to UDP csum in software. */ - VLOG_ERR_BUF(errormsg, "No UDP checksum support"); - return false; - } - } else if (dp_packet_hwol_l4_is_sctp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM)) { - /* Fall back to SCTP csum in software. */ - VLOG_ERR_BUF(errormsg, "No SCTP checksum support"); - return false; - } - } else { - VLOG_ERR_BUF(errormsg, "No L4 checksum support: mask: %"PRIu64, - l4_mask); - return false; - } - } - return true; } @@ -975,20 +953,16 @@ netdev_push_header(const struct netdev *netdev, size_t i, size = dp_packet_batch_size(batch); DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet) - || dp_packet_hwol_l4_mask(packet))) { + if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet))) { COVERAGE_INC(netdev_push_header_drops); dp_packet_delete(packet); - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is " + VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is " "not supported: packet dropped", netdev_get_name(netdev)); } else { /* The packet is going to be encapsulated and there is * no support yet for inner network header csum offloading. */ - if (dp_packet_hwol_tx_ip_csum(packet) - && !dp_packet_ip_checksum_good(packet)) { - dp_packet_ip_set_header_csum(packet); - } + dp_packet_ol_send_prepare(packet, 0); netdev->netdev_class->push_header(netdev, packet, data); diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 9597f3554ce..747e04014ab 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -486,9 +486,11 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, size_t l4_size = dp_packet_l4_size(packet); if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { - /* New UDP checksum. */ struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else if (uh->udp_csum) { + /* New UDP checksum. */ uint16_t old_udp_checksum = ~uh->udp_csum; uint32_t udp_checksum = old_udp_checksum + delta_checksum; udp_checksum = csum_finish(udp_checksum); @@ -501,13 +503,17 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, } } else if (nh->ip_proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - /* New TCP checksum. */ - struct tcp_header *th = dp_packet_l4(packet); - uint16_t old_tcp_checksum = ~th->tcp_csum; - uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; - tcp_checksum = csum_finish(tcp_checksum); - - th->tcp_csum = tcp_checksum; + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + /* New TCP checksum. */ + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + tcp_checksum = csum_finish(tcp_checksum); + + th->tcp_csum = tcp_checksum; + } } pkt_metadata_init_conn(&packet->md); @@ -569,11 +575,22 @@ avx512_ipv6_sum_header(__m512i ip6_header) static inline uint16_t ALWAYS_INLINE __attribute__((__target__("avx512vbmi"))) -avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) +avx512_ipv6_addr_csum_delta(__m512i v_packet, __m512i v_new_hdr, + bool rh_present) { - uint16_t old_delta = avx512_ipv6_sum_header(old_header); - uint16_t new_delta = avx512_ipv6_sum_header(new_header); - uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; + __m512i v_new_hdr_for_cksum = v_new_hdr; + uint32_t csum_delta; + uint16_t old_delta; + uint16_t new_delta; + + if (rh_present) { + v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr, + v_packet); + } + + old_delta = avx512_ipv6_sum_header(v_packet); + new_delta = avx512_ipv6_sum_header(v_new_hdr_for_cksum); + csum_delta = ((uint16_t) ~old_delta) + new_delta; return ~csum_finish(csum_delta); } @@ -656,25 +673,19 @@ action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) if (do_csum) { size_t l4_size = dp_packet_l4_size(packet); - __m512i v_new_hdr_for_cksum = v_new_hdr; uint16_t delta_checksum; - /* In case of routing header being present, checksum should not be - * updated for the destination address. */ - if (rh_present) { - v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr, - v_packet); - } - - delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, - v_new_hdr_for_cksum); - if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { struct udp_header *uh = dp_packet_l4(packet); - - if (uh->udp_csum) { + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else if (uh->udp_csum) { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); uint16_t old_udp_checksum = ~uh->udp_csum; - uint32_t udp_checksum = old_udp_checksum + delta_checksum; + uint32_t udp_checksum = old_udp_checksum + + delta_checksum; udp_checksum = csum_finish(udp_checksum); @@ -684,15 +695,26 @@ action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) uh->udp_csum = udp_checksum; } - } else if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); - uint16_t old_tcp_checksum = ~th->tcp_csum; - uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; - tcp_checksum = csum_finish(tcp_checksum); - th->tcp_csum = tcp_checksum; + } else if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + + tcp_checksum = csum_finish(tcp_checksum); + th->tcp_csum = tcp_checksum; + } } else if (proto == IPPROTO_ICMPV6 && l4_size >= sizeof(struct icmp6_header)) { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); struct icmp6_header *icmp = dp_packet_l4(packet); uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum; uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum; diff --git a/lib/packets.c b/lib/packets.c index a4ccc21f823..462b51f92dc 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1131,16 +1131,22 @@ packet_set_ipv4_addr(struct dp_packet *packet, pkt_metadata_init_conn(&packet->md); if (nh->ip_proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); - - th->tcp_csum = recalc_csum32(th->tcp_csum, old_addr, new_addr); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct tcp_header *th = dp_packet_l4(packet); + th->tcp_csum = recalc_csum32(th->tcp_csum, old_addr, new_addr); + } } else if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN ) { - struct udp_header *uh = dp_packet_l4(packet); - - if (uh->udp_csum) { - uh->udp_csum = recalc_csum32(uh->udp_csum, old_addr, new_addr); - if (!uh->udp_csum) { - uh->udp_csum = htons(0xffff); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct udp_header *uh = dp_packet_l4(packet); + if (uh->udp_csum) { + uh->udp_csum = recalc_csum32(uh->udp_csum, old_addr, new_addr); + if (!uh->udp_csum) { + uh->udp_csum = htons(0xffff); + } } } } @@ -1246,16 +1252,24 @@ packet_update_csum128(struct dp_packet *packet, uint8_t proto, size_t l4_size = dp_packet_l4_size(packet); if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct tcp_header *th = dp_packet_l4(packet); - th->tcp_csum = recalc_csum128(th->tcp_csum, addr, new_addr); + th->tcp_csum = recalc_csum128(th->tcp_csum, addr, new_addr); + } } else if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { - struct udp_header *uh = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { - uh->udp_csum = recalc_csum128(uh->udp_csum, addr, new_addr); - if (!uh->udp_csum) { - uh->udp_csum = htons(0xffff); + if (uh->udp_csum) { + uh->udp_csum = recalc_csum128(uh->udp_csum, addr, new_addr); + if (!uh->udp_csum) { + uh->udp_csum = htons(0xffff); + } } } } else if (proto == IPPROTO_ICMPV6 && @@ -1375,7 +1389,9 @@ static void packet_set_port(ovs_be16 *port, ovs_be16 new_port, ovs_be16 *csum) { if (*port != new_port) { - *csum = recalc_csum16(*csum, *port, new_port); + if (csum) { + *csum = recalc_csum16(*csum, *port, new_port); + } *port = new_port; } } @@ -1387,9 +1403,16 @@ void packet_set_tcp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct tcp_header *th = dp_packet_l4(packet); + ovs_be16 *csum = NULL; + + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + csum = &th->tcp_csum; + } - packet_set_port(&th->tcp_src, src, &th->tcp_csum); - packet_set_port(&th->tcp_dst, dst, &th->tcp_csum); + packet_set_port(&th->tcp_src, src, csum); + packet_set_port(&th->tcp_dst, dst, csum); pkt_metadata_init_conn(&packet->md); } @@ -1401,17 +1424,21 @@ packet_set_udp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { - packet_set_port(&uh->udp_src, src, &uh->udp_csum); - packet_set_port(&uh->udp_dst, dst, &uh->udp_csum); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + packet_set_port(&uh->udp_src, src, NULL); + packet_set_port(&uh->udp_dst, dst, NULL); + } else { + ovs_be16 *csum = uh->udp_csum ? &uh->udp_csum : NULL; + + packet_set_port(&uh->udp_src, src, csum); + packet_set_port(&uh->udp_dst, dst, csum); - if (!uh->udp_csum) { + if (csum && !uh->udp_csum) { uh->udp_csum = htons(0xffff); } - } else { - uh->udp_src = src; - uh->udp_dst = dst; } + pkt_metadata_init_conn(&packet->md); } @@ -1422,18 +1449,27 @@ void packet_set_sctp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct sctp_header *sh = dp_packet_l4(packet); - ovs_be32 old_csum, old_correct_csum, new_csum; - uint16_t tp_len = dp_packet_l4_size(packet); - old_csum = get_16aligned_be32(&sh->sctp_csum); - put_16aligned_be32(&sh->sctp_csum, 0); - old_correct_csum = crc32c((void *)sh, tp_len); + if (dp_packet_hwol_l4_is_sctp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + sh->sctp_src = src; + sh->sctp_dst = dst; + } else { + ovs_be32 old_csum, old_correct_csum, new_csum; + uint16_t tp_len = dp_packet_l4_size(packet); - sh->sctp_src = src; - sh->sctp_dst = dst; + old_csum = get_16aligned_be32(&sh->sctp_csum); + put_16aligned_be32(&sh->sctp_csum, 0); + old_correct_csum = crc32c((void *) sh, tp_len); + + sh->sctp_src = src; + sh->sctp_dst = dst; + + new_csum = crc32c((void *) sh, tp_len); + put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum + ^ new_csum); + } - new_csum = crc32c((void *)sh, tp_len); - put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum ^ new_csum); pkt_metadata_init_conn(&packet->md); } @@ -1957,3 +1993,72 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) } } } + +/* Set TCP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_tcp_complete_csum(struct dp_packet *p) +{ + struct tcp_header *tcp = dp_packet_l4(p); + + tcp->tcp_csum = 0; + if (dp_packet_hwol_is_ipv4(p)) { + struct ip_header *ip = dp_packet_l3(p); + + tcp->tcp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), + tcp, dp_packet_l4_size(p))); + } else if (dp_packet_hwol_tx_ipv6(p)) { + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p); + + tcp->tcp_csum = packet_csum_upperlayer6(ip6, tcp, ip6->ip6_nxt, + dp_packet_l4_size(p)); + } else { + OVS_NOT_REACHED(); + } +} + +/* Set UDP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_udp_complete_csum(struct dp_packet *p) +{ + struct udp_header *udp = dp_packet_l4(p); + + /* Skip csum calculation if the udp_csum is zero. */ + if (!udp->udp_csum) { + return; + } + + udp->udp_csum = 0; + if (dp_packet_hwol_is_ipv4(p)) { + struct ip_header *ip = dp_packet_l3(p); + + udp->udp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), + udp, dp_packet_l4_size(p))); + } else if (dp_packet_hwol_tx_ipv6(p)) { + struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p); + + udp->udp_csum = packet_csum_upperlayer6(ip6, udp, ip6->ip6_nxt, + dp_packet_l4_size(p)); + } else { + OVS_NOT_REACHED(); + } + + if (!udp->udp_csum) { + udp->udp_csum = htons(0xffff); + } +} + +/* Set SCTP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_sctp_complete_csum(struct dp_packet *p) +{ + struct sctp_header *sh = dp_packet_l4(p); + uint16_t tp_len = dp_packet_l4_size(p); + ovs_be32 csum; + + put_16aligned_be32(&sh->sctp_csum, 0); + csum = crc32c((void *) sh, tp_len); + put_16aligned_be32(&sh->sctp_csum, csum); +} diff --git a/lib/packets.h b/lib/packets.h index ac4c28e471e..200b25cf012 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1671,6 +1671,9 @@ uint32_t packet_csum_pseudoheader(const struct ip_header *); bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag); void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); +void packet_tcp_complete_csum(struct dp_packet *); +void packet_udp_complete_csum(struct dp_packet *); +void packet_sctp_complete_csum(struct dp_packet *); #define DNS_HEADER_LEN 12 struct dns_header { From 07f6d6a0cb519840b1a1401dc1c4dd741b5150cd Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Mon, 19 Jun 2023 17:37:05 +0200 Subject: [PATCH 279/833] Add editorconfig file. EditorConfig is a file format and collection of text editor plugins for maintaining consistent coding styles between different editors and IDEs. Initialize the file following the coding rules in Documentation/internals/contributing/coding-style.rst and add exceptions declared in build-aux/initial-tab-allowed-files. Only enforce rules for *.c and *.h files. Other files should use the default indenting rules from text editors. In order for this file to be taken into account (unless they use an editor with built-in EditorConfig support), developers will have to install a plugin. Notes: * All matching rules are considered. The last matching rule's properties will override the previous ones. * The max_line_length property is only supported by a limited number of EditorConfig plugins. It will be ignored if unsupported. Link: https://editorconfig.org/ Link: https://github.com/editorconfig/editorconfig-emacs Link: https://github.com/editorconfig/editorconfig-vim Link: https://github.com/editorconfig/editorconfig/wiki/EditorConfig-Properties#max_line_length Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- .editorconfig | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ Makefile.am | 1 + 2 files changed, 49 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000000..685c7275005 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,48 @@ +# See https://editorconfig.org/ for syntax reference. + +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 + +[*.{c,h}] +indent_style = space +indent_size = 4 +max_line_length = 79 + +[include/linux/**.h] +indent_style = tab +indent_size = tab +tab_width = 8 + +[include/sparse/rte_*.h] +indent_style = tab +tab_width = 8 + +[include/windows/getopt.h] +indent_style = tab +indent_size = tab +tab_width = 8 + +[include/windows/netinet/{icmp6,ip6}.h] +indent_style = tab +indent_size = tab +tab_width = 8 + +[lib/getopt_long.c] +indent_style = tab +indent_size = tab +tab_width = 8 + +[lib/sflow*.{c,h}] +indent_style = tab +indent_size = tab +tab_width = 8 + +[lib/strsep.c] +indent_style = tab +indent_size = tab +tab_width = 8 diff --git a/Makefile.am b/Makefile.am index df9c33dfe63..db341504d37 100644 --- a/Makefile.am +++ b/Makefile.am @@ -82,6 +82,7 @@ EXTRA_DIST = \ .ci/osx-build.sh \ .ci/osx-prepare.sh \ .cirrus.yml \ + .editorconfig \ .github/workflows/build-and-test.yml \ appveyor.yml \ boot.sh \ From c91867030234284052bf6d50928b390d7889193e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 21 Jun 2023 14:50:12 +0200 Subject: [PATCH 280/833] MAINTAINERS: Add Eelco Chaudron. Eelco Chaudron was elected by the Open vSwitch committers yesterday. This formalises his status as an Open vSwitch committer. Welcome Eelco! Acked-by: Alin Gabriel Serdean Acked-by: Eelco Chaudron Signed-off-by: Simon Horman Signed-off-by: Ilya Maximets --- MAINTAINERS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 85b8e641658..2fc2517177e 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -45,6 +45,8 @@ This is the current list of active Open vSwitch committers: - aserdean@ovn.org * - Ansis Atteka - ansisatteka@gmail.com + * - Eelco Chaudron + - echaudro@redhat.com * - Ian Stokes - istokes@ovn.org * - Ilya Maximets From 903294cde6e19b3eccefee13875f83b5ada2774c Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 19 Jun 2023 14:57:39 +0200 Subject: [PATCH 281/833] dpif: Add coverage counters for dpif_operate() failures. Add additional error coverage counters for dpif operation failures. This could help to quickly identify netlink problems when communicating with the OVS kernel module. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2070630 Reviewed-by: Adrian Moreno Acked-by: Aaron Conole Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/dpif.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/lib/dpif.c b/lib/dpif.c index 3305401fe01..b1cbf39c48d 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -55,18 +55,22 @@ VLOG_DEFINE_THIS_MODULE(dpif); COVERAGE_DEFINE(dpif_destroy); -COVERAGE_DEFINE(dpif_port_add); -COVERAGE_DEFINE(dpif_port_del); +COVERAGE_DEFINE(dpif_execute); +COVERAGE_DEFINE(dpif_execute_error); +COVERAGE_DEFINE(dpif_execute_with_help); +COVERAGE_DEFINE(dpif_flow_del); +COVERAGE_DEFINE(dpif_flow_del_error); COVERAGE_DEFINE(dpif_flow_flush); COVERAGE_DEFINE(dpif_flow_get); +COVERAGE_DEFINE(dpif_flow_get_error); COVERAGE_DEFINE(dpif_flow_put); -COVERAGE_DEFINE(dpif_flow_del); -COVERAGE_DEFINE(dpif_execute); -COVERAGE_DEFINE(dpif_purge); -COVERAGE_DEFINE(dpif_execute_with_help); -COVERAGE_DEFINE(dpif_meter_set); -COVERAGE_DEFINE(dpif_meter_get); +COVERAGE_DEFINE(dpif_flow_put_error); COVERAGE_DEFINE(dpif_meter_del); +COVERAGE_DEFINE(dpif_meter_get); +COVERAGE_DEFINE(dpif_meter_set); +COVERAGE_DEFINE(dpif_port_add); +COVERAGE_DEFINE(dpif_port_del); +COVERAGE_DEFINE(dpif_purge); static const struct dpif_class *base_dpif_classes[] = { #if defined(__linux__) || defined(_WIN32) @@ -1381,8 +1385,11 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_flow_put); log_flow_put_message(dpif, &this_module, put, error); - if (error && put->stats) { - memset(put->stats, 0, sizeof *put->stats); + if (error) { + COVERAGE_INC(dpif_flow_put_error); + if (put->stats) { + memset(put->stats, 0, sizeof *put->stats); + } } break; } @@ -1392,10 +1399,10 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_flow_get); if (error) { + COVERAGE_INC(dpif_flow_get_error); memset(get->flow, 0, sizeof *get->flow); } log_flow_get_message(dpif, &this_module, get, error); - break; } @@ -1404,8 +1411,11 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_flow_del); log_flow_del_message(dpif, &this_module, del, error); - if (error && del->stats) { - memset(del->stats, 0, sizeof *del->stats); + if (error) { + COVERAGE_INC(dpif_flow_del_error); + if (del->stats) { + memset(del->stats, 0, sizeof *del->stats); + } } break; } @@ -1414,6 +1424,9 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_execute); log_execute_message(dpif, &this_module, &op->execute, false, error); + if (error) { + COVERAGE_INC(dpif_execute_error); + } break; } } From d56932aac668a999da948670d87a27cc6b1a748c Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Fri, 23 Jun 2023 14:12:33 +0200 Subject: [PATCH 282/833] checkpatch: Ignore yml files when checking line lengths. As far as I can tell they're used mostly for CI job definitions and these tend to result in long lines. Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2023-June/405796.html Suggested-by: Aaron Conole Acked-by: Aaron Conole Acked-by: Eelco Chaudron Signed-off-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- utilities/checkpatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 0d30b71b5b7..64f0efeb474 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -195,7 +195,7 @@ def reset_counters(): # # Python isn't checked as flake8 performs these checks during build. line_length_ignore_list = re.compile( - r'\.(am|at|etc|in|m4|mk|patch|py)$|^debian/.*$') + r'\.(am|at|etc|in|m4|mk|patch|py|yml)$|^debian/.*$') # Don't enforce a requirement that leading whitespace be all spaces on # files that include these characters in their name, since these kinds From 34ace16cb8295d40cdc2d9a9544612ec1faf3c87 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Tue, 20 Jun 2023 18:20:39 +0100 Subject: [PATCH 283/833] tests: Add macro to common file. get_log_next_line_num() was defined in alb.at. As it may be useful in other test files, move to ofproto-macros.at. Suggested-by: David Marchand Signed-off-by: Kevin Traynor Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/alb.at | 4 ---- tests/ofproto-macros.at | 7 +++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/alb.at b/tests/alb.at index 922185d61d8..32dc40a1b66 100644 --- a/tests/alb.at +++ b/tests/alb.at @@ -2,10 +2,6 @@ AT_BANNER([PMD Auto Load Balance]) m4_divert_push([PREPARE_TESTS]) -get_log_next_line_num () { - LINENUM=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1)) -} - m4_divert_pop([PREPARE_TESTS]) m4_define([DUMMY_NUMA], [--dummy-numa="0,0"]) diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index 676d55aa956..d2e6ac768ba 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -265,6 +265,13 @@ check_logs () { /|EMER|/p" ${logs} } +# Gets the last line number in ovs-vswitchd.log +1. This can be used to +# help ensure that an output in the log is newly written as the result of +# a test command and it is not just matching an earlier log line. +get_log_next_line_num () { + LINENUM=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1)) +} + # add_of_br BRNUM [ARG...] add_of_br () { local brnum=$1; shift From 9b4d2ad8e8cf720e150ea038600cc85b6f79d465 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Fri, 23 Jun 2023 12:33:43 +0200 Subject: [PATCH 284/833] conntrack: Allow to dump userspace conntrack expectations. The patch introduces a new commands ovs-appctl dpctl/dump-conntrack-exp that allows to dump the existing expectations for the userspace ct. Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- NEWS | 2 + lib/conntrack.c | 66 ++++++++++++++++++++++++ lib/conntrack.h | 10 +++- lib/ct-dpif.c | 87 ++++++++++++++++++++++++++++++++ lib/ct-dpif.h | 15 ++++++ lib/dpctl.c | 49 ++++++++++++++++++ lib/dpctl.man | 6 +++ lib/dpif-netdev.c | 50 ++++++++++++++++++ lib/dpif-netlink.c | 3 ++ lib/dpif-provider.h | 11 ++++ tests/system-kmod-macros.at | 9 ++++ tests/system-traffic.at | 44 ++++++++++++++++ tests/system-userspace-macros.at | 6 +++ 13 files changed, 357 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 66d5a4ea375..16cdb69338b 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,8 @@ Post-v3.1.0 * New commands "dpctl/{ct-get-sweep-interval,ct-set-sweep-interval}" that allow to get and set, for the userspace datapath, the sweep interval for the conntrack garbage collector. + * New commands "dpctl/dump-conntrack-exp" that allows to dump + conntrack's expectations for the userspace datapath. - ovs-ctl: * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask value when starting OVS daemons. E.g., use --ovsdb-server-umask=0002 diff --git a/lib/conntrack.c b/lib/conntrack.c index f5ebfa05bad..4375c03e2b8 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2670,6 +2670,72 @@ conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED) return 0; } +static void +exp_node_to_ct_dpif_exp(const struct alg_exp_node *exp, + struct ct_dpif_exp *entry) +{ + memset(entry, 0, sizeof *entry); + + conn_key_to_tuple(&exp->key, &entry->tuple_orig); + conn_key_to_tuple(&exp->parent_key, &entry->tuple_parent); + entry->zone = exp->key.zone; + entry->mark = exp->parent_mark; + memcpy(&entry->labels, &exp->parent_label, sizeof entry->labels); + entry->protoinfo.proto = exp->key.nw_proto; +} + +int +conntrack_exp_dump_start(struct conntrack *ct, struct conntrack_dump *dump, + const uint16_t *pzone) +{ + memset(dump, 0, sizeof(*dump)); + + if (pzone) { + dump->zone = *pzone; + dump->filter_zone = true; + } + + dump->ct = ct; + + return 0; +} + +int +conntrack_exp_dump_next(struct conntrack_dump *dump, struct ct_dpif_exp *entry) +{ + struct conntrack *ct = dump->ct; + struct alg_exp_node *enode; + int ret = EOF; + + ovs_rwlock_rdlock(&ct->resources_lock); + + for (;;) { + struct hmap_node *node = hmap_at_position(&ct->alg_expectations, + &dump->hmap_pos); + if (!node) { + break; + } + + enode = CONTAINER_OF(node, struct alg_exp_node, node); + + if (!dump->filter_zone || enode->key.zone == dump->zone) { + ret = 0; + exp_node_to_ct_dpif_exp(enode, entry); + break; + } + } + + ovs_rwlock_unlock(&ct->resources_lock); + + return ret; +} + +int +conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED) +{ + return 0; +} + int conntrack_flush(struct conntrack *ct, const uint16_t *zone) { diff --git a/lib/conntrack.h b/lib/conntrack.h index 524ec0acb32..57d5159b61b 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -100,7 +100,10 @@ void conntrack_clear(struct dp_packet *packet); struct conntrack_dump { struct conntrack *ct; unsigned bucket; - struct cmap_position cm_pos; + union { + struct cmap_position cm_pos; + struct hmap_position hmap_pos; + }; bool filter_zone; uint16_t zone; }; @@ -132,6 +135,11 @@ int conntrack_dump_start(struct conntrack *, struct conntrack_dump *, int conntrack_dump_next(struct conntrack_dump *, struct ct_dpif_entry *); int conntrack_dump_done(struct conntrack_dump *); +int conntrack_exp_dump_start(struct conntrack *, struct conntrack_dump *, + const uint16_t *); +int conntrack_exp_dump_next(struct conntrack_dump *, struct ct_dpif_exp *); +int conntrack_exp_dump_done(struct conntrack_dump *); + int conntrack_flush(struct conntrack *, const uint16_t *zone); int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *, uint16_t zone); diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index 0c4b2964ff6..f59c6e560dd 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -101,6 +101,65 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump) ? dpif->dpif_class->ct_dump_done(dpif, dump) : EOPNOTSUPP); } + +/* Start dumping the expectations from the connection tracker. + * + * 'dump' must be the address of a pointer to a struct ct_dpif_dump_state, + * which should be passed (unaltered) to ct_exp_dpif_dump_{next,done}(). + * + * If 'zone' is not NULL, it should point to an integer identifing a + * conntrack zone to which the dump will be limited. If it is NULL, + * conntrack entries from all zones will be dumped. + * + * If there has been a problem the function returns a non-zero value + * that represents the error. Otherwise it returns zero. */ +int +ct_exp_dpif_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump, + const uint16_t *zone) +{ + int err; + + err = (dpif->dpif_class->ct_exp_dump_start + ? dpif->dpif_class->ct_exp_dump_start(dpif, dump, zone) + : EOPNOTSUPP); + + if (!err) { + (*dump)->dpif = dpif; + } + + return err; +} + +/* Dump one expectation and put it in 'entry'. + * + * 'dump' should have been initialized by ct_exp_dpif_dump_start(). + * + * The function returns 0, if an entry has been dumped succesfully. + * Otherwise it returns a non-zero value which can be: + * - EOF: meaning that there are no more entries to dump. + * - an error value. + * In both cases, the user should call ct_exp_dpif_dump_done(). */ +int +ct_exp_dpif_dump_next(struct ct_dpif_dump_state *dump, + struct ct_dpif_exp *entry) +{ + struct dpif *dpif = dump->dpif; + + return (dpif->dpif_class->ct_exp_dump_next + ? dpif->dpif_class->ct_exp_dump_next(dpif, dump, entry) + : EOPNOTSUPP); +} + +/* Free resources used by 'dump', if any. */ +int +ct_exp_dpif_dump_done(struct ct_dpif_dump_state *dump) +{ + struct dpif *dpif = dump->dpif; + + return (dpif->dpif_class->ct_exp_dump_done + ? dpif->dpif_class->ct_exp_dump_done(dpif, dump) + : EOPNOTSUPP); +} /* Flushing. */ @@ -462,6 +521,34 @@ ct_dpif_status_flags(uint32_t flags) } } +void +ct_dpif_format_exp_entry(const struct ct_dpif_exp *entry, struct ds *ds) +{ + ct_dpif_format_ipproto(ds, entry->tuple_orig.ip_proto); + + ds_put_cstr(ds, ",orig=("); + ct_dpif_format_tuple(ds, &entry->tuple_orig); + ds_put_cstr(ds, ")"); + + if (entry->zone) { + ds_put_format(ds, ",zone=%"PRIu16, entry->zone); + } + if (entry->mark) { + ds_put_format(ds, ",mark=%"PRIu32, entry->mark); + } + if (!ovs_u128_is_zero(entry->labels)) { + ovs_be128 value; + + ds_put_cstr(ds, ",labels="); + value = hton128(entry->labels); + ds_put_hex(ds, &value, sizeof value); + } + + ds_put_cstr(ds, ",parent=("); + ct_dpif_format_tuple(ds, &entry->tuple_parent); + ds_put_cstr(ds, ")"); +} + void ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds, bool verbose, bool print_stats) diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index 5579ac9253b..0b728b52986 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -179,6 +179,16 @@ enum ct_dpif_status_flags { #define CT_DPIF_STATUS_MASK ((CT_DPIF_STATUS_UNTRACKED << 1) - 1) +struct ct_dpif_exp { + struct ct_dpif_tuple tuple_orig; + struct ct_dpif_tuple tuple_parent; + uint16_t zone; + struct ct_dpif_protoinfo protoinfo; + ovs_u128 labels; + uint32_t status; + uint32_t mark; +}; + struct ct_dpif_entry { /* Const members. */ struct ct_dpif_tuple tuple_orig; @@ -286,6 +296,10 @@ int ct_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **, const uint16_t *zone, int *); int ct_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_entry *); int ct_dpif_dump_done(struct ct_dpif_dump_state *); +int ct_exp_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **, + const uint16_t *zone); +int ct_exp_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_exp *); +int ct_exp_dpif_dump_done(struct ct_dpif_dump_state *); int ct_dpif_flush(struct dpif *, const uint16_t *zone, const struct ofp_ct_match *); int ct_dpif_set_maxconns(struct dpif *dpif, uint32_t maxconns); @@ -310,6 +324,7 @@ int ct_dpif_ipf_dump_done(struct dpif *dpif, void *); void ct_dpif_entry_uninit(struct ct_dpif_entry *); void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *, bool verbose, bool print_stats); +void ct_dpif_format_exp_entry(const struct ct_dpif_exp *, struct ds *); void ct_dpif_format_ipproto(struct ds *ds, uint16_t ipproto); void ct_dpif_format_tuple(struct ds *, const struct ct_dpif_tuple *); uint8_t ct_dpif_coalesce_tcp_state(uint8_t state); diff --git a/lib/dpctl.c b/lib/dpctl.c index 15950bd50c2..4394653ab3a 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1707,6 +1707,53 @@ dpctl_dump_conntrack(int argc, const char *argv[], return error; } +static int +dpctl_dump_conntrack_exp(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct ct_dpif_dump_state *dump; + uint16_t zone, *pzone = NULL; + struct ct_dpif_exp cte; + struct dpif *dpif; + int error; + + if (argc > 1 && ovs_scan(argv[argc - 1], "zone=%"SCNu16, &zone)) { + pzone = &zone; + argc--; + } + + error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif); + if (error) { + return error; + } + + error = ct_exp_dpif_dump_start(dpif, &dump, pzone); + if (error) { + dpctl_error(dpctl_p, error, "starting conntrack expectations dump"); + dpif_close(dpif); + return error; + } + + while (!(error = ct_exp_dpif_dump_next(dump, &cte))) { + struct ds s = DS_EMPTY_INITIALIZER; + + ct_dpif_format_exp_entry(&cte, &s); + + dpctl_print(dpctl_p, "%s\n", ds_cstr(&s)); + ds_destroy(&s); + } + if (error == EOF) { + error = 0; + } else if (error) { + dpctl_error(dpctl_p, error, "dumping conntrack expectation"); + } + + ct_exp_dpif_dump_done(dump); + dpif_close(dpif); + + return error; +} + static int dpctl_flush_conntrack(int argc, const char *argv[], struct dpctl_params *dpctl_p) @@ -2951,6 +2998,8 @@ static const struct dpctl_command all_commands[] = { 0, 1, dpctl_offload_stats_show, DP_RO }, { "dump-conntrack", "[-m] [-s] [dp] [zone=N]", 0, 4, dpctl_dump_conntrack, DP_RO }, + { "dump-conntrack-exp", "[dp] [zone=N]", + 0, 2, dpctl_dump_conntrack_exp, DP_RO }, { "flush-conntrack", "[dp] [zone=N] [ct-orig-tuple] [ct-reply-tuple]", 0, 4, dpctl_flush_conntrack, DP_RW }, { "cache-get-size", "[dp]", 0, 1, dpctl_cache_get_size, DP_RO }, diff --git a/lib/dpctl.man b/lib/dpctl.man index d448596d353..66fc50903b0 100644 --- a/lib/dpctl.man +++ b/lib/dpctl.man @@ -302,6 +302,12 @@ are included. With \fB\-\-statistics\fR timeouts and timestamps are added to the output. . .TP +\*(DX\fBdump\-conntrack\-exp\fR [\fIdp\fR] [\fBzone=\fIzone\fR] +Prints to the console all the expectation entries in the tracker used by +\fIdp\fR. If \fBzone=\fIzone\fR is specified, only shows the expectations +in \fIzone\fR. Only supported for userspace datapath. +. +.TP \*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] [\fIct-origin-tuple\fR [\fIct-reply-tuple\fR]] Flushes the connection entries in the tracker used by \fIdp\fR based on \fIzone\fR and connection tracking tuple \fIct-origin-tuple\fR. diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index abe63412ebf..feab15d21cd 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9267,6 +9267,53 @@ dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, return err; } +static int +dpif_netdev_ct_exp_dump_start(struct dpif *dpif, + struct ct_dpif_dump_state **dump_, + const uint16_t *pzone) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + struct dp_netdev_ct_dump *dump; + + dump = xzalloc(sizeof *dump); + dump->dp = dp; + dump->ct = dp->conntrack; + + conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone); + + *dump_ = &dump->up; + + return 0; +} + +static int +dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_, + struct ct_dpif_exp *entry) +{ + struct dp_netdev_ct_dump *dump; + + INIT_CONTAINER(dump, dump_, up); + + return conntrack_exp_dump_next(&dump->dump, entry); +} + +static int +dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_) +{ + struct dp_netdev_ct_dump *dump; + int err; + + INIT_CONTAINER(dump, dump_, up); + + err = conntrack_exp_dump_done(&dump->dump); + + free(dump); + + return err; +} + static int dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, const struct ct_dpif_tuple *tuple) @@ -9679,6 +9726,9 @@ const struct dpif_class dpif_netdev_class = { dpif_netdev_ct_dump_start, dpif_netdev_ct_dump_next, dpif_netdev_ct_dump_done, + dpif_netdev_ct_exp_dump_start, + dpif_netdev_ct_exp_dump_next, + dpif_netdev_ct_exp_dump_done, dpif_netdev_ct_flush, dpif_netdev_ct_set_maxconns, dpif_netdev_ct_get_maxconns, diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 60bd39643c7..9194971d379 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -4566,6 +4566,9 @@ const struct dpif_class dpif_netlink_class = { dpif_netlink_ct_dump_start, dpif_netlink_ct_dump_next, dpif_netlink_ct_dump_done, + NULL, /* ct_exp_dump_start */ + NULL, /* ct_exp_dump_next */ + NULL, /* ct_exp_dump_done */ dpif_netlink_ct_flush, NULL, /* ct_set_maxconns */ NULL, /* ct_get_maxconns */ diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index a33c6ec3089..1b822cb0754 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -79,6 +79,7 @@ dpif_flow_dump_thread_init(struct dpif_flow_dump_thread *thread, struct ct_dpif_dump_state; struct ct_dpif_entry; +struct ct_dpif_exp; struct ct_dpif_tuple; struct ct_dpif_timeout_policy; enum ct_features; @@ -471,6 +472,16 @@ struct dpif_class { struct ct_dpif_entry *entry); int (*ct_dump_done)(struct dpif *, struct ct_dpif_dump_state *state); + /* Starts the dump initializing the structures involved and the zone + * filter. */ + int (*ct_exp_dump_start)(struct dpif *, struct ct_dpif_dump_state **state, + const uint16_t *zone); + /* Fill the expectation 'entry' with the related information. */ + int (*ct_exp_dump_next)(struct dpif *, struct ct_dpif_dump_state *state, + struct ct_dpif_exp *entry); + /* Ends the dump cleaning up any potential pending state, if any. */ + int (*ct_exp_dump_done)(struct dpif *, struct ct_dpif_dump_state *state); + /* Flushes the connection tracking tables. The arguments have the * following behavior: * diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index 712925ded77..81601390ddb 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -123,6 +123,15 @@ m4_define([CHECK_CONNTRACK_TIMEOUT], on_exit 'modprobe -r nfnetlink_cttimeout' ]) +# CHECK_CONNTRACK_DUMP_EXPECTATIONS() +# +# Perform requirements checks for dumping conntrack expectations. +# +m4_define([CHECK_CONNTRACK_DUMP_EXPECTATIONS], +[ + AT_SKIP_IF([:]) +]) + # CHECK_CT_DPIF_SET_GET_MAXCONNS() # # Perform requirements checks for running ovs-dpctl ct-set-maxconns or diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 4c378e1d02b..a05ca311ca8 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -5195,6 +5195,50 @@ tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src= OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - FTP with expectation dump]) +AT_SKIP_IF([test $HAVE_FTP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_ALG() +CHECK_CONNTRACK_DUMP_EXPECTATIONS() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal +table=0,priority=100,in_port=1,tcp,action=ct(alg=ftp,commit),2 +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk+rel,action=1 +]) + +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows.txt]) + +OVS_START_L7([at_ns1], [ftp]) + +dnl FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0.log]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +]) + +dnl Verify that a dump with zero entries in a zone doesn't return any entry. +AT_CHECK([ovs-appctl dpctl/dump-conntrack-exp zone=42], [0], [dnl +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack-exp | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),parent=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=) +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([conntrack - FTP over IPv6]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index c1855cbc5b3..73e0e843b9f 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -112,6 +112,12 @@ m4_define([CHECK_CONNTRACK_ZEROIP_SNAT]) # m4_define([CHECK_CONNTRACK_TIMEOUT]) +# CHECK_CONNTRACK_DUMP_EXPECTATIONS() +# +# Perform requirements checks for dumping conntrack expectations. +# +m4_define([CHECK_CONNTRACK_DUMP_EXPECTATIONS]) + # CHECK_CT_DPIF_SET_GET_MAXCONNS() # # Perform requirements checks for running ovs-dpctl ct-set-maxconns or From 2ece9c9ac1e095427c29a722f71ff3b874d5bab2 Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Sun, 25 Jun 2023 10:05:02 -0700 Subject: [PATCH 285/833] ovsdb: raft: Fix RAFT paper link. Signed-off-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/raft.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ovsdb/raft.h b/ovsdb/raft.h index 403ed3dd732..a5b55d9bf03 100644 --- a/ovsdb/raft.h +++ b/ovsdb/raft.h @@ -26,7 +26,8 @@ * ========== * * Based on Diego Ongaro's Ph.D. thesis, "Consensus: Bridging Theory and - * Practice", available at https://ramcloud.stanford.edu/~ongaro/thesis.pdf. + * Practice", available at + * https://github.com/ongardie/dissertation/blob/master/stanford.pdf. * References to sections, pages, and figures are from this thesis. Quotations * in comments also come from this work, in accordance with its license notice, * reproduced below: From c2433bdfc0d25630d66ee7a79503f19316462679 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Jun 2023 00:32:20 +0200 Subject: [PATCH 286/833] dpif-netdev: Lockless meters. Current implementation of meters in the userspace datapath takes the meter lock for every packet batch. If more than one thread hits the flow with the same meter, they will lock each other. Replace the critical section with atomic operations to avoid interlocking. Meters themselves are RCU-protected, so it's safe to access them without holding a lock. Implementation does the following: 1. Tries to advance the 'used' timer of the meter with atomic compare+exchange if it's smaller than 'now'. 2. If the timer change succeeds, atomically update band buckets. 3. Atomically update packet statistics for a meter. 4. Go over buckets and try to atomically subtract the amount of packets or bytes, recording the highest exceeded band. 5. Atomically update band statistics and drop packets. Bucket manipulations are implemented with atomic compare+exchange operations with extra checks, because bucket size should never exceed the maximum and it should never go below zero. Packet statistics may be momentarily inconsistent, i.e., number of packets and the number of bytes may reflect different sets of packets. But it should be eventually consistent. And the difference at any given time should be in just few packets. For the sake of reduced code complexity PKTPS meter tries to push packets through the band one by one, even though they all have the same weight. This is also more fair if more than one thread is passing packets through the same band at the same time. Trying to predict the number of packets that can pass may also cause extra atomic operations reducing the performance. This implementation shows similar performance to the previous one, but should scale better with more threads hitting the same meter. Reviewed-by: Simon Horman Tested-by: Lin Huang Tested-by: Zhang YuHuang Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- NEWS | 2 + lib/dpif-netdev.c | 249 +++++++++++++++++++++++++--------------------- 2 files changed, 139 insertions(+), 112 deletions(-) diff --git a/NEWS b/NEWS index 16cdb69338b..0b5dc3db15c 100644 --- a/NEWS +++ b/NEWS @@ -39,6 +39,8 @@ Post-v3.1.0 - SRv6 Tunnel Protocol * Added support for userspace datapath (only). - Userspace datapath: + * Implementation of OpenFlow meters is now lockless allowing for better + multi-thread scalability. * IP and L4 checksum offload support is now enabled by default for interfaces that support it. See the 'status' column in the 'interface' table to check the status. diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index feab15d21cd..ab493f9d478 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -212,21 +212,21 @@ static void dpcls_remove(struct dpcls *, struct dpcls_rule *); struct dp_meter_band { uint32_t rate; uint32_t burst_size; - uint64_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */ - uint64_t packet_count; - uint64_t byte_count; + atomic_uint64_t bucket; /* In 1/1000 packets for PKTPS, + * or in bits for KBPS. */ + atomic_uint64_t packet_count; + atomic_uint64_t byte_count; }; struct dp_meter { struct cmap_node node; - struct ovs_mutex lock; uint32_t id; uint16_t flags; uint16_t n_bands; uint32_t max_delta_t; - uint64_t used; - uint64_t packet_count; - uint64_t byte_count; + atomic_uint64_t used; /* Time of a last use in milliseconds. */ + atomic_uint64_t packet_count; + atomic_uint64_t byte_count; struct dp_meter_band bands[]; }; @@ -7165,22 +7165,56 @@ dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED, features->max_color = 0; } +/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic, + * i.e., if the result will be larger than 'max_value', will store 'max_value' + * instead. */ +static void +atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value) +{ + uint64_t current, new_value; + + atomic_read_relaxed(value, ¤t); + do { + new_value = current + n; + new_value = MIN(new_value, max_value); + } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, + new_value)); +} + +/* Tries to atomically subtract 'n' from 'value'. Does not perform the + * operation and returns 'false' if the result will be less than 'min_value'. + * Otherwise, stores the result and returns 'true'. */ +static bool +atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value) +{ + uint64_t current; + + atomic_read_relaxed(value, ¤t); + do { + if (current < min_value + n) { + return false; + } + } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, + current - n)); + return true; +} + /* Applies the meter identified by 'meter_id' to 'packets_'. Packets * that exceed a band are dropped in-place. */ static void dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, - uint32_t meter_id, long long int now) + uint32_t meter_id, long long int now_ms) { - struct dp_meter *meter; - struct dp_meter_band *band; - struct dp_packet *packet; - long long int long_delta_t; /* msec */ - uint32_t delta_t; /* msec */ const size_t cnt = dp_packet_batch_size(packets_); - uint32_t bytes, volume; - int exceeded_band[NETDEV_MAX_BURST]; uint32_t exceeded_rate[NETDEV_MAX_BURST]; - int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */ + uint32_t exceeded_band[NETDEV_MAX_BURST]; + uint64_t bytes, volume, meter_used, old; + uint64_t band_packets[MAX_BANDS]; + uint64_t band_bytes[MAX_BANDS]; + struct dp_meter_band *band; + struct dp_packet *packet; + struct dp_meter *meter; + bool exceeded = false; if (meter_id >= MAX_METERS) { return; @@ -7196,116 +7230,101 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, /* Initialize as zeroes. */ memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); - ovs_mutex_lock(&meter->lock); - /* All packets will hit the meter at the same time. */ - long_delta_t = now / 1000 - meter->used / 1000; /* msec */ + atomic_read_relaxed(&meter->used, &meter_used); + do { + if (meter_used >= now_ms) { + /* The '>' condition means that we have several threads hitting the + * same meter, and the other one already advanced the time. */ + meter_used = now_ms; + break; + } + } while (!atomic_compare_exchange_weak_relaxed(&meter->used, + &meter_used, now_ms)); - if (long_delta_t < 0) { - /* This condition means that we have several threads fighting for a - meter lock, and the one who received the packets a bit later wins. - Assuming that all racing threads received packets at the same time - to avoid overflow. */ - long_delta_t = 0; - } + /* Refill all buckets right away, since other threads may use them. */ + if (meter_used < now_ms) { + /* All packets will hit the meter at the same time. */ + uint64_t delta_t = now_ms - meter_used; + + /* Make sure delta_t will not be too large, so that bucket will not + * wrap around below. */ + delta_t = MIN(delta_t, meter->max_delta_t); - /* Make sure delta_t will not be too large, so that bucket will not - * wrap around below. */ - delta_t = (long_delta_t > (long long int)meter->max_delta_t) - ? meter->max_delta_t : (uint32_t)long_delta_t; + for (int m = 0; m < meter->n_bands; m++) { + band = &meter->bands[m]; + /* Update band's bucket. We can't just use atomic add here, + * because we should never add above the max capacity. */ + atomic_sat_add(&band->bucket, delta_t * band->rate, + band->burst_size * 1000ULL); + } + } /* Update meter stats. */ - meter->used = now; - meter->packet_count += cnt; + atomic_add_relaxed(&meter->packet_count, cnt, &old); bytes = 0; DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { bytes += dp_packet_size(packet); } - meter->byte_count += bytes; + atomic_add_relaxed(&meter->byte_count, bytes, &old); /* Meters can operate in terms of packets per second or kilobits per * second. */ if (meter->flags & OFPMF13_PKTPS) { - /* Rate in packets/second, bucket 1/1000 packets. */ - /* msec * packets/sec = 1/1000 packets. */ + /* Rate in packets/second, bucket 1/1000 packets. + * msec * packets/sec = 1/1000 packets. */ volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */ } else { - /* Rate in kbps, bucket in bits. */ - /* msec * kbps = bits */ + /* Rate in kbps, bucket in bits. + * msec * kbps = bits */ volume = bytes * 8; } - /* Update all bands and find the one hit with the highest rate for each - * packet (if any). */ - for (int m = 0; m < meter->n_bands; ++m) { - uint64_t max_bucket_size; - + /* Find the band hit with the highest rate for each packet (if any). */ + for (int m = 0; m < meter->n_bands; m++) { band = &meter->bands[m]; - max_bucket_size = band->burst_size * 1000ULL; - /* Update band's bucket. */ - band->bucket += (uint64_t) delta_t * band->rate; - if (band->bucket > max_bucket_size) { - band->bucket = max_bucket_size; - } /* Drain the bucket for all the packets, if possible. */ - if (band->bucket >= volume) { - band->bucket -= volume; - } else { - int band_exceeded_pkt; - - /* Band limit hit, must process packet-by-packet. */ - if (meter->flags & OFPMF13_PKTPS) { - band_exceeded_pkt = band->bucket / 1000; - band->bucket %= 1000; /* Remainder stays in bucket. */ - - /* Update the exceeding band for each exceeding packet. - * (Only one band will be fired by a packet, and that - * can be different for each packet.) */ - for (int i = band_exceeded_pkt; i < cnt; i++) { - if (band->rate > exceeded_rate[i]) { - exceeded_rate[i] = band->rate; - exceeded_band[i] = m; - } - } - } else { - /* Packet sizes differ, must process one-by-one. */ - band_exceeded_pkt = cnt; - DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { - uint32_t bits = dp_packet_size(packet) * 8; - - if (band->bucket >= bits) { - band->bucket -= bits; - } else { - if (i < band_exceeded_pkt) { - band_exceeded_pkt = i; - } - /* Update the exceeding band for the exceeding packet. - * (Only one band will be fired by a packet, and that - * can be different for each packet.) */ - if (band->rate > exceeded_rate[i]) { - exceeded_rate[i] = band->rate; - exceeded_band[i] = m; - } - } + if (atomic_bound_sub(&band->bucket, volume, 0)) { + continue; + } + + /* Band limit hit, must process packet-by-packet. */ + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS) + ? 1000 : (dp_packet_size(packet) * 8); + + if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) { + /* Update the exceeding band for the exceeding packet. + * Only one band will be fired by a packet, and that can + * be different for each packet. */ + if (band->rate > exceeded_rate[i]) { + exceeded_rate[i] = band->rate; + exceeded_band[i] = m; + exceeded = true; } } - /* Remember the first exceeding packet. */ - if (exceeded_pkt > band_exceeded_pkt) { - exceeded_pkt = band_exceeded_pkt; - } } } + /* No need to iterate over packets if there are no drops. */ + if (!exceeded) { + return; + } + /* Fire the highest rate band exceeded by each packet, and drop * packets if needed. */ + + memset(band_packets, 0, sizeof band_packets); + memset(band_bytes, 0, sizeof band_bytes); + size_t j; DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) { - if (exceeded_band[j] >= 0) { + uint32_t m = exceeded_band[j]; + + if (m != UINT32_MAX) { /* Meter drop packet. */ - band = &meter->bands[exceeded_band[j]]; - band->packet_count += 1; - band->byte_count += dp_packet_size(packet); - COVERAGE_INC(datapath_drop_meter); + band_packets[m]++; + band_bytes[m] += dp_packet_size(packet); dp_packet_delete(packet); } else { /* Meter accepts packet. */ @@ -7313,7 +7332,15 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, } } - ovs_mutex_unlock(&meter->lock); + for (int m = 0; m < meter->n_bands; m++) { + if (!band_packets[m]) { + continue; + } + band = &meter->bands[m]; + atomic_add_relaxed(&band->packet_count, band_packets[m], &old); + atomic_add_relaxed(&band->byte_count, band_bytes[m], &old); + COVERAGE_ADD(datapath_drop_meter, band_packets[m]); + } } /* Meter set/get/del processing is still single-threaded. */ @@ -7354,13 +7381,13 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, meter->flags = config->flags; meter->n_bands = config->n_bands; meter->max_delta_t = 0; - meter->used = time_usec(); meter->id = mid; - ovs_mutex_init_adaptive(&meter->lock); + atomic_init(&meter->used, time_msec()); /* set up bands */ for (i = 0; i < config->n_bands; ++i) { uint32_t band_max_delta_t; + uint64_t bucket_size; /* Set burst size to a workable value if none specified. */ if (config->bands[i].burst_size == 0) { @@ -7370,11 +7397,11 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, meter->bands[i].rate = config->bands[i].rate; meter->bands[i].burst_size = config->bands[i].burst_size; /* Start with a full bucket. */ - meter->bands[i].bucket = meter->bands[i].burst_size * 1000ULL; + bucket_size = meter->bands[i].burst_size * 1000ULL; + atomic_init(&meter->bands[i].bucket, bucket_size); /* Figure out max delta_t that is enough to fill any bucket. */ - band_max_delta_t - = meter->bands[i].bucket / meter->bands[i].rate; + band_max_delta_t = bucket_size / meter->bands[i].rate; if (band_max_delta_t > meter->max_delta_t) { meter->max_delta_t = band_max_delta_t; } @@ -7397,7 +7424,7 @@ dpif_netdev_meter_get(const struct dpif *dpif, { struct dp_netdev *dp = get_dp_netdev(dpif); uint32_t meter_id = meter_id_.uint32; - const struct dp_meter *meter; + struct dp_meter *meter; if (meter_id >= MAX_METERS) { return EFBIG; @@ -7411,17 +7438,15 @@ dpif_netdev_meter_get(const struct dpif *dpif, if (stats) { int i = 0; - ovs_mutex_lock(&meter->lock); - - stats->packet_in_count = meter->packet_count; - stats->byte_in_count = meter->byte_count; + atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count); + atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count); for (i = 0; i < n_bands && i < meter->n_bands; ++i) { - stats->bands[i].packet_count = meter->bands[i].packet_count; - stats->bands[i].byte_count = meter->bands[i].byte_count; + atomic_read_relaxed(&meter->bands[i].packet_count, + &stats->bands[i].packet_count); + atomic_read_relaxed(&meter->bands[i].byte_count, + &stats->bands[i].byte_count); } - - ovs_mutex_unlock(&meter->lock); stats->n_bands = i; } @@ -9173,7 +9198,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_METER: dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a), - pmd->ctx.now); + pmd->ctx.now / 1000); break; case OVS_ACTION_ATTR_PUSH_VLAN: From affb9b81834971b2b376340bb2173a5d1c10425f Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:06 +0300 Subject: [PATCH 287/833] tc: Pass tunnel entirely to tunnel option parse and put functions. Tc flower tunnel key options were encoded in nl_msg_put_flower_tunnel_opts and decoded in nl_parse_flower_tunnel_opts. Only geneve was supported. To avoid adding more arguments to the function to support more vxlan options in the future, change the function arguments to pass tunnel entirely to it instead of keep adding new arguments. Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- lib/tc.c | 15 ++++++++------- lib/tc.h | 34 ++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index 270dc95ce53..223fe6e5e5e 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -701,7 +701,7 @@ nl_parse_geneve_key(const struct nlattr *in_nlattr, static int nl_parse_flower_tunnel_opts(struct nlattr *options, - struct tun_metadata *metadata) + struct tc_flower_tunnel *tunnel) { const struct ofpbuf *msg; struct nlattr *nla; @@ -716,7 +716,7 @@ nl_parse_flower_tunnel_opts(struct nlattr *options, uint16_t type = nl_attr_type(nla); switch (type) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: - err = nl_parse_geneve_key(nla, metadata); + err = nl_parse_geneve_key(nla, &tunnel->metadata); if (err) { return err; } @@ -828,13 +828,13 @@ nl_parse_flower_tunnel(struct nlattr **attrs, struct tc_flower *flower) if (attrs[TCA_FLOWER_KEY_ENC_OPTS] && attrs[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { err = nl_parse_flower_tunnel_opts(attrs[TCA_FLOWER_KEY_ENC_OPTS], - &flower->key.tunnel.metadata); + &flower->key.tunnel); if (err) { return err; } err = nl_parse_flower_tunnel_opts(attrs[TCA_FLOWER_KEY_ENC_OPTS_MASK], - &flower->mask.tunnel.metadata); + &flower->mask.tunnel); if (err) { return err; } @@ -3446,8 +3446,9 @@ nl_msg_put_masked_value(struct ofpbuf *request, uint16_t type, static void nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, - struct tun_metadata *metadata) + struct tc_flower_tunnel *tunnel) { + struct tun_metadata *metadata = &tunnel->metadata; struct geneve_opt *opt; size_t outer, inner; int len, cnt = 0; @@ -3536,9 +3537,9 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower) nl_msg_put_be32(request, TCA_FLOWER_KEY_ENC_KEY_ID, id); } nl_msg_put_flower_tunnel_opts(request, TCA_FLOWER_KEY_ENC_OPTS, - &flower->key.tunnel.metadata); + &flower->key.tunnel); nl_msg_put_flower_tunnel_opts(request, TCA_FLOWER_KEY_ENC_OPTS_MASK, - &flower->mask.tunnel.metadata); + &flower->mask.tunnel); } #define FLOWER_PUT_MASKED_VALUE(member, type) \ diff --git a/lib/tc.h b/lib/tc.h index cdd3b4f60ec..b9d449677ed 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -105,6 +105,23 @@ struct tc_cookie { size_t len; }; +struct tc_flower_tunnel { + struct { + ovs_be32 ipv4_src; + ovs_be32 ipv4_dst; + } ipv4; + struct { + struct in6_addr ipv6_src; + struct in6_addr ipv6_dst; + } ipv6; + uint8_t tos; + uint8_t ttl; + ovs_be16 tp_src; + ovs_be16 tp_dst; + ovs_be64 id; + struct tun_metadata metadata; +}; + struct tc_flower_key { ovs_be16 eth_type; uint8_t ip_proto; @@ -161,22 +178,7 @@ struct tc_flower_key { uint8_t rewrite_tclass; } ipv6; - struct { - struct { - ovs_be32 ipv4_src; - ovs_be32 ipv4_dst; - } ipv4; - struct { - struct in6_addr ipv6_src; - struct in6_addr ipv6_dst; - } ipv6; - uint8_t tos; - uint8_t ttl; - ovs_be16 tp_src; - ovs_be16 tp_dst; - ovs_be64 id; - struct tun_metadata metadata; - } tunnel; + struct tc_flower_tunnel tunnel; }; enum tc_action_type { From 8c3d5488da32f2a10d81e60559c6cfa3762f8f59 Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:07 +0300 Subject: [PATCH 288/833] odp-util: Extract vxlan gbp option decoding to a function. Extract vxlan gbp option decoding to odp_decode_gbp_raw to be used in following commits. Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- lib/odp-util.c | 9 +++------ lib/odp-util.h | 8 ++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index 2ec889c417e..f62dc86c5f9 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -3162,8 +3162,7 @@ odp_tun_key_from_attr__(const struct nlattr *attr, bool is_mask, if (ext[OVS_VXLAN_EXT_GBP]) { uint32_t gbp = nl_attr_get_u32(ext[OVS_VXLAN_EXT_GBP]); - tun->gbp_id = htons(gbp & 0xFFFF); - tun->gbp_flags = (gbp >> 16) & 0xFF; + odp_decode_gbp_raw(gbp, &tun->gbp_id, &tun->gbp_flags); } break; @@ -3753,12 +3752,10 @@ format_odp_tun_vxlan_opt(const struct nlattr *attr, ovs_be16 id, id_mask; uint8_t flags, flags_mask = 0; - id = htons(key & 0xFFFF); - flags = (key >> 16) & 0xFF; + odp_decode_gbp_raw(key, &id, &flags); if (ma) { uint32_t mask = nl_attr_get_u32(ma); - id_mask = htons(mask & 0xFFFF); - flags_mask = (mask >> 16) & 0xFF; + odp_decode_gbp_raw(mask, &id_mask, &flags_mask); } ds_put_cstr(ds, "gbp("); diff --git a/lib/odp-util.h b/lib/odp-util.h index a1d0d0fba5d..cf762bdc354 100644 --- a/lib/odp-util.h +++ b/lib/odp-util.h @@ -374,6 +374,14 @@ void odp_put_push_eth_action(struct ofpbuf *odp_actions, const struct eth_addr *eth_src, const struct eth_addr *eth_dst); +static inline void odp_decode_gbp_raw(uint32_t gbp_raw, + ovs_be16 *id, + uint8_t *flags) +{ + *id = htons(gbp_raw & 0xFFFF); + *flags = (gbp_raw >> 16) & 0xFF; +} + struct attr_len_tbl { int len; const struct attr_len_tbl *next; From 31baa7781e46604a926c6b6ae77a3d164113a38a Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:08 +0300 Subject: [PATCH 289/833] odp-util: Extract vxlan gbp option encoding to a function. Extract vxlan gbp option encoding to odp_encode_gbp_raw to be used in following commits. Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- lib/odp-util.c | 5 +++-- lib/odp-util.h | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index f62dc86c5f9..d2414eb559b 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -3278,10 +3278,11 @@ tun_key_to_attr(struct ofpbuf *a, const struct flow_tnl *tun_key, if ((!tnl_type || !strcmp(tnl_type, "vxlan")) && (tun_key->gbp_flags || tun_key->gbp_id)) { size_t vxlan_opts_ofs; + uint32_t gbp_raw; vxlan_opts_ofs = nl_msg_start_nested(a, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); - nl_msg_put_u32(a, OVS_VXLAN_EXT_GBP, - (tun_key->gbp_flags << 16) | ntohs(tun_key->gbp_id)); + gbp_raw = odp_encode_gbp_raw(tun_key->gbp_flags, tun_key->gbp_id); + nl_msg_put_u32(a, OVS_VXLAN_EXT_GBP, gbp_raw); nl_msg_end_nested(a, vxlan_opts_ofs); } diff --git a/lib/odp-util.h b/lib/odp-util.h index cf762bdc354..163efe7a87b 100644 --- a/lib/odp-util.h +++ b/lib/odp-util.h @@ -382,6 +382,11 @@ static inline void odp_decode_gbp_raw(uint32_t gbp_raw, *flags = (gbp_raw >> 16) & 0xFF; } +static inline uint32_t odp_encode_gbp_raw(uint8_t flags, ovs_be16 id) +{ + return (flags << 16) | ntohs(id); +} + struct attr_len_tbl { int len; const struct attr_len_tbl *next; From c39d7d06f5aca8f6f43334a77ae77e4a6509cb88 Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:09 +0300 Subject: [PATCH 290/833] netlink: Add new function to add NLA_F_NESTED to nested netlink messages. Linux kernel netlink module added NLA_F_NESTED flag checking for nested netlink messages in 5.2. A nested message without the flag set will be treated as malformatted one. The check is optional and is controlled by message policy. To avoid this, add NLA_F_NESTED explicitly for all nested netlink messages with a new function nl_msg_start_nested_with_flag(). Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- lib/netlink.c | 9 +++++++++ lib/netlink.h | 1 + 2 files changed, 10 insertions(+) diff --git a/lib/netlink.c b/lib/netlink.c index 6215282d6fb..1e8d5a8ec57 100644 --- a/lib/netlink.c +++ b/lib/netlink.c @@ -523,6 +523,15 @@ nl_msg_start_nested(struct ofpbuf *msg, uint16_t type) return offset; } +/* Adds the header for nested Netlink attributes to 'msg', with the specified + * 'type', and returns the header's offset within 'msg'. It's similar to + * nl_msg_start_nested() and uses NLA_F_NESTED flag mandatorily. */ +size_t +nl_msg_start_nested_with_flag(struct ofpbuf *msg, uint16_t type) +{ + return nl_msg_start_nested(msg, type | NLA_F_NESTED); +} + /* Finalizes a nested Netlink attribute in 'msg'. 'offset' should be the value * returned by nl_msg_start_nested(). */ void diff --git a/lib/netlink.h b/lib/netlink.h index e9050c31bac..008604aa60d 100644 --- a/lib/netlink.h +++ b/lib/netlink.h @@ -81,6 +81,7 @@ void nl_msg_put_string__(struct ofpbuf *, uint16_t type, const char *value, void nl_msg_put_string(struct ofpbuf *, uint16_t type, const char *value); size_t nl_msg_start_nested(struct ofpbuf *, uint16_t type); +size_t nl_msg_start_nested_with_flag(struct ofpbuf *, uint16_t type); void nl_msg_end_nested(struct ofpbuf *, size_t offset); void nl_msg_cancel_nested(struct ofpbuf *, size_t offset); bool nl_msg_end_non_empty_nested(struct ofpbuf *, size_t offset); From a4332b5e68f61bfbddae9dfb194f6815ea90385a Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:10 +0300 Subject: [PATCH 291/833] tc: Add vxlan gbp option flower match offload. Add TC offload support for filtering vxlan tunnels with gbp option. Reviewed-by: Gavi Teitz Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- include/linux/pkt_cls.h | 13 ++++++ lib/netdev-offload-tc.c | 17 ++++++++ lib/tc.c | 92 +++++++++++++++++++++++++++++++++++------ lib/tc.h | 7 ++++ 4 files changed, 117 insertions(+), 12 deletions(-) diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index a8cd8db5bf8..fb4a7ecea4c 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -273,6 +273,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_GENEVE * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_VXLAN + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -290,6 +294,15 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 4f26dd8cca5..1c97681bc92 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -1234,6 +1234,15 @@ parse_tc_flower_to_match(const struct netdev *netdev, match_set_tun_tp_dst_masked(match, flower->key.tunnel.tp_dst, flower->mask.tunnel.tp_dst); } + if (flower->mask.tunnel.gbp.id) { + match_set_tun_gbp_id_masked(match, flower->key.tunnel.gbp.id, + flower->mask.tunnel.gbp.id); + } + if (flower->mask.tunnel.gbp.flags) { + match_set_tun_gbp_flags_masked(match, + flower->key.tunnel.gbp.flags, + flower->mask.tunnel.gbp.flags); + } if (!strcmp(netdev_get_type(netdev), "geneve")) { flower_tun_opt_to_match(match, flower); @@ -2193,6 +2202,9 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, flower.key.tunnel.ttl = tnl->ip_ttl; flower.key.tunnel.tp_src = tnl->tp_src; flower.key.tunnel.tp_dst = tnl->tp_dst; + flower.key.tunnel.gbp.id = tnl->gbp_id; + flower.key.tunnel.gbp.flags = tnl->gbp_flags; + flower.key.tunnel.gbp.id_present = !!tnl_mask->gbp_id; flower.mask.tunnel.ipv4.ipv4_src = tnl_mask->ip_src; flower.mask.tunnel.ipv4.ipv4_dst = tnl_mask->ip_dst; @@ -2207,6 +2219,9 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, * Degrading the flow down to exact match for now as a workaround. */ flower.mask.tunnel.tp_dst = OVS_BE16_MAX; flower.mask.tunnel.id = (tnl->flags & FLOW_TNL_F_KEY) ? tnl_mask->tun_id : 0; + flower.mask.tunnel.gbp.id = tnl_mask->gbp_id; + flower.mask.tunnel.gbp.flags = tnl_mask->gbp_flags; + flower.mask.tunnel.gbp.id_present = !!tnl_mask->gbp_id; memset(&tnl_mask->ip_src, 0, sizeof tnl_mask->ip_src); memset(&tnl_mask->ip_dst, 0, sizeof tnl_mask->ip_dst); @@ -2218,6 +2233,8 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, memset(&tnl_mask->tp_dst, 0, sizeof tnl_mask->tp_dst); memset(&tnl_mask->tun_id, 0, sizeof tnl_mask->tun_id); + memset(&tnl_mask->gbp_id, 0, sizeof tnl_mask->gbp_id); + memset(&tnl_mask->gbp_flags, 0, sizeof tnl_mask->gbp_flags); tnl_mask->flags &= ~FLOW_TNL_F_KEY; /* XXX: This is wrong! We're ignoring DF and CSUM flags configuration diff --git a/lib/tc.c b/lib/tc.c index 223fe6e5e5e..ae1ca57c9d2 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -39,6 +39,7 @@ #include "coverage.h" #include "netlink-socket.h" #include "netlink.h" +#include "odp-util.h" #include "openvswitch/ofpbuf.h" #include "openvswitch/util.h" #include "openvswitch/vlog.h" @@ -699,6 +700,38 @@ nl_parse_geneve_key(const struct nlattr *in_nlattr, return 0; } +static int +nl_parse_vxlan_key(const struct nlattr *in_nlattr, + struct tc_flower_tunnel *tunnel) +{ + const struct ofpbuf *msg; + struct nlattr *nla; + struct ofpbuf buf; + uint32_t gbp_raw; + size_t left; + + nl_attr_get_nested(in_nlattr, &buf); + msg = &buf; + + NL_ATTR_FOR_EACH (nla, left, ofpbuf_at(msg, 0, 0), msg->size) { + uint16_t type = nl_attr_type(nla); + + switch (type) { + case TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP: + gbp_raw = nl_attr_get_u32(nla); + odp_decode_gbp_raw(gbp_raw, &tunnel->gbp.id, + &tunnel->gbp.flags); + tunnel->gbp.id_present = true; + break; + default: + VLOG_WARN_RL(&error_rl, "failed to parse vxlan tun options"); + return EINVAL; + } + } + + return 0; +} + static int nl_parse_flower_tunnel_opts(struct nlattr *options, struct tc_flower_tunnel *tunnel) @@ -721,6 +754,13 @@ nl_parse_flower_tunnel_opts(struct nlattr *options, return err; } + break; + case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: + err = nl_parse_vxlan_key(nla, tunnel); + if (err) { + return err; + } + break; } } @@ -3445,23 +3485,18 @@ nl_msg_put_masked_value(struct ofpbuf *request, uint16_t type, } static void -nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, - struct tc_flower_tunnel *tunnel) +nl_msg_put_flower_geneve(struct ofpbuf *request, + const struct tc_flower_tunnel *tunnel) { - struct tun_metadata *metadata = &tunnel->metadata; - struct geneve_opt *opt; - size_t outer, inner; + const struct tun_metadata *metadata = &tunnel->metadata; + const struct geneve_opt *opt; int len, cnt = 0; + size_t offset; len = metadata->present.len; - if (!len) { - return; - } - - outer = nl_msg_start_nested(request, type); while (len) { opt = &metadata->opts.gnv[cnt]; - inner = nl_msg_start_nested(request, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); + offset = nl_msg_start_nested(request, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); nl_msg_put_be16(request, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class); @@ -3472,8 +3507,41 @@ nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, cnt += sizeof(struct geneve_opt) / 4 + opt->length; len -= sizeof(struct geneve_opt) + opt->length * 4; - nl_msg_end_nested(request, inner); + nl_msg_end_nested(request, offset); } +} + +static void +nl_msg_put_flower_vxlan_tun_opts(struct ofpbuf *request, + const struct tc_flower_tunnel *tunnel) +{ + uint32_t gbp_raw; + size_t offset; + + if (!tunnel->gbp.id_present) { + return; + } + + gbp_raw = odp_encode_gbp_raw(tunnel->gbp.flags, tunnel->gbp.id); + offset = nl_msg_start_nested_with_flag(request, + TCA_FLOWER_KEY_ENC_OPTS_VXLAN); + nl_msg_put_u32(request, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, gbp_raw); + nl_msg_end_nested(request, offset); +} + +static void +nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, + struct tc_flower_tunnel *tunnel) +{ + size_t outer; + + if (!tunnel->metadata.present.len && !tunnel->gbp.id_present) { + return; + } + + outer = nl_msg_start_nested(request, type); + nl_msg_put_flower_geneve(request, tunnel); + nl_msg_put_flower_vxlan_tun_opts(request, tunnel); nl_msg_end_nested(request, outer); } diff --git a/lib/tc.h b/lib/tc.h index b9d449677ed..95fff37b9b6 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -105,6 +105,12 @@ struct tc_cookie { size_t len; }; +struct tc_tunnel_gbp { + ovs_be16 id; + uint8_t flags; + bool id_present; +}; + struct tc_flower_tunnel { struct { ovs_be32 ipv4_src; @@ -118,6 +124,7 @@ struct tc_flower_tunnel { uint8_t ttl; ovs_be16 tp_src; ovs_be16 tp_dst; + struct tc_tunnel_gbp gbp; ovs_be64 id; struct tun_metadata metadata; }; From 256c1e5819e9f43e414a8cddcca7ad674790e3bc Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:11 +0300 Subject: [PATCH 292/833] tc: Pass encap entirely to nl_msg_put_act_tunnel_key_set. Most of the data members of struct tc_action{ } are defined as anonymous struct in place. Instead of passing all members of an anonymous struct, which is not flexible to new members being added, expose encap as named struct and pass it entirely. Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- lib/tc.c | 57 +++++++++++++++++++++++--------------------------------- lib/tc.h | 38 +++++++++++++++++++------------------ 2 files changed, 43 insertions(+), 52 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index ae1ca57c9d2..7434b0150f7 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -2641,13 +2641,9 @@ nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, } static void -nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, bool id_present, - ovs_be64 id, ovs_be32 ipv4_src, - ovs_be32 ipv4_dst, struct in6_addr *ipv6_src, - struct in6_addr *ipv6_dst, - ovs_be16 tp_dst, uint8_t tos, uint8_t ttl, - struct tun_metadata *tun_metadata, - uint8_t no_csum, uint32_t action_pc) +nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, + struct tc_action_encap *encap, + uint32_t action_pc) { size_t offset; @@ -2659,30 +2655,33 @@ nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, bool id_present, nl_msg_put_unspec(request, TCA_TUNNEL_KEY_PARMS, &tun, sizeof tun); - ovs_be32 id32 = be64_to_be32(id); - if (id_present) { + ovs_be32 id32 = be64_to_be32(encap->id); + if (encap->id_present) { nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_KEY_ID, id32); } - if (ipv4_dst) { - nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_SRC, ipv4_src); - nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_DST, ipv4_dst); - } else if (ipv6_addr_is_set(ipv6_dst)) { + if (encap->ipv4.ipv4_dst) { + nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_SRC, + encap->ipv4.ipv4_src); + nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_DST, + encap->ipv4.ipv4_dst); + } else if (ipv6_addr_is_set(&encap->ipv6.ipv6_dst)) { nl_msg_put_in6_addr(request, TCA_TUNNEL_KEY_ENC_IPV6_DST, - ipv6_dst); + &encap->ipv6.ipv6_dst); nl_msg_put_in6_addr(request, TCA_TUNNEL_KEY_ENC_IPV6_SRC, - ipv6_src); + &encap->ipv6.ipv6_src); } - if (tos) { - nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TOS, tos); + if (encap->tos) { + nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TOS, encap->tos); } - if (ttl) { - nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TTL, ttl); + if (encap->ttl) { + nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TTL, encap->ttl); } - if (tp_dst) { - nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_DST_PORT, tp_dst); + if (encap->tp_dst) { + nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_DST_PORT, + encap->tp_dst); } - nl_msg_put_act_tunnel_geneve_option(request, tun_metadata); - nl_msg_put_u8(request, TCA_TUNNEL_KEY_NO_CSUM, no_csum); + nl_msg_put_act_tunnel_geneve_option(request, &encap->data); + nl_msg_put_u8(request, TCA_TUNNEL_KEY_NO_CSUM, encap->no_csum); } nl_msg_end_nested(request, offset); } @@ -3305,17 +3304,7 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) } act_offset = nl_msg_start_nested(request, act_index++); - nl_msg_put_act_tunnel_key_set(request, action->encap.id_present, - action->encap.id, - action->encap.ipv4.ipv4_src, - action->encap.ipv4.ipv4_dst, - &action->encap.ipv6.ipv6_src, - &action->encap.ipv6.ipv6_dst, - action->encap.tp_dst, - action->encap.tos, - action->encap.ttl, - &action->encap.data, - action->encap.no_csum, + nl_msg_put_act_tunnel_key_set(request, &action->encap, action_pc); nl_msg_put_act_flags(request); nl_msg_end_nested(request, act_offset); diff --git a/lib/tc.h b/lib/tc.h index 95fff37b9b6..1d648282a00 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -210,6 +210,25 @@ enum nat_type { TC_NAT_RESTORE, }; +struct tc_action_encap { + bool id_present; + ovs_be64 id; + ovs_be16 tp_src; + ovs_be16 tp_dst; + uint8_t tos; + uint8_t ttl; + uint8_t no_csum; + struct { + ovs_be32 ipv4_src; + ovs_be32 ipv4_dst; + } ipv4; + struct { + struct in6_addr ipv6_src; + struct in6_addr ipv6_dst; + } ipv6; + struct tun_metadata data; +}; + struct tc_action { union { int chain; @@ -233,24 +252,7 @@ struct tc_action { uint8_t bos; } mpls; - struct { - bool id_present; - ovs_be64 id; - ovs_be16 tp_src; - ovs_be16 tp_dst; - uint8_t tos; - uint8_t ttl; - uint8_t no_csum; - struct { - ovs_be32 ipv4_src; - ovs_be32 ipv4_dst; - } ipv4; - struct { - struct in6_addr ipv6_src; - struct in6_addr ipv6_dst; - } ipv6; - struct tun_metadata data; - } encap; + struct tc_action_encap encap; struct { uint16_t zone; From a2a3f1983f3f3b82ad72df3764deead2f4413ffd Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:12 +0300 Subject: [PATCH 293/833] tc: Add vxlan encap action with gbp option offload. Add TC offload support for vxlan encap with gbp option. Reviewed-by: Gavi Teitz Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- NEWS | 2 + acinclude.m4 | 7 ++++ include/linux/tc_act/tc_tunnel_key.h | 17 +++++++- lib/netdev-offload-tc.c | 30 +++++++++++++- lib/odp-util.c | 42 +++++++++++++------- lib/odp-util.h | 3 ++ lib/tc.c | 58 +++++++++++++++++++++++++++- lib/tc.h | 1 + 8 files changed, 143 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 0b5dc3db15c..6a990c92151 100644 --- a/NEWS +++ b/NEWS @@ -44,6 +44,8 @@ Post-v3.1.0 * IP and L4 checksum offload support is now enabled by default for interfaces that support it. See the 'status' column in the 'interface' table to check the status. + - Linux TC offload: + * Add support for offloading VXLAN tunnels with the GBP extensions. v3.1.0 - 16 Feb 2023 diff --git a/acinclude.m4 b/acinclude.m4 index ac1eab79004..690a13c2596 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -191,6 +191,13 @@ AC_DEFUN([OVS_CHECK_LINUX_TC], [ [AC_DEFINE([HAVE_TCA_TUNNEL_KEY_ENC_TTL], [1], [Define to 1 if TCA_TUNNEL_KEY_ENC_TTL is available.])]) + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include ], [ + int x = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN; + ])], + [AC_DEFINE([HAVE_TCA_TUNNEL_KEY_ENC_OPTS_VXLAN], [1], + [Define to 1 if TCA_TUNNEL_KEY_ENC_OPTS_VXLAN is available.])]) + AC_COMPILE_IFELSE([ AC_LANG_PROGRAM([#include ], [ int x = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP; diff --git a/include/linux/tc_act/tc_tunnel_key.h b/include/linux/tc_act/tc_tunnel_key.h index f13acf17dd7..17291b90bf3 100644 --- a/include/linux/tc_act/tc_tunnel_key.h +++ b/include/linux/tc_act/tc_tunnel_key.h @@ -1,7 +1,7 @@ #ifndef __LINUX_TC_ACT_TC_TUNNEL_KEY_WRAPPER_H #define __LINUX_TC_ACT_TC_TUNNEL_KEY_WRAPPER_H 1 -#if defined(__KERNEL__) || defined(HAVE_TCA_TUNNEL_KEY_ENC_TTL) +#if defined(__KERNEL__) || defined(HAVE_TCA_TUNNEL_KEY_ENC_OPTS_VXLAN) #include_next #else @@ -53,6 +53,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_GENEVE * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_VXLAN + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -70,6 +74,15 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) -#endif /* __KERNEL__ || HAVE_TCA_TUNNEL_KEY_ENC_TTL */ +enum { + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) + +#endif /* __KERNEL__ || HAVE_TCA_TUNNEL_KEY_ENC_OPTS_VXLAN */ #endif /* __LINUX_TC_ACT_TC_TUNNEL_KEY_WRAPPER_H */ diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 1c97681bc92..c43eacd4d94 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -668,6 +668,23 @@ static void parse_tc_flower_geneve_opts(struct tc_action *action, nl_msg_end_nested(buf, geneve_off); } +static void +parse_tc_flower_vxlan_tun_opts(struct tc_action *action, struct ofpbuf *buf) +{ + size_t gbp_off; + uint32_t gbp_raw; + + if (!action->encap.gbp.id_present) { + return; + } + + gbp_off = nl_msg_start_nested(buf, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + gbp_raw = odp_encode_gbp_raw(action->encap.gbp.flags, + action->encap.gbp.id); + nl_msg_put_u32(buf, OVS_VXLAN_EXT_GBP, gbp_raw); + nl_msg_end_nested(buf, gbp_off); +} + static void flower_tun_opt_to_match(struct match *match, struct tc_flower *flower) { @@ -863,7 +880,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, if (!action->encap.no_csum) { nl_msg_put_flag(buf, OVS_TUNNEL_KEY_ATTR_CSUM); } - + parse_tc_flower_vxlan_tun_opts(action, buf); parse_tc_flower_geneve_opts(action, buf); nl_msg_end_nested(buf, tunnel_offset); nl_msg_end_nested(buf, set_offset); @@ -1552,6 +1569,7 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, action->type = TC_ACT_ENCAP; action->encap.id_present = false; + action->encap.gbp.id_present = false; action->encap.no_csum = 1; flower->action_count++; NL_ATTR_FOR_EACH_UNSAFE(tun_attr, tun_left, tunnel, tunnel_len) { @@ -1613,6 +1631,16 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, action->encap.data.present.len = nl_attr_get_size(tun_attr); } break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: { + if (odp_vxlan_tun_opts_from_attr(tun_attr, + &action->encap.gbp.id, + &action->encap.gbp.flags, + &action->encap.gbp.id_present)) { + VLOG_ERR_RL(&rl, "error parsing VXLAN options"); + return EINVAL; + } + } + break; default: VLOG_DBG_RL(&rl, "unsupported tunnel key attribute %d", nl_attr_type(tun_attr)); diff --git a/lib/odp-util.c b/lib/odp-util.c index d2414eb559b..3eb2c3cb98c 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -3149,22 +3149,12 @@ odp_tun_key_from_attr__(const struct nlattr *attr, bool is_mask, tun->flags |= FLOW_TNL_F_OAM; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: { - static const struct nl_policy vxlan_opts_policy[] = { - [OVS_VXLAN_EXT_GBP] = { .type = NL_A_U32 }, - }; - struct nlattr *ext[ARRAY_SIZE(vxlan_opts_policy)]; - - if (!nl_parse_nested(a, vxlan_opts_policy, ext, ARRAY_SIZE(ext))) { + if (odp_vxlan_tun_opts_from_attr(a, &tun->gbp_id, + &tun->gbp_flags, + NULL)) { odp_parse_error(&rl, errorp, "error parsing VXLAN options"); return ODP_FIT_ERROR; } - - if (ext[OVS_VXLAN_EXT_GBP]) { - uint32_t gbp = nl_attr_get_u32(ext[OVS_VXLAN_EXT_GBP]); - - odp_decode_gbp_raw(gbp, &tun->gbp_id, &tun->gbp_flags); - } - break; } case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: @@ -8844,3 +8834,29 @@ commit_odp_actions(const struct flow *flow, struct flow *base, return slow1 ? slow1 : slow2; } + +int +odp_vxlan_tun_opts_from_attr(const struct nlattr *tun_attr, ovs_be16 *id, + uint8_t *flags, bool *id_present) +{ + static const struct nl_policy vxlan_opts_policy[] = { + [OVS_VXLAN_EXT_GBP] = { .type = NL_A_U32 }, + }; + struct nlattr *ext[ARRAY_SIZE(vxlan_opts_policy)]; + + if (!nl_parse_nested(tun_attr, vxlan_opts_policy, ext, ARRAY_SIZE(ext))) { + return EINVAL; + } + + if (ext[OVS_VXLAN_EXT_GBP]) { + uint32_t gbp_raw = nl_attr_get_u32(ext[OVS_VXLAN_EXT_GBP]); + + odp_decode_gbp_raw(gbp_raw, id, flags); + } + + if (id_present) { + *id_present = !!ext[OVS_VXLAN_EXT_GBP]; + } + + return 0; +} diff --git a/lib/odp-util.h b/lib/odp-util.h index 163efe7a87b..8c7baa680dd 100644 --- a/lib/odp-util.h +++ b/lib/odp-util.h @@ -292,6 +292,9 @@ enum slow_path_reason commit_odp_actions(const struct flow *, bool pending_decap, struct ofpbuf *encap_data); +int odp_vxlan_tun_opts_from_attr(const struct nlattr *tun_attr, ovs_be16 *id, + uint8_t *flags, bool *id_present); + /* ofproto-dpif interface. * * The following types and functions are logically part of ofproto-dpif. diff --git a/lib/tc.c b/lib/tc.c index 7434b0150f7..e34a1a5f090 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1290,6 +1290,35 @@ nl_parse_act_geneve_opts(const struct nlattr *in_nlattr, return 0; } +static int +nl_parse_act_vxlan_opts(struct nlattr *in_nlattr, struct tc_action *action) +{ + const struct ofpbuf *msg; + struct nlattr *nla; + struct ofpbuf buf; + size_t left; + + nl_attr_get_nested(in_nlattr, &buf); + msg = &buf; + + NL_ATTR_FOR_EACH (nla, left, ofpbuf_at(msg, 0, 0), msg->size) { + uint16_t type = nl_attr_type(nla); + int32_t gbp_raw; + + switch (type) { + case TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP: + gbp_raw = nl_attr_get_u32(nla); + odp_decode_gbp_raw(gbp_raw, &action->encap.gbp.id, + &action->encap.gbp.flags); + action->encap.gbp.id_present = true; + + break; + } + } + + return 0; +} + static int nl_parse_act_tunnel_opts(struct nlattr *options, struct tc_action *action) { @@ -1314,7 +1343,12 @@ nl_parse_act_tunnel_opts(struct nlattr *options, struct tc_action *action) if (err) { return err; } - + break; + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: + err = nl_parse_act_vxlan_opts(nla, action); + if (err) { + return err; + } break; } } @@ -2640,6 +2674,27 @@ nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, nl_msg_end_nested(request, outer); } +static void +nl_msg_put_act_tunnel_vxlan_opts(struct ofpbuf *request, + struct tc_action_encap *encap) +{ + size_t outer, inner; + uint32_t gbp_raw; + + if (!encap->gbp.id_present) { + return; + } + + gbp_raw = odp_encode_gbp_raw(encap->gbp.flags, + encap->gbp.id); + outer = nl_msg_start_nested_with_flag(request, TCA_TUNNEL_KEY_ENC_OPTS); + inner = nl_msg_start_nested_with_flag(request, + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); + nl_msg_put_u32(request, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, gbp_raw); + nl_msg_end_nested(request, inner); + nl_msg_end_nested(request, outer); +} + static void nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, struct tc_action_encap *encap, @@ -2680,6 +2735,7 @@ nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_DST_PORT, encap->tp_dst); } + nl_msg_put_act_tunnel_vxlan_opts(request, encap); nl_msg_put_act_tunnel_geneve_option(request, &encap->data); nl_msg_put_u8(request, TCA_TUNNEL_KEY_NO_CSUM, encap->no_csum); } diff --git a/lib/tc.h b/lib/tc.h index 1d648282a00..06707ffa467 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -227,6 +227,7 @@ struct tc_action_encap { struct in6_addr ipv6_dst; } ipv6; struct tun_metadata data; + struct tc_tunnel_gbp gbp; }; struct tc_action { From 7f04588d78fe0a571f5107b0d5aeda832383f284 Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:13 +0300 Subject: [PATCH 294/833] netdev-tc-offloads: Probe for allowing vxlan gbp support. Kernels that do not support vxlan gbp would treat the rule that has vxlan gbp encap action or vxlan gbp id match differently, either reject it or just skip the action/match and continue processing the knowing ones. To solve the issue, probe and disallow inserting rules with vxlan gbp action/match if kernel does not support it. Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- lib/netdev-offload-tc.c | 64 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index c43eacd4d94..b846a63c222 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -52,6 +52,7 @@ static struct hmap tc_to_ufid = HMAP_INITIALIZER(&tc_to_ufid); static bool multi_mask_per_prio = false; static bool block_support = false; static uint16_t ct_state_support; +static bool vxlan_gbp_support = false; struct netlink_field { int offset; @@ -668,14 +669,17 @@ static void parse_tc_flower_geneve_opts(struct tc_action *action, nl_msg_end_nested(buf, geneve_off); } -static void +static int parse_tc_flower_vxlan_tun_opts(struct tc_action *action, struct ofpbuf *buf) { size_t gbp_off; uint32_t gbp_raw; if (!action->encap.gbp.id_present) { - return; + return 0; + } + if (!vxlan_gbp_support) { + return -EOPNOTSUPP; } gbp_off = nl_msg_start_nested(buf, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); @@ -683,6 +687,7 @@ parse_tc_flower_vxlan_tun_opts(struct tc_action *action, struct ofpbuf *buf) action->encap.gbp.id); nl_msg_put_u32(buf, OVS_VXLAN_EXT_GBP, gbp_raw); nl_msg_end_nested(buf, gbp_off); + return 0; } static void @@ -845,6 +850,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, size_t set_offset = nl_msg_start_nested(buf, OVS_ACTION_ATTR_SET); size_t tunnel_offset = nl_msg_start_nested(buf, OVS_KEY_ATTR_TUNNEL); + int ret; if (action->encap.id_present) { nl_msg_put_be64(buf, OVS_TUNNEL_KEY_ATTR_ID, action->encap.id); @@ -880,7 +886,10 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, if (!action->encap.no_csum) { nl_msg_put_flag(buf, OVS_TUNNEL_KEY_ATTR_CSUM); } - parse_tc_flower_vxlan_tun_opts(action, buf); + ret = parse_tc_flower_vxlan_tun_opts(action, buf); + if (ret) { + return ret; + } parse_tc_flower_geneve_opts(action, buf); nl_msg_end_nested(buf, tunnel_offset); nl_msg_end_nested(buf, set_offset); @@ -1632,6 +1641,9 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, } break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: { + if (!vxlan_gbp_support) { + return EOPNOTSUPP; + } if (odp_vxlan_tun_opts_from_attr(tun_attr, &action->encap.gbp.id, &action->encap.gbp.flags, @@ -2787,6 +2799,51 @@ probe_tc_block_support(int ifindex) } } +static void +probe_vxlan_gbp_support(int ifindex) +{ + struct tc_flower flower; + struct tcf_id id; + int block_id = 0; + int prio = 1; + int error; + + error = tc_add_del_qdisc(ifindex, true, block_id, TC_INGRESS); + if (error) { + return; + } + + memset(&flower, 0, sizeof flower); + + flower.tc_policy = TC_POLICY_SKIP_HW; + flower.key.eth_type = htons(ETH_P_IP); + flower.mask.eth_type = OVS_BE16_MAX; + flower.tunnel = true; + flower.mask.tunnel.id = OVS_BE64_MAX; + flower.mask.tunnel.ipv4.ipv4_src = OVS_BE32_MAX; + flower.mask.tunnel.ipv4.ipv4_dst = OVS_BE32_MAX; + flower.mask.tunnel.tp_dst = OVS_BE16_MAX; + flower.mask.tunnel.gbp.id = OVS_BE16_MAX; + flower.key.tunnel.ipv4.ipv4_src = htonl(0x01010101); + flower.key.tunnel.ipv4.ipv4_dst = htonl(0x01010102); + flower.key.tunnel.tp_dst = htons(46354); + flower.key.tunnel.gbp.id = htons(512); + + id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS); + error = tc_replace_flower(&id, &flower); + if (error) { + goto out; + } + + tc_del_flower_filter(&id); + + vxlan_gbp_support = true; + VLOG_INFO("probe tc: vxlan gbp is supported."); + +out: + tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS); +} + static int tc_get_policer_action_ids(struct hmap *map) { @@ -2914,6 +2971,7 @@ netdev_tc_init_flow_api(struct netdev *netdev) probe_multi_mask_per_prio(ifindex); probe_ct_state_support(ifindex); + probe_vxlan_gbp_support(ifindex); ovs_mutex_lock(&meter_police_ids_mutex); meter_police_ids = id_pool_create(METER_POLICE_IDS_BASE, From b4c7009c20e720329caa8f109f2b25edc0bcf31a Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 27 Jun 2023 13:48:14 +0300 Subject: [PATCH 295/833] system-offloads-traffic.at: Add vxlan gbp offload test. Add a vxlan gbp offload test case: vxlan offloads with gbp extention - ping between two ports - offloads enabled ok Reviewed-by: Roi Dayan Reviewed-by: Simon Horman Signed-off-by: Gavin Li Signed-off-by: Eelco Chaudron --- tests/system-offloads-traffic.at | 50 ++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index ae302a29499..7215e36e2d8 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -805,3 +805,53 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/could not open network device ovs-p0/d /failed to offload flow/d "]) AT_CLEANUP + +AT_SETUP([offloads - ping over vxlan tunnel with gbp - offloads enabled]) +OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_VXLAN() + +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) +AT_SKIP_IF([! grep -q "probe tc: vxlan gbp is supported." ovs-vswitchd.log]) +ADD_BR([br-underlay]) + +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([vxlan], [br0], [at_vxlan0], [172.31.1.1], [10.1.1.100/24], [options:exts=gbp]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=br0 actions=load:0x200->NXM_NX_TUN_GBP_ID[], output:at_vxlan0]") +AT_CHECK([ovs-ofctl add-flow br0 "in_port=at_vxlan0, tun_gbp_id=512 actions=output:br0"]) +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [id 0 dstport 4789 gbp]) +NS_CHECK_EXEC([at_ns0], [iptables -I OUTPUT -p ip -j MARK --set-mark 512 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [iptables -I INPUT -m mark --mark 512 -j ACCEPT 2>/dev/null], [0], [ignore]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay. +NS_CHECK_EXEC([at_ns0], [ping -q -c 1000 -i 0.01 10.1.1.100 | FORMAT_PING], [0], [dnl +1000 packets transmitted, 1000 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | grep "tp_dst=4789,vxlan(gbp(id=512))" | wc -l], [0], [dnl +1 +]) +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | grep "tp_dst=4789,vxlan(gbp(id=512,flags=0))" | wc -l], [0], [dnl +1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + From a5669fd51c9b1276ca03d54a5f069b5221915325 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Fri, 7 Jul 2023 15:59:23 +0200 Subject: [PATCH 296/833] netdev-dpdk: Drop TSO in case of conflicting virtio features. At some point in OVS history, some virtio features were announced as supported (ECN and UFO virtio features). The userspace TSO code, which has been added later, does not support those features and tries to disable them. This breaks OVS upgrades: if an existing VM already negotiated such features, their lack on reconnection to an upgraded OVS triggers a vhost socket disconnection by Qemu. This results in an endless loop because Qemu then retries with the same set of virtio features. This patch proposes to try and detect those vhost socket disconnection and fallback restoring the old virtio features (and disabling TSO for this vhost port). Acked-by: Mike Pattrick Acked-by: Simon Horman Acked-by: Maxime Coquelin Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- Documentation/topics/userspace-tso.rst | 26 ++++++- lib/netdev-dpdk.c | 100 ++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 6 deletions(-) diff --git a/Documentation/topics/userspace-tso.rst b/Documentation/topics/userspace-tso.rst index 5a43c2e86b8..c4b15f2604a 100644 --- a/Documentation/topics/userspace-tso.rst +++ b/Documentation/topics/userspace-tso.rst @@ -68,7 +68,7 @@ as follows. connection is established, `TSO` is thus advertised to the guest as an available feature: -QEMU Command Line Parameter:: +1. QEMU Command Line Parameter:: $ sudo $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 \ ... @@ -77,12 +77,34 @@ QEMU Command Line Parameter:: ... 2. Ethtool. Assuming that the guest's OS also supports `TSO`, ethtool can be -used to enable same:: + used to enable same:: $ ethtool -K eth0 sg on # scatter-gather is a prerequisite for TSO $ ethtool -K eth0 tso on $ ethtool -k eth0 +**Note:** Enabling this feature impacts the virtio features exposed by the DPDK +vHost User backend to a guest. If a guest was already connected to OvS before +enabling TSO and restarting OvS, this guest ports won't have TSO available:: + + $ ovs-vsctl get interface vhost0 status:tx_tcp_seg_offload + "false" + +To help diagnose the issue, those ports have some additional information in +their status field in ovsdb:: + + $ ovs-vsctl get interface vhost0 status:userspace-tso + disabled + +To restore TSO for this guest ports, this guest QEMU process must be stopped, +then started again. OvS will then report:: + + $ ovs-vsctl get interface vhost0 status:tx_tcp_seg_offload + "true" + + $ ovs-vsctl get interface vhost0 status:userspace-tso + ovs-vsctl: no key "userspace-tso" in Interface record "vhost0" column status + ~~~~~~~~~~~ Limitations ~~~~~~~~~~~ diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 63dac689e38..4415443924d 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -418,6 +418,18 @@ enum dpdk_hw_ol_features { NETDEV_TX_TSO_OFFLOAD = 1 << 7, }; +/* Flags for the netdev_dpdk virtio_features_state field. + * This is used for the virtio features recovery mechanism linked to TSO + * support. */ +#define OVS_VIRTIO_F_CLEAN (UINT8_C(1) << 0) +#define OVS_VIRTIO_F_WORKAROUND (UINT8_C(1) << 1) +#define OVS_VIRTIO_F_NEGOTIATED (UINT8_C(1) << 2) +#define OVS_VIRTIO_F_RECONF_PENDING (UINT8_C(1) << 3) +#define OVS_VIRTIO_F_CLEAN_NEGOTIATED \ + (OVS_VIRTIO_F_CLEAN | OVS_VIRTIO_F_NEGOTIATED) +#define OVS_VIRTIO_F_WORKAROUND_NEGOTIATED \ + (OVS_VIRTIO_F_WORKAROUND | OVS_VIRTIO_F_NEGOTIATED) + /* * In order to avoid confusion in variables names, following naming convention * should be used, if possible: @@ -474,7 +486,11 @@ struct netdev_dpdk { bool vhost_reconfigured; atomic_uint8_t vhost_tx_retries_max; - /* 2 pad bytes here. */ + + /* Flags for virtio features recovery mechanism. */ + uint8_t virtio_features_state; + + /* 1 pad byte here. */ ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -1359,6 +1375,7 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, dev->requested_lsc_interrupt_mode = 0; ovsrcu_index_init(&dev->vid, -1); dev->vhost_reconfigured = false; + dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; dev->attached = false; dev->started = false; dev->reset_needed = false; @@ -3883,6 +3900,12 @@ netdev_dpdk_vhost_user_get_status(const struct netdev *netdev, xasprintf("%d", vring.size)); } + if (userspace_tso_enabled() + && dev->virtio_features_state & OVS_VIRTIO_F_WORKAROUND) { + + smap_add_format(args, "userspace-tso", "disabled"); + } + ovs_mutex_unlock(&dev->mutex); return 0; } @@ -4245,6 +4268,8 @@ new_device(int vid) newnode = dev->socket_id; } + dev->virtio_features_state |= OVS_VIRTIO_F_NEGOTIATED; + if (dev->requested_n_txq < qp_num || dev->requested_n_rxq < qp_num || dev->requested_socket_id != newnode @@ -4268,7 +4293,9 @@ new_device(int vid) dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; } - if (userspace_tso_enabled()) { + if (userspace_tso_enabled() + && dev->virtio_features_state & OVS_VIRTIO_F_CLEAN) { + if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4) && features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) { @@ -4524,6 +4551,45 @@ destroy_connection(int vid) dev->requested_n_txq = qp_num; netdev_request_reconfigure(&dev->up); } + + if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) { + /* The socket disconnected before reaching new_device. It + * likely means that the guest did not agree with the virtio + * features. */ + VLOG_WARN_RL(&rl, "Connection on socket '%s' closed during " + "initialization.", dev->vhost_id); + } + if (!(dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING)) { + switch (dev->virtio_features_state) { + case OVS_VIRTIO_F_CLEAN: + dev->virtio_features_state = OVS_VIRTIO_F_WORKAROUND; + break; + + case OVS_VIRTIO_F_WORKAROUND: + dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; + break; + + case OVS_VIRTIO_F_CLEAN_NEGOTIATED: + /* The virtio features were clean and got accepted by the + * guest. We expect it will be the case in the future and + * change nothing. */ + break; + + case OVS_VIRTIO_F_WORKAROUND_NEGOTIATED: + /* Let's try to go with clean virtio features on a next + * connection. */ + dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; + break; + + default: + OVS_NOT_REACHED(); + } + if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) { + dev->virtio_features_state |= OVS_VIRTIO_F_RECONF_PENDING; + netdev_request_reconfigure(&dev->up); + } + } + ovs_mutex_unlock(&dev->mutex); exists = true; break; @@ -5454,10 +5520,31 @@ static int netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + bool unregister = false; + char *vhost_id; int err; ovs_mutex_lock(&dev->mutex); + if (dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT && dev->vhost_id + && dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING) { + + /* This vhost-user port was registered to the vhost library already, + * but a socket disconnection happened and configuration must be + * re-evaluated wrt dev->virtio_features_state. */ + dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT; + vhost_id = dev->vhost_id; + unregister = true; + } + + ovs_mutex_unlock(&dev->mutex); + + if (unregister) { + dpdk_vhost_driver_unregister(dev, vhost_id); + } + + ovs_mutex_lock(&dev->mutex); + /* Configure vHost client mode if requested and if the following criteria * are met: * 1. Device hasn't been registered yet. @@ -5466,6 +5553,11 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) { uint64_t virtio_unsup_features = 0; uint64_t vhost_flags = 0; + bool enable_tso; + + enable_tso = userspace_tso_enabled() + && dev->virtio_features_state & OVS_VIRTIO_F_CLEAN; + dev->virtio_features_state &= ~OVS_VIRTIO_F_RECONF_PENDING; /* Register client-mode device. */ vhost_flags |= RTE_VHOST_USER_CLIENT; @@ -5487,7 +5579,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) } /* Enable External Buffers if TCP Segmentation Offload is enabled. */ - if (userspace_tso_enabled()) { + if (enable_tso) { vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT; } @@ -5512,7 +5604,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) goto unlock; } - if (userspace_tso_enabled()) { + if (enable_tso) { virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN | 1ULL << VIRTIO_NET_F_HOST_UFO; VLOG_DBG("%s: TSO enabled on vhost port", From fc06ea9a18837060eebedcb54c87da537530dc04 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Tue, 4 Jul 2023 21:59:56 +0200 Subject: [PATCH 297/833] netdev-dpdk: Add custom rx-steering configuration. Some control protocols are used to maintain link status between forwarding engines (e.g. LACP). When the system is not sized properly, the PMD threads may not be able to process all incoming traffic from the configured Rx queues. When a signaling packet of such protocols is dropped, it can cause link flapping, worsening the situation. Use the rte_flow API to redirect these protocols into a dedicated Rx queue. The assumption is made that the ratio between control protocol traffic and user data traffic is very low and thus this dedicated Rx queue will never get full. Re-program the RSS redirection table to only use the other Rx queues. The additional Rx queue will be assigned a PMD core like any other Rx queue. Polling that extra queue may introduce increased latency and a slight performance penalty at the benefit of preventing link flapping. This feature must be enabled per port on specific protocols via the rx-steering option. This option takes "rss" followed by a "+" separated list of protocol names. It is only supported on ethernet ports. This feature is experimental. If the user has already configured multiple Rx queues on the port, an additional one will be allocated for control packets. If the hardware cannot satisfy the number of requested Rx queues, the last Rx queue will be assigned for control plane. If only one Rx queue is available, the rx-steering feature will be disabled. If the hardware does not support the rte_flow matchers/actions, the rx-steering feature will be completely disabled on the port and regular rss will be performed instead. It cannot be enabled when other-config:hw-offload=true as it may conflict with the offloaded flows. Similarly, if hw-offload is enabled, custom rx-steering will be forcibly disabled on all ports and replaced by regular rss. Example use: ovs-vsctl add-bond br-phy bond0 phy0 phy1 -- \ set interface phy0 type=dpdk options:dpdk-devargs=0000:ca:00.0 -- \ set interface phy0 options:rx-steering=rss+lacp -- \ set interface phy1 type=dpdk options:dpdk-devargs=0000:ca:00.1 -- \ set interface phy1 options:rx-steering=rss+lacp As a starting point, only one protocol is supported: LACP. Other protocols can be added in the future. NIC compatibility should be checked. To validate that this works as intended, I used a traffic generator to generate random traffic slightly above the machine capacity at line rate on a two ports bond interface. OVS is configured to receive traffic on two VLANs and pop/push them in a br-int bridge based on tags set on patch ports. +----------------------+ | DUT | |+--------------------+| || br-int || in_port=patch10,actions=mod_dl_src:$patch11, || || mod_dl_dst:$tgen0, || || output:patch10 || || in_port=patch11,actions=mod_dl_src:$patch10 || || mod_dl_dst:$tgen0, || patch10 patch11 || output:patch10 |+---|-----------|----+| | | | | |+---|-----------|----+| || patch00 patch01 || || tag:10 tag:20 || || || || br-phy || default flow, action=NORMAL || || || bond0 || balance-slb, lacp=passive, lacp-time=fast || phy0 phy1 || |+------|-----|-------+| +-------|-----|--------+ | | +-------|-----|--------+ | port0 port1 | balance L3/L4, lacp=active, lacp-time=fast | lag | mode trunk VLANs 10, 20 | | | switch | | | | vlan 10 vlan 20 | mode access | port2 port3 | +-----|----------|-----+ | | +-----|----------|-----+ | tgen0 tgen1 | Random traffic that is properly balanced | | across the bond ports in both directions. | traffic generator | +----------------------+ Without rx-steering, the bond0 links are randomly switching to "defaulted" when one of the LACP packets sent by the switch is dropped because the RX queues are full and the PMD threads did not process them fast enough. When that happens, all traffic must go through a single link which causes above line rate traffic to be dropped. ~# ovs-appctl lacp/show-stats bond0 ---- bond0 statistics ---- member: phy0: TX PDUs: 347246 RX PDUs: 14865 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 168 Link Defaulted: 0 Carrier Status Changed: 0 member: phy1: TX PDUs: 347245 RX PDUs: 14919 RX Bad PDUs: 0 RX Marker Request PDUs: 0 Link Expired: 147 Link Defaulted: 1 Carrier Status Changed: 0 When rx-steering is enabled, no LACP packet is dropped and the bond links remain enabled at all times, maximizing the throughput. Neither the "Link Expired" nor the "Link Defaulted" counters are incremented anymore. This feature may be considered as "QoS". However, it does not work by limiting the rate of traffic explicitly. It only guarantees that some protocols have a lower chance of being dropped because the PMD cores cannot keep up with regular traffic. The choice of protocols is limited on purpose. This is not meant to be configurable by users. Some limited configurability could be considered in the future but it would expose to more potential issues if users are accidentally redirecting all traffic in the isolated queue. Acked-by: Kevin Traynor Acked-by: Aaron Conole Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/phy.rst | 87 +++++++++ NEWS | 3 + lib/netdev-dpdk.c | 315 +++++++++++++++++++++++++++++- vswitchd/vswitch.xml | 44 +++++ 4 files changed, 446 insertions(+), 3 deletions(-) diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index 4b0fe8dded3..f66b106c46a 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -131,6 +131,93 @@ possible with DPDK acceleration. It is possible to configure multiple Rx queues for ``dpdk`` ports, thus ensuring this is not a bottleneck for performance. For information on configuring PMD threads, refer to :doc:`pmd`. +Traffic Rx Steering +------------------- + +.. warning:: This feature is experimental. + +Some control protocols are used to maintain link status between forwarding +engines. In SDN environments, these packets share the same physical network +with the user data traffic. + +When the system is not sized properly, the PMD threads may not be able to +process all incoming traffic from the configured Rx queues. When a signaling +packet of such protocols is dropped, it can cause link flapping, worsening the +situation. + +Some physical NICs can be programmed to put these protocols in a dedicated +hardware Rx queue using the rte_flow__ API. + +__ https://doc.dpdk.org/guides-22.11/prog_guide/rte_flow.html + +.. warning:: + + This feature is not compatible with all NICs. Refer to the DPDK + `compatibilty matrix`__ and vendor documentation for more details. + + __ https://doc.dpdk.org/guides-22.11/nics/overview.html + +Rx steering must be enabled for specific protocols per port. The +``rx-steering`` option takes one of the following values: + +``rss`` + Do regular RSS on all configured Rx queues. This is the default behaviour. + +``rss+lacp`` + Do regular RSS on all configured Rx queues. An extra Rx queue is configured + for LACP__ packets (ether type ``0x8809``). + + __ https://www.ieee802.org/3/ad/public/mar99/seaman_1_0399.pdf + +Example:: + + $ ovs-vsctl add-port br0 dpdk-p0 -- set Interface dpdk-p0 type=dpdk \ + options:dpdk-devargs=0000:01:00.0 options:n_rxq=2 \ + options:rx-steering=rss+lacp + +.. note:: + + If multiple Rx queues are already configured, regular hash-based RSS + (Receive Side Scaling) queue balancing is done on all but the extra Rx + queue. + +.. tip:: + + You can check if Rx steering is supported on a port with the following + command:: + + $ ovs-vsctl get interface dpdk-p0 status + {..., rss_queues="0-1", rx_steering_queue="2"} + + This will also show in ``ovs-vswitchd.log``:: + + INFO|dpdk-p0: rx-steering: redirecting lacp traffic to queue 2 + INFO|dpdk-p0: rx-steering: applying rss on queues 0-1 + + If the hardware does not support redirecting the specified protocols to + a dedicated queue, it will be explicit:: + + $ ovs-vsctl get interface dpdk-p0 status + {..., rx_steering=unsupported} + + More details can often be found in ``ovs-vswitchd.log``:: + + WARN|dpdk-p0: rx-steering: failed to add lacp flow: Unsupported pattern + +To disable Rx steering on a port, use the following command:: + + $ ovs-vsctl remove Interface dpdk-p0 options rx-steering + +You can see that it has been disabled in ``ovs-vswitchd.log``:: + + INFO|dpdk-p0: rx-steering: default rss + +.. warning:: + + This feature is mutually exclusive with ``other-config:hw-offload`` as it + may conflict with the offloaded flows. If both are enabled, ``rx-steering`` + will fall back to default ``rss`` mode. + .. _dpdk-phy-flow-control: Flow Control diff --git a/NEWS b/NEWS index 6a990c92151..eedaad07b13 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,9 @@ Post-v3.1.0 * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started with the --hw-rawio-access command line option. This allows the process extra privileges when mapping physical interconnect memory. + * New experimental "rx-steering=rss+" option to redirect + certain protocols (for now, only LACP) to a dedicated hardware queue + using the rte_flow API. - SRv6 Tunnel Protocol * Added support for userspace datapath (only). - Userspace datapath: diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 4415443924d..aa87ee5468e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -418,6 +418,10 @@ enum dpdk_hw_ol_features { NETDEV_TX_TSO_OFFLOAD = 1 << 7, }; +enum dpdk_rx_steer_flags { + DPDK_RX_STEER_LACP = 1 << 0, +}; + /* Flags for the netdev_dpdk virtio_features_state field. * This is used for the virtio features recovery mechanism linked to TSO * support. */ @@ -524,6 +528,12 @@ struct netdev_dpdk { * netdev_dpdk*_reconfigure() is called */ int requested_mtu; int requested_n_txq; + /* User input for n_rxq (see dpdk_set_rxq_config). */ + int user_n_rxq; + /* user_n_rxq + an optional rx steering queue (see + * netdev_dpdk_reconfigure). This field is different from the other + * requested_* fields as it may contain a different value than the user + * input. */ int requested_n_rxq; int requested_rxq_size; int requested_txq_size; @@ -553,6 +563,13 @@ struct netdev_dpdk { /* VF configuration. */ struct eth_addr requested_hwaddr; + + /* Requested rx queue steering flags, + * from the enum set 'dpdk_rx_steer_flags'. */ + uint64_t requested_rx_steer_flags; + uint64_t rx_steer_flags; + size_t rx_steer_flows_num; + struct rte_flow **rx_steer_flows; ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -1388,10 +1405,15 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, netdev->n_rxq = 0; netdev->n_txq = 0; + dev->user_n_rxq = NR_QUEUE; dev->requested_n_rxq = NR_QUEUE; dev->requested_n_txq = NR_QUEUE; dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE; dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE; + dev->requested_rx_steer_flags = 0; + dev->rx_steer_flags = 0; + dev->rx_steer_flows_num = 0; + dev->rx_steer_flows = NULL; /* Initialize the flow control to NULL */ memset(&dev->fc_conf, 0, sizeof dev->fc_conf); @@ -1566,6 +1588,8 @@ common_destruct(struct netdev_dpdk *dev) ovs_mutex_destroy(&dev->mutex); } +static void dpdk_rx_steer_unconfigure(struct netdev_dpdk *); + static void netdev_dpdk_destruct(struct netdev *netdev) { @@ -1573,6 +1597,9 @@ netdev_dpdk_destruct(struct netdev *netdev) ovs_mutex_lock(&dpdk_mutex); + /* Destroy any rx-steering flows to allow RXQs to be removed. */ + dpdk_rx_steer_unconfigure(dev); + rte_eth_dev_stop(dev->port_id); dev->started = false; @@ -1812,7 +1839,7 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) ovs_mutex_lock(&dev->mutex); - smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq); + smap_add_format(args, "requested_rx_queues", "%d", dev->user_n_rxq); smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq); smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq); smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq); @@ -1826,6 +1853,9 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) } else { smap_add(args, "rx_csum_offload", "false"); } + if (dev->rx_steer_flags == DPDK_RX_STEER_LACP) { + smap_add(args, "rx-steering", "rss+lacp"); + } smap_add(args, "lsc_interrupt_mode", dev->lsc_interrupt_mode ? "true" : "false"); @@ -1976,8 +2006,8 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args) int new_n_rxq; new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1); - if (new_n_rxq != dev->requested_n_rxq) { - dev->requested_n_rxq = new_n_rxq; + if (new_n_rxq != dev->user_n_rxq) { + dev->user_n_rxq = new_n_rxq; netdev_request_reconfigure(&dev->up); } } @@ -2037,6 +2067,41 @@ dpdk_process_queue_size(struct netdev *netdev, const struct smap *args, } } +static void +dpdk_set_rx_steer_config(struct netdev *netdev, struct netdev_dpdk *dev, + const struct smap *args, char **errp) +{ + const char *arg = smap_get_def(args, "rx-steering", "rss"); + uint64_t flags = 0; + + if (!strcmp(arg, "rss+lacp")) { + flags = DPDK_RX_STEER_LACP; + } else if (strcmp(arg, "rss")) { + VLOG_WARN_BUF(errp, "%s: options:rx-steering " + "unsupported parameter value '%s'", + netdev_get_name(netdev), arg); + } + + if (flags && dev->type != DPDK_DEV_ETH) { + VLOG_WARN_BUF(errp, "%s: options:rx-steering " + "is only supported on ethernet ports", + netdev_get_name(netdev)); + flags = 0; + } + + if (flags && netdev_is_flow_api_enabled()) { + VLOG_WARN_BUF(errp, "%s: options:rx-steering " + "is incompatible with hw-offload", + netdev_get_name(netdev)); + flags = 0; + } + + if (flags != dev->requested_rx_steer_flags) { + dev->requested_rx_steer_flags = flags; + netdev_request_reconfigure(netdev); + } +} + static int netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, char **errp) @@ -2058,6 +2123,8 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, ovs_mutex_lock(&dpdk_mutex); ovs_mutex_lock(&dev->mutex); + dpdk_set_rx_steer_config(netdev, dev, args, errp); + dpdk_set_rxq_config(dev, args); new_devargs = smap_get(args, "dpdk-devargs"); @@ -3939,9 +4006,12 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct rte_eth_dev_info dev_info; + size_t rx_steer_flows_num; + uint64_t rx_steer_flags; const char *bus_info; uint32_t link_speed; uint32_t dev_flags; + int n_rxq; if (!rte_eth_dev_is_valid_port(dev->port_id)) { return ENODEV; @@ -3953,6 +4023,9 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) link_speed = dev->link.link_speed; dev_flags = *dev_info.dev_flags; bus_info = rte_dev_bus_info(dev_info.device); + rx_steer_flags = dev->rx_steer_flags; + rx_steer_flows_num = dev->rx_steer_flows_num; + n_rxq = netdev->n_rxq; ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); @@ -3995,6 +4068,19 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) ETH_ADDR_ARGS(dev->hwaddr)); } + if (rx_steer_flags) { + if (!rx_steer_flows_num) { + smap_add(args, "rx_steering", "unsupported"); + } else { + smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1); + if (n_rxq > 2) { + smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2); + } else { + smap_add(args, "rss_queues", "0"); + } + } + } + return 0; } @@ -5376,16 +5462,211 @@ static const struct dpdk_qos_ops trtcm_policer_ops = { .qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init }; +static int +dpdk_rx_steer_add_flow(struct netdev_dpdk *dev, + const struct rte_flow_item items[], + const char *desc) +{ + const struct rte_flow_attr attr = { .ingress = 1 }; + const struct rte_flow_action actions[] = { + { + .type = RTE_FLOW_ACTION_TYPE_QUEUE, + .conf = &(const struct rte_flow_action_queue) { + .index = dev->up.n_rxq - 1, + }, + }, + { .type = RTE_FLOW_ACTION_TYPE_END }, + }; + struct rte_flow_error error; + struct rte_flow *flow; + size_t num; + int err; + + set_error(&error, RTE_FLOW_ERROR_TYPE_NONE); + err = rte_flow_validate(dev->port_id, &attr, items, actions, &error); + if (err) { + VLOG_WARN("%s: rx-steering: device does not support %s flow: %s", + netdev_get_name(&dev->up), desc, + error.message ? error.message : ""); + goto out; + } + + set_error(&error, RTE_FLOW_ERROR_TYPE_NONE); + flow = rte_flow_create(dev->port_id, &attr, items, actions, &error); + if (flow == NULL) { + VLOG_WARN("%s: rx-steering: failed to add %s flow: %s", + netdev_get_name(&dev->up), desc, + error.message ? error.message : ""); + err = rte_errno; + goto out; + } + + num = dev->rx_steer_flows_num + 1; + dev->rx_steer_flows = xrealloc(dev->rx_steer_flows, num * sizeof flow); + dev->rx_steer_flows[dev->rx_steer_flows_num] = flow; + dev->rx_steer_flows_num = num; + + VLOG_INFO("%s: rx-steering: redirected %s traffic to rx queue %d", + netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1); +out: + return err; +} + +#define RETA_CONF_SIZE (RTE_ETH_RSS_RETA_SIZE_512 / RTE_ETH_RETA_GROUP_SIZE) + +static int +dpdk_rx_steer_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq) +{ + struct rte_eth_rss_reta_entry64 reta_conf[RETA_CONF_SIZE]; + struct rte_eth_dev_info info; + int err; + + rte_eth_dev_info_get(dev->port_id, &info); + + if (info.reta_size % rss_n_rxq != 0 && + info.reta_size < RTE_ETH_RSS_RETA_SIZE_128) { + /* + * Some drivers set reta_size equal to the total number of rxqs that + * are configured when it is a power of two. Since we are actually + * reconfiguring the redirection table to exclude the last rxq, we may + * end up with an imbalanced redirection table. For example, such + * configuration: + * + * options:n_rxq=3 options:rx-steering=rss+lacp + * + * Will actually configure 4 rxqs on the NIC, and the default reta to: + * + * [0, 1, 2, 3] + * + * And dpdk_rx_steer_rss_configure() will reconfigure reta to: + * + * [0, 1, 2, 0] + * + * Causing queue 0 to receive twice as much traffic as queues 1 and 2. + * + * Work around that corner case by forcing a bigger redirection table + * size to 128 entries when reta_size is not a multiple of rss_n_rxq + * and when reta_size is less than 128. This value seems to be + * supported by most of the drivers that also support rte_flow. + */ + info.reta_size = RTE_ETH_RSS_RETA_SIZE_128; + } + + memset(reta_conf, 0, sizeof reta_conf); + for (uint16_t i = 0; i < info.reta_size; i++) { + uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE; + uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE; + + reta_conf[idx].mask |= 1ULL << shift; + reta_conf[idx].reta[shift] = i % rss_n_rxq; + } + + err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, info.reta_size); + if (err < 0) { + VLOG_WARN("%s: failed to configure RSS redirection table: err=%d", + netdev_get_name(&dev->up), err); + } + + return err; +} + +static int +dpdk_rx_steer_configure(struct netdev_dpdk *dev) +{ + int err = 0; + + if (dev->up.n_rxq < 2) { + err = ENOTSUP; + VLOG_WARN("%s: rx-steering: not enough available rx queues", + netdev_get_name(&dev->up)); + goto out; + } + + if (dev->requested_rx_steer_flags & DPDK_RX_STEER_LACP) { + const struct rte_flow_item items[] = { + { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .spec = &(const struct rte_flow_item_eth){ + .type = htons(ETH_TYPE_LACP), + }, + .mask = &(const struct rte_flow_item_eth){ + .type = htons(0xffff), + }, + }, + { .type = RTE_FLOW_ITEM_TYPE_END }, + }; + err = dpdk_rx_steer_add_flow(dev, items, "lacp"); + if (err) { + goto out; + } + } + + if (dev->rx_steer_flows_num) { + /* Reconfigure RSS reta in all but the rx steering queue. */ + err = dpdk_rx_steer_rss_configure(dev, dev->up.n_rxq - 1); + if (err) { + goto out; + } + if (dev->up.n_rxq == 2) { + VLOG_INFO("%s: rx-steering: redirected other traffic to " + "rx queue 0", netdev_get_name(&dev->up)); + } else { + VLOG_INFO("%s: rx-steering: applied rss on rx queues 0-%u", + netdev_get_name(&dev->up), dev->up.n_rxq - 2); + } + } + +out: + return err; +} + +static void +dpdk_rx_steer_unconfigure(struct netdev_dpdk *dev) +{ + struct rte_flow_error error; + + if (!dev->rx_steer_flows_num) { + return; + } + + VLOG_DBG("%s: rx-steering: reset flows", netdev_get_name(&dev->up)); + + for (int i = 0; i < dev->rx_steer_flows_num; i++) { + set_error(&error, RTE_FLOW_ERROR_TYPE_NONE); + if (rte_flow_destroy(dev->port_id, dev->rx_steer_flows[i], &error)) { + VLOG_WARN("%s: rx-steering: failed to destroy flow: %s", + netdev_get_name(&dev->up), + error.message ? error.message : ""); + } + } + free(dev->rx_steer_flows); + dev->rx_steer_flows_num = 0; + dev->rx_steer_flows = NULL; + /* + * Most DPDK drivers seem to reset their RSS redirection table in + * rte_eth_dev_configure() or rte_eth_dev_start(), both of which are + * called in dpdk_eth_dev_init(). No need to explicitly reset it. + */ +} + static int netdev_dpdk_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + bool try_rx_steer; int err = 0; ovs_mutex_lock(&dev->mutex); + try_rx_steer = dev->requested_rx_steer_flags != 0; + dev->requested_n_rxq = dev->user_n_rxq; + if (try_rx_steer) { + dev->requested_n_rxq += 1; + } + if (netdev->n_txq == dev->requested_n_txq && netdev->n_rxq == dev->requested_n_rxq + && dev->rx_steer_flags == dev->requested_rx_steer_flags && dev->mtu == dev->requested_mtu && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode && dev->rxq_size == dev->requested_rxq_size @@ -5398,6 +5679,9 @@ netdev_dpdk_reconfigure(struct netdev *netdev) goto out; } +retry: + dpdk_rx_steer_unconfigure(dev); + if (dev->reset_needed) { rte_eth_dev_reset(dev->port_id); if_notifier_manual_report(); @@ -5422,6 +5706,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) dev->txq_size = dev->requested_txq_size; rte_free(dev->tx_q); + dev->tx_q = NULL; if (!eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)) { err = netdev_dpdk_set_etheraddr__(dev, dev->requested_hwaddr); @@ -5445,6 +5730,23 @@ netdev_dpdk_reconfigure(struct netdev *netdev) */ dev->requested_hwaddr = dev->hwaddr; + if (try_rx_steer) { + err = dpdk_rx_steer_configure(dev); + if (err) { + /* No hw support, disable & recover gracefully. */ + try_rx_steer = false; + /* + * The extra queue must be explicitly removed here to ensure that + * it is unconfigured immediately. + */ + dev->requested_n_rxq = dev->user_n_rxq; + goto retry; + } + } else { + VLOG_INFO("%s: rx-steering: default rss", netdev_get_name(&dev->up)); + } + dev->rx_steer_flags = dev->requested_rx_steer_flags; + dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq); if (!dev->tx_q) { err = ENOMEM; @@ -5681,6 +5983,13 @@ netdev_dpdk_flow_api_supported(struct netdev *netdev) dev = netdev_dpdk_cast(netdev); ovs_mutex_lock(&dev->mutex); if (dev->type == DPDK_DEV_ETH) { + if (dev->requested_rx_steer_flags) { + VLOG_WARN("%s: rx-steering is mutually exclusive with hw-offload," + " falling back to default rss mode", + netdev_get_name(netdev)); + dev->requested_rx_steer_flags = 0; + netdev_request_reconfigure(netdev); + } /* TODO: Check if we able to offload some minimal flow. */ ret = true; } diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 59c404bbbc7..01408e90a40 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3517,6 +3517,50 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \

      This option may only be used with dpdk VF representors.

      + +

      + Configure hardware Rx queue steering policy. +

      +

      + This option takes one of the following values: +

      +
      +
      rss
      +
      + Distribution of ingress packets in all Rx queues according to the + RSS algorithm. This is the default behaviour. +
      +
      rss+lacp
      +
      + Distribution of ingress packets according to the RSS algorithm on + all but the last Rx queue. An extra Rx queue is allocated for LACP + packets. +
      +
      +

      + If the user has already configured multiple on the port, an additional one will + be allocated for the specified protocols. Even if the hardware cannot + satisfy the requested number of requested Rx queues, the last Rx + queue will be used. If only one Rx queue is available or if the + hardware does not support the rte_flow matchers/actions required to + redirect the selected protocols, custom rx-steering will + fall back to default rss mode. +

      +

      + This feature is mutually exclusive with + + as it may conflict with the offloaded flows. If both are enabled, + rx-steering will fall back to default rss + mode. +

      +

      + This option is only applicable to interfaces with type + dpdk. +

      +
      + From 8e073791d4a64ae61c040cbfacedc4124dbbf1e5 Mon Sep 17 00:00:00 2001 From: Sayali Naval Date: Wed, 5 Jul 2023 20:02:45 +0000 Subject: [PATCH 298/833] bridge: Fix unexpected values for IPFIX enable-input/output-sampling. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As per the Open vSwitch Manual ovs-vsctl(8) the Bridge IPFIX parameters can be passed as follows: ovs-vsctl -- set Bridge br0 ipfix=@i \ -- --id=@i create IPFIX targets=\"192.168.0.34:4739\" \ obs_domain_id=123 obs_point_id=456 cache_active_timeout=60 \ cache_max_flows=13 \ other_config:enable-input-sampling=false \ other_config:enable-output-sampling=false where the default values are: enable_input_sampling: true enable_output_sampling: true But in the existing code these 2 parameters take up unexpected values in some scenarios: be_opts.enable_input_sampling = !smap_get_bool(&be_cfg->other_config, "enable-input-sampling", false); be_opts.enable_output_sampling = !smap_get_bool(&be_cfg->other_config, "enable-output-sampling", false); Here, the function smap_get_bool is being used with a negation. This returns expected values for the default case (since the above code will negate “false” we get from smap_get bool function and return the value “true”) but unexpected values for the case where the sampling value is passed through the CLI. For example, if we pass "true" for other_config:enable-input-sampling in the CLI, the above code will negate the “true” value we get from the smap_bool function and return the value “false”. Same would be the case for enable_output_sampling. Acked-by: Adrian Moreno Signed-off-by: Sayali Naval Signed-off-by: Ilya Maximets --- vswitchd/bridge.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index f5dc59ad06e..b972d55d0b3 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1560,11 +1560,11 @@ bridge_configure_ipfix(struct bridge *br) be_opts.enable_tunnel_sampling = smap_get_bool(&be_cfg->other_config, "enable-tunnel-sampling", true); - be_opts.enable_input_sampling = !smap_get_bool(&be_cfg->other_config, - "enable-input-sampling", false); + be_opts.enable_input_sampling = smap_get_bool(&be_cfg->other_config, + "enable-input-sampling", true); - be_opts.enable_output_sampling = !smap_get_bool(&be_cfg->other_config, - "enable-output-sampling", false); + be_opts.enable_output_sampling = smap_get_bool(&be_cfg->other_config, + "enable-output-sampling", true); virtual_obs_id = smap_get(&be_cfg->other_config, "virtual_obs_id"); be_opts.virtual_obs_id = nullable_xstrdup(virtual_obs_id); From 00782baac054039717e06cfc9c64a1c79921cafa Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 12 Jul 2023 00:14:54 +0200 Subject: [PATCH 299/833] AUTHORS: Add Sayali Naval. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 7175766482f..c1b32b03858 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -396,6 +396,7 @@ Sanjay Sane Saurabh Mohan saurabh@cplanenetworks.com Saurabh Shah Saurabh Shrivastava saurabh.shrivastava@nuagenetworks.net +Sayali Naval sanaval@cisco.com Scott Cheloha scottcheloha@gmail.com Scott Lowe scott.lowe@scottlowe.org Scott Mann sdmnix@gmail.com From e71f1a2da130a09c798c57ac9681b1eb3f0be050 Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Wed, 14 Jun 2023 02:34:39 +0800 Subject: [PATCH 300/833] ovsdb: Assert and check return values of `ovsdb_table_schema_get_column`. This commit adds a few null pointer assertions and checks to some return values of `ovsdb_table_schema_get_column`. If a null pointer is encountered in these blocks, either the assertion will fail or the control flow will now be redirected to alternative paths which will output the appropriate error messages. A few ovsdb-rbac and ovsdb-server tests are also updated to verify the expected warning logs by adding said logs to the ALLOWLIST of the OVSDB_SERVER_SHUTDOWN statements. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- ovsdb/condition.c | 5 ++++- ovsdb/ovsdb-client.c | 7 +++++-- ovsdb/ovsdb-util.c | 6 ++++++ tests/ovsdb-rbac.at | 4 +++- tests/ovsdb-server.at | 8 ++++++-- 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/ovsdb/condition.c b/ovsdb/condition.c index 09c89b2a02c..5a3eb4e8a3f 100644 --- a/ovsdb/condition.c +++ b/ovsdb/condition.c @@ -47,7 +47,10 @@ ovsdb_clause_from_json(const struct ovsdb_table_schema *ts, /* Column and arg fields are not being used with boolean functions. * Use dummy values */ - clause->column = ovsdb_table_schema_get_column(ts, "_uuid"); + const struct ovsdb_column *uuid_column = + ovsdb_table_schema_get_column(ts, "_uuid"); + ovs_assert(uuid_column); + clause->column = uuid_column; clause->index = clause->column->index; ovsdb_datum_init_default(&clause->arg, &clause->column->type); return NULL; diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index bae2c5f0414..46484630d2d 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -1232,8 +1232,11 @@ parse_monitor_columns(char *arg, const char *server, const char *database, } free(nodes); - add_column(server, ovsdb_table_schema_get_column(table, "_version"), - columns, columns_json); + const struct ovsdb_column *version_column = + ovsdb_table_schema_get_column(table, "_version"); + + ovs_assert(version_column); + add_column(server, version_column, columns, columns_json); } if (!initial || !insert || !delete || !modify) { diff --git a/ovsdb/ovsdb-util.c b/ovsdb/ovsdb-util.c index 303191dc87d..ec453789010 100644 --- a/ovsdb/ovsdb-util.c +++ b/ovsdb/ovsdb-util.c @@ -291,9 +291,15 @@ ovsdb_util_write_string_string_column(struct ovsdb_row *row, size_t i; column = ovsdb_table_schema_get_column(row->table->schema, column_name); + if (!column) { + VLOG_WARN("No %s column present in the %s table", + column_name, row->table->schema->name); + goto unwind; + } datum = ovsdb_util_get_datum(row, column_name, OVSDB_TYPE_STRING, OVSDB_TYPE_STRING, UINT_MAX); if (!datum) { +unwind: for (i = 0; i < n; i++) { free(keys[i]); free(values[i]); diff --git a/tests/ovsdb-rbac.at b/tests/ovsdb-rbac.at index 7de3711fbd0..3172e4bf558 100644 --- a/tests/ovsdb-rbac.at +++ b/tests/ovsdb-rbac.at @@ -371,5 +371,7 @@ cat stdout >> output AT_CHECK([uuidfilt stdout], [0], [[[{"count":1}]] ], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /No status column present in the Connection table/d +"]) AT_CLEANUP diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index b53ab8f5227..8ccec80bcbd 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -428,7 +428,9 @@ AT_CHECK( [[[{"rows":[{"managers":"punix:socket1"}]},{"rows":[{"is_connected":false,"target":"punix:socket2"}]}] ]], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /No status column present in the Manager table/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-remote and remove-remote]) @@ -2110,7 +2112,9 @@ AT_CHECK([ovsdb-client transact tcp:127.0.0.1:$TCP_PORT \ cat stdout >> output AT_CHECK([uuidfilt output], [0], [[[{"details":"insert operation not allowed when database server is in read only mode","error":"not allowed"}]] ], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /No status column present in the Manager table/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server replication with schema mismatch]) From e769387b42bab2db138b9b7182ee49d3f335ae4d Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Wed, 14 Jun 2023 02:34:40 +0800 Subject: [PATCH 301/833] file, monitor: Add null pointer assertions for old and new ovsdb_rows. This commit adds non-null pointer assertions in some code that performs some decisions based on old and new input ovsdb_rows. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- ovsdb/file.c | 3 +++ ovsdb/monitor.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index 2d887e53ebc..400b34794bb 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -522,9 +522,12 @@ ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, } if (row) { + ovs_assert(new || old); struct ovsdb_table *table = new ? new->table : old->table; char uuid[UUID_LEN + 1]; + ovs_assert(table); + if (table != ftxn->table) { /* Create JSON object for transaction overall. */ if (!ftxn->json) { diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 4afaa89f48e..01091fabe78 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -1372,8 +1372,11 @@ ovsdb_monitor_changes_update(const struct ovsdb_row *old, const struct ovsdb_monitor_table *mt, struct ovsdb_monitor_change_set_for_table *mcst) { + ovs_assert(new || old); const struct uuid *uuid = ovsdb_row_get_uuid(new ? new : old); - struct ovsdb_monitor_row *change; + struct ovsdb_monitor_row *change = NULL; + + ovs_assert(uuid); change = ovsdb_monitor_changes_row_find(mcst, uuid); if (!change) { From b2d45921a674dd0227225525966bb04f5abb3e55 Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Wed, 14 Jun 2023 02:34:41 +0800 Subject: [PATCH 302/833] ovs-vsctl: Fix crash when routing is enabled. In the case where routing is enabled, the bridge member of the `vsctl_port` structs is not populated. This can cause a crash if we attempt to access it. This patch fixes the crash by checking if the bridge member is valid before attempting to access it. In the `check_conflicts` function, we print both the port name and the bridge name if routing is disabled and we only print the port name if routing is enabled. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- utilities/ovs-vsctl.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index 2f5ac1a2622..62b51230296 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -889,14 +889,23 @@ check_conflicts(struct vsctl_context *vsctl_ctx, const char *name, port = shash_find_data(&vsctl_ctx->ports, name); if (port) { - ctl_fatal("%s because a port named %s already exists on " - "bridge %s", msg, name, port->bridge->name); + if (port->bridge) { + ctl_fatal("%s because a port named %s already exists on " + "bridge %s", msg, name, port->bridge->name); + } else { + ctl_fatal("%s because a port named %s already exists", msg, name); + } } iface = shash_find_data(&vsctl_ctx->ifaces, name); if (iface) { - ctl_fatal("%s because an interface named %s already exists " - "on bridge %s", msg, name, iface->port->bridge->name); + if (iface->port->bridge) { + ctl_fatal("%s because an interface named %s already exists " + "on bridge %s", msg, name, iface->port->bridge->name); + } else { + ctl_fatal("%s because an interface named %s already exists", msg, + name); + } } free(msg); @@ -936,7 +945,7 @@ find_port(struct vsctl_context *vsctl_ctx, const char *name, bool must_exist) ovs_assert(vsctl_ctx->cache_valid); port = shash_find_data(&vsctl_ctx->ports, name); - if (port && !strcmp(name, port->bridge->name)) { + if (port && port->bridge && !strcmp(name, port->bridge->name)) { port = NULL; } if (must_exist && !port) { @@ -954,7 +963,8 @@ find_iface(struct vsctl_context *vsctl_ctx, const char *name, bool must_exist) ovs_assert(vsctl_ctx->cache_valid); iface = shash_find_data(&vsctl_ctx->ifaces, name); - if (iface && !strcmp(name, iface->port->bridge->name)) { + if (iface && iface->port->bridge && + !strcmp(name, iface->port->bridge->name)) { iface = NULL; } if (must_exist && !iface) { From f770b8c1336957115f50f70a64cb2ac98b963d1b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 12 Jul 2023 00:31:40 +0200 Subject: [PATCH 303/833] AUTHORS: Add James Raphael Tiovalen. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index c1b32b03858..10e5c276fcd 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -211,6 +211,7 @@ Jakub Libosvar libosvar@redhat.com Jakub Sitnicki jsitnicki@gmail.com James P. roampune@gmail.com James Page james.page@ubuntu.com +James Raphael Tiovalen jamestiotio@gmail.com Jamie Lennox jamielennox@gmail.com Jan Scheurich jan.scheurich@ericsson.com Jan Vansteenkiste jan@vstone.eu From d25c6bd8df37e50cac9aa1c18365bbc91cde3811 Mon Sep 17 00:00:00 2001 From: Chandan Somani Date: Fri, 7 Jul 2023 16:07:56 -0400 Subject: [PATCH 304/833] checkpatch: Reorganize flagged words using a list. Single out flagged words and allow for more useful details, like spelling suggestions. Signed-off-by: Chandan Somani Signed-off-by: Eelco Chaudron --- utilities/checkpatch.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 64f0efeb474..acf9a0102f7 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -411,6 +411,8 @@ def check_spelling(line, comment): words = filter_comments(line, True) if comment else line words = words.replace(':', ' ').split(' ') + flagged_words = [] + for word in words: skip = False strword = re.subn(r'\W+', '', word)[0].replace(',', '') @@ -435,9 +437,13 @@ def check_spelling(line, comment): skip = True if not skip: - print_warning("Check for spelling mistakes (e.g. \"%s\")" - % strword) - return True + flagged_words.append(strword) + + if len(flagged_words) > 0: + for mistake in flagged_words: + print_warning("Possible misspelled word: \"%s\"" % mistake) + + return True return False From 9a50170a805ab9f70a78d1b1ce5c56ec44b64886 Mon Sep 17 00:00:00 2001 From: Chandan Somani Date: Fri, 7 Jul 2023 16:07:57 -0400 Subject: [PATCH 305/833] checkpatch: Add suggestions to the spell checker. This will be useful for correcting possible spelling mistakes with ease. Suggestions limited to 3 at first, but can be made configurable in the future. Acked-by: Aaron Conole Signed-off-by: Chandan Somani Signed-off-by: Eelco Chaudron --- utilities/checkpatch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index acf9a0102f7..e5d5029f267 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -412,6 +412,7 @@ def check_spelling(line, comment): words = words.replace(':', ' ').split(' ') flagged_words = [] + num_suggestions = 3 for word in words: skip = False @@ -442,6 +443,8 @@ def check_spelling(line, comment): if len(flagged_words) > 0: for mistake in flagged_words: print_warning("Possible misspelled word: \"%s\"" % mistake) + print("Did you mean: ", + spell_check_dict.suggest(mistake)[:num_suggestions]) return True From 799f697e51eca178262dc801ef3bf6f5509b180b Mon Sep 17 00:00:00 2001 From: Chandan Somani Date: Fri, 7 Jul 2023 16:07:58 -0400 Subject: [PATCH 306/833] checkpatch: Print subject field if misspelled or missing. This narrows down spelling errors that are in the commit subject. It also provides a subject if the subject line is missing. The provisional subject is the name of the patch file, which should provide some context about the patch. Acked-by: Aaron Conole Signed-off-by: Chandan Somani Signed-off-by: Eelco Chaudron --- utilities/checkpatch.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index e5d5029f267..12bd153ee05 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -1024,6 +1024,19 @@ def ovs_checkpatch_file(filename): result = ovs_checkpatch_parse(part.get_payload(decode=False), filename, mail.get('Author', mail['From']), mail['Commit']) + if spellcheck: + if not mail['Subject'] or not mail['Subject'].strip(): + if mail['Subject']: + mail.replace_header('Subject', sys.argv[-1]) + else: + mail.add_header('Subject', sys.argv[-1]) + + print("Subject missing! Your provisional subject is", + mail['Subject']) + + if check_spelling(mail['Subject'], False): + print("Subject: %s" % mail['Subject']) + ovs_checkpatch_print_result() return result From f3e9d30041c581c4b758e402db65d44dc1c6b23e Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 12 Jul 2023 12:09:50 +0200 Subject: [PATCH 307/833] AUTHORS: Add Chandan Somani. Signed-off-by: Eelco Chaudron --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 10e5c276fcd..9657aa710e3 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -98,6 +98,7 @@ Bruce Davie bdavie@vmware.com Bryan Phillippe bp@toroki.com Carlo Andreotti c.andreotti@m3s.it Casey Barker crbarker@google.com +Chandan Somani csomani@redhat.com Chandra Sekhar Vejendla csvejend@us.ibm.com Chris Wright chrisw@sous-sol.org Christoph Jaeger cj@linux.com From 4829506b2a21ca42628ea7f73d8c4cf82cb11f9f Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 12 Jul 2023 09:37:07 -0400 Subject: [PATCH 308/833] ofproto-dpif-xlate: Reduce stack usage in recursive xlate functions. Several xlate actions used in recursive translation currently store a large amount of information on the stack. This can result in handler threads quickly running out of stack space despite before xlate_resubmit_resource_check() is able to terminate translation. This patch reduces stack usage by over 3kb from several translation actions. This patch also moves some trace function from do_xlate_actions into its own function. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2104779 Reviewed-by: Simon Horman Signed-off-by: Mike Pattrick Signed-off-by: Eelco Chaudron --- ofproto/ofproto-dpif-xlate.c | 259 ++++++++++++++++++++++------------- 1 file changed, 164 insertions(+), 95 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 29f4daa6357..4928ea99cfc 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -501,6 +501,84 @@ ctx_cancel_freeze(struct xlate_ctx *ctx) static void finish_freezing(struct xlate_ctx *ctx); +/* These functions and structure are used to save stack space in actions that + * need to retain a large amount of xlate_ctx state. */ +struct xretained_state { + union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)]; + uint64_t actset_stub[1024 / 8]; + struct ofpbuf old_stack; + struct ofpbuf old_action_set; + struct flow old_flow; + struct flow old_base; + struct flow_tnl flow_tnl_mask; +}; + +/* The return of this function must be freed by + * xretain_state_restore_and_free(). */ +static struct xretained_state * +xretain_state_save(struct xlate_ctx *ctx) +{ + struct xretained_state *retained = xmalloc(sizeof *retained); + + retained->old_flow = ctx->xin->flow; + retained->old_stack = ctx->stack; + retained->old_action_set = ctx->action_set; + ofpbuf_use_stub(&ctx->stack, retained->new_stack, + sizeof retained->new_stack); + ofpbuf_use_stub(&ctx->action_set, retained->actset_stub, + sizeof retained->actset_stub); + + return retained; +} + +static void +xretain_tunnel_mask_save(const struct xlate_ctx *ctx, + struct xretained_state *retained) +{ + retained->flow_tnl_mask = ctx->wc->masks.tunnel; +} + +static void +xretain_base_flow_save(const struct xlate_ctx *ctx, + struct xretained_state *retained) +{ + retained->old_base = ctx->base_flow; +} + +static void +xretain_base_flow_restore(struct xlate_ctx *ctx, + const struct xretained_state *retained) +{ + ctx->base_flow = retained->old_base; +} + +static void +xretain_flow_restore(struct xlate_ctx *ctx, + const struct xretained_state *retained) +{ + ctx->xin->flow = retained->old_flow; +} + +static void +xretain_tunnel_mask_restore(struct xlate_ctx *ctx, + const struct xretained_state *retained) +{ + ctx->wc->masks.tunnel = retained->flow_tnl_mask; +} + +static void +xretain_state_restore_and_free(struct xlate_ctx *ctx, + struct xretained_state *retained) +{ + ctx->xin->flow = retained->old_flow; + ofpbuf_uninit(&ctx->action_set); + ctx->action_set = retained->old_action_set; + ofpbuf_uninit(&ctx->stack); + ctx->stack = retained->old_stack; + + free(retained); +} + /* A controller may use OFPP_NONE as the ingress port to indicate that * it did not arrive on a "real" port. 'ofpp_none_bundle' exists for * when an input bundle is needed for validation (e.g., mirroring or @@ -3915,20 +3993,17 @@ static void patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev, struct xport *out_dev, bool is_last_action) { + bool old_was_mpls = ctx->was_mpls; struct flow *flow = &ctx->xin->flow; - struct flow old_flow = ctx->xin->flow; - struct flow_tnl old_flow_tnl_wc = ctx->wc->masks.tunnel; bool old_conntrack = ctx->conntracked; - bool old_was_mpls = ctx->was_mpls; - ovs_version_t old_version = ctx->xin->tables_version; - struct ofpbuf old_stack = ctx->stack; - uint8_t new_stack[1024]; - struct ofpbuf old_action_set = ctx->action_set; + struct xretained_state *retained_state; struct ovs_list *old_trace = ctx->xin->trace; - uint64_t actset_stub[1024 / 8]; + ovs_version_t old_version = ctx->xin->tables_version; + + retained_state = xretain_state_save(ctx); + + xretain_tunnel_mask_save(ctx, retained_state); - ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack); - ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub); flow->in_port.ofp_port = out_dev->ofp_port; flow->metadata = htonll(0); memset(&flow->tunnel, 0, sizeof flow->tunnel); @@ -3967,14 +4042,15 @@ patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev, } else { /* Forwarding is disabled by STP and RSTP. Let OFPP_NORMAL and * the learning action look at the packet, then drop it. */ - struct flow old_base_flow = ctx->base_flow; size_t old_size = ctx->odp_actions->size; + + xretain_base_flow_save(ctx, retained_state); mirror_mask_t old_mirrors2 = ctx->mirrors; xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true, false, is_last_action, clone_xlate_actions); ctx->mirrors = old_mirrors2; - ctx->base_flow = old_base_flow; + xretain_base_flow_restore(ctx, retained_state); ctx->odp_actions->size = old_size; /* Undo changes that may have been done for freezing. */ @@ -3986,18 +4062,15 @@ patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev, if (independent_mirrors) { ctx->mirrors = old_mirrors; } - ctx->xin->flow = old_flow; ctx->xbridge = in_dev->xbridge; - ofpbuf_uninit(&ctx->action_set); - ctx->action_set = old_action_set; - ofpbuf_uninit(&ctx->stack); - ctx->stack = old_stack; /* Restore calling bridge's lookup version. */ ctx->xin->tables_version = old_version; - /* Restore to calling bridge tunneling information */ - ctx->wc->masks.tunnel = old_flow_tnl_wc; + /* Restore to calling bridge tunneling information; the ctx flow, actions, + * and stack. And free the retained state. */ + xretain_tunnel_mask_restore(ctx, retained_state); + xretain_state_restore_and_free(ctx, retained_state); /* The out bridge popping MPLS should have no effect on the original * bridge. */ @@ -4247,7 +4320,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, const struct xport *xport = get_ofp_port(ctx->xbridge, ofp_port); struct flow_wildcards *wc = ctx->wc; struct flow *flow = &ctx->xin->flow; - struct flow_tnl flow_tnl; + struct flow_tnl *flow_tnl = NULL; union flow_vlan_hdr flow_vlans[FLOW_MAX_VLAN_HEADERS]; uint8_t flow_nw_tos; odp_port_t out_port, odp_port, odp_tnl_port; @@ -4261,7 +4334,6 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, /* If 'struct flow' gets additional metadata, we'll need to zero it out * before traversing a patch port. */ BUILD_ASSERT_DECL(FLOW_WC_SEQ == 42); - memset(&flow_tnl, 0, sizeof flow_tnl); if (!check_output_prerequisites(ctx, xport, flow, check_stp)) { return; @@ -4305,7 +4377,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, * the Logical (tunnel) Port are not visible for any further * matches, while explicit set actions on tunnel metadata are. */ - flow_tnl = flow->tunnel; + flow_tnl = xmemdup(&flow->tunnel, sizeof *flow_tnl); odp_port = tnl_port_send(xport->ofport, flow, ctx->wc); if (odp_port == ODPP_NONE) { xlate_report(ctx, OFT_WARN, "Tunneling decided against output"); @@ -4336,7 +4408,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, tnl_type = tnl_port_get_type(xport->ofport); commit_odp_tunnel_action(flow, &ctx->base_flow, ctx->odp_actions, tnl_type); - flow->tunnel = flow_tnl; /* Restore tunnel metadata */ + flow->tunnel = *flow_tnl; /* Restore tunnel metadata. */ } } else { odp_port = xport->odp_port; @@ -4380,7 +4452,8 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, /* Output to native tunnel port. */ native_tunnel_output(ctx, xport, flow, odp_port, truncate, is_last_action); - flow->tunnel = flow_tnl; /* Restore tunnel metadata */ + ovs_assert(flow_tnl); + flow->tunnel = *flow_tnl; /* Restore tunnel metadata. */ } else if (terminate_native_tunnel(ctx, xport, flow, wc, &odp_tnl_port)) { @@ -4423,7 +4496,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, xport->xbundle)); } - out: +out: /* Restore flow */ memcpy(flow->vlans, flow_vlans, sizeof flow->vlans); flow->nw_tos = flow_nw_tos; @@ -4431,6 +4504,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, flow->dl_src = flow_dl_src; flow->packet_type = flow_packet_type; flow->dl_type = flow_dl_type; + free(flow_tnl); } static void @@ -5409,15 +5483,15 @@ xlate_output_reg_action(struct xlate_ctx *ctx, { uint64_t port = mf_get_subfield(&or->src, &ctx->xin->flow); if (port <= UINT16_MAX) { - xlate_report(ctx, OFT_DETAIL, "output port is %"PRIu64, port); - - union mf_subvalue value; + union mf_subvalue *value = xmalloc(sizeof *value); - memset(&value, 0xff, sizeof value); - mf_write_subfield_flow(&or->src, &value, &ctx->wc->masks); + xlate_report(ctx, OFT_DETAIL, "output port is %"PRIu64, port); + memset(value, 0xff, sizeof *value); + mf_write_subfield_flow(&or->src, value, &ctx->wc->masks); xlate_output_action(ctx, u16_to_ofp(port), or->max_len, false, is_last_action, false, group_bucket_action); + free(value); } else { xlate_report(ctx, OFT_WARN, "output port %"PRIu64" is out of range", port); @@ -5758,13 +5832,15 @@ xlate_sample_action(struct xlate_ctx *ctx, struct flow *flow = &ctx->xin->flow; tnl_port_send(xport->ofport, flow, ctx->wc); if (!ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) { - struct flow_tnl flow_tnl = flow->tunnel; + struct flow_tnl *flow_tnl; const char *tnl_type; + flow_tnl = xmemdup(&flow->tunnel, sizeof *flow_tnl); tnl_type = tnl_port_get_type(xport->ofport); commit_odp_tunnel_action(flow, &ctx->base_flow, ctx->odp_actions, tnl_type); - flow->tunnel = flow_tnl; + flow->tunnel = *flow_tnl; + free(flow_tnl); } } else { xlate_report_error(ctx, @@ -5874,21 +5950,12 @@ clone_xlate_actions(const struct ofpact *actions, size_t actions_len, struct xlate_ctx *ctx, bool is_last_action, bool group_bucket_action OVS_UNUSED) { - struct ofpbuf old_stack = ctx->stack; - union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)]; - ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack); - ofpbuf_put(&ctx->stack, old_stack.data, old_stack.size); - - struct ofpbuf old_action_set = ctx->action_set; - uint64_t actset_stub[1024 / 8]; - ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub); - ofpbuf_put(&ctx->action_set, old_action_set.data, old_action_set.size); - + struct xretained_state *retained_state; size_t offset, ac_offset; - struct flow old_flow = ctx->xin->flow; + + retained_state = xretain_state_save(ctx); if (reversible_actions(actions, actions_len) || is_last_action) { - old_flow = ctx->xin->flow; do_xlate_actions(actions, actions_len, ctx, is_last_action, false); if (!ctx->freezing) { xlate_action_set(ctx); @@ -5903,7 +5970,8 @@ clone_xlate_actions(const struct ofpact *actions, size_t actions_len, * avoid emitting those actions twice. Once inside * the clone, another time for the action after clone. */ xlate_commit_actions(ctx); - struct flow old_base = ctx->base_flow; + xretain_base_flow_save(ctx, retained_state); + bool old_was_mpls = ctx->was_mpls; bool old_conntracked = ctx->conntracked; @@ -5960,14 +6028,10 @@ clone_xlate_actions(const struct ofpact *actions, size_t actions_len, ctx->was_mpls = old_was_mpls; /* Restore the 'base_flow' for the next action. */ - ctx->base_flow = old_base; + xretain_base_flow_restore(ctx, retained_state); xlate_done: - ofpbuf_uninit(&ctx->action_set); - ctx->action_set = old_action_set; - ofpbuf_uninit(&ctx->stack); - ctx->stack = old_stack; - ctx->xin->flow = old_flow; + xretain_state_restore_and_free(ctx, retained_state); } static void @@ -6343,8 +6407,8 @@ compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc, { uint16_t zone; if (ofc->zone_src.field) { - union mf_subvalue value; - memset(&value, 0xff, sizeof(value)); + union mf_subvalue *value = xmalloc(sizeof *value); + memset(value, 0xff, sizeof *value); zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow); if (ctx->xin->frozen_state) { @@ -6354,12 +6418,13 @@ compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc, * which will invalidate the megaflow with old the recirc_id. */ if (!mf_is_frozen_metadata(ofc->zone_src.field)) { - mf_write_subfield_flow(&ofc->zone_src, &value, + mf_write_subfield_flow(&ofc->zone_src, value, &ctx->wc->masks); } } else { - mf_write_subfield_flow(&ofc->zone_src, &value, &ctx->wc->masks); + mf_write_subfield_flow(&ofc->zone_src, value, &ctx->wc->masks); } + free(value); } else { zone = ofc->zone_imm; } @@ -6449,16 +6514,16 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, const struct ofpact *remaining_acts, size_t remaining_acts_len) { - union mf_subvalue value; - memset(&value, 0, sizeof value); + union mf_subvalue *value = xmalloc(sizeof *value); + memset(value, 0, sizeof *value); if (!ctx->xbridge->support.check_pkt_len) { uint8_t is_pkt_larger = 0; if (ctx->xin->packet) { is_pkt_larger = dp_packet_size(ctx->xin->packet) > check_pkt_larger->pkt_len; } - value.u8_val = is_pkt_larger; - mf_write_subfield_flow(&check_pkt_larger->dst, &value, + value->u8_val = is_pkt_larger; + mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow); /* If datapath doesn't support check_pkt_len action, then set the * SLOW_ACTION flag. If we don't set SLOW_ACTION, we @@ -6468,22 +6533,17 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, * the packet length. This results in wrong actions being applied. */ ctx->xout->slow |= SLOW_ACTION; + free(value); return; } - struct ofpbuf old_stack = ctx->stack; - union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)]; - ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack); - ofpbuf_put(&ctx->stack, old_stack.data, old_stack.size); + struct xretained_state *retained_state; - struct ofpbuf old_action_set = ctx->action_set; - uint64_t actset_stub[1024 / 8]; - ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub); - ofpbuf_put(&ctx->action_set, old_action_set.data, old_action_set.size); + retained_state = xretain_state_save(ctx); - struct flow old_flow = ctx->xin->flow; xlate_commit_actions(ctx); - struct flow old_base = ctx->base_flow; + xretain_base_flow_save(ctx, retained_state); + bool old_was_mpls = ctx->was_mpls; bool old_conntracked = ctx->conntracked; @@ -6493,8 +6553,8 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, check_pkt_larger->pkt_len); size_t offset_attr = nl_msg_start_nested( ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); - value.u8_val = 1; - mf_write_subfield_flow(&check_pkt_larger->dst, &value, &ctx->xin->flow); + value->u8_val = 1; + mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow); do_xlate_actions(remaining_acts, remaining_acts_len, ctx, true, false); if (!ctx->freezing) { xlate_action_set(ctx); @@ -6504,10 +6564,10 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, } nl_msg_end_nested(ctx->odp_actions, offset_attr); - ctx->base_flow = old_base; + xretain_base_flow_restore(ctx, retained_state); + xretain_flow_restore(ctx, retained_state); ctx->was_mpls = old_was_mpls; ctx->conntracked = old_conntracked; - ctx->xin->flow = old_flow; /* If the flow translation for the IF_GREATER case requires freezing, * then ctx->exit would be true. Reset to false so that we can @@ -6518,8 +6578,8 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, offset_attr = nl_msg_start_nested( ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL); - value.u8_val = 0; - mf_write_subfield_flow(&check_pkt_larger->dst, &value, &ctx->xin->flow); + value->u8_val = 0; + mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow); do_xlate_actions(remaining_acts, remaining_acts_len, ctx, true, false); if (!ctx->freezing) { xlate_action_set(ctx); @@ -6530,15 +6590,12 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, nl_msg_end_nested(ctx->odp_actions, offset_attr); nl_msg_end_nested(ctx->odp_actions, offset); - ofpbuf_uninit(&ctx->action_set); - ctx->action_set = old_action_set; - ofpbuf_uninit(&ctx->stack); - ctx->stack = old_stack; - ctx->base_flow = old_base; ctx->was_mpls = old_was_mpls; ctx->conntracked = old_conntracked; - ctx->xin->flow = old_flow; ctx->exit = old_exit; + xretain_base_flow_restore(ctx, retained_state); + xretain_state_restore_and_free(ctx, retained_state); + free(value); } static void @@ -6989,6 +7046,31 @@ xlate_ofpact_unroll_xlate(struct xlate_ctx *ctx, "cookie=%#"PRIx64, a->rule_table_id, a->rule_cookie); } +static void +xlate_trace(struct xlate_ctx *ctx, const struct ofpact *a) +{ + struct ofputil_port_map *map; + + map = xmalloc(sizeof *map); + ofputil_port_map_init(map); + + if (ctx->xin->names) { + struct ofproto_dpif *ofprotop; + + ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name); + ofproto_append_ports_to_map(map, ofprotop->up.ports); + } + + struct ds s = DS_EMPTY_INITIALIZER; + struct ofpact_format_params fp = { .s = &s, .port_map = map }; + + ofpacts_format(a, OFPACT_ALIGN(a->len), &fp); + xlate_report(ctx, OFT_ACTION, "%s", ds_cstr(&s)); + ds_destroy(&s); + ofputil_port_map_destroy(map); + free(map); +} + static void do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, struct xlate_ctx *ctx, bool is_last_action, @@ -7031,20 +7113,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, } if (OVS_UNLIKELY(ctx->xin->trace)) { - struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); - - if (ctx->xin->names) { - struct ofproto_dpif *ofprotop; - ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name); - ofproto_append_ports_to_map(&map, ofprotop->up.ports); - } - - struct ds s = DS_EMPTY_INITIALIZER; - struct ofpact_format_params fp = { .s = &s, .port_map = &map }; - ofpacts_format(a, OFPACT_ALIGN(a->len), &fp); - xlate_report(ctx, OFT_ACTION, "%s", ds_cstr(&s)); - ds_destroy(&s); - ofputil_port_map_destroy(&map); + xlate_trace(ctx, a); } switch (a->type) { From a5fdc45b842d5109d38e9f1564afd3e3da77d6be Mon Sep 17 00:00:00 2001 From: Viacheslav Galaktionov Date: Thu, 13 Jul 2023 12:55:07 +0400 Subject: [PATCH 309/833] netdev-dpdk: Fix build with experimental API. The set_error function is now used regardless of whether experimental APIs are allowed or not, so it must be defined unconditionally. Fixes: fc06ea9a1883 ("netdev-dpdk: Add custom rx-steering configuration.") Acked-by: Ivan Malov Signed-off-by: Viacheslav Galaktionov Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 5cd95d00f5a..86df7a1e83c 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -52,6 +52,17 @@ netdev_dpdk_rte_flow_query_count(struct netdev *netdev, int netdev_dpdk_get_port_id(struct netdev *netdev); +static inline void +set_error(struct rte_flow_error *error, enum rte_flow_error_type type) +{ + if (!error) { + return; + } + error->type = type; + error->cause = NULL; + error->message = NULL; +} + #ifdef ALLOW_EXPERIMENTAL_API int netdev_dpdk_rte_flow_tunnel_decap_set(struct netdev *, @@ -79,17 +90,6 @@ int netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *, #else -static inline void -set_error(struct rte_flow_error *error, enum rte_flow_error_type type) -{ - if (!error) { - return; - } - error->type = type; - error->cause = NULL; - error->message = NULL; -} - static inline int netdev_dpdk_rte_flow_tunnel_decap_set( struct netdev *netdev OVS_UNUSED, From 62f5aa42aada8f3a6bebbcd0bd87f79c37334c9c Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Wed, 14 Jun 2023 02:34:38 +0800 Subject: [PATCH 310/833] shash, simap, smap: Add assertions to `*_count` functions. This commit adds assertions in the functions `shash_count`, `simap_count`, and `smap_count` to ensure that the corresponding input struct pointer is not NULL. This ensures that if the return values of `shash_sort`, `simap_sort`, or `smap_sort` are NULL, then the following for loops would not attempt to access the pointer, which might result in segmentation faults or undefined behavior. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- lib/shash.c | 2 ++ lib/simap.c | 2 ++ lib/smap.c | 1 + 3 files changed, 5 insertions(+) diff --git a/lib/shash.c b/lib/shash.c index a7b2c645829..2bfc8eb507f 100644 --- a/lib/shash.c +++ b/lib/shash.c @@ -17,6 +17,7 @@ #include #include "openvswitch/shash.h" #include "hash.h" +#include "util.h" static struct shash_node *shash_find__(const struct shash *, const char *name, size_t name_len, @@ -100,6 +101,7 @@ shash_is_empty(const struct shash *shash) size_t shash_count(const struct shash *shash) { + ovs_assert(shash); return hmap_count(&shash->map); } diff --git a/lib/simap.c b/lib/simap.c index 0ee08d74d52..1c01d4ebe22 100644 --- a/lib/simap.c +++ b/lib/simap.c @@ -17,6 +17,7 @@ #include #include "simap.h" #include "hash.h" +#include "util.h" static size_t hash_name(const char *, size_t length); static struct simap_node *simap_find__(const struct simap *, @@ -84,6 +85,7 @@ simap_is_empty(const struct simap *simap) size_t simap_count(const struct simap *simap) { + ovs_assert(simap); return hmap_count(&simap->map); } diff --git a/lib/smap.c b/lib/smap.c index 47fb3450201..122adca2717 100644 --- a/lib/smap.c +++ b/lib/smap.c @@ -300,6 +300,7 @@ smap_is_empty(const struct smap *smap) size_t smap_count(const struct smap *smap) { + ovs_assert(smap); return hmap_count(&smap->map); } From 501f665a5a4b3eafa75f020ab77c1d62f7840172 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Wed, 12 Jul 2023 11:16:43 +0200 Subject: [PATCH 311/833] conntrack: Extract l4 information for SCTP. Since a27d70a89 ("conntrack: add generic IP protocol support") all the unrecognized IP protocols get handled using ct_proto_other ops and are managed as L3 using 3 tuples. This patch stores L4 information for SCTP in the conn_key so that multiple conn instances, instead of one with ports zeroed, will be created when there are multiple SCTP connections between two hosts. It also performs crc32c check when not offloaded, and adds SCTP to pat_enabled. With this patch, given two SCTP association between two hosts, tracking the connection will result in: sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=55884,dport=5201), reply=(src=10.1.1.1,dst=10.1.1.2,sport=5201,dport=12345),zone=1 sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=59874,dport=5202), reply=(src=10.1.1.1,dst=10.1.1.2,sport=5202,dport=12346),zone=1 instead of: sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=0,dport=0), reply=(src=10.1.1.1,dst=10.1.1.2,sport=0,dport=0),zone=1 Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- NEWS | 1 + lib/conntrack.c | 86 +++++++++++++++++++++++++++++++- lib/packets.h | 11 ++++ tests/system-kmod-macros.at | 11 ++++ tests/system-traffic.at | 73 +++++++++++++++++++++++++++ tests/system-userspace-macros.at | 7 +++ 6 files changed, 188 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index eedaad07b13..01e8219bfa5 100644 --- a/NEWS +++ b/NEWS @@ -42,6 +42,7 @@ Post-v3.1.0 - SRv6 Tunnel Protocol * Added support for userspace datapath (only). - Userspace datapath: + * Connection tracking now supports extraction of SCTP L4 information. * Implementation of OpenFlow meters is now lockless allowing for better multi-thread scalability. * IP and L4 checksum offload support is now enabled by default for diff --git a/lib/conntrack.c b/lib/conntrack.c index 4375c03e2b8..5f1176d333f 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -27,6 +27,7 @@ #include "conntrack-private.h" #include "conntrack-tp.h" #include "coverage.h" +#include "crc32c.h" #include "csum.h" #include "ct-dpif.h" #include "dp-packet.h" @@ -41,6 +42,7 @@ #include "random.h" #include "rculist.h" #include "timeval.h" +#include "unaligned.h" VLOG_DEFINE_THIS_MODULE(conntrack); @@ -771,6 +773,8 @@ pat_packet(struct dp_packet *pkt, const struct conn_key *key) packet_set_tcp_port(pkt, key->dst.port, key->src.port); } else if (key->nw_proto == IPPROTO_UDP) { packet_set_udp_port(pkt, key->dst.port, key->src.port); + } else if (key->nw_proto == IPPROTO_SCTP) { + packet_set_sctp_port(pkt, key->dst.port, key->src.port); } } @@ -1675,6 +1679,26 @@ checksum_valid(const struct conn_key *key, const void *data, size_t size, return valid; } +static inline bool +sctp_checksum_valid(const void *data, size_t size) +{ + struct sctp_header *sctp = (struct sctp_header *) data; + ovs_be32 rcvd_csum, csum; + bool ret; + + rcvd_csum = get_16aligned_be32(&sctp->sctp_csum); + put_16aligned_be32(&sctp->sctp_csum, 0); + csum = crc32c(data, size); + put_16aligned_be32(&sctp->sctp_csum, rcvd_csum); + + ret = (rcvd_csum == csum); + if (!ret) { + COVERAGE_INC(conntrack_l4csum_err); + } + + return ret; +} + static inline bool check_l4_tcp(const struct conn_key *key, const void *data, size_t size, const void *l3, bool validate_checksum) @@ -1711,6 +1735,47 @@ check_l4_udp(const struct conn_key *key, const void *data, size_t size, || (validate_checksum ? checksum_valid(key, data, size, l3) : true); } +static inline bool +sctp_check_len(const struct sctp_header *sh, size_t size) +{ + const struct sctp_chunk_header *sch; + size_t next; + + if (size < SCTP_HEADER_LEN) { + return false; + } + + /* rfc4960: Chunks (including Type, Length, and Value fields) are padded + * out by the sender with all zero bytes to be a multiple of 4 bytes long. + */ + for (next = sizeof(struct sctp_header), + sch = SCTP_NEXT_CHUNK(sh, next); + next < size; + next += ROUND_UP(ntohs(sch->length), 4), + sch = SCTP_NEXT_CHUNK(sh, next)) { + /* rfc4960: This value represents the size of the chunk in bytes, + * including the Chunk Type, Chunk Flags, Chunk Length, and Chunk Value + * fields. + * Therefore, if the Chunk Value field is zero-length, the Length + * field will be set to 4. */ + if (ntohs(sch->length) < sizeof *sch) { + return false; + } + } + + return (next == size); +} + +static inline bool +check_l4_sctp(const void *data, size_t size, bool validate_checksum) +{ + if (OVS_UNLIKELY(!sctp_check_len(data, size))) { + return false; + } + + return validate_checksum ? sctp_checksum_valid(data, size) : true; +} + static inline bool check_l4_icmp(const void *data, size_t size, bool validate_checksum) { @@ -1761,6 +1826,21 @@ extract_l4_udp(struct conn_key *key, const void *data, size_t size, return key->src.port && key->dst.port; } +static inline bool +extract_l4_sctp(struct conn_key *key, const void *data, size_t size, + size_t *chk_len) +{ + if (OVS_UNLIKELY(size < (chk_len ? *chk_len : SCTP_HEADER_LEN))) { + return false; + } + + const struct sctp_header *sctp = data; + key->src.port = sctp->sctp_src; + key->dst.port = sctp->sctp_dst; + + return key->src.port && key->dst.port; +} + static inline bool extract_l4(struct conn_key *key, const void *data, size_t size, bool *related, const void *l3, bool validate_checksum, size_t *chk_len); @@ -1976,6 +2056,9 @@ extract_l4(struct conn_key *key, const void *data, size_t size, bool *related, return (!related || check_l4_udp(key, data, size, l3, validate_checksum)) && extract_l4_udp(key, data, size, chk_len); + } else if (key->nw_proto == IPPROTO_SCTP) { + return (!related || check_l4_sctp(data, size, validate_checksum)) + && extract_l4_sctp(key, data, size, chk_len); } else if (key->dl_type == htons(ETH_TYPE_IP) && key->nw_proto == IPPROTO_ICMP) { return (!related || check_l4_icmp(data, size, validate_checksum)) @@ -2374,7 +2457,8 @@ nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info); union ct_addr min_addr = {0}, max_addr = {0}, addr = {0}; bool pat_proto = conn->key.nw_proto == IPPROTO_TCP || - conn->key.nw_proto == IPPROTO_UDP; + conn->key.nw_proto == IPPROTO_UDP || + conn->key.nw_proto == IPPROTO_SCTP; uint16_t min_dport, max_dport, curr_dport; uint16_t min_sport, max_sport, curr_sport; diff --git a/lib/packets.h b/lib/packets.h index 200b25cf012..12245b7649a 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -854,6 +854,17 @@ struct sctp_header { }; BUILD_ASSERT_DECL(SCTP_HEADER_LEN == sizeof(struct sctp_header)); +#define SCTP_CHUNK_HEADER_LEN 4 +struct sctp_chunk_header { + uint8_t type; + uint8_t flags; + ovs_be16 length; +}; +BUILD_ASSERT_DECL(SCTP_CHUNK_HEADER_LEN == sizeof(struct sctp_chunk_header)); + +#define SCTP_NEXT_CHUNK(sh, off) \ + ALIGNED_CAST(struct sctp_chunk_header *, (uint8_t *) sh + off) + #define UDP_HEADER_LEN 8 struct udp_header { ovs_be16 udp_src; diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index 81601390ddb..5203b1df808 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -112,6 +112,17 @@ m4_define([CHECK_CONNTRACK_ZEROIP_SNAT], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) ]) +# CHECK_CONNTRACK_SCTP() +# +# Perform requirements checks for running conntrack SCTP. The kernel +# optionally support nf proto sctp. +# +m4_define([CHECK_CONNTRACK_SCTP], +[ + AT_SKIP_IF([test "$IS_WIN32" = "yes"]) + AT_SKIP_IF([! test -e /proc/sys/net/netfilter/nf_conntrack_sctp_timeout_closed]) +]) + # CHECK_CONNTRACK_TIMEOUT() # # Perform requirements checks for running conntrack customized timeout tests. diff --git a/tests/system-traffic.at b/tests/system-traffic.at index a05ca311ca8..9f07f45a36a 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -4701,6 +4701,79 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src= OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - SCTP SNAT with port range]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_SCTP() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address e6:66:c1:11:11:11]) +NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address e6:66:c1:22:22:22]) + +dnl Allow any traffic from ns0->ns1. Only allow return traffic from ns1->ns0. +AT_DATA([flows.txt], [dnl +table=0,priority=100,in_port=1,sctp,action=ct(commit,zone=1,nat(src=10.1.1.240:34567)),controller +table=0,priority=100,in_port=2,ct_state=-trk,sctp,tp_dst=34567,action=ct(table=1,zone=1,nat) +table=0,priority=0,action=drop +table=1,priority=100,in_port=2,ct_state=+trk+rpl,ct_zone=1,sctp,action=controller +table=1,priority=0,action=drop +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +AT_CAPTURE_FILE([ofctl_monitor.log]) +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +dnl Simple SCTP association local and remote single homing +dnl Send INIT. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c111111108004502004400004000408424300a0101010a010102d6b9303900000000c5cc426b0100002470e18ccc0001a000000affff7ae1c142000c00060005000080000004c0000004 actions=resubmit(,0)"]) +dnl Reply INIT_ACK. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=e666c1111111e666c122222208004502012400004000408422610a0101020a0101f03039870770e18ccc97abd49a0200010425bb9dfa0001a000000a000abb90fba5000700e827a048cd1474b111490710816ec95cfc501126b200000000000000000000000000000000fa9dbb25cc8ce17000000000000000002b953b0e1d346d160a000a00a5fb90bb020087070a0101f00000000000000000000000000000000000000000393001000000000080020024fbb82eae13af8d70329bc42bb7cd7e6458d60ff1a181e9b41167c2cab54471bf0000000000000000000000000000000000000000000000000000000000000000000000000100002470e18ccc0001a000000affff7ae1c142000c00060005000080000004c00000040000000000000000000000000000000080000004c0000004 actions=resubmit(,0)"]) +dnl Send COOKIE_ECHO. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c1111111080045020108000040004084236c0a0101010a010102d6b9303925bb9dfaf2c860300a0000e827a048cd1474b111490710816ec95cfc501126b200000000000000000000000000000000fa9dbb25cc8ce17000000000000000002b953b0e1d346d160a000a00a5fb90bb020087070a0101f00000000000000000000000000000000000000000393001000000000080020024fbb82eae13af8d70329bc42bb7cd7e6458d60ff1a181e9b41167c2cab54471bf0000000000000000000000000000000000000000000000000000000000000000000000000100002470e18ccc0001a000000affff7ae1c142000c00060005000080000004c000000400000000000000000000000000000000 actions=resubmit(,0)"]) +dnl Reply COOKIE_ACK. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=e666c1111111e666c122222208004502002400004000408423610a0101020a0101f03039870770e18ccc0391398b0b000004 actions=resubmit(,0)"]) +dnl Send DATA. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c1111111080045020034000140004084243f0a0101010a010102d6b9303925bb9dfabc366345000300147ae1c1420000000000000000666f6f0a actions=resubmit(,0)"]) +dnl Reply SACK. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=e666c1111111e666c122222208004502003042c840004084e08c0a0101020a0101f03039870770e18ccc6a990714030000107ae1c14200019ffc00000000 actions=resubmit(,0)"]) +dnl ABORT the association. The association cannot be gracefully terminated because of +dnl a small timeouts in SHUTDOWN_SENT in the kernel datapath that would make the test unreliable +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c111111108004500002400010000408464510a0101010a010102d6b9303925bb9dfae3b82c3806000004 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +OVS_APP_EXIT_AND_WAIT([ovs-ofctl]) + +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=82 in_port=1 (via action) data_len=82 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:9670267b +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=306 ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2 (via action) data_len=306 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969 sctp_csum:49864886 +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=278 in_port=1 (via action) data_len=278 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:8c816918 +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=50 ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2 (via action) data_len=50 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969 sctp_csum:ef4749fc +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=66 in_port=1 (via action) data_len=66 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:eb2b2c17 +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=62 ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2 (via action) data_len=62 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969 sctp_csum:9b67e853 +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=50 in_port=1 (via action) data_len=50 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:4bb49f65 +]) + +dnl Check the ct entry +dnl protoinfo has to be removed in order to normalize the current difference between user and kernel output +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed 's/,protoinfo=.*$//' ], [], [dnl +sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + dnl Check kernel datapath to make sure conntrack fills in L3 and L4 dnl protocol information AT_SETUP([conntrack - fragment reassembly with L3 L4 protocol information]) diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index 73e0e843b9f..d9b5b7e4c4d 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -106,6 +106,13 @@ m4_define([CHECK_CONNTRACK_NAT]) # m4_define([CHECK_CONNTRACK_ZEROIP_SNAT]) +# CHECK_CONNTRACK_SCTP() +# +# Perform requirements checks for running conntrack SCTP. The userspace +# datapath has no dependency, so no check is required. +# +m4_define([CHECK_CONNTRACK_SCTP]) + # CHECK_CONNTRACK_TIMEOUT() # # Perform requirements checks for running conntrack customized timeout tests. From 4d55a364ff60d894dce4e2e97a489d81520dc663 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Tue, 11 Jul 2023 22:55:52 -0500 Subject: [PATCH 312/833] python: Add async DNS support. This adds a Python version of the async DNS support added in: 771680d96 DNS: Add basic support for asynchronous DNS resolving The above version uses the unbound C library, and this implimentation uses the SWIG-wrapped Python version of that. In the event that the Python unbound library is not available, a warning will be logged and the resolve() method will just return None. For the case where inet_parse_active() is passed an IP address, it will not try to resolve it, so existing behavior should be preserved in the case that the unbound library is unavailable. Intentional differences from the C version are as follows: OVS_HOSTS_FILE environment variable can bet set to override the system 'hosts' file. This is primarily to allow testing to be done without requiring network connectivity. Since resolution can still be done via hosts file lookup, DNS lookups are not disabled when resolv.conf cannot be loaded. The Python socket_util module has fallen behind its C equivalent. The bare minimum change was done to inet_parse_active() to support sync/async dns, as there is no equivalent to parse_sockaddr_components(), inet_parse_passive(), etc. A TODO was added to bring socket_util.py up to equivalency to the C version. Signed-off-by: Terry Wilson Signed-off-by: Ilya Maximets --- .github/workflows/build-and-test.yml | 4 +- Documentation/intro/install/general.rst | 4 +- Documentation/intro/install/rhel.rst | 2 +- Documentation/intro/install/windows.rst | 2 +- NEWS | 3 + debian/control.in | 1 + m4/openvswitch.m4 | 8 +- python/TODO.rst | 7 + python/automake.mk | 2 + python/ovs/dns_resolve.py | 286 ++++++++++++++++++++++++ python/ovs/socket_util.py | 21 +- python/ovs/stream.py | 2 +- python/ovs/tests/test_dns_resolve.py | 280 +++++++++++++++++++++++ python/setup.py | 6 +- rhel/openvswitch-fedora.spec.in | 2 +- tests/vlog.at | 2 + 16 files changed, 615 insertions(+), 17 deletions(-) create mode 100644 python/ovs/dns_resolve.py create mode 100644 python/ovs/tests/test_dns_resolve.py diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index f66ab43b0bf..47d239f1086 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -183,10 +183,10 @@ jobs: run: sudo apt update || true - name: install common dependencies run: sudo apt install -y ${{ env.dependencies }} - - name: install libunbound libunwind + - name: install libunbound libunwind python3-unbound # GitHub Actions doesn't have 32-bit versions of these libraries. if: matrix.m32 == '' - run: sudo apt install -y libunbound-dev libunwind-dev + run: sudo apt install -y libunbound-dev libunwind-dev python3-unbound - name: install 32-bit libraries if: matrix.m32 != '' run: sudo apt install -y gcc-multilib diff --git a/Documentation/intro/install/general.rst b/Documentation/intro/install/general.rst index 42b5682fd87..19e360d47ce 100644 --- a/Documentation/intro/install/general.rst +++ b/Documentation/intro/install/general.rst @@ -90,7 +90,7 @@ need the following software: If libcap-ng is installed, then Open vSwitch will automatically build with support for it. -- Python 3.4 or later. +- Python 3.6 or later. - Unbound library, from http://www.unbound.net, is optional but recommended if you want to enable ovs-vswitchd and other utilities to use DNS names when @@ -208,7 +208,7 @@ simply install and run Open vSwitch you require the following software: from iproute2 (part of all major distributions and available at https://wiki.linuxfoundation.org/networking/iproute2). -- Python 3.4 or later. +- Python 3.6 or later. On Linux you should ensure that ``/dev/urandom`` exists. To support TAP devices, you must also ensure that ``/dev/net/tun`` exists. diff --git a/Documentation/intro/install/rhel.rst b/Documentation/intro/install/rhel.rst index d1fc42021a6..f2151d89071 100644 --- a/Documentation/intro/install/rhel.rst +++ b/Documentation/intro/install/rhel.rst @@ -92,7 +92,7 @@ Once that is completed, remove the file ``/tmp/ovs.spec``. If python3-sphinx package is not available in your version of RHEL, you can install it via pip with 'pip install sphinx'. -Open vSwitch requires python 3.4 or newer which is not available in older +Open vSwitch requires python 3.6 or newer which is not available in older distributions. In the case of RHEL 6.x and its derivatives, one option is to install python34 from `EPEL`_. diff --git a/Documentation/intro/install/windows.rst b/Documentation/intro/install/windows.rst index 78f60f35acf..fce099d5dc1 100644 --- a/Documentation/intro/install/windows.rst +++ b/Documentation/intro/install/windows.rst @@ -56,7 +56,7 @@ The following explains the steps in some detail. 'C:/MinGW /mingw'. -- Python 3.4 or later. +- Python 3.6 or later. Install the latest Python 3.x from python.org and verify that its path is part of Windows' PATH environment variable. diff --git a/NEWS b/NEWS index 01e8219bfa5..bda41ad4c53 100644 --- a/NEWS +++ b/NEWS @@ -50,6 +50,9 @@ Post-v3.1.0 table to check the status. - Linux TC offload: * Add support for offloading VXLAN tunnels with the GBP extensions. + - Python + * Added async DNS support. + * Dropped support for Python < 3.6. v3.1.0 - 16 Feb 2023 diff --git a/debian/control.in b/debian/control.in index 19f590d0645..64b0a4ce018 100644 --- a/debian/control.in +++ b/debian/control.in @@ -287,6 +287,7 @@ Depends: Suggests: python3-netaddr, python3-pyparsing, + python3-unbound, Description: Python 3 bindings for Open vSwitch Open vSwitch is a production quality, multilayer, software-based, Ethernet virtual switch. It is designed to enable massive network diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4 index 47f486be49b..47aa9da16a1 100644 --- a/m4/openvswitch.m4 +++ b/m4/openvswitch.m4 @@ -375,16 +375,16 @@ dnl Checks for valgrind/valgrind.h. AC_DEFUN([OVS_CHECK_VALGRIND], [AC_CHECK_HEADERS([valgrind/valgrind.h])]) -dnl Checks for Python 3.4 or later. +dnl Checks for Python 3.6 or later. AC_DEFUN([OVS_CHECK_PYTHON3], [AC_CACHE_CHECK( - [for Python 3 (version 3.4 or later)], + [for Python 3 (version 3.6 or later)], [ovs_cv_python3], [if test -n "$PYTHON3"; then ovs_cv_python3=$PYTHON3 else ovs_cv_python3=no - for binary in python3 python3.4 python3.5 python3.6 python3.7; do + for binary in python3 python3.6 python3.7 python3.8 python3.9 python3.10 python3.11 python3.12; do ovs_save_IFS=$IFS; IFS=$PATH_SEPARATOR for dir in $PATH; do IFS=$ovs_save_IFS @@ -401,7 +401,7 @@ else: done fi]) if test "$ovs_cv_python3" = no; then - AC_MSG_ERROR([Python 3.4 or later is required but not found in $PATH, please install it or set $PYTHON3 to point to it]) + AC_MSG_ERROR([Python 3.6 or later is required but not found in $PATH, please install it or set $PYTHON3 to point to it]) fi AC_ARG_VAR([PYTHON3]) PYTHON3=$ovs_cv_python3]) diff --git a/python/TODO.rst b/python/TODO.rst index 3a53489f128..acc5461e2f2 100644 --- a/python/TODO.rst +++ b/python/TODO.rst @@ -32,3 +32,10 @@ Python Bindings To-do List * Support write-only-changed monitor mode (equivalent of OVSDB_IDL_WRITE_CHANGED_ONLY). + +* socket_util: + + * Add equivalent fuctions to inet_parse_passive, parse_sockaddr_components, + et al. to better support using async dns. The reconnect code will + currently log a warning when inet_parse_active() returns w/o yet having + resolved an address, but will continue to connect and eventually succeed. diff --git a/python/automake.mk b/python/automake.mk index d00911828c6..82a50878741 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -16,6 +16,7 @@ ovs_pyfiles = \ python/ovs/compat/sortedcontainers/sorteddict.py \ python/ovs/compat/sortedcontainers/sortedset.py \ python/ovs/daemon.py \ + python/ovs/dns_resolve.py \ python/ovs/db/__init__.py \ python/ovs/db/custom_index.py \ python/ovs/db/data.py \ @@ -55,6 +56,7 @@ ovs_pyfiles = \ ovs_pytests = \ python/ovs/tests/test_decoders.py \ + python/ovs/tests/test_dns_resolve.py \ python/ovs/tests/test_filter.py \ python/ovs/tests/test_kv.py \ python/ovs/tests/test_list.py \ diff --git a/python/ovs/dns_resolve.py b/python/ovs/dns_resolve.py new file mode 100644 index 00000000000..41546ad5ca4 --- /dev/null +++ b/python/ovs/dns_resolve.py @@ -0,0 +1,286 @@ +# Copyright (c) 2023 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import enum +import functools +import ipaddress +import os +import time +import typing + +try: + import unbound # type: ignore +except ImportError: + pass + +import ovs.vlog + +vlog = ovs.vlog.Vlog("dns_resolve") + + +class ReqState(enum.Enum): + INVALID = 0 + PENDING = 1 + GOOD = 2 + ERROR = 3 + + +class DNSRequest: + def __init__(self, name: str): + self.name: str = name + self.state: ReqState = ReqState.INVALID + self.time: typing.Optional[float] = None + # set by DNSResolver._callback + self.result: typing.Optional[str] = None + self.ttl: typing.Optional[float] = None + + @property + def expired(self): + return time.time() > self.time + self.ttl + + @property + def is_valid(self): + return self.state == ReqState.GOOD and not self.expired + + def __str__(self): + return (f"DNSRequest(name={self.name}, state={self.state}, " + f"time={self.time}, result={self.result})") + + +class DefaultReqDict(collections.defaultdict): + def __init__(self): + super().__init__(DNSRequest) + + def __missing__(self, key): + ret = self.default_factory(key) + self[key] = ret + return ret + + +class UnboundException(Exception): + def __init__(self, message, errno): + try: + msg = f"{message}: {unbound.ub_strerror(errno)}" + except NameError: + msg = message + super().__init__(msg) + + +def dns_enabled(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + if self.dns_enabled: + return func(self, *args, **kwargs) + vlog.err("DNS support requires the python unbound library") + return wrapper + + +class DNSResolver: + def __init__(self, is_daemon: bool = False): + """Create a resolver instance + + If is_daemon is true, set the resolver to handle requests + asynchronously. The following environment variables are processed: + + OVS_UNBOUND_CONF: The filename for an unbound.conf file + OVS_RESOLV_CONF: A filename to override the system default resolv.conf + OVS_HOSTS_FILE: A filename to override the system default hosts file + + In the event that the unbound library is missing or fails to initialize + DNS lookup support will be disabled and the resolve() method will + return None. + """ + self._is_daemon = is_daemon + try: + self._ctx = unbound.ub_ctx() + self.dns_enabled = True + except Exception: + # The unbound docs mention that this could thrown an exception + # but do not specify what exception that is. This can also + # happen with a missing unbound library. + self.dns_enabled = False + vlog.err("Failed to initialize the unbound library") + return + + # NOTE(twilson) This cache, like the C version, can grow without bound + # and has no cleanup or aging mechanism. Given our usage patterns, this + # should not be a problem. But this should not be used to resolve an + # unbounded list of addresses in a long-running daemon. + self._requests = DefaultReqDict() + + self._ub_call(self._set_unbound_conf) + + # NOTE(twilson) The C version disables DNS in this case. I didn't do + # that here since it could still be useful to resolve addresses from + # /etc/hosts even w/o resolv.conf + self._ub_call(self._set_resolv_conf) + self._ub_call(self._set_hosts_file) + + self._ctx.set_async(True) # Sets threaded behavior for resolve_async() + + def _ub_call(self, fn, *args, **kwargs): + """Convert UnboundExceptions into vlog warnings""" + try: + return fn(*args, **kwargs) + except UnboundException as e: + vlog.warn(e) + + @dns_enabled + def _set_unbound_conf(self): + ub_cfg = os.getenv("OVS_UNBOUND_CONF") + if ub_cfg: + retval = self._ctx.config(ub_cfg) + if retval != 0: + raise UnboundException( + "Failed to set libunbound context config", retval) + + @dns_enabled + def _set_resolv_conf(self): + filename = os.getenv("OVS_RESOLV_CONF") + # The C lib checks that the file exists and also sets filename to + # /etc/resolv.conf on non-Windows, but resolvconf already does this. + retval = self._ctx.resolvconf(filename) + if retval != 0: + location = filename or "system default nameserver" + raise UnboundException(location, retval) + + @dns_enabled + def _set_hosts_file(self): + # The C lib doesn't have the ability to set a hosts file, but it is + # useful to have, especially for writing tests that don't rely on + # network connectivity. hosts(None) uses /etc/hosts. + filename = os.getenv("OVS_HOSTS_FILE") + retval = self._ctx.hosts(filename) + if retval != 0: + location = filename or "system default hosts file" + raise UnboundException(location, retval) + + @dns_enabled + def _callback(self, req: DNSRequest, err: int, result): + if err != 0 or (result.qtype == unbound.RR_TYPE_AAAA + and not result.havedata): + req.state = ReqState.ERROR + vlog.warn(f"{req.name}: failed to resolve") + return + if result.qtype == unbound.RR_TYPE_A and not result.havedata: + self._resolve_async(req, unbound.RR_TYPE_AAAA) + return + try: + ip_str = next(iter(result.data.as_raw_data())) + ip = ipaddress.ip_address(ip_str) # test if IP is valid + # NOTE (twilson) For some reason, accessing result data outside of + # _callback causes a segfault. So just grab and store what we need. + req.result = str(ip) + req.ttl = result.ttl + req.state = ReqState.GOOD + req.time = time.time() + except (ValueError, StopIteration): + req.state = ReqState.ERROR + vlog.err(f"{req.name}: failed to resolve") + + @dns_enabled + def _resolve_sync(self, name: str) -> typing.Optional[str]: + for qtype in (unbound.RR_TYPE_A, unbound.RR_TYPE_AAAA): + err, result = self._ctx.resolve(name, qtype) + if err != 0: + return None + if not result.havedata: + continue + try: + ip = ipaddress.ip_address( + next(iter(result.data.as_raw_data()))) + except (ValueError, StopIteration): + return None + return str(ip) + + return None + + @dns_enabled + def _resolve_async(self, req: DNSRequest, qtype) -> None: + err, _ = self._ctx.resolve_async(req.name, req, self._callback, + qtype) + if err != 0: + req.state = ReqState.ERROR + return None + + req.state = ReqState.PENDING + return None + + @dns_enabled + def resolve(self, name: str) -> typing.Optional[str]: + """Resolve a host name to an IP address + + If the resolver is set to handle requests asynchronously, resolve() + should be recalled until it returns a non-None result. Errors will be + logged. + + :param name: The host name to resolve + :returns: The IP address or None on error or not (yet) found + """ + if not self._is_daemon: + return self._resolve_sync(name) + retval = self._ctx.process() + if retval != 0: + vlog.err(f"dns-resolve error: {unbound.ub_strerror(retval)}") + return None + req = self._requests[name] # Creates a DNSRequest if not found + if req.is_valid: + return req.result + elif req.state != ReqState.PENDING: + self._resolve_async(req, unbound.RR_TYPE_A) + return None + + +_global_resolver: typing.Optional[DNSResolver] = None + + +def init(is_daemon: bool = False) -> DNSResolver: + """Initialize a global DNSResolver + + See DNSResolver.__init__ for more details + """ + global _global_resolver + _global_resolver = DNSResolver(is_daemon) + return _global_resolver + + +def resolve(name: str) -> typing.Optional[str]: + """Resolve a host name to an IP address + + If a DNSResolver instance has not been instantiated, or if it has been + created with is_daemon=False, resolve() will synchronously resolve the + hostname. If DNSResolver has been initialized with is_daemon=True, it + will instead resolve asynchornously and resolve() will return None until + the hostname has been resolved. + + :param name: The host name to resolve + :returns: The IP address or None on error or not (yet) found + """ + if _global_resolver is None: + init() + + # mypy doesn't understand that init() sets _global_resolver, so ignore type + return _global_resolver.resolve(name) # type: ignore + + +def destroy(): + """Destroy the global DNSResolver + + This destroys the global DNSResolver instance and any outstanding + asynchronouse requests. + """ + global _global_resolver + del _global_resolver + _global_resolver = None # noqa: F841 diff --git a/python/ovs/socket_util.py b/python/ovs/socket_util.py index 7b41dc44bf1..a26298b75ca 100644 --- a/python/ovs/socket_util.py +++ b/python/ovs/socket_util.py @@ -13,12 +13,14 @@ # limitations under the License. import errno +import ipaddress import os import os.path import random import socket import sys +from ovs import dns_resolve import ovs.fatal_signal import ovs.poller import ovs.vlog @@ -216,7 +218,7 @@ def is_valid_ipv4_address(address): return True -def inet_parse_active(target, default_port): +def _inet_parse_active(target, default_port): address = target.split(":") if len(address) >= 2: host_name = ":".join(address[0:-1]).lstrip('[').rstrip(']') @@ -229,9 +231,24 @@ def inet_parse_active(target, default_port): host_name = address[0] if not host_name: raise ValueError("%s: bad peer name format" % target) + try: + host_name = str(ipaddress.ip_address(host_name)) + except ValueError: + host_name = dns_resolve.resolve(host_name) + if not host_name: + raise ValueError("%s: bad peer name format" % target) return (host_name, port) +def inet_parse_active(target, default_port, raises=True): + try: + return _inet_parse_active(target, default_port) + except ValueError: + if raises: + raise + return ("", default_port) + + def inet_create_socket_active(style, address): try: is_addr_inet = is_valid_ipv4_address(address[0]) @@ -262,7 +279,7 @@ def inet_connect_active(sock, address, family, dscp): def inet_open_active(style, target, default_port, dscp): - address = inet_parse_active(target, default_port) + address = inet_parse_active(target, default_port, raises=False) family, sock = inet_create_socket_active(style, address) if sock is None: return family, sock diff --git a/python/ovs/stream.py b/python/ovs/stream.py index b32341076ca..82fbb0d6883 100644 --- a/python/ovs/stream.py +++ b/python/ovs/stream.py @@ -784,7 +784,7 @@ def needs_probes(): @staticmethod def _open(suffix, dscp): - address = ovs.socket_util.inet_parse_active(suffix, 0) + address = ovs.socket_util.inet_parse_active(suffix, 0, raises=False) family, sock = ovs.socket_util.inet_create_socket_active( socket.SOCK_STREAM, address) if sock is None: diff --git a/python/ovs/tests/test_dns_resolve.py b/python/ovs/tests/test_dns_resolve.py new file mode 100644 index 00000000000..0698e8f77d9 --- /dev/null +++ b/python/ovs/tests/test_dns_resolve.py @@ -0,0 +1,280 @@ +import contextlib +import ipaddress +import sys +import time +from unittest import mock + +import pytest + +from ovs import dns_resolve +from ovs import socket_util + + +skip_no_unbound = pytest.mark.skipif("unbound" not in dns_resolve.__dict__, + reason="Unbound not installed") + +HOSTS = [("192.0.2.1", "fake.ip4.domain", "192.0.2.1"), + ("2001:db8:2::1", "fake.ip6.domain", "2001:db8:2::1"), + ("192.0.2.2", "fake.both.domain", "192.0.2.2"), + ("2001:db8:2::2", "fake.both.domain", "192.0.2.2")] + + +def _tmp_file(path, content): + path.write_text(content) + assert content == path.read_text() + return path + + +@pytest.fixture(params=[False, True], ids=["not_daemon", "daemon"]) +def resolver_factory(monkeypatch, tmp_path, hosts_file, request): + # Allow delaying the instantiation of the DNSResolver + def resolver_factory(): + with monkeypatch.context() as m: + m.setenv("OVS_HOSTS_FILE", str(hosts_file)) + # Test with both is_daemon False and True + resolver = dns_resolve.init(request.param) + assert resolver._is_daemon == request.param + return resolver + + return resolver_factory + + +@contextlib.contextmanager +def DNSResolver(*args, **kwargs): + """Clean up after returning a dns_resolver.DNSResolver""" + resolver = dns_resolve.init(*args, **kwargs) + try: + yield resolver + finally: + dns_resolve.destroy() + assert dns_resolve._global_resolver is None + + +@pytest.fixture +def unbound_conf(tmp_path): + path = tmp_path / "unbound.conf" + content = """ + server: + verbosity: 1 + """ + return _tmp_file(path, content) + + +@pytest.fixture +def resolv_conf(tmp_path): + path = tmp_path / "resolv.conf" + content = "nameserver 127.0.0.1" + return _tmp_file(path, content) + + +@pytest.fixture +def hosts_file(tmp_path): + path = tmp_path / "hosts" + content = "\n".join(f"{ip}\t{host}" for ip, host, _ in HOSTS) + return _tmp_file(path, content) + + +@pytest.fixture +def missing_file(tmp_path): + f = tmp_path / "missing_file" + assert not f.exists() + return f + + +@pytest.fixture(params=[False, True], ids=["with unbound", "without unbound"]) +def missing_unbound(monkeypatch, request): + if request.param: + if "unbound" in dns_resolve.__dict__: + monkeypatch.setitem(sys.modules, 'unbound', None) + monkeypatch.delitem(dns_resolve.__dict__, "unbound") + elif "unbound" not in dns_resolve.__dict__: + pytest.skip("Unbound not installed") + return request.param + + +def test_missing_unbound(missing_unbound, resolver_factory): + resolver = resolver_factory() # Dont fail even w/o unbound + assert resolver.dns_enabled == (not missing_unbound) + + +def test_DNSRequest_defaults(): + req = dns_resolve.DNSRequest(HOSTS[0][1]) + assert HOSTS[0][1] == req.name + assert req.state == dns_resolve.ReqState.INVALID + assert req.time == req.result == req.ttl is None + assert str(req) + + +def _resolve(resolver, host, fn=dns_resolve.resolve): + """Handle sync/async lookups, giving up if more than 1 second has passed""" + + timeout = 1 + start = time.time() + name = fn(host) + if resolver and resolver._is_daemon: + while name is None: + name = fn(host) + if name: + break + time.sleep(0.01) + end = time.time() + if end - start > timeout: + break + if name: + return name + raise LookupError(f"{host} not found") + + +@pytest.mark.parametrize("ip,host,expected", HOSTS) +def test_resolve_addresses(missing_unbound, resolver_factory, ip, host, + expected): + resolver = resolver_factory() + if missing_unbound: + with pytest.raises(LookupError): + _resolve(resolver, host) + else: + result = _resolve(resolver, host) + assert ipaddress.ip_address(expected) == ipaddress.ip_address(result) + + +@pytest.mark.parametrize("ip,host,expected", HOSTS) +def test_resolve_without_init(monkeypatch, missing_unbound, ip, host, expected, + hosts_file): + # make sure we don't have a global resolver + dns_resolve.destroy() + with monkeypatch.context() as m: + m.setenv("OVS_HOSTS_FILE", str(hosts_file)) + if missing_unbound: + with pytest.raises(LookupError): + _resolve(None, host) + else: + res = _resolve(None, host) + assert dns_resolve._global_resolver is not None + assert dns_resolve._global_resolver._is_daemon is False + assert ipaddress.ip_address(expected) == ipaddress.ip_address(res) + + +def test_resolve_unknown_host(missing_unbound, resolver_factory): + resolver = resolver_factory() + with pytest.raises(LookupError): + _resolve(resolver, "fake.notadomain") + + +@skip_no_unbound +def test_resolve_process_error(): + with DNSResolver(True) as resolver: + with mock.patch.object(resolver._ctx, "process", return_value=-1): + assert resolver.resolve("fake.domain") is None + + +@skip_no_unbound +def test_resolve_resolve_error(): + with DNSResolver(False) as resolver: + with mock.patch.object(resolver._ctx, "resolve", + return_value=(-1, None)): + assert resolver.resolve("fake.domain") is None + + +@skip_no_unbound +def test_resolve_resolve_async_error(): + with DNSResolver(True) as resolver: + with mock.patch.object(resolver._ctx, "resolve_async", + return_value=(-1, None)): + with pytest.raises(LookupError): + _resolve(resolver, "fake.domain") + + +@pytest.mark.parametrize("file,raises", + [(None, False), + ("missing_file", dns_resolve.UnboundException), + ("unbound_conf", False)]) +def test_set_unbound_conf(monkeypatch, missing_unbound, resolver_factory, + request, file, raises): + if file: + file = str(request.getfixturevalue(file)) + monkeypatch.setenv("OVS_UNBOUND_CONF", file) + resolver = resolver_factory() # Doesn't raise + if missing_unbound: + assert resolver._set_unbound_conf() is None + return + with mock.patch.object(resolver._ctx, "config", + side_effect=resolver._ctx.config) as c: + if raises: + with pytest.raises(raises): + resolver._set_unbound_conf() + else: + resolver._set_unbound_conf() + if file: + c.assert_called_once_with(file) + else: + c.assert_not_called() + + +@pytest.mark.parametrize("file,raises", + [(None, False), + ("missing_file", dns_resolve.UnboundException), + ("resolv_conf", False)]) +def test_resolv_conf(monkeypatch, missing_unbound, resolver_factory, request, + file, raises): + if file: + file = str(request.getfixturevalue(file)) + monkeypatch.setenv("OVS_RESOLV_CONF", file) + resolver = resolver_factory() # Doesn't raise + if missing_unbound: + assert resolver._set_resolv_conf() is None + return + with mock.patch.object(resolver._ctx, "resolvconf", + side_effect=resolver._ctx.resolvconf) as c: + if raises: + with pytest.raises(raises): + resolver._set_resolv_conf() + else: + resolver._set_resolv_conf() + c.assert_called_once_with(file) + + +@pytest.mark.parametrize("file,raises", + [(None, False), + ("missing_file", dns_resolve.UnboundException), + ("hosts_file", False)]) +def test_hosts(monkeypatch, missing_unbound, resolver_factory, request, file, + raises): + if file: + file = str(request.getfixturevalue(file)) + monkeypatch.setenv("OVS_HOSTS_FILE", file) + resolver = resolver_factory() # Doesn't raise + if missing_unbound: + assert resolver._set_hosts_file() is None + return + with mock.patch.object(resolver._ctx, "hosts", + side_effect=resolver._ctx.hosts) as c: + if raises: + with pytest.raises(raises): + resolver._set_hosts_file() + else: + resolver._set_hosts_file() + c.assert_called_once_with(file) + + +def test_UnboundException(missing_unbound): + with pytest.raises(dns_resolve.UnboundException): + raise dns_resolve.UnboundException("Fake exception", -1) + + +@skip_no_unbound +@pytest.mark.parametrize("ip,host,expected", HOSTS) +def test_inet_parse_active(resolver_factory, ip, host, expected): + resolver = resolver_factory() + + def fn(name): + # Return the same thing _resolve() would so we can call + # this multiple times for the is_daemon=True case + return socket_util.inet_parse_active(f"{name}:6640", 6640, + raises=False)[0] or None + + # parsing IPs still works + IP = _resolve(resolver, ip, fn) + assert ipaddress.ip_address(ip) == ipaddress.ip_address(IP) + # parsing hosts works + IP = _resolve(resolver, host, fn) + assert ipaddress.ip_address(IP) == ipaddress.ip_address(expected) diff --git a/python/setup.py b/python/setup.py index 27684c40469..bcf832ce9ba 100644 --- a/python/setup.py +++ b/python/setup.py @@ -99,8 +99,7 @@ def build_extension(self, ext): 'Topic :: System :: Networking', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', ], ext_modules=[setuptools.Extension("ovs._json", sources=["ovs/_json.c"], @@ -110,7 +109,8 @@ def build_extension(self, ext): cmdclass={'build_ext': try_build_ext}, install_requires=['sortedcontainers'], extras_require={':sys_platform == "win32"': ['pywin32 >= 1.0'], - 'flow': ['netaddr', 'pyparsing']}, + 'flow': ['netaddr', 'pyparsing'], + 'dns': ['unbound']}, ) try: diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 44899c1ca74..343a5716d16 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -113,7 +113,7 @@ Summary: Open vSwitch python3 bindings License: ASL 2.0 BuildArch: noarch Requires: python3 -Suggests: python3-netaddr python3-pyparsing +Suggests: python3-netaddr python3-pyparsing python3-unbound %{?python_provide:%python_provide python3-openvswitch = %{version}-%{release}} %description -n python3-openvswitch diff --git a/tests/vlog.at b/tests/vlog.at index 3e92e70a93c..785014956e7 100644 --- a/tests/vlog.at +++ b/tests/vlog.at @@ -385,6 +385,7 @@ AT_CHECK([APPCTL -t test-unixctl.py vlog/list], [0], [dnl console syslog file ------- ------ ------ daemon info info info +dns_resolve info info info fatal-signal info info info jsonrpc info info info poller info info info @@ -404,6 +405,7 @@ unixctl_server info info info console syslog file ------- ------ ------ daemon info err dbg +dns_resolve info info dbg fatal-signal info info dbg jsonrpc info info dbg poller info info dbg From 023dcdc7a1551b59dce2e02142edda3dae533ff6 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Fri, 14 Jul 2023 19:06:33 +0100 Subject: [PATCH 313/833] dpif-netdev: Rename pmd-maxsleep config option. other_config:pmd-maxsleep is a config option to allow PMD thread cores to sleep under low or no load conditions. Rename it to 'pmd-sleep-max' to allow a more structured name and so that additional options or command can follow the 'pmd-sleep-xyz' pattern. Use of other_config:pmd-maxsleep is deprecated to be removed in a future release and will result in a warning. Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 7 ++++++- NEWS | 2 ++ lib/dpif-netdev.c | 11 ++++++++++- tests/pmd.at | 12 ++++++------ vswitchd/vswitch.xml | 2 +- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index e70986d16b2..9e014ec7b15 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -334,7 +334,12 @@ when there is no load or very-low load on all the Rx queues they poll. This can be enabled by setting the max requested sleep time (in microseconds) for a PMD thread:: - $ ovs-vsctl set open_vswitch . other_config:pmd-maxsleep=50 + $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=50 + +.. note:: + + Previous config name 'pmd-maxsleep' is deprecated and will be removed in a + future release. With a non-zero max value a PMD may request to sleep by an incrementing amount of time up to the maximum time. If at any point the threshold of at least half diff --git a/NEWS b/NEWS index bda41ad4c53..469691e912d 100644 --- a/NEWS +++ b/NEWS @@ -48,6 +48,8 @@ Post-v3.1.0 * IP and L4 checksum offload support is now enabled by default for interfaces that support it. See the 'status' column in the 'interface' table to check the status. + * 'pmd-maxsleep' other_config was renamed to 'pmd-sleep-max'. + 'pmd-maxsleep' is deprecated and will be removed in a future release. - Linux TC offload: * Add support for offloading VXLAN tunnels with the GBP extensions. - Python diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index ab493f9d478..9b11914b3ab 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -4982,7 +4982,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) set_pmd_auto_lb(dp, autolb_state, log_autolb); - pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", 0); + pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", UINT64_MAX); + if (pmd_max_sleep != UINT64_MAX) { + VLOG_WARN("pmd-maxsleep is deprecated. " + "Please use pmd-sleep-max instead."); + } else { + pmd_max_sleep = 0; + } + + pmd_max_sleep = smap_get_ullong(other_config, "pmd-sleep-max", + pmd_max_sleep); pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep); if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) { diff --git a/tests/pmd.at b/tests/pmd.at index 48f3d432d22..374ad7217a8 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1265,36 +1265,36 @@ OVS_WAIT_UNTIL([tail ovs-vswitchd.log | grep "PMD load based sleeps are disabled dnl Check low value max sleep get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="1"]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="1"]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 1 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) dnl Check high value max sleep get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="10000"]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10000"]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10000 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) dnl Check setting max sleep to zero get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="0"]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="0"]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 0 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are disabled."]) dnl Check above high value max sleep get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="10001"]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10001"]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10000 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) dnl Check rounding get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="490"]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="490"]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 490 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) dnl Check rounding get_log_next_line_num -AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-maxsleep="499"]) +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="499"]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 499 usecs."]) OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 01408e90a40..cfcde34ffed 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -802,7 +802,7 @@ The default value is 25%.

      -

      From 395668a68dfbd404942c9594ed48c4c31fc940bd Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Fri, 14 Jul 2023 19:06:34 +0100 Subject: [PATCH 314/833] pmd.at: Add macro for checking pmd sleep max time and state. This is just cosmetic. There is no change to the tests. Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- tests/pmd.at | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/tests/pmd.at b/tests/pmd.at index 374ad7217a8..4dd775bd3fd 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -60,6 +60,22 @@ m4_define([CHECK_PMD_THREADS_CREATED], [ fi ]) +dnl CHECK_DP_SLEEP_MAX([max_sleep], [enabled], [+line]) +dnl +dnl Checks correct pmd load based sleep is set for the datapath. +dnl Checking starts from line number 'line' in ovs-vswithd.log . +m4_define([CHECK_DP_SLEEP_MAX], [ + SLEEP_TIME="PMD max sleep request is $1 usecs." + SLEEP_STATE="PMD load based sleeps are $2." + line_st=$3 + if [[ -z "$line_st" ]] + then + line_st="+0" + fi + OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_TIME"]) + OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_STATE"]) +]) + m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1\2:/"]) m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"]) @@ -1255,48 +1271,41 @@ ovs-appctl: ovs-vswitchd: server returned an error OVS_VSWITCHD_STOP AT_CLEANUP -dnl Check default state AT_SETUP([PMD - pmd sleep]) OVS_VSWITCHD_START dnl Check default -OVS_WAIT_UNTIL([tail ovs-vswitchd.log | grep "PMD max sleep request is 0 usecs."]) -OVS_WAIT_UNTIL([tail ovs-vswitchd.log | grep "PMD load based sleeps are disabled."]) +CHECK_DP_SLEEP_MAX([0], [disabled], []) dnl Check low value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="1"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 1 usecs."]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) +CHECK_DP_SLEEP_MAX([1], [enabled], [+$LINENUM]) dnl Check high value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10000"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10000 usecs."]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) +CHECK_DP_SLEEP_MAX([10000], [enabled], [+$LINENUM]) dnl Check setting max sleep to zero get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="0"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 0 usecs."]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are disabled."]) +CHECK_DP_SLEEP_MAX([0], [disabled], [+$LINENUM]) dnl Check above high value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10001"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 10000 usecs."]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) +CHECK_DP_SLEEP_MAX([10000], [enabled], [+$LINENUM]) dnl Check rounding get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="490"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 490 usecs."]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) +CHECK_DP_SLEEP_MAX([490], [enabled], [+$LINENUM]) + dnl Check rounding get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="499"]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD max sleep request is 499 usecs."]) -OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep "PMD load based sleeps are enabled."]) +CHECK_DP_SLEEP_MAX([499], [enabled], [+$LINENUM]) OVS_VSWITCHD_STOP AT_CLEANUP From bc6a6f82e54fc75c2e746423b02f1cfd8065db4e Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Fri, 14 Jul 2023 19:06:35 +0100 Subject: [PATCH 315/833] dpif-netdev: Add pmd-sleep-show command. Max requested sleep time and status for a PMD thread is logged at start up or when changed, but it can be convenient to have a command to dump this information explicitly. It is envisaged that this will be expanded for individual pmds in the future, hence adding to dpif_netdev_pmd_info(). Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 4 ++++ NEWS | 2 ++ lib/dpif-netdev.c | 23 +++++++++++++++++++---- tests/pmd.at | 22 ++++++++++++++++++++++ 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index 9e014ec7b15..affd64cc9aa 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -353,6 +353,10 @@ and can differ significantly depending on system configuration. The actual time not processing packets will be determined by the sleep and processor wake-up times and should be tested with each system configuration. +The current configuration of the PMD load based sleeping can be shown with:: + + $ ovs-appctl dpif-netdev/pmd-sleep-show + Sleep time statistics for 10 secs can be seen with:: $ ovs-appctl dpif-netdev/pmd-stats-clear \ diff --git a/NEWS b/NEWS index 469691e912d..a890ed935cc 100644 --- a/NEWS +++ b/NEWS @@ -50,6 +50,8 @@ Post-v3.1.0 table to check the status. * 'pmd-maxsleep' other_config was renamed to 'pmd-sleep-max'. 'pmd-maxsleep' is deprecated and will be removed in a future release. + * 'ovs-appctl dpif-netdev/pmd-sleep-show' command was added to get the + max sleep configuration of PMD thread cores. - Linux TC offload: * Add support for offloading VXLAN tunnels with the GBP extensions. - Python diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 9b11914b3ab..0b623fcea45 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -701,6 +701,7 @@ enum pmd_info_type { PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */ PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ + PMD_INFO_SLEEP_SHOW, /* Show max sleep configuration details. */ }; static void @@ -1441,7 +1442,9 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], unsigned int secs = 0; unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) / INTERVAL_USEC_TO_SEC; - bool first_show_rxq = true; + uint64_t default_max_sleep = 0; + bool show_header = true; + ovs_mutex_lock(&dp_netdev_mutex); @@ -1489,7 +1492,7 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], continue; } if (type == PMD_INFO_SHOW_RXQ) { - if (first_show_rxq) { + if (show_header) { if (!secs || secs > max_secs) { secs = max_secs; } else { @@ -1498,7 +1501,7 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], } ds_put_format(&reply, "Displaying last %u seconds " "pmd usage %%\n", secs); - first_show_rxq = false; + show_header = false; } pmd_info_show_rxq(&reply, pmd, secs); } else if (type == PMD_INFO_CLEAR_STATS) { @@ -1507,6 +1510,14 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], pmd_info_show_stats(&reply, pmd); } else if (type == PMD_INFO_PERF_SHOW) { pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); + } else if (type == PMD_INFO_SLEEP_SHOW) { + if (show_header) { + atomic_read_relaxed(&dp->pmd_max_sleep, &default_max_sleep); + ds_put_format(&reply, "Default max sleep: %4"PRIu64" us", + default_max_sleep); + ds_put_cstr(&reply, "\n"); + show_header = false; + } } } free(pmd_list); @@ -1607,7 +1618,8 @@ dpif_netdev_init(void) { static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, clear_aux = PMD_INFO_CLEAR_STATS, - poll_aux = PMD_INFO_SHOW_RXQ; + poll_aux = PMD_INFO_SHOW_RXQ, + sleep_aux = PMD_INFO_SLEEP_SHOW; unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]", 0, 3, dpif_netdev_pmd_info, @@ -1619,6 +1631,9 @@ dpif_netdev_init(void) "[-secs secs] [dp]", 0, 5, dpif_netdev_pmd_info, (void *)&poll_aux); + unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]", + 0, 1, dpif_netdev_pmd_info, + (void *)&sleep_aux); unixctl_command_register("dpif-netdev/pmd-perf-show", "[-nh] [-it iter-history-len]" " [-ms ms-history-len]" diff --git a/tests/pmd.at b/tests/pmd.at index 4dd775bd3fd..7b1652595f7 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1277,35 +1277,57 @@ OVS_VSWITCHD_START dnl Check default CHECK_DP_SLEEP_MAX([0], [disabled], []) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +]) + dnl Check low value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="1"]) CHECK_DP_SLEEP_MAX([1], [enabled], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 1 us +]) dnl Check high value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10000"]) CHECK_DP_SLEEP_MAX([10000], [enabled], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 10000 us +]) dnl Check setting max sleep to zero get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="0"]) CHECK_DP_SLEEP_MAX([0], [disabled], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +]) dnl Check above high value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10001"]) CHECK_DP_SLEEP_MAX([10000], [enabled], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 10000 us +]) dnl Check rounding get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="490"]) CHECK_DP_SLEEP_MAX([490], [enabled], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 490 us +]) dnl Check rounding get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="499"]) CHECK_DP_SLEEP_MAX([499], [enabled], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 499 us +]) OVS_VSWITCHD_STOP AT_CLEANUP From ef4883a8df23e4a713fafafc971fad0372996059 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Fri, 14 Jul 2023 19:06:36 +0100 Subject: [PATCH 316/833] dpif-netdev: Remove pmd-sleep-max experimental tag. Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 4 ++-- NEWS | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index affd64cc9aa..f43819be041 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -324,8 +324,8 @@ A user can use this option to set a minimum frequency of Rx queue to PMD reassignment due to PMD Auto Load Balance. For example, this could be set (in min) such that a reassignment is triggered at most every few hours. -PMD load based sleeping (Experimental) --------------------------------------- +PMD load based sleeping +----------------------- PMD threads constantly poll Rx queues which are assigned to them. In order to reduce the CPU cycles they use, they can sleep for small periods of time diff --git a/NEWS b/NEWS index a890ed935cc..6a1bc1cf3f3 100644 --- a/NEWS +++ b/NEWS @@ -52,6 +52,7 @@ Post-v3.1.0 'pmd-maxsleep' is deprecated and will be removed in a future release. * 'ovs-appctl dpif-netdev/pmd-sleep-show' command was added to get the max sleep configuration of PMD thread cores. + * Removed experimental tag from PMD load based sleeping. - Linux TC offload: * Add support for offloading VXLAN tunnels with the GBP extensions. - Python From 5392f89fed6af1643b9d7f846e10ca92dabac8ee Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Mon, 17 Jul 2023 11:06:53 +0200 Subject: [PATCH 317/833] relay: Allow setting probe interval. Previously it was not possible to set the probe interval for the connection from a relay to the backing ovsdb-server. With this change it is now possible using the `ovsdb-server/set-relay-source-probe-interval` command. Reviewed-by: Simon Horman Signed-off-by: Felix Huettner Signed-off-by: Ilya Maximets --- NEWS | 3 +++ ovsdb/ovsdb-server.c | 30 +++++++++++++++++++++++++++++- ovsdb/relay.c | 15 ++++++++++++++- ovsdb/relay.h | 8 +++++++- 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 6a1bc1cf3f3..19b1fef1aaa 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,9 @@ Post-v3.1.0 conversion operation is present. For the cluster service model follow upgrade instructions in 'Upgrading from version 3.1 and earlier to 3.2 and later' section of ovsdb(7). + * When ovsdb-server is running in relay mode, the probe interval is + now configurable via 'ovsdb-server/set-relay-source-probe-interval' + unixctl command. - IPFIX template and statistics intervals can now be configured through two new options in the IPFIX table: 'template_interval' and 'stats_interval'. - Linux kernel datapath: diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 9bad0c8ddf2..8e623118b10 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -94,6 +94,7 @@ static unixctl_cb_func ovsdb_server_get_active_ovsdb_server; static unixctl_cb_func ovsdb_server_connect_active_ovsdb_server; static unixctl_cb_func ovsdb_server_disconnect_active_ovsdb_server; static unixctl_cb_func ovsdb_server_set_active_ovsdb_server_probe_interval; +static unixctl_cb_func ovsdb_server_set_relay_source_interval; static unixctl_cb_func ovsdb_server_set_sync_exclude_tables; static unixctl_cb_func ovsdb_server_get_sync_exclude_tables; static unixctl_cb_func ovsdb_server_get_sync_status; @@ -107,6 +108,7 @@ struct server_config { char **sync_exclude; bool *is_backup; int *replication_probe_interval; + int *relay_source_probe_interval; struct ovsdb_jsonrpc_server *jsonrpc; }; static unixctl_cb_func ovsdb_server_add_remote; @@ -328,6 +330,7 @@ main(int argc, char *argv[]) struct shash all_dbs; struct shash_node *node; int replication_probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; + int relay_source_probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); @@ -377,6 +380,7 @@ main(int argc, char *argv[]) server_config.sync_exclude = &sync_exclude; server_config.is_backup = &is_backup; server_config.replication_probe_interval = &replication_probe_interval; + server_config.relay_source_probe_interval = &relay_source_probe_interval; perf_counters_init(); @@ -472,6 +476,9 @@ main(int argc, char *argv[]) unixctl_command_register( "ovsdb-server/set-active-ovsdb-server-probe-interval", "", 1, 1, ovsdb_server_set_active_ovsdb_server_probe_interval, &server_config); + unixctl_command_register( + "ovsdb-server/set-relay-source-probe-interval", "", 1, 1, + ovsdb_server_set_relay_source_interval, &server_config); unixctl_command_register("ovsdb-server/set-sync-exclude-tables", "", 0, 1, ovsdb_server_set_sync_exclude_tables, &server_config); @@ -797,7 +804,8 @@ open_db(struct server_config *config, const char *filename) add_db(config, db); if (is_relay) { - ovsdb_relay_add_db(db->db, relay_remotes, update_schema, config); + ovsdb_relay_add_db(db->db, relay_remotes, update_schema, config, + *config->relay_source_probe_interval); } return NULL; } @@ -1480,6 +1488,26 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, } } +static void +ovsdb_server_set_relay_source_interval(struct unixctl_conn *conn, + int argc OVS_UNUSED, + const char *argv[], + void *config_) +{ + struct server_config *config = config_; + int probe_interval; + + if (str_to_int(argv[1], 10, &probe_interval)) { + *config->relay_source_probe_interval = probe_interval; + save_config(config); + ovsdb_relay_set_probe_interval(probe_interval); + unixctl_command_reply(conn, NULL); + } else { + unixctl_command_reply_error( + conn, "Invalid probe interval, integer value expected"); + } +} + static void ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, int argc OVS_UNUSED, diff --git a/ovsdb/relay.c b/ovsdb/relay.c index 377f3285f61..b035cb49210 100644 --- a/ovsdb/relay.c +++ b/ovsdb/relay.c @@ -127,7 +127,7 @@ static struct ovsdb_cs_ops relay_cs_ops = { void ovsdb_relay_add_db(struct ovsdb *db, const char *remote, schema_change_callback schema_change_cb, - void *schema_change_aux) + void *schema_change_aux, int probe_interval) { struct relay_ctx *ctx; @@ -152,10 +152,23 @@ ovsdb_relay_add_db(struct ovsdb *db, const char *remote, shash_add(&relay_dbs, db->name, ctx); ovsdb_cs_set_leader_only(ctx->cs, false); ovsdb_cs_set_remote(ctx->cs, remote, true); + ovsdb_cs_set_probe_interval(ctx->cs, probe_interval); VLOG_DBG("added database: %s, %s", db->name, remote); } +/* Updates the probe interval for all relay connections to the specified + * value. */ +void +ovsdb_relay_set_probe_interval(int probe_interval) +{ + struct shash_node *node; + SHASH_FOR_EACH (node, &relay_dbs) { + struct relay_ctx *ctx = node->data; + ovsdb_cs_set_probe_interval(ctx->cs, probe_interval); + } +} + void ovsdb_relay_del_db(struct ovsdb *db) { diff --git a/ovsdb/relay.h b/ovsdb/relay.h index f841554ca9e..218caad65de 100644 --- a/ovsdb/relay.h +++ b/ovsdb/relay.h @@ -19,11 +19,15 @@ #include +#include "reconnect.h" + struct json; struct ovsdb; struct ovsdb_schema; struct uuid; +#define RELAY_SOURCE_DEFAULT_PROBE_INTERVAL RECONNECT_DEFAULT_PROBE_INTERVAL + typedef struct ovsdb_error *(*schema_change_callback)( struct ovsdb *, const struct ovsdb_schema *, @@ -33,11 +37,13 @@ typedef struct ovsdb_error *(*schema_change_callback)( void ovsdb_relay_add_db(struct ovsdb *, const char *remote, schema_change_callback schema_change_cb, - void *schema_change_aux); + void *schema_change_aux, int probe_interval); void ovsdb_relay_del_db(struct ovsdb *); void ovsdb_relay_run(void); void ovsdb_relay_wait(void); +void ovsdb_relay_set_probe_interval(int probe_interval); + bool ovsdb_relay_is_connected(struct ovsdb *); #endif /* OVSDB_RELAY_H */ From 1ef3f4f78ada08f66bab7ca0bbe8dec4337423c3 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 17 Jul 2023 19:47:43 +0200 Subject: [PATCH 318/833] AUTHORS: Add Felix Huettner. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 9657aa710e3..9186e1ad227 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -166,6 +166,7 @@ Eziz Durdyyev ezizdurdy@gmail.com Fabrizio D'Angelo fdangelo@redhat.com Faicker Mo faicker.mo@ucloud.cn Fangrui Song maskray@google.com +Felix Huettner felix.huettner@mail.schwarz Fengqi Li lifengqi@inspur.com Flavio Fernandes flavio@flaviof.com Flavio Leitner fbl@redhat.com From 6240c0b4c80ea3d8dd1bf77526b04b55742de2ce Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:11 +0200 Subject: [PATCH 319/833] netdev: Add netdev_get_speed() to netdev API. Currently, the netdev's speed is being calculated by taking the link's feature bits (using netdev_get_features()) and transforming them into bps. This mechanism can be both inaccurate and difficult to maintain, mainly because we currently use the feature bits supported by OpenFlow which would have to be extended to support all new feature bits of all netdev implementations while keeping the OpenFlow API intact. In order to expose the link speed accurately for all current and future hardware, add a new netdev API call that allows the implementations to provide the current and maximum link speeds in Mbps. Internally, the logic to get the maximum supported speed still relies on feature bits so it might still get out of sync in the future. However, the maximum configurable speed is not used as much as the current speed and these feature bits are not exposed through the netdev interface so it should be easier to add more. Use this new function instead of netdev_get_features() where the link speed is needed. As a consequence of this patch, link speeds of cards is properly reported (internally in OVSDB) even if not supported by OpenFlow. A test verifies this behavior using a tap device. Also, in order to avoid using the old, this patch adds a checkpatch.py warning if the old API is used. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2137567 Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- include/openvswitch/netdev.h | 1 + lib/dpif.h | 4 ++- lib/netdev-bsd.c | 22 +++++++++++++++ lib/netdev-dpdk.c | 52 ++++++++++++++++++++++++++++++++++++ lib/netdev-linux-private.h | 1 + lib/netdev-linux.c | 46 ++++++++++++++++++++++++------- lib/netdev-provider.h | 9 +++++++ lib/netdev.c | 30 +++++++++++++++++++++ ofproto/ofproto-dpif-sflow.c | 11 ++++++-- ofproto/ofproto.c | 6 +++-- tests/atlocal.in | 3 +++ tests/system-interface.at | 30 +++++++++++++++++++++ utilities/checkpatch.py | 11 +++++--- vswitchd/bridge.c | 30 +++++++++++++-------- 14 files changed, 228 insertions(+), 28 deletions(-) diff --git a/include/openvswitch/netdev.h b/include/openvswitch/netdev.h index cafd6fd7bee..83e8633dda6 100644 --- a/include/openvswitch/netdev.h +++ b/include/openvswitch/netdev.h @@ -132,6 +132,7 @@ int netdev_get_features(const struct netdev *, enum netdev_features *advertised, enum netdev_features *supported, enum netdev_features *peer); +int netdev_get_speed(const struct netdev *, uint32_t *current, uint32_t *max); uint64_t netdev_features_to_bps(enum netdev_features features, uint64_t default_bps); bool netdev_features_is_full_duplex(enum netdev_features features); diff --git a/lib/dpif.h b/lib/dpif.h index 129cbf6a1d5..9e9d0aa1b0a 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -91,7 +91,9 @@ * * - Carrier status (netdev_get_carrier()). * - * - Speed (netdev_get_features()). + * - Link features (netdev_get_features()). + * + * - Speed (netdev_get_speed()). * * - QoS queue configuration (netdev_get_queue(), netdev_set_queue() and * related functions.) diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index 7875636cc3c..8596741aa17 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -1168,6 +1168,27 @@ netdev_bsd_get_features(const struct netdev *netdev, return error; } +static int +netdev_bsd_get_speed(const struct netdev *netdev, uint32_t *current, + uint32_t *max) +{ + enum netdev_features f_current, f_supported, f_advertised, f_peer; + int error; + + error = netdev_bsd_get_features(netdev, &f_current, &f_advertised, + &f_supported, &f_peer); + if (error) { + return error; + } + + *current = MIN(UINT32_MAX, + netdev_features_to_bps(f_current, 0) / 1000000ULL); + *max = MIN(UINT32_MAX, + netdev_features_to_bps(f_supported, 0) / 1000000ULL); + + return 0; +} + /* * Assigns 'addr' as 'netdev''s IPv4 address and 'mask' as its netmask. If * 'addr' is INADDR_ANY, 'netdev''s IPv4 address is cleared. Returns a @@ -1493,6 +1514,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off, .get_carrier = netdev_bsd_get_carrier, \ .get_stats = netdev_bsd_get_stats, \ .get_features = netdev_bsd_get_features, \ + .get_speed = netdev_bsd_get_speed, \ .set_in4 = netdev_bsd_set_in4, \ .get_addr_list = netdev_bsd_get_addr_list, \ .get_next_hop = netdev_bsd_get_next_hop, \ diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index aa87ee5468e..64d3b1df4c2 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -3686,6 +3686,57 @@ netdev_dpdk_get_features(const struct netdev *netdev, return 0; } +static int +netdev_dpdk_get_speed(const struct netdev *netdev, uint32_t *current, + uint32_t *max) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_eth_dev_info dev_info; + struct rte_eth_link link; + + ovs_mutex_lock(&dev->mutex); + link = dev->link; + rte_eth_dev_info_get(dev->port_id, &dev_info); + ovs_mutex_unlock(&dev->mutex); + + *current = link.link_speed != RTE_ETH_SPEED_NUM_UNKNOWN + ? link.link_speed : 0; + + if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_200G) { + *max = RTE_ETH_SPEED_NUM_200G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100G) { + *max = RTE_ETH_SPEED_NUM_100G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_56G) { + *max = RTE_ETH_SPEED_NUM_56G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_50G) { + *max = RTE_ETH_SPEED_NUM_50G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_40G) { + *max = RTE_ETH_SPEED_NUM_40G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_25G) { + *max = RTE_ETH_SPEED_NUM_25G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_20G) { + *max = RTE_ETH_SPEED_NUM_20G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10G) { + *max = RTE_ETH_SPEED_NUM_10G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_5G) { + *max = RTE_ETH_SPEED_NUM_5G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_2_5G) { + *max = RTE_ETH_SPEED_NUM_2_5G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_1G) { + *max = RTE_ETH_SPEED_NUM_1G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M || + dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M_HD) { + *max = RTE_ETH_SPEED_NUM_100M; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M || + dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M_HD) { + *max = RTE_ETH_SPEED_NUM_10M; + } else { + *max = 0; + } + + return 0; +} + static struct ingress_policer * netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst) { @@ -6332,6 +6383,7 @@ parse_vhost_config(const struct smap *ovs_other_config) .get_stats = netdev_dpdk_get_stats, \ .get_custom_stats = netdev_dpdk_get_custom_stats, \ .get_features = netdev_dpdk_get_features, \ + .get_speed = netdev_dpdk_get_speed, \ .get_status = netdev_dpdk_get_status, \ .reconfigure = netdev_dpdk_reconfigure, \ .rxq_recv = netdev_dpdk_rxq_recv diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h index deb015bdb80..0ecf0f748f9 100644 --- a/lib/netdev-linux-private.h +++ b/lib/netdev-linux-private.h @@ -92,6 +92,7 @@ struct netdev_linux { enum netdev_features current; /* Cached from ETHTOOL_GSET. */ enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */ enum netdev_features supported; /* Cached from ETHTOOL_GSET. */ + uint32_t current_speed; /* Cached from ETHTOOL_GSET. */ struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */ struct tc *tc; diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 3dba2ef1fe4..599745da44e 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2382,7 +2382,6 @@ static void netdev_linux_read_features(struct netdev_linux *netdev) { struct ethtool_cmd ecmd; - uint32_t speed; int error; if (netdev->cache_valid & VALID_FEATURES) { @@ -2496,20 +2495,20 @@ netdev_linux_read_features(struct netdev_linux *netdev) } /* Current settings. */ - speed = ethtool_cmd_speed(&ecmd); - if (speed == SPEED_10) { + netdev->current_speed = ethtool_cmd_speed(&ecmd); + if (netdev->current_speed == SPEED_10) { netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD; - } else if (speed == SPEED_100) { + } else if (netdev->current_speed == SPEED_100) { netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD; - } else if (speed == SPEED_1000) { + } else if (netdev->current_speed == SPEED_1000) { netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD; - } else if (speed == SPEED_10000) { + } else if (netdev->current_speed == SPEED_10000) { netdev->current = NETDEV_F_10GB_FD; - } else if (speed == 40000) { + } else if (netdev->current_speed == 40000) { netdev->current = NETDEV_F_40GB_FD; - } else if (speed == 100000) { + } else if (netdev->current_speed == 100000) { netdev->current = NETDEV_F_100GB_FD; - } else if (speed == 1000000) { + } else if (netdev->current_speed == 1000000) { netdev->current = NETDEV_F_1TB_FD; } else { netdev->current = 0; @@ -2563,6 +2562,33 @@ netdev_linux_get_features(const struct netdev *netdev_, return error; } +static int +netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current, + uint32_t *max) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + if (netdev_linux_netnsid_is_remote(netdev)) { + error = EOPNOTSUPP; + goto exit; + } + + netdev_linux_read_features(netdev); + if (!netdev->get_features_error) { + *current = netdev->current_speed == SPEED_UNKNOWN + ? 0 : netdev->current_speed; + *max = MIN(UINT32_MAX, + netdev_features_to_bps(netdev->supported, 0) / 1000000ULL); + } + error = netdev->get_features_error; + +exit: + ovs_mutex_unlock(&netdev->mutex); + return error; +} + /* Set the features advertised by 'netdev' to 'advertise'. */ static int netdev_linux_set_advertisements(struct netdev *netdev_, @@ -3697,6 +3723,7 @@ const struct netdev_class netdev_linux_class = { .destruct = netdev_linux_destruct, .get_stats = netdev_linux_get_stats, .get_features = netdev_linux_get_features, + .get_speed = netdev_linux_get_speed, .get_status = netdev_linux_get_status, .get_block_id = netdev_linux_get_block_id, .send = netdev_linux_send, @@ -3713,6 +3740,7 @@ const struct netdev_class netdev_tap_class = { .destruct = netdev_linux_destruct, .get_stats = netdev_tap_get_stats, .get_features = netdev_linux_get_features, + .get_speed = netdev_linux_get_speed, .get_status = netdev_linux_get_status, .send = netdev_linux_send, .rxq_construct = netdev_linux_rxq_construct, diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index b5420947d0c..a7393c7cecf 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -500,6 +500,15 @@ struct netdev_class { enum netdev_features *supported, enum netdev_features *peer); + /* Stores the current and maximum supported link speed by 'netdev' into + * each of '*current' and '*max'. Each value represents the speed in Mbps. + * If any of the speeds is unknown, a zero value must be stored. + * + * This function may be set to null if it would always return EOPNOTSUPP. + */ + int (*get_speed)(const struct netdev *netdev, uint32_t *current, + uint32_t *max); + /* Set the features advertised by 'netdev' to 'advertise', which is a * set of NETDEV_F_* bits. * diff --git a/lib/netdev.c b/lib/netdev.c index 8df7f873715..e5ac7713d2e 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -1158,6 +1158,36 @@ netdev_get_features(const struct netdev *netdev, return error; } +int +netdev_get_speed(const struct netdev *netdev, uint32_t *current, uint32_t *max) +{ + uint32_t current_dummy, max_dummy; + int error; + + if (!current) { + current = ¤t_dummy; + } + if (!max) { + max = &max_dummy; + } + + error = netdev->netdev_class->get_speed + ? netdev->netdev_class->get_speed(netdev, current, max) + : EOPNOTSUPP; + + if (error == EOPNOTSUPP) { + enum netdev_features current_f, supported_f; + + error = netdev_get_features(netdev, ¤t_f, NULL, + &supported_f, NULL); + *current = netdev_features_to_bps(current_f, 0) / 1000000; + *max = netdev_features_to_bps(supported_f, 0) / 1000000; + } else if (error) { + *current = *max = 0; + } + return error; +} + /* Returns the maximum speed of a network connection that has the NETDEV_F_* * bits in 'features', in bits per second. If no bits that indicate a speed * are set in 'features', returns 'default_bps'. */ diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c index a405eb0563f..a3c83bac815 100644 --- a/ofproto/ofproto-dpif-sflow.c +++ b/ofproto/ofproto-dpif-sflow.c @@ -306,6 +306,7 @@ sflow_agent_get_counters(void *ds_, SFLPoller *poller, struct netdev_stats stats; enum netdev_flags flags; struct lacp_member_stats lacp_stats; + uint32_t curr_speed; const char *ifName; dsp = dpif_sflow_find_port(ds, u32_to_odp(poller->bridgePort)); @@ -320,13 +321,19 @@ sflow_agent_get_counters(void *ds_, SFLPoller *poller, if (!netdev_get_features(dsp->ofport->netdev, ¤t, NULL, NULL, NULL)) { /* The values of ifDirection come from MAU MIB (RFC 2668): 0 = unknown, 1 = full-duplex, 2 = half-duplex, 3 = in, 4=out */ - counters->ifSpeed = netdev_features_to_bps(current, 0); counters->ifDirection = (netdev_features_is_full_duplex(current) ? 1 : 2); } else { - counters->ifSpeed = 100000000; counters->ifDirection = 0; } + + netdev_get_speed(dsp->ofport->netdev, &curr_speed, NULL); + if (curr_speed) { + counters->ifSpeed = curr_speed * 1000000; + } else { + counters->ifSpeed = 100000000; + } + if (!netdev_get_flags(dsp->ofport->netdev, &flags) && flags & NETDEV_UP) { counters->ifStatus = 1; /* ifAdminStatus up. */ if (netdev_get_carrier(dsp->ofport->netdev)) { diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 11cc0c6f602..dbf4958bc24 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -2476,6 +2476,7 @@ ofport_open(struct ofproto *ofproto, struct ofputil_phy_port *pp, struct netdev **p_netdev) { + uint32_t curr_speed, max_speed; enum netdev_flags flags; struct netdev *netdev; int error; @@ -2514,8 +2515,9 @@ ofport_open(struct ofproto *ofproto, pp->state = netdev_get_carrier(netdev) ? 0 : OFPUTIL_PS_LINK_DOWN; netdev_get_features(netdev, &pp->curr, &pp->advertised, &pp->supported, &pp->peer); - pp->curr_speed = netdev_features_to_bps(pp->curr, 0) / 1000; - pp->max_speed = netdev_features_to_bps(pp->supported, 0) / 1000; + netdev_get_speed(netdev, &curr_speed, &max_speed); + pp->curr_speed = curr_speed * 1000; + pp->max_speed = max_speed * 1000; *p_netdev = netdev; return 0; diff --git a/tests/atlocal.in b/tests/atlocal.in index 18d5efae047..94b5c4d0b14 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -180,6 +180,9 @@ find_command tcpdump # Set HAVE_LFTP find_command lftp +# Set HAVE_ETHTOOL +find_command ethtool + CURL_OPT="-g -v --max-time 1 --retry 2 --retry-delay 1 --connect-timeout 1" # Determine whether "diff" supports "normal" diffs. (busybox diff does not.) diff --git a/tests/system-interface.at b/tests/system-interface.at index 3bf339582dd..148f011c7ee 100644 --- a/tests/system-interface.at +++ b/tests/system-interface.at @@ -122,3 +122,33 @@ AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " | diff -u - OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([interface - current speed]) +AT_SKIP_IF([test $HAVE_ETHTOOL = "no"]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ip tuntap add tap0 mode tap]) +on_exit 'ip tuntap del tap0 mode tap' + +AT_CHECK([ip link set dev tap0 address aa:55:aa:55:00:01]) +AT_CHECK([ethtool -s tap0 speed 50000 duplex full]) +AT_CHECK([ip link set dev tap0 up]) + +AT_CHECK([ovs-vsctl add-port br0 tap0 -- set int tap0 type=tap]) + +AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn dump-ports-desc br0 tap0], [0], [stdout]) +AT_CHECK([strip_xids < stdout], [0], [dnl +OFPST_PORT_DESC reply (OF1.5): + 1(tap0): addr:aa:55:aa:55:00:01 + config: 0 + state: LIVE + current: COPPER + speed: 50000 Mbps now, 0 Mbps max +]) + +AT_CHECK([ovs-vsctl get interface tap0 link_speed], [0], [dnl +50000000000 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 12bd153ee05..5c4aaefb374 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -671,18 +671,23 @@ def regex_warn_factory(description): easy_to_misuse_api = [ ('ovsrcu_barrier', - 'lib/ovs-rcu.c', + ['lib/ovs-rcu.c'], 'Are you sure you need to use ovsrcu_barrier(), ' 'in most cases ovsrcu_synchronize() will be fine?'), + ('netdev_features_to_bps', + ['lib/netdev.c', 'lib/netdev-bsd.c', 'lib/netdev-linux.c'], + 'Are you sure you need to use netdev_features_to_bps()? ' + 'If you want to retrieve the current and/or maximum link speed, ' + 'consider using netdev_get_speed() instead.'), ] checks += [ {'regex': r'(\.c)(\.in)?$', - 'match_name': lambda x: x != location, + 'match_name': lambda x, loc=locations: x not in loc, 'prereq': lambda x: not is_comment_line(x), 'check': regex_function_factory(function_name), 'print': regex_warn_factory(description)} - for (function_name, location, description) in easy_to_misuse_api] + for (function_name, locations, description) in easy_to_misuse_api] def regex_operator_factory(operator): diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index b972d55d0b3..e9110c1d80d 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1694,11 +1694,12 @@ port_configure_stp(const struct ofproto *ofproto, struct port *port, if (config_str) { port_s->path_cost = strtoul(config_str, NULL, 10); } else { - enum netdev_features current; - unsigned int mbps; + uint32_t mbps; - netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - mbps = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 1000000; + netdev_get_speed(iface->netdev, &mbps, NULL); + if (!mbps) { + mbps = NETDEV_DEFAULT_BPS / 1000000; + } port_s->path_cost = stp_convert_speed_to_cost(mbps); } @@ -1777,11 +1778,12 @@ port_configure_rstp(const struct ofproto *ofproto, struct port *port, if (config_str) { port_s->path_cost = strtoul(config_str, NULL, 10); } else { - enum netdev_features current; - unsigned int mbps; + uint32_t mbps; - netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - mbps = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 1000000; + netdev_get_speed(iface->netdev, &mbps, NULL); + if (!mbps) { + mbps = NETDEV_DEFAULT_BPS / 1000000; + } port_s->path_cost = rstp_convert_speed_to_cost(mbps); } @@ -2418,6 +2420,7 @@ iface_refresh_netdev_status(struct iface *iface) struct eth_addr mac; int64_t bps, mtu_64, ifindex64, link_resets; int mtu, error; + uint32_t mbps; if (iface_is_synthetic(iface)) { return; @@ -2456,14 +2459,19 @@ iface_refresh_netdev_status(struct iface *iface) ovsrec_interface_set_link_resets(iface->cfg, &link_resets, 1); error = netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - bps = !error ? netdev_features_to_bps(current, 0) : 0; - if (bps) { + if (!error) { ovsrec_interface_set_duplex(iface->cfg, netdev_features_is_full_duplex(current) ? "full" : "half"); - ovsrec_interface_set_link_speed(iface->cfg, &bps, 1); } else { ovsrec_interface_set_duplex(iface->cfg, NULL); + } + + netdev_get_speed(iface->netdev, &mbps, NULL); + if (mbps) { + bps = mbps * 1000000ULL; + ovsrec_interface_set_link_speed(iface->cfg, &bps, 1); + } else { ovsrec_interface_set_link_speed(iface->cfg, NULL, 0); } From b8f8fad8643518551cf742056ae8728c936674c6 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:12 +0200 Subject: [PATCH 320/833] netdev-linux: Use speed as max rate in tc classes. Instead of relying on feature bits, use the speed value directly as maximum rate for htb and hfsc classes. There is still a limitation with the maximum rate that we can express with a 32-bit number in bytes/s (~ 34.3Gbps), but using the actual link speed instead of the feature bits, we can at least use an accurate maximum for some link speeds (such as 25Gbps) which are not supported by netdev's feature bits. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 599745da44e..063a24254fd 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -4781,18 +4781,16 @@ htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id, } static void -htb_parse_qdisc_details__(struct netdev *netdev_, - const struct smap *details, struct htb_class *hc) +htb_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, + struct htb_class *hc) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); - hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8; if (!hc->max_rate) { - enum netdev_features current; + uint32_t current_speed; - netdev_linux_read_features(netdev); - current = !netdev->get_features_error ? netdev->current : 0; - hc->max_rate = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 8; + netdev_get_speed(netdev, ¤t_speed, NULL); + hc->max_rate = current_speed ? current_speed / 8 * 1000000ULL + : NETDEV_DEFAULT_BPS / 8; } hc->min_rate = hc->max_rate; hc->burst = 0; @@ -5253,18 +5251,16 @@ hfsc_query_class__(const struct netdev *netdev, unsigned int handle, } static void -hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details, +hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, struct hfsc_class *class) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); - uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8; if (!max_rate) { - enum netdev_features current; + uint32_t current_speed; - netdev_linux_read_features(netdev); - current = !netdev->get_features_error ? netdev->current : 0; - max_rate = netdev_features_to_bps(current, NETDEV_DEFAULT_BPS) / 8; + netdev_get_speed(netdev, ¤t_speed, NULL); + max_rate = current_speed ? current_speed / 8 * 1000000ULL + : NETDEV_DEFAULT_BPS / 8; } class->min_rate = max_rate; From 7edfac5745872a6cabdc112bf25e93244096b946 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:13 +0200 Subject: [PATCH 321/833] netdev-linux: Use 64bit rtab and burst calculations. tc uses these "rtab" tables to estimate the time (ticks) that it takes to send a packet of different sizes. In preparation for the introduction of 64-bit rates, add an argument to tc_put_rtab() to allow an external 64-bit rate. Also use 64bits for other burst buffer calculation functions. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 063a24254fd..128a46c5ec1 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -484,9 +484,9 @@ static const struct tc_ops *const tcs[] = { NULL }; -static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks); -static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size); -static unsigned int tc_buffer_per_jiffy(unsigned int rate); +static unsigned int tc_ticks_to_bytes(uint64_t rate, unsigned int ticks); +static unsigned int tc_bytes_to_ticks(uint64_t rate, unsigned int size); +static unsigned int tc_buffer_per_jiffy(uint64_t rate); static uint32_t tc_time_to_ticks(uint32_t time); static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *, @@ -512,10 +512,11 @@ static int tc_del_qdisc(struct netdev *netdev); static int tc_query_qdisc(const struct netdev *netdev); void -tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate); +tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate, + uint64_t rate64); static int tc_calc_cell_log(unsigned int mtu); static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu); -static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes); +static int tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes); /* This is set pretty low because we probably won't learn anything from the @@ -2723,7 +2724,7 @@ nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset, single_action); if (police->rate.rate) { - tc_put_rtab(request, TCA_POLICE_RATE, &police->rate); + tc_put_rtab(request, TCA_POLICE_RATE, &police->rate, 0); } if (pkts_rate) { uint64_t pkt_burst_ticks; @@ -4709,8 +4710,8 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, nl_msg_put_string(&request, TCA_KIND, "htb"); opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS); nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt); - tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate); - tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil); + tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, 0); + tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, 0); nl_msg_end_nested(&request, opt_offset); error = tc_transact(&request, NULL); @@ -6010,7 +6011,7 @@ read_psched(void) /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a * rate of 'rate' bytes per second. */ static unsigned int -tc_ticks_to_bytes(unsigned int rate, unsigned int ticks) +tc_ticks_to_bytes(uint64_t rate, unsigned int ticks) { read_psched(); return (rate * ticks) / ticks_per_s; @@ -6019,7 +6020,7 @@ tc_ticks_to_bytes(unsigned int rate, unsigned int ticks) /* Returns the number of ticks that it would take to transmit 'size' bytes at a * rate of 'rate' bytes per second. */ static unsigned int -tc_bytes_to_ticks(unsigned int rate, unsigned int size) +tc_bytes_to_ticks(uint64_t rate, unsigned int size) { read_psched(); return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0; @@ -6028,7 +6029,7 @@ tc_bytes_to_ticks(unsigned int rate, unsigned int size) /* Returns the number of bytes that need to be reserved for qdisc buffering at * a transmission rate of 'rate' bytes per second. */ static unsigned int -tc_buffer_per_jiffy(unsigned int rate) +tc_buffer_per_jiffy(uint64_t rate) { read_psched(); return rate / buffer_hz; @@ -6391,15 +6392,19 @@ tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu) /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */ /* rate->cell_align = 0; */ /* distro headers. */ rate->mpu = ETH_TOTAL_MIN; - rate->rate = Bps; + rate->rate = MIN(UINT32_MAX, Bps); } /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink * attribute of the specified "type". * + * A 64-bit rate can be provided via 'rate64' in bps. + * If zero, the rate in 'rate' will be used. + * * See tc_calc_cell_log() above for a description of "rtab"s. */ void -tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate) +tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate, + uint64_t rate64) { uint32_t *rtab; unsigned int i; @@ -6410,7 +6415,7 @@ tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate) if (packet_size < rate->mpu) { packet_size = rate->mpu; } - rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size); + rtab[i] = tc_bytes_to_ticks(rate64 ? rate64 : rate->rate, packet_size); } } @@ -6419,7 +6424,7 @@ tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate) * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of * 0 is fine.) */ static int -tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes) +tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes) { unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu; return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst)); From a86fea06fe7c81ba1966c0e36a45c78978655dce Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:14 +0200 Subject: [PATCH 322/833] netdev-linux: Use 64-bit rates in htb tc classes. Currently, htb rates are capped at ~34Gbps because they are internally expressed as 32-bit fields. Move min and max rates to 64-bit fields and use TCA_HTB_RATE64 and TCA_HTB_CEIL64 to configure HTC classes to break this barrier. In order to test this, create a dummy tuntap device and set it's speed to a very high value so we can try adding a QoS queue with big rates. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2137619 Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- NEWS | 1 + acinclude.m4 | 10 ++++++++++ lib/netdev-linux.c | 41 ++++++++++++++++++++++++++++++++--------- tests/atlocal.in | 1 + tests/system-traffic.at | 28 ++++++++++++++++++++++++++++ 5 files changed, 72 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index 19b1fef1aaa..57ba463cac3 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,7 @@ Post-v3.1.0 in order to create OVSDB sockets with access mode of 0770. - QoS: * Added new configuration option 'jitter' for a linux-netem QoS type. + * 'linux-htb' QoS type now supports rates higher than 34 Gbps. - DPDK: * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started with the --hw-rawio-access command line option. This allows the diff --git a/acinclude.m4 b/acinclude.m4 index 690a13c2596..28d028f371b 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -218,6 +218,16 @@ AC_DEFUN([OVS_CHECK_LINUX_TC], [ ])], [AC_DEFINE([HAVE_TCA_STATS_PKT64], [1], [Define to 1 if TCA_STATS_PKT64 is available.])]) + + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include ], [ + int x = TCA_HTB_RATE64; + ])], + [AC_SUBST(HAVE_TCA_HTB_RATE64,yes) + AC_DEFINE([HAVE_TCA_HTB_RATE64], [1], + [Define to 1 if TCA_HTB_RATE64 is available.])], + [AC_SUBST(HAVE_TCA_HTB_RATE64,no)] + ) ]) dnl OVS_CHECK_LINUX_SCTP_CT diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 128a46c5ec1..55bb21bb3ff 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -4601,13 +4601,13 @@ static const struct tc_ops tc_ops_netem = { struct htb { struct tc tc; - unsigned int max_rate; /* In bytes/s. */ + uint64_t max_rate; /* In bytes/s. */ }; struct htb_class { struct tc_queue tc_queue; - unsigned int min_rate; /* In bytes/s. */ - unsigned int max_rate; /* In bytes/s. */ + uint64_t min_rate; /* In bytes/s. */ + uint64_t max_rate; /* In bytes/s. */ unsigned int burst; /* In bytes. */ unsigned int priority; /* Lower values are higher priorities. */ }; @@ -4695,8 +4695,8 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) { opt.quantum = mtu; } - opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst); - opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst); + opt.buffer = tc_calc_buffer(class->min_rate, mtu, class->burst); + opt.cbuffer = tc_calc_buffer(class->max_rate, mtu, class->burst); opt.prio = class->priority; tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, @@ -4709,15 +4709,26 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, nl_msg_put_string(&request, TCA_KIND, "htb"); opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS); + +#ifdef HAVE_TCA_HTB_RATE64 + if (class->min_rate > UINT32_MAX) { + nl_msg_put_u64(&request, TCA_HTB_RATE64, class->min_rate); + } + if (class->max_rate > UINT32_MAX) { + nl_msg_put_u64(&request, TCA_HTB_CEIL64, class->max_rate); + } +#endif nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt); - tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, 0); - tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, 0); + + tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, class->min_rate); + tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, class->max_rate); nl_msg_end_nested(&request, opt_offset); error = tc_transact(&request, NULL); if (error) { VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, " - "min_rate=%u max_rate=%u burst=%u prio=%u (%s)", + "min_rate=%"PRIu64" max_rate=%"PRIu64" burst=%u prio=%u " + "(%s)", netdev_get_name(netdev), tc_get_major(handle), tc_get_minor(handle), tc_get_major(parent), tc_get_minor(parent), @@ -4737,6 +4748,10 @@ htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class) static const struct nl_policy tca_htb_policy[] = { [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false, .min_len = sizeof(struct tc_htb_opt) }, +#ifdef HAVE_TCA_HTB_RATE64 + [TCA_HTB_RATE64] = { .type = NL_A_U64, .optional = true }, + [TCA_HTB_CEIL64] = { .type = NL_A_U64, .optional = true }, +#endif }; struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)]; @@ -4751,7 +4766,15 @@ htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class) htb = nl_attr_get(attrs[TCA_HTB_PARMS]); class->min_rate = htb->rate.rate; class->max_rate = htb->ceil.rate; - class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer); +#ifdef HAVE_TCA_HTB_RATE64 + if (attrs[TCA_HTB_RATE64]) { + class->min_rate = nl_attr_get_u64(attrs[TCA_HTB_RATE64]); + } + if (attrs[TCA_HTB_CEIL64]) { + class->max_rate = nl_attr_get_u64(attrs[TCA_HTB_CEIL64]); + } +#endif + class->burst = tc_ticks_to_bytes(class->min_rate, htb->buffer); class->priority = htb->prio; return 0; } diff --git a/tests/atlocal.in b/tests/atlocal.in index 94b5c4d0b14..ffdea5cc017 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -7,6 +7,7 @@ HAVE_UNWIND='@HAVE_UNWIND@' EGREP='@EGREP@' PYTHON3='@PYTHON3@' CFLAGS='@CFLAGS@' +HAVE_TCA_HTB_RATE64='@HAVE_TCA_HTB_RATE64@' # PYTHONCOERCECLOCALE=0 disables the Unicode compatibility warning on # stderr that breaks almost any Python3 test (PEP 0538) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 9f07f45a36a..ecb37303a0f 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2354,6 +2354,34 @@ AT_CHECK([tc class show dev ovs-p1 | grep -q 'class htb .* HTB_CONF']) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([QoS - 64bit]) +AT_SKIP_IF([test $HAVE_TC = no]) +AT_SKIP_IF([test $HAVE_TCA_HTB_RATE64 = no]) +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Configure the QoS with rates that require 64bits, i.e: > 34Gbps. +AT_CHECK([ovs-vsctl set port ovs-p0 qos=@qos -- set port ovs-p1 qos=@qos dnl + -- --id=@qos create qos dnl + type=linux-htb other-config:max-rate=50000000000 queues:0=@queue dnl + -- --id=@queue create queue dnl + other_config:min-rate=40000000000 other_config:max-rate=50000000000 dnl + other_config:burst=5000000], + [ignore], [ignore]) + +OVS_WAIT_UNTIL([tc qdisc show dev ovs-p0 | grep -q htb]) +OVS_WAIT_UNTIL([tc qdisc show dev ovs-p1 | grep -q htb]) + +m4_define([HTB_CONF], [rate 40Gbit ceil 50Gbit burst 620000b cburst 618750b]) +AT_CHECK([tc class show dev ovs-p0 | grep -q 'class htb .* HTB_CONF']) +AT_CHECK([tc class show dev ovs-p1 | grep -q 'class htb .* HTB_CONF']) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([conntrack]) AT_SETUP([conntrack - controller]) From 13e183da31bbcc8b81c4d1ba2dfa19abb3fa561d Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:15 +0200 Subject: [PATCH 323/833] netdev-linux: Remove tc_matchall_fill_police. It is equivalent to tc_policer_init() so remove the duplicated function. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 55bb21bb3ff..e20d7409d57 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -510,6 +510,8 @@ static int tc_delete_class(const struct netdev *, unsigned int handle); static int tc_del_qdisc(struct netdev *netdev); static int tc_query_qdisc(const struct netdev *netdev); +static void tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate, + uint64_t kbits_burst); void tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate, @@ -2661,29 +2663,6 @@ netdev_linux_set_advertisements(struct netdev *netdev_, return error; } -static struct tc_police -tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst) -{ - unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8; - unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8; - struct tc_police police; - struct tc_ratespec rate; - int mtu = 65535; - - memset(&rate, 0, sizeof rate); - rate.rate = bps; - rate.cell_log = tc_calc_cell_log(mtu); - rate.mpu = ETH_TOTAL_MIN; - - memset(&police, 0, sizeof police); - police.burst = tc_bytes_to_ticks(bps, bsize); - police.action = TC_POLICE_SHOT; - police.rate = rate; - police.mtu = mtu; - - return police; -} - static void nl_msg_act_police_start_nest(struct ofpbuf *request, uint32_t prio, size_t *offset, size_t *act_offset, @@ -2764,7 +2743,7 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, tcmsg->tcm_info = tc_make_handle(prio, eth_type); tcmsg->tcm_handle = handle; - pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst); + tc_policer_init(&pol_act, kbits_rate, kbits_burst); nl_msg_put_string(&request, TCA_KIND, "matchall"); basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT); From 68ac6e9db7d4cb8d84acd35820a5775d43270204 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:16 +0200 Subject: [PATCH 324/833] netdev-linux: Refactor nl_msg_put_act_police. In preparation for supporting 64-bit rates in tc policies, move the allocation and initialization of struct tc_police object inside nl_msg_put_act_police(). That way, the function is now called with the actual rates. Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index e20d7409d57..759c98d33db 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2689,21 +2689,26 @@ nl_msg_act_police_end_nest(struct ofpbuf *request, size_t offset, } static void -nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, +nl_msg_put_act_police(struct ofpbuf *request, uint32_t index, + uint64_t kbits_rate, uint64_t kbits_burst, uint64_t pkts_rate, uint64_t pkts_burst, uint32_t notexceed_act, bool single_action) { size_t offset, act_offset; + struct tc_police police; uint32_t prio = 0; - if (!police->rate.rate && !pkts_rate) { + if (!kbits_rate && !pkts_rate) { return; } + tc_policer_init(&police, kbits_rate, kbits_burst); + police.index = index; + nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset, single_action); - if (police->rate.rate) { - tc_put_rtab(request, TCA_POLICE_RATE, &police->rate, 0); + if (police.rate.rate) { + tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, 0); } if (pkts_rate) { uint64_t pkt_burst_ticks; @@ -2713,7 +2718,7 @@ nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, nl_msg_put_u64(request, TCA_POLICE_PKTRATE64, pkts_rate); nl_msg_put_u64(request, TCA_POLICE_PKTBURST64, pkt_burst_ticks); } - nl_msg_put_unspec(request, TCA_POLICE_TBF, police, sizeof *police); + nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police); nl_msg_act_police_end_nest(request, offset, act_offset, notexceed_act); } @@ -2726,7 +2731,6 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, size_t basic_offset, action_offset; uint16_t prio = TC_RESERVED_PRIORITY_POLICE; int ifindex, err = 0; - struct tc_police pol_act; struct ofpbuf request; struct ofpbuf *reply; struct tcmsg *tcmsg; @@ -2743,12 +2747,12 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, tcmsg->tcm_info = tc_make_handle(prio, eth_type); tcmsg->tcm_handle = handle; - tc_policer_init(&pol_act, kbits_rate, kbits_burst); nl_msg_put_string(&request, TCA_KIND, "matchall"); basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT); - nl_msg_put_act_police(&request, &pol_act, kpkts_rate * 1000, - kpkts_burst * 1000, TC_ACT_UNSPEC, false); + nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst, + kpkts_rate * 1000, kpkts_burst * 1000, TC_ACT_UNSPEC, + false); nl_msg_end_nested(&request, action_offset); nl_msg_end_nested(&request, basic_offset); @@ -5743,7 +5747,6 @@ tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, uint32_t kpkts_burst) { size_t basic_offset, police_offset; - struct tc_police tc_police; struct ofpbuf request; struct tcmsg *tcmsg; int error; @@ -5760,9 +5763,9 @@ tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); police_offset = nl_msg_start_nested(&request, TCA_BASIC_ACT); - tc_policer_init(&tc_police, kbits_rate, kbits_burst); - nl_msg_put_act_police(&request, &tc_police, kpkts_rate * 1000ULL, - kpkts_burst * 1000ULL, TC_ACT_UNSPEC, false); + nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst, + kpkts_rate * 1000ULL, kpkts_burst * 1000ULL, + TC_ACT_UNSPEC, false); nl_msg_end_nested(&request, police_offset); nl_msg_end_nested(&request, basic_offset); @@ -5779,16 +5782,12 @@ tc_add_policer_action(uint32_t index, uint32_t kbits_rate, uint32_t kbits_burst, uint32_t pkts_rate, uint32_t pkts_burst, bool update) { - struct tc_police tc_police; struct ofpbuf request; struct tcamsg *tcamsg; size_t offset; int flags; int error; - tc_policer_init(&tc_police, kbits_rate, kbits_burst); - tc_police.index = index; - flags = (update ? NLM_F_REPLACE : NLM_F_EXCL) | NLM_F_CREATE; tcamsg = tc_make_action_request(RTM_NEWACTION, flags, &request); if (!tcamsg) { @@ -5796,8 +5795,8 @@ tc_add_policer_action(uint32_t index, uint32_t kbits_rate, } offset = nl_msg_start_nested(&request, TCA_ACT_TAB); - nl_msg_put_act_police(&request, &tc_police, pkts_rate, pkts_burst, - TC_ACT_PIPE, true); + nl_msg_put_act_police(&request, index, kbits_rate, kbits_burst, pkts_rate, + pkts_burst, TC_ACT_PIPE, true); nl_msg_end_nested(&request, offset); error = tc_transact(&request, NULL); From 07ce41da116006e21b7299732525483eab3b90c2 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 17 Jul 2023 10:08:17 +0200 Subject: [PATCH 325/833] netdev-linux: Support 64-bit rates in tc policing. Use TCA_POLICE_RATE64 if the rate cannot be expressed using 32bits. This breaks the 32Gbps barrier. The new barrier is ~4Tbps caused by netdev's API expressing kbps rates using 32-bit integers. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2137643 Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- NEWS | 2 ++ acinclude.m4 | 10 ++++++++++ lib/netdev-linux.c | 19 ++++++++++++------- lib/netdev-linux.h | 2 +- lib/tc.c | 2 ++ tests/atlocal.in | 1 + tests/system-traffic.at | 21 +++++++++++++++++++++ 7 files changed, 49 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 57ba463cac3..6df7ef599ed 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,8 @@ Post-v3.1.0 - QoS: * Added new configuration option 'jitter' for a linux-netem QoS type. * 'linux-htb' QoS type now supports rates higher than 34 Gbps. + - Ingress Policing: + * Ingress policing byte rates can now be configured higher than 34 Gbps. - DPDK: * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started with the --hw-rawio-access command line option. This allows the diff --git a/acinclude.m4 b/acinclude.m4 index 28d028f371b..f1ba046c238 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -228,6 +228,16 @@ AC_DEFUN([OVS_CHECK_LINUX_TC], [ [Define to 1 if TCA_HTB_RATE64 is available.])], [AC_SUBST(HAVE_TCA_HTB_RATE64,no)] ) + + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include ], [ + int x = TCA_POLICE_PKTRATE64; + ])], + [AC_SUBST(HAVE_TCA_POLICE_PKTRATE64,yes) + AC_DEFINE([HAVE_TCA_POLICE_PKTRATE64], [1], + [Define to 1 if TCA_POLICE_PKTRATE64 is available.])], + [AC_SUBST(HAVE_TCA_POLICE_PKTRATE64,no)] + ) ]) dnl OVS_CHECK_LINUX_SCTP_CT diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 759c98d33db..cca3408797e 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -494,7 +494,7 @@ static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *, unsigned int flags, struct ofpbuf *); -static int tc_add_policer(struct netdev *, uint32_t kbits_rate, +static int tc_add_policer(struct netdev *, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst); @@ -2694,6 +2694,7 @@ nl_msg_put_act_police(struct ofpbuf *request, uint32_t index, uint64_t pkts_rate, uint64_t pkts_burst, uint32_t notexceed_act, bool single_action) { + uint64_t bytes_rate = kbits_rate / 8 * 1000; size_t offset, act_offset; struct tc_police police; uint32_t prio = 0; @@ -2708,8 +2709,13 @@ nl_msg_put_act_police(struct ofpbuf *request, uint32_t index, nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset, single_action); if (police.rate.rate) { - tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, 0); + tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, bytes_rate); } +#ifdef HAVE_TCA_POLICE_PKTRATE64 + if (bytes_rate > UINT32_MAX) { + nl_msg_put_u64(request, TCA_POLICE_RATE64, bytes_rate); + } +#endif if (pkts_rate) { uint64_t pkt_burst_ticks; /* Here tc_bytes_to_ticks is used to convert packets rather than bytes @@ -2723,7 +2729,7 @@ nl_msg_put_act_police(struct ofpbuf *request, uint32_t index, } static int -tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, +tc_add_matchall_policer(struct netdev *netdev, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst) { @@ -5742,9 +5748,8 @@ tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate, * Returns 0 if successful, otherwise a positive errno value. */ static int -tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, - uint32_t kbits_burst, uint32_t kpkts_rate, - uint32_t kpkts_burst) +tc_add_policer(struct netdev *netdev, uint64_t kbits_rate, + uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst) { size_t basic_offset, police_offset; struct ofpbuf request; @@ -5778,7 +5783,7 @@ tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, } int -tc_add_policer_action(uint32_t index, uint32_t kbits_rate, +tc_add_policer_action(uint32_t index, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t pkts_rate, uint32_t pkts_burst, bool update) { diff --git a/lib/netdev-linux.h b/lib/netdev-linux.h index 9a416ce505c..ec19b0dedc4 100644 --- a/lib/netdev-linux.h +++ b/lib/netdev-linux.h @@ -29,7 +29,7 @@ struct netdev; int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag, const char *flag_name, bool enable); int linux_get_ifindex(const char *netdev_name); -int tc_add_policer_action(uint32_t index, uint32_t kbits_rate, +int tc_add_policer_action(uint32_t index, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t pkts_rate, uint32_t pkts_burst, bool update); int tc_del_policer_action(uint32_t index, struct ofputil_meter_stats *stats); diff --git a/lib/tc.c b/lib/tc.c index e34a1a5f090..f49048cdaba 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -1504,6 +1504,8 @@ static const struct nl_policy police_policy[] = { [TCA_POLICE_RATE] = { .type = NL_A_UNSPEC, .min_len = 1024, .optional = true, }, + [TCA_POLICE_RATE64] = { .type = NL_A_U32, + .optional = true, }, [TCA_POLICE_PEAKRATE] = { .type = NL_A_UNSPEC, .min_len = 1024, .optional = true, }, diff --git a/tests/atlocal.in b/tests/atlocal.in index ffdea5cc017..1013098a184 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -8,6 +8,7 @@ EGREP='@EGREP@' PYTHON3='@PYTHON3@' CFLAGS='@CFLAGS@' HAVE_TCA_HTB_RATE64='@HAVE_TCA_HTB_RATE64@' +HAVE_TCA_POLICE_PKTRATE64='@HAVE_TCA_POLICE_PKTRATE64@' # PYTHONCOERCECLOCALE=0 disables the Unicode compatibility warning on # stderr that breaks almost any Python3 test (PEP 0538) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index ecb37303a0f..945037ec057 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2382,6 +2382,27 @@ AT_CHECK([tc class show dev ovs-p1 | grep -q 'class htb .* HTB_CONF']) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([Ingress Policing - 64-bit]) +AT_SKIP_IF([test $HAVE_TC = no]) +AT_SKIP_IF([test $HAVE_TCA_POLICE_PKTRATE64 = no]) +OVS_TRAFFIC_VSWITCHD_START() +ADD_NAMESPACES(ns0) +ADD_VETH(p0, ns0, br0, "10.1.1.1/24") + +AT_CHECK([ovs-vsctl set interface ovs-p0 ingress_policing_rate=50000000]) +AT_CHECK([ovs-vsctl set interface ovs-p0 ingress_policing_burst=400000]) + +AT_CHECK([tc -o -s -d filter show dev ovs-p0 ingress | + sed -n 's/.*\(rate [[0-9]]*[[a-zA-Z]]* burst [[0-9]]*[[a-zA-Z]]*\).*/\1/; T; p; q'], + [0],[dnl +rate 50Gbit burst 74500000b +]) + +AT_CHECK([tc -s -d filter show dev ovs-p0 ingress | + grep -E "basic|matchall" > /dev/null], [0]) +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([conntrack]) AT_SETUP([conntrack - controller]) From f20980a19eb57de24aef29d1d4d5b80b61a9a982 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 17 Jul 2023 15:22:20 +0200 Subject: [PATCH 326/833] Prepare for 3.2.0. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/faq/releases.rst | 1 + NEWS | 2 +- configure.ac | 2 +- debian/changelog | 4 ++-- debian/rules | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 9fb679e307d..e6bda14e7b0 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -218,6 +218,7 @@ Q: What DPDK version does each Open vSwitch release work with? 2.17.x 21.11.2 3.0.x 21.11.2 3.1.x 22.11.1 + 3.2.x 22.11.1 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? diff --git a/NEWS b/NEWS index 6df7ef599ed..1438f9f8ddb 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Post-v3.1.0 +v3.2.0 - xx xxx xxxx -------------------- - OVSDB: * Changed format in which ovsdb schema conversion operations are stored in diff --git a/configure.ac b/configure.ac index d05e544b549..320509c5fc5 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.1.90, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.2.0, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 9a87224b283..294b4a5dbc6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,8 +1,8 @@ -openvswitch (3.1.90-1) unstable; urgency=low +openvswitch (3.2.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Mon, 16 Jan 2023 16:51:01 +0100 + -- Open vSwitch team Mon, 17 Jul 2023 14:40:00 +0100 openvswitch (3.1.0-1) unstable; urgency=low diff --git a/debian/rules b/debian/rules index 28c249d07cd..dc5cc8a65b0 100755 --- a/debian/rules +++ b/debian/rules @@ -134,8 +134,8 @@ override_dh_python3: # Helper target for creating snapshots from upstream git DATE=$(shell date +%Y%m%d) # Upstream branch to track -BRANCH=branch-3.1 -VERSION=3.1.0 +BRANCH=branch-3.2 +VERSION=3.2.0 get-orig-snapshot: rm -Rf openvswitch-upstream From bffffd841ff1a201fcf5bd7007d64e1d511d6aa1 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 17 Jul 2023 15:22:21 +0200 Subject: [PATCH 327/833] Prepare for post-3.2.0 (3.2.90). Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ configure.ac | 2 +- debian/changelog | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 1438f9f8ddb..7a852427e51 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Post-v3.2.0 +-------------------- + + v3.2.0 - xx xxx xxxx -------------------- - OVSDB: diff --git a/configure.ac b/configure.ac index 320509c5fc5..c8708630e88 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.2.0, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.2.90, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 294b4a5dbc6..69aac167ac1 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +openvswitch (3.2.90-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Mon, 17 Jul 2023 14:40:01 +0100 + openvswitch (3.2.0-1) unstable; urgency=low * New upstream version From 24520a401e061e7289411a3b29d5c8824d1054f8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 18 Jul 2023 14:40:03 +0200 Subject: [PATCH 328/833] vswitchd: Wait for a bridge exit before replying to exit unixctl. Before the cleanup option, the bridge_exit() call was fairly fast, because it didn't include any particularly long operations. However, with the cleanup flag, this function destroys a lot of datapath resources freeing a lot of memory, waiting on RCU and talking to the kernel. That may take a noticeable amount of time, especially on a busy system or under profilers/sanitizers. However, the unixctl 'exit' command replies instantly without waiting for any work to actually be done. This may cause system test failures or other issues where scripts expect ovs-vswitchd to exit or destroy all the datapath resources shortly after appctl call. Fix that by waiting for the bridge_exit() before replying to the user. At least, all the datapath resources will actually be destroyed by the time ovs-appctl exits. Also moving a structure from stack to global. Seems cleaner this way. Since we're not replying right away and it's technically possible to have multiple clients requesting exit at the same time, storing connections in an array. Fixes: fe13ccdca6a2 ("vswitchd: Add --cleanup option to the 'appctl exit' command") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- vswitchd/ovs-vswitchd.c | 46 ++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index a244d2f7095..273af9f5d62 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -68,19 +68,19 @@ static unixctl_cb_func ovs_vswitchd_exit; static char *parse_options(int argc, char *argv[], char **unixctl_path); OVS_NO_RETURN static void usage(void); -struct ovs_vswitchd_exit_args { - bool *exiting; - bool *cleanup; -}; +static struct ovs_vswitchd_exit_args { + struct unixctl_conn **conns; + size_t n_conns; + bool exiting; + bool cleanup; +} exit_args; int main(int argc, char *argv[]) { - char *unixctl_path = NULL; struct unixctl_server *unixctl; + char *unixctl_path = NULL; char *remote; - bool exiting, cleanup; - struct ovs_vswitchd_exit_args exit_args = {&exiting, &cleanup}; int retval; set_program_name(argv[0]); @@ -111,14 +111,12 @@ main(int argc, char *argv[]) exit(EXIT_FAILURE); } unixctl_command_register("exit", "[--cleanup]", 0, 1, - ovs_vswitchd_exit, &exit_args); + ovs_vswitchd_exit, NULL); bridge_init(remote); free(remote); - exiting = false; - cleanup = false; - while (!exiting) { + while (!exit_args.exiting) { OVS_USDT_PROBE(main, run_start); memory_run(); if (memory_should_report()) { @@ -137,16 +135,22 @@ main(int argc, char *argv[]) bridge_wait(); unixctl_server_wait(unixctl); netdev_wait(); - if (exiting) { + if (exit_args.exiting) { poll_immediate_wake(); } OVS_USDT_PROBE(main, poll_block); poll_block(); if (should_service_stop()) { - exiting = true; + exit_args.exiting = true; } } - bridge_exit(cleanup); + bridge_exit(exit_args.cleanup); + + for (size_t i = 0; i < exit_args.n_conns; i++) { + unixctl_command_reply(exit_args.conns[i], NULL); + } + free(exit_args.conns); + unixctl_server_destroy(unixctl); service_stop(); vlog_disable_async(); @@ -304,10 +308,14 @@ usage(void) static void ovs_vswitchd_exit(struct unixctl_conn *conn, int argc, - const char *argv[], void *exit_args_) + const char *argv[], void *args OVS_UNUSED) { - struct ovs_vswitchd_exit_args *exit_args = exit_args_; - *exit_args->exiting = true; - *exit_args->cleanup = argc == 2 && !strcmp(argv[1], "--cleanup"); - unixctl_command_reply(conn, NULL); + exit_args.n_conns++; + exit_args.conns = xrealloc(exit_args.conns, + exit_args.n_conns * sizeof *exit_args.conns); + exit_args.conns[exit_args.n_conns - 1] = conn; + exit_args.exiting = true; + if (!exit_args.cleanup) { + exit_args.cleanup = argc == 2 && !strcmp(argv[1], "--cleanup"); + } } From f5188ff2147517612b410ed607e3843cdf4b51a6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 19 Jul 2023 12:33:03 +0200 Subject: [PATCH 329/833] daemon.at: Correctly terminate ovsdb process in a backtrace test. In a backtrace test with monitor the child process will be re-started after being killed. The test doesn't wait for that to happen, so it is possible that during the test cleanup the pid in a pid file is not updated yet. Hence, the on-exit hook will not kill the process. This is causing issues in Cirrus CI, because gmake on FreBSD waits for all child processes to exit and that never happens. Fix the issue by waiting for a new process. It's also better to exit gracefully instead of relying on the on-exit kill. Fixes: 759a29dc2d97 ("backtrace: Extend the backtrace functionality.") Acked-by: Ales Musil Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/daemon.at | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/daemon.at b/tests/daemon.at index 13cb8fc1c14..2c7fac57c79 100644 --- a/tests/daemon.at +++ b/tests/daemon.at @@ -284,4 +284,8 @@ AT_CHECK([kill -SEGV $child]) OVS_WAIT_UNTIL([grep -q "backtrace(monitor)|WARN|SIGSEGV detected, backtrace:" ovsdb-server.log]) OVS_WAIT_UNTIL([grep -q "daemon_unix(monitor)|ERR|1 crashes: pid .* died, killed (Segmentation fault)" ovsdb-server.log]) +# Wait until a new process is started before exiting, so it will be +# stopped correctly. +OVS_WAIT_UNTIL([test -s ovsdb-server.pid && test $(cat ovsdb-server.pid) != $child]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) AT_CLEANUP From feed7f6775056b3dd55249596a7e587bc9c5fd4a Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 19 Jul 2023 12:21:09 -0400 Subject: [PATCH 330/833] ofproto-dpif-upcall: Mirror packets that are modified. Currently OVS keeps track of which mirrors that each packet has been sent to for the purpose of deduplication. However, this doesn't consider that openflow rules can make significant changes to packets after ingress. For example, OVN can create OpenFlow rules that turn an echo request into an echo response by flipping source/destination addresses and setting the ICMP type to Reply. When a mirror is configured, only the request gets mirrored even though a response is received. This can cause a false impression of the actual traffic on wire if someone inspects the mirror and doesn't see an echo reply even though one has been sent. This patch resets the mirrors every time a packet is modified, so mirrors will receive every copy of a packet that is sent for output. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2155579 Acked-by: Eelco Chaudron Acked-by: Aaron Conole Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 103 +++++++++++++++++++++++++++++++++++ tests/ofproto-dpif.at | 6 +- 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 4928ea99cfc..47ea0f47e7e 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -7046,6 +7046,107 @@ xlate_ofpact_unroll_xlate(struct xlate_ctx *ctx, "cookie=%#"PRIx64, a->rule_table_id, a->rule_cookie); } +/* Reset the mirror context if we modify the packet and would like to mirror + * the new copy. */ +static void +reset_mirror_ctx(struct xlate_ctx *ctx, const struct flow *flow, + const struct ofpact *a) +{ + switch (a->type) { + case OFPACT_STRIP_VLAN: + case OFPACT_PUSH_VLAN: + case OFPACT_SET_ETH_SRC: + case OFPACT_SET_ETH_DST: + case OFPACT_PUSH_MPLS: + case OFPACT_POP_MPLS: + case OFPACT_SET_MPLS_LABEL: + case OFPACT_SET_MPLS_TC: + case OFPACT_SET_MPLS_TTL: + case OFPACT_DEC_MPLS_TTL: + case OFPACT_DEC_NSH_TTL: + case OFPACT_DEC_TTL: + case OFPACT_SET_VLAN_VID: + case OFPACT_SET_VLAN_PCP: + case OFPACT_ENCAP: + case OFPACT_DECAP: + case OFPACT_NAT: + ctx->mirrors = 0; + return; + + case OFPACT_SET_FIELD: { + const struct ofpact_set_field *set_field; + const struct mf_field *mf; + + set_field = ofpact_get_SET_FIELD(a); + mf = set_field->field; + if (mf_are_prereqs_ok(mf, flow, NULL)) { + ctx->mirrors = 0; + } + return; + } + + case OFPACT_SET_IPV4_SRC: + case OFPACT_SET_IPV4_DST: + if (flow->dl_type == htons(ETH_TYPE_IP)) { + ctx->mirrors = 0; + } + return; + + case OFPACT_SET_IP_DSCP: + case OFPACT_SET_IP_ECN: + case OFPACT_SET_IP_TTL: + if (is_ip_any(flow)) { + ctx->mirrors = 0; + } + return; + + case OFPACT_SET_L4_SRC_PORT: + case OFPACT_SET_L4_DST_PORT: + if (is_ip_any(flow) && !(flow->nw_frag & FLOW_NW_FRAG_LATER)) { + ctx->mirrors = 0; + } + return; + + case OFPACT_OUTPUT_REG: + case OFPACT_OUTPUT_TRUNC: + case OFPACT_GROUP: + case OFPACT_OUTPUT: + case OFPACT_CONTROLLER: + case OFPACT_RESUBMIT: + case OFPACT_GOTO_TABLE: + case OFPACT_WRITE_METADATA: + case OFPACT_SET_TUNNEL: + case OFPACT_REG_MOVE: + case OFPACT_STACK_PUSH: + case OFPACT_STACK_POP: + case OFPACT_LEARN: + case OFPACT_ENQUEUE: + case OFPACT_SET_QUEUE: + case OFPACT_POP_QUEUE: + case OFPACT_MULTIPATH: + case OFPACT_BUNDLE: + case OFPACT_EXIT: + case OFPACT_UNROLL_XLATE: + case OFPACT_FIN_TIMEOUT: + case OFPACT_CLEAR_ACTIONS: + case OFPACT_WRITE_ACTIONS: + case OFPACT_METER: + case OFPACT_SAMPLE: + case OFPACT_CLONE: + case OFPACT_DEBUG_RECIRC: + case OFPACT_DEBUG_SLOW: + case OFPACT_CT: + case OFPACT_CT_CLEAR: + case OFPACT_CHECK_PKT_LARGER: + case OFPACT_DELETE_FIELD: + case OFPACT_NOTE: + case OFPACT_CONJUNCTION: + return; + } + + OVS_NOT_REACHED(); +} + static void xlate_trace(struct xlate_ctx *ctx, const struct ofpact *a) { @@ -7112,6 +7213,8 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, break; } + reset_mirror_ctx(ctx, flow, a); + if (OVS_UNLIKELY(ctx->xin->trace)) { xlate_trace(ctx, a); } diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 6824ce0bbfe..f242f77f316 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -5349,7 +5349,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) flow="in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) AT_CHECK_UNQUOTED([tail -1 stdout], [0], - [Datapath actions: 3,push_vlan(vid=17,pcp=0),2 + [Datapath actions: 3,push_vlan(vid=17,pcp=0),2,3 ]) flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" @@ -5388,7 +5388,7 @@ flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x080 AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) actual=`tail -1 stdout | sed 's/Datapath actions: //'` -expected="push_vlan(vid=17,pcp=0),1,pop_vlan,push_vlan(vid=12,pcp=0),1,2,100" +expected="push_vlan(vid=12,pcp=0),100,2,1,pop_vlan,push_vlan(vid=17,pcp=0),1,pop_vlan,push_vlan(vid=12,pcp=0),100,2,1" AT_CHECK([ovs-dpctl normalize-actions "$flow" "$expected"], [0], [stdout]) mv stdout expout AT_CHECK([ovs-dpctl normalize-actions "$flow" "$actual"], [0], [expout]) @@ -5656,7 +5656,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) flow="in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) AT_CHECK_UNQUOTED([tail -1 stdout], [0], - [Datapath actions: trunc(100),3,push_vlan(vid=17,pcp=0),2 + [Datapath actions: trunc(100),3,push_vlan(vid=17,pcp=0),2,trunc(100),3 ]) flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" From d460c473ebf9e9ab16da44cbfbb13a4911352195 Mon Sep 17 00:00:00 2001 From: Ivan Malov Date: Sun, 16 Jul 2023 15:57:20 +0400 Subject: [PATCH 331/833] netdev-dpdk: Negotiate delivery of per-packet Rx metadata. This may be required by some PMDs in offload scenarios. Fixes: e8a2b5bf92bb ("netdev-dpdk: implement flow offload with rte flow") Signed-off-by: Ivan Malov Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 64d3b1df4c2..8f1361e21f7 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -512,6 +512,9 @@ struct netdev_dpdk { /* Array of vhost rxq states, see vring_state_changed. */ bool *vhost_rxq_enabled; + + /* Ensures that Rx metadata delivery is configured only once. */ + bool rx_metadata_delivery_configured; ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -1220,6 +1223,45 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) } } +static void +dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev) +{ + uint64_t rx_metadata = 0; + int ret; + + if (dev->rx_metadata_delivery_configured) { + return; + } + + /* For the fallback offload (non-"transfer" rules). */ + rx_metadata |= RTE_ETH_RX_METADATA_USER_MARK; + +#ifdef ALLOW_EXPERIMENTAL_API + /* For the tunnel offload. */ + rx_metadata |= RTE_ETH_RX_METADATA_TUNNEL_ID; +#endif /* ALLOW_EXPERIMENTAL_API */ + + ret = rte_eth_rx_metadata_negotiate(dev->port_id, &rx_metadata); + if (ret == 0) { + if (!(rx_metadata & RTE_ETH_RX_METADATA_USER_MARK)) { + VLOG_DBG("%s: The NIC will not provide per-packet USER_MARK", + netdev_get_name(&dev->up)); + } +#ifdef ALLOW_EXPERIMENTAL_API + if (!(rx_metadata & RTE_ETH_RX_METADATA_TUNNEL_ID)) { + VLOG_DBG("%s: The NIC will not provide per-packet TUNNEL_ID", + netdev_get_name(&dev->up)); + } +#endif /* ALLOW_EXPERIMENTAL_API */ + } else { + VLOG(ret == -ENOTSUP ? VLL_DBG : VLL_WARN, + "%s: Cannot negotiate Rx metadata: %s", + netdev_get_name(&dev->up), rte_strerror(-ret)); + } + + dev->rx_metadata_delivery_configured = true; +} + static int dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) @@ -1233,6 +1275,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM; + if (netdev_is_flow_api_enabled()) { + /* + * Full tunnel offload requires that tunnel ID metadata be + * delivered with "miss" packets from the hardware to the + * PMD. The same goes for megaflow mark metadata which is + * used in MARK + RSS offload scenario. + * + * Request delivery of such metadata. + */ + dpdk_eth_dev_init_rx_metadata(dev); + } + rte_eth_dev_info_get(dev->port_id, &info); if (strstr(info.driver_name, "vf") != NULL) { @@ -1421,6 +1475,8 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, /* Initilize the hardware offload flags to 0 */ dev->hw_ol_features = 0; + dev->rx_metadata_delivery_configured = false; + dev->flags = NETDEV_UP | NETDEV_PROMISC; ovs_list_push_back(&dpdk_list, &dev->list_node); From 47520b33bdf80d039f25d2faa25f4fcdf55143ea Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 19 Jul 2023 18:14:04 +0200 Subject: [PATCH 332/833] ofproto-dpif: Fix removal of renamed datapath ports. OVS configuration is based on port names and OpenFlow port numbers. Names are stored in the database and translated later to OF ports. On the datapath level, each port has a name and a datapath port number. Port name in the database has to match datapath port name, unless it's a tunnel port. If a datapath port is renamed with 'ip link set DEV name NAME', ovs-vswitchd will wake up, destroy all the OpenFlow-related structures and clean other things up. This is because the port no longer represents the port from a database due to a name difference. However, ovs-vswitch will not actually remove the port from the datapath, because it thinks that this port is no longer there. This is happening because lookup is performed by name and the name have changed. As a result we have a port in a datapath that is not related to any port known to ovs-vswitchd and ovs-vswitchd can't remove it. This port also occupies a datapath port number and prevents the port to be added back with a new name. Fix that by performing lookup by a datapath port number during the port destruction. The name was used only to avoid spurious warnings in a normal case where the port was successfully deleted by other parts of OVS. Adding an extra flag to avoid these warnings instead. Fixes: 02f8d6460afd ("ofproto-dpif: Query port existence by name to prevent warnings.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/284 Tested-by: Alin-Gabriel Serdean Acked-by: Alin-Gabriel Serdean Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- lib/dpctl.c | 2 +- lib/dpif.c | 16 +++++++---- lib/dpif.h | 2 +- ofproto/ofproto-dpif.c | 14 ++++++---- tests/system-interface.at | 57 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 79 insertions(+), 12 deletions(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index 4394653ab3a..79b82a1767d 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -673,7 +673,7 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) } for (int i = 0; i < n_port_nos; i++) { - if (dpif_port_query_by_number(dpif, port_nos[i], &dpif_port)) { + if (dpif_port_query_by_number(dpif, port_nos[i], &dpif_port, true)) { continue; } diff --git a/lib/dpif.c b/lib/dpif.c index b1cbf39c48d..d07241f1e7c 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -705,13 +705,14 @@ dpif_port_set_config(struct dpif *dpif, odp_port_t port_no, * initializes '*port' appropriately; on failure, returns a positive errno * value. * - * Retuns ENODEV if the port doesn't exist. + * Retuns ENODEV if the port doesn't exist. Will not log a warning in this + * case unless 'warn_if_not_found' is true. * * The caller owns the data in 'port' and must free it with * dpif_port_destroy() when it is no longer needed. */ int dpif_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, - struct dpif_port *port) + struct dpif_port *port, bool warn_if_not_found) { int error = dpif->dpif_class->port_query_by_number(dpif, port_no, port); if (!error) { @@ -719,8 +720,13 @@ dpif_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, dpif_name(dpif), port_no, port->name); } else { memset(port, 0, sizeof *port); - VLOG_WARN_RL(&error_rl, "%s: failed to query port %"PRIu32": %s", - dpif_name(dpif), port_no, ovs_strerror(error)); + if (error == ENODEV && !warn_if_not_found) { + VLOG_DBG_RL(&dpmsg_rl, "%s: failed to query port %"PRIu32": %s", + dpif_name(dpif), port_no, ovs_strerror(error)); + } else { + VLOG_WARN_RL(&error_rl, "%s: failed to query port %"PRIu32": %s", + dpif_name(dpif), port_no, ovs_strerror(error)); + } } return error; } @@ -788,7 +794,7 @@ dpif_port_get_name(struct dpif *dpif, odp_port_t port_no, ovs_assert(name_size > 0); - error = dpif_port_query_by_number(dpif, port_no, &port); + error = dpif_port_query_by_number(dpif, port_no, &port, true); if (!error) { ovs_strlcpy(name, port.name, name_size); dpif_port_destroy(&port); diff --git a/lib/dpif.h b/lib/dpif.h index 9e9d0aa1b0a..0f2dc2ef3c5 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -463,7 +463,7 @@ void dpif_port_clone(struct dpif_port *, const struct dpif_port *); void dpif_port_destroy(struct dpif_port *); bool dpif_port_exists(const struct dpif *dpif, const char *devname); int dpif_port_query_by_number(const struct dpif *, odp_port_t port_no, - struct dpif_port *); + struct dpif_port *, bool warn_if_not_found); int dpif_port_query_by_name(const struct dpif *, const char *devname, struct dpif_port *); int dpif_port_get_name(struct dpif *, odp_port_t port_no, diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index fad7342b0b0..e22ca757ac3 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -2161,8 +2161,7 @@ port_destruct(struct ofport *port_, bool del) struct ofproto_dpif *ofproto = ofproto_dpif_cast(port->up.ofproto); const char *devname = netdev_get_name(port->up.netdev); const char *netdev_type = netdev_get_type(port->up.netdev); - char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; - const char *dp_port_name; + struct dpif_port dpif_port; ofproto->backer->need_revalidate = REV_RECONFIGURE; xlate_txn_start(); @@ -2176,9 +2175,13 @@ port_destruct(struct ofport *port_, bool del) del = dpif_cleanup_required(ofproto->backer->dpif); } - dp_port_name = netdev_vport_get_dpif_port(port->up.netdev, namebuf, - sizeof namebuf); - if (del && dpif_port_exists(ofproto->backer->dpif, dp_port_name)) { + /* Don't try to delete ports that are not part of the datapath. */ + if (del && port->odp_port == ODPP_NONE) { + del = false; + } + + if (del && !dpif_port_query_by_number(ofproto->backer->dpif, + port->odp_port, &dpif_port, false)) { /* The underlying device is still there, so delete it. This * happens when the ofproto is being destroyed, since the caller * assumes that removal of attached ports will happen as part of @@ -2186,6 +2189,7 @@ port_destruct(struct ofport *port_, bool del) if (!port->is_tunnel) { dpif_port_del(ofproto->backer->dpif, port->odp_port, false); } + dpif_port_destroy(&dpif_port); } else if (del) { /* The underlying device is already deleted (e.g. tunctl -d). * Calling dpif_port_remove to do local cleanup for the netdev */ diff --git a/tests/system-interface.at b/tests/system-interface.at index 148f011c7ee..d4ee5c46bad 100644 --- a/tests/system-interface.at +++ b/tests/system-interface.at @@ -123,6 +123,63 @@ AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " | diff -u - OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([interface - datapath port rename]) +OVS_TRAFFIC_VSWITCHD_START() + +dnl Not relevant for userspace datapath. +AT_SKIP_IF([! ovs-appctl dpctl/show | grep -q ovs-system]) + +AT_CHECK([ip link add ovs-veth0 type veth peer name ovs-veth1]) +dnl We will rename ovs-veth0, so removing the peer on exit. +on_exit 'ip link del ovs-veth1' + +AT_CHECK([ovs-vsctl add-port br0 ovs-veth0]) + +OVS_WAIT_UNTIL([ip link show | grep -q "ovs-veth0.* ovs-system "]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: ovs-veth0 +]) + +dnl Rename the interface while attached to OVS. +AT_CHECK([ip l set ovs-veth0 name ovs-new-port]) + +dnl Wait for the port to be detached from the OVS datapath. +OVS_WAIT_UNTIL([ip link show | grep "ovs-new-port" | grep -v "ovs-system"]) + +dnl Check that database indicates the error. +AT_CHECK([ovs-vsctl get interface ovs-veth0 error], [0], [dnl +"could not open network device ovs-veth0 (No such device)" +]) + +dnl Check that the port is no longer in the datapath. +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) +]) + +dnl Rename the interface back and check that it is in use again. +AT_CHECK([ip l set ovs-new-port name ovs-veth0]) + +OVS_WAIT_UNTIL([ip link show | grep -q "ovs-veth0.* ovs-system "]) + +AT_CHECK([ovs-vsctl get interface ovs-veth0 error], [0], [dnl +[[]] +]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: ovs-veth0 +]) + +OVS_TRAFFIC_VSWITCHD_STOP([" + /could not open network device ovs-veth0 (No such device)/d +"]) +AT_CLEANUP + AT_SETUP([interface - current speed]) AT_SKIP_IF([test $HAVE_ETHTOOL = "no"]) OVS_TRAFFIC_VSWITCHD_START() From 20a7654d240f2af5da73a32c1791442e6aac0101 Mon Sep 17 00:00:00 2001 From: Simon Jones Date: Fri, 28 Jul 2023 09:41:22 +0800 Subject: [PATCH 333/833] ovs-tcpdump: Clear auto-assigned ipv6 address of mirror port. ovs-tcpdump will add mipxxx NIC, and on some systems this NIC has IPv6 address by default. For vxlan topology, mipxxx, which has IPv6 address, will be treated as tunnel port, and will got error actions. Prevent this by clearing the auto-assigned IPv6 address. This can also be controlled on some systems with ipv6 sysctls. Tested on centos stream 8, and ubuntu 20.04. Acked-by: Mike Pattrick Acked-by: Aaron Conole Signed-off-by: Simon Jones Signed-off-by: Ilya Maximets --- utilities/ovs-tcpdump.in | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index 420c11eb8a6..4cbd9a5d310 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -96,6 +96,10 @@ def _install_dst_if_linux(tap_name, mtu_value=None): *(['ip', 'link', 'set', 'dev', str(tap_name), 'up'])) pipe.wait() + pipe = _doexec( + *(['ip', '-6', 'addr', 'flush', 'dev', str(tap_name)])) + pipe.wait() + def _remove_dst_if_linux(tap_name): _doexec( From aa56afb576bf020f9906ad08c2d3ff025da47a61 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 3 Aug 2023 14:15:43 +0200 Subject: [PATCH 334/833] AUTHORS: Add Simon Jones. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 9186e1ad227..8427f91672c 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -416,6 +416,7 @@ Shashwat Srivastava shashwat.srivastava@tcs.com Shih-Hao Li shihli@vmware.com Shu Shen shu.shen@radisys.com Simon Horman simon.horman@corigine.com +Simon Jones batmanustc@gmail.com Sivaprasad Tummala sivaprasad.tummala@intel.com Somnath Chatterjee somnath.b.chatterjee@ericsson.com Songtao Zhan zhanst1@chinatelecom.cn From edfbd44ffd623fbc2fcdbcc3a13c7d9e3931aa2a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Jul 2023 11:32:17 +0200 Subject: [PATCH 335/833] ovsdb: file: Fix inability to read diffs that violate type size. Diff records in a database file may contain sets larger than a maximum set size, so constraints should not be checked on read. They will be checked later after applying the diff to a column. Fixes: 2ccd66f594f7 ("ovsdb: Use column diffs for ovsdb and raft log entries.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2023-July/406685.html Reported-by: Peng He Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index 400b34794bb..6e7e47ca68c 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -107,7 +107,14 @@ ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, column_name, schema->name); } - error = ovsdb_datum_from_json(&datum, &column->type, node->data, NULL); + if (row_contains_diff) { + /* Diff may violate the type size rules. */ + error = ovsdb_transient_datum_from_json(&datum, &column->type, + node->data); + } else { + error = ovsdb_datum_from_json(&datum, &column->type, + node->data, NULL); + } if (error) { return error; } From e062465a884fb4be4658bebd0b9853144a9a7dfb Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Jul 2023 11:32:18 +0200 Subject: [PATCH 336/833] ovsdb: file: Fix diff application to a default column value. On a server side, default values are just normal values. The only difference is that they are initialized from default atoms. They are allocated on a separate piece of memory as any other values, so there should not be any special treatment. Current code doesn't apply the diff to a column with default values after reading the file transaction and that breaks the logic. For example, if we have a column with a set and a minimum number of elements for a type is 1, it will be initialized with one default atom. On mutation, new values can be added and the diff will contain only these new values, while the column will contain both the new values and the default atom. While reading such transaction from a file with a diff, current code will replace the content losing the default atom. The only case where we need to actually replace is if this row doesn't exist and it's not actually a diff, i.e. if this row was just created to be populated with a json content. Fix that by removing the wrong check and not use values as a diff in case the row doesn't exist in a database. Fixes: 2ccd66f594f7 ("ovsdb: Use column diffs for ovsdb and raft log entries.") Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/file.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ovsdb/file.c b/ovsdb/file.c index 6e7e47ca68c..8bd1d4af30f 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -118,9 +118,7 @@ ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, if (error) { return error; } - if (row_contains_diff - && !ovsdb_datum_is_default(&row->fields[column->index], - &column->type)) { + if (row_contains_diff) { error = ovsdb_datum_apply_diff_in_place( &row->fields[column->index], &datum, &column->type); @@ -161,8 +159,7 @@ ovsdb_file_txn_row_from_json(struct ovsdb_txn *txn, struct ovsdb_table *table, new = ovsdb_row_create(table); *ovsdb_row_get_uuid_rw(new) = *row_uuid; - error = ovsdb_file_update_row_from_json(new, converting, - row_contains_diff, json); + error = ovsdb_file_update_row_from_json(new, converting, false, json); if (error) { ovsdb_row_destroy(new); } else { From 2f1b430645f8fc0a2ffcb1240e734ed450ef2262 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Jul 2023 11:32:19 +0200 Subject: [PATCH 337/833] ovsdb: relay: Fix handling of XOR updates with size constraints. Relay servers apply updates via ovsdb_table_execute_update(). XOR updates contain datum diffs, and datum diffs can be larger than the type constraints. Currently, relay will fail to parse such update into ovsdb row triggering a syntax error and a re-connection. Fix that by relaxing the size constraints for this kind of updates. Fixes: 026c77c58ddb ("ovsdb: New ovsdb 'relay' service model.") Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/execution.c | 4 ++-- ovsdb/row.c | 13 ++++++++++--- ovsdb/row.h | 3 ++- ovsdb/table.c | 5 +++-- tests/test-ovsdb.c | 13 +++++++------ 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/ovsdb/execution.c b/ovsdb/execution.c index f9b8067d02c..5587ef96fe5 100644 --- a/ovsdb/execution.c +++ b/ovsdb/execution.c @@ -320,7 +320,7 @@ parse_row(const struct json *json, const struct ovsdb_table *table, } row = ovsdb_row_create(table); - error = ovsdb_row_from_json(row, json, symtab, columns); + error = ovsdb_row_from_json(row, json, symtab, columns, false); if (error) { ovsdb_row_destroy(row); return error; @@ -764,7 +764,7 @@ ovsdb_execute_wait(struct ovsdb_execution *x, struct ovsdb_parser *parser, row = ovsdb_row_create(table); error = ovsdb_row_from_json(row, rows->array.elems[i], x->symtab, - NULL); + NULL, false); if (error) { ovsdb_row_destroy(row); break; diff --git a/ovsdb/row.c b/ovsdb/row.c index d7bfbdd365e..2b52b68161f 100644 --- a/ovsdb/row.c +++ b/ovsdb/row.c @@ -302,12 +302,14 @@ ovsdb_row_columns_to_string(const struct ovsdb_row *row, struct ovsdb_error * ovsdb_row_from_json(struct ovsdb_row *row, const struct json *json, struct ovsdb_symbol_table *symtab, - struct ovsdb_column_set *included) + struct ovsdb_column_set *included, bool is_diff) { struct ovsdb_table_schema *schema = row->table->schema; struct ovsdb_error *error; struct shash_node *node; + ovs_assert(!is_diff || !symtab); + if (json->type != JSON_OBJECT) { return ovsdb_syntax_error(json, NULL, "row must be JSON object"); } @@ -324,8 +326,13 @@ ovsdb_row_from_json(struct ovsdb_row *row, const struct json *json, column_name, schema->name); } - error = ovsdb_datum_from_json(&datum, &column->type, node->data, - symtab); + if (is_diff) { + error = ovsdb_transient_datum_from_json(&datum, &column->type, + node->data); + } else { + error = ovsdb_datum_from_json(&datum, &column->type, node->data, + symtab); + } if (error) { return error; } diff --git a/ovsdb/row.h b/ovsdb/row.h index ff91288fed3..59f498a20d6 100644 --- a/ovsdb/row.h +++ b/ovsdb/row.h @@ -114,7 +114,8 @@ void ovsdb_row_columns_to_string(const struct ovsdb_row *, struct ovsdb_error *ovsdb_row_from_json(struct ovsdb_row *, const struct json *, struct ovsdb_symbol_table *, - struct ovsdb_column_set *included) + struct ovsdb_column_set *included, + bool is_diff) OVS_WARN_UNUSED_RESULT; struct json *ovsdb_row_to_json(const struct ovsdb_row *, const struct ovsdb_column_set *include); diff --git a/ovsdb/table.c b/ovsdb/table.c index 66071ce2f88..0792e1580e6 100644 --- a/ovsdb/table.c +++ b/ovsdb/table.c @@ -368,7 +368,8 @@ ovsdb_table_execute_insert(struct ovsdb_txn *txn, const struct uuid *row_uuid, struct ovsdb_row *row = ovsdb_row_create(table); - struct ovsdb_error *error = ovsdb_row_from_json(row, json_row, NULL, NULL); + struct ovsdb_error *error = ovsdb_row_from_json(row, json_row, + NULL, NULL, false); if (!error) { *ovsdb_row_get_uuid_rw(row) = *row_uuid; ovsdb_txn_row_insert(txn, row); @@ -411,7 +412,7 @@ ovsdb_table_execute_update(struct ovsdb_txn *txn, const struct uuid *row_uuid, struct ovsdb_column_set columns = OVSDB_COLUMN_SET_INITIALIZER; struct ovsdb_row *update = ovsdb_row_create(table); struct ovsdb_error *error = ovsdb_row_from_json(update, json_row, - NULL, &columns); + NULL, &columns, xor); if (!error && (xor || !ovsdb_row_equal_columns(row, update, &columns))) { error = ovsdb_row_update_columns(ovsdb_txn_row_modify(txn, row), diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index 1bc5ac17a01..c761822e62e 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -870,7 +870,8 @@ do_parse_rows(struct ovs_cmdl_context *ctx) row = ovsdb_row_create(table); json = unbox_json(parse_json(ctx->argv[i])); - check_ovsdb_error(ovsdb_row_from_json(row, json, NULL, &columns)); + check_ovsdb_error(ovsdb_row_from_json(row, json, NULL, + &columns, false)); json_destroy(json); print_and_free_json(ovsdb_row_to_json(row, &all_columns)); @@ -937,7 +938,7 @@ do_compare_rows(struct ovs_cmdl_context *ctx) } names[i] = xstrdup(json->array.elems[0]->string); check_ovsdb_error(ovsdb_row_from_json(rows[i], json->array.elems[1], - NULL, NULL)); + NULL, NULL, false)); json_destroy(json); } for (i = 0; i < n_rows; i++) { @@ -1050,7 +1051,7 @@ do_evaluate_condition__(struct ovs_cmdl_context *ctx, int mode) for (i = 0; i < n_rows; i++) { rows[i] = ovsdb_row_create(table); check_ovsdb_error(ovsdb_row_from_json(rows[i], json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); } json_destroy(json); @@ -1224,7 +1225,7 @@ do_execute_mutations(struct ovs_cmdl_context *ctx) for (i = 0; i < n_rows; i++) { rows[i] = ovsdb_row_create(table); check_ovsdb_error(ovsdb_row_from_json(rows[i], json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); } json_destroy(json); @@ -1338,7 +1339,7 @@ do_query(struct ovs_cmdl_context *ctx) struct ovsdb_row *row = ovsdb_row_create(table); uuid_generate(ovsdb_row_get_uuid_rw(row)); check_ovsdb_error(ovsdb_row_from_json(row, json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); if (ovsdb_table_get_row(table, ovsdb_row_get_uuid(row))) { ovs_fatal(0, "duplicate UUID "UUID_FMT" in table", UUID_ARGS(ovsdb_row_get_uuid(row))); @@ -1445,7 +1446,7 @@ do_query_distinct(struct ovs_cmdl_context *ctx) row = ovsdb_row_create(table); uuid_generate(ovsdb_row_get_uuid_rw(row)); check_ovsdb_error(ovsdb_row_from_json(row, json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); /* Initialize row and find equivalence class. */ rows[i].uuid = *ovsdb_row_get_uuid(row); From d6fd6e5917dc9579f50fd271f848361c6e11a693 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 25 Jul 2023 11:32:20 +0200 Subject: [PATCH 338/833] tests: Add ovsdb execution cases for set size constraints. Adding an extra check to one of the ovsdb execution cases that will verify that ovsdb-server is able to read back transactions previously written to a database file. And also adding new execution tests that cover previously discovered issues with size checks on sets. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- tests/ovsdb-execution.at | 54 ++++++++++++++++++++++++++++++++++------ tests/ovsdb-server.at | 15 ++++++++++- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/tests/ovsdb-execution.at b/tests/ovsdb-execution.at index e72bf060697..fd1c7a2395b 100644 --- a/tests/ovsdb-execution.at +++ b/tests/ovsdb-execution.at @@ -728,6 +728,53 @@ dnl collide (only) with their previous values (succeeds). [{"count":2},{"uuid":["uuid","<6>"]},{"uuid":["uuid","<7>"]},{"rows":[{"name":"new one","number":1},{"name":"new two","number":2},{"name":"old one","number":10},{"name":"old two","number":20}]}] ]]) +OVSDB_CHECK_EXECUTION([size constraints on sets], + [constraint_schema], + [ + [[["constraints", + {"op": "insert", + "table": "b", + "row": {"b": 1} + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "delete", 0]] + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "insert", 1]] + }]]], + [[["constraints", + {"op": "update", + "table": "b", + "where": [], + "row": {"x": ["set", [3, 4]]} + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "insert", 5]] + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "delete", 4], ["x", "insert", 5]] + }]]] + ], + [[[{"uuid":["uuid","<0>"]}] +[{"details":"Attempted to store 0 elements in set of 1 to 2 integers.","error":"constraint violation"}] +[{"count":1}] +[{"count":1}] +[{"details":"Attempted to store 3 elements in set of 1 to 2 integers.","error":"constraint violation"}] +[{"count":1}] +]]) + OVSDB_CHECK_EXECUTION([referential integrity -- simple], [constraint_schema], [[[["constraints", @@ -751,12 +798,6 @@ OVSDB_CHECK_EXECUTION([referential integrity -- simple], {"op": "delete", "table": "b", "where": []}]]], -dnl Check that "mutate" honors number-of-elements constraints on sets and maps. - [[["constraints", - {"op": "mutate", - "table": "b", - "where": [], - "mutations": [["x", "delete", 0]]}]]], [[["constraints", {"op": "delete", "table": "a", @@ -783,7 +824,6 @@ dnl Check that "mutate" honors number-of-elements constraints on sets and maps. "where": []}]]]], [[[{"uuid":["uuid","<0>"]},{"uuid":["uuid","<1>"]},{"uuid":["uuid","<2>"]},{"uuid":["uuid","<3>"]}] [{"count":1},{"details":"cannot delete b row <0> because of 3 remaining reference(s)","error":"referential integrity violation"}] -[{"details":"Attempted to store 0 elements in set of 1 to 2 integers.","error":"constraint violation"}] [{"count":1}] [{"count":1},{"details":"cannot delete b row <0> because of 2 remaining reference(s)","error":"referential integrity violation"}] [{"count":1}] diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 8ccec80bcbd..d36c3c117ec 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -26,6 +26,9 @@ m4_define([OVSDB_SERVER_SHUTDOWN2], # If a given UUID appears more than once it is always replaced by the # same marker. # +# Additionally, checks that records written to a database file can be +# read back producing the same in-memory database content. +# # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. m4_define([OVSDB_CHECK_EXECUTION], [AT_SETUP([$1]) @@ -33,12 +36,22 @@ m4_define([OVSDB_CHECK_EXECUTION], $2 > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile \ + --remote=punix:socket db], [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ovsdb-client transact unix:socket 'txn'], [0], [stdout], [ignore]) cat stdout >> output ]) AT_CHECK([uuidfilt output], [0], [$4], [ignore]) + + AT_CHECK([ovsdb-client dump unix:socket], [0], [stdout], [ignore]) + + OVSDB_SERVER_SHUTDOWN + + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile \ + --remote=punix:socket db], [0], [ignore], [ignore]) + OVS_WAIT_UNTIL([ovsdb-client dump unix:socket > dump2; diff stdout dump2]) + OVSDB_SERVER_SHUTDOWN AT_CLEANUP]) From bd2a80b1df4bd22446e9c80e7865c512e460a3c9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 2 Aug 2023 15:45:32 +0200 Subject: [PATCH 339/833] ovsdb-server: Fix excessive memory usage on DB open. During initial read of a database file all the file transactions are added to the transaction history. The history run with the history size checks is only executed after the whole file is processed. If, for some reason, the file contains way too many transactions, this behavior may result in excessive memory consumption up to hundreds of GBs. For example, here is a log entry about memory usage after reading a file with 100K+ OVN NbDB transactions: |00004|memory|INFO|95650400 kB peak resident set size after 96.9 seconds |00005|memory|INFO|atoms:3083346 cells:1838767 monitors:0 raft-log:123309 txn-history:123307 txn-history-atoms:1647022868 In this particular case ovsdb-server allocated 95 GB of RAM in order to accommodate 1.6 billion ovsdb atoms in the history, while only 3 million atoms are in the actual database. Fix that by running history size checks after applying each file transaction. This way the memory usage while reading the database from the example stays at about 1 GB mark. History size checks are cheap in comparison with transaction replay, so the additional calls do not reduce performance. We could've just moved the history run into ovsdb_txn_replay_commit(), but it seems more organic to call it externally, since we have init() and destroy() functions called externally as well. Since the history run will be executed shortly after reading the database and actual memory consumption peak is not always logged, there seem to be no reliable way to unit test for the issue without adding extra testing infrastructure into the code. Fixes: 695e81502794 ("ovsdb-server: Transaction history tracking.") Reported-at: https://bugzilla.redhat.com/2228464 Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 3 ++- ovsdb/relay.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 8e623118b10..cf09c907961 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -235,7 +235,7 @@ main_loop(struct server_config *config, SHASH_FOR_EACH_SAFE (node, all_dbs) { struct db *db = node->data; - ovsdb_txn_history_run(db->db); + ovsdb_storage_run(db->db->storage); read_db(config, db); /* Run triggers after storage_run and read_db to make sure new raft @@ -678,6 +678,7 @@ parse_txn(struct server_config *config, struct db *db, if (!error && !uuid_is_zero(txnid)) { db->db->prereq = *txnid; } + ovsdb_txn_history_run(db->db); } return error; } diff --git a/ovsdb/relay.c b/ovsdb/relay.c index b035cb49210..27ff196b727 100644 --- a/ovsdb/relay.c +++ b/ovsdb/relay.c @@ -413,6 +413,7 @@ ovsdb_relay_run(void) } ovsdb_cs_event_destroy(event); } + ovsdb_txn_history_run(ctx->db); } } From 269053bf239dd6208acfb9538e4c9713e5b59fc5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 2 Aug 2023 18:46:02 +0200 Subject: [PATCH 340/833] ovsdb-tool: Fix json leak while showing clustered log. The json read from file is never freed in ovsdb-tool show-log for a clustered database: ERROR: LeakSanitizer: detected memory leaks Direct leak of 10774760 byte(s) in 269369 object(s) allocated from: 0 0x50cc32 in malloc (ovsdb/ovsdb-tool+0x50cc32) 1 0x6e7b6b in xmalloc__ lib/util.c:140:15 2 0x6e7b6b in xmalloc lib/util.c:175:12 3 0x6494f6 in json_create lib/json.c:1489:25 4 0x64a8a7 in json_object_create lib/json.c:263:25 5 0x6525f3 in json_parser_push_object lib/json.c:1311:25 6 0x6525f3 in json_parser_input lib/json.c:1409:13 7 0x64f6c4 in json_parser_feed lib/json.c:1126:17 8 0x5694b5 in parse_body ovsdb/log.c:412:9 9 0x5694b5 in ovsdb_log_read ovsdb/log.c:477:13 10 0x54d294 in do_show_log_cluster ovsdb/ovsdb-tool.c:1069:27 11 0x54d294 in do_show_log ovsdb/ovsdb-tool.c:1115:9 12 0x63b7b1 in ovs_cmdl_run_command__ lib/command-line.c:247:17 13 0x5488a5 in main ovsdb/ovsdb-tool.c:82:5 14 0xe0eb49 in __libc_start_call_main (/lib64/libc.so.6+0x27b49) 15 0xe0ec0a in __libc_start_main@GLIBC_2.2.5 (/lib64/libc.so.6+0x27c0a) 16 0x471fe4 in _start (ovsdb/ovsdb-tool+0x471fe4) Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") Reported-by: Dumitru Ceara Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-tool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index e265365322c..facd680ff3f 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -1094,6 +1094,7 @@ do_show_log_cluster(struct ovsdb_log *log) free(s); } + json_destroy(json); putchar('\n'); } From 2f34475a9708617eaa484044a5b485980b734b38 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 3 Aug 2023 16:23:11 +0200 Subject: [PATCH 341/833] ci: Fix OPTS not being passed to OSX builds. Before GHA, OPTS were always passed as an argument for a *-build.sh script. But that changed. Linux builds are using the OPTS variable from the environment, but OSX script does not, so the options are currently ignored. That wasn't a big issue until now, because SSL was not available or the build actually worked on newer branches. But GHA recently updated OpenSSL to 3.0+ and we have deprecation warnings on branches that do not support OpenSSL 3.0+ (branch 2.16) and that breaks the build. Fixes: 6cb2f5a630e3 ("github: Add GitHub Actions workflow.") Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- .ci/osx-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/osx-build.sh b/.ci/osx-build.sh index 09df61826f1..b81744ec9b9 100755 --- a/.ci/osx-build.sh +++ b/.ci/osx-build.sh @@ -10,7 +10,7 @@ function configure_ovs() ./boot.sh && ./configure $* } -configure_ovs $EXTRA_OPTS $* +configure_ovs $EXTRA_OPTS $OPTS $* if [ "$CC" = "clang" ]; then make CFLAGS="$CFLAGS -Wno-error=unused-command-line-argument" From bbdfb332d203e9742160274b68f4f7c18d47ff72 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sat, 5 Aug 2023 14:07:03 +0200 Subject: [PATCH 342/833] MAINTAINERS: Add Kevin Traynor. Kevin Traynor was recently elected by the Open vSwitch committers. This formalizes his status as an Open vSwitch committer. Welcome Kevin! Signed-off-by: Simon Horman Signed-off-by: Ilya Maximets --- MAINTAINERS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 2fc2517177e..a3e923195c3 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -51,6 +51,8 @@ This is the current list of active Open vSwitch committers: - istokes@ovn.org * - Ilya Maximets - i.maximets@ovn.org + * - Kevin Traynor + - ktraynor@redhat.com * - Russell Bryant - russell@ovn.org * - Simon Horman From b8d4619d345b18db0e3e1b7b25936e7478061940 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 7 Aug 2023 14:28:41 +0200 Subject: [PATCH 343/833] cirrus: Update to FreeBSD 13.2. 13.2 was released in April and 13.1 images no longer able to update packages: pkg: repository FreeBSD contains packages for wrong OS version Fix that by updating to FreeBSD 13.2. Reported-by: David Marchand Reviewed-by: David Marchand Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 952d964315c..48931fa085c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -3,7 +3,7 @@ freebsd_build_task: freebsd_instance: matrix: image_family: freebsd-12-4-snap - image_family: freebsd-13-1-snap + image_family: freebsd-13-2-snap cpu: 4 memory: 4G From da64d1b2fb0f1c165fe7b4a6860423a7ed17107e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sat, 12 Aug 2023 10:07:45 +0200 Subject: [PATCH 344/833] MAINTAINERS: Add Aaron Conole. Aaron Conole was recently elected by the Open vSwitch committers. This formalizes his status as an Open vSwitch committer. Welcome Aaron! Signed-off-by: Simon Horman Signed-off-by: Ilya Maximets --- MAINTAINERS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index a3e923195c3..99a0bd405b2 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -41,6 +41,8 @@ This is the current list of active Open vSwitch committers: * - Name - Email + * - Aaron Conole + - aconole@redhat.com * - Alin Serdean - aserdean@ovn.org * - Ansis Atteka From 21410ff800ccaa56e2b419efa5322b540cfe3448 Mon Sep 17 00:00:00 2001 From: Peng He Date: Mon, 14 Aug 2023 02:37:50 +0000 Subject: [PATCH 345/833] dpif-netdev: Fix dpif_netdev_flow_put. OVS allows overlapping megaflows, as long as the actions of these megaflows are equal. However, the current implementation of action modification relies on flow_lookup instead of UFID, this could result in looking up a wrong megaflow and make the ukeys and megaflows inconsistent. Just like the test case in the patch, at first we have a rule with the prefix: 10.1.2.0/24 And we will get a megaflow with prefixes 10.1.2.2/24 when a packet with IP 10.1.2.2 is received. Then suppose we change the rule into 10.1.0.0/16. OVS prefers to keep the 10.1.2.2/24 megaflow and just changes its action instead of extending the prefix into 10.1.2.2/16. Then suppose we have a 10.1.0.2 packet, since it misses the megaflow, this time, we will have an overlapping megaflow with the right prefix: 10.1.0.2/16 Now we have two megaflows: 10.1.2.2/24 10.1.0.2/16 Last, suppose we have changed the ruleset again. The revalidator this time still decides to change the actions of both megaflows instead of deleting them. The dpif_netdev_flow_put will search the megaflow to modify with unmasked keys, however it might lookup the wrong megaflow as the key 10.1.2.2 matches both 10.1.2.2/24 and 10.1.0.2/16! This patch changes the megaflow lookup code in modification path into relying the UFID to find the correct megaflow instead of key lookup. Falling back to a classifier lookup in case where UFID was not provided in order to support cases where UFID was not generated from the flow data during the flow addition. Fixes: beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline") Signed-off-by: Peng He Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 45 ++++++++++++++++++++++++++++++--------------- tests/pmd.at | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 0b623fcea45..9730e0eecc9 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -4206,7 +4206,7 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, const struct dpif_flow_put *put, struct dpif_flow_stats *stats) { - struct dp_netdev_flow *netdev_flow; + struct dp_netdev_flow *netdev_flow = NULL; int error = 0; if (stats) { @@ -4214,16 +4214,35 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, } ovs_mutex_lock(&pmd->flow_mutex); - netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); - if (!netdev_flow) { - if (put->flags & DPIF_FP_CREATE) { - dp_netdev_flow_add(pmd, match, ufid, put->actions, - put->actions_len, ODPP_NONE); + if (put->ufid) { + netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid, + put->key, put->key_len); + } else { + /* Use key instead of the locally generated ufid + * to search netdev_flow. */ + netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); + } + + if (put->flags & DPIF_FP_CREATE) { + if (!netdev_flow) { + dp_netdev_flow_add(pmd, match, ufid, + put->actions, put->actions_len, ODPP_NONE); } else { - error = ENOENT; + error = EEXIST; } - } else { - if (put->flags & DPIF_FP_MODIFY) { + goto exit; + } + + if (put->flags & DPIF_FP_MODIFY) { + if (!netdev_flow) { + error = ENOENT; + } else { + if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) { + /* Overlapping flow. */ + error = EINVAL; + goto exit; + } + struct dp_netdev_actions *new_actions; struct dp_netdev_actions *old_actions; @@ -4254,15 +4273,11 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, * counter, and subtracting it before outputting the stats */ error = EOPNOTSUPP; } - ovsrcu_postpone(dp_netdev_actions_free, old_actions); - } else if (put->flags & DPIF_FP_CREATE) { - error = EEXIST; - } else { - /* Overlapping flow. */ - error = EINVAL; } } + +exit: ovs_mutex_unlock(&pmd->flow_mutex); return error; } diff --git a/tests/pmd.at b/tests/pmd.at index 7b1652595f7..7c333a901ba 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1331,3 +1331,50 @@ Default max sleep: 499 us OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([PMD - revalidator modify overlapping flows]) + +OVS_VSWITCHD_START( +[add-port br0 p1 \ + -- set bridge br0 datapath-type=dummy \ + -- set interface p1 type=dummy-pmd \ + -- add-port br0 p2 \ + -- set interface p2 type=dummy-pmd +], [], [], [DUMMY_NUMA]) + +dnl Add one OpenFlow rule and generate a megaflow. +AT_CHECK([ovs-ofctl add-flow br0 'table=0,in_port=p1,ip,nw_dst=10.1.2.0/24,actions=p2']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)']) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//'], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:never, actions:2]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)']) +dnl Replace OpenFlow rules, trigger the revalidation. +AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=ct(commit)' | dnl + ovs-ofctl --bundle replace-flows br0 -]) +AT_CHECK([ovs-appctl revalidator/wait]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)']) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:never, actions:ct(commit) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:ct(commit)]) + +dnl Hold the prefix 10.1.2.2/24 by another 10s. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)']) +dnl Send more 10.1.0.2 to make 10.1.0.0/16 tuple prepend 10.1.2.0/24 tuple in the pvector of subtables. +for i in $(seq 0 256); do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)']) +done + +AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=p2' | dnl + ovs-ofctl --bundle replace-flows br0 -]) + +AT_CHECK([ovs-appctl revalidator/wait]) +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [0], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:0.0s, actions:2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From eac54ee24af919542213b2cbbab5a208668e1b85 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 11 Aug 2023 19:23:15 +0200 Subject: [PATCH 346/833] system-traffic.at: Avoid names veth0/veth1 in SRv6 tests. It's fairly common to have veth0/veth1 interfaces on a system, but that breaks SRv6 tests that are trying to create them. Adding ovs- prefix to avoid name collision. Fixes: 03fc1ad78521 ("userspace: Add SRv6 tunnel support.") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 945037ec057..808c492a225 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1202,11 +1202,11 @@ AT_CHECK([ovs-ofctl add-flow br0 in_port=at_srv6,actions=mod_dl_dst:aa:55:aa:55: dnl Set up tunnel endpoints on the namespace 'at_ns0', dnl and overlay port on the namespace 'at_ns1' -ADD_VETH_NS([at_ns0], [veth0], [10.1.1.2/24], [at_ns1], [veth1], [10.1.1.1/24]) +ADD_VETH_NS([at_ns0], [ovs-veth0], [10.1.1.2/24], [at_ns1], [ovs-veth1], [10.1.1.1/24]) NS_CHECK_EXEC([at_ns0], [ip sr tunsrc set fc00:a::1]) NS_CHECK_EXEC([at_ns0], [ip route add 10.100.100.0/24 encap seg6 mode encap segs fc00::100 dev p0]) -NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX4 nh4 0.0.0.0 dev veth0]) -NS_CHECK_EXEC([at_ns1], [ip route add 10.100.100.0/24 via 10.1.1.2 dev veth1]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX4 nh4 0.0.0.0 dev ovs-veth0]) +NS_CHECK_EXEC([at_ns1], [ip route add 10.100.100.0/24 via 10.1.1.2 dev ovs-veth1]) dnl Linux seems to take a little time to get its IPv6 stack in order. Without dnl waiting, we get occasional failures due to the following error: @@ -1263,11 +1263,11 @@ AT_CHECK([ovs-ofctl add-flow br0 in_port=at_srv6,actions=mod_dl_dst:aa:55:aa:55: dnl Set up tunnel endpoints on the namespace 'at_ns0', dnl and overlay port on the namespace 'at_ns1' -ADD_VETH_NS([at_ns0], [veth0], [fc00:1::2/64], [at_ns1], [veth1], [fc00:1::1/64]) +ADD_VETH_NS([at_ns0], [ovs-veth0], [fc00:1::2/64], [at_ns1], [ovs-veth1], [fc00:1::1/64]) NS_CHECK_EXEC([at_ns0], [ip sr tunsrc set fc00:a::1]) NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:100::0/64 encap seg6 mode encap segs fc00::100 dev p0]) -NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX6 nh6 :: dev veth0]) -NS_CHECK_EXEC([at_ns1], [ip -6 route add fc00:100::/64 via fc00:1::2 dev veth1]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX6 nh6 :: dev ovs-veth0]) +NS_CHECK_EXEC([at_ns1], [ip -6 route add fc00:100::/64 via fc00:1::2 dev ovs-veth1]) dnl Linux seems to take a little time to get its IPv6 stack in order. Without dnl waiting, we get occasional failures due to the following error: From cf11766cbcf162399aafb84ba5634a22bccf9e8b Mon Sep 17 00:00:00 2001 From: Peng He Date: Sat, 1 Jul 2023 05:11:16 +0000 Subject: [PATCH 347/833] ofproto-dpif-upcall: Fix push_dp_ops to handle all errors. push_dp_ops only handles delete ops errors but ignores the modify ops results. It's better to handle all the dp operation errors in a consistent way. This patch prevents the inconsistency by considering modify failure in revalidators. To note, we cannot perform two state transitions and change ukey_state into UKEY_EVICTED directly here, because, if we do so, the sweep will remove the ukey alone and leave dp flow alive. Later, the dump will retrieve the dp flow and might even recover it. This will contribute the stats of this dp flow twice. Signed-off-by: Peng He Signed-off-by: Eelco Chaudron --- ofproto/ofproto-dpif-upcall.c | 50 +++++++++++++++++++++++++---------- tests/dpif-netdev.at | 43 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 14 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 04b583f816f..cde03abc6da 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -53,6 +53,7 @@ VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall); COVERAGE_DEFINE(dumped_duplicate_flow); +COVERAGE_DEFINE(dumped_inconsistent_flow); COVERAGE_DEFINE(dumped_new_flow); COVERAGE_DEFINE(handler_duplicate_upcall); COVERAGE_DEFINE(revalidate_missed_dp_flow); @@ -258,6 +259,7 @@ enum ukey_state { UKEY_CREATED = 0, UKEY_VISIBLE, /* Ukey is in umap, datapath flow install is queued. */ UKEY_OPERATIONAL, /* Ukey is in umap, datapath flow is installed. */ + UKEY_INCONSISTENT, /* Ukey is in umap, datapath flow is inconsistent. */ UKEY_EVICTING, /* Ukey is in umap, datapath flow delete is queued. */ UKEY_EVICTED, /* Ukey is in umap, datapath flow is deleted. */ UKEY_DELETED, /* Ukey removed from umap, ukey free is deferred. */ @@ -1999,6 +2001,10 @@ transition_ukey_at(struct udpif_key *ukey, enum ukey_state dst, * UKEY_VISIBLE -> UKEY_EVICTED * A handler attempts to install the flow, but the datapath rejects it. * Consider that the datapath has already destroyed it. + * UKEY_OPERATIONAL -> UKEY_INCONSISTENT + * A revalidator modifies the flow with error returns. + * UKEY_INCONSISTENT -> UKEY_EVICTING + * A revalidator decides to evict the datapath flow. * UKEY_OPERATIONAL -> UKEY_EVICTING * A revalidator decides to evict the datapath flow. * UKEY_EVICTING -> UKEY_EVICTED @@ -2006,8 +2012,9 @@ transition_ukey_at(struct udpif_key *ukey, enum ukey_state dst, * UKEY_EVICTED -> UKEY_DELETED * A revalidator has removed the ukey from the umap and is deleting it. */ - if (ukey->state == dst - 1 || (ukey->state == UKEY_VISIBLE && - dst < UKEY_DELETED)) { + if (ukey->state == dst - 1 || + (ukey->state == UKEY_VISIBLE && dst < UKEY_DELETED) || + (ukey->state == UKEY_OPERATIONAL && dst == UKEY_EVICTING)) { ukey->state = dst; } else { struct ds ds = DS_EMPTY_INITIALIZER; @@ -2490,26 +2497,31 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops) for (i = 0; i < n_ops; i++) { struct ukey_op *op = &ops[i]; - struct dpif_flow_stats *push, *stats, push_buf; - - stats = op->dop.flow_del.stats; - push = &push_buf; - - if (op->dop.type != DPIF_OP_FLOW_DEL) { - /* Only deleted flows need their stats pushed. */ - continue; - } if (op->dop.error) { - /* flow_del error, 'stats' is unusable. */ if (op->ukey) { ovs_mutex_lock(&op->ukey->mutex); - transition_ukey(op->ukey, UKEY_EVICTED); + if (op->dop.type == DPIF_OP_FLOW_DEL) { + transition_ukey(op->ukey, UKEY_EVICTED); + } else { + /* Modification of the flow failed. */ + transition_ukey(op->ukey, UKEY_INCONSISTENT); + } ovs_mutex_unlock(&op->ukey->mutex); } continue; } + if (op->dop.type != DPIF_OP_FLOW_DEL) { + /* Only deleted flows need their stats pushed. */ + continue; + } + + struct dpif_flow_stats *push, *stats, push_buf; + + stats = op->dop.flow_del.stats; + push = &push_buf; + if (op->ukey) { ovs_mutex_lock(&op->ukey->mutex); transition_ukey(op->ukey, UKEY_EVICTED); @@ -2857,6 +2869,15 @@ revalidate(struct revalidator *revalidator) continue; } + if (ukey->state == UKEY_INCONSISTENT) { + ukey->dump_seq = dump_seq; + reval_op_init(&ops[n_ops++], UKEY_DELETE, udpif, ukey, + &recircs, &odp_actions); + ovs_mutex_unlock(&ukey->mutex); + COVERAGE_INC(dumped_inconsistent_flow); + continue; + } + if (ukey->state <= UKEY_OPERATIONAL) { /* The flow is now confirmed to be in the datapath. */ transition_ukey(ukey, UKEY_OPERATIONAL); @@ -2945,13 +2966,14 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) } ukey_state = ukey->state; if (ukey_state == UKEY_OPERATIONAL + || (ukey_state == UKEY_INCONSISTENT) || (ukey_state == UKEY_VISIBLE && purge)) { struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER; bool seq_mismatch = (ukey->dump_seq != dump_seq && ukey->reval_seq != reval_seq); enum reval_result result; - if (purge) { + if (purge || ukey_state == UKEY_INCONSISTENT) { result = UKEY_DELETE; } else if (!seq_mismatch) { result = UKEY_KEEP; diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 67adf27fb19..85119fb819e 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -812,3 +812,46 @@ AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl ]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([dpif-netdev - revalidators handle dp modification fail correctly]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set interface p1 type=dummy \ + -- set bridge br0 datapath-type=dummy \ + -- add-port br0 p2 \ + -- set interface p2 type=dummy -- + ]) + +AT_CHECK([ovs-ofctl add-flow br0 'table=0,in_port=p1,actions=p2']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2),tcp(src=1,dst=2)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2),tcp(src=1,dst=2)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/.*thread://' | strip_xout_keep_actions ], [0], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:2 +]) + +dnl Wait for the dp flow to enter OPERATIONAL state. +AT_CHECK([ovs-appctl revalidator/wait]) + +AT_CHECK([ovs-appctl revalidator/pause]) + +dnl Delete all dp flows, so flow modification will fail. +AT_CHECK([ovs-appctl dpctl/del-flows]) + +AT_CHECK([ovs-appctl revalidator/resume]) + +dnl Replace OpenFlow rules, trigger revalidation and wait for it to complete. +AT_CHECK([echo 'table=0,in_port=p1,ip actions=ct(commit)' | ovs-ofctl --bundle replace-flows br0 -]) +AT_CHECK([ovs-appctl revalidator/wait]) + +dnl Inconsistent ukey should be deleted. +AT_CHECK([ovs-appctl upcall/show | grep keys | grep -q -v 0], [1]) + +dnl Check the log for the flow modification error. +AT_CHECK([grep -q -E ".*failed to put.*$" ovs-vswitchd.log]) + +dnl Remove warning logs to let test suite pass. +OVS_VSWITCHD_STOP(["dnl + /.*failed to put.*$/d + /.*failed to flow_del.*$/d"]) +AT_CLEANUP From 77610902b5d0db708544e349cc3b1bb2b06f13c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Rigault?= Date: Sat, 5 Aug 2023 13:12:15 +0200 Subject: [PATCH 348/833] connmgr: Count unsent async messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an additional coverage counter for the case where no controller is available to receive a OFPT_PACKET_IN/NXT_PACKET_IN2 message and so the message is not sent at all. This should help investigate issues where controller actions are not properly executed (for example an OVN reject ACL was supposed to be executed but ovn-controller was not started and the client ended up timing out). Acked-by: Simon Horman Signed-off-by: François Rigault Signed-off-by: Ilya Maximets --- ofproto/connmgr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ofproto/connmgr.c b/ofproto/connmgr.c index 7b14cae7733..b092e9e04ef 100644 --- a/ofproto/connmgr.c +++ b/ofproto/connmgr.c @@ -1649,6 +1649,8 @@ connmgr_send_table_status(struct connmgr *mgr, } } +COVERAGE_DEFINE(connmgr_async_unsent); + /* Given 'pin', sends an OFPT_PACKET_IN message to each OpenFlow controller as * necessary according to their individual configurations. */ void @@ -1656,6 +1658,7 @@ connmgr_send_async_msg(struct connmgr *mgr, const struct ofproto_async_msg *am) { struct ofconn *ofconn; + bool sent = false; LIST_FOR_EACH (ofconn, connmgr_node, &mgr->conns) { enum ofputil_protocol protocol = ofconn_get_protocol(ofconn); @@ -1677,6 +1680,11 @@ connmgr_send_async_msg(struct connmgr *mgr, am->pin.up.base.flow_metadata.flow.in_port.ofp_port, msg, &txq); do_send_packet_ins(ofconn, &txq); + sent = true; + } + + if (!sent) { + COVERAGE_INC(connmgr_async_unsent); } } From bd78f0e2e483a027d6be1443fb974d532d630cb9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 15 Aug 2023 23:40:01 +0200 Subject: [PATCH 349/833] =?UTF-8?q?AUTHORS:=20Add=20Fran=C3=A7ois=20Rigaul?= =?UTF-8?q?t.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 8427f91672c..01c9b72a0c5 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -171,6 +171,7 @@ Fengqi Li lifengqi@inspur.com Flavio Fernandes flavio@flaviof.com Flavio Leitner fbl@redhat.com Francesco Fusco ffusco@redhat.com +François Rigault frigo@amadeus.com Frédéric Tobias Christ fchrist@live.de Frode Nordahl frode.nordahl@gmail.com FUJITA Tomonori fujita.tomonori@lab.ntt.co.jp From d80df0b860ad307d8052f1b461fb667a5a53d745 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 15 Aug 2023 17:31:28 +0200 Subject: [PATCH 350/833] AUTHORS: Update email for Simon Horman. Update my email contact address for Open vSwitch activities. Signed-off-by: Simon Horman Acked-by: Ilya Maximets --- .mailmap | 5 +++-- AUTHORS.rst | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 4773c4a2454..da46dc15924 100644 --- a/.mailmap +++ b/.mailmap @@ -79,8 +79,9 @@ Sabyasachi Sengupta Shad Ansari Shih-Hao Li -Simon Horman -Simon Horman +Simon Horman +Simon Horman +Simon Horman Stephen Finucane Thomas F. Herbert Thomas Graf diff --git a/AUTHORS.rst b/AUTHORS.rst index 01c9b72a0c5..af3df81e6e2 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -416,7 +416,7 @@ Shashank Ram rams@vmware.com Shashwat Srivastava shashwat.srivastava@tcs.com Shih-Hao Li shihli@vmware.com Shu Shen shu.shen@radisys.com -Simon Horman simon.horman@corigine.com +Simon Horman horms@ovn.org Simon Jones batmanustc@gmail.com Sivaprasad Tummala sivaprasad.tummala@intel.com Somnath Chatterjee somnath.b.chatterjee@ericsson.com From 0945e1a5fa3d66749facd24f8c60f732f7220f11 Mon Sep 17 00:00:00 2001 From: gordonwwang Date: Thu, 17 Aug 2023 11:04:39 +0800 Subject: [PATCH 351/833] ovs.tmac: Fix troff warning in versions above groff-1.23. When the compilation dependency is groff-1.23, the following message is displayed in the compilation log, and the compilation fails: troff:vswitchd/ovs-vswitchd.8:1298: warning: cannot select font 'CW' make[1]: *** [Makefile:6761: manpage-check] Error 1 CW font was removed and and now groff warns about non-existent font: https://git.savannah.gnu.org/cgit/groff.git/commit/?id=d75ea8b2e283e37bd560e821fa4597065f36725f) Fix that by replacing CW with CR. CW supposed to be an alias of CR anyway. Submitted-at: https://github.com/openvswitch/ovs/pull/416 Co-authored-by: Xiaojie Chen Signed-off-by: Xiaojie Chen Signed-off-by: gordonwwang Signed-off-by: Ilya Maximets --- lib/ovs.tmac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ovs.tmac b/lib/ovs.tmac index 5f8f20afa4a..97b6fa3df76 100644 --- a/lib/ovs.tmac +++ b/lib/ovs.tmac @@ -175,7 +175,7 @@ . nr mE \\n(.f . nf . nh -. ft CW +. ft CR .. . . From 41006c1b3e94c0588c2153d5a71bb78a962c8350 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 17 Aug 2023 13:25:30 +0200 Subject: [PATCH 352/833] AUTHORS: Add gordonwwang and Xiaojie Chen. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index af3df81e6e2..71b577418d5 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -482,6 +482,7 @@ William Tu u9012063@gmail.com Wilson Peng pweisong@vmware.com Xavier Simonart xsimonar@redhat.com Xiao Liang shaw.leon@gmail.com +Xiaojie Chen jackchanx@163.com xu rong xu.rong@zte.com.cn YAMAMOTO Takashi yamamoto@midokura.com Yalei Li liyl43@chinatelecom.cn @@ -508,6 +509,7 @@ Zoltan Kiss zoltan.kiss@citrix.com Zoltán Balogh zoltan.balogh.eth@gmail.com Zongkai LI zealokii@gmail.com aginwala amginwal@gmail.com +gordonwwang gordonwwang@tencent.com lic121 lic121@chinatelecom.cn lzhecheng lzhecheng@vmware.com parameswaran krishnamurthy parkrish@gmail.com From de86c5bbdc627117ce8a31f9c9a599c372c8e2a8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 17 Aug 2023 15:21:57 +0200 Subject: [PATCH 353/833] Set release date for 3.2.0. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- NEWS | 2 +- debian/changelog | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index 7a852427e51..b582bdbbc78 100644 --- a/NEWS +++ b/NEWS @@ -2,7 +2,7 @@ Post-v3.2.0 -------------------- -v3.2.0 - xx xxx xxxx +v3.2.0 - 17 Aug 2023 -------------------- - OVSDB: * Changed format in which ovsdb schema conversion operations are stored in diff --git a/debian/changelog b/debian/changelog index 69aac167ac1..a42f4deaa8b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -8,7 +8,7 @@ openvswitch (3.2.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Mon, 17 Jul 2023 14:40:00 +0100 + -- Open vSwitch team Thu, 17 Aug 2023 15:20:36 +0200 openvswitch (3.1.0-1) unstable; urgency=low From e3d5616706107b4c2bdc483359cf1854611cd9e0 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 18 Aug 2023 00:04:26 +0200 Subject: [PATCH 354/833] AUTHORS: Add Ivan Malov. Ivan authored commit: d460c473ebf9 ("netdev-dpdk: Negotiate delivery of per-packet Rx metadata.") But I forgot to update the AUTHORS file, sorry. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 71b577418d5..8df89f9227a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -208,6 +208,7 @@ Ilya Maximets i.maximets@ovn.org Iman Tabrizian tabrizian@outlook.com Isaku Yamahata yamahata@valinux.co.jp Ivan Dyukov i.dyukov@samsung.com +Ivan Malov ivan.malov@arknetworks.am IWASE Yusuke iwase.yusuke@gmail.com Jaime Caamaño Ruiz jcaamano@suse.com Jakub Libosvar libosvar@redhat.com From be2cd24b129c12007be75c291992a8290d22353b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 22 Aug 2023 12:35:38 +0200 Subject: [PATCH 355/833] compiler.h: Don't use asm and typeof with non-GNU compilers. 'asm' is a GNU extension. Compilers without GNU extensions do not understand it: dpif-netdev-perf.h:225:5: error: use of undeclared identifier 'asm' asm volatile("rdtsc" : "=a" (l), "=d" (h)); Redefining asm as __asm__ for non-C++ compilers that do not have it defined. While at it, also add typeof definition. It doesn't need a check for C++, because it's not a keyword in C++. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- include/openvswitch/compiler.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h index cf009f82644..52614a5ac04 100644 --- a/include/openvswitch/compiler.h +++ b/include/openvswitch/compiler.h @@ -37,6 +37,16 @@ #define OVS_NO_RETURN #endif +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef __cplusplus +#ifndef asm +#define asm __asm__ +#endif +#endif + #if __GNUC__ && !__CHECKER__ #define OVS_UNUSED __attribute__((__unused__)) #define OVS_PRINTF_FORMAT(FMT, ARG1) __attribute__((__format__(printf, FMT, ARG1))) From 1776aa17a9aba64291e499a2ec8c72f47b64768f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 22 Aug 2023 12:35:39 +0200 Subject: [PATCH 356/833] sflow: Always enable _BSD_SOURCE. sFlow library is using BSD-style types like u_char that require _BSD_SOURCE to be defined. Also adding _DEFAULT_SOURCE, because _BSD_SOURCE cannot be used without it with glibc > 2.19: error: "_BSD_SOURCE and _SVID_SOURCE are deprecated, use _DEFAULT_SOURCE" Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/automake.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/automake.mk b/lib/automake.mk index e64ee76ce79..24b0ffefee0 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -451,7 +451,7 @@ lib_libsflow_la_SOURCES = \ lib/sflow_poller.c \ lib/sflow_receiver.c lib_libsflow_la_CPPFLAGS = $(AM_CPPFLAGS) -lib_libsflow_la_CFLAGS = $(AM_CFLAGS) +lib_libsflow_la_CFLAGS = $(AM_CFLAGS) -D_BSD_SOURCE -D_DEFAULT_SOURCE if HAVE_WNO_UNUSED lib_libsflow_la_CFLAGS += -Wno-unused endif From f0899b1fcbf43c61db4a17f1c665284e96f35997 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 22 Aug 2023 12:35:40 +0200 Subject: [PATCH 357/833] tests: Fix order of includes in barrier/id-fpool/mpsc-queue tests. config.h must be included first, because it affects all the later included headers, even the system ones. Also, struct timeval requires inclusion of sys/time.h. This fixes the build with CC=clang CFLAGS='-std=c99 -fgnuc-version=0' Fixes: aec1081c7df3 ("tests: Add ovs-barrier unit test.") Fixes: 2eac33c6cc4a ("id-fpool: Module for fast ID generation.") Fixes: 5396ba5b21c4 ("mpsc-queue: Module for lock-free message passing.") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/test-barrier.c | 8 ++++---- tests/test-id-fpool.c | 4 ++-- tests/test-mpsc-queue.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test-barrier.c b/tests/test-barrier.c index 3bc5291cc17..fb0ab0e695c 100644 --- a/tests/test-barrier.c +++ b/tests/test-barrier.c @@ -14,13 +14,13 @@ * limitations under the License. */ -#include - #include +#undef NDEBUG +#include -#include "ovs-thread.h" -#include "ovs-rcu.h" #include "ovstest.h" +#include "ovs-rcu.h" +#include "ovs-thread.h" #include "random.h" #include "util.h" diff --git a/tests/test-id-fpool.c b/tests/test-id-fpool.c index 25275d9aefa..27800aa9bad 100644 --- a/tests/test-id-fpool.c +++ b/tests/test-id-fpool.c @@ -14,12 +14,12 @@ * limitations under the License. */ +#include #undef NDEBUG #include #include #include - -#include +#include #include "command-line.h" #include "id-fpool.h" diff --git a/tests/test-mpsc-queue.c b/tests/test-mpsc-queue.c index a38bf9e6dfa..16aa804a034 100644 --- a/tests/test-mpsc-queue.c +++ b/tests/test-mpsc-queue.c @@ -14,12 +14,12 @@ * limitations under the License. */ +#include #undef NDEBUG #include #include #include - -#include +#include #include "command-line.h" #include "guarded-list.h" From d910fd8a0f321a5e839eb04434ed6cb024fc1a85 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 22 Aug 2023 12:35:41 +0200 Subject: [PATCH 358/833] ci: Add jobs to test -std=c99 builds. It should generally be possible to build OVS with mostly c99 standard and without GNU extensions. It's not a pure c99 build, because some non-standard features from POSIX variants are always enabled if available via AC_USE_SYSTEM_EXTENSIONS. Without them we'll not have some essentials like RW locks, for example. '-std=c99' doesn't disable all the GNU extensions, only some of them. It's technically possible to use '-fgnuc-version=0' with clang to fully disable all the extensions, but it is causing build assertions with the version of clang provided in Ubuntu 20.04 in GHA (some issue with packed structures not being correctly packed). We may use this option in the future after the base image upgrade. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .ci/linux-build.sh | 4 ++++ .github/workflows/build-and-test.yml | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 99850a94346..8227a574870 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -82,6 +82,10 @@ if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then install_dpdk fi +if [ "$STD" ]; then + CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} -std=$STD" +fi + if [ "$CC" = "clang" ]; then CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} -Wno-error=unused-command-line-argument" elif [ "$M32" ]; then diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 47d239f1086..bc5494e863b 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -85,6 +85,7 @@ jobs: LIBS: ${{ matrix.libs }} M32: ${{ matrix.m32 }} OPTS: ${{ matrix.opts }} + STD: ${{ matrix.std }} TESTSUITE: ${{ matrix.testsuite }} name: linux ${{ join(matrix.*, ' ') }} @@ -100,6 +101,11 @@ jobs: - compiler: clang opts: --disable-ssl + - compiler: gcc + std: c99 + - compiler: clang + std: c99 + - compiler: gcc testsuite: test - compiler: clang From f1305b5a2879602d61d02756eb5de93448f70012 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Fri, 18 Aug 2023 18:51:34 +0900 Subject: [PATCH 359/833] tests: Add clang-analyzer-results to gitignore. Fixes: 30b79363b967 ("Makefile.am: Add clang static analysis support") Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- tests/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/.gitignore b/tests/.gitignore index 83b1cb3b489..3a8c4597564 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -3,6 +3,7 @@ /Makefile.in /atconfig /atlocal +/clang-analyzer-results/ /idltest.c /idltest.h /idltest.ovsidl From e46d455201d08725687dc90d3d0ee99fe8f70ca6 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Mon, 21 Aug 2023 15:53:33 +0200 Subject: [PATCH 360/833] docs: Add `nowarn` region option to tables. Starting with groff 1.23.0 a warning is produced if the tbl preprocessor is not run. A side effect of enabling it is that new warnings on table formatting is printed. As requested during the review [0] of a series [1] attempting to address this, this patch makes use of the `nowarn` region option as opposed to attempting to change the formatting. 0: https://patchwork.ozlabs.org/project/openvswitch/patch/ZM00Wfa80rOb2oCA@riva.ucam.org/#3164177 1: https://patchwork.ozlabs.org/project/openvswitch/list/?series=367378&state=* Reported-by: Lucas Nussbaum Reported-at: https://bugs.debian.org/1042358 Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- build-aux/extract-ofp-fields | 6 +++--- lib/meta-flow.xml | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/build-aux/extract-ofp-fields b/build-aux/extract-ofp-fields index efec59c25b3..6366bd44618 100755 --- a/build-aux/extract-ofp-fields +++ b/build-aux/extract-ofp-fields @@ -216,7 +216,7 @@ def field_to_xml(field_node, f, body, summary): """.PP \\fB%s Field\\fR .TS -tab(;); +tab(;),nowarn; l lx. """ % title @@ -317,7 +317,7 @@ def group_xml_to_nroff(group_node, fields): '.SH "%s"\n' % build.nroff.text_to_nroff(title.upper() + " FIELDS"), '.SS "Summary:"\n', ".TS\n", - "tab(;);\n", + "tab(;),nowarn;\n", "l l l l l l l.\n", "Name;Bytes;Mask;RW?;Prereqs;NXM/OXM Support\n", "\_;\_;\_;\_;\_;\_\n", @@ -329,7 +329,7 @@ def group_xml_to_nroff(group_node, fields): def make_oxm_classes_xml(document): - s = """tab(;); + s = """tab(;),nowarn; l l l. Prefix;Vendor;Class \_;\_;\_ diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml index bdd12f6a7bb..416ea0cf224 100644 --- a/lib/meta-flow.xml +++ b/lib/meta-flow.xml @@ -3517,6 +3517,7 @@ actions=clone(load:0->NXM_OF_IN_PORT[],output:123)

      +nowarn; r r r r r. Criteria OpenFlow 1.0 OpenFlow 1.1 OpenFlow 1.2+ NXM \_ \_ \_ \_ \_ From 6180fefa835c7cad36e89f77f3d9de13c680fb88 Mon Sep 17 00:00:00 2001 From: Colin Watson Date: Mon, 21 Aug 2023 15:53:34 +0200 Subject: [PATCH 361/833] docs: Run tbl preprocessor in manpage-check rule. If we omit this, groff 1.23.0 warns: tbl preprocessor failed, or it or soelim was not run; table(s) likely not rendered (TE macro called with TW register undefined) Reported-by: Lucas Nussbaum Reported-at: https://bugs.debian.org/1042358 Signed-off-by: Colin Watson Signed-off-by: Ilya Maximets --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index db341504d37..265cf0a7b52 100644 --- a/Makefile.am +++ b/Makefile.am @@ -368,7 +368,7 @@ ALL_LOCAL += manpage-check manpage-check: $(man_MANS) $(dist_man_MANS) $(noinst_man_MANS) @error=false; \ for manpage in $?; do \ - LANG=en_US.UTF-8 groff -w mac -w delim -w escape -w input -w missing -w tab -T utf8 -man -p -z $$manpage >$@.tmp 2>&1; \ + LANG=en_US.UTF-8 groff -t -w mac -w delim -w escape -w input -w missing -w tab -T utf8 -man -p -z $$manpage >$@.tmp 2>&1; \ if grep warning: $@.tmp; then error=:; fi; \ rm -f $@.tmp; \ done; \ From eb344e0be418b4695a8a5603a590a266eaecb13a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 23 Aug 2023 14:04:00 +0200 Subject: [PATCH 362/833] AUTHORS: Add Colin Watson and Lucas Nussbaum. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 8df89f9227a..0821ecaa0f4 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -108,6 +108,7 @@ Chuck Short zulcss@ubuntu.com Cian Ferriter cian.ferriter@intel.com Ciara Loftus ciara.loftus@intel.com Clint Byrum clint@fewbar.com +Colin Watson cjwatson@ubuntu.com Cong Wang amwang@redhat.com Conner Herriges conner.herriges@ibm.com Damien Millescamps damien.millescamps@6wind.com @@ -658,6 +659,7 @@ Len Gao leng@vmware.com Linhaifeng haifeng.lin@huawei.com Logan Rosen logatronico@gmail.com Luca Falavigna dktrkranz@debian.org +Lucas Nussbaum lucas@debian.org Luiz Henrique Ozaki luiz.ozaki@gmail.com Madhu Venugopal mavenugo@gmail.com Malvika Gupta malvika.gupta@arm.com From 57cccb0763fa69adcb026b2450d0bc09def13fba Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 22 Aug 2023 10:40:15 +0200 Subject: [PATCH 363/833] fatal-signal: Drop logging of failed dummy backtrace. Some systems may provide backtrace() in libc but for some reason not provide any frames when attempting to use it. On those systems the fatal_signal_init() function currently logs this debug message: "Capturing of dummy backtrace has failed." A consequence of this logging may be false negative test results. Logging the fact that backtrace() does not work has limited value on a production system and I propose we drop it. Fixes: 759a29dc2d97 ("backtrace: Extend the backtrace functionality.") Reported-at: https://launchpad.net/bugs/2032623 Acked-by: Ales Musil Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- lib/fatal-signal.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c index 77f0c87dd48..95315007478 100644 --- a/lib/fatal-signal.c +++ b/lib/fatal-signal.c @@ -138,10 +138,6 @@ fatal_signal_init(void) backtrace_capture(&dummy_bt); - if (!dummy_bt.n_frames) { - VLOG_DBG("Capturing of dummy backtrace has failed."); - } - fatal_signal_create_wakeup_events(); #ifdef _WIN32 From 9842d89e58e801b6b3a92ac079688b99b5669587 Mon Sep 17 00:00:00 2001 From: Colin Watson Date: Thu, 24 Aug 2023 10:31:56 +0200 Subject: [PATCH 364/833] docs: Fix rendering of VLAN Comparison Chart. tbl defaults to expecting table entries to be separated by tab characters. However, commit 5a0e4aec1af5cf7741c490bce704577e51e536b9 converted these to spaces and inadvertently broke the rendering. Use semicolons as separators instead; these are less prone to being broken by tree-wide changes, and match the style used by build-aux/extract-ofp-fields. Fixes: 5a0e4aec1af5 ("treewide: Convert leading tabs to spaces.") Reported-by: Lucas Nussbaum Reported-at: https://bugs.debian.org/1042358 Co-authored-by: Frode Nordahl Signed-off-by: Frode Nordahl Signed-off-by: Colin Watson Signed-off-by: Ilya Maximets --- lib/meta-flow.xml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml index 416ea0cf224..ac72a44bce4 100644 --- a/lib/meta-flow.xml +++ b/lib/meta-flow.xml @@ -3517,24 +3517,24 @@ actions=clone(load:0->NXM_OF_IN_PORT[],output:123)

      -nowarn; +tab(;); r r r r r. -Criteria OpenFlow 1.0 OpenFlow 1.1 OpenFlow 1.2+ NXM -\_ \_ \_ \_ \_ -[1] \fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR \fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR \fL0000\fR/\fL0000\fR,\fL--\fR \fL0000\fR/\fL0000\fR -[2] \fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR \fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR \fL0000\fR/\fLffff\fR,\fL--\fR \fL0000\fR/\fLffff\fR -[3] \fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR \fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR \fL1xxx\fR/\fLffff\fR,\fL--\fR \fL1xxx\fR/\fL1fff\fR -[4] \fL????\fR/\fL1\fR,\fL0y\fR/\fL0\fR \fLfffe\fR/\fL0\fR,\fL0y\fR/\fL0\fR \fL1000\fR/\fL1000\fR,\fL0y\fR \fLz000\fR/\fLf000\fR -[5] \fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR \fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR \fL1xxx\fR/\fLffff\fR,\fL0y\fR \fLzxxx\fR/\fLffff\fR +Criteria;OpenFlow 1.0;OpenFlow 1.1;OpenFlow 1.2+;NXM +\_;\_;\_;\_;\_ +[1];\fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR;\fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR;\fL0000\fR/\fL0000\fR,\fL--\fR;\fL0000\fR/\fL0000\fR +[2];\fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR;\fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR;\fL0000\fR/\fLffff\fR,\fL--\fR;\fL0000\fR/\fLffff\fR +[3];\fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR;\fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR;\fL1xxx\fR/\fLffff\fR,\fL--\fR;\fL1xxx\fR/\fL1fff\fR +[4];\fL????\fR/\fL1\fR,\fL0y\fR/\fL0\fR;\fLfffe\fR/\fL0\fR,\fL0y\fR/\fL0\fR;\fL1000\fR/\fL1000\fR,\fL0y\fR;\fLz000\fR/\fLf000\fR +[5];\fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR;\fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR;\fL1xxx\fR/\fLffff\fR,\fL0y\fR;\fLzxxx\fR/\fLffff\fR .T& -r r c c r. -[6] (none) (none) \fL1001\fR/\fL1001\fR,\fL--\fR \fL1001\fR/\fL1001\fR +r c c r r. +[6];(none);(none);\fL1001\fR/\fL1001\fR,\fL--\fR;\fL1001\fR/\fL1001\fR .T& -r r c c c. -[7] (none) (none) (none) \fL3000\fR/\fL3000\fR -[8] (none) (none) (none) \fL0000\fR/\fL0fff\fR -[9] (none) (none) (none) \fL0000\fR/\fLf000\fR -[10] (none) (none) (none) \fL0000\fR/\fLefff\fR +r c c c r. +[7];(none);(none);(none);\fL3000\fR/\fL3000\fR +[8];(none);(none);(none);\fL0000\fR/\fL0fff\fR +[9];(none);(none);(none);\fL0000\fR/\fLf000\fR +[10];(none);(none);(none);\fL0000\fR/\fLefff\fR

      From 0e98b99240dec1fa0430205676a42bd484d24f16 Mon Sep 17 00:00:00 2001 From: Antonin Bas Date: Wed, 16 Aug 2023 17:30:59 -0700 Subject: [PATCH 365/833] doc: Fix description of max_len for controller action. From: Antonin Bas Since Open vSwitch 2.7, the max_len option has no effect, and the full packet is always sent to controllers. This was confirmed with both the kernel and netdev datapaths. Reported-by: Antonin Bas Reported-at: https://github.com/openvswitch/ovs-issues/issues/295 Acked-by: Simon Horman Signed-off-by: Antonin Bas Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-actions.7.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/ref/ovs-actions.7.rst b/Documentation/ref/ovs-actions.7.rst index d1389565564..36adcc5db2d 100644 --- a/Documentation/ref/ovs-actions.7.rst +++ b/Documentation/ref/ovs-actions.7.rst @@ -694,7 +694,8 @@ encapsulated in an OpenFlow ``packet-in`` message. The supported options are: Limit to *max_len* the number of bytes of the packet to send in the ``packet-in.`` A *max_len* of 0 prevents any of the packet from being sent (thus, only metadata is included). By default, the entire packet is - sent, equivalent to a *max_len* of 65535. + sent, equivalent to a *max_len* of 65535. This option has no effect in + Open vSwith 2.7 and later: the entire packet will always be sent. ``reason=``\ *reason* Specify *reason* as the reason for sending the message in the @@ -733,6 +734,12 @@ encapsulated in an OpenFlow ``packet-in`` message. The supported options are: options require the Open vSwitch ``NXAST_CONTROLLER`` extension action added in Open vSwitch 1.6. + Open vSwitch 2.7 and later is configured to not buffer packets for the + packet-in event. As a result, the full packet is always sent to + controllers. This means that the ``max_len`` option has no effect on the + ``controller`` action, and all values (even 0) are equivalent to the default + value of 65535. + The ``enqueue`` action ---------------------- From 785e22f876b5bf70c403ee019897e890d7f59bbc Mon Sep 17 00:00:00 2001 From: Zhiqi Chen Date: Sun, 13 Aug 2023 17:08:14 +0800 Subject: [PATCH 366/833] dpif-netdev: Fix length calculation of netdet_flow_key. The 'len' of a netdev_flow_key initialized by netdev_flow_key_init() is always zero, which may cause errors when cloning a netdev_flow_key by netdev_flow_key_clone(). Currently the 'len' member of a netdev_flow_key initialized by netdev_flow_key_init() is not used, so this error will not cause any bad behavior for now. Fixes: c82f496c3b69 ("dpif-netdev: Use unmasked key when adding datapath flows.") Acked-by: Eelco Chaudron Signed-off-by: Zhiqi Chen Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 9730e0eecc9..157694bcf0e 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -3380,14 +3380,13 @@ static inline void netdev_flow_key_init(struct netdev_flow_key *key, const struct flow *flow) { - uint64_t *dst = miniflow_values(&key->mf); uint32_t hash = 0; uint64_t value; miniflow_map_init(&key->mf, flow); miniflow_init(&key->mf, flow); - size_t n = dst - miniflow_get_values(&key->mf); + size_t n = miniflow_n_values(&key->mf); FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { hash = hash_add64(hash, value); From bb0dd1135ba9aa52bf44d62fa36ab5d2d70c9cf0 Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Wed, 23 Aug 2023 16:29:00 +0200 Subject: [PATCH 367/833] python: Rename build related code to ovs_build_helpers. The python/build folder contents are completely unrelated to the ovs python bindings. These files are only used during the build for various subsystems (docs, man pages, code generation, etc.). Having that folder in that location prevents from running: cd python && python3 -m build Which is a way to generate PEP517 compatible source archives and binary wheel packages. Rename that folder to ovs_build_helpers which is more explicit. Update all imports accordingly. Link: https://peps.python.org/pep-0517/ Link: https://pypi.org/project/build/ Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- Makefile.am | 2 +- build-aux/extract-ofp-fields | 12 ++++++------ build-aux/gen_ofp_field_decoders | 4 ++-- build-aux/sodepends.py | 3 ++- build-aux/soexpand.py | 3 ++- build-aux/xml2nroff | 10 +++++----- ovsdb/ovsdb-doc | 2 +- python/automake.mk | 16 ++++++++-------- python/{build => ovs_build_helpers}/__init__.py | 0 .../extract_ofp_fields.py | 0 python/{build => ovs_build_helpers}/nroff.py | 0 python/{build => ovs_build_helpers}/soutil.py | 0 12 files changed, 27 insertions(+), 25 deletions(-) rename python/{build => ovs_build_helpers}/__init__.py (100%) rename python/{build => ovs_build_helpers}/extract_ofp_fields.py (100%) rename python/{build => ovs_build_helpers}/nroff.py (100%) rename python/{build => ovs_build_helpers}/soutil.py (100%) diff --git a/Makefile.am b/Makefile.am index 265cf0a7b52..439e2bf6d53 100644 --- a/Makefile.am +++ b/Makefile.am @@ -415,7 +415,7 @@ endif CLEANFILES += flake8-check -include manpages.mk -manpages.mk: $(MAN_ROOTS) build-aux/sodepends.py python/build/soutil.py +manpages.mk: $(MAN_ROOTS) build-aux/sodepends.py python/ovs_build_helpers/soutil.py @PYTHONPATH=$$PYTHONPATH$(psep)$(srcdir)/python $(PYTHON3) $(srcdir)/build-aux/sodepends.py -I. -I$(srcdir) $(MAN_ROOTS) >$(@F).tmp @if cmp -s $(@F).tmp $@; then \ touch $@; \ diff --git a/build-aux/extract-ofp-fields b/build-aux/extract-ofp-fields index 6366bd44618..05d3e1df36b 100755 --- a/build-aux/extract-ofp-fields +++ b/build-aux/extract-ofp-fields @@ -4,9 +4,9 @@ import getopt import sys import os.path import xml.dom.minidom -import build.nroff -from build.extract_ofp_fields import ( +from ovs_build_helpers import nroff +from ovs_build_helpers.extract_ofp_fields import ( extract_ofp_fields, PREREQS, OXM_CLASSES, @@ -297,7 +297,7 @@ l lx. body += [".TE\n"] body += [".PP\n"] - body += [build.nroff.block_xml_to_nroff(field_node.childNodes)] + body += [nroff.block_xml_to_nroff(field_node.childNodes)] def group_xml_to_nroff(group_node, fields): @@ -310,11 +310,11 @@ def group_xml_to_nroff(group_node, fields): id_ = node.attributes["id"].nodeValue field_to_xml(node, fields[id_], body, summary) else: - body += [build.nroff.block_xml_to_nroff([node])] + body += [nroff.block_xml_to_nroff([node])] content = [ ".bp\n", - '.SH "%s"\n' % build.nroff.text_to_nroff(title.upper() + " FIELDS"), + '.SH "%s"\n' % nroff.text_to_nroff(title.upper() + " FIELDS"), '.SS "Summary:"\n', ".TS\n", "tab(;),nowarn;\n", @@ -422,7 +422,7 @@ ovs\-fields \- protocol header fields in OpenFlow and Open vSwitch elif node.nodeType == node.COMMENT_NODE: pass else: - s += build.nroff.block_xml_to_nroff([node]) + s += nroff.block_xml_to_nroff([node]) for f in fields: if "used" not in f: diff --git a/build-aux/gen_ofp_field_decoders b/build-aux/gen_ofp_field_decoders index 0b797ee8c8c..0cb6108c222 100755 --- a/build-aux/gen_ofp_field_decoders +++ b/build-aux/gen_ofp_field_decoders @@ -2,7 +2,7 @@ import argparse -import build.extract_ofp_fields as extract_fields +from ovs_build_helpers.extract_ofp_fields import extract_ofp_fields def main(): @@ -19,7 +19,7 @@ def main(): args = parser.parse_args() - fields = extract_fields.extract_ofp_fields(args.metaflow) + fields = extract_ofp_fields(args.metaflow) field_decoders = {} aliases = {} diff --git a/build-aux/sodepends.py b/build-aux/sodepends.py index 45812bcbd70..ac8dd61a4b2 100755 --- a/build-aux/sodepends.py +++ b/build-aux/sodepends.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from build import soutil import sys +from ovs_build_helpers import soutil + def sodepends(include_dirs, filenames, dst): ok = True diff --git a/build-aux/soexpand.py b/build-aux/soexpand.py index 00adcf47a35..7d4dc0486a9 100755 --- a/build-aux/soexpand.py +++ b/build-aux/soexpand.py @@ -14,9 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from build import soutil import sys +from ovs_build_helpers import soutil + def soexpand(include_dirs, src, dst): ok = True diff --git a/build-aux/xml2nroff b/build-aux/xml2nroff index ee5553f4564..3e937910bed 100755 --- a/build-aux/xml2nroff +++ b/build-aux/xml2nroff @@ -18,7 +18,7 @@ import getopt import sys import xml.dom.minidom -import build.nroff +from ovs_build_helpers import nroff argv0 = sys.argv[0] @@ -90,10 +90,10 @@ def manpage_to_nroff(xml_file, subst, include_path, version=None): . I "\\$1" . RE .. -''' % (build.nroff.text_to_nroff(program), build.nroff.text_to_nroff(section), - build.nroff.text_to_nroff(title), build.nroff.text_to_nroff(version)) +''' % (nroff.text_to_nroff(program), nroff.text_to_nroff(section), + nroff.text_to_nroff(title), nroff.text_to_nroff(version)) - s += build.nroff.block_xml_to_nroff(doc.childNodes) + "\n" + s += nroff.block_xml_to_nroff(doc.childNodes) + "\n" return s @@ -139,7 +139,7 @@ if __name__ == "__main__": try: s = manpage_to_nroff(args[0], subst, include_path, version) - except build.nroff.error.Error as e: + except nroff.error.Error as e: sys.stderr.write("%s: %s\n" % (argv0, e.msg)) sys.exit(1) for line in s.splitlines(): diff --git a/ovsdb/ovsdb-doc b/ovsdb/ovsdb-doc index 10d0c0c1343..099770d253f 100755 --- a/ovsdb/ovsdb-doc +++ b/ovsdb/ovsdb-doc @@ -24,7 +24,7 @@ import ovs.json from ovs.db import error import ovs.db.schema -from build.nroff import * +from ovs_build_helpers.nroff import * argv0 = sys.argv[0] diff --git a/python/automake.mk b/python/automake.mk index 82a50878741..6c7ac84b9c4 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -66,10 +66,10 @@ ovs_pytests = \ # These python files are used at build time but not runtime, # so they are not installed. EXTRA_DIST += \ - python/build/__init__.py \ - python/build/extract_ofp_fields.py \ - python/build/nroff.py \ - python/build/soutil.py + python/ovs_build_helpers/__init__.py \ + python/ovs_build_helpers/extract_ofp_fields.py \ + python/ovs_build_helpers/nroff.py \ + python/ovs_build_helpers/soutil.py # PyPI support. EXTRA_DIST += \ @@ -88,10 +88,10 @@ PYCOV_CLEAN_FILES += $(PYFILES:.py=.py,cover) FLAKE8_PYFILES += \ $(filter-out python/ovs/compat/% python/ovs/dirs.py,$(PYFILES)) \ - python/build/__init__.py \ - python/build/extract_ofp_fields.py \ - python/build/nroff.py \ - python/build/soutil.py \ + python/ovs_build_helpers/__init__.py \ + python/ovs_build_helpers/extract_ofp_fields.py \ + python/ovs_build_helpers/nroff.py \ + python/ovs_build_helpers/soutil.py \ python/ovs/dirs.py.template \ python/setup.py diff --git a/python/build/__init__.py b/python/ovs_build_helpers/__init__.py similarity index 100% rename from python/build/__init__.py rename to python/ovs_build_helpers/__init__.py diff --git a/python/build/extract_ofp_fields.py b/python/ovs_build_helpers/extract_ofp_fields.py similarity index 100% rename from python/build/extract_ofp_fields.py rename to python/ovs_build_helpers/extract_ofp_fields.py diff --git a/python/build/nroff.py b/python/ovs_build_helpers/nroff.py similarity index 100% rename from python/build/nroff.py rename to python/ovs_build_helpers/nroff.py diff --git a/python/build/soutil.py b/python/ovs_build_helpers/soutil.py similarity index 100% rename from python/build/soutil.py rename to python/ovs_build_helpers/soutil.py From f1983a508b38868ad3fc0be74686f8f914c95b9c Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Wed, 23 Aug 2023 16:29:01 +0200 Subject: [PATCH 368/833] python: Use twine to upload sdist package to pypi.org. setup.py upload is now deprecated. When used, pypi.org returns an error: Upload failed (400): Invalid value for blake2_256_digest. Error: Use a valid, hex-encoded, BLAKE2 message digest. Use twine which is the recommended replacement tool to upload on pypi.org. Link: https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html#summary Reported-by: Terry Wilson Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- python/automake.mk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/automake.mk b/python/automake.mk index 6c7ac84b9c4..d9f6803691d 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -112,11 +112,14 @@ ovs-install-data-local: $(INSTALL_DATA) python/ovs/dirs.py.tmp $(DESTDIR)$(pkgdatadir)/python/ovs/dirs.py rm python/ovs/dirs.py.tmp +.PHONY: python-sdist python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py (cd python/ && $(PYTHON3) setup.py sdist) -pypi-upload: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py - (cd python/ && $(PYTHON3) setup.py sdist upload) +.PHONY: pypi-upload +pypi-upload: python-sdist + twine upload python/dist/ovs-$(VERSION).tar.gz + install-data-local: ovs-install-data-local UNINSTALL_LOCAL += ovs-uninstall-local From bf7e53bb571764c72212d708a2a3f320470132fe Mon Sep 17 00:00:00 2001 From: Robin Jarry Date: Wed, 23 Aug 2023 16:29:02 +0200 Subject: [PATCH 369/833] python: Use build to generate PEP517 compatible archives. Quoting Paul Ganssle, setuptools maintainer: * The setuptools project has stopped maintaining all direct invocations of setup.py years ago, and distutils is deprecated. There are undoubtedly many ways that your setup.py-based system is broken today, even if it's not failing loudly or obviously. * Direct invocations of setup.py cannot bootstrap their own dependencies, and so some CLI is necessary for dependency management. * The setuptools project no longer wants to provide any public CLI, and will be actively removing the existing interface (though the time scale for this is long). * PEP 517, 518 and other standards-based packaging are the future of the Python ecosystem and a lot of progress has been made on making this upgrade seamless. As described in the recommendations in the end of the article: `python3 setup.py sdist` should be replaced by `python3 -m build --sdist`. Link: https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html#summary Signed-off-by: Robin Jarry Signed-off-by: Ilya Maximets --- python/automake.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/automake.mk b/python/automake.mk index d9f6803691d..84cf2eab57e 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -114,7 +114,7 @@ ovs-install-data-local: .PHONY: python-sdist python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py - (cd python/ && $(PYTHON3) setup.py sdist) + cd python/ && $(PYTHON3) -m build --sdist .PHONY: pypi-upload pypi-upload: python-sdist From 9b7e1a75378f806fcf782e0286d529028e6d62bf Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 28 Aug 2023 18:07:53 +0200 Subject: [PATCH 370/833] netdev-dpdk: Clear IP packet type when no offload is requested. OVS currently sets RTE_MBUF_F_TX_IPV[46] flags in early stages of the packet reception and keeps track of the IP packet type as the packet goes through OVS pipeline. When a packet leaves OVS and hits a DPDK driver, OVS may not request IP checksum offloading but leaves one of this packet type flag in ol_flags. The DPDK api describes that RTE_MBUF_F_TX_IPV4 must be set when requesting some Tx offloads (like RTE_MBUF_F_TX_IPSUM, RTE_MBUF_F_TX_TCP_CKSUM, .., RTE_MBUF_F_TX_TCP_SEG). Even though setting RTE_MBUF_F_TX_IPV4 without requesting a Tx offload is undefined, this can confuse some drivers (like net/iavf) which then reads zeroed l2_len and l3_len and ends up dropping the packet. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2231081 Fixes: 5d11c47d3ebe ("userspace: Enable IP checksum offloading by default.") Acked-by: Mike Pattrick Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 8f1361e21f7..2f5a133184e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2431,6 +2431,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_TCP_SEG))) { + mbuf->ol_flags &= ~(RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6); return true; } From d3bdc7c913050b79a1c88f01d7f9e774033cf5fa Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 28 Aug 2023 14:16:22 +0200 Subject: [PATCH 371/833] tests: Fix time dependency in overlapping flows modification test. On slow systems or at high testsuite concurrency sending 256 packets can take more than 10 seconds. This is causing expiration of one of the flows and a subsequent test failure. Use time warping instead to avoid the time dependency. Fixes: 21410ff800cc ("dpif-netdev: Fix dpif_netdev_flow_put.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2023-August/052623.html Reported-by: Sangeetha Elumalai Reviewed-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/pmd.at | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/pmd.at b/tests/pmd.at index 7c333a901ba..7bdaca9e71f 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -1355,18 +1355,22 @@ AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=ct(commit)' | d ovs-ofctl --bundle replace-flows br0 -]) AT_CHECK([ovs-appctl revalidator/wait]) +dnl Prevent flows from expiring. +AT_CHECK([ovs-appctl time/stop]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)']) OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:never, actions:ct(commit) recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:ct(commit)]) -dnl Hold the prefix 10.1.2.2/24 by another 10s. -AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)']) dnl Send more 10.1.0.2 to make 10.1.0.0/16 tuple prepend 10.1.2.0/24 tuple in the pvector of subtables. for i in $(seq 0 256); do AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)']) done +dnl Warp time enough to trigger subtable optimization. +AT_CHECK([ovs-appctl time/warp 500 2000], [0], [ignore]) + AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=p2' | dnl ovs-ofctl --bundle replace-flows br0 -]) From 28c0cec40627b6b6277debdc8a4d3e1ef14eb502 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 28 Aug 2023 19:12:25 +0200 Subject: [PATCH 372/833] configure: Avoid deprecated AC_PROG_CC_C99 if possible. autoconf 2.70 deprecated the AC_PROG_CC_C99 macro and the AC_PROG_CC was recommended for use instead. However, older versions of that suggested macro do not attempt enabling C99, so it is not a direct replacement. Autoconf 2.69 and older are still widely used in many distributions. Another difference is that AC_PROG_CC attempts to enable C11 in new versions of autoconf. But since we have CI jobs that check -std=c99 builds now, we can afford enabling C11 by default without risking compatibility issues. Fix a deprecation warning by using a new AC_PROG_CC macro with autoconf 2.70+. AC_PROG_CC and AC_PROG_CC_C99 seems to produce the same configuration script in autoconf 2.70+ anyway, so we're already kind of using a new macro on systems with a new autoconf. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- configure.ac | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index c8708630e88..44c09b2ac4c 100644 --- a/configure.ac +++ b/configure.ac @@ -21,7 +21,11 @@ AC_CONFIG_HEADERS([config.h]) AC_CONFIG_TESTDIR([tests]) AM_INIT_AUTOMAKE([tar-pax]) -AC_PROG_CC_C99 +# AC_PROG_CC doesn't try enabling C99 in autoconf 2.69 and below, but +# AC_PROG_CC_C99 is deprecated in newer ones. In autoconf 2.70+ both +# will try enabling features up to C11. +m4_version_prereq([2.70], [AC_PROG_CC], [AC_PROG_CC_C99]) + AM_PROG_CC_C_O AC_PROG_CXX AC_PROG_CPP From 13b874f4fe89cc72e702f91183f678872ee6e88d Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 30 Aug 2023 10:16:33 +0200 Subject: [PATCH 373/833] tests/mfex: Don't require python cryptography. Tests using mfex_fuzzy.py will fail on systems lacking the Python cryptography library. cryptography is not required, it is only imported in mfex_fuzzy.py to silence some warnings when scapy tries to load some libraries. Fixes: c3ed0bf34b8a ("tests/mfex: Silence Blowfish/CAST5 deprecation warnings.") Acked-by: Eelco Chaudron Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- tests/mfex_fuzzy.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/mfex_fuzzy.py b/tests/mfex_fuzzy.py index 30028ba7a04..50b9870641d 100755 --- a/tests/mfex_fuzzy.py +++ b/tests/mfex_fuzzy.py @@ -3,12 +3,15 @@ import sys import warnings -from cryptography.utils import CryptographyDeprecationWarning -warnings.filterwarnings( - "ignore", - category=CryptographyDeprecationWarning, - message=r"(blowfish|cast5)", -) +try: + from cryptography.utils import CryptographyDeprecationWarning + warnings.filterwarnings( + "ignore", + category=CryptographyDeprecationWarning, + message=r"(blowfish|cast5)", + ) +except ModuleNotFoundError: + pass # flake8: noqa: E402 from scapy.all import RandMAC, RandIP, PcapWriter, RandIP6, RandShort, fuzz From bb61931dc5467e3bd24ac6b2877de6f9b6ff5d87 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 30 Aug 2023 10:28:16 +0200 Subject: [PATCH 374/833] netdev-dpdk: Disable net/tap Tx L4 checksum offloads. As reported by Ales when doing some OVN integration tests with OVS 3.2, net/tap has broken L4 checksum offloads. Fixes are pending on DPDK side. Until they get in a LTS release used by OVS, disable those Tx offloads. Acked-by: Eelco Chaudron Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 2f5a133184e..55700250df2 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1312,6 +1312,16 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER; } + if (!strcmp(info.driver_name, "net_tap")) { + /* FIXME: L4 checksum offloading is broken in DPDK net/tap driver. + * This workaround can be removed once the fix makes it to a DPDK + * LTS release used by OVS. */ + VLOG_INFO("%s: disabled Tx L4 checksum offloads for a net/tap port.", + netdev_get_name(&dev->up)); + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_UDP_CKSUM; + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM; + } + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) { dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; } else { From 85634fd58004c06bbb5ab1b0d21fb7e3fa7980e0 Mon Sep 17 00:00:00 2001 From: Han Zhou Date: Tue, 18 Jul 2023 02:38:26 -0700 Subject: [PATCH 375/833] ovsdb: raft: Support pre-vote mechanism to deal with disruptive server. When a server becomes unstable due to system overloading or intermittent partitioning, it may miss some heartbeats and then starts election with a new term, which would disrupt the otherwise healthy cluster formed by the rest of the healthy nodes. Such situation may exist for a long time until the "flapping" server is shutdown or recovered completely, which can severely impact the availability of the cluster. The pre-vote mechanism introduced in the raft paper section 9.6 can prevent such problems. This patch implements the pre-vote mechanism. Note: during the upgrade, since the old version doesn't recognize the new optional field in the vote rpc (and the ovsdb_parse_finish validates that all fields in the jsonrpc are parsed), an error log may be noticed on old nodes if an upgraded node happens to become candidate first and vote for itself, and the vote request will be discarded. If this happens before enough nodes complete the upgrade, the vote from the upgraded node may not reach the quorum. This results in re-election, and any old nodes should be able to vote and get elected as leader. So, in unlucky cases there can be more leader elections happening during the upgrade. Reviewed-by: Simon Horman Signed-off-by: Han Zhou Signed-off-by: Ilya Maximets --- NEWS | 6 +++ ovsdb/raft-rpc.c | 21 +++++++++- ovsdb/raft-rpc.h | 3 ++ ovsdb/raft.c | 88 ++++++++++++++++++++++++++++++------------ tests/ovsdb-cluster.at | 43 +++++++++++++++++++++ 5 files changed, 135 insertions(+), 26 deletions(-) diff --git a/NEWS b/NEWS index b582bdbbc78..6b45492f1b7 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,11 @@ Post-v3.2.0 -------------------- + - OVSDB: + * Support pre-vote mechanism in RAFT that protects the cluster against + disruptive servers (section 9.6 of the original RAFT paper). Upgrading + from older version is supported but it may trigger more leader elections + during the process, and error logs complaining unrecognized fields may + be observed on old nodes. v3.2.0 - 17 Aug 2023 diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c index dd14d81091f..27c3aad99c4 100644 --- a/ovsdb/raft-rpc.c +++ b/ovsdb/raft-rpc.c @@ -283,6 +283,9 @@ raft_vote_request_to_jsonrpc(const struct raft_vote_request *rq, json_object_put(args, "leadership_transfer", json_boolean_create(true)); } + if (rq->is_prevote) { + json_object_put(args, "is_prevote", json_boolean_create(true)); + } } static void @@ -294,6 +297,8 @@ raft_vote_request_from_jsonrpc(struct ovsdb_parser *p, rq->last_log_term = raft_parse_required_uint64(p, "last_log_term"); rq->leadership_transfer = raft_parse_optional_boolean(p, "leadership_transfer") == 1; + rq->is_prevote + = raft_parse_optional_boolean(p, "is_prevote") == 1; } static void @@ -305,6 +310,9 @@ raft_format_vote_request(const struct raft_vote_request *rq, struct ds *s) if (rq->leadership_transfer) { ds_put_cstr(s, " leadership_transfer=true"); } + if (rq->is_prevote) { + ds_put_cstr(s, " is_prevote=true"); + } } /* raft_vote_reply. */ @@ -326,6 +334,9 @@ raft_vote_reply_to_jsonrpc(const struct raft_vote_reply *rpy, { raft_put_uint64(args, "term", rpy->term); json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote)); + if (rpy->is_prevote) { + json_object_put(args, "is_prevote", json_boolean_create(true)); + } } static void @@ -334,6 +345,7 @@ raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p, { rpy->term = raft_parse_required_uint64(p, "term"); rpy->vote = raft_parse_required_uuid(p, "vote"); + rpy->is_prevote = raft_parse_optional_boolean(p, "is_prevote") == 1; } static void @@ -341,6 +353,9 @@ raft_format_vote_reply(const struct raft_vote_reply *rpy, struct ds *s) { ds_put_format(s, " term=%"PRIu64, rpy->term); ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote)); + if (rpy->is_prevote) { + ds_put_cstr(s, " is_prevote=true"); + } } /* raft_add_server_request */ @@ -1007,8 +1022,10 @@ raft_rpc_get_vote(const union raft_rpc *rpc) case RAFT_RPC_BECOME_LEADER: return NULL; - case RAFT_RPC_VOTE_REPLY: - return &raft_vote_reply_cast(rpc)->vote; + case RAFT_RPC_VOTE_REPLY: { + const struct raft_vote_reply *rpy = raft_vote_reply_cast(rpc); + return rpy->is_prevote ? NULL : &rpy->vote; + } default: OVS_NOT_REACHED(); diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h index 221f24d0012..7677c35b4e0 100644 --- a/ovsdb/raft-rpc.h +++ b/ovsdb/raft-rpc.h @@ -125,12 +125,15 @@ struct raft_vote_request { uint64_t last_log_index; /* Index of candidate's last log entry. */ uint64_t last_log_term; /* Term of candidate's last log entry. */ bool leadership_transfer; /* True to override minimum election timeout. */ + bool is_prevote; /* True: pre-vote; False: real vote (default). */ }; struct raft_vote_reply { struct raft_rpc_common common; uint64_t term; /* Current term, for candidate to update itself. */ struct uuid vote; /* Server ID of vote. */ + bool is_prevote; /* Copy of the is_prevote from the request, + * primarily for validation. */ }; struct raft_add_server_request { diff --git a/ovsdb/raft.c b/ovsdb/raft.c index b2361b1737a..8effd9ad1ad 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -305,6 +305,11 @@ struct raft { /* Candidates only. Reinitialized at start of election. */ int n_votes; /* Number of votes for me. */ + bool prevote_passed; /* Indicates if it passed pre-vote phase. + * Pre-vote mechanism is introduced in raft + * paper section 9.6. We implement it as a + * sub-state of candidate to minimize the + * change and keep backward compatibility. */ /* Followers and candidates only. */ bool candidate_retrying; /* The earlier election timed-out and we are @@ -372,7 +377,8 @@ static void raft_become_follower(struct raft *); static void raft_reset_election_timer(struct raft *); static void raft_reset_ping_timer(struct raft *); static void raft_send_heartbeats(struct raft *); -static void raft_start_election(struct raft *, bool leadership_transfer); +static void raft_start_election(struct raft *, bool is_prevote, + bool leadership_transfer); static bool raft_truncate(struct raft *, uint64_t new_end); static void raft_get_servers_from_log(struct raft *, enum vlog_level); static void raft_get_election_timer_from_log(struct raft *); @@ -1069,7 +1075,8 @@ raft_open(struct ovsdb_log *log, struct raft **raftp) /* If there's only one server, start an election right away so that the * cluster bootstraps quickly. */ if (hmap_count(&raft->servers) == 1) { - raft_start_election(raft, false); + /* No pre-vote needed since we are the only one. */ + raft_start_election(raft, false, false); } } else { raft->join_timeout = time_msec() + 1000; @@ -1360,7 +1367,7 @@ void raft_take_leadership(struct raft *raft) { if (raft->role != RAFT_LEADER) { - raft_start_election(raft, true); + raft_start_election(raft, false, true); } } @@ -1766,12 +1773,12 @@ raft_set_term(struct raft *raft, uint64_t term, const struct uuid *vote) return true; } -static void +static bool raft_accept_vote(struct raft *raft, struct raft_server *s, const struct uuid *vote) { if (uuid_equals(&s->vote, vote)) { - return; + return false; } if (!uuid_is_zero(&s->vote)) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); @@ -1785,13 +1792,18 @@ raft_accept_vote(struct raft *raft, struct raft_server *s, s->vote = *vote; if (uuid_equals(vote, &raft->sid) && ++raft->n_votes > hmap_count(&raft->servers) / 2) { - raft_become_leader(raft); + return true; } + return false; } static void -raft_start_election(struct raft *raft, bool leadership_transfer) +raft_start_election(struct raft *raft, bool is_prevote, + bool leadership_transfer) { + /* Leadership transfer doesn't use pre-vote. */ + ovs_assert(!is_prevote || !leadership_transfer); + if (raft->leaving) { return; } @@ -1801,7 +1813,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer) return; } - if (!raft_set_term(raft, raft->term + 1, &raft->sid)) { + if (!is_prevote && !raft_set_term(raft, raft->term + 1, &raft->sid)) { return; } @@ -1809,26 +1821,33 @@ raft_start_election(struct raft *raft, bool leadership_transfer) raft->leader_sid = UUID_ZERO; raft->role = RAFT_CANDIDATE; - /* If there was no leader elected since last election, we know we are - * retrying now. */ - raft->candidate_retrying = !raft->had_leader; - raft->had_leader = false; + raft->prevote_passed = !is_prevote; + + if (is_prevote || leadership_transfer) { + /* If there was no leader elected since last election, we know we are + * retrying now. */ + raft->candidate_retrying = !raft->had_leader; + raft->had_leader = false; + + raft->election_start = time_msec(); + raft->election_won = 0; + } raft->n_votes = 0; - raft->election_start = time_msec(); - raft->election_won = 0; raft->leadership_transfer = leadership_transfer; static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); if (!VLOG_DROP_INFO(&rl)) { long long int now = time_msec(); + char *comment = is_prevote ? "pre-vote" : "vote"; if (now >= raft->election_timeout) { VLOG_INFO("term %"PRIu64": %lld ms timeout expired, " - "starting election", - raft->term, now - raft->election_base); + "starting election (%s)", + raft->term, now - raft->election_base, comment); } else { - VLOG_INFO("term %"PRIu64": starting election", raft->term); + VLOG_INFO("term %"PRIu64": starting election (%s)", + raft->term, comment); } } raft_reset_election_timer(raft); @@ -1853,6 +1872,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer) ? raft->entries[raft->log_end - raft->log_start - 1].term : raft->snap.term), .leadership_transfer = leadership_transfer, + .is_prevote = is_prevote, }, }; if (failure_test != FT_DONT_SEND_VOTE_REQUEST) { @@ -1861,7 +1881,13 @@ raft_start_election(struct raft *raft, bool leadership_transfer) } /* Vote for ourselves. */ - raft_accept_vote(raft, me, &raft->sid); + if (raft_accept_vote(raft, me, &raft->sid)) { + /* We just started vote, so it shouldn't be accepted yet unless this is + * a one-node cluster. In such case we don't do pre-vote, and become + * leader immediately. */ + ovs_assert(!is_prevote); + raft_become_leader(raft); + } } static void @@ -2041,10 +2067,10 @@ raft_run(struct raft *raft) raft_reset_election_timer(raft); } else { raft_become_follower(raft); - raft_start_election(raft, false); + raft_start_election(raft, true, false); } } else { - raft_start_election(raft, false); + raft_start_election(raft, true, false); } } @@ -3673,6 +3699,10 @@ raft_handle_vote_request__(struct raft *raft, return false; } + if (rq->is_prevote) { + return true; + } + /* Record a vote for the peer. */ if (!raft_set_term(raft, raft->term, &rq->common.sid)) { return false; @@ -3685,7 +3715,7 @@ raft_handle_vote_request__(struct raft *raft, static void raft_send_vote_reply(struct raft *raft, const struct uuid *dst, - const struct uuid *vote) + const struct uuid *vote, bool is_prevote) { union raft_rpc rpy = { .vote_reply = { @@ -3695,6 +3725,7 @@ raft_send_vote_reply(struct raft *raft, const struct uuid *dst, }, .term = raft->term, .vote = *vote, + .is_prevote = is_prevote, }, }; raft_send(raft, &rpy); @@ -3705,7 +3736,9 @@ raft_handle_vote_request(struct raft *raft, const struct raft_vote_request *rq) { if (raft_handle_vote_request__(raft, rq)) { - raft_send_vote_reply(raft, &rq->common.sid, &raft->vote); + raft_send_vote_reply(raft, &rq->common.sid, + rq->is_prevote ? &rq->common.sid : &raft->vote, + rq->is_prevote); } } @@ -3723,7 +3756,14 @@ raft_handle_vote_reply(struct raft *raft, struct raft_server *s = raft_find_peer(raft, &rpy->common.sid); if (s) { - raft_accept_vote(raft, s, &rpy->vote); + if (raft_accept_vote(raft, s, &rpy->vote)) { + if (raft->prevote_passed) { + raft_become_leader(raft); + } else { + /* Start the real election. */ + raft_start_election(raft, false, false); + } + } } } @@ -4357,7 +4397,7 @@ raft_handle_become_leader(struct raft *raft, VLOG_INFO("received leadership transfer from %s in term %"PRIu64, raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf), rq->term); - raft_start_election(raft, true); + raft_start_election(raft, false, true); } } diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 9fbf5dc897f..3e8bca59a30 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -715,6 +715,49 @@ done AT_CLEANUP + +AT_SETUP([OVSDB cluster - disruptive server]) +AT_KEYWORDS([ovsdb server negative unix cluster disruptive]) + +n=3 +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster \ + s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=$(ovsdb-tool db-cid s1.db) +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema) +for i in $(seq 2 $n); do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +for i in $(seq $n); do + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir \ + --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done +for i in $(seq $n); do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +# An unstable follower shouldn't disrupt the healthy cluster - shouldn't +# trigger term change. +AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/failure-test stop-raft-rpc], [0], [ignore]) +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s2 cluster/status $schema_name | grep "Role: candidate"]) +AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/failure-test clear], [0], [ignore]) + +# Should step back to follower. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s2 cluster/status $schema_name | grep "Role: follower"]) + +# No term change. +for i in $(seq $n); do + AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name | grep "Term: 1"], [0], [ignore]) +done + +for i in $(seq $n); do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + +AT_CLEANUP + AT_BANNER([OVSDB - cluster tests]) From 1116459b3ba813ca0e5f458f436309688dfcd1e5 Mon Sep 17 00:00:00 2001 From: Peng He Date: Wed, 30 Aug 2023 21:29:51 +0200 Subject: [PATCH 376/833] conntrack: Remove nat_conn introducing key directionality. The patch avoids the extra allocation for nat_conn. Currently, when doing NAT, the userspace conntrack will use an extra conn for the two directions in a flow. However, each conn has actually the two keys for both orig and rev directions. This patch introduces a key_node[CT_DIRS] member as per Aaron's suggestion in the conn which consists of a key, direction, and a cmap_node for hash lookup so addressing the feedback received by the original patch [0]. With this adjustment, we also remove the assertion that connections in the table are DEFAULT while updating connection state and/or removing connections. [0] https://patchwork.ozlabs.org/project/openvswitch/patch/20201129033255.64647-2-hepeng.0320@bytedance.com/ Reported-by: Michael Plato Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-September/052065.html Signed-off-by: Peng He Co-authored-by: Paolo Valerio Signed-off-by: Paolo Valerio Tested-by: Frode Nordahl Acked-by: Ilya Maximets Acked-by: Aaron Conole Signed-off-by: Aaron Conole --- lib/conntrack-private.h | 19 ++- lib/conntrack-tp.c | 6 +- lib/conntrack.c | 366 ++++++++++++++++------------------------ 3 files changed, 164 insertions(+), 227 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index bb326868e9d..3fd5fccd3eb 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -49,6 +49,12 @@ struct ct_endpoint { * hashing in ct_endpoint_hash_add(). */ BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4); +enum key_dir { + CT_DIR_FWD = 0, + CT_DIR_REV, + CT_DIRS, +}; + /* Changes to this structure need to be reflected in conn_key_hash() * and conn_key_cmp(). */ struct conn_key { @@ -112,20 +118,18 @@ enum ct_timeout { #define N_EXP_LISTS 100 -enum OVS_PACKED_ENUM ct_conn_type { - CT_CONN_TYPE_DEFAULT, - CT_CONN_TYPE_UN_NAT, +struct conn_key_node { + enum key_dir dir; + struct conn_key key; + struct cmap_node cm_node; }; struct conn { /* Immutable data. */ - struct conn_key key; - struct conn_key rev_key; + struct conn_key_node key_node[CT_DIRS]; struct conn_key parent_key; /* Only used for orig_tuple support. */ - struct cmap_node cm_node; uint16_t nat_action; char *alg; - struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */ atomic_flag reclaimed; /* False during the lifetime of the connection, * True as soon as a thread has started freeing * its memory. */ @@ -150,7 +154,6 @@ struct conn { /* Immutable data. */ bool alg_related; /* True if alg data connection. */ - enum ct_conn_type conn_type; uint32_t tp_id; /* Timeout policy ID. */ }; diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index 89cb2704a6c..2149fdc73a7 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn, } VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d " "val=%u sec.", - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, + conn->tp_id, val); atomic_store_relaxed(&conn->expiration, now + val * 1000); } @@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn, } VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.", - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, + conn->tp_id, val); conn->expiration = now + val * 1000; } diff --git a/lib/conntrack.c b/lib/conntrack.c index 5f1176d333f..47a443fba4d 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -103,7 +103,7 @@ static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn, struct conn_lookup_ctx *ctx, long long now); static long long int conn_expiration(const struct conn *); -static bool conn_expired(struct conn *, long long now); +static bool conn_expired(const struct conn *, long long now); static void conn_expire_push_front(struct conntrack *ct, struct conn *conn); static void set_mark(struct dp_packet *, struct conn *, uint32_t val, uint32_t mask); @@ -113,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *, static void *clean_thread_main(void *f_); static bool -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, - struct conn *nat_conn, +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, const struct nat_action_info_t *nat_info); static uint8_t @@ -208,7 +207,7 @@ static alg_helper alg_helpers[] = { #define ALG_WC_SRC_PORT 0 /* If the total number of connections goes above this value, no new connections - * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */ + * are accepted. */ #define DEFAULT_N_CONN_LIMIT 3000000 /* Does a member by member comparison of two conn_keys; this @@ -234,61 +233,6 @@ conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2) return 1; } -static void -ct_print_conn_info(const struct conn *c, const char *log_msg, - enum vlog_level vll, bool force, bool rl_on) -{ -#define CT_VLOG(RL_ON, LEVEL, ...) \ - do { \ - if (RL_ON) { \ - static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \ - vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \ - } else { \ - vlog(&this_module, LEVEL, __VA_ARGS__); \ - } \ - } while (0) - - if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) { - if (c->key.dl_type == htons(ETH_TYPE_IP)) { - CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src " - "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports " - "%"PRIu16"/%"PRIu16" rev src/dst ports " - "%"PRIu16"/%"PRIu16" zone/rev zone " - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto " - "%"PRIu8"/%"PRIu8, log_msg, - IP_ARGS(c->key.src.addr.ipv4), - IP_ARGS(c->key.dst.addr.ipv4), - IP_ARGS(c->rev_key.src.addr.ipv4), - IP_ARGS(c->rev_key.dst.addr.ipv4), - ntohs(c->key.src.port), ntohs(c->key.dst.port), - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port), - c->key.zone, c->rev_key.zone, c->key.nw_proto, - c->rev_key.nw_proto); - } else { - char ip6_s[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s); - char ip6_d[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d); - char ip6_rs[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs, - sizeof ip6_rs); - char ip6_rd[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd, - sizeof ip6_rd); - - CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s" - " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16 - " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone " - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto " - "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs, - ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port), - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port), - c->key.zone, c->rev_key.zone, c->key.nw_proto, - c->rev_key.nw_proto); - } - } -} - /* Initializes the connection tracker 'ct'. The caller is responsible for * calling 'conntrack_destroy()', when the instance is not needed anymore */ struct conntrack * @@ -477,28 +421,27 @@ conn_clean__(struct conntrack *ct, struct conn *conn) uint32_t hash; if (conn->alg) { - expectation_clean(ct, &conn->key); + expectation_clean(ct, &conn->key_node[CT_DIR_FWD].key); } - hash = conn_key_hash(&conn->key, ct->hash_basis); - cmap_remove(&ct->conns, &conn->cm_node, hash); + hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis); + cmap_remove(&ct->conns, &conn->key_node[CT_DIR_FWD].cm_node, hash); - if (conn->nat_conn) { - hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis); - cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash); + if (conn->nat_action) { + hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key, + ct->hash_basis); + cmap_remove(&ct->conns, &conn->key_node[CT_DIR_REV].cm_node, hash); } rculist_remove(&conn->node); } -/* Must be called with 'conn' of 'conn_type' CT_CONN_TYPE_DEFAULT. Also - * removes the associated nat 'conn' from the lookup datastructures. */ +/* Also removes the associated nat 'conn' from the lookup + datastructures. */ static void conn_clean(struct conntrack *ct, struct conn *conn) OVS_EXCLUDED(conn->lock, ct->ct_lock) { - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); - if (atomic_flag_test_and_set(&conn->reclaimed)) { return; } @@ -585,34 +528,39 @@ conn_key_lookup(struct conntrack *ct, const struct conn_key *key, uint32_t hash, long long now, struct conn **conn_out, bool *reply) { - struct conn *conn; + struct conn_key_node *keyn; + struct conn *conn = NULL; bool found = false; - CMAP_FOR_EACH_WITH_HASH (conn, cm_node, hash, &ct->conns) { + CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns) { + if (keyn->dir == CT_DIR_FWD) { + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); + } else { + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_REV]); + } + if (conn_expired(conn, now)) { continue; } - if (!conn_key_cmp(&conn->key, key)) { - found = true; - if (reply) { - *reply = false; + + for (int i = CT_DIR_FWD; i < CT_DIRS; i++) { + if (!conn_key_cmp(&conn->key_node[i].key, key)) { + found = true; + if (reply) { + *reply = (i == CT_DIR_REV); + } + goto out_found; } - break; - } - if (!conn_key_cmp(&conn->rev_key, key)) { - found = true; - if (reply) { - *reply = true; - } - break; } } +out_found: if (found && conn_out) { *conn_out = conn; } else if (conn_out) { *conn_out = NULL; } + return found; } @@ -646,7 +594,7 @@ write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn, if (conn->alg_related) { key = &conn->parent_key; } else { - key = &conn->key; + key = &conn->key_node[CT_DIR_FWD].key; } } else if (alg_exp) { pkt->md.ct_mark = alg_exp->parent_mark; @@ -877,7 +825,8 @@ nat_inner_packet(struct dp_packet *pkt, struct conn_key *key, static void nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool related) { - struct conn_key *key = reply ? &conn->key : &conn->rev_key; + enum key_dir dir = reply ? CT_DIR_FWD : CT_DIR_REV; + struct conn_key *key = &conn->key_node[dir].key; uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action) : conn->nat_action; @@ -911,7 +860,7 @@ conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in, { struct conn *conn; - conn_lookup(ct, &conn_in->key, now, &conn, NULL); + conn_lookup(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL); if (conn && seq_skew) { conn->seq_skew = seq_skew; conn->seq_skew_dir = seq_skew_dir; @@ -947,7 +896,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, OVS_REQUIRES(ct->ct_lock) { struct conn *nc = NULL; - struct conn *nat_conn = NULL; if (!valid_new(pkt, &ctx->key)) { pkt->md.ct_state = CS_INVALID; @@ -961,6 +909,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, } if (commit) { + struct conn_key_node *fwd_key_node, *rev_key_node; struct zone_limit *zl = zone_limit_lookup_or_default(ct, ctx->key.zone); if (zl && atomic_count_get(&zl->czl.count) >= zl->czl.limit) { @@ -975,9 +924,12 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, } nc = new_conn(ct, pkt, &ctx->key, now, tp_id); - memcpy(&nc->key, &ctx->key, sizeof nc->key); - memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key); - conn_key_reverse(&nc->rev_key); + fwd_key_node = &nc->key_node[CT_DIR_FWD]; + rev_key_node = &nc->key_node[CT_DIR_REV]; + memcpy(&fwd_key_node->key, &ctx->key, sizeof fwd_key_node->key); + memcpy(&rev_key_node->key, &fwd_key_node->key, + sizeof rev_key_node->key); + conn_key_reverse(&rev_key_node->key); if (ct_verify_helper(helper, ct_alg_ctl)) { nc->alg = nullable_xstrdup(helper); @@ -992,46 +944,33 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, if (nat_action_info) { nc->nat_action = nat_action_info->nat_action; - nat_conn = xzalloc(sizeof *nat_conn); if (alg_exp) { if (alg_exp->nat_rpl_dst) { - nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr; + rev_key_node->key.dst.addr = alg_exp->alg_nat_repl_addr; nc->nat_action = NAT_ACTION_SRC; } else { - nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr; + rev_key_node->key.src.addr = alg_exp->alg_nat_repl_addr; nc->nat_action = NAT_ACTION_DST; } } else { - memcpy(nat_conn, nc, sizeof *nat_conn); - bool nat_res = nat_get_unique_tuple(ct, nc, nat_conn, - nat_action_info); - + bool nat_res = nat_get_unique_tuple(ct, nc, nat_action_info); if (!nat_res) { goto nat_res_exhaustion; } - - /* Update nc with nat adjustments made to nat_conn by - * nat_get_unique_tuple(). */ - memcpy(nc, nat_conn, sizeof *nc); } nat_packet(pkt, nc, false, ctx->icmp_related); - memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key); - memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key); - nat_conn->conn_type = CT_CONN_TYPE_UN_NAT; - nat_conn->nat_action = 0; - nat_conn->alg = NULL; - nat_conn->nat_conn = NULL; - uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis); - cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash); + uint32_t rev_hash = conn_key_hash(&rev_key_node->key, + ct->hash_basis); + cmap_insert(&ct->conns, &rev_key_node->cm_node, rev_hash); } - nc->nat_conn = nat_conn; ovs_mutex_init_adaptive(&nc->lock); - nc->conn_type = CT_CONN_TYPE_DEFAULT; atomic_flag_clear(&nc->reclaimed); - cmap_insert(&ct->conns, &nc->cm_node, ctx->hash); + fwd_key_node->dir = CT_DIR_FWD; + rev_key_node->dir = CT_DIR_REV; + cmap_insert(&ct->conns, &fwd_key_node->cm_node, ctx->hash); conn_expire_push_front(ct, nc); atomic_count_inc(&ct->n_conn); ctx->conn = nc; /* For completeness. */ @@ -1052,7 +991,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, * firewall rules or a separate firewall. Also using zone partitioning * can limit DoS impact. */ nat_res_exhaustion: - free(nat_conn); delete_conn__(nc); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - " @@ -1065,7 +1003,6 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, struct conn *conn, long long now) { - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); bool create_new_conn = false; if (ctx->icmp_related) { @@ -1092,7 +1029,8 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt, pkt->md.ct_state = CS_INVALID; break; case CT_UPDATE_NEW: - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) { + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key, + now, NULL, NULL)) { conn_force_expire(conn); } create_new_conn = true; @@ -1268,8 +1206,10 @@ initial_conn_lookup(struct conntrack *ct, struct conn_lookup_ctx *ctx, if (natted) { if (OVS_LIKELY(ctx->conn)) { + enum key_dir dir; ctx->reply = !ctx->reply; - ctx->key = ctx->reply ? ctx->conn->rev_key : ctx->conn->key; + dir = ctx->reply ? CT_DIR_REV : CT_DIR_FWD; + ctx->key = ctx->conn->key_node[dir].key; ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); } else { /* A lookup failure does not necessarily imply that an @@ -1302,31 +1242,13 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, /* Delete found entry if in wrong direction. 'force' implies commit. */ if (OVS_UNLIKELY(force && ctx->reply && conn)) { - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) { + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key, + now, NULL, NULL)) { conn_force_expire(conn); } conn = NULL; } - if (OVS_LIKELY(conn)) { - if (conn->conn_type == CT_CONN_TYPE_UN_NAT) { - - ctx->reply = true; - struct conn *rev_conn = conn; /* Save for debugging. */ - uint32_t hash = conn_key_hash(&conn->rev_key, ct->hash_basis); - conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply); - - if (!conn) { - pkt->md.ct_state |= CS_INVALID; - write_ct_md(pkt, zone, NULL, NULL, NULL); - char *log_msg = xasprintf("Missing parent conn %p", rev_conn); - ct_print_conn_info(rev_conn, log_msg, VLL_INFO, true, true); - free(log_msg); - return; - } - } - } - enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst, helper); @@ -1419,8 +1341,9 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, struct conn *conn = packet->md.conn; if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) { write_ct_md(packet, zone, NULL, NULL, NULL); - } else if (conn && conn->key.zone == zone && !force - && !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) { + } else if (conn && + conn->key_node[CT_DIR_FWD].key.zone == zone && !force && + !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) { process_one_fast(zone, setmark, setlabel, nat_action_info, conn, packet); } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx, @@ -2269,7 +2192,7 @@ nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment) } static uint32_t -nat_range_hash(const struct conn *conn, uint32_t basis, +nat_range_hash(const struct conn_key *key, uint32_t basis, const struct nat_action_info_t *nat_info) { uint32_t hash = basis; @@ -2279,11 +2202,11 @@ nat_range_hash(const struct conn *conn, uint32_t basis, hash = hash_add(hash, ((uint32_t) nat_info->max_port << 16) | nat_info->min_port); - hash = ct_endpoint_hash_add(hash, &conn->key.src); - hash = ct_endpoint_hash_add(hash, &conn->key.dst); - hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type); - hash = hash_add(hash, conn->key.nw_proto); - hash = hash_add(hash, conn->key.zone); + hash = ct_endpoint_hash_add(hash, &key->src); + hash = ct_endpoint_hash_add(hash, &key->dst); + hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type); + hash = hash_add(hash, key->nw_proto); + hash = hash_add(hash, key->zone); /* The purpose of the second parameter is to distinguish hashes of data of * different length; our data always has the same length so there is no @@ -2357,7 +2280,7 @@ get_addr_in_range(union ct_addr *min, union ct_addr *max, } static void -find_addr(const struct conn *conn, union ct_addr *min, +find_addr(const struct conn_key *key, union ct_addr *min, union ct_addr *max, union ct_addr *curr, uint32_t hash, bool ipv4, const struct nat_action_info_t *nat_info) @@ -2367,9 +2290,9 @@ find_addr(const struct conn *conn, union ct_addr *min, /* All-zero case. */ if (!memcmp(min, &zero_ip, sizeof *min)) { if (nat_info->nat_action & NAT_ACTION_SRC) { - *curr = conn->key.src.addr; + *curr = key->src.addr; } else if (nat_info->nat_action & NAT_ACTION_DST) { - *curr = conn->key.dst.addr; + *curr = key->dst.addr; } } else { get_addr_in_range(min, max, curr, hash, ipv4); @@ -2388,7 +2311,7 @@ store_addr_to_key(union ct_addr *addr, struct conn_key *key, } static bool -nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, +nat_get_unique_l4(struct conntrack *ct, struct conn_key *rev_key, ovs_be16 *port, uint16_t curr, uint16_t min, uint16_t max) { @@ -2411,8 +2334,7 @@ nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, } *port = htons(curr); - if (!conn_lookup(ct, &nat_conn->rev_key, - time_msec(), NULL, NULL)) { + if (!conn_lookup(ct, rev_key, time_msec(), NULL, NULL)) { return true; } } @@ -2450,54 +2372,50 @@ nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, * * If none can be found, return exhaustion to the caller. */ static bool -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, - struct conn *nat_conn, +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, const struct nat_action_info_t *nat_info) { - uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info); + struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key; + struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key; union ct_addr min_addr = {0}, max_addr = {0}, addr = {0}; - bool pat_proto = conn->key.nw_proto == IPPROTO_TCP || - conn->key.nw_proto == IPPROTO_UDP || - conn->key.nw_proto == IPPROTO_SCTP; + bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP || + fwd_key->nw_proto == IPPROTO_UDP || + fwd_key->nw_proto == IPPROTO_SCTP; uint16_t min_dport, max_dport, curr_dport; uint16_t min_sport, max_sport, curr_sport; + uint32_t hash; + hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info); min_addr = nat_info->min_addr; max_addr = nat_info->max_addr; - find_addr(conn, &min_addr, &max_addr, &addr, hash, - (conn->key.dl_type == htons(ETH_TYPE_IP)), nat_info); + find_addr(fwd_key, &min_addr, &max_addr, &addr, hash, + (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info); - set_sport_range(nat_info, &conn->key, hash, &curr_sport, + set_sport_range(nat_info, fwd_key, hash, &curr_sport, &min_sport, &max_sport); - set_dport_range(nat_info, &conn->key, hash, &curr_dport, + set_dport_range(nat_info, fwd_key, hash, &curr_dport, &min_dport, &max_dport); if (pat_proto) { - nat_conn->rev_key.src.port = htons(curr_dport); - nat_conn->rev_key.dst.port = htons(curr_sport); + rev_key->src.port = htons(curr_dport); + rev_key->dst.port = htons(curr_sport); } - store_addr_to_key(&addr, &nat_conn->rev_key, - nat_info->nat_action); + store_addr_to_key(&addr, rev_key, nat_info->nat_action); if (!pat_proto) { - if (!conn_lookup(ct, &nat_conn->rev_key, - time_msec(), NULL, NULL)) { - return true; - } - - return false; + return !conn_lookup(ct, rev_key, time_msec(), NULL, NULL); } bool found = false; if (nat_info->nat_action & NAT_ACTION_DST_PORT) { - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port, + found = nat_get_unique_l4(ct, rev_key, &rev_key->src.port, curr_dport, min_dport, max_dport); } if (!found) { - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port, + found = nat_get_unique_l4(ct, rev_key, &rev_key->dst.port, curr_sport, min_sport, max_sport); } @@ -2513,9 +2431,9 @@ conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, long long now) { ovs_mutex_lock(&conn->lock); + uint8_t nw_proto = conn->key_node[CT_DIR_FWD].key.nw_proto; enum ct_update_res update_res = - l4_protos[conn->key.nw_proto]->conn_update(ct, conn, pkt, ctx->reply, - now); + l4_protos[nw_proto]->conn_update(ct, conn, pkt, ctx->reply, now); ovs_mutex_unlock(&conn->lock); return update_res; } @@ -2541,12 +2459,9 @@ conn_expiration(const struct conn *conn) } static bool -conn_expired(struct conn *conn, long long now) +conn_expired(const struct conn *conn, long long now) { - if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { - return now >= conn_expiration(conn); - } - return false; + return now >= conn_expiration(conn); } static bool @@ -2572,9 +2487,7 @@ delete_conn__(struct conn *conn) static void delete_conn(struct conn *conn) { - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); ovs_mutex_destroy(&conn->lock); - free(conn->nat_conn); delete_conn__(conn); } @@ -2667,15 +2580,18 @@ static void conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, long long now) { + const struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key; + const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key; + memset(entry, 0, sizeof *entry); - conn_key_to_tuple(&conn->key, &entry->tuple_orig); - conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply); + conn_key_to_tuple(key, &entry->tuple_orig); + conn_key_to_tuple(rev_key, &entry->tuple_reply); if (conn->alg_related) { conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent); } - entry->zone = conn->key.zone; + entry->zone = key->zone; ovs_mutex_lock(&conn->lock); entry->mark = conn->mark; @@ -2683,7 +2599,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, long long expiration = conn_expiration(conn) - now; - struct ct_l4_proto *class = l4_protos[conn->key.nw_proto]; + struct ct_l4_proto *class = l4_protos[key->nw_proto]; if (class->conn_get_protoinfo) { class->conn_get_protoinfo(conn, &entry->protoinfo); } @@ -2731,15 +2647,20 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) if (!cm_node) { break; } + struct conn_key_node *keyn; struct conn *conn; - INIT_CONTAINER(conn, cm_node, cm_node); + INIT_CONTAINER(keyn, cm_node, cm_node); + if (keyn->dir != CT_DIR_FWD) { + continue; + } + + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); if (conn_expired(conn, now)) { continue; } - if ((!dump->filter_zone || conn->key.zone == dump->zone) && - (conn->conn_type != CT_CONN_TYPE_UN_NAT)) { + if (!dump->filter_zone || keyn->key.zone == dump->zone) { conn_to_ct_dpif_entry(conn, entry, now); return 0; } @@ -2823,14 +2744,15 @@ conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED) int conntrack_flush(struct conntrack *ct, const uint16_t *zone) { + struct conn_key_node *keyn; struct conn *conn; - CMAP_FOR_EACH (conn, cm_node, &ct->conns) { - if (conn->conn_type != CT_CONN_TYPE_DEFAULT) { + CMAP_FOR_EACH (keyn, cm_node, &ct->conns) { + if (keyn->dir != CT_DIR_FWD) { continue; } - - if (!zone || *zone == conn->key.zone) { + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); + if (!zone || *zone == keyn->key.zone) { conn_clean(ct, conn); } } @@ -2842,18 +2764,18 @@ int conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple, uint16_t zone) { - int error = 0; struct conn_key key; struct conn *conn; + int error = 0; memset(&key, 0, sizeof(key)); tuple_to_conn_key(tuple, zone, &key); conn_lookup(ct, &key, time_msec(), &conn, NULL); - if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) { + if (conn) { conn_clean(ct, conn); } else { - VLOG_WARN("Must flush tuple using the original pre-NATed tuple"); + VLOG_WARN("Tuple not found"); error = ENOENT; } @@ -2996,50 +2918,54 @@ expectation_create(struct conntrack *ct, ovs_be16 dst_port, const struct conn *parent_conn, bool reply, bool src_ip_wc, bool skip_nat) { + const struct conn_key *pconn_key, *pconn_rev_key; union ct_addr src_addr; union ct_addr dst_addr; union ct_addr alg_nat_repl_addr; struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node); + pconn_key = &parent_conn->key_node[CT_DIR_FWD].key; + pconn_rev_key = &parent_conn->key_node[CT_DIR_REV].key; + if (reply) { - src_addr = parent_conn->key.src.addr; - dst_addr = parent_conn->key.dst.addr; + src_addr = pconn_key->src.addr; + dst_addr = pconn_key->dst.addr; alg_exp_node->nat_rpl_dst = true; if (skip_nat) { alg_nat_repl_addr = dst_addr; } else if (parent_conn->nat_action & NAT_ACTION_DST) { - alg_nat_repl_addr = parent_conn->rev_key.src.addr; + alg_nat_repl_addr = pconn_rev_key->src.addr; alg_exp_node->nat_rpl_dst = false; } else { - alg_nat_repl_addr = parent_conn->rev_key.dst.addr; + alg_nat_repl_addr = pconn_rev_key->dst.addr; } } else { - src_addr = parent_conn->rev_key.src.addr; - dst_addr = parent_conn->rev_key.dst.addr; + src_addr = pconn_rev_key->src.addr; + dst_addr = pconn_rev_key->dst.addr; alg_exp_node->nat_rpl_dst = false; if (skip_nat) { alg_nat_repl_addr = src_addr; } else if (parent_conn->nat_action & NAT_ACTION_DST) { - alg_nat_repl_addr = parent_conn->key.dst.addr; + alg_nat_repl_addr = pconn_key->dst.addr; alg_exp_node->nat_rpl_dst = true; } else { - alg_nat_repl_addr = parent_conn->key.src.addr; + alg_nat_repl_addr = pconn_key->src.addr; } } if (src_ip_wc) { memset(&src_addr, 0, sizeof src_addr); } - alg_exp_node->key.dl_type = parent_conn->key.dl_type; - alg_exp_node->key.nw_proto = parent_conn->key.nw_proto; - alg_exp_node->key.zone = parent_conn->key.zone; + alg_exp_node->key.dl_type = pconn_key->dl_type; + alg_exp_node->key.nw_proto = pconn_key->nw_proto; + alg_exp_node->key.zone = pconn_key->zone; alg_exp_node->key.src.addr = src_addr; alg_exp_node->key.dst.addr = dst_addr; alg_exp_node->key.src.port = ALG_WC_SRC_PORT; alg_exp_node->key.dst.port = dst_port; alg_exp_node->parent_mark = parent_conn->mark; alg_exp_node->parent_label = parent_conn->label; - memcpy(&alg_exp_node->parent_key, &parent_conn->key, + memcpy(&alg_exp_node->parent_key, pconn_key, sizeof alg_exp_node->parent_key); /* Take the write lock here because it is almost 100% * likely that the lookup will fail and @@ -3291,12 +3217,16 @@ process_ftp_ctl_v4(struct conntrack *ct, switch (mode) { case CT_FTP_MODE_ACTIVE: - *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4; - conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4; + *v4_addr_rep = + conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr.ipv4; + conn_ipv4_addr = + conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv4; break; case CT_FTP_MODE_PASSIVE: - *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4; - conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4; + *v4_addr_rep = + conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr.ipv4; + conn_ipv4_addr = + conn_for_expectation->key_node[CT_DIR_REV].key.src.addr.ipv4; break; case CT_TFTP_MODE: default: @@ -3328,7 +3258,7 @@ skip_ipv6_digits(char *str) static enum ftp_ctl_pkt process_ftp_ctl_v6(struct conntrack *ct, struct dp_packet *pkt, - const struct conn *conn_for_expectation, + const struct conn *conn_for_exp, union ct_addr *v6_addr_rep, char **ftp_data_start, size_t *addr_offset_from_ftp_data_start, size_t *addr_size, enum ct_alg_mode *mode) @@ -3396,24 +3326,25 @@ process_ftp_ctl_v6(struct conntrack *ct, switch (*mode) { case CT_FTP_MODE_ACTIVE: - *v6_addr_rep = conn_for_expectation->rev_key.dst.addr; + *v6_addr_rep = conn_for_exp->key_node[CT_DIR_REV].key.dst.addr; /* Although most servers will block this exploit, there may be some * less well managed. */ if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) && - memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6, + memcmp(&ip6_addr, + &conn_for_exp->key_node[CT_DIR_FWD].key.src.addr.ipv6, sizeof ip6_addr)) { return CT_FTP_CTL_INVALID; } break; case CT_FTP_MODE_PASSIVE: - *v6_addr_rep = conn_for_expectation->key.dst.addr; + *v6_addr_rep = conn_for_exp->key_node[CT_DIR_FWD].key.dst.addr; break; case CT_TFTP_MODE: default: OVS_NOT_REACHED(); } - expectation_create(ct, port, conn_for_expectation, + expectation_create(ct, port, conn_for_exp, !!(pkt->md.ct_state & CS_REPLY_DIR), false, false); return CT_FTP_CTL_INTEREST; } @@ -3571,7 +3502,8 @@ handle_tftp_ctl(struct conntrack *ct, long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED) { - expectation_create(ct, conn_for_expectation->key.src.port, + expectation_create(ct, + conn_for_expectation->key_node[CT_DIR_FWD].key.src.port, conn_for_expectation, !!(pkt->md.ct_state & CS_REPLY_DIR), false, false); } From 40546cd6e51a53048d320fc0ed6a99f1713f5335 Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Fri, 4 Aug 2023 00:19:10 +0800 Subject: [PATCH 377/833] lib, ovs-vsctl: Add zero-initializations. This commit adds zero-initializations by changing `SFL_ALLOC` from `malloc` to `xzalloc`, adding a `memset` call to `sflAlloc`, initializing a `pollfd` struct variable with zeroes, and changing some calls to `xmalloc` to `xzalloc`. This is to prevent potential data leaks or undefined behavior from potentially uninitialized variables. Some variables would always be initialized by either the code flow or the compiler. Thus, some of the associated Coverity reports might be false positives. That said, it is still considered best practice to zero-initialize variables upfront just in case to ensure the overall resilience and security of OVS, as long as they do not impact performance-critical code. As a bonus, it would also make static analyzer tools, such as Coverity, happy. Reviewed-by: Simon Horman Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- lib/latch-unix.c | 2 +- lib/sflow_agent.c | 12 ++++++++++-- lib/sflow_api.h | 2 +- utilities/ovs-vsctl.c | 8 +++++--- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/lib/latch-unix.c b/lib/latch-unix.c index f4d10c39a03..c62bb024b44 100644 --- a/lib/latch-unix.c +++ b/lib/latch-unix.c @@ -71,7 +71,7 @@ latch_set(struct latch *latch) bool latch_is_set(const struct latch *latch) { - struct pollfd pfd; + struct pollfd pfd = {0}; int retval; pfd.fd = latch->fds[0]; diff --git a/lib/sflow_agent.c b/lib/sflow_agent.c index c95f654a59c..743774a27b3 100644 --- a/lib/sflow_agent.c +++ b/lib/sflow_agent.c @@ -510,8 +510,16 @@ void sfl_agent_sysError(SFLAgent *agent, char *modName, char *msg) static void * sflAlloc(SFLAgent *agent, size_t bytes) { - if(agent->allocFn) return (*agent->allocFn)(agent->magic, agent, bytes); - else return SFL_ALLOC(bytes); + void *alloc; + + if (agent->allocFn) { + alloc = (*agent->allocFn)(agent->magic, agent, bytes); + ovs_assert(alloc); + memset(alloc, 0, bytes); + } else { + alloc = SFL_ALLOC(bytes); + } + return alloc; } static void sflFree(SFLAgent *agent, void *obj) diff --git a/lib/sflow_api.h b/lib/sflow_api.h index a0530b37ab4..eb23e2acdb9 100644 --- a/lib/sflow_api.h +++ b/lib/sflow_api.h @@ -337,7 +337,7 @@ void sfl_agent_sysError(SFLAgent *agent, char *modName, char *msg); u_int32_t sfl_receiver_samplePacketsSent(SFLReceiver *receiver); -#define SFL_ALLOC malloc +#define SFL_ALLOC xzalloc #define SFL_FREE free #endif /* SFLOW_API_H */ diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index 62b51230296..56e4da313de 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -575,15 +575,18 @@ add_bridge_to_cache(struct vsctl_context *vsctl_ctx, struct ovsrec_bridge *br_cfg, const char *name, struct vsctl_bridge *parent, int vlan) { - struct vsctl_bridge *br = xmalloc(sizeof *br); + struct vsctl_bridge *br = xzalloc(sizeof *br); + br->br_cfg = br_cfg; br->name = xstrdup(name); ovs_list_init(&br->ports); br->parent = parent; br->vlan = vlan; hmap_init(&br->children); + if (parent) { struct vsctl_bridge *conflict = find_vlan_bridge(parent, vlan); + if (conflict) { VLOG_WARN("%s: bridge has multiple VLAN bridges (%s and %s) " "for VLAN %d, but only one is allowed", @@ -659,7 +662,7 @@ static struct vsctl_port * add_port_to_cache(struct vsctl_context *vsctl_ctx, struct vsctl_bridge *parent, struct ovsrec_port *port_cfg) { - struct vsctl_port *port; + struct vsctl_port *port = xzalloc(sizeof *port); if (port_cfg->tag && *port_cfg->tag >= 0 && *port_cfg->tag <= 4095) { @@ -671,7 +674,6 @@ add_port_to_cache(struct vsctl_context *vsctl_ctx, struct vsctl_bridge *parent, } } - port = xmalloc(sizeof *port); ovs_list_push_back(&parent->ports, &port->ports_node); ovs_list_init(&port->ifaces); port->port_cfg = port_cfg; From bc79a7bf033fa4cda8ccfc5481db3cfccd72650c Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Fri, 4 Aug 2023 00:19:12 +0800 Subject: [PATCH 378/833] treewide: Add `ovs_assert` to check for null pointers. This patch adds an assortment of `ovs_assert` statements to check for null pointers. We use assertions since it should be impossible for any of these pointers to be NULL. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 1 + lib/odp-execute.c | 4 ++++ lib/rtnetlink.c | 4 ++-- lib/shash.c | 2 +- ovsdb/jsonrpc-server.c | 4 ++++ ovsdb/monitor.c | 3 +++ ovsdb/ovsdb-server.c | 1 + ovsdb/transaction.c | 2 ++ utilities/ovs-vsctl.c | 1 + vtep/vtep-ctl.c | 1 + 10 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 27114a9a998..072bc4073ee 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -175,6 +175,7 @@ dp_packet_new_with_headroom(size_t size, size_t headroom) struct dp_packet * dp_packet_clone(const struct dp_packet *buffer) { + ovs_assert(buffer); return dp_packet_clone_with_headroom(buffer, 0); } diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 37f0f717af6..eb03b57c42e 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -147,6 +147,8 @@ odp_set_ipv4(struct dp_packet *packet, const struct ovs_key_ipv4 *key, uint8_t new_tos; uint8_t new_ttl; + ovs_assert(nh); + if (mask->ipv4_src) { ip_src_nh = get_16aligned_be32(&nh->ip_src); new_ip_src = key->ipv4_src | (ip_src_nh & ~mask->ipv4_src); @@ -287,6 +289,8 @@ set_arp(struct dp_packet *packet, const struct ovs_key_arp *key, { struct arp_eth_header *arp = dp_packet_l3(packet); + ovs_assert(arp); + if (!mask) { arp->ar_op = key->arp_op; arp->ar_sha = key->arp_sha; diff --git a/lib/rtnetlink.c b/lib/rtnetlink.c index f67352603f7..37078d00e10 100644 --- a/lib/rtnetlink.c +++ b/lib/rtnetlink.c @@ -112,7 +112,7 @@ rtnetlink_parse(struct ofpbuf *buf, struct rtnetlink_change *change) if (parsed) { const struct ifinfomsg *ifinfo; - ifinfo = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *ifinfo); + ifinfo = ofpbuf_at_assert(buf, NLMSG_HDRLEN, sizeof *ifinfo); /* Wireless events can be spammy and cause a * lot of unnecessary churn and CPU load in @@ -175,7 +175,7 @@ rtnetlink_parse(struct ofpbuf *buf, struct rtnetlink_change *change) if (parsed) { const struct ifaddrmsg *ifaddr; - ifaddr = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *ifaddr); + ifaddr = ofpbuf_at_assert(buf, NLMSG_HDRLEN, sizeof *ifaddr); change->nlmsg_type = nlmsg->nlmsg_type; change->if_index = ifaddr->ifa_index; diff --git a/lib/shash.c b/lib/shash.c index 2bfc8eb507f..6af985d0baa 100644 --- a/lib/shash.c +++ b/lib/shash.c @@ -265,7 +265,7 @@ void * shash_find_and_delete_assert(struct shash *sh, const char *name) { void *data = shash_find_and_delete(sh, name); - ovs_assert(data != NULL); + ovs_assert(data); return data; } diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 17868f5b720..9a77760c382 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -1131,6 +1131,8 @@ static void ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, struct jsonrpc_msg *request) { + ovs_assert(db); + /* Check for duplicate ID. */ size_t hash = json_hash(request->id, 0); struct ovsdb_jsonrpc_trigger *t @@ -1391,6 +1393,8 @@ ovsdb_jsonrpc_monitor_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, enum ovsdb_monitor_version version, const struct json *request_id) { + ovs_assert(db); + struct ovsdb_jsonrpc_monitor *m = NULL; struct ovsdb_monitor *dbmon = NULL; struct json *monitor_id, *monitor_requests; diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 01091fabe78..9829cd39ca0 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -1322,6 +1322,7 @@ ovsdb_monitor_table_add_select(struct ovsdb_monitor *dbmon, struct ovsdb_monitor_table * mt; mt = shash_find_data(&dbmon->tables, table->schema->name); + ovs_assert(mt); mt->select |= select; } @@ -1706,6 +1707,8 @@ ovsdb_monitor_hash(const struct ovsdb_monitor *dbmon, size_t basis) for (i = 0; i < n; i++) { struct ovsdb_monitor_table *mt = nodes[i]->data; + ovs_assert(mt); + basis = hash_pointer(mt->table, basis); basis = hash_3words(mt->select, mt->n_columns, basis); diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index cf09c907961..4d29043f4f6 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -2262,6 +2262,7 @@ sset_from_json(struct sset *sset, const struct json *array) sset_clear(sset); + ovs_assert(array); ovs_assert(array->type == JSON_ARRAY); for (i = 0; i < array->array.n; i++) { const struct json *elem = array->array.elems[i]; diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 7cf4a851aac..4fdc5bcea7b 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -34,6 +34,7 @@ #include "storage.h" #include "table.h" #include "uuid.h" +#include "util.h" VLOG_DEFINE_THIS_MODULE(transaction); @@ -576,6 +577,7 @@ ovsdb_txn_update_weak_refs(struct ovsdb_txn *txn OVS_UNUSED, dst_row = CONST_CAST(struct ovsdb_row *, ovsdb_table_get_row(weak->dst_table, &weak->dst)); + ovs_assert(dst_row); ovs_assert(!ovsdb_row_find_weak_ref(dst_row, weak)); hmap_insert(&dst_row->dst_refs, &weak->dst_node, ovsdb_weak_ref_hash(weak)); diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index 56e4da313de..5e549df0055 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -820,6 +820,7 @@ vsctl_context_populate_cache(struct ctl_context *ctx) continue; } br = shash_find_data(&vsctl_ctx->bridges, br_cfg->name); + ovs_assert(br); for (j = 0; j < br_cfg->n_ports; j++) { struct ovsrec_port *port_cfg = br_cfg->ports[j]; struct vsctl_port *port; diff --git a/vtep/vtep-ctl.c b/vtep/vtep-ctl.c index e5d99714dee..61ec4801ed9 100644 --- a/vtep/vtep-ctl.c +++ b/vtep/vtep-ctl.c @@ -1065,6 +1065,7 @@ vtep_ctl_context_populate_cache(struct ctl_context *ctx) continue; } ps = shash_find_data(&vtepctl_ctx->pswitches, ps_cfg->name); + ovs_assert(ps); for (j = 0; j < ps_cfg->n_ports; j++) { struct vteprec_physical_port *port_cfg = ps_cfg->ports[j]; struct vtep_ctl_port *port; From 9a8b39b70950c533e482b3514973b66c9afba8d4 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Fri, 1 Sep 2023 12:08:33 -0400 Subject: [PATCH 379/833] ofproto-dpif-xlate: Don't reinstall removed XC_LEARN rule. When the a revalidator thread is updating statistics for an XC_LEARN xcache entry in xlate_push_stats_entry it uses ofproto_flow_mod_learn. The revalidator will update stats for rules even if they are in a removed state or marked as invisible. However, ofproto_flow_mod_learn will detect if a flow has been removed and re-add it in that case. This can result in an old learn action replacing the new learn action that had replaced it in the first place. This change adds a new last_used parameter to ofproto_flow_mod_learn allowing the caller to provide a timestamp that will be fed into the learned rule's modified time. The provided timestamp should be the time of the last packet activity. If last_used is not set then the current time is used, as is the current behaviour. This change also adds a check when replacing a learned rule to favour the newest rule. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2213892 Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate-cache.c | 2 +- ofproto/ofproto-dpif-xlate.c | 10 ++++- ofproto/ofproto-dpif.c | 2 +- ofproto/ofproto-provider.h | 6 ++- ofproto/ofproto.c | 60 +++++++++++++++++++++++++----- tests/learn.at | 60 ++++++++++++++++++++++++++++++ 6 files changed, 126 insertions(+), 14 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate-cache.c b/ofproto/ofproto-dpif-xlate-cache.c index 9224ee2e6d5..2e1fcb3a6f7 100644 --- a/ofproto/ofproto-dpif-xlate-cache.c +++ b/ofproto/ofproto-dpif-xlate-cache.c @@ -125,7 +125,7 @@ xlate_push_stats_entry(struct xc_entry *entry, case XC_LEARN: { enum ofperr error; error = ofproto_flow_mod_learn(entry->learn.ofm, true, - entry->learn.limit, NULL); + entry->learn.limit, NULL, stats->used); if (error) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); VLOG_WARN_RL(&rl, "xcache LEARN action execution failed."); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 47ea0f47e7e..d608a5f257c 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -5700,8 +5700,16 @@ xlate_learn_action(struct xlate_ctx *ctx, const struct ofpact_learn *learn) if (!error) { bool success = true; if (ctx->xin->allow_side_effects) { + long long int last_used; + + if (ctx->xin->resubmit_stats) { + last_used = ctx->xin->resubmit_stats->used; + } else { + last_used = time_msec(); + } error = ofproto_flow_mod_learn(ofm, ctx->xin->xcache != NULL, - learn->limit, &success); + learn->limit, &success, + last_used); } else if (learn->limit) { if (!ofm->temp_rule || ofm->temp_rule->state != RULE_INSERTED) { diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index e22ca757ac3..ba5706f6adc 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -4880,7 +4880,7 @@ packet_xlate(struct ofproto *ofproto_, struct ofproto_packet_out *opo) if (entry->type == XC_LEARN) { struct ofproto_flow_mod *ofm = entry->learn.ofm; - error = ofproto_flow_mod_learn_refresh(ofm); + error = ofproto_flow_mod_learn_refresh(ofm, time_msec()); if (error) { goto error_out; } diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 143ded6904e..9f7b8b6e831 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -2027,9 +2027,11 @@ enum ofperr ofproto_flow_mod_init_for_learn(struct ofproto *, struct ofproto_flow_mod *) OVS_EXCLUDED(ofproto_mutex); enum ofperr ofproto_flow_mod_learn(struct ofproto_flow_mod *, bool keep_ref, - unsigned limit, bool *below_limit) + unsigned limit, bool *below_limit, + long long int last_used) OVS_EXCLUDED(ofproto_mutex); -enum ofperr ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm); +enum ofperr ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm, + long long int last_used); enum ofperr ofproto_flow_mod_learn_start(struct ofproto_flow_mod *ofm) OVS_REQUIRES(ofproto_mutex); void ofproto_flow_mod_learn_revert(struct ofproto_flow_mod *ofm) diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index dbf4958bc24..e78c80d1155 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -5472,7 +5472,8 @@ ofproto_flow_mod_init_for_learn(struct ofproto *ofproto, } enum ofperr -ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) +ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm, + long long int last_used) { enum ofperr error = 0; @@ -5493,9 +5494,37 @@ ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) * this function is executed the rule will be reinstated. */ if (rule->state == RULE_REMOVED) { struct cls_rule cr; + struct oftable *table = &rule->ofproto->tables[rule->table_id]; + ovs_version_t tables_version = rule->ofproto->tables_version; + + if (!cls_rule_visible_in_version(&rule->cr, tables_version)) { + const struct cls_rule *curr_cls_rule; + + /* Only check for matching classifier rules and their modified + * time, instead of also checking all rule metadata, with the goal + * of suppressing a learn action update that would replace a more + * recent rule in the classifier. */ + curr_cls_rule = classifier_find_rule_exactly(&table->cls, + &rule->cr, + tables_version); + if (curr_cls_rule) { + struct rule *curr_rule = rule_from_cls_rule(curr_cls_rule); + long long int curr_last_used; + + ovs_mutex_lock(&curr_rule->mutex); + curr_last_used = curr_rule->modified; + ovs_mutex_unlock(&curr_rule->mutex); + + if (curr_last_used > last_used) { + /* In the case of a newer visible rule, don't recreate the + * current rule. */ + return 0; + } + } + } - cls_rule_clone(&cr, &rule->cr); ovs_mutex_lock(&rule->mutex); + cls_rule_clone(&cr, &rule->cr); error = ofproto_rule_create(rule->ofproto, &cr, rule->table_id, rule->flow_cookie, rule->idle_timeout, @@ -5506,6 +5535,7 @@ ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) rule->match_tlv_bitmap, rule->ofpacts_tlv_bitmap, &ofm->temp_rule); + ofm->temp_rule->modified = last_used; ovs_mutex_unlock(&rule->mutex); if (!error) { ofproto_rule_unref(rule); /* Release old reference. */ @@ -5513,7 +5543,7 @@ ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) } else { /* Refresh the existing rule. */ ovs_mutex_lock(&rule->mutex); - rule->modified = time_msec(); + rule->modified = last_used; ovs_mutex_unlock(&rule->mutex); } return error; @@ -5565,10 +5595,16 @@ ofproto_flow_mod_learn_finish(struct ofproto_flow_mod *ofm, /* Refresh 'ofm->temp_rule', for which the caller holds a reference, if already * in the classifier, insert it otherwise. If the rule has already been - * removed from the classifier, a new rule is created using 'ofm->temp_rule' as - * a template and the reference to the old 'ofm->temp_rule' is freed. If - * 'keep_ref' is true, then a reference to the current rule is held, otherwise - * it is released and 'ofm->temp_rule' is set to NULL. + * removed from the classifier and replaced by another rule, the 'last_used' + * parameter is used to determine whether the newer rule is replaced or kept. + * If 'last_used' is greater than the last modified time of an identical rule + * in the classifier, then a new rule is created using 'ofm->temp_rule' as a + * template and the reference to the old 'ofm->temp_rule' is freed. If the + * rule has been removed but another identical rule doesn't exist in the + * classifier, then it will be recreated. If the rule hasn't been removed + * from the classifier, then 'last_used' is used to update the rules modified + * time. If 'keep_ref' is true, then a reference to the current rule is held, + * otherwise it is released and 'ofm->temp_rule' is set to NULL. * * If 'limit' != 0, insertion will fail if there are more than 'limit' rules * in the same table with the same cookie. If insertion succeeds, @@ -5579,10 +5615,11 @@ ofproto_flow_mod_learn_finish(struct ofproto_flow_mod *ofm, * during the call. */ enum ofperr ofproto_flow_mod_learn(struct ofproto_flow_mod *ofm, bool keep_ref, - unsigned limit, bool *below_limitp) + unsigned limit, bool *below_limitp, + long long int last_used) OVS_EXCLUDED(ofproto_mutex) { - enum ofperr error = ofproto_flow_mod_learn_refresh(ofm); + enum ofperr error = ofproto_flow_mod_learn_refresh(ofm, last_used); struct rule *rule = ofm->temp_rule; bool below_limit = true; @@ -5615,6 +5652,11 @@ ofproto_flow_mod_learn(struct ofproto_flow_mod *ofm, bool keep_ref, error = ofproto_flow_mod_learn_start(ofm); if (!error) { + /* ofproto_flow_mod_learn_start may have overwritten + * modified with current time. */ + ovs_mutex_lock(&ofm->temp_rule->mutex); + ofm->temp_rule->modified = last_used; + ovs_mutex_unlock(&ofm->temp_rule->mutex); error = ofproto_flow_mod_learn_finish(ofm, NULL); } } else { diff --git a/tests/learn.at b/tests/learn.at index d127fed3481..d0bcc83633c 100644 --- a/tests/learn.at +++ b/tests/learn.at @@ -836,3 +836,63 @@ AT_CHECK([ovs-vsctl add-br br1 -- set b br1 datapath_type=dummy]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([learning action - flapping learn rule]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +AT_CHECK([ovs-appctl time/stop], [0], [ignore]) +AT_CHECK([[ovs-ofctl add-flow br0 'table=0,priority=2,in_port=1,actions=resubmit(,2)']]) +AT_CHECK([[ovs-ofctl add-flow br0 'table=0,priority=2,in_port=2,actions=resubmit(,2)']]) +AT_CHECK([[ovs-ofctl add-flow br0 'table=2,actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],output:OXM_OF_IN_PORT[]),output:3']]) + +packet="eth(src=50:54:00:00:00:06,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9)" + +dnl Run this test a few times in a loop to reduce the likelyhood that it passes by chance. +for i in 1 2 3; do + AT_CHECK([ovs-appctl revalidator/pause], [0]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + + AT_CHECK([ovs-appctl revalidator/resume], [0]) + AT_CHECK([ovs-appctl revalidator/wait], [0]) + + AT_CHECK([ovs-ofctl --no-stats dump-flows br0 | ofctl_strip | sort | grep 0x123], [0], [dnl + cookie=0x123, hard_timeout=3, priority=1,dl_dst=50:54:00:00:00:06 actions=output:1 + table=2, actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[[]]=NXM_OF_ETH_SRC[[]],output:OXM_OF_IN_PORT[[]]),output:3 +]) + + AT_CHECK([ovs-appctl revalidator/pause], [0]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + + AT_CHECK([ovs-appctl revalidator/resume], [0]) + AT_CHECK([ovs-appctl revalidator/wait], [0]) + + AT_CHECK([ovs-ofctl --no-stats dump-flows br0 | ofctl_strip | sort | grep 0x123], [0], [dnl + cookie=0x123, hard_timeout=3, priority=1,dl_dst=50:54:00:00:00:06 actions=output:2 + table=2, actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[[]]=NXM_OF_ETH_SRC[[]],output:OXM_OF_IN_PORT[[]]),output:3 +]) +done + +dnl Wait and check for learned rule eviction due to hard timeout. +AT_CHECK([ovs-appctl time/warp 3200], [0], [ignore]) + +AT_CHECK([ovs-ofctl --no-stats dump-flows br0 | ofctl_strip | grep 0x123], [0], [dnl + table=2, actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[[]]=NXM_OF_ETH_SRC[[]],output:OXM_OF_IN_PORT[[]]),output:3 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From 154e4299de6a6fb0ac20863fc7a062e7348044c1 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Tue, 5 Sep 2023 21:13:02 +0200 Subject: [PATCH 380/833] ofproto-dpif-xlate: Fix recirculation with patch port and controller. If a packet originating from the controller recirculates after going through a patch port, it gets dropped with the following message: ofproto_dpif_upcall(handler8)|INFO|received packet on unassociated datapath port 4294967295 This happens because there's no xport_uuid in the recirculation node and at the same type in_port refers to the patch port. The patch, in the case of zeroed uuid, checks that in_port belongs to the bridge and returns the related ofproto. Reported-at: https://bugzilla.redhat.com/2170920 Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 12 +++++++++++- tests/ofproto-dpif.at | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index d608a5f257c..be4bd665768 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -1615,7 +1615,8 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, } ofp_port_t in_port = recirc_id_node->state.metadata.in_port; - if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) { + if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER && + !uuid_is_zero(&recirc_id_node->state.xport_uuid)) { struct uuid xport_uuid = recirc_id_node->state.xport_uuid; xport = xport_lookup_by_uuid(xcfg, &xport_uuid); if (xport && xport->xbridge && xport->xbridge->ofproto) { @@ -1626,11 +1627,19 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, * that the packet originated from the controller via an OpenFlow * "packet-out". The right thing to do is to find just the * ofproto. There is no xport, which is OK. + * Also a zeroed xport_uuid with a valid in_port, means that + * the packet originated from OFPP_CONTROLLER passed + * through a patch port. * * OFPP_NONE can also indicate that a bond caused recirculation. */ struct uuid uuid = recirc_id_node->state.ofproto_uuid; const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid); + if (bridge && bridge->ofproto) { + if (in_port != OFPP_CONTROLLER && in_port != OFPP_NONE && + !get_ofp_port(bridge, in_port)) { + goto xport_lookup; + } if (errorp) { *errorp = NULL; } @@ -1643,6 +1652,7 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, } } +xport_lookup: xport = xport_lookup(xcfg, tnl_port_should_receive(flow) ? tnl_port_receive(flow) : odp_port_to_ofport(backer, flow->in_port.odp_port)); diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index f242f77f316..a39d0d3ae98 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -5854,6 +5854,40 @@ OVS_WAIT_UNTIL([check_flows], [ovs-ofctl dump-flows br0]) OVS_VSWITCHD_STOP AT_CLEANUP +dnl Checks for regression against a bug in which OVS dropped packets +dnl originating from a controller passing through a patch port. +AT_SETUP([ofproto-dpif - packet-out recirculation OFPP_CONTROLLER and patch port]) +OVS_VSWITCHD_START( + [add-port br0 patch-br1 -- \ + set interface patch-br1 type=patch options:peer=patch-br0 -- \ + add-br br1 -- set bridge br1 datapath-type=dummy fail-mode=secure -- \ + add-port br1 patch-br0 -- set interface patch-br0 type=patch options:peer=patch-br1 +]) + +add_of_ports --pcap br1 1 + +AT_DATA([flows-br0.txt], [dnl +table=0 icmp actions=output:patch-br1 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows-br0.txt]) + +AT_DATA([flows-br1.txt], [dnl +table=0, icmp actions=ct(table=1,zone=1) +table=1, ct_state=+trk, icmp actions=p1 +]) +AT_CHECK([ovs-ofctl add-flows br1 flows-br1.txt]) + +packet=50540000000750540000000508004500005c000000008001b94dc0a80001c0a80002080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f +AT_CHECK([ovs-ofctl packet-out br0 "in_port=CONTROLLER packet=$packet actions=table"]) + +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows -m br1 | grep "ct_state" | ofctl_strip], [dnl + table=1, n_packets=1, n_bytes=106, ct_state=+trk,icmp actions=output:2]) + +OVS_WAIT_UNTIL([ovs-pcap p1-tx.pcap | grep -q "$packet"]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - debug_slow action]) OVS_VSWITCHD_START add_of_ports br0 1 2 3 From 563c50fba7c5eb8ed4c2a4e42131a7bdc670a4a6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Sep 2023 19:32:02 +0200 Subject: [PATCH 381/833] ovsdb-cluster.at: Remove extra ordinal schema and schema name operations. Many tests are retrieving the schema name twice and also producing an ordinal schema which is not used in these tests. Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/ovsdb-cluster.at | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 3e8bca59a30..481afc08b32 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -104,8 +104,6 @@ ovsdb_test_cluster_disconnect () { n=$1 leader_or_follower=$2 check_flapping=$3 - schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` - ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -221,8 +219,6 @@ AT_SETUP([OVSDB cluster - initial status should be disconnected]) AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -260,8 +256,6 @@ AT_SETUP([OVSDB cluster - election timer change]) AT_KEYWORDS([ovsdb server positive unix cluster timer]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -353,8 +347,6 @@ AT_SETUP([OVSDB cluster - install snapshot RPC]) AT_KEYWORDS([ovsdb server positive unix cluster snapshot]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -433,8 +425,6 @@ AT_SETUP([OVSDB cluster - follower crash while joining]) AT_KEYWORDS([ovsdb server negative unix cluster join]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` @@ -665,8 +655,6 @@ AT_SETUP([OVSDB cluster - competing candidates]) AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` From bac34b26a72161a686d432022364f3e9db94f385 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Sep 2023 16:09:07 +0200 Subject: [PATCH 382/833] netlink-conntrack: Fix partial match of entries with SCTP. The SCTP protocol ports were excluded from the netlink encoding. In that case the nl_ct_flush_tuple() would return EOPNOTSUPP, that could result in some CT entries not being properly flushed if we would hit SCTP entry earlier than others. This at the same time allows to flush SCTP on its own in during partial match. This should still be considered a bug, because OvS currently supports SCTP CT entries, and it should also support partial flush for them the same way it supports partial flush for TCP/UDP. Reported-at: https://bugzilla.redhat.com/2228037 Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/netlink-conntrack.c | 3 ++- tests/system-traffic.at | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c index 4fcde9ba1e3..492bfcffb8a 100644 --- a/lib/netlink-conntrack.c +++ b/lib/netlink-conntrack.c @@ -579,7 +579,8 @@ nl_ct_put_tuple_proto(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple) nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_TYPE, tuple->icmp_type); nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_CODE, tuple->icmp_code); } else if (tuple->ip_proto == IPPROTO_TCP || - tuple->ip_proto == IPPROTO_UDP) { + tuple->ip_proto == IPPROTO_UDP || + tuple->ip_proto == IPPROTO_SCTP) { nl_msg_put_be16(buf, CTA_PROTO_SRC_PORT, tuple->src_port); nl_msg_put_be16(buf, CTA_PROTO_DST_PORT, tuple->dst_port); } else { diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 808c492a225..418cd32fecd 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2516,6 +2516,7 @@ AT_CLEANUP AT_SETUP([conntrack - ct flush]) CHECK_CONNTRACK() +CHECK_CONNTRACK_SCTP() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2526,10 +2527,8 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_DATA([flows.txt], [dnl priority=1,action=drop priority=10,arp,action=normal -priority=100,in_port=1,udp,action=ct(commit),2 -priority=100,in_port=2,udp,action=ct(zone=5,commit),1 -priority=100,in_port=1,icmp,action=ct(commit),2 -priority=100,in_port=2,icmp,action=ct(zone=5,commit),1 +priority=100,in_port=1,ip,action=ct(commit),2 +priority=100,in_port=2,ip,action=ct(zone=5,commit),1 ]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) @@ -2692,6 +2691,25 @@ udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10. AT_CHECK([FLUSH_CMD]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test SCTP flush based on port. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500003400010000408464410a0101010a01010200010002000000009178f7d30100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000950540000000a08004500003400010000408464410a0101020a010101000200010000000098f29e470100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sed "s/,protoinfo=.*$//" | sort], [0], [dnl +sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1,ct_nw_proto=132,ct_tp_src=1,ct_tp_dst=2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sed "s/,protoinfo=.*$//" | sort], [0], [dnl +sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_proto=132,ct_tp_src=2,ct_tp_dst=1']) + AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) ]) From 0896dc19efb5825e3dce0ade09df1c31c0297c74 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Sat, 9 Sep 2023 04:18:36 +0200 Subject: [PATCH 383/833] python: idl: Fix last-id update from a monitor reply. While sending a reply to the monitor_cond_since request, server includes the last transaction ID. And it sends new IDs with each subsequent update. Current implementation doesn't use the one supplied with a monitor reply, and only takes into account IDs provided with monitor updates. That may cause various issues: 1. Performance: During initialization, the last-id is set to zero. If re-connection will happen after receiving a monitor reply, but before any monitor update, the client will send a new monitor request with an all-zero last-id and will re-download the whole database again. 2. Data inconsistency: Assuming one of the clients sends a transaction, but our python client disconnects before receiving a monitor update for this transaction. The last-id will point to a database state before this transaction. On re-connection, this last-id will be sent and the monitor reply will contain a diff with a new data from that transaction. But if another disconnection happens right after that, on second re-connection our python client will send another monitor_cond_since with exactly the same last-id. That will cause receiving the same set of updates again. And since it's an update2 message with a diff of the data, the client will remove previously applied result of the transaction. At this point it will have a different database view with the server potentially leading to all sorts of data inconsistency problems. Fix that by always updating the last-id from the latest monitor reply. Fixes: 46d44cf3be0d ("python: idl: Add monitor_cond_since support.") Acked-by: Simon Horman Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- python/ovs/db/idl.py | 1 + tests/ovsdb-idl.at | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index 9fc2159b04a..16ece0334cf 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -494,6 +494,7 @@ def run(self): if not msg.result[0]: self.__clear() self.__parse_update(msg.result[2], OVSDB_UPDATE3) + self.last_id = msg.result[1] elif self.state == self.IDL_S_DATA_MONITOR_COND_REQUESTED: self.__clear() self.__parse_update(msg.result, OVSDB_UPDATE2) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index df5a9d2fd20..1028b023787 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -2332,6 +2332,23 @@ CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], [ssl6], [[[::1]]]) +dnl OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS(LOG) +dnl +dnl Looks up transaction IDs in the log of OVSDB client application. +dnl All-zero UUID should not be sent within a monitor request more than once, +dnl unless some database requests were lost (not replied). +m4_define([OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS], +[ + requests=$(grep -c 'send request' $1) + replies=$(grep -c 'received reply' $1) + + if test "$requests" -eq "$replies"; then + AT_CHECK([grep 'monitor_cond_since' $1 \ + | grep -c "00000000-0000-0000-0000-000000000000" | tr -d '\n'], + [0], [1]) + fi +]) + # same as OVSDB_CHECK_IDL but uses Python IDL implementation with tcp # with multiple remotes to assert the idl connects to the leader of the Raft cluster m4_define([OVSDB_CHECK_IDL_LEADER_ONLY_PY], @@ -2347,10 +2364,11 @@ m4_define([OVSDB_CHECK_IDL_LEADER_ONLY_PY], pids=$(cat s2.pid s3.pid s1.pid | tr '\n' ',') echo $pids AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t30 idl-cluster $srcdir/idltest.ovsschema $remotes $pids $3], - [0], [stdout], [ignore]) + [0], [stdout], [stderr]) remote=$(ovsdb_cluster_leader $remotes "idltest") leader=$(echo $remote | cut -d'|' -f 1) AT_CHECK([grep -F -- "${leader}" stdout], [0], [ignore]) + OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS([stderr]) AT_CLEANUP]) OVSDB_CHECK_IDL_LEADER_ONLY_PY([Check Python IDL connects to leader], 3, ['remote']) @@ -2393,6 +2411,7 @@ m4_define([OVSDB_CHECK_CLUSTER_IDL_C], AT_CHECK([sort stdout | uuidfilt]m4_if([$7],,, [[| $7]]), [0], [$5]) m4_ifval([$8], [AT_CHECK([grep '$8' stderr], [1])], [], []) + OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS([stderr]) AT_CLEANUP]) # Same as OVSDB_CHECK_CLUSTER_IDL_C but uses the Python IDL implementation. @@ -2413,6 +2432,7 @@ m4_define([OVSDB_CHECK_CLUSTER_IDL_PY], AT_CHECK([sort stdout | uuidfilt]m4_if([$7],,, [[| $7]]), [0], [$5]) m4_if([$8], [AT_CHECK([grep '$8' stderr], [1])], [], []) + OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS([stderr]) AT_CLEANUP]) m4_define([OVSDB_CHECK_CLUSTER_IDL], From 1b8fa4a66aa410e9083d49c5a1fbbe524ae25024 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 11 Sep 2023 17:06:26 +0200 Subject: [PATCH 384/833] checkpatch: Add checks for the subject line. This patch adds WARNINGs for the subject line length and the format, i.e., the sentence should start with a capital and end with a dot. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- tests/checkpatch.at | 29 ++++++++++++++++++++++ utilities/checkpatch.py | 53 +++++++++++++++++++++++++++++++++-------- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/tests/checkpatch.at b/tests/checkpatch.at index fdcdb846e1c..4f6b0c7b36b 100755 --- a/tests/checkpatch.at +++ b/tests/checkpatch.at @@ -8,7 +8,14 @@ OVS_START_SHELL_HELPERS try_checkpatch() { # Take the patch to test from $1. Remove an initial four-space indent # from it and, if it is just headers with no body, add a null body. + # If it does not have a 'Subject', add a valid one. echo "$1" | sed 's/^ //' > test.patch + if grep 'Subject\:' test.patch >/dev/null 2>&1; then : + else + sed -i'' -e '1i\ +Subject: Patch this is. +' test.patch + fi if grep '---' expout >/dev/null 2>&1; then : else printf '\n---\n' >> test.patch @@ -560,3 +567,25 @@ try_checkpatch \ " AT_CLEANUP + +AT_SETUP([checkpatch - subject]) +try_checkpatch \ + "Author: A + Commit: A + Subject: netdev: invalid case and dot ending + + Signed-off-by: A" \ + "WARNING: The subject summary should start with a capital. + WARNING: The subject summary should end with a dot. + Subject: netdev: invalid case and dot ending" + +try_checkpatch \ + "Author: A + Commit: A + Subject: netdev: This is a way to long commit summary and therefor it should report a WARNING! + + Signed-off-by: A" \ + "WARNING: The subject, ':

      ', is over 70 characters, i.e., 85. + Subject: netdev: This is a way to long commit summary and therefor it should report a WARNING!" + +AT_CLEANUP diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 5c4aaefb374..3f42c44f293 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -792,6 +792,36 @@ def run_file_checks(text): check['check'](text) +def run_subject_checks(subject, spellcheck=False): + warnings = False + + if spellcheck and check_spelling(subject, False): + warnings = True + + summary = subject[subject.rindex(': ') + 2:] + area_summary = subject[subject.index(': ') + 2:] + area_summary_len = len(area_summary) + if area_summary_len > 70: + print_warning("The subject, ': ', is over 70 " + "characters, i.e., %u." % area_summary_len) + warnings = True + + if summary[0].isalpha() and summary[0].islower(): + print_warning( + "The subject summary should start with a capital.") + warnings = True + + if subject[-1] not in [".", "?", "!"]: + print_warning( + "The subject summary should end with a dot.") + warnings = True + + if warnings: + print(subject) + + return warnings + + def ovs_checkpatch_parse(text, filename, author=None, committer=None): global print_file_name, total_line, checking_file, \ empty_return_check_state @@ -812,6 +842,7 @@ def ovs_checkpatch_parse(text, filename, author=None, committer=None): r'^@@ ([0-9-+]+),([0-9-+]+) ([0-9-+]+),([0-9-+]+) @@') is_author = re.compile(r'^(Author|From): (.*)$', re.I | re.M | re.S) is_committer = re.compile(r'^(Commit: )(.*)$', re.I | re.M | re.S) + is_subject = re.compile(r'^(Subject: )(.*)$', re.I | re.M | re.S) is_signature = re.compile(r'^(Signed-off-by: )(.*)$', re.I | re.M | re.S) is_co_author = re.compile(r'^(Co-authored-by: )(.*)$', @@ -911,6 +942,8 @@ def ovs_checkpatch_parse(text, filename, author=None, committer=None): committer = is_committer.match(line).group(2) elif is_author.match(line): author = is_author.match(line).group(2) + elif is_subject.match(line): + run_subject_checks(line, spellcheck) elif is_signature.match(line): m = is_signature.match(line) signatures.append(m.group(2)) @@ -1029,18 +1062,18 @@ def ovs_checkpatch_file(filename): result = ovs_checkpatch_parse(part.get_payload(decode=False), filename, mail.get('Author', mail['From']), mail['Commit']) - if spellcheck: - if not mail['Subject'] or not mail['Subject'].strip(): - if mail['Subject']: - mail.replace_header('Subject', sys.argv[-1]) - else: - mail.add_header('Subject', sys.argv[-1]) - print("Subject missing! Your provisional subject is", - mail['Subject']) + if not mail['Subject'] or not mail['Subject'].strip(): + if mail['Subject']: + mail.replace_header('Subject', sys.argv[-1]) + else: + mail.add_header('Subject', sys.argv[-1]) + + print("Subject missing! Your provisional subject is", + mail['Subject']) - if check_spelling(mail['Subject'], False): - print("Subject: %s" % mail['Subject']) + if run_subject_checks('Subject: ' + mail['Subject'], spellcheck): + result = True ovs_checkpatch_print_result() return result From 010c256caa63d7117e3a9e0ac2dd5b53f442aebb Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Sun, 3 Sep 2023 23:21:54 +0800 Subject: [PATCH 385/833] lib: Add non-null assertions to some return values of `dp_packet_data`. This commit adds some `ovs_assert()` checks to some return values of `dp_packet_data()` to ensure that they are not NULL and to prevent null-pointer dereferences, which might lead to unwanted crashes. We use assertions since it should be impossible for these calls to `dp_packet_data()` to return NULL. Reviewed-by: Simon Horman Acked-by: Aaron Conole Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 17 ++++++++++++----- lib/netdev-native-tnl.c | 6 +++++- lib/pcap-file.c | 4 +++- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 072bc4073ee..8bc747c103f 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -185,12 +185,15 @@ dp_packet_clone(const struct dp_packet *buffer) struct dp_packet * dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) { + const void *data_dp = dp_packet_data(buffer); struct dp_packet *new_buffer; uint32_t mark; - new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(buffer), - dp_packet_size(buffer), - headroom); + ovs_assert(data_dp); + + new_buffer = dp_packet_clone_data_with_headroom(data_dp, + dp_packet_size(buffer), + headroom); /* Copy the following fields into the returned buffer: l2_pad_size, * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */ memcpy(&new_buffer->l2_pad_size, &buffer->l2_pad_size, @@ -327,8 +330,12 @@ dp_packet_shift(struct dp_packet *b, int delta) : true); if (delta != 0) { - char *dst = (char *) dp_packet_data(b) + delta; - memmove(dst, dp_packet_data(b), dp_packet_size(b)); + const void *data_dp = dp_packet_data(b); + char *dst = (char *) data_dp + delta; + + ovs_assert(data_dp); + + memmove(dst, data_dp, dp_packet_size(b)); dp_packet_set_data(b, dst); } } diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 715bbab2bec..a0682c70fbb 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -43,6 +43,7 @@ #include "seq.h" #include "unaligned.h" #include "unixctl.h" +#include "util.h" #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(native_tnl); @@ -415,11 +416,14 @@ parse_gre_header(struct dp_packet *packet, struct dp_packet * netdev_gre_pop_header(struct dp_packet *packet) { + const void *data_dp = dp_packet_data(packet); struct pkt_metadata *md = &packet->md; struct flow_tnl *tnl = &md->tunnel; int hlen = sizeof(struct eth_header) + 4; - hlen += netdev_tnl_is_header_ipv6(dp_packet_data(packet)) ? + ovs_assert(data_dp); + + hlen += netdev_tnl_is_header_ipv6(data_dp) ? IPV6_HEADER_LEN : IP_HEADER_LEN; pkt_metadata_init_tnl(md); diff --git a/lib/pcap-file.c b/lib/pcap-file.c index 3ed7ea4880e..0c2fed77662 100644 --- a/lib/pcap-file.c +++ b/lib/pcap-file.c @@ -280,10 +280,12 @@ ovs_pcap_read(struct pcap_file *p_file, struct dp_packet **bufp, void ovs_pcap_write(struct pcap_file *p_file, struct dp_packet *buf) { + const void *data_dp = dp_packet_data(buf); struct pcaprec_hdr prh; struct timeval tv; ovs_assert(dp_packet_is_eth(buf)); + ovs_assert(data_dp); xgettimeofday(&tv); prh.ts_sec = tv.tv_sec; @@ -291,7 +293,7 @@ ovs_pcap_write(struct pcap_file *p_file, struct dp_packet *buf) prh.incl_len = dp_packet_size(buf); prh.orig_len = dp_packet_size(buf); ignore(fwrite(&prh, sizeof prh, 1, p_file->file)); - ignore(fwrite(dp_packet_data(buf), dp_packet_size(buf), 1, p_file->file)); + ignore(fwrite(data_dp, dp_packet_size(buf), 1, p_file->file)); fflush(p_file->file); } From 880a2bbb4b90c64b5d02dc5f5b16e046caa7ed87 Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Sun, 3 Sep 2023 23:21:55 +0800 Subject: [PATCH 386/833] lib, ovsdb, vtep: Add various null pointer checks. This commit adds various null pointer checks to some files in the `lib`, `ovsdb`, and `vtep` directories to fix several Coverity defects. These changes are grouped together as they perform similar checks, returning early, skipping some action, or logging a warning if a null pointer is encountered. Reviewed-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 8 ++++---- lib/dpctl.c | 12 ++++++++++++ lib/shash.c | 4 ++++ lib/sset.c | 5 +++++ ovsdb/jsonrpc-server.c | 2 +- ovsdb/monitor.c | 3 +++ ovsdb/ovsdb-client.c | 6 ++++-- ovsdb/row.c | 5 ++++- vtep/vtep-ctl.c | 17 +++++++++-------- 9 files changed, 46 insertions(+), 16 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 8bc747c103f..ed004c3b902 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -360,7 +360,7 @@ void * dp_packet_put_zeros(struct dp_packet *b, size_t size) { void *dst = dp_packet_put_uninit(b, size); - memset(dst, 0, size); + nullable_memset(dst, 0, size); return dst; } @@ -371,7 +371,7 @@ void * dp_packet_put(struct dp_packet *b, const void *p, size_t size) { void *dst = dp_packet_put_uninit(b, size); - memcpy(dst, p, size); + nullable_memcpy(dst, p, size); return dst; } @@ -443,7 +443,7 @@ void * dp_packet_push_zeros(struct dp_packet *b, size_t size) { void *dst = dp_packet_push_uninit(b, size); - memset(dst, 0, size); + nullable_memset(dst, 0, size); return dst; } @@ -454,7 +454,7 @@ void * dp_packet_push(struct dp_packet *b, const void *p, size_t size) { void *dst = dp_packet_push_uninit(b, size); - memcpy(dst, p, size); + nullable_memcpy(dst, p, size); return dst; } diff --git a/lib/dpctl.c b/lib/dpctl.c index 79b82a1767d..cd12625a160 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -336,6 +336,12 @@ dpctl_add_if(int argc OVS_UNUSED, const char *argv[], value = ""; } + if (!key) { + dpctl_error(dpctl_p, 0, "Invalid option format"); + error = EINVAL; + goto next; + } + if (!strcmp(key, "type")) { type = value; } else if (!strcmp(key, "port_no")) { @@ -454,6 +460,12 @@ dpctl_set_if(int argc, const char *argv[], struct dpctl_params *dpctl_p) value = ""; } + if (!key) { + dpctl_error(dpctl_p, 0, "Invalid option format"); + error = EINVAL; + goto next_destroy_args; + } + if (!strcmp(key, "type")) { if (strcmp(value, type)) { dpctl_error(dpctl_p, 0, diff --git a/lib/shash.c b/lib/shash.c index 6af985d0baa..92260cddf8c 100644 --- a/lib/shash.c +++ b/lib/shash.c @@ -205,6 +205,10 @@ shash_delete(struct shash *sh, struct shash_node *node) char * shash_steal(struct shash *sh, struct shash_node *node) { + if (!node) { + return NULL; + } + char *name = node->name; hmap_remove(&sh->map, &node->node); diff --git a/lib/sset.c b/lib/sset.c index aa179002020..fda26812906 100644 --- a/lib/sset.c +++ b/lib/sset.c @@ -261,6 +261,11 @@ char * sset_pop(struct sset *set) { const char *name = SSET_FIRST(set); + + if (!name) { + return NULL; + } + char *copy = xstrdup(name); sset_delete(set, SSET_NODE_FROM_NAME(name)); return copy; diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 9a77760c382..a3ca48a7b35 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -1038,7 +1038,7 @@ ovsdb_jsonrpc_session_got_request(struct ovsdb_jsonrpc_session *s, request->id); } else if (!strcmp(request->method, "get_schema")) { struct ovsdb *db = ovsdb_jsonrpc_lookup_db(s, request, &reply); - if (!reply) { + if (db && !reply) { reply = jsonrpc_create_reply(ovsdb_schema_to_json(db->schema), request->id); } diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 9829cd39ca0..4ccb51b1a98 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -483,6 +483,7 @@ ovsdb_monitor_add_column(struct ovsdb_monitor *dbmon, struct ovsdb_monitor_column *c; mt = shash_find_data(&dbmon->tables, table->schema->name); + ovs_assert(mt); /* Check for column duplication. Return duplicated column name. */ if (mt->columns_index_map[column->index] != -1) { @@ -813,6 +814,8 @@ ovsdb_monitor_table_condition_update( struct ovsdb_error *error; struct ovsdb_condition cond = OVSDB_CONDITION_INITIALIZER(&cond); + ovs_assert(mtc); + error = ovsdb_condition_from_json(table->schema, cond_json, NULL, &cond); if (error) { diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index 46484630d2d..7249805bab5 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -1843,7 +1843,7 @@ do_dump(struct jsonrpc *rpc, const char *database, struct ovsdb_schema *schema; struct json *transaction; - const struct shash_node *node, **tables; + const struct shash_node *node, **tables = NULL; size_t n_tables; struct ovsdb_table_schema *tschema; const struct shash *columns; @@ -1869,8 +1869,10 @@ do_dump(struct jsonrpc *rpc, const char *database, shash_add(&custom_columns, argv[i], node->data); } } else { - tables = shash_sort(&schema->tables); n_tables = shash_count(&schema->tables); + if (n_tables) { + tables = shash_sort(&schema->tables); + } } /* Construct transaction to retrieve entire database. */ diff --git a/ovsdb/row.c b/ovsdb/row.c index 2b52b68161f..6b52509a91c 100644 --- a/ovsdb/row.c +++ b/ovsdb/row.c @@ -406,7 +406,10 @@ ovsdb_row_set_add_row(struct ovsdb_row_set *set, const struct ovsdb_row *row) set->rows = x2nrealloc(set->rows, &set->allocated_rows, sizeof *set->rows); } - set->rows[set->n_rows++] = row; + + if (set->rows) { + set->rows[set->n_rows++] = row; + } } struct json * diff --git a/vtep/vtep-ctl.c b/vtep/vtep-ctl.c index 61ec4801ed9..26b8540b4a6 100644 --- a/vtep/vtep-ctl.c +++ b/vtep/vtep-ctl.c @@ -1859,18 +1859,21 @@ del_mcast_entry(struct ctl_context *ctx, const char *encap, const char *dst_ip, bool local) { struct vtep_ctl_context *vtepctl_ctx = vtep_ctl_context_cast(ctx); + struct vteprec_physical_locator_set *ploc_set_cfg; + struct vteprec_physical_locator *ploc_cfg; struct vtep_ctl_mcast_mac *mcast_mac; struct shash *mcast_shash; - struct vteprec_physical_locator *ploc_cfg; - struct vteprec_physical_locator_set *ploc_set_cfg; + struct shash_node *mcast_node; mcast_shash = local ? &ls->mcast_local : &ls->mcast_remote; - mcast_mac = shash_find_data(mcast_shash, mac); - if (!mcast_mac) { + mcast_node = shash_find(mcast_shash, mac); + if (!mcast_node || !mcast_node->data) { return; } + mcast_mac = mcast_node->data; + ploc_cfg = find_ploc(vtepctl_ctx, encap, dst_ip); if (!ploc_cfg) { /* Couldn't find the physical locator, so just ignore. */ @@ -1883,8 +1886,6 @@ del_mcast_entry(struct ctl_context *ctx, del_ploc_from_mcast_mac(mcast_mac, ploc_cfg); if (ovs_list_is_empty(&mcast_mac->locators)) { - struct shash_node *node = shash_find(mcast_shash, mac); - vteprec_physical_locator_set_delete(ploc_set_cfg); if (local) { @@ -1893,8 +1894,8 @@ del_mcast_entry(struct ctl_context *ctx, vteprec_mcast_macs_remote_delete(mcast_mac->remote_cfg); } - free(node->data); - shash_delete(mcast_shash, node); + free(mcast_node->data); + shash_delete(mcast_shash, mcast_node); } else { if (local) { vteprec_mcast_macs_local_set_locator_set(mcast_mac->local_cfg, From a40c55eff92dad5ff73ad501cd6952d744eea2d2 Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Sat, 16 Sep 2023 15:28:54 +0800 Subject: [PATCH 387/833] hash: Add explicit typecasts to fix C++ compilation issues. C++ does not allow implicit conversion from void pointer to a specific pointer type. This change removes the cast from uint32_t* to void* in `hash_words_32aligned` and adds an explicit typecast from uint32_t* to uint64_t* in `hash_words_inline`. This issue was initially discovered on G++ v9.2.0 when a downstream C++ application included the hash.h header file and was compiled on an AMD Ryzen Zen 2 CPU (__SSE4_2__ && __x86_64__). On the latest G++ version, it would throw an error. On the latest GCC version with `-Wc++-compat`, it would throw a warning. Acked-by: Mike Pattrick Signed-off-by: James Raphael Tiovalen Signed-off-by: Ilya Maximets --- lib/hash.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/hash.h b/lib/hash.h index 7b7f70c112a..307309fd059 100644 --- a/lib/hash.h +++ b/lib/hash.h @@ -198,9 +198,8 @@ hash_finish32(uint64_t hash, uint32_t final, uint32_t semifinal) } static inline uint32_t -hash_words_32aligned(const uint32_t *p_, size_t n_words, uint32_t basis) +hash_words_32aligned(const uint32_t *p, size_t n_words, uint32_t basis) { - const uint32_t *p = (const void *) p_; uint32_t hash1 = basis; uint32_t hash2 = 0; uint32_t hash3 = n_words; @@ -254,7 +253,7 @@ hash_words_32aligned(const uint32_t *p_, size_t n_words, uint32_t basis) static inline uint32_t hash_words_inline(const uint32_t *p_, size_t n_words, uint32_t basis) { - const uint64_t *p = (const void *)p_; + const uint64_t *p = ALIGNED_CAST(const uint64_t *, p_); uint64_t hash1 = basis; uint64_t hash2 = 0; uint64_t hash3 = n_words; From 4fc02650ae0ffa9275077b3fa190fe6d1d1a4e9d Mon Sep 17 00:00:00 2001 From: Xavier Simonart Date: Fri, 22 Sep 2023 15:18:23 +0200 Subject: [PATCH 388/833] ovsdb: Fix potential leak when making diff of conditions. OVN unit tests highlight this: ERROR: LeakSanitizer: detected memory leaks Direct leak of 1344 byte(s) in 1 object(s) allocated from: 0 0x4db0b7 in calloc (ovsdb/ovsdb-server+0x4db0b7) 1 0x5c2162 in xcalloc__ lib/util.c:124:31 2 0x5c221c in xcalloc lib/util.c:161:12 3 0x54afbc in ovsdb_condition_diff ovsdb/condition.c:527:21 4 0x529da6 in ovsdb_monitor_table_condition_update ovsdb/monitor.c:824:5 5 0x524fa4 in ovsdb_jsonrpc_parse_monitor_cond_change_request ovsdb/jsonrpc-server.c:1557:13 6 0x5235c3 in ovsdb_jsonrpc_monitor_cond_change ovsdb/jsonrpc-server.c:1624:25 7 0x5217f2 in ovsdb_jsonrpc_session_got_request ovsdb/jsonrpc-server.c:1034:17 8 0x520ee6 in ovsdb_jsonrpc_session_run ovsdb/jsonrpc-server.c:572:17 9 0x51ffbe in ovsdb_jsonrpc_session_run_all ovsdb/jsonrpc-server.c:602:21 10 0x51fbcf in ovsdb_jsonrpc_server_run ovsdb/jsonrpc-server.c:417:9 11 0x517550 in main_loop ovsdb/ovsdb-server.c:224:9 12 0x512e80 in main ovsdb/ovsdb-server.c:507:5 13 0x7f9ecf675b74 in __libc_start_main (/lib64/libc.so.6+0x27b74) Fixes: ef1da757f016 ("ovsdb: condition: Process condition changes incrementally.") Signed-off-by: Xavier Simonart Signed-off-by: Ilya Maximets --- ovsdb/condition.c | 11 ++++++++--- ovsdb/monitor.c | 1 + tests/ovsdb-monitor.at | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ovsdb/condition.c b/ovsdb/condition.c index 5a3eb4e8a3f..4911fbf59be 100644 --- a/ovsdb/condition.c +++ b/ovsdb/condition.c @@ -550,9 +550,14 @@ ovsdb_condition_diff(struct ovsdb_condition *diff, &b->clauses[j]); } - diff->optimized = a->optimized && b->optimized; - if (diff->optimized) { - ovsdb_condition_optimize(diff); + if (diff->n_clauses) { + diff->optimized = a->optimized && b->optimized; + if (diff->optimized) { + ovsdb_condition_optimize(diff); + } + } else { + free(diff->clauses); + diff->clauses = NULL; } } diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 4ccb51b1a98..d1e466faa48 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -824,6 +824,7 @@ ovsdb_monitor_table_condition_update( ovsdb_condition_destroy(&mtc->new_condition); ovsdb_condition_clone(&mtc->new_condition, &cond); ovsdb_condition_destroy(&cond); + ovsdb_condition_destroy(&mtc->diff_condition); ovsdb_condition_diff(&mtc->diff_condition, &mtc->old_condition, &mtc->new_condition); ovsdb_monitor_condition_add_columns(dbmon, diff --git a/tests/ovsdb-monitor.at b/tests/ovsdb-monitor.at index 12cd2bc3194..3e1df18a112 100644 --- a/tests/ovsdb-monitor.at +++ b/tests/ovsdb-monitor.at @@ -586,6 +586,7 @@ row,action,name,number,_version [[]], [], [[[[["name","==","one"],["name","==","two"]]]], + [[[["name","==","two"],["name","==","one"]]]], [[[["name","==","one"]]]], [[[false]]], [[[true]]]]) From 13dde113107b21d3e1c5f4197590f7a1ce71459e Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 27 Sep 2023 11:18:59 +0200 Subject: [PATCH 389/833] utilities: Add kernel_delay.py script to debug a busy Linux kernel. This patch adds an utility that can be used to determine if an issue is related to a lack of Linux kernel resources. This tool is also featured in a Red Hat developers blog article: https://developers.redhat.com/articles/2023/07/24/troubleshooting-open-vswitch-kernel-blame Reviewed-by: Adrian Moreno Acked-by: Aaron Conole Signed-off-by: Eelco Chaudron --- utilities/automake.mk | 4 + utilities/usdt-scripts/kernel_delay.py | 1420 +++++++++++++++++++++++ utilities/usdt-scripts/kernel_delay.rst | 596 ++++++++++ 3 files changed, 2020 insertions(+) create mode 100755 utilities/usdt-scripts/kernel_delay.py create mode 100644 utilities/usdt-scripts/kernel_delay.rst diff --git a/utilities/automake.mk b/utilities/automake.mk index 37d679f8227..9a2114df40a 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -23,6 +23,8 @@ scripts_DATA += utilities/ovs-lib usdt_SCRIPTS += \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/kernel_delay.py \ + utilities/usdt-scripts/kernel_delay.rst \ utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py @@ -70,6 +72,8 @@ EXTRA_DIST += \ utilities/docker/debian/build-kernel-modules.sh \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/kernel_delay.py \ + utilities/usdt-scripts/kernel_delay.rst \ utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py diff --git a/utilities/usdt-scripts/kernel_delay.py b/utilities/usdt-scripts/kernel_delay.py new file mode 100755 index 00000000000..b2012fdf20c --- /dev/null +++ b/utilities/usdt-scripts/kernel_delay.py @@ -0,0 +1,1420 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022,2023 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Script information: +# ------------------- +# This script allows a developer to quickly identify if the issue at hand +# might be related to the kernel running out of resources or if it really is +# an Open vSwitch issue. +# +# For documentation see the kernel_delay.rst file. +# +# +# Dependencies: +# ------------- +# You need to install the BCC package for your specific platform or build it +# yourself using the following instructions: +# https://raw.githubusercontent.com/iovisor/bcc/master/INSTALL.md +# +# Python needs the following additional packages installed: +# - pytz +# - psutil +# +# You can either install your distribution specific package or use pip: +# pip install pytz psutil +# +import argparse +import datetime +import os +import pytz +import psutil +import re +import sys +import time + +import ctypes as ct + +try: + from bcc import BPF, USDT, USDTException + from bcc.syscall import syscalls, syscall_name +except ModuleNotFoundError: + print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!") + sys.exit(os.EX_OSFILE) + +from enum import IntEnum + + +# +# Actual eBPF source code +# +EBPF_SOURCE = """ +#include +#include + +#define MONITOR_PID + +enum { + +}; + +struct event_t { + u64 ts; + u32 tid; + u32 id; + + int user_stack_id; + int kernel_stack_id; + + u32 syscall; + u64 entry_ts; + +}; + +BPF_RINGBUF_OUTPUT(events, ); +BPF_STACK_TRACE(stack_traces, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); +BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1); + +BPF_ARRAY(capture_on, u64, 1); +static inline bool capture_enabled(u64 pid_tgid) { + int key = 0; + u64 *ret; + + if ((pid_tgid >> 32) != MONITOR_PID) + return false; + + ret = capture_on.lookup(&key); + return ret && *ret == 1; +} + +static inline bool capture_enabled__() { + int key = 0; + u64 *ret; + + ret = capture_on.lookup(&key); + return ret && *ret == 1; +} + +static struct event_t *get_event(uint32_t id) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->id = id; + event->ts = bpf_ktime_get_ns(); + event->tid = bpf_get_current_pid_tgid(); + + return event; +} + +static int start_trigger() { + int key = 0; + u64 *val = capture_on.lookup(&key); + + /* If the value is -1 we can't start as we are still processing the + * results in userspace. */ + if (!val || *val != 0) { + trigger_miss.increment(0); + return 0; + } + + struct event_t *event = get_event(EVENT_START_TRIGGER); + if (event) { + events.ringbuf_submit(event, 0); + *val = 1; + } else { + trigger_miss.increment(0); + } + return 0; +} + +static int stop_trigger() { + int key = 0; + u64 *val = capture_on.lookup(&key); + + if (!val || *val != 1) + return 0; + + struct event_t *event = get_event(EVENT_STOP_TRIGGER); + + if (event) + events.ringbuf_submit(event, 0); + + if (val) + *val = -1; + + return 0; +} + + + + + +/* + * For the syscall monitor the following probes get installed. + */ +struct syscall_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct syscall_data_key_t { + u32 pid; + u32 tid; + u32 syscall; +}; + +BPF_HASH(syscall_start, u64, u64); +BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t); + +TRACEPOINT_PROBE(raw_syscalls, sys_enter) { + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + u64 t = bpf_ktime_get_ns(); + syscall_start.update(&pid_tgid, &t); + + return 0; +} + +TRACEPOINT_PROBE(raw_syscalls, sys_exit) { + struct syscall_data_t *val, zero = {}; + struct syscall_data_key_t key; + + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + key.pid = pid_tgid >> 32; + key.tid = (u32)pid_tgid; + key.syscall = args->id; + + u64 *start_ns = syscall_start.lookup(&pid_tgid); + + if (!start_ns) + return 0; + + val = syscall_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - *start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + + if () { + struct event_t *event = get_event(EVENT_SYSCALL); + if (event) { + event->syscall = args->id; + event->entry_ts = *start_ns; + if () { + event->user_stack_id = stack_traces.get_stackid( + args, BPF_F_USER_STACK); + event->kernel_stack_id = stack_traces.get_stackid( + args, 0); + } + events.ringbuf_submit(event, 0); + } + } + } + return 0; +} + + +/* + * For measuring the thread run time, we need the following. + */ +struct run_time_data_t { + u64 count; + u64 total_ns; + u64 max_ns; + u64 min_ns; +}; + +struct pid_tid_key_t { + u32 pid; + u32 tid; +}; + +BPF_HASH(run_start, u64, u64); +BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t); + +static inline void thread_start_run(u64 pid_tgid, u64 ktime) +{ + run_start.update(&pid_tgid, &ktime); +} + +static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + u64 *start_ns = run_start.lookup(&pid_tgid); + + if (!start_ns || *start_ns == 0) + return; + + struct run_time_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = tgid, + .tid = pid }; + + val = run_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (val->max_ns == 0 || delta > val->max_ns) + val->max_ns = delta; + if (val->min_ns == 0 || delta < val->min_ns) + val->min_ns = delta; + } + *start_ns = 0; +} + + +/* + * For measuring the thread-ready delay, we need the following. + */ +struct ready_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +BPF_HASH(ready_start, u64, u64); +BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t); + +static inline int sched_wakeup__(u32 pid, u32 tgid) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + + if (!capture_enabled(pid_tgid)) + return 0; + + u64 t = bpf_ktime_get_ns(); + ready_start.update(&pid_tgid, &t); + return 0; +} + +RAW_TRACEPOINT_PROBE(sched_wakeup) +{ + struct task_struct *t = (struct task_struct *)ctx->args[0]; + return sched_wakeup__(t->pid, t->tgid); +} + +RAW_TRACEPOINT_PROBE(sched_wakeup_new) +{ + struct task_struct *t = (struct task_struct *)ctx->args[0]; + return sched_wakeup__(t->pid, t->tgid); +} + +RAW_TRACEPOINT_PROBE(sched_switch) +{ + struct task_struct *prev = (struct task_struct *)ctx->args[1]; + struct task_struct *next= (struct task_struct *)ctx->args[2]; + u64 ktime = 0; + + if (!capture_enabled__()) + return 0; + + if (prev-> == TASK_RUNNING && prev->tgid == MONITOR_PID) + sched_wakeup__(prev->pid, prev->tgid); + + if (prev->tgid == MONITOR_PID) { + ktime = bpf_ktime_get_ns(); + thread_stop_run(prev->pid, prev->tgid, ktime); + } + + u64 pid_tgid = (u64)next->tgid << 32 | next->pid; + + if (next->tgid != MONITOR_PID) + return 0; + + if (ktime == 0) + ktime = bpf_ktime_get_ns(); + + u64 *start_ns = ready_start.lookup(&pid_tgid); + + if (start_ns && *start_ns != 0) { + + struct ready_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = next->tgid, + .tid = next->pid }; + + val = ready_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + } + *start_ns = 0; + } + + thread_start_run(pid_tgid, ktime); + return 0; +} + + +/* + * For measuring the hard irq time, we need the following. + */ +struct hardirq_start_data_t { + u64 start_ns; + char irq_name[32]; +}; + +struct hardirq_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct hardirq_data_key_t { + u32 pid; + u32 tid; + char irq_name[32]; +}; + +BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t); +BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t); + +TRACEPOINT_PROBE(irq, irq_handler_entry) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct hardirq_start_data_t data = {}; + + data.start_ns = bpf_ktime_get_ns(); + TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name)); + hardirq_start.update(&pid_tgid, &data); + return 0; +} + +TRACEPOINT_PROBE(irq, irq_handler_exit) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct hardirq_start_data_t *data; + data = hardirq_start.lookup(&pid_tgid); + if (!data || data->start_ns == 0) + return 0; + + if (args->ret != IRQ_NONE) { + struct hardirq_data_t *val, zero = {}; + struct hardirq_data_key_t key = { .pid = pid_tgid >> 32, + .tid = (u32)pid_tgid }; + + bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name), + data->irq_name); + val = hardirq_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - data->start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + } + } + + data->start_ns = 0; + return 0; +} + + +/* + * For measuring the soft irq time, we need the following. + */ +struct softirq_start_data_t { + u64 start_ns; + u32 vec_nr; +}; + +struct softirq_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct softirq_data_key_t { + u32 pid; + u32 tid; + u32 vec_nr; +}; + +BPF_HASH(softirq_start, u64, struct softirq_start_data_t); +BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t); + +TRACEPOINT_PROBE(irq, softirq_entry) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct softirq_start_data_t data = {}; + + data.start_ns = bpf_ktime_get_ns(); + data.vec_nr = args->vec; + softirq_start.update(&pid_tgid, &data); + return 0; +} + +TRACEPOINT_PROBE(irq, softirq_exit) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct softirq_start_data_t *data; + data = softirq_start.lookup(&pid_tgid); + if (!data || data->start_ns == 0) + return 0; + + struct softirq_data_t *val, zero = {}; + struct softirq_data_key_t key = { .pid = pid_tgid >> 32, + .tid = (u32)pid_tgid, + .vec_nr = data->vec_nr}; + + val = softirq_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - data->start_ns; + val->count++; + val->total_ns += delta; + if (val->worst_ns == 0 || delta > val->worst_ns) + val->worst_ns = delta; + } + + data->start_ns = 0; + return 0; +} +""" + + +# +# time_ns() +# +try: + from time import time_ns +except ImportError: + # For compatibility with Python <= v3.6. + def time_ns(): + now = datetime.datetime.now() + return int(now.timestamp() * 1e9) + + +# +# Probe class to use for the start/stop triggers +# +class Probe(object): + ''' + The goal for this object is to support as many as possible + probe/events as supported by BCC. See + https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments + ''' + def __init__(self, probe, pid=None): + self.pid = pid + self.text_probe = probe + self._parse_text_probe() + + def __str__(self): + if self.probe_type == "usdt": + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, + self.usdt_provider, self.usdt_probe) + elif self.probe_type == "trace": + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, + self.trace_system, self.trace_event) + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, + self.kprobe_function) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, + self.uprobe_function) + else: + return "[{}] <{}:unknown probe>".format(self.text_probe, + self.probe_type) + + def _raise(self, error): + raise ValueError("[{}]; {}".format(self.text_probe, error)) + + def _verify_kprobe_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_trace_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_uprobe_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_usdt_probe(self): + if not self.pid: + self._raise("USDT probes need a valid PID.") + + usdt = USDT(pid=self.pid) + + for probe in usdt.enumerate_probes(): + if probe.provider.decode("utf-8") == self.usdt_provider and \ + probe.name.decode("utf-8") == self.usdt_probe: + return + + self._raise("Can't find UDST probe '{}:{}'".format(self.usdt_provider, + self.usdt_probe)) + + def _parse_text_probe(self): + ''' + The text probe format is defined as follows: + : + + Types: + USDT: u|usdt:: + TRACE: t|trace:: + KPROBE: k|kprobe: + KRETPROBE: kr|kretprobe: + UPROBE: up|uprobe: + URETPROBE: ur|uretprobe: + ''' + args = self.text_probe.split(":") + if len(args) <= 1: + self._raise("Can't extract probe type.") + + if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace", + "u", "usdt", "up", "uprobe", "ur", "uretprobe"]: + self._raise("Invalid probe type '{}'".format(args[0])) + + self.probe_type = "kprobe" if args[0] == "k" else args[0] + self.probe_type = "kretprobe" if args[0] == "kr" else self.probe_type + self.probe_type = "trace" if args[0] == "t" else self.probe_type + self.probe_type = "usdt" if args[0] == "u" else self.probe_type + self.probe_type = "uprobe" if args[0] == "up" else self.probe_type + self.probe_type = "uretprobe" if args[0] == "ur" else self.probe_type + + if self.probe_type == "usdt": + if len(args) != 3: + self._raise("Invalid number of arguments for USDT") + + self.usdt_provider = args[1] + self.usdt_probe = args[2] + self._verify_usdt_probe() + + elif self.probe_type == "trace": + if len(args) != 3: + self._raise("Invalid number of arguments for TRACE") + + self.trace_system = args[1] + self.trace_event = args[2] + self._verify_trace_probe() + + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": + if len(args) != 2: + self._raise("Invalid number of arguments for K(RET)PROBE") + self.kprobe_function = args[1] + self._verify_kprobe_probe() + + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + if len(args) != 2: + self._raise("Invalid number of arguments for U(RET)PROBE") + self.uprobe_function = args[1] + self._verify_uprobe_probe() + + def _get_kprobe_c_code(self, function_name, function_content): + # + # The kprobe__* do not require a function name, so it's + # ignored in the code generation. + # + return """ +int {}__{}(struct pt_regs *ctx) {{ + {} +}} +""".format(self.probe_type, self.kprobe_function, function_content) + + def _get_trace_c_code(self, function_name, function_content): + # + # The TRACEPOINT_PROBE() do not require a function name, so it's + # ignored in the code generation. + # + return """ +TRACEPOINT_PROBE({},{}) {{ + {} +}} +""".format(self.trace_system, self.trace_event, function_content) + + def _get_uprobe_c_code(self, function_name, function_content): + return """ +int {}(struct pt_regs *ctx) {{ + {} +}} +""".format(function_name, function_content) + + def _get_usdt_c_code(self, function_name, function_content): + return """ +int {}(struct pt_regs *ctx) {{ + {} +}} +""".format(function_name, function_content) + + def get_c_code(self, function_name, function_content): + if self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return self._get_kprobe_c_code(function_name, function_content) + elif self.probe_type == "trace": + return self._get_trace_c_code(function_name, function_content) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return self._get_uprobe_c_code(function_name, function_content) + elif self.probe_type == "usdt": + return self._get_usdt_c_code(function_name, function_content) + + return "" + + def probe_name(self): + if self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return "{}".format(self.kprobe_function) + elif self.probe_type == "trace": + return "{}:{}".format(self.trace_system, + self.trace_event) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return "{}".format(self.uprobe_function) + elif self.probe_type == "usdt": + return "{}:{}".format(self.usdt_provider, + self.usdt_probe) + + return "" + + +# +# event_to_dict() +# +def event_to_dict(event): + return dict([(field, getattr(event, field)) + for (field, _) in event._fields_ + if isinstance(getattr(event, field), (int, bytes))]) + + +# +# Event enum +# +Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER"], + start=0) + + +# +# process_event() +# +def process_event(ctx, data, size): + global start_trigger_ts + global stop_trigger_ts + + event = bpf["events"].event(data) + if event.id == Event.SYSCALL: + syscall_events.append({"tid": event.tid, + "ts_entry": event.entry_ts, + "ts_exit": event.ts, + "syscall": event.syscall, + "user_stack_id": event.user_stack_id, + "kernel_stack_id": event.kernel_stack_id}) + elif event.id == Event.START_TRIGGER: + # + # This event would have started the trigger already, so all we need to + # do is record the start timestamp. + # + start_trigger_ts = event.ts + + elif event.id == Event.STOP_TRIGGER: + # + # This event would have stopped the trigger already, so all we need to + # do is record the start timestamp. + stop_trigger_ts = event.ts + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# unsigned_int() +# +def unsigned_int(value): + try: + value = int(value) + except ValueError: + raise argparse.ArgumentTypeError("must be an integer") + + if value < 0: + raise argparse.ArgumentTypeError("must be positive") + return value + + +# +# unsigned_nonzero_int() +# +def unsigned_nonzero_int(value): + value = unsigned_int(value) + if value == 0: + raise argparse.ArgumentTypeError("must be nonzero") + return value + + +# +# get_thread_name() +# +def get_thread_name(pid, tid): + try: + with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f: + return f.readline().strip("\n") + except FileNotFoundError: + pass + + return f"" + + +# +# get_vec_nr_name() +# +def get_vec_nr_name(vec_nr): + known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll", + "tasklet", "sched", "hrtimer", "rcu"] + + if vec_nr < 0 or vec_nr > len(known_vec_nr): + return f"" + + return known_vec_nr[vec_nr] + + +# +# start/stop/reset capture +# +def start_capture(): + bpf["capture_on"][ct.c_int(0)] = ct.c_int(1) + + +def stop_capture(force=False): + if force: + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff) + else: + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0) + + +def capture_running(): + return bpf["capture_on"][ct.c_int(0)].value == 1 + + +def reset_capture(): + bpf["syscall_start"].clear() + bpf["syscall_data"].clear() + bpf["run_start"].clear() + bpf["run_data"].clear() + bpf["ready_start"].clear() + bpf["ready_data"].clear() + bpf["hardirq_start"].clear() + bpf["hardirq_data"].clear() + bpf["softirq_start"].clear() + bpf["softirq_data"].clear() + bpf["stack_traces"].clear() + + +# +# Display timestamp +# +def print_timestamp(msg): + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "{} @{} ({} UTC)".format( + msg, ltz.isoformat(), utc.strftime("%H:%M:%S")) + print(time_string) + + +# +# process_results() +# +def process_results(syscall_events=None, trigger_delta=None): + if trigger_delta: + print_timestamp("# Triggered sample dump, stop-start delta {:,} ns". + format(trigger_delta)) + else: + print_timestamp("# Sample dump") + + # + # First get a list of all threads we need to report on. + # + threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items() + if k.syscall != 0xffffffff} + + threads_run = {k.tid for k, _ in bpf["run_data"].items() + if k.pid != 0xffffffff} + + threads_ready = {k.tid for k, _ in bpf["ready_data"].items() + if k.pid != 0xffffffff} + + threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items() + if k.pid != 0xffffffff} + + threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items() + if k.pid != 0xffffffff} + + threads = sorted(threads_syscall | threads_run | threads_ready | + threads_hardirq | threads_softirq, + key=lambda x: get_thread_name(options.pid, x)) + + # + # Print header... + # + print("{:10} {:16} {}".format("TID", "THREAD", "")) + print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76)) + indent = 28 * " " + + # + # Print all events/statistics per threads. + # + poll_id = [k for k, v in syscalls.items() if v == b"poll"][0] + for thread in threads: + + if thread != threads[0]: + print("") + + # + # SYSCALL_STATISTICS + # + print("{:10} {:16} {}\n{}{:20} {:>6} {:>10} {:>16} {:>16}".format( + thread, get_thread_name(options.pid, thread), + "[SYSCALL STATISTICS]", indent, + "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns")) + + total_count = 0 + total_ns = 0 + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["syscall_data"].items()), + key=lambda kv: -kv[1].total_ns): + + print("{}{:20.20} {:6} {:10} {:16,} {:16,}".format( + indent, syscall_name(k.syscall).decode("utf-8"), k.syscall, + v.count, v.total_ns, v.worst_ns)) + if k.syscall != poll_id: + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:6} {:10} {:16,}".format( + indent, "TOTAL( - poll):", "", total_count, total_ns)) + + # + # THREAD RUN STATISTICS + # + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( + "", "", "[THREAD RUN STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns")) + + for k, v in filter(lambda t: t[0].tid == thread, + bpf["run_data"].items()): + + print("{}{:10} {:16,} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.min_ns, v.max_ns)) + + # + # THREAD READY STATISTICS + # + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( + "", "", "[THREAD READY STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MAX ns")) + + for k, v in filter(lambda t: t[0].tid == thread, + bpf["ready_data"].items()): + + print("{}{:10} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.worst_ns)) + + # + # HARD IRQ STATISTICS + # + total_ns = 0 + total_count = 0 + header_printed = False + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["hardirq_data"].items()), + key=lambda kv: -kv[1].total_ns): + + if not header_printed: + print("\n{:10} {:16} {}\n{}{:20} {:>10} {:>16} {:>16}". + format("", "", "[HARD IRQ STATISTICS]", indent, + "NAME", "COUNT", "TOTAL ns", "MAX ns")) + header_printed = True + + print("{}{:20.20} {:10} {:16,} {:16,}".format( + indent, k.irq_name.decode("utf-8"), + v.count, v.total_ns, v.worst_ns)) + + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:10} {:16,}".format( + indent, "TOTAL:", total_count, total_ns)) + + # + # SOFT IRQ STATISTICS + # + total_ns = 0 + total_count = 0 + header_printed = False + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["softirq_data"].items()), + key=lambda kv: -kv[1].total_ns): + + if not header_printed: + print("\n{:10} {:16} {}\n" + "{}{:20} {:>7} {:>10} {:>16} {:>16}". + format("", "", "[SOFT IRQ STATISTICS]", indent, + "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX ns")) + header_printed = True + + print("{}{:20.20} {:>7} {:10} {:16,} {:16,}".format( + indent, get_vec_nr_name(k.vec_nr), k.vec_nr, + v.count, v.total_ns, v.worst_ns)) + + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:7} {:10} {:16,}".format( + indent, "TOTAL:", "", total_count, total_ns)) + + # + # Print events + # + lost_stack_traces = 0 + if syscall_events: + stack_traces = bpf.get_table("stack_traces") + + print("\n\n# SYSCALL EVENTS:" + "\n{}{:>19} {:>19} {:>10} {:16} {:>10} {}".format( + 2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM", + "DELTA (us)", "SYSCALL")) + print("{}{:19} {:19} {:10} {:16} {:10} {}".format( + 2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16, + "-" * 10, "-" * 16)) + for event in syscall_events: + print("{}{:19} {:19} {:10} {:16} {:10,} {}".format( + " " * 2, + event["ts_entry"], event["ts_exit"], event["tid"], + get_thread_name(options.pid, event["tid"]), + int((event["ts_exit"] - event["ts_entry"]) / 1000), + syscall_name(event["syscall"]).decode("utf-8"))) + # + # Not sure where to put this, but I'll add some info on stack + # traces here... Userspace stack traces are very limited due to + # the fact that bcc does not support dwarf backtraces. As OVS + # gets compiled without frame pointers we will not see much. + # If however, OVS does get built with frame pointers, we should not + # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle + # the debug symbols correctly. Also, note that for kernel + # traces you should not use BPF_STACK_TRACE_BUILDID, so two + # buffers are needed. + # + # Some info on manual dwarf walk support: + # https://github.com/iovisor/bcc/issues/3515 + # https://github.com/iovisor/bcc/pull/4463 + # + if options.stack_trace_size == 0: + continue + + if event["kernel_stack_id"] < 0 or event["user_stack_id"] < 0: + lost_stack_traces += 1 + + kernel_stack = stack_traces.walk(event["kernel_stack_id"]) \ + if event["kernel_stack_id"] >= 0 else [] + user_stack = stack_traces.walk(event["user_stack_id"]) \ + if event["user_stack_id"] >= 0 else [] + + for addr in kernel_stack: + print("{}{}".format( + " " * 10, + bpf.ksym(addr, show_module=True, + show_offset=True).decode("utf-8", "replace"))) + + for addr in user_stack: + addr_str = bpf.sym(addr, options.pid, show_module=True, + show_offset=True).decode("utf-8", "replace") + + if addr_str == "[unknown]": + addr_str += " 0x{:x}".format(addr) + + print("{}{}".format(" " * 10, addr_str)) + + # + # Print any footer messages. + # + if lost_stack_traces > 0: + print("\n#WARNING: We where not able to display {} stack traces!\n" + "# Consider increasing the stack trace size using\n" + "# the '--stack-trace-size' option.\n" + "# Note that this can also happen due to a stack id\n" + "# collision.".format(lost_stack_traces)) + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global bpf + global options + global syscall_events + global start_trigger_ts + global stop_trigger_ts + + start_trigger_ts = 0 + stop_trigger_ts = 0 + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs="?") + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=unsigned_int, default=None) + parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS", + help="Record syscall events that take longer than " + "DURATION_NS. Omit the duration value to record all " + "syscall events", + type=unsigned_int, const=0, default=None, nargs="?") + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=unsigned_int, default=1024, metavar="NUMBER") + parser.add_argument("--sample-count", + help="Number of sample runs, default 1", + type=unsigned_nonzero_int, default=1, metavar="RUNS") + parser.add_argument("--sample-interval", + help="Delay between sample runs, default 0", + type=float, default=0, metavar="SECONDS") + parser.add_argument("--sample-time", + help="Sample time, default 0.5 seconds", + type=float, default=0.5, metavar="SECONDS") + parser.add_argument("--skip-syscall-poll-events", + help="Skip poll() syscalls with --syscall-events", + action="store_true") + parser.add_argument("--stack-trace-size", + help="Number of unique stack traces that can be " + "recorded, default 4096. 0 to disable", + type=unsigned_int, default=4096) + parser.add_argument("--start-trigger", metavar="TRIGGER", + help="Start trigger, see documentation for details", + type=str, default=None) + parser.add_argument("--stop-trigger", metavar="TRIGGER", + help="Stop trigger, see documentation for details", + type=str, default=None) + parser.add_argument("--trigger-delta", metavar="DURATION_NS", + help="Only report event when the trigger duration > " + "DURATION_NS, default 0 (all events)", + type=unsigned_int, const=0, default=0, nargs="?") + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if not options.pid: + for proc in psutil.process_iter(): + if "ovs-vswitchd" in proc.name(): + if options.pid: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(os.EX_NOINPUT) + + options.pid = proc.pid + + # + # Error checking on input parameters. + # + if not options.pid: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(os.EX_UNAVAILABLE) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Make sure we are running as root, or else we can not attach the probes. + # + if os.geteuid() != 0: + print("ERROR: We need to run as root to attached probes!") + sys.exit(os.EX_NOPERM) + + # + # Setup any of the start stop triggers + # + if options.start_trigger is not None: + try: + start_trigger = Probe(options.start_trigger, pid=options.pid) + except ValueError as e: + print(f"ERROR: Invalid start trigger {str(e)}") + sys.exit(os.EX_CONFIG) + else: + start_trigger = None + + if options.stop_trigger is not None: + try: + stop_trigger = Probe(options.stop_trigger, pid=options.pid) + except ValueError as e: + print(f"ERROR: Invalid stop trigger {str(e)}") + sys.exit(os.EX_CONFIG) + else: + stop_trigger = None + + # + # Attach probe to running process. + # + source = EBPF_SOURCE.replace("", "\n".join( + [" EVENT_{} = {},".format( + event.name, event.value) for event in Event])) + source = source.replace("", + str(options.buffer_page_count)) + source = source.replace("", str(options.pid)) + + if BPF.kernel_struct_has_field(b"task_struct", b"state") == 1: + source = source.replace("", "state") + else: + source = source.replace("", "__state") + + poll_id = [k for k, v in syscalls.items() if v == b"poll"][0] + if options.syscall_events is None: + syscall_trace_events = "false" + elif options.syscall_events == 0: + if not options.skip_syscall_poll_events: + syscall_trace_events = "true" + else: + syscall_trace_events = f"args->id != {poll_id}" + else: + syscall_trace_events = "delta > {}".format(options.syscall_events) + if options.skip_syscall_poll_events: + syscall_trace_events += f" && args->id != {poll_id}" + + source = source.replace("", + syscall_trace_events) + + source = source.replace("", + str(options.stack_trace_size)) + + source = source.replace("", "true" + if options.stack_trace_size > 0 else "false") + + # + # Handle start/stop probes + # + if start_trigger: + source = source.replace("", + start_trigger.get_c_code( + "start_trigger_probe", + "return start_trigger();")) + else: + source = source.replace("", "") + + if stop_trigger: + source = source.replace("", + stop_trigger.get_c_code( + "stop_trigger_probe", + "return stop_trigger();")) + else: + source = source.replace("", "") + + # + # Setup usdt or other probes that need handling trough the BFP class. + # + usdt = USDT(pid=int(options.pid)) + try: + if start_trigger and start_trigger.probe_type == "usdt": + usdt.enable_probe(probe=start_trigger.probe_name(), + fn_name="start_trigger_probe") + if stop_trigger and stop_trigger.probe_type == "usdt": + usdt.enable_probe(probe=stop_trigger.probe_name(), + fn_name="stop_trigger_probe") + + except USDTException as e: + print("ERROR: {}".format( + (re.sub("^", " " * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(os.EX_OSERR) + + bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug) + + if start_trigger: + try: + if start_trigger.probe_type == "uprobe": + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", + sym=start_trigger.probe_name(), + fn_name="start_trigger_probe", + pid=options.pid) + + if start_trigger.probe_type == "uretprobe": + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", + sym=start_trigger.probe_name(), + fn_name="start_trigger_probe", + pid=options.pid) + except Exception as e: + print("ERROR: Failed attaching uprobe start trigger " + f"'{start_trigger.probe_name()}';\n {str(e)}") + sys.exit(os.EX_OSERR) + + if stop_trigger: + try: + if stop_trigger.probe_type == "uprobe": + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", + sym=stop_trigger.probe_name(), + fn_name="stop_trigger_probe", + pid=options.pid) + + if stop_trigger.probe_type == "uretprobe": + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", + sym=stop_trigger.probe_name(), + fn_name="stop_trigger_probe", + pid=options.pid) + except Exception as e: + print("ERROR: Failed attaching uprobe stop trigger" + f"'{stop_trigger.probe_name()}';\n {str(e)}") + sys.exit(os.EX_OSERR) + + # + # If no triggers are configured use the delay configuration + # + bpf["events"].open_ring_buffer(process_event) + + sample_count = 0 + while sample_count < options.sample_count: + sample_count += 1 + syscall_events = [] + + if not options.start_trigger: + print_timestamp("# Start sampling") + start_capture() + stop_time = -1 if options.stop_trigger else \ + time_ns() + options.sample_time * 1000000000 + else: + # For start triggers the stop time depends on the start trigger + # time, or depends on the stop trigger if configured. + stop_time = -1 if options.stop_trigger else 0 + + while True: + keyboard_interrupt = False + try: + last_start_ts = start_trigger_ts + last_stop_ts = stop_trigger_ts + + if stop_time > 0: + delay = int((stop_time - time_ns()) / 1000000) + if delay <= 0: + break + else: + delay = -1 + + bpf.ring_buffer_poll(timeout=delay) + + if stop_time <= 0 and last_start_ts != start_trigger_ts: + print_timestamp( + "# Start sampling (trigger@{})".format( + start_trigger_ts)) + + if not options.stop_trigger: + stop_time = time_ns() + \ + options.sample_time * 1000000000 + + if last_stop_ts != stop_trigger_ts: + break + + except KeyboardInterrupt: + keyboard_interrupt = True + break + + if options.stop_trigger and not capture_running(): + print_timestamp("# Stop sampling (trigger@{})".format( + stop_trigger_ts)) + else: + print_timestamp("# Stop sampling") + + if stop_trigger_ts != 0 and start_trigger_ts != 0: + trigger_delta = stop_trigger_ts - start_trigger_ts + else: + trigger_delta = None + + if not trigger_delta or trigger_delta >= options.trigger_delta: + stop_capture(force=True) # Prevent a new trigger to start. + process_results(syscall_events=syscall_events, + trigger_delta=trigger_delta) + elif trigger_delta: + sample_count -= 1 + print_timestamp("# Sample dump skipped, delta {:,} ns".format( + trigger_delta)) + + reset_capture() + stop_capture() + + if keyboard_interrupt: + break + + if options.sample_interval > 0: + time.sleep(options.sample_interval) + + # + # Report lost events. + # + dropcnt = bpf.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all events were captured, {} were " + "dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count)) + + if options.sample_count > 1: + trigger_miss = bpf.get_table("trigger_miss") + for k in trigger_miss.keys(): + count = trigger_miss.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all start triggers were successful. " + "{} were missed due to\n# slow userspace " + "processing!".format(count)) + + +# +# Start main() as the default entry point... +# +if __name__ == "__main__": + main() diff --git a/utilities/usdt-scripts/kernel_delay.rst b/utilities/usdt-scripts/kernel_delay.rst new file mode 100644 index 00000000000..0ebd30afb67 --- /dev/null +++ b/utilities/usdt-scripts/kernel_delay.rst @@ -0,0 +1,596 @@ +Troubleshooting Open vSwitch: Is the kernel to blame? +===================================================== +Often, when troubleshooting Open vSwitch (OVS) in the field, you might be left +wondering if the issue is really OVS-related, or if it's a problem with the +kernel being overloaded. Messages in the log like +``Unreasonably long XXXXms poll interval`` might suggest it's OVS, but from +experience, these are mostly related to an overloaded Linux Kernel. +The kernel_delay.py tool can help you quickly identify if the focus of your +investigation should be OVS or the Linux kernel. + + +Introduction +------------ +``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_ +framework to install eBPF probes. The data the eBPF probes collect will be +analyzed and presented to the user by the Python script. Some of the presented +data can also be captured by the individual scripts included in the BBC [#BCC]_ +framework. + +kernel_delay.py has two modes of operation: + +- In **time mode**, the tool runs for a specific time and collects the + information. +- In **trigger mode**, event collection can be started and/or stopped based on + a specific eBPF probe. Currently, the following probes are supported: + - USDT probes + - Kernel tracepoints + - kprobe + - kretprobe + - uprobe + - uretprobe + + +In addition, the option, ``--sample-count``, exists to specify how many +iterations you would like to do. When using triggers, you can also ignore +samples if they are less than a number of nanoseconds with the +``--trigger-delta`` option. The latter might be useful when debugging Linux +syscalls which take a long time to complete. More on this later. Finally, you +can configure the delay between two sample runs with the ``--sample-interval`` +option. + +Before getting into more details, you can run the tool without any options +to see what the output looks like. Notice that it will try to automatically +get the process ID of the running ``ovs-vswitchd``. You can overwrite this +with the ``--pid`` option. + +.. code-block:: console + + $ sudo ./kernel_delay.py + # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC) + # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC) + # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 27090 ovs-vswitchd [SYSCALL STATISTICS] + + + 31741 revalidator122 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + poll 7 5 184,193,176 184,191,520 + recvmsg 47 494 125,208,756 310,331 + futex 202 8 18,768,758 4,023,039 + sendto 44 10 375,861 266,867 + sendmsg 46 4 43,294 11,213 + write 1 1 5,949 5,949 + getrusage 98 1 1,424 1,424 + read 0 1 1,292 1,292 + TOTAL( - poll): 519 144,405,334 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 6 136,764,071 1,480 115,146,424 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 7 11,334 6,636 + + [HARD IRQ STATISTICS] + NAME COUNT TOTAL ns MAX ns + eno8303-rx-1 1 3,586 3,586 + TOTAL: 1 3,586 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + net_rx 3 1 17,699 17,699 + sched 7 6 13,820 3,226 + rcu 9 16 13,586 1,554 + timer 1 3 10,259 3,815 + TOTAL: 26 55,364 + + +By default, the tool will run for half a second in `time mode`. To extend this +you can use the ``--sample-time`` option. + + +What will it report +------------------- +The above sample output separates the captured data on a per-thread basis. +For this, it displays the thread's id (``TID``) and name (``THREAD``), +followed by resource-specific data. Which are: + +- ``SYSCALL STATISTICS`` +- ``THREAD RUN STATISTICS`` +- ``THREAD READY STATISTICS`` +- ``HARD IRQ STATISTICS`` +- ``SOFT IRQ STATISTICS`` + +The following sections will describe in detail what statistics they report. + + +``SYSCALL STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~ +``SYSCALL STATISTICS`` tell you which Linux system calls got executed during +the measurement interval. This includes the number of times the syscall was +called (``COUNT``), the total time spent in the system calls (``TOTAL ns``), +and the worst-case duration of a single call (``MAX ns``). + +It also shows the total of all system calls, but it excludes the poll system +call, as the purpose of this call is to wait for activity on a set of sockets, +and usually, the thread gets swapped out. + +Note that it only counts calls that started and stopped during the +measurement interval! + + +``THREAD RUN STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU +during the measurement interval. + +Note that these statistics only count events where the thread started and +stopped running on a CPU during the measurement interval. For example, if +this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``. +If not, there might be a misconfiguration. + + +``THREAD READY STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD READY STATISTICS`` tell you the time between the thread being ready +to run and it actually running on the CPU. + +Note that these statistics only count events where the thread was getting +ready to run and started running during the measurement interval. + + +``HARD IRQ STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~ +``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard +interrupts during the threads run time. + +It shows the interrupt name (``NAME``), the number of interrupts (``COUNT``), +the total time spent in the interrupt handler (``TOTAL ns``), and the +worst-case duration (``MAX ns``). + + +``SOFT IRQ STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~ +``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft +interrupts during the threads run time. + +It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the +number of interrupts (``COUNT``), the total time spent in the interrupt +handler (``TOTAL ns``), and the worst-case duration (``MAX ns``). + + +The ``--syscall-events`` option +------------------------------- +In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``, +the tool can also report each individual syscall. This can be a usefull +second step if the ``SYSCALL_STATISTICS`` show high latency numbers. + +All you need to do is add the ``--syscall-events`` option, with or without +the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter +allows you to exclude events that take less than the supplied time. + +The ``--skip-syscall-poll-events`` option allows you to exclude poll +syscalls from the report. + +Below is an example run, note that the resource-specific data is removed +to highlight the syscall events: + +.. code-block:: console + + $ sudo ./kernel_delay.py --syscall-events 50000 --skip-syscall-poll-events + # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC) + # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC) + # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 3359686 ipf_clean2 [SYSCALL STATISTICS] + ... + 3359635 ovs-vswitchd [SYSCALL STATISTICS] + ... + 3359697 revalidator12 [SYSCALL STATISTICS] + ... + 3359698 revalidator13 [SYSCALL STATISTICS] + ... + 3359699 revalidator14 [SYSCALL STATISTICS] + ... + 3359700 revalidator15 [SYSCALL STATISTICS] + ... + + # SYSCALL EVENTS: + ENTRY (ns) EXIT (ns) TID COMM DELTA (us) SYSCALL + ------------------- ------------------- ---------- ---------------- ---------- ---------------- + 2161821694935486 2161821695031201 3359699 revalidator14 95 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003936313a63 + 2161821695276882 2161821695333687 3359698 revalidator13 56 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003134313a63 + 2161821695275820 2161821695405733 3359700 revalidator15 129 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003936313a63 + 2161821695964969 2161821696052021 3359635 ovs-vswitchd 87 accept + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI_accept+0x4d [libc.so.6] + pfd_accept+0x3a [ovs-vswitchd] + [unknown] 0x7fff19f2bd00 + [unknown] 0xe4b8001f0f + +As you can see above, the output also shows the stackback trace. You can +disable this using the ``--stack-trace-size 0`` option. + +As you can see above, the backtrace does not show a lot of useful information +due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further +analyze system call backtraces, you could use perf. The following perf +script can do this for you (refer to the embedded instructions): + +https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py + + +Using triggers +-------------- +The tool supports start and, or stop triggers. This will allow you to capture +statistics triggered by a specific event. The following combinations of +stop-and-start triggers can be used. + +If you only use ``--start-trigger``, the inspection start when the trigger +happens and runs until the ``--sample-time`` number of seconds has passed. +The example below shows all the supported options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \ + --sample-count 4 --sample-interval 1 + + +If you only use ``--stop-trigger``, the inspection starts immediately and +stops when the trigger happens. The example below shows all the supported +options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \ + --sample-count 4 --sample-interval 1 + + +If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the +statistics are captured between the two first occurrences of these events. +The example below shows all the supported options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run \ + --stop-trigger upr:bridge_run \ + --sample-count 4 --sample-interval 1 \ + --trigger-delta 50000 + +What triggers are supported? Note that what ``kernel_delay.py`` calls triggers, +BCC [#BCC]_, calls events; these are eBPF tracepoints you can attach to. +For more details on the supported tracepoints, check out the BCC +documentation [#BCC_EVENT]_. + +The list below shows the supported triggers and their argument format: + +**USDT probes:** + [u|usdt]:{provider}:{probe} +**Kernel tracepoint:** + [t:trace]:{system}:{event} +**kprobe:** + [k:kprobe]:{kernel_function} +**kretprobe:** + [kr:kretprobe]:{kernel_function} +**uprobe:** + [up:uprobe]:{function} +**uretprobe:** + [upr:uretprobe]:{function} + +Here are a couple of trigger examples, more use-case-specific examples can be +found in the *Examples* section. + +.. code-block:: console + + --start|stop-trigger u:udpif_revalidator:start_dump + --start|stop-trigger t:openvswitch:ovs_dp_upcall + --start|stop-trigger k:ovs_dp_process_packet + --start|stop-trigger kr:ovs_dp_process_packet + --start|stop-trigger up:bridge_run + --start|stop-trigger upr:bridge_run + + +Examples +-------- +This section will give some examples of how to use this tool in real-world +scenarios. Let's start with the issue where Open vSwitch reports +``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note +that there is a blog available explaining how the revalidator process works +in OVS [#REVAL_BLOG]_. + +First, let me explain this log message. It gets logged if the time delta +between two ``poll_block()`` calls is more than 1 second. In other words, +the process was spending a lot of time processing stuff that was made +available by the return of the ``poll_block()`` function. + +Do a run with the tool using the existing USDT revalidator probes as a start +and stop trigger (Note that the resource-specific data is removed from the none +revalidator threads): + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done + # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 (12:52:00 UTC) + # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 (12:52:01 UTC) + # Triggered sample dump, stop-start delta 1,187,986,454 ns @2023-06-14T14:52:01.298021 (12:52:01 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1457761 handler24 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + sendmsg 46 6110 123,274,761 41,776 + recvmsg 47 136299 99,397,508 49,896 + futex 202 51 7,655,832 7,536,776 + poll 7 4068 1,202,883 2,907 + getrusage 98 2034 586,602 1,398 + sendto 44 9 213,682 27,417 + TOTAL( - poll): 144503 231,128,385 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 1 1,438 1,438 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 21 59,145 3,769 + rcu 9 50 42,917 2,234 + TOTAL: 71 102,062 + 1457733 ovs-vswitchd [SYSCALL STATISTICS] + ... + 1457792 revalidator55 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + futex 202 73 572,576,329 19,621,600 + recvmsg 47 815 296,697,618 405,338 + sendto 44 3 78,302 26,837 + sendmsg 46 3 38,712 13,250 + write 1 1 5,073 5,073 + TOTAL( - poll): 895 869,396,034 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 48 394,350,393 1,729 140,455,796 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 49 23,650 1,559 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 14 26,889 3,041 + rcu 9 28 23,024 1,600 + TOTAL: 42 49,913 + + +Above you see from the start of the output that the trigger took more than a +second (1,187,986,454 ns), which is already know, by looking at the output of +the ``ovs-vsctl upcall/show`` command. + +From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics you can see it +spent almost 870ms handling syscalls, and there were no poll() calls being +executed. The ``THREAD RUN STATISTICS`` statistics here are a bit misleading, +as it looks like OVS only spent 394ms on the CPU. But earlier, it was mentioned +that this time does not include the time being on the CPU at the start or stop +of an event. What is exactly the case here, because USDT probes were used. + +From the above data and maybe some ``top`` output, it can be determined that +the *revalidator55* thread is taking a lot of CPU time, probably because it +has to do a lot of revalidator work by itself. The solution here is to increase +the number of revalidator threads, so more work could be done in parallel. + +Here is another run of the same command in another scenario: + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done + # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 (13:48:23 UTC) + # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 (13:48:23 UTC) + # Triggered sample dump, stop-start delta 263,181,230 ns @2023-06-14T15:48:23.781414 (13:48:23 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1457733 ovs-vswitchd [SYSCALL STATISTICS] + ... + 1457792 revalidator55 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + recvmsg 47 284 193,422,110 46,248,418 + sendto 44 2 46,685 23,665 + sendmsg 46 2 24,916 12,703 + write 1 1 6,534 6,534 + TOTAL( - poll): 289 193,500,245 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 2 47,333,558 331,516 47,002,042 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 3 87,000,403 45,999,712 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 2 9,504 5,109 + TOTAL: 2 9,504 + + +Here you can see the revalidator run took about 263ms, which does not look +odd, however, the ``THREAD READY STATISTICS`` information shows that OVS was +waiting 87ms for a CPU to be run on. This means the revalidator process could +have finished 87ms faster. Looking at the ``MAX ns`` value, a worst-case delay +of almost 46ms can be seen, which hints at an overloaded system. + +One final example that uses a ``uprobe`` to get some statistics on a +``bridge_run()`` execution that takes more than 1ms. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run --trigger-delta 1000000 + # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 (14:21:10 UTC) + # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 (14:21:10 UTC) + # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 (14:21:10 UTC) + # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 (14:21:10 UTC) + # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 (14:21:10 UTC) + # Triggered sample dump, stop-start delta 1,609,374 ns @2023-06-14T16:21:10.542565 (14:21:10 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 3371035 [SYSCALL STATISTICS] + ... + 3371102 handler66 [SYSCALL STATISTICS] + ... + 3366258 ovs-vswitchd [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + futex 202 43 403,469 199,312 + clone3 435 13 174,394 30,731 + munmap 11 8 115,774 21,861 + poll 7 5 92,969 38,307 + unlink 87 2 49,918 35,741 + mprotect 10 8 47,618 13,201 + accept 43 10 31,360 6,976 + mmap 9 8 30,279 5,776 + write 1 6 27,720 11,774 + rt_sigprocmask 14 28 12,281 970 + read 0 6 9,478 2,318 + recvfrom 45 3 7,024 4,024 + sendto 44 1 4,684 4,684 + getrusage 98 5 4,594 1,342 + close 3 2 2,918 1,627 + recvmsg 47 1 2,722 2,722 + TOTAL( - poll): 144 924,233 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 13 817,605 5,433 524,376 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 14 28,646 11,566 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + rcu 9 1 2,838 2,838 + TOTAL: 1 2,838 + + 3371110 revalidator74 [SYSCALL STATISTICS] + ... + 3366311 urcu3 [SYSCALL STATISTICS] + ... + + +OVS removed some of the threads and their resource-specific data, but based +on the ```` thread name, you can determine that some +threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some +``clone3`` syscalls, indicating threads were created. In this example, it was +due to the deletion of a bridge, which resulted in the recreation of the +revalidator and handler threads. + + +Use with Openshift +------------------ +This section describes how you would use the tool on a node in an OpenShift +cluster. It assumes you have console access to the node, either directly or +through a debug container. + +A base fedora38 container will be used through podman, as this will allow the +use of some additional tools and packages needed. + +First the containers need to be started: + +.. code-block:: console + + [core@sno-master ~]$ sudo podman run -it --rm \ + -e PS1='[(DEBUG)\u@\h \W]\$ ' \ + --privileged --network=host --pid=host \ + -v /lib/modules:/lib/modules:ro \ + -v /sys/kernel/debug:/sys/kernel/debug \ + -v /proc:/proc \ + -v /:/mnt/rootdir \ + quay.io/fedora/fedora:38-x86_64 + + [(DEBUG)root@sno-master /]# + + +Next add the ``linux_delay.py`` dependencies: + +.. code-block:: console + + [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \ + python3-pytz python3-psutil + + +You need to install the devel, debug and source RPMs for your OVS and kernel +version: + +.. code-block:: console + + [(DEBUG)root@sno-master home]# rpm -i \ + openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \ + openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \ + kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm + + +Now the tool can be started. Here the above ``bridge_run()`` example is used: + +.. code-block:: console + + [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run + # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 (11:44:07 UTC) + # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 (11:44:07 UTC) + # Triggered sample dump, stop-start delta 100,467 ns @2023-06-15T11:44:07.628569 (11:44:07 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1246 ovs-vswitchd [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + getdents64 217 2 8,560 8,162 + openat 257 1 6,951 6,951 + accept 43 4 6,942 3,763 + recvfrom 45 1 3,726 3,726 + recvmsg 47 2 2,880 2,188 + stat 4 2 1,946 1,384 + close 3 1 1,393 1,393 + fstat 5 1 1,324 1,324 + TOTAL( - poll): 14 33,722 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + + +.. rubric:: Footnotes + +.. [#BCC] https://github.com/iovisor/bcc +.. [#BCC_EVENT] https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments +.. [#REVAL_BLOG] https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained From 155f632e719bf56958e66674733156dc5acf9ea2 Mon Sep 17 00:00:00 2001 From: Brad Cowie Date: Thu, 28 Sep 2023 21:44:24 +0000 Subject: [PATCH 390/833] connmgr: Fix ofconn configuration on vswitchd startup. ofconn connection parameters, such as probe_interval and max_backoff, are always set to their default values when vswitchd starts up even if the user has configured these to be something different in ovsdb: $ ovs-vsctl set controller UUID inactivity_probe=9000 $ journalctl -u ovs-vswitchd.service | grep "inactivity" ovs|10895|rconn|DBG|dp1<->tcp:127.0.0.1:6653: idle 9 seconds, sending inactivity probe $ systemctl restart openvswitch-switch.service $ journalctl -u ovs-vswitchd.service | grep "inactivity" ovs|00848|rconn|DBG|dp1<->tcp:127.0.0.1:6653: idle 5 seconds, sending inactivity probe This bug was introduced by commit a0baa7df (connmgr: Make treatment of active and passive connections more uniform.). This happens because ofservice_reconfigure() loops over each ofconn in ofservice->conns and calls ofconn_reconfigure() on it to set the configuration parameters, however when ofservice_reconfigure() is called from ofservice_create(), ofservice->conns hasn't been populated yet so ofconn_reconfigure() is never called. This commit moves the ofservice_reconfigure() call to ofconn_create() where ofservice->conns is populated. This commit also removes the hardcoded default values for inactivity_probe (5s) and max_backoff (8s) on initial creation of the ofservice, as these config values are available from the ofproto_controller struct c. Signed-off-by: Brad Cowie Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- ofproto/connmgr.c | 11 ++++------- tests/ofproto.at | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/ofproto/connmgr.c b/ofproto/connmgr.c index b092e9e04ef..f7f7b127996 100644 --- a/ofproto/connmgr.c +++ b/ofproto/connmgr.c @@ -1209,7 +1209,7 @@ ofconn_create(struct ofservice *ofservice, struct rconn *rconn, hmap_init(&ofconn->bundles); ofconn->next_bundle_expiry_check = time_msec() + BUNDLE_EXPIRY_INTERVAL; - ofconn_set_rate_limit(ofconn, settings->rate_limit, settings->burst_limit); + ofservice_reconfigure(ofservice, settings); ovs_mutex_unlock(&ofproto_mutex); } @@ -1915,10 +1915,7 @@ connmgr_count_hidden_rules(const struct connmgr *mgr) } /* Creates a new ofservice for 'target' in 'mgr'. Returns 0 if successful, - * otherwise a positive errno value. - * - * ofservice_reconfigure() must be called to fully configure the new - * ofservice. */ + * otherwise a positive errno value. */ static void ofservice_create(struct connmgr *mgr, const char *target, const struct ofproto_controller *c) @@ -1928,7 +1925,8 @@ ofservice_create(struct connmgr *mgr, const char *target, struct rconn *rconn = NULL; if (!vconn_verify_name(target)) { char *name = ofconn_make_name(mgr, target); - rconn = rconn_create(5, 8, c->dscp, c->allowed_versions); + rconn = rconn_create(c->probe_interval, c->max_backoff, + c->dscp, c->allowed_versions); rconn_connect(rconn, target, name); free(name); } else if (!pvconn_verify_name(target)) { @@ -1951,7 +1949,6 @@ ofservice_create(struct connmgr *mgr, const char *target, ofservice->rconn = rconn; ofservice->pvconn = pvconn; ofservice->s = *c; - ofservice_reconfigure(ofservice, c); VLOG_INFO("%s: added %s controller \"%s\"", mgr->name, ofconn_type_to_string(ofservice->type), target); diff --git a/tests/ofproto.at b/tests/ofproto.at index 2fa8486a86f..2889f81fb17 100644 --- a/tests/ofproto.at +++ b/tests/ofproto.at @@ -6720,3 +6720,31 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(tip=172.31.1 OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ofproto - configure inactivity probe interval]) + +# Set 6 second inactivity probe interval (default is 5 seconds). +OVS_VSWITCHD_START([set-controller br0 unix:testcontroller \ + -- set Controller br0 inactivity_probe=6000], [], [], + [-vfile:rconn:dbg]) + +# Start test openflow controller. +AT_CHECK([ovs-testcontroller -vsyslog:off --detach --no-chdir --pidfile punix:testcontroller], + [0], [ignore]) +on_exit 'kill `cat ovs-testcontroller.pid`' +OVS_WAIT_UNTIL([test -e testcontroller]) + +# After 6 seconds of inactivity there should be a log message. +OVS_WAIT_UNTIL([grep "idle 6 seconds, sending inactivity probe" ovs-vswitchd.log]) + +# Restart ovs-vswitchd with an empty ovs-vswitchd log file. +OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) +mv ovs-vswitchd.log ovs-vswitchd_1.log +AT_CHECK([ovs-vswitchd --enable-dummy --disable-system --disable-system-route --detach \ + --no-chdir --pidfile --log-file -vfile:rconn:dbg -vvconn -vofproto_dpif -vunixctl], + [0], [], [stderr]) + +# After 6 seconds of inactivity there should be a log message. +OVS_WAIT_UNTIL([grep "idle 6 seconds, sending inactivity probe" ovs-vswitchd.log]) +OVS_VSWITCHD_STOP(["/br0<->unix:testcontroller: connection failed/d"]) +AT_CLEANUP From b78427639fa97ca46846c8bd9744f9006c77f641 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 4 Oct 2023 14:59:48 +0200 Subject: [PATCH 391/833] Documentation: Add CVE-2022-40982, aka Downfall reference. Added a reference to the DPDK documentation as a result of analyzing the OVS code for potential performance impacts due to the Downfall mitigation. Acked-by: Aaron Conole Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- Documentation/topics/dpdk/bridge.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/topics/dpdk/bridge.rst b/Documentation/topics/dpdk/bridge.rst index 354f1ced143..00be06e37fe 100644 --- a/Documentation/topics/dpdk/bridge.rst +++ b/Documentation/topics/dpdk/bridge.rst @@ -206,6 +206,11 @@ chosen, and the 2nd occurance of that priority is not used. Put in logical terms, a subtable is chosen if its priority is greater than the previous best candidate. +Note that the ``avx512_gather`` implementation uses instructions which may be +affected by the Gather Data Sampling (GDS) vulnerability, aka Downfall, +mitigation (see documentation for CVE-2022-40982 for details). This could +result in lower performance when these mitigations are enabled. + Optimizing Specific Subtable Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 0aeb06e1fa7e2b97d338e317eb9ee63c28b8490b Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Sun, 11 Jun 2023 18:58:26 +0300 Subject: [PATCH 392/833] netdev-offload-dpdk: Fix flushing of a physdev. Vport's offloads are done on the tracked orig-in-port, but the flow itself is associated in the vport's map. Removing the physdev will flush all the ports that are on its map, but not the ones on other netdevs' maps. Since flows take reference count on both their vport and their physdev, the physdev still has references on. Trying to remove it and re-add it fails with "already in use" error. Fix it by flushing the physdev's offload flows in all related netdevs, e.g. the netdev itself, or for physical devices, all vports. Fixes: adbd4301a249 ("netdev-offload-dpdk: Use per-netdev offload metadata.") Reported-by: wuxi_seu@163.com Acked-by: Simon Horman Signed-off-by: Eli Britstein Signed-off-by: Ilya Maximets --- lib/netdev-offload-dpdk.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index 14bc877719c..992627fa231 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -2537,15 +2537,15 @@ netdev_offload_dpdk_flow_get(struct netdev *netdev, return ret; } -static int -netdev_offload_dpdk_flow_flush(struct netdev *netdev) +static void +flush_netdev_flows_in_related(struct netdev *netdev, struct netdev *related) { - struct cmap *map = offload_data_map(netdev); - struct ufid_to_rte_flow_data *data; unsigned int tid = netdev_offload_thread_id(); + struct cmap *map = offload_data_map(related); + struct ufid_to_rte_flow_data *data; if (!map) { - return -1; + return; } CMAP_FOR_EACH (data, node, map) { @@ -2556,6 +2556,31 @@ netdev_offload_dpdk_flow_flush(struct netdev *netdev) netdev_offload_dpdk_flow_destroy(data); } } +} + +static bool +flush_in_vport_cb(struct netdev *vport, + odp_port_t odp_port OVS_UNUSED, + void *aux) +{ + struct netdev *netdev = aux; + + /* Only vports are related to physical devices. */ + if (netdev_vport_is_vport_class(vport->netdev_class)) { + flush_netdev_flows_in_related(netdev, vport); + } + + return false; +} + +static int +netdev_offload_dpdk_flow_flush(struct netdev *netdev) +{ + flush_netdev_flows_in_related(netdev, netdev); + + if (!netdev_vport_is_vport_class(netdev->netdev_class)) { + netdev_ports_traverse(netdev->dpif_type, flush_in_vport_cb, netdev); + } return 0; } From 8020eff9a0823e8173b59f139705bfeb09075311 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Fri, 6 Oct 2023 11:29:03 +0200 Subject: [PATCH 393/833] netdev-dpdk: Document status options for VF MAC address. Fixes: f4336f504b17 ("netdev-dpdk: Add option to configure VF MAC address. ") Signed-off-by: Jakob Meng Acked-by: Simon Horman Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- vswitchd/vswitch.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index cfcde34ffed..797fb05bf87 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3805,6 +3805,10 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Device ID of PCI device. + + Ethernet address set for this VF interface. Only reported for dpdk + VF representors. + From e9ada16292ebf11b117912509037a664be13e67e Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Fri, 6 Oct 2023 11:29:04 +0200 Subject: [PATCH 394/833] netdev-dpdk: Update docs for interface info. The status options pci-vendor_id and pci-device_id for dpdk netdevs have been replaced by bus_info. This patch updates the documentation in vswitchd/vswitch.xml accordingly. Fixes: a77c7796f23a ("dpdk: Update to use v22.11.1.") Signed-off-by: Jakob Meng Acked-by: Simon Horman Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- vswitchd/vswitch.xml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 797fb05bf87..006d1e6a404 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3797,12 +3797,9 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Interface description string. - - Vendor ID of PCI device. - - - - Device ID of PCI device. + + Bus name and bus info such as Vendor ID and Device ID of PCI + device. From bb6ed2472fc5323ef629fe80ddb51efe69f31d44 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Fri, 6 Oct 2023 11:29:05 +0200 Subject: [PATCH 395/833] netdev-dpdk: Document rx-steering status options. Fixes: fc06ea9a1883 ("netdev-dpdk: Add custom rx-steering configuration.") Signed-off-by: Jakob Meng Acked-by: Simon Horman Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- vswitchd/vswitch.xml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 006d1e6a404..1e2a1267d4f 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3806,6 +3806,20 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Ethernet address set for this VF interface. Only reported for dpdk VF representors. + + + Hardware Rx queue steering policy in use. + + + + ID of rx steering queue. Only reported if rx-steering + is supported by hardware. + + + + IDs of rss queues. Only reported if rx-steering is + supported by hardware. + From 297db8056e104a26c4e5f1eee8459792098644b6 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Tue, 10 Oct 2023 11:25:58 +0100 Subject: [PATCH 396/833] AUTHORS: Add Jakob Meng. Signed-off-by: Kevin Traynor --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 0821ecaa0f4..9cd8a9b9229 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -212,6 +212,7 @@ Ivan Dyukov i.dyukov@samsung.com Ivan Malov ivan.malov@arknetworks.am IWASE Yusuke iwase.yusuke@gmail.com Jaime Caamaño Ruiz jcaamano@suse.com +Jakob Meng code@jakobmeng.de Jakub Libosvar libosvar@redhat.com Jakub Sitnicki jsitnicki@gmail.com James P. roampune@gmail.com From d76193008ec07ca3420b036c198e5fd149469608 Mon Sep 17 00:00:00 2001 From: James Raphael Tiovalen Date: Sat, 7 Oct 2023 16:37:38 +0800 Subject: [PATCH 397/833] tests: Add some tests for byteq module. This commit adds a non-exhaustive list of tests for some of the functions declared in `lib/byteq`. These unit tests have been executed via `make check` and they successfully passed. Acked-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: James Raphael Tiovalen Signed-off-by: Eelco Chaudron --- tests/automake.mk | 1 + tests/library.at | 13 ++++ tests/test-byteq.c | 159 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 tests/test-byteq.c diff --git a/tests/automake.mk b/tests/automake.mk index 720c944496b..f8a925012d6 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -455,6 +455,7 @@ tests_ovstest_SOURCES = \ tests/test-barrier.c \ tests/test-bundle.c \ tests/test-byte-order.c \ + tests/test-byteq.c \ tests/test-classifier.c \ tests/test-ccmap.c \ tests/test-cmap.c \ diff --git a/tests/library.at b/tests/library.at index 164ae789dde..3f9df2f87d3 100644 --- a/tests/library.at +++ b/tests/library.at @@ -88,6 +88,19 @@ AT_KEYWORDS([byte order]) AT_CHECK([ovstest test-byte-order]) AT_CLEANUP +AT_SETUP([byteq - basic]) +AT_KEYWORDS([byteq]) +AT_CHECK([ovstest test-byteq basic], [0], [... +]) +AT_CLEANUP + +AT_SETUP([byteq - write_read]) +AT_KEYWORDS([byteq]) +AT_SKIP_IF([test "$IS_WIN32" = "yes"]) +AT_CHECK([ovstest test-byteq write_read], [0], [. +]) +AT_CLEANUP + AT_SETUP([random number generator]) AT_CHECK([ovstest test-random], [0], [dnl average=7fa2014f diff --git a/tests/test-byteq.c b/tests/test-byteq.c new file mode 100644 index 00000000000..ed2afd1fef8 --- /dev/null +++ b/tests/test-byteq.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2023 Hewlett Packard Enterprise Development LP + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include +#include +#include +#include +#include +#include "byteq.h" +#include "ovstest.h" +#include "util.h" + +static void test_byteq_main(int argc, char *argv[]); +static void test_byteq_put_get(void); +static void test_byteq_putn_get(void); +static void test_byteq_put_string(void); +static void test_byteq_write_read(void); + +#define SIZE 256 + +static void +test_byteq_put_get(void) +{ + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + for (int i = 0; i < input_len; i++) { + byteq_put(&bq, input[i]); + } + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } +} + +static void +test_byteq_putn_get(void) +{ + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + byteq_putn(&bq, input, input_len); + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } +} + +static void +test_byteq_put_string(void) +{ + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + byteq_put_string(&bq, input); + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } +} + +static void +test_byteq_write_read(void) +{ +#ifndef _WIN32 + int fd[2]; + pid_t childpid; + int rc; + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + byteq_put_string(&bq, input); + + rc = pipe(fd); + ovs_assert(rc == 0); + + /* Flush stdout */ + fflush(stdout); + + childpid = fork(); + ovs_assert(childpid != -1); + if (childpid == 0) { + /* Child process closes stdout */ + close(STDOUT_FILENO); + /* Child process closes up input side of pipe */ + close(fd[0]); + rc = byteq_write(&bq, fd[1]); + ovs_assert(rc == 0); + exit(0); + } else { + /* Parent process closes up output side of pipe */ + close(fd[1]); + rc = byteq_read(&bq, fd[0]); + ovs_assert(rc == EOF); + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } + } +#endif /* _WIN32 */ +} + +static void +run_test(void (*function)(void)) +{ + function(); + printf("."); +} + +static void +test_byteq_main(int argc, char *argv[]) +{ + if (argc != 2) { + ovs_fatal(0, "exactly one argument required\n" + "the argument must be one of the following:\n" + "\tbasic\n" + "\twrite_read\n"); + } + + if (strcmp(argv[1], "write_read") == 0) { + run_test(test_byteq_write_read); + printf("\n"); + } else if (strcmp(argv[1], "basic") == 0) { + run_test(test_byteq_put_get); + run_test(test_byteq_putn_get); + run_test(test_byteq_put_string); + printf("\n"); + } else { + ovs_fatal(0, "invalid argument\n" + "the argument must be one of the following:\n" + "\tbasic\n" + "\twrite_read\n"); + } +} + +OVSTEST_REGISTER("test-byteq", test_byteq_main); From 834bd9158f365f438aa96ac0e04ba95f54f51eaf Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 9 Oct 2023 15:05:01 +0300 Subject: [PATCH 398/833] ofproto-dpif-upcall: Fix redundant mirror on geneve tunnel options. The cited commit fixed missing mirror packets by reset mirror when packets are modified but setting geneve options was also treated as a modified packet but should be treated as a part of set_tunnel which doesn't reset mirror. Fixes: feed7f677505 ("ofproto-dpif-upcall: Mirror packets that are modified.") Acked-by: Simon Horman Signed-off-by: Roi Dayan Signed-off-by: Eelco Chaudron --- ofproto/ofproto-dpif-xlate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index be4bd665768..e243773307b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -7097,7 +7097,7 @@ reset_mirror_ctx(struct xlate_ctx *ctx, const struct flow *flow, set_field = ofpact_get_SET_FIELD(a); mf = set_field->field; - if (mf_are_prereqs_ok(mf, flow, NULL)) { + if (mf_are_prereqs_ok(mf, flow, NULL) && !mf_is_tun_metadata(mf)) { ctx->mirrors = 0; } return; From c92ded5515031bf256d80090b5c4f5980360c0c6 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 9 Oct 2023 15:05:02 +0300 Subject: [PATCH 399/833] tests/tunnel.at: Add geneve options mirror test. Test geneve options mirror flow doesn't add redundant mirror. Acked-by: Simon Horman Signed-off-by: Roi Dayan Signed-off-by: Eelco Chaudron --- tests/tunnel.at | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/tunnel.at b/tests/tunnel.at index ddeb66bc9fb..7e7116711a6 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -1279,3 +1279,32 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel - Geneve metadata mirror]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=geneve \ + options:remote_ip=1.1.1.1 ofport_request=1 \ + -- add-port br0 p2 -- set Interface p2 type=dummy \ + ofport_request=2 ofport_request=2]) +OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP +add_of_ports br0 90 +AT_CHECK([ovs-vsctl \ + set Bridge br0 mirrors=@m --\ + --id=@p90 get Port p90 --\ + --id=@m create Mirror name=mymirror select_all=true output_port=@p90], [0], [stdout]) + +AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0,len=4}->tun_metadata0,{class=0xffff,type=1,len=8}->tun_metadata1"]) + +AT_DATA([flows.txt], [dnl +in_port=2,actions=set_field:0xa->tun_metadata0,set_field:0x1234567890abcdef->tun_metadata1,1 +tun_metadata0=0xb/0xf,actions=2 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 90,set(tunnel(dst=1.1.1.1,ttl=64,tp_dst=6081,geneve({class=0xffff,type=0,len=4,0xa}{class=0xffff,type=0x1,len=8,0x1234567890abcdef}),flags(df))),6081 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From f100e6a838ffaa9fbf02f29404cf9850ba002083 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 9 Oct 2023 15:05:03 +0300 Subject: [PATCH 400/833] tests: Update some tests title prefix print. Use test title prefix according to filename the test is in for tunnel.at and ofproto-dpif.at. Acked-by: Simon Horman Signed-off-by: Roi Dayan Signed-off-by: Eelco Chaudron --- tests/ofproto-dpif.at | 10 +++++----- tests/tunnel.at | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index a39d0d3ae98..e305e7b9cd0 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -676,7 +676,7 @@ NXST_FLOW reply: OVS_VSWITCHD_STOP() AT_CLEANUP -AT_SETUP([bond - discard duplicated frames]) +AT_SETUP([ofproto-dpif - bond - discard duplicated frames]) dnl With an active/active non-lacp bond, the default behaviour dnl is to discard multicast frames on the secondary interface. OVS_VSWITCHD_START([dnl @@ -740,7 +740,7 @@ Datapath actions: drop OVS_VSWITCHD_STOP() AT_CLEANUP -AT_SETUP([bond - allow duplicated frames]) +AT_SETUP([ofproto-dpif - bond - allow duplicated frames]) dnl Receiving of duplicated multicast frames should be allowed with 'all_members_active'. OVS_VSWITCHD_START([dnl add-bond br0 bond0 p1 p2 -- dnl @@ -9706,7 +9706,7 @@ OVS_VSWITCHD_STOP AT_CLEANUP # Tests the bundling with various bfd and cfm configurations. -AT_SETUP([ofproto - bundle with variable bfd/cfm config]) +AT_SETUP([ofproto-dpif - bundle with variable bfd/cfm config]) OVS_VSWITCHD_START([add-br br1 -- set bridge br1 datapath-type=dummy -- \ add-bond br0 br0bond p0 p2 bond-mode=active-backup -- \ add-bond br1 br1bond p1 p3 bond-mode=active-backup -- \ @@ -11777,7 +11777,7 @@ AT_CHECK([ovs-appctl dpctl/ct-get-sweep-interval], [0], [dnl OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto - set mtu]) +AT_SETUP([ofproto-dpif - set mtu]) OVS_VSWITCHD_START add_of_ports br0 1 @@ -11827,7 +11827,7 @@ AT_CHECK([ovs-vsctl wait-until Interface br0 mtu=1400]) OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto - fragment prerequisites]) +AT_SETUP([ofproto-dpif - fragment prerequisites]) OVS_VSWITCHD_START AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) diff --git a/tests/tunnel.at b/tests/tunnel.at index 7e7116711a6..05613bcc343 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -603,7 +603,7 @@ AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto-dpif - set_field - tun_src/tun_dst/tun_id]) +AT_SETUP([tunnel - set_field - tun_src/tun_dst/tun_id]) OVS_VSWITCHD_START([dnl add-port br0 p1 -- set Interface p1 type=gre options:key=flow \ options:remote_ip=1.1.1.1 ofport_request=1 \ From b16ef5200282288410d096d377ae3dfb78cf45cf Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Sat, 7 Oct 2023 11:49:42 +0800 Subject: [PATCH 401/833] tc: Add csum offload of IGMP/UDPLITE/SCTP in IP rewrite. When the IP header is modified, for example, by NAT or a ToS/TTL change, the IP header checksum needs recalculation. In addition to the IP header checksum, for UDPLITE, its checksum also needs recalculation when any of the addresses change. This patch adds support for TC offloading of IGMP, UDPLITE, and SCTP packets by adding the correct csum action. Acked-by: Simon Horman Signed-off-by: Faicker Mo Signed-off-by: Eelco Chaudron --- lib/tc.c | 7 ++++++- tests/system-offloads-traffic.at | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/lib/tc.c b/lib/tc.c index f49048cdaba..ae71390bc7d 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -2973,11 +2973,16 @@ csum_update_flag(struct tc_flower *flower, } else if (flower->key.ip_proto == IPPROTO_UDP) { flower->needs_full_ip_proto_mask = true; flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP; - } else if (flower->key.ip_proto == IPPROTO_ICMP) { + } else if (flower->key.ip_proto == IPPROTO_ICMP || + flower->key.ip_proto == IPPROTO_IGMP || + flower->key.ip_proto == IPPROTO_SCTP) { flower->needs_full_ip_proto_mask = true; } else if (flower->key.ip_proto == IPPROTO_ICMPV6) { flower->needs_full_ip_proto_mask = true; flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP; + } else if (flower->key.ip_proto == IPPROTO_UDPLITE) { + flower->needs_full_ip_proto_mask = true; + flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDPLITE; } else { VLOG_WARN_RL(&error_rl, "can't offload rewrite of IP/IPV6 with ip_proto: %d", diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 7215e36e2d8..3a03d931c82 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -855,3 +855,30 @@ AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([offloads - IGMP with ip rewrite - offloads enabled]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Set up the ip field modify flow. +AT_CHECK([ovs-ofctl add-flow br0 "priority=100 in_port=ovs-p0,ip actions=mod_nw_tos:12,output:ovs-p1"]) + +dnl Add and del multicast address to send IGMP packet. +NS_CHECK_EXEC([at_ns0], [ip addr add dev p0 224.10.10.10/24 autojoin 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [ip addr del dev p0 224.10.10.10/24 2>/dev/null], [0]) + +OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | wc -l` -ge 1]) + +dnl Check the offloaded flow. +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | strip_stats], [0], [dnl +in_port(2),eth(),eth_type(0x0800),ipv4(proto=2,tos=0xc0/0xfc,frag=no), packets:0, bytes:0, used:0.001s, actions:set(ipv4(tos=0xc/0xfc)),3 +]) + +dnl Check the tc rule. +AT_CHECK([tc -d filter show dev ovs-p0 ingress | grep -q "csum (iph)"], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From c29ba54018520f957c48d947325ed50c9442b831 Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Sat, 7 Oct 2023 11:49:46 +0800 Subject: [PATCH 402/833] tc: Add IPIP/GRE protocols to offload in IP rewrite. Currently checksum recalculation is not supported with TC offload for IPIP and GRE packets. This patch adds support for TC offloading of IPIP and GRE packets by adding the correct csum action. Without this patch the following warning can be seen in the logging: Can't offload rewrite of IP/IPV6 with ip_proto: X. Acked-by: Simon Horman Signed-off-by: Faicker Mo Signed-off-by: Eelco Chaudron --- lib/tc.c | 4 +++- tests/system-offloads-traffic.at | 39 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/lib/tc.c b/lib/tc.c index ae71390bc7d..f8570363327 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -2975,7 +2975,9 @@ csum_update_flag(struct tc_flower *flower, flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP; } else if (flower->key.ip_proto == IPPROTO_ICMP || flower->key.ip_proto == IPPROTO_IGMP || - flower->key.ip_proto == IPPROTO_SCTP) { + flower->key.ip_proto == IPPROTO_SCTP || + flower->key.ip_proto == IPPROTO_IPIP || + flower->key.ip_proto == IPPROTO_GRE) { flower->needs_full_ip_proto_mask = true; } else if (flower->key.ip_proto == IPPROTO_ICMPV6) { flower->needs_full_ip_proto_mask = true; diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 3a03d931c82..81f3dc8c1e7 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -882,3 +882,42 @@ AT_CHECK([tc -d filter show dev ovs-p0 ingress | grep -q "csum (iph)"], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([offloads - IPIP wth ip rewrite - offloads enabled]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +AT_CHECK([ovs-ofctl add-flow br0 "priority=0 actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Set up the ip field modify flow. +AT_CHECK([ovs-ofctl add-flow br0 "priority=100 in_port=ovs-p0,ip,nw_dst=10.1.1.2 actions=dec_ttl,output:ovs-p1"]) +AT_CHECK([ovs-ofctl add-flow br0 "priority=100 in_port=ovs-p1,ip,nw_dst=10.1.1.1 actions=dec_ttl,output:ovs-p0"]) + +dnl Set up ipip tunnel in NS. +NS_CHECK_EXEC([at_ns0], [ip tunnel add ipip0 remote 10.1.1.2 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [ip link set dev ipip0 up 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [ip addr add dev ipip0 192.168.1.1/30 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns1], [ip tunnel add ipip0 remote 10.1.1.1 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns1], [ip link set dev ipip0 up 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns1], [ip addr add dev ipip0 192.168.1.2/30 2>/dev/null], [0]) + +dnl Check the tunnel. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 192.168.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Check the offloaded flow. +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | strip_stats], [0], [dnl +in_port(2),eth(),eth_type(0x0800),ipv4(dst=10.1.1.2,proto=4,ttl=64,frag=no), packets:0, bytes:0, used:0.001s, actions:set(ipv4(ttl=63)),3 +in_port(3),eth(),eth_type(0x0800),ipv4(dst=10.1.1.1,proto=4,ttl=64,frag=no), packets:0, bytes:0, used:0.001s, actions:set(ipv4(ttl=63)),2 +]) + +dnl Check the tc rule. +AT_CHECK([tc -d filter show dev ovs-p0 ingress | grep -q "csum (iph)"], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From 23a70e28663478b05cb8cfe132b9ad09439dd561 Mon Sep 17 00:00:00 2001 From: Zengyuan Wang Date: Tue, 17 Oct 2023 19:11:19 +0800 Subject: [PATCH 403/833] db-ctl-base: Fix memory leak of db commands. Variable "want_key" in function check_condition and variable "key" in function set_column were not destroyed in exception branch. This patch calls ovsdb_atom_destroy to release resources to avoid memory leak. Fixes: 79c1a00fb5a5 ("db-ctl-base: Don't die in set_column() on error.") Fixes: e09b3af3e249 ("db-ctl-base: Don't die in is_condition_satisfied() on error") Acked-by: Simon Horman Signed-off-by: Zengyuan Wang Signed-off-by: Ilya Maximets --- lib/db-ctl-base.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c index 5d2635946d3..3a8068b12c0 100644 --- a/lib/db-ctl-base.c +++ b/lib/db-ctl-base.c @@ -820,6 +820,7 @@ check_condition(const struct ovsdb_idl_table_class *table, type.value.type = OVSDB_TYPE_VOID; error = ovsdb_datum_from_string(&b, &type, value_string, symtab); if (error) { + ovsdb_atom_destroy(&want_key, column->type.key.type); goto out; } @@ -1374,6 +1375,7 @@ set_column(const struct ovsdb_idl_table_class *table, error = ovsdb_atom_from_string(&value, NULL, &column->type.value, value_string, symtab); if (error) { + ovsdb_atom_destroy(&key, column->type.key.type); goto out; } From d581473cb304b910ba34fa9810ad2a25bc43d49b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 18 Oct 2023 23:01:04 +0200 Subject: [PATCH 404/833] AUTHORS: Add Zengyuan Wang. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 9cd8a9b9229..6b8367ef4a0 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -502,6 +502,7 @@ Yunjian Wang wangyunjian@huawei.com Yousong Zhou yszhou4tech@gmail.com Zak Whittington zwhitt.vmware@gmail.com Zang MingJie zealot0630@gmail.com +Zengyuan Wang wangzengyuan@huawei.com ZhengLingyun konghuarukhr@163.com Zhenyu Gao sysugaozhenyu@gmail.com Zhi Yong Wu zwu.kernel@gmail.com From bd86266ea9ab3452173b70912f1c967db16ef0dd Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 18 Oct 2023 16:23:53 +0200 Subject: [PATCH 405/833] ofproto-dpif-upcall: Pause revalidators when purging. This issue has been observed when running traffic tests with a dpdk enabled userspace datapath (though those tests are added in a separate series). However, the described issue also affects the kernel datapath which is why this patch is sent separately. A main thread executing the 'revalidator/purge' command could race with revalidator threads that can be dumping/sweeping the purged flows at the same time. This race can be reproduced (with dpif debug logs) by running the conntrack - ICMP related unit tests with the userspace datapath: 2023-10-09T14:11:55.242Z|00177|unixctl|DBG|received request revalidator/purge[], id=0 2023-10-09T14:11:55.242Z|00044|dpif(revalidator17)|DBG|netdev@ovs-netdev: flow_dump ufid:68ff6817-fb3b-4b30-8412-9cf175318294 , packets:0, bytes:0, used:never 2023-10-09T14:11:55.242Z|00178|dpif|DBG|netdev@ovs-netdev: flow_del ufid:07046e91-30a6-4862-9048-1a76b5a88a5b recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0), ct_state(0),ct_zone(0),ct_mark(0),ct_label(0), packet_type(ns=0,id=0), eth(src=a6:0a:bf:e2:f3:f2,dst=62:23:0f:f6:2c:75), eth_type(0x0800),ipv4(src=10.1.1.1,dst=10.1.1.2,proto=17,tos=0, ttl=64,frag=no),udp(src=37380,dst=10000), packets:0, bytes:0, used:never ... 2023-10-09T14:11:55.242Z|00049|dpif(revalidator17)|WARN|netdev@ovs-netdev: failed to flow_get (No such file or directory) ufid:07046e91-30a6-4862-9048-1a76b5a88a5b , packets:0, bytes:0, used:never 2023-10-09T14:11:55.242Z|00050|ofproto_dpif_upcall(revalidator17)|WARN| Failed to acquire udpif_key corresponding to unexpected flow (No such file or directory): ufid:07046e91-30a6-4862-9048-1a76b5a88a5b ... 2023-10-09T14:11:55.242Z|00183|unixctl|DBG|replying with success, id=0: "" To avoid this race, a first part of the fix is to pause (if not already paused) the revalidators while the main thread is purging the datapath flows. Then a second issue is observed by running the same unit test with the kernel datapath. Its dpif implementation dumps flows via a netlink request (see dpif_flow_dump_create(), dpif_netlink_flow_dump_create(), nl_dump_start(), nl_sock_send__()) in the leader revalidator thread, before pausing revalidators: 2023-10-09T14:44:28.742Z|00122|unixctl|DBG|received request revalidator/purge[], id=0 ... 2023-10-09T14:44:28.742Z|00125|dpif|DBG|system@ovs-system: flow_del ufid:70102d81-30a1-44b9-aa76-3d02a9ffd2c9 recirc_id(0),dp_hash(0), skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0), ct_mark(0),ct_label(0),eth(src=a6:0a:bf:e2:f3:f2, dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=10.1.1.1, tip=10.1.1.2,op=1,sha=a6:0a:bf:e2:f3:f2,tha=00:00:00:00:00:00), packets:0, bytes:0, used:never ... 2023-10-09T14:44:28.742Z|00129|unixctl|DBG|replying with success, id=0: "" ... 2023-10-09T14:44:28.742Z|00006|dpif(revalidator21)|DBG|system@ovs-system: flow_dump ufid:70102d81-30a1-44b9-aa76-3d02a9ffd2c9 , packets:0, bytes:0, used:never ... 2023-10-09T14:44:28.742Z|00012|dpif(revalidator21)|WARN|system@ovs-system: failed to flow_get (No such file or directory) ufid:70102d81-30a1-44b9-aa76-3d02a9ffd2c9 , packets:0, bytes:0, used:never 2023-10-09T14:44:28.742Z|00013|ofproto_dpif_upcall(revalidator21)|WARN| Failed to acquire udpif_key corresponding to unexpected flow (No such file or directory): ufid:70102d81-30a1-44b9-aa76-3d02a9ffd2c9 To avoid evaluating already deleted flows, the second part of the fix is to ensure that dumping from the leader revalidator thread is done out of any pause request. As a result of this patch, the unit test "offloads - delete ufid mapping if device not exist - offloads enabled" does not need to waive the random warning logs when purging dp flows. Fixes: 98bb4286970d ("tests: Add command to purge revalidators of flows.") Acked-by: Eelco Chaudron Acked-by: Simon Horman Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 17 +++++++++++++++-- tests/system-offloads-traffic.at | 2 -- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index cde03abc6da..cc10f57b5e6 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -990,7 +990,7 @@ udpif_revalidator(void *arg) udpif->reval_exit = latch_is_set(&udpif->exit_latch); start_time = time_msec(); - if (!udpif->reval_exit) { + if (!udpif->reval_exit && !udpif->pause) { bool terse_dump; terse_dump = udpif_use_ufid(udpif); @@ -1000,10 +1000,15 @@ udpif_revalidator(void *arg) } } - /* Wait for the leader to start the flow dump. */ + /* Wait for the leader to reach this point. */ ovs_barrier_block(&udpif->reval_barrier); if (udpif->pause) { revalidator_pause(revalidator); + if (!udpif->reval_exit) { + /* The main thread resumed all validators, but the leader + * didn't start the dump, go to next iteration. */ + continue; + } } if (udpif->reval_exit) { @@ -3217,11 +3222,19 @@ upcall_unixctl_purge(struct unixctl_conn *conn, int argc OVS_UNUSED, struct udpif *udpif; LIST_FOR_EACH (udpif, list_node, &all_udpifs) { + bool wake_up = false; int n; + if (!latch_is_set(&udpif->pause_latch)) { + udpif_pause_revalidators(udpif); + wake_up = true; + } for (n = 0; n < udpif->n_revalidators; n++) { revalidator_purge(&udpif->revalidators[n]); } + if (wake_up) { + udpif_resume_revalidators(udpif); + } } unixctl_command_reply(conn, ""); } diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 81f3dc8c1e7..e9a4587653d 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -799,8 +799,6 @@ AT_CHECK([test $(ovs-appctl dpctl/dump-flows | grep -c "eth_type(0x0800)") -eq 0 OVS_TRAFFIC_VSWITCHD_STOP(["/could not open network device ovs-p0/d /on nonexistent port/d -/failed to flow_get/d -/Failed to acquire udpif_key/d /No such device/d /failed to offload flow/d "]) From a413fed99b15f3cec15aaadf6dad2a5c227fd56a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 18 Oct 2023 22:46:10 +0200 Subject: [PATCH 406/833] tc: Improve logging of mismatched actions. Currently we log the 980-ish byte long tc_action structure as a single long hex string. That is very hard to read and hard to spot the difference between two. And most of the fields are zero. Use the sparse hex dump instead as we do for keys already. Ex.: Action 1 mismatch: - Expected Action: 00000000 f0 3c 00 00 01 00 00 00-00 00 00 00 00 00 00 00 000003d0 00 00 00 00 ff ff ff ff- - Received Action: 00000000 f0 3c 00 00 01 01 00 00-00 00 00 00 00 00 00 00 000003d0 00 00 00 00 ff ff ff ff- Without the change, each action would be a 1900+ characters long string of mostly zeroes. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/tc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index f8570363327..e9bcae4e4b1 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -3858,15 +3858,13 @@ log_tc_flower_match(const char *msg, ds_put_cstr(&s, "\nExpected Actions:\n"); for (i = 0, action = a->actions; i < a->action_count; i++, action++) { - ds_put_cstr(&s, " - "); - ds_put_hex(&s, action, sizeof *action); - ds_put_cstr(&s, "\n"); + ds_put_format(&s, " - %d -\n", i); + ds_put_sparse_hex_dump(&s, action, sizeof *action, 0, false); } - ds_put_cstr(&s, "Received Actions:\n"); + ds_put_cstr(&s, "\nReceived Actions:\n"); for (i = 0, action = b->actions; i < b->action_count; i++, action++) { - ds_put_cstr(&s, " - "); - ds_put_hex(&s, action, sizeof *action); - ds_put_cstr(&s, "\n"); + ds_put_format(&s, " - %d -\n", i); + ds_put_sparse_hex_dump(&s, action, sizeof *action, 0, false); } } else { /* Only dump the delta in actions. */ @@ -3875,12 +3873,13 @@ log_tc_flower_match(const char *msg, for (int i = 0; i < a->action_count; i++, action_a++, action_b++) { if (memcmp(action_a, action_b, sizeof *action_a)) { - ds_put_format(&s, - "\nAction %d mismatch:\n - Expected Action: ", - i); - ds_put_hex(&s, action_a, sizeof *action_a); - ds_put_cstr(&s, "\n - Received Action: "); - ds_put_hex(&s, action_b, sizeof *action_b); + ds_put_format(&s, "\nAction %d mismatch:\n" + " - Expected Action:\n", i); + ds_put_sparse_hex_dump(&s, action_a, sizeof *action_a, + 0, false); + ds_put_cstr(&s, " - Received Action:\n"); + ds_put_sparse_hex_dump(&s, action_b, sizeof *action_b, + 0, false); } } } From e388bd73b70d8f053f26b609ee939d6af7d1a10c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 23 Oct 2023 15:31:48 +0200 Subject: [PATCH 407/833] readthedocs: Add the configuration file. Since last month ReadTheDocs only supports building with a new configuration file provided in the repository itself: https://blog.readthedocs.com/migrate-configuration-v2/ So, all our documentation builds are failing for quite some time. Add the configuration file to unblock documentation updates. Need to remove the upper restriction on the sphinx version. sphinx 2.0 is very old at this point and pip fails to install it along with other dependencies on the rtd server. Note: Sphinx 2.0 moved from HTML4 to HTML5 renderer and tables no longer have borders by default. That should be addressed via CSS file in the ovs-sphinx-theme. Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- .readthedocs.yaml | 24 ++++++++++++++++++++++++ Documentation/requirements.txt | 2 +- Makefile.am | 1 + 3 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000000..e481e64f1fc --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,24 @@ +# .readthedocs.yaml +# Read the Docs configuration file. +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details. + +# Required. +version: 2 + +# Set the OS, Python version, etc. +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +# Build documentation in the "Documentation/" directory with Sphinx. +sphinx: + configuration: Documentation/conf.py + +# Build all formats: HTML, PDF, ePub. +formats: all + +# Declare the Python requirements. +python: + install: + - requirements: Documentation/requirements.txt diff --git a/Documentation/requirements.txt b/Documentation/requirements.txt index 77130c6e01b..77f44bd7654 100644 --- a/Documentation/requirements.txt +++ b/Documentation/requirements.txt @@ -1,2 +1,2 @@ -sphinx>=1.1,<2.0 +sphinx>=1.1 ovs_sphinx_theme>=1.0,<1.1 diff --git a/Makefile.am b/Makefile.am index 439e2bf6d53..94f488d1837 100644 --- a/Makefile.am +++ b/Makefile.am @@ -84,6 +84,7 @@ EXTRA_DIST = \ .cirrus.yml \ .editorconfig \ .github/workflows/build-and-test.yml \ + .readthedocs.yaml \ appveyor.yml \ boot.sh \ poc/builders/Vagrantfile \ From cc89bf8e22f838e08f3729c25384d38cfa17fdb1 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 23 Oct 2023 15:59:46 +0200 Subject: [PATCH 408/833] README: Add documentation build status badge. This should make it a little more visible that documentation build fails on ReadTheDocs. Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index e6c0d3d3061..a2c234f4d17 100644 --- a/README.rst +++ b/README.rst @@ -12,6 +12,8 @@ Open vSwitch :target: https://ci.appveyor.com/project/blp/ovs/history .. image:: https://api.cirrus-ci.com/github/openvswitch/ovs.svg :target: https://cirrus-ci.com/github/openvswitch/ovs +.. image:: https://readthedocs.org/projects/openvswitch/badge/?version=latest + :target: https://docs.openvswitch.org/en/latest/ What is Open vSwitch? --------------------- From 6cfb3d1ff5137c9bc3e361bf76f412b7f2a9f13a Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Sat, 21 Oct 2023 01:22:08 +0200 Subject: [PATCH 409/833] tests/system-traffic: Ensure no name resolution for tcpdump. Depending on system configuration, executing tcpdump without the -n parameter, may prolong the execution time for tcpdump while it attempts name resolution. This delay may in turn lead to test failures due to contents of tables to check being evicted. We recently started to see this problem with the "conntrack -IPv6 ICMP6 Related with SNAT" test. For consistency, this patch adds the -n parameter to all tcpdump calls in system-traffic.at. Acked-by: Simon Horman Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 418cd32fecd..1df2a541902 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -888,7 +888,7 @@ NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0] ]) dnl Start tcpdump to capture the encapsulated packets. -NETNS_DAEMONIZE([at_ns0], [tcpdump -U -i p0 -w p0.pcap], [tcpdump.pid]) +NETNS_DAEMONIZE([at_ns0], [tcpdump -n -U -i p0 -w p0.pcap], [tcpdump.pid]) sleep 1 dnl Generate a single packet trough the controler that needs an ARP modification @@ -3814,7 +3814,7 @@ table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +NETNS_DAEMONIZE([at_ns0], [tcpdump -n -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) dnl Send UDP packet from 10.1.1.1:1234 to 10.1.1.240:80 @@ -6158,7 +6158,7 @@ table=10 priority=0 action=drop AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -OVS_DAEMONIZE([tcpdump -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +OVS_DAEMONIZE([tcpdump -n -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) sleep 1 dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. @@ -6182,7 +6182,7 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 's/dst= udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=),mark=1 ]) -AT_CHECK([tcpdump -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) +AT_CHECK([tcpdump -n -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -6927,13 +6927,13 @@ OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) AT_CHECK([ovs-appctl dpctl/flush-conntrack]) rm p0.pcap -OVS_DAEMONIZE([tcpdump -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +OVS_DAEMONIZE([tcpdump -n -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) sleep 1 dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc -6 $NC_EOF_OPT -u fc00::2 1"]) -AT_CHECK([tcpdump -v "icmp6" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) +AT_CHECK([tcpdump -n -v "icmp6" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc00::2,dst=fc00::240,sport=,dport=) @@ -6962,7 +6962,7 @@ table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -NETNS_DAEMONIZE([at_ns0], [tcpdump -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +NETNS_DAEMONIZE([at_ns0], [tcpdump -n -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) dnl Send UDP packet from [[fc00::1]]:1234 to [[fc00::240]]:80 @@ -7653,7 +7653,7 @@ table=2,in_port=ovs-server,ip,ct_state=+trk+rpl,actions=output:ovs-client AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) rm server.pcap -NETNS_DAEMONIZE([server], [tcpdump -l -U -i server -w server.pcap 2>tcpdump0_err], [tcpdump0.pid]) +NETNS_DAEMONIZE([server], [tcpdump -n -l -U -i server -w server.pcap 2>tcpdump0_err], [tcpdump0.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) dnl Send UDP client->server @@ -7695,7 +7695,7 @@ dnl Check the ICMP error in reply direction AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=42]) rm client.pcap -NETNS_DAEMONIZE([client], [tcpdump -l -U -i client -w client.pcap 2>tcpdump1_err], [tcpdump1.pid]) +NETNS_DAEMONIZE([client], [tcpdump -n -l -U -i client -w client.pcap 2>tcpdump1_err], [tcpdump1.pid]) OVS_WAIT_UNTIL([grep "listening" tcpdump1_err]) dnl Send UDP client->server From 34ae81c1f4330a5934d52a3c6bcf03d8bd8ee668 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Sat, 21 Oct 2023 17:04:48 +0200 Subject: [PATCH 410/833] tests: Use ping timeout instead of deadline. Many system tests currently use ping with the combination of a low packet count (-c 3), short interval between sends (-i 0.3) and a _deadline_ of 2 seconds (-d 2). This combination of options may lead to a situation where more than count packets are sent however ping will stop when count packets are received. This results in a failed test due to how the result is checked, for example: ping6 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING @@ -1,2 +1,2 @@ -3 packets transmitted, 3 received, 0% packet loss, time 0ms +4 packets transmitted, 3 received, 25% packet loss, time 0ms To reiterate, in the above example there is no packet loss, but ping stops after _receiving_ 3 packets, not bothering with waiting for the response to the fourth packet it just sent out. If we look at the iputils ping manual for the -w deadline option we can read that this is expected behavior: > Specify a timeout, in seconds, before ping exits regardless of > how many packets have been sent or received. In this case ping > does not stop after count packet are sent, it waits either for > deadline expire or until count probes are answered or for some > error notification from network. To avoid these kinds of failures in checks where a response is expected, we replace ping -w with ping -W. We keep ping -w for checks where it is expected to NOT get a response. Acked-by: Simon Horman Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/system-afxdp.at | 2 +- tests/system-ipsec.at | 4 +- tests/system-layer3-tunnels.at | 20 +- tests/system-offloads-traffic.at | 58 ++--- tests/system-tap.at | 2 +- tests/system-traffic.at | 262 ++++++++++---------- tests/system-userspace-packet-type-aware.at | 6 +- 7 files changed, 177 insertions(+), 177 deletions(-) diff --git a/tests/system-afxdp.at b/tests/system-afxdp.at index 0d09906fb6c..88f66056630 100644 --- a/tests/system-afxdp.at +++ b/tests/system-afxdp.at @@ -39,7 +39,7 @@ AT_CHECK([ovs-vsctl add-port br0 ovs-p0 -- \ set interface ovs-p0 type=afxdp-nonpmd options:n_rxq=1], [0], [], [stderr]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at index 07f2b8fd0e8..d3d27133b97 100644 --- a/tests/system-ipsec.at +++ b/tests/system-ipsec.at @@ -141,10 +141,10 @@ m4_define([CHECK_ESP_TRAFFIC], OVS_WAIT_UNTIL([test `IPSEC_STATUS_LOADED(right)` -eq `IPSEC_STATUS_ACTIVE(right)`]) dnl Ping over IPsec tunnel - NS_CHECK_EXEC([left], [ping -q -c 3 -i 0.3 -w 2 192.0.0.2 | FORMAT_PING], [0], [dnl + NS_CHECK_EXEC([left], [ping -q -c 3 -i 0.3 -W 2 192.0.0.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) - NS_CHECK_EXEC([right], [ping -q -c 3 -i 0.3 -w 2 192.0.0.1 | FORMAT_PING], [0], [dnl + NS_CHECK_EXEC([right], [ping -q -c 3 -i 0.3 -W 2 192.0.0.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-layer3-tunnels.at b/tests/system-layer3-tunnels.at index 81123f7309a..6fbdedb64f6 100644 --- a/tests/system-layer3-tunnels.at +++ b/tests/system-layer3-tunnels.at @@ -34,15 +34,15 @@ AT_CHECK([ovs-ofctl add-flow br0 "priority=100 ip,nw_dst=10.1.1.2 action=mod_dl_ OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.1.1.2]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -83,15 +83,15 @@ AT_CHECK([ovs-ofctl add-flow br0 "priority=100 ip,nw_dst=10.1.1.2 action=mod_dl_ OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.1.1.2]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -191,11 +191,11 @@ AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br0 flows0.txt]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br1 flows1.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -239,11 +239,11 @@ AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br0 flows0.txt]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br1 flows1.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index e9a4587653d..5ad6b4bfdf6 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -39,7 +39,7 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -72,7 +72,7 @@ ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [ignore]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -204,7 +204,7 @@ NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.2 lladdr f0:00:00:01:01:02 dev p0]) NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev p1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flow br0 "actions=normal"]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -254,7 +254,7 @@ NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.2 lladdr f0:00:00:01:01:02 dev p0]) NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev p1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flow br0 "actions=normal"]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -313,11 +313,11 @@ NETNS_DAEMONIZE([at_ns3], [tcpdump -l -n -U -i p3 dst 10.1.1.2 and icmp > p3.pca NETNS_DAEMONIZE([at_ns4], [tcpdump -l -n -U -i p4 dst 10.1.1.2 and icmp > p4.pcap 2>/dev/null], [tcpdump4.pid]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -371,11 +371,11 @@ NETNS_DAEMONIZE([at_ns3], [tcpdump -l -n -U -i p3 dst 10.1.1.2 and icmp > p3.pca NETNS_DAEMONIZE([at_ns4], [tcpdump -l -n -U -i p4 dst 10.1.1.2 and icmp > p4.pcap 2>/dev/null], [tcpdump4.pid]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -419,11 +419,11 @@ table=4,in_port=1,reg0=0x0 actions=output:2 AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -447,7 +447,7 @@ table=4,in_port=1,reg0=0x0 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(3))]) @@ -463,7 +463,7 @@ table=4,in_port=1,reg0=0x0 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(drop),le(3))]) @@ -479,7 +479,7 @@ table=4,in_port=1,reg0=0x1 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(3),le(drop))]) @@ -496,7 +496,7 @@ table=4,in_port=1,reg0=0x1 actions=output:2,4 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(3,5),le(3,4))]) @@ -519,11 +519,11 @@ NETNS_DAEMONIZE([at_ns3], [tcpdump -l -n -U -i p3 dst 10.1.1.2 and icmp > p3_2.p NETNS_DAEMONIZE([at_ns4], [tcpdump -l -n -U -i p4 dst 10.1.1.2 and icmp > p4_2.pcap 2>/dev/null], [tcpdump4_2.pid]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -555,7 +555,7 @@ table=4,in_port=1,reg0=0x0 actions=mod_dl_src:00:11:11:11:11:11,output:4 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(set(ipv4(tos=0x4/0xfc)),4),le(set(eth(src=00:11:11:11:11:11)),5)),3]) @@ -573,7 +573,7 @@ table=4,in_port=1,reg0=0x0 actions=mod_nw_tos:8,output:4 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(set(eth(src=00:11:11:11:11:11)),4),le(set(ipv4(tos=0x8/0xfc)),5)),3]) @@ -591,7 +591,7 @@ table=4,in_port=1,reg0=0x0 actions=output:br0 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(1),le(1)),3]) @@ -608,7 +608,7 @@ table=4,in_port=1,reg0=0x1 actions=output:br0 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(1),le(drop)),3]) @@ -625,7 +625,7 @@ table=4,in_port=1,reg0=0x0 actions=output:br0 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -642,7 +642,7 @@ table=1,in_port=1,reg1=0x2 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(drop),le(drop)),3]) @@ -662,7 +662,7 @@ table=5,in_port=1,reg0=0x0 actions=output:3 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(check_pkt_len(size=400,gt(5),le(4))),le(5)),3]) @@ -682,7 +682,7 @@ table=5,in_port=1,reg0=0x0 actions=output:3 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(check_pkt_len(size=100,gt(5),le(4)))),3]) @@ -706,7 +706,7 @@ add in_port=ovs-p1,actions=ovs-p0 ]) AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -723,7 +723,7 @@ modify in_port=ovs-p1,actions=output(port=ovs-p0, max_len=128) AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) AT_CHECK([ovs-appctl revalidator/wait], [0]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -735,7 +735,7 @@ recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-appctl revalidator/wait], [0]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -834,7 +834,7 @@ NS_CHECK_EXEC([at_ns0], [iptables -I OUTPUT -p ip -j MARK --set-mark 512 2>/dev/ NS_CHECK_EXEC([at_ns0], [iptables -I INPUT -m mark --mark 512 -j ACCEPT 2>/dev/null], [0], [ignore]) dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -904,7 +904,7 @@ NS_CHECK_EXEC([at_ns1], [ip link set dev ipip0 up 2>/dev/null], [0]) NS_CHECK_EXEC([at_ns1], [ip addr add dev ipip0 192.168.1.2/30 2>/dev/null], [0]) dnl Check the tunnel. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 192.168.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 192.168.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-tap.at b/tests/system-tap.at index 871a3bda4fc..3d84a53182c 100644 --- a/tests/system-tap.at +++ b/tests/system-tap.at @@ -22,7 +22,7 @@ AT_CHECK([ip netns exec at_ns1 ip link set dev tap1 up]) AT_CHECK([ip netns exec at_ns0 ip addr add 10.1.1.1/24 dev tap0]) AT_CHECK([ip netns exec at_ns1 ip addr add 10.1.1.2/24 dev tap1]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 1df2a541902..7ea45020289 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -10,13 +10,13 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -33,7 +33,7 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -56,13 +56,13 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") ADD_VLAN(p0, at_ns0, 100, "10.2.2.1/24") ADD_VLAN(p1, at_ns1, 100, "10.2.2.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -88,13 +88,13 @@ ADD_CVLAN(p1.4094, at_ns1, 100, "10.2.2.2/24") OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -116,13 +116,13 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -147,13 +147,13 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -179,13 +179,13 @@ ADD_CVLAN(p1.4094, at_ns1, 100, "fc00:1::2/96") OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -209,13 +209,13 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -253,13 +253,13 @@ priority=0,actions=NORMAL AT_CHECK([ovs-ofctl del-flows br0]) AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::3 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::3 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::3 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -278,13 +278,13 @@ ADD_VETH_BOND(p1 p2, at_ns1, br0, bond0, lacp=active bond_mode=balance-tcp, "10. OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.1.1.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -315,18 +315,18 @@ ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], [id 0 dstport 4789]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -363,17 +363,17 @@ ADD_VLAN(at_vxlan1, at_ns0, 100, "10.1.1.1/24") ADD_VLAN(p0, at_ns0, 42, "172.31.1.1/24") dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -406,18 +406,18 @@ ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan1], [at_ns0], [fc00::100], [10.1.1.1/24], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -448,18 +448,18 @@ ADD_OVS_TUNNEL([gre], [br0], [at_gre0], [172.31.1.1], [10.1.1.100/24]) ADD_NATIVE_TUNNEL([gretap], [ns_gre0], [at_ns0], [172.31.1.100], [10.1.1.1/24]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -495,12 +495,12 @@ ADD_NATIVE_TUNNEL6([ip6gretap], [ns_gretap0], [at_ns0], [fc00:100::100], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 2 fc00:100::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -532,12 +532,12 @@ ADD_OVS_TUNNEL([erspan], [br0], [at_erspan0], [172.31.1.1], [10.1.1.100/24], [op ADD_NATIVE_TUNNEL([erspan], [ns_erspan0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [seq key 1 erspan_ver 1 erspan 7]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl NS_CHECK_EXEC([at_ns0], [ping -s 1200 -i 0.3 -c 3 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -569,12 +569,12 @@ ADD_OVS_TUNNEL([erspan], [br0], [at_erspan0], [172.31.1.1], [10.1.1.100/24], [op ADD_NATIVE_TUNNEL([erspan], [ns_erspan0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [seq key 1 erspan_ver 2 erspan_dir egress erspan_hwid 7]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl NS_CHECK_EXEC([at_ns0], [ping -s 1200 -i 0.3 -c 3 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -610,12 +610,12 @@ ADD_NATIVE_TUNNEL6([ip6erspan], [ns_erspan0], [at_ns0], [fc00:100::100], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 2 fc00:100::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -651,12 +651,12 @@ ADD_NATIVE_TUNNEL6([ip6erspan], [ns_erspan0], [at_ns0], [fc00:100::100], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 2 fc00:100::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -686,18 +686,18 @@ ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [vni 0]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -739,12 +739,12 @@ ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [vni 0]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl ping over tunnel should work -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -837,18 +837,18 @@ ADD_NATIVE_TUNNEL6([geneve], [ns_gnv0], [at_ns0], [fc00::100], [10.1.1.1/24], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -883,7 +883,7 @@ AT_CHECK([ovs-ofctl add-flow br0 "table=37,actions=at_gnv0"]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -930,7 +930,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -i p0 dst host 172.31.1.1 -l > p0.pcap 2>/ sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -978,7 +978,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host 172.31.1.1 -l > p0.pcap sleep 1 dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1031,7 +1031,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host 172.31.1.1 -l > p0.pcap sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1088,7 +1088,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host fc00:100::1 -l > p0.pcap sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1144,7 +1144,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host fc00:100::1 -l > p0.pcap sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1214,12 +1214,12 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay. -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.100.100.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.100.100.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1276,12 +1276,12 @@ OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) OVS_WAIT_UNTIL([ip netns exec at_ns1 ping6 -c 1 fc00:100::100]) dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay. -NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1307,7 +1307,7 @@ priority=10 in_port=2,ip,actions=clone(mod_dl_src(ae:c6:7e:54:8d:4d),mod_dl_dst( AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1348,11 +1348,11 @@ table=1,priority=10 actions=normal AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-ofctl add-flows br1 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1387,11 +1387,11 @@ table=3,priority=10 actions=normal AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-ofctl add-flows br1 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -2062,7 +2062,7 @@ add in_port=ovs-p1,actions=ovs-p0,br0 ]) AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -2081,7 +2081,7 @@ modify in_port=ovs-p1,actions=ovs-p0 AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) AT_CHECK([ovs-appctl revalidator/wait], [0]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -2096,7 +2096,7 @@ recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:19, bytes AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-appctl revalidator/wait], [0]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -2563,7 +2563,7 @@ AT_CHECK([FLUSH_CMD zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17 AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0]) dnl Test ICMP traffic -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -2763,7 +2763,7 @@ priority=100,in_port=2,icmp,ct_state=+trk+est,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) dnl Pings from ns0->ns1 should work fine. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -2804,7 +2804,7 @@ priority=100,in_port=2,icmp,ct_state=+trk+est,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) dnl Pings from ns0->ns1 should work fine. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -2904,7 +2904,7 @@ NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -w 2 fc00::1 | FORMAT_PING], [0], ]) dnl Pings from ns0->ns1 should work fine. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3855,12 +3855,12 @@ dnl Modify userspace conntrack fragmentation handling. DPCTL_MODIFY_FRAGMENTATION() dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3932,12 +3932,12 @@ dnl Modify userspace conntrack fragmentation handling. DPCTL_MODIFY_FRAGMENTATION() dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3978,22 +3978,22 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.255.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.255.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.255.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.255.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4152,12 +4152,12 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) dnl Ipv6 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4234,12 +4234,12 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4277,22 +4277,22 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::4]) dnl Ipv6 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 larger fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4504,18 +4504,18 @@ ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], [id 0 dstport 4789]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4564,18 +4564,18 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4688,7 +4688,7 @@ dnl The default udp_single and icmp_first timeouts are 30 seconds in dnl kernel DP, and 60 seconds in userspace DP. dnl Send ICMP and UDP traffic -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4714,7 +4714,7 @@ done AT_CHECK([ovs-vsctl --may-exist add-zone-tp $DP_TYPE zone=5 udp_first=1 udp_single=1 icmp_first=1 icmp_reply=1]) dnl Send ICMP and UDP traffic -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4732,7 +4732,7 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl ]) dnl Re-send ICMP and UDP traffic to test conntrack cache -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4753,7 +4753,7 @@ dnl Set the timeout policy to default again. AT_CHECK([ovs-vsctl del-zone-tp $DP_TYPE zone=5]) dnl Send ICMP and UDP traffic -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -5019,7 +5019,7 @@ table=2,in_port=1,ip,ct_state=+trk+est,ct_zone=2,action=LOCAL AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -AT_CHECK([ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +AT_CHECK([ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -5090,7 +5090,7 @@ table=4,priority=100,ip,action=output:NXM_NX_REG0[[]] AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -AT_CHECK([ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +AT_CHECK([ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -6872,7 +6872,7 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::240]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::240 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::240 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7605,12 +7605,12 @@ ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [vni 0]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7837,7 +7837,7 @@ dnl CVLAN traffic should match the flow and drop AT_CHECK([ovs-appctl revalidator/purge]) AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:vlan-limit=1]) AT_CHECK([ovs-ofctl add-flow br0 "priority=100 dl_type=0x8100 action=drop"]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -w 3 10.2.2.2], [1], [ignore]) +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -W 3 10.2.2.2], [1], [ignore]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -7887,11 +7887,11 @@ AT_CHECK([ovs-ofctl --bundle add-flows br2 flows-customer-br.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7943,11 +7943,11 @@ AT_CHECK([ovs-ofctl --bundle add-flows br2 flows-customer-br.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7995,24 +7995,24 @@ AT_CHECK([ovs-vsctl set port ovs-p2 vlan_mode=dot1q-tunnel tag=4094 cvlans=100,2 OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.3.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.3.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.3.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.3.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.3.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl CVLAN 300 is not permitted by dot1q-tunnel -NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -w 3 10.4.2.2], [1], [ignore]) +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -W 3 10.4.2.2], [1], [ignore]) OVS_TRAFFIC_VSWITCHD_STOP(["/dropping VLAN \(0\|300\) packet received on dot1q-tunnel port/d"]) AT_CLEANUP @@ -8041,11 +8041,11 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows-br0.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-userspace-packet-type-aware.at b/tests/system-userspace-packet-type-aware.at index 974304758f8..aac178edaf9 100644 --- a/tests/system-userspace-packet-type-aware.at +++ b/tests/system-userspace-packet-type-aware.at @@ -335,7 +335,7 @@ AT_CHECK([ # Ping between N1 and N3, via the L2 GRE tunnel between br-in1 and br-in3 -NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -w 2 $N3_IP | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -W 2 $N3_IP | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -363,7 +363,7 @@ AT_CHECK([ # Ping between N1 and N2, via the L2 GRE tunnel between br-in1 and br-in2 -NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -w 2 $N2_IP | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -W 2 $N2_IP | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -394,7 +394,7 @@ AT_CHECK([ # Ping between N3 and N2, via the L3 GRE tunnel between br-in3 and br-in2 -NS_CHECK_EXEC([ns3], [ping -q -c 3 -i 0.3 -w 2 $N1_IP | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([ns3], [ping -q -c 3 -i 0.3 -W 2 $N1_IP | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) From 594d145410c540ceea8acc1fd1a99067439c67b4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 26 Oct 2023 19:55:04 +0200 Subject: [PATCH 411/833] readthedocs: Use dirhtml builder. We used this builder before, but from the project configuration on the website. ReadTheDocs doesn't allow to change it there anymore and it doesn't allow to see the full name of the previously used builder (!!), so I failed to migrate it to the config file. The result is that older link like: https://docs.openvswitch.org/en/latest/howto/dpdk/ Now require .html: https://docs.openvswitch.org/en/latest/howto/dpdk.html Fixing now by switching the builder back. Fixes: e388bd73b70d ("readthedocs: Add the configuration file.") Reported-by: Antonin Bas Reported-by: David Marchand Reported-at: https://github.com/openvswitch/ovs-issues/issues/310 Reviewed-by: Antonin Bas Acked-by: Eelco Chaudron Reviewed-by: David Marchand Signed-off-by: Ilya Maximets --- .readthedocs.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e481e64f1fc..7d505150ecd 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -14,6 +14,7 @@ build: # Build documentation in the "Documentation/" directory with Sphinx. sphinx: configuration: Documentation/conf.py + builder: "dirhtml" # Build all formats: HTML, PDF, ePub. formats: all From e8914353cedee52b54e1831037fd68b061e81c07 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Mon, 23 Oct 2023 10:41:12 +0100 Subject: [PATCH 412/833] vswitch.xml: Add dpdkvhostuser group status. Add group for dpdkvhostuser(/client) netdev. Adding as a single group as they display the same status, one of which is 'mode' to indicate if it's client or server. Fixes: b2e8b12f8a82 ("netdev-dpdk: add vhost-user get_status.") Signed-off-by: Kevin Traynor Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- vswitchd/vswitch.xml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 1e2a1267d4f..2c2bdfa57f7 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3821,6 +3821,36 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ supported by hardware. + + +

      + dpdkvhostuser and dpdkvhostuserclient + netdev specific interface status information. +

      + + client (connecting) or server (listening) in the socket + communication. + + + virtio features bitmap as per virtio specification. + + + The number of available virtqueues. + + + The numa id of the device and guest memory. + + + The path to the socket used for communication. + + + Status of connection to the device. + + + Each virtqueue will have it's size reported, where n is the + virtqueue number from 0..(num_of_vrings-1). + +
      From 2c841eef95b18d104a4ed37065f1492bde5c27b8 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Mon, 23 Oct 2023 10:41:13 +0100 Subject: [PATCH 413/833] vswitch.xml: Add entry for dpdkvhostuser userspace-tso. get_status for dpdkvhostuser(/client) netdev class may display userspace-tso status. Fixes: a5669fd51c9b ("netdev-dpdk: Drop TSO in case of conflicting virtio features.") Signed-off-by: Kevin Traynor Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- vswitchd/vswitch.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 2c2bdfa57f7..e400043ce7f 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3850,6 +3850,9 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Each virtqueue will have it's size reported, where n is the virtqueue number from 0..(num_of_vrings-1). + + Whether userspace-tso is enabled or disabled. + From fea52dc7170ca499a0da47dc3f0d5ff4b7b1a7a7 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Mon, 30 Oct 2023 10:02:59 +0100 Subject: [PATCH 414/833] python: Remove duplicate UnixctlClient implementation. The unixctl implementation in Python has been split into three parts in the past. During this process the UnixctlClient was duplicated, in python/ovs/unixctl/client.py and python/ovs/unixctl/server.py. This patch removes the duplicate from the latter. Fixes: 53cf9963ccc6 ("python: Break unixctl implementation into registry, client, and server.") Signed-off-by: Jakob Meng Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- python/ovs/unixctl/server.py | 44 ------------------------------------ 1 file changed, 44 deletions(-) diff --git a/python/ovs/unixctl/server.py b/python/ovs/unixctl/server.py index 5f9b3e7393b..b9cb52fadd3 100644 --- a/python/ovs/unixctl/server.py +++ b/python/ovs/unixctl/server.py @@ -211,47 +211,3 @@ def create(path, version=None): version) return 0, UnixctlServer(listener) - - -class UnixctlClient(object): - def __init__(self, conn): - assert isinstance(conn, ovs.jsonrpc.Connection) - self._conn = conn - - def transact(self, command, argv): - assert isinstance(command, str) - assert isinstance(argv, list) - for arg in argv: - assert isinstance(arg, str) - - request = Message.create_request(command, argv) - error, reply = self._conn.transact_block(request) - - if error: - vlog.warn("error communicating with %s: %s" - % (self._conn.name, os.strerror(error))) - return error, None, None - - if reply.error is not None: - return 0, str(reply.error), None - else: - assert reply.result is not None - return 0, None, str(reply.result) - - def close(self): - self._conn.close() - self.conn = None - - @staticmethod - def create(path): - assert isinstance(path, str) - - unix = "unix:%s" % ovs.util.abs_file_name(ovs.dirs.RUNDIR, path) - error, stream = ovs.stream.Stream.open_block( - ovs.stream.Stream.open(unix)) - - if error: - vlog.warn("failed to connect to %s" % path) - return error, None - - return 0, UnixctlClient(ovs.jsonrpc.Connection(stream)) From 49096a0cf1f2be6d5509bf61eb88fdfbd5794150 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 23 Oct 2023 16:22:16 +0200 Subject: [PATCH 415/833] general: Fix Clang's static analyzer 'Dead initialization' warnings. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- lib/meta-flow.c | 4 ++-- lib/ofp-actions.c | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/meta-flow.c b/lib/meta-flow.c index 474344194fa..aa7cf1fcbbd 100644 --- a/lib/meta-flow.c +++ b/lib/meta-flow.c @@ -2751,8 +2751,8 @@ static char * mf_from_integer_string(const struct mf_field *mf, const char *s, uint8_t *valuep, uint8_t *maskp) { + const char *err_str; char *tail; - const char *err_str = ""; int err; err = parse_int_string(s, valuep, mf->n_bytes, &tail); @@ -2785,8 +2785,8 @@ mf_from_integer_string(const struct mf_field *mf, const char *s, static char * mf_from_packet_type_string(const char *s, ovs_be32 *packet_type) { + const char *err_str; char *tail; - const char *err_str = ""; int err; if (*s != '(') { diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c index d7e5f542a04..da7b1dd31ae 100644 --- a/lib/ofp-actions.c +++ b/lib/ofp-actions.c @@ -4230,10 +4230,12 @@ encode_DELETE_FIELD(const struct ofpact_delete_field *delete_field, enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out) { - struct nx_action_delete_field *nadf = put_NXAST_DELETE_FIELD(out); - size_t size = out->size; + size_t size; - out->size = size - sizeof nadf->pad; + put_NXAST_DELETE_FIELD(out); + size = out->size; + + out->size = size - MEMBER_SIZEOF(struct nx_action_delete_field, pad); nx_put_mff_header(out, delete_field->field, 0, false); out->size = size; } From 5b6021957b9a11e3fa4decaa030b62023043c7d3 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 23 Oct 2023 16:22:24 +0200 Subject: [PATCH 416/833] general: Fix Clang's static analyzer 'Dead assignment' warnings. This patch addresses a 'Dead assignment' warning by designating the variable as OVS_UNUSED. We opted for this approach instead of comparing it to the sizeof(struct ...) method because of concerns related to code clarity. Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- lib/ofp-monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ofp-monitor.c b/lib/ofp-monitor.c index c27733a5264..29b0c5965c7 100644 --- a/lib/ofp-monitor.c +++ b/lib/ofp-monitor.c @@ -962,7 +962,7 @@ ofputil_decode_flow_update(struct ofputil_flow_update *update, return 0; } else if (update->event == OFPFME_PAUSED || update->event == OFPFME_RESUMED) { - struct ofp_flow_update_paused *ofup; + struct ofp_flow_update_paused *ofup OVS_UNUSED; if (length != sizeof *ofup) { goto bad_len; From 03c8e8010e78f5596f8efd7a119c6c04c9518cc2 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 23 Oct 2023 16:22:32 +0200 Subject: [PATCH 417/833] ofp-table: Fix count_common_prefix_run() function. It appears that an issue existed in the count_common_prefix_run() function from the beginning. This problem came to light while addressing 'Dead assignment' warnings identified by the Clang static analyzer. Instead of updating the extra_prefix_len with the current (next) value, the next value was inadvertently updated with extra_prefix_len. This patch rectifies this behavior. Fixes: 95a5454c5110 ("ofp-print: Abbreviate lists of fields in table features output.") Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- lib/ofp-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ofp-table.c b/lib/ofp-table.c index a956754f2d5..f9bd3b7f9c8 100644 --- a/lib/ofp-table.c +++ b/lib/ofp-table.c @@ -1416,7 +1416,7 @@ count_common_prefix_run(const char *ids[], size_t n, if (!next) { break; } else if (next < extra_prefix_len) { - next = extra_prefix_len; + extra_prefix_len = next; } i++; } From 979bc94b1b7568e5f08e2360c6e3c490f4042020 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 23 Oct 2023 16:22:39 +0200 Subject: [PATCH 418/833] ovsdb: Fix Clang's static analyzer 'func null dereference' warnings. In the existing code, there is no existing path that would result in a crash. Therefore, this code is currently implemented to satisfy Clang's requirements. Nevertheless, it serves the additional purpose of preventing issues with potential new use cases of the ovsdb_mutation_set_execute() API. Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- ovsdb/mutation.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ovsdb/mutation.c b/ovsdb/mutation.c index cbc71bc4944..79456001917 100644 --- a/ovsdb/mutation.c +++ b/ovsdb/mutation.c @@ -236,7 +236,8 @@ ovsdb_mutation_set_destroy(struct ovsdb_mutation_set *set) enum ovsdb_mutation_scalar_error { ME_OK, ME_DOM, - ME_RANGE + ME_RANGE, + ME_NOTSUP }; struct ovsdb_scalar_mutation { @@ -267,6 +268,9 @@ ovsdb_mutation_scalar_error(enum ovsdb_mutation_scalar_error error, "Result of \"%s\" operation is out of range.", ovsdb_mutator_to_string(mutator)); + case ME_NOTSUP: + return ovsdb_error(NULL, "Operation not supported."); + default: return OVSDB_BUG("unexpected error"); } @@ -514,6 +518,12 @@ div_double(double *x, double y) } } +static int +mod_double(double *x OVS_UNUSED, double y OVS_UNUSED) +{ + return ME_NOTSUP; +} + static const struct ovsdb_scalar_mutation add_mutation = { add_int, add_double, OVSDB_M_ADD }; @@ -531,5 +541,5 @@ static const struct ovsdb_scalar_mutation div_mutation = { }; static const struct ovsdb_scalar_mutation mod_mutation = { - mod_int, NULL, OVSDB_M_MOD + mod_int, mod_double, OVSDB_M_MOD }; From 08212d755ec002e28856c301e9fe3a044bcea450 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 23 Oct 2023 16:22:47 +0200 Subject: [PATCH 419/833] netdev-offload: Fix Clang's static analyzer 'Division by zero' warnings. When enabling DPDK with the configure the below, ovs-vswitchd will crash. ovs-vsctl set Open_vSwitch . other_config:n-offload-threads=0 ovs-vsctl set Open_vSwitch . other_config:hw-offload=true This issue arises because setting the 'n-offload-threads' value to zero is not a supported configuration. This fix addresses this by implementing a check to ensure a valid 'n-offload-threads' value, both during configuration and statistics gathering. Fixes: 62c2d8a67543 ("netdev-offload: Add multi-thread API.") Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- lib/dpif-netdev.c | 4 ++++ lib/netdev-offload.c | 3 ++- tests/test-id-fpool.c | 2 +- tests/test-mpsc-queue.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 157694bcf0e..b8f065d1d77 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -4748,6 +4748,10 @@ dpif_netdev_offload_stats_get(struct dpif *dpif, } nb_thread = netdev_offload_thread_nb(); + if (!nb_thread) { + return EINVAL; + } + /* nb_thread counters for the overall total as well. */ stats->size = ARRAY_SIZE(hwol_stats) * (nb_thread + 1); stats->counters = xcalloc(stats->size, sizeof *stats->counters); diff --git a/lib/netdev-offload.c b/lib/netdev-offload.c index a5fa6248754..931d634e15a 100644 --- a/lib/netdev-offload.c +++ b/lib/netdev-offload.c @@ -872,7 +872,8 @@ netdev_set_flow_api_enabled(const struct smap *ovs_other_config) offload_thread_nb = smap_get_ullong(ovs_other_config, "n-offload-threads", DEFAULT_OFFLOAD_THREAD_NB); - if (offload_thread_nb > MAX_OFFLOAD_THREAD_NB) { + if (offload_thread_nb == 0 || + offload_thread_nb > MAX_OFFLOAD_THREAD_NB) { VLOG_WARN("netdev: Invalid number of threads requested: %u", offload_thread_nb); offload_thread_nb = DEFAULT_OFFLOAD_THREAD_NB; diff --git a/tests/test-id-fpool.c b/tests/test-id-fpool.c index 27800aa9bad..7bdb8154d3c 100644 --- a/tests/test-id-fpool.c +++ b/tests/test-id-fpool.c @@ -237,7 +237,7 @@ print_result(const char *prefix) for (i = 0; i < n_threads; i++) { avg += thread_working_ms[i]; } - avg /= n_threads; + avg /= n_threads ? n_threads : 1; printf("%s: ", prefix); for (i = 0; i < n_threads; i++) { if (thread_working_ms[i] >= TIMEOUT_MS) { diff --git a/tests/test-mpsc-queue.c b/tests/test-mpsc-queue.c index 16aa804a034..86a223caffa 100644 --- a/tests/test-mpsc-queue.c +++ b/tests/test-mpsc-queue.c @@ -315,7 +315,7 @@ print_result(const char *prefix, int reader_elapsed) for (i = 0; i < n_threads; i++) { avg += thread_working_ms[i]; } - avg /= n_threads; + avg /= n_threads ? n_threads : 1; printf("%s: %6d", prefix, reader_elapsed); for (i = 0; i < n_threads; i++) { printf(" %6" PRIu64, thread_working_ms[i]); From 723cd4c9be4a06b87909a6b157544b530afc6954 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 30 Oct 2023 21:10:44 +0100 Subject: [PATCH 420/833] automake: Move build-aux EXTRA_DIST updates to their own file. Otherwise it's hard to keep track of all the scripts we have. Acked-by: Eelco Chaudron Reviewed-By: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- build-aux/automake.mk | 10 +++++++++- datapath-windows/include/automake.mk | 2 -- include/automake.mk | 1 - include/openflow/automake.mk | 3 --- lib/automake.mk | 4 ---- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/build-aux/automake.mk b/build-aux/automake.mk index b9a77a51cfe..8d7e8ae1904 100644 --- a/build-aux/automake.mk +++ b/build-aux/automake.mk @@ -1,11 +1,19 @@ EXTRA_DIST += \ build-aux/calculate-schema-cksum \ build-aux/cccl \ + build-aux/check-structs \ build-aux/cksum-schema-check \ build-aux/dist-docs \ build-aux/dpdkstrip.py \ - build-aux/generate-dhparams-c \ + build-aux/extract-odp-netlink-h \ + build-aux/extract-odp-netlink-macros-h \ + build-aux/extract-odp-netlink-windows-dp-h \ + build-aux/extract-ofp-actions \ + build-aux/extract-ofp-errors \ + build-aux/extract-ofp-fields \ + build-aux/extract-ofp-msgs \ build-aux/gen_ofp_field_decoders \ + build-aux/generate-dhparams-c \ build-aux/initial-tab-allowed-files \ build-aux/sodepends.py \ build-aux/soexpand.py \ diff --git a/datapath-windows/include/automake.mk b/datapath-windows/include/automake.mk index a354f007fd2..185a06b03ef 100644 --- a/datapath-windows/include/automake.mk +++ b/datapath-windows/include/automake.mk @@ -7,6 +7,4 @@ $(srcdir)/datapath-windows/include/OvsDpInterface.h: \ build-aux/extract-odp-netlink-windows-dp-h $(AM_V_GEN)sed -f $(srcdir)/build-aux/extract-odp-netlink-windows-dp-h < $< > $@ -EXTRA_DIST += $(srcdir)/build-aux/extract-odp-netlink-windows-dp-h - CLEANFILES += $(srcdir)/datapath-windows/include/OvsDpInterface.h diff --git a/include/automake.mk b/include/automake.mk index 1e3390ae0d9..a276c680b53 100644 --- a/include/automake.mk +++ b/include/automake.mk @@ -8,7 +8,6 @@ include/odp-netlink-macros.h: include/odp-netlink.h \ build-aux/extract-odp-netlink-macros-h $(AM_V_GEN)sh -f $(srcdir)/build-aux/extract-odp-netlink-macros-h $< > $@ -EXTRA_DIST += build-aux/extract-odp-netlink-h build-aux/extract-odp-netlink-macros-h CLEANFILES += include/odp-netlink.h include/odp-netlink-macros.h include include/openflow/automake.mk diff --git a/include/openflow/automake.mk b/include/openflow/automake.mk index a1d75756c9d..820c09f84bd 100644 --- a/include/openflow/automake.mk +++ b/include/openflow/automake.mk @@ -22,6 +22,3 @@ HSTAMP_FILES = $(openflowinclude_HEADERS:.h=.hstamp) CLEANFILES += $(HSTAMP_FILES) ALL_LOCAL += $(HSTAMP_FILES) $(HSTAMP_FILES): build-aux/check-structs $(openflowinclude_HEADERS) - -EXTRA_DIST += build-aux/check-structs - diff --git a/lib/automake.mk b/lib/automake.mk index 24b0ffefee0..1be13a420a7 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -642,7 +642,6 @@ lib/nx-match.inc: $(srcdir)/build-aux/extract-ofp-fields include/openvswitch/met $(AM_V_at)mv $@.tmp $@ lib/nx-match.lo: lib/nx-match.inc CLEANFILES += lib/meta-flow.inc lib/nx-match.inc -EXTRA_DIST += build-aux/extract-ofp-fields lib/ofp-actions.inc1: $(srcdir)/build-aux/extract-ofp-actions lib/ofp-actions.c $(AM_V_GEN)$(run_python) $< prototypes $(srcdir)/lib/ofp-actions.c > $@.tmp && mv $@.tmp $@ @@ -650,7 +649,6 @@ lib/ofp-actions.inc2: $(srcdir)/build-aux/extract-ofp-actions lib/ofp-actions.c $(AM_V_GEN)$(run_python) $< definitions $(srcdir)/lib/ofp-actions.c > $@.tmp && mv $@.tmp $@ lib/ofp-actions.lo: lib/ofp-actions.inc1 lib/ofp-actions.inc2 CLEANFILES += lib/ofp-actions.inc1 lib/ofp-actions.inc2 -EXTRA_DIST += build-aux/extract-ofp-actions lib/ofp-errors.inc: include/openvswitch/ofp-errors.h include/openflow/openflow-common.h \ $(srcdir)/build-aux/extract-ofp-errors @@ -660,14 +658,12 @@ lib/ofp-errors.inc: include/openvswitch/ofp-errors.h include/openflow/openflow-c mv $@.tmp $@ lib/ofp-errors.lo: lib/ofp-errors.inc CLEANFILES += lib/ofp-errors.inc -EXTRA_DIST += build-aux/extract-ofp-errors lib/ofp-msgs.inc: include/openvswitch/ofp-msgs.h $(srcdir)/build-aux/extract-ofp-msgs $(AM_V_GEN)$(run_python) $(srcdir)/build-aux/extract-ofp-msgs \ $(srcdir)/include/openvswitch/ofp-msgs.h $@ > $@.tmp && mv $@.tmp $@ lib/ofp-msgs.lo: lib/ofp-msgs.inc CLEANFILES += lib/ofp-msgs.inc -EXTRA_DIST += build-aux/extract-ofp-msgs # _server IDL OVSIDL_BUILT += lib/ovsdb-server-idl.c lib/ovsdb-server-idl.h lib/ovsdb-server-idl.ovsidl From 51fb99290421e8c0b3bcae610fff82d347541fdb Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 30 Oct 2023 21:10:45 +0100 Subject: [PATCH 421/833] build-aux/extract-ofp-actions: Fix flake8 and syntax errors. A few general style issues like extra spacing and lines being too long. Also, unused variables 'error_types' and 'comments'. And a few invalid escape sequences, which are not actual escape sequences, but cause actual syntax warnings starting python 3.12 and will eventually become syntax errors [1]: extract-ofp-actions:122: SyntaxWarning: invalid escape sequence '\[' comment = re.sub('\[[^]]*\]', '', comment) extract-ofp-actions:125: SyntaxWarning: invalid escape sequence '\s' m = re.match('([^:]+):\s+(.*)$', comment) These are fixed by converting to raw strings. [1] https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences Acked-by: Eelco Chaudron Reviewed-By: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- build-aux/extract-ofp-actions | 108 ++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 44 deletions(-) diff --git a/build-aux/extract-ofp-actions b/build-aux/extract-ofp-actions index 0aa6c65f316..cc5c1dbb062 100755 --- a/build-aux/extract-ofp-actions +++ b/build-aux/extract-ofp-actions @@ -17,27 +17,30 @@ version_map = {"1.0": 0x01, version_reverse_map = dict((v, k) for (k, v) in version_map.items()) # Map from vendor name to the length of the action header. -vendor_map = {"OF": (0x00000000, 4), +vendor_map = {"OF": (0x00000000, 4), "ONF": (0x4f4e4600, 10), "NX": (0x00002320, 10)} # Basic types used in action arguments. -types = {} -types['uint8_t'] = {"size": 1, "align": 1, "ntoh": None, "hton": None} -types['ovs_be16'] = {"size": 2, "align": 2, "ntoh": "ntohs", "hton": "htons"} -types['ovs_be32'] = {"size": 4, "align": 4, "ntoh": "ntohl", "hton": "htonl"} -types['ovs_be64'] = {"size": 8, "align": 8, "ntoh": "ntohll", "hton": "htonll"} -types['uint16_t'] = {"size": 2, "align": 2, "ntoh": None, "hton": None} -types['uint32_t'] = {"size": 4, "align": 4, "ntoh": None, "hton": None} -types['uint64_t'] = {"size": 8, "align": 8, "ntoh": None, "hton": None} +types = { + "uint8_t" : {"size": 1, "align": 1, "ntoh": None, "hton": None}, + "ovs_be16": {"size": 2, "align": 2, "ntoh": "ntohs", "hton": "htons"}, + "ovs_be32": {"size": 4, "align": 4, "ntoh": "ntohl", "hton": "htonl"}, + "ovs_be64": {"size": 8, "align": 8, "ntoh": "ntohll", "hton": "htonll"}, + "uint16_t": {"size": 2, "align": 2, "ntoh": None, "hton": None}, + "uint32_t": {"size": 4, "align": 4, "ntoh": None, "hton": None}, + "uint64_t": {"size": 8, "align": 8, "ntoh": None, "hton": None}, +} line = "" - +n_errors = 0 arg_structs = set() + def round_up(x, y): return int((x + (y - 1)) / y) * y + def open_file(fn): global file_name global input_file @@ -46,6 +49,7 @@ def open_file(fn): input_file = open(file_name) line_number = 0 + def get_line(): global input_file global line @@ -56,16 +60,18 @@ def get_line(): fatal("unexpected end of input") return line -n_errors = 0 + def error(msg): global n_errors sys.stderr.write("%s:%d: %s\n" % (file_name, line_number, msg)) n_errors += 1 + def fatal(msg): error(msg) sys.exit(1) + def usage(): argv0 = os.path.basename(sys.argv[0]) print('''\ @@ -84,10 +90,8 @@ Commands: ''' % {"argv0": argv0}) sys.exit(0) -def extract_ofp_actions(fn, definitions): - error_types = {} - comments = [] +def extract_ofp_actions(fn, definitions): names = [] domain = {} for code, size in vendor_map.values(): @@ -100,14 +104,14 @@ def extract_ofp_actions(fn, definitions): while True: get_line() - if re.match('enum ofp_raw_action_type {', line): + if re.match(r'enum ofp_raw_action_type {', line): break while True: get_line() if line.startswith('/*') or not line or line.isspace(): continue - elif re.match('}', line): + elif re.match(r'}', line): break if not line.lstrip().startswith('/*'): @@ -119,10 +123,10 @@ def extract_ofp_actions(fn, definitions): if line.startswith('/*') or not line or line.isspace(): fatal("unexpected syntax within action") comment += ' %s' % line.lstrip('* \t').rstrip(' \t\r\n') - comment = re.sub('\[[^]]*\]', '', comment) + comment = re.sub(r'\[[^]]*\]', '', comment) comment = comment[:-2].rstrip() - m = re.match('([^:]+):\s+(.*)$', comment) + m = re.match(r'([^:]+):\s+(.*)$', comment) if not m: fatal("unexpected syntax between actions") @@ -147,7 +151,9 @@ def extract_ofp_actions(fn, definitions): names.append(enum) for dst in dsts.split(', '): - m = re.match(r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?(?:\((\d+)\))(?: is deprecated \(([^)]+)\))?$', dst) + m = re.match( + r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?(?:\((\d+)\))(?:' + r' is deprecated \(([^)]+)\))?$', dst) if not m: fatal("%r: syntax error in destination" % dst) vendor_name = m.group(1) @@ -220,18 +226,18 @@ def extract_ofp_actions(fn, definitions): else: max_length = min_length - info = {"enum": enum, # 0 - "deprecation": deprecation, # 1 - "file_name": file_name, # 2 - "line_number": line_number, # 3 - "min_length": min_length, # 4 - "max_length": max_length, # 5 - "arg_ofs": arg_ofs, # 6 - "arg_len": arg_len, # 7 - "base_argtype": base_argtype, # 8 - "arg_vl_mff_map": arg_vl_mff_map, # 9 - "version": version, # 10 - "type": type_} # 11 + info = {"enum": enum, # 0 + "deprecation": deprecation, # 1 + "file_name": file_name, # 2 + "line_number": line_number, # 3 + "min_length": min_length, # 4 + "max_length": max_length, # 5 + "arg_ofs": arg_ofs, # 6 + "arg_len": arg_len, # 7 + "base_argtype": base_argtype, # 8 + "arg_vl_mff_map": arg_vl_mff_map, # 9 + "version": version, # 10 + "type": type_} # 11 domain[vendor][type_][version] = info enums.setdefault(enum, []) @@ -247,9 +253,13 @@ def extract_ofp_actions(fn, definitions): """) if definitions: - print("/* Verify that structs used as actions are reasonable sizes. */") + print( + "/* Verify that structs used as actions are reasonable sizes. */" + ) for s in sorted(arg_structs): - print("BUILD_ASSERT_DECL(sizeof(%s) %% OFP_ACTION_ALIGN == 0);" % s) + print( + "BUILD_ASSERT_DECL(sizeof(%s) %% OFP_ACTION_ALIGN == 0);" % s + ) print("\nstatic struct ofpact_raw_instance all_raw_instances[] = {") for vendor in domain: @@ -265,9 +275,11 @@ def extract_ofp_actions(fn, definitions): print(" %s," % d["max_length"]) print(" %s," % d["arg_ofs"]) print(" %s," % d["arg_len"]) - print(" \"%s\"," % re.sub('_RAW[0-9]*', '', d["enum"], 1)) + print(" \"%s\"," + % re.sub(r'_RAW[0-9]*', '', d["enum"], 1)) if d["deprecation"]: - print(" \"%s\"," % re.sub(r'(["\\])', r'\\\1', d["deprecation"])) + print(" \"%s\"," + % re.sub(r'(["\\])', r'\\\1', d["deprecation"])) else: print(" NULL,") print(" },") @@ -286,10 +298,11 @@ def extract_ofp_actions(fn, definitions): decl = "static inline " if base_argtype.startswith('struct'): - decl += "%s *" %base_argtype + decl += "%s *" % base_argtype else: decl += "void" - decl += "\nput_%s(struct ofpbuf *openflow" % versions[0]["enum"].replace('_RAW', '', 1) + decl += "\nput_%s(struct ofpbuf *openflow" \ + % versions[0]["enum"].replace('_RAW', '', 1) if need_ofp_version: decl += ", enum ofp_version version" if base_argtype != 'void' and not base_argtype.startswith('struct'): @@ -348,9 +361,13 @@ ofpact_decode(const struct ofp_action_header *a, enum ofp_raw_action_type raw, else: arg = "arg" if arg_vl_mff_map: - print(" return decode_%s(%s, version, vl_mff_map, tlv_bitmap, out);" % (enum, arg)) + print( + " return decode_%s(%s," % (enum, arg), + "version, vl_mff_map, tlv_bitmap, out);" + ) else: - print(" return decode_%s(%s, version, out);" % (enum, arg)) + print(" return decode_%s(%s, version, out);" + % (enum, arg)) print("") print("""\ default: @@ -366,7 +383,8 @@ ofpact_decode(const struct ofp_action_header *a, enum ofp_raw_action_type raw, arg_vl_mff_map = versions[0]["arg_vl_mff_map"] if base_argtype != 'void': if base_argtype.startswith('struct'): - prototype += "const %s *, enum ofp_version, " % base_argtype + prototype += "const %s *, " % base_argtype + prototype += "enum ofp_version, " else: prototype += "%s, enum ofp_version, " % base_argtype if arg_vl_mff_map: @@ -378,13 +396,15 @@ ofpact_decode(const struct ofp_action_header *a, enum ofp_raw_action_type raw, static enum ofperr ofpact_decode(const struct ofp_action_header *, enum ofp_raw_action_type raw, enum ofp_version version, - uint64_t arg, const struct vl_mff_map *vl_mff_map, + uint64_t arg, + const struct vl_mff_map *vl_mff_map, uint64_t *tlv_bitmap, struct ofpbuf *out); """) + -## ------------ ## -## Main Program ## -## ------------ ## +# ------------ # +# Main Program # +# ------------ # if __name__ == '__main__': argv0 = sys.argv[0] From 6625f6f2f2c37e932744bc3aaae6e690befcc0a5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 30 Oct 2023 21:10:46 +0100 Subject: [PATCH 422/833] build-aux/extract-ofp-errors: Fix flake8 and syntax errors. A few general style issues like extra spacing and lines being too long, unused variable 'error_types', passing more arguments than a format string has. And a few invalid escape sequences, which are not actual escape sequences, but cause actual syntax warnings starting python 3.12 and will eventually become syntax errors [1]: extract-ofp-errors:244: SyntaxWarning: invalid escape sequence '\.' m = re.match('Expected: (.*)\.$', comment) extract-ofp-errors:249: SyntaxWarning: invalid escape sequence '\.' m = re.match('((?:.(?!\. ))+.)\.\s+(.*)$', comment) extract-ofp-errors:256: SyntaxWarning: invalid escape sequence '\s' m = re.match('\s+(?:OFPERR_([A-Z0-9_]+))(\s*=\s*OFPERR_OFS)?,', extract-ofp-errors:265: SyntaxWarning: invalid escape sequence '\[' comments.append(re.sub('\[[^]]*\]', '', comment)) These are fixed by converting to raw strings. [1] https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences Acked-by: Eelco Chaudron Reviewed-By: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- build-aux/extract-ofp-errors | 101 +++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 39 deletions(-) diff --git a/build-aux/extract-ofp-errors b/build-aux/extract-ofp-errors index 2c3fbfc881b..eeefccbee05 100755 --- a/build-aux/extract-ofp-errors +++ b/build-aux/extract-ofp-errors @@ -22,6 +22,9 @@ tokenRe = "#?" + idRe + "|[0-9]+|." inComment = False inDirective = False +n_errors = 0 + + def open_file(fn): global fileName global inputFile @@ -30,6 +33,7 @@ def open_file(fn): inputFile = open(fileName) lineNumber = 0 + def tryGetLine(): global inputFile global line @@ -38,10 +42,12 @@ def tryGetLine(): lineNumber += 1 return line != "" + def getLine(): if not tryGetLine(): fatal("unexpected end of input") + def getToken(): global token global line @@ -82,37 +88,43 @@ def getToken(): line = line[:-2] + inputFile.readline() lineNumber += 1 if line == "": - if token == None: + if token is None: fatal("unexpected end of input") token = None return False -n_errors = 0 + def error(msg): global n_errors sys.stderr.write("%s:%d: %s\n" % (fileName, lineNumber, msg)) n_errors += 1 + def fatal(msg): error(msg) sys.exit(1) + def skipDirective(): getToken() while token != '$': getToken() + def isId(s): - return re.match(idRe + "$", s) != None + return re.match(idRe + "$", s) is not None + def forceId(): if not isId(token): fatal("identifier expected") + def forceInteger(): - if not re.match('[0-9]+$', token): + if not re.match(r'[0-9]+$', token): fatal("integer expected") + def match(t): if token == t: getToken() @@ -120,10 +132,12 @@ def match(t): else: return False + def forceMatch(t): if not match(t): fatal("%s expected" % t) + def parseTaggedName(): assert token in ('struct', 'union') name = token @@ -133,26 +147,26 @@ def parseTaggedName(): getToken() return name + def print_enum(tag, constants, storage_class): - print (""" + print(""" %(storage_class)sconst char * %(tag)s_to_string(uint16_t value) { switch (value) {\ """ % {"tag": tag, - "bufferlen": len(tag) + 32, "storage_class": storage_class}) for constant in constants: - print (" case %s: return \"%s\";" % (constant, constant)) - print ("""\ + print(" case %s: return \"%s\";" % (constant, constant)) + print("""\ } return NULL; -}\ -""" % {"tag": tag}) +}""") + def usage(): argv0 = os.path.basename(sys.argv[0]) - print ('''\ + print('''\ %(argv0)s, for extracting OpenFlow error codes from header files usage: %(argv0)s ERROR_HEADER VENDOR_HEADER @@ -167,6 +181,7 @@ The output is suitable for use as lib/ofp-errors.inc.\ ''' % {"argv0": argv0}) sys.exit(0) + def extract_vendor_ids(fn): global vendor_map vendor_map = {} @@ -174,7 +189,10 @@ def extract_vendor_ids(fn): open_file(fn) while tryGetLine(): - m = re.match(r'#define\s+([A-Z0-9_]+)_VENDOR_ID\s+(0x[0-9a-fA-F]+|[0-9]+)', line) + m = re.match( + r'#define\s+([A-Z0-9_]+)_VENDOR_ID\s+(0x[0-9a-fA-F]+|[0-9]+)', + line + ) if not m: continue @@ -202,9 +220,8 @@ def extract_vendor_ids(fn): % (id_, vendor_reverse_map[id_], name)) vendor_reverse_map[id_] = name -def extract_ofp_errors(fn): - error_types = {} +def extract_ofp_errors(fn): comments = [] names = [] domain = {} @@ -220,14 +237,14 @@ def extract_ofp_errors(fn): while True: getLine() - if re.match('enum ofperr', line): + if re.match(r'enum ofperr', line): break while True: getLine() if line.startswith('/*') or not line or line.isspace(): continue - elif re.match('}', line): + elif re.match(r'}', line): break if not line.lstrip().startswith('/*'): @@ -241,19 +258,19 @@ def extract_ofp_errors(fn): comment += ' %s' % line.lstrip('* \t').rstrip(' \t\r\n') comment = comment[:-2].rstrip() - m = re.match('Expected: (.*)\.$', comment) + m = re.match(r'Expected: (.*)\.$', comment) if m: expected_errors[m.group(1)] = (fileName, lineNumber) continue - m = re.match('((?:.(?!\. ))+.)\.\s+(.*)$', comment) + m = re.match(r'((?:.(?!\. ))+.)\.\s+(.*)$', comment) if not m: fatal("unexpected syntax between errors") dsts, comment = m.groups() getLine() - m = re.match('\s+(?:OFPERR_([A-Z0-9_]+))(\s*=\s*OFPERR_OFS)?,', + m = re.match(r'\s+(?:OFPERR_([A-Z0-9_]+))(\s*=\s*OFPERR_OFS)?,', line) if not m: fatal("syntax error expecting OFPERR_ enum value") @@ -262,11 +279,14 @@ def extract_ofp_errors(fn): if enum in names: fatal("%s specified twice" % enum) - comments.append(re.sub('\[[^]]*\]', '', comment)) + comments.append(re.sub(r'\[[^]]*\]', '', comment)) names.append(enum) for dst in dsts.split(', '): - m = re.match(r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?\((\d+)(?:,(\d+))?\)$', dst) + m = re.match( + r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?\((\d+)(?:,(\d+))?\)$', + dst + ) if not m: fatal("%r: syntax error in destination" % dst) vendor_name = m.group(1) @@ -313,8 +333,7 @@ def extract_ofp_errors(fn): # mechanism that includes a type but not a code. if v1 < version_map['1.2'] or v2 < version_map['1.2']: if code is None: - fatal("%s: NX1.0 and NX1.1 domains require code" - % (dst, vendor_name)) + fatal("%s: NX1.0 and NX1.1 domains require code" % dst) if v1 >= version_map['1.2'] or v2 >= version_map['1.2']: if code is not None: fatal("%s: NX1.2+ domains do not have codes" % dst) @@ -340,11 +359,13 @@ def extract_ofp_errors(fn): del expected_errors[msg] else: error("%s: %s." % (dst, msg)) - sys.stderr.write("%s:%d: %s: Here is the location " - "of the previous definition.\n" - % (domain[version][vendor][type_][code][1], - domain[version][vendor][type_][code][2], - dst)) + sys.stderr.write( + "%s:%d: %s: Here is the location " + "of the previous definition.\n" + % (domain[version][vendor][type_][code][1], + domain[version][vendor][type_][code][2], + dst) + ) else: domain[version][vendor][type_][code] = (enum, fileName, lineNumber) @@ -361,7 +382,7 @@ def extract_ofp_errors(fn): if n_errors: sys.exit(1) - print ("""\ + print("""\ /* Generated automatically; do not modify! -*- buffer-read-only: t -*- */ #define OFPERR_N_ERRORS %d @@ -386,7 +407,7 @@ static const char *error_comments[OFPERR_N_ERRORS] = { for comment in comments))) def output_domain(map, name, description, version): - print (""" + print(""" static enum ofperr %s_decode(uint32_t vendor, uint16_t type, uint16_t code) { @@ -405,16 +426,16 @@ static enum ofperr vendor_s = "(%#xULL << 32) | " % vendor else: vendor_s = "" - print (" case %s ((uint32_t) %d << 16) | %d:" % (vendor_s, + print(" case %s ((uint32_t) %d << 16) | %d:" % (vendor_s, type_, code)) - print (" return OFPERR_%s;" % enum) - print ("""\ + print(" return OFPERR_%s;" % enum) + print("""\ } return 0; }""") - print (""" + print(""" static const struct ofperr_domain %s = { "%s", %d, @@ -423,20 +444,22 @@ static const struct ofperr_domain %s = { for enum in names: if enum in map: vendor, type_, code = map[enum] - if code == None: + if code is None: code = -1 - print (" { %#8x, %2d, %3d }, /* %s */" % (vendor, type_, code, enum)) + print(" { %#8x, %2d, %3d }, /* %s */" % (vendor, type_, + code, enum)) else: - print (" { -1, -1, -1 }, /* %s */" % enum) - print ("""\ + print(" { -1, -1, -1 }, /* %s */" % enum) + print("""\ }, };""") for version_name, id_ in version_map.items(): - var = 'ofperr_of' + re.sub('[^A-Za-z0-9_]', '', version_name) + var = 'ofperr_of' + re.sub(r'[^A-Za-z0-9_]', '', version_name) description = "OpenFlow %s" % version_name output_domain(reverse[id_], var, description, id_) + if __name__ == '__main__': if '--help' in sys.argv: usage() From 23fb4bd4bcae99585ed64302549df142f17fc783 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 30 Oct 2023 21:10:47 +0100 Subject: [PATCH 423/833] build-aux/extract-ofp-fields: Fix flake8 and syntax errors. A few general style issues like extra spacing and block comment format. And a few invalid escape sequences, which are not actual escape sequences, but cause actual syntax warnings starting python 3.12 and will eventually become syntax errors [1]: extract-ofp-fields:323: SyntaxWarning: invalid escape sequence '\_' "\_;\_;\_;\_;\_;\_\n", extract-ofp-fields:332: SyntaxWarning: invalid escape sequence '\_' s = """tab(;); extract-ofp-fields:374: SyntaxWarning: invalid escape sequence '\-' """\ These are fixed by converting to raw strings. While doing that we also have to remove all the now unnecessary escaping from actual escape sequences like '\\'. [1] https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences Acked-by: Eelco Chaudron Reviewed-By: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- build-aux/extract-ofp-fields | 50 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/build-aux/extract-ofp-fields b/build-aux/extract-ofp-fields index 05d3e1df36b..89d80c20855 100755 --- a/build-aux/extract-ofp-fields +++ b/build-aux/extract-ofp-fields @@ -167,9 +167,9 @@ def make_nx_match(meta_flow_h): print(oline) -## ------------------------ ## -## Documentation Generation ## -## ------------------------ ## +# ------------------------ # +# Documentation Generation # +# ------------------------ # def field_to_xml(field_node, f, body, summary): @@ -189,9 +189,9 @@ def field_to_xml(field_node, f, body, summary): ovs_version = [int(x) for x in ovs_version_s.split(".")] if min_ovs_version is None or ovs_version < min_ovs_version: min_ovs_version = ovs_version - summary += ["\\fB%s\\fR" % f["name"]] + summary += [r"\fB%s\fR" % f["name"]] if f["extra_name"]: - summary += [" aka \\fB%s\\fR" % f["extra_name"]] + summary += [r" aka \fB%s\fR" % f["extra_name"]] summary += [";%d" % f["n_bytes"]] if f["n_bits"] != 8 * f["n_bytes"]: summary += [" (low %d bits)" % f["n_bits"]] @@ -213,8 +213,8 @@ def field_to_xml(field_node, f, body, summary): title = field_node.attributes["title"].nodeValue body += [ - """.PP -\\fB%s Field\\fR + r""".PP +\fB%s Field\fR .TS tab(;),nowarn; l lx. @@ -222,9 +222,9 @@ l lx. % title ] - body += ["Name:;\\fB%s\\fR" % f["name"]] + body += [r"Name:;\fB%s\fR" % f["name"]] if f["extra_name"]: - body += [" (aka \\fB%s\\fR)" % f["extra_name"]] + body += [r" (aka \fB%s\fR)" % f["extra_name"]] body += ["\n"] body += ["Width:;"] @@ -320,7 +320,8 @@ def group_xml_to_nroff(group_node, fields): "tab(;),nowarn;\n", "l l l l l l l.\n", "Name;Bytes;Mask;RW?;Prereqs;NXM/OXM Support\n", - "\_;\_;\_;\_;\_;\_\n", + r"\_;\_;\_;\_;\_;\_", + "\n", ] content += summary content += [".TE\n"] @@ -329,7 +330,7 @@ def group_xml_to_nroff(group_node, fields): def make_oxm_classes_xml(document): - s = """tab(;),nowarn; + s = r"""tab(;),nowarn; l l l. Prefix;Vendor;Class \_;\_;\_ @@ -367,42 +368,41 @@ def make_ovs_fields(meta_flow_h, meta_flow_xml): doc = document.documentElement global version - if version == None: + if version is None: version = "UNKNOWN" print( - """\ -'\\" tp -.\\" -*- mode: troff; coding: utf-8 -*- + r"""'\" tp +.\" -*- mode: troff; coding: utf-8 -*- .TH "ovs\-fields" 7 "%s" "Open vSwitch" "Open vSwitch Manual" -.fp 5 L CR \\" Make fixed-width font available as \\fL. +.fp 5 L CR \" Make fixed-width font available as \fL. .de ST . PP . RS -0.15in -. I "\\\\$1" +. I "\\$1" . RE .. .de SU . PP -. I "\\\\$1" +. I "\\$1" .. .de IQ . br . ns -. IP "\\\\$1" +. IP "\\$1" .. .de TQ . br . ns -. TP "\\\\$1" +. TP "\\$1" .. .de URL -\\\\$2 \\(laURL: \\\\$1 \\(ra\\\\$3 +\\$2 \(laURL: \\$1 \(ra\\$3 .. -.if \\n[.g] .mso www.tmac +.if \n[.g] .mso www.tmac .SH NAME ovs\-fields \- protocol header fields in OpenFlow and Open vSwitch . @@ -460,9 +460,9 @@ ovs\-fields \- protocol header fields in OpenFlow and Open vSwitch print(output[i]) -## ------------ ## -## Main Program ## -## ------------ ## +# ------------ # +# Main Program # +# ------------ # if __name__ == "__main__": argv0 = sys.argv[0] From 20e6309ba66fc9e39525a1fefae6c3702212d7a7 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 30 Oct 2023 21:10:48 +0100 Subject: [PATCH 424/833] build-aux/extract-ofp-msgs: Fix flake8 and syntax errors. A few general style issues like extra spacing and line length, semicolons at the end of the line and unused variable 'raw_types'. And a few invalid escape sequences, which are not actual escape sequences, but cause actual syntax warnings starting python 3.12 and will eventually become syntax errors [1]: extract-ofp-msgs:118: SyntaxWarning: invalid escape sequence '\s' m = re.match('\s+(?:OFPRAW_%s)(\d*)_([A-Z0-9_]+),?$' % type_, These are fixed by converting to raw strings. [1] https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences Acked-by: Eelco Chaudron Reviewed-By: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- build-aux/extract-ofp-msgs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/build-aux/extract-ofp-msgs b/build-aux/extract-ofp-msgs index 6b3295cf64c..c26ea1d3557 100755 --- a/build-aux/extract-ofp-msgs +++ b/build-aux/extract-ofp-msgs @@ -24,6 +24,9 @@ OFPT11_STATS_REQUEST = 18 OFPT11_STATS_REPLY = 19 OFPST_VENDOR = 0xffff +n_errors = 0 + + def decode_version_range(range): if range in VERSION: return (VERSION[range], VERSION[range]) @@ -35,6 +38,7 @@ def decode_version_range(range): a, b = re.match(r'^([^-]+)-([^-]+)$', range).groups() return (VERSION[a], VERSION[b]) + def get_line(): global line global line_number @@ -43,16 +47,18 @@ def get_line(): if line == "": fatal("unexpected end of input") -n_errors = 0 + def error(msg): global n_errors sys.stderr.write("%s:%d: %s\n" % (file_name, line_number, msg)) n_errors += 1 + def fatal(msg): error(msg) sys.exit(1) + def usage(): argv0 = os.path.basename(sys.argv[0]) print('''\ @@ -65,6 +71,7 @@ only controls #line directives in the output.\ ''' % {"argv0": argv0}) sys.exit(0) + def make_sizeof(s): m = re.match(r'(.*) up to (.*)', s) if m: @@ -73,9 +80,8 @@ def make_sizeof(s): else: return "sizeof(%s)" % s -def extract_ofp_msgs(output_file_name): - raw_types = [] +def extract_ofp_msgs(output_file_name): all_hdrs = {} all_raws = {} all_raws_order = [] @@ -108,15 +114,16 @@ def extract_ofp_msgs(output_file_name): comment += ' %s' % line.lstrip('* \t').rstrip(' \t\r\n') comment = comment[:-2].rstrip() - m = re.match(r'([A-Z]+) ([-.+\d]+|) \((\d+)\): ([^.]+)\.$', comment) + m = re.match( + r'([A-Z]+) ([-.+\d]+|) \((\d+)\): ([^.]+)\.$', comment + ) if not m: fatal("unexpected syntax between messages") type_, versions, number, contents = m.groups() number = int(number) get_line() - m = re.match('\s+(?:OFPRAW_%s)(\d*)_([A-Z0-9_]+),?$' % type_, - line) + m = re.match(r'\s+(?:OFPRAW_%s)(\d*)_([A-Z0-9_]+),?$' % type_, line) if not m: fatal("syntax error expecting OFPRAW_ enum") vinfix, name = m.groups() @@ -300,7 +307,7 @@ def extract_ofp_msgs(output_file_name): for hdrs in r['hdrs']: output.append(" { {0, NULL}, {%d, %d, %d, 0x%x, %d}, %s, 0 }," % (hdrs + (raw,))) - + output.append("};") output.append("") @@ -349,8 +356,8 @@ def extract_ofp_msgs(output_file_name): % r["human_name"]) output.append("};") - output.append(""); - output.append("static const char *type_names[] = {"); + output.append("") + output.append("static const char *type_names[] = {") for t in all_types: output.append(" \"%s\"," % t) output.append("};") @@ -378,4 +385,3 @@ if __name__ == '__main__': for line in extract_ofp_msgs(sys.argv[2]): print(line) - From 28f6e7602cc0b4229a9e7e556a3f4cad3b821c3b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 30 Oct 2023 21:10:49 +0100 Subject: [PATCH 425/833] build-aux: Enable flake8 checks for python extraction scripts. These were recently updated to pass the checks, so should be added to the list in order to avoid regressions in the future. While at it, fixing the indentation. Acked-by: Eelco Chaudron Reviewed-By: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- build-aux/automake.mk | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/build-aux/automake.mk b/build-aux/automake.mk index 8d7e8ae1904..d65b6da6c5a 100644 --- a/build-aux/automake.mk +++ b/build-aux/automake.mk @@ -21,8 +21,12 @@ EXTRA_DIST += \ build-aux/xml2nroff FLAKE8_PYFILES += \ - build-aux/dpdkstrip.py \ - build-aux/gen_ofp_field_decoders \ - build-aux/sodepends.py \ - build-aux/soexpand.py \ - build-aux/xml2nroff + build-aux/dpdkstrip.py \ + build-aux/extract-ofp-actions \ + build-aux/extract-ofp-errors \ + build-aux/extract-ofp-fields \ + build-aux/extract-ofp-msgs \ + build-aux/gen_ofp_field_decoders \ + build-aux/sodepends.py \ + build-aux/soexpand.py \ + build-aux/xml2nroff From fdbf0bb2aed53e70b455eb1adcfda8d8278ea690 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 31 Oct 2023 17:12:34 +0000 Subject: [PATCH 426/833] flake8: Fix E721 check failures. E721: "do not compare types, for exact checks use `is` / `is not`, for instance checks use `isinstance()`" This fixes `make flake8-check` target when running with pycodestyle>=1.2. Acked-by: Eelco Chaudron Signed-off-by: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- python/ovs/jsonrpc.py | 2 +- tests/test-jsonrpc.py | 4 ++-- tests/test-ovsdb.py | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/ovs/jsonrpc.py b/python/ovs/jsonrpc.py index d5127268aab..d9fe27aec64 100644 --- a/python/ovs/jsonrpc.py +++ b/python/ovs/jsonrpc.py @@ -377,7 +377,7 @@ def __init__(self, reconnect, rpc, remotes): self.stream = None self.pstream = None self.seqno = 0 - if type(remotes) != list: + if type(remotes) is not list: remotes = [remotes] self.remotes = remotes random.shuffle(self.remotes) diff --git a/tests/test-jsonrpc.py b/tests/test-jsonrpc.py index 1df5afa221f..8a4a1759380 100644 --- a/tests/test-jsonrpc.py +++ b/tests/test-jsonrpc.py @@ -199,13 +199,13 @@ def main(argv): sys.exit(1) func, n_args = commands[command_name] - if type(n_args) == tuple: + if type(n_args) is tuple: if len(args) < n_args[0]: sys.stderr.write("%s: \"%s\" requires at least %d arguments but " "only %d provided\n" % (argv[0], command_name, n_args, len(args))) sys.exit(1) - elif type(n_args) == int: + elif type(n_args) is int: if len(args) != n_args: sys.stderr.write("%s: \"%s\" requires %d arguments but %d " "provided\n" diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index a841adba4e1..71248854fc7 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -37,7 +37,7 @@ def unbox_json(json): - if type(json) == list and len(json) == 1: + if type(json) is list and len(json) == 1: return json[0] else: return json @@ -325,9 +325,9 @@ def substitute_uuids(json, symtab): symbol = symtab.get(json) if symbol: return str(symbol) - elif type(json) == list: + elif type(json) is list: return [substitute_uuids(element, symtab) for element in json] - elif type(json) == dict: + elif type(json) is dict: d = {} for key, value in json.items(): d[key] = substitute_uuids(value, symtab) @@ -341,10 +341,10 @@ def parse_uuids(json, symtab): name = "#%d#" % len(symtab) sys.stderr.write("%s = %s\n" % (name, json)) symtab[name] = json - elif type(json) == list: + elif type(json) is list: for element in json: parse_uuids(element, symtab) - elif type(json) == dict: + elif type(json) is dict: for value in json.values(): parse_uuids(value, symtab) @@ -1049,14 +1049,14 @@ def main(argv): sys.exit(1) func, n_args = commands[command_name] - if type(n_args) == tuple: + if type(n_args) is tuple: if len(args) < n_args[0]: sys.stderr.write("%s: \"%s\" requires at least %d arguments but " "only %d provided\n" % (ovs.util.PROGRAM_NAME, command_name, n_args[0], len(args))) sys.exit(1) - elif type(n_args) == int: + elif type(n_args) is int: if len(args) != n_args: sys.stderr.write("%s: \"%s\" requires %d arguments but %d " "provided\n" From bf843fd439b25e2048f8b2b466557bd3d682072d Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Thu, 2 Nov 2023 14:59:41 +0200 Subject: [PATCH 427/833] checkpatch: Don't spell check Fixes tag. Fixes tag quotes another commit that might fail in a spell check. Don't fail it. Signed-off-by: Eli Britstein Acked-by: Roi Dayan Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- utilities/checkpatch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 3f42c44f293..1a0ebe79be4 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -408,6 +408,9 @@ def check_spelling(line, comment): if not spell_check_dict or not spellcheck: return False + if line.startswith('Fixes: '): + return False + words = filter_comments(line, True) if comment else line words = words.replace(':', ' ').split(' ') From 169c6b2d4aba44f4e3e19c31923172ad83627af5 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Mon, 13 Nov 2023 09:53:45 +0100 Subject: [PATCH 428/833] netdev-dummy: Sync and clean {get, set}_config() callbacks. For better usability, the function pairs get_config() and set_config() for netdevs should be symmetric: Options which are accepted by set_config() should be returned by get_config() and the latter should output valid options for set_config() only. This patch also moves key-value pairs which are not valid options from get_config() to the get_status() callback. The tests have been updated accordingly. Reported-at: https://bugzilla.redhat.com/1949855 Signed-off-by: Jakob Meng Reviewed-by: Robin Jarry Signed-off-by: Kevin Traynor --- lib/netdev-dummy.c | 19 +++++++++++++++---- tests/pmd.at | 26 +++++++++++++------------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 1a54add87f0..fe82317d723 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -795,14 +795,25 @@ netdev_dummy_get_config(const struct netdev *dev, struct smap *args) dummy_packet_conn_get_config(&netdev->conn, args); + /* pcap, rxq_pcap and tx_pcap cannot be recovered because filenames have + * been discarded after opening file descriptors */ + + if (netdev->ol_ip_csum) { + smap_add_format(args, "ol_ip_csum", "%s", "true"); + } + + if (netdev->ol_ip_csum_set_good) { + smap_add_format(args, "ol_ip_csum_set_good", "%s", "true"); + } + /* 'dummy-pmd' specific config. */ if (!netdev_is_pmd(dev)) { goto exit; } - smap_add_format(args, "requested_rx_queues", "%d", netdev->requested_n_rxq); - smap_add_format(args, "configured_rx_queues", "%d", dev->n_rxq); - smap_add_format(args, "requested_tx_queues", "%d", netdev->requested_n_txq); - smap_add_format(args, "configured_tx_queues", "%d", dev->n_txq); + + smap_add_format(args, "n_rxq", "%d", netdev->requested_n_rxq); + smap_add_format(args, "n_txq", "%d", netdev->requested_n_txq); + smap_add_format(args, "numa_id", "%d", netdev->requested_numa_id); exit: ovs_mutex_unlock(&netdev->mutex); diff --git a/tests/pmd.at b/tests/pmd.at index 7bdaca9e71f..06cc90477b0 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -93,11 +93,11 @@ pmd thread numa_id core_id : overhead: NOT AVAIL ]) -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=1, configured_tx_queues=, requested_rx_queues=1, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0) ]) OVS_VSWITCHD_STOP @@ -111,11 +111,11 @@ CHECK_PMD_THREADS_CREATED() AT_CHECK([ovs-vsctl set interface p0 options:n_rxq=8]) -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=8, configured_tx_queues=, requested_rx_queues=8, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=0) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl @@ -144,11 +144,11 @@ OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n CHECK_CPU_DISCOVERED(2) CHECK_PMD_THREADS_CREATED() -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=8, configured_tx_queues=, requested_rx_queues=8, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=0) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl @@ -227,11 +227,11 @@ TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1)) CHECK_CPU_DISCOVERED(4) CHECK_PMD_THREADS_CREATED() -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=8, configured_tx_queues=, requested_rx_queues=8, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=1) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl @@ -436,11 +436,11 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:smc-enable=true]) sleep 1 -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 7/1: (dummy-pmd: configured_rx_queues=4, configured_tx_queues=, requested_rx_queues=4, requested_tx_queues=) + p0 7/1: (dummy-pmd: n_rxq=4, n_txq=1, numa_id=0) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN | sed '/cycles/d' | grep pmd -A 12], [0], [dnl @@ -604,8 +604,8 @@ icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10 dnl Check resetting to default number of rx queues after removal from the db. AT_CHECK([ovs-vsctl remove interface p1 options n_rxq]) -AT_CHECK([ovs-appctl dpif/show | grep p1 | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl - p1 1/1: (dummy-pmd: configured_rx_queues=1, configured_tx_queues=, requested_rx_queues=1, requested_tx_queues=) +AT_CHECK([ovs-appctl dpif/show | grep p1], [0], [dnl + p1 1/1: (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0) ]) OVS_VSWITCHD_STOP @@ -1152,7 +1152,7 @@ dummy@dp0: lookups: hit:0 missed:0 lost:0 flows: 0 port 0: dp0 (dummy-internal) - port 1: p1 (dummy-pmd: configured_rx_queues=1, configured_tx_queues=1, requested_rx_queues=1, requested_tx_queues=1) + port 1: p1 (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0) port 2: p2 (dummy) ]) From d614f2863ffa5f0c63e17eee3b319e4233a07f53 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Mon, 13 Nov 2023 09:53:46 +0100 Subject: [PATCH 429/833] netdev-afxdp: Sync and clean {get, set}_config() callbacks. For better usability, the function pairs get_config() and set_config() for netdevs should be symmetric: Options which are accepted by set_config() should be returned by get_config() and the latter should output valid options for set_config() only. This patch also moves key-value pairs which are not valid options from get_config() to the get_status() callback. The documentation in vswitchd/vswitch.xml for status columns has been updated accordingly. Reported-at: https://bugzilla.redhat.com/1949855 Signed-off-by: Jakob Meng Signed-off-by: Kevin Traynor --- Documentation/intro/install/afxdp.rst | 12 ++++-------- lib/netdev-afxdp.c | 21 +++++++++++++++++++-- lib/netdev-afxdp.h | 1 + lib/netdev-linux-private.h | 1 + lib/netdev-linux.c | 4 ++-- vswitchd/vswitch.xml | 11 +++++++++++ 6 files changed, 38 insertions(+), 12 deletions(-) diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index 51c24bf5b1e..5776614c8e5 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -219,14 +219,10 @@ Otherwise, enable debugging by:: ovs-appctl vlog/set netdev_afxdp::dbg To check which XDP mode was chosen by ``best-effort``, you can look for -``xdp-mode-in-use`` in the output of ``ovs-appctl dpctl/show``:: - - # ovs-appctl dpctl/show - netdev@ovs-netdev: - <...> - port 2: ens802f0 (afxdp: n_rxq=1, use-need-wakeup=true, - xdp-mode=best-effort, - xdp-mode-in-use=native-with-zerocopy) +``xdp-mode`` in the output of ``ovs-vsctl get interface INT status:xdp-mode``:: + + # ovs-vsctl get interface ens802f0 status:xdp-mode + "native-with-zerocopy" References ---------- diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index 16f26bc3065..b680a147985 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -672,8 +672,6 @@ netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args) ovs_mutex_lock(&dev->mutex); smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); smap_add_format(args, "xdp-mode", "%s", xdp_modes[dev->xdp_mode].name); - smap_add_format(args, "xdp-mode-in-use", "%s", - xdp_modes[dev->xdp_mode_in_use].name); smap_add_format(args, "use-need-wakeup", "%s", dev->use_need_wakeup ? "true" : "false"); ovs_mutex_unlock(&dev->mutex); @@ -1367,3 +1365,22 @@ netdev_afxdp_get_stats(const struct netdev *netdev, return error; } + +int +netdev_afxdp_get_status(const struct netdev *netdev, struct smap *args) +{ + int error = netdev_linux_get_status(netdev, args); + + if (error) { + return error; + } + + struct netdev_linux *dev = netdev_linux_cast(netdev); + + ovs_mutex_lock(&dev->mutex); + smap_add_format(args, "xdp-mode", "%s", + xdp_modes[dev->xdp_mode_in_use].name); + ovs_mutex_unlock(&dev->mutex); + + return error; +} diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h index e91cd102d28..bd3b9dfbead 100644 --- a/lib/netdev-afxdp.h +++ b/lib/netdev-afxdp.h @@ -63,6 +63,7 @@ int netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, int netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args); int netdev_afxdp_get_stats(const struct netdev *netdev_, struct netdev_stats *stats); +int netdev_afxdp_get_status(const struct netdev *netdev, struct smap *args); int netdev_afxdp_get_custom_stats(const struct netdev *netdev, struct netdev_custom_stats *custom_stats); diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h index 0ecf0f748f9..188e8438a32 100644 --- a/lib/netdev-linux-private.h +++ b/lib/netdev-linux-private.h @@ -50,6 +50,7 @@ struct netdev_rxq_linux { }; int netdev_linux_construct(struct netdev *); +int netdev_linux_get_status(const struct netdev *, struct smap *); void netdev_linux_run(const struct netdev_class *); int get_stats_via_netlink(const struct netdev *netdev_, diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index cca3408797e..70521e3c7f7 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -3493,7 +3493,7 @@ netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop, return ENXIO; } -static int +int netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -3759,7 +3759,7 @@ const struct netdev_class netdev_internal_class = { .destruct = netdev_afxdp_destruct, \ .get_stats = netdev_afxdp_get_stats, \ .get_custom_stats = netdev_afxdp_get_custom_stats, \ - .get_status = netdev_linux_get_status, \ + .get_status = netdev_afxdp_get_status, \ .set_config = netdev_afxdp_set_config, \ .get_config = netdev_afxdp_get_config, \ .reconfigure = netdev_afxdp_reconfigure, \ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index e400043ce7f..81f6e872e2c 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3854,6 +3854,17 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Whether userspace-tso is enabled or disabled. + + +

      + AF_XDP specific interface status options. +

      + + + XDP mode currently in use. See for description of possible values. + +
      From c19a5b48bf18592035a4185072e9c62f86a389c2 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Mon, 13 Nov 2023 09:53:47 +0100 Subject: [PATCH 430/833] netdev-dpdk: Sync and clean {get, set}_config() callbacks. For better usability, the function pairs get_config() and set_config() for netdevs should be symmetric: Options which are accepted by set_config() should be returned by get_config() and the latter should output valid options for set_config() only. This patch moves key-value pairs which are not valid options from get_config() to the get_status() callback. For example, get_config() in lib/netdev-dpdk.c returned {configured,requested}_{rx,tx}_queues previously. For requested rx queues the proper option name is n_rxq, so requested_rx_queues has been renamed respectively. Tx queues cannot be changed by the user, hence requested_tx_queues has been dropped. Both configured_{rx,tx}_queues will be returned as n_{r,t}xq in the get_status() callback. The netdev dpdk classes no longer share a common get_config() callback, instead both the dpdk_class and the dpdk_vhost_client_class define their own callbacks. The get_config() callback for dpdk_vhost_class has been dropped because it does not have a set_config() callback. The documentation in vswitchd/vswitch.xml for status columns as well as tests have been updated accordingly. Reported-at: https://bugzilla.redhat.com/1949855 Signed-off-by: Jakob Meng Reviewed-by: Robin Jarry Signed-off-by: Kevin Traynor --- Documentation/topics/dpdk/phy.rst | 4 +- NEWS | 7 ++ lib/netdev-dpdk.c | 113 +++++++++++++++++++++--------- tests/system-dpdk.at | 64 ++++++++++------- vswitchd/vswitch.xml | 14 +++- 5 files changed, 143 insertions(+), 59 deletions(-) diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index f66b106c46a..41cc3588abf 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -198,7 +198,7 @@ Example:: a dedicated queue, it will be explicit:: $ ovs-vsctl get interface dpdk-p0 status - {..., rx_steering=unsupported} + {..., rx-steering=unsupported} More details can often be found in ``ovs-vswitchd.log``:: @@ -499,7 +499,7 @@ its options:: $ ovs-appctl dpctl/show [...] - port 3: dpdk-rep0 (dpdk: configured_rx_queues=1, ..., dpdk-vf-mac=00:11:22:33:44:55, ...) + port 3: dpdk-rep0 (dpdk: ..., dpdk-vf-mac=00:11:22:33:44:55, ...) $ ovs-vsctl show [...] diff --git a/NEWS b/NEWS index 6b45492f1b7..43aea97b5d2 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,13 @@ Post-v3.2.0 from older version is supported but it may trigger more leader elections during the process, and error logs complaining unrecognized fields may be observed on old nodes. + - ovs-appctl: + * Output of 'dpctl/show' command no longer shows interface configuration + status, only values of the actual configuration options, a.k.a. + 'requested' configuration. The interface configuration status, + a.k.a. 'configured' values, can be found in the 'status' column of + the Interface table, i.e. with 'ovs-vsctl get interface <..> status'. + Reported names adjusted accordingly. v3.2.0 - 17 Aug 2023 diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 55700250df2..29f2b280d49 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1905,31 +1905,41 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) ovs_mutex_lock(&dev->mutex); - smap_add_format(args, "requested_rx_queues", "%d", dev->user_n_rxq); - smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq); - smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq); - smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq); - smap_add_format(args, "mtu", "%d", dev->mtu); + if (dev->devargs && dev->devargs[0]) { + smap_add_format(args, "dpdk-devargs", "%s", dev->devargs); + } - if (dev->type == DPDK_DEV_ETH) { - smap_add_format(args, "n_rxq_desc", "%d", dev->rxq_size); - smap_add_format(args, "n_txq_desc", "%d", dev->txq_size); - if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) { - smap_add(args, "rx_csum_offload", "true"); - } else { - smap_add(args, "rx_csum_offload", "false"); - } - if (dev->rx_steer_flags == DPDK_RX_STEER_LACP) { - smap_add(args, "rx-steering", "rss+lacp"); - } - smap_add(args, "lsc_interrupt_mode", - dev->lsc_interrupt_mode ? "true" : "false"); + smap_add_format(args, "n_rxq", "%d", dev->user_n_rxq); - if (dpdk_port_is_representor(dev)) { - smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT, - ETH_ADDR_ARGS(dev->requested_hwaddr)); - } + if (dev->fc_conf.mode == RTE_ETH_FC_TX_PAUSE || + dev->fc_conf.mode == RTE_ETH_FC_FULL) { + smap_add(args, "rx-flow-ctrl", "true"); } + + if (dev->fc_conf.mode == RTE_ETH_FC_RX_PAUSE || + dev->fc_conf.mode == RTE_ETH_FC_FULL) { + smap_add(args, "tx-flow-ctrl", "true"); + } + + if (dev->fc_conf.autoneg) { + smap_add(args, "flow-ctrl-autoneg", "true"); + } + + smap_add_format(args, "n_rxq_desc", "%d", dev->rxq_size); + smap_add_format(args, "n_txq_desc", "%d", dev->txq_size); + + if (dev->rx_steer_flags == DPDK_RX_STEER_LACP) { + smap_add(args, "rx-steering", "rss+lacp"); + } + + smap_add(args, "dpdk-lsc-interrupt", + dev->lsc_interrupt_mode ? "true" : "false"); + + if (dpdk_port_is_representor(dev)) { + smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT, + ETH_ADDR_ARGS(dev->requested_hwaddr)); + } + ovs_mutex_unlock(&dev->mutex); return 0; @@ -2324,6 +2334,29 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, return err; } +static int +netdev_dpdk_vhost_client_get_config(const struct netdev *netdev, + struct smap *args) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + int tx_retries_max; + + ovs_mutex_lock(&dev->mutex); + + if (dev->vhost_id) { + smap_add(args, "vhost-server-path", dev->vhost_id); + } + + atomic_read_relaxed(&dev->vhost_tx_retries_max, &tx_retries_max); + if (tx_retries_max != VHOST_ENQ_RETRY_DEF) { + smap_add_format(args, "tx-retries-max", "%d", tx_retries_max); + } + + ovs_mutex_unlock(&dev->mutex); + + return 0; +} + static int netdev_dpdk_vhost_client_set_config(struct netdev *netdev, const struct smap *args, @@ -4091,6 +4124,9 @@ netdev_dpdk_vhost_user_get_status(const struct netdev *netdev, smap_add_format(args, "userspace-tso", "disabled"); } + smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); + smap_add_format(args, "n_txq", "%d", netdev->n_txq); + ovs_mutex_unlock(&dev->mutex); return 0; } @@ -4161,6 +4197,13 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs); smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools); + smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); + smap_add_format(args, "n_txq", "%d", netdev->n_txq); + + smap_add(args, "rx_csum_offload", + dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD + ? "true" : "false"); + /* Querying the DPDK library for iftype may be done in future, pending * support; cf. RFC 3635 Section 3.2.4. */ enum { IF_TYPE_ETHERNETCSMACD = 6 }; @@ -4186,16 +4229,21 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) ETH_ADDR_ARGS(dev->hwaddr)); } - if (rx_steer_flags) { - if (!rx_steer_flows_num) { - smap_add(args, "rx_steering", "unsupported"); + if (rx_steer_flags && !rx_steer_flows_num) { + smap_add(args, "rx-steering", "unsupported"); + } else if (rx_steer_flags == DPDK_RX_STEER_LACP) { + smap_add(args, "rx-steering", "rss+lacp"); + } else { + ovs_assert(!rx_steer_flags); + smap_add(args, "rx-steering", "rss"); + } + + if (rx_steer_flags && rx_steer_flows_num) { + smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1); + if (n_rxq > 2) { + smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2); } else { - smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1); - if (n_rxq > 2) { - smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2); - } else { - smap_add(args, "rss_queues", "0"); - } + smap_add(args, "rss_queues", "0"); } } @@ -6415,7 +6463,6 @@ parse_vhost_config(const struct smap *ovs_other_config) .is_pmd = true, \ .alloc = netdev_dpdk_alloc, \ .dealloc = netdev_dpdk_dealloc, \ - .get_config = netdev_dpdk_get_config, \ .get_numa_id = netdev_dpdk_get_numa_id, \ .set_etheraddr = netdev_dpdk_set_etheraddr, \ .get_etheraddr = netdev_dpdk_get_etheraddr, \ @@ -6459,6 +6506,7 @@ static const struct netdev_class dpdk_class = { .type = "dpdk", NETDEV_DPDK_CLASS_BASE, .construct = netdev_dpdk_construct, + .get_config = netdev_dpdk_get_config, .set_config = netdev_dpdk_set_config, .send = netdev_dpdk_eth_send, }; @@ -6485,6 +6533,7 @@ static const struct netdev_class dpdk_vhost_client_class = { .init = netdev_dpdk_vhost_class_init, .construct = netdev_dpdk_vhost_client_construct, .destruct = netdev_dpdk_vhost_destruct, + .get_config = netdev_dpdk_vhost_client_get_config, .set_config = netdev_dpdk_vhost_client_set_config, .send = netdev_dpdk_vhost_send, .get_carrier = netdev_dpdk_vhost_get_carrier, diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 0f58e857422..fd42aed0b38 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -588,8 +588,9 @@ AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 dnl Check default MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=1500' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl +1500 +]) dnl Increase MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9000]) @@ -600,8 +601,9 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9000) setup error" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl +9000 +]) dnl Clean up @@ -636,14 +638,16 @@ dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9000) setup error" ovs-vswitchd.log], [], [stdout]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl +9000 +]) dnl Decrease MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=2000]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=2000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl +2000 +]) dnl Clean up @@ -686,16 +690,19 @@ tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check default MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=1500' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +1500 +]) dnl Increase MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9000]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +9000 +]) dnl Clean up the testpmd now pkill -f -x -9 'tail -f /dev/null' @@ -743,16 +750,19 @@ tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +9000 +]) dnl Decrease MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=2000]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=2000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +2000 +]) dnl Clean up the testpmd now pkill -f -x -9 'tail -f /dev/null' @@ -789,8 +799,9 @@ dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9702) setup error" ovs-vswitchd.log], [], [stdout]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9702' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl +9702 +]) dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9711]) @@ -830,8 +841,9 @@ dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (68) setup error" ovs-vswitchd.log], [], [stdout]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=68' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl +68 +]) dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=67]) @@ -877,10 +889,12 @@ tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9702' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +9702 +]) dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9711]) @@ -934,10 +948,12 @@ tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=68' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +68 +]) dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=67]) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 81f6e872e2c..68392ac41d7 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3789,6 +3789,18 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Maximum number of VMDq pools. + + Number of Rx queues. + + + + Number of Tx queues. + + + + Whether Rx Checksum offload is enabled or not. + + Interface type ID according to IANA ifTYPE MIB definitions. @@ -3807,7 +3819,7 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ VF representors. - + Hardware Rx queue steering policy in use. From 8b4265c11176551194ef83c35730f8d0c5a651a8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 3 Nov 2023 20:01:40 +0100 Subject: [PATCH 431/833] release-process: Update LTS designation schedule example. It is an example and the dates are not set in stone, so updating the table it is not very important. But it's nice to see currently supported releases there as well as the near future plans. Acked-by: Kevin Traynor Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/internals/release-process.rst | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Documentation/internals/release-process.rst b/Documentation/internals/release-process.rst index 0eb8e192a0e..d939c2d3ab8 100644 --- a/Documentation/internals/release-process.rst +++ b/Documentation/internals/release-process.rst @@ -96,18 +96,22 @@ LTS designation schedule example (depends on current state of development): +---------+--------------+--------------------------------------------------+ | Version | Release Date | Actions | +---------+--------------+--------------------------------------------------+ -| 2.14 | Aug 2020 | 2.14 - new latest stable, 2.13 stable ⟶ new LTS | -+---------+--------------+--------------------------------------------------+ -| 2.15 | Feb 2021 | 2.12 - new latest stable, 2.5 LTS ⟶ EOL | -+---------+--------------+--------------------------------------------------+ -| 2.16 | Aug 2021 | 2.16 - new latest stable | -+---------+--------------+--------------------------------------------------+ | 2.17 | Feb 2022 | 2.17 - new latest stable | +---------+--------------+--------------------------------------------------+ | 3.0 | Aug 2022 | 3.0 - new latest stable, 2.17 stable ⟶ new LTS | +---------+--------------+--------------------------------------------------+ | 3.1 | Feb 2023 | 3.1 - new latest stable, 2.13 LTS ⟶ EOL | +---------+--------------+--------------------------------------------------+ +| 3.2 | Aug 2023 | 3.2 - new latest stable | ++---------+--------------+--------------------------------------------------+ +| 3.3 | Feb 2024 | 3.3 - new latest stable | ++---------+--------------+--------------------------------------------------+ +| 3.4 | Aug 2024 | 3.4 - new latest stable, 3.3 stable ⟶ new LTS | ++---------+--------------+--------------------------------------------------+ +| 3.5 | Feb 2025 | 3.5 - new latest stable, 2.17 LTS ⟶ EOL | ++---------+--------------+--------------------------------------------------+ +| 3.6 | Aug 2025 | 3.6 - new latest stable | ++---------+--------------+--------------------------------------------------+ While branches other than LTS and the latest release are not formally maintained, the OVS project usually provides stable releases for these branches From 4d74e230730ab0643aaad14d11a6712408cbd705 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 3 Nov 2023 20:04:53 +0100 Subject: [PATCH 432/833] build-aux/extract-ofp-fields: Fix the number of Summary columns. The table has only 6 columns, not 7. This doesn't really affect rendering. Only slightly affects calculations around how much space the table needs. Fixes: 96fee5e0a2a0 ("ovs-fields: New manpage to document Open vSwitch and OpenFlow fields.") Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- build-aux/extract-ofp-fields | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-aux/extract-ofp-fields b/build-aux/extract-ofp-fields index 89d80c20855..2657d9249bb 100755 --- a/build-aux/extract-ofp-fields +++ b/build-aux/extract-ofp-fields @@ -318,7 +318,7 @@ def group_xml_to_nroff(group_node, fields): '.SS "Summary:"\n', ".TS\n", "tab(;),nowarn;\n", - "l l l l l l l.\n", + "l l l l l l.\n", "Name;Bytes;Mask;RW?;Prereqs;NXM/OXM Support\n", r"\_;\_;\_;\_;\_;\_", "\n", From 74bfe3701407cd7cff18a88b19ca48f90067464d Mon Sep 17 00:00:00 2001 From: Salem Sol Date: Sun, 5 Nov 2023 10:38:09 +0200 Subject: [PATCH 433/833] checkpatch: Add argument to skip committer signoff check. Introduce --skip-committer-signoff arg that can be used internally by groups using gerrit for code reviews and gerrit maintainers could do the rebase instead of the author or push upstream commits to be merged through gerrit. Signed-off-by: Salem Sol Acked-by: Roi Dayan Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- utilities/checkpatch.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 1a0ebe79be4..6b210fab838 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -189,6 +189,7 @@ def reset_counters(): skip_gerrit_change_id_check = False skip_block_whitespace_check = False skip_signoff_check = False +skip_committer_signoff_check = False # Don't enforce character limit on files that include these characters in their # name, as they may have legitimate reasons to have longer lines. @@ -920,7 +921,8 @@ def ovs_checkpatch_parse(text, filename, author=None, committer=None): break if (committer and author != committer - and committer not in signatures): + and committer not in signatures + and not skip_committer_signoff_check): print_error("Committer %s needs to sign off." % committer) @@ -1038,7 +1040,8 @@ def usage(): -S|--spellcheck Check C comments and commit-message for possible spelling mistakes -t|--skip-trailing-whitespace Skips the trailing whitespace test - --skip-gerrit-change-id Skips the gerrit change id test""" + --skip-gerrit-change-id Skips the gerrit change id test + --skip-committer-signoff Skips the committer sign-off test""" % sys.argv[0]) @@ -1109,6 +1112,7 @@ def partition(pred, iterable): "skip-signoff-lines", "skip-trailing-whitespace", "skip-gerrit-change-id", + "skip-committer-signoff", "spellcheck", "quiet"]) except: @@ -1129,6 +1133,8 @@ def partition(pred, iterable): skip_trailing_whitespace_check = True elif o in ("--skip-gerrit-change-id"): skip_gerrit_change_id_check = True + elif o in ("--skip-committer-signoff"): + skip_committer_signoff_check = True elif o in ("-f", "--check-file"): checking_file = True elif o in ("-S", "--spellcheck"): From 3e0d8d1f4b6320c8e5f5be9edd86bb1c0e7807af Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Sun, 5 Nov 2023 10:38:10 +0200 Subject: [PATCH 434/833] checkpatch.at: Add cases to verify skip committer check. First case without the skip flag should fail. Second case uses the skip flag and should pass. Signed-off-by: Roi Dayan Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/checkpatch.at | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/checkpatch.at b/tests/checkpatch.at index 4f6b0c7b36b..caab2817bd9 100755 --- a/tests/checkpatch.at +++ b/tests/checkpatch.at @@ -1,7 +1,7 @@ AT_BANNER([checkpatch]) OVS_START_SHELL_HELPERS -# try_checkpatch PATCH [ERRORS] +# try_checkpatch PATCH [ERRORS] [checkpatch-args] # # Runs checkpatch, if installed, on the given PATCH, expecting the # specified set of ERRORS (and warnings). @@ -29,11 +29,11 @@ Subject: Patch this is. fi if test -s expout; then - AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py -q test.patch], + AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py $3 -q test.patch], [1], [stdout]) AT_CHECK([sed '/^Lines checked:/,$d' stdout], [0], [expout]) else - AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py -q test.patch]) + AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py $3 -q test.patch]) fi } OVS_END_SHELL_HELPERS @@ -589,3 +589,23 @@ try_checkpatch \ Subject: netdev: This is a way to long commit summary and therefor it should report a WARNING!" AT_CLEANUP + +AT_SETUP([checkpatch - ignore committer as signoff]) +try_checkpatch \ + "Author: A + Commit: B + Subject: netdev: Subject. + + Signed-off-by: A" \ + "ERROR: Committer B needs to sign off." + +try_checkpatch \ + "Author: A + Commit: B + Subject: netdev: Subject. + + Signed-off-by: A" \ + "" \ + "--skip-committer-signoff" + +AT_CLEANUP From c62b4ac8f8da0403658e13a2292e30e1d86caa0d Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 14 Nov 2023 17:59:37 +0000 Subject: [PATCH 435/833] ovs-ofctl: Implement compose-packet --bare [--bad-csum]. With --bare, it will produce a bare hexified payload with no spaces or offset indicators inserted, which is useful in tests to produce frames to pass to e.g. `ovs-ofctl receive`. With --bad-csum, it will produce a frame that has an invalid IP checksum (applicable to IPv4 only because IPv6 doesn't have checksums.) The command is now more useful in tests, where we may need to produce hex frame payloads to compare observed frames against. As an example of the tool use, a single test case is converted to it. The test uses both normal --bare and --bad-csum behaviors of the command, confirming they work as advertised. Acked-by: Simon Horman Signed-off-by: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- lib/flow.c | 17 +++++++++++-- lib/flow.h | 2 +- lib/netdev-dummy.c | 4 +-- ofproto/ofproto-dpif-trace.c | 2 +- ofproto/ofproto-dpif.c | 4 +-- tests/dpif-netdev.at | 45 ++++++++++++++++------------------ utilities/ovs-ofctl.c | 47 ++++++++++++++++++++++++++++++------ 7 files changed, 81 insertions(+), 40 deletions(-) diff --git a/lib/flow.c b/lib/flow.c index fe226cf0fe5..b8f99f66be9 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -3306,6 +3306,8 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) * (This is useful only for testing, obviously, and the packet isn't really * valid. Lots of fields are just zeroed.) * + * If 'bad_csum' is true, the final IP checksum is invalid. + * * For packets whose protocols can encapsulate arbitrary L7 payloads, 'l7' and * 'l7_len' determine that payload: * @@ -3318,7 +3320,7 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) * from 'l7'. */ void flow_compose(struct dp_packet *p, const struct flow *flow, - const void *l7, size_t l7_len) + const void *l7, size_t l7_len, bool bad_csum) { /* Add code to this function (or its callees) for emitting new fields or * protocols. (This isn't essential, so it can be skipped for initial @@ -3370,7 +3372,18 @@ flow_compose(struct dp_packet *p, const struct flow *flow, /* Checksum has already been zeroed by put_zeros call. */ ip->ip_csum = csum(ip, sizeof *ip); - dp_packet_ol_set_ip_csum_good(p); + if (bad_csum) { + /* + * Internet checksum is a sum complement to zero, so any other + * value will result in an invalid checksum. Here, we flip one + * bit. + */ + ip->ip_csum ^= (OVS_FORCE ovs_be16) 0x1; + dp_packet_ip_checksum_bad(p); + } else { + dp_packet_ol_set_ip_csum_good(p); + } + pseudo_hdr_csum = packet_csum_pseudoheader(ip); flow_compose_l4_csum(p, flow, pseudo_hdr_csum); } else if (flow->dl_type == htons(ETH_TYPE_IPV6)) { diff --git a/lib/flow.h b/lib/flow.h index a9d026e1ce3..75a9be3c19d 100644 --- a/lib/flow.h +++ b/lib/flow.h @@ -127,7 +127,7 @@ void flow_set_mpls_bos(struct flow *, int idx, uint8_t stack); void flow_set_mpls_lse(struct flow *, int idx, ovs_be32 lse); void flow_compose(struct dp_packet *, const struct flow *, - const void *l7, size_t l7_len); + const void *l7, size_t l7_len, bool bad_csum); void packet_expand(struct dp_packet *, const struct flow *, size_t size); bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto, diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index fe82317d723..8c6e6d44870 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -1769,7 +1769,7 @@ eth_from_flow_str(const char *s, size_t packet_size, packet = dp_packet_new(0); if (packet_size) { - flow_compose(packet, flow, NULL, 0); + flow_compose(packet, flow, NULL, 0, false); if (dp_packet_size(packet) < packet_size) { packet_expand(packet, flow, packet_size); } else if (dp_packet_size(packet) > packet_size){ @@ -1777,7 +1777,7 @@ eth_from_flow_str(const char *s, size_t packet_size, packet = NULL; } } else { - flow_compose(packet, flow, NULL, 64); + flow_compose(packet, flow, NULL, 64, false); } ofpbuf_uninit(&odp_key); diff --git a/ofproto/ofproto-dpif-trace.c b/ofproto/ofproto-dpif-trace.c index 527e2f17ede..b86e7fe07eb 100644 --- a/ofproto/ofproto-dpif-trace.c +++ b/ofproto/ofproto-dpif-trace.c @@ -440,7 +440,7 @@ parse_flow_and_packet(int argc, const char *argv[], if (generate_packet) { /* Generate a packet, as requested. */ packet = dp_packet_new(0); - flow_compose(packet, flow, l7, l7_len); + flow_compose(packet, flow, l7, l7_len, false); } else if (packet) { /* Use the metadata from the flow and the packet argument to * reconstruct the flow. */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index ba5706f6adc..9e8faf82910 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -1255,7 +1255,7 @@ check_ct_eventmask(struct dpif_backer *backer) /* Compose a dummy UDP packet. */ dp_packet_init(&packet, 0); - flow_compose(&packet, &flow, NULL, 64); + flow_compose(&packet, &flow, NULL, 64, false); /* Execute the actions. On older datapaths this fails with EINVAL, on * newer datapaths it succeeds. */ @@ -1348,7 +1348,7 @@ check_ct_timeout_policy(struct dpif_backer *backer) /* Compose a dummy UDP packet. */ dp_packet_init(&packet, 0); - flow_compose(&packet, &flow, NULL, 64); + flow_compose(&packet, &flow, NULL, 64, false); /* Execute the actions. On older datapaths this fails with EINVAL, on * newer datapaths it succeeds. */ diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 85119fb819e..d0359b5eab6 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -746,19 +746,25 @@ OVS_VSWITCHD_START( # Modify the ip_dst addr to force changing the IP csum. AT_CHECK([ovs-ofctl add-flow br1 in_port=p1,actions=mod_nw_dst:192.168.1.1,output:p2]) +flow_s="\ + eth_src=8a:bf:7e:2f:05:84,eth_dst=0a:8f:39:4f:e0:73,dl_type=0x0800,\ + nw_src=192.168.123.2,nw_dst=192.168.123.1,nw_proto=6,nw_ttl=64,nw_frag=no,\ + tp_src=54392,tp_dst=5201,tcp_flags=ack" + +good_frame=$(ovs-ofctl compose-packet --bare "${flow_s}") + # Check if no offload remains ok. AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap]) AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false]) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ -0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 -]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${good_frame}]) # Checksum should change to 0x990 with ip_dst changed to 192.168.1.1 # by the datapath while processing the packet. +flow_expected=$(echo "${flow_s}" | sed 's/192.168.123.1/192.168.1.1/g') +good_expected=$(ovs-ofctl compose-packet --bare "${flow_expected}") AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) -AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl -0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} ]) # Check if packets entering the datapath with csum offloading @@ -766,12 +772,9 @@ AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl # in the datapath and not by the netdev. AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ -0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 -]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${good_frame}]) AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) -AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl -0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} ]) # Check if packets entering the datapath with csum offloading @@ -779,36 +782,30 @@ AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl # by the datapath. AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true]) AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ -0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${good_frame} ]) AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) -AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl -0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} ]) # Push a packet with bad csum and offloading disabled to check # if the datapath updates the csum, but does not fix the issue. +bad_frame=$(ovs-ofctl compose-packet --bare --bad-csum "${flow_s}") AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false]) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ -0a8f394fe0738abf7e2f058408004500003433e0400040068f03c0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 -]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${bad_frame}]) AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) -AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl -0a8f394fe0738abf7e2f058408004500003433e0400040060904c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +bad_expected=$(ovs-ofctl compose-packet --bare --bad-csum "${flow_expected}") +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${bad_expected} ]) # Push a packet with bad csum and offloading enabled to check # if the driver updates and fixes the csum. AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true]) AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) -AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ -0a8f394fe0738abf7e2f058408004500003433e0400040068f03c0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4 -]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${bad_frame}]) AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) -AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl -0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4 +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} ]) OVS_VSWITCHD_STOP AT_CLEANUP diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index 24d0941cf2e..0a382f336bb 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -154,6 +154,12 @@ static int show_stats = 1; /* --pcap: Makes "compose-packet" print a pcap on stdout. */ static int print_pcap = 0; +/* --bare: Makes "compose-packet" print a bare hexified payload. */ +static int print_bare = 0; + +/* -bad-csum: Makes "compose-packet" generate an invalid checksum. */ +static int bad_csum = 0; + /* --raw: Makes "ofp-print" read binary data from stdin. */ static int raw = 0; @@ -243,6 +249,8 @@ parse_options(int argc, char *argv[]) {"color", optional_argument, NULL, OPT_COLOR}, {"may-create", no_argument, NULL, OPT_MAY_CREATE}, {"pcap", no_argument, &print_pcap, 1}, + {"bare", no_argument, &print_bare, 1}, + {"bad-csum", no_argument, &bad_csum, 1}, {"raw", no_argument, &raw, 1}, {"read-only", no_argument, NULL, OPT_READ_ONLY}, DAEMON_LONG_OPTIONS, @@ -4948,20 +4956,33 @@ ofctl_parse_key_value(struct ovs_cmdl_context *ctx) } } -/* "compose-packet [--pcap] FLOW [L7]": Converts the OpenFlow flow - * specification FLOW to a packet with flow_compose() and prints the hex bytes - * in the packet on stdout. Also verifies that the flow extracted from that - * packet matches the original FLOW. +/* "compose-packet [--pcap|--bare] [--bad-csum] FLOW [L7]": Converts the + * OpenFlow flow specification FLOW to a packet with flow_compose() and prints + * the hex bytes of the packet, with offsets, to stdout. + * + * With --pcap, prints the packet in pcap format, so that you can do something + * like "ovs-ofctl --pcap compose-packet udp | tcpdump -vvvv -r-" to use + * another tool to dump the packet contents. + * + * With --bare, prints the packet as a single bare hex string with no + * spaces or offsets, so that you can pass the result directly to e.g. + * "ovs-appctl netdev-dummy/receive vif $(ovs-ofctl compose-packet --bare + * FLOW)" + * + * With --bad-csum, produces a packet with an invalid IP checksum. (For IPv4.) * - * With --pcap, prints the packet to stdout instead as a pcap file, so that you - * can do something like "ovs-ofctl --pcap compose-packet udp | tcpdump -vvvv - * -r-" to use another tool to dump the packet contents. + * Regardless of the mode, the command also verifies that the flow extracted + * from that packet matches the original FLOW. * * If L7 is specified, draws the L7 payload data from it, otherwise defaults to * 64 bytes of payload. */ static void ofctl_compose_packet(struct ovs_cmdl_context *ctx) { + if (print_pcap && print_bare) { + ovs_fatal(1, "--bare and --pcap are mutually exclusive"); + } + if (print_pcap && isatty(STDOUT_FILENO)) { ovs_fatal(1, "not writing pcap data to stdout; redirect to a file " "or pipe to tcpdump instead"); @@ -4989,7 +5010,7 @@ ofctl_compose_packet(struct ovs_cmdl_context *ctx) l7_len = dp_packet_size(&payload); l7 = dp_packet_steal_data(&payload); } - flow_compose(&p, &flow1, l7, l7_len); + flow_compose(&p, &flow1, l7, l7_len, bad_csum); free(l7); if (print_pcap) { @@ -4997,6 +5018,16 @@ ofctl_compose_packet(struct ovs_cmdl_context *ctx) ovs_pcap_write_header(p_file); ovs_pcap_write(p_file, &p); ovs_pcap_close(p_file); + } else if (print_bare) { + /* Binary to a bare hex string. */ + for (int i = 0; i < dp_packet_size(&p); i++) { + uint8_t val = ((uint8_t *) dp_packet_data(&p))[i]; + /* Don't use ds_put_hex because it adds 0x prefix as well as + * it doesn't guarantee an even number of payload characters, which + * may be important elsewhere (e.g. in netdev-dummy/receive). */ + printf("%02" PRIx8, val); + } + } else { ovs_hex_dump(stdout, dp_packet_data(&p), dp_packet_size(&p), 0, false); } From 7b514aba0e91c535024508624724a83a3df87b71 Mon Sep 17 00:00:00 2001 From: Nobuhiro MIKI Date: Wed, 15 Nov 2023 18:47:33 +0900 Subject: [PATCH 436/833] ofproto-dpif-trace: Improve conjunctive match tracing. A conjunctive flow consists of two or more multiple flows with conjunction actions. When input to the ofproto/trace command matches a conjunctive flow, it outputs flows of all dimensions. Acked-by: Simon Horman Signed-off-by: Nobuhiro MIKI Signed-off-by: Ilya Maximets --- NEWS | 2 + lib/classifier.c | 51 ++++++++++++++++--- lib/classifier.h | 4 +- lib/ovs-router.c | 5 +- lib/tnl-ports.c | 6 +-- ofproto/ofproto-dpif-xlate.c | 67 +++++++++++++++++++++--- ofproto/ofproto-dpif.c | 25 ++++++--- ofproto/ofproto-dpif.h | 3 +- tests/classifier.at | 99 ++++++++++++++++++++++++++++++++++++ tests/test-classifier.c | 8 +-- 10 files changed, 240 insertions(+), 30 deletions(-) diff --git a/NEWS b/NEWS index 43aea97b5d2..1d9c30533b3 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ Post-v3.2.0 during the process, and error logs complaining unrecognized fields may be observed on old nodes. - ovs-appctl: + * 'ofproto/trace' now reports OpenFlow rules that make up a conjunctive + flow match. * Output of 'dpctl/show' command no longer shows interface configuration status, only values of the actual configuration options, a.k.a. 'requested' configuration. The interface configuration status, diff --git a/lib/classifier.c b/lib/classifier.c index 18dbfc83ad4..0729bd19024 100644 --- a/lib/classifier.c +++ b/lib/classifier.c @@ -853,6 +853,32 @@ trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie) ctx->lookup_done = false; } +static void +insert_conj_flows(struct hmapx *conj_flows, uint32_t id, int priority, + struct cls_conjunction_set **soft, size_t n_soft) +{ + struct cls_conjunction_set *conj_set; + + if (!conj_flows) { + return; + } + + for (size_t i = 0; i < n_soft; i++) { + conj_set = soft[i]; + + if (conj_set->priority != priority) { + continue; + } + + for (size_t j = 0; j < conj_set->n; j++) { + if (conj_set->conj[j].id == id) { + hmapx_add(conj_flows, (void *) (conj_set->match->cls_rule)); + break; + } + } + } +} + struct conjunctive_match { struct hmap_node hmap_node; uint32_t id; @@ -933,11 +959,15 @@ free_conjunctive_matches(struct hmap *matches, * recursion within this function itself. * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ static const struct cls_rule * classifier_lookup__(const struct classifier *cls, ovs_version_t version, struct flow *flow, struct flow_wildcards *wc, - bool allow_conjunctive_matches) + bool allow_conjunctive_matches, + struct hmapx *conj_flows) { struct trie_ctx trie_ctx[CLS_MAX_TRIES]; const struct cls_match *match; @@ -1097,10 +1127,15 @@ classifier_lookup__(const struct classifier *cls, ovs_version_t version, const struct cls_rule *rule; flow->conj_id = id; - rule = classifier_lookup__(cls, version, flow, wc, false); + rule = classifier_lookup__(cls, version, flow, wc, false, + NULL); flow->conj_id = saved_conj_id; if (rule) { + if (allow_conjunctive_matches) { + insert_conj_flows(conj_flows, id, soft_pri, soft, + n_soft); + } free_conjunctive_matches(&matches, cm_stubs, ARRAY_SIZE(cm_stubs)); if (soft != soft_stub) { @@ -1161,12 +1196,16 @@ classifier_lookup__(const struct classifier *cls, ovs_version_t version, * flow_wildcards_init_catchall()). * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ const struct cls_rule * classifier_lookup(const struct classifier *cls, ovs_version_t version, - struct flow *flow, struct flow_wildcards *wc) + struct flow *flow, struct flow_wildcards *wc, + struct hmapx *conj_flows) { - return classifier_lookup__(cls, version, flow, wc, true); + return classifier_lookup__(cls, version, flow, wc, true, conj_flows); } /* Finds and returns a rule in 'cls' with exactly the same priority and diff --git a/lib/classifier.h b/lib/classifier.h index f646a8f7429..f55a2cba998 100644 --- a/lib/classifier.h +++ b/lib/classifier.h @@ -299,6 +299,7 @@ * parallel to the rule's removal. */ #include "cmap.h" +#include "hmapx.h" #include "openvswitch/match.h" #include "openvswitch/meta-flow.h" #include "pvector.h" @@ -398,7 +399,8 @@ static inline void classifier_publish(struct classifier *); * and each other. */ const struct cls_rule *classifier_lookup(const struct classifier *, ovs_version_t, struct flow *, - struct flow_wildcards *); + struct flow_wildcards *, + struct hmapx *conj_flows); bool classifier_rule_overlaps(const struct classifier *, const struct cls_rule *, ovs_version_t); const struct cls_rule *classifier_find_rule_exactly(const struct classifier *, diff --git a/lib/ovs-router.c b/lib/ovs-router.c index 7c04bb0e6b1..ca014d80ed3 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -115,7 +115,8 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst, const struct cls_rule *cr_src; struct flow flow_src = {.ipv6_dst = *src, .pkt_mark = mark}; - cr_src = classifier_lookup(&cls, OVS_VERSION_MAX, &flow_src, NULL); + cr_src = classifier_lookup(&cls, OVS_VERSION_MAX, &flow_src, NULL, + NULL); if (cr_src) { struct ovs_router_entry *p_src = ovs_router_entry_cast(cr_src); if (!p_src->local) { @@ -126,7 +127,7 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst, } } - cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL); + cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL, NULL); if (cr) { struct ovs_router_entry *p = ovs_router_entry_cast(cr); diff --git a/lib/tnl-ports.c b/lib/tnl-ports.c index f16409a0bf0..bb0b0b0c55f 100644 --- a/lib/tnl-ports.c +++ b/lib/tnl-ports.c @@ -112,7 +112,7 @@ map_insert(odp_port_t port, struct eth_addr mac, struct in6_addr *addr, tnl_port_init_flow(&match.flow, mac, addr, nw_proto, tp_port); do { - cr = classifier_lookup(&cls, OVS_VERSION_MAX, &match.flow, NULL); + cr = classifier_lookup(&cls, OVS_VERSION_MAX, &match.flow, NULL, NULL); p = tnl_port_cast(cr); /* Try again if the rule was released before we get the reference. */ } while (p && !ovs_refcount_try_ref_rcu(&p->ref_cnt)); @@ -247,7 +247,7 @@ map_delete(struct eth_addr mac, struct in6_addr *addr, tnl_port_init_flow(&flow, mac, addr, nw_proto, tp_port); - cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL); + cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL, NULL); tnl_port_unref(cr); } @@ -305,7 +305,7 @@ odp_port_t tnl_port_map_lookup(struct flow *flow, struct flow_wildcards *wc) { const struct cls_rule *cr = classifier_lookup(&cls, OVS_VERSION_MAX, flow, - wc); + wc, NULL); return (cr) ? tnl_port_cast(cr)->portno : ODPP_NONE; } diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index e243773307b..289f8a7361d 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -229,6 +229,9 @@ struct xlate_ctx { * wants actions. */ struct ofpbuf *odp_actions; + /* Set of matching conjunctive flows, or NULL. */ + struct hmapx *conj_flows; + /* Statistics maintained by xlate_table_action(). * * These statistics limit the amount of work that a single flow @@ -866,6 +869,34 @@ xlate_report_action_set(const struct xlate_ctx *ctx, const char *verb) } } +static void +xlate_report_conj_matches(const struct xlate_ctx *ctx, + const struct ofputil_port_map *map) +{ + struct ds s = DS_EMPTY_INITIALIZER; + struct hmapx_node *node; + struct cls_rule *rule; + + /* NOTE: The conj flows have meaning in order. For each flow that is a + * component of conj flows, 'k' in 'conjunction(id, k/n)' represents the + * dimension. When there are multiple flows with the same id, it may be + * implicitly expected that they would be output in ascending order of 'k'. + * + * However, because of the use of hmapx strucutre and the fact that the + * classifier returns them in arbitrary order, they are output in arbitrary + * order here. */ + HMAPX_FOR_EACH (node, ctx->conj_flows) { + ds_clear(&s); + + rule = node->data; + + cls_rule_format(rule, ofproto_get_tun_tab(&ctx->xin->ofproto->up), + map, &s); + xlate_report(ctx, OFT_DETAIL, "conj. %s", ds_cstr(&s)); + } + + ds_destroy(&s); +} /* If tracing is enabled in 'ctx', appends a node representing 'rule' (in * OpenFlow table 'table_id') to the trace and makes this node the parent for @@ -882,6 +913,8 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, return; } + struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); + struct ds s = DS_EMPTY_INITIALIZER; ds_put_format(&s, "%2d. ", table_id); if (rule == ctx->xin->ofproto->miss_rule) { @@ -892,8 +925,6 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, ds_put_cstr(&s, "Packets are IP fragments and " "the fragment handling mode is \"drop\"."); } else { - struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); - if (ctx->xin->names) { struct ofproto_dpif *ofprotop; ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name); @@ -904,8 +935,6 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, ofproto_get_tun_tab(&ctx->xin->ofproto->up), &map, &s, OFP_DEFAULT_PRIORITY); - ofputil_port_map_destroy(&map); - if (ds_last(&s) != ' ') { ds_put_cstr(&s, ", "); } @@ -918,6 +947,9 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, ctx->xin->trace = &oftrace_report(ctx->xin->trace, OFT_TABLE, ds_cstr(&s))->subs; ds_destroy(&s); + + xlate_report_conj_matches(ctx, &map); + ofputil_port_map_destroy(&map); } /* If tracing is enabled in 'ctx', adds an OFT_DETAIL trace node to 'ctx' @@ -4653,7 +4685,7 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t in_port, uint8_t table_id, ctx->xin->resubmit_stats, &ctx->table_id, in_port, may_packet_in, honor_table_miss, - ctx->xin->xcache); + ctx->xin->xcache, ctx->conj_flows); /* Swap back. */ if (with_ct_orig) { tuple_swap(&ctx->xin->flow, ctx->wc); @@ -4674,6 +4706,11 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t in_port, uint8_t table_id, struct ovs_list *old_trace = ctx->xin->trace; xlate_report_table(ctx, rule, table_id); + + if (OVS_UNLIKELY(ctx->xin->trace)) { + hmapx_clear(ctx->conj_flows); + } + xlate_recursively(ctx, rule, table_id <= old_table_id, is_last_action, xlator); ctx->xin->trace = old_trace; @@ -8044,6 +8081,13 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) COVERAGE_INC(xlate_actions); + ctx.conj_flows = NULL; + + if (OVS_UNLIKELY(xin->trace)) { + ctx.conj_flows = xzalloc(sizeof *ctx.conj_flows); + hmapx_init(ctx.conj_flows); + } + xin->trace = xlate_report(&ctx, OFT_BRIDGE, "bridge(\"%s\")", xbridge->name); if (xin->frozen_state) { @@ -8181,7 +8225,8 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) ctx.rule = rule_dpif_lookup_from_table( ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc, ctx.xin->resubmit_stats, &ctx.table_id, - flow->in_port.ofp_port, true, true, ctx.xin->xcache); + flow->in_port.ofp_port, true, true, ctx.xin->xcache, + ctx.conj_flows); if (ctx.xin->resubmit_stats) { rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats, false); } @@ -8194,6 +8239,10 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) } xlate_report_table(&ctx, ctx.rule, ctx.table_id); + + if (OVS_UNLIKELY(ctx.xin->trace)) { + hmapx_clear(ctx.conj_flows); + } } /* Tunnel stats only for not-thawed packets. */ @@ -8375,6 +8424,12 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) ofpbuf_uninit(&scratch_actions); ofpbuf_delete(ctx.encap_data); + /* Clean up 'conj_flows' as it is no longer needed. */ + if (OVS_UNLIKELY(xin->trace)) { + hmapx_destroy(ctx.conj_flows); + free(ctx.conj_flows); + } + /* Make sure we return a "drop flow" in case of an error. */ if (ctx.error) { xout->slow = 0; diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 9e8faf82910..54e057d43ff 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -4383,15 +4383,20 @@ ofproto_dpif_get_tables_version(struct ofproto_dpif *ofproto) * a reference. * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ static struct rule_dpif * rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, ovs_version_t version, uint8_t table_id, struct flow *flow, - struct flow_wildcards *wc) + struct flow_wildcards *wc, + struct hmapx *conj_flows) { struct classifier *cls = &ofproto->up.tables[table_id].cls; return rule_dpif_cast(rule_from_cls_rule(classifier_lookup(cls, version, - flow, wc))); + flow, wc, + conj_flows))); } void @@ -4433,7 +4438,10 @@ ofproto_dpif_credit_table_stats(struct ofproto_dpif *ofproto, uint8_t table_id, * 'in_port'. This is needed for resubmit action support. * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ struct rule_dpif * rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, ovs_version_t version, struct flow *flow, @@ -4441,7 +4449,8 @@ rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, const struct dpif_flow_stats *stats, uint8_t *table_id, ofp_port_t in_port, bool may_packet_in, bool honor_table_miss, - struct xlate_cache *xcache) + struct xlate_cache *xcache, + struct hmapx *conj_flows) { ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst; ofp_port_t old_in_port = flow->in_port.ofp_port; @@ -4497,7 +4506,8 @@ rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, next_id++, next_id += (next_id == TBL_INTERNAL)) { *table_id = next_id; - rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc); + rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc, + conj_flows); if (stats) { struct oftable *tbl = &ofproto->up.tables[next_id]; unsigned long orig; @@ -6680,7 +6690,8 @@ ofproto_dpif_add_internal_flow(struct ofproto_dpif *ofproto, rule = rule_dpif_lookup_in_table(ofproto, ofproto_dpif_get_tables_version(ofproto), - TBL_INTERNAL, &match->flow, &match->wc); + TBL_INTERNAL, &match->flow, &match->wc, + NULL); if (rule) { *rulep = &rule->up; } else { diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index d8e0cd37ac5..1fe22ab41bd 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -103,7 +103,8 @@ struct rule_dpif *rule_dpif_lookup_from_table(struct ofproto_dpif *, ofp_port_t in_port, bool may_packet_in, bool honor_table_miss, - struct xlate_cache *); + struct xlate_cache *, + struct hmapx *conj_flows); void rule_dpif_credit_stats(struct rule_dpif *, const struct dpif_flow_stats *, bool); diff --git a/tests/classifier.at b/tests/classifier.at index de2705653e0..93a13f32b13 100644 --- a/tests/classifier.at +++ b/tests/classifier.at @@ -276,6 +276,13 @@ for src in 0 1 2 3 4 5 6 7; do AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=1,dl_type=0x0800,nw_src=10.0.0.$src,nw_dst=10.0.0.$dst"], [0], [stdout]) AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: $out ]) + dnl Check detailed output for conjunctive match. + if test $out = 3; then + AT_CHECK_UNQUOTED([cat stdout | grep conj\\. | sort], [0], [dnl + -> conj. priority=100,ip,nw_dst=10.0.0.$dst + -> conj. priority=100,ip,nw_src=10.0.0.$src +]) + fi done done OVS_VSWITCHD_STOP @@ -418,6 +425,98 @@ ovs-ofctl: "conjunction" actions may be used along with "note" but not any other OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conjunctive match with same priority]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 +AT_DATA([flows.txt], [dnl +conj_id=1,actions=2 +conj_id=2,actions=drop + +priority=10,ip,ip_dst=10.0.0.1,actions=conjunction(1,1/2) +priority=10,ip,ip_src=10.0.0.2,actions=conjunction(1,2/2) +priority=10,ip,ip_dst=10.0.0.3,actions=conjunction(2,1/2) +priority=10,ip,in_port=1,actions=conjunction(2,2/2) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +# Check that "priority=10,ip,in_port=1,actions=conjunction(2,2/2)" is +# correctly excluded from the output. +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=1,dl_type=0x0800,nw_dst=10.0.0.1,nw_src=10.0.0.2" | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,ip,nw_dst=10.0.0.1 + -> conj. priority=10,ip,nw_src=10.0.0.2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conjunctive match with metadata]) +OVS_VSWITCHD_START +AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0,len=4}->tun_metadata0"]) +AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=1,len=8}->tun_metadata1"]) +AT_DATA([flows.txt], [dnl +conj_id=7,actions=drop + +priority=5,tun_metadata0=0x1,actions=conjunction(7,1/2) +priority=5,tun_metadata1=0x2,actions=conjunction(7,2/2) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +# Check that tunnel metadata is included in the output. +AT_CHECK([ovs-appctl ofproto/trace br0 "tun_metadata0=0x1,tun_metadata1=0x2,in_port=br0" | grep conj\\. | sort], [0], [dnl + -> conj. priority=5,tun_metadata0=0x1 + -> conj. priority=5,tun_metadata1=0x2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conjunctive match with or without port map]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 +AT_DATA([flows.txt], [dnl +conj_id=1,actions=drop +conj_id=2,actions=drop + +priority=10,ip,actions=conjunction(1,1/2),conjunction(2,1/2) +priority=10,in_port=p1,actions=conjunction(1,2/2) +priority=10,in_port=p2,actions=conjunction(1,2/2) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl ofproto/trace br0 "ip,in_port=p1" --names | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,in_port=p1 + -> conj. priority=10,ip +]) +AT_CHECK([ovs-appctl ofproto/trace br0 "ip,in_port=p2" | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,in_port=2 + -> conj. priority=10,ip +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conjunctive match with resubmit]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 +AT_DATA([flows.txt], [dnl +conj_id=1,actions=resubmit(,2) +priority=10,ip,actions=conjunction(1,1/2) +priority=10,in_port=p1,actions=conjunction(1,2/2) +priority=10,in_port=p2,actions=conjunction(1,2/2) + +table=2,conj_id=7,actions=resubmit(,3) +table=2,priority=20,ip,actions=conjunction(7,1/2) +table=2,priority=20,in_port=p1,actions=conjunction(7,2/2) +table=2,priority=20,in_port=p2,actions=conjunction(7,2/2) + +table=3,actions=drop +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +# Check that conj_flows are reset for each table and that they are output +# exactly once. +AT_CHECK([ovs-appctl ofproto/trace br0 "ip,in_port=p1" --names | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,in_port=p1 + -> conj. priority=10,ip + -> conj. priority=20,in_port=p1 + -> conj. priority=20,ip +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + # Flow classifier a packet with excess of padding. AT_SETUP([flow classifier - packet with extra padding]) OVS_VSWITCHD_START diff --git a/tests/test-classifier.c b/tests/test-classifier.c index cff00c8fa35..2c1604a01e2 100644 --- a/tests/test-classifier.c +++ b/tests/test-classifier.c @@ -441,7 +441,7 @@ compare_classifiers(struct classifier *cls, size_t n_invisible_rules, /* This assertion is here to suppress a GCC 4.9 array-bounds warning */ ovs_assert(cls->n_tries <= CLS_MAX_TRIES); - cr0 = classifier_lookup(cls, version, &flow, &wc); + cr0 = classifier_lookup(cls, version, &flow, &wc, NULL); cr1 = tcls_lookup(tcls, &flow); assert((cr0 == NULL) == (cr1 == NULL)); if (cr0 != NULL) { @@ -454,7 +454,7 @@ compare_classifiers(struct classifier *cls, size_t n_invisible_rules, /* Make sure the rule should have been visible. */ assert(cls_rule_visible_in_version(cr0, version)); } - cr2 = classifier_lookup(cls, version, &flow, NULL); + cr2 = classifier_lookup(cls, version, &flow, NULL, NULL); assert(cr2 == cr0); } } @@ -1370,10 +1370,10 @@ lookup_classifier(void *aux_) if (aux->use_wc) { flow_wildcards_init_catchall(&wc); cr = classifier_lookup(aux->cls, version, &aux->lookup_flows[x], - &wc); + &wc, NULL); } else { cr = classifier_lookup(aux->cls, version, &aux->lookup_flows[x], - NULL); + NULL, NULL); } if (cr) { hits++; From 209667c0eef601978e9a82b5fd4b3b116b6751ca Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:39 +0100 Subject: [PATCH 437/833] system-dpdk: Introduce helpers for testpmd. Rather than copy/paste everywhere, introduce helpers to control testpmd runs. Rely on --stats-period (which outputs port stats every n seconds) so that testpmd keeps running without expecting any user input. Acked-by: Aaron Conole Acked-by: Eelco Chaudron Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/system-dpdk-macros.at | 38 +++++++++++++ tests/system-dpdk.at | 103 +++++++++--------------------------- 2 files changed, 62 insertions(+), 79 deletions(-) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 3920f08a5ed..2cfd26d840d 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -79,3 +79,41 @@ m4_define([OVS_DPDK_START_VSWITCHD], AT_CAPTURE_FILE([ovs-vswitchd.log]) on_exit "kill_ovs_vswitchd `cat ovs-vswitchd.pid`" ]) + + +# OVS_DPDK_CHECK_TESTPMD() +# +# Check dpdk-testpmd availability. +# +m4_define([OVS_DPDK_CHECK_TESTPMD], + [AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +]) + + +# OVS_DPDK_START_TESTPMD() +# +# Start dpdk-testpmd in background. +# +m4_define([OVS_DPDK_START_TESTPMD], + [AT_CHECK([lscpu], [], [stdout]) + AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) + eal_options="--socket-mem="$(cat NUMA_NODE)" --file-prefix page0 --single-file-segments --no-pci" + options="$1" + test "$options" != "${options%% -- *}" || options="$options -- " + eal_options="$eal_options ${options%% -- *}" + testpmd_options="-a --stats-period 2 ${options#* -- }" + echo "dpdk-testpmd $eal_options -- $testpmd_options" >testpmd.cmd + dpdk-testpmd $eal_options -- $testpmd_options >testpmd.log 2>&1 & \ + echo $! > testpmd.pid + on_exit "kill -9 `cat testpmd.pid`" +]) + + +# OVS_DPDK_STOP_TESTPMD() +# +# Stop background dpdk-testpmd. +# +m4_define([OVS_DPDK_STOP_TESTPMD], + [AT_CHECK([kill `cat testpmd.pid`]) + OVS_WAIT([kill -0 `cat testpmd.pid`], [kill -9 `cat testpmd.pid`]) +]) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index fd42aed0b38..ab232e06da1 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -97,13 +97,9 @@ dnl Ping vhost-user port AT_SETUP([OVS-DPDK - ping vhost-user ports]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_START([--no-pci]) -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuser0 -- set Interface dpdkvhostuser0 \ @@ -125,12 +121,8 @@ ADD_NAMESPACES(ns1, ns2) dnl Add veth device ADD_VETH(tap1, ns2, br10, "172.31.110.12/24") -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostuser0" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuser0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostuser0" \ + --vdev="net_tap0,iface=tap0"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ip link show dev tap0 | grep -qw LOWER_UP]) @@ -151,8 +143,7 @@ AT_CHECK([ip netns exec ns2 ip link show], [], [stdout], [stderr]) AT_CHECK([ip netns exec ns1 ping -c 4 -I tap0 172.31.110.12], [], [stdout], [stderr]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() dnl Wait for vhost-user handling the socket disconnect. OVS_WAIT_UNTIL([grep "vHost Device '$OVS_RUNDIR/dpdkvhostuser0' has been removed" ovs-vswitchd.log]) @@ -173,13 +164,9 @@ dnl Ping vhost-user-client port AT_SETUP([OVS-DPDK - ping vhost-user-client ports]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_START([--no-pci]) -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface \ @@ -200,13 +187,8 @@ ADD_NAMESPACES(ns1, ns2) dnl Add veth device ADD_VETH(tap1, ns2, br10, "172.31.110.12/24") -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,queues=2,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a --nb-cores 2 --rxq 2 --txq 2 \ - >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,queues=2,server=1" \ + --vdev="net_tap0,iface=tap0" -- --nb-cores 2 --rxq 2 --txq 2]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ip link show dev tap0 | grep -qw LOWER_UP]) @@ -251,8 +233,7 @@ AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_bytes $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q0_good_bytes` + dnl `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q1_good_bytes`))]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) @@ -663,14 +644,10 @@ dnl MTU increase vport port AT_SETUP([OVS-DPDK - MTU increase vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() OVS_DPDK_START([--no-pci]) -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - dnl Add userspace bridge and attach it to OVS with default MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) @@ -682,12 +659,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ + --vdev="net_tap0,iface=tap0"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -704,8 +677,7 @@ AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl 9000 ]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) @@ -722,14 +694,10 @@ dnl MTU decrease vport port AT_SETUP([OVS-DPDK - MTU decrease vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() OVS_DPDK_START([--no-pci]) -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - dnl Add userspace bridge and attach it to OVS and modify MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) @@ -742,12 +710,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ + --vdev="net_tap0,iface=tap0"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -764,8 +728,7 @@ AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl 2000 ]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) @@ -866,14 +829,10 @@ dnl MTU upper bound vport port AT_SETUP([OVS-DPDK - MTU upper bound vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() OVS_DPDK_START([--no-pci]) -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - dnl Add userspace bridge and attach it to OVS and set MTU value to max upper bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) @@ -881,12 +840,8 @@ AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ + --vdev="net_tap0,iface=tap0"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -900,8 +855,7 @@ dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9711]) AT_CHECK([grep "dpdkvhostuserclient0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) @@ -920,14 +874,10 @@ dnl MTU lower bound vport port AT_SETUP([OVS-DPDK - MTU lower bound vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() OVS_DPDK_START([--no-pci]) -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - dnl Add userspace bridge and attach it to OVS and set MTU value to min lower bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) @@ -940,12 +890,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ + --vdev="net_tap0,iface=tap0"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -959,8 +905,7 @@ dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=67]) AT_CHECK([grep "dpdkvhostuserclient0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) From c488f28a0eaf6e5735924481075eecc8cc3bce90 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:40 +0100 Subject: [PATCH 438/833] system-dpdk: Don't require hugetlbfs. dpdk-testpmd does not need hugetlbfs backing as we don't require multiprocess support in OVS unit tests. Switch to --in-memory and remove the (then unneeded) check on hugetlbfs presence. Acked-by: Aaron Conole Acked-by: Eelco Chaudron Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/system-dpdk-macros.at | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 2cfd26d840d..a176a57a4bc 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -7,9 +7,6 @@ m4_define([OVS_DPDK_PRE_CHECK], [dnl Check Hugepages AT_CHECK([cat /proc/meminfo], [], [stdout]) AT_SKIP_IF([grep -E 'HugePages_Free: *0' stdout], [], [stdout]) - AT_CHECK([mount], [], [stdout]) - AT_CHECK([grep 'hugetlbfs' stdout], [], [stdout], []) - ]) @@ -97,7 +94,7 @@ m4_define([OVS_DPDK_CHECK_TESTPMD], m4_define([OVS_DPDK_START_TESTPMD], [AT_CHECK([lscpu], [], [stdout]) AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - eal_options="--socket-mem="$(cat NUMA_NODE)" --file-prefix page0 --single-file-segments --no-pci" + eal_options="--in-memory --socket-mem="$(cat NUMA_NODE)" --single-file-segments --no-pci" options="$1" test "$options" != "${options%% -- *}" || options="$options -- " eal_options="$eal_options ${options%% -- *}" From ab3eca6122ad4233557b9596b84f90623e5493c1 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:41 +0100 Subject: [PATCH 439/833] ci: Run DPDK tests in GitHub Actions. Let's enhance our coverage in the CI and run DPDK system tests. A few DPDK drivers are enabled in DPDK compilation. Put DPDK build in $PATH for dpdk-testpmd to be available. sudo drops PATH= updates and -E alone does not seem to preserve this variable. Pass PATH=$PATH when running the tests, as a workaround. Since those tests are run as root, the collection of logs is updated accordingly. In GHA, only two cores are available but some test rely on testpmd using three lcores. Add a DPDK_EAL_OPTIONS environment variable and use it to map all testpmd lcores to core 1 (and leave core 0 alone for OVS main and PMD threads). Signed-off-by: David Marchand Acked-by: Aaron Conole Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- .ci/dpdk-build.sh | 7 ++++--- .ci/linux-build.sh | 15 ++++++++++++++- .github/workflows/build-and-test.yml | 7 ++++--- tests/system-dpdk-macros.at | 2 +- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh index 02dcefef618..35540f0694b 100755 --- a/.ci/dpdk-build.sh +++ b/.ci/dpdk-build.sh @@ -35,9 +35,10 @@ function build_dpdk() DPDK_OPTS="$DPDK_OPTS -Ddeveloper_mode=disabled" # OVS compilation and "normal" unit tests (run in the CI) do not depend on - # any DPDK driver being present. - # We can disable all drivers to save compilation time. - DPDK_OPTS="$DPDK_OPTS -Ddisable_drivers=*/*" + # any DPDK driver. + # check-dpdk unit tests requires testpmd and some net/ driver. + DPDK_OPTS="$DPDK_OPTS -Denable_apps=test-pmd" + DPDK_OPTS="$DPDK_OPTS -Denable_drivers=net/null,net/tap,net/virtio" # Install DPDK using prefix. DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 8227a574870..aa2ecc50507 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -22,6 +22,9 @@ function install_dpdk() # Export the following path for pkg-config to find the .pc file. export PKG_CONFIG_PATH=$DPDK_LIB/pkgconfig/:$PKG_CONFIG_PATH + # Expose dpdk binaries. + export PATH=$(pwd)/dpdk-dir/build/bin:$PATH + if [ ! -f "${VERSION_FILE}" ]; then echo "Could not find DPDK in $(pwd)/dpdk-dir" return 1 @@ -113,7 +116,7 @@ fi OPTS="${EXTRA_OPTS} ${OPTS} $*" -if [ "$TESTSUITE" ]; then +if [ "$TESTSUITE" = 'test' ]; then # 'distcheck' will reconfigure with required options. # Now we only need to prepare the Makefile without sparse-wrapped CC. configure_ovs @@ -123,6 +126,16 @@ if [ "$TESTSUITE" ]; then TESTSUITEFLAGS=-j4 RECHECK=yes else build_ovs + for testsuite in $TESTSUITE; do + run_as_root= + if [ "${testsuite##*dpdk}" != "$testsuite" ]; then + sudo sh -c 'echo 1024 > /proc/sys/vm/nr_hugepages' || true + [ "$(cat /proc/sys/vm/nr_hugepages)" = '1024' ] + export DPDK_EAL_OPTIONS="--lcores 0@1,1@1,2@1" + run_as_root="sudo -E PATH=$PATH" + fi + $run_as_root make $testsuite TESTSUITEFLAGS=-j4 RECHECK=yes + done fi exit 0 diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index bc5494e863b..4f62efb7c33 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -123,10 +123,10 @@ jobs: opts: --enable-shared - compiler: gcc - testsuite: test + testsuite: check check-dpdk dpdk: dpdk - compiler: clang - testsuite: test + testsuite: check check-dpdk dpdk: dpdk - compiler: gcc @@ -213,7 +213,8 @@ jobs: mkdir logs cp config.log ./logs/ cp -r ./*/_build/sub/tests/testsuite.* ./logs/ || true - tar -czvf logs.tgz logs/ + sudo cp -r ./tests/*testsuite.* ./logs/ || true + sudo tar -czvf logs.tgz logs/ - name: upload logs on failure if: failure() || cancelled() diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index a176a57a4bc..35d14bee8f7 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -94,7 +94,7 @@ m4_define([OVS_DPDK_CHECK_TESTPMD], m4_define([OVS_DPDK_START_TESTPMD], [AT_CHECK([lscpu], [], [stdout]) AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - eal_options="--in-memory --socket-mem="$(cat NUMA_NODE)" --single-file-segments --no-pci" + eal_options="$DPDK_EAL_OPTIONS --in-memory --socket-mem="$(cat NUMA_NODE)" --single-file-segments --no-pci" options="$1" test "$options" != "${options%% -- *}" || options="$options -- " eal_options="$eal_options ${options%% -- *}" From e6dd50d6154969496ed64205fa756b1986c36ab9 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:42 +0100 Subject: [PATCH 440/833] tests: Define a macro to skip tc relying tests. Some unit tests expect that a OVS port has an associated netdevice on which they can hook tc. This will not be possible when testing the userspace datapath with DPDK. Introduce a helper (which will be overriden in system-dpdk tests) and use it in the existing tests. Acked-by: Aaron Conole Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/system-common-macros.at | 6 ++++++ tests/system-offloads-traffic.at | 6 +++--- tests/system-traffic.at | 6 +++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 0077a8609c0..0113aae8bd2 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -297,6 +297,12 @@ m4_define([OVS_START_L7], # m4_define([OFPROTO_CLEAR_DURATION_IDLE], [[sed -e 's/duration=.*s,/duration=,/g' -e 's/idle_age=[0-9]*,/idle_age=,/g']]) +# OVS_CHECK_TC_QDISC() +# +# Macro to skip tests when tc qdisc can't be applied on a OVS port. +m4_define([OVS_CHECK_TC_QDISC], + [AT_SKIP_IF([test $HAVE_TC = no])]) + # OVS_CHECK_TUNNEL_TSO() # # Macro to be used in general tunneling tests that could be also diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 5ad6b4bfdf6..0bedee7530c 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -20,7 +20,7 @@ m4_define([OVS_CHECK_ACTIONS], [ m4_define([CHECK_TC_INGRESS_PPS], [ - AT_SKIP_IF([test $HAVE_TC = "no"]) + OVS_CHECK_TC_QDISC() AT_CHECK([ip link add ovs_tc_pps0 type veth peer name ovs_tc_pps1 dnl || exit 77]) on_exit 'ip link del ovs_tc_pps0' @@ -95,7 +95,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_rate and ingress_policing_burst - offloads disabled]) AT_KEYWORDS([ingress_policing]) -AT_SKIP_IF([test $HAVE_TC = "no"]) +OVS_CHECK_TC_QDISC() OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=false]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) @@ -118,7 +118,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_rate and ingress_policing_burst - offloads enabled]) AT_KEYWORDS([ingress_policing]) -AT_SKIP_IF([test $HAVE_TC = "no"]) +OVS_CHECK_TC_QDISC() OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) ADD_NAMESPACES(at_ns0) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 7ea45020289..a7d4ed83bdc 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2321,7 +2321,7 @@ AT_CLEANUP AT_BANNER([QoS]) AT_SETUP([QoS - basic configuration]) -AT_SKIP_IF([test $HAVE_TC = no]) +OVS_CHECK_TC_QDISC() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2355,7 +2355,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([QoS - 64bit]) -AT_SKIP_IF([test $HAVE_TC = no]) +OVS_CHECK_TC_QDISC() AT_SKIP_IF([test $HAVE_TCA_HTB_RATE64 = no]) OVS_TRAFFIC_VSWITCHD_START() @@ -2383,7 +2383,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([Ingress Policing - 64-bit]) -AT_SKIP_IF([test $HAVE_TC = no]) +OVS_CHECK_TC_QDISC() AT_SKIP_IF([test $HAVE_TCA_POLICE_PKTRATE64 = no]) OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(ns0) From 818217eafee83940fabcf54a71f4a7da8468373f Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:43 +0100 Subject: [PATCH 441/833] system-dpdk: Refactor OVS daemons helpers. Align system-dpdk existing helpers to other common OVS helpers so they can accept some optional arguments. Introduce a OVS_DPDK_STOP_VSWITCHD wrapper around OVS_VSWITCHD_STOP to catch dpdk related logs in a centralised fashion. Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/system-dpdk-macros.at | 21 ++++- tests/system-dpdk.at | 158 +++++++++++++++--------------------- 2 files changed, 82 insertions(+), 97 deletions(-) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 35d14bee8f7..7fedfd65155 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -36,12 +36,13 @@ m4_define([OVS_DPDK_PRE_PHY_SKIP], # m4_define([OVS_DPDK_START], [dnl start ovs dpdk - OVS_DPDK_START_OVSDB() + OVS_DPDK_START_OVSDB($3) dnl Enable DPDK functionality AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true]) - OVS_DPDK_START_VSWITCHD($1) + OVS_DPDK_START_VSWITCHD([$1], [$2]) ]) + # OVS_DPDK_START_OVSDB() # # Create an empty database and start ovsdb-server. @@ -60,9 +61,10 @@ m4_define([OVS_DPDK_START_OVSDB], AT_CAPTURE_FILE([ovsdb-server.log]) dnl Initialize database. - AT_CHECK([ovs-vsctl --no-wait init]) + AT_CHECK([ovs-vsctl --no-wait init $1]) ]) + # OVS_DPDK_START_VSWITCHD() # # Add special configuration for dpdk-init. Start ovs-vswitchd. @@ -72,12 +74,23 @@ m4_define([OVS_DPDK_START_VSWITCHD], AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-extra="--log-level=pmd.*:error $1"]) dnl Start ovs-vswitchd. - AT_CHECK([ovs-vswitchd --detach --no-chdir --pidfile --log-file -vvconn -vofproto_dpif -vunixctl], [0], [stdout], [stderr]) + AT_CHECK([ovs-vswitchd $2 --detach --no-chdir --pidfile --log-file -vvconn -vofproto_dpif -vunixctl], [0], [stdout], [stderr]) AT_CAPTURE_FILE([ovs-vswitchd.log]) on_exit "kill_ovs_vswitchd `cat ovs-vswitchd.pid`" ]) +m4_define([OVS_DPDK_STOP_VSWITCHD], + [OVS_VSWITCHD_STOP([dnl +$1";/does not exist. The Open vSwitch kernel module is probably not loaded./d +/does not support MTU configuration,/d +/EAL: No \(available\|free\) .*hugepages reported/d +/Failed to enable flow control/d +/Rx checksum offload is not supported on/d +/TELEMETRY: No legacy callbacks, legacy socket not created/d"]) +]) + + # OVS_DPDK_CHECK_TESTPMD() # # Check dpdk-testpmd availability. diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index ab232e06da1..f635d7f9251 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -3,15 +3,6 @@ m4_define([CONFIGURE_VETH_OFFLOADS], AT_BANNER([OVS-DPDK unit tests]) -m4_define([SYSTEM_DPDK_ALLOWED_LOGS],[ -\@does not exist. The Open vSwitch kernel module is probably not loaded.@d -\@does not support MTU configuration,@d -\@EAL: No \(available\|free\) .*hugepages reported@d -\@Failed to enable flow control@d -\@Rx checksum offload is not supported on@d -\@TELEMETRY: No legacy callbacks, legacy socket not created@d -]) - dnl CHECK_MEMPOOL_PARAM([mtu], [numa], [+line]) dnl dnl Waits for logs to indicate that the user has configured a mempool @@ -36,7 +27,7 @@ OVS_DPDK_START([--no-pci]) AT_CHECK([grep "DPDK Enabled - initializing..." ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "EAL" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "DPDK Enabled - initialized" ovs-vswitchd.log], [], [stdout]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -58,7 +49,7 @@ sleep 2 dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -84,9 +75,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ov dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -150,12 +140,11 @@ OVS_WAIT_UNTIL([grep "vHost Device '$OVS_RUNDIR/dpdkvhostuser0' has been removed dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuser0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) recvmsg failed@d -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) failed to connect: No such file or directory@d -\@dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports.@d -\@failed to enumerate system datapaths: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostuser0) recvmsg failed/d +/VHOST_CONFIG: (.*dpdkvhostuser0) failed to connect: No such file or directory/d +/dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports./d +/failed to enumerate system datapaths: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -237,12 +226,10 @@ OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) recvmsg failed@d -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -\@dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports.@d -\@failed to enumerate system datapaths: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) recvmsg failed/d +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/failed to enumerate system datapaths: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -278,7 +265,7 @@ AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -319,9 +306,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ov dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -360,9 +346,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ov dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -400,9 +385,8 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ov dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -434,7 +418,7 @@ AT_CHECK([grep -E 'QoS not configured on phy0' stdout], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -473,9 +457,8 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -506,11 +489,10 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -\@Could not create rte meter for egress policer@d -\@Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/Could not create rte meter for egress policer/d +/Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -541,11 +523,10 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -\@Could not create rte meter for egress policer@d -\@Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/Could not create rte meter for egress policer/d +/Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -589,7 +570,7 @@ AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -633,7 +614,7 @@ AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -681,9 +662,8 @@ OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -732,9 +712,8 @@ OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -773,10 +752,9 @@ AT_CHECK([grep "phy0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@phy0: unsupported MTU 9711@d -\@failed to set MTU for network device phy0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/phy0: unsupported MTU 9711/d +/failed to set MTU for network device phy0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -815,10 +793,9 @@ AT_CHECK([grep "phy0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@phy0: unsupported MTU 67@d -\@failed to set MTU for network device phy0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/phy0: unsupported MTU 67/d +/failed to set MTU for network device phy0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -859,11 +836,10 @@ OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -\@dpdkvhostuserclient0: unsupported MTU 9711@d -\@failed to set MTU for network device dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/dpdkvhostuserclient0: unsupported MTU 9711/d +/failed to set MTU for network device dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -909,11 +885,10 @@ OVS_DPDK_STOP_TESTPMD() dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) failed to connect: No such file or directory@d -\@dpdkvhostuserclient0: unsupported MTU 67@d -\@failed to set MTU for network device dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/dpdkvhostuserclient0: unsupported MTU 67/d +/failed to set MTU for network device dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -948,7 +923,7 @@ OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packe dnl Clean up AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -981,9 +956,8 @@ OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packe dnl Clean up AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@upcall: datapath reached the dynamic limit of .* flows.@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/upcall: datapath reached the dynamic limit of .* flows./d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -1094,17 +1068,16 @@ ovs-appctl: ovs-vswitchd: server returned an error dnl Clean up AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@Error: unknown argument 1.@d -\@Error: invalid study_pkt_cnt value: xyz.@d -\@Error: unknown argument abcd.@d -\@Error: -pmd option requires a thread id argument.@d -\@Error: invalid study_pkt_cnt value: abcd.@d -\@Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID.@d -\@Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list.@d -\@Error: unknown miniflow extract implementation superstudy.@d -\@Error: invalid study_pkt_cnt value: -pmd.@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/Error: unknown argument 1./d +/Error: invalid study_pkt_cnt value: xyz./d +/Error: unknown argument abcd./d +/Error: -pmd option requires a thread id argument./d +/Error: invalid study_pkt_cnt value: abcd./d +/Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID./d +/Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list./d +/Error: unknown miniflow extract implementation superstudy./d +/Error: invalid study_pkt_cnt value: -pmd./d"]) AT_CLEANUP dnl dnl -------------------------------------------------------------------------- @@ -1155,7 +1128,6 @@ OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Port p1: Requesting a mem dnl Clean up AT_CHECK([ovs-vsctl del-port br10 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -])") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- From b561bbdc27a5467a114f37408453bd63d36d8f8c Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:44 +0100 Subject: [PATCH 442/833] netdev-afxdp: Postpone libbpf logging helper registration. When using net/af_xdp DPDK driver along OVS native AF_XDP support, confusing logs are reported, like: netdev_dpdk|INFO|Device 'net_af_xdpp0,iface=ovs-p0' attached to DPDK dpif_netdev|INFO|PMD thread on numa_id: 0, core id: 11 created. dpif_netdev|INFO|There are 1 pmd threads on numa node 0 dpdk|INFO|Device with port_id=0 already stopped dpdk(pmd-c11/id:22)|INFO|PMD thread uses DPDK lcore 1. netdev_dpdk|WARN|Rx checksum offload is not supported on port 0 netdev_afxdp|INFO|libbpf: elf: skipping unrecognized data section(6) .xdp_run_config netdev_afxdp|INFO|libbpf: elf: skipping unrecognized data section(7) xdp_metadata netdev_afxdp|INFO|libbpf: elf: skipping unrecognized data section(7) xdp_metadata netdev_afxdp|INFO|libbpf: elf: skipping unrecognized data section(7) xdp_metadata This comes from the fact that netdev-afxdp unconditionnally registers a helper for logging libbpf messages. Making both net/af_xdp and netdev-afxdp work at the same time seems difficult, so at least, ensure that netdev-afxdp won't register this helper unless a netdev is actually allocated. Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- lib/netdev-afxdp.c | 12 ++++++------ lib/netdev-afxdp.h | 1 - lib/netdev-linux.c | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index b680a147985..54029722e0b 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -1193,18 +1193,18 @@ libbpf_print(enum libbpf_print_level level, return 0; } -int netdev_afxdp_init(void) -{ - libbpf_set_print(libbpf_print); - return 0; -} - int netdev_afxdp_construct(struct netdev *netdev) { + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; struct netdev_linux *dev = netdev_linux_cast(netdev); int ret; + if (ovsthread_once_start(&once)) { + libbpf_set_print(libbpf_print); + ovsthread_once_done(&once); + } + /* Configure common netdev-linux first. */ ret = netdev_linux_construct(netdev); if (ret) { diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h index bd3b9dfbead..236a37cc844 100644 --- a/lib/netdev-afxdp.h +++ b/lib/netdev-afxdp.h @@ -47,7 +47,6 @@ struct xsk_socket_info; int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_); void netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_); -int netdev_afxdp_init(void); int netdev_afxdp_construct(struct netdev *netdev_); void netdev_afxdp_destruct(struct netdev *netdev_); int netdev_afxdp_verify_mtu_size(const struct netdev *netdev, int mtu); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 70521e3c7f7..18b62d90c28 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -3754,7 +3754,6 @@ const struct netdev_class netdev_internal_class = { #ifdef HAVE_AF_XDP #define NETDEV_AFXDP_CLASS_COMMON \ - .init = netdev_afxdp_init, \ .construct = netdev_afxdp_construct, \ .destruct = netdev_afxdp_destruct, \ .get_stats = netdev_afxdp_get_stats, \ From 64c1d16c6aeef910c65b830252166b85a274553e Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:45 +0100 Subject: [PATCH 443/833] system-dpdk: Remove tap interfaces from vport MTU tests. The unit tests for changing MTU with vhost-user ports are not using those tap interfaces. Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/system-dpdk.at | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index f635d7f9251..c125d8051f4 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -640,9 +640,7 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) -OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0"]) - +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -690,9 +688,7 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) -OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0"]) - +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -817,8 +813,7 @@ AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0"]) +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -866,8 +861,7 @@ AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) -OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0"]) +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) From d0a6cf57ddee5b1c966273199583b8d13232199d Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:46 +0100 Subject: [PATCH 444/833] system-dpdk: Fix race in some vhost-user client MTU test. Adding those grep gives enough time to OVS to be ready to connect with the testpmd virtio-user port instantiated afterward. Fixes: bf47829116a8 ("tests: Add OVS-DPDK MTU unit tests.") Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/system-dpdk.at | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index c125d8051f4..b7ae4af406e 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -813,6 +813,10 @@ AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) +AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) + OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) From 1c37d869c2a7367cb2598c24a1f7f7db067575bf Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:47 +0100 Subject: [PATCH 445/833] system-dpdk: Refactor tests using vhost-user ports. Introduce macros responsible for adding a vhost-user / vhost-user client port to a userspace datapath bridge and check associated logs. Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/system-dpdk.at | 123 +++++++++++++------------------------------ 1 file changed, 36 insertions(+), 87 deletions(-) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index b7ae4af406e..4da2afd683c 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -18,6 +18,29 @@ m4_define([CHECK_MEMPOOL_PARAM], [ | grep "User configured shared mempool set for: MTU $1, NUMA $2."]) ]) +dnl ADD_VHOST_USER_CLIENT_PORT([bridge], [port], [socket]) +dnl Add a dpdk vhost-user client port to a bridge and check this port is ready +dnl to be used by looking at the logs. +m4_define([ADD_VHOST_USER_CLIENT_PORT], [ + AT_CHECK([ovs-vsctl add-port $1 $2 -- \ + set Interface $2 type=dpdkvhostuserclient options:vhost-server-path=$3], + [], [stdout], [stderr]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) vhost-user client: socket created" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "vHost User device '$2' created in 'client' mode, using client socket" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) reconnecting..." ovs-vswitchd.log]) +]) + +dnl ADD_VHOST_USER_PORT([bridge], [port], [socket]) +dnl Add a dpdk vhost-user port to a bridge and check this port is ready +dnl to be used by looking at the logs. +m4_define([ADD_VHOST_USER_PORT], [ + AT_CHECK([ovs-vsctl add-port $1 $2 -- set Interface $2 type=dpdkvhostuser], [], + [stdout], [stderr]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) vhost-user server: socket created" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "Socket $3 created for vhost-user port $2" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) binding succeeded" ovs-vswitchd.log]) +]) + dnl -------------------------------------------------------------------------- dnl Check if EAL init is successful AT_SETUP([OVS-DPDK - EAL init]) @@ -64,15 +87,10 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_DPDK_STOP_VSWITCHD(["dnl @@ -92,19 +110,9 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuser0 -- set Interface dpdkvhostuser0 \ - type=dpdkvhostuser], [], - [stdout], [stderr]) +ADD_VHOST_USER_PORT([br10], [dpdkvhostuser0], [$OVS_RUNDIR/dpdkvhostuser0]) AT_CHECK([ovs-vsctl show], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) vhost-user server: socket created" \ - ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "Socket $OVS_RUNDIR/dpdkvhostuser0 created for vhost-user port dpdkvhostuser0" \ - ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostuser0) binding succeeded" ovs-vswitchd.log], [], - [stdout]) - dnl Set up namespaces ADD_NAMESPACES(ns1, ns2) @@ -158,18 +166,9 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface \ - dpdkvhostuserclient0 \ - type=dpdkvhostuserclient \ - options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], - [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl show], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Set up namespaces ADD_NAMESPACES(ns1, ns2) @@ -281,7 +280,7 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set interface dpdkvhostuserclient0 ingress_policing_rate=10000 ingress_policing_burst=1000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -299,11 +298,6 @@ AT_CHECK([grep -E 'ingress_policing_burst: 0' stdout], [], [stdout]) AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_DPDK_STOP_VSWITCHD(["dnl @@ -323,7 +317,7 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set interface dpdkvhostuserclient0 ingress_policing_burst=1000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -338,12 +332,6 @@ AT_CHECK([grep -E 'ingress_policing_burst: 1000' stdout], [], [stdout]) AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) - -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_DPDK_STOP_VSWITCHD(["dnl @@ -363,7 +351,7 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set interface dpdkvhostuserclient0 ingress_policing_rate=10000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -378,11 +366,6 @@ AT_CHECK([grep -E 'ingress_policing_burst: 0' stdout], [], [stdout]) AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 10000' stdout], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) OVS_DPDK_STOP_VSWITCHD(["dnl @@ -434,17 +417,12 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cir=1250000 \ other-config:cbs=2048]) AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Fail if egress policer could not be created AT_FAIL_IF([grep "Could not create rte meter for egress policer" ovs-vswitchd.log], [], [stdout]) @@ -474,15 +452,10 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cbs=2048]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Check egress policer was not created AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [stdout]) @@ -508,15 +481,10 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cir=1250000]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Check egress policer was not created AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [stdout]) @@ -631,15 +599,10 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS with default MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -678,16 +641,11 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and modify MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) @@ -808,15 +766,11 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and set MTU value to max upper bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) @@ -855,16 +809,11 @@ OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and set MTU value to min lower bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=68]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: ($OVS_RUNDIR/dpdkvhostclient0) reconnecting..." ovs-vswitchd.log], [], [stdout]) - OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) From 1d0ff364b069c699459b40c5b8ff108504fe8154 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:48 +0100 Subject: [PATCH 446/833] system-dpdk: Rework cleanup for vhost-user client tests. Those tests are subject to a race when a testpmd hosting the vhost-user server is stopped and OVS has enough time to detect the vhost-user socket drop and tries to reconnect to this socket. In such a situation, the tests can fail as the OVS process with the vhost-user client port complains with a warning log: 2023-09-08T13:15:18.160Z|00163|dpdk|INFO|VHOST_CONFIG: (.../005/dpdkvhostclient0) vhost peer closed 2023-09-08T13:15:18.160Z|00164|netdev_dpdk|INFO|vHost Device '.../005/dpdkvhostclient0' connection has been destroyed 2023-09-08T13:15:18.160Z|00165|dpdk|INFO|VHOST_CONFIG: (.../005/dpdkvhostclient0) vhost-user client: socket created, fd: 24 2023-09-08T13:15:18.160Z|00166|dpdk|WARN|VHOST_CONFIG: (.../005/dpdkvhostclient0) failed to connect: Connection refused 2023-09-08T13:15:18.160Z|00167|dpdk|INFO|VHOST_CONFIG: (.../005/dpdkvhostclient0) reconnecting... Invert the order of the cleanup steps. Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/system-dpdk.at | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 4da2afd683c..5e486d1f47a 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -221,10 +221,9 @@ AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_bytes $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q0_good_bytes` + dnl `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q1_good_bytes`))]) -OVS_DPDK_STOP_TESTPMD() - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) +OVS_DPDK_STOP_TESTPMD() OVS_DPDK_STOP_VSWITCHD(["dnl /VHOST_CONFIG: (.*dpdkvhostclient0) recvmsg failed/d /VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d @@ -619,10 +618,9 @@ AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl 9000 ]) -OVS_DPDK_STOP_TESTPMD() - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) +OVS_DPDK_STOP_TESTPMD() OVS_DPDK_STOP_VSWITCHD(["dnl /VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP @@ -662,10 +660,9 @@ AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl 2000 ]) -OVS_DPDK_STOP_TESTPMD() - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) +OVS_DPDK_STOP_TESTPMD() OVS_DPDK_STOP_VSWITCHD(["dnl /VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP @@ -785,10 +782,9 @@ dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9711]) AT_CHECK([grep "dpdkvhostuserclient0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) -OVS_DPDK_STOP_TESTPMD() - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) +OVS_DPDK_STOP_TESTPMD() OVS_DPDK_STOP_VSWITCHD(["dnl /VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d /dpdkvhostuserclient0: unsupported MTU 9711/d @@ -828,10 +824,9 @@ dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=67]) AT_CHECK([grep "dpdkvhostuserclient0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) -OVS_DPDK_STOP_TESTPMD() - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) +OVS_DPDK_STOP_TESTPMD() OVS_DPDK_STOP_VSWITCHD(["dnl /VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d /dpdkvhostuserclient0: unsupported MTU 67/d From 4e90baca89f04f55c77e3508f8e63d2ab9426caa Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 20 Nov 2023 16:56:49 +0100 Subject: [PATCH 447/833] system-dpdk: Run traffic tests. Integrate system-traffic.at tests as part of check-dpdk. Some tests that can't work with the userspace datapath are skipped by overriding some OVS_CHECK_* macros. ADD_VETH is implemented using the net/af_xdp DPDK driver. Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- .ci/dpdk-build.sh | 3 +- .github/workflows/build-and-test.yml | 2 +- tests/system-dpdk-macros.at | 77 ++++++++++++++++++++++++++++ tests/system-dpdk-testsuite.at | 2 + tests/system-dpdk.at | 3 -- 5 files changed, 82 insertions(+), 5 deletions(-) diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh index 35540f0694b..aa83e446436 100755 --- a/.ci/dpdk-build.sh +++ b/.ci/dpdk-build.sh @@ -38,7 +38,8 @@ function build_dpdk() # any DPDK driver. # check-dpdk unit tests requires testpmd and some net/ driver. DPDK_OPTS="$DPDK_OPTS -Denable_apps=test-pmd" - DPDK_OPTS="$DPDK_OPTS -Denable_drivers=net/null,net/tap,net/virtio" + enable_drivers="net/null,net/af_xdp,net/tap,net/virtio" + DPDK_OPTS="$DPDK_OPTS -Denable_drivers=$enable_drivers" # Install DPDK using prefix. DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 4f62efb7c33..09654205e74 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -5,7 +5,7 @@ on: [push, pull_request] jobs: build-dpdk: env: - dependencies: gcc libnuma-dev ninja-build + dependencies: gcc libbpf-dev libnuma-dev ninja-build pkgconf CC: gcc DPDK_GIT: https://dpdk.org/git/dpdk-stable DPDK_VER: 22.11.1 diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 7fedfd65155..dcdfa55741c 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -127,3 +127,80 @@ m4_define([OVS_DPDK_STOP_TESTPMD], [AT_CHECK([kill `cat testpmd.pid`]) OVS_WAIT([kill -0 `cat testpmd.pid`], [kill -9 `cat testpmd.pid`]) ]) + + +# OVS_TRAFFIC_VSWITCHD_START([vsctl-args], [vsctl-output], [dbinit-aux-args]) +# +# Creates a database and starts ovsdb-server, starts ovs-vswitchd +# connected to that database, calls ovs-vsctl to create a bridge named +# br0 with predictable settings, passing 'vsctl-args' as additional +# commands to ovs-vsctl. If 'vsctl-args' causes ovs-vsctl to provide +# output (e.g. because it includes "create" commands) then 'vsctl-output' +# specifies the expected output after filtering through uuidfilt. +# 'dbinit-aux-args' are passed as additional commands to 'ovs-vsctl init' +# before starting ovs-vswitchd. +m4_define([OVS_TRAFFIC_VSWITCHD_START], + [ + OVS_DPDK_PRE_CHECK() + OVS_WAIT_WHILE([ip link show ovs-netdev]) + dnl For functional tests, no need for DPDK PCI probing. + OVS_DPDK_START([--no-pci], [--disable-system], [$3]) + dnl Add bridges, ports, etc. + OVS_WAIT_WHILE([ip link show br0]) + AT_CHECK([ovs-vsctl -- _ADD_BR([br0]) -- $1 m4_if([$2], [], [], [| uuidfilt])], [0], [$2]) +]) + + +# OVS_TRAFFIC_VSWITCHD_STOP([ALLOWLIST], [extra_cmds]) +# +# Gracefully stops ovs-vswitchd and ovsdb-server, checking their log files +# for messages with severity WARN or higher and signaling an error if any +# is present. The optional ALLOWLIST may contain shell-quoted "sed" +# commands to delete any warnings that are actually expected, e.g.: +# +# OVS_TRAFFIC_VSWITCHD_STOP(["/expected error/d"]) +# +# 'extra_cmds' are shell commands to be executed after OVS_VSWITCHD_STOP() is +# invoked. They can be used to perform additional cleanups such as name space +# removal. +m4_define([OVS_TRAFFIC_VSWITCHD_STOP], + [OVS_DPDK_STOP_VSWITCHD([$1]) + AT_CHECK([:; $2]) +]) + + +# Plug a veth into OVS via DPDK net/af_xdp. +m4_define([ADD_VETH], + [ AT_CHECK([ip link add $1 type veth peer name ovs-$1 || return 77]) + CONFIGURE_VETH_OFFLOADS([$1]) + AT_CHECK([ip link set $1 netns $2]) + AT_CHECK([ip link set dev ovs-$1 up]) + AT_CHECK([ovs-vsctl add-port $3 ovs-$1 -- \ + set interface ovs-$1 external-ids:iface-id="$1" -- \ + set interface ovs-$1 type=dpdk -- \ + set interface ovs-$1 options:dpdk-devargs=net_af_xdp$1,iface=ovs-$1]) + NS_CHECK_EXEC([$2], [ip addr add $4 dev $1 $7]) + NS_CHECK_EXEC([$2], [ip link set dev $1 up]) + if test -n "$5"; then + NS_CHECK_EXEC([$2], [ip link set dev $1 address $5]) + fi + if test -n "$6"; then + NS_CHECK_EXEC([$2], [ip route add default via $6]) + fi + on_exit 'ip link del ovs-$1' + ] +) + + +m4_define([OVS_CHECK_8021AD], + [AT_SKIP_IF([:])]) + + +m4_define([OVS_CHECK_TC_QDISC], + [AT_SKIP_IF([:])]) + + +m4_define([CONFIGURE_VETH_OFFLOADS], + [AT_CHECK([ethtool -K $1 tx off], [0], [ignore], [ignore]) + AT_CHECK([ethtool -K $1 txvlan off], [0], [ignore], [ignore])] +) diff --git a/tests/system-dpdk-testsuite.at b/tests/system-dpdk-testsuite.at index 382f09e9ff2..f61fbf9212a 100644 --- a/tests/system-dpdk-testsuite.at +++ b/tests/system-dpdk-testsuite.at @@ -20,6 +20,8 @@ m4_include([tests/ovs-macros.at]) m4_include([tests/ovsdb-macros.at]) m4_include([tests/ofproto-macros.at]) m4_include([tests/system-common-macros.at]) +m4_include([tests/system-userspace-macros.at]) m4_include([tests/system-dpdk-macros.at]) m4_include([tests/system-dpdk.at]) +m4_include([tests/system-traffic.at]) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 5e486d1f47a..17742d20a0a 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -1,6 +1,3 @@ -m4_define([CONFIGURE_VETH_OFFLOADS], - [AT_CHECK([ethtool -K $1 tx off], [0], [ignore], [ignore])]) - AT_BANNER([OVS-DPDK unit tests]) dnl CHECK_MEMPOOL_PARAM([mtu], [numa], [+line]) From 4990a9a27177f1fdcb62de1be4969e77e3aea937 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 16 Nov 2023 12:42:44 +0100 Subject: [PATCH 448/833] mcast-snooping: Test per port explicit flooding. Various options affect how the mcast snooping module work. When multicast snooping is enabled and a reporter is known, it is still possible to flood associated packets to some other port via the mcast-snooping-flood option. If flooding unregistered traffic is disabled, it is still possible to flood multicast traffic too with the mcast-snooping-flood option. IGMP reports may have to be flooded to some ports explicitly with the mcast-snooping-flood-reports option. Test those parameters. Acked-by: Paolo Valerio Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/mcast-snooping.at | 280 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index d5b7c4774c7..9797bca531c 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -105,6 +105,286 @@ AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([mcast - check multicast per port flooding]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ + other-config:mcast-snooping-disable-flood-unregistered=false +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 \ + -- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ + -- add-port br0 p2 \ + -- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 ofport_request=2 \ + -- add-port br0 p3 \ + -- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 ofport_request=3 \ +], [0]) + +ovs-appctl time/stop + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [stdout]) +AT_CHECK([grep -v 'Datapath actions:' stdout], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> unregistered multicast, flooding + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +]) +AT_CHECK([sed -ne 's/^Datapath actions: \(.*\)$/\1/p' stdout | tr "," "\n" | sort -n], [0], [dnl +1 +2 +100 +]) + +# Send report packets. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' +], [0]) +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN GROUP Age + 1 0 224.1.1.1 0 +]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1 +]) + +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + -> forwarding to mcast flood port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1,2 +]) + +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + -> forwarding to mcast flood port + -> mcast flood port is input port, dropping + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1,2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + + +AT_SETUP([mcast - check multicast per port flooding (unregistered flood disabled)]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ + other-config:mcast-snooping-disable-flood-unregistered=true +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 \ + -- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ + -- add-port br0 p2 \ + -- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 ofport_request=2 \ + -- add-port br0 p3 \ + -- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 ofport_request=3 \ +], [0]) + +ovs-appctl time/stop + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: drop +]) + +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast flood port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 2 +]) + +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast flood port + -> mcast flood port is input port, dropping + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + + +AT_SETUP([mcast - check reports per port flooding]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ + other-config:mcast-snooping-disable-flood-unregistered=false +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 \ + -- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ + -- add-port br0 p2 \ + -- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 ofport_request=2 \ + -- add-port br0 p3 \ + -- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 ofport_request=3 \ +], [0]) + +ovs-appctl time/stop + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> learned that 00:0c:29:a0:27:a1 is on port p1 in VLAN 0 + -> multicast snooping learned that 224.1.1.1 is on port p1 in VLAN 0 + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: drop +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood-reports=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood-reports=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + -> forwarding report to mcast flagged port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 3,2 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +AT_CHECK([ovs-vsctl set port p1 other_config:mcast-snooping-flood-reports=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + -> forwarding report to mcast flagged port + -> mcast port is input port, dropping the Report + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 3,2 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + + AT_SETUP([mcast - delete the port mdb when vlan configuration changed]) OVS_VSWITCHD_START([]) From 42c1e2efeda116457b356d0d0b67c36ff0045cc7 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 16 Nov 2023 12:42:45 +0100 Subject: [PATCH 449/833] mcast-snooping: Flush flood and report ports when deleting interfaces. When a configuration change triggers an interface destruction/creation (like for example, setting ofport_request), a port object may still be referenced as a fport or a rport in the mdb. Before the fix, when flooding multicast traffic: bridge("br0") ------------- 0. priority 32768 NORMAL -> forwarding to mcast group port >> mcast flood port is unknown, dropping -> mcast flood port is input port, dropping -> forwarding to mcast flood port Before the fix, when flooding igmp report traffic: bridge("br0") ------------- 0. priority 32768 NORMAL >> mcast port is unknown, dropping the report -> forwarding report to mcast flagged port -> mcast port is input port, dropping the Report -> forwarding report to mcast flagged port Add relevant cleanup and update unit tests. Fixes: 4fbbf8624868 ("mcast-snooping: Flush ports mdb when VLAN configuration changed.") Acked-by: Paolo Valerio Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- lib/mcast-snooping.c | 17 ++++++++++++++++- tests/mcast-snooping.at | 42 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c index 029ca28558b..43805ae4d56 100644 --- a/lib/mcast-snooping.c +++ b/lib/mcast-snooping.c @@ -946,8 +946,9 @@ mcast_snooping_wait(struct mcast_snooping *ms) void mcast_snooping_flush_bundle(struct mcast_snooping *ms, void *port) { - struct mcast_group *g; struct mcast_mrouter_bundle *m; + struct mcast_port_bundle *p; + struct mcast_group *g; if (!mcast_snooping_enabled(ms)) { return; @@ -971,5 +972,19 @@ mcast_snooping_flush_bundle(struct mcast_snooping *ms, void *port) } } + LIST_FOR_EACH_SAFE (p, node, &ms->fport_list) { + if (p->port == port) { + mcast_snooping_flush_port(p); + ms->need_revalidate = true; + } + } + + LIST_FOR_EACH_SAFE (p, node, &ms->rport_list) { + if (p->port == port) { + mcast_snooping_flush_port(p); + ms->need_revalidate = true; + } + } + ovs_rwlock_unlock(&ms->rwlock); } diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index 9797bca531c..faeb7890d9c 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -207,6 +207,26 @@ Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e Datapath actions: 1,2 ]) +# Change p2 ofport to force a ofbundle change and check that the mdb contains +# no stale port. +AT_CHECK([ovs-vsctl set interface p2 ofport_request=4]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + -> mcast flood port is input port, dropping + -> forwarding to mcast flood port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1,2 +]) + OVS_VSWITCHD_STOP AT_CLEANUP @@ -381,6 +401,28 @@ This flow is handled by the userspace slow path because it: - Uses action(s) not supported by datapath. ]) +# Change p2 ofport to force a ofbundle change and check that the mdb contains +# no stale port. +AT_CHECK([ovs-vsctl set interface p3 ofport_request=4]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + -> mcast port is input port, dropping the Report + -> forwarding report to mcast flagged port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 2,3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + OVS_VSWITCHD_STOP AT_CLEANUP From 40f651ce0711f0319da631ed2b56742b9934c2b2 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 16 Nov 2023 12:42:46 +0100 Subject: [PATCH 450/833] mcast-snooping: Fix comments format. Capitalize comments and end them with a . when needed. Signed-off-by: David Marchand Acked-by: Paolo Valerio Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/mcast-snooping.at | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index faeb7890d9c..890e6aca009 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -31,13 +31,13 @@ dummy@ovs-dummy: hit:0 missed:0 ovs-appctl time/stop -# Send IGMPv3 query on p2 with vlan 1725 +# Send IGMPv3 query on p2 with vlan 1725. # 5c:8a:38:55:25:52 > 01:00:5e:00:00:01, ethertype 802.1Q (0x8100), length 64: vlan 1725, p 0, ethertype IPv4, # 172.17.25.1 > 224.0.0.1: igmp query v3 AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '01005e0000015c8a38552552810006bd080046c000240000000001027f00ac111901e0000001940400001164ec1e00000000027d000000000000000000000000']) -# Send IGMPv3 query on p2 with vlan 1728 +# Send IGMPv3 query on p2 with vlan 1728. # 5c:8a:38:55:25:52 > 01:00:5e:00:00:01, ethertype 802.1Q (0x8100), length 64: vlan 1728, p 0, ethertype IPv4, # 172.17.28.1 > 224.0.0.1: igmp query v3 AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ @@ -51,13 +51,13 @@ AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap]) -# Send a multicast packet on p1 +# Send a multicast packet on p1. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ 'in_port(1),eth(src=aa:55:aa:55:00:01,dst=01:00:5e:5e:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=239.94.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)' ]) -# Check this packet was forwarded exactly once to p2 and has vlan tag 1725 +# Check this packet was forwarded exactly once to p2 and has vlan tag 1725. # aa:55:aa:55:00:01 > 01:00:5e:5e:01:01, ethertype 802.1Q (0x8100), length 46: vlan 1725, p 0, ethertype IPv4, # 10.0.0.1.0 > 239.94.1.1.8000: UDP, length 0 AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) @@ -450,7 +450,7 @@ AT_CHECK([ ovs-appctl time/stop -# send report packets +# Send report packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ '01005E010101000C29A027A18100000108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' @@ -458,7 +458,7 @@ AT_CHECK([ '01005E010101000C29A027A28100000208004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' ], [0]) -# send query packets +# Send query packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p3 \ '01005E010101000C29A027D18100000108004500001C000100004002CBCBAC102201E00101011114EEEB00000000' @@ -505,7 +505,7 @@ AT_CHECK([ ovs-appctl time/stop -# send report packets +# Send report packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ '01005E010101000C29A027A18100000108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' @@ -513,7 +513,7 @@ AT_CHECK([ '01005E010101000C29A027A28100000208004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' ], [0]) -# send query packets +# Send query packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p2 \ '01005E010101000C29A027D18100000108004500001C000100004002CBCBAC102201E00101011114EEEB00000000' From 3626c17443b6178f5be463996eb24284f4c26d32 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Tue, 28 Nov 2023 08:46:22 +0100 Subject: [PATCH 451/833] ovs-ofctl: Correctly mark the CT flush commands. Change the ct-flush and ct-flush-zone to RW as they do in fact modify the state. Fixes: 2a7c4805a76d ("Add OpenFlow command to flush conntrack table entries.") Fixes: 08146bf7d9b4 ("openflow: Add extension to flush CT by generic match.") Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- utilities/ovs-ofctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index 0a382f336bb..2d413e2396d 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -5144,10 +5144,10 @@ static const struct ovs_cmdl_command all_commands[] = { 1, 1, ofctl_dump_ipfix_flow, OVS_RO }, { "ct-flush-zone", "switch zone", - 2, 2, ofctl_ct_flush_zone, OVS_RO }, + 2, 2, ofctl_ct_flush_zone, OVS_RW }, { "ct-flush", "switch [zone=N] [ct-orig-tuple [ct-reply-tuple]]", - 1, 4, ofctl_ct_flush, OVS_RO }, + 1, 4, ofctl_ct_flush, OVS_RW }, { "ofp-parse", "file", 1, 1, ofctl_ofp_parse, OVS_RW }, From ceb29608db455c0c8b3df8ffa953a25ec450b031 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 29 Nov 2023 17:14:47 +0000 Subject: [PATCH 452/833] faq: Update matching DPDK releases for older branches. Branches 2.17/3.0/3.1/3.2 are using newer DPDK LTS releases. Update the faq. Signed-off-by: Kevin Traynor Acked-by: Ilya Maximets --- Documentation/faq/releases.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index e6bda14e7b0..362bf4ec7ba 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -215,10 +215,10 @@ Q: What DPDK version does each Open vSwitch release work with? 2.14.x 19.11.13 2.15.x 20.11.6 2.16.x 20.11.6 - 2.17.x 21.11.2 - 3.0.x 21.11.2 - 3.1.x 22.11.1 - 3.2.x 22.11.1 + 2.17.x 21.11.5 + 3.0.x 21.11.5 + 3.1.x 22.11.3 + 3.2.x 22.11.3 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? From cf984d5becd48a99f365fae23c7c7faf1fc13905 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 30 Nov 2023 22:11:46 +0100 Subject: [PATCH 453/833] appveyor: Use previous image to unblock CI. It may take a few days for AppVeyor to fix their broken images [1], use the 'Previous' version of the image for now to unblock CI. We'll need to revert this once the issue is fixed. [1] https://github.com/appveyor/ci/issues/3893 Acked-by: Alin Gabriel Serdean Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 25f69bb8d11..5903b90d078 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,5 +1,5 @@ version: 1.0.{build} -image: Visual Studio 2019 +image: Previous Visual Studio 2019 branches: only: - master From c8d494636758439ee41120aad9635b93889aad39 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 29 Nov 2023 17:42:02 +0100 Subject: [PATCH 454/833] cirrus: Update from FreeBSD 12 to 14. FreeBSD 12 is going EOL in December [1] and will likely become unavailable shortly after. FreeBSD 14 was released recently, so replacing the old with the new. [1] https://www.freebsd.org/security/ Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 48931fa085c..d8a97228095 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -2,8 +2,8 @@ freebsd_build_task: freebsd_instance: matrix: - image_family: freebsd-12-4-snap image_family: freebsd-13-2-snap + image_family: freebsd-14-0-snap cpu: 4 memory: 4G From 6c59c195266cc95c15142e05d7c444b2b1db73e1 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Tue, 21 Nov 2023 14:26:50 -0500 Subject: [PATCH 455/833] netdev-linux: Use ethtool to detect offload support. Currently when userspace-tso is enabled, netdev-linux interfaces will indicate support for all offload flags regardless of interface configuration. This patch checks for which offload features are enabled during netdev construction. Signed-off-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 150 ++++++++++++++++++++++++++++++++++++++-- tests/ofproto-macros.at | 1 + 2 files changed, 144 insertions(+), 7 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 18b62d90c28..93a5845c064 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -558,6 +558,7 @@ static bool netdev_linux_miimon_enabled(void); static void netdev_linux_miimon_run(void); static void netdev_linux_miimon_wait(void); static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup); +static void netdev_linux_set_ol(struct netdev *netdev); static bool is_tap_netdev(const struct netdev *netdev) @@ -959,14 +960,12 @@ netdev_linux_construct(struct netdev *netdev_) return error; } - /* The socket interface doesn't offer the option to enable only - * csum offloading without TSO. */ if (userspace_tso_enabled()) { - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + /* The AF_PACKET socket interface uses the same option to facilitate + * both csum and segmentation offloading. However, these features can + * be toggled off or on individually at the interface level. The netdev + * flags are set based on the features indicated by ethtool. */ + netdev_linux_set_ol(netdev_); } error = get_flags(&netdev->up, &netdev->ifi_flags); @@ -2381,6 +2380,143 @@ netdev_internal_get_stats(const struct netdev *netdev_, return error; } +static int +netdev_linux_read_stringset_info(struct netdev_linux *netdev, uint32_t *len) +{ + union { + struct ethtool_sset_info hdr; + struct { + uint64_t pad[2]; + uint32_t sset_len[1]; + }; + } sset_info; + int error; + + sset_info.hdr.cmd = ETHTOOL_GSSET_INFO; + sset_info.hdr.reserved = 0; + sset_info.hdr.sset_mask = 1ULL << ETH_SS_FEATURES; + + error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up), + (struct ethtool_cmd *) &sset_info, + ETHTOOL_GSSET_INFO, "ETHTOOL_GSSET_INFO"); + if (error) { + return error; + } + if (sset_info.hdr.sset_mask & (1ULL << ETH_SS_FEATURES)) { + *len = sset_info.sset_len[0]; + return 0; + } else { + /* ETH_SS_FEATURES is not supported. */ + return -EOPNOTSUPP; + } +} + + +static int +netdev_linux_read_definitions(struct netdev_linux *netdev, + struct ethtool_gstrings **pstrings) +{ + struct ethtool_gstrings *strings = NULL; + uint32_t len = 0; + int error = 0; + + error = netdev_linux_read_stringset_info(netdev, &len); + if (error || !len) { + return error; + } + strings = xzalloc(sizeof *strings + len * ETH_GSTRING_LEN); + + strings->cmd = ETHTOOL_GSTRINGS; + strings->string_set = ETH_SS_FEATURES; + strings->len = len; + error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up), + (struct ethtool_cmd *) strings, + ETHTOOL_GSTRINGS, "ETHTOOL_GSTRINGS"); + if (error) { + goto out; + } + + for (int i = 0; i < len; i++) { + strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0; + } + + *pstrings = strings; + + return 0; +out: + *pstrings = NULL; + free(strings); + return error; +} + +static void +netdev_linux_set_ol(struct netdev *netdev_) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + struct ethtool_gfeatures *features = NULL; + struct ethtool_gstrings *names = NULL; + int error; + + COVERAGE_INC(netdev_get_ethtool); + + error = netdev_linux_read_definitions(netdev, &names); + if (error) { + return; + } + + features = xzalloc(sizeof *features + + DIV_ROUND_UP(names->len, 32) * + sizeof features->features[0]); + + features->cmd = ETHTOOL_GFEATURES; + features->size = DIV_ROUND_UP(names->len, 32); + error = netdev_linux_do_ethtool(netdev_get_name(netdev_), + (struct ethtool_cmd *) features, + ETHTOOL_GFEATURES, "ETHTOOL_GFEATURES"); + + if (error) { + goto out; + } + +#define FEATURE_WORD(blocks, index, field) ((blocks)[(index) / 32U].field) +#define FEATURE_FIELD_FLAG(index) (1U << (index) % 32U) +#define FEATURE_BIT_IS_SET(blocks, index, field) \ + (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index)) + + netdev->up.ol_flags = 0; + static const struct { + char *string; + uint32_t value; + } t_list[] = { + {"tx-checksum-ipv4", NETDEV_TX_OFFLOAD_IPV4_CKSUM | + NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM}, + {"tx-checksum-ipv6", NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM}, + {"tx-checksum-ip-generic", NETDEV_TX_OFFLOAD_IPV4_CKSUM | + NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM}, + {"tx-checksum-sctp", NETDEV_TX_OFFLOAD_SCTP_CKSUM}, + {"tx-tcp-segmentation", NETDEV_TX_OFFLOAD_TCP_TSO}, + }; + + for (int j = 0; j < ARRAY_SIZE(t_list); j++) { + for (int i = 0; i < names->len; i++) { + char *name = (char *) names->data + i * ETH_GSTRING_LEN; + if (strcmp(t_list[j].string, name) == 0) { + if (FEATURE_BIT_IS_SET(features->features, i, active)) { + netdev_->ol_flags |= t_list[j].value; + } + break; + } + } + } + +out: + free(names); + free(features); +} + static void netdev_linux_read_features(struct netdev_linux *netdev) { diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index d2e6ac768ba..5a7b7a6e77c 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -260,6 +260,7 @@ check_logs () { /ovs_rcu.*blocked [[0-9]]* ms waiting for .* to quiesce/d /Dropped [[0-9]]* log messages/d /setting extended ack support failed/d +/ETHTOOL_GSSET_INFO/d /|WARN|/p /|ERR|/p /|EMER|/p" ${logs} From e0056018c4e31c30d66abb2928ebe5deb9725e55 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 21 Nov 2023 14:26:51 -0500 Subject: [PATCH 456/833] userspace: Respect tso/gso segment size. Currently OVS will calculate the segment size based on the MTU of the egress port. That usually happens to be correct when the ports share the same MTU, but that is not always true. Therefore, if the segment size is provided, then use that and make sure the over sized packets are dropped. Signed-off-by: Flavio Leitner Co-authored-by: Mike Pattrick Signed-off-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 3 ++ lib/dp-packet.h | 26 ++++++++++++++++ lib/netdev-dpdk.c | 12 +++++++- lib/netdev-linux.c | 77 +++++++++++++++++++++++++++++++++------------- 4 files changed, 95 insertions(+), 23 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index ed004c3b902..920402369de 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -34,6 +34,7 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, enum dp_packet_source so pkt_metadata_init(&b->md, 0); dp_packet_reset_cutlen(b); dp_packet_reset_offload(b); + dp_packet_set_tso_segsz(b, 0); /* Initialize implementation-specific fields of dp_packet. */ dp_packet_init_specific(b); /* By default assume the packet type to be Ethernet. */ @@ -203,6 +204,8 @@ dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) *dp_packet_ol_flags_ptr(new_buffer) = *dp_packet_ol_flags_ptr(buffer); *dp_packet_ol_flags_ptr(new_buffer) &= DP_PACKET_OL_SUPPORTED_MASK; + dp_packet_set_tso_segsz(new_buffer, dp_packet_get_tso_segsz(buffer)); + if (dp_packet_rss_valid(buffer)) { dp_packet_set_rss_hash(new_buffer, dp_packet_get_rss_hash(buffer)); } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 70ddf8aa45a..30a1d9dc090 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -126,6 +126,7 @@ struct dp_packet { uint32_t ol_flags; /* Offloading flags. */ uint32_t rss_hash; /* Packet hash. */ uint32_t flow_mark; /* Packet flow mark. */ + uint16_t tso_segsz; /* TCP segment size. */ #endif enum dp_packet_source source; /* Source of memory allocated as 'base'. */ @@ -166,6 +167,9 @@ static inline void dp_packet_set_size(struct dp_packet *, uint32_t); static inline uint16_t dp_packet_get_allocated(const struct dp_packet *); static inline void dp_packet_set_allocated(struct dp_packet *, uint16_t); +static inline uint16_t dp_packet_get_tso_segsz(const struct dp_packet *); +static inline void dp_packet_set_tso_segsz(struct dp_packet *, uint16_t); + void *dp_packet_resize_l2(struct dp_packet *, int increment); void *dp_packet_resize_l2_5(struct dp_packet *, int increment); static inline void *dp_packet_eth(const struct dp_packet *); @@ -644,6 +648,17 @@ dp_packet_set_allocated(struct dp_packet *b, uint16_t s) b->mbuf.buf_len = s; } +static inline uint16_t +dp_packet_get_tso_segsz(const struct dp_packet *p) +{ + return p->mbuf.tso_segsz; +} + +static inline void +dp_packet_set_tso_segsz(struct dp_packet *p, uint16_t s) +{ + p->mbuf.tso_segsz = s; +} #else /* DPDK_NETDEV */ static inline void @@ -700,6 +715,17 @@ dp_packet_set_allocated(struct dp_packet *b, uint16_t s) b->allocated_ = s; } +static inline uint16_t +dp_packet_get_tso_segsz(const struct dp_packet *p) +{ + return p->tso_segsz; +} + +static inline void +dp_packet_set_tso_segsz(struct dp_packet *p, uint16_t s) +{ + p->tso_segsz = s; +} #endif /* DPDK_NETDEV */ static inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 29f2b280d49..706036d4e42 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2486,6 +2486,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { struct tcp_header *th = dp_packet_l4(pkt); + int hdr_len; if (!th) { VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" @@ -2495,7 +2496,15 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; + hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) { + VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. " + "hdr: %"PRIu32", gso: %"PRIu32", max len: %"PRIu32"", + dev->up.name, hdr_len, mbuf->tso_segsz, + dev->max_packet_len); + return false; + } if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; @@ -2770,7 +2779,8 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, int cnt = 0; struct rte_mbuf *pkt; - /* Filter oversized packets, unless are marked for TSO. */ + /* Filter oversized packets. The TSO packets are filtered out + * during the offloading preparation for performance reasons. */ for (i = 0; i < pkt_cnt; i++) { pkt = pkts[i]; if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 93a5845c064..e79a432607a 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -539,7 +539,7 @@ static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0); static bool tap_supports_vnet_hdr = true; static int netdev_linux_parse_vnet_hdr(struct dp_packet *b); -static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu); +static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu); static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *, int cmd, const char *cmd_name); static int get_flags(const struct netdev *, unsigned int *flags); @@ -1597,9 +1597,10 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_) } static int -netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, - struct dp_packet_batch *batch) +netdev_linux_sock_batch_send(struct netdev *netdev_, int sock, int ifindex, + bool tso, int mtu, struct dp_packet_batch *batch) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); const size_t size = dp_packet_batch_size(batch); /* We don't bother setting most fields in sockaddr_ll because the * kernel ignores them for SOCK_RAW. */ @@ -1608,26 +1609,36 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size); struct iovec *iov = xmalloc(sizeof(*iov) * size); - struct dp_packet *packet; + int cnt = 0; + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { if (tso) { - netdev_linux_prepend_vnet_hdr(packet, mtu); - } + int ret = netdev_linux_prepend_vnet_hdr(packet, mtu); + + if (OVS_UNLIKELY(ret)) { + netdev->tx_dropped += 1; + VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet " + "dropped. %s", netdev_get_name(netdev_), + ovs_strerror(ret)); + continue; + } + } - iov[i].iov_base = dp_packet_data(packet); - iov[i].iov_len = dp_packet_size(packet); - mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, - .msg_namelen = sizeof sll, - .msg_iov = &iov[i], - .msg_iovlen = 1 }; + iov[cnt].iov_base = dp_packet_data(packet); + iov[cnt].iov_len = dp_packet_size(packet); + mmsg[cnt].msg_hdr = (struct msghdr) { .msg_name = &sll, + .msg_namelen = sizeof sll, + .msg_iov = &iov[cnt], + .msg_iovlen = 1 }; + cnt++; } int error = 0; - for (uint32_t ofs = 0; ofs < size; ) { + for (uint32_t ofs = 0; ofs < cnt;) { ssize_t retval; do { - retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0); + retval = sendmmsg(sock, mmsg + ofs, cnt - ofs, 0); error = retval < 0 ? errno : 0; } while (error == EINTR); if (error) { @@ -1669,7 +1680,14 @@ netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu, int error; if (OVS_LIKELY(tap_supports_vnet_hdr)) { - netdev_linux_prepend_vnet_hdr(packet, mtu); + error = netdev_linux_prepend_vnet_hdr(packet, mtu); + if (OVS_UNLIKELY(error)) { + netdev->tx_dropped++; + VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet " + "dropped. %s", netdev_get_name(netdev_), + ovs_strerror(error)); + continue; + } } size = dp_packet_size(packet); @@ -1799,7 +1817,8 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, goto free_batch; } - error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); + error = netdev_linux_sock_batch_send(netdev_, sock, ifindex, tso, mtu, + batch); } else { error = netdev_linux_tap_batch_send(netdev_, mtu, batch); } @@ -7092,8 +7111,7 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) switch (vnet->gso_type) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_TCPV6: - /* FIXME: The packet has offloaded TCP segmentation. The gso_size - * is given and needs to be respected. */ + dp_packet_set_tso_segsz(b, (OVS_FORCE uint16_t) vnet->gso_size); dp_packet_hwol_set_tcp_seg(b); break; @@ -7115,18 +7133,32 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) return ret; } -static void +/* Prepends struct virtio_net_hdr to packet 'b'. + * Returns 0 if successful, otherwise a positive errno value. + * Returns EMSGSIZE if the packet 'b' cannot be sent over MTU 'mtu'. */ +static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) { struct virtio_net_hdr v; struct virtio_net_hdr *vnet = &v; if (dp_packet_hwol_is_tso(b)) { - uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) - + TCP_HEADER_LEN; + uint16_t tso_segsz = dp_packet_get_tso_segsz(b); + struct tcp_header *tcp = dp_packet_l4(b); + int tcp_hdr_len = TCP_OFFSET(tcp->tcp_ctl) * 4; + int hdr_len = ((char *) dp_packet_l4(b) - (char *) dp_packet_eth(b)) + + tcp_hdr_len; + int max_packet_len = mtu + ETH_HEADER_LEN + VLAN_HEADER_LEN; + + if (OVS_UNLIKELY((hdr_len + tso_segsz) > max_packet_len)) { + VLOG_WARN_RL(&rl, "Oversized TSO packet. hdr_len: %"PRIu32", " + "gso: %"PRIu16", max length: %"PRIu32".", hdr_len, + tso_segsz, max_packet_len); + return EMSGSIZE; + } vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len; - vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); + vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz); if (dp_packet_hwol_is_ipv4(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; } else if (dp_packet_hwol_tx_ipv6(b)) { @@ -7216,4 +7248,5 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) } dp_packet_push(b, vnet, sizeof *vnet); + return 0; } From 8b5fe2dc6080db0bed9969cf81bb4a007539cfbe Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 21 Nov 2023 14:26:52 -0500 Subject: [PATCH 457/833] userspace: Add Generic Segmentation Offloading. This provides a software implementation in the case the egress netdev doesn't support segmentation in hardware. The challenge here is to guarantee packet ordering in the original batch that may be full of TSO packets. Each TSO packet can go up to ~64kB, so with segment size of 1440 that means about 44 packets for each TSO. Each batch has 32 packets, so the total batch amounts to 1408 normal packets. The segmentation estimates the total number of packets and then the total number of batches. Then allocate enough memory and finally do the work. Finally each batch is sent in order to the netdev. Signed-off-by: Flavio Leitner Co-authored-by: Mike Pattrick Signed-off-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- NEWS | 4 + lib/automake.mk | 2 + lib/dp-packet-gso.c | 168 ++++++++++++++++++++++++++++++++++++++++ lib/dp-packet-gso.h | 23 ++++++ lib/dp-packet.h | 7 ++ lib/netdev-dpdk.c | 44 ++++++++--- lib/netdev.c | 139 +++++++++++++++++++++------------ lib/packets.c | 4 +- tests/system-traffic.at | 45 +++++++++++ 9 files changed, 371 insertions(+), 65 deletions(-) create mode 100644 lib/dp-packet-gso.c create mode 100644 lib/dp-packet-gso.h diff --git a/NEWS b/NEWS index 1d9c30533b3..490e275da5a 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,10 @@ Post-v3.2.0 a.k.a. 'configured' values, can be found in the 'status' column of the Interface table, i.e. with 'ovs-vsctl get interface <..> status'. Reported names adjusted accordingly. + - Userspace datapath: + * Added support for Generic Segmentation Offloading for the cases where + TSO is enabled but not supported by an egress interface (except for + tunnel interfaces). v3.2.0 - 17 Aug 2023 diff --git a/lib/automake.mk b/lib/automake.mk index 1be13a420a7..0dc8a35cc43 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpctl.h \ lib/dp-packet.h \ lib/dp-packet.c \ + lib/dp-packet-gso.c \ + lib/dp-packet-gso.h \ lib/dpdk.h \ lib/dpif-netdev-extract-study.c \ lib/dpif-netdev-lookup.h \ diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c new file mode 100644 index 00000000000..e2c141b32a7 --- /dev/null +++ b/lib/dp-packet-gso.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "dp-packet.h" +#include "dp-packet-gso.h" +#include "netdev-provider.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(dp_packet_gso); + +/* Retuns a new packet that is a segment of packet 'p'. + * + * The new packet is initialized with 'hdr_len' bytes from the + * start of packet 'p' and then appended with 'data_len' bytes + * from the 'data' buffer. + * + * Note: The packet headers are not updated. */ +static struct dp_packet * +dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len, + const char *data, size_t data_len) +{ + struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len, + dp_packet_headroom(p)); + + /* Append the original packet headers and then the payload. */ + dp_packet_put(seg, dp_packet_data(p), hdr_len); + dp_packet_put(seg, data, data_len); + + /* The new segment should have the same offsets. */ + seg->l2_5_ofs = p->l2_5_ofs; + seg->l3_ofs = p->l3_ofs; + seg->l4_ofs = p->l4_ofs; + + /* The protocol headers remain the same, so preserve hash and mark. */ + *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p); + *dp_packet_flow_mark_ptr(seg) = *dp_packet_flow_mark_ptr(p); + + /* The segment should inherit all the offloading flags from the + * original packet, except for the TCP segmentation, external + * buffer and indirect buffer flags. */ + *dp_packet_ol_flags_ptr(seg) = *dp_packet_ol_flags_ptr(p) + & DP_PACKET_OL_SUPPORTED_MASK; + + dp_packet_hwol_reset_tcp_seg(seg); + + return seg; +} + +/* Returns the calculated number of TCP segments in packet 'p'. */ +int +dp_packet_gso_nr_segs(struct dp_packet *p) +{ + uint16_t segsz = dp_packet_get_tso_segsz(p); + const char *data_tail; + const char *data_pos; + + data_pos = dp_packet_get_tcp_payload(p); + data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); + + return DIV_ROUND_UP(data_tail - data_pos, segsz); +} + +/* Perform software segmentation on packet 'p'. + * + * Segments packet 'p' into the array of preallocated batches in 'batches', + * updating the 'batches' pointer as needed and returns true. + * + * Returns false if the packet cannot be segmented. */ +bool +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + struct dp_packet_batch *curr_batch = *batches; + struct tcp_header *tcp_hdr; + struct ip_header *ip_hdr; + struct dp_packet *seg; + uint16_t tcp_offset; + uint16_t tso_segsz; + uint32_t tcp_seq; + uint16_t ip_id; + int hdr_len; + int seg_len; + + tso_segsz = dp_packet_get_tso_segsz(p); + if (!tso_segsz) { + VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.", + dp_packet_size(p)); + return false; + } + + tcp_hdr = dp_packet_l4(p); + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); + hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) + + tcp_offset * 4; + ip_id = 0; + if (dp_packet_hwol_is_ipv4(p)) { + ip_hdr = dp_packet_l3(p); + ip_id = ntohs(ip_hdr->ip_id); + } + + const char *data_tail = (char *) dp_packet_tail(p) + - dp_packet_l2_pad_size(p); + const char *data_pos = dp_packet_get_tcp_payload(p); + int n_segs = dp_packet_gso_nr_segs(p); + + for (int i = 0; i < n_segs; i++) { + seg_len = data_tail - data_pos; + if (seg_len > tso_segsz) { + seg_len = tso_segsz; + } + + seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); + data_pos += seg_len; + + /* Update L3 header. */ + if (dp_packet_hwol_is_ipv4(seg)) { + ip_hdr = dp_packet_l3(seg); + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + + dp_packet_l4_size(seg)); + ip_hdr->ip_id = htons(ip_id); + ip_hdr->ip_csum = 0; + ip_id++; + } else { + struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); + + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen + = htons(sizeof *ip_hdr + dp_packet_l4_size(seg)); + } + + /* Update L4 header. */ + tcp_hdr = dp_packet_l4(seg); + put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq)); + tcp_seq += seg_len; + if (OVS_LIKELY(i < (n_segs - 1))) { + /* Reset flags PUSH and FIN unless it is the last segment. */ + uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl) + & ~(TCP_PSH | TCP_FIN); + tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset); + } + + if (dp_packet_batch_is_full(curr_batch)) { + curr_batch++; + } + + dp_packet_batch_add(curr_batch, seg); + } + + *batches = curr_batch; + return true; +} diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h new file mode 100644 index 00000000000..9c282fb86cc --- /dev/null +++ b/lib/dp-packet-gso.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DP_PACKET_GSO_H +#define DP_PACKET_GSO_H 1 + +bool dp_packet_gso(struct dp_packet *, struct dp_packet_batch **); +int dp_packet_gso_nr_segs(struct dp_packet *); + +#endif /* dp-packet-gso.h */ diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 30a1d9dc090..11aa0072354 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1131,6 +1131,13 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Resets TCP Segmentation flag in packet 'p'. */ +static inline void +dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_TCP_SEG; +} + /* Returns 'true' if the IP header has good integrity and the * checksum in it is complete. */ static inline bool diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 706036d4e42..1ff25c24692 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2471,6 +2471,7 @@ static bool netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); + struct tcp_header *th; if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_TCP_SEG))) { @@ -2483,27 +2484,36 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l4_len = 0; mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; + th = dp_packet_l4(pkt); if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { - struct tcp_header *th = dp_packet_l4(pkt); - int hdr_len; - if (!th) { VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); return false; } + } + + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) { + if (!th) { + VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header" + " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); + return false; + } mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; - mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; - hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; - if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) { - VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. " - "hdr: %"PRIu32", gso: %"PRIu32", max len: %"PRIu32"", - dev->up.name, hdr_len, mbuf->tso_segsz, - dev->max_packet_len); - return false; + + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { + int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; + if (OVS_UNLIKELY((hdr_len + + mbuf->tso_segsz) > dev->max_packet_len)) { + VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", " + "gso: %"PRIu32", max len: %"PRIu32"", + dev->up.name, hdr_len, mbuf->tso_segsz, + dev->max_packet_len); + return false; + } } if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { @@ -2891,6 +2901,7 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) mbuf_dest->packet_type = pkt_orig->mbuf.packet_type; mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & ~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT)); + mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz; memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); @@ -2949,11 +2960,20 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch, struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets; struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); size_t cnt, pkt_cnt = dp_packet_batch_size(batch); + struct dp_packet *packet; + bool need_copy = false; memset(stats, 0, sizeof *stats); + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (packet->source != DPBUF_DPDK) { + need_copy = true; + break; + } + } + /* Copy dp-packets to mbufs. */ - if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { + if (OVS_UNLIKELY(need_copy)) { cnt = dpdk_copy_batch_to_mbuf(netdev, batch); stats->tx_failure_drops += pkt_cnt - cnt; pkt_cnt = cnt; diff --git a/lib/netdev.c b/lib/netdev.c index e5ac7713d2e..3ed8049f76a 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -35,6 +35,7 @@ #include "coverage.h" #include "dpif.h" #include "dp-packet.h" +#include "dp-packet-gso.h" #include "openvswitch/dynamic-string.h" #include "fatal-signal.h" #include "hash.h" @@ -56,6 +57,7 @@ #include "svec.h" #include "openvswitch/vlog.h" #include "flow.h" +#include "userspace-tso.h" #include "util.h" #ifdef __linux__ #include "tc.h" @@ -67,8 +69,9 @@ COVERAGE_DEFINE(netdev_received); COVERAGE_DEFINE(netdev_sent); COVERAGE_DEFINE(netdev_add_router); COVERAGE_DEFINE(netdev_get_stats); -COVERAGE_DEFINE(netdev_send_prepare_drops); COVERAGE_DEFINE(netdev_push_header_drops); +COVERAGE_DEFINE(netdev_soft_seg_good); +COVERAGE_DEFINE(netdev_soft_seg_drops); struct netdev_saved_flags { struct netdev *netdev; @@ -792,60 +795,84 @@ netdev_get_pt_mode(const struct netdev *netdev) : NETDEV_PT_LEGACY_L2); } -/* Check if a 'packet' is compatible with 'netdev_flags'. - * If a packet is incompatible, return 'false' with the 'errormsg' - * pointing to a reason. */ -static bool -netdev_send_prepare_packet(const uint64_t netdev_flags, - struct dp_packet *packet, char **errormsg) -{ - if (dp_packet_hwol_is_tso(packet) - && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { - /* Fall back to GSO in software. */ - VLOG_ERR_BUF(errormsg, "No TSO support"); - return false; - } - - /* Packet with IP csum offloading enabled was received with verified csum. - * Leave the IP csum offloading enabled even with good checksum to the - * netdev to decide what would be the best to do. - * Provide a software fallback in case the device doesn't support IP csum - * offloading. Note: Encapsulated packet must have the inner IP header - * csum already calculated. - * Packet with L4 csum offloading enabled was received with verified csum. - * Leave the L4 csum offloading enabled even with good checksum for the - * netdev to decide what would be the best to do. - * Netdev that requires pseudo header csum needs to calculate that. - * Provide a software fallback in case the netdev doesn't support L4 csum - * offloading. Note: Encapsulated packet must have the inner L4 header - * csum already calculated. */ - dp_packet_ol_send_prepare(packet, netdev_flags); - - return true; -} - -/* Check if each packet in 'batch' is compatible with 'netdev' features, - * otherwise either fall back to software implementation or drop it. */ -static void -netdev_send_prepare_batch(const struct netdev *netdev, - struct dp_packet_batch *batch) +/* Attempts to segment GSO flagged packets and send them as multiple bundles. + * This function is only used if at least one packet in the current batch is + * flagged for TSO and the netdev does not support this. + * + * The return value is 0 if all batches sent successfully, and an error code + * from netdev_class->send() if at least one batch failed to send. */ +static int +netdev_send_tso(struct netdev *netdev, int qid, + struct dp_packet_batch *batch, bool concurrent_txq) { + struct dp_packet_batch *batches; struct dp_packet *packet; - size_t i, size = dp_packet_batch_size(batch); - - DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - char *errormsg = NULL; + int retval = 0; + int n_packets; + int n_batches; + int error; - if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) { - dp_packet_batch_refill(batch, packet, i); + /* Calculate the total number of packets in the batch after + * the segmentation. */ + n_packets = 0; + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + n_packets += dp_packet_gso_nr_segs(packet); } else { + n_packets++; + } + } + + if (!n_packets) { + return 0; + } + + /* Allocate enough batches to store all the packets in order. */ + n_batches = DIV_ROUND_UP(n_packets, NETDEV_MAX_BURST); + batches = xmalloc(n_batches * sizeof *batches); + + struct dp_packet_batch *curr_batch = batches; + struct dp_packet_batch *last_batch = &batches[n_batches - 1]; + for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) { + dp_packet_batch_init(curr_batch); + } + + /* Do the packet segmentation if TSO is flagged. */ + size_t size = dp_packet_batch_size(batch); + size_t k; + curr_batch = batches; + DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + if (dp_packet_gso(packet, &curr_batch)) { + COVERAGE_INC(netdev_soft_seg_good); + } else { + COVERAGE_INC(netdev_soft_seg_drops); + } dp_packet_delete(packet); - COVERAGE_INC(netdev_send_prepare_drops); - VLOG_WARN_RL(&rl, "%s: Packet dropped: %s", - netdev_get_name(netdev), errormsg); - free(errormsg); + } else { + if (dp_packet_batch_is_full(curr_batch)) { + curr_batch++; + } + + dp_packet_batch_add(curr_batch, packet); + } + } + + for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) { + DP_PACKET_BATCH_FOR_EACH (i, packet, curr_batch) { + dp_packet_ol_send_prepare(packet, netdev->ol_flags); + } + + error = netdev->netdev_class->send(netdev, qid, curr_batch, + concurrent_txq); + if (!error) { + COVERAGE_INC(netdev_sent); + } else { + retval = error; } } + free(batches); + return retval; } /* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet), @@ -877,11 +904,21 @@ int netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool concurrent_txq) { + const uint64_t netdev_flags = netdev->ol_flags; + struct dp_packet *packet; int error; - netdev_send_prepare_batch(netdev, batch); - if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) { - return 0; + if (userspace_tso_enabled() && + !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + return netdev_send_tso(netdev, qid, batch, concurrent_txq); + } + } + } + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + dp_packet_ol_send_prepare(packet, netdev_flags); } error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq); diff --git a/lib/packets.c b/lib/packets.c index 462b51f92dc..dab823ba225 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -427,7 +427,7 @@ add_mpls(struct dp_packet *packet, ovs_be16 ethtype, ovs_be32 lse, } if (!l3_encap) { - struct mpls_hdr *header = dp_packet_push_uninit(packet, MPLS_HLEN); + struct mpls_hdr *header = dp_packet_resize_l2(packet, MPLS_HLEN); put_16aligned_be32(&header->mpls_lse, lse); packet->l2_5_ofs = 0; @@ -513,7 +513,7 @@ push_nsh(struct dp_packet *packet, const struct nsh_hdr *nsh_hdr_src) OVS_NOT_REACHED(); } - nsh = (struct nsh_hdr *) dp_packet_push_uninit(packet, length); + nsh = (struct nsh_hdr *) dp_packet_resize_l2(packet, length); memcpy(nsh, nsh_hdr_src, length); nsh->next_proto = next_proto; packet->packet_type = htonl(PT_NSH); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index a7d4ed83bdc..a37a694c52f 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2111,6 +2111,51 @@ recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:29, bytes OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - netdev offload software fallback]) +AT_SKIP_IF([test $HAVE_NC = no]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Test the case where only one side has all checksum and tso offload disabled. +AT_CHECK([ethtool -K ovs-p1 tso off], [0], [ignore], [ignore]) +AT_CHECK([ethtool -K ovs-p1 sg off], [0], [ignore], [ignore]) + +dnl Reinitialize. +AT_CHECK([ovs-vsctl del-port ovs-p1]) +AT_CHECK([ovs-vsctl add-port br0 ovs-p1]) + +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +NS_CHECK_EXEC([at_ns1], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +NETNS_DAEMONIZE([at_ns0], [nc -l 1234 > data_0], [nc1.pid]) +NETNS_DAEMONIZE([at_ns1], [nc -l 1234 > data_1], [nc2.pid]) + +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +on_exit 'rm -f payload.bin' + +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.2 1234 < payload.bin]) +NS_CHECK_EXEC([at_ns1], [nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin]) + +dnl Wait until transfer completes. +OVS_WAIT_WHILE([kill -0 $(cat nc1.pid) $(cat nc2.pid)]) + +AT_CHECK([diff -q payload.bin data_0], [0]) +AT_CHECK([diff -q payload.bin data_1], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([MPLS]) AT_SETUP([mpls - encap header dp-support]) From 20022fbf5a3845e5b80788a6905bea9ab0ec40c7 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Fri, 27 Oct 2023 14:10:40 +0200 Subject: [PATCH 458/833] editorconfig: Remove [*] section and trim_trailing_whitespace. Wildcard sections [*] and [**] are unsafe because properties cannot be applied safely to any filetype in general. For example, IDEs like Visual Studio Code and KDevelop store configuration files in subfolders like .vscode or .kdev4. Properties from wildcard sections also apply to those files which is not safe in general. Another example are patches created with 'git format-patch' which can contain trailing whitespaces. When editing a patch, e.g. to fix a typo in the title, trailing whitespaces should not be removed. Property trim_trailing_whitespace should not be defined at all because it is interpreted differently by editors. Some wipe whitespaces from the whole file, others remove them from edited lines only and a few change their behavior between releases [0]. Limiting the property to a subset of files like *.c/*.h will not mitigate the issue: Multiple definitions of a whitespace exist. Unicode considers a form feed (0x0C) to be a whitespace [1]. QChar::isSpace() [2] from Qt follows this definition, causing the Kate editor identify a form feed as a trailing whitespace and removing it from sources [3]. This breaks patches when editors remove form feeds and thus causing broken patches which cannot be applied cleanly. Removing trim_trailing_whitespace will be a minor inconvienence, in particular because utilities/checkpatch.py and thus 0-day Robot will prevent trailing whitespaces for our definition of a whitespace. [0] https://github.com/KDE/ktexteditor/commit/94b328fc64e543d91930845d2a96ce08d3043295 [1] https://en.wikipedia.org/wiki/Whitespace_character [2] https://github.com/qt/qtbase/blob/5628600a07295db6ed6683e97fafb0c45ddea505/src/corelib/text/qchar.h#L554 [3] https://github.com/KDE/ktexteditor/blob/10210ec1dd06afa1e7b19a4fff722a8a23719161/src/document/katedocument.cpp#L5643 Fixes: 07f6d6a0cb51 ("Add editorconfig file.") Signed-off-by: Jakob Meng Acked-by: Eelco Chaudron Reviewed-by: Robin-Jarry Signed-off-by: Aaron Conole --- .editorconfig | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.editorconfig b/.editorconfig index 685c7275005..41ba51bf305 100644 --- a/.editorconfig +++ b/.editorconfig @@ -2,15 +2,18 @@ root = true -[*] -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -charset = utf-8 +# No wildcard sections [*] and [**] because properties cannot be +# applied safely to any filetype in general. + +# Property trim_trailing_whitespace should not be defined at all +# because it is interpreted differently by editors. [*.{c,h}] +charset = utf-8 +end_of_line = lf indent_style = space indent_size = 4 +insert_final_newline = true max_line_length = 79 [include/linux/**.h] From a34e306a0db44bacb824691e53dff9c56e83421a Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Thu, 30 Nov 2023 08:31:07 +0100 Subject: [PATCH 459/833] ofp-ct: Return error for unknown property in CT flush. CT flush extension would silently ignore unknown properties, which could lead to potential surprise by deleting more than it was requested to. Return error on unknown property instead to avoid this problem and at the same time inform the user that the specified property is not supported. Fixes: 08146bf7d9b4 ("openflow: Add extension to flush CT by generic match.") Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/ofp-ct.c | 11 +++++++++++ tests/ofp-print.at | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/lib/ofp-ct.c b/lib/ofp-ct.c index 85a9d8beca7..a140fba4702 100644 --- a/lib/ofp-ct.c +++ b/lib/ofp-ct.c @@ -31,6 +31,9 @@ #include "openvswitch/ofp-prop.h" #include "openvswitch/ofp-util.h" #include "openvswitch/packets.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(ofp_ct); static void ofp_ct_tuple_format(struct ds *ds, const struct ofp_ct_tuple *tuple, @@ -286,6 +289,10 @@ ofp_ct_tuple_decode_nested(struct ofpbuf *property, struct ofp_ct_tuple *tuple, case NXT_CT_TUPLE_ICMP_CODE: error = ofpprop_parse_u8(&inner, &tuple->icmp_code); break; + + default: + error = OFPPROP_UNKNOWN(false, "NXT_CT_TUPLE", type); + break; } if (error) { @@ -377,6 +384,10 @@ ofp_ct_match_decode(struct ofp_ct_match *match, bool *with_zone, } error = ofpprop_parse_u16(&property, zone_id); break; + + default: + error = OFPPROP_UNKNOWN(false, "NXT_CT_FLUSH", type); + break; } if (error) { diff --git a/tests/ofp-print.at b/tests/ofp-print.at index 14aa5541694..6a07e23c645 100644 --- a/tests/ofp-print.at +++ b/tests/ofp-print.at @@ -4180,4 +4180,22 @@ AT_CHECK([ovs-ofctl ofp-print "\ 00 01 00 20 00 00 00 00 \ 00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ " | grep -q OFPBPC_BAD_VALUE], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 80 00 08 00 00 00 00 \ +"| grep -q OFPBPC_BAD_TYPE], [0], [ignore], [stderr]) +AT_CHECK([grep -q "unknown NXT_CT_FLUSH property type 128" stderr], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 28 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 00 00 10 00 00 00 00 \ +00 80 00 08 00 50 00 00 \ +"| grep -q OFPBPC_BAD_TYPE], [0], [ignore], [stderr]) +AT_CHECK([grep -q "unknown NXT_CT_TUPLE property type 128" stderr], [0]) + AT_CLEANUP From 472dd66423aae651f2c453e6f6e785b954386cea Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Sat, 2 Dec 2023 00:06:59 +0100 Subject: [PATCH 460/833] netdev-offload-tc: Fix offload of tunnel key tp_src. There is no TCA_TUNNEL_KEY_ENC_SRC_PORT in the kernel, so the offload should not be attempted if OVS_TUNNEL_KEY_ATTR_TP_SRC is requested by OVS. Current code just ignores the attribute in the tunnel(set()) action leading to a flow mismatch and potential incorrect datapath behavior: |tc(handler21)|DBG|tc flower compare failed action compare ... Action 0 mismatch: - Expected Action: 00000010 01 00 00 00 00 00 00 00-00 00 00 00 00 ff 00 11 00000020 c0 5b 17 c1 00 40 00 00-0a 01 00 6d 0a 01 01 12 00000050 08 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000060 01 02 80 01 00 18 00 0b-00 00 00 00 00 00 00 00 ... - Received Action: 00000010 01 00 00 00 00 00 00 00-00 00 00 00 00 ff 00 11 00000020 00 00 17 c1 00 40 00 00-0a 01 00 6d 0a 01 01 12 00000050 08 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000060 01 02 80 01 00 18 00 0b-00 00 00 00 00 00 00 00 ... In the tc_action dump above we can see the difference on the second line. The action dumped from a kernel is missing 'c0 5b' - source port for a tunnel(set()) action on the second line. Removing the field from the tc_action_encap structure entirely to avoid any potential confusion. Note: In general, the source port number in the tunnel(set()) action is not particularly useful for most tunnels, because they may just ignore the value. Specs for Geneve and VXLAN suggest using a value based on the headers of the inner packet as a source port. In vast majority of scenarios the source port doesn't actually end up in the action itself. Having a mismatch between the userspace and TC leads to constant revalidation of the flow and warnings in the log. Adding a test case that demonstrates a scenario where the issue occurs - bridging of two tunnels. Fixes: 8f283af89298 ("netdev-tc-offloads: Implement netdev flow put using tc interface") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2023-October/052744.html Reported-by: Vladislav Odintsov Tested-by: Vladislav Odintsov Acked-by: Eelco Chaudron Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 4 ++- lib/tc.h | 3 +- tests/system-traffic.at | 77 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index b846a63c222..164c7eef63e 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -1627,7 +1627,9 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, } break; case OVS_TUNNEL_KEY_ATTR_TP_SRC: { - action->encap.tp_src = nl_attr_get_be16(tun_attr); + /* There is no corresponding attribute in TC. */ + VLOG_DBG_RL(&rl, "unsupported tunnel key attribute TP_SRC"); + return EOPNOTSUPP; } break; case OVS_TUNNEL_KEY_ATTR_TP_DST: { diff --git a/lib/tc.h b/lib/tc.h index 06707ffa467..fdbcf4b7cb2 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -213,7 +213,8 @@ enum nat_type { struct tc_action_encap { bool id_present; ovs_be64 id; - ovs_be16 tp_src; + /* ovs_be16 tp_src; Could have been here, but there is no + * TCA_TUNNEL_KEY_ENC_ attribute for it in the kernel. */ ovs_be16 tp_dst; uint8_t tos; uint8_t ttl; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index a37a694c52f..3f665e33919 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -903,6 +903,83 @@ ovs-pcap p0.pcap AT_CHECK([ovs-pcap p0.pcap | grep -Eq "^[[[:xdigit:]]]{24}86dd60000000003a1140fc000000000000000000000000000100fc000000000000000000000000000001[[[:xdigit:]]]{4}17c1003a[[[:xdigit:]]]{4}0000655800000000fffffffffffffa163e949d8008060001080006040001[[[:xdigit:]]]{12}0a0000f40000000000000a0000fe$"]) AT_CLEANUP +AT_SETUP([datapath - bridging two geneve tunnels]) +OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_GENEVE() + +OVS_TRAFFIC_VSWITCHD_START() +ADD_BR([br-underlay-0]) +ADD_BR([br-underlay-1]) + +ADD_NAMESPACES(at_ns0) +ADD_NAMESPACES(at_ns1) + +dnl Set up underlay link from host into the namespaces using veth pairs. +ADD_VETH(p0, at_ns0, br-underlay-0, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay-0 "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay-0 up]) + +ADD_VETH(p1, at_ns1, br-underlay-1, "172.31.2.1/24") +AT_CHECK([ip addr add dev br-underlay-1 "172.31.2.100/24"]) +AT_CHECK([ip link set dev br-underlay-1 up]) + +dnl Set up two OVS tunnel endpoints in a root namespace and two native +dnl linux devices inside the test namespaces. +dnl +dnl ns_gnv0 | ns_gnv1 +dnl ip: 10.1.1.1/24 | ip: 10.1.1.2/24 +dnl remote_ip: 172.31.1.100 | remote_ip: 172.31.2.100 +dnl | | | +dnl | | | +dnl p0 | p1 +dnl ip: 172.31.1.1/24 | ip: 172.31.2.1/24 +dnl | NS0 | NS1 | +dnl ---------|------------------------+------------------|-------------------- +dnl | | +dnl br-underlay-0: br-underlay-1: +dnl ip: 172.31.1.100/24 ip: 172.31.2.100/24 +dnl ovs-p0 ovs-p1 +dnl | | +dnl | br0 | +dnl encap/decap --- ip: 10.1.1.100/24 --------- encap/decap +dnl at_gnv0 +dnl remote_ip: 172.31.1.1 +dnl at_gnv1 +dnl remote_ip: 172.31.2.1 +dnl +ADD_OVS_TUNNEL([geneve], [br0], [at_gnv0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [vni 0]) +ADD_OVS_TUNNEL([geneve], [br0], [at_gnv1], [172.31.2.1], [10.1.1.101/24]) +ADD_NATIVE_TUNNEL([geneve], [ns_gnv1], [at_ns1], [172.31.2.100], [10.1.1.2/24], + [vni 0]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay-0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay-1 "actions=normal"]) + +dnl First, check both underlays. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 172.31.2.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Now, check the overlay with different packet sizes. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([datapath - ping over gre tunnel by simulated packets]) OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) From 6b172358852e8aa265e2078249406ea46c9bcf36 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Dec 2023 22:04:12 +0100 Subject: [PATCH 461/833] tunnel: Do not carry source port from a previous tunnel. If a packet is received from a UDP tunnel, it has a source port populated in the tunnel metadata. This field cannot be read or changed with OpenFlow or the tunnel configuration. However, while sending this packet to a different tunnel, the value remains in the metadata and is being sent to the datapath to use as a source port for this new tunnel. Tunnel implementations largely ignore this value, and it is a random value from a different tunnel anyway. Clear it while sending to a different tunnel, so the unnecessary information is not being passed to the datapath. This additionally allows traffic from one tunnel to anther to be offloaded to TC, as TC doesn't allow setting the source port at all. Tested-by: Vladislav Odintsov Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/tunnel.c | 1 + tests/tunnel.at | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c index 3455ed233b2..80ddee78acf 100644 --- a/ofproto/tunnel.c +++ b/ofproto/tunnel.c @@ -432,6 +432,7 @@ tnl_port_send(const struct ofport_dpif *ofport, struct flow *flow, flow->tunnel.ipv6_dst = in6addr_any; } } + flow->tunnel.tp_src = 0; /* Do not carry from a previous tunnel. */ flow->tunnel.tp_dst = cfg->dst_port; if (!cfg->out_key_flow) { flow->tunnel.tun_id = cfg->out_key; diff --git a/tests/tunnel.at b/tests/tunnel.at index 05613bcc343..282651ac732 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -333,6 +333,50 @@ set(tunnel(tun_id=0x5,dst=4.4.4.4,ttl=64,flags(df|key))),1 OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([tunnel - set_tunnel VXLAN]) +OVS_VSWITCHD_START([dnl + add-port br0 p1 -- set Interface p1 type=vxlan options:key=flow \ + options:remote_ip=1.1.1.1 ofport_request=1 \ + -- add-port br0 p2 -- set Interface p2 type=vxlan options:key=flow \ + options:remote_ip=2.2.2.2 ofport_request=2 \ + -- add-port br0 p3 -- set Interface p3 type=vxlan options:key=flow \ + options:remote_ip=3.3.3.3 ofport_request=3 \ + -- add-port br0 p4 -- set Interface p4 type=vxlan options:key=flow \ + options:remote_ip=4.4.4.4 ofport_request=4]) +AT_DATA([flows.txt], [dnl +actions=set_tunnel:1,output:1,set_tunnel:2,output:2,set_tunnel:3,output:3,set_tunnel:5,output:4 +]) + +OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl + br0 65534/100: (dummy-internal) + p1 1/4789: (vxlan: key=flow, remote_ip=1.1.1.1) + p2 2/4789: (vxlan: key=flow, remote_ip=2.2.2.2) + p3 3/4789: (vxlan: key=flow, remote_ip=3.3.3.3) + p4 4/4789: (vxlan: key=flow, remote_ip=4.4.4.4) +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(100),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [Datapath actions: dnl +set(tunnel(tun_id=0x1,dst=1.1.1.1,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x2,dst=2.2.2.2,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x3,dst=3.3.3.3,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x5,dst=4.4.4.4,ttl=64,tp_dst=4789,flags(df|key))),4789 +]) + +dnl With pre-existing tunnel metadata. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=1.1.1.1,dst=5.5.5.5,tp_src=12345,tp_dst=4789,ttl=64,flags(key)),in_port(4789),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [Datapath actions: dnl +set(tunnel(tun_id=0x2,dst=2.2.2.2,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x3,dst=3.3.3.3,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x5,dst=4.4.4.4,ttl=64,tp_dst=4789,flags(df|key))),4789 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([tunnel - key]) OVS_VSWITCHD_START([dnl add-port br0 p1 -- set Interface p1 type=gre options:key=1 \ From 4b9eb061b1e47b6d578505bbab00caca78b04bd7 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Dec 2023 06:49:08 +0100 Subject: [PATCH 462/833] ct-dpif: Handle default zone limit the same way as other limits. Internally handle default CT zone limit as other limits that can be passed via the list with special value -1. Currently, the -1 is treated by both datapaths as default, add static asserts to make sure that this remains the case in the future. This allows us to easily delete the default zone limit. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/conntrack.c | 2 +- lib/conntrack.h | 7 +++++-- lib/ct-dpif.c | 28 +++++++++++++++------------- lib/ct-dpif.h | 14 ++++++-------- lib/dpctl.c | 15 ++++++++------- lib/dpif-netdev.c | 21 ++++++--------------- lib/dpif-netlink.c | 29 ++++++----------------------- lib/dpif-provider.h | 24 +++++++++++------------- 8 files changed, 58 insertions(+), 82 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 47a443fba4d..31f00a12748 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -398,7 +398,7 @@ zone_limit_clean(struct conntrack *ct, struct zone_limit *zl) } int -zone_limit_delete(struct conntrack *ct, uint16_t zone) +zone_limit_delete(struct conntrack *ct, int32_t zone) { ovs_mutex_lock(&ct->ct_lock); struct zone_limit *zl = zone_limit_lookup_protected(ct, zone); diff --git a/lib/conntrack.h b/lib/conntrack.h index 57d5159b61b..18c182f8501 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -122,11 +122,14 @@ struct timeout_policy { enum { INVALID_ZONE = -2, - DEFAULT_ZONE = -1, /* Default zone for zone limit management. */ + DEFAULT_ZONE = OVS_ZONE_LIMIT_DEFAULT_ZONE, /* Default zone for zone + * limit management. */ MIN_ZONE = 0, MAX_ZONE = 0xFFFF, }; +BUILD_ASSERT_DECL(DEFAULT_ZONE > INVALID_ZONE && DEFAULT_ZONE < MIN_ZONE); + struct ct_dpif_entry; struct ct_dpif_tuple; @@ -154,6 +157,6 @@ struct ipf *conntrack_ipf_ctx(struct conntrack *ct); struct conntrack_zone_limit zone_limit_get(struct conntrack *ct, int32_t zone); int zone_limit_update(struct conntrack *ct, int32_t zone, uint32_t limit); -int zone_limit_delete(struct conntrack *ct, uint16_t zone); +int zone_limit_delete(struct conntrack *ct, int32_t zone); #endif /* conntrack.h */ diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index f59c6e560dd..2ee04516450 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -398,23 +398,19 @@ ct_dpif_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) } int -ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit, - const struct ovs_list *zone_limits) +ct_dpif_set_limits(struct dpif *dpif, const struct ovs_list *zone_limits) { return (dpif->dpif_class->ct_set_limits - ? dpif->dpif_class->ct_set_limits(dpif, default_limit, - zone_limits) + ? dpif->dpif_class->ct_set_limits(dpif, zone_limits) : EOPNOTSUPP); } int -ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit, - const struct ovs_list *zone_limits_in, +ct_dpif_get_limits(struct dpif *dpif, const struct ovs_list *zone_limits_in, struct ovs_list *zone_limits_out) { return (dpif->dpif_class->ct_get_limits - ? dpif->dpif_class->ct_get_limits(dpif, default_limit, - zone_limits_in, + ? dpif->dpif_class->ct_get_limits(dpif, zone_limits_in, zone_limits_out) : EOPNOTSUPP); } @@ -854,7 +850,7 @@ ct_dpif_format_tcp_stat(struct ds * ds, int tcp_state, int conn_per_state) void -ct_dpif_push_zone_limit(struct ovs_list *zone_limits, uint16_t zone, +ct_dpif_push_zone_limit(struct ovs_list *zone_limits, int32_t zone, uint32_t limit, uint32_t count) { struct ct_dpif_zone_limit *zone_limit = xmalloc(sizeof *zone_limit); @@ -928,15 +924,21 @@ ct_dpif_parse_zone_limit_tuple(const char *s, uint16_t *pzone, } void -ct_dpif_format_zone_limits(uint32_t default_limit, - const struct ovs_list *zone_limits, struct ds *ds) +ct_dpif_format_zone_limits(const struct ovs_list *zone_limits, struct ds *ds) { struct ct_dpif_zone_limit *zone_limit; - ds_put_format(ds, "default limit=%"PRIu32, default_limit); + LIST_FOR_EACH (zone_limit, node, zone_limits) { + if (zone_limit->zone == OVS_ZONE_LIMIT_DEFAULT_ZONE) { + ds_put_format(ds, "default limit=%"PRIu32, zone_limit->limit); + } + } LIST_FOR_EACH (zone_limit, node, zone_limits) { - ds_put_format(ds, "\nzone=%"PRIu16, zone_limit->zone); + if (zone_limit->zone == OVS_ZONE_LIMIT_DEFAULT_ZONE) { + continue; + } + ds_put_format(ds, "\nzone=%"PRIu16, (uint16_t) zone_limit->zone); ds_put_format(ds, ",limit=%"PRIu32, zone_limit->limit); ds_put_format(ds, ",count=%"PRIu32, zone_limit->count); } diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index 0b728b52986..c8a7c155e3c 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -237,7 +237,7 @@ struct ct_dpif_dump_state { }; struct ct_dpif_zone_limit { - uint16_t zone; + int32_t zone; uint32_t limit; /* Limit on number of entries. */ uint32_t count; /* Current number of entries. */ struct ovs_list node; @@ -307,10 +307,9 @@ int ct_dpif_get_maxconns(struct dpif *dpif, uint32_t *maxconns); int ct_dpif_get_nconns(struct dpif *dpif, uint32_t *nconns); int ct_dpif_set_tcp_seq_chk(struct dpif *dpif, bool enabled); int ct_dpif_get_tcp_seq_chk(struct dpif *dpif, bool *enabled); -int ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit, - const struct ovs_list *); -int ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit, - const struct ovs_list *, struct ovs_list *); +int ct_dpif_set_limits(struct dpif *dpif, const struct ovs_list *); +int ct_dpif_get_limits(struct dpif *dpif, const struct ovs_list *, + struct ovs_list *); int ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *); int ct_dpif_sweep(struct dpif *, uint32_t *ms); int ct_dpif_ipf_set_enabled(struct dpif *, bool v6, bool enable); @@ -329,13 +328,12 @@ void ct_dpif_format_ipproto(struct ds *ds, uint16_t ipproto); void ct_dpif_format_tuple(struct ds *, const struct ct_dpif_tuple *); uint8_t ct_dpif_coalesce_tcp_state(uint8_t state); void ct_dpif_format_tcp_stat(struct ds *, int, int); -void ct_dpif_push_zone_limit(struct ovs_list *, uint16_t zone, uint32_t limit, +void ct_dpif_push_zone_limit(struct ovs_list *, int32_t zone, uint32_t limit, uint32_t count); void ct_dpif_free_zone_limits(struct ovs_list *); bool ct_dpif_parse_zone_limit_tuple(const char *s, uint16_t *pzone, uint32_t *plimit, struct ds *); -void ct_dpif_format_zone_limits(uint32_t default_limit, - const struct ovs_list *, struct ds *); +void ct_dpif_format_zone_limits(const struct ovs_list *, struct ds *); bool ct_dpif_set_timeout_policy_attr_by_name(struct ct_dpif_timeout_policy *tp, const char *key, uint32_t value); bool ct_dpif_timeout_policy_support_ipproto(uint8_t ipproto); diff --git a/lib/dpctl.c b/lib/dpctl.c index cd12625a160..76f21a530a5 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -2202,7 +2202,7 @@ dpctl_ct_set_limits(int argc, const char *argv[], struct dpif *dpif; struct ds ds = DS_EMPTY_INITIALIZER; int i = dp_arg_exists(argc, argv) ? 2 : 1; - uint32_t default_limit, *p_default_limit = NULL; + uint32_t default_limit; struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); int error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, &dpif); @@ -2213,7 +2213,8 @@ dpctl_ct_set_limits(int argc, const char *argv[], /* Parse default limit */ if (!strncmp(argv[i], "default=", 8)) { if (ovs_scan(argv[i], "default=%"SCNu32, &default_limit)) { - p_default_limit = &default_limit; + ct_dpif_push_zone_limit(&zone_limits, OVS_ZONE_LIMIT_DEFAULT_ZONE, + default_limit, 0); i++; } else { ds_put_cstr(&ds, "invalid default limit"); @@ -2233,7 +2234,7 @@ dpctl_ct_set_limits(int argc, const char *argv[], ct_dpif_push_zone_limit(&zone_limits, zone, limit, 0); } - error = ct_dpif_set_limits(dpif, p_default_limit, &zone_limits); + error = ct_dpif_set_limits(dpif, &zone_limits); if (!error) { ct_dpif_free_zone_limits(&zone_limits); dpif_close(dpif); @@ -2322,7 +2323,6 @@ dpctl_ct_get_limits(int argc, const char *argv[], { struct dpif *dpif; struct ds ds = DS_EMPTY_INITIALIZER; - uint32_t default_limit; int i = dp_arg_exists(argc, argv) ? 2 : 1; struct ovs_list list_query = OVS_LIST_INITIALIZER(&list_query); struct ovs_list list_reply = OVS_LIST_INITIALIZER(&list_reply); @@ -2333,16 +2333,17 @@ dpctl_ct_get_limits(int argc, const char *argv[], } if (argc > i) { + ct_dpif_push_zone_limit(&list_query, OVS_ZONE_LIMIT_DEFAULT_ZONE, + 0, 0); error = parse_ct_limit_zones(argv[i], &list_query, &ds); if (error) { goto error; } } - error = ct_dpif_get_limits(dpif, &default_limit, &list_query, - &list_reply); + error = ct_dpif_get_limits(dpif, &list_query, &list_reply); if (!error) { - ct_dpif_format_zone_limits(default_limit, &list_reply, &ds); + ct_dpif_format_zone_limits(&list_reply, &ds); dpctl_print(dpctl_p, "%s\n", ds_cstr(&ds)); goto out; } else { diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index b8f065d1d77..9a59a1b03c2 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9450,17 +9450,10 @@ dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms) static int dpif_netdev_ct_set_limits(struct dpif *dpif, - const uint32_t *default_limits, const struct ovs_list *zone_limits) { int err = 0; struct dp_netdev *dp = get_dp_netdev(dpif); - if (default_limits) { - err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits); - if (err != 0) { - return err; - } - } struct ct_dpif_zone_limit *zone_limit; LIST_FOR_EACH (zone_limit, node, zone_limits) { @@ -9475,20 +9468,12 @@ dpif_netdev_ct_set_limits(struct dpif *dpif, static int dpif_netdev_ct_get_limits(struct dpif *dpif, - uint32_t *default_limit, const struct ovs_list *zone_limits_request, struct ovs_list *zone_limits_reply) { struct dp_netdev *dp = get_dp_netdev(dpif); struct conntrack_zone_limit czl; - czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); - if (czl.zone == DEFAULT_ZONE) { - *default_limit = czl.limit; - } else { - return EINVAL; - } - if (!ovs_list_is_empty(zone_limits_request)) { struct ct_dpif_zone_limit *zone_limit; LIST_FOR_EACH (zone_limit, node, zone_limits_request) { @@ -9502,6 +9487,12 @@ dpif_netdev_ct_get_limits(struct dpif *dpif, } } } else { + czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); + if (czl.zone == DEFAULT_ZONE) { + ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE, + czl.limit, 0); + } + for (int z = MIN_ZONE; z <= MAX_ZONE; z++) { czl = zone_limit_get(dp->conntrack, z); if (czl.zone == z) { diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 9194971d379..84e2bd8eaf5 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -3360,7 +3360,6 @@ dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone, static int dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED, - const uint32_t *default_limits, const struct ovs_list *zone_limits) { if (ovs_ct_limit_family < 0) { @@ -3378,13 +3377,6 @@ dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED, size_t opt_offset; opt_offset = nl_msg_start_nested(request, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); - if (default_limits) { - struct ovs_zone_limit req_zone_limit = { - .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, - .limit = *default_limits, - }; - nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit); - } if (!ovs_list_is_empty(zone_limits)) { struct ct_dpif_zone_limit *zone_limit; @@ -3406,7 +3398,6 @@ dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED, static int dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf, - uint32_t *default_limit, struct ovs_list *zone_limits) { static const struct nl_policy ovs_ct_limit_policy[] = { @@ -3439,11 +3430,8 @@ dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf, nl_attr_get(attr[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]); while (rem >= sizeof *zone_limit) { - if (zone_limit->zone_id == OVS_ZONE_LIMIT_DEFAULT_ZONE) { - *default_limit = zone_limit->limit; - } else if (zone_limit->zone_id < OVS_ZONE_LIMIT_DEFAULT_ZONE || - zone_limit->zone_id > UINT16_MAX) { - } else { + if (zone_limit->zone_id >= OVS_ZONE_LIMIT_DEFAULT_ZONE && + zone_limit->zone_id <= UINT16_MAX) { ct_dpif_push_zone_limit(zone_limits, zone_limit->zone_id, zone_limit->limit, zone_limit->count); } @@ -3456,7 +3444,6 @@ dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf, static int dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED, - uint32_t *default_limit, const struct ovs_list *zone_limits_request, struct ovs_list *zone_limits_reply) { @@ -3477,14 +3464,11 @@ dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED, size_t opt_offset = nl_msg_start_nested(request, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); - struct ovs_zone_limit req_zone_limit = { - .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, - }; - nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit); - struct ct_dpif_zone_limit *zone_limit; LIST_FOR_EACH (zone_limit, node, zone_limits_request) { - req_zone_limit.zone_id = zone_limit->zone; + struct ovs_zone_limit req_zone_limit = { + .zone_id = zone_limit->zone, + }; nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit); } @@ -3497,8 +3481,7 @@ dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED, goto out; } - err = dpif_netlink_zone_limits_from_ofpbuf(reply, default_limit, - zone_limits_reply); + err = dpif_netlink_zone_limits_from_ofpbuf(reply, zone_limits_reply); out: ofpbuf_delete(request); diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index 1b822cb0754..520e21e68db 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -520,19 +520,17 @@ struct dpif_class { /* Sets the max connections allowed per zone according to 'zone_limits', * a list of 'struct ct_dpif_zone_limit' entries (the 'count' member - * is not used when setting limits). If 'default_limit' is not NULL, - * modifies the default limit to '*default_limit'. */ - int (*ct_set_limits)(struct dpif *, const uint32_t *default_limit, - const struct ovs_list *zone_limits); - - /* Looks up the default per zone limit and stores that in - * 'default_limit'. Look up the per zone limits for all zones in - * the 'zone_limits_in' list of 'struct ct_dpif_zone_limit' entries - * (the 'limit' and 'count' members are not used), and stores the - * reply that includes the zone, the per zone limit, and the number - * of connections in the zone into 'zone_limits_out' list. */ - int (*ct_get_limits)(struct dpif *, uint32_t *default_limit, - const struct ovs_list *zone_limits_in, + * is not used when setting limits). */ + int (*ct_set_limits)(struct dpif *, const struct ovs_list *zone_limits); + + /* Looks up the per zone limits for all zones in the 'zone_limits_in' list + * of 'struct ct_dpif_zone_limit' entries (the 'limit' and 'count' members + * are not used), and stores the reply that includes the zone, the per + * zone limit, and the number of connections in the zone into + * 'zone_limits_out' list. If the 'zone_limits_in' list is empty the + * report will contain all previously set zone limits and the default + * limit. Note: The default zone limit "count" is not used. */ + int (*ct_get_limits)(struct dpif *, const struct ovs_list *zone_limits_in, struct ovs_list *zone_limits_out); /* Deletes per zone limit of all zones specified in 'zone_limits', a From 8f4b86237bd5bff4579bb8af63b270be0f458c0a Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Dec 2023 06:49:09 +0100 Subject: [PATCH 463/833] dpctl: Allow the default CT zone limit to be deleted. Add optional argument to dpctl ct-del-limits called "default", which allows to remove the default limit making it effectively system default. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- NEWS | 2 ++ lib/conntrack.c | 12 +++++++----- lib/dpctl.c | 21 +++++++++++++++------ tests/system-traffic.at | 26 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 490e275da5a..a4a27d56104 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ Post-v3.2.0 a.k.a. 'configured' values, can be found in the 'status' column of the Interface table, i.e. with 'ovs-vsctl get interface <..> status'. Reported names adjusted accordingly. + * Added support for removal of default CT zone limit, e.g. + "ovs-appctl dpctl/ct-del-limits default". - Userspace datapath: * Added support for Generic Segmentation Offloading for the cases where TSO is enabled but not supported by an egress interface (except for diff --git a/lib/conntrack.c b/lib/conntrack.c index 31f00a12748..71c470661f6 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -404,13 +404,15 @@ zone_limit_delete(struct conntrack *ct, int32_t zone) struct zone_limit *zl = zone_limit_lookup_protected(ct, zone); if (zl) { zone_limit_clean(ct, zl); - ovs_mutex_unlock(&ct->ct_lock); - VLOG_INFO("Deleted zone limit for zone %d", zone); - } else { - ovs_mutex_unlock(&ct->ct_lock); - VLOG_INFO("Attempted delete of non-existent zone limit: zone %d", + } + + if (zone != DEFAULT_ZONE) { + VLOG_INFO(zl ? "Deleted zone limit for zone %d" + : "Attempted delete of non-existent zone limit: zone %d", zone); } + + ovs_mutex_unlock(&ct->ct_lock); return 0; } diff --git a/lib/dpctl.c b/lib/dpctl.c index 76f21a530a5..a8c65474712 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -2291,14 +2291,23 @@ dpctl_ct_del_limits(int argc, const char *argv[], int i = dp_arg_exists(argc, argv) ? 2 : 1; struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); - error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif); + error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); if (error) { return error; } - error = parse_ct_limit_zones(argv[i], &zone_limits, &ds); - if (error) { - goto error; + /* Parse default limit. */ + if (!strcmp(argv[i], "default")) { + ct_dpif_push_zone_limit(&zone_limits, OVS_ZONE_LIMIT_DEFAULT_ZONE, + 0, 0); + i++; + } + + if (argc > i) { + error = parse_ct_limit_zones(argv[i], &zone_limits, &ds); + if (error) { + goto error; + } } error = ct_dpif_del_limits(dpif, &zone_limits); @@ -3031,8 +3040,8 @@ static const struct dpctl_command all_commands[] = { { "ct-get-tcp-seq-chk", "[dp]", 0, 1, dpctl_ct_get_tcp_seq_chk, DP_RO }, { "ct-set-limits", "[dp] [default=L] [zone=N,limit=L]...", 1, INT_MAX, dpctl_ct_set_limits, DP_RO }, - { "ct-del-limits", "[dp] zone=N1[,N2]...", 1, 2, dpctl_ct_del_limits, - DP_RO }, + { "ct-del-limits", "[dp] [default] [zone=N1[,N2]...]", 1, 3, + dpctl_ct_del_limits, DP_RO }, { "ct-get-limits", "[dp] [zone=N1[,N2]...]", 0, 2, dpctl_ct_get_limits, DP_RO }, { "ct-get-sweep-interval", "[dp]", 0, 1, dpctl_ct_get_sweep, DP_RO }, diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 3f665e33919..498ac8888d8 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -5317,6 +5317,32 @@ udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=3),reply=(src=10.1.1.4,dst=10. udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=4),reply=(src=10.1.1.4,dst=10.1.1.3,sport=4,dport=1),zone=3 ]) +dnl Test ct-del-limits for default zone. + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=4,limit=4]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=15 +zone=4,limit=4,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits default]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=0 +zone=4,limit=4,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=15 +zone=4,limit=4,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits default zone=4]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=0 +zone=4,limit=0,count=0 +]) + OVS_TRAFFIC_VSWITCHD_STOP(["dnl /could not create datapath/d /(Cannot allocate memory) on packet/d"]) From 32488336190047029ee2b2833d85f91b8134252e Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Dec 2023 06:49:10 +0100 Subject: [PATCH 464/833] ovs-vsctl: Add limit to CT zone. Add limit to the CT zone DB table with ovs-vsctl helper methods. The limit has two special values besides any number, 0 is unlimited and empty limit is to leave the value untouched in the datapath. This is preparation step and the value is not yet propagated to the datapath. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- NEWS | 5 ++ tests/ovs-vsctl.at | 88 ++++++++++++++++++++++- utilities/ovs-vsctl.8.in | 31 ++++++-- utilities/ovs-vsctl.c | 140 +++++++++++++++++++++++++++++++++++-- vswitchd/vswitch.ovsschema | 14 +++- vswitchd/vswitch.xml | 14 ++++ 6 files changed, 276 insertions(+), 16 deletions(-) diff --git a/NEWS b/NEWS index a4a27d56104..63f2842ae8f 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,11 @@ Post-v3.2.0 Reported names adjusted accordingly. * Added support for removal of default CT zone limit, e.g. "ovs-appctl dpctl/ct-del-limits default". + - ovs-vsctl: + * New commands 'set-zone-limit', 'del-zone-limit' and 'list-zone-limits' + to manage the maximum number of connections in conntrack zones via + a new 'limit' column in the 'CT_Zone' database table and + 'ct_zone_default_limit' column in the 'Datapath' table. - Userspace datapath: * Added support for Generic Segmentation Offloading for the cases where TSO is enabled but not supported by an egress interface (except for diff --git a/tests/ovs-vsctl.at b/tests/ovs-vsctl.at index a368bff6ede..febb9dadf19 100644 --- a/tests/ovs-vsctl.at +++ b/tests/ovs-vsctl.at @@ -975,6 +975,67 @@ AT_CHECK( [0], [stdout]) AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:10, Timeout Policies: system default ]) +AT_CHECK([RUN_OVS_VSCTL([--if-exists del-zone-tp netdev zone=10])]) + +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 1 1])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 1, Limit: 1 +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 1 5])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 1, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev 1])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 10 5])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdev zone=10 icmp_first=1 icmp_reply=2])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [dnl +Zone:10, Timeout Policies: icmp_first=1 icmp_reply=2 +]) + +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev 10])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [dnl +Zone:10, Timeout Policies: icmp_first=1 icmp_reply=2 +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 10 5])]) +AT_CHECK([RUN_OVS_VSCTL([del-zone-tp netdev zone=10])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 10, Limit: 5 +]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [dnl +Zone:10, Timeout Policies: system default +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev default 5])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Default, Limit: 5 +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev default 10])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Default, Limit: 10 +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev default])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([--if-exists del-zone-limit netdev default])]) + AT_CHECK([RUN_OVS_VSCTL([-- --id=@m create Datapath datapath_version=0 'capabilities={recirc=true}' -- set Open_vSwitch . datapaths:"system"=@m])], [0], [stdout]) AT_CHECK([RUN_OVS_VSCTL([list-dp-cap system])], [0], [recirc=true @@ -1113,16 +1174,39 @@ AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdevxx zone=1 icmp_first=1 icmp_reply=2]) ]) AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdev zone=2 icmp_first=2 icmp_reply=3])]) AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdev zone=2 icmp_first=2 icmp_reply=3])], - [1], [], [ovs-vsctl: zone id 2 already exists + [1], [], [ovs-vsctl: zone id 2 already has a policy ]) AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:2, Timeout Policies: icmp_first=2 icmp_reply=3 ]) AT_CHECK([RUN_OVS_VSCTL([del-zone-tp netdev zone=11])], - [1], [], [ovs-vsctl: zone id 11 does not exist + [1], [], [ovs-vsctl: zone id 11 does not have a policy ]) AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:2, Timeout Policies: icmp_first=2 icmp_reply=3 ]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdevxx 5 1])], + [1], [], [ovs-vsctl: datapath netdevxx does not exist +]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 88888 1])], + [1], [], [ovs-vsctl: zone_id (88888) out of range +]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 5 -1])], + [1], [], [ovs-vsctl: limit (-1) out of range +]) +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev 10])], + [1], [], [ovs-vsctl: zone_id 10 does not have a limit +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdevxx default 1])], + [1], [], [ovs-vsctl: datapath netdevxx does not exist +]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev default -1])], + [1], [], [ovs-vsctl: limit (-1) out of range +]) +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev default])], + [1], [], [ovs-vsctl: datapath netdev does not have a limit +]) + AT_CHECK([RUN_OVS_VSCTL([-- --id=@m create Datapath datapath_version=0 'capabilities={recirc=true}' -- set Open_vSwitch . datapaths:"system"=@m])], [0], [stdout]) AT_CHECK([RUN_OVS_VSCTL([list-dp-cap nosystem])], [1], [], [ovs-vsctl: datapath "nosystem" record not found diff --git a/utilities/ovs-vsctl.8.in b/utilities/ovs-vsctl.8.in index 9e319aa1cf8..5ce949df496 100644 --- a/utilities/ovs-vsctl.8.in +++ b/utilities/ovs-vsctl.8.in @@ -354,7 +354,7 @@ Prints the name of the bridge that contains \fIiface\fR on standard output. . .SS "Conntrack Zone Commands" -These commands query and modify datapath CT zones and Timeout Policies. +These commands query and modify datapath CT zones, Timeout Policies and Limits. . .IP "[\fB\-\-may\-exist\fR] \fBadd\-zone\-tp \fIdatapath \fBzone=\fIzone_id \fIpolicies\fR" Creates a conntrack zone timeout policy with \fIzone_id\fR in @@ -365,20 +365,37 @@ packet and a 60-second policy for ICMP reply packets. See the \fBCT_Timeout_Policy\fR table in \fBovs-vswitchd.conf.db\fR(5) for the supported keys. .IP -Without \fB\-\-may\-exist\fR, attempting to add a \fIzone_id\fR that -already exists is an error. With \fB\-\-may\-exist\fR, -this command does nothing if \fIzone_id\fR already exists. +Without \fB\-\-may\-exist\fR, attempting to add a \fIpolicy\fR for +\fIzone_id\fR that already has a policy is an error. + With \fB\-\-may\-exist\fR, this command does nothing if policy for + \fIzone_id\fR already exists. . .IP "[\fB\-\-if\-exists\fR] \fBdel\-zone\-tp \fIdatapath \fBzone=\fIzone_id\fR" Delete the timeout policy associated with \fIzone_id\fR from \fIdatapath\fR. .IP -Without \fB\-\-if\-exists\fR, attempting to delete a zone that -does not exist is an error. With \fB\-\-if\-exists\fR, attempting to -delete a zone that does not exist has no effect. +Without \fB\-\-if\-exists\fR, attempting to delete a policy for zone that +does not exist or doesn't have a policy is an error. With +\fB\-\-if\-exists\fR, attempting to delete a a policy that does not +exist has no effect. . .IP "\fBlist\-zone\-tp \fIdatapath\fR" Prints the timeout policies of all zones in \fIdatapath\fR. . +.IP "\fBset\-zone\-limit \fIdatapath \fIzone_id\fR|\fBdefault \fIzone_limit\fR" +Sets a conntrack zone limit with \fIzone_id\fR|\fIdefault\fR in +\fIdatapath\fR. The \fIlimit\fR with value \fB0\fR means unlimited. +.IP +. +.IP "[\fB\-\-if\-exists\fR] \fBdel\-zone\-limit \fIdatapath \fIzone_id\fR|\fBdefault\fR" +Delete the limit associated with \fIzone_id\fR from \fIdatapath\fR. +.IP +Without \fB\-\-if\-exists\fR, attempting to delete a limit for zone that +does not exist or doesn't have a limit is an error. With \fB\-\-if\-exists\fR, +attempting to delete a limit that does not exist has no effect. +. +.IP "\fBlist\-zone\-limits \fIdatapath\fR" +Prints the limits of all zones in \fIdatapath\fR. +. .SS "Datapath Capabilities Command" The command query datapath capabilities. . diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index 5e549df0055..495be356524 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -442,6 +442,13 @@ Auto Attach commands:\n\ Switch commands:\n\ emer-reset reset switch to known good state\n\ \n\ +Connection Tracking commands:\n\ + set-zone-limit DATAPATH ZONE|default LIMIT\n\ + set CT LIMIT for ZONE|default on DATAPATH\n\ + del-zone-limit DATAPATH ZONE|default\n\ + delete CT limit for ZONE|default on DATAPATH\n\ + list-zone-limits DATAPATH list all limits configured on DATAPATH\n\ +\n\ %s\ %s\ \n\ @@ -1302,8 +1309,8 @@ cmd_add_zone_tp(struct ctl_context *ctx) ctl_fatal("No timeout policy"); } - if (zone && !may_exist) { - ctl_fatal("zone id %"PRIu64" already exists", zone_id); + if (zone && zone->timeout_policy && !may_exist) { + ctl_fatal("zone id %"PRIu64" already has a policy", zone_id); } tp = create_timeout_policy(ctx, &ctx->argv[3], n_tps); @@ -1332,11 +1339,20 @@ cmd_del_zone_tp(struct ctl_context *ctx) } struct ovsrec_ct_zone *zone = find_ct_zone(dp, zone_id); - if (must_exist && !zone) { - ctl_fatal("zone id %"PRIu64" does not exist", zone_id); + if (must_exist && !(zone && zone->timeout_policy)) { + ctl_fatal("zone id %"PRIu64" does not have a policy", zone_id); } - if (zone) { + if (!zone) { + return; + } + + if (zone->limit) { + if (zone->timeout_policy) { + ovsrec_ct_timeout_policy_delete(zone->timeout_policy); + } + ovsrec_ct_zone_set_timeout_policy(zone, NULL); + } else { ovsrec_datapath_update_ct_zones_delkey(dp, zone_id); } } @@ -1371,12 +1387,118 @@ cmd_list_zone_tp(struct ctl_context *ctx) } } +static void +cmd_set_zone_limit(struct ctl_context *ctx) +{ + struct vsctl_context *vsctl_ctx = vsctl_context_cast(ctx); + int64_t zone_id = -1; + int64_t limit = -1; + + const char *dp_name = ctx->argv[1]; + + ovs_scan(ctx->argv[2], "%"SCNi64, &zone_id); + ovs_scan(ctx->argv[3], "%"SCNi64, &limit); + + struct ovsrec_datapath *dp = find_datapath(vsctl_ctx, dp_name); + if (!dp) { + ctl_fatal("datapath %s does not exist", dp_name); + } + + if (limit < 0 || limit > UINT32_MAX) { + ctl_fatal("limit (%"PRIi64") out of range", limit); + } + + if (!strcmp(ctx->argv[2], "default")) { + ovsrec_datapath_set_ct_zone_default_limit(dp, &limit, 1); + return; + } + + if (zone_id < 0 || zone_id > UINT16_MAX) { + ctl_fatal("zone_id (%"PRIi64") out of range", zone_id); + } + + struct ovsrec_ct_zone *zone = find_ct_zone(dp, zone_id); + if (!zone) { + zone = ovsrec_ct_zone_insert(ctx->txn); + ovsrec_datapath_update_ct_zones_setkey(dp, zone_id, zone); + } + + ovsrec_ct_zone_set_limit(zone, &limit, 1); +} + +static void +cmd_del_zone_limit(struct ctl_context *ctx) +{ + struct vsctl_context *vsctl_ctx = vsctl_context_cast(ctx); + int64_t zone_id; + + bool must_exist = !shash_find(&ctx->options, "--if-exists"); + const char *dp_name = ctx->argv[1]; + + ovs_scan(ctx->argv[2], "%"SCNi64, &zone_id); + + struct ovsrec_datapath *dp = find_datapath(vsctl_ctx, dp_name); + if (!dp) { + ctl_fatal("datapath %s does not exist", dp_name); + } + + if (!strcmp(ctx->argv[2], "default")) { + if (must_exist && !dp->ct_zone_default_limit) { + ctl_fatal("datapath %s does not have a limit", dp_name); + } + + ovsrec_datapath_set_ct_zone_default_limit(dp, NULL, 0); + return; + } + + struct ovsrec_ct_zone *zone = find_ct_zone(dp, zone_id); + if (must_exist && !(zone && zone->limit)) { + ctl_fatal("zone_id %"PRIi64" does not have a limit", zone_id); + } + + if (!zone) { + return; + } + + if (zone->timeout_policy) { + ovsrec_ct_zone_set_limit(zone, NULL, 0); + } else { + ovsrec_datapath_update_ct_zones_delkey(dp, zone_id); + } +} + +static void +cmd_list_zone_limits(struct ctl_context *ctx) +{ + struct vsctl_context *vsctl_ctx = vsctl_context_cast(ctx); + + struct ovsrec_datapath *dp = find_datapath(vsctl_ctx, ctx->argv[1]); + if (!dp) { + ctl_fatal("datapath: %s record not found", ctx->argv[1]); + } + + if (dp->ct_zone_default_limit) { + ds_put_format(&ctx->output, "Default, Limit: %"PRIu64"\n", + *dp->ct_zone_default_limit); + } + + for (int i = 0; i < dp->n_ct_zones; i++) { + struct ovsrec_ct_zone *zone = dp->value_ct_zones[i]; + if (zone->limit) { + ds_put_format(&ctx->output, "Zone: %"PRIu64", Limit: %"PRIu64"\n", + dp->key_ct_zones[i], *zone->limit); + } + } +} + static void pre_get_zone(struct ctl_context *ctx) { ovsdb_idl_add_column(ctx->idl, &ovsrec_open_vswitch_col_datapaths); ovsdb_idl_add_column(ctx->idl, &ovsrec_datapath_col_ct_zones); + ovsdb_idl_add_column(ctx->idl, &ovsrec_datapath_col_ct_zone_default_limit); ovsdb_idl_add_column(ctx->idl, &ovsrec_ct_zone_col_timeout_policy); + ovsdb_idl_add_column(ctx->idl, &ovsrec_ct_zone_col_limit); ovsdb_idl_add_column(ctx->idl, &ovsrec_ct_timeout_policy_col_timeouts); } @@ -3159,6 +3281,14 @@ static const struct ctl_command_syntax vsctl_commands[] = { /* Datapath capabilities. */ {"list-dp-cap", 1, 1, "", pre_get_dp_cap, cmd_list_dp_cap, NULL, "", RO}, + /* CT zone limit. */ + {"set-zone-limit", 3, 3, "ARG ARG ARG", pre_get_zone, cmd_set_zone_limit, + NULL, "", RW}, + {"del-zone-limit", 2, 2, "ARG ARG", pre_get_zone, cmd_del_zone_limit, NULL, + "--if-exists", RW}, + {"list-zone-limits", 1, 1, "ARG", pre_get_zone, cmd_list_zone_limits, NULL, + "", RO}, + {NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, RO}, }; diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index 2d395ff952c..e2d5e2e85e6 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "8.4.0", - "cksum": "2738838700 27127", + "version": "8.5.0", + "cksum": "4040946650 27557", "tables": { "Open_vSwitch": { "columns": { @@ -670,6 +670,11 @@ "capabilities": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}, + "ct_zone_default_limit": { + "type": { "key": {"type": "integer", + "minInteger": 0, + "maxInteger": 4294967295}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}}, @@ -679,6 +684,11 @@ "type": {"key": {"type": "uuid", "refTable": "CT_Timeout_Policy"}, "min": 0, "max": 1}}, + "limit": { + "type": { "key": {"type": "integer", + "minInteger": 0, + "maxInteger": 4294967295}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}}, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 68392ac41d7..eaccd85cf94 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -6488,6 +6488,14 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ + + Default connection tracking zone limit that is applied to all zones + that didn't specify the + explicitly. If the limit is unspecified the default limit + configuration for the datapath is left intact. The value 0 means + unlimited. + + The overall purpose of these columns is described under Common Columns at the beginning of this document. @@ -6504,6 +6512,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ is not specified, it defaults to the timeout policy in the system. + + Connection tracking limit for this zone. If the limit is unspecified + the will be used. + The value 0 means unlimited. + + The overall purpose of these columns is described under Common Columns at the beginning of this document. From 1b3557f53dbc8cdbba7aa318cb2659b1a8d68185 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Dec 2023 06:49:11 +0100 Subject: [PATCH 465/833] vswitchd, ofproto-dpif: Propagate the CT limit from database. Propagate the CT limit that is present in the DB into datapath. The limit is currently only propagated on change and can be overwritten by the dpctl commands. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif.c | 39 ++++++++++++++++++++ ofproto/ofproto-dpif.h | 5 +++ ofproto/ofproto-provider.h | 8 ++++ ofproto/ofproto.c | 12 ++++++ ofproto/ofproto.h | 2 + tests/system-traffic.at | 54 +++++++++++++++++++++++++++ vswitchd/bridge.c | 75 +++++++++++++++++++++++++++++--------- 7 files changed, 177 insertions(+), 18 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 54e057d43ff..bfae28d9616 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -220,6 +220,7 @@ static void ofproto_unixctl_init(void); static void ct_zone_config_init(struct dpif_backer *backer); static void ct_zone_config_uninit(struct dpif_backer *backer); static void ct_zone_timeout_policy_sweep(struct dpif_backer *backer); +static void ct_zone_limits_commit(struct dpif_backer *backer); static inline struct ofproto_dpif * ofproto_dpif_cast(const struct ofproto *ofproto) @@ -513,6 +514,7 @@ type_run(const char *type) process_dpif_port_changes(backer); ct_zone_timeout_policy_sweep(backer); + ct_zone_limits_commit(backer); return 0; } @@ -5532,6 +5534,8 @@ ct_zone_config_init(struct dpif_backer *backer) cmap_init(&backer->ct_zones); hmap_init(&backer->ct_tps); ovs_list_init(&backer->ct_tp_kill_list); + ovs_list_init(&backer->ct_zone_limits_to_add); + ovs_list_init(&backer->ct_zone_limits_to_del); clear_existing_ct_timeout_policies(backer); } @@ -5555,6 +5559,8 @@ ct_zone_config_uninit(struct dpif_backer *backer) id_pool_destroy(backer->tp_ids); cmap_destroy(&backer->ct_zones); hmap_destroy(&backer->ct_tps); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_add); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_del); } static void @@ -5635,6 +5641,38 @@ ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone_id) } } +static void +ct_zone_limit_update(const char *datapath_type, int32_t zone_id, + int64_t *limit) +{ + struct dpif_backer *backer = shash_find_data(&all_dpif_backers, + datapath_type); + if (!backer) { + return; + } + + if (limit) { + ct_dpif_push_zone_limit(&backer->ct_zone_limits_to_add, zone_id, + *limit, 0); + } else { + ct_dpif_push_zone_limit(&backer->ct_zone_limits_to_del, zone_id, 0, 0); + } +} + +static void +ct_zone_limits_commit(struct dpif_backer *backer) +{ + if (!ovs_list_is_empty(&backer->ct_zone_limits_to_add)) { + ct_dpif_set_limits(backer->dpif, &backer->ct_zone_limits_to_add); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_add); + } + + if (!ovs_list_is_empty(&backer->ct_zone_limits_to_del)) { + ct_dpif_del_limits(backer->dpif, &backer->ct_zone_limits_to_del); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_del); + } +} + static void get_datapath_cap(const char *datapath_type, struct smap *cap) { @@ -6925,4 +6963,5 @@ const struct ofproto_class ofproto_dpif_class = { ct_flush, /* ct_flush */ ct_set_zone_timeout_policy, ct_del_zone_timeout_policy, + ct_zone_limit_update, }; diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index 1fe22ab41bd..92d33aa6470 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -285,6 +285,11 @@ struct dpif_backer { feature than 'bt_support'. */ struct atomic_count tnl_count; + + struct ovs_list ct_zone_limits_to_add; /* CT zone limits queued for + * addition into datapath. */ + struct ovs_list ct_zone_limits_to_del; /* CT zone limits queued for + * deletion from datapath. */ }; /* All existing ofproto_backer instances, indexed by ofproto->up.type. */ diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 9f7b8b6e831..face0b574cc 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -1921,6 +1921,14 @@ struct ofproto_class { /* Deletes the timeout policy associated with 'zone' in datapath type * 'dp_type'. */ void (*ct_del_zone_timeout_policy)(const char *dp_type, uint16_t zone); + + /* Updates the CT zone limit for specified zone. Setting 'zone' to + * 'OVS_ZONE_LIMIT_DEFAULT_ZONE' represents the default zone. + * 'NULL' passed as 'limit' indicates that the limit should be removed for + * the specified zone. The caller must ensure that the 'limit' value is + * within proper range (0 - UINT32_MAX). */ + void (*ct_zone_limit_update)(const char *dp_type, int32_t zone, + int64_t *limit); }; extern const struct ofproto_class ofproto_dpif_class; diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index e78c80d1155..649add089a3 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -1026,6 +1026,18 @@ ofproto_ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone_id) } +void +ofproto_ct_zone_limit_update(const char *datapath_type, int32_t zone_id, + int64_t *limit) +{ + datapath_type = ofproto_normalize_type(datapath_type); + const struct ofproto_class *class = ofproto_class_find__(datapath_type); + + if (class && class->ct_zone_limit_update) { + class->ct_zone_limit_update(datapath_type, zone_id, limit); + } +} + /* Spanning Tree Protocol (STP) configuration. */ diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 8efdb20a072..7ce6a65e131 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -384,6 +384,8 @@ void ofproto_ct_set_zone_timeout_policy(const char *datapath_type, struct simap *timeout_policy); void ofproto_ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone); +void ofproto_ct_zone_limit_update(const char *datapath_type, int32_t zone_id, + int64_t *limit); void ofproto_get_datapath_cap(const char *datapath_type, struct smap *dp_cap); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 498ac8888d8..99e2bfad978 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -5343,6 +5343,60 @@ default limit=0 zone=4,limit=0,count=0 ]) +dnl Test limit set via database. +VSCTL_ADD_DATAPATH_TABLE() + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=0]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=3]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=3]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=10 +zone=0,limit=5,count=0 +]) + +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 0 3]) +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 3 3]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=10 +zone=0,limit=3,count=0 +zone=3,limit=3,count=0]) + +for i in 2 3 4 5 6; do + packet="50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000${i}00080000" + AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 \ + "in_port=2 packet=${packet} actions=resubmit(,0)"]) +done + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.3," | sort ], [0], [dnl +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=2),reply=(src=10.1.1.4,dst=10.1.1.3,sport=2,dport=1),zone=3 +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=3),reply=(src=10.1.1.4,dst=10.1.1.3,sport=3,dport=1),zone=3 +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=4),reply=(src=10.1.1.4,dst=10.1.1.3,sport=4,dport=1),zone=3 +]) + +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=10 +zone=0,limit=3,count=0 +zone=3,limit=3,count=3 +]) + +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 3]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=10 +zone=0,limit=3,count=0]) + +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE default 5]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=5 +zone=0,limit=3,count=0]) + +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE default]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=0 +zone=0,limit=3,count=0]) + OVS_TRAFFIC_VSWITCHD_STOP(["dnl /could not create datapath/d /(Cannot allocate memory) on packet/d"]) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index e9110c1d80d..5be38b890b2 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -157,6 +157,8 @@ struct aa_mapping { /* Internal representation of conntrack zone configuration table in OVSDB. */ struct ct_zone { uint16_t zone_id; + int64_t limit; /* Limit of allowed entries. '-1' if not + * specified. */ struct simap tp; /* A map from timeout policy attribute to * timeout value. */ struct hmap_node node; /* Node in 'struct datapath' 'ct_zones' @@ -168,14 +170,15 @@ struct ct_zone { /* Internal representation of datapath configuration table in OVSDB. */ struct datapath { - char *type; /* Datapath type. */ - struct hmap ct_zones; /* Map of 'struct ct_zone' elements, indexed - * by 'zone'. */ - struct hmap_node node; /* Node in 'all_datapaths' hmap. */ - struct smap caps; /* Capabilities. */ - unsigned int last_used; /* The last idl_seqno that this 'datapath' - * used in OVSDB. This number is used for - * garbage collection. */ + char *type; /* Datapath type. */ + struct hmap ct_zones; /* Map of 'struct ct_zone' elements, + * indexed by 'zone'. */ + struct hmap_node node; /* Node in 'all_datapaths' hmap. */ + struct smap caps; /* Capabilities. */ + unsigned int last_used; /* The last idl_seqno that this 'datapath' + * used in OVSDB. This number is used for + * garbage collection. */ + int64_t ct_zone_default_limit; /* Default CT limit for all zones. */ }; /* All bridges, indexed by name. */ @@ -662,6 +665,7 @@ ct_zone_alloc(uint16_t zone_id, struct ovsrec_ct_timeout_policy *tp_cfg) struct ct_zone *ct_zone = xzalloc(sizeof *ct_zone); ct_zone->zone_id = zone_id; + ct_zone->limit = -1; simap_init(&ct_zone->tp); get_timeout_policy_from_ovsrec(&ct_zone->tp, tp_cfg); return ct_zone; @@ -670,6 +674,14 @@ ct_zone_alloc(uint16_t zone_id, struct ovsrec_ct_timeout_policy *tp_cfg) static void ct_zone_remove_and_destroy(struct datapath *dp, struct ct_zone *ct_zone) { + if (!simap_is_empty(&ct_zone->tp)) { + ofproto_ct_del_zone_timeout_policy(dp->type, ct_zone->zone_id); + } + + if (ct_zone->limit > -1) { + ofproto_ct_zone_limit_update(dp->type, ct_zone->zone_id, NULL); + } + hmap_remove(&dp->ct_zones, &ct_zone->node); simap_destroy(&ct_zone->tp); free(ct_zone); @@ -706,6 +718,7 @@ datapath_create(const char *type) { struct datapath *dp = xzalloc(sizeof *dp); dp->type = xstrdup(type); + dp->ct_zone_default_limit = -1; hmap_init(&dp->ct_zones); hmap_insert(&all_datapaths, &dp->node, hash_string(type, 0)); smap_init(&dp->caps); @@ -722,6 +735,11 @@ datapath_destroy(struct datapath *dp) ct_zone_remove_and_destroy(dp, ct_zone); } + if (dp->ct_zone_default_limit > -1) { + ofproto_ct_zone_limit_update(dp->type, OVS_ZONE_LIMIT_DEFAULT_ZONE, + NULL); + } + hmap_remove(&all_datapaths, &dp->node); hmap_destroy(&dp->ct_zones); free(dp->type); @@ -743,29 +761,50 @@ ct_zones_reconfigure(struct datapath *dp, struct ovsrec_datapath *dp_cfg) struct ovsrec_ct_timeout_policy *tp_cfg = zone_cfg->timeout_policy; ct_zone = ct_zone_lookup(&dp->ct_zones, zone_id); - if (ct_zone) { - struct simap new_tp = SIMAP_INITIALIZER(&new_tp); - get_timeout_policy_from_ovsrec(&new_tp, tp_cfg); - if (update_timeout_policy(&ct_zone->tp, &new_tp)) { + if (!ct_zone) { + ct_zone = ct_zone_alloc(zone_id, tp_cfg); + hmap_insert(&dp->ct_zones, &ct_zone->node, hash_int(zone_id, 0)); + } + + struct simap new_tp = SIMAP_INITIALIZER(&new_tp); + get_timeout_policy_from_ovsrec(&new_tp, tp_cfg); + + if (update_timeout_policy(&ct_zone->tp, &new_tp)) { + if (simap_count(&ct_zone->tp)) { ofproto_ct_set_zone_timeout_policy(dp->type, ct_zone->zone_id, &ct_zone->tp); + } else { + ofproto_ct_del_zone_timeout_policy(dp->type, ct_zone->zone_id); } - } else { - ct_zone = ct_zone_alloc(zone_id, tp_cfg); - hmap_insert(&dp->ct_zones, &ct_zone->node, hash_int(zone_id, 0)); - ofproto_ct_set_zone_timeout_policy(dp->type, ct_zone->zone_id, - &ct_zone->tp); } + + int64_t desired_limit = zone_cfg->limit ? *zone_cfg->limit : -1; + if (ct_zone->limit != desired_limit) { + ofproto_ct_zone_limit_update(dp->type, zone_id, zone_cfg->limit); + ct_zone->limit = desired_limit; + } + ct_zone->last_used = idl_seqno; } /* Purge 'ct_zone's no longer found in the database. */ HMAP_FOR_EACH_SAFE (ct_zone, node, &dp->ct_zones) { if (ct_zone->last_used != idl_seqno) { - ofproto_ct_del_zone_timeout_policy(dp->type, ct_zone->zone_id); ct_zone_remove_and_destroy(dp, ct_zone); } } + + /* Reconfigure default CT zone limit if needed. */ + int64_t default_limit = dp_cfg->ct_zone_default_limit + ? *dp_cfg->ct_zone_default_limit + : -1; + + if (dp->ct_zone_default_limit != default_limit) { + ofproto_ct_zone_limit_update(dp->type, OVS_ZONE_LIMIT_DEFAULT_ZONE, + dp_cfg->ct_zone_default_limit); + dp->ct_zone_default_limit = default_limit; + } + } static void From 27e0349e2066dab084b3e496b81b3183dd877c35 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Dec 2023 06:49:12 +0100 Subject: [PATCH 466/833] ct-dpif: Enforce CT zone limit protection. Make sure that if any zone limit was set via DB all zones are forced to be set there also. This is done by tracking which datapath has zone limit protection and it is reflected in the dpctl command. If the datapath is protected the dpctl command will return permission error. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/ct-dpif.c | 25 +++++++++++++++++++ lib/ct-dpif.h | 2 ++ lib/dpctl.c | 14 +++++++++++ ofproto/ofproto-dpif.c | 13 ++++++++++ ofproto/ofproto-provider.h | 5 ++++ ofproto/ofproto.c | 11 +++++++++ ofproto/ofproto.h | 2 ++ tests/system-traffic.at | 49 ++++++++++++++++++++++++++++++++++++++ vswitchd/bridge.c | 7 ++++++ 9 files changed, 128 insertions(+) diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index 2ee04516450..5115c886b1d 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -23,6 +23,7 @@ #include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-parse.h" #include "openvswitch/vlog.h" +#include "sset.h" VLOG_DEFINE_THIS_MODULE(ct_dpif); @@ -32,6 +33,10 @@ struct flags { const char *name; }; +/* Protection for CT zone limit per datapath. */ +static struct sset ct_limit_protection = + SSET_INITIALIZER(&ct_limit_protection); + static void ct_dpif_format_counters(struct ds *, const struct ct_dpif_counters *); static void ct_dpif_format_timestamp(struct ds *, @@ -1064,3 +1069,23 @@ ct_dpif_get_features(struct dpif *dpif, enum ct_features *features) ? dpif->dpif_class->ct_get_features(dpif, features) : EOPNOTSUPP); } + +void +ct_dpif_set_zone_limit_protection(struct dpif *dpif, bool protected) +{ + if (sset_contains(&ct_limit_protection, dpif->full_name) == protected) { + return; + } + + if (protected) { + sset_add(&ct_limit_protection, dpif->full_name); + } else { + sset_find_and_delete(&ct_limit_protection, dpif->full_name); + } +} + +bool +ct_dpif_is_zone_limit_protected(struct dpif *dpif) +{ + return sset_contains(&ct_limit_protection, dpif->full_name); +} diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index c8a7c155e3c..c3786d5ae54 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -350,5 +350,7 @@ int ct_dpif_get_timeout_policy_name(struct dpif *dpif, uint32_t tp_id, uint16_t dl_type, uint8_t nw_proto, char **tp_name, bool *is_generic); int ct_dpif_get_features(struct dpif *dpif, enum ct_features *features); +void ct_dpif_set_zone_limit_protection(struct dpif *, bool protected); +bool ct_dpif_is_zone_limit_protected(struct dpif *); #endif /* CT_DPIF_H */ diff --git a/lib/dpctl.c b/lib/dpctl.c index a8c65474712..2a1aac5e5f5 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -2234,6 +2234,13 @@ dpctl_ct_set_limits(int argc, const char *argv[], ct_dpif_push_zone_limit(&zone_limits, zone, limit, 0); } + if (ct_dpif_is_zone_limit_protected(dpif)) { + ds_put_cstr(&ds, "the zone limits are set via database, " + "use 'ovs-vsctl set-zone-limit <...>' instead."); + error = EPERM; + goto error; + } + error = ct_dpif_set_limits(dpif, &zone_limits); if (!error) { ct_dpif_free_zone_limits(&zone_limits); @@ -2310,6 +2317,13 @@ dpctl_ct_del_limits(int argc, const char *argv[], } } + if (ct_dpif_is_zone_limit_protected(dpif)) { + ds_put_cstr(&ds, "the zone limits are set via database, " + "use 'ovs-vsctl del-zone-limit <...>' instead."); + error = EPERM; + goto error; + } + error = ct_dpif_del_limits(dpif, &zone_limits); if (!error) { goto out; diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index bfae28d9616..6e62ed1f982 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -5673,6 +5673,18 @@ ct_zone_limits_commit(struct dpif_backer *backer) } } +static void +ct_zone_limit_protection_update(const char *datapath_type, bool protected) +{ + struct dpif_backer *backer = shash_find_data(&all_dpif_backers, + datapath_type); + if (!backer) { + return; + } + + ct_dpif_set_zone_limit_protection(backer->dpif, protected); +} + static void get_datapath_cap(const char *datapath_type, struct smap *cap) { @@ -6964,4 +6976,5 @@ const struct ofproto_class ofproto_dpif_class = { ct_set_zone_timeout_policy, ct_del_zone_timeout_policy, ct_zone_limit_update, + ct_zone_limit_protection_update, }; diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index face0b574cc..83c509fcf80 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -1929,6 +1929,11 @@ struct ofproto_class { * within proper range (0 - UINT32_MAX). */ void (*ct_zone_limit_update)(const char *dp_type, int32_t zone, int64_t *limit); + + /* Sets the CT zone limit protection to "protected" for the specified + * datapath type. */ + void (*ct_zone_limit_protection_update)(const char *dp_type, + bool protected); }; extern const struct ofproto_class ofproto_dpif_class; diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 649add089a3..122a06f3032 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -1038,6 +1038,17 @@ ofproto_ct_zone_limit_update(const char *datapath_type, int32_t zone_id, } } +void +ofproto_ct_zone_limit_protection_update(const char *datapath_type, + bool protected) +{ + datapath_type = ofproto_normalize_type(datapath_type); + const struct ofproto_class *class = ofproto_class_find__(datapath_type); + + if (class && class->ct_zone_limit_protection_update) { + class->ct_zone_limit_protection_update(datapath_type, protected); + } +} /* Spanning Tree Protocol (STP) configuration. */ diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 7ce6a65e131..1c07df27518 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -386,6 +386,8 @@ void ofproto_ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone); void ofproto_ct_zone_limit_update(const char *datapath_type, int32_t zone_id, int64_t *limit); +void ofproto_ct_zone_limit_protection_update(const char *datapath_type, + bool protected); void ofproto_get_datapath_cap(const char *datapath_type, struct smap *dp_cap); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 99e2bfad978..fa66f6f66f8 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -5397,6 +5397,55 @@ OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl default limit=0 zone=0,limit=3,count=0]) +dnl Try to overwrite the zone limit via dpctl command. +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=3,limit=5 zone=0,limit=5], [2], [ignore], [dnl +ovs-vswitchd: the zone limits are set via database, dnl +use 'ovs-vsctl set-zone-limit <...>' instead. (Operation not permitted) +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=0 +zone=0,limit=3,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=0], [2], [ignore], [dnl +ovs-vswitchd: the zone limits are set via database, dnl +use 'ovs-vsctl del-zone-limit <...>' instead. (Operation not permitted) +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=0 +zone=0,limit=3,count=0 +]) + +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 0]) +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE default 10]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=10 +]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=1,limit=5], [2], [ignore], [dnl +ovs-vswitchd: the zone limits are set via database, dnl +use 'ovs-vsctl set-zone-limit <...>' instead. (Operation not permitted) +ovs-appctl: ovs-vswitchd: server returned an error +]) + +dnl Delete all zones from DB, that should remove the protection. +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE default]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=1,limit=5]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=15 +zone=1,limit=5,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=1]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=15 +]) + OVS_TRAFFIC_VSWITCHD_STOP(["dnl /could not create datapath/d /(Cannot allocate memory) on packet/d"]) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 5be38b890b2..95a65fcdcd5 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -740,6 +740,7 @@ datapath_destroy(struct datapath *dp) NULL); } + ofproto_ct_zone_limit_protection_update(dp->type, false); hmap_remove(&all_datapaths, &dp->node); hmap_destroy(&dp->ct_zones); free(dp->type); @@ -752,6 +753,7 @@ static void ct_zones_reconfigure(struct datapath *dp, struct ovsrec_datapath *dp_cfg) { struct ct_zone *ct_zone; + bool protected = false; /* Add new 'ct_zone's or update existing 'ct_zone's based on the database * state. */ @@ -785,6 +787,8 @@ ct_zones_reconfigure(struct datapath *dp, struct ovsrec_datapath *dp_cfg) } ct_zone->last_used = idl_seqno; + + protected = protected || !!zone_cfg->limit; } /* Purge 'ct_zone's no longer found in the database. */ @@ -805,6 +809,9 @@ ct_zones_reconfigure(struct datapath *dp, struct ovsrec_datapath *dp_cfg) dp->ct_zone_default_limit = default_limit; } + protected = protected || !!dp_cfg->ct_zone_default_limit; + + ofproto_ct_zone_limit_protection_update(dp->type, protected); } static void From 7067ed1660f90e6dcb46aa51839f2d6ebe5b2023 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Mon, 4 Dec 2023 06:49:13 +0100 Subject: [PATCH 467/833] tests: Do not use zone 0 for CT limit system test. The zone 0 is default system zone, do not use this zone for the test because it might contain some entries already which could cause flakiness during the check. In order to still have the zone 0 parsing coverage add simple unit tests for dpctl. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- tests/dpctl.at | 8 +++++- tests/system-traffic.at | 59 ++++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/tests/dpctl.at b/tests/dpctl.at index d2f1046f8b5..a87f67f9870 100644 --- a/tests/dpctl.at +++ b/tests/dpctl.at @@ -136,7 +136,7 @@ AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0]) OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([dpctl - ct-get-limits ct-del-limits]) +AT_SETUP([dpctl - ct-set-limits ct-get-limits ct-del-limits]) OVS_VSWITCHD_START AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [default limit=0 ]) @@ -149,5 +149,11 @@ AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=x], [2], [], ovs-appctl: ovs-vswitchd: server returned an error ]) AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=]) +AT_CHECK([ovs-appctl dpctl/ct-set-limits zone=0,limit=0]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0], [0], [default limit=0 +zone=0,limit=0,count=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=0]) + OVS_VSWITCHD_STOP AT_CLEANUP \ No newline at end of file diff --git a/tests/system-traffic.at b/tests/system-traffic.at index fa66f6f66f8..69ba6a18ab7 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -5246,20 +5246,20 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_DATA([flows.txt], [dnl priority=1,action=drop priority=10,arp,action=normal -priority=100,in_port=1,udp,action=ct(commit),2 +priority=100,in_port=1,udp,action=ct(zone=1,commit),2 priority=100,in_port=2,udp,action=ct(zone=3,commit),1 ]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10 zone=0,limit=5 zone=1,limit=15 zone=2,limit=3 zone=3,limit=3]) -AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=1,2,4]) -AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0,1,2,3], [],[dnl +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10 zone=1,limit=5 zone=2,limit=3 zone=3,limit=3 zone=4,limit=15]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=2,4,5]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=1,2,3,4], [],[dnl default limit=10 -zone=0,limit=5,count=0 -zone=1,limit=10,count=0 +zone=1,limit=5,count=0 zone=2,limit=10,count=0 zone=3,limit=3,count=0 +zone=4,limit=10,count=0 ]) dnl Test UDP from port 1 @@ -5273,10 +5273,9 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a5 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000900080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000a00080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0,1,2,3,4,5], [0], [dnl +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=1,2,3,4,5], [0], [dnl default limit=10 -zone=0,limit=5,count=5 -zone=1,limit=10,count=0 +zone=1,limit=5,count=5 zone=2,limit=10,count=0 zone=3,limit=3,count=0 zone=4,limit=10,count=0 @@ -5286,16 +5285,16 @@ zone=5,limit=10,count=0 dnl Test ct-get-limits for all zones AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=10 -zone=0,limit=5,count=5 +zone=1,limit=5,count=5 zone=3,limit=3,count=0 ]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | sort ], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=3),reply=(src=10.1.1.2,dst=10.1.1.1,sport=3,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=4),reply=(src=10.1.1.2,dst=10.1.1.1,sport=4,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=5),reply=(src=10.1.1.2,dst=10.1.1.1,sport=5,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=6),reply=(src=10.1.1.2,dst=10.1.1.1,sport=6,dport=1) +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=3),reply=(src=10.1.1.2,dst=10.1.1.1,sport=3,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=4),reply=(src=10.1.1.2,dst=10.1.1.1,sport=4,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=5),reply=(src=10.1.1.2,dst=10.1.1.1,sport=5,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=6),reply=(src=10.1.1.2,dst=10.1.1.1,sport=6,dport=1),zone=1 ]) dnl Test UDP from port 2 @@ -5305,9 +5304,9 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000500080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000600080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0,3], [0], [dnl +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=1,3], [0], [dnl default limit=10 -zone=0,limit=5,count=5 +zone=1,limit=5,count=5 zone=3,limit=3,count=3 ]) @@ -5346,22 +5345,22 @@ zone=4,limit=0,count=0 dnl Test limit set via database. VSCTL_ADD_DATAPATH_TABLE() -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=0]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1]) AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=3]) AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10]) AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=3]) AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=10 -zone=0,limit=5,count=0 +zone=1,limit=5,count=0 ]) -AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 0 3]) +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 1 3]) AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 3 3]) OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl default limit=10 -zone=0,limit=3,count=0 +zone=1,limit=3,count=0 zone=3,limit=3,count=0]) for i in 2 3 4 5 6; do @@ -5378,27 +5377,27 @@ udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=4),reply=(src=10.1.1.4,dst=10. AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=10 -zone=0,limit=3,count=0 +zone=1,limit=3,count=0 zone=3,limit=3,count=3 ]) AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 3]) OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl default limit=10 -zone=0,limit=3,count=0]) +zone=1,limit=3,count=0]) AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE default 5]) OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl default limit=5 -zone=0,limit=3,count=0]) +zone=1,limit=3,count=0]) AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE default]) OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl default limit=0 -zone=0,limit=3,count=0]) +zone=1,limit=3,count=0]) dnl Try to overwrite the zone limit via dpctl command. -AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=3,limit=5 zone=0,limit=5], [2], [ignore], [dnl +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=3,limit=5 zone=1,limit=5], [2], [ignore], [dnl ovs-vswitchd: the zone limits are set via database, dnl use 'ovs-vsctl set-zone-limit <...>' instead. (Operation not permitted) ovs-appctl: ovs-vswitchd: server returned an error @@ -5406,10 +5405,10 @@ ovs-appctl: ovs-vswitchd: server returned an error AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=0 -zone=0,limit=3,count=0 +zone=1,limit=3,count=0 ]) -AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=0], [2], [ignore], [dnl +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=1], [2], [ignore], [dnl ovs-vswitchd: the zone limits are set via database, dnl use 'ovs-vsctl del-zone-limit <...>' instead. (Operation not permitted) ovs-appctl: ovs-vswitchd: server returned an error @@ -5417,10 +5416,10 @@ ovs-appctl: ovs-vswitchd: server returned an error AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=0 -zone=0,limit=3,count=0 +zone=1,limit=3,count=0 ]) -AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 0]) +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 1]) AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE default 10]) AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=10 From cc670e741170972de5da4cfad294991468954dfe Mon Sep 17 00:00:00 2001 From: David Marchand Date: Fri, 1 Dec 2023 15:29:31 +0100 Subject: [PATCH 468/833] system-dpdk: Wait for MTU changes to be applied. Because a DPDK backed netdev configuration is done in an asynchronous way, and a MTU change requires a reconfiguration, directly checking ovs-vswitchd logs or querying ovsdb for the interface current MTU value is racy. Add synchronisation points on the interface MTU value in ovsdb as it ensures that a netdev (re)configuration did happen. With those synchronisation points in place, error messages may be checked in logs afterward. Fixes: bf47829116a8 ("tests: Add OVS-DPDK MTU unit tests.") Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- tests/system-dpdk.at | 42 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index 17742d20a0a..af092a20004 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -511,15 +511,13 @@ dnl Add userspace bridge and attach it to OVS with default MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 dnl Check default MTU value in the datapath -AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl -1500 -]) +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [1500]) dnl Increase MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9000]) +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [9000]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -527,10 +525,6 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9000) setup error" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl -9000 -]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) @@ -555,7 +549,9 @@ AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9000]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 + +dnl Check MTU value in the datapath +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [9000]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -563,17 +559,9 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9000) setup error" ovs-vswitchd.log], [], [stdout]) -dnl Check MTU value in the datapath -AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl -9000 -]) - dnl Decrease MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=2000]) - -AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl -2000 -]) +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [2000]) dnl Clean up @@ -680,7 +668,9 @@ AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 + +dnl Check MTU value in the datapath +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [9702]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -688,11 +678,6 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9702) setup error" ovs-vswitchd.log], [], [stdout]) -dnl Check MTU value in the datapath -AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl -9702 -]) - dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9711]) AT_CHECK([grep "phy0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) @@ -721,7 +706,9 @@ AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=68]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 + +dnl Check MTU value in the datapath +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [68]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -729,11 +716,6 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (68) setup error" ovs-vswitchd.log], [], [stdout]) -dnl Check MTU value in the datapath -AT_CHECK([ovs-vsctl get Interface phy0 mtu], [0], [dnl -68 -]) - dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=67]) AT_CHECK([grep "phy0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) From 62c5d32ad4aba692881027d05c6e5e51e8b5df9c Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Wed, 6 Dec 2023 08:53:16 +0100 Subject: [PATCH 469/833] ofp-prop: Add helper for parsing and storing of ovs_u128. Add helper methods that allow us to store and parse the ovs_u128 type. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- include/openvswitch/ofp-prop.h | 4 ++++ lib/ofp-prop.c | 42 ++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/include/openvswitch/ofp-prop.h b/include/openvswitch/ofp-prop.h index e676f8dc0f7..afc86a5f701 100644 --- a/include/openvswitch/ofp-prop.h +++ b/include/openvswitch/ofp-prop.h @@ -84,10 +84,12 @@ enum ofperr ofpprop_pull(struct ofpbuf *msg, struct ofpbuf *property, enum ofperr ofpprop_parse_be16(const struct ofpbuf *, ovs_be16 *value); enum ofperr ofpprop_parse_be32(const struct ofpbuf *, ovs_be32 *value); enum ofperr ofpprop_parse_be64(const struct ofpbuf *, ovs_be64 *value); +enum ofperr ofpprop_parse_be128(const struct ofpbuf *, ovs_be128 *value); enum ofperr ofpprop_parse_u8(const struct ofpbuf *, uint8_t *value); enum ofperr ofpprop_parse_u16(const struct ofpbuf *, uint16_t *value); enum ofperr ofpprop_parse_u32(const struct ofpbuf *, uint32_t *value); enum ofperr ofpprop_parse_u64(const struct ofpbuf *, uint64_t *value); +enum ofperr ofpprop_parse_u128(const struct ofpbuf *, ovs_u128 *value); enum ofperr ofpprop_parse_uuid(const struct ofpbuf *, struct uuid *); enum ofperr ofpprop_parse_nested(const struct ofpbuf *, struct ofpbuf *); @@ -98,10 +100,12 @@ void *ofpprop_put_zeros(struct ofpbuf *, uint64_t type, size_t len); void ofpprop_put_be16(struct ofpbuf *, uint64_t type, ovs_be16 value); void ofpprop_put_be32(struct ofpbuf *, uint64_t type, ovs_be32 value); void ofpprop_put_be64(struct ofpbuf *, uint64_t type, ovs_be64 value); +void ofpprop_put_be128(struct ofpbuf *, uint64_t type, ovs_be128 value); void ofpprop_put_u8(struct ofpbuf *, uint64_t type, uint8_t value); void ofpprop_put_u16(struct ofpbuf *, uint64_t type, uint16_t value); void ofpprop_put_u32(struct ofpbuf *, uint64_t type, uint32_t value); void ofpprop_put_u64(struct ofpbuf *, uint64_t type, uint64_t value); +void ofpprop_put_u128(struct ofpbuf *, uint64_t type, ovs_u128 value); void ofpprop_put_bitmap(struct ofpbuf *, uint64_t type, uint64_t bitmap); void ofpprop_put_flag(struct ofpbuf *, uint64_t type); void ofpprop_put_uuid(struct ofpbuf *, uint64_t type, const struct uuid *); diff --git a/lib/ofp-prop.c b/lib/ofp-prop.c index 8b2d8a85abe..0a685750c17 100644 --- a/lib/ofp-prop.c +++ b/lib/ofp-prop.c @@ -184,6 +184,20 @@ ofpprop_parse_be64(const struct ofpbuf *property, ovs_be64 *value) return 0; } +/* Attempts to parse 'property' as a property containing a 128-bit value. If + * successful, stores the value into '*value' and returns 0; otherwise returns + * an OpenFlow error. */ +enum ofperr +ofpprop_parse_be128(const struct ofpbuf *property, ovs_be128 *value) +{ + ovs_be128 *p = property->msg; + if (ofpbuf_msgsize(property) != sizeof *p) { + return OFPERR_OFPBPC_BAD_LEN; + } + *value = *p; + return 0; +} + /* Attempts to parse 'property' as a property containing a 8-bit value. If * successful, stores the value into '*value' and returns 0; otherwise returns * an OpenFlow error. */ @@ -250,6 +264,20 @@ ofpprop_parse_u64(const struct ofpbuf *property, uint64_t *value) return 0; } +/* Attempts to parse 'property' as a property containing a 128-bit value. If + * successful, stores the value into '*value' and returns 0; otherwise returns + * an OpenFlow error. */ +enum ofperr +ofpprop_parse_u128(const struct ofpbuf *property, ovs_u128 *value) +{ + ovs_be128 *p = property->msg; + if (ofpbuf_msgsize(property) != sizeof *p) { + return OFPERR_OFPBPC_BAD_LEN; + } + *value = ntoh128(*p); + return 0; +} + /* Attempts to parse 'property' as a property containing a UUID. If * successful, stores the value into '*uuid' and returns 0; otherwise returns * an OpenFlow error. */ @@ -351,6 +379,13 @@ ofpprop_put_be64(struct ofpbuf *msg, uint64_t type, ovs_be64 value) ofpprop_end(msg, start); } +/* Adds a property with the given 'type' and 128-bit 'value' to 'msg'. */ +void +ofpprop_put_be128(struct ofpbuf *msg, uint64_t type, ovs_be128 value) +{ + ofpprop_put(msg, type, &value, sizeof value); +} + /* Adds a property with the given 'type' and 8-bit 'value' to 'msg'. */ void ofpprop_put_u8(struct ofpbuf *msg, uint64_t type, uint8_t value) @@ -381,6 +416,13 @@ ofpprop_put_u64(struct ofpbuf *msg, uint64_t type, uint64_t value) ofpprop_put_be64(msg, type, htonll(value)); } +/* Adds a property with the given 'type' and 128-bit 'value' to 'msg'. */ +void +ofpprop_put_u128(struct ofpbuf *msg, uint64_t type, ovs_u128 value) +{ + ofpprop_put_be128(msg, type, hton128(value)); +} + /* Appends a property to 'msg' whose type is 'type' and whose contents is a * series of property headers, one for each 1-bit in 'bitmap'. */ void From 386deb32cd7028f2d86c933d0d953f194184b984 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Wed, 6 Dec 2023 08:53:17 +0100 Subject: [PATCH 470/833] dpctl, ovs-ofctl: Unify parsing of ct-flush arguments. In order to make the command extensible unify the arguments parsing into single function. This will be later on used for the mark and labels arguments. Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- include/openvswitch/ofp-ct.h | 5 ++-- lib/dpctl.c | 41 ++++--------------------------- lib/ofp-ct.c | 47 +++++++++++++++++++++++++++++++++++- tests/system-traffic.at | 2 +- utilities/ovs-ofctl.c | 38 ++++++----------------------- 5 files changed, 62 insertions(+), 71 deletions(-) diff --git a/include/openvswitch/ofp-ct.h b/include/openvswitch/ofp-ct.h index c8023c3097e..cd6192e6ff3 100644 --- a/include/openvswitch/ofp-ct.h +++ b/include/openvswitch/ofp-ct.h @@ -58,8 +58,9 @@ bool ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *, uint8_t ip_proto); bool ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *, uint8_t ip_proto); void ofp_ct_match_format(struct ds *, const struct ofp_ct_match *); -bool ofp_ct_tuple_parse(struct ofp_ct_tuple *, const char *, - struct ds *, uint8_t *ip_proto, uint16_t *l3_type); +bool ofp_ct_match_parse(const char **, int argc, struct ds *, + struct ofp_ct_match *, bool *with_zone, + uint16_t *zone_id); enum ofperr ofp_ct_match_decode(struct ofp_ct_match *, bool *with_zone, uint16_t *zone_id, const struct ofp_header *); diff --git a/lib/dpctl.c b/lib/dpctl.c index 2a1aac5e5f5..7cc9d280568 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1773,48 +1773,17 @@ dpctl_flush_conntrack(int argc, const char *argv[], struct dpif *dpif = NULL; struct ofp_ct_match match = {0}; struct ds ds = DS_EMPTY_INITIALIZER; - uint16_t zone, *pzone = NULL; + uint16_t zone; int error; int args = argc - 1; - int zone_pos = 1; + bool with_zone = false; if (dp_arg_exists(argc, argv)) { args--; - zone_pos = 2; - } - - /* Parse zone. */ - if (args && !strncmp(argv[zone_pos], "zone=", 5)) { - if (!ovs_scan(argv[zone_pos], "zone=%"SCNu16, &zone)) { - ds_put_cstr(&ds, "failed to parse zone"); - error = EINVAL; - goto error; - } - pzone = &zone; - args--; - } - - /* Parse ct tuples. */ - for (int i = 0; i < 2; i++) { - if (!args) { - break; - } - - struct ofp_ct_tuple *tuple = - i ? &match.tuple_reply : &match.tuple_orig; - const char *arg = argv[argc - args]; - - if (arg[0] && !ofp_ct_tuple_parse(tuple, arg, &ds, &match.ip_proto, - &match.l3_type)) { - error = EINVAL; - goto error; - } - args--; } - /* Report error if there is more than one unparsed argument. */ - if (args > 0) { - ds_put_cstr(&ds, "invalid arguments"); + if (args && !ofp_ct_match_parse(&argv[argc - args], args, &ds, &match, + &with_zone, &zone)) { error = EINVAL; goto error; } @@ -1825,7 +1794,7 @@ dpctl_flush_conntrack(int argc, const char *argv[], return error; } - error = ct_dpif_flush(dpif, pzone, &match); + error = ct_dpif_flush(dpif, with_zone ? &zone : NULL, &match); if (!error) { dpif_close(dpif); return 0; diff --git a/lib/ofp-ct.c b/lib/ofp-ct.c index a140fba4702..b6500f905ba 100644 --- a/lib/ofp-ct.c +++ b/lib/ofp-ct.c @@ -101,7 +101,7 @@ ofp_ct_match_format(struct ds *ds, const struct ofp_ct_match *match) /* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. * Returns true on success. Otherwise, returns false and puts the error * message in 'ds'. */ -bool +static bool ofp_ct_tuple_parse(struct ofp_ct_tuple *tuple, const char *s, struct ds *ds, uint8_t *ip_proto, uint16_t *l3_type) { @@ -219,6 +219,51 @@ ofp_ct_tuple_parse(struct ofp_ct_tuple *tuple, const char *s, return false; } +/* Parses a specification of a conntrack match from 'argv' into 'match'. + * Returns true on success. Otherwise, returns false and puts the error + * message in 'ds'. */ +bool +ofp_ct_match_parse(const char **argv, int argc, struct ds *ds, + struct ofp_ct_match *match, bool *with_zone, + uint16_t *zone_id) +{ + int args = argc; + + /* Parse zone. */ + if (args && !strncmp(argv[argc - args], "zone=", 5)) { + if (!ovs_scan(argv[argc - args], "zone=%"SCNu16, zone_id)) { + ds_put_cstr(ds, "failed to parse zone"); + return false; + } + *with_zone = true; + args--; + } + + /* Parse ct tuples. */ + for (int i = 0; i < 2; i++) { + if (!args) { + break; + } + + struct ofp_ct_tuple *tuple = + i ? &match->tuple_reply : &match->tuple_orig; + const char *arg = argv[argc - args]; + + if (arg[0] && !ofp_ct_tuple_parse(tuple, arg, ds, &match->ip_proto, + &match->l3_type)) { + return false; + } + args--; + } + + if (args > 0) { + ds_put_cstr(ds, "invalid arguments"); + return false; + } + + return true; +} + static enum ofperr ofpprop_pull_ipv6(struct ofpbuf *property, struct in6_addr *addr, uint16_t *l3_type) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 69ba6a18ab7..d3f7e0a55d1 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2859,7 +2859,7 @@ AT_CHECK([ovs-ofctl ct-flush br0 zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1 AT_CHECK([grep -q "command takes at most 4 arguments" stderr]) AT_CHECK([ovs-ofctl ct-flush br0 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [1], [ignore], [stderr]) -AT_CHECK([grep -q "Invalid arguments" stderr]) +AT_CHECK([grep -q "invalid arguments" stderr]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index 2d413e2396d..9a18880e6ce 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -3068,43 +3068,19 @@ ofctl_ct_flush(struct ovs_cmdl_context *ctx) struct vconn *vconn; struct ofp_ct_match match = {0}; struct ds ds = DS_EMPTY_INITIALIZER; - uint16_t zone, *pzone = NULL; + uint16_t zone; int args = ctx->argc - 2; + bool with_zone = false; - /* Parse zone. */ - if (args && !strncmp(ctx->argv[2], "zone=", 5)) { - if (!ovs_scan(ctx->argv[2], "zone=%"SCNu16, &zone)) { - ovs_fatal(0, "Failed to parse zone"); - } - pzone = &zone; - args--; - } - - /* Parse ct tuples. */ - for (int i = 0; i < 2; i++) { - if (!args) { - break; - } - - struct ofp_ct_tuple *tuple = - i ? &match.tuple_reply : &match.tuple_orig; - const char *arg = ctx->argv[ctx->argc - args]; - - if (arg[0] && !ofp_ct_tuple_parse(tuple, arg, &ds, &match.ip_proto, - &match.l3_type)) { - ovs_fatal(0, "Failed to parse ct-tuple: %s", ds_cstr(&ds)); - } - args--; - } - - if (args > 0) { - ovs_fatal(0, "Invalid arguments"); + if (args && !ofp_ct_match_parse((const char **) &ctx->argv[2], + args, &ds, &match, &with_zone, &zone)) { + ovs_fatal(0, "Failed to parse CT match: %s", ds_cstr(&ds)); } open_vconn(ctx->argv[1], &vconn); enum ofp_version version = vconn_get_version(vconn); - struct ofpbuf *msg = ofp_ct_match_encode(&match, pzone, version); - + struct ofpbuf *msg = ofp_ct_match_encode(&match, with_zone ? &zone : NULL, + version); ds_destroy(&ds); transact_noreply(vconn, msg); vconn_close(vconn); From a095794bcc5ace7050f55593134b58e3ef86e7ec Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Wed, 6 Dec 2023 08:53:18 +0100 Subject: [PATCH 471/833] openflow: Allow CT flush to match on mark and labels. Extend the current NX_CT_FLUSH with four additional fields, that allow to match on CT entry "mark" or "labels". This is encoded as separate TLV values which is backward compatible. Versions that do not support them will fail the command. Extend also the ovs-dpctl and ovs-ofctl command line tools with option to specify those two matching parameters for the "ct-flush" command. Reported-at: https://issues.redhat.com/browse/FDP-55 Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- NEWS | 6 ++ include/openflow/nicira-ext.h | 4 + include/openvswitch/ofp-ct.h | 9 ++- lib/ct-dpif.c | 12 ++- lib/dpctl.c | 5 +- lib/ofp-ct.c | 135 +++++++++++++++++++++++++++++++++- tests/ofp-print.at | 84 +++++++++++++++++++++ tests/ovs-ofctl.at | 36 +++++++++ tests/system-traffic.at | 112 ++++++++++++++++++---------- utilities/ovs-ofctl.8.in | 15 ++-- utilities/ovs-ofctl.c | 11 ++- 11 files changed, 371 insertions(+), 58 deletions(-) diff --git a/NEWS b/NEWS index 63f2842ae8f..591d5e47005 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,10 @@ Post-v3.2.0 from older version is supported but it may trigger more leader elections during the process, and error logs complaining unrecognized fields may be observed on old nodes. + - OpenFlow: + * NXT_CT_FLUSH extension is updated to support flushing connections + based on mark and labels. 'ct-flush' command of ovs-ofctl updated + to support these new arguments accordingly. - ovs-appctl: * 'ofproto/trace' now reports OpenFlow rules that make up a conjunctive flow match. @@ -17,6 +21,8 @@ Post-v3.2.0 Reported names adjusted accordingly. * Added support for removal of default CT zone limit, e.g. "ovs-appctl dpctl/ct-del-limits default". + * 'dpctl/flush-conntrack' is now capable of flushing connections based + on mark and labels. - ovs-vsctl: * New commands 'set-zone-limit', 'del-zone-limit' and 'list-zone-limits' to manage the maximum number of connections in conntrack zones via diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h index 7687758985a..959845ce6d7 100644 --- a/include/openflow/nicira-ext.h +++ b/include/openflow/nicira-ext.h @@ -1075,6 +1075,10 @@ enum nx_ct_flush_tlv_type { * by 'enum nx_ct_flush_tuple_tlv_type'*/ /* Primitive types. */ NXT_CT_ZONE_ID = 2, /* be16 zone id. */ + NXT_CT_MARK = 3, /* be32 mark. */ + NXT_CT_MARK_MASK = 4, /* be32 mark mask. */ + NXT_CT_LABELS = 5, /* be128 labels. */ + NXT_CT_LABELS_MASK = 6, /* be128 labels mask. */ }; /* CT flush nested TLVs. */ diff --git a/include/openvswitch/ofp-ct.h b/include/openvswitch/ofp-ct.h index cd6192e6ff3..d57b6267843 100644 --- a/include/openvswitch/ofp-ct.h +++ b/include/openvswitch/ofp-ct.h @@ -51,11 +51,16 @@ struct ofp_ct_match { struct ofp_ct_tuple tuple_orig; struct ofp_ct_tuple tuple_reply; + + uint32_t mark; + uint32_t mark_mask; + + ovs_u128 labels; + ovs_u128 labels_mask; }; bool ofp_ct_match_is_zero(const struct ofp_ct_match *); -bool ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *, uint8_t ip_proto); -bool ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *, uint8_t ip_proto); +bool ofp_ct_match_is_five_tuple(const struct ofp_ct_match *); void ofp_ct_match_format(struct ds *, const struct ofp_ct_match *); bool ofp_ct_match_parse(const char **, int argc, struct ds *, diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index 5115c886b1d..5a836b6683f 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -274,6 +274,15 @@ ct_dpif_entry_cmp(const struct ct_dpif_entry *entry, return false; } + if ((match->mark & match->mark_mask) != (entry->mark & match->mark_mask)) { + return false; + } + + if (!ovs_u128_equals(ovs_u128_and(match->labels, match->labels_mask), + ovs_u128_and(entry->labels, match->labels_mask))) { + return false; + } + return true; } @@ -300,8 +309,7 @@ ct_dpif_flush_tuple(struct dpif *dpif, const uint16_t *zone, /* If we have full five tuple in original and empty reply tuple just * do the flush over original tuple directly. */ - if (ofp_ct_tuple_is_five_tuple(&match->tuple_orig, match->ip_proto) && - ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto)) { + if (ofp_ct_match_is_five_tuple(match)) { struct ct_dpif_tuple tuple; ct_dpif_tuple_from_ofp_ct_tuple(&match->tuple_orig, &tuple, diff --git a/lib/dpctl.c b/lib/dpctl.c index 7cc9d280568..34ee7d0e2de 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -3005,8 +3005,9 @@ static const struct dpctl_command all_commands[] = { 0, 4, dpctl_dump_conntrack, DP_RO }, { "dump-conntrack-exp", "[dp] [zone=N]", 0, 2, dpctl_dump_conntrack_exp, DP_RO }, - { "flush-conntrack", "[dp] [zone=N] [ct-orig-tuple] [ct-reply-tuple]", - 0, 4, dpctl_flush_conntrack, DP_RW }, + { "flush-conntrack", "[dp] [zone=N] [mark=X[/M]] [labels=Y[/N]] " + "[ct-orig-tuple [ct-reply-tuple]]", + 0, 6, dpctl_flush_conntrack, DP_RW }, { "cache-get-size", "[dp]", 0, 1, dpctl_cache_get_size, DP_RO }, { "cache-set-size", "dp cache ", 3, 3, dpctl_cache_set_size, DP_RW }, { "ct-stats-show", "[dp] [zone=N]", diff --git a/lib/ofp-ct.c b/lib/ofp-ct.c index b6500f905ba..2e12790b434 100644 --- a/lib/ofp-ct.c +++ b/lib/ofp-ct.c @@ -53,7 +53,7 @@ ofp_ct_tuple_format(struct ds *ds, const struct ofp_ct_tuple *tuple, } } -bool +static bool ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) { bool is_zero = ipv6_is_zero(&tuple->src) && ipv6_is_zero(&tuple->dst); @@ -65,7 +65,7 @@ ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) return is_zero; } -bool +static bool ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) { /* First check if we have address. */ @@ -78,17 +78,48 @@ ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) return five_tuple; } +bool +ofp_ct_match_is_five_tuple(const struct ofp_ct_match *match) +{ + return ofp_ct_tuple_is_five_tuple(&match->tuple_orig, match->ip_proto) && + ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto) && + !match->mark_mask && ovs_u128_is_zero(match->labels_mask); +} + bool ofp_ct_match_is_zero(const struct ofp_ct_match *match) { return !match->ip_proto && !match->l3_type && ofp_ct_tuple_is_zero(&match->tuple_orig, match->ip_proto) && - ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto); + ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto) && + !match->mark_mask && ovs_u128_is_zero(match->labels_mask); } void ofp_ct_match_format(struct ds *ds, const struct ofp_ct_match *match) { + if (match->mark_mask) { + ds_put_format(ds, "mark=%#"PRIx32, match->mark); + if (match->mark_mask != UINT32_MAX) { + ds_put_format(ds, "/%#"PRIx32, match->mark_mask); + } + ds_put_char(ds, ' '); + } + + if (!ovs_u128_is_zero(match->labels_mask)) { + ovs_be128 be_value = hton128(match->labels); + ovs_be128 be_mask = hton128(match->labels_mask); + + ds_put_cstr(ds, "labels="); + ds_put_hex(ds, &be_value, sizeof be_value); + + if (!ovs_u128_is_ones(match->labels_mask)) { + ds_put_char(ds, '/'); + ds_put_hex(ds, &be_mask, sizeof be_mask); + } + ds_put_char(ds, ' '); + } + ds_put_cstr(ds, "'"); ofp_ct_tuple_format(ds, &match->tuple_orig, match->ip_proto, match->l3_type); @@ -98,6 +129,23 @@ ofp_ct_match_format(struct ds *ds, const struct ofp_ct_match *match) ds_put_cstr(ds, "'"); } +static inline bool +ofp_ct_masked_parse(const char *s, uint8_t *val, size_t val_len, + uint8_t *mask, size_t mask_len) +{ + char *tail; + if (!parse_int_string(s, val, val_len, &tail)) { + if (*tail != '/' || parse_int_string(tail + 1, mask, + mask_len, &tail)) { + memset(mask, UINT8_MAX, mask_len); + } + + return true; + } + + return false; +} + /* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. * Returns true on success. Otherwise, returns false and puts the error * message in 'ds'. */ @@ -239,6 +287,40 @@ ofp_ct_match_parse(const char **argv, int argc, struct ds *ds, args--; } + /* Parse mark. */ + if (args && !strncmp(argv[argc - args], "mark=", 5)) { + const char *s = argv[argc - args] + 5; + ovs_be32 mark_be; + ovs_be32 mask_be; + + if (ofp_ct_masked_parse(s, (uint8_t *) &mark_be, sizeof mark_be, + (uint8_t *) &mask_be, sizeof mask_be)) { + match->mark = ntohl(mark_be); + match->mark_mask = ntohl(mask_be); + } else { + ds_put_cstr(ds, "failed to parse mark"); + return false; + } + args--; + } + + /* Parse labels. */ + if (args && !strncmp(argv[argc - args], "labels=", 7)) { + const char *s = argv[argc - args] + 7; + ovs_be128 labels_be; + ovs_be128 mask_be; + + if (ofp_ct_masked_parse(s, (uint8_t *) &labels_be, sizeof labels_be, + (uint8_t *) &mask_be, sizeof mask_be)) { + match->labels = ntoh128(labels_be); + match->labels_mask = ntoh128(mask_be); + } else { + ds_put_cstr(ds, "failed to parse labels"); + return false; + } + args--; + } + /* Parse ct tuples. */ for (int i = 0; i < 2; i++) { if (!args) { @@ -389,6 +471,7 @@ enum ofperr ofp_ct_match_decode(struct ofp_ct_match *match, bool *with_zone, uint16_t *zone_id, const struct ofp_header *oh) { + uint32_t tlv_flags = 0; struct ofpbuf msg = ofpbuf_const_initializer(oh, ntohs(oh->length)); ofpraw_pull_assert(&msg); @@ -430,6 +513,22 @@ ofp_ct_match_decode(struct ofp_ct_match *match, bool *with_zone, error = ofpprop_parse_u16(&property, zone_id); break; + case NXT_CT_MARK: + error = ofpprop_parse_u32(&property, &match->mark); + break; + + case NXT_CT_MARK_MASK: + error = ofpprop_parse_u32(&property, &match->mark_mask); + break; + + case NXT_CT_LABELS: + error = ofpprop_parse_u128(&property, &match->labels); + break; + + case NXT_CT_LABELS_MASK: + error = ofpprop_parse_u128(&property, &match->labels_mask); + break; + default: error = OFPPROP_UNKNOWN(false, "NXT_CT_FLUSH", type); break; @@ -438,6 +537,22 @@ ofp_ct_match_decode(struct ofp_ct_match *match, bool *with_zone, if (error) { return error; } + + if (type < (sizeof tlv_flags * CHAR_BIT)) { + tlv_flags |= (UINT32_C(1) << type); + } + } + + /* Consider the mask being all ones if it's not present but the value + * is specified. */ + if (tlv_flags & (UINT32_C(1) << NXT_CT_MARK) && + !(tlv_flags & (UINT32_C(1) << NXT_CT_MARK_MASK))) { + match->mark_mask = UINT32_MAX; + } + + if (tlv_flags & (UINT32_C(1) << NXT_CT_LABELS) && + !(tlv_flags & (UINT32_C(1) << NXT_CT_LABELS_MASK))) { + match->labels_mask = OVS_U128_MAX; } return 0; @@ -461,5 +576,19 @@ ofp_ct_match_encode(const struct ofp_ct_match *match, uint16_t *zone_id, ofpprop_put_u16(msg, NXT_CT_ZONE_ID, *zone_id); } + if (match->mark_mask) { + ofpprop_put_u32(msg, NXT_CT_MARK, match->mark); + if (match->mark_mask != UINT32_MAX) { + ofpprop_put_u32(msg, NXT_CT_MARK_MASK, match->mark_mask); + } + } + + if (!ovs_u128_is_zero(match->labels_mask)) { + ofpprop_put_u128(msg, NXT_CT_LABELS, match->labels); + if (!ovs_u128_is_ones(match->labels_mask)) { + ofpprop_put_u128(msg, NXT_CT_LABELS_MASK, match->labels_mask); + } + } + return msg; } diff --git a/tests/ofp-print.at b/tests/ofp-print.at index 6a07e23c645..b2e69c10038 100644 --- a/tests/ofp-print.at +++ b/tests/ofp-print.at @@ -4093,6 +4093,84 @@ AT_CHECK([ovs-ofctl ofp-print "\ NXT_CT_FLUSH (xid=0x3): zone=13 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' ]) +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 04 00 08 00 00 00 cd \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0/0xcd 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 28 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +00 04 00 08 00 00 00 cd \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab/0xcd 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 30 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 labels=0xffab00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 30 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 06 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 labels=0/0xffcd00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 48 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +00 06 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 labels=0xffab00/0xffcd00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 38 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab labels=0xffab00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 58 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +00 04 00 08 00 00 00 cd \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +00 06 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab/0xcd labels=0xffab00/0xffcd00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + AT_CHECK([ovs-ofctl ofp-print "\ 01 04 00 68 00 00 00 03 00 00 23 20 00 00 00 20 \ 06 \ @@ -4198,4 +4276,10 @@ AT_CHECK([ovs-ofctl ofp-print "\ "| grep -q OFPBPC_BAD_TYPE], [0], [ignore], [stderr]) AT_CHECK([grep -q "unknown NXT_CT_TUPLE property type 128" stderr], [0]) +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 30 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 06 00 15 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +" | grep -q OFPBPC_BAD_LEN], [0]) AT_CLEANUP diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at index 8531b2e2eb6..d03d365003b 100644 --- a/tests/ovs-ofctl.at +++ b/tests/ovs-ofctl.at @@ -3307,5 +3307,41 @@ AT_CHECK([ovs-ofctl ct-flush br0]) OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 5]) AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: " ovs-vswitchd.log]) +AT_CHECK([ovs-ofctl ct-flush br0 mark=0]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 6]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 mark=0" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 mark=0/0x5]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 7]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 mark=0/0x5" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 mark=0xabc/0xdef]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 8]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 mark=0xabc/0xdef" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 labels=0]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 9]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 labels=0" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 labels=0/0x5]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 10]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 labels=0/0x5" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 labels=0xabc/0xdef]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 11]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 labels=0xabc/0xdef" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 mark=25 labels=25]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 12]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 mark=0x19 labels=0x19" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 mark=30/25 labels=30/25]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 13]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 mark=0x1e/0x19 labels=0x1e/0x19" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=6 mark=30/0 labels=30/0]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 14]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone 6" ovs-vswitchd.log]) + OVS_VSWITCHD_STOP AT_CLEANUP diff --git a/tests/system-traffic.at b/tests/system-traffic.at index d3f7e0a55d1..0d6f8cf21af 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2649,8 +2649,8 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_DATA([flows.txt], [dnl priority=1,action=drop priority=10,arp,action=normal -priority=100,in_port=1,ip,action=ct(commit),2 -priority=100,in_port=2,ip,action=ct(zone=5,commit),1 +priority=100,in_port=1,ip,action=ct(commit,exec(set_field:0xaa->ct_mark)),2 +priority=100,in_port=2,ip,action=ct(zone=5,commit,exec(set_field:0xaa00000000->ct_label)),1 ]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) @@ -2665,7 +2665,7 @@ dnl Test UDP from port 1 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) @@ -2677,7 +2677,7 @@ dnl Test UDP from port 2 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) @@ -2691,7 +2691,7 @@ NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [0], [stdout]) AT_CHECK([cat stdout | FORMAT_CT(10.1.1.1)], [0],[dnl -icmp,orig=(src=10.1.1.2,dst=10.1.1.1,id=,type=8,code=0),reply=(src=10.1.1.1,dst=10.1.1.2,id=,type=0,code=0),zone=5 +icmp,orig=(src=10.1.1.2,dst=10.1.1.1,id=,type=8,code=0),reply=(src=10.1.1.1,dst=10.1.1.2,id=,type=0,code=0),zone=5,labels=0xaa00000000 ]) ICMP_ID=`cat stdout | cut -d ',' -f4 | cut -d '=' -f2` @@ -2707,14 +2707,14 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=2']) @@ -2727,14 +2727,14 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=1']) @@ -2747,14 +2747,14 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2']) @@ -2767,14 +2767,14 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.1']) @@ -2787,14 +2787,14 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD '' 'ct_nw_src=10.1.1.2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD zone=5 '' 'ct_nw_src=10.1.1.1']) @@ -2807,8 +2807,8 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a5 AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD]) @@ -2820,46 +2820,80 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a5 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000950540000000a08004500003400010000408464410a0101020a010101000200010000000098f29e470100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sed "s/,protoinfo=.*$//" | sort], [0], [dnl -sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1,ct_nw_proto=132,ct_tp_src=1,ct_tp_dst=2']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sed "s/,protoinfo=.*$//" | sort], [0], [dnl -sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_proto=132,ct_tp_src=2,ct_tp_dst=1']) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by mark and labels. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD mark=0xaa]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD labels=0xaa00000000]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD mark=2/2]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) -dnl Test flush with invalid arguments +AT_CHECK([FLUSH_CMD labels=0x0200000000/0x0200000000]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) + +dnl Test flush with invalid arguments. -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=invalid 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1'], [2], [ignore], [stderr]) +AT_CHECK([FLUSH_CMD zone=invalid 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1'], [ignore], [ignore], [stderr]) AT_CHECK([grep -q "failed to parse zone" stderr]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_src=10.1.1.1,invalid=invalid' 'ct_nw_dst=10.1.1.1'], [2], [ignore], [stderr]) +AT_CHECK([FLUSH_CMD zone=1 'ct_nw_src=10.1.1.1,invalid=invalid' 'ct_nw_dst=10.1.1.1'], [ignore], [ignore], [stderr]) AT_CHECK([grep -q "invalid conntrack tuple field: invalid" stderr]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_src=invalid' 'ct_nw_dst=10.1.1.1'], [2], [ignore], [stderr]) +AT_CHECK([FLUSH_CMD zone=1 'ct_nw_src=invalid' 'ct_nw_dst=10.1.1.1'], [ignore], [ignore], [stderr]) AT_CHECK([grep -q "failed to parse field ct_nw_src" stderr]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [2], [ignore], [stderr]) +AT_CHECK([FLUSH_CMD zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [ignore], [ignore], [stderr]) AT_CHECK([grep -q "invalid arguments" stderr]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack $dp zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [2], [ignore], [stderr]) -AT_CHECK([grep -q "command takes at most 4 arguments" stderr]) +AT_CHECK([FLUSH_CMD zone=1 mark=1 labels=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "command takes at most 6 arguments" stderr]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack $dp 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [2], [ignore], [stderr]) -AT_CHECK([grep -q "invalid arguments" stderr]) - -AT_CHECK([ovs-ofctl ct-flush br0 zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [1], [ignore], [stderr]) -AT_CHECK([grep -q "command takes at most 4 arguments" stderr]) +AT_CHECK([FLUSH_CMD mark=invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse mark" stderr]) -AT_CHECK([ovs-ofctl ct-flush br0 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [1], [ignore], [stderr]) -AT_CHECK([grep -q "invalid arguments" stderr]) +AT_CHECK([FLUSH_CMD labels=invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse labels" stderr]) +]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in index 0a611b2ee23..d0f99f2bb92 100644 --- a/utilities/ovs-ofctl.8.in +++ b/utilities/ovs-ofctl.8.in @@ -296,17 +296,19 @@ Flushes the connection tracking entries in \fIzone\fR on \fIswitch\fR. This command uses an Open vSwitch extension that is only in Open vSwitch 2.6 and later. . -.IP "\fBct\-flush \fIswitch [zone=N] [ct-orig-tuple [ct-reply-tuple]]\fR -Flushes the connection entries on \fIswitch\fR based on \fIzone\fR and -connection tracking tuples \fIct-[orig|reply]-tuple\fR. +.IP "\fBct\-flush \fIswitch [zone=N] [mark=X[/M]] [labels=Y[/N]] [ct-orig-tuple [ct-reply-tuple]]\fR +Flushes the connection entries on \fIswitch\fR based on \fIzone\fR, \fImark\fR, +\fIlabels\fR and connection tracking tuples \fIct-[orig|reply]-tuple\fR. .IP If \fIct-[orig|reply]-tuple\fR is not provided, flushes all the connection entries. If \fIzone\fR is specified, only flushes the connections in -\fIzone\fR. +\fIzone\fR. if \fImark\fR or \fIlabels\fR is provided, it will flush +only entries that are matching specific \fImark/labels\fR. .IP If \fIct-[orig|reply]-tuple\fR is provided, flushes the connection entry specified by \fIct-[orig|reply]-tuple\fR in \fIzone\fR. The zone defaults -to 0 if it is not provided. The userspace connection tracker requires flushing +to 0 if it is not provided. The \fImark\fR and \fIlabels\fR defaults to "0/0" +if it is not provided. The userspace connection tracker requires flushing with the original pre-NATed tuple and a warning log will be otherwise generated. The tuple can be partial and will remove all connections that are matching on the specified fields. In order to specify only @@ -325,7 +327,8 @@ An example of an IPv6 TCP \fIct-[orig|reply]-tuple\fR: "ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2,ct_nw_proto=6,ct_tp_src=1,ct_tp_dst=2" .IP This command uses an Open vSwitch extension that is only in Open vSwitch 3.1 -and later. +and later. Support for matching on \fImark\fR and \fIlabels\fR is only in +Open vSwitch 3.3 and later. . .SS "OpenFlow Switch Flow Table Commands" . diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index 9a18880e6ce..ba3458e55ad 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -494,9 +494,11 @@ usage(void) " dump-ipfix-bridge SWITCH print ipfix stats of bridge\n" " dump-ipfix-flow SWITCH print flow ipfix of a bridge\n" " ct-flush-zone SWITCH ZONE flush conntrack entries in ZONE\n" - " ct-flush SWITCH [ZONE] [CT_ORIG_TUPLE [CT_REPLY_TUPLE]]\n" + " ct-flush SWITCH [ZONE] [mark=X[/M]] [labels=Y[/N]]\n" + " [CT_ORIG_TUPLE [CT_REPLY_TUPLE]]\n" " flush conntrack entries specified\n" - " by CT_ORIG/REPLY_TUPLE and ZONE\n" + " by CT_ORIG/REPLY_TUPLE, ZONE, mark\n" + " and labels\n" "\nFor OpenFlow switches and controllers:\n" " probe TARGET probe whether TARGET is up\n" " ping TARGET [N] latency of N-byte echos\n" @@ -5122,8 +5124,9 @@ static const struct ovs_cmdl_command all_commands[] = { { "ct-flush-zone", "switch zone", 2, 2, ofctl_ct_flush_zone, OVS_RW }, - { "ct-flush", "switch [zone=N] [ct-orig-tuple [ct-reply-tuple]]", - 1, 4, ofctl_ct_flush, OVS_RW }, + { "ct-flush", "switch [zone=N] [mark=X[/M]] [labels=Y[/N]] " + "[ct-orig-tuple [ct-reply-tuple]]", + 1, 6, ofctl_ct_flush, OVS_RW }, { "ofp-parse", "file", 1, 1, ofctl_ofp_parse, OVS_RW }, From 4cbbf56e6cccd9b2593bc77395c7ca2e78f34191 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Thu, 14 Dec 2023 11:15:15 +0000 Subject: [PATCH 472/833] dpif-netdev: Add per PMD sleep config. Extend 'pmd-sleep-max' so that individual PMD thread cores may have a specified max sleep request value. Existing behaviour is maintained. Any PMD thread core without a value will use the global value if set or default no sleep. To set PMD thread cores 8 and 9 to never request a load based sleep and all other PMD thread cores to be able to request a max sleep of 50 usecs: $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=50,8:0,9:0 To set PMD thread cores 10 and 11 to request a max sleep of 100 usecs and all other PMD thread cores to never request a sleep: $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=10:100,11:100 'pmd-sleep-show' is updated to show the max sleep value for each PMD thread. Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/topics/dpdk/pmd.rst | 34 ++- NEWS | 4 + lib/dpif-netdev-private-thread.h | 3 + lib/dpif-netdev.c | 270 ++++++++++++++++++++--- tests/pmd.at | 352 ++++++++++++++++++++++++++++-- vswitchd/vswitch.xml | 31 ++- 6 files changed, 644 insertions(+), 50 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index f43819be041..2e8cf5edb87 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -353,10 +353,6 @@ and can differ significantly depending on system configuration. The actual time not processing packets will be determined by the sleep and processor wake-up times and should be tested with each system configuration. -The current configuration of the PMD load based sleeping can be shown with:: - - $ ovs-appctl dpif-netdev/pmd-sleep-show - Sleep time statistics for 10 secs can be seen with:: $ ovs-appctl dpif-netdev/pmd-stats-clear \ @@ -379,5 +375,35 @@ system configuration (e.g. enabling processor C-states) and workloads. extra latency before the PMD thread returns to processing packets at full rate. +Maximum sleep values can also be set for individual PMD threads using +key:value pairs in the form of core:max_sleep. Any PMD thread that has been +assigned a specified value will use that. Any PMD thread that does not have +a specified value will use the current global value. + +Specified values for individual PMD threads can be added or removed at +any time. + +For example, to set PMD threads on cores 8 and 9 to never request a load based +sleep and all others PMD threads to be able to request a max sleep of +50 microseconds (us):: + + $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=50,8:0,9:0 + +The max sleep value for each PMD thread can be checked in the logs or with:: + + $ ovs-appctl dpif-netdev/pmd-sleep-show + pmd thread numa_id 0 core_id 8: + max sleep: 0 us + pmd thread numa_id 1 core_id 9: + max sleep: 0 us + pmd thread numa_id 0 core_id 10: + max sleep: 50 us + pmd thread numa_id 1 core_id 11: + max sleep: 50 us + pmd thread numa_id 0 core_id 12: + max sleep: 50 us + pmd thread numa_id 1 core_id 13: + max sleep: 50 us + .. _ovs-vswitchd(8): http://openvswitch.org/support/dist-docs/ovs-vswitchd.8.html diff --git a/NEWS b/NEWS index 591d5e47005..270ed667340 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,10 @@ Post-v3.2.0 * Added support for Generic Segmentation Offloading for the cases where TSO is enabled but not supported by an egress interface (except for tunnel interfaces). + * 'pmd-sleep-max' is updated to also accept pmd-thread-core:sleep-max. + The existing behaviour is maintained and a non key:value pair value + will be applied to all other PMD thread cores.'pmd-sleep-show' is + updated to show the maximum sleep for each PMD thread core. v3.2.0 - 17 Aug 2023 diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h index 1ec3cd79470..8715b383796 100644 --- a/lib/dpif-netdev-private-thread.h +++ b/lib/dpif-netdev-private-thread.h @@ -180,6 +180,9 @@ struct dp_netdev_pmd_thread { int numa_id; /* numa node id of this pmd thread. */ bool isolated; + /* Max sleep request in microseconds. */ + atomic_uint64_t max_sleep; + /* Queue id used by this pmd thread to send packets on all netdevs if * XPS disabled for this netdev. All static_tx_qid's are unique and less * than 'cmap_count(dp->poll_threads)'. */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 9a59a1b03c2..0aea9d0b8bf 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -179,6 +179,11 @@ static struct odp_support dp_netdev_support = { /* Time in uS to increment a pmd thread sleep time. */ #define PMD_SLEEP_INC_US 1 +struct pmd_sleep { + unsigned core_id; + uint64_t max_sleep; +}; + struct dpcls { struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ odp_port_t in_port; @@ -287,8 +292,8 @@ struct dp_netdev { atomic_uint32_t emc_insert_min; /* Enable collection of PMD performance metrics. */ atomic_bool pmd_perf_metrics; - /* Max load based sleep request. */ - atomic_uint64_t pmd_max_sleep; + /* Default max load based sleep request. */ + uint64_t pmd_max_sleep_default; /* Enable the SMC cache from ovsdb config */ atomic_bool smc_enable_db; @@ -326,6 +331,9 @@ struct dp_netdev { /* Cpu mask for pin of pmd threads. */ char *pmd_cmask; + /* PMD max load based sleep request user string. */ + char *max_sleep_list; + uint64_t last_tnl_conf_seq; struct conntrack *conntrack; @@ -1428,6 +1436,19 @@ dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, ds_destroy(&reply); } +static void +pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id, + uint64_t pmd_max_sleep) +{ + if (core_id == NON_PMD_CORE_ID) { + return; + } + ds_put_format(reply, + "pmd thread numa_id %d core_id %d:\n" + " max sleep: %4"PRIu64" us\n", + numa_id, core_id, pmd_max_sleep); +} + static void dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], void *aux) @@ -1442,9 +1463,8 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], unsigned int secs = 0; unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) / INTERVAL_USEC_TO_SEC; - uint64_t default_max_sleep = 0; bool show_header = true; - + uint64_t max_sleep; ovs_mutex_lock(&dp_netdev_mutex); @@ -1512,12 +1532,13 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); } else if (type == PMD_INFO_SLEEP_SHOW) { if (show_header) { - atomic_read_relaxed(&dp->pmd_max_sleep, &default_max_sleep); - ds_put_format(&reply, "Default max sleep: %4"PRIu64" us", - default_max_sleep); - ds_put_cstr(&reply, "\n"); + ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n", + dp->pmd_max_sleep_default); show_header = false; } + atomic_read_relaxed(&pmd->max_sleep, &max_sleep); + pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id, + max_sleep); } } free(pmd_list); @@ -1906,6 +1927,8 @@ create_dp_netdev(const char *name, const struct dpif_class *class, return error; } + dp->max_sleep_list = NULL; + dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); *dpp = dp; return 0; @@ -2015,6 +2038,7 @@ dp_netdev_free(struct dp_netdev *dp) dp_netdev_meter_destroy(dp); + free(dp->max_sleep_list); free(dp->pmd_cmask); free(CONST_CAST(char *, dp->name)); free(dp); @@ -4847,6 +4871,209 @@ set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log) } } +static int +parse_pmd_sleep_list(const char *max_sleep_list, + struct pmd_sleep **pmd_sleeps) +{ + char *list, *copy, *key, *value; + int num_vals = 0; + + if (!max_sleep_list) { + return num_vals; + } + + list = copy = xstrdup(max_sleep_list); + + while (ofputil_parse_key_value(&list, &key, &value)) { + uint64_t temp, pmd_max_sleep; + char *error = NULL; + unsigned core; + int i; + + error = str_to_u64(key, &temp); + if (error) { + free(error); + continue; + } + + if (value[0] == '\0') { + /* No value specified. key is dp default. */ + core = UINT_MAX; + pmd_max_sleep = temp; + } else { + error = str_to_u64(value, &pmd_max_sleep); + if (!error && temp < UINT_MAX) { + /* Key is pmd core id. */ + core = (unsigned) temp; + } else { + free(error); + continue; + } + } + + /* Detect duplicate max sleep values. */ + for (i = 0; i < num_vals; i++) { + if ((*pmd_sleeps)[i].core_id == core) { + break; + } + } + if (i == num_vals) { + /* Not duplicate, add a new entry. */ + *pmd_sleeps = xrealloc(*pmd_sleeps, + (num_vals + 1) * sizeof **pmd_sleeps); + num_vals++; + } + + pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); + + (*pmd_sleeps)[i].core_id = core; + (*pmd_sleeps)[i].max_sleep = pmd_max_sleep; + } + + free(copy); + return num_vals; +} + +static void +log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep) +{ + if (core_id == NON_PMD_CORE_ID) { + return; + } + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, " + "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep); +} + +static void +pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) +{ + uint64_t max_sleep = dp->pmd_max_sleep_default; + struct pmd_sleep *pmd_sleeps = NULL; + int num_vals; + + num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps); + + /* Check if the user has set a specific value for this pmd. */ + for (int i = 0; i < num_vals; i++) { + if (pmd_sleeps[i].core_id == pmd->core_id) { + max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + atomic_init(&pmd->max_sleep, max_sleep); + log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep); + free(pmd_sleeps); +} + +static bool +assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals, + struct pmd_sleep *pmd_sleeps) +{ + struct dp_netdev_pmd_thread *pmd; + bool value_changed = false; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + uint64_t new_max_sleep, cur_pmd_max_sleep; + + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* Default to global value. */ + new_max_sleep = dp->pmd_max_sleep_default; + + /* Check for pmd specific value. */ + for (int i = 0; i < num_vals; i++) { + if (pmd->core_id == pmd_sleeps[i].core_id) { + new_max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); + if (new_max_sleep != cur_pmd_max_sleep) { + atomic_store_relaxed(&pmd->max_sleep, new_max_sleep); + value_changed = true; + } + } + return value_changed; +} + +static void +log_all_pmd_sleeps(struct dp_netdev *dp) +{ + struct dp_netdev_pmd_thread **pmd_list = NULL; + struct dp_netdev_pmd_thread *pmd; + size_t n; + + VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.", + dp->pmd_max_sleep_default); + + sorted_poll_thread_list(dp, &pmd_list, &n); + + for (size_t i = 0; i < n; i++) { + uint64_t cur_pmd_max_sleep; + + pmd = pmd_list[i]; + atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); + log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep); + } + free(pmd_list); +} + +static bool +set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config) +{ + const char *max_sleep_list = smap_get(config, "pmd-sleep-max"); + struct pmd_sleep *pmd_sleeps = NULL; + uint64_t default_max_sleep = 0; + bool default_changed = false; + bool pmd_changed = false; + uint64_t pmd_maxsleep; + int num_vals = 0; + + /* Check for deprecated 'pmd-maxsleep' value. */ + pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX); + if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) { + VLOG_WARN_ONCE("pmd-maxsleep is deprecated. " + "Please use pmd-sleep-max instead."); + default_max_sleep = pmd_maxsleep; + } + + /* Check if there is no change in string or value. */ + if (!!dp->max_sleep_list == !!max_sleep_list) { + if (max_sleep_list + ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list) + : default_max_sleep == dp->pmd_max_sleep_default) { + return false; + } + } + + /* Free existing string and copy new one (if any). */ + free(dp->max_sleep_list); + dp->max_sleep_list = nullable_xstrdup(max_sleep_list); + + if (max_sleep_list) { + num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps); + + /* Check if the user has set a global value. */ + for (int i = 0; i < num_vals; i++) { + if (pmd_sleeps[i].core_id == UINT_MAX) { + default_max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + } + + if (dp->pmd_max_sleep_default != default_max_sleep) { + dp->pmd_max_sleep_default = default_max_sleep; + default_changed = true; + } + pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps); + + free(pmd_sleeps); + return default_changed || pmd_changed; +} + /* Applies datapath configuration from the database. Some of the changes are * actually applied in dpif_netdev_run(). */ static int @@ -4864,7 +5091,6 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) uint64_t rebalance_intvl; uint8_t cur_rebalance_load; uint32_t rebalance_load, rebalance_improve; - uint64_t pmd_max_sleep, cur_pmd_max_sleep; bool log_autolb = false; enum sched_assignment_type pmd_rxq_assign_type; static bool first_set_config = true; @@ -5015,26 +5241,12 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) set_pmd_auto_lb(dp, autolb_state, log_autolb); - pmd_max_sleep = smap_get_ullong(other_config, "pmd-maxsleep", UINT64_MAX); - if (pmd_max_sleep != UINT64_MAX) { - VLOG_WARN("pmd-maxsleep is deprecated. " - "Please use pmd-sleep-max instead."); - } else { - pmd_max_sleep = 0; + bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config); + if (first_set_config || sleep_changed) { + log_all_pmd_sleeps(dp); } - pmd_max_sleep = smap_get_ullong(other_config, "pmd-sleep-max", - pmd_max_sleep); - pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); - atomic_read_relaxed(&dp->pmd_max_sleep, &cur_pmd_max_sleep); - if (first_set_config || pmd_max_sleep != cur_pmd_max_sleep) { - atomic_store_relaxed(&dp->pmd_max_sleep, pmd_max_sleep); - VLOG_INFO("PMD max sleep request is %"PRIu64" usecs.", pmd_max_sleep); - VLOG_INFO("PMD load based sleeps are %s.", - pmd_max_sleep ? "enabled" : "disabled" ); - } - - first_set_config = false; + first_set_config = false; return 0; } @@ -7063,7 +7275,7 @@ pmd_thread_main(void *f_) pmd_perf_start_iteration(s); atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); - atomic_read_relaxed(&pmd->dp->pmd_max_sleep, &max_sleep); + atomic_read_relaxed(&pmd->max_sleep, &max_sleep); for (i = 0; i < poll_cnt; i++) { @@ -7650,6 +7862,8 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, hmap_init(&pmd->send_port_cache); cmap_init(&pmd->tx_bonds); + pmd_init_max_sleep(dp, pmd); + /* Initialize DPIF function pointer to the default configured version. */ atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default()); diff --git a/tests/pmd.at b/tests/pmd.at index 06cc90477b0..35a44b4dfee 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -60,20 +60,32 @@ m4_define([CHECK_PMD_THREADS_CREATED], [ fi ]) -dnl CHECK_DP_SLEEP_MAX([max_sleep], [enabled], [+line]) +dnl CHECK_DP_SLEEP_MAX([max_sleep], [+line]) dnl -dnl Checks correct pmd load based sleep is set for the datapath. +dnl Checks correct pmd load based sleep value for the datapath. dnl Checking starts from line number 'line' in ovs-vswithd.log . m4_define([CHECK_DP_SLEEP_MAX], [ - SLEEP_TIME="PMD max sleep request is $1 usecs." - SLEEP_STATE="PMD load based sleeps are $2." - line_st=$3 + SLEEP_TIME="Default PMD thread max sleep: *[$1] us." + line_st=$2 if [[ -z "$line_st" ]] then line_st="+0" fi OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_TIME"]) - OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_STATE"]) +]) + +dnl CHECK_PMD_SLEEP_MAX([core_id], [numa_id], [max_sleep], [+line]) +dnl +dnl Checks max sleep time of each pmd with core_id. +dnl Checking starts from line number 'line' in ovs-vswithd.log . +m4_define([CHECK_PMD_SLEEP_MAX], [ + PATTERN="PMD thread on numa_id: *[$1], core id: *[$2], max sleep: *[$3] us." + line_st=$4 + if [[ -z "$line_st" ]] + then + line_st="+0" + fi + OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$PATTERN"]) ]) m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1\2:/"]) @@ -1272,61 +1284,371 @@ OVS_VSWITCHD_STOP AT_CLEANUP AT_SETUP([PMD - pmd sleep]) -OVS_VSWITCHD_START +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1], [], [], [--dummy-numa 0,0,0,1,1,8,8]) dnl Check default -CHECK_DP_SLEEP_MAX([0], [disabled], []) - +CHECK_DP_SLEEP_MAX([0], []) +CHECK_PMD_SLEEP_MAX([0], [0], [0], []) +CHECK_PMD_SLEEP_MAX([1], [3], [0], []) +CHECK_PMD_SLEEP_MAX([8], [5], [0], []) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us ]) dnl Check low value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="1"]) -CHECK_DP_SLEEP_MAX([1], [enabled], [+$LINENUM]) +CHECK_DP_SLEEP_MAX([1], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [1], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [1], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [1], [+$LINENUM]) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 1 us +pmd thread numa_id 0 core_id 0: + max sleep: 1 us +pmd thread numa_id 1 core_id 3: + max sleep: 1 us +pmd thread numa_id 8 core_id 5: + max sleep: 1 us ]) dnl Check high value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10000"]) -CHECK_DP_SLEEP_MAX([10000], [enabled], [+$LINENUM]) +CHECK_DP_SLEEP_MAX([10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [10000], [+$LINENUM]) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 10000 us +pmd thread numa_id 0 core_id 0: + max sleep: 10000 us +pmd thread numa_id 1 core_id 3: + max sleep: 10000 us +pmd thread numa_id 8 core_id 5: + max sleep: 10000 us ]) dnl Check setting max sleep to zero get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="0"]) -CHECK_DP_SLEEP_MAX([0], [disabled], [+$LINENUM]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM]) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us ]) dnl Check above high value max sleep get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10001"]) -CHECK_DP_SLEEP_MAX([10000], [enabled], [+$LINENUM]) +CHECK_DP_SLEEP_MAX([10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [10000], [+$LINENUM]) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 10000 us +pmd thread numa_id 0 core_id 0: + max sleep: 10000 us +pmd thread numa_id 1 core_id 3: + max sleep: 10000 us +pmd thread numa_id 8 core_id 5: + max sleep: 10000 us ]) dnl Check rounding get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="490"]) -CHECK_DP_SLEEP_MAX([490], [enabled], [+$LINENUM]) +CHECK_DP_SLEEP_MAX([490], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [490], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [490], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [490], [+$LINENUM]) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 490 us +pmd thread numa_id 0 core_id 0: + max sleep: 490 us +pmd thread numa_id 1 core_id 3: + max sleep: 490 us +pmd thread numa_id 8 core_id 5: + max sleep: 490 us ]) dnl Check rounding get_log_next_line_num AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="499"]) -CHECK_DP_SLEEP_MAX([499], [enabled], [+$LINENUM]) +CHECK_DP_SLEEP_MAX([499], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [499], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [499], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [499], [+$LINENUM]) AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl Default max sleep: 499 us +pmd thread numa_id 0 core_id 0: + max sleep: 499 us +pmd thread numa_id 1 core_id 3: + max sleep: 499 us +pmd thread numa_id 8 core_id 5: + max sleep: 499 us +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([PMD - per PMD sleep]) +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1], + [], [], [--dummy-numa 0,0,0,1,1,8,8]) + +dnl Check system default. +CHECK_DP_SLEEP_MAX([0], []) +CHECK_PMD_SLEEP_MAX([0], [0], [0], []) +CHECK_PMD_SLEEP_MAX([1], [3], [0], []) +CHECK_PMD_SLEEP_MAX([8], [5], [0], []) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +]) + +dnl Only per PMD. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=3:300,0:100,5:400]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [400], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 400 us +]) + +dnl Mix of not used default and per-PMD. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,3:300,0:100,5:200]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +]) + +dnl Remove a per-pmd entry and use default. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,3:300]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [50], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 50 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 50 us +]) + +dnl Mix and change values. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=3:400,200]) +CHECK_DP_SLEEP_MAX([200], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [200], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [400], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 200 us +pmd thread numa_id 0 core_id 0: + max sleep: 200 us +pmd thread numa_id 1 core_id 3: + max sleep: 400 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +]) + +dnl Add values for pmds that don't exist yet. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=2:600,50,3:300,0:100,6:400,5:200]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +]) + +dnl Add more cores. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=7f]) +CHECK_PMD_SLEEP_MAX([0], [1], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [600], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6],[400], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 0 core_id 1: + max sleep: 50 us +pmd thread numa_id 0 core_id 2: + max sleep: 600 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 1 core_id 4: + max sleep: 50 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +pmd thread numa_id 8 core_id 6: + max sleep: 400 us +]) + +dnl Go back to just a global value. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=90]) +CHECK_DP_SLEEP_MAX([90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [90], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 90 us +pmd thread numa_id 0 core_id 0: + max sleep: 90 us +pmd thread numa_id 0 core_id 1: + max sleep: 90 us +pmd thread numa_id 0 core_id 2: + max sleep: 90 us +pmd thread numa_id 1 core_id 3: + max sleep: 90 us +pmd thread numa_id 1 core_id 4: + max sleep: 90 us +pmd thread numa_id 8 core_id 5: + max sleep: 90 us +pmd thread numa_id 8 core_id 6: + max sleep: 90 us +]) + +dnl Try invalid value. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=qwe]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [0], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 0 core_id 1: + max sleep: 0 us +pmd thread numa_id 0 core_id 2: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 1 core_id 4: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +pmd thread numa_id 8 core_id 6: + max sleep: 0 us +]) + +dnl Try invalid key:value. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,1:qwe,2:0]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [50], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 50 us +pmd thread numa_id 0 core_id 1: + max sleep: 50 us +pmd thread numa_id 0 core_id 2: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 50 us +pmd thread numa_id 1 core_id 4: + max sleep: 50 us +pmd thread numa_id 8 core_id 5: + max sleep: 50 us +pmd thread numa_id 8 core_id 6: + max sleep: 50 us +]) + +dnl Remove config. +get_log_next_line_num +AT_CHECK([ovs-vsctl remove Open_vSwitch . other_config pmd-sleep-max]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [0], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 0 core_id 1: + max sleep: 0 us +pmd thread numa_id 0 core_id 2: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 1 core_id 4: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +pmd thread numa_id 8 core_id 6: + max sleep: 0 us ]) OVS_VSWITCHD_STOP diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index eaccd85cf94..612ba41e3b2 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -802,9 +802,7 @@ The default value is 25%.

      - +

      Specifies the maximum sleep time that will be requested in microseconds per iteration for a PMD thread which has received zero @@ -823,6 +821,33 @@

      The maximum value is 10000 microseconds.

      +

      + other_config:pmd-sleep-max=<pmd-sleep-list> +

      +

      where

      +

      +

        +
      • + <pmd-sleep-list> ::= NULL | <non-empty-list> +
      • +
      • + <non-empty-list> ::= <pmd-sleep-value> | + <pmd-sleep-value> , + <non-empty-list> +
      • +
      • + <pmd-sleep-value> ::= <global-default-sleep-value> | + <pmd-core-sleep-pair> +
      • +
      • + <global-default-sleep-value> ::= <max-sleep-time> +
      • +
      • + <pmd-core-sleep-pair> ::= <core> : + <max-sleep-time> +
      • +
      +

      From ee93f364b91f22f2c1b947c3e6a5799f7e5ddcb6 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:40:30 +0100 Subject: [PATCH 473/833] ci: Add JOBS variable to replace all the '-j4' instances. Add a JOBS variable, which defaults to '-j4' but can be overwritten with the same environment variable. This can be useful if you use this linux-build.sh script outside of GitHub actions on a machine with many cores. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .ci/linux-build.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index aa2ecc50507..67c01a64430 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -6,6 +6,7 @@ set -x CFLAGS_FOR_OVS="-g -O2" SPARSE_FLAGS="" EXTRA_OPTS="--enable-Werror" +JOBS=${JOBS:-"-j4"} function install_dpdk() { @@ -46,7 +47,7 @@ function build_ovs() configure_ovs $OPTS make selinux-policy - make -j4 + make ${JOBS} } if [ "$DEB_PACKAGE" ]; then @@ -122,8 +123,8 @@ if [ "$TESTSUITE" = 'test' ]; then configure_ovs export DISTCHECK_CONFIGURE_FLAGS="$OPTS" - make distcheck -j4 CFLAGS="${CFLAGS_FOR_OVS}" \ - TESTSUITEFLAGS=-j4 RECHECK=yes + make distcheck ${JOBS} CFLAGS="${CFLAGS_FOR_OVS}" \ + TESTSUITEFLAGS=${JOBS} RECHECK=yes else build_ovs for testsuite in $TESTSUITE; do @@ -134,7 +135,7 @@ else export DPDK_EAL_OPTIONS="--lcores 0@1,1@1,2@1" run_as_root="sudo -E PATH=$PATH" fi - $run_as_root make $testsuite TESTSUITEFLAGS=-j4 RECHECK=yes + $run_as_root make $testsuite TESTSUITEFLAGS=${JOBS} RECHECK=yes done fi From e07ae9a6d7a31b8793cf481c3ac45cb4923c1eb3 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:41:02 +0100 Subject: [PATCH 474/833] ci: Add make check-ovsdb-cluster tests to GitHub action ci. This patch adds 'make check-ovsdb-cluster' tests to GitHub action ci. In addition, this patch also makes sure this test and 'make check' do not run as root. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .ci/linux-build.sh | 5 ++++- .github/workflows/build-and-test.yml | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 67c01a64430..bb540703e2b 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -129,11 +129,14 @@ else build_ovs for testsuite in $TESTSUITE; do run_as_root= + if [ "$testsuite" != "check" ] && \ + [ "$testsuite" != "check-ovsdb-cluster" ] ; then + run_as_root="sudo -E PATH=$PATH" + fi if [ "${testsuite##*dpdk}" != "$testsuite" ]; then sudo sh -c 'echo 1024 > /proc/sys/vm/nr_hugepages' || true [ "$(cat /proc/sys/vm/nr_hugepages)" = '1024' ] export DPDK_EAL_OPTIONS="--lcores 0@1,1@1,2@1" - run_as_root="sudo -E PATH=$PATH" fi $run_as_root make $testsuite TESTSUITEFLAGS=${JOBS} RECHECK=yes done diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 09654205e74..5d441157ce8 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -164,6 +164,9 @@ jobs: m32: m32 opts: --disable-ssl + - compiler: gcc + testsuite: check-ovsdb-cluster + steps: - name: checkout uses: actions/checkout@v3 From 9dfa65dc7b0f9d58d2798a200e7df59aa751d236 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:41:37 +0100 Subject: [PATCH 475/833] ci: Update the GitHub Ubuntu runner image to Ubuntu 22.04. Updating this image is a requirement for the kernel system-traffic tests to pass on Ubuntu. In addition, 20.04 might be replaced, as soon as 24.04 comes out. Or we need to do this when it becomes EOL in April 2025. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .github/workflows/build-and-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 5d441157ce8..acb57ac46b8 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -12,7 +12,7 @@ jobs: name: dpdk gcc outputs: dpdk_key: ${{ steps.gen_dpdk_key.outputs.key }} - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 timeout-minutes: 30 steps: @@ -89,7 +89,7 @@ jobs: TESTSUITE: ${{ matrix.testsuite }} name: linux ${{ join(matrix.*, ' ') }} - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 timeout-minutes: 30 strategy: From 6660fccb1929f1bacc6a267518c71e7f3c47903b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:41:58 +0100 Subject: [PATCH 476/833] ci: Exclude tests that show random failures through GitHub actions. I ran 80 series of full tests, and the following tests showed failures: 802.1ad - vlan_limit +2023-11-20T10:32:11.245Z|00001|dpif_netdev(revalidator5)|ERR|internal error parsing flow key recirc_id(0),dp_hash(0),skb_priority(0), in_port(2),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0), packet_type(ns=0,id=0),eth(src=42:7e:4b:46:68:1b,dst=33:33:ff:46:68:1b), eth_type(0x88a8),vlan(vid=4094,pcp=0),encap(eth_type(0x8100), vlan(vid=100,pcp=0),encap(eth_type(0x86dd),ipv6( src=::,dst=ff02::1:ff46:681b,label=0,proto=58,tclass=0,hlimit=255, frag=no),icmpv6(type=135,code=0),nd(target=fe80::407e:4bff:fe46:681b, sll=00:00:00:00:00:00,tll=00:00:00:00:00:00))) +2023-11-20T10:32:11.245Z|00002|dpif(revalidator5)|WARN|netdev@ovs-netdev: failed to put[modify] (Invalid argument) ufid:ef1ca90c-dbd0-4ca7-9869-411bdffd1ece recirc_id(0),dp_hash(0/0), skb_priority(0/0),in_port(2),skb_mark(0/0),ct_state(0/0),ct_zone(0/0), ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0), eth(src=42:7e:4b:46:68:1b,dst=33:33:ff:46:68:1b),eth_type(0x88a8), vlan(vid=4094,pcp=0/0x0),encap(eth_type(0x8100), vlan(vid=100/0x0,pcp=0/0x0),encap(eth_type(0x86dd), ipv6(src=::/::,dst=ff02::1:ff46:681b/::,label=0/0,proto=58/0, tclass=0/0,hlimit=255/0,frag=no),icmpv6(type=135/0,code=0/0), nd(target=fe80::407e:4bff:fe46:681b/::, sll=00:00:00:00:00:00/00:00:00:00:00:00, tll=00:00:00:00:00:00/00:00:00:00:00:00))), actions:drop conntrack - zones from other field, more tests +2023-11-20T10:45:43.015Z|00001|dpif(handler5)|WARN|system@ovs-system: execute ct(commit),3 failed (Invalid argument) on packet tcp, vlan_tci=0x0000,dl_src=42:7e:4b:46:68:1b,dl_dst=ba:72:4c:a5:31:6b, nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=64, nw_frag=no,tp_src=53738,tp_dst=80,tcp_flags=psh|ack tcp_csum:e4a conntrack - limit by zone ./system-traffic.at:5154: ovs-appctl dpctl/ct-get-limits zone=0,1,2,3,4,5 --- - 2023-11-20 10:51:09.965375141 +0000 +++ /home/runner/work/ovs/ovs/tests/system-kmod-testsuite.dir/at-groups/ 114/stdout 2023-11-20 10:51:09.956723756 +0000 @@ -1,5 +1,5 @@ default limit=10 -zone=0,limit=5,count=5 +zone=0,limit=5,count=6 As I do not see those failures when running these stand alone on the same Ubuntu distribution, I've disabled them. This patch also adds the 'CHECK_GITHUB_ACTION' macro to skip tests that won't execute successfully through GitHub actions. We could not use the -k !keyword option, as it can not be combined with a range of tests. Signed-off-by: Eelco Chaudron Reviewed-by: Simon Horman --- tests/system-common-macros.at | 4 ++++ tests/system-traffic.at | 3 +++ 2 files changed, 7 insertions(+) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 0113aae8bd2..0620be0c702 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -365,3 +365,7 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP], # OVS_CHECK_CT_CLEAR() m4_define([OVS_CHECK_CT_CLEAR], [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])]) + +# OVS_CHECK_GITHUB_ACTION +m4_define([OVS_CHECK_GITHUB_ACTION], + [AT_SKIP_IF([test "$GITHUB_ACTIONS" = "true"])]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 0d6f8cf21af..09308ac6ba7 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3311,6 +3311,7 @@ AT_CLEANUP AT_SETUP([conntrack - zones from other field, more tests]) CHECK_CONNTRACK() +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -5270,6 +5271,7 @@ AT_CLEANUP AT_SETUP([conntrack - limit by zone]) CHECK_CONNTRACK() +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -8099,6 +8101,7 @@ AT_CLEANUP AT_BANNER([802.1ad]) AT_SETUP([802.1ad - vlan_limit]) +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0]) OVS_CHECK_8021AD() From a80883f7682158c7a6955360ee852e8279f748e9 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:42:12 +0100 Subject: [PATCH 477/833] ci: Fixed tests that show random failures through GitHub actions. I ran 80 series of full tests, and the following tests showed failures: conntrack - Multiple ICMP traverse ./system-traffic.at:7571: ovs-appctl dpctl/dump-conntrack | ... -e 's/state=[0-9_A-Z]*/state=/g' | sort | uniq --- - 2023-11-20 15:36:02.591051192 +0000 +++ /home/runner/work/ovs/ovs/tests/system-kmod-testsuite.dir/... @@ -1,2 +1,9 @@ +tcp,orig=(src=10.1.1.7,dst=13.107.43.16,sport=, dport=),reply=(src=13.107.43.16,dst=10.1.1.7,sport=, dport=),protoinfo=(state=) +tcp,orig=(src=10.1.1.7,dst=168.63.129.16,sport=, dport=),reply=(src=168.63.129.16,dst=10.1.1.7,sport=, dport=),protoinfo=(state=) ... +tcp,orig=(src=20.22.98.201,dst=10.1.1.7,sport=,dport=), reply=(src=10.1.1.7,dst=20.22.98.201,sport=,dport=), protoinfo=(state=) conntrack - ct flush +++ /home/runner/work/ovs/ovs/tests/system-kmod-testsuite.dir/... @@ -1,3 +1,5 @@ +tcp,orig=(src=10.1.1.154,dst=13.107.42.16,sport=45300,dport=443), reply=(src=13.107.42.16,dst=10.1.1.154,sport=443,dport=45300), protoinfo=(state=ESTABLISHED) +tcp,orig=(src=10.1.1.154,dst=20.72.125.48,sport=45572,dport=443), reply=(src=20.72.125.48,dst=10.1.1.154,sport=443,dport=45572), protoinfo=(state=ESTABLISHED) These tests showed local IP addresses in the results. The tests were modified to only include the IP addresses relevant to the test case. Signed-off-by: Eelco Chaudron Reviewed-by: Simon Horman --- tests/system-traffic.at | 54 ++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 09308ac6ba7..3cdd2f12526 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2706,170 +2706,170 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a5 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=1']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test UDP from port 1 and 2, partial flush by dst port AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=1']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test UDP from port 1 and 2, partial flush by src address AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test UDP from port 1 and 2, partial flush by dst address AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.1']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test UDP from port 1 and 2, partial flush by src address in reply direction AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD '' 'ct_nw_src=10.1.1.2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD zone=5 '' 'ct_nw_src=10.1.1.1']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test UDP from port 1 and 2, flush without arguments AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test SCTP flush based on port. AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500003400010000408464410a0101010a01010200010002000000009178f7d30100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000950540000000a08004500003400010000408464410a0101020a010101000200010000000098f29e470100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sed "s/,protoinfo=.*$//" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sed "s/,protoinfo=.*$//" | sort], [0], [dnl sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1,ct_nw_proto=132,ct_tp_src=1,ct_tp_dst=2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sed "s/,protoinfo=.*$//" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sed "s/,protoinfo=.*$//" | sort], [0], [dnl sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_proto=132,ct_tp_src=2,ct_tp_dst=1']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test UDP from port 1 and 2, partial flush by mark and labels. AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD mark=0xaa]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD labels=0xaa00000000]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1" | sort], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD mark=2/2]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) AT_CHECK([FLUSH_CMD labels=0x0200000000/0x0200000000]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) dnl Test flush with invalid arguments. @@ -7852,7 +7852,7 @@ NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f0 00 00 01 01 02 f0 00 sleep 1 dnl ensure CT picked up the packet -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1)], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0) ]) From e7b51b38fe7a9d3c4d3eeaadf4c73be691ea980d Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:42:27 +0100 Subject: [PATCH 478/833] ci: Add make check-kernel to GitHub actions ci. This patch adds 'make check-kernel' to the GitHub actions ci. However, to do this, some additional changes were needed. First, some of the missing test and package dependencies had to be added. Finally, we added an option to the GitHub run matrix that allows the tests to be split up, to avoid lengthy single test runs. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .ci/linux-build.sh | 3 ++- .github/workflows/build-and-test.yml | 11 ++++++++++- python/test_requirements.txt | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index bb540703e2b..05b944ead09 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -138,7 +138,8 @@ else [ "$(cat /proc/sys/vm/nr_hugepages)" = '1024' ] export DPDK_EAL_OPTIONS="--lcores 0@1,1@1,2@1" fi - $run_as_root make $testsuite TESTSUITEFLAGS=${JOBS} RECHECK=yes + $run_as_root make $testsuite TESTSUITEFLAGS="${JOBS} ${TEST_RANGE}" \ + RECHECK=yes done fi diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index acb57ac46b8..0b881ca9175 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -76,7 +76,8 @@ jobs: env: dependencies: | automake libtool gcc bc libjemalloc2 libjemalloc-dev libssl-dev \ - llvm-dev libnuma-dev libpcap-dev selinux-policy-dev libbpf-dev + llvm-dev libnuma-dev libpcap-dev selinux-policy-dev libbpf-dev \ + lftp libreswan ASAN: ${{ matrix.asan }} UBSAN: ${{ matrix.ubsan }} CC: ${{ matrix.compiler }} @@ -87,6 +88,7 @@ jobs: OPTS: ${{ matrix.opts }} STD: ${{ matrix.std }} TESTSUITE: ${{ matrix.testsuite }} + TEST_RANGE: ${{ matrix.test_range }} name: linux ${{ join(matrix.*, ' ') }} runs-on: ubuntu-22.04 @@ -167,6 +169,13 @@ jobs: - compiler: gcc testsuite: check-ovsdb-cluster + - compiler: gcc + testsuite: check-kernel + test_range: "-100" + - compiler: gcc + testsuite: check-kernel + test_range: "100-" + steps: - name: checkout uses: actions/checkout@v3 diff --git a/python/test_requirements.txt b/python/test_requirements.txt index 6aaee13e3fe..c85ce41add8 100644 --- a/python/test_requirements.txt +++ b/python/test_requirements.txt @@ -1,3 +1,5 @@ -pytest netaddr +pyftpdlib pyparsing +pytest +tftpy From 15f179324cb44e7f06e3a290ce60e416b868da26 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:42:41 +0100 Subject: [PATCH 479/833] ci: Add make check-offloads to GitHub actions ci. This patch also adds the 'CHECK_GITHUB_ACTION' macro to skip tests that won't execute successfully through GitHub actions. We could not use the -k !keyword option, as it can not be combined with a range of tests. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .ci/linux-build.sh | 2 +- .github/workflows/build-and-test.yml | 7 +++++++ tests/system-offloads-traffic.at | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 05b944ead09..90581c10b7f 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -131,7 +131,7 @@ else run_as_root= if [ "$testsuite" != "check" ] && \ [ "$testsuite" != "check-ovsdb-cluster" ] ; then - run_as_root="sudo -E PATH=$PATH" + run_as_root="sudo -E PATH=$PATH GITHUB_ACTIONS=$GITHUB_ACTIONS" fi if [ "${testsuite##*dpdk}" != "$testsuite" ]; then sudo sh -c 'echo 1024 > /proc/sys/vm/nr_hugepages' || true diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 0b881ca9175..586b0cdd914 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -176,6 +176,13 @@ jobs: testsuite: check-kernel test_range: "100-" + - compiler: gcc + testsuite: check-offloads + test_range: "-100" + - compiler: gcc + testsuite: check-offloads + test_range: "100-" + steps: - name: checkout uses: actions/checkout@v3 diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 0bedee7530c..6bd49a3eef3 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -192,6 +192,7 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads disabled]) AT_KEYWORDS([dp-meter]) AT_SKIP_IF([test $HAVE_NC = "no"]) +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) @@ -240,6 +241,7 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads enabled]) AT_KEYWORDS([offload-meter]) +OVS_CHECK_GITHUB_ACTION() CHECK_TC_INGRESS_PPS() AT_SKIP_IF([test $HAVE_NC = "no"]) OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) From 09958e081c9ce01f5824fd5d16faa3a8295b0e34 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:42:55 +0100 Subject: [PATCH 480/833] ci: Add make check-system-userspace to GitHub actions ci. This patch adds 'make check-system-userspace' to the GitHub actions ci. The tests are not split into two separate test runs as they complete in around 10 minutes. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .github/workflows/build-and-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 586b0cdd914..db0a1ac3999 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -183,6 +183,10 @@ jobs: testsuite: check-offloads test_range: "100-" + - compiler: gcc + dpdk: dpdk + testsuite: check-system-userspace + steps: - name: checkout uses: actions/checkout@v3 From adfc3d4a3289e69140d25e1f7858c37e883accd1 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:43:09 +0100 Subject: [PATCH 481/833] ci: Add make check-system-tso to GitHub actions ci. This patch adds 'make check-system-tso' to the GitHub actions ci. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .github/workflows/build-and-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index db0a1ac3999..d74668f6162 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -187,6 +187,10 @@ jobs: dpdk: dpdk testsuite: check-system-userspace + - compiler: gcc + dpdk: dpdk + testsuite: check-system-tso + steps: - name: checkout uses: actions/checkout@v3 From 26ffd192f2147f3f53cae35b5e90bd6926c1ba8f Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:44:05 +0100 Subject: [PATCH 482/833] ci: Fix dpdk build cache key generation. When new drivers are introduced, the cache key is not accurately computed. Before the commit 1a1b3106d90e ("ci: Separate DPDK from OVS build."), the DPDK build process was integrated in .ci/linux-{setup,build}.sh scripts, where specific lines were employed to generate the key. Since it is now separated in .ci/dpdk-{setup,build}.sh, this patch computes the key based on the content of those dedicated scripts. Fixes: 4e90baca89f0 ("system-dpdk: Run traffic tests.") Signed-off-by: Eelco Chaudron Acked-by: Simon Horman Reviewed-by: David Marchand --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index d74668f6162..e9a2714fbdb 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -30,7 +30,7 @@ jobs: # This also allows us to use cache from any branch as long as version # and a way we're building DPDK stays the same. run: | - grep -irE 'RTE_|DPDK|meson|ninja' .ci/dpdk-* > dpdk-ci-signature + cat .ci/dpdk-* > dpdk-ci-signature grep -rwE 'DPDK_GIT|DPDK_VER' .github/ >> dpdk-ci-signature if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then git ls-remote --heads $DPDK_GIT $DPDK_VER >> dpdk-ci-signature From 67c53a89df5cdf9c93b99345fd2bd3d1e6aa7dcc Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:44:29 +0100 Subject: [PATCH 483/833] ci: Allow make check-dpdk to run the MFEX tests. Currently, if you use the python/test_requirements.txt file to set up your test environment the MFEX tests will be skipped due to the Scapy package not being included. This is fixed as part of this patch. The test case change will make sure the 'MFEX Configuration' test will run without the need for Scapy and its auto-generated tests. In addition, we exclude the traffic-related MFEX tests from running on GitHub actions due to limited resources. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .ci/dpdk-build.sh | 2 +- .github/workflows/build-and-test.yml | 2 +- python/test_requirements.txt | 1 + tests/system-dpdk.at | 6 +++--- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh index aa83e446436..d4c178ee0df 100755 --- a/.ci/dpdk-build.sh +++ b/.ci/dpdk-build.sh @@ -38,7 +38,7 @@ function build_dpdk() # any DPDK driver. # check-dpdk unit tests requires testpmd and some net/ driver. DPDK_OPTS="$DPDK_OPTS -Denable_apps=test-pmd" - enable_drivers="net/null,net/af_xdp,net/tap,net/virtio" + enable_drivers="net/null,net/af_xdp,net/tap,net/virtio,net/pcap" DPDK_OPTS="$DPDK_OPTS -Denable_drivers=$enable_drivers" # Install DPDK using prefix. diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index e9a2714fbdb..1e92a0e2b42 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -5,7 +5,7 @@ on: [push, pull_request] jobs: build-dpdk: env: - dependencies: gcc libbpf-dev libnuma-dev ninja-build pkgconf + dependencies: gcc libbpf-dev libnuma-dev libpcap-dev ninja-build pkgconf CC: gcc DPDK_GIT: https://dpdk.org/git/dpdk-stable DPDK_VER: 22.11.1 diff --git a/python/test_requirements.txt b/python/test_requirements.txt index c85ce41add8..5043c71e223 100644 --- a/python/test_requirements.txt +++ b/python/test_requirements.txt @@ -2,4 +2,5 @@ netaddr pyftpdlib pyparsing pytest +scapy tftpy diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index af092a20004..d19062d987e 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -819,6 +819,7 @@ dnl -------------------------------------------------------------------------- dnl MFEX Autovalidator AT_SETUP([OVS-DPDK - MFEX Autovalidator]) AT_KEYWORDS([dpdk]) +OVS_CHECK_GITHUB_ACTION() OVS_DPDK_PRE_CHECK() OVS_DPDK_START([--no-pci]) AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) @@ -852,6 +853,7 @@ dnl -------------------------------------------------------------------------- dnl MFEX Autovalidator Fuzzy AT_SETUP([OVS-DPDK - MFEX Autovalidator Fuzzy]) AT_KEYWORDS([dpdk]) +OVS_CHECK_GITHUB_ACTION() OVS_DPDK_PRE_CHECK() OVS_DPDK_START([--no-pci]) AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) @@ -886,13 +888,11 @@ dnl -------------------------------------------------------------------------- AT_SETUP([OVS-DPDK - MFEX Configuration]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py test_traffic.pcap 1], [], [stdout]) OVS_DPDK_START([--no-pci]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_pcap1,rx_pcap=test_traffic.pcap,infinite_rx=1], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_null0,no-rx=1], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl show], [], [stdout]) AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar 1], [2], From de4cccf930a989c41101e91fbdf8b0d3fa468afa Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Dec 2023 13:45:44 +0100 Subject: [PATCH 484/833] ci: Add make check-afxdp to GitHub actions ci. This patch adds 'make check-afxdp' to the GitHub actions ci. The tests are not split into two separate test runs as they complete in around 10 minutes. Signed-off-by: Eelco Chaudron Acked-by: Simon Horman --- .github/workflows/build-and-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 1e92a0e2b42..710757693d2 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -191,6 +191,10 @@ jobs: dpdk: dpdk testsuite: check-system-tso + - compiler: gcc + dpdk: dpdk + testsuite: check-afxdp + steps: - name: checkout uses: actions/checkout@v3 From df5e5cf4318a64019d0f9bdf4fbfb398d1757269 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 21 Dec 2023 14:18:33 +0100 Subject: [PATCH 485/833] Documentation: Add section on inclusive language. As a community we should strive to be inclusive. As such it seems appropriate to adopt an word list, to help guide the use of inclusive language. This patch proposes use of the Inclusive Naming Word List v1.0. Link: https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Eelco Chaudron Acked-by: Aaron Conole --- Documentation/automake.mk | 1 + Documentation/index.rst | 1 + .../contributing/inclusive-language.rst | 58 +++++++++++++++++++ .../internals/contributing/index.rst | 1 + 4 files changed, 61 insertions(+) create mode 100644 Documentation/internals/contributing/inclusive-language.rst diff --git a/Documentation/automake.mk b/Documentation/automake.mk index 8bd3dbb2b88..47d2e336a0b 100644 --- a/Documentation/automake.mk +++ b/Documentation/automake.mk @@ -109,6 +109,7 @@ DOC_SOURCE = \ Documentation/internals/security.rst \ Documentation/internals/contributing/index.rst \ Documentation/internals/contributing/backporting-patches.rst \ + Documentation/internals/contributing/inclusive-language.rst \ Documentation/internals/contributing/coding-style.rst \ Documentation/internals/contributing/coding-style-windows.rst \ Documentation/internals/contributing/documentation-style.rst \ diff --git a/Documentation/index.rst b/Documentation/index.rst index 3cdc87c6984..7041384733d 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -43,6 +43,7 @@ Contributing - :doc:`internals/contributing/submitting-patches` - :doc:`internals/contributing/backporting-patches` + - :doc:`internals/contributing/inclusive-language` - :doc:`internals/contributing/coding-style` - :doc:`internals/contributing/coding-style-windows` diff --git a/Documentation/internals/contributing/inclusive-language.rst b/Documentation/internals/contributing/inclusive-language.rst new file mode 100644 index 00000000000..e8ee0958b51 --- /dev/null +++ b/Documentation/internals/contributing/inclusive-language.rst @@ -0,0 +1,58 @@ +.. + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. + + Convention for heading levels in Open vSwitch documentation: + + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + + Avoid deeper levels because they do not render well. + +================== +Inclusive Language +================== + +In order to help facilitate an inclusive environment in the Open vSwitch +community we recognise the role of language in framing our +communication with each other. It is important that terms that +may exclude people through racial, cultural or other bias, are avoided +as they may make people feel excluded. + +We recognise that this is subjective, and to some extent is a journey. +But we also recognise that we cannot begin that journey without taking +positive action. To this end Open vSwitch is adopting the practice +of an inclusive word list, which helps to guide the use of language within +the project. + +.. _word list: + +Word List +--------- + +The intent of this document is to formally document the acceptance of a +inclusive word list by Open vSwitch. Accordingly, this document specifies +use of the use the `Inclusive Naming Word List +`__ v1.0 (the word list) for Open +vSwitch. + +The adoption of the word list intended that this act as a guide for +developers creating patches to the Open vSwitch repository, including both +source code and documentation. And to aid maintainers in their role of +shepherding changes into the repository. + +Further steps to align usage of language in Open vSwitch, including +clarification of application of the word list, to new and existing work, +may follow. diff --git a/Documentation/internals/contributing/index.rst b/Documentation/internals/contributing/index.rst index a46cb046a0f..91304e60bdc 100644 --- a/Documentation/internals/contributing/index.rst +++ b/Documentation/internals/contributing/index.rst @@ -35,4 +35,5 @@ The below guides provide information on contributing to Open vSwitch itself. coding-style coding-style-windows documentation-style + inclusive-language libopenvswitch-abi From 98ee21ef63648cf8c5a444e8d3e2723c7f68f13c Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 21 Dec 2023 14:52:27 +0100 Subject: [PATCH 486/833] system-dpdk: Use dummy-pmd port for packet injection. net_pcap is not always available in DPDK (like, in a dev environment when you forgot to install the libpcap-devel). On the other hand, OVS already has its own way to inject packets into a bridge. Let's make use of it. The generating script outputs a bulk of 8 packets per line (to save some cpu spent calling ovs-appctl). Suggested-by: Ilya Maximets Reviewed-by: Maxime Coquelin Acked-by: Eelco Chaudron Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/automake.mk | 6 +--- tests/{mfex_fuzzy.py => genpkts.py} | 56 ++++++++++++++--------------- tests/system-dpdk.at | 24 ++++++++----- 3 files changed, 43 insertions(+), 43 deletions(-) rename tests/{mfex_fuzzy.py => genpkts.py} (66%) diff --git a/tests/automake.mk b/tests/automake.mk index f8a925012d6..2ae0aeecaff 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -146,10 +146,6 @@ $(srcdir)/tests/fuzz-regression-list.at: tests/automake.mk echo "TEST_FUZZ_REGRESSION([$$basename])"; \ done > $@.tmp && mv $@.tmp $@ -EXTRA_DIST += $(MFEX_AUTOVALIDATOR_TESTS) -MFEX_AUTOVALIDATOR_TESTS = \ - tests/mfex_fuzzy.py - OVSDB_CLUSTER_TESTSUITE_AT = \ tests/ovsdb-cluster-testsuite.at \ tests/ovsdb-execution.at \ @@ -522,7 +518,7 @@ tests_test_type_props_SOURCES = tests/test-type-props.c CHECK_PYFILES = \ tests/appctl.py \ tests/flowgen.py \ - tests/mfex_fuzzy.py \ + tests/genpkts.py \ tests/ovsdb-monitor-sort.py \ tests/test-daemon.py \ tests/test-dpparse.py \ diff --git a/tests/mfex_fuzzy.py b/tests/genpkts.py similarity index 66% rename from tests/mfex_fuzzy.py rename to tests/genpkts.py index 50b9870641d..3354e116d0c 100755 --- a/tests/mfex_fuzzy.py +++ b/tests/genpkts.py @@ -17,51 +17,44 @@ from scapy.all import RandMAC, RandIP, PcapWriter, RandIP6, RandShort, fuzz from scapy.all import IPv6, Dot1Q, IP, Ether, UDP, TCP, random -# Path for the pcap file location. -path = str(sys.argv[1]) # The number of packets generated will be size * 8. -size = int(sys.argv[2]) +size = int(sys.argv[1]) # Traffic option is used to choose between fuzzy or simple packet type. -if len(sys.argv) > 3: - traffic_opt = str(sys.argv[3]) +if len(sys.argv) > 2: + traffic_opt = str(sys.argv[2]) else: traffic_opt = "" -pktdump = PcapWriter(path, append=False, sync=True) - -pkt = [] - for i in range(0, size): + pkt = [] + if traffic_opt == "fuzzy": eth = Ether(src=RandMAC(), dst=RandMAC()) vlan = Dot1Q() - udp = UDP(dport=RandShort(), sport=RandShort()) ipv4 = IP(src=RandIP(), dst=RandIP(), len=random.randint(0, 100)) ipv6 = IPv6(src=RandIP6(), dst=RandIP6(), plen=random.randint(0, 100)) + udp = UDP(dport=RandShort(), sport=RandShort()) tcp = TCP(dport=RandShort(), sport=RandShort(), flags='S', dataofs=random.randint(0, 15)) # IPv4 packets with fuzzing - pkt.append(fuzz(eth / ipv4 / udp)) - pkt.append(fuzz(eth / ipv4 / tcp)) - pkt.append(fuzz(eth / vlan / ipv4 / udp)) - pkt.append(fuzz(eth / vlan / ipv4 / tcp)) + pkt.append(fuzz(eth / ipv4 / udp).build().hex()) + pkt.append(fuzz(eth / ipv4 / tcp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv4 / udp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv4 / tcp).build().hex()) # IPv6 packets with fuzzing - pkt.append(fuzz(eth / ipv6 / udp)) - pkt.append(fuzz(eth / ipv6 / tcp)) - pkt.append(fuzz(eth / vlan / ipv6 / udp)) - pkt.append(fuzz(eth / vlan / ipv6 / tcp)) + pkt.append(fuzz(eth / ipv6 / udp).build().hex()) + pkt.append(fuzz(eth / ipv6 / tcp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv6 / udp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv6 / tcp).build().hex()) else: mac_addr_src = "52:54:00:FF:FF:{:02X}".format(i % 0xff) mac_addr_dst = "80:FF:FF:FF:FF:{:02X}".format(i % 0xff) - src_port = 200 + (i % 20) - dst_port = 1000 + (i % 20) eth = Ether(src=mac_addr_src, dst=mac_addr_dst) vlan = Dot1Q(vlan=(i % 10)) - udp = UDP(dport=src_port, sport=dst_port) # IPv4 address range limits to 255 and IPv6 limit to 65535 ipv4_addr_src = "192.168.150." + str((i % 255)) ipv4_addr_dst = "200.100.198." + str((i % 255)) @@ -71,18 +64,21 @@ .format(i % 0xffff) ipv4 = IP(src=ipv4_addr_src, dst=ipv4_addr_dst) ipv6 = IPv6(src=ipv6_addr_src, dst=ipv6_addr_dst) + src_port = 200 + (i % 20) + dst_port = 1000 + (i % 20) + udp = UDP(dport=src_port, sport=dst_port) tcp = TCP(dport=src_port, sport=dst_port, flags='S') # IPv4 packets - pkt.append(eth / ipv4 / udp) - pkt.append(eth / ipv4 / tcp) - pkt.append(eth / vlan / ipv4 / udp) - pkt.append(eth / vlan / ipv4 / tcp) + pkt.append((eth / ipv4 / udp).build().hex()) + pkt.append((eth / ipv4 / tcp).build().hex()) + pkt.append((eth / vlan / ipv4 / udp).build().hex()) + pkt.append((eth / vlan / ipv4 / tcp).build().hex()) # IPv6 packets - pkt.append(eth / ipv6 / udp) - pkt.append(eth / ipv6 / tcp) - pkt.append(eth / vlan / ipv6 / udp) - pkt.append(eth / vlan / ipv6 / tcp) + pkt.append((eth / ipv6 / udp).build().hex()) + pkt.append((eth / ipv6 / tcp).build().hex()) + pkt.append((eth / vlan / ipv6 / udp).build().hex()) + pkt.append((eth / vlan / ipv6 / tcp).build().hex()) -pktdump.write(pkt) + print(' '.join(pkt)) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index d19062d987e..ab72860a639 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -821,16 +821,16 @@ AT_SETUP([OVS-DPDK - MFEX Autovalidator]) AT_KEYWORDS([dpdk]) OVS_CHECK_GITHUB_ACTION() OVS_DPDK_PRE_CHECK() -OVS_DPDK_START([--no-pci]) +OVS_DPDK_START([--no-pci], [--enable-dummy]) AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl ]) AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py test_traffic.pcap 2000], [], [stdout]) +AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 > packets]) dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_pcap1,rx_pcap=test_traffic.pcap,infinite_rx=1], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd]) AT_CHECK([ovs-vsctl show], [], [stdout]) AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl @@ -841,6 +841,10 @@ AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl Miniflow extract implementation set to autovalidator. ]) +cat packets | while read line; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) +done + OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) dnl Clean up @@ -855,16 +859,16 @@ AT_SETUP([OVS-DPDK - MFEX Autovalidator Fuzzy]) AT_KEYWORDS([dpdk]) OVS_CHECK_GITHUB_ACTION() OVS_DPDK_PRE_CHECK() -OVS_DPDK_START([--no-pci]) +OVS_DPDK_START([--no-pci], [--enable-dummy]) AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl ]) AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py fuzzy.pcap 2000 fuzzy], [], [stdout]) +AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 fuzzy > packets]) dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_pcap1,rx_pcap=fuzzy.pcap,infinite_rx=1], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd]) AT_CHECK([ovs-vsctl show], [], [stdout]) AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl @@ -875,6 +879,10 @@ AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl Miniflow extract implementation set to autovalidator. ]) +cat packets | while read line; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) +done + OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) dnl Clean up @@ -888,11 +896,11 @@ dnl -------------------------------------------------------------------------- AT_SETUP([OVS-DPDK - MFEX Configuration]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START([--no-pci]) +OVS_DPDK_START([--no-pci], [--enable-dummy]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_null0,no-rx=1], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd]) AT_CHECK([ovs-vsctl show], [], [stdout]) AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar 1], [2], From 5c3810491294dd7c4aa84541cf422a754ac99365 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 21 Dec 2023 14:52:58 +0100 Subject: [PATCH 487/833] tests: Move MFEX tests to dpif-netdev. The MFEX code and tests do not depend on DPDK anymore. We can move the unit tests to dpif-netdev. Reviewed-by: Maxime Coquelin Acked-by: Eelco Chaudron Acked-by: Kumar Amber Signed-off-by: David Marchand Signed-off-by: Simon Horman --- tests/dpif-netdev.at | 165 ++++++++++++++++++++++++++++++++++++ tests/system-dpdk.at | 197 ------------------------------------------- 2 files changed, 165 insertions(+), 197 deletions(-) diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index d0359b5eab6..c9474af0adb 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -852,3 +852,168 @@ OVS_VSWITCHD_STOP(["dnl /.*failed to put.*$/d /.*failed to flow_del.*$/d"]) AT_CLEANUP + +AT_SETUP([dpif-netdev - MFEX Autovalidator]) +AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) +AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 > packets]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set Interface p1 type=dummy-pmd], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"]) + +AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl +]) + +AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl +DPIF implementation set to dpif_avx512. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl +Miniflow extract implementation set to autovalidator. +]) + +cat packets | while read line; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) +done + +OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([dpif-netdev - MFEX Autovalidator Fuzzy]) +AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) +AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 fuzzy > packets]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set Interface p1 type=dummy-pmd], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"]) + +AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl +]) + +AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl +DPIF implementation set to dpif_avx512. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl +Miniflow extract implementation set to autovalidator. +]) + +cat packets | while read line; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) +done + +OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) + +OVS_VSWITCHD_STOP(["dnl +/upcall: datapath reached the dynamic limit of .* flows./d"]) +AT_CLEANUP + +AT_SETUP([dpif-netdev - MFEX Configuration]) +OVS_VSWITCHD_START( + [set Open_vSwitch . other_config:pmd-cpu-mask=0x1 \ + -- add-port br0 p1 \ + -- set Interface p1 type=dummy-pmd], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar 1], [2], +[], [dnl +Error: unknown argument 1. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 6 study 300 xyz], [2], +[], [dnl +Error: invalid study_pkt_cnt value: xyz. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar abcd], [2], +[], [dnl +Error: unknown argument abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 scalar abcd], [2], +[], [dnl +Error: unknown argument abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd], [2], +[], [dnl +Error: -pmd option requires a thread id argument. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set tudy abcd], [2], +[], [dnl +Error: unknown argument abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 7 study abcd], [2], +[], [dnl +Error: invalid study_pkt_cnt value: abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study], [0], [dnl +Miniflow extract implementation set to study, on pmd thread 0, studying 128 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study 512], [0], [dnl +Miniflow extract implementation set to study, on pmd thread 0, studying 512 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study 512], [0], [dnl +Miniflow extract implementation set to study, studying 512 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study], [0], [dnl +Miniflow extract implementation set to study, studying 128 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 autovalidator], [0], [dnl +Miniflow extract implementation set to autovalidator, on pmd thread 0. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd zero study], [2], +[], [dnl +Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1], [2], +[], [dnl +Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 superstudy], [2], +[], [dnl +Error: unknown miniflow extract implementation superstudy. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set superstudy], [2], +[], [dnl +Error: unknown miniflow extract implementation superstudy. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 study -pmd], [2], +[], [dnl +Error: invalid study_pkt_cnt value: -pmd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +OVS_VSWITCHD_STOP(["dnl +/Error: unknown argument 1./d +/Error: invalid study_pkt_cnt value: xyz./d +/Error: unknown argument abcd./d +/Error: -pmd option requires a thread id argument./d +/Error: invalid study_pkt_cnt value: abcd./d +/Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID./d +/Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list./d +/Error: unknown miniflow extract implementation superstudy./d +/Error: invalid study_pkt_cnt value: -pmd./d"]) +AT_CLEANUP diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index ab72860a639..fab3dcbeafc 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -813,203 +813,6 @@ OVS_DPDK_STOP_VSWITCHD(["dnl AT_CLEANUP dnl -------------------------------------------------------------------------- - - -dnl -------------------------------------------------------------------------- -dnl MFEX Autovalidator -AT_SETUP([OVS-DPDK - MFEX Autovalidator]) -AT_KEYWORDS([dpdk]) -OVS_CHECK_GITHUB_ACTION() -OVS_DPDK_PRE_CHECK() -OVS_DPDK_START([--no-pci], [--enable-dummy]) -AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl -]) - -AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 > packets]) - -dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd]) -AT_CHECK([ovs-vsctl show], [], [stdout]) - -AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl -DPIF implementation set to dpif_avx512. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl -Miniflow extract implementation set to autovalidator. -]) - -cat packets | while read line; do - AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) -done - -OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) - -dnl Clean up -AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_DPDK_STOP_VSWITCHD -AT_CLEANUP -dnl -------------------------------------------------------------------------- - -dnl -------------------------------------------------------------------------- -dnl MFEX Autovalidator Fuzzy -AT_SETUP([OVS-DPDK - MFEX Autovalidator Fuzzy]) -AT_KEYWORDS([dpdk]) -OVS_CHECK_GITHUB_ACTION() -OVS_DPDK_PRE_CHECK() -OVS_DPDK_START([--no-pci], [--enable-dummy]) -AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl -]) - -AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 fuzzy > packets]) - -dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd]) -AT_CHECK([ovs-vsctl show], [], [stdout]) - -AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl -DPIF implementation set to dpif_avx512. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl -Miniflow extract implementation set to autovalidator. -]) - -cat packets | while read line; do - AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) -done - -OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) - -dnl Clean up -AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_DPDK_STOP_VSWITCHD(["dnl -/upcall: datapath reached the dynamic limit of .* flows./d"]) -AT_CLEANUP -dnl -------------------------------------------------------------------------- - -dnl -------------------------------------------------------------------------- -AT_SETUP([OVS-DPDK - MFEX Configuration]) -AT_KEYWORDS([dpdk]) -OVS_DPDK_PRE_CHECK() -OVS_DPDK_START([--no-pci], [--enable-dummy]) -AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) -dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy-pmd]) -AT_CHECK([ovs-vsctl show], [], [stdout]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar 1], [2], -[], [dnl -Error: unknown argument 1. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 6 study 300 xyz], [2], -[], [dnl -Error: invalid study_pkt_cnt value: xyz. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar abcd], [2], -[], [dnl -Error: unknown argument abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 scalar abcd], [2], -[], [dnl -Error: unknown argument abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd], [2], -[], [dnl -Error: -pmd option requires a thread id argument. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set tudy abcd], [2], -[], [dnl -Error: unknown argument abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 7 study abcd], [2], -[], [dnl -Error: invalid study_pkt_cnt value: abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study], [0], [dnl -Miniflow extract implementation set to study, on pmd thread 0, studying 128 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study 512], [0], [dnl -Miniflow extract implementation set to study, on pmd thread 0, studying 512 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study 512], [0], [dnl -Miniflow extract implementation set to study, studying 512 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study], [0], [dnl -Miniflow extract implementation set to study, studying 128 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 autovalidator], [0], [dnl -Miniflow extract implementation set to autovalidator, on pmd thread 0. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd zero study], [2], -[], [dnl -Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1], [2], -[], [dnl -Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 superstudy], [2], -[], [dnl -Error: unknown miniflow extract implementation superstudy. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set superstudy], [2], -[], [dnl -Error: unknown miniflow extract implementation superstudy. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 study -pmd], [2], -[], [dnl -Error: invalid study_pkt_cnt value: -pmd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -dnl Clean up -AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_DPDK_STOP_VSWITCHD(["dnl -/Error: unknown argument 1./d -/Error: invalid study_pkt_cnt value: xyz./d -/Error: unknown argument abcd./d -/Error: -pmd option requires a thread id argument./d -/Error: invalid study_pkt_cnt value: abcd./d -/Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID./d -/Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list./d -/Error: unknown miniflow extract implementation superstudy./d -/Error: invalid study_pkt_cnt value: -pmd./d"]) -AT_CLEANUP dnl -dnl -------------------------------------------------------------------------- - dnl -------------------------------------------------------------------------- dnl Setup user configured mempools AT_SETUP([OVS-DPDK - user configured mempool]) From 4102674b3ecadb0e20e512cc661cddbbc4b3d1f6 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Tue, 12 Dec 2023 12:43:34 -0500 Subject: [PATCH 488/833] ovsdb-idl: Preserve change_seqno when deleting rows. In the case of a weak reference, clearing all change_seqno's can delete useful information. Instead of clearing all seqno's when removing track_node, only clear those values in cases including row insertion, and row deleting if no dst_arcs remain. Fixes: 95689f166818 ("ovsdb-idl: Preserve references for deleted rows.") Reported-at: https://issues.redhat.com/browse/FDP-193 Signed-off-by: Mike Pattrick Acked-by: Dumitru Ceara Signed-off-by: Simon Horman --- lib/ovsdb-idl.c | 11 ++++++++-- tests/ovsdb-idl.at | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c index 634fbb56df2..ba720474b66 100644 --- a/lib/ovsdb-idl.c +++ b/lib/ovsdb-idl.c @@ -177,6 +177,7 @@ static void ovsdb_idl_row_mark_backrefs_for_reparsing(struct ovsdb_idl_row *); static void ovsdb_idl_row_track_change(struct ovsdb_idl_row *, enum ovsdb_idl_change); static void ovsdb_idl_row_untrack_change(struct ovsdb_idl_row *); +static void ovsdb_idl_row_clear_changeseqno(struct ovsdb_idl_row *); static void ovsdb_idl_txn_abort_all(struct ovsdb_idl *); static bool ovsdb_idl_txn_extract_mutations(struct ovsdb_idl_row *, @@ -1374,6 +1375,7 @@ ovsdb_idl_track_clear__(struct ovsdb_idl *idl, bool flush_all) row->updated = NULL; } ovsdb_idl_row_untrack_change(row); + ovsdb_idl_row_clear_changeseqno(row); if (ovsdb_idl_row_is_orphan(row)) { ovsdb_idl_row_unparse(row); @@ -1632,6 +1634,7 @@ ovsdb_idl_process_update(struct ovsdb_idl_table *table, ru->columns); } else if (ovsdb_idl_row_is_orphan(row)) { ovsdb_idl_row_untrack_change(row); + ovsdb_idl_row_clear_changeseqno(row); ovsdb_idl_insert_row(row, ru->columns); } else { VLOG_ERR_RL(&semantic_rl, "cannot add existing row "UUID_FMT" to " @@ -2283,11 +2286,15 @@ ovsdb_idl_row_untrack_change(struct ovsdb_idl_row *row) return; } + ovs_list_remove(&row->track_node); + ovs_list_init(&row->track_node); +} + +static void ovsdb_idl_row_clear_changeseqno(struct ovsdb_idl_row *row) +{ row->change_seqno[OVSDB_IDL_CHANGE_INSERT] = row->change_seqno[OVSDB_IDL_CHANGE_MODIFY] = row->change_seqno[OVSDB_IDL_CHANGE_DELETE] = 0; - ovs_list_remove(&row->track_node); - ovs_list_init(&row->track_node); } static struct ovsdb_idl_row * diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 1028b023787..f17cfdf1047 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -1466,6 +1466,56 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, references, singl 006: done ]]) +OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, weak references, insert+delete batch], + [['["idltest", + {"op": "insert", + "table": "simple", + "row": {"s": "row0_s"}, + "uuid-name": "uuid_row0_s"}, + {"op": "insert", + "table": "simple6", + "row": {"name": "row0_s6", + "weak_ref": ["set", + [["named-uuid", "uuid_row0_s"]] + ]}}]']], + [['condition simple [true];simple6 [true]' \ + '["idltest", + {"op": "insert", + "table": "simple", + "row": {"s": "row1_s"}, + "uuid-name": "uuid_row1_s"}, + {"op": "mutate", + "table": "simple6", + "where": [["name", "==", "row0_s6"]], + "mutations": [["weak_ref", "insert", ["set", [["named-uuid", "uuid_row1_s"]]]]]}]' \ + '+["idltest", + {"op": "delete", + "table": "simple", + "where": [["s", "==", "row1_s"]]}]' \ + '["idltest", + {"op": "insert", + "table": "simple", + "row": {"s": "row2_s"}}]']], + [[000: simple6: conditions unchanged +000: simple: conditions unchanged +001: table simple6: inserted row: name=row0_s6 weak_ref=[<0>] uuid=<1> +001: table simple6: updated columns: name weak_ref +001: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +001: table simple: updated columns: s +002: {"error":null,"result":[{"uuid":["uuid","<3>"]},{"count":1}]} +003: {"error":null,"result":[{"count":1}]} +004: table simple6: name=row0_s6 weak_ref=[<0>] uuid=<1> +004: table simple6: updated columns: weak_ref +004: table simple: inserted/deleted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +004: table simple: updated columns: s +005: {"error":null,"result":[{"uuid":["uuid","<4>"]}]} +006: table simple6: name=row0_s6 weak_ref=[<0>] uuid=<1> +006: table simple: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +006: table simple: inserted row: i=0 r=0 b=false s=row2_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<4> +006: table simple: updated columns: s +007: done +]]) + dnl This test checks that deleting both the destination and source of the dnl reference doesn't remove the reference in the source tracked record. OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, weak references, multiple deletes], From d51d4f42d3e880dce7c13b1437c8d5d1312ecbd5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 18 Dec 2023 03:02:40 +0100 Subject: [PATCH 489/833] ovsdb: Fix incorrect sharing of UUID and _version columns. Datum of UUID and _version columns is accessed directly via ovsdb_row_get_uuid_rw() and ovsdb_row_get_version_rw() functions instead of ovsdb_data_* functions. Meaning, the data will be directly modified even if it is shared between rows. Fix that by unsharing the data whenever RW pointer is taken. The issue was mostly hidden because weak reference assessment code always called ovsdb_datum_subtract() even if not needed. This way all the new transaction rows were always implicitly unshared. Also making ovsdb_datum_subtract() call conditional, so the issue can be hit by existing unit tests. Fixes: 485ac63d10f8 ("ovsdb: Add lazy-copy support for ovsdb_datum objects.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/row.h | 2 ++ ovsdb/transaction.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ovsdb/row.h b/ovsdb/row.h index 59f498a20d6..6f5e58acb3f 100644 --- a/ovsdb/row.h +++ b/ovsdb/row.h @@ -130,6 +130,7 @@ ovsdb_row_get_uuid(const struct ovsdb_row *row) static inline struct uuid * ovsdb_row_get_uuid_rw(struct ovsdb_row *row) { + ovsdb_datum_unshare(&row->fields[OVSDB_COL_UUID], &ovsdb_type_uuid); return &row->fields[OVSDB_COL_UUID].keys[0].uuid; } @@ -142,6 +143,7 @@ ovsdb_row_get_version(const struct ovsdb_row *row) static inline struct uuid * ovsdb_row_get_version_rw(struct ovsdb_row *row) { + ovsdb_datum_unshare(&row->fields[OVSDB_COL_VERSION], &ovsdb_type_uuid); return &row->fields[OVSDB_COL_VERSION].keys[0].uuid; } diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 4fdc5bcea7b..f43533a8cdc 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -733,8 +733,10 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) ovsdb_datum_sort_unique(&deleted_refs, &column->type); /* Removing elements that references deleted rows. */ - ovsdb_datum_subtract(datum, &column->type, - &deleted_refs, &column->type); + if (deleted_refs.n) { + ovsdb_datum_subtract(datum, &column->type, + &deleted_refs, &column->type); + } ovsdb_datum_destroy(&deleted_refs, &column->type); /* Generating the difference between old and new data. */ From 6f11d9daad526bf51ee1538c7451a04572dc7a12 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 18 Dec 2023 03:02:41 +0100 Subject: [PATCH 490/833] ovsdb: transaction: Avoid diffs for different type references. While counting strong references current code first generates a difference between old and new datums and only after it checks the types of the atoms to be strong references. Similar thing happens while assessing weak references. First the added/removed are generated and then we check for atoms to be weak references. Check the type first to avoid unnecessary work. This change doubles the performance of transactions that modify large sets of references. For example, with this change applied, initial read of OVSDB file containing 136K transactions of large OVN port groups and address sets on my laptop takes 24 seconds vs 43 seconds without. Fixes: 4dbff9f0a685 ("ovsdb: transaction: Incremental reassessment of weak refs.") Fixes: b2712d026eae ("ovsdb: transaction: Use diffs for strong reference counting.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/ovsdb-types.h | 12 ++++++++++++ ovsdb/transaction.c | 7 ++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/lib/ovsdb-types.h b/lib/ovsdb-types.h index 9777efea332..688fe56337e 100644 --- a/lib/ovsdb-types.h +++ b/lib/ovsdb-types.h @@ -238,6 +238,18 @@ static inline bool ovsdb_type_is_map(const struct ovsdb_type *type) return type->value.type != OVSDB_TYPE_VOID; } +static inline bool ovsdb_type_has_strong_refs(const struct ovsdb_type *type) +{ + return ovsdb_base_type_is_strong_ref(&type->key) + || ovsdb_base_type_is_strong_ref(&type->value); +} + +static inline bool ovsdb_type_has_weak_refs(const struct ovsdb_type *type) +{ + return ovsdb_base_type_is_weak_ref(&type->key) + || ovsdb_base_type_is_weak_ref(&type->value); +} + #ifdef __cplusplus } #endif diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index f43533a8cdc..bbe4cddc117 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -322,7 +322,8 @@ update_row_ref_count(struct ovsdb_txn *txn, struct ovsdb_txn_row *r) const struct ovsdb_column *column = node->data; struct ovsdb_error *error; - if (bitmap_is_set(r->changed, column->index)) { + if (bitmap_is_set(r->changed, column->index) + && ovsdb_type_has_strong_refs(&column->type)) { if (r->old && !r->new) { error = ovsdb_txn_adjust_row_refs( txn, r->old, column, @@ -718,6 +719,10 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) unsigned int orig_n; bool zero = false; + if (!ovsdb_type_has_weak_refs(&column->type)) { + continue; + } + orig_n = datum->n; /* Collecting all key-value pairs that references deleted rows. */ From 0ef3ebb0cfef60a0f395ee7e2cd556c1dceab125 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 18 Dec 2023 03:02:42 +0100 Subject: [PATCH 491/833] ovsdb: transaction: Don't try to diff unchanged columns. While reassessing weak references the code attempts to collect added and removed atoms, even if the column didn't change. In case the column contains a large set, it may take significant amount of time to process. Add a check for the column actually being changed either by removing references to deleted rows or by direct removal. For example, rows in OVN Port_Group tables frequently have two large sets - 'ports' and 'acls'. In case a new ACL is added to the set without changing the ports, ports don't need to be reassessed. Fixes: 4dbff9f0a685 ("ovsdb: transaction: Incremental reassessment of weak refs.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/transaction.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index bbe4cddc117..a482588a0b8 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -745,13 +745,17 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) ovsdb_datum_destroy(&deleted_refs, &column->type); /* Generating the difference between old and new data. */ - if (txn_row->old) { - ovsdb_datum_added_removed(&added, &removed, - &txn_row->old->fields[column->index], - datum, &column->type); - } else { - ovsdb_datum_init_empty(&removed); - ovsdb_datum_clone(&added, datum); + ovsdb_datum_init_empty(&added); + ovsdb_datum_init_empty(&removed); + if (datum->n != orig_n + || bitmap_is_set(txn_row->changed, column->index)) { + if (txn_row->old) { + ovsdb_datum_added_removed(&added, &removed, + &txn_row->old->fields[column->index], + datum, &column->type); + } else { + ovsdb_datum_clone(&added, datum); + } } /* Checking added data and creating new references. */ From 7c3df36762ba95c40698bf44e5be905c050b5730 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Dec 2023 02:04:03 +0100 Subject: [PATCH 492/833] ovsdb-server.at: Enbale debug logs in active-backup tests. It's almost impossible to debug test failures without them. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/ovsdb-server.at | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index d36c3c117ec..35286db37d3 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1830,9 +1830,14 @@ replication_schema > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], + [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 db2], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server2.log --pidfile=2.pid \ + --remote=punix:db2.sock --unixctl=unixctl2 db2], + [0], [ignore], [ignore]) dnl Try to connect without specifying the active server. AT_CHECK([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/connect-active-ovsdb-server], [0], @@ -2153,9 +2158,16 @@ AT_CHECK([ovsdb-tool transact db2 \ dnl Start both 'db1' and 'db2'. on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock --unixctl="`pwd`"/unixctl db1 --active ], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server1.log --pidfile \ + --remote=punix:db.sock \ + --unixctl="$(pwd)"/unixctl db1 --active ], + [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl="`pwd`"/unixctl2 db2], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server2.log --pidfile=2.pid \ + --remote=punix:db2.sock --unixctl="$(pwd)"/unixctl2 db2], + [0], [ignore], [ignore]) OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status |grep active]) OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status |grep active]) From 0a2e16b67dbf16d21435f6ec5fc69d30ed98525d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Dec 2023 02:04:04 +0100 Subject: [PATCH 493/833] tests: ovsdb: Use diff -up format for replay test. It's easier to analyze failures when the lines that are different are shown next to each other. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/ovsdb-client.at | 6 +++--- tests/ovsdb-server.at | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ovsdb-client.at b/tests/ovsdb-client.at index 68fb962bd7e..dcddb258745 100644 --- a/tests/ovsdb-client.at +++ b/tests/ovsdb-client.at @@ -270,8 +270,8 @@ AT_CHECK([ovsdb-client --replay=./replay_dir dnl dnl Waiting for client to exit the same way as it exited during recording. OVS_WAIT_WHILE([test -e ovsdb-client.pid]) -AT_CHECK([diff monitor.stdout monitor-replay.stdout]) -AT_CHECK([diff monitor.stderr monitor-replay.stderr]) +AT_CHECK([diff -u monitor.stdout monitor-replay.stdout]) +AT_CHECK([diff -u monitor.stderr monitor-replay.stderr]) dnl Stripping out timestamps, PIDs and poll_loop warnings from the log. dnl Also stripping socket_util errors as sockets are not used in replay. @@ -284,6 +284,6 @@ m4_define([CLEAN_LOG_FILE], CLEAN_LOG_FILE([monitor.log], [monitor.log.clear]) CLEAN_LOG_FILE([monitor-replay.log], [monitor-replay.log.clear]) -AT_CHECK([diff monitor.log.clear monitor-replay.log.clear]) +AT_CHECK([diff -u monitor.log.clear monitor-replay.log.clear]) AT_CLEANUP diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 35286db37d3..6eb758e2293 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -2394,6 +2394,6 @@ CLEAN_LOG_FILE([2.log], [2.log.clear]) dnl Checking that databases and logs are equal. AT_CHECK([diff db.clear ./replay_dir/db.copy.clear]) -AT_CHECK([diff 1.log.clear 2.log.clear]) +AT_CHECK([diff -u 1.log.clear 2.log.clear]) AT_CLEANUP From d07a3b798d7d04332d0f65dddacb09fc7ba04558 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Dec 2023 02:04:05 +0100 Subject: [PATCH 494/833] jsonrpc: Sort JSON objects while printing debug messages. We compare the logs in some tests, for example record/replay tests. And those fail if for some reason the JSON object traversal happens in the different order. Sort the output in debug logs in order to fix sporadic test failures. Should not affect performance in real-world cases as the actual outgoing message is still not sorted. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/jsonrpc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c index c8ce5362e16..3db5f76e280 100644 --- a/lib/jsonrpc.c +++ b/lib/jsonrpc.c @@ -221,19 +221,19 @@ jsonrpc_log_msg(const struct jsonrpc *rpc, const char *title, } if (msg->params) { ds_put_cstr(&s, ", params="); - json_to_ds(msg->params, 0, &s); + json_to_ds(msg->params, JSSF_SORT, &s); } if (msg->result) { ds_put_cstr(&s, ", result="); - json_to_ds(msg->result, 0, &s); + json_to_ds(msg->result, JSSF_SORT, &s); } if (msg->error) { ds_put_cstr(&s, ", error="); - json_to_ds(msg->error, 0, &s); + json_to_ds(msg->error, JSSF_SORT, &s); } if (msg->id) { ds_put_cstr(&s, ", id="); - json_to_ds(msg->id, 0, &s); + json_to_ds(msg->id, JSSF_SORT, &s); } VLOG_DBG("%s: %s %s%s", rpc->name, title, jsonrpc_msg_type_to_string(msg->type), ds_cstr(&s)); From e951af81cbfb09338a96f567867548f825c5f6ff Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Dec 2023 02:04:06 +0100 Subject: [PATCH 495/833] ovsdb: jsonrpc-server: Fix the DSCP value in default options. The DSCP_DEFAULT is not zero and is a value that supposed to be used for all connections by default. Fixes: f125905cdd3d ("Allow configuring DSCP on controller and manager connections.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index a3ca48a7b35..45f7c8038c2 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -215,6 +215,7 @@ ovsdb_jsonrpc_default_options(const char *target) options->probe_interval = (stream_or_pstream_needs_probes(target) ? RECONNECT_DEFAULT_PROBE_INTERVAL : 0); + options->dscp = DSCP_DEFAULT; return options; } From 94371c0996b4fb92133ebdda847ad279265a2803 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 14 Dec 2023 02:04:07 +0100 Subject: [PATCH 496/833] ovsdb: trigger: Do not allow conversion in read-only mode. It's not a big problem, but it would be nice to ensure that the backup database cannot be locally converted. Fixes: e51879e99b3e ("ovsdb: Make OVSDB backup sever read only") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/trigger.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 0edcdd89c64..2a48ccc643a 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -278,6 +278,14 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) return false; } + if (t->read_only) { + trigger_convert_error( + t, ovsdb_error("not allowed", "conversion is not allowed " + "for read-only database %s", + t->db->schema->name)); + return false; + } + /* Validate parameters. */ const struct json *params = t->request->params; if (params->type != JSON_ARRAY || params->array.n != 2) { From 54b3eb531aa37061e4a548b0b9bbdbd5997c60d4 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Fri, 5 Jan 2024 16:41:45 +0000 Subject: [PATCH 497/833] backtrace: Fix error in log_backtrace() documentation. The documentation for log_backtrace() states the backtrace is logged at DEBUG level, while in reality it is logged at ERROR level. Fixes: d0b99d38edab ("backtrace: Add log_backtrace()") Acked-by: Mike Pattrick Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- lib/backtrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/backtrace.h b/lib/backtrace.h index 9ccafd6d47c..a2506da5fff 100644 --- a/lib/backtrace.h +++ b/lib/backtrace.h @@ -26,7 +26,7 @@ #endif /* log_backtrace() will save the backtrace of a running program - * into the log at the DEBUG level. + * into the log at the ERROR level. * * To use it, insert the following code to where backtrace is * desired: From 2535d171a311e998a3e6926c306645a46bc36f9a Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Sat, 6 Jan 2024 09:23:20 +0000 Subject: [PATCH 498/833] vconn: Count vconn_sent regardless of log level. vconn_sent counter is supposed to increase each time send() return 0, no matter if the vconn log debug is on or off. Acked-by: Eelco Chaudron Signed-off-by: Cheng Li Signed-off-by: Ilya Maximets --- lib/vconn.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/vconn.c b/lib/vconn.c index b5567622779..e9603432d2d 100644 --- a/lib/vconn.c +++ b/lib/vconn.c @@ -682,7 +682,6 @@ do_send(struct vconn *vconn, struct ofpbuf *msg) ofpmsg_update_length(msg); if (!VLOG_IS_DBG_ENABLED()) { - COVERAGE_INC(vconn_sent); retval = (vconn->vclass->send)(vconn, msg); } else { char *s = ofp_to_string(msg->data, msg->size, NULL, NULL, 1); @@ -693,6 +692,9 @@ do_send(struct vconn *vconn, struct ofpbuf *msg) } free(s); } + if (!retval) { + COVERAGE_INC(vconn_sent); + } return retval; } From 21c61243fb755c0150a8b29bec1fd3a6135758b6 Mon Sep 17 00:00:00 2001 From: Gaetan Rivet Date: Mon, 13 Nov 2023 15:00:32 +0200 Subject: [PATCH 499/833] checkpatch: Fix personal word list storage. The enchant dictionary synchronizes additions to the source file. Keep the two word source separate by adding the extra words only to the current session. Fixes: 999c7773a60b ("checkpatch: add a comment spell-checker") Signed-off-by: Gaetan Rivet Acked-by: Roi Dayan Acked-by: Eelco Chaudron Signed-off-by: Aaron Conole --- utilities/checkpatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 6b210fab838..2dd02ee6420 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -93,7 +93,7 @@ def open_spell_check_dict(): global spell_check_dict spell_check_dict = enchant.Dict("en_US") for kw in extra_keywords: - spell_check_dict.add(kw) + spell_check_dict.add_to_session(kw) return True except: From 915b97971d580a3775fc78e1495c948a6d850076 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 13 Nov 2023 15:00:33 +0200 Subject: [PATCH 500/833] checkpatch.py: Load codespell dictionary. codespell dictionary contains a list of widely used words which enchant alone could fail on. for an example: refcount, pthread, enqueuing, etc. Load that dictionary, if exists, into enchant spell checker. Signed-off-by: Roi Dayan Acked-by: Eelco Chaudron Signed-off-by: Aaron Conole --- utilities/checkpatch.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 2dd02ee6420..e0cab6b9f82 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -39,6 +39,15 @@ def open_spell_check_dict(): import enchant + try: + import codespell_lib + codespell_dir = os.path.dirname(codespell_lib.__file__) + codespell_file = os.path.join(codespell_dir, 'data', 'dictionary.txt') + if not os.path.exists(codespell_file): + codespell_file = '' + except: + codespell_file = '' + try: extra_keywords = ['ovs', 'vswitch', 'vswitchd', 'ovs-vswitchd', 'netdev', 'selinux', 'ovs-ctl', 'dpctl', 'ofctl', @@ -91,7 +100,16 @@ def open_spell_check_dict(): 'syscall', 'lacp', 'ipf', 'skb', 'valgrind'] global spell_check_dict + spell_check_dict = enchant.Dict("en_US") + + if codespell_file: + with open(codespell_file) as f: + for line in f.readlines(): + words = line.strip().split('>')[1].strip(', ').split(',') + for word in words: + spell_check_dict.add_to_session(word.strip()) + for kw in extra_keywords: spell_check_dict.add_to_session(kw) From 7404d25ea72d275c02e71dae76c07b2f16d5b4f2 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 10 Jan 2024 11:04:36 +0100 Subject: [PATCH 501/833] system-dpdk: Test with mlx5 devices. The DPDK unit test only runs if vfio or igb_uio kernel modules are loaded: on systems with only mlx5, this test is always skipped. Besides, the test tries to grab the first device listed by dpdk-devbind.py, regardless of the PCI device status regarding kmod binding. Remove dependency on this DPDK script and use a minimal script that reads PCI sysfs. This script is not perfect, as one can imagine PCI devices bound to vfio-pci for virtual machines. Plus, this script only tries to take over vfio-pci devices. mlx5 devices can't be taken over blindly as it could mean losing connectivity to the machine if the netdev was in use for this system. For those two reasons, add a new environment variable DPDK_PCI_ADDR for testers to select the PCI device of their liking. For consistency and grep, the temporary file PCI_ADDR is renamed to DPDK_PCI_ADDR. Reviewed-by: Maxime Coquelin Acked-by: Eelco Chaudron Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- Documentation/topics/testing.rst | 11 ++++++--- tests/automake.mk | 1 + tests/system-dpdk-find-device.py | 39 ++++++++++++++++++++++++++++++++ tests/system-dpdk-macros.at | 10 ++------ tests/system-dpdk.at | 14 ++++++------ 5 files changed, 57 insertions(+), 18 deletions(-) create mode 100755 tests/system-dpdk-find-device.py diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index 5f6940b84d9..fb9b3e77b10 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -343,15 +343,20 @@ To see a list of all the available tests, run:: These tests support a `DPDK supported NIC`_. The tests operate on a wider set of environments, for instance, when a virtual port is used. -They do require proper DPDK variables (``DPDK_DIR`` and ``DPDK_BUILD``). Moreover you need to have root privileges to load the required modules and to bind -the NIC to the DPDK-compatible driver. +a PCI device to the DPDK-compatible driver. .. _DPDK supported NIC: https://core.dpdk.org/supported/#nics +The phy test will skip if no suitable PCI device is found. +It is possible to select which PCI device is used for this test by setting the +DPDK_PCI_ADDR environment variable, which is especially useful when testing +with a mlx5 device:: + + # DPDK_PCI_ADDR=0000:82:00.0 make check-dpdk + All tests are skipped if no hugepages are configured. User must look into the DPDK manual to figure out how to `Configure hugepages`_. -The phy test will skip if no compatible physical device is available. .. _Configure hugepages: https://doc.dpdk.org/guides-22.11/linux_gsg/sys_reqs.html diff --git a/tests/automake.mk b/tests/automake.mk index 2ae0aeecaff..10c9fbb01f3 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -520,6 +520,7 @@ CHECK_PYFILES = \ tests/flowgen.py \ tests/genpkts.py \ tests/ovsdb-monitor-sort.py \ + tests/system-dpdk-find-device.py \ tests/test-daemon.py \ tests/test-dpparse.py \ tests/test-json.py \ diff --git a/tests/system-dpdk-find-device.py b/tests/system-dpdk-find-device.py new file mode 100755 index 00000000000..ced74e7f310 --- /dev/null +++ b/tests/system-dpdk-find-device.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path +import os +import sys + +# The tester might want to select a PCI device, if so, trust it. +if 'DPDK_PCI_ADDR' in os.environ: + print(os.environ['DPDK_PCI_ADDR']) + sys.exit(0) + +for device in sorted(Path('/sys/bus/pci/devices').iterdir()): + class_path = device / 'class' + # Only consider Network class devices + if class_path.read_text().strip() != '0x020000': + continue + kmod_path = device / 'driver' / 'module' + kmod_name = kmod_path.resolve().name + # Only care about devices bound to vfio_pci or igb_uio. + if kmod_name not in ['vfio_pci', 'igb_uio']: + continue + print(device.resolve().name) + sys.exit(0) + +sys.exit(1) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index dcdfa55741c..3b5a3512d43 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -19,14 +19,8 @@ m4_define([OVS_DPDK_PRE_PHY_SKIP], [dnl Perform the precheck OVS_DPDK_PRE_CHECK() - dnl Check if VFIO or UIO driver is loaded - AT_SKIP_IF([ ! (lsmod | grep -E "igb_uio|vfio") ], [], [stdout]) - - dnl Find PCI address candidate, skip if there is no DPDK-compatible NIC - AT_CHECK([$DPDK_DIR/usertools/dpdk-devbind.py -s | head -n +4 | tail -1], [], [stdout]) - AT_CHECK([cat stdout | cut -d" " -s -f1 > PCI_ADDR]) - AT_SKIP_IF([ ! test -s PCI_ADDR ]) - + dnl Check if a device is available for DPDK + AT_SKIP_IF([ ! $abs_top_srcdir/tests/system-dpdk-find-device.py > DPDK_PCI_ADDR ]) ]) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index fab3dcbeafc..1c97bf77720 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -63,7 +63,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -240,7 +240,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and add policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set interface phy0 ingress_policing_rate=10000 ingress_policing_burst=1000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -380,7 +380,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) OVS_WAIT_UNTIL([ovs-vsctl set port phy0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cir=1250000 other-config:cbs=2048]) AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show phy0], [], [stdout]) sleep 2 @@ -509,7 +509,7 @@ dnl First set MTU to its default value and confirm that value, then increase the dnl Add userspace bridge and attach it to OVS with default MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl show], [], [stdout]) dnl Check default MTU value in the datapath @@ -546,7 +546,7 @@ dnl First set an increased MTU value and confirm that value, then decrease the M dnl Add userspace bridge and attach it to OVS and modify MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9000]) AT_CHECK([ovs-vsctl show], [], [stdout]) @@ -665,7 +665,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and set MTU value to max upper bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) @@ -703,7 +703,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and set MTU value to min lower bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=68]) AT_CHECK([ovs-vsctl show], [], [stdout]) From 7b74454c72e3a73bdcb7585478a97a5e9639cfe4 Mon Sep 17 00:00:00 2001 From: Brad Cowie Date: Fri, 5 Jan 2024 17:58:31 +1300 Subject: [PATCH 502/833] system-tests: Test openflow matching for ct related packets with SNAT. Linux kernel commit ebddb1404900 ("net: move the nat function to nf_nat_ovs for ovs and tc") introduced a regression into the kernel datapath which prevented the openvswitch match key from being updated when nat was undone for packets in the related conntrack state. This issue caused these packets (usually ICMP/ICMPv6 error packets) to match the wrong openflow rule. This issue was fixed in linux kernel commit e6345d2824a3 ("netfilter: nf_nat: fix action not being set for all ct states"). This test will fail for linux kernel versions v6.2 to v6.6, so test is skipped for versions lower than v6.7. Link: https://lore.kernel.org/netdev/20231221224311.130319-1-brad@faucet.nz/ Suggested-by: Aaron Conole Signed-off-by: Brad Cowie Signed-off-by: Aaron Conole --- tests/ofproto-macros.at | 5 +++ tests/system-traffic.at | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index 5a7b7a6e77c..932208debe5 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -19,6 +19,11 @@ s/dir\/[0-9]*\/br0.mgmt/dir\/XXXX\/br0.mgmt/ ' } +# Strips out byte counters from ovs-ofctl output +ofctl_strip_bytes () { + sed 's/ n_bytes=[0-9]*,//' +} + # Filter (multiline) vconn debug messages from ovs-vswitchd.log. # Use with vconn_sub() and ofctl_strip() print_vconn_debug () { awk -F\| < ovs-vswitchd.log ' diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 3cdd2f12526..42a3bd6a26e 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -6473,6 +6473,95 @@ AT_CHECK([tcpdump -n -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1 OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - ICMP related with SNAT]) +AT_SKIP_IF([test $HAVE_NC = no]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_CHECK_MIN_KERNEL(6, 7) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Allow IP traffic from ns0->ns1, rewrite source IP with SNAT to 10.1.1.254. +dnl Only allow related ICMP responses back and undo NAT to restore original IP. +AT_DATA([flows.txt], [dnl +ct_state=-trk,ip actions=ct(table=0) +ct_state=+trk,ip,in_port=1 actions=ct(commit,nat(src=10.1.1.254)),2 +ct_state=+rel+trk,icmp,in_port=2,nw_dst=10.1.1.254 actions=ct(commit,table=1,nat) +dnl +dnl Handle ICMP related packets. +dnl These should match first rule with original IPs before SNAT. +dnl The second rule, which matches on the SNAT IP, shouldn't match any packets. +table=1,in_port=2,ct_state=+rel+trk,icmp,nw_src=10.1.1.2,nw_dst=10.1.1.1 action=1 +table=1,in_port=2,ct_state=+rel+trk,icmp,nw_dst=10.1.1.254 action=goto_table:2 +table=1,priority=0,action=drop +dnl +dnl Drop any ICMP related packets that incorrectly reach this table. +table=2,priority=0,action=drop +dnl +dnl ARP +priority=100 arp arp_op=1 action=move:OXM_OF_ARP_TPA[[]]->NXM_NX_REG2[[]],resubmit(,8),goto_table:10 +priority=10 arp action=normal +priority=0,action=drop +dnl +dnl MAC resolution table for IP in reg2, stores mac in OXM_OF_PKT_REG0 +table=8,reg2=0x0a0101f0/0xfffffff0,action=load:0x808888888888->OXM_OF_PKT_REG0[[]] +table=8,priority=0,action=load:0->OXM_OF_PKT_REG0[[]] +dnl ARP responder mac filled in at OXM_OF_PKT_REG0, or 0 for normal action. +dnl TPA IP in reg2. +dnl Swaps the fields of the ARP message to turn a query to a response. +table=10 priority=100 arp xreg0=0 action=normal +table=10 priority=10,arp,arp_op=1,action=load:2->OXM_OF_ARP_OP[[]],move:OXM_OF_ARP_SHA[[]]->OXM_OF_ARP_THA[[]],move:OXM_OF_PKT_REG0[[0..47]]->OXM_OF_ARP_SHA[[]],move:OXM_OF_ARP_SPA[[]]->OXM_OF_ARP_TPA[[]],move:NXM_NX_REG2[[]]->OXM_OF_ARP_SPA[[]],move:NXM_OF_ETH_SRC[[]]->NXM_OF_ETH_DST[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_OF_ETH_SRC[[]],move:NXM_OF_IN_PORT[[]]->NXM_NX_REG3[[0..15]],load:0->NXM_OF_IN_PORT[[]],output:NXM_NX_REG3[[0..15]] +table=10 priority=0 action=drop +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +rm p0.pcap +OVS_DAEMONIZE([tcpdump -n -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +sleep 1 + +dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. +NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc $NC_EOF_OPT -u 10.1.1.2 10000"]) + +dnl Flush conntrack state. +dnl To verify related packets are handled exactly the same as before flushing. +AT_CHECK([ovs-appctl dpctl/flush-conntrack], [0]) + +dnl Solicit another "destination unreachable" response. +dnl To verify that after flushing, the same openflow rules are matched. +NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc $NC_EOF_OPT -u 10.1.1.2 10000"]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | ofctl_strip_bytes | sort | grep -v drop], [0], [dnl + n_packets=1, priority=10,arp actions=NORMAL + n_packets=2, ct_state=+rel+trk,icmp,in_port=2,nw_dst=10.1.1.254 actions=ct(commit,table=1,nat) + n_packets=2, ct_state=+trk,ip,in_port=1 actions=ct(commit,nat(src=10.1.1.254)),output:2 + n_packets=2, priority=100,arp,arp_op=1 actions=move:NXM_OF_ARP_TPA[[]]->NXM_NX_REG2[[]],resubmit(,8),goto_table:10 + n_packets=4, ct_state=-trk,ip actions=ct(table=0) + table=1, ct_state=+rel+trk,icmp,in_port=2,nw_dst=10.1.1.254 actions=goto_table:2 + table=1, n_packets=2, ct_state=+rel+trk,icmp,in_port=2,nw_src=10.1.1.2,nw_dst=10.1.1.1 actions=output:1 + table=10, n_packets=1, priority=10,arp,arp_op=1 actions=set_field:2->arp_op,move:NXM_NX_ARP_SHA[[]]->NXM_NX_ARP_THA[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_NX_ARP_SHA[[]],move:NXM_OF_ARP_SPA[[]]->NXM_OF_ARP_TPA[[]],move:NXM_NX_REG2[[]]->NXM_OF_ARP_SPA[[]],move:NXM_OF_ETH_SRC[[]]->NXM_OF_ETH_DST[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_OF_ETH_SRC[[]],move:NXM_OF_IN_PORT[[]]->NXM_NX_REG3[[0..15]],set_field:0->in_port,output:NXM_NX_REG3[[0..15]] + table=10, n_packets=1, priority=100,arp,reg0=0,reg1=0 actions=NORMAL + table=8, n_packets=1, priority=0 actions=set_field:0->xreg0 + table=8, n_packets=1, reg2=0xa0101f0/0xfffffff0 actions=set_field:0x808888888888->xreg0 +OFPST_FLOW reply (OF1.5): +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 's/dst=10.1.1.2[[45]][[0-9]]/dst=10.1.1.2XX/'], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=) +]) + +AT_CHECK([tcpdump -n -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + dnl CHECK_FTP_NAT(TITLE, IP_ADDR, FLOWS, CT_DUMP) dnl dnl Checks the implementation of conntrack with FTP ALGs in combination with From 14ef8b451f47ef3fa14456790ac8ce4eb824054f Mon Sep 17 00:00:00 2001 From: Viacheslav Galaktionov Date: Mon, 11 Dec 2023 12:51:01 +0200 Subject: [PATCH 503/833] lib/conntrack: Only use given packet in protocol detection. The current protocol detection logic relies on two pieces of metadata passed as arguments: tp_src and tp_dst, which represent the L4 source and destination port numbers from the flow that triggered the current flow rule first, and was responsible for creating the current DP flow. Since multiple network flows of many different kinds, potentially using different protocols on all layers, can be processed by one flow rule, using the metadata of some unrelated flow might lead to unexpected results. For example, ICMP type and code can be interpreted as TCP source and destination ports. This can confuse the code responsible for the helper selection, leading to errors in traffic handling and incorrect detection of related flows. One of the easiest ways to fix this problem is to simply remove the tp_src and tp_dst parameters from the picture. The current code base has no good use for them. The helper selection logic was based on these values and therefore needs to be changed. Ensure that the helper specified in a flow rule is used, given it is compatible with the L4 protocol of the packet. When a flow rule does not specify a helper, one can still be picked using the given packet's metadata like TCP/UDP ports. Signed-off-by: Viacheslav Galaktionov Signed-off-by: Aaron Conole --- lib/conntrack.c | 40 +++++++++++++++++----------------------- lib/conntrack.h | 2 +- lib/dpif-netdev.c | 5 ++--- tests/test-conntrack.c | 6 +++--- 4 files changed, 23 insertions(+), 30 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 71c470661f6..9bb3c17f866 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -657,8 +657,7 @@ is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl) } static enum ct_alg_ctl_type -get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst, - const char *helper) +get_alg_ctl_type(const struct dp_packet *pkt, const char *helper) { /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined * in OSX, at least in in.h. Since these values will never change, remove @@ -668,26 +667,24 @@ get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst, uint8_t ip_proto = get_ip_proto(pkt); struct udp_header *uh = dp_packet_l4(pkt); struct tcp_header *th = dp_packet_l4(pkt); - ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP); - ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP); - ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP); + ovs_be16 ftp_port = htons(CT_IPPORT_FTP); + ovs_be16 tftp_port = htons(CT_IPPORT_TFTP); - if (OVS_UNLIKELY(tp_dst)) { - if (helper && !strncmp(helper, "ftp", strlen("ftp"))) { - ftp_dst_port = tp_dst; - } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) { - tftp_dst_port = tp_dst; + if (helper) { + if ((ip_proto == IPPROTO_TCP) && + !strncmp(helper, "ftp", strlen("ftp"))) { + return CT_ALG_CTL_FTP; } - } else if (OVS_UNLIKELY(tp_src)) { - if (helper && !strncmp(helper, "ftp", strlen("ftp"))) { - ftp_src_port = tp_src; + if ((ip_proto == IPPROTO_UDP) && + !strncmp(helper, "tftp", strlen("tftp"))) { + return CT_ALG_CTL_TFTP; } } - if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) { + if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_port) { return CT_ALG_CTL_TFTP; } else if (ip_proto == IPPROTO_TCP && - (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) { + (th->tcp_src == ftp_port || th->tcp_dst == ftp_port)) { return CT_ALG_CTL_FTP; } return CT_ALG_CTL_NONE; @@ -1229,8 +1226,7 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, bool force, bool commit, long long now, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, const struct nat_action_info_t *nat_action_info, - ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, - uint32_t tp_id) + const char *helper, uint32_t tp_id) { /* Reset ct_state whenever entering a new zone. */ if (pkt->md.ct_state && pkt->md.ct_zone != zone) { @@ -1251,8 +1247,7 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, conn = NULL; } - enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst, - helper); + enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, helper); if (OVS_LIKELY(conn)) { if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn, @@ -1329,7 +1324,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, ovs_be16 dl_type, bool force, bool commit, uint16_t zone, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, - ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, + const char *helper, const struct nat_action_info_t *nat_action_info, long long now, uint32_t tp_id) { @@ -1345,7 +1340,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, write_ct_md(packet, zone, NULL, NULL, NULL); } else if (conn && conn->key_node[CT_DIR_FWD].key.zone == zone && !force && - !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) { + !get_alg_ctl_type(packet, helper)) { process_one_fast(zone, setmark, setlabel, nat_action_info, conn, packet); } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx, @@ -1354,8 +1349,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, write_ct_md(packet, zone, NULL, NULL, NULL); } else { process_one(ct, packet, &ctx, zone, force, commit, now, setmark, - setlabel, nat_action_info, tp_src, tp_dst, helper, - tp_id); + setlabel, nat_action_info, helper, tp_id); } } diff --git a/lib/conntrack.h b/lib/conntrack.h index 18c182f8501..0a888be4559 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -92,7 +92,7 @@ int conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, ovs_be16 dl_type, bool force, bool commit, uint16_t zone, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, - ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, + const char *helper, const struct nat_action_info_t *nat_action_info, long long now, uint32_t tp_id); void conntrack_clear(struct dp_packet *packet); diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 0aea9d0b8bf..df5bbf85a05 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9446,9 +9446,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, } conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, - commit, zone, setmark, setlabel, aux->flow->tp_src, - aux->flow->tp_dst, helper, nat_action_info_ref, - pmd->ctx.now / 1000, tp_id); + commit, zone, setmark, setlabel, helper, + nat_action_info_ref, pmd->ctx.now / 1000, tp_id); break; } diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c index 24c93e4a488..292b6c048b8 100644 --- a/tests/test-conntrack.c +++ b/tests/test-conntrack.c @@ -91,7 +91,7 @@ ct_thread_main(void *aux_) ovs_barrier_block(&barrier); for (i = 0; i < n_pkts; i += batch_size) { conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL, NULL, - 0, 0, NULL, NULL, now, 0); + NULL, NULL, now, 0); DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) { pkt_metadata_init_conn(&pkt->md); } @@ -178,7 +178,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_, if (flow.dl_type != dl_type) { conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, - NULL, NULL, 0, 0, NULL, NULL, now, 0); + NULL, NULL, NULL, NULL, now, 0); dp_packet_batch_init(&new_batch); } dp_packet_batch_add(&new_batch, packet); @@ -186,7 +186,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_, if (!dp_packet_batch_is_empty(&new_batch)) { conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, NULL, NULL, - 0, 0, NULL, NULL, now, 0); + NULL, NULL, now, 0); } } From 8abe32f95798447b5f6de6a6e95599a1ad78b07c Mon Sep 17 00:00:00 2001 From: Viacheslav Galaktionov Date: Mon, 11 Dec 2023 12:51:02 +0200 Subject: [PATCH 504/833] conntrack: Use helpers from committed connections. When a packet hits a flow rule without an explicitly specified helper, OvS has to rely on automatic application layer gateway detection to find related connections. This works as long as services are running on their standard ports, e.g. when FTP servers use TCP port 21. However, sometimes it's necessary to run services on non-standard ports. In that case, there is no way for OvS to guess which protocol is used within a given flow. Of course, this means that no related connections can be recognized. When a connection is committed with a particular helper, it's reasonable to assume this helper will be used in subsequent CT actions, as long as they don't override it. Achieve this behaviour by using the committed connection's helper when a flow rule does not specify one. Signed-off-by: Viacheslav Galaktionov Acked-by: Ivan Malov Signed-off-by: Aaron Conole --- Documentation/faq/releases.rst | 1 + NEWS | 3 +++ lib/conntrack.c | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 362bf4ec7ba..aa69eefa131 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -140,6 +140,7 @@ Q: Are all features available with all datapaths? Conntrack Zone Limit 4.18 2.10 2.13 YES Conntrack NAT 4.6 2.6 2.8 YES Conntrack NAT6 4.6 2.6 2.8 3.0 + Conntrack Helper Persist. YES YES 3.2 NO Tunnel - LISP NO 2.11 NO NO Tunnel - STT NO 2.4 NO YES Tunnel - GRE 3.11 1.0 2.4 YES diff --git a/NEWS b/NEWS index 270ed667340..f6b4cbf997b 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,9 @@ Post-v3.2.0 The existing behaviour is maintained and a non key:value pair value will be applied to all other PMD thread cores.'pmd-sleep-show' is updated to show the maximum sleep for each PMD thread core. + * The userspace conntrack module no longer requires the user to specify + connection helpers in all flow rules. Instead, the helper specified + during connection commit will be used by default. v3.2.0 - 17 Aug 2023 diff --git a/lib/conntrack.c b/lib/conntrack.c index 9bb3c17f866..013709bd622 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -1247,6 +1247,10 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, conn = NULL; } + if (conn && helper == NULL) { + helper = conn->alg; + } + enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, helper); if (OVS_LIKELY(conn)) { @@ -1336,6 +1340,11 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) { struct conn *conn = packet->md.conn; + + if (helper == NULL && conn != NULL) { + helper = conn->alg; + } + if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) { write_ct_md(packet, zone, NULL, NULL, NULL); } else if (conn && From 8aea66599e3495dbfcb76897139dd4a771f7310e Mon Sep 17 00:00:00 2001 From: Viacheslav Galaktionov Date: Mon, 11 Dec 2023 12:51:03 +0200 Subject: [PATCH 505/833] system-traffic.at: Test conntrack + FTP server running on a non-standard port. All existing test iterations assume that the FTP server is running on a standard port, which may not always be the case. These tests helped find problems in conntrack alg processing with non-standard ports. Perform the necessary adjustments to ensure the test suite can start the L7 server on a user-provided port. Signed-off-by: Viacheslav Galaktionov Signed-off-by: Aaron Conole --- tests/system-common-macros.at | 15 +++-- tests/system-traffic.at | 106 ++++++++++++++++++++++++++++++++++ tests/test-l7.py | 4 ++ 3 files changed, 120 insertions(+), 5 deletions(-) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 0620be0c702..01ebe364ee7 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -276,18 +276,23 @@ m4_define([NETNS_DAEMONIZE], m4_define([OVS_CHECK_FIREWALL], [AT_SKIP_IF([systemctl status firewalld 2>&1 | grep running > /dev/null])]) -# OVS_START_L7([namespace], [protocol]) +# OVS_START_L7([namespace], [protocol], [port]) # -# Start a server serving 'protocol' within 'namespace'. The server will exit -# when the test finishes. +# Start a server serving 'protocol' on port 'port' within 'namespace'. +# If 'port' is not specified, the standard one for 'protocol' will be used. +# The server will exit when the test finishes. # m4_define([OVS_START_L7], [PIDFILE=$(mktemp $2XXX.pid) - NETNS_DAEMONIZE([$1], [[$PYTHON3 $srcdir/test-l7.py $2]], [$PIDFILE]) + NETNS_DAEMONIZE([$1], [[$PYTHON3 $srcdir/test-l7.py $2 $3]], [$PIDFILE]) dnl netstat doesn't print http over IPv6 as "http6"; drop the number. PROTO=$(echo $2 | sed -e 's/\([[a-zA-Z]]*\).*/\1/') - OVS_WAIT_UNTIL([NS_EXEC([$1], [netstat -l | grep $PROTO])]) + if test -z "$3"; then + OVS_WAIT_UNTIL([NS_EXEC([$1], [netstat -l | grep $PROTO])]) + else + OVS_WAIT_UNTIL([NS_EXEC([$1], [netstat -ln | grep :$3])]) + fi ] ) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 42a3bd6a26e..283706c6e12 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -5621,6 +5621,112 @@ tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src= OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - FTP non-standard port]) +AT_SKIP_IF([test $HAVE_FTP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_ALG() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0. +AT_DATA([flows1.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal +table=0,priority=100,in_port=1,tcp,action=ct(alg=ftp,commit),2 +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk+rel,action=1 +]) + +dnl Similar policy but without allowing all traffic from ns0->ns1. +AT_DATA([flows2.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal + +dnl Allow outgoing TCP connections, and treat them as FTP +table=0,priority=100,in_port=1,tcp,action=ct(table=1) +table=1,in_port=1,tcp,ct_state=+trk+new,action=ct(commit,alg=ftp),2 +table=1,in_port=1,tcp,ct_state=+trk+est,action=2 + +dnl Allow incoming FTP data connections and responses to existing connections +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+new+rel,action=ct(commit),1 +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk-new+rel,action=1 +]) + +dnl flows3 is same as flows1, except no ALG is specified. +AT_DATA([flows3.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal +table=0,priority=100,in_port=1,tcp,action=ct(commit),2 +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk+rel,action=1 +]) + +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows1.txt]) + +OVS_START_L7([at_ns0], [ftp], [11111]) +OVS_START_L7([at_ns1], [ftp], [11111]) + +dnl FTP requests from p1->p0 should fail due to network failure. +dnl Try 3 times, in 1 second intervals. +NS_CHECK_EXEC([at_ns1], [wget ftp://10.1.1.1:11111 --no-passive-ftp -t 3 -T 1 -v -o wget1.log], [4]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.1)], [0], [dnl +]) + +dnl FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0.log]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +]) + +dnl Try the second set of flows. +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +dnl FTP requests from p1->p0 should fail due to network failure. +dnl Try 3 times, in 1 second intervals. +NS_CHECK_EXEC([at_ns1], [wget ftp://10.1.1.1:11111 --no-passive-ftp -t 3 -T 1 -v -o wget1.log], [4]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.1)], [0], [dnl +]) + +dnl Active FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0-1.log]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +tcp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),reply=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),protoinfo=(state=) +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +dnl Passive FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 -t 3 -T 1 --retry-connrefused -v -o wget0-2.log]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +]) + +dnl Try the third set of flows, without alg specifier. +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows3.txt]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +dnl FTP control requests from p0->p1 should work fine, but helper will not be assigned. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0-3.log], [4]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=) +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([conntrack - FTP with expectation dump]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() diff --git a/tests/test-l7.py b/tests/test-l7.py index 32a77392c64..97cd4f29a60 100755 --- a/tests/test-l7.py +++ b/tests/test-l7.py @@ -86,6 +86,8 @@ def main(): description='Run basic application servers.') parser.add_argument('proto', default='http', nargs='?', help='protocol to serve (%s)' % protocols) + parser.add_argument('port', default=0, nargs='?', + help='server port number') args = parser.parse_args() if args.proto not in protocols: @@ -95,6 +97,8 @@ def main(): constructor = SERVERS[args.proto][0] handler = SERVERS[args.proto][1] port = SERVERS[args.proto][2] + if args.port != 0: + port = args.port srv = constructor(('', port), handler) srv.serve_forever() From c8d85a0e459401f05d177c4814ca8b60a802f13f Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 10 Jan 2024 20:31:56 -0500 Subject: [PATCH 506/833] AUTHORS: Add Brad Cowie Signed-off-by: Aaron Conole --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 6b8367ef4a0..fb03b5dfeea 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -92,6 +92,7 @@ Billy O'Mahony billy.o.mahony@intel.com Binbin Xu xu.binbin1@zte.com.cn Bodo Petermann b.petermann@syseleven.de Boleslaw Tokarski boleslaw.tokarski@jollamobile.com +Brad Cowie brad@faucet.nz Brian Haley haleyb.dev@gmail.com Brian Kruger bkruger+ovsdev@gmail.com Bruce Davie bdavie@vmware.com From ac04dfa7ec362616003f843ebfc46f074076aebb Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Mon, 18 Dec 2023 17:31:24 -0600 Subject: [PATCH 507/833] python: idl: Handle monitor_canceled. Currently python-ovs claims to be "db change aware" but does not parse the "monitor_canceled" notification. Transactions can continue being made, but the monitor updates will not be sent. This handles monitor_cancel similarly to how ovsdb-cs currently does. Fixes: c39751e44539 ("python: Monitor Database table to manage lifecycle of IDL client.") Signed-off-by: Terry Wilson Acked-by: Dumitru Ceara Signed-off-by: Simon Horman --- python/ovs/db/idl.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index 16ece0334cf..a80da84e7a9 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -299,6 +299,7 @@ def __init__(self, remote, schema_helper, probe_interval=None, self._server_schema_request_id = None self._server_monitor_request_id = None self._db_change_aware_request_id = None + self._monitor_cancel_request_id = None self._server_db_name = '_Server' self._server_db_table = 'Database' self.server_tables = None @@ -481,6 +482,10 @@ def run(self): break else: self.__parse_update(msg.params[1], OVSDB_UPDATE) + elif self.handle_monitor_canceled(msg): + break + elif self.handle_monitor_cancel_reply(msg): + break elif (msg.type == ovs.jsonrpc.Message.T_REPLY and self._monitor_request_id is not None and self._monitor_request_id == msg.id): @@ -616,6 +621,33 @@ def run(self): return initial_change_seqno != self.change_seqno + def handle_monitor_canceled(self, msg): + if msg.type != msg.T_NOTIFY: + return False + if msg.method != "monitor_canceled": + return False + + if msg.params[0] == str(self.uuid): + params = [str(self.server_monitor_uuid)] + elif msg.params[0] == str(self.server_monitor_uuid): + params = [str(self.uuid)] + else: + return False + + mc_msg = ovs.jsonrpc.Message.create_request("monitor_cancel", params) + self._monitor_cancel_request_id = mc_msg.id + self.send_request(mc_msg) + self.restart_fsm() + return True + + def handle_monitor_cancel_reply(self, msg): + if msg.type != msg.T_REPLY: + return False + if msg.id != self._monitor_cancel_request_id: + return False + self._monitor_cancel_request_id = None + return True + def compose_cond_change(self): if not self.cond_changed: return From 67ee6308781dc24acfc0e50ab7b3e6ad71d6333e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 8 Jan 2024 15:52:23 +0100 Subject: [PATCH 508/833] ovsdb-idl.at: Test IDL behavior during database conversion. Add new 'monitor' command to test-ovsdb utilities to make them just run IDL loop infinitely. Other commands can still be placed before the 'monitor', e.g. setting up conditions, tracking, running a few transactions. Having that, adding a couple test cases for IDL with online database conversion. Test checks that IDL receives monitor cancellation notification and successfully re-sends monitor requests. While at it, adding debug logging to ovsdb-server processes for easier debugging. While working on a test the issue was discovered that schema for standalone databases is not getting updated in the _Server database after conversion. Checking the new schema bits only for clustered databases for now. Signed-off-by: Ilya Maximets Acked-by: Dumitru Ceara Signed-off-by: Simon Horman --- tests/ovsdb-idl.at | 100 +++++++++++++++++++++++++++++++++++++++++--- tests/test-ovsdb.c | 7 ++++ tests/test-ovsdb.py | 27 +++++++----- 3 files changed, 119 insertions(+), 15 deletions(-) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index f17cfdf1047..fb568dd823c 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -29,8 +29,8 @@ m4_define([OVSDB_START_IDLTEST], AT_CHECK([ovsdb-tool create db dnl m4_if([$2], [], [$abs_srcdir/idltest.ovsschema], [$2])]) PKIDIR=$abs_top_builddir/tests - AT_CHECK([ovsdb-server -vconsole:warn --log-file --detach --no-chdir dnl - --pidfile --remote=punix:socket dnl + AT_CHECK([ovsdb-server -vconsole:warn -vfile:dbg --log-file dnl + --detach --no-chdir --pidfile --remote=punix:socket dnl m4_if(m4_substr($1, 0, 5), [pssl:], [--private-key=$PKIDIR/testpki-privkey2.pem dnl --certificate=$PKIDIR/testpki-cert2.pem dnl @@ -57,9 +57,9 @@ m4_define([OVSDB_CLUSTER_START_IDLTEST], done on_exit 'kill $(cat s*.pid)' for i in $(seq $n); do - AT_CHECK([ovsdb-server -vraft -vconsole:warn --detach --no-chdir \ - --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i \ - --remote=punix:s$i.ovsdb \ + AT_CHECK([ovsdb-server -vraft -vconsole:warn -vfile:dbg --detach \ + --no-chdir --log-file=s$i.log --pidfile=s$i.pid \ + --unixctl=s$i --remote=punix:s$i.ovsdb \ m4_if([$2], [], [], [--remote=$2]) s$i.db]) done @@ -2756,3 +2756,93 @@ OVSDB_CHECK_IDL_PERS_UUID_INSERT([simple idl, persistent uuid insert], 011: done ]], [['This UUID would duplicate a UUID already present within the table or deleted within the same transaction']]) + + +m4_define([OVSDB_CHECK_IDL_CHANGE_AWARE], + [AT_SETUP([simple idl, database change aware, online conversion - $1]) + AT_KEYWORDS([ovsdb server idl db_change_aware conversion $1]) + + m4_if([$1], [clustered], + [OVSDB_CLUSTER_START_IDLTEST([1], [punix:socket])], + [OVSDB_START_IDLTEST]) + + dnl Add some data. + AT_CHECK([[ovsdb-client transact unix:socket '["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 1, + "r": 2.0, + "b": true, + "s": "first row", + "u": ["uuid", "84f5c8f5-ac76-4dbc-a24f-8860eb407fc1"], + "ia": ["set", [1, 2, 3]], + "ra": ["set", [-0.5]], + "ba": ["set", [true]], + "sa": ["set", ["abc", "def"]], + "ua": ["set", [["uuid", "69443985-7806-45e2-b35f-574a04e720f9"], + ["uuid", "aad11ef0-816a-4b01-93e6-03b8b4256b98"]]]}}, + {"op": "insert", + "table": "simple", + "row": {"b": false, "s": "second row"}}, + {"op": "insert", + "table": "simple", + "row": {"b": true, "s": "third row"}} + ]']], [0], [stdout]) + + dnl Create a new schema by adding 'extra_column' to the 'simple' table. + AT_CHECK([sed 's/"ua": {/"extra_column":{"type": "string"},"ua": {/ + s/1.2.3/1.2.4/' \ + $abs_srcdir/idltest.ovsschema > new-idltest.ovsschema]) + dnl Try "needs-conversion". + AT_CHECK([ovsdb-client needs-conversion unix:socket $abs_srcdir/idltest.ovsschema], [0], [no +]) + AT_CHECK([ovsdb-client needs-conversion unix:socket new-idltest.ovsschema], [0], [yes +]) + + dnl Conditionally exclude the second row from monitoring. + m4_define([COND], [['condition simple [["b","==",true]]']]) + + dnl Start monitoring. + OVS_DAEMONIZE([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t30 \ + idl unix:socket COND monitor \ + >idl-c.out 2>idl-c.err], [idl-c.pid]) + AT_CAPTURE_FILE([idl-c.out]) + AT_CAPTURE_FILE([idl-c.err]) + + OVS_DAEMONIZE([$PYTHON3 $srcdir/test-ovsdb.py -t30 \ + idl $srcdir/idltest.ovsschema unix:socket COND monitor \ + >idl-python.out 2>idl-python.err], [idl-python.pid]) + AT_CAPTURE_FILE([idl-python.out]) + AT_CAPTURE_FILE([idl-python.err]) + + dnl Wait for monitors to receive the data. + OVS_WAIT_UNTIL([grep -q 'third row' idl-c.err]) + OVS_WAIT_UNTIL([grep -q 'third row' idl-python.err]) + + dnl Convert the database. + AT_CHECK([ovsdb-client convert unix:socket new-idltest.ovsschema]) + + dnl Check for the monitor cancellation and the data being requested again. + m4_foreach([FILE], [[idl-c], [idl-python]], + [OVS_WAIT_UNTIL([grep -q 'monitor_canceled' FILE.err]) + OVS_WAIT_UNTIL([test 2 -eq $(grep -c 'send request, method="monitor_cond_since", params=."idltest"' FILE.err)]) + + dnl XXX: Checking for the new schema bits conditionally because standalone + dnl databases are not updating the schema in the _Server database properly. + m4_if([$1], [clustered], [OVS_WAIT_UNTIL([grep -q 'extra_column' FILE.err])]) + + dnl Check that there were no unexpected messages. + AT_CHECK([! grep 'unexpected' FILE.err]) + + dnl Check that the data is received twice and the condition is working. + AT_CHECK([sort FILE.out | uuidfilt], [0], +[[000: simple: change conditions +001: table simple: i=0 r=0 b=true s=third row u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +001: table simple: i=1 r=2 b=true s=first row u=<2> ia=[1 2 3] ra=[-0.5] ba=[true] sa=[abc def] ua=[<3> <4>] uuid=<5> +002: table simple: i=0 r=0 b=true s=third row u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +002: table simple: i=1 r=2 b=true s=first row u=<2> ia=[1 2 3] ra=[-0.5] ba=[true] sa=[abc def] ua=[<3> <4>] uuid=<5> +]])]) + AT_CLEANUP]) + +OVSDB_CHECK_IDL_CHANGE_AWARE([standalone]) +OVSDB_CHECK_IDL_CHANGE_AWARE([clustered]) diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index c761822e62e..d6a47de336e 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -2800,6 +2800,13 @@ do_idl(struct ovs_cmdl_context *ctx) } else { print_idl(idl, step++, terse); } + + /* Just run IDL forever for a simple monitoring. */ + if (!strcmp(arg, "monitor")) { + seqno = ovsdb_idl_get_seqno(idl); + i--; + continue; + } } seqno = ovsdb_idl_get_seqno(idl); diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index 71248854fc7..48f8ee2d704 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -757,16 +757,23 @@ def mock_notify(event, row, updates=None): poller.block() else: # Wait for update. - while idl.change_seqno == seqno and not idl.run(): - rpc.run() - - poller = ovs.poller.Poller() - idl.wait(poller) - rpc.wait(poller) - poller.block() - - print_idl(idl, step, terse) - step += 1 + while True: + while idl.change_seqno == seqno and not idl.run(): + rpc.run() + + poller = ovs.poller.Poller() + idl.wait(poller) + rpc.wait(poller) + poller.block() + + print_idl(idl, step, terse) + step += 1 + + # Run IDL forever in case of a simple monitor, otherwise + # break and execute the command. + seqno = idl.change_seqno + if command != "monitor": + break seqno = idl.change_seqno From 7ab8f6f7c77cf16e6b81d10470fe49611fd85717 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 20:54:03 +0100 Subject: [PATCH 509/833] ovsdb: Preserve column diffs read from the storage. Database file contains the column diff, but it is discarded once the 'new' state of a row is constructed. Keep it in the transaction row, as it can be used later by other parts of the code. Diffs do not live long, we keep them around only while transaction is alive, so should not affect memory consumption. Users for this data will be added in later commits. Acked-by: Mike Pattrick Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/execution.c | 14 +++++++--- ovsdb/file.c | 22 +++++++++++----- ovsdb/ovsdb-server.c | 7 +++-- ovsdb/table.c | 6 +++-- ovsdb/transaction.c | 62 ++++++++++++++++++++++++++++++++++---------- ovsdb/transaction.h | 6 +++-- tests/test-ovsdb.c | 2 +- 7 files changed, 88 insertions(+), 31 deletions(-) diff --git a/ovsdb/execution.c b/ovsdb/execution.c index 5587ef96fe5..8c20c3b54a1 100644 --- a/ovsdb/execution.c +++ b/ovsdb/execution.c @@ -490,9 +490,11 @@ update_row_cb(const struct ovsdb_row *row, void *ur_) ur->n_matches++; if (!ovsdb_row_equal_columns(row, ur->row, ur->columns)) { + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(ur->txn, row, &rw_row, NULL); ovsdb_error_assert(ovsdb_row_update_columns( - ovsdb_txn_row_modify(ur->txn, row), - ur->row, ur->columns, false)); + rw_row, ur->row, ur->columns, false)); } return true; @@ -572,10 +574,14 @@ static bool mutate_row_cb(const struct ovsdb_row *row, void *mr_) { struct mutate_row_cbdata *mr = mr_; + struct ovsdb_row *rw_row; + + /* Not trying to track the row diff here, because user transactions + * may attempt to add duplicates or remove elements that do not exist. */ + ovsdb_txn_row_modify(mr->txn, row, &rw_row, NULL); mr->n_matches++; - *mr->error = ovsdb_mutation_set_execute(ovsdb_txn_row_modify(mr->txn, row), - mr->mutations); + *mr->error = ovsdb_mutation_set_execute(rw_row, mr->mutations); return *mr->error == NULL; } diff --git a/ovsdb/file.c b/ovsdb/file.c index 8bd1d4af30f..77a89fd1a46 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -80,8 +80,8 @@ ovsdb_file_column_diff_disable(void) } static struct ovsdb_error * -ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, - bool row_contains_diff, +ovsdb_file_update_row_from_json(struct ovsdb_row *row, struct ovsdb_row *diff, + bool converting, bool row_contains_diff, const struct json *json) { struct ovsdb_table_schema *schema = row->table->schema; @@ -122,6 +122,12 @@ ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, error = ovsdb_datum_apply_diff_in_place( &row->fields[column->index], &datum, &column->type); + if (!error && diff) { + ovs_assert(ovsdb_datum_is_default(&diff->fields[column->index], + &column->type)); + ovsdb_datum_swap(&diff->fields[column->index], &datum); + } + ovsdb_datum_destroy(&datum, &column->type); if (error) { return error; @@ -150,16 +156,20 @@ ovsdb_file_txn_row_from_json(struct ovsdb_txn *txn, struct ovsdb_table *table, ovsdb_txn_row_delete(txn, row); return NULL; } else if (row) { - return ovsdb_file_update_row_from_json(ovsdb_txn_row_modify(txn, row), - converting, row_contains_diff, - json); + struct ovsdb_row *new, *diff = NULL; + + ovsdb_txn_row_modify(txn, row, &new, + row_contains_diff ? &diff : NULL); + return ovsdb_file_update_row_from_json(new, diff, converting, + row_contains_diff, json); } else { struct ovsdb_error *error; struct ovsdb_row *new; new = ovsdb_row_create(table); *ovsdb_row_get_uuid_rw(new) = *row_uuid; - error = ovsdb_file_update_row_from_json(new, converting, false, json); + error = ovsdb_file_update_row_from_json(new, NULL, converting, + false, json); if (error) { ovsdb_row_destroy(new); } else { diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 4d29043f4f6..dbf85fe3bb5 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -1111,7 +1111,7 @@ update_remote_row(const struct ovsdb_row *row, struct ovsdb_txn *txn, /* Bad remote spec or incorrect schema. */ return; } - rw_row = ovsdb_txn_row_modify(txn, row); + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); ovsdb_jsonrpc_server_get_remote_status(jsonrpc, target, &status); /* Update status information columns. */ @@ -1301,7 +1301,10 @@ update_server_status(struct shash *all_dbs) if (!db || !db->db) { ovsdb_txn_row_delete(txn, row); } else { - update_database_status(ovsdb_txn_row_modify(txn, row), db); + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); + update_database_status(rw_row, db); } } diff --git a/ovsdb/table.c b/ovsdb/table.c index 0792e1580e6..3e89ddd44a0 100644 --- a/ovsdb/table.c +++ b/ovsdb/table.c @@ -415,8 +415,10 @@ ovsdb_table_execute_update(struct ovsdb_txn *txn, const struct uuid *row_uuid, NULL, &columns, xor); if (!error && (xor || !ovsdb_row_equal_columns(row, update, &columns))) { - error = ovsdb_row_update_columns(ovsdb_txn_row_modify(txn, row), - update, &columns, xor); + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); + error = ovsdb_row_update_columns(rw_row, update, &columns, xor); } ovsdb_column_set_destroy(&columns); diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index a482588a0b8..60c4e2acb11 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -72,6 +72,8 @@ struct ovsdb_txn_table { * 'new'. * * - A row modified by a transaction will have non-null 'old' and 'new'. + * It may have non-null 'diff' as well in this case, but it is not + * necessary. * * - 'old' and 'new' both null indicates that a row was added then deleted * within a single transaction. Most of the time we instead delete the @@ -83,6 +85,7 @@ struct ovsdb_txn_row { struct hmap_node hmap_node; /* In ovsdb_txn_table's txn_rows hmap. */ struct ovsdb_row *old; /* The old row. */ struct ovsdb_row *new; /* The new row. */ + struct ovsdb_row *diff; /* The difference between old and new. */ size_t n_refs; /* Number of remaining references. */ /* These members are the same as the corresponding members of 'old' or @@ -155,6 +158,7 @@ ovsdb_txn_row_abort(struct ovsdb_txn *txn OVS_UNUSED, { struct ovsdb_row *old = txn_row->old; struct ovsdb_row *new = txn_row->new; + struct ovsdb_row *diff = txn_row->diff; ovsdb_txn_row_prefree(txn_row); if (!old) { @@ -184,6 +188,7 @@ ovsdb_txn_row_abort(struct ovsdb_txn *txn OVS_UNUSED, } ovsdb_row_destroy(new); + ovsdb_row_destroy(diff); free(txn_row); return NULL; @@ -250,7 +255,10 @@ find_or_make_txn_row(struct ovsdb_txn *txn, const struct ovsdb_table *table, if (!txn_row) { const struct ovsdb_row *row = ovsdb_table_get_row(table, uuid); if (row) { - txn_row = ovsdb_txn_row_modify(txn, row)->txn_row; + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); + txn_row = rw_row->txn_row; } } return txn_row; @@ -433,6 +441,9 @@ delete_garbage_row(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) return NULL; } + ovsdb_row_destroy(txn_row->diff); + txn_row->diff = NULL; + row = txn_row->new; txn_row->new = NULL; hmap_remove(&txn_row->table->rows, &row->hmap_node); @@ -544,6 +555,7 @@ ovsdb_txn_row_commit(struct ovsdb_txn *txn OVS_UNUSED, txn_row->new->n_refs = txn_row->n_refs; } ovsdb_row_destroy(txn_row->old); + ovsdb_row_destroy(txn_row->diff); free(txn_row); return NULL; @@ -1178,6 +1190,7 @@ ovsdb_txn_destroy_cloned(struct ovsdb_txn *txn) if (r->new) { ovsdb_row_destroy(r->new); } + ovs_assert(!r->diff); hmap_remove(&t->txn_rows, &r->hmap_node); free(r); } @@ -1439,7 +1452,8 @@ ovsdb_txn_create_txn_table(struct ovsdb_txn *txn, struct ovsdb_table *table) static struct ovsdb_txn_row * ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, - const struct ovsdb_row *old_, struct ovsdb_row *new) + const struct ovsdb_row *old_, struct ovsdb_row *new, + struct ovsdb_row *diff) { const struct ovsdb_row *row = old_ ? old_ : new; struct ovsdb_row *old = CONST_CAST(struct ovsdb_row *, old_); @@ -1453,6 +1467,7 @@ ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, txn_row->table = row->table; txn_row->old = old; txn_row->new = new; + txn_row->diff = diff; txn_row->n_refs = old ? old->n_refs : 0; txn_row->serial = serial - 1; @@ -1465,6 +1480,9 @@ ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, if (new) { new->txn_row = txn_row; } + if (diff) { + diff->txn_row = txn_row; + } txn_table = ovsdb_txn_create_txn_table(txn, table); hmap_insert(&txn_table->txn_rows, &txn_row->hmap_node, @@ -1473,24 +1491,38 @@ ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, return txn_row; } -struct ovsdb_row * -ovsdb_txn_row_modify(struct ovsdb_txn *txn, const struct ovsdb_row *ro_row_) +void +ovsdb_txn_row_modify(struct ovsdb_txn *txn, const struct ovsdb_row *ro_row_, + struct ovsdb_row **rw_row, struct ovsdb_row **diff) { struct ovsdb_row *ro_row = CONST_CAST(struct ovsdb_row *, ro_row_); + ovs_assert(ro_row); + ovs_assert(rw_row); + if (ro_row->txn_row) { ovs_assert(ro_row == ro_row->txn_row->new); - return ro_row; + *rw_row = ro_row; + if (diff) { + *diff = ro_row->txn_row->diff; + } else { + /* Caller will modify the row without updating the diff. + * Destroy the existing diff, if any, so it will not be + * used for this row anymore. Modification will always + * return NULL from this point on. */ + ovsdb_row_destroy(ro_row->txn_row->diff); + ro_row->txn_row->diff = NULL; + } } else { struct ovsdb_table *table = ro_row->table; - struct ovsdb_row *rw_row; - rw_row = ovsdb_row_clone(ro_row); - rw_row->n_refs = ro_row->n_refs; - ovsdb_txn_row_create(txn, table, ro_row, rw_row); - hmap_replace(&table->rows, &ro_row->hmap_node, &rw_row->hmap_node); - - return rw_row; + *rw_row = ovsdb_row_clone(ro_row); + (*rw_row)->n_refs = ro_row->n_refs; + if (diff) { + *diff = ovsdb_row_create(table); + } + ovsdb_txn_row_create(txn, table, ro_row, *rw_row, diff ? *diff : NULL); + hmap_replace(&table->rows, &ro_row->hmap_node, &(*rw_row)->hmap_node); } } @@ -1502,7 +1534,7 @@ ovsdb_txn_row_insert(struct ovsdb_txn *txn, struct ovsdb_row *row) uuid_generate(ovsdb_row_get_version_rw(row)); - ovsdb_txn_row_create(txn, table, NULL, row); + ovsdb_txn_row_create(txn, table, NULL, row, NULL); hmap_insert(&table->rows, &row->hmap_node, hash); } @@ -1518,9 +1550,11 @@ ovsdb_txn_row_delete(struct ovsdb_txn *txn, const struct ovsdb_row *row_) hmap_remove(&table->rows, &row->hmap_node); if (!txn_row) { - ovsdb_txn_row_create(txn, table, row, NULL); + ovsdb_txn_row_create(txn, table, row, NULL, NULL); } else { ovs_assert(txn_row->new == row); + ovsdb_row_destroy(txn_row->diff); + txn_row->diff = NULL; if (txn_row->old) { txn_row->new = NULL; } else { diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h index 0e054eef3bd..f659838dc81 100644 --- a/ovsdb/transaction.h +++ b/ovsdb/transaction.h @@ -21,6 +21,7 @@ struct json; struct ovsdb; +struct ovsdb_row; struct ovsdb_schema; struct ovsdb_table; struct uuid; @@ -50,8 +51,9 @@ const struct ovsdb_error *ovsdb_txn_progress_get_error( const struct ovsdb_txn_progress *); void ovsdb_txn_progress_destroy(struct ovsdb_txn_progress *); -struct ovsdb_row *ovsdb_txn_row_modify(struct ovsdb_txn *, - const struct ovsdb_row *); +void ovsdb_txn_row_modify(struct ovsdb_txn *, const struct ovsdb_row *, + struct ovsdb_row **row_new, + struct ovsdb_row **row_diff); void ovsdb_txn_row_insert(struct ovsdb_txn *, struct ovsdb_row *); void ovsdb_txn_row_delete(struct ovsdb_txn *, const struct ovsdb_row *); diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index d6a47de336e..c4ab899d459 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -1798,7 +1798,7 @@ do_transact_modify(struct ovs_cmdl_context *ctx) struct ovsdb_row *row_rw; row_ro = do_transact_find_row(ctx->argv[1]); - row_rw = ovsdb_txn_row_modify(do_transact_txn, row_ro); + ovsdb_txn_row_modify(do_transact_txn, row_ro, &row_rw, NULL); do_transact_set_i_j(row_rw, ctx->argv[2], ctx->argv[3]); } From 60457a5e9ddc33809139e91b08634eacd766abb2 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 20:54:04 +0100 Subject: [PATCH 510/833] ovsdb: transaction: Calculate added/removed from diff. In case the difference between 'old' and 'new' rows is readily available, it can be used to construct added/removed datums instead. Diffs are typically much smaller than the column itself. This change more than doubles the performance of a transaction replay. For example, with this change applied, initial read of OVSDB file containing 136K small transactions for large OVN port groups and address sets on my laptop takes 11 seconds vs 24 seconds without. Acked-by: Mike Pattrick Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/ovsdb-data.c | 28 ++++++++++++++++++++++++++++ lib/ovsdb-data.h | 1 + ovsdb/transaction.c | 15 ++++++++++++--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/lib/ovsdb-data.c b/lib/ovsdb-data.c index f18f74298f9..abb923ad8fa 100644 --- a/lib/ovsdb-data.c +++ b/lib/ovsdb-data.c @@ -2238,6 +2238,8 @@ ovsdb_symbol_table_insert(struct ovsdb_symbol_table *symtab, /* APIs for Generating and apply diffs. */ /* Find what needs to be added to and removed from 'old' to construct 'new'. + * If the optional 'diff' is provided, it can be used to speed up processing, + * in case it is smaller than the original 'old' and 'new'. * * The 'added' and 'removed' datums are always safe; the orders of keys are * maintained since they are added in order. */ @@ -2246,6 +2248,7 @@ ovsdb_datum_added_removed(struct ovsdb_datum *added, struct ovsdb_datum *removed, const struct ovsdb_datum *old, const struct ovsdb_datum *new, + const struct ovsdb_datum *diff, const struct ovsdb_type *type) { size_t oi, ni; @@ -2258,6 +2261,31 @@ ovsdb_datum_added_removed(struct ovsdb_datum *added, return; } + /* Use diff, if provided, unless it's comparable in size. With a large + * diff, the O(n log n) binary search of each element may be slower than + * a simple O(n) comparison between old and new. */ + if (diff && diff->n * 2 < old->n + new->n) { + unsigned int idx; + + for (size_t di = 0; di < diff->n; di++) { + bool found = ovsdb_datum_find_key(old, &diff->keys[di], + type->key.type, &idx); + + if (!found) { + ovsdb_datum_add_from_index_unsafe(added, diff, di, type); + } else { + if (type->value.type != OVSDB_TYPE_VOID + && !ovsdb_atom_equals(&diff->values[di], + &old->values[idx], + type->value.type)) { + ovsdb_datum_add_from_index_unsafe(added, diff, di, type); + } + ovsdb_datum_add_from_index_unsafe(removed, old, idx, type); + } + } + return; + } + /* Generate the diff in O(n) time. */ for (oi = ni = 0; oi < old->n && ni < new->n;) { int c = ovsdb_atom_compare_3way(&old->keys[oi], &new->keys[ni], diff --git a/lib/ovsdb-data.h b/lib/ovsdb-data.h index f048a8cb03d..c0408ee49ca 100644 --- a/lib/ovsdb-data.h +++ b/lib/ovsdb-data.h @@ -256,6 +256,7 @@ void ovsdb_datum_added_removed(struct ovsdb_datum *added, struct ovsdb_datum *removed, const struct ovsdb_datum *old, const struct ovsdb_datum *new, + const struct ovsdb_datum *diff, const struct ovsdb_type *type); void ovsdb_datum_diff(struct ovsdb_datum *diff, diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 60c4e2acb11..484a88e1cc2 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -347,12 +347,13 @@ update_row_ref_count(struct ovsdb_txn *txn, struct ovsdb_txn_row *r) return error; } } else if (r->old && r->new) { - struct ovsdb_datum added, removed; + struct ovsdb_datum added, removed, *diff; + diff = r->diff ? &r->diff->fields[column->index] : NULL; ovsdb_datum_added_removed(&added, &removed, &r->old->fields[column->index], &r->new->fields[column->index], - &column->type); + diff, &column->type); error = ovsdb_txn_adjust_row_refs( txn, r->old, column, &removed, -1); @@ -762,9 +763,13 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) if (datum->n != orig_n || bitmap_is_set(txn_row->changed, column->index)) { if (txn_row->old) { + struct ovsdb_datum *diff; + + diff = txn_row->diff && datum->n == orig_n + ? &txn_row->diff->fields[column->index] : NULL; ovsdb_datum_added_removed(&added, &removed, &txn_row->old->fields[column->index], - datum, &column->type); + datum, diff, &column->type); } else { ovsdb_datum_clone(&added, datum); } @@ -792,6 +797,10 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) if (datum->n != orig_n) { bitmap_set1(txn_row->changed, column->index); + /* Can no longer rely on the previous diff. */ + ovsdb_row_destroy(txn_row->diff); + txn_row->diff = NULL; + if (datum->n < column->type.n_min) { const struct uuid *row_uuid = ovsdb_row_get_uuid(txn_row->new); if (zero && !txn_row->old) { From 92ff2a2017c9031b93a994355c8482ce9ad51255 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 10 Jan 2024 12:25:55 +0100 Subject: [PATCH 511/833] ofproto-dpif-upcall: Change flow dump duration message to WARN level. Currently the 'Spent an unreasonably long Xms dumping flows' message is set to the INFO level. However, based on this, we are also drastically limiting the number of flows in the datapath, and this would warrant a WARNING level. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Aaron Conole --- ofproto/ofproto-dpif-upcall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index cc10f57b5e6..cd71e3ee36d 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -1049,7 +1049,7 @@ udpif_revalidator(void *arg) atomic_store_relaxed(&udpif->flow_limit, flow_limit); if (duration > 2000) { - VLOG_INFO("Spent an unreasonably long %lldms dumping flows", + VLOG_WARN("Spent an unreasonably long %lldms dumping flows", duration); } From 969b3a5706378f4cb98702cfd875b2fac260fbaa Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 10 Jan 2024 12:25:56 +0100 Subject: [PATCH 512/833] ofproto-dpif-upcall: Add flow_limit coverage counters. Add new coverage counters that might help debugging flow_limit related issues. Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Acked-by: Simon Horman Signed-off-by: Aaron Conole --- ofproto/ofproto-dpif-upcall.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index cd71e3ee36d..b5cbeed8780 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -59,8 +59,11 @@ COVERAGE_DEFINE(handler_duplicate_upcall); COVERAGE_DEFINE(revalidate_missed_dp_flow); COVERAGE_DEFINE(ukey_dp_change); COVERAGE_DEFINE(ukey_invalid_stat_reset); +COVERAGE_DEFINE(upcall_flow_limit_grew); COVERAGE_DEFINE(upcall_flow_limit_hit); COVERAGE_DEFINE(upcall_flow_limit_kill); +COVERAGE_DEFINE(upcall_flow_limit_reduced); +COVERAGE_DEFINE(upcall_flow_limit_scaled); COVERAGE_DEFINE(upcall_ukey_contention); COVERAGE_DEFINE(upcall_ukey_replace); @@ -1039,11 +1042,14 @@ udpif_revalidator(void *arg) udpif->dump_duration = duration; if (duration > 2000) { flow_limit /= duration / 1000; + COVERAGE_INC(upcall_flow_limit_scaled); } else if (duration > 1300) { flow_limit = flow_limit * 3 / 4; + COVERAGE_INC(upcall_flow_limit_reduced); } else if (duration < 1000 && flow_limit < n_flows * 1000 / duration) { flow_limit += 1000; + COVERAGE_INC(upcall_flow_limit_grew); } flow_limit = MIN(ofproto_flow_limit, MAX(flow_limit, 1000)); atomic_store_relaxed(&udpif->flow_limit, flow_limit); From 05d6f419cd91c1dc6f676309479cde0c3b35e542 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 10 Jan 2024 12:25:57 +0100 Subject: [PATCH 513/833] timeval: Add coverage counter for long poll interval events. Martin Kennelly observes that even though this data is available to humans via the journal/log files, these aren't exactly easy for a developer to make any kind of behavioral inferences. This kind of log and counter would be useful when checking on system health to let us know that an Open vSwitch component is noticing some kind of system level hiccup. Add a new coverage counter to track information on these events, and let a developer or system engineer know how long these events have occurred with some historical context. Reported-at: https://lists.linuxfoundation.org/pipermail/ovs-discuss/2023-June/052523.html Suggested-by: Martin Kennelly Co-Authored-By: Aaron Conole Signed-off-by: Eelco Chaudron Acked-by: Simon Horman Acked-by: Ilya Maximets Signed-off-by: Aaron Conole --- lib/timeval.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/timeval.c b/lib/timeval.c index 193c7bab178..0abe7e555aa 100644 --- a/lib/timeval.c +++ b/lib/timeval.c @@ -41,6 +41,8 @@ VLOG_DEFINE_THIS_MODULE(timeval); +COVERAGE_DEFINE(long_poll_interval); + #if !defined(HAVE_CLOCK_GETTIME) typedef unsigned int clockid_t; static int clock_gettime(clock_t id, struct timespec *ts); @@ -644,6 +646,8 @@ log_poll_interval(long long int last_wakeup) const struct rusage *last_rusage = get_recent_rusage(); struct rusage rusage; + COVERAGE_INC(long_poll_interval); + if (!getrusage_thread(&rusage)) { VLOG_WARN("Unreasonably long %lldms poll interval" " (%lldms user, %lldms system)", From ea43621745f985e7b440c87c41796e59b67500c3 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:01 +0100 Subject: [PATCH 514/833] ovsdb: Allow database itself to be read-only. Currently, the read-only option can be set on connections or JSON-RPC server as a whole. However, there is no way to allow modifications in one database, but not in the other. Adding an internal read-only flag for a database itself. Will be used later for running active and backup databases in a single process. Marking the _Server database as read only is not necessary, because modifications of internal databases are not allowed anyway, but it doesn't hurt. Acked-by: Mike Pattrick Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 3 ++- ovsdb/ovsdb-server.c | 3 +++ ovsdb/ovsdb.c | 2 ++ ovsdb/ovsdb.h | 3 +++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 45f7c8038c2..4ea4c7a4bf2 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -1148,7 +1148,8 @@ ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, /* Insert into trigger table. */ t = xmalloc(sizeof *t); bool disconnect_all = ovsdb_trigger_init( - &s->up, db, &t->trigger, request, time_msec(), s->read_only, + &s->up, db, &t->trigger, request, time_msec(), + s->read_only || db->read_only, s->remote->role, jsonrpc_session_get_id(s->js)); t->id = json_clone(request->id); hmap_insert(&s->triggers, &t->hmap_node, hash); diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index dbf85fe3bb5..dfc94db42de 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -831,8 +831,11 @@ add_server_db(struct server_config *config) db->filename = xstrdup(""); db->db = ovsdb_create(schema, ovsdb_storage_create_unbacked(NULL)); + db->db->read_only = true; + bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); ovs_assert(ok); + add_db(config, db); } diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index f67b836d736..298616a64d0 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -464,6 +464,8 @@ ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage) db->n_atoms = 0; + db->read_only = false; + db->is_relay = false; ovs_list_init(&db->txn_forward_new); hmap_init(&db->txn_forward_sent); diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index d45630e8f0f..325900bc6d3 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -114,6 +114,9 @@ struct ovsdb { size_t n_atoms; /* Total number of ovsdb atoms in the database. */ + bool read_only; /* If 'true', JSON-RPC clients are not allowed to change + * the data. */ + /* Relay mode. */ bool is_relay; /* True, if database is in relay mode. */ /* List that holds transactions waiting to be forwarded to the server. */ From da1a4f6994c692fe10016ba25ee48f60d02bf7b0 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:02 +0100 Subject: [PATCH 515/833] jsonrpc-server: Add functions to convert jsonrpc options to/from json. These functions will be needed when we'll need to load/save configuration of each OVSDB remote separately. The parsing function is written in a way that it updates the provided options and doesn't create a new structure. This is done in order for different callers to have their own default values and only update them with what is provided by the user explicitly. For example, replication and relay have different default probe intervals. Acked-by: Mike Pattrick Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 67 ++++++++++++++++++++++++++++++++++++++++++ ovsdb/jsonrpc-server.h | 6 ++++ 2 files changed, 73 insertions(+) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 4ea4c7a4bf2..02daf1d8886 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -219,6 +219,73 @@ ovsdb_jsonrpc_default_options(const char *target) return options; } +struct json * +ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options) +{ + struct json *json = json_object_create(); + + json_object_put(json, "max-backoff", + json_integer_create(options->max_backoff)); + json_object_put(json, "inactivity-probe", + json_integer_create(options->probe_interval)); + json_object_put(json, "read-only", + json_boolean_create(options->read_only)); + json_object_put(json, "dscp", json_integer_create(options->dscp)); + if (options->role) { + json_object_put(json, "role", json_string_create(options->role)); + } + + return json; +} + +void +ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *options, + const struct json *json) +{ + const struct json *max_backoff, *probe_interval, *read_only, *dscp, *role; + struct ovsdb_parser parser; + struct ovsdb_error *error; + + ovsdb_parser_init(&parser, json, "JSON-RPC options"); + + max_backoff = ovsdb_parser_member(&parser, "max-backoff", + OP_INTEGER | OP_OPTIONAL); + if (max_backoff) { + options->max_backoff = json_integer(max_backoff); + } + + probe_interval = ovsdb_parser_member(&parser, "inactivity-probe", + OP_INTEGER | OP_OPTIONAL); + if (probe_interval) { + options->probe_interval = json_integer(probe_interval); + } + + read_only = ovsdb_parser_member(&parser, "read-only", + OP_BOOLEAN | OP_OPTIONAL); + if (read_only) { + options->read_only = json_boolean(read_only); + } + + dscp = ovsdb_parser_member(&parser, "dscp", OP_INTEGER | OP_OPTIONAL); + if (dscp) { + options->dscp = json_integer(dscp); + } + + role = ovsdb_parser_member(&parser, "role", OP_STRING | OP_OPTIONAL); + if (role) { + free(options->role); + options->role = nullable_xstrdup(json_string(role)); + } + + error = ovsdb_parser_finish(&parser); + if (error) { + char *s = ovsdb_error_to_string_free(error); + + VLOG_WARN("%s", s); + free(s); + } +} + /* Sets 'svr''s current set of remotes to the names in 'new_remotes', with * options in the struct ovsdb_jsonrpc_options supplied as the data values. * diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index e0653aa3974..9c49966c1ca 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -42,6 +42,12 @@ struct ovsdb_jsonrpc_options { struct ovsdb_jsonrpc_options * ovsdb_jsonrpc_default_options(const char *target); +struct json *ovsdb_jsonrpc_options_to_json( + const struct ovsdb_jsonrpc_options *) + OVS_WARN_UNUSED_RESULT; +void ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *, + const struct json *); + void ovsdb_jsonrpc_server_set_remotes(struct ovsdb_jsonrpc_server *, const struct shash *); From 80414c3e1bd6a6a283b861aac4d644dd473eb542 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:03 +0100 Subject: [PATCH 516/833] ovsdb: Track jsonrpc options per remote. Store JSON-RPC options for each remote separately, so it will be possible to have different configurations per remote in the future. These are also stored to and loaded from the temporary file that OVSDB is using to restore runtime configuration of the server restarted by the monitor process after a crash. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 11 ++++ ovsdb/jsonrpc-server.h | 6 +- ovsdb/ovsdb-server.c | 143 ++++++++++++++++++++++++++++------------- 3 files changed, 114 insertions(+), 46 deletions(-) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 02daf1d8886..d5e9c311ca4 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -219,6 +219,17 @@ ovsdb_jsonrpc_default_options(const char *target) return options; } +struct ovsdb_jsonrpc_options * +ovsdb_jsonrpc_options_clone(const struct ovsdb_jsonrpc_options *options) +{ + struct ovsdb_jsonrpc_options *clone; + + clone = xmemdup(options, sizeof *options); + clone->role = nullable_xstrdup(options->role); + + return clone; +} + struct json * ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options) { diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index 9c49966c1ca..39366ad7054 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -39,8 +39,10 @@ struct ovsdb_jsonrpc_options { int dscp; /* Dscp value for manager connections */ char *role; /* Role, for role-based access controls */ }; -struct ovsdb_jsonrpc_options * -ovsdb_jsonrpc_default_options(const char *target); +struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_default_options( + const char *target); +struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_options_clone( + const struct ovsdb_jsonrpc_options *); struct json *ovsdb_jsonrpc_options_to_json( const struct ovsdb_jsonrpc_options *) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index dfc94db42de..b0e368e8c72 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -101,7 +101,7 @@ static unixctl_cb_func ovsdb_server_get_sync_status; static unixctl_cb_func ovsdb_server_get_db_storage_status; struct server_config { - struct sset *remotes; + struct shash *remotes; struct shash *all_dbs; FILE *config_tmpfile; char **sync_from; @@ -130,29 +130,34 @@ static void remove_db(struct server_config *, struct shash_node *db, char *); static void close_db(struct server_config *, struct db *, char *); static void parse_options(int argc, char *argvp[], - struct sset *db_filenames, struct sset *remotes, + struct sset *db_filenames, struct shash *remotes, char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *is_backup); OVS_NO_RETURN static void usage(void); +static struct ovsdb_jsonrpc_options *add_remote( + struct shash *remotes, const char *target, + const struct ovsdb_jsonrpc_options *); +static void free_remotes(struct shash *remotes); + static char *reconfigure_remotes(struct ovsdb_jsonrpc_server *, const struct shash *all_dbs, - struct sset *remotes); + struct shash *remotes); static char *reconfigure_ssl(const struct shash *all_dbs); static void report_error_if_changed(char *error, char **last_errorp); static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, - const struct sset *remotes, + const struct shash *remotes, struct shash *all_dbs); static void update_server_status(struct shash *all_dbs); -static void save_config__(FILE *config_file, const struct sset *remotes, +static void save_config__(FILE *config_file, const struct shash *remotes, const struct sset *db_filenames, const char *sync_from, const char *sync_exclude, bool is_backup); static void save_config(struct server_config *); -static void load_config(FILE *config_file, struct sset *remotes, +static void load_config(FILE *config_file, struct shash *remotes, struct sset *db_filenames, char **sync_from, char **sync_exclude, bool *is_backup); @@ -184,7 +189,7 @@ log_and_free_error(struct ovsdb_error *error) static void main_loop(struct server_config *config, struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, - struct unixctl_server *unixctl, struct sset *remotes, + struct unixctl_server *unixctl, struct shash *remotes, struct process *run_process, bool *exiting, bool *is_backup) { char *remotes_error, *ssl_error; @@ -318,9 +323,6 @@ main(int argc, char *argv[]) char *run_command = NULL; struct unixctl_server *unixctl; struct ovsdb_jsonrpc_server *jsonrpc; - struct sset remotes, db_filenames; - char *sync_from, *sync_exclude; - bool is_backup; const char *db_filename; struct process *run_process; bool exiting; @@ -331,6 +333,10 @@ main(int argc, char *argv[]) struct shash_node *node; int replication_probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; int relay_source_probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; + struct sset db_filenames = SSET_INITIALIZER(&db_filenames); + struct shash remotes = SHASH_INITIALIZER(&remotes); + char *sync_from = NULL, *sync_exclude = NULL; + bool is_backup; ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); @@ -514,7 +520,8 @@ main(int argc, char *argv[]) } ovsdb_jsonrpc_server_destroy(jsonrpc); shash_destroy(&all_dbs); - sset_destroy(&remotes); + free_remotes(&remotes); + shash_destroy(&remotes); sset_destroy(&db_filenames); free(sync_from); free(sync_exclude); @@ -971,13 +978,16 @@ query_db_string(const struct shash *all_dbs, const char *name, } static struct ovsdb_jsonrpc_options * -add_remote(struct shash *remotes, const char *target) +add_remote(struct shash *remotes, const char *target, + const struct ovsdb_jsonrpc_options *options_) { struct ovsdb_jsonrpc_options *options; options = shash_find_data(remotes, target); if (!options) { - options = ovsdb_jsonrpc_default_options(target); + options = options_ + ? ovsdb_jsonrpc_options_clone(options_) + : ovsdb_jsonrpc_default_options(target); shash_add(remotes, target, options); } @@ -994,7 +1004,7 @@ free_remotes(struct shash *remotes) struct ovsdb_jsonrpc_options *options = node->data; free(options->role); } - shash_destroy_free_data(remotes); + shash_clear_free_data(remotes); } } @@ -1015,7 +1025,7 @@ add_manager_options(struct shash *remotes, const struct ovsdb_row *row) return; } - options = add_remote(remotes, target); + options = add_remote(remotes, target, NULL); if (ovsdb_util_read_integer_column(row, "max_backoff", &max_backoff)) { options->max_backoff = max_backoff; } @@ -1075,7 +1085,7 @@ query_db_remotes(const char *name, const struct shash *all_dbs, datum = &row->fields[column->index]; for (i = 0; i < datum->n; i++) { - add_remote(remotes, json_string(datum->keys[i].s)); + add_remote(remotes, json_string(datum->keys[i].s), NULL); } } } else if (column->type.key.type == OVSDB_TYPE_UUID @@ -1223,19 +1233,24 @@ commit_txn(struct ovsdb_txn *txn, const char *name) static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, - const struct sset *remotes, + const struct shash *remotes, struct shash *all_dbs) { - struct shash_node *node; - SHASH_FOR_EACH (node, all_dbs) { - struct db *db = node->data; + struct shash_node *db_node; + + SHASH_FOR_EACH (db_node, all_dbs) { + struct db *db = db_node->data; + if (!db->db || ovsdb_storage_is_clustered(db->db->storage)) { continue; } struct ovsdb_txn *txn = ovsdb_txn_create(db->db); - const char *remote; - SSET_FOR_EACH (remote, remotes) { + const struct shash_node *remote_node; + + SHASH_FOR_EACH (remote_node, remotes) { + const char *remote = remote_node->name; + update_remote_rows(all_dbs, db, remote, jsonrpc, txn); } commit_txn(txn, "remote status"); @@ -1345,23 +1360,27 @@ update_server_status(struct shash *all_dbs) /* Reconfigures ovsdb-server's remotes based on information in the database. */ static char * reconfigure_remotes(struct ovsdb_jsonrpc_server *jsonrpc, - const struct shash *all_dbs, struct sset *remotes) + const struct shash *all_dbs, struct shash *remotes) { struct ds errors = DS_EMPTY_INITIALIZER; struct shash resolved_remotes; - const char *name; + struct shash_node *node; /* Configure remotes. */ shash_init(&resolved_remotes); - SSET_FOR_EACH (name, remotes) { + SHASH_FOR_EACH (node, remotes) { + const struct ovsdb_jsonrpc_options *options = node->data; + const char *name = node->name; + if (!strncmp(name, "db:", 3)) { query_db_remotes(name, all_dbs, &resolved_remotes, &errors); } else { - add_remote(&resolved_remotes, name); + add_remote(&resolved_remotes, name, options); } } ovsdb_jsonrpc_server_set_remotes(jsonrpc, &resolved_remotes); free_remotes(&resolved_remotes); + shash_destroy(&resolved_remotes); return errors.string; } @@ -1722,7 +1741,7 @@ ovsdb_server_add_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, : parse_db_column(config->all_dbs, remote, &db, &table, &column)); if (!retval) { - if (sset_add(config->remotes, remote)) { + if (add_remote(config->remotes, remote, NULL)) { save_config(config); } unixctl_command_reply(conn, NULL); @@ -1739,11 +1758,12 @@ ovsdb_server_remove_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[], void *config_) { struct server_config *config = config_; - struct sset_node *node; + struct ovsdb_jsonrpc_options *options; - node = sset_find(config->remotes, argv[1]); - if (node) { - sset_delete(config->remotes, node); + options = shash_find_and_delete(config->remotes, argv[1]); + if (options) { + free(options->role); + free(options); save_config(config); unixctl_command_reply(conn, NULL); } else { @@ -1756,15 +1776,15 @@ static void ovsdb_server_list_remotes(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *remotes_) { - struct sset *remotes = remotes_; - const char **list, **p; + const struct shash *remotes = remotes_; + const struct shash_node **list; struct ds s; ds_init(&s); - list = sset_sort(remotes); - for (p = list; *p; p++) { - ds_put_format(&s, "%s\n", *p); + list = shash_sort(remotes); + for (size_t i = 0; i < shash_count(remotes); i++) { + ds_put_format(&s, "%s\n", list[i]->name); } free(list); @@ -1999,7 +2019,7 @@ ovsdb_server_get_db_storage_status(struct unixctl_conn *conn, static void parse_options(int argc, char *argv[], - struct sset *db_filenames, struct sset *remotes, + struct sset *db_filenames, struct shash *remotes, char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *active) { @@ -2050,7 +2070,7 @@ parse_options(int argc, char *argv[], *sync_from = NULL; *sync_exclude = NULL; sset_init(db_filenames); - sset_init(remotes); + shash_init(remotes); for (;;) { int c; @@ -2061,7 +2081,7 @@ parse_options(int argc, char *argv[], switch (c) { case OPT_REMOTE: - sset_add(remotes, optarg); + add_remote(remotes, optarg, NULL); break; case OPT_UNIXCTL: @@ -2200,10 +2220,24 @@ sset_to_json(const struct sset *sset) return array; } +static struct json * +remotes_to_json(const struct shash *remotes) +{ + const struct shash_node *node; + struct json *json; + + json = json_object_create(); + SHASH_FOR_EACH (node, remotes) { + json_object_put(json, node->name, + ovsdb_jsonrpc_options_to_json(node->data)); + } + return json; +} + /* Truncates and replaces the contents of 'config_file' by a representation of * 'remotes' and 'db_filenames'. */ static void -save_config__(FILE *config_file, const struct sset *remotes, +save_config__(FILE *config_file, const struct shash *remotes, const struct sset *db_filenames, const char *sync_from, const char *sync_exclude, bool is_backup) { @@ -2216,7 +2250,7 @@ save_config__(FILE *config_file, const struct sset *remotes, } obj = json_object_create(); - json_object_put(obj, "remotes", sset_to_json(remotes)); + json_object_put(obj, "remotes", remotes_to_json(remotes)); json_object_put(obj, "db_filenames", sset_to_json(db_filenames)); if (sync_from) { json_object_put(obj, "sync_from", json_string_create(sync_from)); @@ -2276,11 +2310,32 @@ sset_from_json(struct sset *sset, const struct json *array) } } +static void +remotes_from_json(struct shash *remotes, const struct json *json) +{ + struct ovsdb_jsonrpc_options *options; + const struct shash_node *node; + const struct shash *object; + + free_remotes(remotes); + + ovs_assert(json); + ovs_assert(json->type == JSON_OBJECT); + + object = json_object(json); + SHASH_FOR_EACH (node, object) { + options = ovsdb_jsonrpc_default_options(node->name); + ovsdb_jsonrpc_options_update_from_json(options, node->data); + shash_add(remotes, node->name, options); + } +} + /* Clears and replaces 'remotes' and 'dbnames' by a configuration read from * 'config_file', which must have been previously written by save_config(). */ static void -load_config(FILE *config_file, struct sset *remotes, struct sset *db_filenames, - char **sync_from, char **sync_exclude, bool *is_backup) +load_config(FILE *config_file, struct shash *remotes, + struct sset *db_filenames, char **sync_from, + char **sync_exclude, bool *is_backup) { struct json *json; @@ -2293,7 +2348,7 @@ load_config(FILE *config_file, struct sset *remotes, struct sset *db_filenames, } ovs_assert(json->type == JSON_OBJECT); - sset_from_json(remotes, shash_find_data(json_object(json), "remotes")); + remotes_from_json(remotes, shash_find_data(json_object(json), "remotes")); sset_from_json(db_filenames, shash_find_data(json_object(json), "db_filenames")); From e56d3024229974c0c5adf248556f3de3f944f3f7 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:04 +0100 Subject: [PATCH 517/833] ovsdb: Extract relay string parsing into a separate function. Small refactoring so we can re-use this function in later commits. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 45 +++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index b0e368e8c72..135b8db4ef4 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -316,6 +316,34 @@ main_loop(struct server_config *config, free(remotes_error); } +/* Parsing the relay in format 'relay:DB_NAME:'. + * On success, returns 'true', 'name' is set to DB_NAME, 'remotes' to + * ''. Caller is responsible of freeing 'name' and + * 'remotes'. On failure, returns 'false'. */ +static bool +parse_relay_args(const char *arg, char **name, char **remote) +{ + const char *relay_prefix = "relay:"; + const int relay_prefix_len = strlen(relay_prefix); + bool is_relay; + + is_relay = !strncmp(arg, relay_prefix, relay_prefix_len); + if (!is_relay) { + return false; + } + + *remote = strchr(arg + relay_prefix_len, ':'); + + if (!*remote || (*remote)[0] == '\0') { + *remote = NULL; + return false; + } + arg += relay_prefix_len; + *name = xmemdup0(arg, *remote - arg); + *remote = xstrdup(*remote + 1); + return true; +} + int main(int argc, char *argv[]) { @@ -733,15 +761,13 @@ add_db(struct server_config *config, struct db *db) static struct ovsdb_error * OVS_WARN_UNUSED_RESULT open_db(struct server_config *config, const char *filename) { - const char *relay_prefix = "relay:"; - const char *relay_remotes = NULL; - const int relay_prefix_len = strlen(relay_prefix); struct ovsdb_storage *storage; + char *relay_remotes = NULL; struct ovsdb_error *error; bool is_relay; char *name; - is_relay = !strncmp(filename, relay_prefix, relay_prefix_len); + is_relay = parse_relay_args(filename, &name, &relay_remotes); if (!is_relay) { /* If we know that the file is already open, return a good error * message. Otherwise, if the file is open, we'll fail later on with @@ -756,15 +782,7 @@ open_db(struct server_config *config, const char *filename) } name = xstrdup(filename); } else { - /* Parsing the relay in format 'relay:DB_NAME:'*/ - relay_remotes = strchr(filename + relay_prefix_len, ':'); - - if (!relay_remotes || relay_remotes[0] == '\0') { - return ovsdb_error(NULL, "%s: invalid syntax", filename); - } - name = xmemdup0(filename, relay_remotes - filename); - storage = ovsdb_storage_create_unbacked(name + relay_prefix_len); - relay_remotes++; /* Skip the ':'. */ + storage = ovsdb_storage_create_unbacked(name); } struct ovsdb_schema *schema; @@ -814,6 +832,7 @@ open_db(struct server_config *config, const char *filename) if (is_relay) { ovsdb_relay_add_db(db->db, relay_remotes, update_schema, config, *config->relay_source_probe_interval); + free(relay_remotes); } return NULL; } From 3ff980c854387a1c51ed7a699eb3258e25bfd547 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:05 +0100 Subject: [PATCH 518/833] ovsdb: replication: Isolate databases from each other. Refactoring of the replication code, so each database is handled separately from each other. Supposed to work the same way as before with the only difference that each backup database will have its own connection to the source and will have its own state machine. From the user's perspective, the only visible difference is that ovsdb-server/sync-status appctl now shows the status of each database separately. If one of the connections is permanently broken, all the databases will be switched to active. This is done in order to preserve the old behavior where we had only one connection. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- include/openvswitch/json.h | 1 + lib/json.c | 15 + ovsdb/ovsdb-server.c | 74 +++-- ovsdb/replication.c | 660 +++++++++++++++++-------------------- ovsdb/replication.h | 35 +- 5 files changed, 383 insertions(+), 402 deletions(-) diff --git a/include/openvswitch/json.h b/include/openvswitch/json.h index 35b403c29bd..eb92c6a9186 100644 --- a/include/openvswitch/json.h +++ b/include/openvswitch/json.h @@ -91,6 +91,7 @@ struct json *json_array_create(struct json **, size_t n); struct json *json_array_create_1(struct json *); struct json *json_array_create_2(struct json *, struct json *); struct json *json_array_create_3(struct json *, struct json *, struct json *); +bool json_array_contains_string(const struct json *, const char *); struct json *json_object_create(void); void json_object_put(struct json *, const char *name, struct json *value); diff --git a/lib/json.c b/lib/json.c index aded8bb0159..9411eeda7cc 100644 --- a/lib/json.c +++ b/lib/json.c @@ -257,6 +257,21 @@ json_array_create_3(struct json *elem0, struct json *elem1, struct json *elem2) return json_array_create(elems, 3); } +bool +json_array_contains_string(const struct json *json, const char *str) +{ + ovs_assert(json->type == JSON_ARRAY); + + for (size_t i = 0; i < json->array.n; i++) { + const struct json *elem = json->array.elems[i]; + + if (elem->type == JSON_STRING && !strcmp(json_string(elem), str)) { + return true; + } + } + return false; +} + struct json * json_object_create(void) { diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 135b8db4ef4..404abf1e559 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -166,12 +166,12 @@ ovsdb_replication_init(const char *sync_from, const char *exclude, struct shash *all_dbs, const struct uuid *server_uuid, int probe_interval) { - replication_init(sync_from, exclude, server_uuid, probe_interval); struct shash_node *node; SHASH_FOR_EACH (node, all_dbs) { struct db *db = node->data; if (node->name[0] != '_' && db->db) { - replication_add_local_db(node->name, db->db); + replication_set_db(db->db, sync_from, exclude, + server_uuid, probe_interval); } } } @@ -228,11 +228,20 @@ main_loop(struct server_config *config, report_error_if_changed(reconfigure_ssl(all_dbs), &ssl_error); ovsdb_jsonrpc_server_run(jsonrpc); + replication_run(); if (*is_backup) { - replication_run(); - if (!replication_is_alive()) { - disconnect_active_server(); - *is_backup = false; + SHASH_FOR_EACH (node, all_dbs) { + struct db *db = node->data; + if (db->db->name[0] != '_' && !replication_is_alive(db->db)) { + *is_backup = false; + break; + } + } + if (!*is_backup) { + SHASH_FOR_EACH (node, all_dbs) { + struct db *db = node->data; + replication_remove_db(db->db); + } } } @@ -283,10 +292,8 @@ main_loop(struct server_config *config, update_server_status(all_dbs); memory_wait(); - if (*is_backup) { - replication_wait(); - } + replication_wait(); ovsdb_relay_wait(); ovsdb_jsonrpc_server_wait(jsonrpc); @@ -518,7 +525,7 @@ main(int argc, char *argv[]) &server_config); unixctl_command_register("ovsdb-server/get-sync-exclude-tables", "", 0, 0, ovsdb_server_get_sync_exclude_tables, - NULL); + &server_config); unixctl_command_register("ovsdb-server/sync-status", "", 0, 0, ovsdb_server_get_sync_status, &server_config); @@ -607,6 +614,9 @@ close_db(struct server_config *config, struct db *db, char *comment) if (db->db->is_relay) { ovsdb_relay_del_db(db->db); } + if (*config->is_backup) { + replication_remove_db(db->db); + } ovsdb_destroy(db->db); free(db->filename); free(db); @@ -1504,8 +1514,12 @@ ovsdb_server_disconnect_active_ovsdb_server(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; - disconnect_active_server(); + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + replication_remove_db(db->db); + } *config->is_backup = false; save_config(config); unixctl_command_reply(conn, NULL); @@ -1524,7 +1538,11 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, *config->replication_probe_interval = probe_interval; save_config(config); if (*config->is_backup) { - replication_set_probe_interval(probe_interval); + const struct uuid *server_uuid; + server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); + ovsdb_replication_init(*config->sync_from, *config->sync_exclude, + config->all_dbs, server_uuid, + *config->replication_probe_interval); } unixctl_command_reply(conn, NULL); } else { @@ -1561,7 +1579,7 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, { struct server_config *config = config_; - char *err = set_excluded_tables(argv[1], true); + char *err = parse_excluded_tables(argv[1]); if (!err) { free(*config->sync_exclude); *config->sync_exclude = xstrdup(argv[1]); @@ -1573,7 +1591,6 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, config->all_dbs, server_uuid, *config->replication_probe_interval); } - err = set_excluded_tables(argv[1], false); } unixctl_command_reply(conn, err); free(err); @@ -1583,11 +1600,11 @@ static void ovsdb_server_get_sync_exclude_tables(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, - void *arg_ OVS_UNUSED) + void *config_) { - char *reply = get_excluded_tables(); - unixctl_command_reply(conn, reply); - free(reply); + struct server_config *config = config_; + + unixctl_command_reply(conn, *config->sync_exclude); } static void @@ -1846,13 +1863,6 @@ remove_db(struct server_config *config, struct shash_node *node, char *comment) shash_delete(config->all_dbs, node); save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - } } static void @@ -1994,7 +2004,17 @@ ovsdb_server_get_sync_status(struct unixctl_conn *conn, int argc OVS_UNUSED, ds_put_format(&ds, "state: %s\n", is_backup ? "backup" : "active"); if (is_backup) { - ds_put_and_free_cstr(&ds, replication_status()); + const struct shash_node **db_nodes = shash_sort(config->all_dbs); + + for (size_t i = 0; i < shash_count(config->all_dbs); i++) { + const struct db *db = db_nodes[i]->data; + + if (db->db && db->db->name[0] != '_') { + ds_put_and_free_cstr(&ds, replication_status(db->db)); + ds_put_char(&ds, '\n'); + } + } + free(db_nodes); } unixctl_command_reply(conn, ds_cstr(&ds)); @@ -2158,7 +2178,7 @@ parse_options(int argc, char *argv[], break; case OPT_SYNC_EXCLUDE: { - char *err = set_excluded_tables(optarg, false); + char *err = parse_excluded_tables(optarg); if (err) { ovs_fatal(0, "%s", err); } diff --git a/ovsdb/replication.c b/ovsdb/replication.c index 477c69d701b..30292fb490b 100644 --- a/ovsdb/replication.c +++ b/ovsdb/replication.c @@ -38,16 +38,7 @@ VLOG_DEFINE_THIS_MODULE(replication); -static char *sync_from; static struct uuid server_uuid; -static struct jsonrpc_session *session; -static unsigned int session_seqno = UINT_MAX; - -static struct jsonrpc_msg *create_monitor_request(struct ovsdb_schema *); -static void add_monitored_table(struct ovsdb_table_schema *table, - struct json *monitor_requests); - -static struct ovsdb_error *reset_database(struct ovsdb *db); static struct ovsdb_error *process_notification(struct json *, struct ovsdb *); static struct ovsdb_error *process_table_update(struct json *table_update, @@ -55,27 +46,6 @@ static struct ovsdb_error *process_table_update(struct json *table_update, struct ovsdb *database, struct ovsdb_txn *txn); -/* Maps from db name to sset of table names. */ -static struct shash excluded_tables = SHASH_INITIALIZER(&excluded_tables); - -static void excluded_tables_clear(void); -static void excluded_tables_add(const char *database, const char *table); -static bool excluded_tables_find(const char *database, const char *table); - - -/* Keep track of request IDs of all outstanding OVSDB requests. */ -static struct hmap request_ids = HMAP_INITIALIZER(&request_ids); - -struct request_ids_hmap_node { - struct hmap_node hmap; - struct json *request_id; - struct ovsdb *db; /* associated database */ -}; -void request_ids_add(const struct json *id, struct ovsdb *db); -bool request_ids_lookup_and_free(const struct json *id, struct ovsdb **db); -static void request_ids_destroy(void); -void request_ids_clear(void); - enum ovsdb_replication_state { RPL_S_INIT, RPL_S_SERVER_ID_REQUESTED, @@ -85,168 +55,215 @@ enum ovsdb_replication_state { RPL_S_REPLICATING, RPL_S_ERR /* Error, no longer replicating. */ }; -static enum ovsdb_replication_state state; - struct replication_db { struct ovsdb *db; + bool schema_version_higher; /* Points to the schema received from the active server if * the local db schema version is higher. NULL otherwise. */ struct ovsdb_schema *active_db_schema; + + char *sync_from; + char *excluded_tables_str; + struct sset excluded_tables; + + struct json *request_id; /* Id of the outstanding OVSDB request. */ + + struct jsonrpc_session *session; + unsigned int session_seqno; + + enum ovsdb_replication_state state; }; static bool is_replication_possible(struct ovsdb_schema *local_db_schema, struct ovsdb_schema *active_db_schema); +static struct jsonrpc_msg *create_monitor_request(struct replication_db *, + struct ovsdb_schema *); +static void add_monitored_table(struct ovsdb_table_schema *table, + struct json *monitor_requests); + + /* All DBs known to ovsdb-server. The actual replication dbs are stored * in 'replication dbs', which is a subset of all dbs and remote dbs whose * schema matches. */ -static struct shash local_dbs = SHASH_INITIALIZER(&local_dbs); -static struct shash *replication_dbs; +static struct shash replication_dbs = SHASH_INITIALIZER(&replication_dbs); + +static void replication_db_destroy(struct replication_db *); +static struct ovsdb_error *reset_database(struct replication_db *); -static struct shash *replication_dbs_create(void); -static void replication_dbs_destroy(void); /* Find 'struct ovsdb' by name within 'replication_dbs' */ static struct replication_db *find_db(const char *db_name); + +static char *set_excluded_tables(struct replication_db *, const char *excluded) + OVS_WARN_UNUSED_RESULT; + +static void request_id_set(struct replication_db *, const struct json *id); +static void request_id_clear(struct replication_db *); +static bool request_id_compare_and_free(struct replication_db *, + const struct json *id); void -replication_init(const char *sync_from_, const char *exclude_tables, - const struct uuid *server, int probe_interval) +replication_set_db(struct ovsdb *db, const char *sync_from, + const char *exclude_tables, const struct uuid *server, + int probe_interval) { - free(sync_from); - sync_from = xstrdup(sync_from_); - /* Caller should have verified that the 'exclude_tables' is - * parseable. An error here is unexpected. */ - ovs_assert(!set_excluded_tables(exclude_tables, false)); + struct replication_db *rdb = find_db(db->name); - replication_dbs_destroy(); + if (uuid_is_zero(&server_uuid)) { + /* Keep a copy of local server uuid. */ + server_uuid = *server; + } else { + ovs_assert(uuid_equals(&server_uuid, server)); + } - shash_clear(&local_dbs); - if (session) { - jsonrpc_session_close(session); + ovs_assert(sync_from); + + if (rdb + && nullable_string_is_equal(rdb->excluded_tables_str, exclude_tables) + && nullable_string_is_equal(rdb->sync_from, sync_from)) { + jsonrpc_session_set_probe_interval(rdb->session, probe_interval); + return; } - session = jsonrpc_session_open(sync_from, true); - session_seqno = UINT_MAX; + if (!rdb) { + rdb = xzalloc(sizeof *rdb); + rdb->db = db; + sset_init(&rdb->excluded_tables); + rdb->schema_version_higher = false; + shash_add(&replication_dbs, db->name, rdb); + } else { + replication_db_destroy(rdb); + } + + rdb->sync_from = xstrdup(sync_from); + rdb->excluded_tables_str = nullable_xstrdup(exclude_tables); + /* Caller should have verified that the 'exclude_tables' is + * parseable. An error here is unexpected. */ + ovs_assert(!set_excluded_tables(rdb, exclude_tables)); - jsonrpc_session_set_probe_interval(session, probe_interval); + rdb->session = jsonrpc_session_open(rdb->sync_from, true); + rdb->session_seqno = UINT_MAX; - /* Keep a copy of local server uuid. */ - server_uuid = *server; + jsonrpc_session_set_probe_interval(rdb->session, probe_interval); - state = RPL_S_INIT; + rdb->state = RPL_S_INIT; } void -replication_add_local_db(const char *database, struct ovsdb *db) +replication_remove_db(const struct ovsdb *db) { - shash_add_assert(&local_dbs, database, db); + struct replication_db *rdb; + + rdb = shash_find_and_delete(&replication_dbs, db->name); + if (rdb) { + replication_db_destroy(rdb); + free(rdb); + } } static void -send_schema_requests(const struct json *result) +send_schema_request(struct replication_db *rdb) { - for (size_t i = 0; i < result->array.n; i++) { - const struct json *name = result->array.elems[i]; - if (name->type == JSON_STRING) { - /* Send one schema request for each remote DB. */ - const char *db_name = json_string(name); - struct replication_db *rdb = find_db(db_name); - if (rdb) { - struct jsonrpc_msg *request = - jsonrpc_create_request( - "get_schema", - json_array_create_1( - json_string_create(db_name)), - NULL); - - request_ids_add(request->id, rdb->db); - jsonrpc_session_send(session, request); - } - } - } + struct jsonrpc_msg *request = + jsonrpc_create_request( + "get_schema", + json_array_create_1(json_string_create(rdb->db->name)), + NULL); + + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); } -void -replication_run(void) +static void +replication_run_db(struct replication_db *rdb) { - if (!session) { + if (!rdb->session) { return; } - jsonrpc_session_run(session); + jsonrpc_session_run(rdb->session); - for (int i = 0; jsonrpc_session_is_connected(session) && i < 50; i++) { + for (int i = 0; i < 50; i++) { struct jsonrpc_msg *msg; unsigned int seqno; - seqno = jsonrpc_session_get_seqno(session); - if (seqno != session_seqno || state == RPL_S_INIT) { - session_seqno = seqno; - request_ids_clear(); + if (!jsonrpc_session_is_connected(rdb->session)) { + break; + } + + seqno = jsonrpc_session_get_seqno(rdb->session); + if (seqno != rdb->session_seqno || rdb->state == RPL_S_INIT) { + rdb->session_seqno = seqno; + request_id_clear(rdb); + struct jsonrpc_msg *request; request = jsonrpc_create_request("get_server_id", json_array_create_empty(), NULL); - request_ids_add(request->id, NULL); - jsonrpc_session_send(session, request); + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); - state = RPL_S_SERVER_ID_REQUESTED; - VLOG_DBG("send server ID request."); + rdb->state = RPL_S_SERVER_ID_REQUESTED; + VLOG_DBG("%s: send server ID request.", rdb->db->name); } - msg = jsonrpc_session_recv(session); + msg = jsonrpc_session_recv(rdb->session); if (!msg) { continue; } - if (msg->type == JSONRPC_NOTIFY && state != RPL_S_ERR + if (msg->type == JSONRPC_NOTIFY && rdb->state != RPL_S_ERR && !strcmp(msg->method, "update")) { if (msg->params->type == JSON_ARRAY && msg->params->array.n == 2 && msg->params->array.elems[0]->type == JSON_STRING) { char *db_name = msg->params->array.elems[0]->string; - struct replication_db *rdb = find_db(db_name); - if (rdb) { + + if (!strcmp(db_name, rdb->db->name)) { struct ovsdb_error *error; error = process_notification(msg->params->array.elems[1], rdb->db); if (error) { ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; } + } else { + VLOG_WARN("%s: received update for unexpected database %s", + rdb->db->name, db_name); + rdb->state = RPL_S_ERR; } } } else if (msg->type == JSONRPC_REPLY) { - struct replication_db *rdb; - struct ovsdb *db; - if (!request_ids_lookup_and_free(msg->id, &db)) { - VLOG_WARN("received unexpected reply"); + if (!request_id_compare_and_free(rdb, msg->id)) { + VLOG_WARN("%s: received unexpected reply.", rdb->db->name); goto next; } - switch (state) { + switch (rdb->state) { case RPL_S_SERVER_ID_REQUESTED: { struct uuid uuid; if (msg->result->type != JSON_STRING || !uuid_from_string(&uuid, json_string(msg->result))) { struct ovsdb_error *error; error = ovsdb_error("get_server_id failed", - "Server ID is not valid UUID"); + "%s: Server ID is not valid UUID", + rdb->db->name); ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; break; } if (uuid_equals(&uuid, &server_uuid)) { struct ovsdb_error *error; error = ovsdb_error("Server ID check failed", - "Self replicating is not allowed"); + "%s: Self replicating is not allowed", + rdb->db->name); ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; break; } @@ -254,25 +271,32 @@ replication_run(void) request = jsonrpc_create_request("list_dbs", json_array_create_empty(), NULL); - request_ids_add(request->id, NULL); - jsonrpc_session_send(session, request); + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); - replication_dbs_destroy(); - replication_dbs = replication_dbs_create(); - state = RPL_S_DB_REQUESTED; + rdb->state = RPL_S_DB_REQUESTED; break; } case RPL_S_DB_REQUESTED: if (msg->result->type != JSON_ARRAY) { struct ovsdb_error *error; error = ovsdb_error("list_dbs failed", - "list_dbs response is not array"); + "%s: list_dbs response is not array", + rdb->db->name); + ovsdb_error_assert(error); + rdb->state = RPL_S_ERR; + } else if (!json_array_contains_string(msg->result, + rdb->db->name)) { + struct ovsdb_error *error; + error = ovsdb_error("list_dbs failed", + "%s: database name is not in the list", + rdb->db->name); ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; } else { - send_schema_requests(msg->result); - VLOG_DBG("Send schema requests"); - state = RPL_S_SCHEMA_REQUESTED; + send_schema_request(rdb); + VLOG_DBG("%s: send schema request.", rdb->db->name); + rdb->state = RPL_S_SCHEMA_REQUESTED; } break; @@ -283,19 +307,22 @@ replication_run(void) error = ovsdb_schema_from_json(msg->result, &schema); if (error) { ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; + break; } - rdb = find_db(schema->name); - if (!rdb) { + if (strcmp(rdb->db->name, schema->name)) { /* Unexpected schema. */ - VLOG_WARN("unexpected schema %s", schema->name); - state = RPL_S_ERR; + VLOG_WARN("%s: unexpected schema %s.", + rdb->db->name, schema->name); + rdb->state = RPL_S_ERR; + ovsdb_schema_destroy(schema); + break; } else if (!ovsdb_schema_equal(schema, rdb->db->schema)) { /* Schmea version mismatch. */ - VLOG_INFO("Schema version mismatch, checking if %s can " - "still be replicated or not.", - schema->name); + VLOG_INFO("%s: Schema version mismatch, checking if %s can" + " still be replicated or not.", + rdb->db->name, schema->name); if (is_replication_possible(rdb->db->schema, schema)) { VLOG_INFO("%s can be replicated.", schema->name); rdb->schema_version_higher = true; @@ -305,68 +332,48 @@ replication_run(void) rdb->active_db_schema = schema; } else { VLOG_INFO("%s cannot be replicated.", schema->name); - struct replication_db *r = - shash_find_and_delete(replication_dbs, - schema->name); - if (r->active_db_schema) { - ovsdb_schema_destroy(r->active_db_schema); - } - free(r); + rdb->state = RPL_S_ERR; ovsdb_schema_destroy(schema); + break; } } else { ovsdb_schema_destroy(schema); } - /* After receiving schemas, reset the local databases that - * will be monitored and send out monitor requests for them. */ - if (hmap_is_empty(&request_ids)) { - struct shash_node *node; - - if (shash_is_empty(replication_dbs)) { - VLOG_WARN("Nothing to replicate."); - state = RPL_S_ERR; - } else { - SHASH_FOR_EACH (node, replication_dbs) { - rdb = node->data; - struct jsonrpc_msg *request = - create_monitor_request( - rdb->schema_version_higher ? - rdb->active_db_schema : rdb->db->schema); - - request_ids_add(request->id, rdb->db); - jsonrpc_session_send(session, request); - VLOG_DBG("Send monitor requests"); - state = RPL_S_MONITOR_REQUESTED; - } - } - } + /* Send out a monitor request. */ + struct jsonrpc_msg *request = + create_monitor_request(rdb, rdb->schema_version_higher + ? rdb->active_db_schema + : rdb->db->schema); + + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); + VLOG_DBG("%s: send monitor request.", rdb->db->name); + rdb->state = RPL_S_MONITOR_REQUESTED; break; } case RPL_S_MONITOR_REQUESTED: { /* Reply to monitor requests. */ struct ovsdb_error *error; - VLOG_INFO("Monitor request received. Resetting the database"); + VLOG_INFO("%s: Monitor reply received. " + "Resetting the database.", rdb->db->name); /* Resetting the database here has few risks. If the * process_notification() fails, the database is completely * lost locally. In case that node becomes active, then * there is a chance of complete data loss in the active/standy * cluster. */ - error = reset_database(db); + error = reset_database(rdb); if (!error) { - error = process_notification(msg->result, db); + error = process_notification(msg->result, rdb->db); } if (error) { ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; } else { - /* Transition to replicating state after receiving - * all replies of "monitor" requests. */ - if (hmap_is_empty(&request_ids)) { - VLOG_DBG("Listening to monitor updates"); - state = RPL_S_REPLICATING; - } + VLOG_DBG("%s: Listening to monitor updates.", + rdb->db->name); + rdb->state = RPL_S_REPLICATING; } break; } @@ -386,24 +393,40 @@ replication_run(void) } } +void +replication_run(void) +{ + struct shash_node *node; + + SHASH_FOR_EACH (node, &replication_dbs) { + replication_run_db(node->data); + } +} + void replication_wait(void) { - if (session) { - jsonrpc_session_wait(session); - jsonrpc_session_recv_wait(session); + struct shash_node *node; + + SHASH_FOR_EACH (node, &replication_dbs) { + struct replication_db *rdb = node->data; + + if (rdb->session) { + jsonrpc_session_wait(rdb->session); + jsonrpc_session_recv_wait(rdb->session); + } } } -/* Parse 'excluded' to rebuild 'excluded_tables'. If 'dryrun' is false, the - * current set of excluded tables will be wiped out, regardless of whether - * 'excluded' can be parsed. If 'dryrun' is true, only parses 'excluded' and +/* Parse 'excluded' to rebuild 'rdb->excluded_tables'. If 'rdb' is not NULL, + * the current set of excluded tables will be wiped out, regardless of whether + * 'excluded' can be parsed. If 'rdb' is NULL, only parses 'excluded' and * reports any errors, without modifying the list of exclusions. * - * On error, returns the error string, which the caller is - * responsible for freeing. Returns NULL otherwise. */ -char * OVS_WARN_UNUSED_RESULT -set_excluded_tables(const char *excluded, bool dryrun) + * On error, returns the error string, which the caller is responsible for + * freeing. Returns NULL otherwise. */ +static char * OVS_WARN_UNUSED_RESULT +set_excluded_tables__(struct replication_db *rdb, const char *excluded) { struct sset set = SSET_INITIALIZER(&set); char *err = NULL; @@ -411,17 +434,22 @@ set_excluded_tables(const char *excluded, bool dryrun) if (excluded) { const char *longname; - if (!dryrun) { - /* Can only add to an empty shash. */ - excluded_tables_clear(); + if (rdb) { + /* Can only add to an empty set. */ + sset_clear(&rdb->excluded_tables); } sset_from_delimited_string(&set, excluded, " ,"); SSET_FOR_EACH (longname, &set) { + if (rdb && !strchr(longname, ':')) { + sset_add(&rdb->excluded_tables, longname); + continue; + } + char *database = xstrdup(longname), *table = NULL; strtok_r(database, ":", &table); - if (table && !dryrun) { - excluded_tables_add(database, table); + if (table && rdb && !strcmp(rdb->db->name, database)) { + sset_add(&rdb->excluded_tables, table); } free(database); @@ -434,120 +462,74 @@ set_excluded_tables(const char *excluded, bool dryrun) done: sset_destroy(&set); - if (err && !dryrun) { + if (err && rdb) { /* On error, destroy the partially built 'excluded_tables'. */ - excluded_tables_clear(); + sset_clear(&rdb->excluded_tables); } return err; } char * OVS_WARN_UNUSED_RESULT -get_excluded_tables(void) +parse_excluded_tables(const char *excluded) { - struct shash_node *node; - struct sset set = SSET_INITIALIZER(&set); - - SHASH_FOR_EACH (node, &excluded_tables) { - const char *database = node->name; - const char *table; - struct sset *tables = node->data; - - SSET_FOR_EACH (table, tables) { - sset_add_and_free(&set, xasprintf("%s:%s", database, table)); - } - } - - /* Output the table list in an sorted order, so that - * the output string will not depend on the hash function - * that used to implement the hmap data structure. This is - * only useful for writting unit tests. */ - const char **sorted = sset_sort(&set); - struct ds ds = DS_EMPTY_INITIALIZER; - size_t i; - for (i = 0; i < sset_count(&set); i++) { - ds_put_format(&ds, "%s,", sorted[i]); - } - - ds_chomp(&ds, ','); - - free(sorted); - sset_destroy(&set); - - return ds_steal_cstr(&ds); + return set_excluded_tables__(NULL, excluded); } -static void -excluded_tables_clear(void) +static char * OVS_WARN_UNUSED_RESULT +set_excluded_tables(struct replication_db *rdb, const char *excluded) { - struct shash_node *node; - SHASH_FOR_EACH (node, &excluded_tables) { - struct sset *tables = node->data; - sset_destroy(tables); - } - - shash_clear_free_data(&excluded_tables); + return set_excluded_tables__(rdb, excluded); } -static void -excluded_tables_add(const char *database, const char *table) +char * OVS_WARN_UNUSED_RESULT +get_excluded_tables(const struct ovsdb *db) { - struct sset *tables = shash_find_data(&excluded_tables, database); + const struct replication_db *rdb = find_db(db->name); - if (!tables) { - tables = xmalloc(sizeof *tables); - sset_init(tables); - shash_add(&excluded_tables, database, tables); + if (!rdb) { + return xstrdup(""); } - sset_add(tables, table); -} + struct sset set = SSET_INITIALIZER(&set); + const char *table; + char *result; -static bool -excluded_tables_find(const char *database, const char *table) -{ - struct sset *tables = shash_find_data(&excluded_tables, database); - return tables && sset_contains(tables, table); -} + SSET_FOR_EACH (table, &rdb->excluded_tables) { + sset_add_and_free(&set, xasprintf("%s:%s", rdb->db->name, table)); + } -void -disconnect_active_server(void) -{ - jsonrpc_session_close(session); - session = NULL; + result = sset_join(&set, ",", ""); + sset_destroy(&set); + + return result; } void replication_destroy(void) { - excluded_tables_clear(); - shash_destroy(&excluded_tables); + struct shash_node *node; - if (sync_from) { - free(sync_from); - sync_from = NULL; + SHASH_FOR_EACH (node, &replication_dbs) { + replication_db_destroy(node->data); } - - request_ids_destroy(); - replication_dbs_destroy(); - - shash_destroy(&local_dbs); + shash_destroy_free_data(&replication_dbs); } static struct replication_db * find_db(const char *db_name) { - return shash_find_data(replication_dbs, db_name); + return shash_find_data(&replication_dbs, db_name); } static struct ovsdb_error * -reset_database(struct ovsdb *db) +reset_database(struct replication_db *rdb) { - struct ovsdb_txn *txn = ovsdb_txn_create(db); + struct ovsdb_txn *txn = ovsdb_txn_create(rdb->db); struct shash_node *table_node; - SHASH_FOR_EACH (table_node, &db->tables) { + SHASH_FOR_EACH (table_node, &rdb->db->tables) { /* Delete all rows if the table is not excluded. */ - if (!excluded_tables_find(db->schema->name, table_node->name)) { + if (!sset_contains(&rdb->excluded_tables, table_node->name)) { struct ovsdb_table *table = table_node->data; struct ovsdb_row *row; HMAP_FOR_EACH_SAFE (row, hmap_node, &table->rows) { @@ -565,7 +547,7 @@ reset_database(struct ovsdb *db) * Caller is responsible for disposing 'request'. */ static struct jsonrpc_msg * -create_monitor_request(struct ovsdb_schema *schema) +create_monitor_request(struct replication_db *rdb, struct ovsdb_schema *schema) { struct jsonrpc_msg *request; struct json *monitor; @@ -579,7 +561,7 @@ create_monitor_request(struct ovsdb_schema *schema) struct ovsdb_table_schema *table = nodes[j]->data; /* Monitor all tables not excluded. */ - if (!excluded_tables_find(db_name, table->name)) { + if (!sset_contains(&rdb->excluded_tables, table->name)) { add_monitored_table(table, monitor_request); } } @@ -689,114 +671,76 @@ process_table_update(struct json *table_update, const char *table_name, return NULL; } -void -request_ids_add(const struct json *id, struct ovsdb *db) +static void +request_id_set(struct replication_db *rdb, const struct json *id) { - struct request_ids_hmap_node *node = xmalloc(sizeof *node); + ovs_assert(!rdb->request_id); + rdb->request_id = json_clone(id); +} - node->request_id = json_clone(id); - node->db = db; - hmap_insert(&request_ids, &node->hmap, json_hash(id, 0)); +static void +request_id_clear(struct replication_db *rdb) +{ + json_destroy(rdb->request_id); + rdb->request_id = NULL; } -/* Look up 'id' from 'request_ids', if found, remove the found id from - * 'request_ids' and free its memory. If not found, 'request_ids' does - * not change. Sets '*db' to the database for the request (NULL if not - * found). +/* Compare 'id' with sent 'request_id'. If it matches, clear the current + * 'request_id'. If it doesn't match, 'request_id' does not change. * - * Return true if 'id' is found, false otherwise. + * Return true if 'id' matches, false otherwise. */ -bool -request_ids_lookup_and_free(const struct json *id, struct ovsdb **db) +static bool +request_id_compare_and_free(struct replication_db *rdb, const struct json *id) { - struct request_ids_hmap_node *node; - - HMAP_FOR_EACH_WITH_HASH (node, hmap, json_hash(id, 0), &request_ids) { - if (json_equal(id, node->request_id)) { - hmap_remove(&request_ids, &node->hmap); - *db = node->db; - json_destroy(node->request_id); - free(node); - return true; - } + if (rdb->request_id && json_equal(id, rdb->request_id)) { + request_id_clear(rdb); + return true; } - - *db = NULL; return false; } static void -request_ids_destroy(void) +replication_db_destroy(struct replication_db *rdb) { - struct request_ids_hmap_node *node; - - HMAP_FOR_EACH_POP (node, hmap, &request_ids) { - json_destroy(node->request_id); - free(node); + if (!rdb) { + return; } - hmap_destroy(&request_ids); -} -void -request_ids_clear(void) -{ - request_ids_destroy(); - hmap_init(&request_ids); -} + free(rdb->sync_from); + rdb->sync_from = NULL; -static struct shash * -replication_dbs_create(void) -{ - struct shash *new = xmalloc(sizeof *new); - shash_init(new); + free(rdb->excluded_tables_str); + rdb->excluded_tables_str = NULL; + sset_destroy(&rdb->excluded_tables); - struct shash_node *node; - SHASH_FOR_EACH (node, &local_dbs) { - struct replication_db *repl_db = xmalloc(sizeof *repl_db); - repl_db->db = node->data; - repl_db->schema_version_higher = false; - repl_db->active_db_schema = NULL; - shash_add(new, node->name, repl_db); - } + request_id_clear(rdb); - return new; -} - -static void -replication_dbs_destroy(void) -{ - if (!replication_dbs) { - return; + if (rdb->session) { + jsonrpc_session_close(rdb->session); + rdb->session = NULL; } - struct shash_node *node; - - SHASH_FOR_EACH_SAFE (node, replication_dbs) { - hmap_remove(&replication_dbs->map, &node->node); - struct replication_db *rdb = node->data; - if (rdb->active_db_schema) { - ovsdb_schema_destroy(rdb->active_db_schema); - } - free(rdb); - free(node->name); - free(node); + if (rdb->active_db_schema) { + ovsdb_schema_destroy(rdb->active_db_schema); + rdb->active_db_schema = NULL; } - hmap_destroy(&replication_dbs->map); - free(replication_dbs); - replication_dbs = NULL; + rdb->schema_version_higher = false; } /* Return true if replication just started or is ongoing. * Return false if the connection failed, or the replication * was not able to start. */ bool -replication_is_alive(void) +replication_is_alive(const struct ovsdb *db) { - if (session) { - return jsonrpc_session_is_alive(session) && state != RPL_S_ERR; + const struct replication_db *rdb = find_db(db->name); + + if (!rdb || !rdb->session) { + return false; } - return false; + return jsonrpc_session_is_alive(rdb->session) && rdb->state != RPL_S_ERR; } /* Return the last error reported on a connection by 'session'. The @@ -806,60 +750,60 @@ replication_is_alive(void) * Return a negative value if replication session has error, or the * replication was not able to start. */ int -replication_get_last_error(void) +replication_get_last_error(const struct ovsdb *db) { + const struct replication_db *rdb = find_db(db->name); int err = 0; - if (session) { - err = jsonrpc_session_get_last_error(session); + if (rdb && rdb->session) { + err = jsonrpc_session_get_last_error(rdb->session); if (!err) { - err = (state == RPL_S_ERR) ? ENOENT : 0; + err = (rdb->state == RPL_S_ERR) ? ENOENT : 0; } } return err; } -char * -replication_status(void) +char * OVS_WARN_UNUSED_RESULT +replication_status(const struct ovsdb *db) { - bool alive = session && jsonrpc_session_is_alive(session); + const struct replication_db *rdb = find_db(db->name); + + if (!rdb) { + return xasprintf("%s is not configured for replication", db->name); + } + + bool alive = rdb->session && jsonrpc_session_is_alive(rdb->session); struct ds ds = DS_EMPTY_INITIALIZER; + ds_put_format(&ds, "database: %s\n", db->name); if (alive) { - switch(state) { + switch (rdb->state) { case RPL_S_INIT: case RPL_S_SERVER_ID_REQUESTED: case RPL_S_DB_REQUESTED: case RPL_S_SCHEMA_REQUESTED: case RPL_S_MONITOR_REQUESTED: - ds_put_format(&ds, "connecting: %s", sync_from); + ds_put_format(&ds, "connecting: %s", rdb->sync_from); break; case RPL_S_REPLICATING: { - struct shash_node *node; - - ds_put_format(&ds, "replicating: %s\n", sync_from); - ds_put_cstr(&ds, "database:"); - SHASH_FOR_EACH (node, replication_dbs) { - ds_put_format(&ds, " %s,", node->name); - } - ds_chomp(&ds, ','); + ds_put_format(&ds, "replicating: %s\n", rdb->sync_from); - if (!shash_is_empty(&excluded_tables)) { - ds_put_char(&ds, '\n'); + if (!sset_is_empty(&rdb->excluded_tables)) { ds_put_cstr(&ds, "exclude: "); - ds_put_and_free_cstr(&ds, get_excluded_tables()); + ds_put_and_free_cstr(&ds, get_excluded_tables(db)); } break; } case RPL_S_ERR: - ds_put_format(&ds, "Replication to (%s) failed\n", sync_from); + ds_put_format(&ds, "Replication to (%s) failed", rdb->sync_from); break; default: OVS_NOT_REACHED(); } } else { - ds_put_format(&ds, "not connected to %s", sync_from); + ds_put_format(&ds, "not connected to %s", rdb->sync_from); } return ds_steal_cstr(&ds); } @@ -913,10 +857,12 @@ is_replication_possible(struct ovsdb_schema *local_db_schema, } void -replication_set_probe_interval(int probe_interval) +replication_set_probe_interval(const struct ovsdb *db, int probe_interval) { - if (session) { - jsonrpc_session_set_probe_interval(session, probe_interval); + const struct replication_db *rdb = find_db(db->name); + + if (rdb && rdb->session) { + jsonrpc_session_set_probe_interval(rdb->session, probe_interval); } } diff --git a/ovsdb/replication.h b/ovsdb/replication.h index 6d1be820f3d..5e573e1002c 100644 --- a/ovsdb/replication.h +++ b/ovsdb/replication.h @@ -26,41 +26,40 @@ struct ovsdb; * API Usage *=========== * - * - replication_init() needs to be called whenever OVSDB server switches into + * - replication_set_db() needs to be called whenever database switches into * the backup mode. * - * - replication_add_local_db() should be called immediately after to add all - * known database that OVSDB server owns, one at a time. + * - replication_remove_db() needs to be called whenever backup database + * switches into an active mode. * * - replication_destroy() should be called when OVSDB server shutdown to * reclaim resources. * * - replication_run(), replication_wait(), replication_is_alive() and * replication_get_last_error() should be call within the main loop - * whenever OVSDB server runs in the backup mode. + * whenever OVSDB has backup databases. * - * - set_excluded_tables(), get_excluded_tables(), disconnect_active_server() - * and replication_usage() are support functions used mainly by unixctl - * commands. + * - parse_excluded_tables(), get_excluded_tables() and replication_usage() + * are support functions used mainly by unixctl commands. */ #define REPLICATION_DEFAULT_PROBE_INTERVAL 60000 -void replication_init(const char *sync_from, const char *exclude_tables, - const struct uuid *server, int probe_interval); +void replication_set_db(struct ovsdb *, const char *sync_from, + const char *exclude_tables, const struct uuid *server, + int probe_interval); +void replication_remove_db(const struct ovsdb *); + void replication_run(void); void replication_wait(void); void replication_destroy(void); void replication_usage(void); -void replication_add_local_db(const char *databse, struct ovsdb *db); -bool replication_is_alive(void); -int replication_get_last_error(void); -char *replication_status(void); -void replication_set_probe_interval(int); +bool replication_is_alive(const struct ovsdb *); +int replication_get_last_error(const struct ovsdb *); +char *replication_status(const struct ovsdb *); +void replication_set_probe_interval(const struct ovsdb *, int probe_interval); -char *set_excluded_tables(const char *excluded, bool dryrun) - OVS_WARN_UNUSED_RESULT; -char *get_excluded_tables(void) OVS_WARN_UNUSED_RESULT; -void disconnect_active_server(void); +char *parse_excluded_tables(const char *excluded) OVS_WARN_UNUSED_RESULT; +char *get_excluded_tables(const struct ovsdb *) OVS_WARN_UNUSED_RESULT; #endif /* ovsdb/replication.h */ From c8c0e570dcb3007e6a30e11f36aa4a23c0e3ad0b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:06 +0100 Subject: [PATCH 519/833] ovsdb: replication: Automatically switch read-only mode. When database is added to the replication, it should no longer accept transactions that can modify it. When it's removed from the replication, it should be writable again. Add this logic to the replication module itself, so it can be removed from the main ovsdb-server later. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/replication.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ovsdb/replication.c b/ovsdb/replication.c index 30292fb490b..b166a56622d 100644 --- a/ovsdb/replication.c +++ b/ovsdb/replication.c @@ -150,6 +150,7 @@ replication_set_db(struct ovsdb *db, const char *sync_from, jsonrpc_session_set_probe_interval(rdb->session, probe_interval); rdb->state = RPL_S_INIT; + rdb->db->read_only = true; } void @@ -727,6 +728,7 @@ replication_db_destroy(struct replication_db *rdb) } rdb->schema_version_higher = false; + rdb->db->read_only = false; } /* Return true if replication just started or is ongoing. From e76f8472090edb29a0b1b49ea292473a598a6095 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:07 +0100 Subject: [PATCH 520/833] ovsdb-server: Database config isolation. Add a new structure 'db_config' that holds the user-provided configuration of the database. And attach this configuration to each of the databases on the server. Each database has a service model: standalone, clustered, relay or active-backup. Relays and A-B databases have a source, each source has its own set of JSON-RPC session options. A-B also have an indicator of it being active or backup and an optional list of tables to exclude from replication. All of that should be stored per database in the temporary configuration file that is used in order to restore the config after the OVSDB crash. For that, the save/load functions are also updates. This change is written in generic way assuming all the databases can have different configuration including service model. The only user-visible change here is a slight modification of the ovsdb-server/sync-status appctl, since it now needs to skip databases that are not active-backup and also should report active-backup databases that are currently active, i.e. not added to the replication module. If the service model is not defined in the configuration, it is assumed to be standalone or clustered, and determined from the storage type while opening the database. If the service model is defined, but doesn't match the actual storage type in the database file, ovsdb-server will fail to open the database. This should never happen with internally generated config file, but may happen in the future with user-provided configuration files. In this case the service model is used for verification purposes only, if administrator wants to assert a particular model. Since the database 'source' connections can't use 'role' or 'read-only' options, a new flag added to corresponding JSON parsing functions to skip these fields. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 39 ++- ovsdb/jsonrpc-server.h | 6 +- ovsdb/ovsdb-server.c | 779 +++++++++++++++++++++++++++++++---------- ovsdb/replication.c | 1 - tests/ovsdb-server.at | 8 +- 5 files changed, 633 insertions(+), 200 deletions(-) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index d5e9c311ca4..da1fbd2502b 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -230,8 +230,18 @@ ovsdb_jsonrpc_options_clone(const struct ovsdb_jsonrpc_options *options) return clone; } +void +ovsdb_jsonrpc_options_free(struct ovsdb_jsonrpc_options *options) +{ + if (options) { + free(options->role); + free(options); + } +} + struct json * -ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options) +ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options, + bool jsonrpc_session_only) { struct json *json = json_object_create(); @@ -239,9 +249,15 @@ ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options) json_integer_create(options->max_backoff)); json_object_put(json, "inactivity-probe", json_integer_create(options->probe_interval)); + json_object_put(json, "dscp", json_integer_create(options->dscp)); + + if (jsonrpc_session_only) { + /* Caller is not interested in OVSDB-specific options. */ + return json; + } + json_object_put(json, "read-only", json_boolean_create(options->read_only)); - json_object_put(json, "dscp", json_integer_create(options->dscp)); if (options->role) { json_object_put(json, "role", json_string_create(options->role)); } @@ -251,7 +267,8 @@ ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options) void ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *options, - const struct json *json) + const struct json *json, + bool jsonrpc_session_only) { const struct json *max_backoff, *probe_interval, *read_only, *dscp, *role; struct ovsdb_parser parser; @@ -271,23 +288,29 @@ ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *options, options->probe_interval = json_integer(probe_interval); } + dscp = ovsdb_parser_member(&parser, "dscp", OP_INTEGER | OP_OPTIONAL); + if (dscp) { + options->dscp = json_integer(dscp); + } + + if (jsonrpc_session_only) { + /* Caller is not interested in OVSDB-specific options. */ + goto exit; + } + read_only = ovsdb_parser_member(&parser, "read-only", OP_BOOLEAN | OP_OPTIONAL); if (read_only) { options->read_only = json_boolean(read_only); } - dscp = ovsdb_parser_member(&parser, "dscp", OP_INTEGER | OP_OPTIONAL); - if (dscp) { - options->dscp = json_integer(dscp); - } - role = ovsdb_parser_member(&parser, "role", OP_STRING | OP_OPTIONAL); if (role) { free(options->role); options->role = nullable_xstrdup(json_string(role)); } +exit: error = ovsdb_parser_finish(&parser); if (error) { char *s = ovsdb_error_to_string_free(error); diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index 39366ad7054..9fb2baa54aa 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -43,12 +43,14 @@ struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_default_options( const char *target); struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_options_clone( const struct ovsdb_jsonrpc_options *); +void ovsdb_jsonrpc_options_free(struct ovsdb_jsonrpc_options *); struct json *ovsdb_jsonrpc_options_to_json( - const struct ovsdb_jsonrpc_options *) + const struct ovsdb_jsonrpc_options *, bool jsonrpc_session_only) OVS_WARN_UNUSED_RESULT; void ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *, - const struct json *); + const struct json *, + bool jsonrpc_session_only); void ovsdb_jsonrpc_server_set_remotes(struct ovsdb_jsonrpc_server *, const struct shash *); diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 404abf1e559..ab32ceb8ad0 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -42,6 +42,7 @@ #include "ovsdb-data.h" #include "ovsdb-types.h" #include "ovsdb-error.h" +#include "ovsdb-parser.h" #include "openvswitch/poll-loop.h" #include "process.h" #include "replication.h" @@ -65,12 +66,6 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_server); -struct db { - char *filename; - struct ovsdb *db; - struct uuid row_uuid; -}; - /* SSL configuration. */ static char *private_key_file; static char *certificate_file; @@ -100,16 +95,79 @@ static unixctl_cb_func ovsdb_server_get_sync_exclude_tables; static unixctl_cb_func ovsdb_server_get_sync_status; static unixctl_cb_func ovsdb_server_get_db_storage_status; +#define SERVICE_MODELS \ + SERVICE_MODEL(UNDEFINED, undefined) \ + SERVICE_MODEL(STANDALONE, standalone) \ + SERVICE_MODEL(CLUSTERED, clustered) \ + SERVICE_MODEL(ACTIVE_BACKUP, active-backup) \ + SERVICE_MODEL(RELAY, relay) + +enum service_model { +#define SERVICE_MODEL(ENUM, NAME) SM_##ENUM, + SERVICE_MODELS +#undef SERVICE_MODEL +}; + +static const char * +service_model_to_string(enum service_model model) +{ + switch (model) { +#define SERVICE_MODEL(ENUM, NAME) \ + case SM_##ENUM: return #NAME; + SERVICE_MODELS +#undef SERVICE_MODEL + default: OVS_NOT_REACHED(); + } +} + +static enum service_model +service_model_from_string(const char *model) +{ +#define SERVICE_MODEL(ENUM, NAME) \ + if (!strcmp(model, #NAME)) { \ + return SM_##ENUM; \ + } + SERVICE_MODELS +#undef SERVICE_MODEL + + VLOG_WARN("Unrecognized database service model: '%s'", model); + + return SM_UNDEFINED; +} + +struct db_config { + enum service_model model; + char *source; /* sync-from for backup or relay source. */ + struct ovsdb_jsonrpc_options *options; /* For 'source' connection. */ + + /* Configuration specific to SM_ACTIVE_BACKUP. */ + struct { + char *sync_exclude; /* Tables to exclude. */ + bool backup; /* If true, the database is read-only and receives + * updates from the 'source'. */ + } ab; +}; + +struct db { + struct ovsdb *db; + char *filename; + struct db_config *config; + struct uuid row_uuid; +}; + struct server_config { struct shash *remotes; - struct shash *all_dbs; - FILE *config_tmpfile; + struct shash *all_dbs; /* All the currently serviced databases. + * 'struct db' by a schema name. */ + struct ovsdb_jsonrpc_server *jsonrpc; + + /* Command line + appctl configuration. */ char **sync_from; char **sync_exclude; bool *is_backup; int *replication_probe_interval; int *relay_source_probe_interval; - struct ovsdb_jsonrpc_server *jsonrpc; + FILE *config_tmpfile; }; static unixctl_cb_func ovsdb_server_add_remote; static unixctl_cb_func ovsdb_server_remove_remote; @@ -123,14 +181,15 @@ static unixctl_cb_func ovsdb_server_tlog_list; static void read_db(struct server_config *, struct db *); static struct ovsdb_error *open_db(struct server_config *, - const char *filename) + const char *filename, + const struct db_config *) OVS_WARN_UNUSED_RESULT; static void add_server_db(struct server_config *); static void remove_db(struct server_config *, struct shash_node *db, char *); static void close_db(struct server_config *, struct db *, char *); static void parse_options(int argc, char *argvp[], - struct sset *db_filenames, struct shash *remotes, + struct shash *db_conf, struct shash *remotes, char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *is_backup); @@ -153,29 +212,14 @@ static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, static void update_server_status(struct shash *all_dbs); static void save_config__(FILE *config_file, const struct shash *remotes, - const struct sset *db_filenames, + const struct shash *db_conf, const char *sync_from, const char *sync_exclude, bool is_backup); static void save_config(struct server_config *); static void load_config(FILE *config_file, struct shash *remotes, - struct sset *db_filenames, char **sync_from, + struct shash *db_conf, char **sync_from, char **sync_exclude, bool *is_backup); -static void -ovsdb_replication_init(const char *sync_from, const char *exclude, - struct shash *all_dbs, const struct uuid *server_uuid, - int probe_interval) -{ - struct shash_node *node; - SHASH_FOR_EACH (node, all_dbs) { - struct db *db = node->data; - if (node->name[0] != '_' && db->db) { - replication_set_db(db->db, sync_from, exclude, - server_uuid, probe_interval); - } - } -} - static void log_and_free_error(struct ovsdb_error *error) { @@ -186,11 +230,52 @@ log_and_free_error(struct ovsdb_error *error) } } +static void +ovsdb_server_replication_remove_db(struct db *db) +{ + replication_remove_db(db->db); + db->config->ab.backup = false; +} + +static void +ovsdb_server_replication_run(struct server_config *config) +{ + struct shash_node *node; + bool all_alive = true; + + replication_run(); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + + if (db->config->model == SM_ACTIVE_BACKUP && db->config->ab.backup + && !replication_is_alive(db->db)) { + ovsdb_server_replication_remove_db(db); + all_alive = false; + } + } + + /* If one connection is broken, switch all databases to active, + * since they are configured via the same command line / appctl. */ + if (!all_alive && *config->is_backup) { + *config->is_backup = false; + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + + if (db->config->model == SM_ACTIVE_BACKUP + && db->config->ab.backup) { + ovsdb_server_replication_remove_db(db); + } + } + } +} + static void main_loop(struct server_config *config, struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, struct unixctl_server *unixctl, struct shash *remotes, - struct process *run_process, bool *exiting, bool *is_backup) + struct process *run_process, bool *exiting) { char *remotes_error, *ssl_error; struct shash_node *node; @@ -220,7 +305,7 @@ main_loop(struct server_config *config, * the set of remotes that reconfigure_remotes() uses. */ unixctl_server_run(unixctl); - ovsdb_jsonrpc_server_set_read_only(jsonrpc, *is_backup); + ovsdb_jsonrpc_server_set_read_only(jsonrpc, false); report_error_if_changed( reconfigure_remotes(jsonrpc, all_dbs, remotes), @@ -228,23 +313,7 @@ main_loop(struct server_config *config, report_error_if_changed(reconfigure_ssl(all_dbs), &ssl_error); ovsdb_jsonrpc_server_run(jsonrpc); - replication_run(); - if (*is_backup) { - SHASH_FOR_EACH (node, all_dbs) { - struct db *db = node->data; - if (db->db->name[0] != '_' && !replication_is_alive(db->db)) { - *is_backup = false; - break; - } - } - if (!*is_backup) { - SHASH_FOR_EACH (node, all_dbs) { - struct db *db = node->data; - replication_remove_db(db->db); - } - } - } - + ovsdb_server_replication_run(config); ovsdb_relay_run(); SHASH_FOR_EACH_SAFE (node, all_dbs) { @@ -351,6 +420,89 @@ parse_relay_args(const char *arg, char **name, char **remote) return true; } +static void +db_config_destroy(struct db_config *conf) +{ + if (!conf) { + return; + } + + free(conf->source); + ovsdb_jsonrpc_options_free(conf->options); + free(conf->ab.sync_exclude); + free(conf); +} + +static struct db_config * +db_config_clone(const struct db_config *c) +{ + struct db_config *conf = xmemdup(c, sizeof *c); + + conf->source = nullable_xstrdup(c->source); + if (c->options) { + conf->options = ovsdb_jsonrpc_options_clone(c->options); + } + conf->ab.sync_exclude = nullable_xstrdup(c->ab.sync_exclude); + + return conf; +} + +static struct ovsdb_jsonrpc_options * +get_jsonrpc_options(const char *target, enum service_model model) +{ + struct ovsdb_jsonrpc_options *options; + + options = ovsdb_jsonrpc_default_options(target); + if (model == SM_ACTIVE_BACKUP) { + options->probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; + } else if (model == SM_RELAY) { + options->probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; + } + + return options; +} + +static void +add_database_config(struct shash *db_conf, const char *opt, + const char *sync_from, const char *sync_exclude, + bool active) +{ + struct db_config *conf = xzalloc(sizeof *conf); + char *filename = NULL; + + if (parse_relay_args(opt, &filename, &conf->source)) { + conf->model = SM_RELAY; + conf->options = get_jsonrpc_options(conf->source, conf->model); + } else if (sync_from) { + conf->model = SM_ACTIVE_BACKUP; + conf->source = xstrdup(sync_from); + conf->options = get_jsonrpc_options(conf->source, conf->model); + conf->ab.sync_exclude = nullable_xstrdup(sync_exclude); + conf->ab.backup = !active; + filename = xstrdup(opt); + } else { + conf->model = SM_UNDEFINED; /* We'll update once the file is open. */ + filename = xstrdup(opt); + } + + conf = shash_replace_nocopy(db_conf, filename, conf); + if (conf) { + VLOG_WARN("Duplicate database configuration: %s", filename); + db_config_destroy(conf); + } +} + +static void +free_database_configs(struct shash *db_conf) +{ + struct shash_node *node; + + SHASH_FOR_EACH (node, db_conf) { + db_config_destroy(node->data); + } + shash_clear(db_conf); +} + int main(int argc, char *argv[]) { @@ -358,7 +510,6 @@ main(int argc, char *argv[]) char *run_command = NULL; struct unixctl_server *unixctl; struct ovsdb_jsonrpc_server *jsonrpc; - const char *db_filename; struct process *run_process; bool exiting; int retval; @@ -369,6 +520,7 @@ main(int argc, char *argv[]) int replication_probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; int relay_source_probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; struct sset db_filenames = SSET_INITIALIZER(&db_filenames); + struct shash db_conf = SHASH_INITIALIZER(&db_conf); struct shash remotes = SHASH_INITIALIZER(&remotes); char *sync_from = NULL, *sync_exclude = NULL; bool is_backup; @@ -381,7 +533,7 @@ main(int argc, char *argv[]) dns_resolve_init(true); bool active = false; - parse_options(argc, argv, &db_filenames, &remotes, &unixctl_path, + parse_options(argc, argv, &db_conf, &remotes, &unixctl_path, &run_command, &sync_from, &sync_exclude, &active); is_backup = sync_from && !active; @@ -400,13 +552,15 @@ main(int argc, char *argv[]) server_config.remotes = &remotes; server_config.config_tmpfile = config_tmpfile; - save_config__(config_tmpfile, &remotes, &db_filenames, sync_from, + save_config__(config_tmpfile, &remotes, &db_conf, sync_from, sync_exclude, is_backup); + free_remotes(&remotes); + free_database_configs(&db_conf); daemonize_start(false, false); /* Load the saved config. */ - load_config(config_tmpfile, &remotes, &db_filenames, &sync_from, + load_config(config_tmpfile, &remotes, &db_conf, &sync_from, &sync_exclude, &is_backup); /* Start ovsdb jsonrpc server. When running as a backup server, @@ -425,13 +579,16 @@ main(int argc, char *argv[]) perf_counters_init(); - SSET_FOR_EACH (db_filename, &db_filenames) { - struct ovsdb_error *error = open_db(&server_config, db_filename); + SHASH_FOR_EACH (node, &db_conf) { + struct ovsdb_error *error = open_db(&server_config, + node->name, node->data); if (error) { char *s = ovsdb_error_to_string_free(error); ovs_fatal(0, "%s", s); } + db_config_destroy(node->data); } + shash_clear(&db_conf); add_server_db(&server_config); char *error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes); @@ -538,15 +695,8 @@ main(int argc, char *argv[]) unixctl_command_register("ovsdb-server/disable-monitor-cond", "", 0, 0, ovsdb_server_disable_monitor_cond, jsonrpc); - if (is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(jsonrpc); - ovsdb_replication_init(sync_from, sync_exclude, &all_dbs, server_uuid, - replication_probe_interval); - } - main_loop(&server_config, jsonrpc, &all_dbs, unixctl, &remotes, - run_process, &exiting, &is_backup); + run_process, &exiting); SHASH_FOR_EACH_SAFE (node, &all_dbs) { struct db *db = node->data; @@ -557,7 +707,8 @@ main(int argc, char *argv[]) shash_destroy(&all_dbs); free_remotes(&remotes); shash_destroy(&remotes); - sset_destroy(&db_filenames); + free_database_configs(&db_conf); + shash_destroy(&db_conf); free(sync_from); free(sync_exclude); unixctl_server_destroy(unixctl); @@ -581,7 +732,7 @@ main(int argc, char *argv[]) * * "False negatives" are possible. */ static bool -is_already_open(struct server_config *config OVS_UNUSED, +is_already_open(struct server_config *server_config OVS_UNUSED, const char *filename OVS_UNUSED) { #ifndef _WIN32 @@ -590,11 +741,12 @@ is_already_open(struct server_config *config OVS_UNUSED, if (!stat(filename, &s)) { struct shash_node *node; - SHASH_FOR_EACH (node, config->all_dbs) { + SHASH_FOR_EACH (node, server_config->all_dbs) { struct db *db = node->data; struct stat s2; - if (!stat(db->filename, &s2) + if (db->config->model != SM_RELAY + && !stat(db->filename, &s2) && s.st_dev == s2.st_dev && s.st_ino == s2.st_ino) { return true; @@ -607,16 +759,19 @@ is_already_open(struct server_config *config OVS_UNUSED, } static void -close_db(struct server_config *config, struct db *db, char *comment) +close_db(struct server_config *server_config, struct db *db, char *comment) { if (db) { - ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db, comment); - if (db->db->is_relay) { + ovsdb_jsonrpc_server_remove_db(server_config->jsonrpc, + db->db, comment); + if (db->config->model == SM_RELAY) { ovsdb_relay_del_db(db->db); } - if (*config->is_backup) { - replication_remove_db(db->db); + if (db->config->model == SM_ACTIVE_BACKUP + && db->config->ab.backup) { + ovsdb_server_replication_remove_db(db); } + db_config_destroy(db->config); ovsdb_destroy(db->db); free(db->filename); free(db); @@ -769,20 +924,17 @@ add_db(struct server_config *config, struct db *db) } static struct ovsdb_error * OVS_WARN_UNUSED_RESULT -open_db(struct server_config *config, const char *filename) +open_db(struct server_config *server_config, + const char *filename, const struct db_config *conf) { struct ovsdb_storage *storage; - char *relay_remotes = NULL; struct ovsdb_error *error; - bool is_relay; - char *name; - is_relay = parse_relay_args(filename, &name, &relay_remotes); - if (!is_relay) { + if (conf->model != SM_RELAY) { /* If we know that the file is already open, return a good error * message. Otherwise, if the file is open, we'll fail later on with * a harder to interpret file locking error. */ - if (is_already_open(config, filename)) { + if (is_already_open(server_config, filename)) { return ovsdb_error(NULL, "%s: already open", filename); } @@ -790,59 +942,78 @@ open_db(struct server_config *config, const char *filename) if (error) { return error; } - name = xstrdup(filename); } else { - storage = ovsdb_storage_create_unbacked(name); + storage = ovsdb_storage_create_unbacked(filename); + } + + enum service_model model = conf->model; + if (model == SM_UNDEFINED || model == SM_STANDALONE + || model == SM_CLUSTERED) { + /* Check the actual service model from the storage. */ + model = ovsdb_storage_is_clustered(storage) + ? SM_CLUSTERED : SM_STANDALONE; + } + if (conf->model != SM_UNDEFINED && conf->model != model) { + ovsdb_storage_close(storage); + return ovsdb_error(NULL, "%s: database is %s and not %s", + filename, service_model_to_string(model), + service_model_to_string(conf->model)); } struct ovsdb_schema *schema; - if (is_relay || ovsdb_storage_is_clustered(storage)) { + if (model == SM_RELAY || model == SM_CLUSTERED) { schema = NULL; } else { struct json *txn_json; error = ovsdb_storage_read(storage, &schema, &txn_json, NULL); if (error) { ovsdb_storage_close(storage); - free(name); return error; } ovs_assert(schema && !txn_json); } struct db *db = xzalloc(sizeof *db); - db->filename = name; + db->filename = xstrdup(filename); + db->config = db_config_clone(conf); + db->config->model = model; db->db = ovsdb_create(schema, storage); - ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); + ovsdb_jsonrpc_server_add_db(server_config->jsonrpc, db->db); /* Enable txn history for clustered and relay modes. It is not enabled for * other modes for now, since txn id is available for clustered and relay * modes only. */ - ovsdb_txn_history_init(db->db, - is_relay || ovsdb_storage_is_clustered(storage)); + ovsdb_txn_history_init(db->db, model == SM_RELAY || model == SM_CLUSTERED); - read_db(config, db); + read_db(server_config, db); error = (db->db->name[0] == '_' ? ovsdb_error(NULL, "%s: names beginning with \"_\" are reserved", db->db->name) - : shash_find(config->all_dbs, db->db->name) + : shash_find(server_config->all_dbs, db->db->name) ? ovsdb_error(NULL, "%s: duplicate database name", db->db->name) : NULL); if (error) { char *error_s = ovsdb_error_to_string(error); - close_db(config, db, + close_db(server_config, db, xasprintf("cannot complete opening %s database (%s)", db->db->name, error_s)); free(error_s); return error; } - add_db(config, db); + add_db(server_config, db); - if (is_relay) { - ovsdb_relay_add_db(db->db, relay_remotes, update_schema, config, - *config->relay_source_probe_interval); - free(relay_remotes); + if (model == SM_RELAY) { + ovsdb_relay_add_db(db->db, conf->source, update_schema, server_config, + conf->options->probe_interval); + } + if (model == SM_ACTIVE_BACKUP && conf->ab.backup) { + const struct uuid *server_uuid; + + server_uuid = ovsdb_jsonrpc_server_get_uuid(server_config->jsonrpc); + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, conf->options->probe_interval); } return NULL; } @@ -866,6 +1037,8 @@ add_server_db(struct server_config *config) /* We don't need txn_history for server_db. */ db->filename = xstrdup(""); + db->config = xzalloc(sizeof *db->config); + db->config->model = SM_UNDEFINED; db->db = ovsdb_create(schema, ovsdb_storage_create_unbacked(NULL)); db->db->read_only = true; @@ -1031,9 +1204,10 @@ free_remotes(struct shash *remotes) SHASH_FOR_EACH (node, remotes) { struct ovsdb_jsonrpc_options *options = node->data; - free(options->role); + + ovsdb_jsonrpc_options_free(options); } - shash_clear_free_data(remotes); + shash_clear(remotes); } } @@ -1461,11 +1635,20 @@ ovsdb_server_set_active_ovsdb_server(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; - if (*config->sync_from) { - free(*config->sync_from); - } + free(*config->sync_from); *config->sync_from = xstrdup(argv[1]); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + + if (db->config->model == SM_ACTIVE_BACKUP) { + free(db->config->source); + db->config->source = xstrdup(argv[1]); + } + } + save_config(config); unixctl_command_reply(conn, NULL); @@ -1489,20 +1672,39 @@ ovsdb_server_connect_active_ovsdb_server(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; char *msg = NULL; - if ( !*config->sync_from) { + if (!*config->sync_from) { msg = "Unable to connect: active server is not specified.\n"; } else { const struct uuid *server_uuid; server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - if (!*config->is_backup) { - *config->is_backup = true; - save_config(config); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + /* This command also converts standalone databases to AB. */ + if (conf->model == SM_STANDALONE) { + conf->model = SM_ACTIVE_BACKUP; + conf->source = xstrdup(*config->sync_from); + conf->options = ovsdb_jsonrpc_default_options(conf->source); + conf->options->probe_interval = + *config->replication_probe_interval; + conf->ab.sync_exclude = + nullable_xstrdup(*config->sync_exclude); + conf->ab.backup = false; + } + + if (conf->model == SM_ACTIVE_BACKUP && !conf->ab.backup) { + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, conf->options->probe_interval); + conf->ab.backup = true; + } } + *config->is_backup = true; + save_config(config); } unixctl_command_reply(conn, msg); } @@ -1518,7 +1720,11 @@ ovsdb_server_disconnect_active_ovsdb_server(struct unixctl_conn *conn, SHASH_FOR_EACH (node, config->all_dbs) { struct db *db = node->data; - replication_remove_db(db->db); + struct db_config *conf = db->config; + + if (conf->model == SM_ACTIVE_BACKUP && conf->ab.backup) { + ovsdb_server_replication_remove_db(db); + } } *config->is_backup = false; save_config(config); @@ -1532,23 +1738,35 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; - + struct shash_node *node; int probe_interval; - if (str_to_int(argv[1], 10, &probe_interval)) { - *config->replication_probe_interval = probe_interval; - save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - } - unixctl_command_reply(conn, NULL); - } else { - unixctl_command_reply( + + if (!str_to_int(argv[1], 10, &probe_interval)) { + unixctl_command_reply_error( conn, "Invalid probe interval, integer value expected"); + return; + } + + const struct uuid *server_uuid; + server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); + + *config->replication_probe_interval = probe_interval; + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + if (conf->model == SM_ACTIVE_BACKUP) { + conf->options->probe_interval = probe_interval; + if (conf->ab.backup) { + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, conf->options->probe_interval); + } + } } + + save_config(config); + unixctl_command_reply(conn, NULL); } static void @@ -1558,17 +1776,30 @@ ovsdb_server_set_relay_source_interval(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; int probe_interval; - if (str_to_int(argv[1], 10, &probe_interval)) { - *config->relay_source_probe_interval = probe_interval; - save_config(config); - ovsdb_relay_set_probe_interval(probe_interval); - unixctl_command_reply(conn, NULL); - } else { + if (!str_to_int(argv[1], 10, &probe_interval)) { unixctl_command_reply_error( conn, "Invalid probe interval, integer value expected"); + return; + } + + *config->relay_source_probe_interval = probe_interval; + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + if (conf->model == SM_RELAY) { + conf->options->probe_interval = probe_interval; + } } + + ovsdb_relay_set_probe_interval(probe_interval); + save_config(config); + + unixctl_command_reply(conn, NULL); } static void @@ -1578,20 +1809,36 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; char *err = parse_excluded_tables(argv[1]); - if (!err) { - free(*config->sync_exclude); - *config->sync_exclude = xstrdup(argv[1]); - save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); + if (err) { + goto exit; + } + + const struct uuid *server_uuid; + server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); + + free(*config->sync_exclude); + *config->sync_exclude = xstrdup(argv[1]); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + if (conf->model == SM_ACTIVE_BACKUP) { + free(conf->ab.sync_exclude); + conf->ab.sync_exclude = xstrdup(argv[1]); + if (conf->ab.backup) { + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, conf->options->probe_interval); + } } } + + save_config(config); + +exit: unixctl_command_reply(conn, err); free(err); } @@ -1798,8 +2045,7 @@ ovsdb_server_remove_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, options = shash_find_and_delete(config->remotes, argv[1]); if (options) { - free(options->role); - free(options); + ovsdb_jsonrpc_options_free(options); save_config(config); unixctl_command_reply(conn, NULL); } else { @@ -1836,22 +2082,26 @@ ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED, { struct server_config *config = config_; const char *filename = argv[1]; + const struct shash_node *node; + struct shash db_conf; + + shash_init(&db_conf); + add_database_config(&db_conf, filename, *config->sync_from, + *config->sync_exclude, !config->is_backup); + ovs_assert(shash_count(&db_conf) == 1); + node = shash_first(&db_conf); - char *error = ovsdb_error_to_string_free(open_db(config, filename)); + char *error = ovsdb_error_to_string_free(open_db(config, + node->name, node->data)); if (!error) { save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - } unixctl_command_reply(conn, NULL); } else { unixctl_command_reply_error(conn, error); free(error); } + db_config_destroy(node->data); + shash_destroy(&db_conf); } static void @@ -1998,23 +2248,34 @@ ovsdb_server_get_sync_status(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *config_) { struct server_config *config = config_; - bool is_backup = *config->is_backup; struct ds ds = DS_EMPTY_INITIALIZER; + bool any_backup = false; - ds_put_format(&ds, "state: %s\n", is_backup ? "backup" : "active"); + const struct shash_node **db_nodes = shash_sort(config->all_dbs); - if (is_backup) { - const struct shash_node **db_nodes = shash_sort(config->all_dbs); + for (size_t i = 0; i < shash_count(config->all_dbs); i++) { + const struct db *db = db_nodes[i]->data; - for (size_t i = 0; i < shash_count(config->all_dbs); i++) { - const struct db *db = db_nodes[i]->data; + if (db->config->model != SM_ACTIVE_BACKUP) { + continue; + } - if (db->db && db->db->name[0] != '_') { - ds_put_and_free_cstr(&ds, replication_status(db->db)); - ds_put_char(&ds, '\n'); - } + any_backup = true; + + ds_put_format(&ds, "database: %s\n", db->db->name); + ds_put_format(&ds, "state: %s\n", + db->config->ab.backup ? "backup" : "active"); + if (db->config->ab.backup) { + ds_put_and_free_cstr(&ds, replication_status(db->db)); } - free(db_nodes); + if (i + 1 < shash_count(config->all_dbs)) { + ds_put_char(&ds, '\n'); + } + } + free(db_nodes); + + if (!any_backup) { + ds_put_cstr(&ds, "state: active\n"); } unixctl_command_reply(conn, ds_cstr(&ds)); @@ -2058,7 +2319,7 @@ ovsdb_server_get_db_storage_status(struct unixctl_conn *conn, static void parse_options(int argc, char *argv[], - struct sset *db_filenames, struct shash *remotes, + struct shash *db_conf, struct shash *remotes, char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *active) { @@ -2108,7 +2369,7 @@ parse_options(int argc, char *argv[], *sync_from = NULL; *sync_exclude = NULL; - sset_init(db_filenames); + shash_init(db_conf); shash_init(remotes); for (;;) { int c; @@ -2214,10 +2475,15 @@ parse_options(int argc, char *argv[], argv += optind; if (argc > 0) { for (int i = 0; i < argc; i++) { - sset_add(db_filenames, argv[i]); + add_database_config(db_conf, argv[i], *sync_from, *sync_exclude, + *active); } } else if (add_default_db) { - sset_add_and_free(db_filenames, xasprintf("%s/conf.db", ovs_dbdir())); + char *filename = xasprintf("%s/conf.db", ovs_dbdir()); + + add_database_config(db_conf, filename, *sync_from, *sync_exclude, + *active); + free(filename); } } @@ -2268,16 +2534,63 @@ remotes_to_json(const struct shash *remotes) json = json_object_create(); SHASH_FOR_EACH (node, remotes) { json_object_put(json, node->name, - ovsdb_jsonrpc_options_to_json(node->data)); + ovsdb_jsonrpc_options_to_json(node->data, false)); + } + return json; +} + +static struct json * +db_config_to_json(const struct db_config *conf) +{ + struct json *json; + + json = json_object_create(); + + if (conf->model != SM_UNDEFINED) { + json_object_put(json, "service-model", + json_string_create( + service_model_to_string(conf->model))); + } + + if (conf->source) { + struct json *source = json_object_create(); + + json_object_put(source, conf->source, + ovsdb_jsonrpc_options_to_json(conf->options, true)); + json_object_put(json, "source", source); + } + + if (conf->model == SM_ACTIVE_BACKUP) { + if (conf->ab.sync_exclude) { + struct sset set = SSET_INITIALIZER(&set); + + sset_from_delimited_string(&set, conf->ab.sync_exclude, " ,"); + json_object_put(json, "exclude-tables", sset_to_json(&set)); + sset_destroy(&set); + } + json_object_put(json, "backup", json_boolean_create(conf->ab.backup)); + } + return json; +} + +static struct json * +databases_to_json(const struct shash *db_conf) +{ + const struct shash_node *node; + struct json *json; + + json = json_object_create(); + SHASH_FOR_EACH (node, db_conf) { + json_object_put(json, node->name, db_config_to_json(node->data)); } return json; } /* Truncates and replaces the contents of 'config_file' by a representation of - * 'remotes' and 'db_filenames'. */ + * 'remotes', 'db_conf' and a few global replication paramaters. */ static void save_config__(FILE *config_file, const struct shash *remotes, - const struct sset *db_filenames, const char *sync_from, + const struct shash *db_conf, const char *sync_from, const char *sync_exclude, bool is_backup) { struct json *obj; @@ -2290,7 +2603,8 @@ save_config__(FILE *config_file, const struct shash *remotes, obj = json_object_create(); json_object_put(obj, "remotes", remotes_to_json(remotes)); - json_object_put(obj, "db_filenames", sset_to_json(db_filenames)); + json_object_put(obj, "databases", databases_to_json(db_conf)); + if (sync_from) { json_object_put(obj, "sync_from", json_string_create(sync_from)); } @@ -2316,56 +2630,147 @@ save_config__(FILE *config_file, const struct shash *remotes, static void save_config(struct server_config *config) { - struct sset db_filenames; struct shash_node *node; + struct shash db_conf; - sset_init(&db_filenames); + shash_init(&db_conf); SHASH_FOR_EACH (node, config->all_dbs) { struct db *db = node->data; + if (node->name[0] != '_') { - sset_add(&db_filenames, db->filename); + shash_add(&db_conf, db->filename, db->config); } } - save_config__(config->config_tmpfile, config->remotes, &db_filenames, + save_config__(config->config_tmpfile, config->remotes, &db_conf, *config->sync_from, *config->sync_exclude, *config->is_backup); - sset_destroy(&db_filenames); + shash_destroy(&db_conf); } static void -sset_from_json(struct sset *sset, const struct json *array) +remotes_from_json(struct shash *remotes, const struct json *json) { - size_t i; + struct ovsdb_jsonrpc_options *options; + const struct shash_node *node; + const struct shash *object; - sset_clear(sset); + free_remotes(remotes); - ovs_assert(array); - ovs_assert(array->type == JSON_ARRAY); - for (i = 0; i < array->array.n; i++) { - const struct json *elem = array->array.elems[i]; - sset_add(sset, json_string(elem)); + ovs_assert(json); + ovs_assert(json->type == JSON_OBJECT); + + object = json_object(json); + SHASH_FOR_EACH (node, object) { + options = ovsdb_jsonrpc_default_options(node->name); + ovsdb_jsonrpc_options_update_from_json(options, node->data, false); + shash_add(remotes, node->name, options); } } +static struct db_config * +db_config_from_json(const char *name, const struct json *json) +{ + const struct json *model, *source, *sync_exclude, *backup; + struct db_config *conf = xzalloc(sizeof *conf); + struct ovsdb_parser parser; + struct ovsdb_error *error; + + ovsdb_parser_init(&parser, json, "database %s", name); + + model = ovsdb_parser_member(&parser, "service-model", + OP_STRING | OP_OPTIONAL); + conf->model = model ? service_model_from_string(json_string(model)) + : SM_UNDEFINED; + + if (conf->model == SM_ACTIVE_BACKUP) { + backup = ovsdb_parser_member(&parser, "backup", OP_BOOLEAN); + conf->ab.backup = backup ? json_boolean(backup) : false; + + sync_exclude = ovsdb_parser_member(&parser, "exclude-tables", + OP_ARRAY | OP_OPTIONAL); + if (sync_exclude) { + const struct json_array *exclude = json_array(sync_exclude); + struct sset set = SSET_INITIALIZER(&set); + + for (size_t i = 0; i < exclude->n; i++) { + if (exclude->elems[i]->type != JSON_STRING) { + ovsdb_parser_raise_error(&parser, + "'exclude-tables' must contain strings"); + break; + } + sset_add(&set, json_string(exclude->elems[i])); + } + conf->ab.sync_exclude = sset_join(&set, ",", ""); + sset_destroy(&set); + } + } + + if (conf->model == SM_ACTIVE_BACKUP || conf->model == SM_RELAY) { + enum ovsdb_parser_types type = OP_OBJECT; + + if (conf->model == SM_ACTIVE_BACKUP && !conf->ab.backup) { + /* Active database doesn't have to have a source. */ + type |= OP_OPTIONAL; + } + source = ovsdb_parser_member(&parser, "source", type); + + if (source && shash_count(json_object(source)) != 1) { + ovsdb_parser_raise_error(&parser, + "'source' should be an object with exactly one element"); + } else if (source) { + const struct shash_node *node = shash_first(json_object(source)); + const struct json *options; + + ovs_assert(node); + conf->source = xstrdup(node->name); + options = node->data; + + conf->options = get_jsonrpc_options(conf->source, conf->model); + + if (options->type == JSON_OBJECT) { + ovsdb_jsonrpc_options_update_from_json(conf->options, + options, true); + } else if (options->type != JSON_NULL) { + ovsdb_parser_raise_error(&parser, + "JSON-RPC options is not a JSON object or null"); + } + } + } + + error = ovsdb_parser_finish(&parser); + if (error) { + char *s = ovsdb_error_to_string_free(error); + + VLOG_WARN("%s", s); + free(s); + db_config_destroy(conf); + return NULL; + } + + return conf; +} + + static void -remotes_from_json(struct shash *remotes, const struct json *json) +databases_from_json(struct shash *db_conf, const struct json *json) { - struct ovsdb_jsonrpc_options *options; const struct shash_node *node; const struct shash *object; - free_remotes(remotes); + free_database_configs(db_conf); ovs_assert(json); ovs_assert(json->type == JSON_OBJECT); object = json_object(json); SHASH_FOR_EACH (node, object) { - options = ovsdb_jsonrpc_default_options(node->name); - ovsdb_jsonrpc_options_update_from_json(options, node->data); - shash_add(remotes, node->name, options); + struct db_config *conf = db_config_from_json(node->name, node->data); + + if (conf) { + shash_add(db_conf, node->name, conf); + } } } @@ -2373,7 +2778,7 @@ remotes_from_json(struct shash *remotes, const struct json *json) * 'config_file', which must have been previously written by save_config(). */ static void load_config(FILE *config_file, struct shash *remotes, - struct sset *db_filenames, char **sync_from, + struct shash *db_conf, char **sync_from, char **sync_exclude, bool *is_backup) { struct json *json; @@ -2388,8 +2793,8 @@ load_config(FILE *config_file, struct shash *remotes, ovs_assert(json->type == JSON_OBJECT); remotes_from_json(remotes, shash_find_data(json_object(json), "remotes")); - sset_from_json(db_filenames, - shash_find_data(json_object(json), "db_filenames")); + databases_from_json(db_conf, + shash_find_data(json_object(json), "databases")); struct json *string; string = shash_find_data(json_object(json), "sync_from"); diff --git a/ovsdb/replication.c b/ovsdb/replication.c index b166a56622d..8f3750eddae 100644 --- a/ovsdb/replication.c +++ b/ovsdb/replication.c @@ -779,7 +779,6 @@ replication_status(const struct ovsdb *db) bool alive = rdb->session && jsonrpc_session_is_alive(rdb->session); struct ds ds = DS_EMPTY_INITIALIZER; - ds_put_format(&ds, "database: %s\n", db->name); if (alive) { switch (rdb->state) { case RPL_S_INIT: diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 6eb758e2293..45aa80cd676 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1988,7 +1988,9 @@ OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status |grep re dnl Switch the 'db1' to active AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/disconnect-active-ovsdb-server]) -AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status], [0], [state: active +AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active ]) dnl Issue a transaction to 'db1' @@ -2007,7 +2009,9 @@ AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/connect-active-ovsdb-server dnl Verify the change happend OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status |grep replicating]) -AT_CHECK([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status], [0], [state: active +AT_CHECK([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active ]) dnl Issue an transaction to 'db2' which is now active. From 8c8a6f793fb25d083a4fe277eaec49fc0a306089 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:08 +0100 Subject: [PATCH 521/833] ovsdb-server: Add no-op config-file option. Adding a --config-file option that will be used in the future to allow users to provide the database server configuration via a JSON file. For now, it does nothing useful, but we define it as mutually exclusive with all the command line options and UnixCtl commands that configure values that will be available via a config file. This will ensure that we don't have too many ways of configuring the same thing at the same time. New appctl command 'ovsdb-server/reload' is going to signal OVSDB server that it needs to re-read the configuration file. While at it, adding a missing 'usage' line for '--no-dbs'. This option is rarely used, so it doesn't seem to be worth a separate fix. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 110 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index ab32ceb8ad0..70e3cf4cac1 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -95,6 +95,13 @@ static unixctl_cb_func ovsdb_server_get_sync_exclude_tables; static unixctl_cb_func ovsdb_server_get_sync_status; static unixctl_cb_func ovsdb_server_get_db_storage_status; +/* Holds the name of the configuration file passed via --config-file. + * Mutually exclusive with command-line and unixctl configuration + * that can otherwise be done via configuration file. */ +static char *config_file_path; +/* UnixCtl command to reload configuration from a configuration file. */ +static unixctl_cb_func ovsdb_server_reload; + #define SERVICE_MODELS \ SERVICE_MODEL(UNDEFINED, undefined) \ SERVICE_MODEL(STANDALONE, standalone) \ @@ -637,6 +644,8 @@ main(int argc, char *argv[]) ovsdb_server_memory_trim_on_compaction, NULL); unixctl_command_register("ovsdb-server/reconnect", "", 0, 0, ovsdb_server_reconnect, jsonrpc); + unixctl_command_register("ovsdb-server/reload", "", 0, 0, + ovsdb_server_reload, &server_config); unixctl_command_register("ovsdb-server/add-remote", "REMOTE", 1, 1, ovsdb_server_add_remote, &server_config); @@ -713,6 +722,7 @@ main(int argc, char *argv[]) free(sync_exclude); unixctl_server_destroy(unixctl); replication_destroy(); + free(config_file_path); if (run_process && process_exited(run_process)) { int status = process_status(run_process); @@ -1629,6 +1639,23 @@ report_error_if_changed(char *error, char **last_errorp) } } +static bool +check_config_file_on_unixctl(struct unixctl_conn *conn) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + + if (!config_file_path) { + return false; + } + + ds_put_format(&ds, "Update the %s and use ovsdb-server/reload instead", + config_file_path); + unixctl_command_reply_error(conn, ds_cstr(&ds)); + ds_destroy(&ds); + + return true; +} + static void ovsdb_server_set_active_ovsdb_server(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[], @@ -1637,6 +1664,10 @@ ovsdb_server_set_active_ovsdb_server(struct unixctl_conn *conn, struct server_config *config = config_; struct shash_node *node; + if (check_config_file_on_unixctl(conn)) { + return; + } + free(*config->sync_from); *config->sync_from = xstrdup(argv[1]); @@ -1675,6 +1706,10 @@ ovsdb_server_connect_active_ovsdb_server(struct unixctl_conn *conn, struct shash_node *node; char *msg = NULL; + if (check_config_file_on_unixctl(conn)) { + return; + } + if (!*config->sync_from) { msg = "Unable to connect: active server is not specified.\n"; } else { @@ -1718,6 +1753,10 @@ ovsdb_server_disconnect_active_ovsdb_server(struct unixctl_conn *conn, struct server_config *config = config_; struct shash_node *node; + if (check_config_file_on_unixctl(conn)) { + return; + } + SHASH_FOR_EACH (node, config->all_dbs) { struct db *db = node->data; struct db_config *conf = db->config; @@ -1741,6 +1780,10 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, struct shash_node *node; int probe_interval; + if (check_config_file_on_unixctl(conn)) { + return; + } + if (!str_to_int(argv[1], 10, &probe_interval)) { unixctl_command_reply_error( conn, "Invalid probe interval, integer value expected"); @@ -1779,6 +1822,10 @@ ovsdb_server_set_relay_source_interval(struct unixctl_conn *conn, struct shash_node *node; int probe_interval; + if (check_config_file_on_unixctl(conn)) { + return; + } + if (!str_to_int(argv[1], 10, &probe_interval)) { unixctl_command_reply_error( conn, "Invalid probe interval, integer value expected"); @@ -1811,6 +1858,10 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, struct server_config *config = config_; struct shash_node *node; + if (check_config_file_on_unixctl(conn)) { + return; + } + char *err = parse_excluded_tables(argv[1]); if (err) { goto exit; @@ -2005,6 +2056,21 @@ ovsdb_server_reconnect(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, NULL); } +/* "ovsdb-server/reload": makes ovsdb-server open a configuration file on + * 'config_file_path', read it and sync the runtime configuration with it. */ +static void +ovsdb_server_reload(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *config_ OVS_UNUSED) +{ + if (!config_file_path) { + unixctl_command_reply_error(conn, + "Configuration file was not specified on command line"); + } else { + unixctl_command_reply_error(conn, + "Configuration file support is not implemented yet"); + } +} + /* "ovsdb-server/add-remote REMOTE": adds REMOTE to the set of remotes that * ovsdb-server services. */ static void @@ -2019,6 +2085,10 @@ ovsdb_server_add_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, const struct db *db; char *retval; + if (check_config_file_on_unixctl(conn)) { + return; + } + retval = (strncmp("db:", remote, 3) ? NULL : parse_db_column(config->all_dbs, remote, @@ -2043,6 +2113,10 @@ ovsdb_server_remove_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, struct server_config *config = config_; struct ovsdb_jsonrpc_options *options; + if (check_config_file_on_unixctl(conn)) { + return; + } + options = shash_find_and_delete(config->remotes, argv[1]); if (options) { ovsdb_jsonrpc_options_free(options); @@ -2085,6 +2159,10 @@ ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED, const struct shash_node *node; struct shash db_conf; + if (check_config_file_on_unixctl(conn)) { + return; + } + shash_init(&db_conf); add_database_config(&db_conf, filename, *config->sync_from, *config->sync_exclude, !config->is_backup); @@ -2122,6 +2200,10 @@ ovsdb_server_remove_database(struct unixctl_conn *conn, int argc OVS_UNUSED, struct server_config *config = config_; struct shash_node *node; + if (check_config_file_on_unixctl(conn)) { + return; + } + node = shash_find(config->all_dbs, argv[1]); if (!node) { unixctl_command_reply_error(conn, "Failed to find the database."); @@ -2335,6 +2417,7 @@ parse_options(int argc, char *argv[], OPT_NO_DBS, OPT_FILE_COLUMN_DIFF, OPT_FILE_NO_DATA_CONVERSION, + OPT_CONFIG_FILE, VLOG_OPTION_ENUMS, DAEMON_OPTION_ENUMS, SSL_OPTION_ENUMS, @@ -2362,6 +2445,7 @@ parse_options(int argc, char *argv[], {"disable-file-column-diff", no_argument, NULL, OPT_FILE_COLUMN_DIFF}, {"disable-file-no-data-conversion", no_argument, NULL, OPT_FILE_NO_DATA_CONVERSION}, + {"config-file", required_argument, NULL, OPT_CONFIG_FILE}, {NULL, 0, NULL, 0}, }; char *short_options = ovs_cmdl_long_options_to_short_options(long_options); @@ -2462,6 +2546,11 @@ parse_options(int argc, char *argv[], ovsdb_no_data_conversion_disable(); break; + case OPT_CONFIG_FILE: + config_file_path = abs_file_name(ovs_dbdir(), optarg); + add_default_db = false; + break; + case '?': exit(EXIT_FAILURE); @@ -2473,7 +2562,19 @@ parse_options(int argc, char *argv[], argc -= optind; argv += optind; - if (argc > 0) { + + if (config_file_path) { + if (*sync_from || *sync_exclude || *active) { + ovs_fatal(0, "--config-file is mutually exclusive with " + "--sync-from, --sync-exclude and --active"); + } + if (shash_count(remotes)) { + ovs_fatal(0, "--config-file is mutually exclusive with --remote"); + } + if (argc > 0) { + ovs_fatal(0, "Databases should be specified in a config file"); + } + } else if (argc > 0) { for (int i = 0; i < argc; i++) { add_database_config(db_conf, argv[i], *sync_from, *sync_exclude, *active); @@ -2498,6 +2599,12 @@ usage(void) printf("\nJSON-RPC options (may be specified any number of times):\n" " --remote=REMOTE connect or listen to REMOTE\n"); stream_usage("JSON-RPC", true, true, true); + printf("\nConfiguration file:\n" + " --config-file PATH Use configuration file as a source of\n" + " database and JSON-RPC configuration.\n" + " Mutually exclusive with the DATABASE,\n" + " JSON-RPC and Syncing options.\n" + " Assumes --no-dbs.\n"); daemon_usage(); vlog_usage(); replication_usage(); @@ -2505,6 +2612,7 @@ usage(void) printf("\nOther options:\n" " --run COMMAND run COMMAND as subprocess then exit\n" " --unixctl=SOCKET override default control socket name\n" + " --no-dbs do not add default database\n" " --disable-file-column-diff\n" " don't use column diff in database file\n" " -h, --help display this help message\n" From 37ab57b41c0b5e60b67e126a3114345a3da9eac2 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:09 +0100 Subject: [PATCH 522/833] jsonrpc-server: Re-add remotes on role changes. It is currently not possible for the role to change in runtime (unless a manual DB transaction is crafted), but it will be with addition of a config file. If the role changes, listening socket will be closed, and all the connections to this remote will be terminated. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index da1fbd2502b..1151a60ad04 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -339,7 +339,8 @@ ovsdb_jsonrpc_server_set_remotes(struct ovsdb_jsonrpc_server *svr, if (!options) { VLOG_INFO("%s: remote deconfigured", node->name); ovsdb_jsonrpc_server_del_remote(node); - } else if (options->dscp != remote->dscp) { + } else if (options->dscp != remote->dscp + || !nullable_string_is_equal(options->role, remote->role)) { ovsdb_jsonrpc_server_del_remote(node); } } From 40ce846e8699e90f433964f45bb749c2ddfc1454 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:10 +0100 Subject: [PATCH 523/833] jsonrpc: Add function to update all options at once. It's useful to have a way to update all the JSON-RPC session options all at once and not call 3 separate functions every time. This may also allow the internals of these options to be better abstracted, i.e. allow users to not know what are these options exactly. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/jsonrpc.c | 9 +++++++++ lib/jsonrpc.h | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c index 3db5f76e280..f1ef709502c 100644 --- a/lib/jsonrpc.c +++ b/lib/jsonrpc.c @@ -1337,6 +1337,15 @@ jsonrpc_session_set_dscp(struct jsonrpc_session *s, uint8_t dscp) } } +void +jsonrpc_session_set_options(struct jsonrpc_session *s, + const struct jsonrpc_session_options *options) +{ + jsonrpc_session_set_max_backoff(s, options->max_backoff); + jsonrpc_session_set_probe_interval(s, options->probe_interval); + jsonrpc_session_set_dscp(s, options->dscp); +} + /* Sets thresholds for send backlog. If send backlog contains more than * 'max_n_msgs' messages or is larger than 'max_backlog_bytes' bytes, * connection will be closed (then reconnected, if that feature is enabled). */ diff --git a/lib/jsonrpc.h b/lib/jsonrpc.h index 2aa97d3fe6d..1baffcd8071 100644 --- a/lib/jsonrpc.h +++ b/lib/jsonrpc.h @@ -139,6 +139,14 @@ void jsonrpc_session_enable_reconnect(struct jsonrpc_session *); void jsonrpc_session_force_reconnect(struct jsonrpc_session *); void jsonrpc_session_reset_backoff(struct jsonrpc_session *); +struct jsonrpc_session_options { + int max_backoff; /* Maximum reconnection backoff, in msec. */ + int probe_interval; /* Max idle time before probing, in msec. */ + uint8_t dscp; /* Dscp value for passive connections. */ +}; + +void jsonrpc_session_set_options(struct jsonrpc_session *, + const struct jsonrpc_session_options *); void jsonrpc_session_set_max_backoff(struct jsonrpc_session *, int max_backoff); void jsonrpc_session_set_probe_interval(struct jsonrpc_session *, From 9a1b79c154310f28aec6e39d63331f74b08385c5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:11 +0100 Subject: [PATCH 524/833] ovsdb: Embed jsonrpc session options into ovsdb jsonrpc options. Just introduced structure 'jsonrpc_session_options' is the same as part of the 'ovsdb_jsonrpc_options'. In fact, these options do really belong to a lower layer. So, replace a copy of these fields with a structure, so it can be easily passed to jsonrpc's 'jsonrpc_session_set_options()'. Not creating separate JSON parsing/formatting functions to avoid creating an extra nesting level for the users who will write the JSON definition in a configuration file. I.e. keeping the JSON object flat. Also, not changing the 'db_config->options' to be 'jsonrpc_session_options', even though we don't need the 'role' or 'read-only' fields. This allows us to use the same JSON parsing function for both the remotes ans database sources. Can be changed in the future, but for now keeping as is to avoid extra code complication. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/jsonrpc-server.c | 40 ++++++++++++++++------------------------ ovsdb/jsonrpc-server.h | 5 ++--- ovsdb/ovsdb-server.c | 31 +++++++++++++++++-------------- 3 files changed, 35 insertions(+), 41 deletions(-) diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 1151a60ad04..817997677bd 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -211,11 +211,12 @@ struct ovsdb_jsonrpc_options * ovsdb_jsonrpc_default_options(const char *target) { struct ovsdb_jsonrpc_options *options = xzalloc(sizeof *options); - options->max_backoff = RECONNECT_DEFAULT_MAX_BACKOFF; - options->probe_interval = (stream_or_pstream_needs_probes(target) - ? RECONNECT_DEFAULT_PROBE_INTERVAL - : 0); - options->dscp = DSCP_DEFAULT; + struct jsonrpc_session_options *rpc_opt = &options->rpc; + + rpc_opt->max_backoff = RECONNECT_DEFAULT_MAX_BACKOFF; + rpc_opt->probe_interval = (stream_or_pstream_needs_probes(target) + ? RECONNECT_DEFAULT_PROBE_INTERVAL : 0); + rpc_opt->dscp = DSCP_DEFAULT; return options; } @@ -246,10 +247,10 @@ ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options, struct json *json = json_object_create(); json_object_put(json, "max-backoff", - json_integer_create(options->max_backoff)); + json_integer_create(options->rpc.max_backoff)); json_object_put(json, "inactivity-probe", - json_integer_create(options->probe_interval)); - json_object_put(json, "dscp", json_integer_create(options->dscp)); + json_integer_create(options->rpc.probe_interval)); + json_object_put(json, "dscp", json_integer_create(options->rpc.dscp)); if (jsonrpc_session_only) { /* Caller is not interested in OVSDB-specific options. */ @@ -279,18 +280,18 @@ ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *options, max_backoff = ovsdb_parser_member(&parser, "max-backoff", OP_INTEGER | OP_OPTIONAL); if (max_backoff) { - options->max_backoff = json_integer(max_backoff); + options->rpc.max_backoff = json_integer(max_backoff); } probe_interval = ovsdb_parser_member(&parser, "inactivity-probe", OP_INTEGER | OP_OPTIONAL); if (probe_interval) { - options->probe_interval = json_integer(probe_interval); + options->rpc.probe_interval = json_integer(probe_interval); } dscp = ovsdb_parser_member(&parser, "dscp", OP_INTEGER | OP_OPTIONAL); if (dscp) { - options->dscp = json_integer(dscp); + options->rpc.dscp = json_integer(dscp); } if (jsonrpc_session_only) { @@ -339,7 +340,7 @@ ovsdb_jsonrpc_server_set_remotes(struct ovsdb_jsonrpc_server *svr, if (!options) { VLOG_INFO("%s: remote deconfigured", node->name); ovsdb_jsonrpc_server_del_remote(node); - } else if (options->dscp != remote->dscp + } else if (options->rpc.dscp != remote->dscp || !nullable_string_is_equal(options->role, remote->role)) { ovsdb_jsonrpc_server_del_remote(node); } @@ -369,7 +370,7 @@ ovsdb_jsonrpc_server_add_remote(struct ovsdb_jsonrpc_server *svr, struct pstream *listener; int error; - error = jsonrpc_pstream_open(name, &listener, options->dscp); + error = jsonrpc_pstream_open(name, &listener, options->rpc.dscp); switch (error) { case 0: case EAFNOSUPPORT: @@ -377,7 +378,7 @@ ovsdb_jsonrpc_server_add_remote(struct ovsdb_jsonrpc_server *svr, remote->server = svr; remote->listener = listener; ovs_list_init(&remote->sessions); - remote->dscp = options->dscp; + remote->dscp = options->rpc.dscp; remote->read_only = options->read_only; remote->role = nullable_xstrdup(options->role); shash_add(&svr->remotes, name, remote); @@ -687,15 +688,6 @@ ovsdb_jsonrpc_session_run(struct ovsdb_jsonrpc_session *s) return jsonrpc_session_is_alive(s->js) ? 0 : ETIMEDOUT; } -static void -ovsdb_jsonrpc_session_set_options(struct ovsdb_jsonrpc_session *session, - const struct ovsdb_jsonrpc_options *options) -{ - jsonrpc_session_set_max_backoff(session->js, options->max_backoff); - jsonrpc_session_set_probe_interval(session->js, options->probe_interval); - jsonrpc_session_set_dscp(session->js, options->dscp); -} - static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote) { @@ -814,7 +806,7 @@ ovsdb_jsonrpc_session_set_all_options( struct ovsdb_jsonrpc_session *s; LIST_FOR_EACH (s, node, &remote->sessions) { - ovsdb_jsonrpc_session_set_options(s, options); + jsonrpc_session_set_options(s->js, &options->rpc); } } diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index 9fb2baa54aa..d613cb7c70e 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -18,6 +18,7 @@ #include #include "openvswitch/types.h" +#include "jsonrpc.h" struct ovsdb; struct shash; @@ -33,10 +34,8 @@ void ovsdb_jsonrpc_server_destroy(struct ovsdb_jsonrpc_server *); /* Options for a remote. */ struct ovsdb_jsonrpc_options { - int max_backoff; /* Maximum reconnection backoff, in msec. */ - int probe_interval; /* Max idle time before probing, in msec. */ + struct jsonrpc_session_options rpc; /* JSON-RPC options. */ bool read_only; /* Only read-only transactions are allowed. */ - int dscp; /* Dscp value for manager connections */ char *role; /* Role, for role-based access controls */ }; struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_default_options( diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 70e3cf4cac1..ff3c885d5cf 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -461,9 +461,9 @@ get_jsonrpc_options(const char *target, enum service_model model) options = ovsdb_jsonrpc_default_options(target); if (model == SM_ACTIVE_BACKUP) { - options->probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; + options->rpc.probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; } else if (model == SM_RELAY) { - options->probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; + options->rpc.probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; } return options; @@ -1016,14 +1016,14 @@ open_db(struct server_config *server_config, if (model == SM_RELAY) { ovsdb_relay_add_db(db->db, conf->source, update_schema, server_config, - conf->options->probe_interval); + conf->options->rpc.probe_interval); } if (model == SM_ACTIVE_BACKUP && conf->ab.backup) { const struct uuid *server_uuid; server_uuid = ovsdb_jsonrpc_server_get_uuid(server_config->jsonrpc); replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, conf->options->probe_interval); + server_uuid, conf->options->rpc.probe_interval); } return NULL; } @@ -1240,11 +1240,11 @@ add_manager_options(struct shash *remotes, const struct ovsdb_row *row) options = add_remote(remotes, target, NULL); if (ovsdb_util_read_integer_column(row, "max_backoff", &max_backoff)) { - options->max_backoff = max_backoff; + options->rpc.max_backoff = max_backoff; } if (ovsdb_util_read_integer_column(row, "inactivity_probe", &probe_interval)) { - options->probe_interval = probe_interval; + options->rpc.probe_interval = probe_interval; } if (ovsdb_util_read_bool_column(row, "read_only", &read_only)) { options->read_only = read_only; @@ -1256,13 +1256,13 @@ add_manager_options(struct shash *remotes, const struct ovsdb_row *row) options->role = xstrdup(role); } - options->dscp = DSCP_DEFAULT; + options->rpc.dscp = DSCP_DEFAULT; dscp_string = ovsdb_util_read_map_string_column(row, "other_config", "dscp"); if (dscp_string) { int dscp = atoi(dscp_string); if (dscp >= 0 && dscp <= 63) { - options->dscp = dscp; + options->rpc.dscp = dscp; } } } @@ -1725,7 +1725,7 @@ ovsdb_server_connect_active_ovsdb_server(struct unixctl_conn *conn, conf->model = SM_ACTIVE_BACKUP; conf->source = xstrdup(*config->sync_from); conf->options = ovsdb_jsonrpc_default_options(conf->source); - conf->options->probe_interval = + conf->options->rpc.probe_interval = *config->replication_probe_interval; conf->ab.sync_exclude = nullable_xstrdup(*config->sync_exclude); @@ -1734,7 +1734,8 @@ ovsdb_server_connect_active_ovsdb_server(struct unixctl_conn *conn, if (conf->model == SM_ACTIVE_BACKUP && !conf->ab.backup) { replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, conf->options->probe_interval); + server_uuid, + conf->options->rpc.probe_interval); conf->ab.backup = true; } } @@ -1800,10 +1801,11 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, struct db_config *conf = db->config; if (conf->model == SM_ACTIVE_BACKUP) { - conf->options->probe_interval = probe_interval; + conf->options->rpc.probe_interval = probe_interval; if (conf->ab.backup) { replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, conf->options->probe_interval); + server_uuid, + conf->options->rpc.probe_interval); } } } @@ -1839,7 +1841,7 @@ ovsdb_server_set_relay_source_interval(struct unixctl_conn *conn, struct db_config *conf = db->config; if (conf->model == SM_RELAY) { - conf->options->probe_interval = probe_interval; + conf->options->rpc.probe_interval = probe_interval; } } @@ -1882,7 +1884,8 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, conf->ab.sync_exclude = xstrdup(argv[1]); if (conf->ab.backup) { replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, conf->options->probe_interval); + server_uuid, + conf->options->rpc.probe_interval); } } } From 6de317c0e091adb216348c05bb8f70c5e1ad3baa Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:12 +0100 Subject: [PATCH 525/833] ovsdb: replication: Allow to set all jsonrpc options. Set all the options for the source connection, not only the inactivity probe interval. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 11 ++++------- ovsdb/replication.c | 6 +++--- ovsdb/replication.h | 3 ++- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index ff3c885d5cf..87b06cfcd97 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -1023,7 +1023,7 @@ open_db(struct server_config *server_config, server_uuid = ovsdb_jsonrpc_server_get_uuid(server_config->jsonrpc); replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, conf->options->rpc.probe_interval); + server_uuid, &conf->options->rpc); } return NULL; } @@ -1734,8 +1734,7 @@ ovsdb_server_connect_active_ovsdb_server(struct unixctl_conn *conn, if (conf->model == SM_ACTIVE_BACKUP && !conf->ab.backup) { replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, - conf->options->rpc.probe_interval); + server_uuid, &conf->options->rpc); conf->ab.backup = true; } } @@ -1804,8 +1803,7 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, conf->options->rpc.probe_interval = probe_interval; if (conf->ab.backup) { replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, - conf->options->rpc.probe_interval); + server_uuid, &conf->options->rpc); } } } @@ -1884,8 +1882,7 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, conf->ab.sync_exclude = xstrdup(argv[1]); if (conf->ab.backup) { replication_set_db(db->db, conf->source, conf->ab.sync_exclude, - server_uuid, - conf->options->rpc.probe_interval); + server_uuid, &conf->options->rpc); } } } diff --git a/ovsdb/replication.c b/ovsdb/replication.c index 8f3750eddae..56720cb105d 100644 --- a/ovsdb/replication.c +++ b/ovsdb/replication.c @@ -108,7 +108,7 @@ static bool request_id_compare_and_free(struct replication_db *, void replication_set_db(struct ovsdb *db, const char *sync_from, const char *exclude_tables, const struct uuid *server, - int probe_interval) + const struct jsonrpc_session_options *options) { struct replication_db *rdb = find_db(db->name); @@ -124,7 +124,7 @@ replication_set_db(struct ovsdb *db, const char *sync_from, if (rdb && nullable_string_is_equal(rdb->excluded_tables_str, exclude_tables) && nullable_string_is_equal(rdb->sync_from, sync_from)) { - jsonrpc_session_set_probe_interval(rdb->session, probe_interval); + jsonrpc_session_set_options(rdb->session, options); return; } @@ -147,7 +147,7 @@ replication_set_db(struct ovsdb *db, const char *sync_from, rdb->session = jsonrpc_session_open(rdb->sync_from, true); rdb->session_seqno = UINT_MAX; - jsonrpc_session_set_probe_interval(rdb->session, probe_interval); + jsonrpc_session_set_options(rdb->session, options); rdb->state = RPL_S_INIT; rdb->db->read_only = true; diff --git a/ovsdb/replication.h b/ovsdb/replication.h index 5e573e1002c..38886b6be9b 100644 --- a/ovsdb/replication.h +++ b/ovsdb/replication.h @@ -20,6 +20,7 @@ #include struct ovsdb; +struct jsonrpc_session_options; /* Replication module runs when OVSDB server runs in the backup mode. * @@ -47,7 +48,7 @@ struct ovsdb; void replication_set_db(struct ovsdb *, const char *sync_from, const char *exclude_tables, const struct uuid *server, - int probe_interval); + const struct jsonrpc_session_options *); void replication_remove_db(const struct ovsdb *); void replication_run(void); From dd0947b871327de9083727e17e41f45bf3dcd82d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:13 +0100 Subject: [PATCH 526/833] ovsdb-cs: Add function to set all jsonrpc session options. Allow setting all the options for the source connection, not only the inactivity probe interval. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- lib/ovsdb-cs.c | 10 ++++++++++ lib/ovsdb-cs.h | 3 +++ 2 files changed, 13 insertions(+) diff --git a/lib/ovsdb-cs.c b/lib/ovsdb-cs.c index c7c147cc02b..b5eda88adbd 100644 --- a/lib/ovsdb-cs.c +++ b/lib/ovsdb-cs.c @@ -791,6 +791,16 @@ ovsdb_cs_get_last_error(const struct ovsdb_cs *cs) } } +/* Sets all the JSON-RPC session 'options' for 'cs''s current session. */ +void +ovsdb_cs_set_jsonrpc_options(const struct ovsdb_cs *cs, + const struct jsonrpc_session_options *options) +{ + if (cs->session) { + jsonrpc_session_set_options(cs->session, options); + } +} + /* Sets the "probe interval" for 'cs''s current session to 'probe_interval', in * milliseconds. */ void diff --git a/lib/ovsdb-cs.h b/lib/ovsdb-cs.h index 4cf9ca2b99c..bcc3dcd7167 100644 --- a/lib/ovsdb-cs.h +++ b/lib/ovsdb-cs.h @@ -32,6 +32,7 @@ #include "openvswitch/uuid.h" struct json; +struct jsonrpc_session_options; struct ovsdb_cs; struct ovsdb_cs_ops { @@ -131,6 +132,8 @@ bool ovsdb_cs_is_alive(const struct ovsdb_cs *); bool ovsdb_cs_is_connected(const struct ovsdb_cs *); int ovsdb_cs_get_last_error(const struct ovsdb_cs *); +void ovsdb_cs_set_jsonrpc_options(const struct ovsdb_cs *, + const struct jsonrpc_session_options *); void ovsdb_cs_set_probe_interval(const struct ovsdb_cs *, int probe_interval); /* Conditional monitoring (specifying that only rows matching particular From 99d7e88495967fb2e3b4bf98030a28334f8ff2eb Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:14 +0100 Subject: [PATCH 527/833] ovsdb: relay: Allow setting all jsonrpc session options. Allow setting all the JSON-RPC session options at once. While at it, allow updating options the same way the source can be updated while calling 'ovsdb_relay_add_db()' if the relay is already configured. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/ovsdb-server.c | 2 +- ovsdb/relay.c | 6 ++++-- ovsdb/relay.h | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 87b06cfcd97..8cea4413dfc 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -1016,7 +1016,7 @@ open_db(struct server_config *server_config, if (model == SM_RELAY) { ovsdb_relay_add_db(db->db, conf->source, update_schema, server_config, - conf->options->rpc.probe_interval); + &conf->options->rpc); } if (model == SM_ACTIVE_BACKUP && conf->ab.backup) { const struct uuid *server_uuid; diff --git a/ovsdb/relay.c b/ovsdb/relay.c index 27ff196b727..71a5b8e1cec 100644 --- a/ovsdb/relay.c +++ b/ovsdb/relay.c @@ -127,7 +127,8 @@ static struct ovsdb_cs_ops relay_cs_ops = { void ovsdb_relay_add_db(struct ovsdb *db, const char *remote, schema_change_callback schema_change_cb, - void *schema_change_aux, int probe_interval) + void *schema_change_aux, + const struct jsonrpc_session_options *options) { struct relay_ctx *ctx; @@ -138,6 +139,7 @@ ovsdb_relay_add_db(struct ovsdb *db, const char *remote, ctx = shash_find_data(&relay_dbs, db->name); if (ctx) { ovsdb_cs_set_remote(ctx->cs, remote, true); + ovsdb_cs_set_jsonrpc_options(ctx->cs, options); VLOG_DBG("%s: relay source set to '%s'", db->name, remote); return; } @@ -152,7 +154,7 @@ ovsdb_relay_add_db(struct ovsdb *db, const char *remote, shash_add(&relay_dbs, db->name, ctx); ovsdb_cs_set_leader_only(ctx->cs, false); ovsdb_cs_set_remote(ctx->cs, remote, true); - ovsdb_cs_set_probe_interval(ctx->cs, probe_interval); + ovsdb_cs_set_jsonrpc_options(ctx->cs, options); VLOG_DBG("added database: %s, %s", db->name, remote); } diff --git a/ovsdb/relay.h b/ovsdb/relay.h index 218caad65de..19cd3ef602a 100644 --- a/ovsdb/relay.h +++ b/ovsdb/relay.h @@ -22,6 +22,7 @@ #include "reconnect.h" struct json; +struct jsonrpc_session_options; struct ovsdb; struct ovsdb_schema; struct uuid; @@ -37,7 +38,8 @@ typedef struct ovsdb_error *(*schema_change_callback)( void ovsdb_relay_add_db(struct ovsdb *, const char *remote, schema_change_callback schema_change_cb, - void *schema_change_aux, int probe_interval); + void *schema_change_aux, + const struct jsonrpc_session_options *); void ovsdb_relay_del_db(struct ovsdb *); void ovsdb_relay_run(void); void ovsdb_relay_wait(void); From 55140090e63a644609aa5bf9ade4e1f69e31093f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:15 +0100 Subject: [PATCH 528/833] ovsdb-server: Allow user-provided config files. OVSDB server maintains a temporary file with the current database configuration for the case it is restarted by a monitor process after a crash. On startup the configuration from command line arguments is stored there in a JSON format, also whenever user changes the configuration with different UnixCtl commands, those changes are getting added to the file. When restarted from the crash it reads the configuration from the file and continues with all the necessary remotes and databases. This change allows it to be an external user-provided file that OVSDB server will read the configuration from. The file can be specified with a --config-file command line argument and it is mutually exclusive with most other command line arguments that set up remotes or databases, it is also mutually exclusive with use of appctl commands that modify same configurations, e.g. add/remove-db or add/remove-remote. If the user wants to change the configuration of a running server, they may change the file and call ovsdb-server/reload appctl. OVSDB server will open a file, read and parse it, compare the new configuration with the current one and adjust the running configuration as needed. OVSDB server will try to keep existing databases and connections intact, if the change can be applied without disrupting the normal operation. User-provided files are not trustworthy, so extra checks were added to ensure a correct file format. If the file cannot be correctly parsed, e.g. contains invalid JSON, no changes will be applied and the server will keep using the previous configuration until the next reload. If config-file is provided for active-backup databases, permanent disconnection of one of the backup databases no longer leads to switching all other databases to 'active'. Only the disconnected one will transition, since all of them have their own records in the configuration file. With this change, users can run all types of databases within the same ovsdb-server process at the same time. Simple configuration may look like this: { "remotes": { "punix:db.sock": {}, "pssl:6641": { "inactivity-probe": 16000, "read-only": false, "role": "ovn-controller" } }, "databases": { "conf.db": {}, "sb.db": { "service-model": "active-backup", "backup": true, "source": { "tcp:127.0.0.1:6644": null } }, "OVN_Northbound": { "service-model": "relay", "source": { "ssl:[fe:::1]:6642,ssl:[fe:::2]:6642": { "max-backoff": 8000, "inactivity-probe": 10000 } } } } } Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- Documentation/ref/ovsdb.7.rst | 86 +++++- Documentation/topics/ovsdb-relay.rst | 19 ++ NEWS | 4 + ovsdb/ovsdb-server.1.in | 96 ++++++- ovsdb/ovsdb-server.c | 394 ++++++++++++++++++++++----- 5 files changed, 521 insertions(+), 78 deletions(-) diff --git a/Documentation/ref/ovsdb.7.rst b/Documentation/ref/ovsdb.7.rst index 84b153d2424..46ed13e6163 100644 --- a/Documentation/ref/ovsdb.7.rst +++ b/Documentation/ref/ovsdb.7.rst @@ -155,6 +155,22 @@ standalone database, configure the server to listen on a "connection method" that the client can reach, then point the client to that connection method. See `Connection Methods`_ below for information about connection methods. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a server +with a **standalone** database may look like this:: + + { + "remotes": { "": {} }, + "databases": { "": {} } + } + +``ovsdb-server`` will infer the service model from the database file itself. +However, if additional verification is desired, an optional +``"service-model": "standalone"`` can be provided for the database file inside +the inner curly braces. If the specified ``service-model`` will not match the +content of the database file, ``ovsdb-server`` will refuse to open this +database. + Active-Backup Database Service Model ------------------------------------ @@ -177,10 +193,36 @@ database file from the active server. Then use connects to the active server. At that point, the backup server will fetch a copy of the active database and keep it up-to-date until it is killed. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a backup +server in this case may look like this:: + + { + "remotes": { "": {} }, + "databases": { + "": { + "service-model": "active-backup", + "backup": true, + "source": { + "": { + "inactivity-probe": , + "max-backoff": + } + } + } + } + } + +All the fields in the ``""`` description above are required. +Options for the ``""`` connection method (``"inactivity-probe"``, etc.) +can be omitted. + When the active server in an active-backup server pair fails, an administrator can switch the backup server to an active role with the ``ovs-appctl`` command ``ovsdb-server/disconnect-active-ovsdb-server``. Clients then have read/write -access to the now-active server. Of course, administrators are slow to respond +access to the now-active server. When the ``--config-file`` is in use, the +same can be achieved by changing the ``"backup"`` value in the file and running +``ovsdb-server/reload`` command. Of course, administrators are slow to respond compared to software, so in practice external management software detects the active server's failure and changes the backup server's role. For example, the "Integration Guide for Centralized Control" in the OVN documentation describes @@ -236,6 +278,22 @@ To set up a clustered database, first initialize it on a single node by running arguments, the ``create-cluster`` command can create an empty database or copy a standalone database's contents into the new database. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a server +with a **clustered** database may look like this:: + + { + "remotes": { "": {} }, + "databases": { "": {} } + } + +``ovsdb-server`` will infer the service model from the database file itself. +However, if additional verification is desired, an optional +``"service-model": "clustered"`` can be provided for the database file inside +the inner curly braces. If the specified ``service-model`` will not match the +content of the database file, ``ovsdb-server`` will refuse to open this +database. + To configure a client to use a clustered database, first configure all of the servers to listen on a connection method that the client can reach, then point the client to all of the servers' connection methods, comma-separated. See @@ -505,6 +563,29 @@ server. ```` could contain a comma-separated list of connection methods, e.g. to connect to any server of the clustered database. Multiple relay servers could be started for the same relay source. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a relay +database server in this case may look like this:: + + { + "remotes": { "": {} }, + "databases": { + "": { + "service-model": "relay", + "source": { + "": { + "inactivity-probe": , + "max-backoff": + } + } + } + } + } + +Both the ``"service-model"`` and the ``"source"`` are required. Options for +the ``""`` connection method (``"inactivity-probe"``, etc.) +can be omitted. + Since the way relays handle read and write transactions is very similar to the clustered model where "cluster" means "set of relay servers connected to the same relay source", "follower" means "relay server" and the "leader" @@ -629,7 +710,8 @@ Creating a Database Creating and starting up the service for a new database was covered separately for each database service model in the `Service -Models`_ section, above. +Models`_ section, above. A single ``ovsdb-server`` process may serve +any number of databases with different service models at the same time. Backing Up and Restoring a Database ----------------------------------- diff --git a/Documentation/topics/ovsdb-relay.rst b/Documentation/topics/ovsdb-relay.rst index 50a3c6d07b9..75f0c6577d6 100644 --- a/Documentation/topics/ovsdb-relay.rst +++ b/Documentation/topics/ovsdb-relay.rst @@ -105,6 +105,25 @@ started like this:: $ ... $ ovsdb-server --remote=ptcp:6642:172.16.0.K relay:OVN_Southbound:$REMOTES +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for relay +database servers in this case may look like this:: + + { + "remotes": { "ptcp:6642:172.16.0.X": {} }, + "databases": { + "OVN_Southbound": { + "service-model": "relay", + "source": { + "$REMOTES": {} + } + } + } + } + +See ``ovsdb-server(1)`` and ``Relay Service Model`` in ``ovsdb(7)`` for more +configuration options. + Every relay server could connect to any of the cluster members of their choice, fairness of load distribution is achieved by shuffling remotes. diff --git a/NEWS b/NEWS index f6b4cbf997b..49d74b0b951 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,10 @@ Post-v3.2.0 from older version is supported but it may trigger more leader elections during the process, and error logs complaining unrecognized fields may be observed on old nodes. + * New command line option --config-file that allows a fine control over + remotes and database configuration, including setting options for + connection methods for relays and active-backup replication. + For more details see ovsdb-server(1) and ovsdb(7). - OpenFlow: * NXT_CT_FLUSH extension is updated to support flushing connections based on mark and labels. 'ct-flush' command of ovs-ofctl updated diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in index da7a6fd5d54..9fabf2d6727 100644 --- a/ovsdb/ovsdb-server.1.in +++ b/ovsdb/ovsdb-server.1.in @@ -12,6 +12,7 @@ ovsdb\-server \- Open vSwitch database server [\fIdatabase\fR]\&... [\fIrelay:schema_name:remote\fR]\&... [\fB\-\-remote=\fIremote\fR]\&... +[\fB\-\-config\-file=\fIfile\fR] [\fB\-\-run=\fIcommand\fR] .so lib/daemon-syn.man .so lib/service-syn.man @@ -44,6 +45,11 @@ If none of database files or relay databases is specified, the default is initialized using, for example, \fBovsdb\-tool\fR's \fBcreate\fR, \fBcreate\-cluster\fR, or \fBjoin\-cluster\fR command. .PP +All types of databases can alternatively be added using a configuration +file provided via \fB\-\-config\-file\fR option. This option is mutually +exclusive with specifying \fIdatabase\fR on the command line. For a detailed +description of the configuration file format see \fBovsdb\fR(7). +.PP This OVSDB implementation supports standalone, active-backup, relay and clustered database service models, as well as database replication. See the Service Models section of \fBovsdb\fR(7) for more information. @@ -105,6 +111,74 @@ It is an error for \fIcolumn\fR to have another type. .IP To connect or listen on multiple connection methods, use multiple \fB\-\-remote\fR options. +.IP +Alternatively, remotes can be specified in a "remotes" section of the +configuration file, if provided using \fB\-\-config\-file\fR option. +\fB\-\-config\-file\fR and \fB\-\-remote\fR options are mutually +exclusive. +. +.IP "\fB\-\-config-file=\fIfile\fR" +Specifies a configuration file for \fBovsdb\-server\fR. This \fIfile\fR +can contain connection methods and databases used by the server. +The \fIfile\fR contains a JSON object with two main elements: +.RS +.IP "\fBremotes\fR" +JSON object that contains a set of connection methods in the following format: +"\fItarget\fR": { "\fIoption\fR": \fIvalue\fR, ... }. Where \fItarget\fR +is in the same format as \fIremote\fR in \fB\-\-remote\fR option. +\fIoption\fR can be \fBmax-backoff\fR (integer), \fBinactivity-probe\fR +(integer), \fBread-only\fR (boolean), \fBrole\fR (string) or \fBdscp\fR +(integer) with their allowed \fIvalue\fRs respectively. The meaning of these +\fIoption\fRs is the same as in configuration of \fIremote\fR via a database +row with \fB\-\-remote\fR option. +.IP "\fBdatabases\fR" +JSON object that describes databases that should be added to the +\fBovsdb\-server\fR in the following format: "\fIname\fR":{ "\fIoption\fR": +\fIvalue\fR, ... }. Where \fIname\fR is either a file name of a previously +created and initialized database or a schema name in case of relay +databases. Available \fIoption\fRs are: +.RS +.IP "\fBservice-model\fR (string)" +Describes the service model of this database. One of: \fBstandalone\fR, +\fBclustered\fR, \fBactive-backup\fR or \fBrelay\fR. This option is +required for all types, except for standalone and clustered. For these +databases the service model will be inferred from the file, if not +specified explicitly. \fBovsdb-server\fR will refuse to add a database +if the specified \fBservice-model\fR doesn't match with the provided file. +.IP "\fBsource\fR (JSON object; active-backup or relay)" +Describes the connection method to the active database or to the relay +source. It is a JSON object with exactly one element in the same format +as elements of "\fBremotes\fR", except that \fBread-only\fR and \fBrole\fR +options are not applicable. E.g. \fB"source": { "unix:db.sock": { +"inactivity-probe": 10000, "max-backoff": 8000 } }\fR +.IP "\fBbackup\fR (boolean; active-backup only)" +If set to \fBtrue\fR, \fBovsdb-server\fR will use this database as a +backup for the specified \fBsource\fR. Will be served as an active +database otherwise. +.IP "\fBexclude-tables\fR (JSON array of strings; active-backup only)" +List of table names that should be excluded from replication in backup mode, +e.g. \fB"exclude-tables": [ "Table_One", "Table_Two" ]\fR. +.RE +.RE +.IP +Content of the most basic configuration file may look like this: +\fB{ "remotes": { "pssl:6640": {} }, "databases": { "conf.db": {} } }\fR +.IP +Examples of configuration files for different service models can be +found in in \fBovsdb\fR(7). +.IP +\fB\-\-config-file\fR option is mutually exclusive with the \fB\-\-remote\fR +as well as with specifying \fIdatabase\fR on a command line. It is also +mutually exclusive with all the \fBActive-Backup Options\fR and all the +\fBRUNTIME MANAGEMENT COMMANDS\fR that can change the configuration of +the server in conflict with the content of the file, i.e. all the commands +that manipulate with remotes and databases. Read-only commands can still +be used. +.IP +In case of changes in the \fIfile\fR, users should run the +\fBovsdb-server/reload\fR command with \fBovs-appctl\fR(8) in order for +changes to take effect. +.RE . .IP "\fB\-\-run=\fIcommand\fR]" Ordinarily \fBovsdb\-server\fR runs forever, or until it is told to @@ -178,6 +252,8 @@ allow the syncing options to be specified using command line options, yet start the server, as the default, active server. To switch the running server to backup mode, use \fBovs-appctl(1)\fR to execute the \fBovsdb\-server/connect\-active\-ovsdb\-server\fR command. +.PP +These options are mutually exclusive with the \fB\-\-config\-file\fR. .SS "Public Key Infrastructure Options" The options described below for configuring the SSL public key infrastructure accept a special syntax for obtaining their @@ -230,6 +306,8 @@ clients. Adds a remote, as if \fB\-\-remote=\fIremote\fR had been specified on the \fBovsdb\-server\fR command line. (If \fIremote\fR is already a remote, this command succeeds without changing the configuration.) +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/remove\-remote \fIremote\fR" Removes the specified \fIremote\fR from the configuration, failing @@ -241,6 +319,8 @@ configuring a \fBdb:\fIdb\fB,\fItable\fB,\fIcolumn\fR remote. (You can remove a database source with \fBovsdb\-server/remove\-remote \fBdb:\fIdb\fB,\fItable\fB,\fIcolumn\fR, but not individual remotes found indirectly through the database.) +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/list\-remotes" Outputs a list of the currently configured remotes named on @@ -254,6 +334,8 @@ Adds the \fIdatabase\fR to the running \fBovsdb\-server\fR. \fIdatabase\fR could be a database file or a relay description in the following format: \fIrelay:schema_name:remote\fR. The database file must already have been created and initialized using, for example, \fBovsdb\-tool create\fR. +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/remove\-db \fIdatabase\fR" Removes \fIdatabase\fR from the running \fBovsdb\-server\fR. \fIdatabase\fR @@ -268,6 +350,8 @@ Any public key infrastructure options specified through this database (e.g. \fB\-\-private\-key=db:\fIdatabase,\fR... on the command line) will be disabled until another database with the same name is added again (with \fBovsdb\-server/add\-db\fR). +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/list\-dbs" Outputs a list of the currently configured databases added either through @@ -286,6 +370,9 @@ These commands query and update the role of \fBovsdb\-server\fR within an active-backup pair of servers. See \fBActive-Backup Options\fR, above, and \fBActive-Backup Database Service Model\fR in \fBovsdb\fR(7) for more information. +.PP +All \fBActive-Backup Commands\fR that change the state of \fBovsdb\-server\fR +are mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/set\-active\-ovsdb\-server \fIserver" Sets the active \fIserver\fR from which \fBovsdb\-server\fR connects through @@ -324,11 +411,10 @@ Gets the tables that are currently excluded from synchronization. Prints a summary of replication run time information. The \fBstate\fR information is always provided, indicating whether the server is running in the \fIactive\fR or the \fIbackup\fR mode. -When running in backup mode, replication connection status, which -can be either \fIconnecting\fR, \fIreplicating\fR or \fIerror\fR, are shown. -When the connection is in \fIreplicating\fR state, further output shows -the list of databases currently replicating, and the tables that are -excluded. +For all databases with active-backup service model, replication connection +status, which can be either \fIconnecting\fR, \fIreplicating\fR or +\fIerror\fR, are shown. When the connection is in \fIreplicating\fR state, +further output shows the tables that are currently excluded from replication. . .SS "Cluster Commands" These commands support the \fBovsdb\-server\fR clustered service model. diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 8cea4413dfc..d45c9e5f3d6 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -195,6 +195,13 @@ static void add_server_db(struct server_config *); static void remove_db(struct server_config *, struct shash_node *db, char *); static void close_db(struct server_config *, struct db *, char *); +static struct ovsdb_error *update_schema(struct ovsdb *, + const struct ovsdb_schema *, + const struct uuid *txnid, + bool conversion_with_no_data, + void *aux) + OVS_WARN_UNUSED_RESULT; + static void parse_options(int argc, char *argvp[], struct shash *db_conf, struct shash *remotes, char **unixctl_pathp, char **run_command, @@ -223,7 +230,7 @@ static void save_config__(FILE *config_file, const struct shash *remotes, const char *sync_from, const char *sync_exclude, bool is_backup); static void save_config(struct server_config *); -static void load_config(FILE *config_file, struct shash *remotes, +static bool load_config(FILE *config_file, struct shash *remotes, struct shash *db_conf, char **sync_from, char **sync_exclude, bool *is_backup); @@ -263,8 +270,9 @@ ovsdb_server_replication_run(struct server_config *config) } /* If one connection is broken, switch all databases to active, - * since they are configured via the same command line / appctl. */ - if (!all_alive && *config->is_backup) { + * if they are configured via the command line / appctl and so have + * shared configuration. */ + if (!config_file_path && !all_alive && *config->is_backup) { *config->is_backup = false; SHASH_FOR_EACH (node, config->all_dbs) { @@ -510,6 +518,196 @@ free_database_configs(struct shash *db_conf) shash_clear(db_conf); } +static bool +service_model_can_convert(enum service_model a, enum service_model b) +{ + ovs_assert(a != SM_UNDEFINED); + + if (a == b) { + return true; + } + + if (b == SM_UNDEFINED) { + return a == SM_STANDALONE || a == SM_CLUSTERED; + } + + /* Conversion can happen only between standalone and active-backup. */ + return (a == SM_STANDALONE && b == SM_ACTIVE_BACKUP) + || (a == SM_ACTIVE_BACKUP && b == SM_STANDALONE); +} + +static void +database_update_config(struct server_config *server_config, + struct db *db, const struct db_config *new_conf) +{ + struct db_config *conf = db->config; + enum service_model model = conf->model; + + /* Stop replicating when transitioning to active or standalone. */ + if (conf->model == SM_ACTIVE_BACKUP && conf->ab.backup + && (new_conf->model == SM_STANDALONE || !new_conf->ab.backup)) { + ovsdb_server_replication_remove_db(db); + } + + db_config_destroy(conf); + conf = db->config = db_config_clone(new_conf); + + if (conf->model == SM_UNDEFINED) { + /* We're operating on the same file, the model is the same. */ + conf->model = model; + } + + if (conf->model == SM_RELAY) { + ovsdb_relay_add_db(db->db, conf->source, update_schema, server_config, + &conf->options->rpc); + } + if (conf->model == SM_ACTIVE_BACKUP && conf->ab.backup) { + const struct uuid *server_uuid; + + server_uuid = ovsdb_jsonrpc_server_get_uuid(server_config->jsonrpc); + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, &conf->options->rpc); + } +} + +static bool +reconfigure_databases(struct server_config *server_config, + struct shash *db_conf) +{ + struct db_config *cur_conf, *new_conf; + struct shash_node *node, *conf_node; + bool res = true; + struct db *db; + + /* Remove databases that are no longer in the configuration or have + * incompatible configuration. Update compatible ones. */ + SHASH_FOR_EACH_SAFE (node, server_config->all_dbs) { + db = node->data; + + if (node->name[0] == '_') { + /* Skip internal databases. */ + continue; + } + + cur_conf = db->config; + conf_node = shash_find(db_conf, db->filename); + new_conf = conf_node ? conf_node->data : NULL; + + if (!new_conf) { + remove_db(server_config, node, + xasprintf("database %s removed from configuration", + node->name)); + continue; + } + if (!service_model_can_convert(cur_conf->model, new_conf->model)) { + remove_db(server_config, node, + xasprintf("service model changed for database %s", + node->name)); + continue; + } + database_update_config(server_config, db, new_conf); + + db_config_destroy(new_conf); + shash_delete(db_conf, conf_node); + } + + /* Create new databases. */ + SHASH_FOR_EACH (node, db_conf) { + struct ovsdb_error *error = open_db(server_config, + node->name, node->data); + if (error) { + char *s = ovsdb_error_to_string_free(error); + + VLOG_WARN("failed to open database '%s': %s", node->name, s); + free(s); + res = false; + } + db_config_destroy(node->data); + } + shash_clear(db_conf); + + return res; +} + +static bool +reconfigure_ovsdb_server(struct server_config *server_config) +{ + char *sync_from = NULL, *sync_exclude = NULL; + bool is_backup = false; + struct shash remotes; + struct shash db_conf; + bool res = true; + + FILE *file = NULL; + + if (config_file_path) { + file = fopen(config_file_path, "r+b"); + if (!file) { + VLOG_ERR("failed to open configuration file '%s': %s", + config_file_path, ovs_strerror(errno)); + return false; + } else { + VLOG_INFO("loading configuration from '%s'", config_file_path); + } + } else { + file = server_config->config_tmpfile; + } + ovs_assert(file); + + shash_init(&remotes); + shash_init(&db_conf); + + if (!load_config(file, &remotes, &db_conf, + &sync_from, &sync_exclude, &is_backup)) { + if (config_file_path) { + VLOG_WARN("failed to load configuration from %s", + config_file_path); + } else { + VLOG_FATAL("failed to load configuration from a temporary file"); + } + res = false; + goto exit_close; + } + + /* Parsing was successful. Update the server configuration. */ + shash_swap(server_config->remotes, &remotes); + free(*server_config->sync_from); + *server_config->sync_from = sync_from; + free(*server_config->sync_exclude); + *server_config->sync_exclude = sync_exclude; + *server_config->is_backup = is_backup; + + if (!reconfigure_databases(server_config, &db_conf)) { + VLOG_WARN("failed to configure databases"); + res = false; + } + + char *error = reconfigure_remotes(server_config->jsonrpc, + server_config->all_dbs, + server_config->remotes); + if (error) { + VLOG_WARN("failed to configure remotes: %s", error); + res = false; + } else { + error = reconfigure_ssl(server_config->all_dbs); + if (error) { + VLOG_WARN("failed to configure SSL: %s", error); + res = false; + } + } + free(error); + +exit_close: + if (config_file_path) { + fclose(file); + } + free_remotes(&remotes); + free_database_configs(&db_conf); + shash_destroy(&remotes); + shash_destroy(&db_conf); + return res; +} + int main(int argc, char *argv[]) { @@ -520,8 +718,7 @@ main(int argc, char *argv[]) struct process *run_process; bool exiting; int retval; - FILE *config_tmpfile; - struct server_config server_config; + FILE *config_tmpfile = NULL; struct shash all_dbs; struct shash_node *node; int replication_probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; @@ -532,6 +729,16 @@ main(int argc, char *argv[]) char *sync_from = NULL, *sync_exclude = NULL; bool is_backup; + struct server_config server_config = { + .remotes = &remotes, + .all_dbs = &all_dbs, + .sync_from = &sync_from, + .sync_exclude = &sync_exclude, + .is_backup = &is_backup, + .replication_probe_interval = &replication_probe_interval, + .relay_source_probe_interval = &relay_source_probe_interval, + }; + ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); service_start(&argc, &argv); @@ -546,64 +753,39 @@ main(int argc, char *argv[]) daemon_become_new_user(false, false); - /* Create and initialize 'config_tmpfile' as a temporary file to hold - * ovsdb-server's most basic configuration, and then save our initial - * configuration to it. When --monitor is used, this preserves the effects - * of ovs-appctl commands such as ovsdb-server/add-remote (which saves the - * new configuration) across crashes. */ - config_tmpfile = tmpfile(); - if (!config_tmpfile) { - ovs_fatal(errno, "failed to create temporary file"); + if (!config_file_path) { + /* Create and initialize 'config_tmpfile' as a temporary file to hold + * ovsdb-server's most basic configuration, and then save our initial + * configuration to it. When --monitor is used, this preserves the + * effects of ovs-appctl commands such as ovsdb-server/add-remote + * (which saves the new configuration) across crashes. */ + config_tmpfile = tmpfile(); + if (!config_tmpfile) { + ovs_fatal(errno, "failed to create temporary file"); + } + server_config.config_tmpfile = config_tmpfile; + save_config__(config_tmpfile, &remotes, &db_conf, sync_from, + sync_exclude, is_backup); } - server_config.remotes = &remotes; - server_config.config_tmpfile = config_tmpfile; - - save_config__(config_tmpfile, &remotes, &db_conf, sync_from, - sync_exclude, is_backup); free_remotes(&remotes); free_database_configs(&db_conf); daemonize_start(false, false); - /* Load the saved config. */ - load_config(config_tmpfile, &remotes, &db_conf, &sync_from, - &sync_exclude, &is_backup); - - /* Start ovsdb jsonrpc server. When running as a backup server, - * jsonrpc connections are read only. Otherwise, both read - * and write transactions are allowed. */ - jsonrpc = ovsdb_jsonrpc_server_create(is_backup); + perf_counters_init(); - shash_init(&all_dbs); - server_config.all_dbs = &all_dbs; + /* Start ovsdb jsonrpc server. Both read and write transactions are + * allowed by default, individual remotes and databases will be configured + * as read-only, if necessary. */ + jsonrpc = ovsdb_jsonrpc_server_create(false); server_config.jsonrpc = jsonrpc; - server_config.sync_from = &sync_from; - server_config.sync_exclude = &sync_exclude; - server_config.is_backup = &is_backup; - server_config.replication_probe_interval = &replication_probe_interval; - server_config.relay_source_probe_interval = &relay_source_probe_interval; - perf_counters_init(); - - SHASH_FOR_EACH (node, &db_conf) { - struct ovsdb_error *error = open_db(&server_config, - node->name, node->data); - if (error) { - char *s = ovsdb_error_to_string_free(error); - ovs_fatal(0, "%s", s); - } - db_config_destroy(node->data); - } - shash_clear(&db_conf); + shash_init(&all_dbs); add_server_db(&server_config); - char *error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes); - if (!error) { - error = reconfigure_ssl(&all_dbs); - } - if (error) { - ovs_fatal(0, "%s", error); + if (!reconfigure_ovsdb_server(&server_config)) { + ovs_fatal(0, "server configuration failed"); } retval = unixctl_server_create(unixctl_path, &unixctl); @@ -2060,14 +2242,21 @@ ovsdb_server_reconnect(struct unixctl_conn *conn, int argc OVS_UNUSED, * 'config_file_path', read it and sync the runtime configuration with it. */ static void ovsdb_server_reload(struct unixctl_conn *conn, int argc OVS_UNUSED, - const char *argv[] OVS_UNUSED, void *config_ OVS_UNUSED) + const char *argv[] OVS_UNUSED, void *config_) { + struct server_config *config = config_; + if (!config_file_path) { unixctl_command_reply_error(conn, "Configuration file was not specified on command line"); - } else { + return; + } + + if (!reconfigure_ovsdb_server(config)) { unixctl_command_reply_error(conn, - "Configuration file support is not implemented yet"); + "Configuration failed. See the log file for details."); + } else { + unixctl_command_reply(conn, NULL); } } @@ -2741,6 +2930,10 @@ save_config(struct server_config *config) struct shash_node *node; struct shash db_conf; + if (config_file_path) { + return; + } + shash_init(&db_conf); SHASH_FOR_EACH (node, config->all_dbs) { struct db *db = node->data; @@ -2757,7 +2950,7 @@ save_config(struct server_config *config) shash_destroy(&db_conf); } -static void +static bool remotes_from_json(struct shash *remotes, const struct json *json) { struct ovsdb_jsonrpc_options *options; @@ -2767,14 +2960,31 @@ remotes_from_json(struct shash *remotes, const struct json *json) free_remotes(remotes); ovs_assert(json); - ovs_assert(json->type == JSON_OBJECT); + if (json->type == JSON_NULL) { + return true; + } + if (json->type != JSON_OBJECT) { + VLOG_WARN("config: 'remotes' is not a JSON object"); + return false; + } object = json_object(json); SHASH_FOR_EACH (node, object) { options = ovsdb_jsonrpc_default_options(node->name); - ovsdb_jsonrpc_options_update_from_json(options, node->data, false); shash_add(remotes, node->name, options); + + json = node->data; + if (json->type == JSON_OBJECT) { + ovsdb_jsonrpc_options_update_from_json(options, node->data, false); + } else if (json->type != JSON_NULL) { + VLOG_WARN("%s: JSON-RPC options are not a JSON object or null", + node->name); + free_remotes(remotes); + return false; + } } + + return true; } static struct db_config * @@ -2785,12 +2995,24 @@ db_config_from_json(const char *name, const struct json *json) struct ovsdb_parser parser; struct ovsdb_error *error; + conf->model = SM_UNDEFINED; + + ovs_assert(json); + if (json->type == JSON_NULL) { + return conf; + } + ovsdb_parser_init(&parser, json, "database %s", name); model = ovsdb_parser_member(&parser, "service-model", OP_STRING | OP_OPTIONAL); - conf->model = model ? service_model_from_string(json_string(model)) - : SM_UNDEFINED; + if (model) { + conf->model = service_model_from_string(json_string(model)); + if (conf->model == SM_UNDEFINED) { + ovsdb_parser_raise_error(&parser, + "'%s' is not a valid service model", json_string(model)); + } + } if (conf->model == SM_ACTIVE_BACKUP) { backup = ovsdb_parser_member(&parser, "backup", OP_BOOLEAN); @@ -2861,7 +3083,7 @@ db_config_from_json(const char *name, const struct json *json) } -static void +static bool databases_from_json(struct shash *db_conf, const struct json *json) { const struct shash_node *node; @@ -2870,7 +3092,12 @@ databases_from_json(struct shash *db_conf, const struct json *json) free_database_configs(db_conf); ovs_assert(json); - ovs_assert(json->type == JSON_OBJECT); + if (json->type == JSON_NULL) { + return true; + } + if (json->type != JSON_OBJECT) { + VLOG_WARN("config: 'databases' is not a JSON object or null"); + } object = json_object(json); SHASH_FOR_EACH (node, object) { @@ -2878,13 +3105,19 @@ databases_from_json(struct shash *db_conf, const struct json *json) if (conf) { shash_add(db_conf, node->name, conf); + } else { + free_database_configs(db_conf); + return false; } } + return true; } -/* Clears and replaces 'remotes' and 'dbnames' by a configuration read from - * 'config_file', which must have been previously written by save_config(). */ -static void +/* Clears and replaces 'remotes' and 'db_conf' by a configuration read from + * 'config_file', which must have been previously written by save_config() + * or provided by the user with --config-file. + * Returns 'true', if parsing was successful, 'false' otherwise. */ +static bool load_config(FILE *config_file, struct shash *remotes, struct shash *db_conf, char **sync_from, char **sync_exclude, bool *is_backup) @@ -2892,17 +3125,34 @@ load_config(FILE *config_file, struct shash *remotes, struct json *json; if (fseek(config_file, 0, SEEK_SET) != 0) { - VLOG_FATAL("seek failed in temporary file (%s)", ovs_strerror(errno)); + VLOG_WARN("config: file seek failed (%s)", ovs_strerror(errno)); + return false; } json = json_from_stream(config_file); if (json->type == JSON_STRING) { - VLOG_FATAL("reading json failed (%s)", json_string(json)); + VLOG_WARN("config: reading JSON failed (%s)", json_string(json)); + json_destroy(json); + return false; + } + if (json->type != JSON_OBJECT) { + VLOG_WARN("configuration in a file must be a JSON object"); + json_destroy(json); + return false; } - ovs_assert(json->type == JSON_OBJECT); - remotes_from_json(remotes, shash_find_data(json_object(json), "remotes")); - databases_from_json(db_conf, - shash_find_data(json_object(json), "databases")); + if (!remotes_from_json(remotes, + shash_find_data(json_object(json), "remotes"))) { + VLOG_WARN("config: failed to parse 'remotes'"); + json_destroy(json); + return false; + } + if (!databases_from_json(db_conf, shash_find_data(json_object(json), + "databases"))) { + VLOG_WARN("config: failed to parse 'databases'"); + free_remotes(remotes); + json_destroy(json); + return false; + } struct json *string; string = shash_find_data(json_object(json), "sync_from"); @@ -2913,7 +3163,9 @@ load_config(FILE *config_file, struct shash *remotes, free(*sync_exclude); *sync_exclude = string ? xstrdup(json_string(string)) : NULL; - *is_backup = json_boolean(shash_find_data(json_object(json), "is_backup")); + struct json *boolean = shash_find_data(json_object(json), "is_backup"); + *is_backup = boolean ? json_boolean(boolean) : false; json_destroy(json); + return true; } From 47ddc474dbd2239b4122d9dccc5630b1c7c9a838 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:16 +0100 Subject: [PATCH 529/833] tests: ovsdb: Add relay and replication execution with config file. Basic relay and active-backup command execution tests extended to run some copies of ovsdb-server processes with --config-file. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- tests/ovsdb-server.at | 123 +++++++++++++++++++++++++++++++++--------- 1 file changed, 97 insertions(+), 26 deletions(-) diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 45aa80cd676..488dfc36f54 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1593,12 +1593,30 @@ m4_define([OVSDB_CHECK_EXECUTION_RELAY], ], [0], [ignore], [ignore]) for i in $(seq 2 ${n_servers}); do - AT_CHECK([ovsdb-server --detach --no-chdir dnl - --log-file=ovsdb-server$i.log dnl - --pidfile=${i}.pid --remote=punix:db${i}.sock dnl - --unixctl=unixctl${i} -vjsonrpc:file:dbg dnl - relay:${schema_name}:unix:db$((i-1)).sock - ], [0], [ignore], [ignore]) + dnl Run every second relay with a config file. + if test $(expr $i % 2) -eq 0; then + echo "{ + \"remotes\": { \"punix:db${i}.sock\": {} }, + \"databases\": { + \"${schema_name}\": { + \"service-model\": \"relay\", + \"source\": { \"unix:db$((i-1)).sock\": {} } + } + } + }" > config${i}.json + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile=${i}.pid \ + --log-file=ovsdb-server$i.log \ + --unixctl=unixctl${i} -vjsonrpc:file:dbg \ + --config-file=config${i}.json + ], [0], [ignore], [ignore]) + else + AT_CHECK([ovsdb-server --detach --no-chdir \ + --log-file=ovsdb-server$i.log \ + --pidfile=${i}.pid --remote=punix:db${i}.sock \ + --unixctl=unixctl${i} -vjsonrpc:file:dbg \ + relay:${schema_name}:unix:db$((i-1)).sock + ], [0], [ignore], [ignore]) + fi done m4_foreach([txn], [$4], @@ -1645,13 +1663,14 @@ AT_BANNER([OVSDB -- ovsdb-server replication]) # OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS]) # -# Creates two databases with the given SCHEMA, and starts an ovsdb-server on +# Creates three databases with the given SCHEMA, and starts an ovsdb-server on # each database. # Runs each of the TRANSACTIONS (which should be a quoted list of # quoted strings) against one of the servers with ovsdb-client one at a -# time. The server replicates its database to the other ovsdb-server. +# time. The server replicates its database to the other two ovsdb-servers, +# one of which is configured via command line and the other via --config-file. # -# Checks that the dump of both databases are the same. +# Checks that the dump of all databases are the same. # # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. m4_define([OVSDB_CHECK_EXECUTION], @@ -1660,22 +1679,43 @@ m4_define([OVSDB_CHECK_EXECUTION], $2 > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) - - on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) - i - - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 --sync-from=unix:db.sock db2], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create db3 schema], [0], [stdout], [ignore]) + + on_exit 'kill $(cat *.pid)' + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server1.log \ + --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) + + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server2.log \ + --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 \ + --sync-from=unix:db.sock db2], [0], [ignore], [ignore]) + + AT_DATA([config3.json], [ + { + "remotes": { "punix:db3.sock": {} }, + "databases": { + "db3": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db.sock": {} } + } + } + } +]) + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server3.log \ + --pidfile=3.pid --unixctl=unixctl3 --config-file=config3.json], + [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ovsdb-client transact 'txn'], [0], [stdout], [ignore]) ]) AT_CHECK([ovsdb-client dump], [0], [stdout], [ignore]) - OVS_WAIT_UNTIL([ ovsdb-client dump unix:db2.sock > dump2; diff stdout dump2]) + OVS_WAIT_UNTIL([ ovsdb-client dump unix:db2.sock > dump2; diff -u stdout dump2]) + OVS_WAIT_UNTIL([ ovsdb-client dump unix:db3.sock > dump3; diff -u stdout dump3]) OVSDB_SERVER_SHUTDOWN OVSDB_SERVER_SHUTDOWN2 + OVSDB_SERVER_SHUTDOWN_N([3]) AT_CLEANUP]) EXECUTION_EXAMPLES @@ -1684,19 +1724,22 @@ AT_BANNER([OVSDB -- ovsdb-server replication table-exclusion]) # OVSDB_CHECK_REPLICATION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS]) # -# Creates two databases with the given SCHEMA, and starts an +# Creates three databases with the given SCHEMA, and starts an # ovsdb-server on each database. # Runs each of the TRANSACTIONS (which should be a quoted list of # quoted strings) against one of the servers with ovsdb-client one at a -# time. The server replicates its database to the other ovsdb-server. +# time. The server replicates its database to the other two ovsdb-servers, +# one of which is configured via command line and the other via --config-file. # -# Checks that the difference between the dump of the databases is -# OUTPUT, but UUIDs in the output are replaced by markers of the form -# where N is a number. The first unique UUID is replaced by <0>, +# Checks that the difference between the dump of the first and the other two +# databases is OUTPUT, but UUIDs in the output are replaced by markers of the +# form where N is a number. The first unique UUID is replaced by <0>, # the next by <1>, and so on. # If a given UUID appears more than once it is always replaced by the # same marker. # +# Also checks that the dumps of the second and third databases are the same. +# # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. m4_define([OVSDB_CHECK_REPLICATION], [AT_SETUP([$1]) @@ -1705,11 +1748,33 @@ m4_define([OVSDB_CHECK_REPLICATION], $2 > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) - - on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) - - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 --sync-from=unix:db.sock --sync-exclude-tables=mydb:b db2], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create db3 schema], [0], [stdout], [ignore]) + + on_exit 'kill $(cat *.pid)' + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server1.log \ + --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) + + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server2.log \ + --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 \ + --sync-from=unix:db.sock --sync-exclude-tables=mydb:b db2], + [0], [ignore], [ignore]) + + AT_DATA([config3.json], [ + { + "remotes": { "punix:db3.sock": {} }, + "databases": { + "db3": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db.sock": {} }, + "exclude-tables": [["b"]] + } + } + } +]) + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server3.log \ + --pidfile=3.pid --unixctl=unixctl3 --config-file=config3.json], + [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ ovsdb-client transact 'txn' ], [0], [stdout], [ignore]) @@ -1722,6 +1787,11 @@ m4_define([OVSDB_CHECK_REPLICATION], AT_CHECK([ovsdb-client dump unix:db2.sock], [0], [stdout], [ignore]) cat stdout > dump2 + OVS_WAIT_UNTIL([ ovsdb-client dump unix:db3.sock | grep one ]) + AT_CHECK([ovsdb-client dump unix:db3.sock], [0], [stdout], [ignore]) + cat stdout > dump3 + AT_CHECK([diff -u dump2 dump3]) + AT_CHECK([diff dump1 dump2], [1], [stdout], [ignore]) cat stdout > output @@ -1729,6 +1799,7 @@ m4_define([OVSDB_CHECK_REPLICATION], OVSDB_SERVER_SHUTDOWN OVSDB_SERVER_SHUTDOWN2 + OVSDB_SERVER_SHUTDOWN_N([3]) AT_CLEANUP]) REPLICATION_EXAMPLES From 65b22552a06a2cec6ffdcc19f5960b579815d15c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Jan 2024 23:49:17 +0100 Subject: [PATCH 530/833] tests: ovsdb: Add configuration tests with config file. Add more tests specific to --config-file. Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- tests/ovsdb-server.at | 651 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 651 insertions(+) diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 488dfc36f54..036e4cc3ba5 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -183,6 +183,31 @@ AT_CHECK( OVSDB_SERVER_SHUTDOWN AT_CLEANUP +AT_SETUP([database multiplexing implementation with config file]) +AT_KEYWORDS([ovsdb server positive config-file]) +ordinal_schema > schema1 +constraint_schema > schema2 +AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) +on_exit 'kill $(cat *.pid)' + +AT_DATA([config.json], [ +{"remotes" : { "punix:db.sock": {} }, + "databases": { "db1": {}, "db2": { "service-model": "standalone" } } } +]) + +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile \ + --config-file=config.json], [0], [ignore], [ignore]) +CHECK_DBS([constraints +ordinals +]) +AT_CHECK( + [[ovstest test-jsonrpc request unix:db.sock get_schema [\"nonexistent\"]]], [0], + [[{"error":{"details":"get_schema request specifies unknown database nonexistent","error":"unknown database","syntax":"[\"nonexistent\"]"},"id":0,"result":null} +]]) +OVSDB_SERVER_SHUTDOWN +AT_CLEANUP + AT_SETUP([ovsdb-server/add-db and remove-db]) AT_KEYWORDS([ovsdb server positive]) on_exit 'kill `cat *.pid`' @@ -298,6 +323,155 @@ AT_CHECK([uuidfilt db-change-unaware.stdout], [0], [dnl OVSDB_SERVER_SHUTDOWN(["/no database named ordinals/d"]) AT_CLEANUP +AT_SETUP([ovsdb-server/add-db and remove-db with a config file]) +AT_KEYWORDS([ovsdb server positive config-file]) +on_exit 'kill $(cat *.pid)' +ordinal_schema > schema1 +constraint_schema > schema2 +AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) + +dnl Start ovsdb-server with just a single database - db1. +AT_DATA([config.json], [ +{ + "remotes": { + "punix:db.sock": {} + }, + "databases": { + "db1": {} + } +} +]) +AT_CAPTURE_FILE([config.json]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir \ + --pidfile --config-file=config.json], [0], [ignore], [ignore]) +CHECK_DBS([ordinals +]) + +dnl Remove the database. +AT_CHECK([sed -i'back' '/db1/d' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([]) + +dnl Start monitoring processes. +AT_CHECK([ovsdb-client --detach --no-chdir --pidfile=ovsdb-client-1.pid \ + --no-db-change-aware --no-headings monitor _Server Database name \ + > db-change-unaware.stdout 2> db-change-unaware.stderr]) +AT_CHECK([ovsdb-client --detach --no-chdir --pidfile=ovsdb-client-2.pid \ + --db-change-aware --no-headings monitor _Server Database name \ + > db-change-aware.stdout 2> db-change-aware.stderr]) +AT_CAPTURE_FILE([db-change-unaware.stdout]) +AT_CAPTURE_FILE([db-change-unaware.stderr]) +AT_CAPTURE_FILE([db-change-aware.stdout]) +AT_CAPTURE_FILE([db-change-aware.stderr]) + +dnl Add the first database back. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db1": {} + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([ordinals +]) + +dnl Add the second database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([constraints +ordinals +]) + +dnl The databases are responsive. +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-client list-tables unix:db.sock ordinals], [0], [ignore], [ignore]) + +dnl Add an already added database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) + +dnl Fix the config back. +AT_CHECK([sed -i'back' '/db2/d' config.json]) +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) + +dnl Add a non-existing database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db3": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload], [2], [ignore], [ignore]) +OVS_WAIT_UNTIL([grep -q 'failed to configure databases' ovsdb-server.log]) +AT_CHECK([sed -i'back' '/db3/d' config.json]) + +dnl Add a remote through a db path in db1. +AT_CHECK([sed -i'back' '/"remotes"/a\ + "db:ordinals,ordinals,name": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [db:ordinals,ordinals,name +punix:db.sock +]) + +dnl Removing db1 has no effect on its remote. +AT_CHECK([sed -i'back' '/db1/d' config.json]) +AT_CHECK([sed -i'back' 's/"db2": {},/"db2": {}/' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload], [2], [ignore], [ignore]) +CHECK_DBS([constraints +]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [db:ordinals,ordinals,name +punix:db.sock +]) +AT_CHECK([ovsdb-client list-tables unix:db.sock ordinals], [1], [ignore], [ignore]) + +dnl Remove now missing remote. +AT_CHECK([sed -i'back' '/db:ordinals,ordinals,name/d' config.json]) + +dnl Remove db2. +AT_CHECK([sed -i'back' '/db2/d' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS() +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [1], [ignore], [ignore]) + +dnl Add a removed database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {} + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([constraints +]) +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) + +# Check the monitoring results. +AT_CHECK([uuidfilt db-change-aware.stdout], [0], [dnl +<0> initial _Server + +<1> insert ordinals + +<2> insert constraints + +<1> delete ordinals + +<2> delete constraints + +<3> insert constraints +]) +AT_CHECK([uuidfilt db-change-unaware.stdout], [0], [dnl +<0> initial _Server +]) + +OVSDB_SERVER_SHUTDOWN([" + /no database named ordinals/d + /failed to open database 'db3'/d + /failed to configure databases/d +"]) +AT_CLEANUP + AT_SETUP([ovsdb-server/add-db with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) @@ -499,6 +673,81 @@ AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes]) OVSDB_SERVER_SHUTDOWN AT_CLEANUP +AT_SETUP([ovsdb-server/add-remote and remove-remote with config file]) +AT_KEYWORDS([ovsdb server positive config-file]) +ordinal_schema > schema +AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) +on_exit 'kill $(cat *.pid)' + +AT_DATA([config.json], [ +{ + "remotes": { + }, + "databases": { "db": {} } +} +]) +AT_CAPTURE_FILE([config.json]) + +AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file --pidfile \ + --config-file=config.json], [0], [ignore], [ignore]) + +AT_CHECK([test ! -e socket1]) +AT_CHECK([sed -i'back' '/"remotes"/a\ + "punix:socket1": {} + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +if test "$IS_WIN32" = "yes"; then + OVS_WAIT_UNTIL([test -e socket1]) +else + OVS_WAIT_UNTIL([test -S socket1]) +fi +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [punix:socket1 +]) + +AT_CHECK([test ! -e socket2]) +AT_CHECK([sed -i'back' '/"remotes"/a\ + "punix:socket2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +if test "$IS_WIN32" = "yes"; then + OVS_WAIT_UNTIL([test -e socket2]) +else + OVS_WAIT_UNTIL([test -S socket2]) +fi +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [punix:socket1 +punix:socket2 +]) + +AT_CHECK([sed -i'back' '/"remotes"/a\ + "db:x,y,z": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload], [2], [ignore], [ignore]) +OVS_WAIT_UNTIL([grep -q '"db:x,y,z": no database named x' ovsdb-server.log]) +AT_CHECK([sed -i'back' '/db:x,y,z/d' config.json]) + +AT_CHECK([sed -i'back' '/punix:socket1/d' config.json]) +AT_CHECK([sed -i'back' 's/"punix:socket2": {},/"punix:socket2": {}/' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +OVS_WAIT_UNTIL([test ! -e socket1]) +if test "$IS_WIN32" = "yes"; then + AT_CHECK([test -e socket2]) +else + AT_CHECK([test -S socket2]) +fi +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [punix:socket2 +]) + +AT_CHECK([sed -i'back' '/punix:socket2/d' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +OVS_WAIT_UNTIL([test ! -e socket2]) +AT_CHECK([test ! -e socket1]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes]) +OVSDB_SERVER_SHUTDOWN(['/"db:x,y,z": no database named x/d']) +AT_CLEANUP + AT_SETUP([ovsdb-server/add-remote with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) @@ -2108,6 +2357,140 @@ dnl OVSDB_SERVER_SHUTDOWN dnl OVSDB_SERVER_SHUTDOWN2 AT_CLEANUP +AT_SETUP([ovsdb-server/active-backup-role-switching with config file]) +AT_KEYWORDS([ovsdb server replication active-backup-switching config-file]) +replication_schema > schema +AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) +AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) + +dnl Add some data to both DBs. +AT_CHECK([ovsdb-tool transact db1 \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 9, "name": "nine"}}]]'], [0], [ignore], [ignore]) + +AT_CHECK([ovsdb-tool transact db2 \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 9, "name": "nine"}}]]'], [0], [ignore], [ignore]) + +dnl Start both 'db1' and 'db2' in backup mode. Let them backup from each +dnl other. This is not a supported operation state, but to simulate a start +dnl up condition where an HA manger can select which one to be an active +dnl server soon after. +on_exit 'kill $(cat *.pid)' + +AT_DATA([config1.json], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db1": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db2.sock": {} } + } + } +} +]) + +AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server1.log \ + --pidfile=1.pid --unixctl=unixctl1 --config-file=config1.json], + [0], [ignore], [ignore]) + +AT_DATA([config2.json], [ +{ + "remotes": { "punix:db2.sock": {} }, + "databases": { + "db2": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db.sock": {} } + } + } +} +]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server2.log \ + --pidfile=2.pid --unixctl=unixctl2 --config-file=config2.json], + [0], [ignore], [ignore]) + +dnl Make sure both servers reached the replication state. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/sync-status | grep replicating]) +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/unixctl2 ovsdb-server/sync-status | grep replicating]) + +dnl Switch the 'db1' to active. +AT_CHECK([sed -i'back' 's/"backup": true/"backup": false/' config1.json]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/reload]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active +]) + +dnl Issue a transaction to 'db1'. +AT_CHECK([ovsdb-client transact unix:db.sock \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 0, "name": "zero"}}]]'], [0], [ignore]) + +dnl It should be replicated to 'db2'. +OVS_WAIT_UNTIL([ovsdb-client dump unix:db2.sock | grep zero]) + +dnl Issue a transaction to 'db2', it should fail. +AT_CHECK([ovsdb-client transact unix:db2.sock \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 1, "name": "one"}}]]'], [0], [dnl +[[{"details":"insert operation not allowed when database server is in read only mode","error":"not allowed"}]] +]) + +dnl Flip the role of 'db1' and 'db2'. 'db1' becomes backup, and 'db2' becomes active. +AT_CHECK([sed -i'back' 's/"backup": true/"backup": false/' config2.json]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl2 ovsdb-server/reload]) +AT_CHECK([sed -i'back' 's/"backup": false/"backup": true/' config1.json]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/reload]) + +dnl Verify the change happend. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/sync-status | grep replicating]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl2 ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active +]) + +dnl Issue a transaction to 'db2' which is now active. +AT_CHECK([ovsdb-client transact unix:db2.sock \ +'[["mydb", + {"op": "insert", + "table": "b", + "row": {"number": 1, "name": "one"}}]]'], [0], [ignore]) + +dnl The transaction should be replicated to 'db1'. +OVS_WAIT_UNTIL([ovsdb-client dump unix:db.sock | grep one]) + +dnl Issue a transaction to 'db1', it should fail. +AT_CHECK([ovsdb-client transact unix:db.sock \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 2, "name": "two"}}]]'], [0], [dnl +[[{"details":"insert operation not allowed when database server is in read only mode","error":"not allowed"}]] +]) + +dnl Both servers should have the same content. +AT_CHECK([ovsdb-client dump unix:db.sock], [0], [stdout]) +cat stdout > dump1 + +AT_CHECK([ovsdb-client dump unix:db2.sock], [0], [stdout]) +cat stdout > dump2 + +AT_CHECK([diff -u dump1 dump2]) + +OVSDB_SERVER_SHUTDOWN_N([1]) +OVSDB_SERVER_SHUTDOWN2 +AT_CLEANUP + #ovsdb-server prevent self replicating AT_SETUP([ovsdb-server prevent self replicating]) AT_KEYWORDS([ovsdb server replication]) @@ -2472,3 +2855,271 @@ AT_CHECK([diff db.clear ./replay_dir/db.copy.clear]) AT_CHECK([diff -u 1.log.clear 2.log.clear]) AT_CLEANUP + +AT_BANNER([OVSDB -- ovsdb-server configuration file]) + +dnl TEST_CONFIG_FILE([NAME], [config], [EXIT_CODE], [FAILURE_STRINGS]) +dnl +dnl Tries the config as a data for --config-file, checks the EXIT_CODE +dnl of the ovsdb-server and checks the stderr for FAILURE_STRINGS. +dnl NAME is added to the test name and keywords. +m4_define([TEST_CONFIG_FILE], +[ + AT_SETUP([ovsdb-server config-file - $1]) + AT_KEYWORDS([ovsdb server config-file $1]) + on_exit 'kill $(cat *.pid)' + echo '$2' > config.json + AT_CAPTURE_FILE([config.json]) + ordinal_schema > schema + constraint_schema > schema2 + AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create db2 schema], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create-cluster db_cluster schema2 unix:s1.raft], + [0], [ignore], [ignore]) + AT_CHECK([ovsdb-server -vfile -vPATTERN:console:'%p|%m' -vvlog:off \ + --log-file --detach --no-chdir --pidfile \ + --config-file=config.json], [$3], [ignore], [stderr]) + m4_if([$4], [], [], [ + AT_CHECK([cat stderr | grep -v -E 'INFO|DBG' \ + | grep -v 'failed to load configuration from' > warnings]) + AT_CHECK([cat warnings], [0], [m4_if([$3], [0], [$4], [$4 +ovsdb-server: server configuration failed +])])]) + m4_if([$3$4], [0], [OVSDB_SERVER_SHUTDOWN]) + AT_CLEANUP +]) + +TEST_CONFIG_FILE([simple], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": null, "db_cluster": {} } +} +], [0]) + +TEST_CONFIG_FILE([standalone], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": { "service-model": "standalone" } } +} +], [0]) + +TEST_CONFIG_FILE([clustered], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db_cluster": { "service-model": "clustered" } } +} +], [0]) + +TEST_CONFIG_FILE([unknown service model], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": { "service-model": "not-a-service-model" } } +} +], [1], [dnl +WARN|Unrecognized database service model: 'not-a-service-model' +WARN|syntax "{"service-model":"not-a-service-model"}": syntax error:dnl + Parsing database db failed: 'not-a-service-model' is not a valid service model +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([same schema], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": null, "db2": {} } +} +], [1], [dnl +WARN|failed to open database 'db2': ovsdb error: ordinals: duplicate database name +WARN|failed to configure databases]) + +TEST_CONFIG_FILE([model mismatch], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": { "service-model": "clustered" } } +} +], [1], [dnl +WARN|failed to open database 'db': ovsdb error: db: database is standalone and not clustered +WARN|failed to configure databases]) + +TEST_CONFIG_FILE([model mismatch clustered], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db_cluster": { "service-model": "standalone" } } +} +], [1], [dnl +WARN|failed to open database 'db_cluster': ovsdb error: db_cluster: database is clustered and not standalone +WARN|failed to configure databases]) + +TEST_CONFIG_FILE([relay], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay", + "source": { "unix:db2.sock": {} } + } + } +} +], [0]) + +TEST_CONFIG_FILE([relay without source], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay" + } + } +} +], [1], [dnl +WARN|syntax "{"service-model":"relay"}": syntax error: Parsing database RelaySchema failed:dnl + Required 'source' member is missing. +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([relay with options], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay", + "source": { + "punix:db2.sock": { + "inactivity-probe": 10000, + "max-backoff": 8000, + "dscp": 42 + } + } + } + } +} +], [0]) + +TEST_CONFIG_FILE([relay with unrelated options], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay", + "source": { + "punix:db2.sock": { + "inactivity-probe": 10000, + "max-backoff": 8000, + "dscp": 42, + "role": "My-RBAC-role" + } + } + } + } +} +], [0], [dnl +WARN|syntax "{"dscp":42,"inactivity-probe":10000,"max-backoff":8000,"role":"My-RBAC-role"}":dnl + syntax error: Parsing JSON-RPC options failed: Member 'role' is present but not allowed here. +]) + +TEST_CONFIG_FILE([unknown config], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { "unknnown": "unknown" } + } +} +], [1], [dnl +WARN|syntax "{"unknnown":"unknown"}": syntax error: Parsing database db failed:dnl + Member 'unknnown' is present but not allowed here. +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([active-backup active], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { + "service-model": "active-backup", + "backup": false + } + } +} +], [0]) + +TEST_CONFIG_FILE([active-backup backup], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { + "service-model": "active-backup", + "backup": true, + "source": { + "punix:db2.sock": { + "inactivity-probe": 100000, + "max-backoff": 16000, + "dscp": 42 + } + } + } + } +} +], [0]) + +TEST_CONFIG_FILE([active-backup backup without source], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { + "service-model": "active-backup", + "backup": true + } + } +} +], [1], [dnl +WARN|syntax "{"backup":true,"service-model":"active-backup"}": syntax error:dnl + Parsing database db failed: Required 'source' member is missing. +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([syntax error], [ +{ + "remotes": { "punix:db.sock": {}, }, + "databases": { "db": {}, "db_cluster": {} } +} +], [1], [dnl +WARN|config: reading JSON failed (line 2, column 38, byte 41: syntax error parsing object expecting string)]) + +TEST_CONFIG_FILE([complex config], [ +{ + "remotes": { + "punix:db.sock": { + "inactivity-probe": 0, + "read-only": false + }, + "pssl:0:127.0.0.1": { + "inactivity-probe": 5000, + "max-backoff": 8000, + "read-only": true, + "role": "ovn-controller", + "dscp": 48 + }, + "db:ordinals,ordinals,name": null + }, + "databases": { + "db_cluster": { + "service-model": "clustered" + }, + "OVN_Northbound": { + "service-model": "relay", + "source": { + "unix:nb.sock": { + "max-backoff": 3000, + "inactivity-probe": 16000 + } + } + }, + "db": { + "service-model": "active-backup", + "backup": true, + "source": { + "unix:active.sock": { + "max-backoff": 16000, + "inactivity-probe": 180000 + } + }, + "exclude-tables": [["IC_SB_Global", "Availability_Zone"]] + } + } +} +], [0]) From 8893e24d9d0921aaf934f263a06ba223ef0db369 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Mon, 15 Jan 2024 15:28:14 +0100 Subject: [PATCH 531/833] dpdk: Update to use v23.11. This commit adds support for DPDK v23.11. It updates the CI script and documentation and includes the following changes coming from the dpdk-latest branch: - sparse: Add some compiler intrinsics for DPDK build. https://patchwork.ozlabs.org/project/openvswitch/list/?series=371129&state=* - ci: Cache DPDK installed libraries only. - ci: Reduce optional libraries in DPDK. https://patchwork.ozlabs.org/project/openvswitch/list/?series=383367&state=* - system-dpdk: Ignore net/ice error log about QinQ offloading. https://patchwork.ozlabs.org/project/openvswitch/list/?series=385259&state=* There is a known issue with i40e VF devices where OVS main thread may block when adding such devices as dpif-netdev dpdk ports. Acked-by: Kevin Traynor Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- .ci/dpdk-build.sh | 28 +++++++++++++++--------- .ci/linux-build.sh | 9 ++++---- .github/workflows/build-and-test.yml | 4 ++-- Documentation/faq/releases.rst | 2 +- Documentation/intro/install/dpdk.rst | 16 +++++++------- Documentation/topics/dpdk/phy.rst | 12 +++++----- Documentation/topics/dpdk/vdev.rst | 2 +- Documentation/topics/dpdk/vhost-user.rst | 2 +- Documentation/topics/testing.rst | 2 +- Documentation/topics/userspace-tso.rst | 2 +- NEWS | 9 ++++++++ debian/control.in | 2 +- include/sparse/automake.mk | 1 + include/sparse/ia32intrin.h | 23 +++++++++++++++++++ rhel/openvswitch-fedora.spec.in | 2 +- tests/system-dpdk-macros.at | 1 + 16 files changed, 80 insertions(+), 37 deletions(-) create mode 100644 include/sparse/ia32intrin.h diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh index d4c178ee0df..23f3166a548 100755 --- a/.ci/dpdk-build.sh +++ b/.ci/dpdk-build.sh @@ -5,25 +5,27 @@ set -x function build_dpdk() { - local VERSION_FILE="dpdk-dir/cached-version" local DPDK_VER=$1 local DPDK_OPTS="" + local DPDK_INSTALL_DIR="$(pwd)/dpdk-dir" + local VERSION_FILE="$DPDK_INSTALL_DIR/cached-version" - rm -rf dpdk-dir + rm -rf dpdk-src + rm -rf $DPDK_INSTALL_DIR if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then - git clone --single-branch $DPDK_GIT dpdk-dir -b "${DPDK_VER##refs/*/}" - pushd dpdk-dir + git clone --single-branch $DPDK_GIT dpdk-src -b "${DPDK_VER##refs/*/}" + pushd dpdk-src git log -1 --oneline else wget https://fast.dpdk.org/rel/dpdk-$1.tar.xz tar xvf dpdk-$1.tar.xz > /dev/null DIR_NAME=$(tar -tf dpdk-$1.tar.xz | head -1 | cut -f1 -d"/") - mv ${DIR_NAME} dpdk-dir - pushd dpdk-dir + mv ${DIR_NAME} dpdk-src + pushd dpdk-src fi - # Switching to 'default' machine to make dpdk-dir cache usable on + # Switching to 'default' machine to make the dpdk cache usable on # different CPUs. We can't be sure that all CI machines are exactly same. DPDK_OPTS="$DPDK_OPTS -Dmachine=default" @@ -40,16 +42,22 @@ function build_dpdk() DPDK_OPTS="$DPDK_OPTS -Denable_apps=test-pmd" enable_drivers="net/null,net/af_xdp,net/tap,net/virtio,net/pcap" DPDK_OPTS="$DPDK_OPTS -Denable_drivers=$enable_drivers" + # OVS depends on the vhost library (and its dependencies). + # net/tap depends on the gso library. + DPDK_OPTS="$DPDK_OPTS -Denable_libs=cryptodev,dmadev,gso,vhost" # Install DPDK using prefix. - DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" + DPDK_OPTS="$DPDK_OPTS --prefix=$DPDK_INSTALL_DIR" meson $DPDK_OPTS build ninja -C build ninja -C build install - - echo "Installed DPDK in $(pwd)" popd + + # Remove examples sources. + rm -rf $DPDK_INSTALL_DIR/share/dpdk/examples + + echo "Installed DPDK in $DPDK_INSTALL_DIR" echo "${DPDK_VER}" > ${VERSION_FILE} } diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 90581c10b7f..cf1462a0c4a 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -10,8 +10,9 @@ JOBS=${JOBS:-"-j4"} function install_dpdk() { - local VERSION_FILE="dpdk-dir/cached-version" - local DPDK_LIB=$(pwd)/dpdk-dir/build/lib/x86_64-linux-gnu + local DPDK_INSTALL_DIR="$(pwd)/dpdk-dir" + local VERSION_FILE="${DPDK_INSTALL_DIR}/cached-version" + local DPDK_LIB=${DPDK_INSTALL_DIR}/lib/x86_64-linux-gnu if [ "$DPDK_SHARED" ]; then EXTRA_OPTS="$EXTRA_OPTS --with-dpdk=shared" @@ -27,13 +28,13 @@ function install_dpdk() export PATH=$(pwd)/dpdk-dir/build/bin:$PATH if [ ! -f "${VERSION_FILE}" ]; then - echo "Could not find DPDK in $(pwd)/dpdk-dir" + echo "Could not find DPDK in $DPDK_INSTALL_DIR" return 1 fi # Update the library paths. sudo ldconfig - echo "Found cached DPDK $(cat ${VERSION_FILE}) build in $(pwd)/dpdk-dir" + echo "Found cached DPDK $(cat ${VERSION_FILE}) build in $DPDK_INSTALL_DIR" } function configure_ovs() diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 710757693d2..7bfb42a422d 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -7,8 +7,8 @@ jobs: env: dependencies: gcc libbpf-dev libnuma-dev libpcap-dev ninja-build pkgconf CC: gcc - DPDK_GIT: https://dpdk.org/git/dpdk-stable - DPDK_VER: 22.11.1 + DPDK_GIT: https://dpdk.org/git/dpdk + DPDK_VER: 23.11 name: dpdk gcc outputs: dpdk_key: ${{ steps.gen_dpdk_key.outputs.key }} diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index aa69eefa131..da185ae1dc4 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -236,7 +236,7 @@ Q: Are all the DPDK releases that OVS versions work with maintained? The latest information about DPDK stable and LTS releases can be found at `DPDK stable`_. -.. _DPDK stable: http://doc.dpdk.org/guides-22.11/contributing/stable.html +.. _DPDK stable: http://doc.dpdk.org/guides-23.11/contributing/stable.html Q: I get an error like this when I configure Open vSwitch: diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index 63a0ebb23bb..ad9bdf22c06 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -42,7 +42,7 @@ Build requirements In addition to the requirements described in :doc:`general`, building Open vSwitch with DPDK will require the following: -- DPDK 22.11.1 +- DPDK 23.11 - A `DPDK supported NIC`_ @@ -59,8 +59,8 @@ vSwitch with DPDK will require the following: Detailed system requirements can be found at `DPDK requirements`_. -.. _DPDK supported NIC: https://doc.dpdk.org/guides-22.11/nics/index.html -.. _DPDK requirements: https://doc.dpdk.org/guides-22.11/linux_gsg/sys_reqs.html +.. _DPDK supported NIC: https://doc.dpdk.org/guides-23.11/nics/index.html +.. _DPDK requirements: https://doc.dpdk.org/guides-23.11/linux_gsg/sys_reqs.html .. _dpdk-install: @@ -73,9 +73,9 @@ Install DPDK #. Download the `DPDK sources`_, extract the file and set ``DPDK_DIR``:: $ cd /usr/src/ - $ wget https://fast.dpdk.org/rel/dpdk-22.11.1.tar.xz - $ tar xf dpdk-22.11.1.tar.xz - $ export DPDK_DIR=/usr/src/dpdk-stable-22.11.1 + $ wget https://fast.dpdk.org/rel/dpdk-23.11.tar.xz + $ tar xf dpdk-23.11.tar.xz + $ export DPDK_DIR=/usr/src/dpdk-23.11 $ cd $DPDK_DIR #. Configure and install DPDK using Meson @@ -121,7 +121,7 @@ Install DPDK .. _DPDK sources: http://dpdk.org/rel .. _DPDK documentation: - https://doc.dpdk.org/guides-22.11/linux_gsg/build_dpdk.html + https://doc.dpdk.org/guides-23.11/linux_gsg/build_dpdk.html Install OVS ~~~~~~~~~~~ @@ -722,7 +722,7 @@ Limitations release notes`_. .. _DPDK release notes: - https://doc.dpdk.org/guides-22.11/rel_notes/release_22_11.html + https://doc.dpdk.org/guides-23.11/rel_notes/release_23_11.html - Upper bound MTU: DPDK device drivers differ in how the L2 frame for a given MTU value is calculated e.g. i40e driver includes 2 x vlan headers in diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index 41cc3588abf..d94eafc9a9b 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -117,7 +117,7 @@ tool:: For more information, refer to the `DPDK drivers documentation`_. -.. _DPDK drivers documentation: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html +.. _DPDK drivers documentation: https://doc.dpdk.org/guides-23.11/linux_gsg/linux_drivers.html .. _dpdk-phy-multiqueue: @@ -148,14 +148,14 @@ situation. Some physical NICs can be programmed to put these protocols in a dedicated hardware Rx queue using the rte_flow__ API. -__ https://doc.dpdk.org/guides-22.11/prog_guide/rte_flow.html +__ https://doc.dpdk.org/guides-23.11/prog_guide/rte_flow.html .. warning:: This feature is not compatible with all NICs. Refer to the DPDK `compatibilty matrix`__ and vendor documentation for more details. - __ https://doc.dpdk.org/guides-22.11/nics/overview.html + __ https://doc.dpdk.org/guides-23.11/nics/overview.html Rx steering must be enabled for specific protocols per port. The ``rx-steering`` option takes one of the following values: @@ -322,7 +322,7 @@ To hotplug a port with igb_uio in this case, DPDK must be configured to use physical addressing for IOVA mode. For more information regarding IOVA modes in DPDK please refer to the `DPDK IOVA Mode Detection`__. -__ https://doc.dpdk.org/guides-22.11/prog_guide/env_abstraction_layer.html#iova-mode-detection +__ https://doc.dpdk.org/guides-23.11/prog_guide/env_abstraction_layer.html#iova-mode-detection To configure OVS DPDK to use physical addressing for IOVA:: @@ -354,7 +354,7 @@ Representors are multi devices created on top of one PF. For more information, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-22.11/prog_guide/switch_representation.html#port-representors +__ https://doc.dpdk.org/guides-23.11/prog_guide/switch_representation.html#port-representors Prior to port representors there was a one-to-one relationship between the PF and the eth device. With port representors the relationship becomes one PF to @@ -488,7 +488,7 @@ in the ``options`` column of the ``Interface`` table. kernel netdevice, and be inherited from it when Open vSwitch is restarted, even if the options described in this section are unset from Open vSwitch. -.. _bifurcated drivers: https://doc.dpdk.org/guides-22.11/linux_gsg/linux_drivers.html#bifurcated-driver +.. _bifurcated drivers: https://doc.dpdk.org/guides-23.11/linux_gsg/linux_drivers.html#bifurcated-driver - Configure the VF MAC address:: diff --git a/Documentation/topics/dpdk/vdev.rst b/Documentation/topics/dpdk/vdev.rst index 3383afce562..f1f59af5d95 100644 --- a/Documentation/topics/dpdk/vdev.rst +++ b/Documentation/topics/dpdk/vdev.rst @@ -63,4 +63,4 @@ run:: More information on the different types of virtual DPDK PMDs can be found in the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-22.11/nics/overview.html +__ https://doc.dpdk.org/guides-23.11/nics/overview.html diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst index 3a5f5be9887..e952a686b55 100644 --- a/Documentation/topics/dpdk/vhost-user.rst +++ b/Documentation/topics/dpdk/vhost-user.rst @@ -539,4 +539,4 @@ shown with:: Further information can be found in the `DPDK documentation -`__ +`__ diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index fb9b3e77b10..c6093463d31 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -358,7 +358,7 @@ with a mlx5 device:: All tests are skipped if no hugepages are configured. User must look into the DPDK manual to figure out how to `Configure hugepages`_. -.. _Configure hugepages: https://doc.dpdk.org/guides-22.11/linux_gsg/sys_reqs.html +.. _Configure hugepages: https://doc.dpdk.org/guides-23.11/linux_gsg/sys_reqs.html All the features documented under `Unit Tests`_ are available for the DPDK testsuite. diff --git a/Documentation/topics/userspace-tso.rst b/Documentation/topics/userspace-tso.rst index c4b15f2604a..a21bb2b5dee 100644 --- a/Documentation/topics/userspace-tso.rst +++ b/Documentation/topics/userspace-tso.rst @@ -46,7 +46,7 @@ datasheet for compatibility. Secondly, the NIC must have an associated DPDK Poll Mode Driver (PMD) which supports `TSO`. For a list of features per PMD, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-22.11/nics/overview.html +__ https://doc.dpdk.org/guides-23.11/nics/overview.html Enabling TSO ~~~~~~~~~~~~ diff --git a/NEWS b/NEWS index 49d74b0b951..410dd68e512 100644 --- a/NEWS +++ b/NEWS @@ -43,6 +43,15 @@ Post-v3.2.0 * The userspace conntrack module no longer requires the user to specify connection helpers in all flow rules. Instead, the helper specified during connection commit will be used by default. + - DPDK: + * Add support for DPDK 23.11. + +Known issues: + - DPDK: v23.11 has a change in behavior in handling i40e VF devices. This + may block and prevent OVS from adding such devices as ports in a netdev + datapath bridge. + For the details, see https://bugs.dpdk.org/show_bug.cgi?id=1337 which + describes the issue first detected in the 21.11 LTS branch. v3.2.0 - 17 Aug 2023 diff --git a/debian/control.in b/debian/control.in index 64b0a4ce018..f9eea897ed9 100644 --- a/debian/control.in +++ b/debian/control.in @@ -21,7 +21,7 @@ Build-Depends: iproute2, libcap-ng-dev, libdbus-1-dev [amd64 i386 ppc64el arm64], -# DPDK_NETDEV libdpdk-dev (>= 22.11) [amd64 i386 ppc64el arm64], +# DPDK_NETDEV libdpdk-dev (>= 23.11) [amd64 i386 ppc64el arm64], libnuma-dev [amd64 i386 ppc64el arm64], libpcap-dev [amd64 i386 ppc64el arm64], libssl-dev, diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk index e966371192b..c1229870bb8 100644 --- a/include/sparse/automake.mk +++ b/include/sparse/automake.mk @@ -4,6 +4,7 @@ noinst_HEADERS += \ include/sparse/arpa/inet.h \ include/sparse/bits/floatn.h \ include/sparse/assert.h \ + include/sparse/ia32intrin.h \ include/sparse/math.h \ include/sparse/numa.h \ include/sparse/netinet/in.h \ diff --git a/include/sparse/ia32intrin.h b/include/sparse/ia32intrin.h new file mode 100644 index 00000000000..5045bf38d96 --- /dev/null +++ b/include/sparse/ia32intrin.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CHECKER__ +#error "Use this header only with sparse. It is not a correct implementation." +#endif + +#define __builtin_ia32_rdtsc() (unsigned long long) 0 + +/* Get actual definitions for us to annotate and build on. */ +#include_next diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 343a5716d16..5d24ebcda8b 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -71,7 +71,7 @@ BuildRequires: libcap-ng libcap-ng-devel %endif %if %{with dpdk} BuildRequires: libpcap-devel numactl-devel -BuildRequires: dpdk-devel >= 22.11 +BuildRequires: dpdk-devel >= 23.11 Provides: %{name}-dpdk = %{version}-%{release} %endif %if %{with afxdp} diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 3b5a3512d43..7cf9bac1700 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -80,6 +80,7 @@ $1";/does not exist. The Open vSwitch kernel module is probably not loaded./d /does not support MTU configuration,/d /EAL: No \(available\|free\) .*hugepages reported/d /Failed to enable flow control/d +/ice_vsi_config_outer_vlan_stripping(): Single VLAN mode (SVM) does not support qinq/d /Rx checksum offload is not supported on/d /TELEMETRY: No legacy callbacks, legacy socket not created/d"]) ]) From bdf2f16771941569dfc591ab9cb013f427e450c4 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 16 Jan 2024 16:47:49 +0100 Subject: [PATCH 532/833] tests: Fix 'long flow dump duration' failures due to large time warp. Large time warps can cause the 'long flow dump duration' log message to happen. However, due to the level change, they could now cause failures. This patch will stop these messages from failing the tests. Fixes: 9bcfb8fb7784 ("ofproto-dpif-upcall: Change flow dump duration message to WARN level.") Signed-off-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/ofproto-macros.at | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index 932208debe5..c870cf8197c 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -254,6 +254,9 @@ check_logs () { # we ignore the messages that were rate-limited, we can end up failing just # because of the announcement that rate-limiting happened (and in a racy, # timing-dependent way, too). + # + # We also ignore the "Spent an unreasonably long XXms dumping flows" as + # they can appear when large time/warps are used during tests. sed -n "$1 /reset by peer/d /Broken pipe/d @@ -266,6 +269,7 @@ check_logs () { /Dropped [[0-9]]* log messages/d /setting extended ack support failed/d /ETHTOOL_GSSET_INFO/d +/Spent an unreasonably long .*ms dumping flows/d /|WARN|/p /|ERR|/p /|EMER|/p" ${logs} From da093acc7dde5873e968d30debe50e2c64c6b3ed Mon Sep 17 00:00:00 2001 From: Ivan Malov Date: Tue, 16 Jan 2024 05:42:55 +0400 Subject: [PATCH 533/833] netdev-offload-dpdk: Replace action PORT_ID with REPRESENTED_PORT. Action PORT_ID has been deprecated. Use REPRESENTED_PORT instead. Acked-by: Kevin Traynor Signed-off-by: Ivan Malov Signed-off-by: Ilya Maximets --- lib/netdev-offload-dpdk.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index 992627fa231..623005b1cb9 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -735,14 +735,15 @@ dump_flow_action(struct ds *s, struct ds *s_extra, ds_put_cstr(s, "rss / "); } else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT) { ds_put_cstr(s, "count / "); - } else if (actions->type == RTE_FLOW_ACTION_TYPE_PORT_ID) { - const struct rte_flow_action_port_id *port_id = actions->conf; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_REPRESENTED_PORT) { + const struct rte_flow_action_ethdev *ethdev = actions->conf; - ds_put_cstr(s, "port_id "); - if (port_id) { - ds_put_format(s, "original %d id %d ", - port_id->original, port_id->id); + ds_put_cstr(s, "represented_port "); + + if (ethdev) { + ds_put_format(s, "ethdev_port_id %d ", ethdev->port_id); } + ds_put_cstr(s, "/ "); } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { ds_put_cstr(s, "drop / "); @@ -1776,19 +1777,22 @@ add_count_action(struct flow_actions *actions) } static int -add_port_id_action(struct flow_actions *actions, - struct netdev *outdev) +add_represented_port_action(struct flow_actions *actions, + struct netdev *outdev) { - struct rte_flow_action_port_id *port_id; + struct rte_flow_action_ethdev *ethdev; int outdev_id; outdev_id = netdev_dpdk_get_port_id(outdev); if (outdev_id < 0) { return -1; } - port_id = xzalloc(sizeof *port_id); - port_id->id = outdev_id; - add_flow_action(actions, RTE_FLOW_ACTION_TYPE_PORT_ID, port_id); + + ethdev = xzalloc(sizeof *ethdev); + ethdev->port_id = outdev_id; + + add_flow_action(actions, RTE_FLOW_ACTION_TYPE_REPRESENTED_PORT, ethdev); + return 0; } @@ -1808,7 +1812,7 @@ add_output_action(struct netdev *netdev, return -1; } if (!netdev_flow_api_equals(netdev, outdev) || - add_port_id_action(actions, outdev)) { + add_represented_port_action(actions, outdev)) { VLOG_DBG_RL(&rl, "%s: Output to port \'%s\' cannot be offloaded.", netdev_get_name(netdev), netdev_get_name(outdev)); ret = -1; From ed738eca39ef308f207f83463dc215df215bdf09 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 16 Jan 2024 09:08:14 +0100 Subject: [PATCH 534/833] util: Annotate function that will never return NULL. The make clang-analyze target reports an 'Dereference of null pointer' and an 'Uninitialized argument value' issue due to it assumes some function can return NULL. This patch annotates these functions, so the static analyzer is aware of this. Acked-by: Dumitru Ceara Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- include/openvswitch/compiler.h | 6 ++++++ lib/util.h | 38 ++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h index 52614a5ac04..878c5c6a70d 100644 --- a/include/openvswitch/compiler.h +++ b/include/openvswitch/compiler.h @@ -37,6 +37,12 @@ #define OVS_NO_RETURN #endif +#if __GNUC__ && !__CHECKER__ +#define OVS_RETURNS_NONNULL __attribute__((returns_nonnull)) +#else +#define OVS_RETURNS_NONNULL +#endif + #ifndef typeof #define typeof __typeof__ #endif diff --git a/lib/util.h b/lib/util.h index 62801e85f55..f2d45bcac8a 100644 --- a/lib/util.h +++ b/lib/util.h @@ -162,28 +162,30 @@ bool memory_locked(void); OVS_NO_RETURN void out_of_memory(void); /* Allocation wrappers that abort if memory is exhausted. */ -void *xmalloc(size_t) MALLOC_LIKE; -void *xcalloc(size_t, size_t) MALLOC_LIKE; -void *xzalloc(size_t) MALLOC_LIKE; -void *xrealloc(void *, size_t); -void *xmemdup(const void *, size_t) MALLOC_LIKE; -char *xmemdup0(const char *, size_t) MALLOC_LIKE; -char *xstrdup(const char *) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xcalloc(size_t, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xzalloc(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xrealloc(void *, size_t); +OVS_RETURNS_NONNULL void *xmemdup(const void *, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL char *xmemdup0(const char *, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL char *xstrdup(const char *) MALLOC_LIKE; char *nullable_xstrdup(const char *) MALLOC_LIKE; bool nullable_string_is_equal(const char *a, const char *b); -char *xasprintf(const char *format, ...) OVS_PRINTF_FORMAT(1, 2) MALLOC_LIKE; -char *xvasprintf(const char *format, va_list) OVS_PRINTF_FORMAT(1, 0) MALLOC_LIKE; -void *x2nrealloc(void *p, size_t *n, size_t s); +OVS_RETURNS_NONNULL char *xasprintf(const char *format, ...) + OVS_PRINTF_FORMAT(1, 2) MALLOC_LIKE; +OVS_RETURNS_NONNULL char *xvasprintf(const char *format, va_list) + OVS_PRINTF_FORMAT(1, 0) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *x2nrealloc(void *p, size_t *n, size_t s); /* Allocation wrappers for specialized situations where coverage counters * cannot be used. */ -void *xmalloc__(size_t) MALLOC_LIKE; -void *xcalloc__(size_t, size_t) MALLOC_LIKE; -void *xzalloc__(size_t) MALLOC_LIKE; -void *xrealloc__(void *, size_t); +OVS_RETURNS_NONNULL void *xmalloc__(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xcalloc__(size_t, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xzalloc__(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xrealloc__(void *, size_t); -void *xmalloc_cacheline(size_t) MALLOC_LIKE; -void *xzalloc_cacheline(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc_cacheline(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xzalloc_cacheline(size_t) MALLOC_LIKE; void free_cacheline(void *); void ovs_strlcpy(char *dst, const char *src, size_t size); @@ -191,9 +193,9 @@ void ovs_strzcpy(char *dst, const char *src, size_t size); int string_ends_with(const char *str, const char *suffix); -void *xmalloc_pagealign(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc_pagealign(size_t) MALLOC_LIKE; void free_pagealign(void *); -void *xmalloc_size_align(size_t, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc_size_align(size_t, size_t) MALLOC_LIKE; void free_size_align(void *); /* The C standards say that neither the 'dst' nor 'src' argument to From d662eee0972400bda52f8012c634b60956a2e7ed Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Thu, 11 Jan 2024 00:08:53 +0100 Subject: [PATCH 535/833] ci: Add clang-analyze to GitHub actions. This patch identifies new static analysis issues during a GitHub action run and reports them. The process involves analyzing the changes introduced in the current commit and comparing them to those in the preceding commit. However, there are two cases when the GitHub push action runner does not provide enough details to determine the preceding commit. These cases are a new branch or a forced push. The strategy for these exceptions is to find the first common commit on any upstream branch, and use that. An example error output might look like this: error level: +0 -0 no changes warning level: +2 +0 New issue "deadcode.DeadStores Value stored to 'remote' is never read" (1 occurrence) file:///home/runner/work/ovs/ovs/vswitchd/ovs-vswitchd.c:86 New issue "unix.Malloc Potential leak of memory pointed to by 'remote'" (1 occurrence) file:///home/runner/work/ovs/ovs/vswitchd/ovs-vswitchd.c:95 note level: +0 -0 no changes all levels: +2 +0 Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- .ci/linux-build.sh | 30 +++++++ .ci/linux-prepare.sh | 2 +- .github/workflows/build-and-test.yml | 113 +++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index cf1462a0c4a..123d420c267 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -51,6 +51,31 @@ function build_ovs() make ${JOBS} } +function clang_analyze() +{ + [ -d "./base-clang-analyzer-results" ] && cache_build=false \ + || cache_build=true + if [ "$cache_build" = true ]; then + # If this is a cache build, proceed to the base branch's directory. + pushd base_ovs_main + fi; + + configure_ovs $OPTS + + make clean + scan-build -o ./clang-analyzer-results -sarif --use-cc=${CC} make ${JOBS} + + if [ "$cache_build" = true ]; then + # Move results, so it will be picked up by the cache. + mv ./clang-analyzer-results ../base-clang-analyzer-results + popd + else + # Only do the compare on the none cache builds. + sarif --check note diff ./base-clang-analyzer-results \ + ./clang-analyzer-results + fi; +} + if [ "$DEB_PACKAGE" ]; then ./boot.sh && ./configure --with-dpdk=$DPDK && make debian mk-build-deps --install --root-cmd sudo --remove debian/control @@ -118,6 +143,11 @@ fi OPTS="${EXTRA_OPTS} ${OPTS} $*" +if [ "$CLANG_ANALYZE" ]; then + clang_analyze + exit 0 +fi + if [ "$TESTSUITE" = 'test' ]; then # 'distcheck' will reconfigure with required options. # Now we only need to prepare the Makefile without sparse-wrapped CC. diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh index c28b6819a35..5028bdc442d 100755 --- a/.ci/linux-prepare.sh +++ b/.ci/linux-prepare.sh @@ -23,7 +23,7 @@ cd .. # https://github.com/pypa/pip/issues/10655 pip3 install --disable-pip-version-check --user wheel pip3 install --disable-pip-version-check --user \ - flake8 'hacking>=3.0' netaddr pyparsing sphinx setuptools + flake8 'hacking>=3.0' netaddr pyparsing sarif-tools sphinx setuptools # Install python test dependencies pip3 install -r python/test_requirements.txt diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 7bfb42a422d..7689f13f1c0 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -254,6 +254,119 @@ jobs: name: logs-linux-${{ join(matrix.*, '-') }} path: logs.tgz + build-clang-analyze: + needs: build-dpdk + env: + dependencies: | + automake bc clang-tools libbpf-dev libnuma-dev libpcap-dev \ + libunbound-dev libunwind-dev libssl-dev libtool llvm-dev + CC: clang + DPDK: dpdk + CLANG_ANALYZE: true + name: clang-analyze + runs-on: ubuntu-22.04 + timeout-minutes: 30 + + steps: + - name: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: get base branch sha + id: base_branch + env: + BASE_SHA: ${{ github.event.pull_request.base.sha }} + EVENT_BEFORE: ${{ github.event.before }} + FORCED_PUSH: ${{ github.event.forced }} + run: | + if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then + echo "sha=$BASE_SHA" >> $GITHUB_OUTPUT + else + if [ "$EVENT_BEFORE" = "0000000000000000000000000000000000000000" ] \ + || [ "$FORCED_PUSH" = true ]; then + BASE_SHA=HEAD~1 + MIN_DISTANCE=1000 + git remote add upstream https://github.com/openvswitch/ovs.git + git fetch upstream + for upstream_head in $(git ls-remote --heads upstream main master dpdk-latest branch-2.17 branch-[3456789]* | cut -f 1); do + CURR_BASE=$(git merge-base ${upstream_head} HEAD 2>/dev/null) + if [ ${CURR_BASE} ]; then + DISTANCE=$(git log --oneline ${CURR_BASE}..HEAD | wc -l); + if test ${MIN_DISTANCE} -gt ${DISTANCE}; then + BASE_SHA=${CURR_BASE} + MIN_DISTANCE=${DISTANCE} + fi + fi + done + echo "sha=$BASE_SHA" >> $GITHUB_OUTPUT + else + echo "sha=$EVENT_BEFORE" >> $GITHUB_OUTPUT + fi + fi + + - name: checkout base branch + env: + BASE_SHA: ${{ steps.base_branch.outputs.sha }} + run: | + cp -r $(pwd)/. /tmp/base_ovs_main && mv /tmp/base_ovs_main ./ + cd $(pwd)/base_ovs_main + git checkout ${BASE_SHA} + + - name: update PATH + run: | + echo "$HOME/bin" >> $GITHUB_PATH + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: generate cache key + id: cache_key + run: | + ver=$(${CC} -v 2>&1 | grep ' version ' | \ + sed 's/.*version \([0-9]*\.[0-9]*\.[0-9]*\).*/\1/g') + echo "key=${CC}-${ver}-analyze-$(git -C base_ovs_main rev-parse HEAD)" \ + >> $GITHUB_OUTPUT + + - name: check for analyzer result cache + id: clang_cache + uses: actions/cache@v3 + with: + path: base-clang-analyzer-results + key: ${{ steps.cache_key.outputs.key }} + + - name: set up python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: get cached dpdk-dir + uses: actions/cache/restore@v3 + with: + path: dpdk-dir + key: ${{ needs.build-dpdk.outputs.dpdk_key }} + + - name: update APT cache + run: sudo apt update || true + + - name: install common dependencies + run: sudo apt install -y ${{ env.dependencies }} + + - name: prepare + run: ./.ci/linux-prepare.sh + + - name: build base reference + if: steps.clang_cache.outputs.cache-hit != 'true' + run: ./.ci/linux-build.sh + + - name: save cache + uses: actions/cache/save@v3 + if: steps.clang_cache.outputs.cache-hit != 'true' + with: + path: base-clang-analyzer-results + key: ${{ steps.cache_key.outputs.key }} + + - name: build + run: ./.ci/linux-build.sh + build-osx: env: CC: clang From 48d4f6963b0f54a5b8badbd227fcfe00f6562202 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 10 Jan 2024 11:22:22 +0100 Subject: [PATCH 536/833] tests: Set handle_segv for UBSAN to allow SIGSEGV tests. Previously tests that were generating a SIGSEGV were excluded from UBSAN runs. With the correct environment variable, these tests can be run. This is working even with the clang version supplied by Ubuntu 20.04. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- tests/atlocal.in | 10 ---------- tests/daemon.at | 28 ++++++++-------------------- tests/ovsdb-server.at | 28 ++++++++-------------------- 3 files changed, 16 insertions(+), 50 deletions(-) diff --git a/tests/atlocal.in b/tests/atlocal.in index 1013098a184..372576a915d 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -194,16 +194,6 @@ else DIFF_SUPPORTS_NORMAL_FORMAT=no fi -# Check whether UB Sanitizer is being used. -case "$CFLAGS" in -*fsanitize=undefined*) - TESTS_WITH_UBSAN=yes - ;; -*) - TESTS_WITH_UBSAN=no - ;; -esac - # Turn off proxies. unset http_proxy unset https_proxy diff --git a/tests/daemon.at b/tests/daemon.at index 2c7fac57c79..6cb8b98883d 100644 --- a/tests/daemon.at +++ b/tests/daemon.at @@ -78,12 +78,9 @@ AT_CLEANUP AT_SETUP([daemon --monitor]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start the daemon and wait for the pidfile to get created. on_exit 'kill $(cat *.pid)' @@ -150,12 +147,9 @@ AT_CLEANUP AT_SETUP([daemon --detach --monitor]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS on_exit 'kill $(cat *.pid)' @@ -239,12 +233,9 @@ AT_SETUP([backtrace without monitor]) AT_SKIP_IF([test "$HAVE_BACKTRACE" = "no" && test "$HAVE_UNWIND" = "no"]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --no-db \ --log-file --verbose=DBG], [0], [ignore], [ignore]) @@ -263,12 +254,9 @@ AT_SETUP([backtrace with monitor]) AT_SKIP_IF([test "$HAVE_BACKTRACE" = "no" && test "$HAVE_UNWIND" = "no"]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS on_exit 'kill $(cat *.pid)' diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 036e4cc3ba5..347ef940a78 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -476,12 +476,9 @@ AT_SETUP([ovsdb-server/add-db with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with one db. ordinal_schema > schema @@ -518,12 +515,9 @@ AT_SETUP([ovsdb-server/add-db and remove-db with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with one db. ordinal_schema > schema @@ -752,12 +746,9 @@ AT_SETUP([ovsdb-server/add-remote with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with no remotes. ordinal_schema > schema @@ -794,12 +785,9 @@ AT_SETUP([ovsdb-server/add-remote and remove-remote with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with no remotes. ordinal_schema > schema From fc13c0d65f552e964c602980ec4d6c8c6ee52bc8 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 10 Jan 2024 11:22:23 +0100 Subject: [PATCH 537/833] ci: Combine the ubsan and asan sanitizer runs. This patch combines the existing ubsan and asan GitHub actions tests into one. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- .ci/linux-build.sh | 14 ++++---------- .github/workflows/build-and-test.yml | 8 ++------ tests/atlocal.in | 4 ++-- tests/automake.mk | 3 +-- tests/ovs-macros.at | 11 +++-------- 5 files changed, 12 insertions(+), 28 deletions(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 123d420c267..7c2aebad80e 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -128,17 +128,11 @@ else CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${SPARSE_FLAGS}" fi -if [ "$ASAN" ]; then - # This will override default option configured in tests/atlocal.in. +if [ "$SANITIZERS" ]; then + # This will override default ASAN options configured in tests/atlocal.in. export ASAN_OPTIONS='detect_leaks=1' - CFLAGS_ASAN="-fno-omit-frame-pointer -fno-common -fsanitize=address" - CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${CFLAGS_ASAN}" -fi - -if [ "$UBSAN" ]; then - # Use the default options configured in tests/atlocal.in, in UBSAN_OPTIONS. - CFLAGS_UBSAN="-fno-omit-frame-pointer -fno-common -fsanitize=undefined" - CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${CFLAGS_UBSAN}" + CFLAGS_FOR_SAN="-fno-omit-frame-pointer -fno-common -fsanitize=$SANITIZERS" + CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${CFLAGS_FOR_SAN}" fi OPTS="${EXTRA_OPTS} ${OPTS} $*" diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 7689f13f1c0..886a5030265 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -78,14 +78,13 @@ jobs: automake libtool gcc bc libjemalloc2 libjemalloc-dev libssl-dev \ llvm-dev libnuma-dev libpcap-dev selinux-policy-dev libbpf-dev \ lftp libreswan - ASAN: ${{ matrix.asan }} - UBSAN: ${{ matrix.ubsan }} CC: ${{ matrix.compiler }} DPDK: ${{ matrix.dpdk }} DPDK_SHARED: ${{ matrix.dpdk_shared }} LIBS: ${{ matrix.libs }} M32: ${{ matrix.m32 }} OPTS: ${{ matrix.opts }} + SANITIZERS: ${{ matrix.sanitizers }} STD: ${{ matrix.std }} TESTSUITE: ${{ matrix.testsuite }} TEST_RANGE: ${{ matrix.test_range }} @@ -111,11 +110,8 @@ jobs: - compiler: gcc testsuite: test - compiler: clang + sanitizers: address,undefined testsuite: test - asan: asan - - compiler: clang - testsuite: test - ubsan: ubsan - compiler: gcc testsuite: test diff --git a/tests/atlocal.in b/tests/atlocal.in index 372576a915d..f321bae55f3 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -217,12 +217,12 @@ export OVS_CTL_TIMEOUT # # We disable leak detection because otherwise minor leaks that don't # matter break everything. -ASAN_OPTIONS=detect_leaks=0:abort_on_error=true:log_path=asan:$ASAN_OPTIONS +ASAN_OPTIONS=detect_leaks=0:abort_on_error=true:log_path=sanitizers:$ASAN_OPTIONS export ASAN_OPTIONS # Add some default flags for UndefinedBehaviorSanitizer, if it was used # for the build. -UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=true:log_path=ubsan:$UBSAN_OPTIONS +UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=true:log_path=sanitizers:$UBSAN_OPTIONS export UBSAN_OPTIONS # Check whether Python test requirements are available. diff --git a/tests/automake.mk b/tests/automake.mk index 10c9fbb01f3..d11b2138190 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -211,8 +211,7 @@ AUTOTEST_PATH = utilities:vswitchd:ovsdb:vtep:tests:ipsec:$(PTHREAD_WIN32_DIR_DL check-local: set $(SHELL) '$(TESTSUITE)' -C tests AUTOTEST_PATH=$(AUTOTEST_PATH); \ "$$@" $(TESTSUITEFLAGS) || \ - (test -z "$$(find $(TESTSUITE_DIR) -name 'asan.*')" && \ - test -z "$$(find $(TESTSUITE_DIR) -name 'ubsan.*')" && \ + (test -z "$$(find $(TESTSUITE_DIR) -name 'sanitizers.*')" && \ test X'$(RECHECK)' = Xyes && "$$@" --recheck) # Python Coverage support. diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at index 39fbfceeb81..06c97855548 100644 --- a/tests/ovs-macros.at +++ b/tests/ovs-macros.at @@ -211,14 +211,9 @@ m4_divert_pop([PREPARE_TESTS]) OVS_START_SHELL_HELPERS ovs_cleanup() { - if test "$(echo asan.*)" != 'asan.*'; then - echo "Address Sanitizer reported errors in:" asan.* - cat asan.* - AT_FAIL_IF([:]) - fi - if test "$(echo ubsan.*)" != 'ubsan.*'; then - echo "Undefined Behavior Sanitizer reported errors in:" ubsan.* - cat ubsan.* + if test "$(echo sanitizers.*)" != 'sanitizers.*'; then + echo "Undefined Behavior Sanitizer or Address Sanitizer reported errors in:" sanitizers.* + cat sanitizers.* AT_FAIL_IF([:]) fi } From 8b51b2bcbd6ee19662976c1a63862a57360ccfa5 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 10 Jan 2024 11:22:24 +0100 Subject: [PATCH 538/833] ci: Add kernel and userspace ASAN/UBSAN tests. This patch adds ASAN and UBSAN GitHub action tests for both the userspace and kernel datapaths. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- .github/workflows/build-and-test.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 886a5030265..3807e5f132c 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -172,6 +172,15 @@ jobs: testsuite: check-kernel test_range: "100-" + - compiler: clang + sanitizers: address,undefined + testsuite: check-kernel + test_range: "-100" + - compiler: clang + sanitizers: address,undefined + testsuite: check-kernel + test_range: "100-" + - compiler: gcc testsuite: check-offloads test_range: "-100" @@ -183,6 +192,11 @@ jobs: dpdk: dpdk testsuite: check-system-userspace + - compiler: clang + sanitizers: address,undefined + dpdk: dpdk + testsuite: check-system-userspace + - compiler: gcc dpdk: dpdk testsuite: check-system-tso From 077d0bad0436d9115d3c5e47ac1545617e001952 Mon Sep 17 00:00:00 2001 From: Mohammad Heib Date: Wed, 27 Dec 2023 13:15:22 +0200 Subject: [PATCH 539/833] mcast-snooping: Store IGMP/MLD protocol version. Store igmp/mld protocol version into the mcast_group internally, the multicast snooping feature is used by many OVS consumers and those consumers heavily rely on the OVS implementation to manage/deal with mcast groups, some of those consumers also need to deal/expose the mcast protocol to the end user for debuggability purposes. OVN for example needs to expose the protocol version to the end user to match between the protocol version used in the OVN logical switches and the uplink ports Therefore, instead of implementing this in each OVS consumer that needs to deal mcast group protocol version which will be very complicated implementation since it rely on the OVS code, saving the protocol to the mdb inside OVS will give that consumer access to the protocol version very easily. Signed-off-by: Mohammad Heib Signed-off-by: Eelco Chaudron --- lib/mcast-snooping.c | 20 ++++++++++++++------ lib/mcast-snooping.h | 18 ++++++++++++++++-- ofproto/ofproto-dpif-xlate.c | 7 ++++++- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c index 43805ae4d56..995216a4b7d 100644 --- a/lib/mcast-snooping.c +++ b/lib/mcast-snooping.c @@ -389,7 +389,8 @@ mcast_snooping_prune_expired(struct mcast_snooping *ms, bool mcast_snooping_add_group(struct mcast_snooping *ms, const struct in6_addr *addr, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock) { bool learned; @@ -424,6 +425,9 @@ mcast_snooping_add_group(struct mcast_snooping *ms, } mcast_group_insert_bundle(ms, grp, port, ms->idle_time); + /* update the protocol version. */ + grp->protocol_version = grp_proto; + /* Mark 'grp' as recently used. */ ovs_list_push_back(&ms->group_lru, &grp->group_node); return learned; @@ -431,11 +435,12 @@ mcast_snooping_add_group(struct mcast_snooping *ms, bool mcast_snooping_add_group4(struct mcast_snooping *ms, ovs_be32 ip4, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock) { struct in6_addr addr = in6_addr_mapped_ipv4(ip4); - return mcast_snooping_add_group(ms, &addr, vlan, port); + return mcast_snooping_add_group(ms, &addr, vlan, port, grp_proto); } int @@ -478,7 +483,8 @@ mcast_snooping_add_report(struct mcast_snooping *ms, || record->type == IGMPV3_CHANGE_TO_INCLUDE_MODE)) { ret = mcast_snooping_leave_group4(ms, ip4, vlan, port); } else { - ret = mcast_snooping_add_group4(ms, ip4, vlan, port); + ret = mcast_snooping_add_group4(ms, ip4, vlan, port, + MCAST_GROUP_IGMPV3); } if (ret) { count++; @@ -513,7 +519,8 @@ mcast_snooping_add_mld(struct mcast_snooping *ms, switch (mld->type) { case MLD_REPORT: - ret = mcast_snooping_add_group(ms, addr, vlan, port); + ret = mcast_snooping_add_group(ms, addr, vlan, port, + MCAST_GROUP_MLDV1); if (ret) { count++; } @@ -545,7 +552,8 @@ mcast_snooping_add_mld(struct mcast_snooping *ms, || record->type == IGMPV3_CHANGE_TO_INCLUDE_MODE)) { ret = mcast_snooping_leave_group(ms, addr, vlan, port); } else { - ret = mcast_snooping_add_group(ms, addr, vlan, port); + ret = mcast_snooping_add_group(ms, addr, vlan, port, + MCAST_GROUP_MLDV2); } if (ret) { count++; diff --git a/lib/mcast-snooping.h b/lib/mcast-snooping.h index f120405da57..8cc8fb0fb8f 100644 --- a/lib/mcast-snooping.h +++ b/lib/mcast-snooping.h @@ -39,6 +39,15 @@ struct mcast_snooping; /* Time, in seconds, before expiring a mrouter_port due to inactivity. */ #define MCAST_MROUTER_PORT_IDLE_TIME 180 +/* Multicast group protocol. */ +typedef enum { + MCAST_GROUP_IGMPV1 = 0, + MCAST_GROUP_IGMPV2, + MCAST_GROUP_IGMPV3, + MCAST_GROUP_MLDV1, + MCAST_GROUP_MLDV2, +} mcast_group_proto; + /* Multicast group entry. * Guarded by owning 'mcast_snooping''s rwlock. */ struct mcast_group { @@ -51,6 +60,9 @@ struct mcast_group { /* VLAN tag. */ uint16_t vlan; + /* Multicast group IPv6/IPv4 Protocol version IGMPv1,2,3 or MLDv1,2 */ + mcast_group_proto protocol_version; + /* Node in parent struct mcast_snooping group_lru. */ struct ovs_list group_node OVS_GUARDED; @@ -185,10 +197,12 @@ mcast_snooping_lookup4(const struct mcast_snooping *ms, ovs_be32 ip4, /* Learning. */ bool mcast_snooping_add_group(struct mcast_snooping *ms, const struct in6_addr *addr, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock); bool mcast_snooping_add_group4(struct mcast_snooping *ms, ovs_be32 ip4, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock); int mcast_snooping_add_report(struct mcast_snooping *ms, const struct dp_packet *p, diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 289f8a7361d..f4d1d71945a 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -2796,6 +2796,7 @@ update_mcast_snooping_table4__(const struct xlate_ctx *ctx, OVS_REQ_WRLOCK(ms->rwlock) { const struct igmp_header *igmp; + mcast_group_proto grp_proto; int count; size_t offset; ovs_be32 ip4 = flow->igmp_group_ip4; @@ -2813,7 +2814,11 @@ update_mcast_snooping_table4__(const struct xlate_ctx *ctx, switch (ntohs(flow->tp_src)) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - if (mcast_snooping_add_group4(ms, ip4, vlan, in_xbundle->ofbundle)) { + grp_proto = ntohs(flow->tp_src) == IGMP_HOST_MEMBERSHIP_REPORT + ? MCAST_GROUP_IGMPV1 + : MCAST_GROUP_IGMPV2; + if (mcast_snooping_add_group4(ms, ip4, vlan, in_xbundle->ofbundle, + grp_proto)) { xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping learned that " IP_FMT" is on port %s in VLAN %d", From b222593bc69b5d82849d18eb435564f5f93449d3 Mon Sep 17 00:00:00 2001 From: Mohammad Heib Date: Wed, 27 Dec 2023 13:15:23 +0200 Subject: [PATCH 540/833] mcast-snooping: Add group protocol to mdb/show output. Expose the mcast group protocol via the mdb/show command output. Signed-off-by: Mohammad Heib Signed-off-by: Eelco Chaudron --- NEWS | 2 + lib/mcast-snooping.c | 24 ++++++++ lib/mcast-snooping.h | 1 + ofproto/ofproto-dpif.c | 9 ++- tests/mcast-snooping.at | 130 ++++++++++++++++++++++++++++++++-------- tests/stp.at | 6 +- 6 files changed, 141 insertions(+), 31 deletions(-) diff --git a/NEWS b/NEWS index 410dd68e512..32ee6896800 100644 --- a/NEWS +++ b/NEWS @@ -45,6 +45,8 @@ Post-v3.2.0 during connection commit will be used by default. - DPDK: * Add support for DPDK 23.11. + - Support for multicast snooping to show the protocol responsible for + adding/updating the entry. Known issues: - DPDK: v23.11 has a change in behavior in handling i40e VF devices. This diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c index 995216a4b7d..60ef8381e9a 100644 --- a/lib/mcast-snooping.c +++ b/lib/mcast-snooping.c @@ -57,6 +57,30 @@ mcast_snooping_flood_unreg(const struct mcast_snooping *ms) return ms->flood_unreg; } +char * +mcast_snooping_group_protocol_str(mcast_group_proto grp_proto) +{ + switch (grp_proto) { + case MCAST_GROUP_IGMPV1: + return "IGMPv1"; + break; + case MCAST_GROUP_IGMPV2: + return "IGMPv2"; + break; + case MCAST_GROUP_IGMPV3: + return "IGMPv3"; + break; + case MCAST_GROUP_MLDV1: + return "MLDv1"; + break; + case MCAST_GROUP_MLDV2: + return "MLDv2"; + break; + default: + return "UNKNOWN"; + } +} + bool mcast_snooping_is_query(ovs_be16 igmp_type) { diff --git a/lib/mcast-snooping.h b/lib/mcast-snooping.h index 8cc8fb0fb8f..76ab4e4f777 100644 --- a/lib/mcast-snooping.h +++ b/lib/mcast-snooping.h @@ -224,6 +224,7 @@ bool mcast_snooping_add_mrouter(struct mcast_snooping *ms, uint16_t vlan, OVS_REQ_WRLOCK(ms->rwlock); bool mcast_snooping_is_query(ovs_be16 igmp_type); bool mcast_snooping_is_membership(ovs_be16 igmp_type); +char *mcast_snooping_group_protocol_str(mcast_group_proto grp_proto); /* Flush. */ void mcast_snooping_mdb_flush(struct mcast_snooping *ms); diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 6e62ed1f982..f59d69c4d1e 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -6172,7 +6172,7 @@ ofproto_unixctl_mcast_snooping_show(struct unixctl_conn *conn, return; } - ds_put_cstr(&ds, " port VLAN GROUP Age\n"); + ds_put_cstr(&ds, " port VLAN protocol GROUP Age\n"); ovs_rwlock_rdlock(&ofproto->ms->rwlock); LIST_FOR_EACH (grp, group_node, &ofproto->ms->group_lru) { LIST_FOR_EACH(b, bundle_node, &grp->bundle_lru) { @@ -6181,7 +6181,9 @@ ofproto_unixctl_mcast_snooping_show(struct unixctl_conn *conn, bundle = b->port; ofputil_port_to_string(ofbundle_get_a_port(bundle)->up.ofp_port, NULL, name, sizeof name); - ds_put_format(&ds, "%5s %4d ", name, grp->vlan); + ds_put_format(&ds, "%5s %4d %-8s ", name, grp->vlan, + mcast_snooping_group_protocol_str( + grp->protocol_version)); ipv6_format_mapped(&grp->addr, &ds); ds_put_format(&ds, " %3d\n", mcast_bundle_age(ofproto->ms, b)); @@ -6195,8 +6197,9 @@ ofproto_unixctl_mcast_snooping_show(struct unixctl_conn *conn, bundle = mrouter->port; ofputil_port_to_string(ofbundle_get_a_port(bundle)->up.ofp_port, NULL, name, sizeof name); - ds_put_format(&ds, "%5s %4d querier %3d\n", + ds_put_format(&ds, "%5s %4d %-8s querier %3d\n", name, mrouter->vlan, + mcast_snooping_group_protocol_str(-1), mcast_mrouter_age(ofproto->ms, mrouter)); } ovs_rwlock_unlock(&ofproto->ms->rwlock); diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index 890e6aca009..a91b3e13a10 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -44,9 +44,9 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '01005e0000015c8a38552552810006c0080046c000240000000001027c00ac111c01e0000001940400001164ec1e00000000027d000000000000000000000000']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 2 1725 querier 0 - 2 1728 querier 0 + port VLAN protocol GROUP Age + 2 1725 UNKNOWN querier 0 + 2 1728 UNKNOWN querier 0 ]) AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap]) @@ -75,7 +75,7 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '01005e0000015c8a38552552810006bd080046c000240000000001027f00ac111901e0000001940400001164ec1000000000027d000000000000000000000000']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) @@ -87,8 +87,8 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '3333ff0e4c67000c290e4c6786dd600000000020000100000000000000000000000000000000ff0200000000000000000001ff0e4c673a000502000001008300e7b800000000ff0200000000000000000001ff0e4c67']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 2 0 ff02::1:ff0e:4c67 0 + port VLAN protocol GROUP Age + 2 0 MLDv1 ff02::1:ff0e:4c67 0 ]) AT_CHECK([ovs-appctl mdb/flush br0], [0], [dnl @@ -99,7 +99,7 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '3333ff0e4c67000c290e4c6786dd600000000020000100000000000000000000000000000000ff0200000000000000000001ff0e4c673a000502000001008300e7b000000000ff0200000000000000000001ff0e4c67']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) OVS_VSWITCHD_STOP @@ -154,8 +154,8 @@ AT_CHECK([ '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' ], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 0 224.1.1.1 0 + port VLAN protocol GROUP Age + 1 0 IGMPv1 224.1.1.1 0 ]) AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl @@ -467,19 +467,19 @@ AT_CHECK([ ], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 - 3 1 querier 0 - 3 2 querier 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 + 3 1 UNKNOWN querier 0 + 3 2 UNKNOWN querier 0 ]) AT_CHECK([ovs-vsctl set port p3 tag=2], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 ]) AT_CLEANUP @@ -522,19 +522,19 @@ AT_CHECK([ ], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 - 2 1 querier 0 - 2 2 querier 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 + 2 1 UNKNOWN querier 0 + 2 2 UNKNOWN querier 0 ]) AT_CHECK([ovs-vsctl del-port br0 p2], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 ]) AT_CLEANUP @@ -605,3 +605,83 @@ recirc_id(),in_port(1),ct_state(+new-inv+trk),eth_type(0x0800),ipv4(prot ]) AT_CLEANUP + +AT_SETUP([mcast - mcast_group protocol updated in mdb]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy \ + other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ +], [0]) + +# Send IGMPv1 report packet. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005E010101000C29A027A18100000008004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' +], [0]) + +# Send IGMPv2 report packet. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005e010102505400000103080046c00020000040000102f8110a000103e001010294040000160008fce0010102' +], [0]) + +# Send IGMPv3 report packet. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005e000016505400000003080046c00028000040000102f9f60a000003e0000016940400002200e3e10000000104000000e9360ce6' +], [0]) + +# Check that all the ipv4 mcast groups were updated in +# the mdb with the appropriate protocol. +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 IGMPv1 224.1.1.1 0 + 1 0 IGMPv2 224.1.1.2 0 + 1 0 IGMPv3 233.54.12.230 0 +]) + +# Send IGMPv1 report packet to address 224.1.1.2 +# and make sure that the protocol will be updated to +# IGMPV1. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005e010102505400000103080046c00020000040000102f8110a000103e00101029404000012000cfce0010102' +], [0]) + +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 IGMPv1 224.1.1.1 0 + 1 0 IGMPv3 233.54.12.230 0 + 1 0 IGMPv1 224.1.1.2 0 +]) + +# Flush the mdb. +AT_CHECK([ovs-appctl mdb/flush br0], [0], [dnl +table successfully flushed +]) + +# Send MLDV2 packet. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +'333300000016d0509956ddf986dd60000000001c3a01fe80000000000000712065589886fa88ff0200000000000000000000000000168f00134d0000000104000000ff0200000000000000000001ff52f3e1']) + +# Send MLDV1 packet. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +'3333ff0e4c67000c290e4c6786dd600000000020000100000000000000000000000000000000ff0200000000000000000001ff0e4c673a000502000001008300e7b800000000ff0200000000000000000001ff0e4c67']) + +# Check that all the ipv6 mcast groups were updated in +# the mdb with the appropriate protocol. +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 MLDv2 ff02::1:ff52:f3e1 0 + 1 0 MLDv1 ff02::1:ff0e:4c67 0 +]) +AT_CLEANUP diff --git a/tests/stp.at b/tests/stp.at index a6b6465d12a..e7bf3958a0a 100644 --- a/tests/stp.at +++ b/tests/stp.at @@ -583,13 +583,13 @@ AT_CHECK([ovs-appctl fdb/show br2], [0], [dnl ]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) AT_CHECK([ovs-appctl mdb/show br1], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) AT_CHECK([ovs-appctl mdb/show br2], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) AT_CLEANUP From 6ece3d57b279e276ebf8b69c36d9ae545ee1d2e8 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 16 Jan 2024 22:52:01 +0000 Subject: [PATCH 541/833] timeval: Add internal timewarp interface. It may be desirable to make use of time warp functionality in unit tests. Separate logic from time/stop unixctl into timeval_stop() and add a new timeval_warp() interface for directing monotonic clock into slow path and advancing the current monotonic directly. This will be used in a patch that implements unit tests for the cooperative multitasking module. Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- lib/timeval.c | 28 ++++++++++++++++++++++++---- lib/timeval.h | 3 +++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/lib/timeval.c b/lib/timeval.c index 0abe7e555aa..10c1b9ca15a 100644 --- a/lib/timeval.c +++ b/lib/timeval.c @@ -767,17 +767,22 @@ get_cpu_usage(void) /* "time/stop" stops the monotonic time returned by e.g. time_msec() from * advancing, except due to later calls to "time/warp". */ -static void -timeval_stop_cb(struct unixctl_conn *conn, - int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, - void *aux OVS_UNUSED) +void +timeval_stop(void) { ovs_mutex_lock(&monotonic_clock.mutex); atomic_store_relaxed(&monotonic_clock.slow_path, true); monotonic_clock.stopped = true; xclock_gettime(monotonic_clock.id, &monotonic_clock.cache); ovs_mutex_unlock(&monotonic_clock.mutex); +} +static void +timeval_stop_cb(struct unixctl_conn *conn, + int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + timeval_stop(); unixctl_command_reply(conn, NULL); } @@ -818,6 +823,21 @@ timeval_warp_cb(struct unixctl_conn *conn, timewarp_work(); } +/* Direct monotonic clock into slow path and advance the current monotonic + * time by 'msecs' milliseconds directly. This is for use in unit tests. */ +void +timeval_warp(long long int msecs) +{ + struct clock *c = &monotonic_clock; + struct timespec warp; + + ovs_mutex_lock(&monotonic_clock.mutex); + atomic_store_relaxed(&monotonic_clock.slow_path, true); + msec_to_timespec(msecs, &warp); + timespec_add(&c->warp, &c->warp, &warp); + ovs_mutex_unlock(&monotonic_clock.mutex); +} + void timeval_dummy_register(void) { diff --git a/lib/timeval.h b/lib/timeval.h index 502f703d4c2..1c40530e27e 100644 --- a/lib/timeval.h +++ b/lib/timeval.h @@ -81,6 +81,9 @@ long long int time_boot_msec(void); void timewarp_run(void); +void timeval_stop(void); +void timeval_warp(long long int msecs); + #ifdef __cplusplus } #endif From 3c8a4e942d6a15faa44f2e283d436bbf57c5b841 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 16 Jan 2024 22:52:02 +0000 Subject: [PATCH 542/833] lib: Introduce cooperative multitasking module. One of the goals of Open vSwitch is to be as resource efficient as possible. Core parts of the program has been implemented as asynchronous state machines, and when absolutely necessary additional threads are used. Introduce cooperative multitasking module which allow us to interleave important processing with long running tasks while avoiding the additional resource consumption of threads and complexity of asynchronous state machines. We will use this module to ensure long running processing in the OVSDB server does not interfere with stable maintenance of the RAFT cluster in subsequent patches. Suggested-by: Ilya Maximets Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- lib/automake.mk | 3 + lib/cooperative-multitasking-private.h | 33 +++ lib/cooperative-multitasking.c | 157 +++++++++++++ lib/cooperative-multitasking.h | 113 +++++++++ tests/automake.mk | 1 + tests/library.at | 10 + tests/ovsdb-server.at | 1 + tests/test-cooperative-multitasking.c | 307 +++++++++++++++++++++++++ 8 files changed, 625 insertions(+) create mode 100644 lib/cooperative-multitasking-private.h create mode 100644 lib/cooperative-multitasking.c create mode 100644 lib/cooperative-multitasking.h create mode 100644 tests/test-cooperative-multitasking.c diff --git a/lib/automake.mk b/lib/automake.mk index 0dc8a35cc43..8596171c635 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -94,6 +94,9 @@ lib_libopenvswitch_la_SOURCES = \ lib/conntrack-other.c \ lib/conntrack.c \ lib/conntrack.h \ + lib/cooperative-multitasking.c \ + lib/cooperative-multitasking.h \ + lib/cooperative-multitasking-private.h \ lib/coverage.c \ lib/coverage.h \ lib/cpu.c \ diff --git a/lib/cooperative-multitasking-private.h b/lib/cooperative-multitasking-private.h new file mode 100644 index 00000000000..cb83823779e --- /dev/null +++ b/lib/cooperative-multitasking-private.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COOPERATIVE_MULTITASKING_PRIVATE_H +#define COOPERATIVE_MULTITASKING_PRIVATE_H 1 + +#include "openvswitch/hmap.h" + +extern struct hmap cooperative_multitasking_callbacks; + +struct cm_entry { + struct hmap_node node; + void (*cb)(void *); + void *arg; + long long int threshold; + long long int last_run; + const char *name; +}; + +#endif /* COOPERATIVE_MULTITASKING_PRIVATE_H */ diff --git a/lib/cooperative-multitasking.c b/lib/cooperative-multitasking.c new file mode 100644 index 00000000000..3a91af26fe1 --- /dev/null +++ b/lib/cooperative-multitasking.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "backtrace.h" +#include "cooperative-multitasking-private.h" +#include "cooperative-multitasking.h" +#include "hash.h" +#include "openvswitch/hmap.h" +#include "openvswitch/vlog.h" +#include "timeval.h" + +VLOG_DEFINE_THIS_MODULE(cooperative_multitasking); + +struct hmap cooperative_multitasking_callbacks = HMAP_INITIALIZER( + &cooperative_multitasking_callbacks); + +/* Free any data allocated by calls to cooperative_multitasking_set(). */ +void +cooperative_multitasking_destroy(void) +{ + struct cm_entry *cm_entry; + HMAP_FOR_EACH_SAFE (cm_entry, node, &cooperative_multitasking_callbacks) { + hmap_remove(&cooperative_multitasking_callbacks, &cm_entry->node); + free(cm_entry); + } +} + +/* Set/update callback as identified by 'cb' and 'arg'. + * + * 'name' is used for logging events related to this callback. + * + * The value for 'last_run' must be updated each time the callback is run. + * + * Updating the value for 'threshold' may be necessary as a consequence of + * change in runtime configuration or requirements of the part of the program + * serviced by the callback. + * + * Providing a value of 0 for 'last_run' or 'threshold' will leave the stored + * value untouched. */ +void +cooperative_multitasking_set(void (*cb)(void *), void *arg, + long long int last_run, long long int threshold, + const char *name) +{ + struct cm_entry *cm_entry; + + HMAP_FOR_EACH_WITH_HASH (cm_entry, node, hash_pointer((void *) cb, 0), + &cooperative_multitasking_callbacks) { + if (cm_entry->cb == cb && cm_entry->arg == arg) { + if (last_run) { + cm_entry->last_run = last_run; + } + + if (threshold) { + cm_entry->threshold = threshold; + } + return; + } + } + + cm_entry = xzalloc(sizeof *cm_entry); + cm_entry->cb = cb; + cm_entry->arg = arg; + cm_entry->threshold = threshold; + cm_entry->last_run = last_run ? last_run : time_msec(); + cm_entry->name = name; + + hmap_insert(&cooperative_multitasking_callbacks, + &cm_entry->node, hash_pointer((void *) cm_entry->cb, 0)); +} + +/* Remove callback identified by 'cb' and 'arg'. */ +void +cooperative_multitasking_remove(void (*cb)(void *), void *arg) +{ + struct cm_entry *cm_entry; + + HMAP_FOR_EACH_WITH_HASH (cm_entry, node, hash_pointer((void *) cb, 0), + &cooperative_multitasking_callbacks) { + if (cm_entry->cb == cb && cm_entry->arg == arg) { + hmap_remove(&cooperative_multitasking_callbacks, &cm_entry->node); + free(cm_entry); + return; + } + } +} + +static void +cooperative_multitasking_yield_at__(const char *source_location) +{ + long long int start = time_msec(); + struct cm_entry *cm_entry; + long long int elapsed; + bool warn; + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + elapsed = time_msec() - cm_entry->last_run; + + if (elapsed >= cm_entry->threshold) { + warn = elapsed - cm_entry->threshold > cm_entry->threshold / 8; + + VLOG(warn ? VLL_WARN : VLL_DBG, "%s: yield for %s(%p): " + "elapsed(%lld) >= threshold(%lld), overrun: %lld", + source_location, cm_entry->name, cm_entry->arg, elapsed, + cm_entry->threshold, elapsed - cm_entry->threshold); + + if (warn && VLOG_IS_DBG_ENABLED()) { + log_backtrace(); + } + + (*cm_entry->cb)(cm_entry->arg); + } + } + + elapsed = time_msec() - start; + if (elapsed > 1000) { + VLOG_WARN("Unreasonably long %lldms runtime for callbacks.", elapsed); + } +} + +/* Iterate over registered callbacks and execute callbacks as demanded by the + * recorded time threshold. */ +void +cooperative_multitasking_yield_at(const char *source_location) +{ + static bool yield_in_progress = false; + + if (yield_in_progress) { + VLOG_ERR_ONCE("Nested yield avoided, this is a bug! " + "Enable debug logging for more details."); + if (VLOG_IS_DBG_ENABLED()) { + VLOG_DBG("%s: nested yield.", source_location); + log_backtrace(); + } + return; + } + yield_in_progress = true; + + cooperative_multitasking_yield_at__(source_location); + + yield_in_progress = false; +} diff --git a/lib/cooperative-multitasking.h b/lib/cooperative-multitasking.h new file mode 100644 index 00000000000..9185c18810e --- /dev/null +++ b/lib/cooperative-multitasking.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COOPERATIVE_MULTITASKING_H +#define COOPERATIVE_MULTITASKING_H 1 + +/* + * cooperative-multitasking, interleaved execution for Open vSwitch. + * + * Overview + * ======== + * + * One of the goals of Open vSwitch is to be as resource efficient as + * possible. Core parts of the program has been implemented as asynchronous + * state machines, and when absolutely necessary additional threads are used. + * + * Modules with mostly synchronous and single threaded code that are expected + * to have heavy processing, can make use of the cooperative-multitasking + * interface to yield to modules that have registered callbacks at a time + * threshold. + * + * Typical Usage + * ============= + * + * The module that provides the callback typically has a run() function that is + * already part of the main processing loop and can then register like this: + * + * static void my_run_cb(void *arg); + * + * static void + * my_run(struct data *my_data) + * { + * ... + * + * cooperative_multitasking_set(&my_run_cb, (void *) my_data, + * time_msec(), 1000, "my_run"); + * } + * + * static void + * my_run_cb (void *arg) + * { + * struct data *my_data = (struct data *) arg; + * + * my_run(my_data); + * } + * + * static void + * my_destroy(struct data *my_data) + * { + * ... + * + * cooperatrive_multitasking_remove(&my_run_cb, (void *) my_data); + * } + * + * The module that is expected to have heavy processing can yield like this: + * + * HMAP_FOR_EACH (row, hmap_node, &src_table->rows) { + * cooperative_multitasking_yield(); + * + * ... + * } + * + * Rules for implementation + * ======================== + * + * - The module that registers itself with a callback must not use the yield + * functionality inside nor should it be possible to do so via calls to other + * modules. + * + * - The module that registers the callback should be self-sufficient, i.e. + * the internal state of that module should not matter to the outside world, + * at least it should not matter for the call stack that enters the + * cooperative_multitasking_yield(). + * + * - cooperative_multitasking_yield() must not be called from places that can + * loop indefinitely, only in places that eventually end, otherwise it may + * give a false impression that the server is working fine while it is stuck + * and not actually doing any useful work. + * + * Thread-safety + * ============= + * + * The cooperative-multitasking module and functions therein are not thread + * safe and must only be used by one thread. + */ + +void cooperative_multitasking_destroy(void); + +void cooperative_multitasking_set(void (*cb)(void *), void *arg, + long long int last_run, + long long int threshold, + const char *name); + +void cooperative_multitasking_remove(void (*cb)(void *), void *arg); + +void cooperative_multitasking_yield_at(const char *source_location); +#define cooperative_multitasking_yield() \ + cooperative_multitasking_yield_at(OVS_SOURCE_LOCATOR) + +#endif /* COOPERATIVE_MULTITASKING_H */ diff --git a/tests/automake.mk b/tests/automake.mk index d11b2138190..04f48f2d8be 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -455,6 +455,7 @@ tests_ovstest_SOURCES = \ tests/test-ccmap.c \ tests/test-cmap.c \ tests/test-conntrack.c \ + tests/test-cooperative-multitasking.c \ tests/test-csum.c \ tests/test-flows.c \ tests/test-hash.c \ diff --git a/tests/library.at b/tests/library.at index 3f9df2f87d3..7b4acebb8a3 100644 --- a/tests/library.at +++ b/tests/library.at @@ -296,3 +296,13 @@ AT_CLEANUP AT_SETUP([uuidset module]) AT_CHECK([ovstest test-uuidset], [0], [], [ignore]) AT_CLEANUP + +AT_SETUP([cooperative-multitasking module]) +AT_CHECK([ovstest test-cooperative-multitasking], [0], []) +AT_CLEANUP + +AT_SETUP([cooperative-multitasking module nested yield detection]) +AT_CHECK([ovstest test-cooperative-multitasking-nested-yield], [0], [], [dnl +cooperative_multitasking|ERR|Nested yield avoided, this is a bug! Enable debug logging for more details. +]) +AT_CLEANUP diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index 347ef940a78..c87ecc2e36e 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -2833,6 +2833,7 @@ m4_define([CLEAN_LOG_FILE], [sed 's/[[0-9\-]]*T[[0-9:\.]]*Z|[[0-9]]*\(|.*$\)/\1/g' $1 | dnl sed '/|poll_loop|/d' | dnl sed '/|socket_util|/d' | dnl + sed '/|cooperative_multitasking|DBG|/d' | dnl sed 's/[[0-9]]*\.ctl/\.ctl/g'> $2]) CLEAN_LOG_FILE([1.log], [1.log.clear]) diff --git a/tests/test-cooperative-multitasking.c b/tests/test-cooperative-multitasking.c new file mode 100644 index 00000000000..f7407bb0305 --- /dev/null +++ b/tests/test-cooperative-multitasking.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2023 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include "cooperative-multitasking.h" +#include "cooperative-multitasking-private.h" +#include "openvswitch/hmap.h" +#include "ovstest.h" +#include "timeval.h" +#include "util.h" +#include "openvswitch/vlog.h" + +struct fixture_arg { + bool called; +}; + +static void fixture_run_wrap(void *arg); + +#define FIXTURE_RUN_NAME "fixture_run" + +static void +fixture_run(struct fixture_arg *arg) +{ + cooperative_multitasking_set(&fixture_run_wrap, (void *) arg, + time_msec(), 0, FIXTURE_RUN_NAME); + if (arg) { + arg->called = true; + } +} + +static void +fixture_run_wrap(void *arg) +{ + struct fixture_arg *fixture_arg = (struct fixture_arg *) arg; + + fixture_run(fixture_arg); +} + + +static void fixture_other_run_wrap(void *arg); + +#define FIXTURE_OTHER_RUN_NAME "fixture_other_run" + +static void +fixture_other_run(struct fixture_arg *arg) +{ + cooperative_multitasking_set(&fixture_other_run_wrap, (void *) arg, + time_msec(), 0, FIXTURE_OTHER_RUN_NAME); + if (arg) { + arg->called = true; + } +} + +static void +fixture_other_run_wrap(void *arg) +{ + struct fixture_arg *fixture_arg = (struct fixture_arg *) arg; + + fixture_other_run(fixture_arg); +} + +static void +test_cm_set_registration(void) +{ + struct cm_entry *cm_entry; + struct fixture_arg arg1 = { + .called = false, + }; + struct fixture_arg arg2 = { + .called = false, + }; + + timeval_stop(); + long long int now = time_msec(); + + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 1000, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 0, 2000, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_other_run_wrap, NULL, 0, 3000, + FIXTURE_OTHER_RUN_NAME); + + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 3); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->cb == &fixture_run_wrap); + ovs_assert(cm_entry->threshold == 1000); + ovs_assert(cm_entry->last_run == now); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->cb == &fixture_run_wrap); + ovs_assert(cm_entry->threshold == 2000); + ovs_assert(cm_entry->last_run == now); + } else if (cm_entry->cb == &fixture_other_run_wrap) { + ovs_assert(cm_entry->arg == NULL); + ovs_assert(cm_entry->threshold == 3000); + ovs_assert(cm_entry->last_run == now); + } else { + OVS_NOT_REACHED(); + } + } + + cooperative_multitasking_remove(&fixture_other_run_wrap, NULL); + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 2); + cooperative_multitasking_remove(&fixture_run_wrap, (void *) &arg2); + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 1); + + cooperative_multitasking_destroy(); +} + +static void +test_cm_set_update(void) +{ + struct cm_entry *cm_entry; + struct fixture_arg arg1 = { + .called = false, + }; + struct fixture_arg arg2 = { + .called = false, + }; + + timeval_stop(); + long long int now = time_msec(); + + /* First register a couple of callbacks. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 0, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 0, 0, + FIXTURE_RUN_NAME); + + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 2); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->threshold == 0); + ovs_assert(cm_entry->last_run == now); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->threshold == 0); + ovs_assert(cm_entry->last_run == now); + } else { + OVS_NOT_REACHED(); + } + } + + /* Update 'last_run' and 'threshold' for each callback and validate + * that the correct entry was actually updated. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 1, 2, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 3, 4, + FIXTURE_RUN_NAME); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->threshold == 2); + ovs_assert(cm_entry->last_run == 1); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->threshold == 4); + ovs_assert(cm_entry->last_run == 3); + } else { + OVS_NOT_REACHED(); + } + } + + /* Confirm that providing 0 for 'last_run' or 'threshold' leaves the + * existing value untouched. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 5, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 6, 0, + FIXTURE_RUN_NAME); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->threshold == 5); + ovs_assert(cm_entry->last_run == 1); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->threshold == 4); + ovs_assert(cm_entry->last_run == 6); + } else { + OVS_NOT_REACHED(); + } + } + + cooperative_multitasking_destroy(); +} + +static void +test_cm_yield(void) +{ + struct cm_entry *cm_entry; + struct fixture_arg arg1 = { + .called = false, + }; + struct fixture_arg arg2 = { + .called = false, + }; + + timeval_stop(); + long long int now = time_msec(); + + /* First register a couple of callbacks. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 1000, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 0, 2000, + FIXTURE_RUN_NAME); + + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 2); + + /* Call to yield should not execute callbacks until time threshold. */ + cooperative_multitasking_yield(); + ovs_assert(arg1.called == false); + ovs_assert(arg2.called == false); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + ovs_assert(cm_entry->last_run == now); + } + + /* Move clock forward and confirm the expected callbacks to be executed. */ + timeval_warp(1000); + timeval_stop(); + cooperative_multitasking_yield(); + ovs_assert(arg1.called == true); + ovs_assert(arg2.called == false); + + /* Move clock forward and confirm the expected callbacks to be executed. */ + arg1.called = arg2.called = false; + timeval_warp(1000); + timeval_stop(); + cooperative_multitasking_yield(); + ovs_assert(arg1.called == true); + ovs_assert(arg2.called == true); + + cooperative_multitasking_destroy(); +} + +static void fixture_buggy_run_wrap(void *arg); + +#define FIXTURE_BUGGY_RUN_NAME "fixture_buggy_run" + +static void +fixture_buggy_run(struct fixture_arg *arg) +{ + cooperative_multitasking_set(&fixture_buggy_run_wrap, (void *) arg, + time_msec(), 0, FIXTURE_BUGGY_RUN_NAME); + if (arg) { + arg->called = true; + } + /* A real run function MUST NOT directly or indirectly call yield, this is + * here to test the detection of such a programming error. */ + cooperative_multitasking_yield(); +} + +static void +fixture_buggy_run_wrap(void *arg) +{ + struct fixture_arg *fixture_arg = (struct fixture_arg *) arg; + + fixture_buggy_run(fixture_arg); +} + +static void +test_cooperative_multitasking_nested_yield(int argc OVS_UNUSED, char *argv[]) +{ + struct fixture_arg arg1 = { + .called = false, + }; + + set_program_name(argv[0]); + vlog_set_pattern(VLF_CONSOLE, "%c|%p|%m"); + vlog_set_levels(NULL, VLF_SYSLOG, VLL_OFF); + + time_msec(); /* Ensure timeval is initialized. */ + + cooperative_multitasking_set(&fixture_buggy_run_wrap, (void *) &arg1, + 0, 1000, FIXTURE_BUGGY_RUN_NAME); + timeval_warp(1000); + cooperative_multitasking_yield(); + cooperative_multitasking_destroy(); +} + +static void +test_cooperative_multitasking(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) +{ + time_msec(); /* Ensure timeval is initialized. */ + + test_cm_set_registration(); + test_cm_set_update(); + test_cm_yield(); +} + +OVSTEST_REGISTER("test-cooperative-multitasking", + test_cooperative_multitasking); +OVSTEST_REGISTER("test-cooperative-multitasking-nested-yield", + test_cooperative_multitasking_nested_yield); From d4a15647b91791f675ad35a989c744eba62efa95 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 16 Jan 2024 22:52:03 +0000 Subject: [PATCH 543/833] ovsdb: raft: Enable cooperative multitasking. The OVSDB server is mostly synchronous and single threaded. The OVSDB RAFT storage engine operate under strict deadlines with operational impact should the deadline be overrun. Register for cooperative multitasking so that long running processing elsewhere in the program may yield to allow stable maintenance of the cluster. Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 8effd9ad1ad..f463afcb3da 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -22,6 +22,7 @@ #include #include +#include "cooperative-multitasking.h" #include "hash.h" #include "jsonrpc.h" #include "lockfile.h" @@ -993,10 +994,13 @@ raft_reset_election_timer(struct raft *raft) raft->election_timeout = raft->election_base + duration; } +#define RAFT_TIMER_THRESHOLD(t) (t / 3) + static void raft_reset_ping_timer(struct raft *raft) { - raft->ping_timeout = time_msec() + raft->election_timer / 3; + raft->ping_timeout = + time_msec() + RAFT_TIMER_THRESHOLD(raft->election_timer); } static void @@ -1371,6 +1375,8 @@ raft_take_leadership(struct raft *raft) } } +static void raft_run_cb(void *arg); + /* Closes everything owned by 'raft' that might be visible outside the process: * network connections, commands, etc. This is part of closing 'raft'; it is * also used if 'raft' has failed in an unrecoverable way. */ @@ -1397,6 +1403,8 @@ raft_close__(struct raft *raft) LIST_FOR_EACH_SAFE (conn, list_node, &raft->conns) { raft_conn_close(conn); } + + cooperative_multitasking_remove(&raft_run_cb, raft); } /* Closes and frees 'raft'. @@ -2114,6 +2122,11 @@ raft_run(struct raft *raft) raft_reset_ping_timer(raft); } + cooperative_multitasking_set( + &raft_run_cb, (void *) raft, time_msec(), + RAFT_TIMER_THRESHOLD(raft->election_timer) + + RAFT_TIMER_THRESHOLD(raft->election_timer) / 10, "raft_run"); + /* Do this only at the end; if we did it as soon as we set raft->left or * raft->failed in handling the RemoveServerReply, then it could easily * cause references to freed memory in RPC sessions, etc. */ @@ -2122,6 +2135,14 @@ raft_run(struct raft *raft) } } +static void +raft_run_cb(void *arg) +{ + struct raft *raft = (struct raft *) arg; + + raft_run(raft); +} + static void raft_wait_session(struct jsonrpc_session *js) { From 36bad31829016556a4cbaf36e5831866d59de84e Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 16 Jan 2024 22:52:04 +0000 Subject: [PATCH 544/833] json: Add yielding json create/destroy functions. Creating and destroying JSON objects may be time consuming. Add json_serialized_object_create_with_yield() and json_destroy_with_yield() functions that make use of the cooperative multitasking module to yield during processing, allowing time sensitive tasks in other parts of the program to be completed during processing. We keep these new functions private to OVS by adding a new lib/json.h header file. The include guard in the public include/openvswitch/json.h is updated to contain the OPENVSWITCH prefix to be in line with the other public header files, allowing us to use the non-prefixed version in our private lib/json.h. Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- include/openvswitch/json.h | 10 +++---- lib/automake.mk | 1 + lib/json.c | 58 ++++++++++++++++++++++++++++++++------ lib/json.h | 32 +++++++++++++++++++++ 4 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 lib/json.h diff --git a/include/openvswitch/json.h b/include/openvswitch/json.h index eb92c6a9186..55544076084 100644 --- a/include/openvswitch/json.h +++ b/include/openvswitch/json.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef JSON_H -#define JSON_H 1 +#ifndef OPENVSWITCH_JSON_H +#define OPENVSWITCH_JSON_H 1 /* This is an implementation of JavaScript Object Notation (JSON) as specified * by RFC 4627. It is intended to fully comply with RFC 4627, with the @@ -159,14 +159,14 @@ json_clone(const struct json *json_) return json; } -void json_destroy__(struct json *json); +void json_destroy__(struct json *json, bool); /* Frees 'json' and everything it points to, recursively. */ static inline void json_destroy(struct json *json) { if (json && !--json->count) { - json_destroy__(json); + json_destroy__(json, false); } } @@ -174,4 +174,4 @@ json_destroy(struct json *json) } #endif -#endif /* json.h */ +#endif /* OPENVSWITCH_JSON_H */ diff --git a/lib/automake.mk b/lib/automake.mk index 8596171c635..78d6e651645 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -178,6 +178,7 @@ lib_libopenvswitch_la_SOURCES = \ lib/jhash.c \ lib/jhash.h \ lib/json.c \ + lib/json.h \ lib/jsonrpc.c \ lib/jsonrpc.h \ lib/lacp.c \ diff --git a/lib/json.c b/lib/json.c index 9411eeda7cc..001f6e6ab79 100644 --- a/lib/json.c +++ b/lib/json.c @@ -24,13 +24,21 @@ #include #include +#include "cooperative-multitasking.h" #include "openvswitch/dynamic-string.h" #include "hash.h" +#include "json.h" #include "openvswitch/shash.h" #include "unicode.h" #include "util.h" #include "uuid.h" +/* Non-public JSSF_* flags. Must not overlap with public ones defined + * in include/openvswitch/json.h. */ +enum { + JSSF_YIELD = 1 << 7, +}; + /* The type of a JSON token. */ enum json_token_type { T_EOF = 0, @@ -189,6 +197,14 @@ json_serialized_object_create(const struct json *src) return json; } +struct json * +json_serialized_object_create_with_yield(const struct json *src) +{ + struct json *json = json_create(JSON_SERIALIZED_OBJECT); + json->string = json_to_string(src, JSSF_SORT | JSSF_YIELD); + return json; +} + struct json * json_array_create_empty(void) { @@ -375,20 +391,20 @@ json_integer(const struct json *json) return json->integer; } -static void json_destroy_object(struct shash *object); -static void json_destroy_array(struct json_array *array); +static void json_destroy_object(struct shash *object, bool yield); +static void json_destroy_array(struct json_array *array, bool yield); /* Frees 'json' and everything it points to, recursively. */ void -json_destroy__(struct json *json) +json_destroy__(struct json *json, bool yield) { switch (json->type) { case JSON_OBJECT: - json_destroy_object(json->object); + json_destroy_object(json->object, yield); break; case JSON_ARRAY: - json_destroy_array(&json->array); + json_destroy_array(&json->array, yield); break; case JSON_STRING: @@ -410,14 +426,22 @@ json_destroy__(struct json *json) } static void -json_destroy_object(struct shash *object) +json_destroy_object(struct shash *object, bool yield) { struct shash_node *node; + if (yield) { + cooperative_multitasking_yield(); + } + SHASH_FOR_EACH_SAFE (node, object) { struct json *value = node->data; - json_destroy(value); + if (yield) { + json_destroy_with_yield(value); + } else { + json_destroy(value); + } shash_delete(object, node); } shash_destroy(object); @@ -425,12 +449,20 @@ json_destroy_object(struct shash *object) } static void -json_destroy_array(struct json_array *array) +json_destroy_array(struct json_array *array, bool yield) { size_t i; + if (yield) { + cooperative_multitasking_yield(); + } + for (i = 0; i < array->n; i++) { - json_destroy(array->elems[i]); + if (yield) { + json_destroy_with_yield(array->elems[i]); + } else { + json_destroy(array->elems[i]); + } } free(array->elems); } @@ -1664,6 +1696,10 @@ json_serialize_object(const struct shash *object, struct json_serializer *s) s->depth++; indent_line(s); + if (s->flags & JSSF_YIELD) { + cooperative_multitasking_yield(); + } + if (s->flags & JSSF_SORT) { const struct shash_node **nodes; size_t n, i; @@ -1697,6 +1733,10 @@ json_serialize_array(const struct json_array *array, struct json_serializer *s) ds_put_char(ds, '['); s->depth++; + if (s->flags & JSSF_YIELD) { + cooperative_multitasking_yield(); + } + if (array->n > 0) { indent_line(s); diff --git a/lib/json.h b/lib/json.h new file mode 100644 index 00000000000..4ad440b396f --- /dev/null +++ b/lib/json.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef JSON_H +#define JSON_H 1 + +#include "openvswitch/json.h" + +static inline void +json_destroy_with_yield(struct json *json) +{ + if (json && !--json->count) { + json_destroy__(json, true); + } +} + +struct json *json_serialized_object_create_with_yield(const struct json *); + +#endif /* JSON_H */ From 603890d6a7f664a41c30cf9aa2cc543d3d150d21 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 16 Jan 2024 22:52:05 +0000 Subject: [PATCH 545/833] ovsdb-server: Make use of cooperative multitasking. Initialize the cooperative multitasking module for the ovsdb-server. The server side schema conversion process used for storage engines such as RAFT is time consuming, yield during processing. After the schema conversion is done, the processing of JSON-RPC sessions and OVSDB monitors for reconnecting clients can overrun the configured election timer. The destruction of JSON objects representing the database contents has been identified as one of the primary offenders. Make use of yielding version of the JSON object destroy function to mitigate. This series has been tested by checking success of schema conversion, ensuring no involuntary leader change occurs with election timer configurations as low as 750 msec, on a 75MB database with ~ 100 connected clients as produced by the ovn-heater ocp-120-density-light test scenario. Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- NEWS | 2 ++ ovsdb/file.c | 3 +++ ovsdb/jsonrpc-server.c | 3 +++ ovsdb/monitor.c | 15 +++++++++++---- ovsdb/ovsdb-server.c | 2 ++ ovsdb/trigger.c | 3 +++ 6 files changed, 24 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 32ee6896800..9e057ad2294 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,8 @@ Post-v3.2.0 remotes and database configuration, including setting options for connection methods for relays and active-backup replication. For more details see ovsdb-server(1) and ovsdb(7). + * Make use of cooperative multitasking to improve maintenance of RAFT + cluster during long running processing such as online schema conversion. - OpenFlow: * NXT_CT_FLUSH extension is updated to support flushing connections based on mark and labels. 'ct-flush' command of ovs-ofctl updated diff --git a/ovsdb/file.c b/ovsdb/file.c index 77a89fd1a46..66ef87a1f16 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -23,6 +23,7 @@ #include "bitmap.h" #include "column.h" +#include "cooperative-multitasking.h" #include "log.h" #include "openvswitch/json.h" #include "lockfile.h" @@ -321,6 +322,8 @@ ovsdb_convert_table(struct ovsdb_txn *txn, struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); + cooperative_multitasking_yield(); + SHASH_FOR_EACH (node, &src_table->schema->columns) { const struct ovsdb_column *src_column = node->data; const struct ovsdb_column *dst_column; diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 817997677bd..26a53898f0a 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -21,6 +21,7 @@ #include "bitmap.h" #include "column.h" +#include "cooperative-multitasking.h" #include "openvswitch/dynamic-string.h" #include "monitor.h" #include "openvswitch/json.h" @@ -694,6 +695,8 @@ ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote) struct ovsdb_jsonrpc_session *s; LIST_FOR_EACH_SAFE (s, node, &remote->sessions) { + cooperative_multitasking_yield(); + int error = ovsdb_jsonrpc_session_run(s); if (error) { ovsdb_jsonrpc_session_close(s); diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index d1e466faa48..c3bfae3d2a1 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -20,8 +20,10 @@ #include "bitmap.h" #include "column.h" +#include "cooperative-multitasking.h" #include "openvswitch/dynamic-string.h" #include "openvswitch/json.h" +#include "json.h" #include "jsonrpc.h" #include "ovsdb-error.h" #include "ovsdb-parser.h" @@ -262,7 +264,7 @@ ovsdb_monitor_json_cache_flush(struct ovsdb_monitor *dbmon) struct ovsdb_monitor_json_cache_node *node; HMAP_FOR_EACH_POP(node, hmap_node, &dbmon->json_cache) { - json_destroy(node->json); + json_destroy_with_yield(node->json); free(node); } } @@ -278,7 +280,7 @@ ovsdb_monitor_json_cache_destroy(struct ovsdb_monitor *dbmon, = ovsdb_monitor_json_cache_search(dbmon, v, change_set); if (node) { hmap_remove(&dbmon->json_cache, &node->hmap_node); - json_destroy(node->json); + json_destroy_with_yield(node->json); free(node); } } @@ -1172,6 +1174,8 @@ ovsdb_monitor_compose_update( struct ovsdb_monitor_table *mt = mcst->mt; HMAP_FOR_EACH_SAFE (row, hmap_node, &mcst->rows) { + cooperative_multitasking_yield(); + struct json *row_json; row_json = (*row_update)(mt, condition, OVSDB_MONITOR_ROW, row, initial, changed, mcst->n_columns); @@ -1217,6 +1221,8 @@ ovsdb_monitor_compose_cond_change_update( HMAP_FOR_EACH (row, hmap_node, &mt->table->rows) { struct json *row_json; + cooperative_multitasking_yield(); + row_json = ovsdb_monitor_compose_row_update2(mt, condition, OVSDB_ROW, row, false, changed, @@ -1286,8 +1292,9 @@ ovsdb_monitor_get_update( /* Pre-serializing the object to avoid doing this * for every client. */ - json_serialized = json_serialized_object_create(json); - json_destroy(json); + json_serialized = + json_serialized_object_create_with_yield(json); + json_destroy_with_yield(json); json = json_serialized; } ovsdb_monitor_json_cache_insert(dbmon, version, mcs, diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index d45c9e5f3d6..b51fd42fe56 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -24,6 +24,7 @@ #include "column.h" #include "command-line.h" +#include "cooperative-multitasking.h" #include "daemon.h" #include "dirs.h" #include "dns-resolve.h" @@ -915,6 +916,7 @@ main(int argc, char *argv[]) } dns_resolve_destroy(); perf_counters_destroy(); + cooperative_multitasking_destroy(); service_stop(); return 0; } diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 2a48ccc643a..8c00fec181f 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -20,6 +20,7 @@ #include #include +#include "cooperative-multitasking.h" #include "file.h" #include "openvswitch/json.h" #include "jsonrpc.h" @@ -181,6 +182,8 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) bool disconnect_all = false; LIST_FOR_EACH_SAFE (t, node, &db->triggers) { + cooperative_multitasking_yield(); + if (run_triggers || now - t->created >= t->timeout_msec || t->progress || t->txn_forward) { From 6bbbb766424d3d33aaf36cbe956f08d88b20e764 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:50 +0100 Subject: [PATCH 546/833] python: ovs: flow: Fix typo in n_packets. They key used in flows is "n_packets". Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/flow/ofp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index 20231fd9f38..f1a720d7522 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -170,7 +170,7 @@ def _gen_info_decoders(): args = { "table": decode_int, "duration": decode_time, - "n_packet": decode_int, + "n_packets": decode_int, "n_bytes": decode_int, "cookie": decode_int, "idle_timeout": decode_time, From 9ef49ca85b9bfcf71edf35ebee549d2dd9d44145 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:51 +0100 Subject: [PATCH 547/833] python: tests: Add info and key tests for OFPFlows. Parsing of info and matches was being tested as generic k-v parsing. Also verify we don't find any unexpected field. Also, verify the length of the kv_list meets the expectations. Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/tests/test_ofp.py | 75 +++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 27bcf0c47cb..5d6d21d8213 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -6,6 +6,32 @@ from ovs.flow.decoders import EthMask, IPMask, decode_mask +def do_test_section(input_string, section, expected): + flow = OFPFlow(input_string) + kv_list = flow.section(section).data + + assert len(expected) == len(kv_list) + + for i in range(len(expected)): + assert expected[i].key == kv_list[i].key + assert expected[i].value == kv_list[i].value + + # Assert positions relative to action string are OK. + pos = flow.section(section).pos + string = flow.section(section).string + + kpos = kv_list[i].meta.kpos + kstr = kv_list[i].meta.kstring + vpos = kv_list[i].meta.vpos + vstr = kv_list[i].meta.vstring + assert string[kpos : kpos + len(kstr)] == kstr + if vpos != -1: + assert string[vpos : vpos + len(vstr)] == vstr + + # Assert string meta is correct. + assert input_string[pos : pos + len(string)] == string + + @pytest.mark.parametrize( "input_string,expected", [ @@ -570,27 +596,40 @@ def test_act(input_string, expected): if isinstance(expected, type): with pytest.raises(expected): - ofp = OFPFlow(input_string) + OFPFlow(input_string) return - ofp = OFPFlow(input_string) - actions = ofp.actions_kv + do_test_section(input_string, "actions", expected) - for i in range(len(expected)): - assert expected[i].key == actions[i].key - assert expected[i].value == actions[i].value - # Assert positions relative to action string are OK. - apos = ofp.section("actions").pos - astring = ofp.section("actions").string +@pytest.mark.parametrize( + "input_string,expected", + [ + ( + "cookie=0x35f946ead8d8f9e4, duration=97746.271s, table=0, n_packets=12, n_bytes=254, priority=4,in_port=1", # noqa: E501 + ( + [ + KeyValue("cookie", 0x35f946ead8d8f9e4), + KeyValue("duration", 97746.271), + KeyValue("table", 0), + KeyValue("n_packets", 12), + KeyValue("n_bytes", 254), + ], + [ + KeyValue("priority", 4), + KeyValue("in_port", 1) + ], + ), + ), + ], +) +def test_key(input_string, expected): + if isinstance(expected, type): + with pytest.raises(expected): + OFPFlow(input_string) + return - kpos = actions[i].meta.kpos - kstr = actions[i].meta.kstring - vpos = actions[i].meta.vpos - vstr = actions[i].meta.vstring - assert astring[kpos : kpos + len(kstr)] == kstr - if vpos != -1: - assert astring[vpos : vpos + len(vstr)] == vstr + input_string += " actions=drop" - # Assert astring meta is correct. - assert input_string[apos : apos + len(astring)] == astring + do_test_section(input_string, "info", expected[0]) + do_test_section(input_string, "match", expected[1]) From ab7d089612cdcea0a72150b9703e4f6c0415eb04 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:52 +0100 Subject: [PATCH 548/833] python: ovs: flow: Add sample to nested actions. Add the sample action to those that can be called in nested actions (such as clone). Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/flow/odp.py | 29 +++++++++++++++-------------- python/ovs/tests/test_ofp.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index 88aee17fb2a..ef7e5d6b846 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -336,6 +336,21 @@ def _action_decoders_args(): **ODPFlow._tnl_action_decoder_args(), } + _decoders["sample"] = nested_kv_decoder( + KVDecoders( + { + "sample": (lambda x: float(x.strip("%"))), + "actions": nested_kv_decoder( + KVDecoders( + decoders=_decoders, + default_free=decode_free_output, + ), + is_list=True, + ), + } + ) + ) + _decoders["clone"] = nested_kv_decoder( KVDecoders(decoders=_decoders, default_free=decode_free_output), is_list=True, @@ -343,20 +358,6 @@ def _action_decoders_args(): return { **_decoders, - "sample": nested_kv_decoder( - KVDecoders( - { - "sample": (lambda x: float(x.strip("%"))), - "actions": nested_kv_decoder( - KVDecoders( - decoders=_decoders, - default_free=decode_free_output, - ), - is_list=True, - ), - } - ) - ), "check_pkt_len": nested_kv_decoder( KVDecoders( { diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 5d6d21d8213..d71ecf08aa3 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -571,6 +571,20 @@ def do_test_section(input_string, section, expected): ), ], ), + ( + "actions=LOCAL,clone(sample(probability=123))", + [ + KeyValue("output", {"port": "LOCAL"}), + KeyValue( + "clone", + [ + {"sample": { + "probability": 123, + }}, + ] + ), + ], + ), ( "actions=doesnotexist(1234)", ParseError, From 5e45091ea8887083aa40ba7d9577e82dd8bee6f1 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:53 +0100 Subject: [PATCH 549/833] python: ovs: flow: Add dp hash and meter actions. Add missing actions. Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/flow/odp.py | 9 +++++++++ python/ovs/tests/test_odp.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index ef7e5d6b846..46697a1bc56 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -204,6 +204,7 @@ def _action_decoders_args(): """Generate the arguments for the action KVDecoders.""" _decoders = { "drop": decode_flag, + "meter": decode_int, "lb_output": decode_int, "trunc": decode_int, "recirc": decode_int, @@ -334,6 +335,14 @@ def _action_decoders_args(): ) ), **ODPFlow._tnl_action_decoder_args(), + "hash": nested_kv_decoder( + KVDecoders( + { + "l4": decode_int, + "sym_l4": decode_int, + } + ) + ), } _decoders["sample"] = nested_kv_decoder( diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index a50d3185cc6..d60947a5c00 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -534,6 +534,18 @@ def test_odp_fields(input_string, expected): ), ], ), + ( + "actions:meter(1),hash(l4(0))", + [ + KeyValue("meter", 1), + KeyValue( + "hash", + { + "l4": 0, + } + ), + ], + ), ], ) def test_odp_actions(input_string, expected): From e72b7b6f174f4974720c94103e54be1d0a07fc38 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:54 +0100 Subject: [PATCH 550/833] python: tests: Refactor test_odp section testing. Avoid code duplication by moving the section testing code to its own function. Also, verify the length of the kv_list meets the expectations. Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/tests/test_odp.py | 68 +++++++++++++++--------------------- 1 file changed, 28 insertions(+), 40 deletions(-) diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index d60947a5c00..401e16b7a42 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -13,6 +13,32 @@ ) +def do_test_section(input_string, section, expected): + flow = ODPFlow(input_string) + kv_list = flow.section(section).data + + assert len(expected) == len(kv_list) + + for i in range(len(expected)): + assert expected[i].key == kv_list[i].key + assert expected[i].value == kv_list[i].value + + # Assert positions relative to action string are OK. + pos = flow.section(section).pos + string = flow.section(section).string + + kpos = kv_list[i].meta.kpos + kstr = kv_list[i].meta.kstring + vpos = kv_list[i].meta.vpos + vstr = kv_list[i].meta.vstring + assert string[kpos : kpos + len(kstr)] == kstr + if vpos != -1: + assert string[vpos : vpos + len(vstr)] == vstr + + # Assert string meta is correct. + assert input_string[pos : pos + len(string)] == string + + @pytest.mark.parametrize( "input_string,expected", [ @@ -109,26 +135,7 @@ ], ) def test_odp_fields(input_string, expected): - odp = ODPFlow(input_string) - match = odp.match_kv - for i in range(len(expected)): - assert expected[i].key == match[i].key - assert expected[i].value == match[i].value - - # Assert positions relative to action string are OK. - mpos = odp.section("match").pos - mstring = odp.section("match").string - - kpos = match[i].meta.kpos - kstr = match[i].meta.kstring - vpos = match[i].meta.vpos - vstr = match[i].meta.vstring - assert mstring[kpos : kpos + len(kstr)] == kstr - if vpos != -1: - assert mstring[vpos : vpos + len(vstr)] == vstr - - # Assert mstring meta is correct. - assert input_string[mpos : mpos + len(mstring)] == mstring + do_test_section(input_string, "match", expected) @pytest.mark.parametrize( @@ -549,23 +556,4 @@ def test_odp_fields(input_string, expected): ], ) def test_odp_actions(input_string, expected): - odp = ODPFlow(input_string) - actions = odp.actions_kv - for i in range(len(expected)): - assert expected[i].key == actions[i].key - assert expected[i].value == actions[i].value - - # Assert positions relative to action string are OK. - apos = odp.section("actions").pos - astring = odp.section("actions").string - - kpos = actions[i].meta.kpos - kstr = actions[i].meta.kstring - vpos = actions[i].meta.vpos - vstr = actions[i].meta.vstring - assert astring[kpos : kpos + len(kstr)] == kstr - if vpos != -1: - assert astring[vpos : vpos + len(vstr)] == vstr - - # Assert astring meta is correct. - assert input_string[apos : apos + len(astring)] == astring + do_test_section(input_string, "actions", expected) From 32f6737b5cb15b56b8bc963756a6fb237978442c Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:55 +0100 Subject: [PATCH 551/833] python: ovs: flow: Add idle_age to openflow flows. Add missing key. Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/flow/ofp.py | 1 + python/ovs/tests/test_ofp.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index f1a720d7522..3d3226c919c 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -176,6 +176,7 @@ def _gen_info_decoders(): "idle_timeout": decode_time, "hard_timeout": decode_time, "hard_age": decode_time, + "idle_age": decode_time, } return KVDecoders(args) diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index d71ecf08aa3..7ba444b7bb4 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -620,7 +620,7 @@ def test_act(input_string, expected): "input_string,expected", [ ( - "cookie=0x35f946ead8d8f9e4, duration=97746.271s, table=0, n_packets=12, n_bytes=254, priority=4,in_port=1", # noqa: E501 + "cookie=0x35f946ead8d8f9e4, duration=97746.271s, table=0, n_packets=12, n_bytes=254, idle_age=117, priority=4,in_port=1", # noqa: E501 ( [ KeyValue("cookie", 0x35f946ead8d8f9e4), @@ -628,6 +628,7 @@ def test_act(input_string, expected): KeyValue("table", 0), KeyValue("n_packets", 12), KeyValue("n_bytes", 254), + KeyValue("idle_age", 117), ], [ KeyValue("priority", 4), From ea44cafae235ec48a2311e2573fa27301955aabc Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:56 +0100 Subject: [PATCH 552/833] python: ovs: flow: Make check_pkt_len action a list. In general, most actions must be lists since the keys can be repeated. Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/flow/odp.py | 6 ++++-- python/ovs/tests/test_odp.py | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index 46697a1bc56..7d9b165d464 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -375,13 +375,15 @@ def _action_decoders_args(): KVDecoders( decoders=_decoders, default_free=decode_free_output, - ) + ), + is_list=True, ), "le": nested_kv_decoder( KVDecoders( decoders=_decoders, default_free=decode_free_output, - ) + ), + is_list=True, ), } ) diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index 401e16b7a42..f19ec386e8e 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -519,24 +519,24 @@ def test_odp_fields(input_string, expected): "check_pkt_len", { "size": 200, - "gt": {"output": {"port": 4}}, - "le": {"output": {"port": 5}}, + "gt": [{"output": {"port": 4}}], + "le": [{"output": {"port": 5}}], }, ), KeyValue( "check_pkt_len", { "size": 200, - "gt": {"drop": True}, - "le": {"output": {"port": 5}}, + "gt": [{"drop": True}], + "le": [{"output": {"port": 5}}], }, ), KeyValue( "check_pkt_len", { "size": 200, - "gt": {"ct": {"nat": True}}, - "le": {"drop": True}, + "gt": [{"ct": {"nat": True}}], + "le": [{"drop": True}], }, ), ], From 253d9007587469e9539833e4ecd7912cec5d3f03 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 17 Jan 2024 12:18:57 +0100 Subject: [PATCH 553/833] python: ovs: flow: Add meter_id to controller. Add missing option to controller action. Signed-off-by: Adrian Moreno Signed-off-by: Simon Horman --- python/ovs/flow/ofp_act.py | 1 + python/ovs/tests/test_ofp.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/python/ovs/flow/ofp_act.py b/python/ovs/flow/ofp_act.py index c540443eaea..2c85076a34c 100644 --- a/python/ovs/flow/ofp_act.py +++ b/python/ovs/flow/ofp_act.py @@ -54,6 +54,7 @@ def decode_controller(value): "id": decode_int, "userdata": decode_default, "pause": decode_flag, + "meter_id": decode_int, } ) )(value) diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 7ba444b7bb4..d098520cae0 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -52,6 +52,21 @@ def do_test_section(input_string, section, expected): KeyValue("controller", {"max_len": 200}), ], ), + ( + "actions=controller(max_len=123,reason=no_match,id=456,userdata=00.00.00.12.00.00.00.00,meter_id=12)", # noqa: E501 + [ + KeyValue( + "controller", + { + "max_len": 123, + "reason": "no_match", + "id": 456, + "userdata": "00.00.00.12.00.00.00.00", + "meter_id": 12, + } + ), + ], + ), ( "actions=enqueue(foo,42),enqueue:foo:42,enqueue(bar,4242)", [ From 9e3c842d5793cb9a99f39efb67344cd702097b09 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 17 Jan 2024 09:21:22 -0500 Subject: [PATCH 554/833] dp-packet: Set checksum flags during software TSO. When OVS needs to fallback on the software TSO implementation to segment a packet, it currently doesn't guarantee that IP and TCP checksum offload flags are set. However, it is possible that these is required. This is true in the case of dp_netdev_upcall(), which clears these flags. This patch explicitly sets the appropriate flags when the segmentation flag is removed, to guarantee that packets always end up with correct checksums. Fixes: 8b5fe2dc6080 ("userspace: Add Generic Segmentation Offloading.") Acked-by: Simon Horman Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 11aa0072354..ad272f581dc 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1131,11 +1131,23 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } -/* Resets TCP Segmentation flag in packet 'p'. */ +/* Resets TCP Segmentation in packet 'p' and adjust flags to indicate + * L3 and L4 checksumming is now required. */ static inline void dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) { - *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_TCP_SEG; + uint64_t ol_flags = *dp_packet_ol_flags_ptr(p) + | DP_PACKET_OL_TX_TCP_CKSUM; + + ol_flags = ol_flags & ~(DP_PACKET_OL_TX_TCP_SEG + | DP_PACKET_OL_RX_L4_CKSUM_GOOD + | DP_PACKET_OL_RX_IP_CKSUM_GOOD); + + if (ol_flags & DP_PACKET_OL_TX_IPV4) { + ol_flags |= DP_PACKET_OL_TX_IP_CKSUM; + } + + *dp_packet_ol_flags_ptr(p) = ol_flags; } /* Returns 'true' if the IP header has good integrity and the From 76e2f20d53d381eff7a01b13add337f1750c9fe0 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 17 Jan 2024 09:21:23 -0500 Subject: [PATCH 555/833] userspace: Correct IPv6 header in software-GSO. Correct the length field in IPv6 packets when applying software fallback GSO. Previosuly the field retained an IPv4 header size, which was incorrect. Fixes: 8b5fe2dc6080 ("userspace: Add Generic Segmentation Offloading.") Acked-by: Simon Horman Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet-gso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c index e2c141b32a7..847685ad989 100644 --- a/lib/dp-packet-gso.c +++ b/lib/dp-packet-gso.c @@ -142,7 +142,7 @@ dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen - = htons(sizeof *ip_hdr + dp_packet_l4_size(seg)); + = htons(dp_packet_l3_size(seg) - sizeof *ip6_hdr); } /* Update L4 header. */ From 0edfe05e4256f83b7a622a40b248883222f8b134 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 17 Jan 2024 09:21:24 -0500 Subject: [PATCH 556/833] netdev-dummy: Add support and test for TSO. Test that netdev-dummy is able to send and receive segment offloaded packets. Acked-by: Simon Horman Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dummy.c | 30 +++++++++++++++++- tests/dpif-netdev.at | 74 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 8c6e6d44870..d6ef865aa65 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -44,6 +44,7 @@ #include "unaligned.h" #include "timeval.h" #include "unixctl.h" +#include "userspace-tso.h" #include "reconnect.h" VLOG_DEFINE_THIS_MODULE(netdev_dummy); @@ -152,6 +153,8 @@ struct netdev_dummy { bool ol_ip_csum OVS_GUARDED; /* Flag RX packet with good csum. */ bool ol_ip_csum_set_good OVS_GUARDED; + /* Set the segment size for netdev TSO support. */ + int ol_tso_segsz OVS_GUARDED; }; /* Max 'recv_queue_len' in struct netdev_dummy. */ @@ -806,6 +809,10 @@ netdev_dummy_get_config(const struct netdev *dev, struct smap *args) smap_add_format(args, "ol_ip_csum_set_good", "%s", "true"); } + if (netdev->ol_tso_segsz && userspace_tso_enabled()) { + smap_add_format(args, "ol_tso_segsz", "%d", netdev->ol_tso_segsz); + } + /* 'dummy-pmd' specific config. */ if (!netdev_is_pmd(dev)) { goto exit; @@ -937,6 +944,14 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args, netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; } + if (userspace_tso_enabled()) { + netdev->ol_tso_segsz = smap_get_int(args, "ol_tso_segsz", 0); + if (netdev->ol_tso_segsz) { + netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_TCP_TSO + | NETDEV_TX_OFFLOAD_TCP_CKSUM); + } + } + netdev_change_seq_changed(netdev_); /* 'dummy-pmd' specific config. */ @@ -1119,6 +1134,13 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, /* The netdev hardware sets the flag when the packet has good csum. */ dp_packet_ol_set_ip_csum_good(packet); } + + if (userspace_tso_enabled() && netdev->ol_tso_segsz) { + dp_packet_set_tso_segsz(packet, netdev->ol_tso_segsz); + dp_packet_hwol_set_tcp_seg(packet); + dp_packet_hwol_set_csum_tcp(packet); + } + ovs_mutex_unlock(&netdev->mutex); dp_packet_batch_init_packet(batch, packet); @@ -1174,6 +1196,12 @@ netdev_dummy_send(struct netdev *netdev, int qid, DP_PACKET_BATCH_FOR_EACH(i, packet, batch) { const void *buffer = dp_packet_data(packet); size_t size = dp_packet_size(packet); + bool is_tso; + + ovs_mutex_lock(&dev->mutex); + is_tso = userspace_tso_enabled() && dev->ol_tso_segsz && + dp_packet_hwol_is_tso(packet); + ovs_mutex_unlock(&dev->mutex); if (!dp_packet_is_eth(packet)) { error = EPFNOSUPPORT; @@ -1194,7 +1222,7 @@ netdev_dummy_send(struct netdev *netdev, int qid, if (eth->eth_type == htons(ETH_TYPE_VLAN)) { max_size += VLAN_HEADER_LEN; } - if (size > max_size) { + if (size > max_size && !is_tso) { error = EMSGSIZE; break; } diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index c9474af0adb..e5b9e0c3396 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -810,6 +810,80 @@ AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([userspace offload - tso]) +OVS_VSWITCHD_START( + [set Open_vSwitch . other_config:userspace-tso-enable=true -- \ + add-br br1 -- set bridge br1 datapath-type=dummy -- \ + add-port br1 p1 -- \ + set Interface p1 type=dummy -- \ + add-port br1 p2 -- \ + set Interface p2 type=dummy]) + +dnl Simple passthrough rule. +AT_CHECK([ovs-ofctl add-flow br1 in_port=p1,actions=output:p2]) + +flow_s="in_port(1),eth(src=8a:bf:7e:2f:05:84,dst=0a:8f:39:4f:e0:73),eth_type(0x0800), \ + ipv4(src=192.168.123.2,dst=192.168.123.1,proto=6,tos=1,ttl=64,frag=no), \ + tcp(src=54392,dst=5201),tcp_flags(ack)" + +flow_s_v6="in_port(1),eth(src=8a:bf:7e:2f:05:84,dst=0a:8f:39:4f:e0:73),eth_type(0x86dd), \ + ipv6(src=2001:cafe::88,dst=2001:cafe::92,proto=6), \ + tcp(src=54392,dst=5201),tcp_flags(ack)" + +dnl Send from tso to no-tso. +AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap -- \ + set Interface p1 options:ol_ip_csum=true -- \ + set Interface p1 options:ol_ip_csum_set_good=false -- \ + set Interface p1 options:ol_tso_segsz=500]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s}" --len 2054]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s_v6}" --len 2074]) + +dnl Send from tso to tso. +AT_CHECK([ovs-vsctl set Interface p2 options:ol_ip_csum=true -- \ + set Interface p2 options:ol_ip_csum_set_good=false -- \ + set Interface p2 options:ol_tso_segsz=500]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s}" --len 2054]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s_v6}" --len 2074]) + +dnl Check that first we have: +dnl - 4x IPv4 500 byte payloads +dnl - 4x IPv6 500 byte payloads +dnl - one IPv4 2000 byte payload, and +dnl - one IPv6 2000 byte payload +zero500=$(printf '0%.0s' $(seq 1000)) +AT_CHECK_UNQUOTED([ovs-pcap p2.pcap], [0], [dnl +[0a8f394fe0738abf7e2f058408004501021c0000000040060187c0a87b02c0a87b01]dnl +[d47814510000000000000000501000004dc20000${zero500}] +[0a8f394fe0738abf7e2f058408004501021c0001000040060186c0a87b02c0a87b01]dnl +[d4781451000001f400000000501000004bce0000${zero500}] +[0a8f394fe0738abf7e2f058408004501021c0002000040060185c0a87b02c0a87b01]dnl +[d4781451000003e8000000005010000049da0000${zero500}] +[0a8f394fe0738abf7e2f058408004501021c0003000040060184c0a87b02c0a87b01]dnl +[d4781451000005dc000000005010000047e60000${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000000000000000050100000edfd0000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000001f40000000050100000ec090000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000003e80000000050100000ea150000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000005dc0000000050100000e8210000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f05840800450107f8000000004006fbaac0a87b02c0a87b01]dnl +[d478145100000000000000005010000047e60000${zero500}${zero500}${zero500}${zero500}] +[0a8f394fe0738abf7e2f058486dd6000000007e406002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000000000000000050100000e8210000]dnl +[${zero500}${zero500}${zero500}${zero500}] +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([dpif-netdev - revalidators handle dp modification fail correctly]) OVS_VSWITCHD_START( [add-port br0 p1 \ From 084c8087292c37fc26d278442dd9ed67d505b7d8 Mon Sep 17 00:00:00 2001 From: Dexia Li Date: Wed, 17 Jan 2024 14:26:30 -0500 Subject: [PATCH 557/833] userspace: Support VXLAN and GENEVE TSO. For userspace datapath, this patch provides vxlan and geneve tunnel tso. Only support userspace vxlan or geneve tunnel, meanwhile support tunnel outter and inner csum offload. If netdev do not support offload features, there is a software fallback.If netdev do not support vxlan and geneve tso,packets will drop. Front-end devices can close offload features by ethtool also. Acked-by: Simon Horman Signed-off-by: Dexia Li Co-authored-by: Mike Pattrick Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 41 +++++++- lib/dp-packet.h | 200 +++++++++++++++++++++++++++++++++++++--- lib/dpif-netdev.c | 4 +- lib/flow.c | 2 +- lib/netdev-dpdk.c | 86 +++++++++++++++-- lib/netdev-dummy.c | 2 +- lib/netdev-native-tnl.c | 101 +++++++++++++++++++- lib/netdev-provider.h | 4 + lib/netdev.c | 53 +++++++++-- lib/packets.c | 12 +-- lib/packets.h | 6 +- tests/dpif-netdev.at | 4 +- 12 files changed, 461 insertions(+), 54 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 920402369de..e7738c37a0c 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -546,16 +546,47 @@ dp_packet_compare_offsets(struct dp_packet *b1, struct dp_packet *b2, return true; } +void +dp_packet_tnl_outer_ol_send_prepare(struct dp_packet *p, + uint64_t flags) +{ + if (dp_packet_hwol_is_outer_ipv4_cksum(p)) { + if (!(flags & NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM)) { + dp_packet_ip_set_header_csum(p, false); + dp_packet_ol_set_ip_csum_good(p); + dp_packet_hwol_reset_outer_ipv4_csum(p); + } + } + + if (!dp_packet_hwol_is_outer_udp_cksum(p)) { + return; + } + + if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) { + packet_udp_complete_csum(p, false); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_outer_udp_csum(p); + } +} + /* Checks if the packet 'p' is compatible with netdev_ol_flags 'flags' * and if not, updates the packet with the software fall back. */ void dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) { + bool tnl_inner = false; + + if (dp_packet_hwol_is_tunnel_geneve(p) || + dp_packet_hwol_is_tunnel_vxlan(p)) { + dp_packet_tnl_outer_ol_send_prepare(p, flags); + tnl_inner = true; + } + if (dp_packet_hwol_tx_ip_csum(p)) { if (dp_packet_ip_checksum_good(p)) { dp_packet_hwol_reset_tx_ip_csum(p); } else if (!(flags & NETDEV_TX_OFFLOAD_IPV4_CKSUM)) { - dp_packet_ip_set_header_csum(p); + dp_packet_ip_set_header_csum(p, tnl_inner); dp_packet_ol_set_ip_csum_good(p); dp_packet_hwol_reset_tx_ip_csum(p); } @@ -565,24 +596,24 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) return; } - if (dp_packet_l4_checksum_good(p)) { + if (dp_packet_l4_checksum_good(p) && !tnl_inner) { dp_packet_hwol_reset_tx_l4_csum(p); return; } if (dp_packet_hwol_l4_is_tcp(p) && !(flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { - packet_tcp_complete_csum(p); + packet_tcp_complete_csum(p, tnl_inner); dp_packet_ol_set_l4_csum_good(p); dp_packet_hwol_reset_tx_l4_csum(p); } else if (dp_packet_hwol_l4_is_udp(p) && !(flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { - packet_udp_complete_csum(p); + packet_udp_complete_csum(p, tnl_inner); dp_packet_ol_set_l4_csum_good(p); dp_packet_hwol_reset_tx_l4_csum(p); } else if (!(flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM) && dp_packet_hwol_l4_is_sctp(p)) { - packet_sctp_complete_csum(p); + packet_sctp_complete_csum(p, tnl_inner); dp_packet_ol_set_l4_csum_good(p); dp_packet_hwol_reset_tx_l4_csum(p); } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index ad272f581dc..ee1f0734ad9 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -86,22 +86,47 @@ enum dp_packet_offload_mask { DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800), /* Offload IP checksum. */ DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000), + /* Offload packet is tunnel GENEVE. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE, + RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000), + /* Offload packet is tunnel VXLAN. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, + RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000), + /* Offload tunnel packet, outer header is IPv4. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4, + RTE_MBUF_F_TX_OUTER_IPV4, 0x8000), + /* Offload tunnel outer IPv4 checksum. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM, + RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000), + /* Offload tunnel outer UDP checksum. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM, + RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000), + /* Offload tunnel packet, outer header is IPv6. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV6, + RTE_MBUF_F_TX_OUTER_IPV6, 0x40000), + /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */ }; -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH | \ - DP_PACKET_OL_FLOW_MARK | \ - DP_PACKET_OL_RX_L4_CKSUM_BAD | \ - DP_PACKET_OL_RX_IP_CKSUM_BAD | \ - DP_PACKET_OL_RX_L4_CKSUM_GOOD | \ - DP_PACKET_OL_RX_IP_CKSUM_GOOD | \ - DP_PACKET_OL_TX_TCP_SEG | \ - DP_PACKET_OL_TX_IPV4 | \ - DP_PACKET_OL_TX_IPV6 | \ - DP_PACKET_OL_TX_TCP_CKSUM | \ - DP_PACKET_OL_TX_UDP_CKSUM | \ - DP_PACKET_OL_TX_SCTP_CKSUM | \ - DP_PACKET_OL_TX_IP_CKSUM) +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH | \ + DP_PACKET_OL_FLOW_MARK | \ + DP_PACKET_OL_RX_L4_CKSUM_BAD | \ + DP_PACKET_OL_RX_IP_CKSUM_BAD | \ + DP_PACKET_OL_RX_L4_CKSUM_GOOD | \ + DP_PACKET_OL_RX_IP_CKSUM_GOOD | \ + DP_PACKET_OL_TX_TCP_SEG | \ + DP_PACKET_OL_TX_IPV4 | \ + DP_PACKET_OL_TX_IPV6 | \ + DP_PACKET_OL_TX_TCP_CKSUM | \ + DP_PACKET_OL_TX_UDP_CKSUM | \ + DP_PACKET_OL_TX_SCTP_CKSUM | \ + DP_PACKET_OL_TX_IP_CKSUM | \ + DP_PACKET_OL_TX_TUNNEL_GENEVE | \ + DP_PACKET_OL_TX_TUNNEL_VXLAN | \ + DP_PACKET_OL_TX_OUTER_IPV4 | \ + DP_PACKET_OL_TX_OUTER_IP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_UDP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_IPV6) #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ @@ -139,6 +164,10 @@ struct dp_packet { * or UINT16_MAX. */ uint16_t l4_ofs; /* Transport-level header offset, or UINT16_MAX. */ + uint16_t inner_l3_ofs; /* Inner Network-level header offset, + * or UINT16_MAX. */ + uint16_t inner_l4_ofs; /* Inner Transport-level header offset, + or UINT16_MAX. */ uint32_t cutlen; /* length in bytes to cut from the end. */ ovs_be32 packet_type; /* Packet type as defined in OpenFlow */ uint16_t csum_start; /* Position to start checksumming from. */ @@ -250,6 +279,7 @@ bool dp_packet_compare_offsets(struct dp_packet *good, struct dp_packet *test, struct ds *err_str); void dp_packet_ol_send_prepare(struct dp_packet *, uint64_t); +void dp_packet_tnl_outer_ol_send_prepare(struct dp_packet *, uint64_t); /* Frees memory that 'b' points to, as well as 'b' itself. */ @@ -482,6 +512,22 @@ dp_packet_l4_size(const struct dp_packet *b) : 0; } +static inline void * +dp_packet_inner_l3(const struct dp_packet *b) +{ + return b->inner_l3_ofs != UINT16_MAX + ? (char *) dp_packet_data(b) + b->inner_l3_ofs + : NULL; +} + +static inline void * +dp_packet_inner_l4(const struct dp_packet *b) +{ + return b->inner_l4_ofs != UINT16_MAX + ? (char *) dp_packet_data(b) + b->inner_l4_ofs + : NULL; +} + static inline const void * dp_packet_get_tcp_payload(const struct dp_packet *b) { @@ -539,6 +585,25 @@ dp_packet_get_nd_payload(const struct dp_packet *b) } #ifdef DPDK_NETDEV +static inline void +dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len) +{ + b->mbuf.l2_len = l2_len; +} + +static inline void +dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len) +{ + b->mbuf.l3_len = l3_len; +} + +static inline void +dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len) +{ + b->mbuf.l4_len = l4_len; +} + + static inline uint64_t * dp_packet_ol_flags_ptr(const struct dp_packet *b) { @@ -558,6 +623,24 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b) } #else +static inline void +dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len OVS_UNUSED) +{ + /* There is no implementation. */ +} + +static inline void +dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len OVS_UNUSED) +{ + /* There is no implementation. */ +} + +static inline void +dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len OVS_UNUSED) +{ + /* There is no implementation. */ +} + static inline uint32_t * dp_packet_ol_flags_ptr(const struct dp_packet *b) { @@ -619,6 +702,8 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v) * (and thus 'v') will always be <= UINT16_MAX; this means that there is no * loss of accuracy in assigning 'v' to 'data_len'. */ + + ovs_assert(v <= UINT16_MAX); b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ b->mbuf.pkt_len = v; /* Total length of all segments linked to * this segment. */ @@ -1056,6 +1141,36 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b) DP_PACKET_OL_TX_SCTP_CKSUM; } +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE + * checksum offloading. */ +static inline bool +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_GENEVE); +} + +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN + * checksum offloading. */ +static inline bool +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN); +} + +/* Returns 'true' if packet 'b' is marked for outer IPv4 checksum offload. */ +static inline bool +dp_packet_hwol_is_outer_ipv4_cksum(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IP_CKSUM); +} + +/* Returns 'true' if packet 'b' is marked for outer UDP checksum offload. */ +static inline bool +dp_packet_hwol_is_outer_udp_cksum(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_UDP_CKSUM); +} + static inline void dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p) { @@ -1078,6 +1193,14 @@ dp_packet_hwol_set_tx_ipv6(struct dp_packet *a) *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_IPV6; } +/* Mark packet 'a' as a tunnel packet with outer IPv6 header. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv6(struct dp_packet *a) +{ + *dp_packet_ol_flags_ptr(a) &= ~DP_PACKET_OL_TX_OUTER_IPV4; + *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_OUTER_IPV6; +} + /* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */ static inline bool dp_packet_hwol_tx_ip_csum(const struct dp_packet *p) @@ -1131,6 +1254,53 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Mark packet 'b' for tunnel GENEVE offloading. */ +static inline void +dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE; +} + +/* Mark packet 'b' for tunnel VXLAN offloading. */ +static inline void +dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; +} + +/* Mark packet 'b' as a tunnel packet with outer IPv4 header. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv4(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4; +} + +/* Mark packet 'b' for csum offloading in outer IPv4 header. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv4_csum(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; +} + +static inline void +dp_packet_hwol_reset_outer_ipv4_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_OUTER_IP_CKSUM; +} + +static inline void +dp_packet_hwol_reset_outer_udp_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_OUTER_UDP_CKSUM; +} + +/* Mark packet 'b' for csum offloading in outer UDP header. */ +static inline void +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; +} + /* Resets TCP Segmentation in packet 'p' and adjust flags to indicate * L3 and L4 checksumming is now required. */ static inline void @@ -1184,9 +1354,9 @@ dp_packet_ip_checksum_bad(const struct dp_packet *p) /* Calculate and set the IPv4 header checksum in packet 'p'. */ static inline void -dp_packet_ip_set_header_csum(struct dp_packet *p) +dp_packet_ip_set_header_csum(struct dp_packet *p, bool inner) { - struct ip_header *ip = dp_packet_l3(p); + struct ip_header *ip = (inner) ? dp_packet_inner_l3(p) : dp_packet_l3(p); ovs_assert(ip); ip->ip_csum = 0; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index df5bbf85a05..c1981137f92 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -8194,7 +8194,9 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, ds_destroy(&ds); } - dp_packet_ol_send_prepare(packet_, 0); + if (type != DPIF_UC_MISS) { + dp_packet_ol_send_prepare(packet_, 0); + } return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, actions, wc, put_actions, dp->upcall_aux); diff --git a/lib/flow.c b/lib/flow.c index b8f99f66be9..82d93570adb 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -3278,7 +3278,7 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) if (dp_packet_hwol_tx_ip_csum(p)) { dp_packet_ol_reset_ip_csum_good(p); } else { - dp_packet_ip_set_header_csum(p); + dp_packet_ip_set_header_csum(p, false); dp_packet_ol_set_ip_csum_good(p); } pseudo_hdr_csum = packet_csum_pseudoheader(ip); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 1ff25c24692..fb26825ff85 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -416,6 +416,10 @@ enum dpdk_hw_ol_features { NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5, NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6, NETDEV_TX_TSO_OFFLOAD = 1 << 7, + NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8, + NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9, + NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10, + NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11, }; enum dpdk_rx_steer_flags { @@ -1075,6 +1079,14 @@ netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev) NETDEV_TX_OFFLOAD_SCTP_CKSUM); netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD, NETDEV_TX_OFFLOAD_TCP_TSO); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD, + NETDEV_TX_VXLAN_TNL_TSO); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD, + NETDEV_TX_GENEVE_TNL_TSO); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM); } static int @@ -1129,6 +1141,22 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO; } + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO; + } + + if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO; + } + + if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM; + } + /* Limit configured rss hash functions to only those supported * by the eth device. */ conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads; @@ -1346,6 +1374,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD; } + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD; + } + dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD; if (userspace_tso_enabled()) { if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { @@ -1354,6 +1394,20 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) VLOG_WARN("%s: Tx TSO offload is not supported.", netdev_get_name(&dev->up)); } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) { + dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; + } else { + VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.", + netdev_get_name(&dev->up)); + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) { + dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD; + } else { + VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.", + netdev_get_name(&dev->up)); + } } n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq); @@ -2479,11 +2533,23 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return true; } - mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); - mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); - mbuf->l4_len = 0; - mbuf->outer_l2_len = 0; - mbuf->outer_l3_len = 0; + /* If packet is vxlan or geneve tunnel packet, calculate outer + * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated + * before. */ + if (mbuf->ol_flags & + (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) { + mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - + (char *) dp_packet_eth(pkt); + mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - + (char *) dp_packet_l3(pkt); + } else { + mbuf->l2_len = (char *) dp_packet_l3(pkt) - + (char *) dp_packet_eth(pkt); + mbuf->l3_len = (char *) dp_packet_l4(pkt) - + (char *) dp_packet_l3(pkt); + mbuf->outer_l2_len = 0; + mbuf->outer_l3_len = 0; + } th = dp_packet_l4(pkt); if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { @@ -2501,8 +2567,14 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return false; } - mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; - mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE | + RTE_MBUF_F_TX_TUNNEL_VXLAN)) { + mbuf->tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - + mbuf->l4_len - mbuf->outer_l3_len; + } else { + mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; + mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + } if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index d6ef865aa65..cd7e85a8188 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -1230,7 +1230,7 @@ netdev_dummy_send(struct netdev *netdev, int qid, if (dp_packet_hwol_tx_ip_csum(packet) && !dp_packet_ip_checksum_good(packet)) { - dp_packet_ip_set_header_csum(packet); + dp_packet_ip_set_header_csum(packet, false); dp_packet_ol_set_ip_csum_good(packet); } diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index a0682c70fbb..fa87c6281d5 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -173,15 +173,29 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header, ip6->ip6_plen = htons(*ip_tot_size); packet_set_ipv6_flow_label(&ip6->ip6_flow, ipv6_label); packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; - dp_packet_hwol_set_tx_ipv6(packet); + + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + dp_packet_hwol_set_tx_outer_ipv6(packet); + } else { + dp_packet_hwol_set_tx_ipv6(packet); + } + dp_packet_ol_reset_ip_csum_good(packet); return ip6 + 1; } else { ip = netdev_tnl_ip_hdr(eth); ip->ip_tot_len = htons(*ip_tot_size); /* Postpone checksum to when the packet is pushed to the port. */ - dp_packet_hwol_set_tx_ipv4(packet); - dp_packet_hwol_set_tx_ip_csum(packet); + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + dp_packet_hwol_set_tx_outer_ipv4(packet); + dp_packet_hwol_set_tx_outer_ipv4_csum(packet); + } else { + dp_packet_hwol_set_tx_ipv4(packet); + dp_packet_hwol_set_tx_ip_csum(packet); + } + dp_packet_ol_reset_ip_csum_good(packet); *ip_tot_size -= IP_HEADER_LEN; packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; @@ -226,6 +240,74 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, return udp + 1; } +/* Calculate inner l2 l3 l4 len as tunnel outer header is not + * encapsulated now. */ +static void +dp_packet_tnl_ol_process(struct dp_packet *packet, + const struct ovs_action_push_tnl *data) +{ + struct udp_header *udp = NULL; + uint8_t opt_len = 0; + struct eth_header *eth = NULL; + struct ip_header *ip = NULL; + struct genevehdr *gnh = NULL; + + /* l2 l3 l4 len refer to inner len, tunnel outer + * header is not encapsulated here. */ + if (dp_packet_hwol_l4_mask(packet)) { + ip = dp_packet_l3(packet); + + if (ip->ip_proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(packet); + dp_packet_set_l4_len(packet, TCP_OFFSET(th->tcp_ctl) * 4); + } else if (ip->ip_proto == IPPROTO_UDP) { + dp_packet_set_l4_len(packet, UDP_HEADER_LEN); + } else if (ip->ip_proto == IPPROTO_SCTP) { + dp_packet_set_l4_len(packet, SCTP_HEADER_LEN); + } + + dp_packet_set_l3_len(packet, (char *) dp_packet_l4(packet) - + (char *) dp_packet_l3(packet)); + + if (data->tnl_type == OVS_VPORT_TYPE_GENEVE || + data->tnl_type == OVS_VPORT_TYPE_VXLAN) { + + if (IP_VER(ip->ip_ihl_ver) == 4) { + dp_packet_hwol_set_tx_ipv4(packet); + dp_packet_hwol_tx_ip_csum(packet); + } else if (IP_VER(ip->ip_ihl_ver) == 6) { + dp_packet_hwol_set_tx_ipv6(packet); + } + } + + /* Attention please, tunnel inner l2 len is consist of udp header + * len and tunnel header len and inner l2 len. */ + if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { + eth = (struct eth_header *)(data->header); + ip = (struct ip_header *)(eth + 1); + udp = (struct udp_header *)(ip + 1); + gnh = (struct genevehdr *)(udp + 1); + opt_len = gnh->opt_len * 4; + dp_packet_hwol_set_tunnel_geneve(packet); + dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) - + (char *) dp_packet_eth(packet) + + GENEVE_BASE_HLEN + opt_len); + + packet->inner_l3_ofs = packet->l3_ofs + GENEVE_BASE_HLEN + opt_len; + packet->inner_l4_ofs = packet->l4_ofs + GENEVE_BASE_HLEN + opt_len; + + } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { + dp_packet_hwol_set_tunnel_vxlan(packet); + dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) - + (char *) dp_packet_eth(packet) + + VXLAN_HLEN); + + packet->inner_l3_ofs = packet->l3_ofs + VXLAN_HLEN; + packet->inner_l4_ofs = packet->l4_ofs + VXLAN_HLEN; + } + } +} + void netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct dp_packet *packet, @@ -234,6 +316,7 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct udp_header *udp; int ip_tot_size; + dp_packet_tnl_ol_process(packet, data); udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, &ip_tot_size, 0); @@ -241,13 +324,21 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); - /* Postpone checksum to the egress netdev. */ - dp_packet_hwol_set_csum_udp(packet); if (udp->udp_csum) { dp_packet_ol_reset_l4_csum_good(packet); + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + dp_packet_hwol_set_outer_udp_csum(packet); + } else { + dp_packet_hwol_set_csum_udp(packet); + } } else { dp_packet_ol_set_l4_csum_good(packet); } + + packet->inner_l3_ofs += packet->l4_ofs; + packet->inner_l4_ofs += packet->l4_ofs; + } static void * diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index a7393c7cecf..22840a058b7 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -43,6 +43,10 @@ enum netdev_ol_flags { NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2, NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3, NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4, + NETDEV_TX_VXLAN_TNL_TSO = 1 << 5, + NETDEV_TX_GENEVE_TNL_TSO = 1 << 6, + NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM = 1 << 7, + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM = 1 << 8, }; /* A network device (e.g. an Ethernet device). diff --git a/lib/netdev.c b/lib/netdev.c index 3ed8049f76a..f2d921ed633 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -69,6 +69,8 @@ COVERAGE_DEFINE(netdev_received); COVERAGE_DEFINE(netdev_sent); COVERAGE_DEFINE(netdev_add_router); COVERAGE_DEFINE(netdev_get_stats); +COVERAGE_DEFINE(netdev_vxlan_tso_drops); +COVERAGE_DEFINE(netdev_geneve_tso_drops); COVERAGE_DEFINE(netdev_push_header_drops); COVERAGE_DEFINE(netdev_soft_seg_good); COVERAGE_DEFINE(netdev_soft_seg_drops); @@ -912,6 +914,23 @@ netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { if (dp_packet_hwol_is_tso(packet)) { + if (dp_packet_hwol_is_tunnel_vxlan(packet) + && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) { + VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support", + netdev_get_name(netdev)); + COVERAGE_INC(netdev_vxlan_tso_drops); + dp_packet_delete_batch(batch, true); + return false; + } + + if (dp_packet_hwol_is_tunnel_geneve(packet) + && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) { + VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support", + netdev_get_name(netdev)); + COVERAGE_INC(netdev_geneve_tso_drops); + dp_packet_delete_batch(batch, true); + return false; + } return netdev_send_tso(netdev, qid, batch, concurrent_txq); } } @@ -990,17 +1009,31 @@ netdev_push_header(const struct netdev *netdev, size_t i, size = dp_packet_batch_size(batch); DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet))) { + if (OVS_UNLIKELY(data->tnl_type != OVS_VPORT_TYPE_GENEVE && + data->tnl_type != OVS_VPORT_TYPE_VXLAN && + dp_packet_hwol_is_tso(packet))) { COVERAGE_INC(netdev_push_header_drops); dp_packet_delete(packet); - VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is " - "not supported: packet dropped", - netdev_get_name(netdev)); + VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is not " + "supported for %s tunnels: packet dropped", + netdev_get_name(netdev), netdev_get_type(netdev)); } else { - /* The packet is going to be encapsulated and there is - * no support yet for inner network header csum offloading. */ - dp_packet_ol_send_prepare(packet, 0); - + if (data->tnl_type != OVS_VPORT_TYPE_GENEVE && + data->tnl_type != OVS_VPORT_TYPE_VXLAN) { + dp_packet_ol_send_prepare(packet, 0); + } else if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + if (dp_packet_hwol_is_tso(packet)) { + COVERAGE_INC(netdev_push_header_drops); + dp_packet_delete(packet); + VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is not " + "supported with multiple levels of " + "VXLAN or GENEVE encapsulation.", + netdev_get_name(netdev)); + continue; + } + dp_packet_ol_send_prepare(packet, 0); + } netdev->netdev_class->push_header(netdev, packet, data); pkt_metadata_init(&packet->md, data->out_port); @@ -1446,6 +1479,10 @@ netdev_get_status(const struct netdev *netdev, struct smap *smap) OL_ADD_STAT("udp_csum", NETDEV_TX_OFFLOAD_UDP_CKSUM); OL_ADD_STAT("sctp_csum", NETDEV_TX_OFFLOAD_SCTP_CKSUM); OL_ADD_STAT("tcp_seg", NETDEV_TX_OFFLOAD_TCP_TSO); + OL_ADD_STAT("vxlan_tso", NETDEV_TX_VXLAN_TNL_TSO); + OL_ADD_STAT("geneve_tso", NETDEV_TX_GENEVE_TNL_TSO); + OL_ADD_STAT("out_ip_csum", NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM); + OL_ADD_STAT("out_udp_csum", NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM); #undef OL_ADD_STAT err = 0; diff --git a/lib/packets.c b/lib/packets.c index dab823ba225..d9e41346e7b 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1997,9 +1997,9 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) /* Set TCP checksum field in packet 'p' with complete checksum. * The packet must have the L3 and L4 offsets. */ void -packet_tcp_complete_csum(struct dp_packet *p) +packet_tcp_complete_csum(struct dp_packet *p, bool inner) { - struct tcp_header *tcp = dp_packet_l4(p); + struct tcp_header *tcp = (inner) ? dp_packet_inner_l4(p) : dp_packet_l4(p); tcp->tcp_csum = 0; if (dp_packet_hwol_is_ipv4(p)) { @@ -2020,9 +2020,9 @@ packet_tcp_complete_csum(struct dp_packet *p) /* Set UDP checksum field in packet 'p' with complete checksum. * The packet must have the L3 and L4 offsets. */ void -packet_udp_complete_csum(struct dp_packet *p) +packet_udp_complete_csum(struct dp_packet *p, bool inner) { - struct udp_header *udp = dp_packet_l4(p); + struct udp_header *udp = (inner) ? dp_packet_inner_l4(p) : dp_packet_l4(p); /* Skip csum calculation if the udp_csum is zero. */ if (!udp->udp_csum) { @@ -2052,9 +2052,9 @@ packet_udp_complete_csum(struct dp_packet *p) /* Set SCTP checksum field in packet 'p' with complete checksum. * The packet must have the L3 and L4 offsets. */ void -packet_sctp_complete_csum(struct dp_packet *p) +packet_sctp_complete_csum(struct dp_packet *p, bool inner) { - struct sctp_header *sh = dp_packet_l4(p); + struct sctp_header *sh = (inner) ? dp_packet_inner_l4(p) : dp_packet_l4(p); uint16_t tp_len = dp_packet_l4_size(p); ovs_be32 csum; diff --git a/lib/packets.h b/lib/packets.h index 12245b7649a..8b6994809fe 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1682,9 +1682,9 @@ uint32_t packet_csum_pseudoheader(const struct ip_header *); bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag); void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); -void packet_tcp_complete_csum(struct dp_packet *); -void packet_udp_complete_csum(struct dp_packet *); -void packet_sctp_complete_csum(struct dp_packet *); +void packet_tcp_complete_csum(struct dp_packet *, bool is_inner); +void packet_udp_complete_csum(struct dp_packet *, bool is_inner); +void packet_sctp_complete_csum(struct dp_packet *, bool is_inner); #define DNS_HEADER_LEN 12 struct dns_header { diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index e5b9e0c3396..790b5a43af9 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -658,11 +658,11 @@ OVS_VSWITCHD_START( other-config:datapath-id=1234 fail-mode=secure]) AT_CHECK([ovs-vsctl get interface p1 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl -tx_ip_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false" +tx_geneve_tso_offload="false", tx_ip_csum_offload="false", tx_out_ip_csum_offload="false", tx_out_udp_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false", tx_vxlan_tso_offload="false" ], []) AT_CHECK([ovs-vsctl get interface br0 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl -tx_ip_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false" +tx_geneve_tso_offload="false", tx_ip_csum_offload="false", tx_out_ip_csum_offload="false", tx_out_udp_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false", tx_vxlan_tso_offload="false" ], []) OVS_VSWITCHD_STOP From 85bcbbed839a8a13b2ac77dabc4ae78293d881e8 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 17 Jan 2024 14:26:31 -0500 Subject: [PATCH 558/833] userspace: Enable tunnel tests with TSO. This patch enables most of the tunnel tests in the testsuite, and adds a large TCP transfer to a vxlan and geneve test to verify TSO functionality. Some additional changes were required to accommodate these changes with netdev-linux interfaces. The test for vlan over vxlan is purposely not enabled as the traffic produced by this test gives incorrect values in the vnet header. Acked-by: Simon Horman Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 8 ++- lib/dp-packet.h | 63 ++++++++++++++++----- lib/dpif-netdev-extract-avx512.c | 8 +-- lib/flow.c | 12 +--- lib/netdev-linux.c | 47 +++++++++++++--- lib/netdev-native-tnl.c | 27 +++++---- lib/packets.c | 94 +++++++++++++++++++++++++------- tests/system-traffic.at | 55 ++++++++++++------- 8 files changed, 222 insertions(+), 92 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index e7738c37a0c..9635cac8b63 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -578,7 +578,6 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) if (dp_packet_hwol_is_tunnel_geneve(p) || dp_packet_hwol_is_tunnel_vxlan(p)) { - dp_packet_tnl_outer_ol_send_prepare(p, flags); tnl_inner = true; } @@ -593,6 +592,9 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) } if (!dp_packet_hwol_tx_l4_checksum(p)) { + if (tnl_inner) { + dp_packet_tnl_outer_ol_send_prepare(p, flags); + } return; } @@ -617,4 +619,8 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) dp_packet_ol_set_l4_csum_good(p); dp_packet_hwol_reset_tx_l4_csum(p); } + + if (tnl_inner) { + dp_packet_tnl_outer_ol_send_prepare(p, flags); + } } diff --git a/lib/dp-packet.h b/lib/dp-packet.h index ee1f0734ad9..52e52b9142d 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -431,6 +431,8 @@ dp_packet_reset_offsets(struct dp_packet *b) b->l2_5_ofs = UINT16_MAX; b->l3_ofs = UINT16_MAX; b->l4_ofs = UINT16_MAX; + b->inner_l3_ofs = UINT16_MAX; + b->inner_l4_ofs = UINT16_MAX; } static inline uint16_t @@ -528,6 +530,16 @@ dp_packet_inner_l4(const struct dp_packet *b) : NULL; } +static inline size_t +dp_packet_inner_l4_size(const struct dp_packet *b) +{ + return OVS_LIKELY(b->l4_ofs != UINT16_MAX) + ? (const char *) dp_packet_tail(b) + - (const char *) dp_packet_inner_l4(b) + - dp_packet_l2_pad_size(b) + : 0; +} + static inline const void * dp_packet_get_tcp_payload(const struct dp_packet *b) { @@ -864,14 +876,6 @@ dp_packet_set_data(struct dp_packet *b, void *data) } } -static inline void -dp_packet_reset_packet(struct dp_packet *b, int off) -{ - dp_packet_set_size(b, dp_packet_size(b) - off); - dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off)); - dp_packet_reset_offsets(b); -} - enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */ struct dp_packet_batch { @@ -1141,6 +1145,20 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b) DP_PACKET_OL_TX_SCTP_CKSUM; } +/* Returns 'true' if packet 'b' is marked as having an outer IPv6 header. */ +static inline bool +dp_packet_hwol_is_outer_ipv6(const struct dp_packet *b) +{ + return *dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV6; +} + +/* Returns 'true' if packet 'b' is marked as having an outer IPv4 header. */ +static inline bool +dp_packet_hwol_is_outer_ipv4(const struct dp_packet *b) +{ + return *dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV4; +} + /* Returns 'true' if packet 'b' is marked for tunnel GENEVE * checksum offloading. */ static inline bool @@ -1413,21 +1431,36 @@ dp_packet_ol_reset_l4_csum_good(struct dp_packet *p) } } -/* Marks packet 'p' with good integrity if the 'start' and 'offset' - * matches with the 'csum_start' and 'csum_offset' in packet 'p'. - * The 'start' is the offset from the begin of the packet headers. - * The 'offset' is the offset from start to place the checksum. +/* Marks packet 'p' with good integrity if checksum offload locations + * were provided. In the case of encapsulated packets, these values may + * be deeper into the packet than OVS might expect. But the packet + * should still be considered to have good integrity. + * The 'csum_start' is the offset from the begin of the packet headers. + * The 'csum_offset' is the offset from start to place the checksum. * The csum_start and csum_offset fields are set from the virtio_net_hdr * struct that may be provided by a netdev on packet ingress. */ static inline void -dp_packet_ol_l4_csum_check_partial(struct dp_packet *p, uint16_t start, - uint16_t offset) +dp_packet_ol_l4_csum_check_partial(struct dp_packet *p) { - if (p->csum_start == start && p->csum_offset == offset) { + if (p->csum_start && p->csum_offset) { dp_packet_ol_set_l4_csum_partial(p); } } +static inline void +dp_packet_reset_packet(struct dp_packet *b, int off) +{ + dp_packet_set_size(b, dp_packet_size(b) - off); + dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off)); + dp_packet_reset_offsets(b); + + if (b->csum_start >= off && b->csum_offset) { + /* Adjust values for decapsulation. */ + b->csum_start -= off; + dp_packet_ol_set_l4_csum_partial(b); + } +} + static inline uint32_t ALWAYS_INLINE dp_packet_calc_hash_ipv4(const uint8_t *pkt, const uint16_t l3_ofs, uint32_t hash) diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 1bc7e8d0e08..57ca4c71b7c 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -776,9 +776,7 @@ mfex_ipv6_set_hwol(struct dp_packet *pkt) static void mfex_tcp_set_hwol(struct dp_packet *pkt) { - dp_packet_ol_l4_csum_check_partial(pkt, pkt->l4_ofs, - offsetof(struct tcp_header, - tcp_csum)); + dp_packet_ol_l4_csum_check_partial(pkt); if (dp_packet_l4_checksum_good(pkt) || dp_packet_ol_l4_csum_partial(pkt)) { dp_packet_hwol_set_csum_tcp(pkt); @@ -788,9 +786,7 @@ mfex_tcp_set_hwol(struct dp_packet *pkt) static void mfex_udp_set_hwol(struct dp_packet *pkt) { - dp_packet_ol_l4_csum_check_partial(pkt, pkt->l4_ofs, - offsetof(struct udp_header, - udp_csum)); + dp_packet_ol_l4_csum_check_partial(pkt); if (dp_packet_l4_checksum_good(pkt) || dp_packet_ol_l4_csum_partial(pkt)) { dp_packet_hwol_set_csum_udp(pkt); diff --git a/lib/flow.c b/lib/flow.c index 82d93570adb..8e3402388cb 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -1054,9 +1054,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } - dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, - offsetof(struct tcp_header, - tcp_csum)); + dp_packet_ol_l4_csum_check_partial(packet); if (dp_packet_l4_checksum_good(packet) || dp_packet_ol_l4_csum_partial(packet)) { dp_packet_hwol_set_csum_tcp(packet); @@ -1076,9 +1074,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } - dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, - offsetof(struct udp_header, - udp_csum)); + dp_packet_ol_l4_csum_check_partial(packet); if (dp_packet_l4_checksum_good(packet) || dp_packet_ol_l4_csum_partial(packet)) { dp_packet_hwol_set_csum_udp(packet); @@ -1092,9 +1088,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) miniflow_push_be16(mf, tp_dst, sctp->sctp_dst); miniflow_push_be16(mf, ct_tp_src, ct_tp_src); miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst); - dp_packet_ol_l4_csum_check_partial(packet, packet->l4_ofs, - offsetof(struct sctp_header, - sctp_csum)); + dp_packet_ol_l4_csum_check_partial(packet); if (dp_packet_l4_checksum_good(packet) || dp_packet_ol_l4_csum_partial(packet)) { dp_packet_hwol_set_csum_sctp(packet); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index e79a432607a..1b2e5b6c2bc 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -7145,8 +7145,12 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) if (dp_packet_hwol_is_tso(b)) { uint16_t tso_segsz = dp_packet_get_tso_segsz(b); struct tcp_header *tcp = dp_packet_l4(b); + struct tcp_header *inner_tcp = dp_packet_inner_l4(b); + if (inner_tcp) { + tcp = inner_tcp; + } int tcp_hdr_len = TCP_OFFSET(tcp->tcp_ctl) * 4; - int hdr_len = ((char *) dp_packet_l4(b) - (char *) dp_packet_eth(b)) + int hdr_len = ((char *) tcp - (char *) dp_packet_eth(b)) + tcp_hdr_len; int max_packet_len = mtu + ETH_HEADER_LEN + VLAN_HEADER_LEN; @@ -7164,17 +7168,35 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) } else if (dp_packet_hwol_tx_ipv6(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; } - } else { vnet->hdr_len = 0; vnet->gso_size = 0; vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; } - if (dp_packet_l4_checksum_good(b)) { + bool l4_is_good = dp_packet_l4_checksum_good(b); + + if ((dp_packet_hwol_is_tunnel_vxlan(b) || + dp_packet_hwol_is_tunnel_geneve(b)) && + dp_packet_hwol_tx_l4_checksum(b)) { + /* This condition is needed because dp-packet doesn't currently track + * outer and inner checksum statuses seperately. In the case of these + * two tunnel types we can end up setting outer l4 as good but still + * need to complete the inner l4. */ + l4_is_good = !(dp_packet_hwol_l4_is_tcp(b) || + dp_packet_hwol_l4_is_udp(b)); + } + + if (l4_is_good) { /* The packet has good L4 checksum. No need to validate again. */ vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID; + if (!dp_packet_ip_checksum_good(b)) { + /* It is possible that L4 is good but the IP checksum isn't + * complete. For example in the case of UDP encapsulation of an ARP + * packet where the UDP checksum is 0. */ + dp_packet_ip_set_header_csum(b, false); + } } else if (dp_packet_hwol_tx_l4_checksum(b)) { /* The csum calculation is offloaded. */ if (dp_packet_hwol_l4_is_tcp(b)) { @@ -7192,20 +7214,28 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) * the TCP pseudo header, so that replacing it by the ones * complement checksum of the TCP header and body will give * the correct result. */ + void *l3_off = dp_packet_inner_l3(b); + void *l4_off = dp_packet_inner_l4(b); + + if (!l3_off || !l4_off) { + l3_off = dp_packet_l3(b); + l4_off = dp_packet_l4(b); + } - struct tcp_header *tcp_hdr = dp_packet_l4(b); + struct tcp_header *tcp_hdr = l4_off; ovs_be16 csum = 0; if (dp_packet_hwol_is_ipv4(b)) { - const struct ip_header *ip_hdr = dp_packet_l3(b); + const struct ip_header *ip_hdr = l3_off; csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); } else if (dp_packet_hwol_tx_ipv6(b)) { - const struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(b); + const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off; csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); } tcp_hdr->tcp_csum = csum; vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = (OVS_FORCE __virtio16) b->l4_ofs; + vnet->csum_start = (OVS_FORCE __virtio16) ((char *) l4_off - + (char *) dp_packet_data(b)); vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct tcp_header, tcp_csum); } else if (dp_packet_hwol_l4_is_udp(b)) { @@ -7222,7 +7252,8 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) udp_hdr->udp_csum = csum; vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = (OVS_FORCE __virtio16) b->l4_ofs; + vnet->csum_start = (OVS_FORCE __virtio16) ((char *) udp_hdr - + (char *) dp_packet_data(b));; vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct udp_header, udp_csum); } else if (dp_packet_hwol_l4_is_sctp(b)) { diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index fa87c6281d5..0d6d803fe45 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -215,7 +215,8 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, } if (udp->udp_csum) { - if (OVS_UNLIKELY(!dp_packet_l4_checksum_good(packet))) { + if (OVS_LIKELY(!dp_packet_ol_l4_csum_partial(packet)) && + OVS_UNLIKELY(!dp_packet_l4_checksum_good(packet))) { uint32_t csum; if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) { csum = packet_csum_pseudoheader6(dp_packet_l3(packet)); @@ -292,18 +293,11 @@ dp_packet_tnl_ol_process(struct dp_packet *packet, dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) - (char *) dp_packet_eth(packet) + GENEVE_BASE_HLEN + opt_len); - - packet->inner_l3_ofs = packet->l3_ofs + GENEVE_BASE_HLEN + opt_len; - packet->inner_l4_ofs = packet->l4_ofs + GENEVE_BASE_HLEN + opt_len; - } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { dp_packet_hwol_set_tunnel_vxlan(packet); dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) - (char *) dp_packet_eth(packet) + VXLAN_HLEN); - - packet->inner_l3_ofs = packet->l3_ofs + VXLAN_HLEN; - packet->inner_l4_ofs = packet->l4_ofs + VXLAN_HLEN; } } } @@ -313,6 +307,8 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct dp_packet *packet, const struct ovs_action_push_tnl *data) { + uint16_t l3_ofs = packet->l3_ofs; + uint16_t l4_ofs = packet->l4_ofs; struct udp_header *udp; int ip_tot_size; @@ -332,13 +328,20 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, } else { dp_packet_hwol_set_csum_udp(packet); } - } else { - dp_packet_ol_set_l4_csum_good(packet); } - packet->inner_l3_ofs += packet->l4_ofs; - packet->inner_l4_ofs += packet->l4_ofs; + if (packet->csum_start && packet->csum_offset) { + dp_packet_ol_set_l4_csum_partial(packet); + } else if (!udp->udp_csum) { + dp_packet_ol_set_l4_csum_good(packet); + } + if (l3_ofs != UINT16_MAX) { + packet->inner_l3_ofs = l3_ofs + data->header_len; + } + if (l4_ofs != UINT16_MAX) { + packet->inner_l4_ofs = l4_ofs + data->header_len; + } } static void * diff --git a/lib/packets.c b/lib/packets.c index d9e41346e7b..f23d2542045 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1999,21 +1999,44 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) void packet_tcp_complete_csum(struct dp_packet *p, bool inner) { - struct tcp_header *tcp = (inner) ? dp_packet_inner_l4(p) : dp_packet_l4(p); + struct tcp_header *tcp; + size_t tcp_sz; + void *ip_hdr; + bool is_v4; + + if (inner) { + tcp = dp_packet_inner_l4(p); + ip_hdr = dp_packet_inner_l3(p); + tcp_sz = dp_packet_inner_l4_size(p); + } else { + tcp = dp_packet_l4(p); + ip_hdr = dp_packet_l3(p); + tcp_sz = dp_packet_l4_size(p); + } + + if (!inner && dp_packet_hwol_is_outer_ipv6(p)) { + is_v4 = false; + } else if (!inner && dp_packet_hwol_is_outer_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_is_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_tx_ipv6(p)) { + is_v4 = false; + } else { + OVS_NOT_REACHED(); + } tcp->tcp_csum = 0; - if (dp_packet_hwol_is_ipv4(p)) { - struct ip_header *ip = dp_packet_l3(p); + if (is_v4) { + struct ip_header *ip = ip_hdr; tcp->tcp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), - tcp, dp_packet_l4_size(p))); - } else if (dp_packet_hwol_tx_ipv6(p)) { - struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p); + tcp, tcp_sz)); + } else { + struct ovs_16aligned_ip6_hdr *ip6 = ip_hdr; tcp->tcp_csum = packet_csum_upperlayer6(ip6, tcp, ip6->ip6_nxt, - dp_packet_l4_size(p)); - } else { - OVS_NOT_REACHED(); + tcp_sz); } } @@ -2022,26 +2045,49 @@ packet_tcp_complete_csum(struct dp_packet *p, bool inner) void packet_udp_complete_csum(struct dp_packet *p, bool inner) { - struct udp_header *udp = (inner) ? dp_packet_inner_l4(p) : dp_packet_l4(p); + struct udp_header *udp; + size_t udp_sz; + void *ip_hdr; + bool is_v4; + + if (inner) { + udp = dp_packet_inner_l4(p); + ip_hdr = dp_packet_inner_l3(p); + udp_sz = dp_packet_inner_l4_size(p); + } else { + udp = dp_packet_l4(p); + ip_hdr = dp_packet_l3(p); + udp_sz = dp_packet_l4_size(p); + } /* Skip csum calculation if the udp_csum is zero. */ if (!udp->udp_csum) { return; } + if (!inner && dp_packet_hwol_is_outer_ipv6(p)) { + is_v4 = false; + } else if (!inner && dp_packet_hwol_is_outer_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_is_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_tx_ipv6(p)) { + is_v4 = false; + } else { + OVS_NOT_REACHED(); + } + udp->udp_csum = 0; - if (dp_packet_hwol_is_ipv4(p)) { - struct ip_header *ip = dp_packet_l3(p); + if (is_v4) { + struct ip_header *ip = ip_hdr; udp->udp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), - udp, dp_packet_l4_size(p))); - } else if (dp_packet_hwol_tx_ipv6(p)) { - struct ovs_16aligned_ip6_hdr *ip6 = dp_packet_l3(p); + udp, udp_sz)); + } else { + struct ovs_16aligned_ip6_hdr *ip6 = ip_hdr; udp->udp_csum = packet_csum_upperlayer6(ip6, udp, ip6->ip6_nxt, - dp_packet_l4_size(p)); - } else { - OVS_NOT_REACHED(); + udp_sz); } if (!udp->udp_csum) { @@ -2054,10 +2100,18 @@ packet_udp_complete_csum(struct dp_packet *p, bool inner) void packet_sctp_complete_csum(struct dp_packet *p, bool inner) { - struct sctp_header *sh = (inner) ? dp_packet_inner_l4(p) : dp_packet_l4(p); - uint16_t tp_len = dp_packet_l4_size(p); + struct sctp_header *sh; + uint16_t tp_len; ovs_be32 csum; + if (inner) { + sh = dp_packet_inner_l4(p); + tp_len = dp_packet_inner_l4_size(p); + } else { + sh = dp_packet_l4(p); + tp_len = dp_packet_l4_size(p); + } + put_16aligned_be32(&sh->sctp_csum, 0); csum = crc32c((void *) sh, tp_len); put_16aligned_be32(&sh->sctp_csum, csum); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 283706c6e12..f363a778cc7 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -292,7 +292,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over vxlan tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_VXLAN() OVS_TRAFFIC_VSWITCHD_START() @@ -330,6 +329,15 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) + +dnl Wait until transfer completes before checking. +OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) +AT_CHECK([diff -q payload.bin data], [0]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -381,7 +389,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over vxlan6 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_VXLAN_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -421,11 +428,18 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) + +dnl Wait until transfer completes before checking. +OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) +AT_CHECK([diff -q payload.bin data], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over gre tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() @@ -467,7 +481,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6gre L2 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -508,7 +521,6 @@ AT_CLEANUP AT_SETUP([datapath - ping over erspan v1 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -545,7 +557,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over erspan v2 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -582,7 +593,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v1 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -622,7 +632,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v2 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -663,7 +672,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -701,11 +709,19 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) + +dnl Wait until transfer completes before checking. +OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) +AT_CHECK([diff -q payload.bin data], [0]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve tunnel, delete flow regression]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -760,7 +776,6 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/|ERR|/d AT_CLEANUP AT_SETUP([datapath - flow resume with geneve tun_metadata]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -812,7 +827,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve6 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -852,12 +866,19 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) + +dnl Wait until transfer completes before checking. +OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) +AT_CHECK([diff -q payload.bin data], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - slow_action on geneve6 tunnel]) AT_SKIP_IF([test $HAVE_TCPDUMP = no]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -981,7 +1002,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over gre tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1028,7 +1048,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over erspan v1 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1077,7 +1096,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over erspan v2 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1131,7 +1149,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v1 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1187,7 +1204,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v2 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1242,7 +1258,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over srv6 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_SRV6() OVS_TRAFFIC_VSWITCHD_START() @@ -1304,7 +1319,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping6 over srv6 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_SRV6() OVS_TRAFFIC_VSWITCHD_START() @@ -8062,7 +8076,6 @@ AT_CLEANUP AT_SETUP([conntrack - can match and clear ct_state from outside OVS]) CHECK_CONNTRACK_LOCAL_STACK() -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() From 206dfaa365a6a552922567e7e282c822c5137208 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jan 2024 22:11:04 +0100 Subject: [PATCH 559/833] AUTHORS: Add Dexia Li. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index fb03b5dfeea..aa9284fb164 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -136,6 +136,7 @@ David Wilder dwilder@us.ibm.com David Yang davidy@vmware.com Dennis Sam dsam@arista.com Devendra Naga devendra.aaru@gmail.com +Dexia Li dexia.li@jaguarmicro.com Dincer Beken dbeken@blackned.de Dmitry Krivenok krivenok.dmitry@gmail.com Dominic Curran dominic.curran@citrix.com From e802fe79aafc6f68099f435280747217735eba77 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jan 2024 13:11:24 +0100 Subject: [PATCH 560/833] Prepare for 3.3.0. Also fixed the Conntrack Helper Persistence version. Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Ilya Maximets --- Documentation/faq/releases.rst | 3 ++- NEWS | 2 +- configure.ac | 2 +- debian/changelog | 4 ++-- debian/rules | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index da185ae1dc4..3a8387f8491 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -140,7 +140,7 @@ Q: Are all features available with all datapaths? Conntrack Zone Limit 4.18 2.10 2.13 YES Conntrack NAT 4.6 2.6 2.8 YES Conntrack NAT6 4.6 2.6 2.8 3.0 - Conntrack Helper Persist. YES YES 3.2 NO + Conntrack Helper Persist. YES YES 3.3 NO Tunnel - LISP NO 2.11 NO NO Tunnel - STT NO 2.4 NO YES Tunnel - GRE 3.11 1.0 2.4 YES @@ -220,6 +220,7 @@ Q: What DPDK version does each Open vSwitch release work with? 3.0.x 21.11.5 3.1.x 22.11.3 3.2.x 22.11.3 + 3.3.x 23.11 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? diff --git a/NEWS b/NEWS index 9e057ad2294..83cf0eb78eb 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Post-v3.2.0 +v3.3.0 - xx xxx xxxx -------------------- - OVSDB: * Support pre-vote mechanism in RAFT that protects the cluster against diff --git a/configure.ac b/configure.ac index 44c09b2ac4c..05afbb9cc81 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.2.90, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.3.0, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index a42f4deaa8b..545dc83138e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,8 +1,8 @@ -openvswitch (3.2.90-1) unstable; urgency=low +openvswitch (3.3.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Mon, 17 Jul 2023 14:40:01 +0100 + -- Open vSwitch team Wed, 17 Jan 2024 13:00:00 +0100 openvswitch (3.2.0-1) unstable; urgency=low diff --git a/debian/rules b/debian/rules index dc5cc8a65b0..075b0416284 100755 --- a/debian/rules +++ b/debian/rules @@ -134,8 +134,8 @@ override_dh_python3: # Helper target for creating snapshots from upstream git DATE=$(shell date +%Y%m%d) # Upstream branch to track -BRANCH=branch-3.2 -VERSION=3.2.0 +BRANCH=branch-3.3 +VERSION=3.3.0 get-orig-snapshot: rm -Rf openvswitch-upstream From 85ceed7c7893de73196f02ae8ec766ccd528fc69 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jan 2024 13:11:25 +0100 Subject: [PATCH 561/833] Prepare for post-3.3.0 (3.3.90). Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ configure.ac | 2 +- debian/changelog | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 83cf0eb78eb..2153b480531 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Post-v3.3.0 +-------------------- + + v3.3.0 - xx xxx xxxx -------------------- - OVSDB: diff --git a/configure.ac b/configure.ac index 05afbb9cc81..dd6553fea07 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.3.0, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.3.90, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 545dc83138e..44321745503 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +openvswitch (3.3.90-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Wed, 17 Jan 2024 13:00:01 +0100 + openvswitch (3.3.0-1) unstable; urgency=low * New upstream version From 335a5deac3ff91448ca14651e92f39dfdd512fcf Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 18 Jan 2024 15:59:05 +0100 Subject: [PATCH 562/833] ovs-atomic: Fix inclusion of Clang header by GCC 14. GCC 14 started to advertise c_atomic extension, older versions didn't do that. Add check for __clang__, so GCC doesn't include headers designed for Clang. Another option would be to prefer stdatomic implementation instead, but some older versions of Clang are not able to use stdatomic.h supplied by GCC as described in commit: 07ece367fb5f ("ovs-atomic: Prefer Clang intrinsics over .") This change fixes OVS build with GCC on Fedora Rawhide (40). Reported-by: Jakob Meng Acked-by: Jakob Meng Acked-by: Eelco Chaudron Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/ovs-atomic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ovs-atomic.h b/lib/ovs-atomic.h index ab9ce6b2e0f..f140d25feba 100644 --- a/lib/ovs-atomic.h +++ b/lib/ovs-atomic.h @@ -328,7 +328,7 @@ #if __CHECKER__ /* sparse doesn't understand some GCC extensions we use. */ #include "ovs-atomic-pthreads.h" - #elif __has_extension(c_atomic) + #elif __clang__ && __has_extension(c_atomic) #include "ovs-atomic-clang.h" #elif HAVE_ATOMIC && __cplusplus >= 201103L #include "ovs-atomic-c++.h" From bacd2c304a1f9cb2b31543ab1a07aa084eaf7db7 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 18 Jan 2024 17:38:19 +0100 Subject: [PATCH 563/833] dp-packet: Avoid checks while preparing non-offloading packets. Currently, dp_packet_ol_send_prepare() performs multiple checks for each offloading flag separately. That takes a noticeable amount of extra cycles for packets that do not have any offloading flags set. Skip most of the work if no checksumming flags are set. The change improves performance of direct forwarding between two virtio-user ports (V2V) by ~2.5 % and offsets all the negative effects of TSO support introduced recently. It adds an extra check to the offloading path, but it is not a default configuration and also should take much smaller hit due to lower number of larger packets. Acked-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 5 +++++ lib/dp-packet.h | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 9635cac8b63..0e23c766e1b 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -576,6 +576,11 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) { bool tnl_inner = false; + if (!dp_packet_hwol_tx_is_any_csum(p)) { + /* Only checksumming needs actions. */ + return; + } + if (dp_packet_hwol_is_tunnel_geneve(p) || dp_packet_hwol_is_tunnel_vxlan(p)) { tnl_inner = true; diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 52e52b9142d..939bec5c899 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -131,6 +131,10 @@ enum dp_packet_offload_mask { #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ DP_PACKET_OL_TX_SCTP_CKSUM) +#define DP_PACKET_OL_TX_ANY_CKSUM (DP_PACKET_OL_TX_L4_MASK | \ + DP_PACKET_OL_TX_IP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_IP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_UDP_CKSUM) #define DP_PACKET_OL_RX_IP_CKSUM_MASK (DP_PACKET_OL_RX_IP_CKSUM_GOOD | \ DP_PACKET_OL_RX_IP_CKSUM_BAD) #define DP_PACKET_OL_RX_L4_CKSUM_MASK (DP_PACKET_OL_RX_L4_CKSUM_GOOD | \ @@ -1189,6 +1193,13 @@ dp_packet_hwol_is_outer_udp_cksum(struct dp_packet *b) return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_UDP_CKSUM); } +/* Returns 'true' if packet 'b' is marked for any checksum offload. */ +static inline bool +dp_packet_hwol_tx_is_any_csum(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_ANY_CKSUM); +} + static inline void dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p) { From 9ca8d3a4d47901aa836a514501e429ccd7c370d5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 18 Jan 2024 20:25:23 +0100 Subject: [PATCH 564/833] tests: mcast-snooping: Stop time for the group protocol test. Otherwise, it randomly fails due to age not being zero under load: tests/mcast-snooping.at:645: ovs-appctl mdb/show br0 --- - +++ /at-groups/2592/stdout @@ -1,5 +1,5 @@ port VLAN protocol GROUP Age - 1 0 IGMPv1 224.1.1.1 0 + 1 0 IGMPv1 224.1.1.1 1 1 0 IGMPv2 224.1.1.2 0 1 0 IGMPv3 233.54.12.230 0 Fixes: b222593bc69b ("mcast-snooping: Add group protocol to mdb/show output.") Acked-by: Eelco Chaudron Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- tests/mcast-snooping.at | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index a91b3e13a10..adbb66c7059 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -622,6 +622,8 @@ AT_CHECK([ other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ ], [0]) +AT_CHECK([ovs-appctl time/stop]) + # Send IGMPv1 report packet. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ From 3eb91a8d1b9ad8afb396e57fd2797ea9c2bc0bb9 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 18 Jan 2024 17:15:00 +0100 Subject: [PATCH 565/833] netdev-dpdk: Trigger port reconfiguration in main thread for resets. When OVS (main thread) configures a DPDK netdev, it holds a netdev_dpdk mutex lock. As part of this configure operation, the net/iavf driver (used with i40e VF devices) triggers a queue count change. The PF entity (serviced by a kernel PF driver for example) handles this change and requests back that the VF driver resets the VF device. The driver then completes the VF reset operation on its side and waits for completion of the iavf-event thread responsible for handling various VF device events. On the other hand, handling of the VF reset request in this iavf-event thread results in notifying the application with a port reset request (RTE_ETH_EVENT_INTR_RESET). The OVS reset callback tries to take a hold of the same netdev_dpdk mutex and blocks the iavf-event thread. As a result, the net/iavf driver (still running on OVS main thread) is unable to complete as it is waiting for iavf-event to complete. To break from this situation, the OVS reset callback now won't take a netdev_dpdk mutex. Instead, the port reset request is stored in a simple RTE_ETH_MAXPORTS array associated to a seq object. This is enough to let the VF driver complete this port initialization. The OVS main thread later handles the port reset request. More details in the DPDK upstream bz as this issue appeared following a change in DPDK. Link: https://bugs.dpdk.org/show_bug.cgi?id=1337 Signed-off-by: David Marchand Signed-off-by: Ilya Maximets --- NEWS | 7 ----- lib/netdev-dpdk.c | 76 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/NEWS b/NEWS index 2153b480531..a6617546c62 100644 --- a/NEWS +++ b/NEWS @@ -54,13 +54,6 @@ v3.3.0 - xx xxx xxxx - Support for multicast snooping to show the protocol responsible for adding/updating the entry. -Known issues: - - DPDK: v23.11 has a change in behavior in handling i40e VF devices. This - may block and prevent OVS from adding such devices as ports in a netdev - datapath bridge. - For the details, see https://bugs.dpdk.org/show_bug.cgi?id=1337 which - describes the issue first detected in the 21.11 LTS branch. - v3.2.0 - 17 Aug 2023 -------------------- diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index fb26825ff85..45f61930d40 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -58,6 +58,7 @@ #include "openvswitch/match.h" #include "openvswitch/ofp-parse.h" #include "openvswitch/ofp-print.h" +#include "openvswitch/poll-loop.h" #include "openvswitch/shash.h" #include "openvswitch/vlog.h" #include "ovs-numa.h" @@ -2101,32 +2102,73 @@ netdev_dpdk_process_devargs(struct netdev_dpdk *dev, return new_port_id; } +static struct seq *netdev_dpdk_reset_seq; +static uint64_t netdev_dpdk_last_reset_seq; +static atomic_bool netdev_dpdk_pending_reset[RTE_MAX_ETHPORTS]; + +static void +netdev_dpdk_wait(const struct netdev_class *netdev_class OVS_UNUSED) +{ + uint64_t last_reset_seq = seq_read(netdev_dpdk_reset_seq); + + if (netdev_dpdk_last_reset_seq == last_reset_seq) { + seq_wait(netdev_dpdk_reset_seq, netdev_dpdk_last_reset_seq); + } else { + poll_immediate_wake(); + } +} + +static void +netdev_dpdk_run(const struct netdev_class *netdev_class OVS_UNUSED) +{ + uint64_t reset_seq = seq_read(netdev_dpdk_reset_seq); + + if (reset_seq != netdev_dpdk_last_reset_seq) { + dpdk_port_t port_id; + + netdev_dpdk_last_reset_seq = reset_seq; + + for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { + struct netdev_dpdk *dev; + bool pending_reset; + + atomic_read_relaxed(&netdev_dpdk_pending_reset[port_id], + &pending_reset); + if (!pending_reset) { + continue; + } + atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], false); + + ovs_mutex_lock(&dpdk_mutex); + dev = netdev_dpdk_lookup_by_port_id(port_id); + if (dev) { + ovs_mutex_lock(&dev->mutex); + dev->reset_needed = true; + netdev_request_reconfigure(&dev->up); + VLOG_DBG_RL(&rl, "%s: Device reset requested.", + netdev_get_name(&dev->up)); + ovs_mutex_unlock(&dev->mutex); + } + ovs_mutex_unlock(&dpdk_mutex); + } + } +} + static int dpdk_eth_event_callback(dpdk_port_t port_id, enum rte_eth_event_type type, void *param OVS_UNUSED, void *ret_param OVS_UNUSED) { - struct netdev_dpdk *dev; - switch ((int) type) { case RTE_ETH_EVENT_INTR_RESET: - ovs_mutex_lock(&dpdk_mutex); - dev = netdev_dpdk_lookup_by_port_id(port_id); - if (dev) { - ovs_mutex_lock(&dev->mutex); - dev->reset_needed = true; - netdev_request_reconfigure(&dev->up); - VLOG_DBG_RL(&rl, "%s: Device reset requested.", - netdev_get_name(&dev->up)); - ovs_mutex_unlock(&dev->mutex); - } - ovs_mutex_unlock(&dpdk_mutex); + atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], true); + seq_change(netdev_dpdk_reset_seq); break; default: /* Ignore all other types. */ break; - } - return 0; + } + return 0; } static void @@ -5001,6 +5043,8 @@ netdev_dpdk_class_init(void) "[netdev]", 0, 1, netdev_dpdk_get_mempool_info, NULL); + netdev_dpdk_reset_seq = seq_create(); + netdev_dpdk_last_reset_seq = seq_read(netdev_dpdk_reset_seq); ret = rte_eth_dev_callback_register(RTE_ETH_ALL, RTE_ETH_EVENT_INTR_RESET, dpdk_eth_event_callback, NULL); @@ -6593,6 +6637,8 @@ parse_vhost_config(const struct smap *ovs_other_config) #define NETDEV_DPDK_CLASS_BASE \ NETDEV_DPDK_CLASS_COMMON, \ .init = netdev_dpdk_class_init, \ + .run = netdev_dpdk_run, \ + .wait = netdev_dpdk_wait, \ .destruct = netdev_dpdk_destruct, \ .set_tx_multiq = netdev_dpdk_set_tx_multiq, \ .get_carrier = netdev_dpdk_get_carrier, \ From 432a0b93557331e8a5f6a66456f33efd49a2fc9b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 22 Jan 2024 18:23:06 +0100 Subject: [PATCH 566/833] ci: Run system tests in a separate namespace. GitHub runners use 10.1.0.0/16 network as their base network for eth0 interface. That is causing random system test failures when unexpected conntrack entries for this network are present, because our system tests are mainly using 10.1.1.0/24 subnet for their test networks. Run system tests in their own network namespace to avoid any unwanted interference. Ideally, we would run every single test in its own namespace, but that is not a trivial change and will likely be hard to backport. Still worth investigating in the future. Note: Layer3 tunnel tests with Bareudp ports rely on loopback to work, but lo interface is down by default in new namespaces. So, bringing it up. These tests are skipped in Ubuntu 22.04, because it doesn't have bareudp support, but it's better to have the change anyway, so it doesn't bite us in the future while upgrading the base image. Signed-off-by: Ilya Maximets Signed-off-by: Eelco Chaudron --- .ci/linux-build.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 7c2aebad80e..bf9d6241d52 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -157,6 +157,10 @@ else if [ "$testsuite" != "check" ] && \ [ "$testsuite" != "check-ovsdb-cluster" ] ; then run_as_root="sudo -E PATH=$PATH GITHUB_ACTIONS=$GITHUB_ACTIONS" + sudo ip netns add ovs-system-test-ns + # Some system tests may rely on traffic loopback. + sudo ip -netns ovs-system-test-ns link set dev lo up + run_as_root="${run_as_root} ip netns exec ovs-system-test-ns" fi if [ "${testsuite##*dpdk}" != "$testsuite" ]; then sudo sh -c 'echo 1024 > /proc/sys/vm/nr_hugepages' || true From 3f74d6bf3b4ee5d14bdd83bf29c646220bb1b0fe Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Sat, 20 Jan 2024 09:42:52 +0000 Subject: [PATCH 567/833] tests: ovsdb-server: Fix config-file same schema test. When a configuration file is used the ovsdb-server (re-)configures databases in multiple passes. First the configuration file is read and a shash is populated, second the shash is iterated over to remove/create databases. The "ovsdb-server config-file - same schema" test currently relies on a certain ordering of this shash, but we can't really rely on a specific ordering as it would be environment specific. The test currently fails on big endian systems such as s390x with: -WARN|failed to open database 'db2': ovsdb error: ordinals: duplicate database name +WARN|failed to open database 'db': ovsdb error: ordinals: duplicate database name Normalize the logged database name so that the test can focus on the fact that duplication is detected rather than in which order. Fixes: 55140090e63a ("ovsdb-server: Allow user-provided config files.") Acked-by: Simon Horman Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/ovsdb-server.at | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index c87ecc2e36e..b8ccc4c8e2f 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -2870,7 +2870,9 @@ m4_define([TEST_CONFIG_FILE], --config-file=config.json], [$3], [ignore], [stderr]) m4_if([$4], [], [], [ AT_CHECK([cat stderr | grep -v -E 'INFO|DBG' \ - | grep -v 'failed to load configuration from' > warnings]) + | grep -v 'failed to load configuration from' \ + | sed -e "/duplicate database name/ s/'db'/'db2'/" \ + > warnings]) AT_CHECK([cat warnings], [0], [m4_if([$3], [0], [$4], [$4 ovsdb-server: server configuration failed ])])]) From 96990ea1e4a597bff3750901ede7b92412ac443e Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 25 Jan 2024 16:46:53 -0500 Subject: [PATCH 568/833] dp-packet: Reset offload/offsets when clearing a packet. The OVN test suite identified a bug in dp_packet_ol_send_prepare() where a BFD packet flagged as double encapsulated would trigger a seg fault. The problem surfaced because bfd_put_packet was reusing a packet allocated on the stack that wasn't having its flags reset between calls. This change will reset OL flags as well as the layer offsets in data_clear(), which should fix this type of packet reuse issue in general as long as data_clear() is called in between uses. Fixes: 8b5fe2dc6080 ("userspace: Add Generic Segmentation Offloading.") Reported-by: Dumitru Ceara Reported-at: https://issues.redhat.com/browse/FDP-300 Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.h | 3 +++ lib/packets.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 939bec5c899..dceb701e8d2 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -207,6 +207,7 @@ void *dp_packet_resize_l2(struct dp_packet *, int increment); void *dp_packet_resize_l2_5(struct dp_packet *, int increment); static inline void *dp_packet_eth(const struct dp_packet *); static inline void dp_packet_reset_offsets(struct dp_packet *); +static inline void dp_packet_reset_offload(struct dp_packet *); static inline uint16_t dp_packet_l2_pad_size(const struct dp_packet *); static inline void dp_packet_set_l2_pad_size(struct dp_packet *, uint16_t); static inline void *dp_packet_l2_5(const struct dp_packet *); @@ -380,6 +381,8 @@ dp_packet_clear(struct dp_packet *b) { dp_packet_set_data(b, dp_packet_base(b)); dp_packet_set_size(b, 0); + dp_packet_reset_offsets(b); + dp_packet_reset_offload(b); } /* Removes 'size' bytes from the head end of 'b', which must contain at least diff --git a/lib/packets.c b/lib/packets.c index f23d2542045..36c6692e5c6 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -224,7 +224,6 @@ compose_rarp(struct dp_packet *b, const struct eth_addr eth_src) arp->ar_tha = eth_src; put_16aligned_be32(&arp->ar_tpa, htonl(0)); - dp_packet_reset_offsets(b); dp_packet_set_l3(b, arp); b->packet_type = htonl(PT_ETH); } @@ -1114,7 +1113,6 @@ eth_compose(struct dp_packet *b, const struct eth_addr eth_dst, eth->eth_type = htons(eth_type); b->packet_type = htonl(PT_ETH); - dp_packet_reset_offsets(b); dp_packet_set_l3(b, data); return data; @@ -1747,7 +1745,6 @@ compose_arp__(struct dp_packet *b) arp->ar_hln = sizeof arp->ar_sha; arp->ar_pln = sizeof arp->ar_spa; - dp_packet_reset_offsets(b); dp_packet_set_l3(b, arp); b->packet_type = htonl(PT_ETH); From 7b838a24fcf55c7f289b801998294c8dcff09e32 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 26 Jan 2024 18:07:52 +0100 Subject: [PATCH 569/833] mcast-snooping: Remove typedef from mcast_group_proto. Typedefs are confusing and the coding style generally advises to not use them. Removing typedef until others start using it. This typedef already got me while testing an OVN update to use OVS 3.3 as a submodule, since the variable was declared in a switch statement and it wasn't clearly visible that there is a variable definition in one of the cases and braces should be used. Strangely some versions of compilers do not require braces in this case, so OVN change works locally, but not in CI. Fixes: 077d0bad0436 ("mcast-snooping: Store IGMP/MLD protocol version.") Acked-by: Mohammad Heib Signed-off-by: Ilya Maximets --- lib/mcast-snooping.c | 6 +++--- lib/mcast-snooping.h | 12 ++++++------ ofproto/ofproto-dpif-xlate.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c index 60ef8381e9a..dc5164b41c7 100644 --- a/lib/mcast-snooping.c +++ b/lib/mcast-snooping.c @@ -58,7 +58,7 @@ mcast_snooping_flood_unreg(const struct mcast_snooping *ms) } char * -mcast_snooping_group_protocol_str(mcast_group_proto grp_proto) +mcast_snooping_group_protocol_str(enum mcast_group_proto grp_proto) { switch (grp_proto) { case MCAST_GROUP_IGMPV1: @@ -414,7 +414,7 @@ bool mcast_snooping_add_group(struct mcast_snooping *ms, const struct in6_addr *addr, uint16_t vlan, void *port, - mcast_group_proto grp_proto) + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock) { bool learned; @@ -460,7 +460,7 @@ mcast_snooping_add_group(struct mcast_snooping *ms, bool mcast_snooping_add_group4(struct mcast_snooping *ms, ovs_be32 ip4, uint16_t vlan, void *port, - mcast_group_proto grp_proto) + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock) { struct in6_addr addr = in6_addr_mapped_ipv4(ip4); diff --git a/lib/mcast-snooping.h b/lib/mcast-snooping.h index 76ab4e4f777..de42cf826ba 100644 --- a/lib/mcast-snooping.h +++ b/lib/mcast-snooping.h @@ -40,13 +40,13 @@ struct mcast_snooping; #define MCAST_MROUTER_PORT_IDLE_TIME 180 /* Multicast group protocol. */ -typedef enum { +enum mcast_group_proto { MCAST_GROUP_IGMPV1 = 0, MCAST_GROUP_IGMPV2, MCAST_GROUP_IGMPV3, MCAST_GROUP_MLDV1, MCAST_GROUP_MLDV2, -} mcast_group_proto; +}; /* Multicast group entry. * Guarded by owning 'mcast_snooping''s rwlock. */ @@ -61,7 +61,7 @@ struct mcast_group { uint16_t vlan; /* Multicast group IPv6/IPv4 Protocol version IGMPv1,2,3 or MLDv1,2 */ - mcast_group_proto protocol_version; + enum mcast_group_proto protocol_version; /* Node in parent struct mcast_snooping group_lru. */ struct ovs_list group_node OVS_GUARDED; @@ -198,11 +198,11 @@ mcast_snooping_lookup4(const struct mcast_snooping *ms, ovs_be32 ip4, bool mcast_snooping_add_group(struct mcast_snooping *ms, const struct in6_addr *addr, uint16_t vlan, void *port, - mcast_group_proto grp_proto) + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock); bool mcast_snooping_add_group4(struct mcast_snooping *ms, ovs_be32 ip4, uint16_t vlan, void *port, - mcast_group_proto grp_proto) + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock); int mcast_snooping_add_report(struct mcast_snooping *ms, const struct dp_packet *p, @@ -224,7 +224,7 @@ bool mcast_snooping_add_mrouter(struct mcast_snooping *ms, uint16_t vlan, OVS_REQ_WRLOCK(ms->rwlock); bool mcast_snooping_is_query(ovs_be16 igmp_type); bool mcast_snooping_is_membership(ovs_be16 igmp_type); -char *mcast_snooping_group_protocol_str(mcast_group_proto grp_proto); +char *mcast_snooping_group_protocol_str(enum mcast_group_proto grp_proto); /* Flush. */ void mcast_snooping_mdb_flush(struct mcast_snooping *ms); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index f4d1d71945a..1cf4d5f7c9b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -2796,7 +2796,7 @@ update_mcast_snooping_table4__(const struct xlate_ctx *ctx, OVS_REQ_WRLOCK(ms->rwlock) { const struct igmp_header *igmp; - mcast_group_proto grp_proto; + enum mcast_group_proto grp_proto; int count; size_t offset; ovs_be32 ip4 = flow->igmp_group_ip4; From 1be7f896af85f2777f8147df207612339b4480c0 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 26 Jan 2024 15:38:25 +0100 Subject: [PATCH 570/833] github: Update versions of action dependencies (Node.js 20). checkout@v3, cache@v3 and setup-python@v4 are using outdated Node.js 16 which is now deprecated in GHA [1], so these actions will stop working soon. Updating to most recent major versions with Node.js 20. This stops GHA from throwing warnings in every build. [1] https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/ While at it also updating upload-artifact version to the latest version. Removed versions from the comment as the general behavior of this action regarding symlinks and wildcards doesn't seem to change between versions much. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .github/workflows/build-and-test.yml | 42 ++++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 3807e5f132c..ddb42558093 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -17,7 +17,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: update PATH run: | @@ -45,14 +45,14 @@ jobs: - name: cache id: dpdk_cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: dpdk-dir key: ${{ steps.gen_dpdk_key.outputs.key }} - name: set up python if: steps.dpdk_cache.outputs.cache-hit != 'true' - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.9' @@ -207,7 +207,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: update PATH run: | @@ -215,13 +215,13 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - name: set up python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.9' - name: cache if: matrix.dpdk != '' || matrix.dpdk_shared != '' - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: dpdk-dir key: ${{ needs.build-dpdk.outputs.dpdk_key }} @@ -247,9 +247,9 @@ jobs: - name: copy logs on failure if: failure() || cancelled() run: | - # upload-artifact@v2 throws exceptions if it tries to upload socket + # upload-artifact throws exceptions if it tries to upload socket # files and we could have some socket files in testsuite.dir. - # Also, upload-artifact@v2 doesn't work well enough with wildcards. + # Also, upload-artifact doesn't work well enough with wildcards. # So, we're just archiving everything here to avoid any issues. mkdir logs cp config.log ./logs/ @@ -259,7 +259,7 @@ jobs: - name: upload logs on failure if: failure() || cancelled() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: logs-linux-${{ join(matrix.*, '-') }} path: logs.tgz @@ -279,7 +279,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -338,18 +338,18 @@ jobs: - name: check for analyzer result cache id: clang_cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: base-clang-analyzer-results key: ${{ steps.cache_key.outputs.key }} - name: set up python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.9' - name: get cached dpdk-dir - uses: actions/cache/restore@v3 + uses: actions/cache/restore@v4 with: path: dpdk-dir key: ${{ needs.build-dpdk.outputs.dpdk_key }} @@ -368,7 +368,7 @@ jobs: run: ./.ci/linux-build.sh - name: save cache - uses: actions/cache/save@v3 + uses: actions/cache/save@v4 if: steps.clang_cache.outputs.cache-hit != 'true' with: path: base-clang-analyzer-results @@ -391,13 +391,13 @@ jobs: steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: update PATH run: | echo "$HOME/bin" >> $GITHUB_PATH echo "$HOME/.local/bin" >> $GITHUB_PATH - name: set up python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.9' - name: install dependencies @@ -408,7 +408,7 @@ jobs: run: ./.ci/osx-build.sh - name: upload logs on failure if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: logs-osx-clang---disable-ssl path: config.log @@ -432,7 +432,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: update PATH run: | @@ -454,7 +454,7 @@ jobs: run: ./.ci/linux-build.sh - name: upload deb packages - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: deb-packages-${{ matrix.dpdk }}-dpdk path: '/home/runner/work/ovs/*.deb' @@ -470,7 +470,7 @@ jobs: steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install dependencies run: | dnf install -y rpm-build dnf-plugins-core @@ -489,7 +489,7 @@ jobs: run: dnf install -y rpm/rpmbuild/RPMS/*/*.rpm - name: upload rpm packages - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: rpm-packages path: | From 6bdca15791ce53e15101c436d8524f2944336031 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 29 Jan 2024 23:33:56 +0100 Subject: [PATCH 571/833] github: Bump Fedora version to 39. Fedora 37 reached EOL in November. Switch to the most recent version to avoid potential CI failures in the future. Acked-by: Eelco Chaudron Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index ddb42558093..fc755814861 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -462,7 +462,7 @@ jobs: build-linux-rpm: name: linux rpm fedora runs-on: ubuntu-latest - container: fedora:37 + container: fedora:39 timeout-minutes: 30 strategy: From 027ae2ba1a94fcdb35cedb8e2668d0ba2311938f Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 31 Jan 2024 17:49:37 +0000 Subject: [PATCH 572/833] faq: Update DPDK releases for older branches. Branches 2.17/3.0/3.1/3.2 are using newer DPDK LTS releases. Update the faq. Signed-off-by: Kevin Traynor Acked-by: Simon Horman Acked-by: Eelco Chaudron Acked-by: Ilya Maximets --- Documentation/faq/releases.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 3a8387f8491..49b987b610c 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -216,10 +216,10 @@ Q: What DPDK version does each Open vSwitch release work with? 2.14.x 19.11.13 2.15.x 20.11.6 2.16.x 20.11.6 - 2.17.x 21.11.5 - 3.0.x 21.11.5 - 3.1.x 22.11.3 - 3.2.x 22.11.3 + 2.17.x 21.11.6 + 3.0.x 21.11.6 + 3.1.x 22.11.4 + 3.2.x 22.11.4 3.3.x 23.11 ============ ======== From 61003d0280625e15796f18d14ce1b5f46e560f3e Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Mon, 11 Dec 2023 12:04:19 -0500 Subject: [PATCH 573/833] odp: ND: Follow Open Flow spec converting from OF to DP. The OpenFlow spec doesn't require that a user specify icmp_code when specifying a type. However, the conversion for a DP flow asks that the user explicitly specified an icmp_code field to match and forces this via a mask check. This means that valid matches for icmp_type=136,... (for example) won't properly generate a full flow and there will be a much broader match installed in the kernel datapath. This can be worked around by explicitly including icmp_code, but for users that want to write flows which are installed in the kernel, it is not possible to omit icmp_code in the openflow message and still have a neighbor discovery match field included. An alternative way to fix up the flow and mask would be to modify the output of the translation in the xlate_wc_finish() to set the mask when detecting a neighbor discovery related packet. This would require additional matching logic in the xlate_wc_finish() path to validate the ICMP type/code details, and set the masks correctly. The approach taken here is to relax the requirements from the ODP side. This follows the OpenFlow specification and only require that the user include an 'icmp_type=' match, rather than both 'icmp_type=..,icmp_code=0' when matching on neighbor discovery. Signed-off-by: Aaron Conole Signed-off-by: Ilya Maximets --- lib/odp-util.c | 31 +++++++++++-------------- tests/ofproto-macros.at | 15 ++++++++++++ tests/system-traffic.at | 51 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/lib/odp-util.c b/lib/odp-util.c index 3eb2c3cb98c..9306c9b4d47 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -6464,12 +6464,10 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, icmpv6_key->icmpv6_code = ntohs(data->tp_dst); if (is_nd(flow, NULL) - /* Even though 'tp_src' and 'tp_dst' are 16 bits wide, ICMP - * type and code are 8 bits wide. Therefore, an exact match - * looks like htons(0xff), not htons(0xffff). See - * xlate_wc_finish() for details. */ - && (!export_mask || (data->tp_src == htons(0xff) - && data->tp_dst == htons(0xff)))) { + /* Even though 'tp_src' is 16 bits wide, ICMP type is 8 bits + * wide. Therefore, an exact match looks like htons(0xff), + * not htons(0xffff). See xlate_wc_finish() for details. */ + && (!export_mask || data->tp_src == htons(0xff))) { struct ovs_key_nd *nd_key; nd_key = nl_msg_put_unspec_uninit(buf, OVS_KEY_ATTR_ND, sizeof *nd_key); @@ -7185,20 +7183,17 @@ parse_l2_5_onward(const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1], flow->arp_sha = nd_key->nd_sll; flow->arp_tha = nd_key->nd_tll; if (is_mask) { - /* Even though 'tp_src' and 'tp_dst' are 16 bits wide, - * ICMP type and code are 8 bits wide. Therefore, an - * exact match looks like htons(0xff), not - * htons(0xffff). See xlate_wc_finish() for details. - * */ + /* Even though 'tp_src' is 16 bits wide, ICMP type + * is 8 bits wide. Therefore, an exact match looks + * like htons(0xff), not htons(0xffff). See + * xlate_wc_finish() for details. */ if (!is_all_zeros(nd_key, sizeof *nd_key) && - (flow->tp_src != htons(0xff) || - flow->tp_dst != htons(0xff))) { + flow->tp_src != htons(0xff)) { odp_parse_error(&rl, errorp, - "ICMP (src,dst) masks should be " - "(0xff,0xff) but are actually " - "(%#"PRIx16",%#"PRIx16")", - ntohs(flow->tp_src), - ntohs(flow->tp_dst)); + "ICMP src mask should be " + "(0xff) but is actually " + "(%#"PRIx16")", + ntohs(flow->tp_src)); return ODP_FIT_ERROR; } else { *expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ND; diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index c870cf8197c..c22fb3c79c3 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -146,6 +146,21 @@ strip_stats () { s/bytes:[[0-9]]*/bytes:0/' } +# Strips key32 field from output. +strip_key32 () { + sed 's/key32([[0-9 \/]]*),//' +} + +# Strips packet-type from output. +strip_ptype () { + sed 's/packet_type(ns=[[0-9]]*,id=[[0-9]]*),//' +} + +# Strips bare eth from output. +strip_eth () { + sed 's/eth(),//' +} + # Changes all 'recirc(...)' and 'recirc=...' to say 'recirc()' and # 'recirc=' respectively. This should make output easier to # compare. diff --git a/tests/system-traffic.at b/tests/system-traffic.at index f363a778cc7..62d00376c8c 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2247,6 +2247,57 @@ AT_CHECK([diff -q payload.bin data_1], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - Neighbor Discovery with loose match]) +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "2001::1:0:392/64", 36:b1:ee:7c:01:03) +ADD_VETH(p1, at_ns1, br0, "2001::1:0:9/64", 36:b1:ee:7c:01:02) + +dnl Set up flows for moving icmp ND Solicit around. This should be the +dnl same for the other ND types. +AT_DATA([flows.txt], [dnl +table=0 priority=95 icmp6,icmp_type=136,nd_target=2001::1:0:9 actions=resubmit(,10) +table=0 priority=95 icmp6,icmp_type=136,nd_target=2001::1:0:392 actions=resubmit(,10) +table=0 priority=65 actions=resubmit(,20) +table=10 actions=NORMAL +table=20 actions=drop +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +dnl Send a mismatching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 f1 f2 20 00 00 00 30 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 02 01 36 b1 ee 7c 01 03 > /dev/null]) + +dnl Send a matching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 fe 5f 20 00 00 00 20 01 00 00 00 00 00 00 00 00 00 01 00 00 03 92 02 01 36 b1 ee 7c 01 03 > /dev/null]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_stats | strip_used | dnl + strip_key32 | strip_ptype | strip_eth | strip_recirc | dnl + grep ",nd" | sort], [0], [dnl +recirc_id(),in_port(2),eth(src=36:b1:ee:7c:01:03,dst=36:b1:ee:7c:01:02),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=2001::1:0:392), packets:0, bytes:0, used:never, actions:1,3 +recirc_id(),in_port(2),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=3000::1), packets:0, bytes:0, used:never, actions:drop +]) + +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-flows | grep ",nd" | wc -l | grep -E ^0]) + +dnl Send a matching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 fe 5f 20 00 00 00 20 01 00 00 00 00 00 00 00 00 00 01 00 00 03 92 02 01 36 b1 ee 7c 01 03 > /dev/null]) + +dnl Send a mismatching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 f1 f2 20 00 00 00 30 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 02 01 36 b1 ee 7c 01 03 > /dev/null]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_stats | strip_used | dnl + strip_key32 | strip_ptype | strip_eth | strip_recirc | dnl + grep ",nd" | sort], [0], [dnl +recirc_id(),in_port(2),eth(src=36:b1:ee:7c:01:03,dst=36:b1:ee:7c:01:02),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=2001::1:0:392), packets:0, bytes:0, used:never, actions:1,3 +recirc_id(),in_port(2),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=3000::1), packets:0, bytes:0, used:never, actions:drop +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([MPLS]) AT_SETUP([mpls - encap header dp-support]) From be695f26fd5667bcc86d78954c4c783979088ead Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Thu, 23 Nov 2023 19:47:54 +0100 Subject: [PATCH 574/833] netdev-offload-tc: Check geneve metadata length. Currently ovs-vswitchd crashes, with hw offloading enabled, if a geneve packet with corrupted metadata is received, because the metadata header is not verified correctly. This commit adds a check for geneve metadata length and, if the header is wrong, the packet is not sent to flower. It also includes a system-traffic test for geneve packets with corrupted metadata. Fixes: a468645c6d33 ("lib/tc: add geneve with option match offload") Reported-by: Haresh Khandelwal Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- lib/netdev-offload-tc.c | 25 ++++++++++++++++++++----- tests/system-traffic.at | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 164c7eef63e..921d5231777 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -1785,12 +1785,12 @@ test_key_and_mask(struct match *match) return 0; } -static void +static int flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, struct flow_tnl *tnl_mask) { struct geneve_opt *opt, *opt_mask; - int len, cnt = 0; + int tot_opt_len, len, cnt = 0; /* 'flower' always has an exact match on tunnel metadata length, so having * it in a wrong format is not acceptable unless it is empty. */ @@ -1806,7 +1806,7 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, memset(&tnl_mask->metadata.present.map, 0, sizeof tnl_mask->metadata.present.map); } - return; + return 0; } tnl_mask->flags &= ~FLOW_TNL_F_UDPIF; @@ -1820,7 +1820,7 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, sizeof tnl_mask->metadata.present.len); if (!tnl->metadata.present.len) { - return; + return 0; } memcpy(flower->key.tunnel.metadata.opts.gnv, tnl->metadata.opts.gnv, @@ -1834,7 +1834,16 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, * also not masks, but actual lengths in the 'flower' structure. */ len = flower->key.tunnel.metadata.present.len; while (len) { + if (len < sizeof *opt) { + return EOPNOTSUPP; + } + opt = &flower->key.tunnel.metadata.opts.gnv[cnt]; + tot_opt_len = sizeof *opt + opt->length * 4; + if (len < tot_opt_len) { + return EOPNOTSUPP; + } + opt_mask = &flower->mask.tunnel.metadata.opts.gnv[cnt]; opt_mask->length = opt->length; @@ -1842,6 +1851,8 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, cnt += sizeof(struct geneve_opt) / 4 + opt->length; len -= sizeof(struct geneve_opt) + opt->length * 4; } + + return 0; } static void @@ -2287,7 +2298,11 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, tnl_mask->flags &= ~(FLOW_TNL_F_DONT_FRAGMENT | FLOW_TNL_F_CSUM); if (!strcmp(netdev_get_type(netdev), "geneve")) { - flower_match_to_tun_opt(&flower, tnl, tnl_mask); + err = flower_match_to_tun_opt(&flower, tnl, tnl_mask); + if (err) { + VLOG_WARN_RL(&warn_rl, "Unable to parse geneve options"); + return err; + } } flower.tunnel = true; } else { diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 62d00376c8c..4fd5dbe593f 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -1001,6 +1001,38 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - handling of geneve corrupted metadata]) +OVS_CHECK_GENEVE() + +OVS_TRAFFIC_VSWITCHD_START( + [_ADD_BR([br-underlay]) -- \ + set bridge br0 other-config:hwaddr=f2:ff:00:00:00:01 -- \ + set bridge br-underlay other-config:hwaddr=f2:ff:00:00:00:02]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24", f2:ff:00:00:00:03) +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([geneve], [br0], [at_gnv0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [vni 0], [address f2:ff:00:00:00:04]) + +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 03 08 00 45 00 00 52 00 01 00 00 40 11 1f f7 ac 1f 01 01 ac 1f 01 64 de c1 17 c1 00 3e 59 e9 01 00 65 58 00 00 00 00 00 03 00 02 f2 ff 00 00 00 01 f2 ff 00 00 00 04 08 00 45 00 00 1c 00 01 00 00 40 01 64 7a 0a 01 01 01 0a 01 01 64 08 00 f7 ff 00 00 00 00 > /dev/null]) + +OVS_WAIT_UNTIL([grep -q 'Invalid Geneve tunnel metadata' ovs-vswitchd.log]) + +OVS_TRAFFIC_VSWITCHD_STOP(["/Invalid Geneve tunnel metadata on bridge br0 while processing icmp,in_port=1,vlan_tci=0x0000,dl_src=f2:ff:00:00:00:04,dl_dst=f2:ff:00:00:00:01,nw_src=10.1.1.1,nw_dst=10.1.1.100,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0/d +/Unable to parse geneve options/d"]) +AT_CLEANUP + AT_SETUP([datapath - ping over gre tunnel by simulated packets]) OVS_CHECK_MIN_KERNEL(3, 10) From 2832faa22aa09b4bde51381fdfe730161fa22248 Mon Sep 17 00:00:00 2001 From: Roberto Bartzen Acosta Date: Mon, 5 Feb 2024 09:36:14 -0300 Subject: [PATCH 575/833] Documentation: Adding note about using the jemalloc library. Updating the reference documentation with the inclusion of possible building problems with libjemalloc and solution suggestions. Reported-at: https://bugs.launchpad.net/ubuntu/+source/openvswitch/+bug/2015748 Signed-off-by: Roberto Bartzen Acosta Acked-by: Eelco Chaudron Reviewed-by: Frode Nordahl [simon: rebased; added leading '$' to last configure example] Signed-off-by: Simon Horman --- Documentation/intro/install/general.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/intro/install/general.rst b/Documentation/intro/install/general.rst index 19e360d47ce..86e85f75dbf 100644 --- a/Documentation/intro/install/general.rst +++ b/Documentation/intro/install/general.rst @@ -344,6 +344,22 @@ you wish to link with jemalloc add it to LIBS:: $ ./configure LIBS=-ljemalloc +.. note:: + Linking Open vSwitch with the jemalloc shared library may not work as + expected in certain operating system development environments. You can + override the automatic compiler decision to avoid possible linker issues by + passing ``-fno-lto`` or ``-fno-builtin`` flag since the jemalloc override + standard built-in memory allocation functions such as malloc, calloc, etc. + Both options can solve possible jemalloc linker issues with pros and cons for + each case, feel free to choose the path that appears best to you. Disabling + LTO flag example:: + + $ ./configure LIBS=-ljemalloc CFLAGS="-g -O2 -fno-lto" + + Disabling built-in flag example:: + + $ ./configure LIBS=-ljemalloc CFLAGS="-g -O2 -fno-builtin" + .. _general-building: Building From b3fc822208e89c6ba04e1fc972da0bf4426a5847 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 9 Feb 2024 14:35:47 +0000 Subject: [PATCH 576/833] AUTHORS: Add Roberto Bartzen Acosta. Signed-off-by: Simon Horman --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index aa9284fb164..fc08f3bbfe8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -385,6 +385,7 @@ Rob Adams readams@readams.net Rob Hoes rob.hoes@citrix.com Robert Wojciechowicz robertx.wojciechowicz@intel.com Robert Åkerblom-Andersson Robert.nr1@gmail.com +Roberto Bartzen Acosta roberto.acosta@luizalabs.com Robin Jarry rjarry@redhat.com Rohith Basavaraja rohith.basavaraja@gmail.com Roi Dayan roid@nvidia.com From 11b62f5e0b2fa2a2c7cd7bad9014d86bbd02caeb Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 9 Feb 2024 21:47:26 +0100 Subject: [PATCH 577/833] appveyor: Move from MinGW 32bit to msys64. AppVeyor is planning to remove support for MinGW 32bit soon. And we had a couple of incidents where it wasn't available already, so we moved to a 'Previous' image. Move to msys64 instead. While at it making the CI scripts a little nicer, moving the non-Windows parts of the preparation and build to separate files. MSYS2 has its own version of python. However, we do not support building on Windows with non-Windows python build. The main issue is the delimiter symbol in PYTHONPATH. In Windows version it has to be ';', while the python supplied with MSYS2 uses ':' as on Linux, while we detect Windows and pass ';' during the build. Renaming the binary, so the Windows version is used. Additionally switched to Python 3.12, 3.7 reached EoL some time back, though it's still available in AppVeyor. The stderr has to be redirected to stdout for scripts, because any message on stderr is treated as fatal failure by PowerShell. Scripts are running with 'set -e', so a failure of individual commands will fail the script. The OpenSSL download is still failing, but it is out of scope for this change. Acked-by: Alin Gabriel Serdean Signed-off-by: Ilya Maximets --- .ci/windows-build.sh | 17 ++++++++++++ .ci/windows-prepare.sh | 11 ++++++++ Makefile.am | 2 ++ appveyor.yml | 59 ++++++++++++++++-------------------------- 4 files changed, 53 insertions(+), 36 deletions(-) create mode 100644 .ci/windows-build.sh create mode 100644 .ci/windows-prepare.sh diff --git a/.ci/windows-build.sh b/.ci/windows-build.sh new file mode 100644 index 00000000000..22994fcdd60 --- /dev/null +++ b/.ci/windows-build.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -ex + +CONFIGURATION=$1 + +./boot.sh +./configure CC=build-aux/cccl LD="$(which link)" \ + LIBS="-lws2_32 -lShlwapi -liphlpapi -lwbemuuid -lole32 -loleaut32" \ + --prefix=C:/openvswitch/usr --localstatedir=C:/openvswitch/var \ + --sysconfdir=C:/openvswitch/etc --with-pthread=c:/PTHREADS-BUILT/ \ + --enable-ssl --with-openssl=C:/OpenSSL-Win64 \ + --with-vstudiotarget="${CONFIGURATION}" + +make -j4 +make datapath_windows_analyze +make install +make windows_installer diff --git a/.ci/windows-prepare.sh b/.ci/windows-prepare.sh new file mode 100644 index 00000000000..2d76add7150 --- /dev/null +++ b/.ci/windows-prepare.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -ex + +mkdir -p /var/cache/pacman/pkg/ +pacman -S --noconfirm --needed automake autoconf libtool make patch + +# Use an MSVC linker and a Windows version of Python. +mv $(which link) $(which link)_copy +mv $(which python3) $(which python3)_copy + +cd /c/pthreads4w-code && nmake all install diff --git a/Makefile.am b/Makefile.am index 94f488d1837..45fce1243a7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,6 +81,8 @@ EXTRA_DIST = \ .ci/linux-prepare.sh \ .ci/osx-build.sh \ .ci/osx-prepare.sh \ + .ci/windows-build.sh \ + .ci/windows-prepare.sh \ .cirrus.yml \ .editorconfig \ .github/workflows/build-and-test.yml \ diff --git a/appveyor.yml b/appveyor.yml index 5903b90d078..373f01a43cc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,5 +1,5 @@ version: 1.0.{build} -image: Previous Visual Studio 2019 +image: Visual Studio 2019 branches: only: - master @@ -7,54 +7,41 @@ configuration: - Debug - Release clone_folder: C:\openvswitch_compile +shallow_clone: true init: -- ps: $env:PATH ="C:\Python37;"+$env:PATH -- ps: New-Item -Type HardLink -Path "C:\Python37\python3.exe" -Value "C:\Python37\python.exe" -- ps: >- +- ps: $env:PATH ="C:\Python312-x64;"+$env:PATH +- ps: New-Item -Type HardLink -Path "C:\Python312-x64\python3.exe" + -Value "C:\Python312-x64\python.exe" +- ps: | mkdir C:\ovs-build-downloads - mkdir C:\openvswitch\driver - $source = "https://slproweb.com/download/Win64OpenSSL-1_0_2u.exe" - $destination = "C:\ovs-build-downloads\Win64OpenSSL-1_0_2u.exe" - Invoke-WebRequest $source -OutFile $destination cd C:\ovs-build-downloads - .\Win64OpenSSL-1_0_2u.exe /silent /verysilent /sp- /suppressmsgboxes - Start-Sleep -s 30 - - cd C:\openvswitch - - git clone -q https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code - - python3 -m pip install pypiwin32 --disable-pip-version-check - cd C:\openvswitch_compile +- ps: git clone -q https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code +- ps: python3 -m pip install pypiwin32 --disable-pip-version-check build_script: - '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"' -- C:\MinGW\msys\1.0\bin\bash -lc "echo \"C:/MinGW /mingw\" > /etc/fstab" -- C:\MinGW\msys\1.0\bin\bash -lc "mv /bin/link.exe /bin/link_copy.exe" -# Build pthreads -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/pthreads4w-code && nmake all install" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && ./boot.sh" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && ./configure CC=build-aux/cccl LD=\"`which link`\" LIBS=\"-lws2_32 -lShlwapi -liphlpapi -lwbemuuid -lole32 -loleaut32\" --prefix=C:/openvswitch/usr --localstatedir=C:/openvswitch/var --sysconfdir=C:/openvswitch/etc --with-pthread=c:/PTHREADS-BUILT/ --enable-ssl --with-openssl=C:/OpenSSL-Win64 --with-vstudiotarget=\"%CONFIGURATION%\"" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make -j 4" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make datapath_windows_analyze" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make install" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make windows_installer" -- cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\bin -- cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\sbin -- ps: cp C:\openvswitch_compile\datapath-windows\x64\Win10$env:CONFIGURATION\package\* C:\openvswitch\driver -- ps: cp C:\openvswitch_compile\datapath-windows\x64\Win10$env:CONFIGURATION\package.cer C:\openvswitch\driver -- ps: cp C:\openvswitch_compile\datapath-windows\misc\* C:\openvswitch\driver -- cp c:\openvswitch_compile\windows\ovs-windows-installer\bin\x64\Release\OpenvSwitch.msi c:\OpenvSwitch-%CONFIGURATION%.msi +- ps: C:\msys64\msys2_shell.cmd -here -defterm -no-start -use-full-path -c + ".ci/windows-prepare.sh 2>&1" +- ps: C:\msys64\msys2_shell.cmd -here -defterm -no-start -use-full-path -c + ".ci/windows-build.sh $env:CONFIGURATION 2>&1" +- ps: cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\bin +- ps: cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\sbin +- ps: mkdir C:\openvswitch\driver +- ps: cp datapath-windows\x64\Win10$env:CONFIGURATION\package\* C:\openvswitch\driver +- ps: cp datapath-windows\x64\Win10$env:CONFIGURATION\package.cer C:\openvswitch\driver +- ps: cp datapath-windows\misc\* C:\openvswitch\driver +- ps: cp windows\ovs-windows-installer\bin\x64\Release\OpenvSwitch.msi + c:\OpenvSwitch-$env:CONFIGURATION.msi after_build: - - ps: 7z a C:\ovs-master-$env:CONFIGURATION.zip C:\openvswitch - - ps: Push-AppveyorArtifact C:\ovs-master-$env:CONFIGURATION.zip - - ps: Push-AppveyorArtifact C:\OpenvSwitch-$env:CONFIGURATION.msi +- ps: 7z a C:\ovs-master-$env:CONFIGURATION.zip C:\openvswitch +- ps: Push-AppveyorArtifact C:\ovs-master-$env:CONFIGURATION.zip +- ps: Push-AppveyorArtifact C:\OpenvSwitch-$env:CONFIGURATION.msi From bf921e56775624521540d1fd5c2132beb24d06d2 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 12 Feb 2024 01:50:18 -0500 Subject: [PATCH 578/833] dp-packet: Validate correct offset for L4 inner size. This patch fixes the correctness of dp_packet_inner_l4_size() when checking for the existence of an inner L4 header. Previously it checked for the outer L4 header. This function is currently only used when a packet is already flagged for tunneling, so an incorrect determination isn't possible as long as the flags of the packet are correct. Fixes: 85bcbbed839a ("userspace: Enable tunnel tests with TSO.") Reviewed-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index dceb701e8d2..802d3f3857c 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -540,7 +540,7 @@ dp_packet_inner_l4(const struct dp_packet *b) static inline size_t dp_packet_inner_l4_size(const struct dp_packet *b) { - return OVS_LIKELY(b->l4_ofs != UINT16_MAX) + return OVS_LIKELY(b->inner_l4_ofs != UINT16_MAX) ? (const char *) dp_packet_tail(b) - (const char *) dp_packet_inner_l4(b) - dp_packet_l2_pad_size(b) From 281b8d24c695a3a69ed0b7811414c7e7c415aaaf Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 12 Feb 2024 01:50:19 -0500 Subject: [PATCH 579/833] bfd: Set proper offsets and flags in BFD packets. Previously the BFD packet creation code did not appropriately set offsets or flags. This contributed to issues involving encapsulation and the TSO code. The transition to using standard functions also means some other metadata like packet_type are set appropriately. Fixes: ccc096898c46 ("bfd: Implement Bidirectional Forwarding Detection.") Reviewed-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/bfd.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/lib/bfd.c b/lib/bfd.c index 9698576d071..9af258917bb 100644 --- a/lib/bfd.c +++ b/lib/bfd.c @@ -586,7 +586,6 @@ bfd_put_packet(struct bfd *bfd, struct dp_packet *p, { long long int min_tx, min_rx; struct udp_header *udp; - struct eth_header *eth; struct ip_header *ip; struct msg *msg; @@ -605,15 +604,13 @@ bfd_put_packet(struct bfd *bfd, struct dp_packet *p, * set. */ ovs_assert(!(bfd->flags & FLAG_POLL) || !(bfd->flags & FLAG_FINAL)); - dp_packet_reserve(p, 2); /* Properly align after the ethernet header. */ - eth = dp_packet_put_uninit(p, sizeof *eth); - eth->eth_src = eth_addr_is_zero(bfd->local_eth_src) - ? eth_src : bfd->local_eth_src; - eth->eth_dst = eth_addr_is_zero(bfd->local_eth_dst) - ? eth_addr_bfd : bfd->local_eth_dst; - eth->eth_type = htons(ETH_TYPE_IP); + ip = eth_compose(p, + eth_addr_is_zero(bfd->local_eth_dst) + ? eth_addr_bfd : bfd->local_eth_dst, + eth_addr_is_zero(bfd->local_eth_src) + ? eth_src : bfd->local_eth_src, + ETH_TYPE_IP, sizeof *ip + sizeof *udp + sizeof *msg); - ip = dp_packet_put_zeros(p, sizeof *ip); ip->ip_ihl_ver = IP_IHL_VER(5, 4); ip->ip_tot_len = htons(sizeof *ip + sizeof *udp + sizeof *msg); ip->ip_ttl = MAXTTL; @@ -621,15 +618,17 @@ bfd_put_packet(struct bfd *bfd, struct dp_packet *p, ip->ip_proto = IPPROTO_UDP; put_16aligned_be32(&ip->ip_src, bfd->ip_src); put_16aligned_be32(&ip->ip_dst, bfd->ip_dst); - /* Checksum has already been zeroed by put_zeros call. */ + /* Checksum has already been zeroed by eth_compose call. */ ip->ip_csum = csum(ip, sizeof *ip); + dp_packet_set_l4(p, ip + 1); - udp = dp_packet_put_zeros(p, sizeof *udp); + udp = dp_packet_l4(p); udp->udp_src = htons(bfd->udp_src); udp->udp_dst = htons(BFD_DEST_PORT); udp->udp_len = htons(sizeof *udp + sizeof *msg); + /* Checksum already zero from eth_compose. */ - msg = dp_packet_put_uninit(p, sizeof *msg); + msg = (struct msg *)(udp + 1); msg->vers_diag = (BFD_VERSION << 5) | bfd->diag; msg->flags = (bfd->state & STATE_MASK) | bfd->flags; From 8cebf7efbaf49acc57d03885a41abe990a7349bd Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 12 Feb 2024 01:50:20 -0500 Subject: [PATCH 580/833] dp-packet: Include inner offsets in adjustments and checks. Include inner offsets in functions where l3 and l4 offsets are either modified or checked. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Reviewed-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 18 +++++++++++++----- lib/odp-execute-avx512.c | 31 ++++++++++++++++++++----------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 0e23c766e1b..305822293b9 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -507,6 +507,8 @@ dp_packet_resize_l2_5(struct dp_packet *b, int increment) /* Adjust layer offsets after l2_5. */ dp_packet_adjust_layer_offset(&b->l3_ofs, increment); dp_packet_adjust_layer_offset(&b->l4_ofs, increment); + dp_packet_adjust_layer_offset(&b->inner_l3_ofs, increment); + dp_packet_adjust_layer_offset(&b->inner_l4_ofs, increment); return dp_packet_data(b); } @@ -529,17 +531,23 @@ dp_packet_compare_offsets(struct dp_packet *b1, struct dp_packet *b2, if ((b1->l2_pad_size != b2->l2_pad_size) || (b1->l2_5_ofs != b2->l2_5_ofs) || (b1->l3_ofs != b2->l3_ofs) || - (b1->l4_ofs != b2->l4_ofs)) { + (b1->l4_ofs != b2->l4_ofs) || + (b1->inner_l3_ofs != b2->inner_l3_ofs) || + (b1->inner_l4_ofs != b2->inner_l4_ofs)) { if (err_str) { ds_put_format(err_str, "Packet offset comparison failed\n"); ds_put_format(err_str, "Buffer 1 offsets: l2_pad_size %u," - " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n", + " l2_5_ofs : %u l3_ofs %u, l4_ofs %u," + " inner_l3_ofs %u, inner_l4_ofs %u\n", b1->l2_pad_size, b1->l2_5_ofs, - b1->l3_ofs, b1->l4_ofs); + b1->l3_ofs, b1->l4_ofs, + b1->inner_l3_ofs, b1->inner_l4_ofs); ds_put_format(err_str, "Buffer 2 offsets: l2_pad_size %u," - " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n", + " l2_5_ofs : %u l3_ofs %u, l4_ofs %u," + " inner_l3_ofs %u, inner_l4_ofs %u\n", b2->l2_pad_size, b2->l2_5_ofs, - b2->l3_ofs, b2->l4_ofs); + b2->l3_ofs, b2->l4_ofs, + b2->inner_l3_ofs, b2->inner_l4_ofs); } return false; } diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 747e04014ab..50c48bfd479 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -35,10 +35,10 @@ VLOG_DEFINE_THIS_MODULE(odp_execute_avx512); -/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs - * fields remain in the same order and offset to l2_padd_size. This is needed - * as the avx512_dp_packet_resize_l2() function will manipulate those fields at - * a fixed memory index based on the l2_padd_size offset. */ +/* The below build asserts make sure that the below fields remain in the same + * order and offset to l2_pad_size. This is needed as the + * avx512_dp_packet_resize_l2() function will manipulate those fields at a + * fixed memory index based on the l2_pad_size offset. */ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) + MEMBER_SIZEOF(struct dp_packet, l2_pad_size) == offsetof(struct dp_packet, l2_5_ofs)); @@ -51,6 +51,14 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) + MEMBER_SIZEOF(struct dp_packet, l3_ofs) == offsetof(struct dp_packet, l4_ofs)); +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l4_ofs) + + MEMBER_SIZEOF(struct dp_packet, l4_ofs) == + offsetof(struct dp_packet, inner_l3_ofs)); + +BUILD_ASSERT_DECL(offsetof(struct dp_packet, inner_l3_ofs) + + MEMBER_SIZEOF(struct dp_packet, inner_l3_ofs) == + offsetof(struct dp_packet, inner_l4_ofs)); + /* The below build assert makes sure it's safe to read/write 128-bits starting * at the l2_pad_size location. */ BUILD_ASSERT_DECL(sizeof(struct dp_packet) - @@ -112,7 +120,7 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) dp_packet_pull(b, -resize_by_bytes); } - /* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which + /* The next step is to update the l2_5_ofs to inner_l4_ofs fields which * the scalar implementation does with the dp_packet_adjust_layer_offset() * function. */ @@ -122,13 +130,14 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) /* Set the v_u16_max register to all one's. */ const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros); - /* Each lane represents 16 bits in a 12-bit register. In this case the - * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and - * l4_ofs fields. */ - const uint8_t k_lanes = 0b1110; + /* Each lane represents 16 bits in a 128-bit register. Here the bitmask + * starts at l2_5_ofs with a value of 0 indicating it is not modified. Then + * five 1's to indicate modificaiton of all fields from l2_5_ofs to + * inner_l4_ofs. */ + const uint8_t k_lanes = 0b111110; /* Set all 16-bit words in the 128-bits v_offset register to the value we - * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */ + * need to add/substract from the l2_5_ofs to inner_l4_ofs fields. */ __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes)); /* Load 128 bits from the dp_packet structure starting at the l2_pad_size @@ -147,7 +156,7 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) /* Based on the bytes adjust (positive, or negative) it will do the actual * add or subtraction. These functions will only operate on the lanes * (fields) requested based on k_cmp, i.e: - * k_cmp = [l2_5_ofs, l3_ofs, l4_ofs] + * k_cmp = [l2_5_ofs, ..., inner_l4_ofs] * for field in kcmp * v_adjust_src[field] = v_adjust_src[field] + v_offset */ From 0061a48920bcf28204670627c45cb66a1da44ae6 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 12 Feb 2024 01:50:21 -0500 Subject: [PATCH 581/833] ofproto-dpif-monitor: Remove unneeded calls to clear packets. Currently the monitor will call dp_packet_clear() on the dp_packet that is shared amongst BFD, LLDP, and CFM. However, all of these packets are created with eth_compose(), which already calls dp_packet_clear(). Reviewed-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-monitor.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/ofproto/ofproto-dpif-monitor.c b/ofproto/ofproto-dpif-monitor.c index bb0e4909101..5132f9c952f 100644 --- a/ofproto/ofproto-dpif-monitor.c +++ b/ofproto/ofproto-dpif-monitor.c @@ -275,19 +275,16 @@ monitor_mport_run(struct mport *mport, struct dp_packet *packet) long long int lldp_wake_time = LLONG_MAX; if (mport->cfm && cfm_should_send_ccm(mport->cfm)) { - dp_packet_clear(packet); cfm_compose_ccm(mport->cfm, packet, mport->hw_addr); ofproto_dpif_send_packet(mport->ofport, false, packet); } if (mport->bfd && bfd_should_send_packet(mport->bfd)) { bool oam; - dp_packet_clear(packet); bfd_put_packet(mport->bfd, packet, mport->hw_addr, &oam); ofproto_dpif_send_packet(mport->ofport, oam, packet); } if (mport->lldp && lldp_should_send_packet(mport->lldp)) { - dp_packet_clear(packet); lldp_put_packet(mport->lldp, packet, mport->hw_addr); ofproto_dpif_send_packet(mport->ofport, false, packet); } From 19cffe30cfdad6f3a0fcc57069d6fb005233ee92 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Mon, 12 Feb 2024 09:18:32 +0100 Subject: [PATCH 582/833] netdev-linux: Avoid deadlock in netdev_get_speed. netdev_linux_get_speed needs to lock netdev_linux->mutex, and so do the internal tc operations. Therefore, the former cannot be called from the latter. Create a lock-free version of netdev_linux_get_speed() and call it from tc operations. Also expand the unit test to cover queues where ceil is determined by the maximum link speed. Fixes: b8f8fad86435 ("netdev-linux: Use speed as max rate in tc classes.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2024-January/052912.html Reported-by: Daryl Wang Suggested-by: Ilya Maximets Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 32 +++++++++++++++---------- tests/system-traffic.at | 53 ++++++++++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 29 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 1b2e5b6c2bc..00df7f63417 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2721,16 +2721,11 @@ netdev_linux_get_features(const struct netdev *netdev_, } static int -netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current, - uint32_t *max) +netdev_linux_get_speed_locked(struct netdev_linux *netdev, + uint32_t *current, uint32_t *max) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); - int error; - - ovs_mutex_lock(&netdev->mutex); if (netdev_linux_netnsid_is_remote(netdev)) { - error = EOPNOTSUPP; - goto exit; + return EOPNOTSUPP; } netdev_linux_read_features(netdev); @@ -2740,9 +2735,18 @@ netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current, *max = MIN(UINT32_MAX, netdev_features_to_bps(netdev->supported, 0) / 1000000ULL); } - error = netdev->get_features_error; + return netdev->get_features_error; +} -exit: +static int +netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current, + uint32_t *max) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + error = netdev_linux_get_speed_locked(netdev, current, max); ovs_mutex_unlock(&netdev->mutex); return error; } @@ -4954,8 +4958,10 @@ htb_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8; if (!hc->max_rate) { uint32_t current_speed; + uint32_t max_speed OVS_UNUSED; - netdev_get_speed(netdev, ¤t_speed, NULL); + netdev_linux_get_speed_locked(netdev_linux_cast(netdev), + ¤t_speed, &max_speed); hc->max_rate = current_speed ? current_speed / 8 * 1000000ULL : NETDEV_DEFAULT_BPS / 8; } @@ -5424,8 +5430,10 @@ hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8; if (!max_rate) { uint32_t current_speed; + uint32_t max_speed OVS_UNUSED; - netdev_get_speed(netdev, ¤t_speed, NULL); + netdev_linux_get_speed_locked(netdev_linux_cast(netdev), + ¤t_speed, &max_speed); max_rate = current_speed ? current_speed / 8 * 1000000ULL : NETDEV_DEFAULT_BPS / 8; } diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 4fd5dbe593f..e68fe7e1859 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2541,34 +2541,53 @@ AT_BANNER([QoS]) AT_SETUP([QoS - basic configuration]) OVS_CHECK_TC_QDISC() +AT_SKIP_IF([test $HAVE_ETHTOOL = "no"]) OVS_TRAFFIC_VSWITCHD_START() -ADD_NAMESPACES(at_ns0, at_ns1) +AT_CHECK([ip tuntap add ovs-tap0 mode tap]) +on_exit 'ip link del ovs-tap0' +AT_CHECK([ip tuntap add ovs-tap1 mode tap]) +on_exit 'ip link del ovs-tap1' -ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") -ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +dnl Set maximum link speed to 5Gb. +AT_CHECK([ethtool -s ovs-tap0 speed 5000 duplex full]) +AT_CHECK([ip link set dev ovs-tap0 up]) +AT_CHECK([ethtool -s ovs-tap1 speed 5000 duplex full]) +AT_CHECK([ip link set dev ovs-tap1 up]) -dnl Adding a custom qdisc to ovs-p1, ovs-p0 will have the default qdisc. -AT_CHECK([tc qdisc add dev ovs-p1 root noqueue]) -AT_CHECK([tc qdisc show dev ovs-p1 | grep -q noqueue]) +AT_CHECK([ovs-vsctl add-port br0 ovs-tap0 -- set int ovs-tap0 type=tap]) +AT_CHECK([ovs-vsctl add-port br0 ovs-tap1 -- set int ovs-tap1 type=tap]) -dnl Configure the same QoS for both ports. -AT_CHECK([ovs-vsctl set port ovs-p0 qos=@qos -- set port ovs-p1 qos=@qos dnl - -- --id=@qos create qos dnl - type=linux-htb other-config:max-rate=3000000 queues:0=@queue dnl - -- --id=@queue create queue dnl +dnl Adding a custom qdisc to ovs-tap1, ovs-tap0 will have the default qdisc. +AT_CHECK([tc qdisc add dev ovs-tap1 root noqueue]) +AT_CHECK([tc qdisc show dev ovs-tap1 | grep -q noqueue]) + +dnl Configure the same QoS for both ports: +dnl queue0 uses fixed max-rate. +dnl queue1 relies on underlying link speed. +AT_CHECK([ovs-vsctl dnl + -- --id=@queue0 create queue dnl other_config:min-rate=2000000 other_config:max-rate=3000000 dnl - other_config:burst=3000000], + other_config:burst=3000000 dnl + -- --id=@queue1 create queue dnl + other_config:min-rate=4000000 other_config:burst=4000000 dnl + -- --id=@qos create qos dnl + type=linux-htb queues:0=@queue0 dnl + queues:1=@queue1 -- dnl + -- set port ovs-tap0 qos=@qos -- set port ovs-tap1 qos=@qos], [ignore], [ignore]) dnl Wait for qdiscs to be applied. -OVS_WAIT_UNTIL([tc qdisc show dev ovs-p0 | grep -q htb]) -OVS_WAIT_UNTIL([tc qdisc show dev ovs-p1 | grep -q htb]) +OVS_WAIT_UNTIL([tc qdisc show dev ovs-tap0 | grep -q htb]) +OVS_WAIT_UNTIL([tc qdisc show dev ovs-tap1 | grep -q htb]) dnl Check the configuration. -m4_define([HTB_CONF], [rate 2Mbit ceil 3Mbit burst 375000b cburst 375000b]) -AT_CHECK([tc class show dev ovs-p0 | grep -q 'class htb .* HTB_CONF']) -AT_CHECK([tc class show dev ovs-p1 | grep -q 'class htb .* HTB_CONF']) +m4_define([HTB_CONF0], [rate 2Mbit ceil 3Mbit burst 375000b cburst 375000b]) +m4_define([HTB_CONF1], [rate 4Mbit ceil 5Gbit burst 500000b cburst 500000b]) +AT_CHECK([tc class show dev ovs-tap0 | grep -q 'class htb .* HTB_CONF0']) +AT_CHECK([tc class show dev ovs-tap0 | grep -q 'class htb .* HTB_CONF1']) +AT_CHECK([tc class show dev ovs-tap1 | grep -q 'class htb .* HTB_CONF0']) +AT_CHECK([tc class show dev ovs-tap1 | grep -q 'class htb .* HTB_CONF1']) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP From dda253293fd07033ef333c8bdce925d983185978 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 13 Feb 2024 20:42:36 +0100 Subject: [PATCH 583/833] ovs-pki: Remove executable bit from private/cakey.pem. It's not an executable file. Acked-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- utilities/ovs-pki.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in index e0ba910f94c..37913cb6ba1 100755 --- a/utilities/ovs-pki.in +++ b/utilities/ovs-pki.in @@ -318,7 +318,7 @@ EOF -extensions ca_cert -out cacert.pem \ -days 3650 -batch -keyfile private/cakey.pem -selfsign \ -infiles careq.pem 1>&3 2>&3 - chmod 0700 private/cakey.pem + chmod 0600 private/cakey.pem cd "$oldpwd" done From c7dd0a7b09add427555a986d504b012240f49a28 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 13 Feb 2024 20:44:41 +0100 Subject: [PATCH 584/833] ovs-pki: Remove umask trick for self-signing. The output file of this openssl command is a certificate signed with pre-existing private key. It doesn't create a private key. The restricted permissions are explicitly removed from the resulted certificate right after its generation. So, there is no point in creating it with restricted permissions in the first place. Fixes: 99e5e05db37a ("ovs-pki: Create private keys with restricted permissions.") Acked-by: Mike Pattrick Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- utilities/ovs-pki.in | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in index 37913cb6ba1..b0c5389031e 100755 --- a/utilities/ovs-pki.in +++ b/utilities/ovs-pki.in @@ -545,16 +545,9 @@ elif test "$command" = self-sign; then cat > "$TMP/v3.ext" <&3 || exit $? - - # Reset the permissions on the certificate to the user's default. - cat "$arg1-cert.pem.tmp" > "$arg1-cert.pem" - rm -f "$arg1-cert.pem.tmp" + openssl x509 -in "$arg1-req.pem" -out "$arg1-cert.pem" \ + -signkey "$arg1-privkey.pem" -req -days 3650 -text \ + -extfile $TMP/v3.ext 2>&3 || exit $? else echo "$0: $command command unknown; use --help for help" >&2 exit 1 From 5df46a44e8755d065b7247a0e7d2c90e9781219f Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Fri, 26 Jan 2024 14:24:51 +0100 Subject: [PATCH 585/833] dpif-netdev: Increase MAX_RECIRC_DEPTH to 8. In a scenario where OVN does load balancing and then SNAT with a OVS userspace datapath [0], the recirc_depth may be greater than 6. In that case, ovs-vswitchd might drop packets and raise warnings: dpif_netdev|WARN|Packet dropped. Max recirculation depth exceeded. Increasing MAX_RECIRC_DEPTH to 8 solves this issue. [0] https://github.com/ovn-org/ovn/blob/dd5cd73e3df1bfb1a215cb45d1e2e03eff1d049a/tests/system-ovn-kmod.at#L740 Reported-at: https://issues.redhat.com/browse/FDP-251 Acked-by: Simon Horman Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- lib/dpif-netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index c1981137f92..46e24d204d4 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -99,7 +99,7 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev); #define FLOW_DUMP_MAX_BATCH 50 /* Use per thread recirc_depth to prevent recirculation loop. */ -#define MAX_RECIRC_DEPTH 6 +#define MAX_RECIRC_DEPTH 8 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) /* Use instant packet send by default. */ From 3e666ba000b5eff58da8abb4e8c694ac3f7b08d6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 15 Feb 2024 12:55:59 +0100 Subject: [PATCH 586/833] rstp: Fix deadlock with patch ports. The cited commit removed direct call to RSTP module from a callback, but we can still enter the module after going through a patch port to a different bridge via ofproto_dpif_send_packet(). Partially revert the change going back to a recursive mutex. Adding the same test for both RSTP and STP. While STP unit tests do catch the same problem for STP (if STP mutex changed to be non-recursive), they are not actually using the same callback function as ovs-vswitchd, so it makes sense to test the implementation in ovs-vswitchd itself as well. Fixes: 6b90bc57e7a2 ("lib/rstp: Remove lock recursion.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2024-February/052925.html Reported-by: Huangzhidong Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/rstp.c | 6 ++++- tests/rstp.at | 57 +++++++++++++++++++++++++++++++++++++++++++++ tests/stp.at | 59 +++++++++++++++++++++++++++++++++++++++++++++++ tests/test-rstp.c | 2 ++ 4 files changed, 123 insertions(+), 1 deletion(-) diff --git a/lib/rstp.c b/lib/rstp.c index 2f01966f796..90e80945997 100644 --- a/lib/rstp.c +++ b/lib/rstp.c @@ -50,7 +50,7 @@ VLOG_DEFINE_THIS_MODULE(rstp); -struct ovs_mutex rstp_mutex = OVS_MUTEX_INITIALIZER; +struct ovs_mutex rstp_mutex; static struct ovs_list all_rstps__ = OVS_LIST_INITIALIZER(&all_rstps__); static struct ovs_list *const all_rstps OVS_GUARDED_BY(rstp_mutex) = &all_rstps__; @@ -248,6 +248,10 @@ void rstp_init(void) OVS_EXCLUDED(rstp_mutex) { + /* We need a recursive mutex because rstp_send_bpdu() could loop back + * into the rstp module through a patch port. */ + ovs_mutex_init_recursive(&rstp_mutex); + unixctl_command_register("rstp/tcn", "[bridge]", 0, 1, rstp_unixctl_tcn, NULL); unixctl_command_register("rstp/show", "[bridge]", 0, 1, rstp_unixctl_show, diff --git a/tests/rstp.at b/tests/rstp.at index 600e85dabde..e0d4bed4f05 100644 --- a/tests/rstp.at +++ b/tests/rstp.at @@ -253,3 +253,60 @@ AT_CHECK([ovs-vsctl del-port br0 p1]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([RSTP - patch ports]) +# Create br0 with interfaces p1 and p7 +# and br1 with interfaces p2 and p8 +# with p1 and p2 being connected patch ports. +OVS_VSWITCHD_START( + [set port br0 other_config:rstp-enable=false -- \ + set bridge br0 rstp-enable=true +]) + +AT_CHECK([add_of_br 1 \ + set port br1 other_config:rstp-enable=false -- \ + set bridge br1 rstp-enable=true]) + +ovs-appctl time/stop + +AT_CHECK([ovs-vsctl \ + add-port br0 p1 -- \ + set interface p1 type=patch options:peer=p2 ofport_request=1 -- \ + set port p1 other_config:rstp-enable=true -- \ + add-port br1 p2 -- \ + set interface p2 type=patch options:peer=p1 ofport_request=2 -- \ + set port p2 other_config:rstp-enable=true -- \ +]) + +AT_CHECK([ovs-vsctl \ + add-port br0 p7 -- \ + set interface p7 ofport_request=7 type=dummy -- \ + set port p7 other_config:rstp-enable=false -- \ + add-port br1 p8 -- \ + set interface p8 ofport_request=8 type=dummy -- \ + set port p8 other_config:rstp-enable=false -- \ +]) + +AT_CHECK([ovs-ofctl add-flow br0 "in_port=7 icmp actions=1"]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=1 icmp actions=7"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=8 icmp actions=2"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=2 icmp actions=8"]) + +# Give time for RSTP to synchronize. +ovs-appctl time/warp 5000 500 + +OVS_WAIT_UNTIL_EQUAL([cat ovs-vswitchd.log | FILTER_STP_TOPOLOGY], [dnl +port p1: RSTP state changed from Disabled to Discarding +port p2: RSTP state changed from Disabled to Discarding +port p2: RSTP state changed from Discarding to Forwarding +port p1: RSTP state changed from Discarding to Forwarding]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(7),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 8 +]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(8),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.3,dst=10.0.0.4,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 7 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/stp.at b/tests/stp.at index e7bf3958a0a..75abe8e5ca0 100644 --- a/tests/stp.at +++ b/tests/stp.at @@ -464,6 +464,65 @@ Datapath actions: 2 AT_CLEANUP +AT_SETUP([STP - patch ports]) +# Create br0 with interfaces p1 and p7 +# and br1 with interfaces p2 and p8 +# with p1 and p2 being connected patch ports. +OVS_VSWITCHD_START( + [set port br0 other_config:stp-enable=false -- \ + set bridge br0 stp-enable=true +]) + +AT_CHECK([add_of_br 1 \ + set port br1 other_config:stp-enable=false -- \ + set bridge br1 stp-enable=true]) + +ovs-appctl time/stop + +AT_CHECK([ovs-vsctl \ + add-port br0 p1 -- \ + set interface p1 type=patch options:peer=p2 ofport_request=1 -- \ + set port p1 other_config:stp-enable=true -- \ + add-port br1 p2 -- \ + set interface p2 type=patch options:peer=p1 ofport_request=2 -- \ + set port p2 other_config:stp-enable=true -- \ +]) + +AT_CHECK([ovs-vsctl \ + add-port br0 p7 -- \ + set interface p7 ofport_request=7 type=dummy -- \ + set port p7 other_config:stp-enable=false -- \ + add-port br1 p8 -- \ + set interface p8 ofport_request=8 type=dummy -- \ + set port p8 other_config:stp-enable=false -- \ +]) + +AT_CHECK([ovs-ofctl add-flow br0 "in_port=7 icmp actions=1"]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=1 icmp actions=7"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=8 icmp actions=2"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=2 icmp actions=8"]) + +# Give time for STP to synchronize. +ovs-appctl time/warp 30000 3000 + +OVS_WAIT_UNTIL_EQUAL([cat ovs-vswitchd.log | FILTER_STP_TOPOLOGY], [dnl +port <>: STP state changed from disabled to listening +port <>: STP state changed from disabled to listening +port <>: STP state changed from listening to learning +port <>: STP state changed from listening to learning +port <>: STP state changed from learning to forwarding +port <>: STP state changed from learning to forwarding]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(7),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 8 +]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(8),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.3,dst=10.0.0.4,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 7 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([STP - flush the fdb and mdb when topology changed]) OVS_VSWITCHD_START([]) diff --git a/tests/test-rstp.c b/tests/test-rstp.c index 9c1026ec1a8..707ee3a6c8a 100644 --- a/tests/test-rstp.c +++ b/tests/test-rstp.c @@ -469,6 +469,8 @@ test_rstp_main(int argc, char *argv[]) vlog_set_pattern(VLF_CONSOLE, "%c|%p|%m"); vlog_set_levels(NULL, VLF_SYSLOG, VLL_OFF); + rstp_init(); + if (argc != 2) { ovs_fatal(0, "usage: test-rstp INPUT.RSTP"); } From cb0cbffbe8fb949a0dc7f1d14782655aae566d61 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 15 Feb 2024 17:53:02 -0500 Subject: [PATCH 587/833] netdev-linux: Favour inner packet for multi-encapsulated TSO. Previously if an OVS configuration nested multiple layers of UDP tunnels like VXLAN or GENEVE on top of each other through netdev-linux interfaces, the vnet header would be incorrectly set to the outermost UDP tunnel layer instead of the intermediary tunnel layer. This resulted in the middle UDP tunnel not checksum offloading properly. Fixes: 85bcbbed839a ("userspace: Enable tunnel tests with TSO.") Reported-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 00df7f63417..8964cd67014 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -7247,14 +7247,23 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct tcp_header, tcp_csum); } else if (dp_packet_hwol_l4_is_udp(b)) { - struct udp_header *udp_hdr = dp_packet_l4(b); + /* Favour the inner packet when indicating checksum offsets. */ + void *l3_off = dp_packet_inner_l3(b); + void *l4_off = dp_packet_inner_l4(b); + + if (!l3_off || !l4_off) { + l3_off = dp_packet_l3(b); + l4_off = dp_packet_l4(b); + } + struct udp_header *udp_hdr = l4_off; + ovs_be16 csum = 0; if (dp_packet_hwol_is_ipv4(b)) { - const struct ip_header *ip_hdr = dp_packet_l3(b); + const struct ip_header *ip_hdr = l3_off; csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); } else if (dp_packet_hwol_tx_ipv6(b)) { - const struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(b); + const struct ovs_16aligned_ip6_hdr *ip6_hdr = l4_off; csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); } From a2d4ad651d8b9b4deda2c19bd7a87bf40f79ddf3 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 15 Feb 2024 17:53:03 -0500 Subject: [PATCH 588/833] netdev-linux: Only repair IP checksum in IPv4. Previously a change was added to the vnet prepend code to solve for the case where no L4 checksum offloading was needed but the L3 checksum hadn't been calculated. But the added check didn't properly account for IPv6 traffic. Fixes: 85bcbbed839a ("userspace: Enable tunnel tests with TSO.") Reported-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.h | 18 +++++++++++++++++- lib/netdev-linux.c | 9 +++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 802d3f3857c..770ddc1b952 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1184,7 +1184,7 @@ dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b) /* Returns 'true' if packet 'b' is marked for outer IPv4 checksum offload. */ static inline bool -dp_packet_hwol_is_outer_ipv4_cksum(struct dp_packet *b) +dp_packet_hwol_is_outer_ipv4_cksum(const struct dp_packet *b) { return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IP_CKSUM); } @@ -1384,6 +1384,22 @@ dp_packet_ip_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_IP_CKSUM_BAD; } +/* Return 'true' is packet 'b' is not encapsulated and is marked for IPv4 + * checksum offload, or if 'b' is encapsulated and the outer layer is marked + * for IPv4 checksum offload. IPv6 packets and non offloaded packets return + * 'false'. */ +static inline bool +dp_packet_hwol_l3_csum_ipv4_ol(const struct dp_packet *b) +{ + if (dp_packet_hwol_is_outer_ipv4(b)) { + return dp_packet_hwol_is_outer_ipv4_cksum(b); + } else if (!dp_packet_hwol_is_outer_ipv6(b)) { + return dp_packet_hwol_tx_ip_csum(b) && + !dp_packet_ip_checksum_good(b); + } + return false; +} + /* Calculate and set the IPv4 header checksum in packet 'p'. */ static inline void dp_packet_ip_set_header_csum(struct dp_packet *p, bool inner) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 8964cd67014..bf91ef462ef 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -7199,10 +7199,11 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) /* The packet has good L4 checksum. No need to validate again. */ vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID; - if (!dp_packet_ip_checksum_good(b)) { - /* It is possible that L4 is good but the IP checksum isn't - * complete. For example in the case of UDP encapsulation of an ARP - * packet where the UDP checksum is 0. */ + + /* It is possible that L4 is good but the IPv4 checksum isn't + * complete. For example in the case of UDP encapsulation of an ARP + * packet where the UDP checksum is 0. */ + if (dp_packet_hwol_l3_csum_ipv4_ol(b)) { dp_packet_ip_set_header_csum(b, false); } } else if (dp_packet_hwol_tx_l4_checksum(b)) { From f81d782c1906ac00852190105161efc810697c76 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 15 Feb 2024 17:53:04 -0500 Subject: [PATCH 589/833] netdev-native-tnl: Mark all vxlan/geneve packets as tunneled. Previously some packets were excluded from the tunnel mark if they weren't L4. However, this causes problems with multi encapsulated packets like arp. Due to these flags being set, additional checks are required in checksum modification code. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Reported-by: David Marchand Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.h | 19 +++++++++++++++++-- lib/netdev-native-tnl.c | 10 ++++++++-- lib/packets.c | 8 ++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 770ddc1b952..2fa17d81402 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1386,8 +1386,8 @@ dp_packet_ip_checksum_bad(const struct dp_packet *p) /* Return 'true' is packet 'b' is not encapsulated and is marked for IPv4 * checksum offload, or if 'b' is encapsulated and the outer layer is marked - * for IPv4 checksum offload. IPv6 packets and non offloaded packets return - * 'false'. */ + * for IPv4 checksum offload. IPv6 packets, non offloaded packets, and IPv4 + * packets that are marked as good return 'false'. */ static inline bool dp_packet_hwol_l3_csum_ipv4_ol(const struct dp_packet *b) { @@ -1400,6 +1400,21 @@ dp_packet_hwol_l3_csum_ipv4_ol(const struct dp_packet *b) return false; } +/* Return 'true' is packet 'b' is not encapsulated and is marked for IPv4 + * checksum offload, or if 'b' is encapsulated and the outer layer is marked + * for IPv4 checksum offload. IPv6 packets and non offloaded packets return + * 'false'. */ +static inline bool +dp_packet_hwol_l3_ipv4(const struct dp_packet *b) +{ + if (dp_packet_hwol_is_outer_ipv4(b)) { + return true; + } else if (!dp_packet_hwol_is_outer_ipv6(b)) { + return dp_packet_hwol_tx_ip_csum(b); + } + return false; +} + /* Calculate and set the IPv4 header checksum in packet 'p'. */ static inline void dp_packet_ip_set_header_csum(struct dp_packet *p, bool inner) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 0d6d803fe45..dee9ab344e4 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -91,8 +91,7 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, /* A packet coming from a network device might have the * csum already checked. In this case, skip the check. */ - if (OVS_UNLIKELY(!dp_packet_ip_checksum_good(packet)) - && !dp_packet_hwol_tx_ip_csum(packet)) { + if (OVS_UNLIKELY(!dp_packet_hwol_l3_csum_ipv4_ol(packet))) { if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) { VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum"); return NULL; @@ -299,6 +298,13 @@ dp_packet_tnl_ol_process(struct dp_packet *packet, (char *) dp_packet_eth(packet) + VXLAN_HLEN); } + } else { + /* Mark non-l4 packets as tunneled. */ + if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { + dp_packet_hwol_set_tunnel_geneve(packet); + } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { + dp_packet_hwol_set_tunnel_vxlan(packet); + } } } diff --git a/lib/packets.c b/lib/packets.c index 36c6692e5c6..5803d26f4ac 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1149,7 +1149,7 @@ packet_set_ipv4_addr(struct dp_packet *packet, } } - if (dp_packet_hwol_tx_ip_csum(packet)) { + if (dp_packet_hwol_l3_ipv4(packet)) { dp_packet_ol_reset_ip_csum_good(packet); } else { nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr); @@ -1328,7 +1328,7 @@ packet_set_ipv4(struct dp_packet *packet, ovs_be32 src, ovs_be32 dst, if (nh->ip_tos != tos) { uint8_t *field = &nh->ip_tos; - if (dp_packet_hwol_tx_ip_csum(packet)) { + if (dp_packet_hwol_l3_ipv4(packet)) { dp_packet_ol_reset_ip_csum_good(packet); } else { nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) *field), @@ -1341,7 +1341,7 @@ packet_set_ipv4(struct dp_packet *packet, ovs_be32 src, ovs_be32 dst, if (nh->ip_ttl != ttl) { uint8_t *field = &nh->ip_ttl; - if (dp_packet_hwol_tx_ip_csum(packet)) { + if (dp_packet_hwol_l3_ipv4(packet)) { dp_packet_ol_reset_ip_csum_good(packet); } else { nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8), @@ -1979,7 +1979,7 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) tos |= IP_ECN_CE; if (nh->ip_tos != tos) { - if (dp_packet_hwol_tx_ip_csum(pkt)) { + if (dp_packet_hwol_l3_ipv4(pkt)) { dp_packet_ol_reset_ip_csum_good(pkt); } else { nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos), From 619d4e6791e8f39d3744fbb44efff8ac692dedf7 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 15 Feb 2024 17:53:05 -0500 Subject: [PATCH 590/833] system-traffic.at: Add tests with UDP tunneling of UDP traffic. Previously a gap existed in the tunnel system tests where only ICMP and TCP traffic was tested. However, the code paths used for UDP traffic are different then either of those and should also be tested. Some of the modified tests had previously checked for TCP with ncat but didn't include an appropriate check for ncat support. That check was added to these tests. Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 92 ++++++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 16 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index e68fe7e1859..98e494abf4f 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -292,6 +292,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over vxlan tunnel]) +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_VXLAN() OVS_TRAFFIC_VSWITCHD_START() @@ -329,14 +330,23 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + dnl Check large bidirectional TCP. AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) -OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) -dnl Wait until transfer completes before checking. -OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) -AT_CHECK([diff -q payload.bin data], [0]) +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -389,6 +399,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over vxlan6 tunnel]) +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_VXLAN_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -428,14 +439,24 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + dnl Check large bidirectional TCP. AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) -OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) -dnl Wait until transfer completes before checking. -OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) -AT_CHECK([diff -q payload.bin data], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -477,6 +498,24 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -672,6 +711,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve tunnel]) +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -709,14 +749,23 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + dnl Check large bidirectional TCP. AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) -OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) -dnl Wait until transfer completes before checking. -OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) -AT_CHECK([diff -q payload.bin data], [0]) +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -827,6 +876,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve6 tunnel]) +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_GENEVE_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -866,14 +916,24 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PI 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + dnl Check large bidirectional TCP. AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) -OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > data], [nc.pid]) NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) -dnl Wait until transfer completes before checking. -OVS_WAIT_WHILE([kill -0 $(cat nc.pid)]) -AT_CHECK([diff -q payload.bin data], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP From 9aeda62cdd4e2eca8ad0f860c9eec20b476f4e4f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 16 Feb 2024 12:36:43 +0100 Subject: [PATCH 591/833] Set release date for 3.3.0. Beside the date update, moving the mdb NEWS entry to a more appropriate place - ovs-appctl section. Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- NEWS | 6 +++--- debian/changelog | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index a6617546c62..0789dc0c6c2 100644 --- a/NEWS +++ b/NEWS @@ -2,7 +2,7 @@ Post-v3.3.0 -------------------- -v3.3.0 - xx xxx xxxx +v3.3.0 - 16 Feb 2024 -------------------- - OVSDB: * Support pre-vote mechanism in RAFT that protects the cluster against @@ -33,6 +33,8 @@ v3.3.0 - xx xxx xxxx "ovs-appctl dpctl/ct-del-limits default". * 'dpctl/flush-conntrack' is now capable of flushing connections based on mark and labels. + * 'mdb/show': support for multicast snooping to show the protocol + responsible for adding/updating the entry. - ovs-vsctl: * New commands 'set-zone-limit', 'del-zone-limit' and 'list-zone-limits' to manage the maximum number of connections in conntrack zones via @@ -51,8 +53,6 @@ v3.3.0 - xx xxx xxxx during connection commit will be used by default. - DPDK: * Add support for DPDK 23.11. - - Support for multicast snooping to show the protocol responsible for - adding/updating the entry. v3.2.0 - 17 Aug 2023 diff --git a/debian/changelog b/debian/changelog index 44321745503..614c46ef919 100644 --- a/debian/changelog +++ b/debian/changelog @@ -8,7 +8,7 @@ openvswitch (3.3.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Wed, 17 Jan 2024 13:00:00 +0100 + -- Open vSwitch team Fri, 16 Feb 2024 12:25:58 +0100 openvswitch (3.2.0-1) unstable; urgency=low From 5f2af0b7a30e7de84de97556223f892ef63ec14b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Mon, 29 Jan 2024 13:51:42 +0100 Subject: [PATCH 592/833] utilities: Add TASK_STOPPED accounting to the kernel_delay.py script. This changes add statistics for when a thread is put into stop state. For example with the following: kill -STOP $(pidof ovs-vswitchd); sleep 1; kill -CONT $(pidof ovs-vswitchd); Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- utilities/usdt-scripts/kernel_delay.py | 110 ++++++++++++++++++------ utilities/usdt-scripts/kernel_delay.rst | 24 ++++++ 2 files changed, 110 insertions(+), 24 deletions(-) diff --git a/utilities/usdt-scripts/kernel_delay.py b/utilities/usdt-scripts/kernel_delay.py index b2012fdf20c..de6b0c9de4d 100755 --- a/utilities/usdt-scripts/kernel_delay.py +++ b/utilities/usdt-scripts/kernel_delay.py @@ -81,7 +81,6 @@ u32 syscall; u64 entry_ts; - }; BPF_RINGBUF_OUTPUT(events, ); @@ -220,7 +219,7 @@ u64 delta = bpf_ktime_get_ns() - *start_ns; val->count++; val->total_ns += delta; - if (val->worst_ns == 0 || delta > val->worst_ns) + if (delta > val->worst_ns) val->worst_ns = delta; if () { @@ -243,13 +242,12 @@ /* - * For measuring the thread run time, we need the following. + * For measuring the thread stopped time, we need the following. */ -struct run_time_data_t { +struct stop_time_data_t { u64 count; u64 total_ns; - u64 max_ns; - u64 min_ns; + u64 worst_ns; }; struct pid_tid_key_t { @@ -257,6 +255,43 @@ u32 tid; }; +BPF_HASH(stop_start, u64, u64); +BPF_HASH(stop_data, struct pid_tid_key_t, struct stop_time_data_t); + +static inline void thread_handle_stopped_run(u32 pid, u32 tgid, u64 ktime) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + u64 *start_ns = stop_start.lookup(&pid_tgid); + + if (!start_ns || *start_ns == 0) + return; + + struct stop_time_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = tgid, + .tid = pid }; + + val = stop_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->worst_ns) + val->worst_ns = delta; + } + *start_ns = 0; +} + + +/* + * For measuring the thread run time, we need the following. + */ +struct run_time_data_t { + u64 count; + u64 total_ns; + u64 max_ns; + u64 min_ns; +}; + BPF_HASH(run_start, u64, u64); BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t); @@ -282,7 +317,7 @@ u64 delta = ktime - *start_ns; val->count++; val->total_ns += delta; - if (val->max_ns == 0 || delta > val->max_ns) + if (delta > val->max_ns) val->max_ns = delta; if (val->min_ns == 0 || delta < val->min_ns) val->min_ns = delta; @@ -312,6 +347,8 @@ u64 t = bpf_ktime_get_ns(); ready_start.update(&pid_tgid, &t); + + thread_handle_stopped_run(pid, tgid, t); return 0; } @@ -336,22 +373,26 @@ if (!capture_enabled__()) return 0; - if (prev-> == TASK_RUNNING && prev->tgid == MONITOR_PID) - sched_wakeup__(prev->pid, prev->tgid); - if (prev->tgid == MONITOR_PID) { + u64 prev_pid_tgid = (u64)next->tgid << 32 | next->pid; ktime = bpf_ktime_get_ns(); + + if (prev-> == TASK_RUNNING) + ready_start.update(&prev_pid_tgid, &ktime); + + if (prev-> & __TASK_STOPPED) + stop_start.update(&prev_pid_tgid, &ktime); + thread_stop_run(prev->pid, prev->tgid, ktime); } - u64 pid_tgid = (u64)next->tgid << 32 | next->pid; - if (next->tgid != MONITOR_PID) return 0; if (ktime == 0) ktime = bpf_ktime_get_ns(); + u64 pid_tgid = (u64)next->tgid << 32 | next->pid; u64 *start_ns = ready_start.lookup(&pid_tgid); if (start_ns && *start_ns != 0) { @@ -365,7 +406,7 @@ u64 delta = ktime - *start_ns; val->count++; val->total_ns += delta; - if (val->worst_ns == 0 || delta > val->worst_ns) + if (delta > val->worst_ns) val->worst_ns = delta; } *start_ns = 0; @@ -438,7 +479,7 @@ u64 delta = bpf_ktime_get_ns() - data->start_ns; val->count++; val->total_ns += delta; - if (val->worst_ns == 0 || delta > val->worst_ns) + if (delta > val->worst_ns) val->worst_ns = delta; } } @@ -508,7 +549,7 @@ u64 delta = bpf_ktime_get_ns() - data->start_ns; val->count++; val->total_ns += delta; - if (val->worst_ns == 0 || delta > val->worst_ns) + if (delta > val->worst_ns) val->worst_ns = delta; } @@ -844,6 +885,8 @@ def reset_capture(): bpf["softirq_start"].clear() bpf["softirq_data"].clear() bpf["stack_traces"].clear() + bpf["stop_start"].clear() + bpf["stop_data"].clear() # @@ -879,6 +922,9 @@ def process_results(syscall_events=None, trigger_delta=None): threads_ready = {k.tid for k, _ in bpf["ready_data"].items() if k.pid != 0xffffffff} + threads_stopped = {k.tid for k, _ in bpf["stop_data"].items() + if k.pid != 0xffffffff} + threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items() if k.pid != 0xffffffff} @@ -886,7 +932,7 @@ def process_results(syscall_events=None, trigger_delta=None): if k.pid != 0xffffffff} threads = sorted(threads_syscall | threads_run | threads_ready | - threads_hardirq | threads_softirq, + threads_stopped | threads_hardirq | threads_softirq, key=lambda x: get_thread_name(options.pid, x)) # @@ -933,28 +979,44 @@ def process_results(syscall_events=None, trigger_delta=None): # # THREAD RUN STATISTICS # - print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( - "", "", "[THREAD RUN STATISTICS]", indent, - "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns")) - for k, v in filter(lambda t: t[0].tid == thread, bpf["run_data"].items()): + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( + "", "", "[THREAD RUN STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns")) + print("{}{:10} {:16,} {:16,} {:16,}".format( indent, v.count, v.total_ns, v.min_ns, v.max_ns)) + break # # THREAD READY STATISTICS # - print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( - "", "", "[THREAD READY STATISTICS]", indent, - "SCHED_CNT", "TOTAL ns", "MAX ns")) - for k, v in filter(lambda t: t[0].tid == thread, bpf["ready_data"].items()): + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( + "", "", "[THREAD READY STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MAX ns")) + print("{}{:10} {:16,} {:16,}".format( indent, v.count, v.total_ns, v.worst_ns)) + break + + # + # THREAD STOPPED STATISTICS + # + for k, v in filter(lambda t: t[0].tid == thread, + bpf["stop_data"].items()): + + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( + "", "", "[THREAD STOPPED STATISTICS]", indent, + "STOP_CNT", "TOTAL ns", "MAX ns")) + + print("{}{:10} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.worst_ns)) + break # # HARD IRQ STATISTICS diff --git a/utilities/usdt-scripts/kernel_delay.rst b/utilities/usdt-scripts/kernel_delay.rst index 0ebd30afb67..e2e43752d20 100644 --- a/utilities/usdt-scripts/kernel_delay.rst +++ b/utilities/usdt-scripts/kernel_delay.rst @@ -75,6 +75,10 @@ with the ``--pid`` option. SCHED_CNT TOTAL ns MAX ns 7 11,334 6,636 + [THREAD STOPPED STATISTICS] + STOP_CNT TOTAL ns MAX ns + 3 3,045,728,323 1,015,739,474 + [HARD IRQ STATISTICS] NAME COUNT TOTAL ns MAX ns eno8303-rx-1 1 3,586 3,586 @@ -102,6 +106,7 @@ followed by resource-specific data. Which are: - ``SYSCALL STATISTICS`` - ``THREAD RUN STATISTICS`` - ``THREAD READY STATISTICS`` +- ``THREAD STOPPED STATISTICS`` - ``HARD IRQ STATISTICS`` - ``SOFT IRQ STATISTICS`` @@ -143,6 +148,25 @@ Note that these statistics only count events where the thread was getting ready to run and started running during the measurement interval. +``THREAD STOPPED STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD STOPPED STATISTICS`` reveal the number of instances where the thread +has been scheduled out while in the running state due to its transition to +the TASK_STOPPED state. + +This behavior can be replicated by manually placing the thread in the stopped +state and subsequently resuming it. For instance: + +.. code-block:: console + + # kill -STOP $(pidof ovs-vswitchd); \ + sleep 1; \ + kill -CONT $(pidof ovs-vswitchd); + +Note that these statistics only count events where the thread was running at +the time it was put to stopped state. + + ``HARD IRQ STATISTICS`` ~~~~~~~~~~~~~~~~~~~~~~~ ``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard From 99413ec2610fb8b4192fa4e7f2db08486ec1e69f Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Fri, 16 Feb 2024 18:19:13 +0100 Subject: [PATCH 593/833] conntrack: Handle random selection for port ranges. The userspace conntrack only supported hash for port selection. With the patch, both userspace and kernel datapath support the random flag. The default behavior remains the same, that is, if no flags are specified, hash is selected. Signed-off-by: Paolo Valerio Acked-by: Aaron Conole Signed-off-by: Simon Horman --- Documentation/ref/ovs-actions.7.rst | 3 +-- NEWS | 3 +++ lib/conntrack.c | 15 ++++++++------- lib/conntrack.h | 5 +++++ lib/dpif-netdev.c | 4 +++- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Documentation/ref/ovs-actions.7.rst b/Documentation/ref/ovs-actions.7.rst index 36adcc5db2d..80acd9070b7 100644 --- a/Documentation/ref/ovs-actions.7.rst +++ b/Documentation/ref/ovs-actions.7.rst @@ -1551,8 +1551,7 @@ following arguments: should be selected. When a port range is specified, fallback to ephemeral ports does not happen, else, it will. The port number selection can be informed by the optional ``random`` and ``hash`` flags - described below. The userspace datapath only supports the ``hash`` - behavior. + described below. The optional *flags* are: diff --git a/NEWS b/NEWS index 0789dc0c6c2..5a5caffbfc8 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,8 @@ Post-v3.3.0 -------------------- + - Userspace datapath: + * Conntrack now supports 'random' flag for selecting ports in a range + while natting. v3.3.0 - 16 Feb 2024 diff --git a/lib/conntrack.c b/lib/conntrack.c index 013709bd622..e09ecdf336c 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2222,7 +2222,7 @@ nat_range_hash(const struct conn_key *key, uint32_t basis, /* Ports are stored in host byte order for convenience. */ static void set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k, - uint32_t hash, uint16_t *curr, uint16_t *min, + uint32_t off, uint16_t *curr, uint16_t *min, uint16_t *max) { if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) || @@ -2241,19 +2241,19 @@ set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k, } else { *min = ni->min_port; *max = ni->max_port; - *curr = *min + (hash % ((*max - *min) + 1)); + *curr = *min + (off % ((*max - *min) + 1)); } } static void set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k, - uint32_t hash, uint16_t *curr, uint16_t *min, + uint32_t off, uint16_t *curr, uint16_t *min, uint16_t *max) { if (ni->nat_action & NAT_ACTION_DST_PORT) { *min = ni->min_port; *max = ni->max_port; - *curr = *min + (hash % ((*max - *min) + 1)); + *curr = *min + (off % ((*max - *min) + 1)); } else { *curr = ntohs(k->dst.port); *min = *max = *curr; @@ -2388,18 +2388,19 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, fwd_key->nw_proto == IPPROTO_SCTP; uint16_t min_dport, max_dport, curr_dport; uint16_t min_sport, max_sport, curr_sport; - uint32_t hash; + uint32_t hash, port_off; hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info); + port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash; min_addr = nat_info->min_addr; max_addr = nat_info->max_addr; find_addr(fwd_key, &min_addr, &max_addr, &addr, hash, (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info); - set_sport_range(nat_info, fwd_key, hash, &curr_sport, + set_sport_range(nat_info, fwd_key, port_off, &curr_sport, &min_sport, &max_sport); - set_dport_range(nat_info, fwd_key, hash, &curr_dport, + set_dport_range(nat_info, fwd_key, port_off, &curr_dport, &min_dport, &max_dport); if (pat_proto) { diff --git a/lib/conntrack.h b/lib/conntrack.h index 0a888be4559..9b0c6aa88f2 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -77,12 +77,17 @@ enum nat_action_e { NAT_ACTION_DST_PORT = 1 << 3, }; +enum nat_flags_e { + NAT_RANGE_RANDOM = 1 << 0, +}; + struct nat_action_info_t { union ct_addr min_addr; union ct_addr max_addr; uint16_t min_port; uint16_t max_port; uint16_t nat_action; + uint16_t nat_flags; }; struct conntrack *conntrack_init(void); diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 46e24d204d4..0f08fa92b1d 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9409,9 +9409,11 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, nl_attr_get_u16(b_nest); proto_num_max_specified = true; break; + case OVS_NAT_ATTR_PROTO_RANDOM: + nat_action_info.nat_flags |= NAT_RANGE_RANDOM; + break; case OVS_NAT_ATTR_PERSISTENT: case OVS_NAT_ATTR_PROTO_HASH: - case OVS_NAT_ATTR_PROTO_RANDOM: break; case OVS_NAT_ATTR_UNSPEC: case __OVS_NAT_ATTR_MAX: From afdc1171a8f1c0984351172691945bdc47ae3910 Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Fri, 16 Feb 2024 18:19:14 +0100 Subject: [PATCH 594/833] conntrack: Handle persistent selection for IP addresses. The patch, when 'persistent' flag is specified, makes the IP selection in a range persistent across reboots. Signed-off-by: Paolo Valerio Acked-by: Aaron Conole Signed-off-by: Simon Horman --- NEWS | 3 ++- lib/conntrack.c | 25 +++++++++++++++++++------ lib/conntrack.h | 1 + lib/dpif-netdev.c | 2 ++ 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 5a5caffbfc8..c9e4064e67a 100644 --- a/NEWS +++ b/NEWS @@ -2,7 +2,8 @@ Post-v3.3.0 -------------------- - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range - while natting. + while natting and 'persistent' flag for selection of the IP address + from a range. v3.3.0 - 16 Feb 2024 diff --git a/lib/conntrack.c b/lib/conntrack.c index e09ecdf336c..8a7056bac3b 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2202,17 +2202,21 @@ nat_range_hash(const struct conn_key *key, uint32_t basis, { uint32_t hash = basis; + if (!basis) { + hash = ct_addr_hash_add(hash, &key->src.addr); + } else { + hash = ct_endpoint_hash_add(hash, &key->src); + hash = ct_endpoint_hash_add(hash, &key->dst); + } + hash = ct_addr_hash_add(hash, &nat_info->min_addr); hash = ct_addr_hash_add(hash, &nat_info->max_addr); hash = hash_add(hash, ((uint32_t) nat_info->max_port << 16) | nat_info->min_port); - hash = ct_endpoint_hash_add(hash, &key->src); - hash = ct_endpoint_hash_add(hash, &key->dst); hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type); hash = hash_add(hash, key->nw_proto); hash = hash_add(hash, key->zone); - /* The purpose of the second parameter is to distinguish hashes of data of * different length; our data always has the same length so there is no * value in counting. */ @@ -2388,10 +2392,19 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, fwd_key->nw_proto == IPPROTO_SCTP; uint16_t min_dport, max_dport, curr_dport; uint16_t min_sport, max_sport, curr_sport; - uint32_t hash, port_off; + uint32_t hash, port_off, basis; + + basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis; + hash = nat_range_hash(fwd_key, basis, nat_info); + + if (nat_info->nat_flags & NAT_RANGE_RANDOM) { + port_off = random_uint32(); + } else if (basis) { + port_off = hash; + } else { + port_off = nat_range_hash(fwd_key, ct->hash_basis, nat_info); + } - hash = nat_range_hash(fwd_key, ct->hash_basis, nat_info); - port_off = nat_info->nat_flags & NAT_RANGE_RANDOM ? random_uint32() : hash; min_addr = nat_info->min_addr; max_addr = nat_info->max_addr; diff --git a/lib/conntrack.h b/lib/conntrack.h index 9b0c6aa88f2..ee7da099e37 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -79,6 +79,7 @@ enum nat_action_e { enum nat_flags_e { NAT_RANGE_RANDOM = 1 << 0, + NAT_PERSISTENT = 1 << 1, }; struct nat_action_info_t { diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 0f08fa92b1d..6e4374859a1 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9413,6 +9413,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, nat_action_info.nat_flags |= NAT_RANGE_RANDOM; break; case OVS_NAT_ATTR_PERSISTENT: + nat_action_info.nat_flags |= NAT_PERSISTENT; + break; case OVS_NAT_ATTR_PROTO_HASH: break; case OVS_NAT_ATTR_UNSPEC: From 46159983d949094e041f6b32cad2c8d6e35084d6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 15 Feb 2024 13:00:05 +0100 Subject: [PATCH 595/833] ovs-thread: Log pthread failures. Currently, failures of pthread_* functions are printed to stderr only and then OVS aborts. These error messages are hard to find and may be even just lost. Use VLOG_ABORT() instead. It will do the same thing, but will try to log the error to the log file and syslog first, if configured. Using VLOG_ABORT() instead of VLOG_FATAL() to preserve the abort() logic and not just exit with a failure code, because it's likely we want a core dump if one of these function failed. For example, we would like to have a stack trace in a core dump in case a mutex lock failed with 'deadlock avoided'. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/ovs-thread.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c index ac5d2c3d029..f8000806156 100644 --- a/lib/ovs-thread.c +++ b/lib/ovs-thread.c @@ -63,13 +63,14 @@ static bool multithreaded; \ /* Verify that 'l' was initialized. */ \ if (OVS_UNLIKELY(!l->where)) { \ - ovs_abort(0, "%s: %s() passed uninitialized ovs_"#TYPE, \ - where, __func__); \ + VLOG_ABORT("%s: %s() passed uninitialized ovs_"#TYPE, \ + where, __func__); \ } \ \ error = pthread_##TYPE##_##FUN(&l->lock); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s: pthread_%s_%s failed", where, #TYPE, #FUN); \ + VLOG_ABORT("%s: pthread_%s_%s failed: %s", where, #TYPE, #FUN, \ + ovs_strerror(error)); \ } \ l->where = where; \ } @@ -91,13 +92,14 @@ LOCK_FUNCTION(spin, lock); \ /* Verify that 'l' was initialized. */ \ if (OVS_UNLIKELY(!l->where)) { \ - ovs_abort(0, "%s: %s() passed uninitialized ovs_"#TYPE, \ - where, __func__); \ + VLOG_ABORT("%s: %s() passed uninitialized ovs_"#TYPE, \ + where, __func__); \ } \ \ error = pthread_##TYPE##_##FUN(&l->lock); \ if (OVS_UNLIKELY(error) && error != EBUSY) { \ - ovs_abort(error, "%s: pthread_%s_%s failed", where, #TYPE, #FUN); \ + VLOG_ABORT("%s: pthread_%s_%s failed: %s", where, #TYPE, #FUN, \ + ovs_strerror(error)); \ } \ if (!error) { \ l->where = where; \ @@ -125,7 +127,8 @@ TRY_LOCK_FUNCTION(spin, trylock); l->where = WHERE; \ error = pthread_##TYPE##_##FUN(&l->lock); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "pthread_%s_%s failed", #TYPE, #FUN); \ + VLOG_ABORT("%s: pthread_%s_%s failed: %s", l->where, #TYPE, #FUN, \ + ovs_strerror(error)); \ } \ } UNLOCK_FUNCTION(mutex, unlock, ""); @@ -143,7 +146,8 @@ UNLOCK_FUNCTION(spin, destroy, NULL); { \ int error = FUNCTION(arg1); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s failed", #FUNCTION); \ + VLOG_ABORT("%s failed: %s", #FUNCTION, \ + ovs_strerror(error)); \ } \ } #define XPTHREAD_FUNC2(FUNCTION, PARAM1, PARAM2) \ @@ -152,7 +156,8 @@ UNLOCK_FUNCTION(spin, destroy, NULL); { \ int error = FUNCTION(arg1, arg2); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s failed", #FUNCTION); \ + VLOG_ABORT("%s failed: %s", #FUNCTION, \ + ovs_strerror(error)); \ } \ } #define XPTHREAD_FUNC3(FUNCTION, PARAM1, PARAM2, PARAM3)\ @@ -161,7 +166,8 @@ UNLOCK_FUNCTION(spin, destroy, NULL); { \ int error = FUNCTION(arg1, arg2, arg3); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s failed", #FUNCTION); \ + VLOG_ABORT("%s failed: %s", #FUNCTION, \ + ovs_strerror(error)); \ } \ } @@ -204,7 +210,7 @@ ovs_mutex_init__(const struct ovs_mutex *l_, int type) xpthread_mutexattr_settype(&attr, type); error = pthread_mutex_init(&l->lock, &attr); if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_mutex_init failed"); + VLOG_ABORT("pthread_mutex_init failed: %s", ovs_strerror(error)); } xpthread_mutexattr_destroy(&attr); } @@ -257,7 +263,7 @@ ovs_rwlock_init(const struct ovs_rwlock *l_) #endif if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_rwlock_init failed"); + VLOG_ABORT("pthread_rwlock_init failed: %s", ovs_strerror(error)); } } @@ -275,7 +281,7 @@ ovs_mutex_cond_wait(pthread_cond_t *cond, const struct ovs_mutex *mutex_) error = pthread_cond_wait(cond, &mutex->lock); if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_cond_wait failed"); + VLOG_ABORT("pthread_cond_wait failed: %s", ovs_strerror(error)); } } @@ -289,7 +295,7 @@ ovs_spin_init__(const struct ovs_spin *l_, int pshared) l->where = ""; error = pthread_spin_init(&l->lock, pshared); if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_spin_init failed"); + VLOG_ABORT("pthread_spin_init failed: %s", ovs_strerror(error)); } } @@ -431,13 +437,15 @@ set_min_stack_size(pthread_attr_t *attr, size_t min_stacksize) error = pthread_attr_getstacksize(attr, &stacksize); if (error) { - ovs_abort(error, "pthread_attr_getstacksize failed"); + VLOG_ABORT("pthread_attr_getstacksize failed: %s", + ovs_strerror(error)); } if (stacksize < min_stacksize) { error = pthread_attr_setstacksize(attr, min_stacksize); if (error) { - ovs_abort(error, "pthread_attr_setstacksize failed"); + VLOG_ABORT("pthread_attr_setstacksize failed: %s", + ovs_strerror(error)); } } } @@ -486,7 +494,7 @@ ovs_thread_create(const char *name, void *(*start)(void *), void *arg) error = pthread_create(&thread, &attr, ovsthread_wrapper, aux); if (error) { - ovs_abort(error, "pthread_create failed"); + VLOG_ABORT("pthread_create failed: %s", ovs_strerror(error)); } pthread_attr_destroy(&attr); return thread; From f9e42c6683e592208fc71917061aec439b2b239d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 20 Feb 2024 23:35:41 +0100 Subject: [PATCH 596/833] tests: Move the non-local port as tunnel endpoint test. It's not a system test as it runs with dummy datapath and ports and it has nothing to do with layer 3 tunnels. It should be with other userspace tunnel tests. While moving also making it a little nicer visually and less error prone by requesting port numbers for all the ports. Acked-by: Mike Pattrick Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/system-layer3-tunnels.at | 55 ----------------------- tests/tunnel-push-pop.at | 79 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 55 deletions(-) diff --git a/tests/system-layer3-tunnels.at b/tests/system-layer3-tunnels.at index 6fbdedb64f6..5dcdd2afae0 100644 --- a/tests/system-layer3-tunnels.at +++ b/tests/system-layer3-tunnels.at @@ -98,61 +98,6 @@ NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([layer3 - use non-local port as tunnel endpoint]) - -OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1]) -AT_CHECK([ovs-vsctl add-port br0 vtep0 -- set int vtep0 type=dummy], [0]) -AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy], [0]) -AT_CHECK([ovs-vsctl add-port int-br t1 -- set Interface t1 type=gre \ - options:remote_ip=1.1.2.92 ofport_request=3], [0]) - -AT_CHECK([ovs-appctl dpif/show], [0], [dnl -dummy@ovs-dummy: hit:0 missed:0 - br0: - br0 65534/100: (dummy-internal) - p0 1/1: (dummy) - vtep0 2/2: (dummy) - int-br: - int-br 65534/3: (dummy-internal) - t1 3/4: (gre: remote_ip=1.1.2.92) -]) - -AT_CHECK([ovs-appctl netdev-dummy/ip4addr vtep0 1.1.2.88/24], [0], [OK -]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 vtep0], [0], [OK -]) -AT_CHECK([ovs-ofctl add-flow br0 action=normal]) -AT_CHECK([ovs-ofctl add-flow int-br action=normal]) - -dnl Use arp request and reply to achieve tunnel next hop mac binding -dnl By default, vtep0's MAC address is aa:55:aa:55:00:03 -AT_CHECK([ovs-appctl netdev-dummy/receive vtep0 'recirc_id(0),in_port(2),eth(dst=ff:ff:ff:ff:ff:ff,src=aa:55:aa:55:00:03),eth_type(0x0806),arp(tip=1.1.2.92,sip=1.1.2.88,op=1,sha=aa:55:aa:55:00:03,tha=00:00:00:00:00:00)']) -AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=aa:55:aa:55:00:03)']) - -AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl -1.1.2.92 f8:bc:12:44:34:b6 br0 -]) - -AT_CHECK([ovs-appctl ovs/route/show | tail -n+2 | sort], [0], [dnl -User: 1.1.2.0/24 dev vtep0 SRC 1.1.2.88 -]) - -dnl Check GRE tunnel pop -AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0800),ipv4(src=1.1.2.92,dst=1.1.2.88,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) - -AT_CHECK([tail -1 stdout], [0], - [Datapath actions: tnl_pop(4) -]) - -dnl Check GRE tunnel push -AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(3),eth(dst=f9:bc:12:44:34:b6,src=af:55:aa:55:00:03),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.92,proto=1,tos=0,ttl=64,frag=no)'], [0], [stdout]) -AT_CHECK([tail -1 stdout], [0], - [Datapath actions: tnl_push(tnl_port(4),header(size=38,type=3,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:03,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(2)),1 -]) - -OVS_VSWITCHD_STOP -AT_CLEANUP - AT_SETUP([layer3 - ping over MPLS Bareudp]) OVS_CHECK_BAREUDP() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index b1440f59045..e51984fde6a 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -993,3 +993,82 @@ udp(src=0,dst=4789,csum=0x0),vxlan(flags=0x8000000,vni=0x0)),out_port(100)),8),7 OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel_push_pop - use non-local port as tunnel endpoint]) + +OVS_VSWITCHD_START([add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1]) + +dnl Adding another port separately to ensure that it gets an +dnl aa:55:aa:55:00:03 MAC address (dummy port number 3). +AT_CHECK([ovs-vsctl add-port br0 vtep0 \ + -- set interface vtep0 type=dummy ofport_request=2]) +AT_CHECK([ovs-vsctl \ + -- add-br int-br \ + -- set bridge int-br datapath_type=dummy \ + -- set Interface int-br ofport_request=3]) +AT_CHECK([ovs-vsctl \ + -- add-port int-br t1 \ + -- set Interface t1 type=gre ofport_request=4 \ + options:remote_ip=1.1.2.92 +]) + +AT_CHECK([ovs-appctl dpif/show], [0], [dnl +dummy@ovs-dummy: hit:0 missed:0 + br0: + br0 65534/100: (dummy-internal) + p0 1/1: (dummy) + vtep0 2/2: (dummy) + int-br: + int-br 65534/3: (dummy-internal) + t1 4/4: (gre: remote_ip=1.1.2.92) +]) + +AT_CHECK([ovs-appctl netdev-dummy/ip4addr vtep0 1.1.2.88/24], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 vtep0], [0], [OK +]) +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl Use arp request and reply to achieve tunnel next hop mac binding. +dnl By default, vtep0's MAC address is aa:55:aa:55:00:03. +AT_CHECK([ovs-appctl netdev-dummy/receive vtep0 'recirc_id(0),in_port(2),dnl + eth(dst=ff:ff:ff:ff:ff:ff,src=aa:55:aa:55:00:03),eth_type(0x0806),dnl + arp(tip=1.1.2.92,sip=1.1.2.88,op=1,sha=aa:55:aa:55:00:03,tha=00:00:00:00:00:00)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0806),dnl + arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=aa:55:aa:55:00:03)']) + +AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl +1.1.2.92 f8:bc:12:44:34:b6 br0 +]) + +AT_CHECK([ovs-appctl ovs/route/show | tail -n+2 | sort], [0], [dnl +User: 1.1.2.0/24 dev vtep0 SRC 1.1.2.88 +]) + +dnl Check GRE tunnel pop. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0800),dnl + ipv4(src=1.1.2.92,dst=1.1.2.88,proto=47,tos=0,ttl=64,frag=no)'], +[0], [stdout]) + +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_pop(4) +]) + +dnl Check GRE tunnel push. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(3),dnl + eth(dst=f9:bc:12:44:34:b6,src=af:55:aa:55:00:03),eth_type(0x0800),dnl + ipv4(src=1.1.3.88,dst=1.1.3.92,proto=1,tos=0,ttl=64,frag=no)'], +[0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(4),header(size=38,type=3,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:03,dl_type=0x0800),dnl +ipv4(src=1.1.2.88,dst=1.1.2.92,proto=47,tos=0,ttl=64,frag=0x4000),dnl +gre((flags=0x0,proto=0x6558))),out_port(2)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From 7992a26ef47f4f5a1fe527cff918d3f10c9fc7fd Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 20 Feb 2024 23:35:42 +0100 Subject: [PATCH 597/833] netdev-dummy: Add local route entries for IP addresses. To mimic what kernel routing subsystem does [1], add a local route entry for every dummy IP address. This helps with OVN testing multiple chassis on a single host and allows to run better unit tests for userspace tunnels without adding route entries manually. This is also the only way to add 'local' route entries that are required for testing 'local_ip' functionality with native tunnels in userspace datapath because route lookup will reject non-local source IPs. There seems to be no way to explicitly remove an IP address from netdev-dummy, hence no code path to handle route entry cleanup. The port itself can be removed, but our tests do not normally do that. Removal can be implemented later if necessary. [1]: http://linux-ip.net/html/routing-tables.html#routing-table-local "If the machine has several IP addresses on one Ethernet interface, there will be a route to each locally hosted IP in the local routing table. This is a normal side effect of bringing up an IP address on an interface under linux." Acked-by: Eelco Chaudron Signed-off-by: Ihar Hrachyshka Co-authored-by: Ilya Maximets Signed-off-by: Ilya Maximets --- lib/netdev-dummy.c | 17 +++++++- lib/ovs-router.c | 14 +++++++ lib/ovs-router.h | 5 +++ tests/nsh.at | 14 ++----- tests/ofproto-dpif.at | 15 ++++--- tests/packet-type-aware.at | 21 ++++------ tests/tunnel-push-pop-ipv6.at | 32 +++++++++------ tests/tunnel-push-pop.at | 75 ++++++++++++++++++++--------------- tests/tunnel.at | 18 ++++----- 9 files changed, 126 insertions(+), 85 deletions(-) diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index cd7e85a8188..e8bbf8d514d 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -39,6 +39,7 @@ #include "pcap-file.h" #include "openvswitch/poll-loop.h" #include "openvswitch/shash.h" +#include "ovs-router.h" #include "sset.h" #include "stream.h" #include "unaligned.h" @@ -2084,11 +2085,20 @@ netdev_dummy_ip4addr(struct unixctl_conn *conn, int argc OVS_UNUSED, if (netdev && is_dummy_class(netdev->netdev_class)) { struct in_addr ip, mask; + struct in6_addr ip6; + uint32_t plen; char *error; - error = ip_parse_masked(argv[2], &ip.s_addr, &mask.s_addr); + error = ip_parse_cidr(argv[2], &ip.s_addr, &plen); if (!error) { + mask.s_addr = be32_prefix_mask(plen); netdev_dummy_add_in4(netdev, ip, mask); + + /* Insert local route entry for the new address. */ + in6_addr_set_mapped_ipv4(&ip6, ip.s_addr); + ovs_router_force_insert(0, &ip6, plen + 96, true, argv[1], + &in6addr_any, &ip6); + unixctl_command_reply(conn, "OK"); } else { unixctl_command_reply_error(conn, error); @@ -2118,6 +2128,11 @@ netdev_dummy_ip6addr(struct unixctl_conn *conn, int argc OVS_UNUSED, mask = ipv6_create_mask(plen); netdev_dummy_add_in6(netdev, &ip6, &mask); + + /* Insert local route entry for the new address. */ + ovs_router_force_insert(0, &ip6, plen, true, argv[1], + &in6addr_any, &ip6); + unixctl_command_reply(conn, "OK"); } else { unixctl_command_reply_error(conn, error); diff --git a/lib/ovs-router.c b/lib/ovs-router.c index ca014d80ed3..3d84c9a30a8 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -330,6 +330,20 @@ ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, } } +/* The same as 'ovs_router_insert', but it adds the route even if updates + * from the system routing table are disabled. Used for unit tests. */ +void +ovs_router_force_insert(uint32_t mark, const struct in6_addr *ip_dst, + uint8_t plen, bool local, const char output_bridge[], + const struct in6_addr *gw, + const struct in6_addr *prefsrc) +{ + uint8_t priority = local ? plen + 64 : plen; + + ovs_router_insert__(mark, priority, local, ip_dst, plen, + output_bridge, gw, prefsrc); +} + static void rt_entry_delete__(const struct cls_rule *cr) { diff --git a/lib/ovs-router.h b/lib/ovs-router.h index eb4ff85d9e6..d7dc7e55f37 100644 --- a/lib/ovs-router.h +++ b/lib/ovs-router.h @@ -34,6 +34,11 @@ void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, bool local, const char output_bridge[], const struct in6_addr *gw, const struct in6_addr *prefsrc); +void ovs_router_force_insert(uint32_t mark, const struct in6_addr *ip_dst, + uint8_t plen, bool local, + const char output_bridge[], + const struct in6_addr *gw, + const struct in6_addr *prefsrc); void ovs_router_flush(void); void ovs_router_disable_system_routing_table(void); diff --git a/tests/nsh.at b/tests/nsh.at index 55296e5593a..0040a50b36c 100644 --- a/tests/nsh.at +++ b/tests/nsh.at @@ -521,51 +521,45 @@ AT_CHECK([ set interface vxlangpe32 type=vxlan options:exts=gpe options:remote_ip=30.0.0.2 options:packet_type=ptap ofport_request=3020 ovs-appctl netdev-dummy/ip4addr br-p1 10.0.0.1/24 - ovs-appctl ovs/route/add 10.0.0.0/24 br-p1 ovs-appctl tnl/arp/set br-p1 10.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p1 10.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p1 10.0.0.3 $HWADDR_BRP3 ovs-appctl netdev-dummy/ip4addr br-p2 20.0.0.2/24 - ovs-appctl ovs/route/add 20.0.0.0/24 br-p2 ovs-appctl tnl/arp/set br-p2 20.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p2 20.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p2 20.0.0.3 $HWADDR_BRP3 ovs-appctl netdev-dummy/ip4addr br-p3 30.0.0.3/24 - ovs-appctl ovs/route/add 30.0.0.0/24 br-p3 ovs-appctl tnl/arp/set br-p3 30.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p3 30.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p3 30.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/add 10.0.0.0/24 br-p1 ovs-appctl tnl/arp/set br-p1 10.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p1 10.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p1 10.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/add 20.0.0.0/24 br-p2 ovs-appctl tnl/arp/set br-p2 20.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p2 20.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p2 20.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/add 30.0.0.0/24 br-p3 ovs-appctl tnl/arp/set br-p3 30.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p3 30.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p3 30.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/show | grep User: + ovs-appctl ovs/route/show | grep Cached: | sort ], [0], [dnl -User: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 -User: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 -User: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 +Cached: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 local +Cached: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 local +Cached: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 local ]) AT_CHECK([ diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index e305e7b9cd0..daeea7775c2 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -7653,12 +7653,14 @@ dummy@ovs-dummy: hit:0 missed:0 vm1 5/3: (dummy: ifindex=2011) ]) -dnl set up route to 1.1.2.92 via br0 and action=normal +dnl Add 1.1.2.92 to br0 and action=normal AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK -]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +]) dnl Prime ARP Cache for 1.1.2.92 AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)']) @@ -7669,10 +7671,13 @@ ovs-vsctl \ --id=@sf create sflow targets=\"127.0.0.1:$SFLOW_PORT\" agent=127.0.0.1 \ header=128 sampling=1 polling=0 -dnl set up route to 192.168.1.2 via br0 +dnl Add 192.168.1.2 to br0, AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 192.168.1.1/16], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 192.168.0.0/16 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 192.168.0.0/16 dev br0 SRC 192.168.1.1 local ]) dnl add rule for int-br to force packet onto tunnel. There is no ifindex diff --git a/tests/packet-type-aware.at b/tests/packet-type-aware.at index 14cebf6efa5..d634930fd52 100644 --- a/tests/packet-type-aware.at +++ b/tests/packet-type-aware.at @@ -142,30 +142,27 @@ AT_CHECK([ ### Setup GRE tunnels AT_CHECK([ ovs-appctl netdev-dummy/ip4addr br-p1 10.0.0.1/24 && - ovs-appctl ovs/route/add 10.0.0.0/24 br-p1 && ovs-appctl tnl/arp/set br-p1 10.0.0.1 $HWADDR_BRP1 && ovs-appctl tnl/arp/set br-p1 10.0.0.2 $HWADDR_BRP2 && ovs-appctl tnl/arp/set br-p1 10.0.0.3 $HWADDR_BRP3 && ovs-appctl netdev-dummy/ip4addr br-p2 20.0.0.2/24 && - ovs-appctl ovs/route/add 20.0.0.0/24 br-p2 && ovs-appctl tnl/arp/set br-p2 20.0.0.1 $HWADDR_BRP1 && ovs-appctl tnl/arp/set br-p2 20.0.0.2 $HWADDR_BRP2 && ovs-appctl tnl/arp/set br-p2 20.0.0.3 $HWADDR_BRP3 && ovs-appctl netdev-dummy/ip4addr br-p3 30.0.0.3/24 && - ovs-appctl ovs/route/add 30.0.0.0/24 br-p3 && ovs-appctl tnl/arp/set br-p3 30.0.0.1 $HWADDR_BRP1 && ovs-appctl tnl/arp/set br-p3 30.0.0.2 $HWADDR_BRP2 && ovs-appctl tnl/arp/set br-p3 30.0.0.3 $HWADDR_BRP3 ], [0], [ignore]) AT_CHECK([ - ovs-appctl ovs/route/show | grep User: + ovs-appctl ovs/route/show | grep Cached: | sort ], [0], [dnl -User: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 -User: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 -User: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 +Cached: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 local +Cached: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 local +Cached: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 local ]) AT_CHECK([ @@ -681,14 +678,13 @@ AT_CHECK([ AT_CHECK([ ovs-appctl netdev-dummy/ip4addr br2 10.0.0.1/24 && - ovs-appctl ovs/route/add 10.0.0.0/24 br2 && ovs-appctl tnl/arp/set br2 10.0.0.2 de:af:be:ef:ba:be ], [0], [ignore]) AT_CHECK([ - ovs-appctl ovs/route/show | grep User: + ovs-appctl ovs/route/show | grep Cached: ], [0], [dnl -User: 10.0.0.0/24 dev br2 SRC 10.0.0.1 +Cached: 10.0.0.0/24 dev br2 SRC 10.0.0.1 local ]) @@ -955,7 +951,6 @@ AT_CHECK([ AT_CHECK([ ovs-appctl netdev-dummy/ip4addr br0 20.0.0.1/24 && - ovs-appctl ovs/route/add 20.0.0.2/24 br0 && ovs-appctl tnl/neigh/set br0 20.0.0.1 aa:bb:cc:00:00:01 && ovs-appctl tnl/neigh/set br0 20.0.0.2 aa:bb:cc:00:00:02 ], [0], [ignore]) @@ -963,9 +958,9 @@ AT_CHECK([ ovs-appctl time/warp 1000 AT_CHECK([ - ovs-appctl ovs/route/show | grep User + ovs-appctl ovs/route/show | grep Cached: ],[0], [dnl -User: 20.0.0.0/24 dev br0 SRC 20.0.0.1 +Cached: 20.0.0.0/24 dev br0 SRC 20.0.0.1 local ]) AT_CHECK([ diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index a8dd28c5b59..6d9ac684126 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -19,11 +19,12 @@ AT_CHECK([ovs-vsctl add-port int-br3 t3 -- set Interface t3 type=srv6 \ options:srv6_flowlabel=compute \ ], [0]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::0/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::91 aa:55:aa:55:00:01], [0], [OK ]) @@ -105,13 +106,15 @@ dummy@ovs-dummy: hit:0 missed:0 t2 2/6: (ip6gre: remote_ip=2001:cafe::92) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::92/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -179,13 +182,15 @@ dummy@ovs-dummy: hit:0 missed:0 t3 3/6: (ip6erspan: erspan_dir=1, erspan_hwid=0x7, erspan_ver=2, key=567, remote_ip=2001:cafe::93) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::92/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -316,14 +321,15 @@ srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=2 ]) - -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::92/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index e51984fde6a..04d17b71f7d 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -30,17 +30,15 @@ dummy@ovs-dummy: hit:0 missed:0 t4 5/3: (erspan: erspan_dir=flow, erspan_hwid=flow, erspan_idx=flow, erspan_ver=flow, key=56, remote_ip=flow) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK -]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0 pkt_mark=1234], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -237,18 +235,21 @@ dummy@ovs-dummy: hit:0 missed:0 t8 9/2152: (gtpu: key=123, remote_ip=1.1.2.92) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK -]) - +dnl Add a static route with a mark. AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0 pkt_mark=1234], [0], [OK ]) +dnl Checking that local routes for added IPs and the static route with a mark +dnl were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep br0 | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local +User: 1.1.2.0/24 MARK 1234 dev br0 SRC 1.1.2.88 +]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -690,12 +691,12 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=geneve \ options:remote_ip=1.1.2.92 options:key=123 ofport_request=2 \ ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -731,11 +732,12 @@ AT_CHECK([ovs-vsctl add-port int-br t2 dnl -- set Interface t2 type=geneve options:remote_ip=1.1.2.92 dnl options:key=123 ofport_request=2]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -796,8 +798,11 @@ dummy@ovs-dummy: hit:0 missed:0 AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) + AT_CHECK([ovs-ofctl add-flow br0 'arp,priority=1,action=normal']) dnl Use arp reply to achieve tunnel next hop mac binding @@ -840,11 +845,12 @@ AT_CHECK([ovs-vsctl add-port int-br t2 dnl -- set Interface t2 type=geneve options:remote_ip=1.1.2.92 dnl options:key=123 ofport_request=2]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -908,10 +914,12 @@ AT_CHECK([ovs-vsctl set port p8 tag=42 dnl -- set port br0 tag=42 dnl -- set port p7 tag=200]) -dnl Set IP address and route for br0. +dnl Set an IP address for br0. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 10.0.0.2/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 10.0.0.11/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 10.0.0.0/24 dev br0 SRC 10.0.0.2 local ]) dnl Send an ARP reply to port b8 on br0, so that packets will be forwarded @@ -953,10 +961,12 @@ AT_CHECK([ovs-vsctl add-port ovs-tun0 tun0 dnl -- add-port ovs-tun0 p7 dnl -- set interface p7 type=dummy ofport_request=7]) -dnl Set IP address and route for br0. +dnl Set an IP address for br0. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 10.0.0.2/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 10.0.0.11/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 10.0.0.0/24 dev br0 SRC 10.0.0.2 local ]) dnl Send an ARP reply to port b8 on br0, so that packets will be forwarded @@ -1026,8 +1036,11 @@ dummy@ovs-dummy: hit:0 missed:0 AT_CHECK([ovs-appctl netdev-dummy/ip4addr vtep0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 vtep0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev vtep0 SRC 1.1.2.88 local ]) + AT_CHECK([ovs-ofctl add-flow br0 action=normal]) AT_CHECK([ovs-ofctl add-flow int-br action=normal]) @@ -1044,10 +1057,6 @@ AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl 1.1.2.92 f8:bc:12:44:34:b6 br0 ]) -AT_CHECK([ovs-appctl ovs/route/show | tail -n+2 | sort], [0], [dnl -User: 1.1.2.0/24 dev vtep0 SRC 1.1.2.88 -]) - dnl Check GRE tunnel pop. AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),dnl eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0800),dnl diff --git a/tests/tunnel.at b/tests/tunnel.at index 282651ac732..71e7c2df4ea 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -524,11 +524,12 @@ dummy@ovs-dummy: hit:0 missed:0 v2 3/3: (dummy-internal) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 172.31.1.1/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 172.31.1.0/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 172.31.1.0/24 dev br0 SRC 172.31.1.1 local ]) dnl change the flow table to bump the internal table version @@ -1276,15 +1277,12 @@ OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy \ ofport_request=2]) OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 fc00::1/64], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add fc00::0/64 br0], [0], [OK -]) -AT_CHECK([ovs-appctl ovs/route/show], [0], [dnl -Route Table: -User: fc00::/64 dev br0 SRC fc00::1 +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: fc00::/64 dev br0 SRC fc00::1 local ]) AT_DATA([flows.txt], [dnl From 166ee41d282c506d100bc2185d60af277121b55b Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 20 Feb 2024 23:35:43 +0100 Subject: [PATCH 598/833] ofproto-dpif-xlate: Fix ignoring IPv6 local_ip for native tunnels. Local IP is taken into account only in case of IPv4 address, IPv6 source is not checked. That leads to source being ignored during the route lookup and ultimately packets encapsulated with a source IP found during a route lookup, which is likely the wrong one. Even worse, after encapsulation we have a difference between the tunnel metadata that contains a correct source IP and the generated actions that used a wrong source IP. This means that if there are OpenFlow rules in a bridge where packet goes after encapsulation, we may match on rules that do not correspond to the actual packet we have. Add the check for IPv6 source address before the route lookup. Tests added to check that we're actually using the configured local_ip as a source address in the packet. Also adding the same test for IPv4, since apparently we don't have any tests covering this functionality for userspace tunnels. This issue also affects the case where source address is set via OpenFlow, e.g. 'set_filed:2001:beef::88->tun_ipv6_src', but it's just a different way of populating the tunnel metadata that doesn't depend on a tunnel to be native or kernel one. So, not adding extra tests for this case for now. Fixes: 8e4e45887ec3 ("ofproto-dpif-xlate: makes OVS native tunneling honor tunnel-specified source addresses") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2024-February/052938.html Reported-by: Derrick Lim Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + ofproto/ofproto-dpif-xlate.c | 2 + tests/tunnel-push-pop-ipv6.at | 84 +++++++++++++++++++++++++++++++++++ tests/tunnel-push-pop.at | 82 ++++++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index fc08f3bbfe8..f99df385ba0 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -589,6 +589,7 @@ David Evans davidjoshuaevans@gmail.com David Palma palma@onesource.pt David van Moolenbroek dvmoolenbroek@aimvalley.nl Derek Cormier derek.cormier@lab.ntt.co.jp +Derrick Lim derrick.lim@rakuten.com Dhaval Badiani dbadiani@vmware.com DK Moon Ding Zhi zhi.ding@6wind.com diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 1cf4d5f7c9b..89f183182ea 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3815,6 +3815,8 @@ native_tunnel_output(struct xlate_ctx *ctx, const struct xport *xport, if (flow->tunnel.ip_src) { in6_addr_set_mapped_ipv4(&s_ip6, flow->tunnel.ip_src); + } else if (ipv6_addr_is_set(&flow->tunnel.ipv6_src)) { + s_ip6 = flow->tunnel.ipv6_src; } err = tnl_route_lookup_flow(ctx, flow, &d_ip6, &s_ip6, &out_dev); diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index 6d9ac684126..3f2cf842927 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -642,3 +642,87 @@ Listening ports: OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel_push_pop_ipv6 - local_ip configuration]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:local_ip=2001:beef::88 \ + options:remote_ip=2001:cafe::92 \ + options:key=123 ofport_request=2]) + +dnl Setup multiple IP addresses. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/64], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:beef::88/64], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 2001:beef::/64 dev br0 SRC 2001:beef::88 local +Cached: 2001:cafe::/64 dev br0 SRC 2001:cafe::88 local +]) +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This Neighbor Advertisement from p0 has two effects: +dnl 1. The neighbor cache will learn that 2001:cafe::92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),dnl + ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=58,tclass=0,hlimit=255,frag=no),dnl + icmpv6(type=136,code=0),dnl + nd(target=2001:cafe::92,sll=00:00:00:00:00:00,tll=f8:bc:12:44:34:b6)' +]) + +dnl Check that local_ip is used for encapsulation in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 2001:cafe::92 via br0 + -> tunneling from aa:55:aa:55:00:00 2001:beef::88 to f8:bc:12:44:34:b6 2001:cafe::92 +Datapath actions: tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:beef::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),1 +]) + +dnl Now check that the packet actually has the local_ip in the header. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa55000086dd +ip6=60000000001e11402001beef0000000000000000000000882001cafe000000000000000000000092 +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Same for UDP checksum. Masked with '....'. +udp=....17c1001e.... +geneve=0000655800007b00 +encap=${eth}${ip6}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow also has a local_ip. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:beef::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index 04d17b71f7d..97405636f98 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -779,6 +779,88 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep -q 'slow_path(action)'], [0]) OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([tunnel_push_pop - local_ip configuration]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:local_ip=2.2.2.88 \ + options:remote_ip=1.1.2.92 \ + options:key=123 ofport_request=2]) + +dnl Setup multiple IP addresses. +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 2.2.2.88/24], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2.2.2.0/24 dev br0 SRC 2.2.2.88 local +]) +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This ARP reply from p0 has two effects: +dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),dnl + arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)' +]) + +dnl Check that local_ip is used for encapsulation in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 1.1.2.92 via br0 + -> tunneling from aa:55:aa:55:00:00 2.2.2.88 to f8:bc:12:44:34:b6 1.1.2.92 +Datapath actions: tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=2.2.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),1 +]) + +dnl Now check that the packet actually has the local_ip in the header. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa5500000800 +ip4=450000320000400040113305020202580101025c +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Masked with '....'. +udp=....17c1001e0000 +geneve=0000655800007b00 +encap=${eth}${ip4}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow also has a local_ip. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=2.2.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([tunnel_push_pop - underlay bridge match]) OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00]) From f0d1beca6cbb7a27cd6b854bc975f26da0781955 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 20 Feb 2024 10:31:34 +0100 Subject: [PATCH 599/833] dpif-netdev: Do not create handler threads. Avoid unnecessary thread creation as no upcalls are generated, resulting in idle threads waiting for process termination. This optimization significantly reduces memory usage, cutting it by half on a 128 CPU/thread system during testing, with the number of threads reduced from 95 to 0. Acked-by: Mike Pattrick Signed-off-by: Eelco Chaudron --- lib/dpif-netdev.c | 10 +++++++++- ofproto/ofproto-dpif-upcall.c | 25 ++++++++++++++++--------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 6e4374859a1..e6c53937d8b 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5250,6 +5250,14 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) return 0; } +static bool +dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED, + uint32_t *n_handlers) +{ + *n_handlers = 0; + return true; +} + /* Parses affinity list and returns result in 'core_ids'. */ static int parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq) @@ -9989,7 +9997,7 @@ const struct dpif_class dpif_netdev_class = { dpif_netdev_offload_stats_get, NULL, /* recv_set */ NULL, /* handlers_set */ - NULL, /* number_handlers_required */ + dpif_netdev_number_handlers_required, dpif_netdev_set_config, dpif_netdev_queue_to_priority, NULL, /* recv */ diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index b5cbeed8780..9a5c5c29ce6 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -584,7 +584,7 @@ static void udpif_start_threads(struct udpif *udpif, uint32_t n_handlers_, uint32_t n_revalidators_) { - if (udpif && n_handlers_ && n_revalidators_) { + if (udpif && n_revalidators_) { /* Creating a thread can take a significant amount of time on some * systems, even hundred of milliseconds, so quiesce around it. */ ovsrcu_quiesce_start(); @@ -592,14 +592,19 @@ udpif_start_threads(struct udpif *udpif, uint32_t n_handlers_, udpif->n_handlers = n_handlers_; udpif->n_revalidators = n_revalidators_; - udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers); - for (size_t i = 0; i < udpif->n_handlers; i++) { - struct handler *handler = &udpif->handlers[i]; + if (udpif->n_handlers) { + udpif->handlers = xzalloc(udpif->n_handlers + * sizeof *udpif->handlers); + for (size_t i = 0; i < udpif->n_handlers; i++) { + struct handler *handler = &udpif->handlers[i]; - handler->udpif = udpif; - handler->handler_id = i; - handler->thread = ovs_thread_create( - "handler", udpif_upcall_handler, handler); + handler->udpif = udpif; + handler->handler_id = i; + handler->thread = ovs_thread_create( + "handler", udpif_upcall_handler, handler); + } + } else { + udpif->handlers = NULL; } atomic_init(&udpif->enable_ufid, udpif->backer->rt_support.ufid); @@ -662,7 +667,9 @@ udpif_set_threads(struct udpif *udpif, uint32_t n_handlers_, if (dpif_number_handlers_required(udpif->dpif, &n_handlers_requested)) { forced = true; if (!n_revalidators_) { - n_revalidators_requested = n_handlers_requested / 4 + 1; + n_revalidators_requested = (n_handlers_requested + ? n_handlers_requested + : MAX(count_cpu_cores(), 2)) / 4 + 1; } else { n_revalidators_requested = n_revalidators_; } From 4c32b6d0964c09396db94bf2f3021f02cdcfd48e Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 26 Feb 2024 08:38:37 -0500 Subject: [PATCH 600/833] dp-packet: Don't offload inner csum if outer isn't supported. Some network cards support inner checksum offloading but not outer checksum offloading. Currently OVS will resolve that outer checksum but allows the network card to resolve the inner checksum, invalidating the outer checksum in the process. Now if we can't offload outer checksums, we don't offload inner either. Reported-at: https://issues.redhat.com/browse/FDP-363 Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 305822293b9..df7bf8e6b3a 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -592,6 +592,18 @@ dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) if (dp_packet_hwol_is_tunnel_geneve(p) || dp_packet_hwol_is_tunnel_vxlan(p)) { tnl_inner = true; + + /* If the TX interface doesn't support UDP tunnel offload but does + * support inner checksum offload and an outer UDP checksum is + * required, then we can't offload inner checksum either. As that would + * invalidate the outer checksum. */ + if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM) && + dp_packet_hwol_is_outer_udp_cksum(p)) { + flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM | + NETDEV_TX_OFFLOAD_SCTP_CKSUM | + NETDEV_TX_OFFLOAD_IPV4_CKSUM); + } } if (dp_packet_hwol_tx_ip_csum(p)) { From 5639aa7b6d1865eee2038a17758c26b74aabce2c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 1 Mar 2024 09:54:14 +0000 Subject: [PATCH 601/833] Documentation: Extend copyright to 2024. IANAL, but I think we can extend the copyright attached to documentation to cover the current year: we are still actively working on the documentation. Signed-off-by: Simon Horman Acked-by: Mike Pattrick Acked-by: Eelco Chaudron --- Documentation/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/conf.py b/Documentation/conf.py index 085ca2cd67c..15785605ad8 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -48,7 +48,7 @@ # General information about the project. project = u'Open vSwitch' -copyright = u'2016-2021, The Open vSwitch Development Community' +copyright = u'2016-2024, The Open vSwitch Development Community' author = u'The Open vSwitch Development Community' # The version info for the project you're documenting, acts as replacement for From 786a89aba707256244d4d5d428eff779cd161e5d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 1 Mar 2024 10:30:53 +0000 Subject: [PATCH 602/833] Documentation: Correct spelling errors. Correct spelling errors in .rst files flagged by codespell. Also correct some minor grammar errors in nearby documentation. Signed-off-by: Simon Horman Acked-by: Mike Pattrick Acked-by: Eelco Chaudron --- Documentation/howto/sflow.rst | 2 +- Documentation/howto/tc-offload.rst | 10 +++++----- Documentation/intro/install/afxdp.rst | 2 +- Documentation/intro/install/documentation.rst | 6 +++--- Documentation/intro/install/dpdk.rst | 12 ++++++------ Documentation/intro/install/fedora.rst | 4 ++-- Documentation/intro/install/general.rst | 2 +- Documentation/intro/why-ovs.rst | 2 +- Documentation/topics/dpdk/bridge.rst | 6 +++--- Documentation/topics/dpdk/phy.rst | 2 +- Documentation/topics/dpdk/vhost-user.rst | 2 +- Documentation/topics/integration.rst | 2 +- Documentation/topics/porting.rst | 2 +- Documentation/topics/record-replay.rst | 2 +- Documentation/topics/testing.rst | 4 ++-- 15 files changed, 30 insertions(+), 30 deletions(-) diff --git a/Documentation/howto/sflow.rst b/Documentation/howto/sflow.rst index 74d8b8e175f..0b378c93d44 100644 --- a/Documentation/howto/sflow.rst +++ b/Documentation/howto/sflow.rst @@ -68,7 +68,7 @@ cookbook entry, we use `sFlowTrend `__, a free sFlow collector that is a simple cross-platform Java download. Other sFlow collectors should work equally well. `hostMon` has a single NIC, `eth0`, that is connected to the -Management Network. `eth0` has an IP adress that can reach `eth1` on `host1`. +Management Network. `eth0` has an IP address that can reach `eth1` on `host1`. Two Virtual Machines ~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/howto/tc-offload.rst b/Documentation/howto/tc-offload.rst index 681dff13e08..ee7f73f8a0c 100644 --- a/Documentation/howto/tc-offload.rst +++ b/Documentation/howto/tc-offload.rst @@ -49,7 +49,7 @@ tc-police action, see ``man tc-police``. Configuration ~~~~~~~~~~~~~ -There is no parameter change in ovs-ofctl command, to configue a meter and use +There is no parameter change in ovs-ofctl command, to configure a meter and use it for a flow in the offload way. Usually the commands are like:: $ ovs-ofctl -O OpenFlow13 add-meter br0 "meter=1 pktps bands=type=drop rate=1" @@ -58,10 +58,10 @@ it for a flow in the offload way. Usually the commands are like:: For more details, see ``man ovs-ofctl``. .. note:: - Each meter is mapped to one TC police action. To avovid the conflicton, the - police action index of 0x10000000-0x1fffffff are resevered for the mapping. - You can check the police actions by the command ``tc action ls action police`` - in Linux system. + Each meter is mapped to one TC police action. To avoid conflicts, the + police action indexes 0x10000000-0x1fffffff are reserved for this mapping. + You can check the police actions using the command ``tc action ls action + police`` on Linux systems. Known TC flow offload limitations diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index 5776614c8e5..964d9ef5b1d 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -150,7 +150,7 @@ To kick start end-to-end autotesting:: make check-afxdp TESTSUITEFLAGS='1' .. note:: - Not all test cases pass at this time. Currenly all cvlan tests are skipped + Not all test cases pass at this time. Currently all cvlan tests are skipped due to kernel issues. If a test case fails, check the log at:: diff --git a/Documentation/intro/install/documentation.rst b/Documentation/intro/install/documentation.rst index acf5b3a3ff3..049ca3d33be 100644 --- a/Documentation/intro/install/documentation.rst +++ b/Documentation/intro/install/documentation.rst @@ -79,9 +79,9 @@ Makefile targets:: .. important:: The ``docs-check`` target will fail if there are any syntax errors. - However, it won't catch more succint issues such as style or grammar issues. - As a result, you should always inspect changes visually to ensure the result - is as intended. + However, it won't catch more succinct issues such as style or grammar + issues. As a result, you should always inspect changes visually to ensure + the result is as intended. Once built, documentation is available in the ``/Documentation/_build`` folder. Open the root ``index.html`` to browse the documentation. diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index ad9bdf22c06..65156966e08 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -232,7 +232,7 @@ Mount the hugepages, if not already mounted by default:: Setup DPDK devices using VFIO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -VFIO is prefered to the UIO driver when using recent versions of DPDK. VFIO +VFIO is preferred to the UIO driver when using recent versions of DPDK. VFIO support required support from both the kernel and BIOS. For the former, kernel version > 3.6 must be used. For the latter, you must enable VT-d in the BIOS and ensure this is configured via grub. To ensure VT-d is enabled via the BIOS, @@ -506,17 +506,17 @@ options. Affinity ~~~~~~~~ -For superior performance, DPDK pmd threads and Qemu vCPU threads needs to be -affinitized accordingly. +For superior performance, DPDK pmd threads and Qemu vCPU threads need to +have affinity set accordingly. - PMD thread Affinity A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces assigned to it. A pmd thread shall poll the ports for incoming packets, switch the packets and send to tx port. A pmd thread is CPU bound, and needs - to be affinitized to isolated cores for optimum performance. Even though a - PMD thread may exist, the thread only starts consuming CPU cycles if there is - at least one receive queue assigned to the pmd. + to be have affinity set to isolated cores for optimum performance. Even + though a PMD thread may exist, the thread only starts consuming CPU cycles if + there is at least one receive queue assigned to the pmd. .. note:: On NUMA systems, PCI devices are also local to a NUMA node. Unbound rx diff --git a/Documentation/intro/install/fedora.rst b/Documentation/intro/install/fedora.rst index 02481597ffe..49fad844c7f 100644 --- a/Documentation/intro/install/fedora.rst +++ b/Documentation/intro/install/fedora.rst @@ -84,8 +84,8 @@ YUM:: Once that is completed, remove the file ``/tmp/ovs.spec``. -Bootstraping ------------- +Bootstrapping +------------- Refer to :ref:`general-bootstrapping`. diff --git a/Documentation/intro/install/general.rst b/Documentation/intro/install/general.rst index 86e85f75dbf..17c15426805 100644 --- a/Documentation/intro/install/general.rst +++ b/Documentation/intro/install/general.rst @@ -495,7 +495,7 @@ Start ovsdb-server using below command:: $ docker run -itd --net=host --name=ovsdb-server \ : ovsdb-server -Start ovs-vswitchd with priviledged mode as it needs to load kernel module in +Start ovs-vswitchd with privileged mode as it needs to load kernel module in host using below command:: $ docker run -itd --net=host --name=ovs-vswitchd \ diff --git a/Documentation/intro/why-ovs.rst b/Documentation/intro/why-ovs.rst index e73066a7665..80a3f2f22f2 100644 --- a/Documentation/intro/why-ovs.rst +++ b/Documentation/intro/why-ovs.rst @@ -125,7 +125,7 @@ previous hypervisor networking stacks, focusing on the need for automated and dynamic network control in large-scale Linux-based virtualization environments. The goal with Open vSwitch is to keep the in-kernel code as small as possible -(as is necessary for performance) and to re-use existing subsystems when +(as is necessary for performance) and to reuse existing subsystems when applicable (for example Open vSwitch uses the existing QoS stack). As of Linux 3.3, Open vSwitch is included as a part of the kernel and packaging for the userspace utilities are available on most popular distributions. diff --git a/Documentation/topics/dpdk/bridge.rst b/Documentation/topics/dpdk/bridge.rst index 00be06e37fe..583105c6425 100644 --- a/Documentation/topics/dpdk/bridge.rst +++ b/Documentation/topics/dpdk/bridge.rst @@ -98,7 +98,7 @@ datapath flows with very simple match criteria. In theory, for very simple forwarding, OVS doesn't need to parse packets at all in order to follow these rules. In practice, due to various implementation constraints, userspace datapath has to match at least on a small set of packet -fileds. Some matching criteria (for example, ingress port) are not related to +fields. Some matching criteria (for example, ingress port) are not related to the packet itself and others (for example, VLAN tag or Ethernet type) can be extracted without fully parsing the packet. This allows OVS to significantly speed up packet forwarding for these flows with simple match criteria. @@ -202,7 +202,7 @@ get command, note the updated priority of the ``avx512_gather`` function:: avx512_gather (Use count: 0, Priority: 3) If two lookup functions have the same priority, the first one in the list is -chosen, and the 2nd occurance of that priority is not used. Put in logical +chosen, and the 2nd occurrence of that priority is not used. Put in logical terms, a subtable is chosen if its priority is greater than the previous best candidate. @@ -280,7 +280,7 @@ composed of bits and blocks where the bits signify which blocks are set or have values where as the blocks hold the metadata, ip, udp, vlan, etc. These values are used by the datapath for switching decisions later. -Most modern CPUs have some SIMD (single instruction, mutiple data) +Most modern CPUs have some SIMD (single instruction, multiple data) capabilities. These SIMD instructions are able to process a vector rather than act on one variable. OVS provides multiple implementations of packet parsing functions. This allows the user to take advantage of SIMD instructions like diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index d94eafc9a9b..efd168cba80 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -153,7 +153,7 @@ __ https://doc.dpdk.org/guides-23.11/prog_guide/rte_flow.html .. warning:: This feature is not compatible with all NICs. Refer to the DPDK - `compatibilty matrix`__ and vendor documentation for more details. + `compatibility matrix`__ and vendor documentation for more details. __ https://doc.dpdk.org/guides-23.11/nics/overview.html diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst index e952a686b55..7866543d89a 100644 --- a/Documentation/topics/dpdk/vhost-user.rst +++ b/Documentation/topics/dpdk/vhost-user.rst @@ -269,7 +269,7 @@ similar to the following:: QEMU waiting for connection on: disconnected:unix:/path/to/socket,server -QEMU will wait until the port is created sucessfully in OVS to boot the VM. +QEMU will wait until the port is created successfully in OVS to boot the VM. One benefit of using this mode is the ability for vHost ports to 'reconnect' in event of the switch crashing or being brought down. Once it is brought back up, the vHost ports will reconnect automatically and normal service will resume. diff --git a/Documentation/topics/integration.rst b/Documentation/topics/integration.rst index 58c4389abef..79bfece8211 100644 --- a/Documentation/topics/integration.rst +++ b/Documentation/topics/integration.rst @@ -250,7 +250,7 @@ with the active server:: 2. Using load balancer vip ip as a master_ip. In order to use this feature, one needs to use listen_on_master_ip_only to no. Current code for load balancer have been tested to work with tcp protocol and needs to be -tested/enchanced for ssl. Using load balancer, standby nodes will not listen on +tested/enhanced for ssl. Using load balancer, standby nodes will not listen on nb and sb db ports so that load balancer will always communicate to the active node and all the traffic will be sent to active node only. Standby will continue to sync using LB VIP IP in this case. diff --git a/Documentation/topics/porting.rst b/Documentation/topics/porting.rst index 839b04d52ee..b627fde1260 100644 --- a/Documentation/topics/porting.rst +++ b/Documentation/topics/porting.rst @@ -210,7 +210,7 @@ vSwitch architecture: :: - Architecure + Architecture _ | +-------------------+ diff --git a/Documentation/topics/record-replay.rst b/Documentation/topics/record-replay.rst index 14a568c2120..f723e05dd7a 100644 --- a/Documentation/topics/record-replay.rst +++ b/Documentation/topics/record-replay.rst @@ -44,7 +44,7 @@ measure performance with ``perf``, and so on. .. note:: The current version of record/replay engine does not work correctly with - internal time-based events that leats to communications with other + internal time-based events that lead to communications with other processes. For this reason it can not be used with clustered databases (RAFT implementation is heavily time dependent). In addition, recording automatically disables inactivity probes on diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index c6093463d31..9b5fc7448fe 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -409,7 +409,7 @@ options are used:: checking whether actions Autovalidator is default implementation... yes Compile OVS in debug mode to have `ovs_assert` statements error out if -there is a mis-match in the datapath classifier lookup or packet parser +there is a mismatch in the datapath classifier lookup or packet parser implementations. Since the AVX512 implementation of the datapath interface is disabled by @@ -492,7 +492,7 @@ Proof of Concepts ~~~~~~~~~~~~~~~~~ Proof of Concepts are documentation materialized into Ansible recipes -executed in VirtualBox or Libvirt environments orchastrated by Vagrant. +executed in VirtualBox or Libvirt environments orchestrated by Vagrant. Proof of Concepts allow developers to create small virtualized setups that demonstrate how certain Open vSwitch features are intended to work avoiding user introduced errors by overlooking instructions. Proof of Concepts From 3c52cd15acf27013d2fdfd52a3ba2c4f407d7ed7 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 1 Mar 2024 10:31:17 +0000 Subject: [PATCH 603/833] Documentation: Anuket project updates. The Anuket was formed by a merger of OPNFV and CNTT [1]. Also, VswitchPerf, aka vsperf, formerly an OPNFV project, has been renamed ViNePerf [2]. Update links and documentation accordingly. The old links were broken, this was flagged by make check-docs [1] https://anuket.io/news/2021/01/27/lf-networking-launches-anuket-an-open-source-project-to-accelerate-infrastructure-compliance-interoperability-and-5g-deployments/ [2] https://docs.opnfv.org/projects/vineperf/en/latest/release/release-notes/release-notes.html Signed-off-by: Simon Horman Acked-by: Mike Pattrick Acked-by: Eelco Chaudron --- Documentation/topics/dpdk/bridge.rst | 2 +- Documentation/topics/testing.rst | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Documentation/topics/dpdk/bridge.rst b/Documentation/topics/dpdk/bridge.rst index 583105c6425..a077385e9b3 100644 --- a/Documentation/topics/dpdk/bridge.rst +++ b/Documentation/topics/dpdk/bridge.rst @@ -52,7 +52,7 @@ DPDK physical ports and contain all "dropped", "error" and "management" counters from ``XSTATS``. A list of all ``XSTATS`` counters can be found `here`__. -__ https://wiki.opnfv.org/display/fastpath/Collectd+Metrics+and+Events +__ https://wiki.anuket.io/display/HOME/Collectd+Metrics+and+Events .. note:: diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index 9b5fc7448fe..dcf10a4db2d 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -479,14 +479,15 @@ You should invoke scan-view to view analysis results. The last line of output from ``clang-analyze`` will list the command (containing results directory) that you should invoke to view the results on a browser. -vsperf ------- +ViNePerf +-------- -The vsperf project aims to develop a vSwitch test framework that can be used to -validate the suitability of different vSwitch implementations in a telco -deployment environment. More information can be found on the `OPNFV wiki`_. +The ViNePerf project, formerly known as VswitchPerf or vsperf, aims to +develop a vSwitch test framework that can be used to validate the +suitability of different vSwitch implementations in a telco deployment +environment. More information can be found on the `Anuket project wiki`_. -.. _OPNFV wiki: https://wiki.opnfv.org/display/vsperf/VSperf+Home +.. _Anuket project wiki: https://wiki.anuket.io/display/HOME/ViNePERF Proof of Concepts ~~~~~~~~~~~~~~~~~ From 2d7a2bddb1b7bb5238d40581c88cdb44a6480282 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 1 Mar 2024 10:31:55 +0000 Subject: [PATCH 604/833] Documentation: Update Pacemaker link. Update link to OCF Resource Agents documentation as the existing link is broken. Also, use HTTPS. Broken link flagged by make check-docs Signed-off-by: Simon Horman Acked-by: Mike Pattrick Acked-by: Eelco Chaudron --- Documentation/topics/integration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/topics/integration.rst b/Documentation/topics/integration.rst index 79bfece8211..ee83f8d4390 100644 --- a/Documentation/topics/integration.rst +++ b/Documentation/topics/integration.rst @@ -195,7 +195,7 @@ stalled. manager which can manage a defined set of resource across a set of clustered nodes. Pacemaker manages the resource with the help of the resource agents. One among the resource agent is `OCF -`__ +`__ OCF is nothing but a shell script which accepts a set of actions and returns an appropriate status code. From cc0e7951818a48fbbd11664e1ad0a509a180b558 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 1 Mar 2024 10:32:38 +0000 Subject: [PATCH 605/833] Documentation: Update links to upstream Kernel documentation. This updates links to several upstream Kernel documents. 1. Lore is now the canonical archive for the netdev mailing list 2. net-next is now maintained by the netdev team, of which David Miller is currently a member, rather than only by David. Also, use HTTPS rather than HTTP. 3. The Netdev FAQ has evolved into the Netdev Maintainer Handbook. 4. The Kernel security document link was dead, provide the current canonical location for this document instead. 1., 2. & 3. Found by inspection 4. Flagged by check-docs Signed-off-by: Simon Horman Acked-by: Mike Pattrick Acked-by: Eelco Chaudron --- .../internals/contributing/backporting-patches.rst | 10 +++++----- Documentation/internals/security.rst | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/internals/contributing/backporting-patches.rst b/Documentation/internals/contributing/backporting-patches.rst index fae416eb3cd..0ef7f5beb9b 100644 --- a/Documentation/internals/contributing/backporting-patches.rst +++ b/Documentation/internals/contributing/backporting-patches.rst @@ -58,7 +58,7 @@ features which have been applied upstream, or bugfixes to the Open vSwitch datapath code. For bugfixes, the patches subsequently follow the regular Open vSwitch process as described above to reach older branches. -__ http://vger.kernel.org/vger-lists.html#netdev +__ https://lore.kernel.org/netdev/ Changes to userspace components ------------------------------- @@ -93,8 +93,8 @@ Changes to Linux kernel components The Linux kernel components in Open vSwitch go through initial review in the upstream Linux netdev community before they go into the Open vSwitch tree. As such, backports from upstream to the Open vSwitch tree may include bugfixes or -new features. The `netdev-FAQ`_ describes the general process for merging -patches to the upstream Linux tree. +new features. The `Netdev Maintainer Handbook`_ describes the general +process for merging patches to the upstream Linux tree. To keep track of the changes which are made upstream against the changes which have been backported to the Open vSwitch tree, backports should be done in the @@ -113,8 +113,8 @@ interests of keeping the Open vSwitch tree in sync with upstream `net-next`, contributors may send Open vSwitch kernel module changes independently of userspace changes. -.. _netdev-faq: https://www.kernel.org/doc/Documentation/networking/netdev-FAQ.txt -.. _net-next: http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git +.. _Netdev Maintainer Handbook: https://docs.kernel.org/process/maintainer-netdev.html +.. _net-next: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git How to backport kernel patches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/internals/security.rst b/Documentation/internals/security.rst index 444d07c3563..e211c16a431 100644 --- a/Documentation/internals/security.rst +++ b/Documentation/internals/security.rst @@ -90,11 +90,11 @@ Reporters may ask for a GPG key while initiating contact with the security team to deliver more sensitive reports. The Linux kernel has `its own vulnerability management process -`__. Handling -of vulnerabilities that affect both the Open vSwitch tree and the upstream -Linux kernel should be reported through both processes. Send your report as a -single email to both the kernel and OVS security teams to allow those teams to -most easily coordinate among themselves. +`__. +Handling of vulnerabilities that affect both the Open vSwitch tree and the +upstream Linux kernel should be reported through both processes. Send your +report as a single email to both the kernel and OVS security teams to allow +those teams to most easily coordinate among themselves. Step 2: Assessment ------------------ From 99c86c6c46632e6adfd699611136ea6b26d1e291 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Mar 2024 12:37:36 +0100 Subject: [PATCH 606/833] github: Temporarily disable SNAT with exhaustion system test. With a new runner update, GitHub Actions had a kernel update. And it seems like something changed between kernels 6.2 and 6.5 so this test now fails very frequently. I can reproduce the same issue on RHEL 9, and I can't reproduce it on Ubuntu 23.04 (kernel 6.2). The test is creating a NAT with a single address+port pair in an attempt to simulate an address space exhaustion. It is expected that a first connection with wget leaves a conntrack entry in a TIME_WAIT state and the second wget should fail as long as this entry remains, because the only available address+port pair is already taken. However, very frequently (not always!) the second connection replaces the first conntrack entry with a new one and connection succeeds. There is still only one connection in the conntrack at any single moment in time, so there is seemingly no issue with the NAT, but the behavior is unexpected and the test fails. The issue is likely introduced by a new kernel feature that allows to evict connections that are in the process of closing: https://lore.kernel.org/netdev/20230626064749.75525-7-pablo@netfilter.org/ Disable the test in CI until we figure out how to fix it. Acked-by: Simon Horman Acked-by: Paolo Valerio Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 98e494abf4f..07d09b912e0 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -6388,6 +6388,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - SNAT with port range with exhaustion]) +OVS_CHECK_GITHUB_ACTION() CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() OVS_TRAFFIC_VSWITCHD_START() From 6fc215de30f51e66e60a7c11083e2597850599e5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 22 Feb 2024 16:06:32 +0100 Subject: [PATCH 607/833] ofproto-dpif-trace: Fix infinite recirculation tracing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trace attempts to process all the recirculations. However, if there is a recirculation loop, i.e. if every recirculation generates another recirculation, this process will never stop. It will grind until the trace fills the system memory. A simple reproducer: make sandbox ovs-vsctl add-br br0 ovs-vsctl add-port br0 p1 ovs-ofctl add-flow br0 "table=0,in_port=p1,ip,actions=ct(table=0)" ovs-appctl ofproto/trace br0 in_port=p1,ip Limit the number of recirculations trace is processing with a fairly arbitrary number - 4096 (loosely based on the resubmit limit, but they are not actually related). Not adding a test for this since it's only for a trace, but also because the test may lead to OOM event in a system if the test fails, which is not nice. Fixes: e6bc8e749381 ("ofproto/trace: Add support for tracing conntrack recirculation") Reported-by: Jaime Caamaño Ruiz Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-trace.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ofproto/ofproto-dpif-trace.c b/ofproto/ofproto-dpif-trace.c index b86e7fe07eb..87506aa7858 100644 --- a/ofproto/ofproto-dpif-trace.c +++ b/ofproto/ofproto-dpif-trace.c @@ -845,17 +845,35 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow, bool names) { struct ovs_list recirc_queue = OVS_LIST_INITIALIZER(&recirc_queue); + int recirculations = 0; + ofproto_trace__(ofproto, flow, packet, &recirc_queue, ofpacts, ofpacts_len, output, names); struct oftrace_recirc_node *recirc_node; LIST_FOR_EACH_POP (recirc_node, node, &recirc_queue) { + if (recirculations++ > 4096) { + ds_put_cstr(output, "\n\n"); + ds_put_char_multiple(output, '=', 79); + ds_put_cstr(output, "\nTrace reached the recirculation limit." + " Sopping the trace here."); + ds_put_format(output, + "\nQueued but not processed: %"PRIuSIZE + " recirculations.", + ovs_list_size(&recirc_queue) + 1); + oftrace_recirc_node_destroy(recirc_node); + break; + } ofproto_trace_recirc_node(recirc_node, next_ct_states, output); ofproto_trace__(ofproto, &recirc_node->flow, recirc_node->packet, &recirc_queue, ofpacts, ofpacts_len, output, names); oftrace_recirc_node_destroy(recirc_node); } + /* Destroy remaining recirculation nodes, if any. */ + LIST_FOR_EACH_POP (recirc_node, node, &recirc_queue) { + oftrace_recirc_node_destroy(recirc_node); + } } void From 436aba68d52891fb5775ec7651282ccf9d04176b Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Thu, 22 Feb 2024 16:54:14 +0100 Subject: [PATCH 608/833] bond: Reset stats when deleting post recirc rule. In order to properly balance bond traffic, ofproto/bond periodically reads usage statistics of the post-recirculation rules (which are added to a hidden internal table). To do that, each "struct bond_entry" (which represents a hash within a bond) stores the last seen statistics for its rule. When a hash is moved to another member (due to a bond rebalance or the previous member going down), the rule is typically just modified, i.e: same match different actions. In this case, statistics are preserved and accounting continues to work. However, if the rule gets completely deleted (e.g: when all bond members go down) and then re-created, the new rule will have 0 tx_bytes but its associated entry will still store a non-zero last-seen value. This situation leads to an overflow of the delta calculation (computed as [current_stats_value - last_seen_value]), which can affect traffic as the hash will be considered to carry a lot of traffic and rebalancing will kick in. In order to fix this situation, reset the value of last seen statistics on rule deletion. Implementation notes: Modifying pr_tx_bytes requires write-locking the global rwlock but a lockless version of update_recirc_rules was being maintained to avoid locking on bon_unref(). Considering the small impact of locking during bond removal, removing the lockless version and relying on clang's thread safety analysis is preferred. Also, folding Ilya's [1], i.e: fixing thread safety annotation in update_recirc_rules() to require holding write-lock. [1] https://patchwork.ozlabs.org/project/openvswitch/patch/20240209161718.1149494-1-i.maximets@ovn.org/ Reported-at: https://github.com/openvswitch/ovs-issues/issues/319 Co-authored-by: Ilya Maximets Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- ofproto/bond.c | 33 +++++++++++++++------------------ tests/ofproto-dpif.at | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/ofproto/bond.c b/ofproto/bond.c index cfdf44f8542..c31869a4c76 100644 --- a/ofproto/bond.c +++ b/ofproto/bond.c @@ -186,7 +186,7 @@ static struct bond_member *choose_output_member(const struct bond *, struct flow_wildcards *, uint16_t vlan) OVS_REQ_RDLOCK(rwlock); -static void update_recirc_rules__(struct bond *); +static void update_recirc_rules(struct bond *) OVS_REQ_WRLOCK(rwlock); static bool bond_may_recirc(const struct bond *); static void bond_update_post_recirc_rules__(struct bond *, bool force) OVS_REQ_WRLOCK(rwlock); @@ -299,7 +299,10 @@ bond_unref(struct bond *bond) } free(bond->hash); bond->hash = NULL; - update_recirc_rules__(bond); + + ovs_rwlock_wrlock(&rwlock); + update_recirc_rules(bond); + ovs_rwlock_unlock(&rwlock); hmap_destroy(&bond->pr_rule_ops); free(bond->primary); @@ -331,17 +334,8 @@ add_pr_rule(struct bond *bond, const struct match *match, hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash); } -/* This function should almost never be called directly. - * 'update_recirc_rules()' should be called instead. Since - * this function modifies 'bond->pr_rule_ops', it is only - * safe when 'rwlock' is held. - * - * However, when the 'bond' is the only reference in the system, - * calling this function avoid acquiring lock only to satisfy - * lock annotation. Currently, only 'bond_unref()' calls - * this function directly. */ static void -update_recirc_rules__(struct bond *bond) +update_recirc_rules(struct bond *bond) OVS_REQ_WRLOCK(rwlock) { struct match match; struct bond_pr_rule_op *pr_op; @@ -407,6 +401,15 @@ update_recirc_rules__(struct bond *bond) VLOG_ERR("failed to remove post recirculation flow %s", err_s); free(err_s); + } else if (bond->hash) { + /* If the flow deletion failed, a subsequent call to + * ofproto_dpif_add_internal_flow() would just modify the + * flow preserving its statistics. Therefore, only reset + * the entry's byte counter if it succeeds. */ + uint32_t hash = pr_op->match.flow.dp_hash & BOND_MASK; + struct bond_entry *entry = &bond->hash[hash]; + + entry->pr_tx_bytes = 0; } hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node); @@ -421,12 +424,6 @@ update_recirc_rules__(struct bond *bond) ofpbuf_uninit(&ofpacts); } -static void -update_recirc_rules(struct bond *bond) - OVS_REQ_RDLOCK(rwlock) -{ - update_recirc_rules__(bond); -} /* Updates 'bond''s overall configuration to 's'. * diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index daeea7775c2..a1393f7f8e5 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -547,6 +547,23 @@ ovs-appctl time/warp 1000 100 ovs-appctl bond/show > bond3.txt AT_CHECK([sed -n '/member p2/,/^$/p' bond3.txt | grep 'hash'], [0], [ignore]) +# Check that both ports doing down and back up doesn't break statistics. +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p1 down], 0, [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p2 down], 0, [OK +]) +ovs-appctl time/warp 1000 100 +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p1 up], 0, [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p2 up], 0, [OK +]) +ovs-appctl time/warp 1000 100 + +AT_CHECK([SEND_TCP_BOND_PKTS([p5], [5], [65500])]) +# We sent 49125 KB of data total in 3 batches. No hash should have more +# than that amount of load. Just checking that it is within 5 digits. +AT_CHECK([ovs-appctl bond/show | grep -E '[[0-9]]{6}'], [1]) + OVS_VSWITCHD_STOP() AT_CLEANUP From d439c201799e9c2316ca5848e1539c9566bf294c Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Mar 2024 22:10:37 +0100 Subject: [PATCH 609/833] appveyor: Print out config.log on configuration failure. We need to know exact linking / compilation errors in order to fix issues. We could have uploaded it as an artifact, but it seems easier to just print it out for now. Acked-by: Simon Horman Acked-by: Alin-Gabriel Serdean Signed-off-by: Ilya Maximets --- .ci/windows-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/windows-build.sh b/.ci/windows-build.sh index 22994fcdd60..e54fbacf446 100644 --- a/.ci/windows-build.sh +++ b/.ci/windows-build.sh @@ -9,7 +9,7 @@ CONFIGURATION=$1 --prefix=C:/openvswitch/usr --localstatedir=C:/openvswitch/var \ --sysconfdir=C:/openvswitch/etc --with-pthread=c:/PTHREADS-BUILT/ \ --enable-ssl --with-openssl=C:/OpenSSL-Win64 \ - --with-vstudiotarget="${CONFIGURATION}" + --with-vstudiotarget="${CONFIGURATION}" || (cat config.log && exit 1) make -j4 make datapath_windows_analyze From f5fa9a0a3cffc9cfa140acda69d5e7c8ac201fee Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Mar 2024 22:10:38 +0100 Subject: [PATCH 610/833] ovs-pki: Fix file permissions on Windows. There is no chmod or 'mkdir -m' support on Windows, so setting file permissions for keys and certificates doesn't actually work. Implementing them using icacls utility instead. ovs-pki script currently only uses 0700 and 0750 modes, so only those (and 0600) are implemented. NTFS ACLs on Windows are fairly different and more complex in comparison with Unix file permissions, so it's hard to implement these functions in a generic way. The script will fail if it will encounter an unknown mode. 0700 is implemented as a F (full access) for 'Creator Owner' with no other permissions. 0750 has an additional RX (read+execute) for the 'Creator Group'. 0600 is implemented the same as 0700, since it doesn't matter for this use case to have or not to have an executable or traversal permissions managed separately from everything else and it would be a little overly verbose to give all the permissions except for X. Inheritance rules are set to (OI)(CI), so the folder itself, subfolders and files in a folder inherit those ACEs. 'umask' also doesn't work on Windows. Instead, moving the private key output files to a temporary folder that has restricted access already configured. The file will inherit these restricted ACEs. It should not be necessary to set explicit permissions for these files since moving them within the same volume should preserve ACEs. However, it might be safer to chmod them directly as well, just in case. Windows administrators will still have to be careful with private keys, because file copies do not preserve permissions and moves to different volumes do not preserve them as well. 'robocopy' with flags to copy security should be used in these cases. We may want to re-implement 'mv' with 'robocopy' if that becomes a problem in the future. There is one more place where umask is used in the script for creation of a self-signed certificate, but it is not actually needed there since the resulted certificate doen't need to be private, so not changing this part for now. Tested with running an empty 'make check' in AppVeyor and examining permissions for files in tests/pki: Files | Linux | Windows ---------------------+------------+-------------------------------------- controllerca | drwxr-xr-x | NT AUTHORITY\SYSTEM:(I)(OI)(CI)(F) switchca | | BUILTIN\Administrators:(I)(OI)(CI)(F) *ca\certs | | BUILTIN\Users:(I)(OI)(CI)(RX) *ca\crl | | BUILTIN\Users:(I)(CI)(AD) *ca\newcerts | | BUILTIN\Users:(I)(CI)(WD) | | APPVEYOR-VM\appveyor:(I)(F) | | CREATOR OWNER:(I)(OI)(CI)(IO)(F) ---------------------+------------+-------------------------------------- stamp | -rw-r--r-- | NT AUTHORITY\SYSTEM:(I)(F) test-cert.pem | | BUILTIN\Administrators:(I)(F) test-req.pem | | BUILTIN\Users:(I)(RX) test2-cert.pem | | APPVEYOR-VM\appveyor:(I)(F) test2-req.pem | | *ca\ca.cnf | | *ca\cacert.pem | | *ca\careq.pem | | *ca\crlnumber | | *ca\index.txt* | | *ca\serial* | | *ca\newcerts\*.pem | | ---------------------+------------+-------------------------------------- controllerca\private | drwx------ | APPVEYOR-VM\appveyor:(F) switchca\private | | CREATOR OWNER:(OI)(CI)(IO)(F) ---------------------+------------+-------------------------------------- test-privkey.pem | -rw------- | APPVEYOR-VM\appveyor:(F) test2-privkey.pem | | *ca\private\cakey.pem| | We can see that private folders and keys have only a full access from their owners. Other files and folders have some extra inherited ACEs from a containing folder. Acked-by: Simon Horman Acked-by: Alin-Gabriel Serdean Signed-off-by: Ilya Maximets --- utilities/ovs-pki.in | 87 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in index b0c5389031e..3d2ef911c94 100755 --- a/utilities/ovs-pki.in +++ b/utilities/ovs-pki.in @@ -57,6 +57,77 @@ FreeBSD|NetBSD|Darwin) ;; esac +case $(uname -s) in +MINGW*|MSYS*) + chmod() + { + local PERM=$1 + local FILE=$2 + local INH= + + if test -d "${FILE}"; then + # Inheritance rules for folders: apply to a folder itself, + # subfolders and files within. + INH='(OI)(CI)' + fi + + case "${PERM}" in + *700 | *600) + # Reset all own and inherited ACEs and grant full access to the + # "Creator Owner". We're giving full access even for 0600, + # because it doesn't matter for a use case of ovs-pki. + icacls "${FILE}" /inheritance:r /grant:r "*S-1-3-0:${INH}F" + ;; + *750) + # Reset all own and inherited ACEs, grant full access to the + # "Creator Owner" and a read+execute access to the "Creator Group". + icacls "${FILE}" /inheritance:r /grant:r \ + "*S-1-3-0:${INH}F" "*S-1-3-1:${INH}RX" + ;; + *) + echo >&2 "Unable to set ${PERM} mode for ${FILE}." + exit 1 + ;; + esac + } + + mkdir() + { + ARG_P= + PERM= + for arg; do + shift + case ${arg} in + -m?*) + PERM=${arg#??} + continue + ;; + -m) + PERM=$1 + shift + continue + ;; + -p) + ARG_P=-p + continue + ;; + *) + set -- "$@" "${arg}" + ;; + esac + done + + command mkdir ${ARG_P} $@ + if [ ${PERM} ]; then + for dir; do + shift + chmod ${PERM} ${dir} + done + fi + } + ;; +esac + for option; do # This option-parsing mechanism borrowed from a Autoconf-generated # configure script under the following license: @@ -466,14 +537,24 @@ CN = $cn [ v3_req ] subjectAltName = DNS:$cn EOF + # It is important to create private keys in $TMP because umask doesn't + # work on Windows and permissions there are inherited from the folder. + # umask itself is still needed though to ensure correct permissions + # on non-Windows platforms. if test $keytype = rsa; then - (umask 077 && openssl genrsa -out "$1-privkey.pem" $bits) 1>&3 2>&3 \ - || exit $? + (umask 077 && openssl genrsa -out "$TMP/privkey.pem" $bits) \ + 1>&3 2>&3 || exit $? else must_exist "$dsaparam" - (umask 077 && openssl gendsa -out "$1-privkey.pem" "$dsaparam") \ + (umask 077 && openssl gendsa -out "$TMP/privkey.pem" "$dsaparam") \ 1>&3 2>&3 || exit $? fi + # Windows: applying permissions (ACEs) to the file itself, just in case. + # 'mv' should technically preserve all the inherited ACEs from a TMP + # folder, but it's better to not rely on that. + chmod 0600 "$TMP/privkey.pem" + mv "$TMP/privkey.pem" "$1-privkey.pem" + openssl req -config "$TMP/req.cnf" -new -text \ -key "$1-privkey.pem" -out "$1-req.pem" 1>&3 2>&3 } From 68e93122144d61d531849956a3037288a575e761 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Mar 2024 22:10:39 +0100 Subject: [PATCH 611/833] m4: Fix linking with OpenSSL 1.1.0+ and 3+ on Windows. OpenSSL 1.1.0 changed the library names from libeay32 and ssleay32 to standard libssl and libcrypto. All the versions of OpenSSL that used old names reached their official EoL, so it should be safe to just migrate to new names. They can still be supported via premium support option, but I don't think that is important for us. Also, OpenSSL installers for older versions had the following folder structure: C:\OPENSSL-WIN64\ +---bin +---include | +---openssl +---lib | libeay32.lib | ssleay32.lib +---VC libeay32MD.lib libeay32MDd.lib libeay32MT.lib libeay32MTd.lib ssleay32MD.lib ssleay32MDd.lib ssleay32MT.lib ssleay32MTd.lib With newer OpenSSL 3+ the structure is different: C:\OPENSSL-WIN64 +---bin +---include | +---openssl +---lib +---VC +---x64 +---MD | libcrypto.lib | libssl.lib +---MDd | libcrypto.lib | libssl.lib +---MT | libcrypto.lib | libssl.lib +---MTd libcrypto.lib libssl.lib Basically, instead of one generic library in the lib folder and a bunch of differently named versions of it for different type of linkage, we now have multiple instances of the library located in different folders based on the linkage type. So, we have to provide an exact path in order to find the library. 'lib/VC/x64/MT' was chosen in this patch since it is a way used for building in build-aux/ccl. MD stands for dynamic linking, MT is static, 'd' stands for debug versions of the libraries. While at it, fixing documentation examples to point to Win64 default installation folder. Acked-by: Simon Horman Acked-by: Alin-Gabriel Serdean Signed-off-by: Ilya Maximets --- Documentation/intro/install/windows.rst | 6 +++--- m4/ax_check_openssl.m4 | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/intro/install/windows.rst b/Documentation/intro/install/windows.rst index fce099d5dc1..efdb8aebcea 100644 --- a/Documentation/intro/install/windows.rst +++ b/Documentation/intro/install/windows.rst @@ -112,7 +112,7 @@ The following explains the steps in some detail. `OpenSSL for Windows `__ Note down the directory where OpenSSL is installed (e.g.: - ``C:/OpenSSL-Win32``) for later use. + ``C:/OpenSSL-Win64``) for later use. .. note:: @@ -182,7 +182,7 @@ To configure with SSL support, add the requisite additional options: --localstatedir="C:/openvswitch/var" --sysconfdir="C:/openvswitch/etc" \ --with-pthread="C:/pthread" \ - --enable-ssl --with-openssl="C:/OpenSSL-Win32" + --enable-ssl --with-openssl="C:/OpenSSL-Win64" Finally, to the kernel module also: @@ -194,7 +194,7 @@ Finally, to the kernel module also: --localstatedir="C:/openvswitch/var" \ --sysconfdir="C:/openvswitch/etc" \ --with-pthread="C:/pthread" \ - --enable-ssl --with-openssl="C:/OpenSSL-Win32" \ + --enable-ssl --with-openssl="C:/OpenSSL-Win64" \ --with-vstudiotarget="" \ --with-vstudiotargetver="" diff --git a/m4/ax_check_openssl.m4 b/m4/ax_check_openssl.m4 index 281d4dc65eb..faa5babde26 100644 --- a/m4/ax_check_openssl.m4 +++ b/m4/ax_check_openssl.m4 @@ -81,7 +81,8 @@ AC_DEFUN([AX_CHECK_OPENSSL], [ SSL_INCLUDES="-I$ssldir/include" SSL_LDFLAGS="-L$ssldir/lib" if test "$WIN32" = "yes"; then - SSL_LIBS="-lssleay32 -llibeay32" + SSL_LDFLAGS="$SSL_LDFLAGS -L$ssldir/lib/VC/x64/MT" + SSL_LIBS="-llibssl -llibcrypto" SSL_DIR=/$(echo ${ssldir} | ${SED} -e 's/://') else SSL_LIBS="-lssl -lcrypto" From 9d8208484a350056e4e0ccc172df0455522cfbba Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 Mar 2024 22:10:40 +0100 Subject: [PATCH 612/833] appveyor: Build with OpenSSL 3.0. OpenSSL 1.0.2u is long deprecated and not available for download. So, our CI never actually downloads it and uses whatever is in the OpenSSL-Win64 folder provided by AppVeyor. Luckily, it happens to be OpenSSL 1.0.2u today. The oldest supported version of OpenSSL upstream today is 3.0. And it is an LTS version. 3.1 and 3.2 are not LTS. Use OpenSSL 3.0 for testing instead. This commit does a few things to achieve that: 1. Removes the folder provided by AppVeyor. This way we will fail the build if something goes wrong instead of silently using OpenSSL version provided by AppVeyor. 2. Obtains the JSON description of available releases and downloads the latest minor version of OpenSSL 3.0 64-bit. With this approach we should not need to update the download link that frequently. New minor releases will be picked up automatically. They should not have any breaking changes, so should be fine to use in CI. OpenSSL 3.0 is supported until at least Sep 2026. The JSON file is an official file referenced on the: https://slproweb.com/products/Win32OpenSSL.html So, it should be safe to use. 3. Executes the downloaded installer with 'Start-Process -Wait' to properly wait for installation to finish instead of just sleeping for 30 seconds. 4. Caches the downloaded installer, so we're not downloading 300 MB on each CI run as that is not nice to do. We know the hash of the latest version, so we will re-download only when the binary changes, i.e. on a new minor release. For the cache to work we need to introduce the 'install' phase, because caches are populated after 'init', but before 'install'. Alternatively, we could have just renamed 'init' to 'install', but I think it's a little nicer to have separate phases, and we can also move 'windows-prepare.sh' to the install phase. Cache is also invalidated whenever appveyor.yml changes. Acked-by: Simon Horman Acked-by: Alin-Gabriel Serdean Signed-off-by: Ilya Maximets --- appveyor.yml | 52 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 373f01a43cc..29cc44d6c6f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -8,28 +8,60 @@ configuration: - Release clone_folder: C:\openvswitch_compile shallow_clone: true + init: - ps: $env:PATH ="C:\Python312-x64;"+$env:PATH - ps: New-Item -Type HardLink -Path "C:\Python312-x64\python3.exe" -Value "C:\Python312-x64\python.exe" + +cache: +- C:\ovs-build-downloads -> appveyor.yml + +install: - ps: | - mkdir C:\ovs-build-downloads + Remove-Item -Recurse -Force -Path C:/OpenSSL-Win64 + New-Item -ItemType Directory -Force -Path C:\ovs-build-downloads + + # Find and download the latest stable OpenSSl 3.0. + $URL = "https://raw.githubusercontent.com/slproweb/opensslhashes/master/win32_openssl_hashes.json" + $webData = (Invoke-WebRequest -Uri $URL).content | ConvertFrom-Json + $source = ($webData.files.PSObject.Properties | Where-Object { + $_.Value.basever -match "3.0.*" -and + $_.Value.bits -eq "64" -and + $_.Value.arch -eq "INTEL" -and + $_.Value.installer -eq "exe" -and + -not $_.Value.light + } | Select-Object Value).PSObject.Properties.Value + + Write-Host "Latest OpenSSL 3.0:" ($source | Format-List | Out-String) + + $destination = "C:\ovs-build-downloads\Win64OpenSSL.exe" + if (Test-Path $destination) { + $fileHash = (Get-FileHash $destination -Algorithm SHA256).Hash.ToLower() + if ($fileHash -ne $source.sha256) { + Write-Host "Cache miss:" $fileHash "!=" $source.sha256 + Remove-Item -Path $destination + } + } - $source = "https://slproweb.com/download/Win64OpenSSL-1_0_2u.exe" - $destination = "C:\ovs-build-downloads\Win64OpenSSL-1_0_2u.exe" - Invoke-WebRequest $source -OutFile $destination + if (Test-Path $destination) { + Write-Host "Using cached:" $destination + } else { + Write-Host "Downloading:" $source.url + Invoke-WebRequest $source.url -OutFile $destination + } + + Write-Host "Installing:" $destination + Start-Process -FilePath $destination ` + -ArgumentList "/silent /verysilent /sp- /suppressmsgboxes" -Wait - cd C:\ovs-build-downloads - .\Win64OpenSSL-1_0_2u.exe /silent /verysilent /sp- /suppressmsgboxes - Start-Sleep -s 30 - cd C:\openvswitch_compile - ps: git clone -q https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code - ps: python3 -m pip install pypiwin32 --disable-pip-version-check - -build_script: - '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"' - ps: C:\msys64\msys2_shell.cmd -here -defterm -no-start -use-full-path -c ".ci/windows-prepare.sh 2>&1" + +build_script: - ps: C:\msys64\msys2_shell.cmd -here -defterm -no-start -use-full-path -c ".ci/windows-build.sh $env:CONFIGURATION 2>&1" - ps: cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\bin From 29e09c80916c0014a9ffd64cb96fdc979cc3f38e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 6 Mar 2024 10:07:36 +0000 Subject: [PATCH 613/833] vswitch.xml: Use member wording for bonds. Since the patch-set that included [1] there has been a policy of using the term member for bonds, LACP, and bundle contexts. This is consistent with the more recently adopted policy of using the inclusive naming word list v1 [2, 3]. This patch addresses two instances where the term member should be used in vswitch.xml. It does not address instances of alternative wording that require code updates, which can addressed as follow-up activity. [1] 91fc374a9c5a ("Eliminate use of term "slave" in bond, LACP, and bundle contexts.") [2] df5e5cf4318a ("Documentation: Add section on inclusive language.") [3] https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- vswitchd/vswitch.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 612ba41e3b2..8a1b607d71b 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -2159,7 +2159,7 @@ - If a slave interface with this name exists in the bond and + If a member interface with this name exists in the bond and is up, it will be made active. Relevant only when is active-backup or if balance-tcp falls back @@ -6291,7 +6291,7 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ and avoids recirculation of packet in datapath. It is supported only for balance-tcp bond mode in netdev datapath. The new action gives higher performance by using bond buckets instead of post - recirculation flows for selection of slave port from bond. By default + recirculation flows for selection of member port from bond. By default this new action is disabled, however it can be enabled by setting in table. From 0c255bf763cc83240fcedfa178c7e42cc45d82ff Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 6 Mar 2024 10:07:43 +0000 Subject: [PATCH 614/833] Documentation: Update to refer to main repository. Recently OVS adopted a policy of using the inclusive naming word list v1 [1, 2]. This patch addresses the use of the term master repository by using the term main repository instead. This is as distinct from addressing the use of a master branch, which remains as a follow-up task. [1] df5e5cf ("Documentation: Add section on inclusive language.") [2] https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- Documentation/internals/committer-grant-revocation.rst | 2 +- Documentation/intro/install/dpdk.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/internals/committer-grant-revocation.rst b/Documentation/internals/committer-grant-revocation.rst index c011df4aec0..7231762d8f1 100644 --- a/Documentation/internals/committer-grant-revocation.rst +++ b/Documentation/internals/committer-grant-revocation.rst @@ -26,7 +26,7 @@ OVS Committer Grant/Revocation Policy ===================================== An OVS committer is a participant in the project with the ability to commit -code directly to the master repository. Commit access grants a broad ability to +code directly to the main repository. Commit access grants a broad ability to affect the progress of the project as presented by its most important artifact, the code and related resources that produce working binaries of Open vSwitch. As such it represents a significant level of trust in an individual's diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index 65156966e08..c92e598d7ae 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -33,7 +33,7 @@ userspace. The :doc:`releases FAQ ` lists support for the required versions of DPDK for each version of Open vSwitch. If building OVS and - DPDK outside of the master build tree users should consult this list + DPDK outside of the main build tree users should consult this list first. Build requirements From f92b30a0ff88d54f2e1c437aa24b698702c2329b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 6 Mar 2024 10:07:54 +0000 Subject: [PATCH 615/833] netdev-linux: Rename struct nedev_linux field as is_lag_primary. Recently OVS adopted a policy of using the inclusive naming word list v1 [1, 2]. This patch partially addresses the use of the term master in the context of LAG devices by using the term primary instead: the is_lag_master field of struct netdev_linux is renamed is_lag_primary. A related comment is also updated. No functional change intended. [1] df5e5cf ("Documentation: Add section on inclusive language.") [2] https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- lib/netdev-linux-private.h | 2 +- lib/netdev-linux.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h index 188e8438a32..8e572e3b3b1 100644 --- a/lib/netdev-linux-private.h +++ b/lib/netdev-linux-private.h @@ -105,7 +105,7 @@ struct netdev_linux { uint64_t rx_dropped; /* Packets dropped while recv from kernel. */ /* LAG information. */ - bool is_lag_master; /* True if the netdev is a LAG master. */ + bool is_lag_primary; /* True if the netdev is a LAG primary. */ int numa_id; /* NUMA node id. */ diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index bf91ef462ef..1f996454d8f 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -885,7 +885,7 @@ netdev_linux_update__(struct netdev_linux *dev, } if (change->primary && netdev_linux_kind_is_lag(change->primary)) { - dev->is_lag_master = true; + dev->is_lag_primary = true; } dev->ifindex = change->if_index; @@ -3703,8 +3703,9 @@ netdev_linux_get_block_id(struct netdev *netdev_) netdev_linux_update_via_netlink(netdev); } - /* Only assigning block ids to linux netdevs that are LAG masters. */ - if (netdev->is_lag_master) { + /* Only assigning block ids to linux netdevs that are + * LAG primary members. */ + if (netdev->is_lag_primary) { block_id = netdev->ifindex; } ovs_mutex_unlock(&netdev->mutex); @@ -6903,7 +6904,7 @@ netdev_linux_update_via_netlink(struct netdev_linux *netdev) changed = true; } if (change->primary && netdev_linux_kind_is_lag(change->primary)) { - netdev->is_lag_master = true; + netdev->is_lag_primary = true; } if (changed) { netdev_change_seq_changed(&netdev->up); From b3ebc34a065ec4ade0520d907f4d6f57f4866382 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 6 Mar 2024 10:09:10 +0000 Subject: [PATCH 616/833] netdev-linux: Rename local variables as primary_*. Recently OVS adopted a policy of using the inclusive naming word list v1 [1, 2]. This patch partially addresses the use of the term master in the context of LAG devices by using the term primary instead: the local variables master_netdev and master_name are renamed as primary_netdev and primary_name. Related comments are also updated. No functional change intended. [1] df5e5cf ("Documentation: Add section on inclusive language.") [2] https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- lib/netdev-linux.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 1f996454d8f..1e904d8e631 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -686,26 +686,26 @@ netdev_linux_update_lag(struct rtnetlink_change *change) lag = shash_find_data(&lag_shash, change->ifname); if (!lag) { - struct netdev *master_netdev; - char master_name[IFNAMSIZ]; + struct netdev *primary_netdev; + char primary_name[IFNAMSIZ]; uint32_t block_id; int error = 0; - if (!if_indextoname(change->master_ifindex, master_name)) { + if (!if_indextoname(change->master_ifindex, primary_name)) { return; } - master_netdev = netdev_from_name(master_name); - if (!master_netdev) { + primary_netdev = netdev_from_name(primary_name); + if (!primary_netdev) { return; } - /* If LAG master is not attached to ovs, ingress block on LAG - * members shoud not be updated. */ - if (!master_netdev->auto_classified && - is_netdev_linux_class(master_netdev->netdev_class)) { - block_id = netdev_get_block_id(master_netdev); + /* If LAG primary member is not attached to ovs, + * ingress block on LAG members should not be updated. */ + if (!primary_netdev->auto_classified && + is_netdev_linux_class(primary_netdev->netdev_class)) { + block_id = netdev_get_block_id(primary_netdev); if (!block_id) { - netdev_close(master_netdev); + netdev_close(primary_netdev); return; } @@ -715,7 +715,7 @@ netdev_linux_update_lag(struct rtnetlink_change *change) /* delete ingress block in case it exists */ tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS); - /* LAG master is linux netdev so add member to same block. */ + /* LAG primary is linux netdev so add member to same block. */ error = tc_add_del_qdisc(change->if_index, true, block_id, TC_INGRESS); if (error) { @@ -726,7 +726,7 @@ netdev_linux_update_lag(struct rtnetlink_change *change) } } - netdev_close(master_netdev); + netdev_close(primary_netdev); } } else if (change->master_ifindex == 0) { /* Check if this was a lag member that has been removed. */ From e0aa15f897f1e10da691d3ecd51d8c6305e5115c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 6 Mar 2024 10:09:51 +0000 Subject: [PATCH 617/833] utilities: Use localhost as sample hostname. Recently OVS adopted a policy of using the inclusive naming word list v1 [1, 2]. This patch addresses the use of the term master in the context of a hostname used in documentation of the kernel_delay utility. It does so by using localhost as the hostname instead. [1] df5e5cf ("Documentation: Add section on inclusive language.") [2] https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- utilities/usdt-scripts/kernel_delay.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utilities/usdt-scripts/kernel_delay.rst b/utilities/usdt-scripts/kernel_delay.rst index e2e43752d20..0f6f916a71e 100644 --- a/utilities/usdt-scripts/kernel_delay.rst +++ b/utilities/usdt-scripts/kernel_delay.rst @@ -553,7 +553,7 @@ First the containers need to be started: .. code-block:: console - [core@sno-master ~]$ sudo podman run -it --rm \ + [core@localhost ~]$ sudo podman run -it --rm \ -e PS1='[(DEBUG)\u@\h \W]\$ ' \ --privileged --network=host --pid=host \ -v /lib/modules:/lib/modules:ro \ @@ -562,14 +562,14 @@ First the containers need to be started: -v /:/mnt/rootdir \ quay.io/fedora/fedora:38-x86_64 - [(DEBUG)root@sno-master /]# + [(DEBUG)root@localhost /]# Next add the ``linux_delay.py`` dependencies: .. code-block:: console - [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \ + [(DEBUG)root@localhost /]# dnf install -y bcc-tools perl-interpreter \ python3-pytz python3-psutil @@ -578,7 +578,7 @@ version: .. code-block:: console - [(DEBUG)root@sno-master home]# rpm -i \ + [(DEBUG)root@localhost home]# rpm -i \ openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \ openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \ kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm @@ -588,7 +588,7 @@ Now the tool can be started. Here the above ``bridge_run()`` example is used: .. code-block:: console - [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run + [(DEBUG)root@localhost home]# ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 (11:44:07 UTC) # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 (11:44:07 UTC) # Triggered sample dump, stop-start delta 100,467 ns @2023-06-15T11:44:07.628569 (11:44:07 UTC) From 6c082a8310d5aa1bcbeb70c5d15d6bf74b0e224e Mon Sep 17 00:00:00 2001 From: Xavier Simonart Date: Mon, 4 Mar 2024 16:21:59 +0100 Subject: [PATCH 618/833] conntrack: Fix flush not flushing all elements. On netdev datapath, when a ct element was cleaned, the cmap could be shrinked, potentially causing some elements to be skipped in the flush iteration. Fixes: 967bb5c5cd90 ("conntrack: Add rcu support.") Signed-off-by: Xavier Simonart Acked-by: Mike Pattrick Signed-off-by: Simon Horman --- lib/conntrack.c | 14 ++++-------- lib/conntrack.h | 2 +- tests/system-traffic.at | 47 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 8a7056bac3b..5786424f6d9 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2651,25 +2651,19 @@ conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump, dump->ct = ct; *ptot_bkts = 1; /* Need to clean up the callers. */ + dump->cursor = cmap_cursor_start(&ct->conns); return 0; } int conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) { - struct conntrack *ct = dump->ct; long long now = time_msec(); - for (;;) { - struct cmap_node *cm_node = cmap_next_position(&ct->conns, - &dump->cm_pos); - if (!cm_node) { - break; - } - struct conn_key_node *keyn; - struct conn *conn; + struct conn_key_node *keyn; + struct conn *conn; - INIT_CONTAINER(keyn, cm_node, cm_node); + CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, &dump->cursor) { if (keyn->dir != CT_DIR_FWD) { continue; } diff --git a/lib/conntrack.h b/lib/conntrack.h index ee7da099e37..8ab8b00176e 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -107,8 +107,8 @@ struct conntrack_dump { struct conntrack *ct; unsigned bucket; union { - struct cmap_position cm_pos; struct hmap_position hmap_pos; + struct cmap_cursor cursor; }; bool filter_zone; uint16_t zone; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 07d09b912e0..2d12d558ec2 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -8390,6 +8390,53 @@ AT_CHECK([ovs-pcap client.pcap | grep 000000002010000000002000], [0], [dnl OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - Flush many conntrack entries by port]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +priority=100,in_port=1,udp,action=ct(zone=1,commit),2 +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +dnl 20 packets from port 1 and 1 packet from port 2. +flow_l3="\ + eth_src=50:54:00:00:00:09,eth_dst=50:54:00:00:00:0a,dl_type=0x0800,\ + nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_proto=17,nw_ttl=64,nw_frag=no" + +for i in $(seq 1 20); do + frame=$(ovs-ofctl compose-packet --bare "$flow_l3, udp_src=1,udp_dst=$i") + AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=$frame actions=resubmit(,0)"]) +done +frame=$(ovs-ofctl compose-packet --bare "$flow_l3, udp_src=2,udp_dst=1") +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=$frame actions=resubmit(,0)"]) + +: > conntrack + +for i in $(seq 1 20); do + echo "udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=${i}),reply=(src=10.1.1.2,dst=10.1.1.1,sport=${i},dport=1),zone=1" >> conntrack +done +echo "udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=2,dport=1),reply=(src=10.1.1.2,dst=10.1.1.1,sport=1,dport=2),zone=1" >> conntrack + +sort conntrack > expout + +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=1 | grep -F "src=10.1.1.1," | sort ], [0], [expout]) + +dnl Check that flushing conntrack by port 1 flush all ct for port 1 but keeps ct for port 2. +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_proto=17,ct_tp_src=1']) +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=1 | grep -F "src=10.1.1.1," | sort ], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=2,dport=1),reply=(src=10.1.1.2,dst=10.1.1.1,sport=1,dport=2),zone=1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([IGMP]) AT_SETUP([IGMP - flood under normal action]) From d2a42f396338210ff7382fc3be9e6306d627db96 Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Tue, 5 Mar 2024 19:55:51 +0100 Subject: [PATCH 619/833] tests: Fix "SSL db: Implementation" test with openssl > 3.2.0. In OpenSSL 3.2.0 (81b741f) all the "alert" error messages were updated to replace "sslv3" with "ssl/tls". This commit updates the "SSL db: implementation" test to support both the pre-openssl 3.2.0 error message: "sslv3 alert certificate unknown" and the post-openssl 3.2.0 error message: "ssl/tls alert certificate unknown". Acked-by: Eelco Chaudron Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- tests/ovsdb-server.at | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index b8ccc4c8e2f..ce6d32aee1d 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -936,8 +936,10 @@ AT_CHECK_UNQUOTED( [ignore]) # The error message for being unable to negotiate a shared ciphersuite # is 'sslv3 alert handshake failure'. This is not the clearest message. +# In openssl 3.2.0 all the error messages were updated to replace 'sslv3' +# with 'ssl/tls'. AT_CHECK_UNQUOTED( - [grep "sslv3 alert handshake failure" output], [0], + [grep -E "(sslv3|ssl/tls) alert handshake failure" output], [0], [stdout], [ignore]) OVSDB_SERVER_SHUTDOWN([" From 07c2ef5cd00ae416ef2ebefeab3cbc4def745275 Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Tue, 5 Mar 2024 21:37:31 +0100 Subject: [PATCH 620/833] bfd: Improve state change log message. A log message like this one: 2024-01-09T06:45:17.201Z|00071|bfd(handler2)|INFO|ovn-0af536-0: BFD state change: down->up "Neighbor Signaled Session Down"->"Neighbor Signaled Session Down". can be hard to read since '->' usually represents a status change, but in this case the diagnostic code stays constant. Update the log message to avoid such ambiguity. The log message for the above event become: 2024-01-09T06:45:16.211Z|00026|bfd(handler3)|INFO|ovn-0af536-0: BFD state change: (bfd.SessionState: down, bfd.LocalDiag: "Neighbor Signaled Session Down") -> (bfd.SessionState: up, bfd.LocalDiag: "Neighbor Signaled Session Down") Reported-by: Alex Stupnikov Reported-at: https://bugzilla.redhat.com/2258496 Acked-by: Eelco Chaudron Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- lib/bfd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/bfd.c b/lib/bfd.c index 9af258917bb..b8149e78973 100644 --- a/lib/bfd.c +++ b/lib/bfd.c @@ -1130,10 +1130,11 @@ bfd_set_state(struct bfd *bfd, enum state state, enum diag diag) if (!VLOG_DROP_INFO(&rl)) { struct ds ds = DS_EMPTY_INITIALIZER; - ds_put_format(&ds, "%s: BFD state change: %s->%s" - " \"%s\"->\"%s\".\n", + ds_put_format(&ds, "%s: BFD state change: (bfd.SessionState: %s," + " bfd.LocalDiag: \"%s\") -> (bfd.SessionState: %s," + " bfd.LocalDiag: \"%s\")\n", bfd->name, bfd_state_str(bfd->state), - bfd_state_str(state), bfd_diag_str(bfd->diag), + bfd_diag_str(bfd->diag), bfd_state_str(state), bfd_diag_str(diag)); bfd_put_details(&ds, bfd); VLOG_INFO("%s", ds_cstr(&ds)); From 2c4ffd2f8a23bed1f2141e81abcc68ccba129e3f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 7 Mar 2024 20:39:41 +0100 Subject: [PATCH 621/833] netdev-dpdk: Dump packets that fail Tx preparation. It's hard to debug situations where driver rejects packets for some reason. Dumping out the mbuf should help with that. Sample output looks like this: |netdev_dpdk(pmd-c03/id:8)|DBG|ovs-p1: First invalid packet: dump mbuf at 0x1180bce140, iova=0x2cb7ce400, buf_len=2176 pkt_len=64, ol_flags=0x2, nb_segs=1, port=65535, ptype=0 segment at 0x1180bce140, data=0x1180bce580, len=90, off=384, refcnt=1 Dump data at [0x1180bce580], len=64 00000000: 33 33 00 00 00 16 AA 27 91 F9 4D 96 86 DD 60 00 | 33.....'..M...`. 00000010: 00 00 00 24 00 01 00 00 00 00 00 00 00 00 00 00 | ...$............ 00000020: 00 00 00 00 00 00 FF 02 00 00 00 00 00 00 00 00 | ................ 00000030: 00 00 00 00 00 16 3A 00 05 02 00 00 01 00 8F 00 | ......:......... Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 45f61930d40..9444c53b18f 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2664,6 +2664,35 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } +static void +netdev_dpdk_mbuf_dump(const char *prefix, const char *message, + const struct rte_mbuf *mbuf) +{ + static struct vlog_rate_limit dump_rl = VLOG_RATE_LIMIT_INIT(5, 5); + char *response = NULL; + FILE *stream; + size_t size; + + if (VLOG_DROP_DBG(&dump_rl)) { + return; + } + + stream = open_memstream(&response, &size); + if (!stream) { + VLOG_ERR("Unable to open memstream for mbuf dump: %s.", + ovs_strerror(errno)); + return; + } + + rte_pktmbuf_dump(stream, mbuf, rte_pktmbuf_pkt_len(mbuf)); + + fclose(stream); + + VLOG_DBG(prefix ? "%s: %s:\n%s" : "%s%s:\n%s", + prefix ? prefix : "", message, response); + free(response); +} + /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of * 'pkts', even in case of failure. * @@ -2680,6 +2709,8 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " "Only %u/%u are valid: %s", netdev_get_name(&dev->up), nb_tx_prep, cnt, rte_strerror(rte_errno)); + netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), + "First invalid packet", pkts[nb_tx_prep]); } while (nb_tx != nb_tx_prep) { From 33f45ded67a2d524ccf54cf4bb79a38d8140f14b Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 6 Mar 2024 16:40:18 -0500 Subject: [PATCH 622/833] ovsdb: Don't iterate over rows on empty mutation. Previously when an empty mutation was used to count the number of rows in a table, OVSDB would iterate over all rows twice. First to perform an RBAC check, and then to perform the no-operation. This change adds a short circuit to mutate operations with no conditions and an empty mutation set, returning immediately. One notable change in functionality is not performing the RBAC check in this condition, as no mutation actually takes place. Reported-by: Terry Wilson Reported-at: https://issues.redhat.com/browse/FDP-359 Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/execution.c | 23 +++++++++++++++++- ovsdb/mutation.h | 6 +++++ tests/ovsdb-execution.at | 51 ++++++++++++++++++++++++++++++++++++++++ tests/ovsdb-rbac.at | 23 ++++++++++++++++++ 4 files changed, 102 insertions(+), 1 deletion(-) diff --git a/ovsdb/execution.c b/ovsdb/execution.c index 8c20c3b54a1..f4cc9e802ba 100644 --- a/ovsdb/execution.c +++ b/ovsdb/execution.c @@ -585,6 +585,16 @@ mutate_row_cb(const struct ovsdb_row *row, void *mr_) return *mr->error == NULL; } +static bool +count_row_cb(const struct ovsdb_row *row OVS_UNUSED, void *rc) +{ + size_t *row_count = rc; + + (*row_count)++; + + return true; +} + static struct ovsdb_error * ovsdb_execute_mutate(struct ovsdb_execution *x, struct ovsdb_parser *parser, struct json *result) @@ -609,7 +619,18 @@ ovsdb_execute_mutate(struct ovsdb_execution *x, struct ovsdb_parser *parser, error = ovsdb_condition_from_json(table->schema, where, x->symtab, &condition); } - if (!error) { + if (!error && ovsdb_mutation_set_empty(&mutations)) { + /* Special case with no mutations, just return the row count. */ + if (ovsdb_condition_empty(&condition)) { + json_object_put(result, "count", + json_integer_create(hmap_count(&table->rows))); + } else { + size_t row_count = 0; + ovsdb_query(table, &condition, count_row_cb, &row_count); + json_object_put(result, "count", + json_integer_create(row_count)); + } + } else if (!error) { mr.n_matches = 0; mr.txn = x->txn; mr.mutations = &mutations; diff --git a/ovsdb/mutation.h b/ovsdb/mutation.h index 7566ef199d6..05d4a262a98 100644 --- a/ovsdb/mutation.h +++ b/ovsdb/mutation.h @@ -69,4 +69,10 @@ void ovsdb_mutation_set_destroy(struct ovsdb_mutation_set *); struct ovsdb_error *ovsdb_mutation_set_execute( struct ovsdb_row *, const struct ovsdb_mutation_set *) OVS_WARN_UNUSED_RESULT; +static inline bool ovsdb_mutation_set_empty( + const struct ovsdb_mutation_set *ms) +{ + return ms->n_mutations == 0; +} + #endif /* ovsdb/mutation.h */ diff --git a/tests/ovsdb-execution.at b/tests/ovsdb-execution.at index fd1c7a2395b..1ffa2b73854 100644 --- a/tests/ovsdb-execution.at +++ b/tests/ovsdb-execution.at @@ -1201,4 +1201,55 @@ OVSDB_CHECK_EXECUTION([garbage collection], [{"rows":[]}] ]])]) +OVSDB_CHECK_EXECUTION([insert rows, count with mutation], + [ordinal_schema], + [[[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 0, "name": "zero"}, + "uuid-name": "first"}]]], + [[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 1, "name": "one"}, + "uuid-name": "first"}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [["name", "==", "zero"]], + "mutations": []}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [["name", "==", "one"]], + "mutations": []}]]], + [[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 2, "name": "one"}, + "uuid-name": "first"}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [["name", "==", "one"]], + "mutations": []}]]], + [[["ordinals", + {"op": "delete", + "table": "ordinals", + "where": [["name", "==", "zero"]]}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [], + "mutations": []}]]]], + [[[{"uuid":["uuid","<0>"]}] +[{"uuid":["uuid","<1>"]}] +[{"count":1}] +[{"count":1}] +[{"uuid":["uuid","<2>"]}] +[{"count":2}] +[{"count":1}] +[{"count":2}] +]]) + EXECUTION_EXAMPLES diff --git a/tests/ovsdb-rbac.at b/tests/ovsdb-rbac.at index 3172e4bf558..c1e5a9134eb 100644 --- a/tests/ovsdb-rbac.at +++ b/tests/ovsdb-rbac.at @@ -355,6 +355,29 @@ AT_CHECK([uuidfilt stdout], [0], [[[{"details":"RBAC rules for client \"client-2 ], [ignore]) # Test 14: +# Count the rows in other_colors. This should pass even though the RBAC +# authorization would fail because "client-2" does not match the +# "creator" column for this row. Because the RBAC check is bypassed when +# mutation is empty. +AT_CHECK([ovsdb-client transact ssl:127.0.0.1:$SSL_PORT \ + --private-key=$RBAC_PKIDIR/client-2-privkey.pem \ + --certificate=$RBAC_PKIDIR/client-2-cert.pem \ + --ca-cert=$RBAC_PKIDIR/pki/switchca/cacert.pem \ + ['["mydb", + {"op": "mutate", + "table": "other_colors", + "where": [], + "mutations": []}, + {"op": "mutate", + "table": "other_colors", + "where": [["name", "==", "seafoam"]], + "mutations": []} + ]']], [0], [stdout], [ignore]) +cat stdout >> output +AT_CHECK([uuidfilt stdout], [0], [[[{"count":1},{"count":1}]] +], [ignore]) + +# Test 15: # Attempt to delete a row from the "other_colors" table. This should pass # the RBAC authorization test because "client-1" does matches the # "creator" column for this row. From fa0dfa18d5344c82bc8387b4f79f4257a59d6f58 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 12 Mar 2024 01:50:46 +0100 Subject: [PATCH 623/833] github: Reduce ASLR entropy to be compatible with asan in llvm 14. Starting with image version 20240310.1.0, GitHub runners are using 32-bit entropy for ASLR: $ sudo sysctl -a | grep vm.mmap.rnd vm.mmap_rnd_bits = 32 vm.mmap_rnd_compat_bits = 16 This breaks all the asan-enabled builds, because older asan gets confused by memory mappings and crashes with segmentation fault. The issue is fixed in newer releases of llvm: https://github.com/llvm/llvm-project/commit/fb77ca05ffb4f8e666878f2f6718a9fb4d686839 https://reviews.llvm.org/D148280 But these are not available in Ubuntu 22.04 image. This should be fixed by GitHub, but until new images are available reducing ASLR entropy manually to 28 bits to make builds work. Reported-at: https://github.com/actions/runner-images/issues/9491 Acked-by: Eelco Chaudron Acked-by: Dumitru Ceara Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- .github/workflows/build-and-test.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index fc755814861..6f5139304ae 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -238,6 +238,14 @@ jobs: if: matrix.m32 != '' run: sudo apt install -y gcc-multilib + - name: Reduce ASLR entropy + if: matrix.sanitizers != '' + # Asan in llvm 14 provided in ubuntu-22.04 is incompatible with + # high-entropy ASLR configured in much newer kernels that GitHub + # runners are using leading to random crashes: + # https://github.com/actions/runner-images/issues/9491 + run: sudo sysctl -w vm.mmap_rnd_bits=28 + - name: prepare run: ./.ci/linux-prepare.sh From 7df30c86ce12833cf7f9bfc71c166c34692ceff4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 11 Mar 2024 19:32:30 +0100 Subject: [PATCH 624/833] netdev-dpdk: Clean up all marker flags if no offloads requested. Some drivers (primarily, Intel ones) do not expect any marking flags being set if no offloads are requested. If these flags are present, driver will fail Tx preparation or behave abnormally. For example, ixgbe driver will refuse to process the packet with only RTE_MBUF_F_TX_TUNNEL_GENEVE and RTE_MBUF_F_TX_OUTER_IPV4 set. This pretty much breaks Geneve tunnels on these cards. An extra check is added to make sure we don't have any unexpected Tx offload flags set. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/321 Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 9444c53b18f..8c52accff93 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -607,6 +607,9 @@ int netdev_dpdk_get_vid(const struct netdev_dpdk *dev); struct ingress_policer * netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev); +static void netdev_dpdk_mbuf_dump(const char *prefix, const char *message, + const struct rte_mbuf *); + static bool is_dpdk_class(const struct netdev_class *class) { @@ -2569,9 +2572,29 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); struct tcp_header *th; - if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK - | RTE_MBUF_F_TX_TCP_SEG))) { - mbuf->ol_flags &= ~(RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6); + const uint64_t all_requests = (RTE_MBUF_F_TX_IP_CKSUM | + RTE_MBUF_F_TX_L4_MASK | + RTE_MBUF_F_TX_OUTER_IP_CKSUM | + RTE_MBUF_F_TX_OUTER_UDP_CKSUM | + RTE_MBUF_F_TX_TCP_SEG); + const uint64_t all_marks = (RTE_MBUF_F_TX_IPV4 | + RTE_MBUF_F_TX_IPV6 | + RTE_MBUF_F_TX_OUTER_IPV4 | + RTE_MBUF_F_TX_OUTER_IPV6 | + RTE_MBUF_F_TX_TUNNEL_MASK); + + if (!(mbuf->ol_flags & all_requests)) { + /* No offloads requested, no marks should be set. */ + mbuf->ol_flags &= ~all_marks; + + uint64_t unexpected = mbuf->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK; + if (OVS_UNLIKELY(unexpected)) { + VLOG_WARN_RL(&rl, "%s: Unexpected Tx offload flags: %#"PRIx64, + netdev_get_name(&dev->up), unexpected); + netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), + "Packet with unexpected ol_flags", mbuf); + return false; + } return true; } From f8809760fcc237ca0677b81166831400b141878f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 13 Mar 2024 18:29:43 +0100 Subject: [PATCH 625/833] netdev-dpdk: Clear inner packet marks if no inner offloads requested. In some cases only outer offloads may be requested for a tunneled packet. In this case there is no need to mark the type of an inner packet. Clean these flags up to avoid potential confusion of DPDK drivers. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 8c52accff93..270d3e11cb5 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2607,6 +2607,15 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) (char *) dp_packet_eth(pkt); mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); + + /* If neither inner checksums nor TSO is requested, inner marks + * should not be set. */ + if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | + RTE_MBUF_F_TX_L4_MASK | + RTE_MBUF_F_TX_TCP_SEG))) { + mbuf->ol_flags &= ~(RTE_MBUF_F_TX_IPV4 | + RTE_MBUF_F_TX_IPV6); + } } else { mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); From 05e9f05d146a2a635542c40d3146b2c20ab72805 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 13 Mar 2024 18:29:44 +0100 Subject: [PATCH 626/833] netdev-dpdk: Fix TCP check during Tx offload preparation. RTE_MBUF_F_TX_TCP_CKSUM is not a flag, but a 2-bit field, so checking it with a simple binary 'and' is incorrect. For example, this check will succeed for a packet with UDP checksum requested as well. Fix the check to avoid wrongly initializing tso_segz and potentially accessing UDP header via TCP structure pointer. The IPv4 checksum flag has to be set for any L4 checksum request, regardless of the type, so moving this check out of the TCP condition. Fixes: 8b5fe2dc6080 ("userspace: Add Generic Segmentation Offloading.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 270d3e11cb5..1ae2ef3981d 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2634,7 +2634,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) } } - if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) { + if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_TCP_CKSUM) { if (!th) { VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header" " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); @@ -2661,11 +2661,14 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return false; } } + } - if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { - mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; - } + /* If L4 checksum is requested, IPv4 should be requested as well. */ + if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK + && mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { + mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; } + return true; } From 0ce82ac45e6828c5e1531b2ada044b7abbbadea5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 13 Mar 2024 18:29:45 +0100 Subject: [PATCH 627/833] netdev-dpdk: Fix tunnel type check during Tx offload preparation. Tunnel types are not flags, but 4-bit fields, so checking them with a simple binary 'and' is incorrect and may produce false-positive matches. While the current implementation is unlikely to cause any issues today, since both RTE_MBUF_F_TX_TUNNEL_VXLAN and RTE_MBUF_F_TX_TUNNEL_GENEVE only have 1 bit set, it is risky to have this code and it may lead to problems if we add support for other tunnel types in the future. Use proper field checks instead. Also adding a warning for unexpected tunnel types in case something goes wrong. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 1ae2ef3981d..29a6bf0328e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2601,8 +2601,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) /* If packet is vxlan or geneve tunnel packet, calculate outer * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated * before. */ - if (mbuf->ol_flags & - (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) { + const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK; + if (tunnel_type == RTE_MBUF_F_TX_TUNNEL_GENEVE || + tunnel_type == RTE_MBUF_F_TX_TUNNEL_VXLAN) { mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - @@ -2616,6 +2617,12 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->ol_flags &= ~(RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6); } + } else if (OVS_UNLIKELY(tunnel_type)) { + VLOG_WARN_RL(&rl, "%s: Unexpected tunnel type: %#"PRIx64, + netdev_get_name(&dev->up), tunnel_type); + netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), + "Packet with unexpected tunnel type", mbuf); + return false; } else { mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt); @@ -2641,8 +2648,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return false; } - if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE | - RTE_MBUF_F_TX_TUNNEL_VXLAN)) { + if (tunnel_type) { mbuf->tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - mbuf->l4_len - mbuf->outer_l3_len; } else { From 86b9e653ef226985b0fef1be7c03d8be305fa0b7 Mon Sep 17 00:00:00 2001 From: Kevin Sprague Date: Tue, 5 Mar 2024 10:44:41 -0500 Subject: [PATCH 628/833] revalidator: Add a USDT probe during flow deletion with purge reason. During normal operations, it is useful to understand when a particular flow gets removed from the system. This can be useful when debugging performance issues tied to ofproto flow changes, trying to determine deployed traffic patterns, or while debugging dynamic systems where ports come and go. Prior to this change, there was a lack of visibility around flow expiration. The existing debugging infrastructure could tell us when a flow was added to the datapath, but not when it was removed or why. This change introduces a USDT probe at the point where the revalidator determines that the flow should be removed. Additionally, we track the reason for the flow eviction and provide that information as well. With this change, we can track the complete flow lifecycle for the netlink datapath by hooking the upcall tracepoint in kernel, the flow put USDT, and the revalidator USDT, letting us watch as flows are added and removed from the kernel datapath. This change only enables this information via USDT probe, so it won't be possible to access this information any other way (see: Documentation/topics/usdt-probes.rst). Also included is a script (utilities/usdt-scripts/flow_reval_monitor.py) which serves as a demonstration of how the new USDT probe might be used going forward. Co-authored-by: Aaron Conole Acked-by: Han Zhou Signed-off-by: Aaron Conole Signed-off-by: Kevin Sprague Signed-off-by: Eelco Chaudron --- Documentation/topics/usdt-probes.rst | 43 + ofproto/ofproto-dpif-upcall.c | 44 +- utilities/automake.mk | 3 + utilities/usdt-scripts/flow_reval_monitor.py | 977 +++++++++++++++++++ 4 files changed, 1061 insertions(+), 6 deletions(-) create mode 100755 utilities/usdt-scripts/flow_reval_monitor.py diff --git a/Documentation/topics/usdt-probes.rst b/Documentation/topics/usdt-probes.rst index e527f43bab6..b9a6c54b29f 100644 --- a/Documentation/topics/usdt-probes.rst +++ b/Documentation/topics/usdt-probes.rst @@ -214,8 +214,10 @@ Available probes in ``ovs_vswitchd``: - dpif_recv:recv_upcall - main:poll_block - main:run_start +- revalidate:flow_result - revalidate_ukey\_\_:entry - revalidate_ukey\_\_:exit +- revalidator_sweep\_\_:flow_result - udpif_revalidator:start_dump - udpif_revalidator:sweep_done @@ -443,6 +445,47 @@ sweep phase was completed. - ``utilities/usdt-scripts/reval_monitor.py`` +probe revalidate:flow_result +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: +This probe is triggered when the revalidator has executed on a particular +flow key to make a determination whether to evict a flow, and the cause +for eviction. The revalidator runs periodically, and this probe will only +be triggered when a flow is flagged for revalidation. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(enum reval_result) result`` +- *arg3*: ``(enum flow_del_reason) del_reason`` + +**Script references**: + +- ``utilities/usdt-scripts/flow_reval_monitor.py`` + + +probe revalidator_sweep\_\_:flow_result +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: +This probe is placed in the path of the revalidator sweep, and is executed +under the condition that a flow entry is in an unexpected state, or the +flows were asked to be purged due to a user action. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(enum reval_result) result`` +- *arg3*: ``(enum flow_del_reason) del_reason`` + +**Script references**: + +- ``utilities/usdt-scripts/flow_reval_monitor.py`` + + Adding your own probes ---------------------- diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 9a5c5c29ce6..d8819563662 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -269,6 +269,20 @@ enum ukey_state { }; #define N_UKEY_STATES (UKEY_DELETED + 1) +enum flow_del_reason { + FDR_NONE = 0, /* No deletion reason for the flow. */ + FDR_AVOID_CACHING, /* Flow deleted to avoid caching. */ + FDR_BAD_ODP_FIT, /* The flow had a bad ODP flow fit. */ + FDR_FLOW_IDLE, /* The flow went unused and was deleted. */ + FDR_FLOW_LIMIT, /* All flows being killed. */ + FDR_FLOW_WILDCARDED, /* The flow needed a narrower wildcard mask. */ + FDR_NO_OFPROTO, /* The flow didn't have an associated ofproto. */ + FDR_PURGE, /* User action caused flows to be killed. */ + FDR_TOO_EXPENSIVE, /* The flow was too expensive to revalidate. */ + FDR_UPDATE_FAIL, /* Flow state transition was unexpected. */ + FDR_XLATION_ERROR, /* There was an error translating the flow. */ +}; + /* 'udpif_key's are responsible for tracking the little bit of state udpif * needs to do flow expiration which can't be pulled directly from the * datapath. They may be created by any handler or revalidator thread at any @@ -2279,7 +2293,8 @@ populate_xcache(struct udpif *udpif, struct udpif_key *ukey, static enum reval_result revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, uint16_t tcp_flags, struct ofpbuf *odp_actions, - struct recirc_refs *recircs, struct xlate_cache *xcache) + struct recirc_refs *recircs, struct xlate_cache *xcache, + enum flow_del_reason *del_reason) { struct xlate_out *xoutp; struct netflow *netflow; @@ -2300,11 +2315,13 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, netflow = NULL; if (xlate_ukey(udpif, ukey, tcp_flags, &ctx)) { + *del_reason = FDR_XLATION_ERROR; goto exit; } xoutp = &ctx.xout; if (xoutp->avoid_caching) { + *del_reason = FDR_AVOID_CACHING; goto exit; } @@ -2318,6 +2335,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, ofpbuf_clear(odp_actions); if (!ofproto) { + *del_reason = FDR_NO_OFPROTO; goto exit; } @@ -2329,6 +2347,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, &dp_mask, &ctx.flow, NULL) == ODP_FIT_ERROR) { + *del_reason = FDR_BAD_ODP_FIT; goto exit; } @@ -2338,6 +2357,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, * down. Note that we do not know if the datapath has ignored any of the * wildcarded bits, so we may be overly conservative here. */ if (flow_wildcards_has_extra(&dp_mask, ctx.wc)) { + *del_reason = FDR_FLOW_WILDCARDED; goto exit; } @@ -2407,7 +2427,7 @@ static enum reval_result revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, const struct dpif_flow_stats *stats, struct ofpbuf *odp_actions, uint64_t reval_seq, - struct recirc_refs *recircs) + struct recirc_refs *recircs, enum flow_del_reason *del_reason) OVS_REQUIRES(ukey->mutex) { bool need_revalidate = ukey->reval_seq != reval_seq; @@ -2437,8 +2457,12 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, xlate_cache_clear(ukey->xcache); } result = revalidate_ukey__(udpif, ukey, push.tcp_flags, - odp_actions, recircs, ukey->xcache); - } /* else delete; too expensive to revalidate */ + odp_actions, recircs, ukey->xcache, + del_reason); + } else { + /* Delete, since it is too expensive to revalidate. */ + *del_reason = FDR_TOO_EXPENSIVE; + } } else if (!push.n_packets || ukey->xcache || !populate_xcache(udpif, ukey, push.tcp_flags)) { result = UKEY_KEEP; @@ -2838,6 +2862,7 @@ revalidate(struct revalidator *revalidator) for (f = flows; f < &flows[n_dumped]; f++) { long long int used = f->stats.used; struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER; + enum flow_del_reason del_reason = FDR_NONE; struct dpif_flow_stats stats = f->stats; enum reval_result result; struct udpif_key *ukey; @@ -2912,9 +2937,10 @@ revalidate(struct revalidator *revalidator) } if (kill_them_all || (used && used < now - max_idle)) { result = UKEY_DELETE; + del_reason = (kill_them_all) ? FDR_FLOW_LIMIT : FDR_FLOW_IDLE; } else { result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, - reval_seq, &recircs); + reval_seq, &recircs, &del_reason); } ukey->dump_seq = dump_seq; @@ -2923,6 +2949,8 @@ revalidate(struct revalidator *revalidator) udpif_update_flow_pps(udpif, ukey, f); } + OVS_USDT_PROBE(revalidate, flow_result, udpif, ukey, result, + del_reason); if (result != UKEY_KEEP) { /* Takes ownership of 'recircs'. */ reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs, @@ -2975,6 +3003,7 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) size_t n_ops = 0; CMAP_FOR_EACH(ukey, cmap_node, &umap->cmap) { + enum flow_del_reason del_reason = FDR_NONE; enum ukey_state ukey_state; /* Handler threads could be holding a ukey lock while it installs a @@ -2993,6 +3022,7 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) if (purge || ukey_state == UKEY_INCONSISTENT) { result = UKEY_DELETE; + del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL; } else if (!seq_mismatch) { result = UKEY_KEEP; } else { @@ -3000,13 +3030,15 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) COVERAGE_INC(revalidate_missed_dp_flow); memcpy(&stats, &ukey->stats, sizeof stats); result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, - reval_seq, &recircs); + reval_seq, &recircs, &del_reason); } if (result != UKEY_KEEP) { /* Clears 'recircs' if filled by revalidate_ukey(). */ reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs, &odp_actions); } + OVS_USDT_PROBE(revalidator_sweep__, flow_sweep_result, udpif, + ukey, result, del_reason); } ovs_mutex_unlock(&ukey->mutex); diff --git a/utilities/automake.mk b/utilities/automake.mk index 9a2114df40a..146b8c37fbb 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -23,6 +23,7 @@ scripts_DATA += utilities/ovs-lib usdt_SCRIPTS += \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/flow_reval_monitor.py \ utilities/usdt-scripts/kernel_delay.py \ utilities/usdt-scripts/kernel_delay.rst \ utilities/usdt-scripts/reval_monitor.py \ @@ -72,6 +73,7 @@ EXTRA_DIST += \ utilities/docker/debian/build-kernel-modules.sh \ utilities/usdt-scripts/bridge_loop.bt \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/flow_reval_monitor.py \ utilities/usdt-scripts/kernel_delay.py \ utilities/usdt-scripts/kernel_delay.rst \ utilities/usdt-scripts/reval_monitor.py \ @@ -146,6 +148,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \ utilities/ovs-tcpdump.in \ utilities/ovs-pipegen.py \ utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/flow_reval_monitor.py \ utilities/usdt-scripts/upcall_monitor.py \ utilities/usdt-scripts/upcall_cost.py diff --git a/utilities/usdt-scripts/flow_reval_monitor.py b/utilities/usdt-scripts/flow_reval_monitor.py new file mode 100755 index 00000000000..534ba8fa216 --- /dev/null +++ b/utilities/usdt-scripts/flow_reval_monitor.py @@ -0,0 +1,977 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022-2024 Redhat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script information: +# ------------------- +# flow_reval_monitor.py uses the dpif_netlink_operate:flow_put and +# revalidator:flow_result USDT probes to monitor flow lifetimes and +# expiration events. By default, this will show all flow_put and flow +# expiration events, along with their reasons. This will look like so: +# +# TID TIME UFID EVENT/REASON +# 71828 1549.119959874 39f0f28f-33... Insert (put) flow to ovs kernel module. +# 71828 1549.420877223 850db41c-47... Insert (put) flow to ovs kernel module. +# 71828 1550.476923456 5bacfca9-fe... Insert (put) flow to ovs kernel module. +# 71832 1559.650192299 850db41c-47... Idle flow timed out +# 71832 1561.153332825 39f0f28f-33... Idle flow timed out +# 71832 1572.684316304 5bacfca9-fe... Idle flow timed out +# +# Flow key data can be printed using the --flow-keys option. This will +# print the equivalent datapath flow string. +# +# When filtering flows, the syntax is the same as used by +# `ovs-appctl dpctl/add-flow`. +# +# For a complete list of options, please use the '--help' or '-h' argument. +# +# Examples: +# +# To use the script on a running ovs-vswitchd to see flow keys and expiration +# events for flows with an ipv4 source of 192.168.10.10: +# $ ./flow_reval_monitor.py --flow-keys --filter-flows \ +# "ipv4(src=192.168.10.10)" +# TIME UFID EVENT/REASON +# 105082.457322742 ufid:f76fc899-376d-466b-bc74-0000b933eb97 flow_put +# ufid:f76fc899-376d-466b-bc74-0000b933eb97 has the following flow information: +# in_port(2), +# eth(src=0e:04:47:fc:74:51, dst=da:dc:c5:69:05:d7), \ +# eth_type(0x800), \ +# ipv4(src=192.168.10.10, dst=192.168.10.30, proto=1, tos=0, ttl=64,[...]), +# icmp(type=8, code=0) +# 105092.635450202 ufid:f76fc899-376d-466b-bc74-0000b933eb97 Flow timed out +# +# Notes: +# 1) No options are needed to attach when there is a single running instance +# of ovs-vswitchd. +# 2) If you're using the flow filtering option, it will only track flows that +# have been upcalled since the script began running. +# 3) When using the flow filtering option, the key size will likely need to +# be expanded to match on all the fields in the message. The default is +# kept small to keep the buffer copy sizes down when displaying +# flows (-k), but is hardcoded to 2048 when an actual filter (-l) is +# applied +# 4) The flow filtering format is a simplified form of the ODP syntax, and +# does not support masked matches, which means you will need to filter +# on exact details. The fields present are dependent on how the +# classifier and OFP rules form the ODP rules - not all fields may be +# present in a particular flow. +# 5) The flow_put filtering only happens for flows installed into the ovs +# kernel module. This means flows taking the HW offload path (ie: tc), +# or on DPDK side won't get matched. + +try: + from bcc import BPF + from bcc import USDT + from bcc import USDTException +except ModuleNotFoundError: + print("ERROR: Can't find the BPF Compiler Collection Tools.") + print("Please install them before running this script.") + exit(1) + +from enum import IntEnum +from ipaddress import IPv4Address, IPv6Address +from pathlib import Path + +import argparse +import psutil +import re +import struct +import subprocess +import sys + +# +# eBPF source code +# +bpf_src = """ +#include + +#define MAX_KEY +#define FLOW_FILTER + +enum probe { }; + + + +struct event_t { + u64 ts; + u32 pid; + u32 result; + u32 reason; + u32 ufid[4]; + u64 key_size; + unsigned char key[MAX_KEY]; + enum probe probe; +}; + +BPF_HASH(watchlist, ovs_u128); +BPF_RINGBUF_OUTPUT(events, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); + +/* Hack to make a 'static' like storage object. */ +BPF_TABLE("percpu_array", uint32_t, struct udpif_key, udpk, 1); + +static struct event_t *get_event(enum probe p) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->probe = p; + event->ts = bpf_ktime_get_ns(); + event->pid = bpf_get_current_pid_tgid(); + + return event; +} + +static int emit_flow_result(struct udpif_key *ukey, ovs_u128 ufid, + u32 result, u32 reason) { + struct event_t *event = NULL; + u64 *ufid_present = NULL; + + ufid_present = watchlist.lookup(&ufid); + if (FLOW_FILTER && !ufid_present) { + return 0; + } + + event = get_event(FLOW_RESULT); + if (!event) { + /* If we can't reserve the space in the ring buffer, return 1. */ + return 1; + } + + event->result = result; + event->reason = reason; + bpf_probe_read(&event->ufid, sizeof ufid, &ufid); + events.ringbuf_submit(event, 0); + + return 0; +} + +int usdt__flow_result(struct pt_regs *ctx) { + struct udpif_key *ukey = NULL; + u32 reason = 0; + u32 result = 0; + ovs_u128 ufid; + u32 zero = 0; + + ukey = udpk.lookup(&zero); + if (!ukey) { + return 1; + } + bpf_usdt_readarg_p(2, ctx, ukey, sizeof(struct udpif_key)); + bpf_usdt_readarg(3, ctx, &result); + bpf_usdt_readarg(4, ctx, &reason); + ufid = ukey->ufid; + + return emit_flow_result(ukey, ufid, result, reason); +} + +int usdt__flow_sweep_result(struct pt_regs *ctx) { + struct udpif_key *ukey = NULL; + u32 reason = 0; + u32 result = 0; + ovs_u128 ufid; + u32 zero = 0; + + ukey = udpk.lookup(&zero); + if (!ukey) { + return 1; + } + bpf_usdt_readarg_p(2, ctx, ukey, sizeof(struct udpif_key)); + bpf_usdt_readarg(3, ctx, &result); + bpf_usdt_readarg(4, ctx, &reason); + ufid = ukey->ufid; + + return emit_flow_result(ukey, ufid, result, reason); +} + +int usdt__op_flow_put(struct pt_regs *ctx) { + struct dpif_flow_put put; + ovs_u128 ufid; + + struct event_t *event = get_event(OP_FLOW_PUT); + if (!event) { + /* If we can't reserve the space in the ring buffer, return 1. */ + return 1; + } + + bpf_usdt_readarg_p(2, ctx, &put, sizeof put); + bpf_probe_read(&event->ufid, sizeof event->ufid, put.ufid); + bpf_probe_read(&ufid, sizeof ufid, &event->ufid); + if (put.key_len > MAX_KEY) { + put.key_len = MAX_KEY; + } + event->key_size = put.key_len; + bpf_probe_read(&event->key, put.key_len, put.key); + event->reason = 0; + events.ringbuf_submit(event, 0); + + watchlist.increment(ufid); + return 0; +} +""" + +Event = IntEnum("Event", ["OP_FLOW_PUT", "FLOW_RESULT"], start=0) +RevalResult = IntEnum( + "reval_result", + [ + "UKEY_KEEP", + "UKEY_DELETE", + "UKEY_MODIFY", + ], + start=0, +) +FdrReasons = IntEnum( + "flow_del_reason", + [ + "FDR_NONE", + "FDR_AVOID_CACHING", + "FDR_BAD_ODP_FIT", + "FDR_FLOW_IDLE", + "FDR_FLOW_LIMIT", + "FDR_FLOW_WILDCARDED", + "FDR_NO_OFPROTO", + "FDR_PURGE", + "FDR_TOO_EXPENSIVE", + "FDR_UPDATE_FAIL", + "FDR_XLATION_ERROR", + ], + start=0, +) + +FdrReasonStrings = [ + "No deletion reason", + "Cache avoidance flag set", + "Bad ODP flow fit", + "Idle flow timed out", + "Kill all flows condition detected", + "Mask too wide - need narrower match", + "No matching ofproto rules", + "Too expensive to revalidate", + "Purged with user action", + "Flow state inconsistent after updates", + "Flow translation error", +] + + +def err(msg, code=-1): + """Prints an error to stderr and exits""" + + print(msg, file=sys.stderr) + sys.exit(code) + + +def run_program(command): + """Invokes a new process and returns stdout. Note that this will honor + the PATH environment variable, so best to use it sparingly, or with a + full path to binary.""" + + try: + process = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding="utf8", + check=True, + ) + + except subprocess.CalledProcessError as perror: + return perror.returncode, perror.stdout + + return 0, process.stdout + + +def get_ovs_definitions(objects, pahole="pahole", pid=None): + """Uses `pahole` or similar utility to pull object definitions from a + running OVS process. The objects argument can either be a string + or can be a list of strings. Optionally, pass a specific `pahole` + binary to use rather than the default. PID needs to be set.""" + + if pid is None: + raise ValueError("A valid pid value should be supplied!") + + if not isinstance(objects, list): + objects = [objects] + + if len(objects) == 0: + raise ValueError("Must supply at least one object!") + + vswitchd = Path(f"/proc/{pid}/exe").resolve() + + object_str = ",".join(objects) + + def run_pahole(debug_file): + """Helper designed for running pahole, or something with compatible + output""" + + error, result = run_program( + [pahole, "-C", object_str, "--compile", debug_file] + ) + + if error: + if f"pahole: {debug_file}: Invalid argument" not in result: + err( + "ERROR: Pahole failed to get ovs-vswitchd data " + "structures!\n{}".format( + re.sub( + "^", " " * 7, result.rstrip(), flags=re.MULTILINE + ) + ) + ) + + return None + + if bool(re.search("pahole: type .* not found", result)): + return None + + return result + + def run_readelf(bin_file): + """Helper designed for running readelf or something with compatible + output""" + + error, result = run_program( + ["readelf", "-n", "--debug-dump=links", bin_file] + ) + + if error: + err( + "ERROR: Failed 'readelf' on \"{}\"!\n{}".format( + bin_file, re.sub("^", " " * 7, result, flags=re.MULTILINE) + ) + ) + + return result + + def get_debug_file(bin_file): + """Runs readelf against the binary, and attempts to find the associated + debuginfo file.""" + elf_result = run_readelf(bin_file) + match = re.search("Build ID: ([0-9a-fA-F]+)", elf_result) + if not match: + err("ERROR: Can't find build ID to read debug symbols!") + + dbg_file = "/usr/lib/debug/.build-id/{}/{}.debug".format( + match.group(1)[:2], match.group(1)[2:] + ) + + return dbg_file + + def get_from_shared_library(debug_file): + ovs_libs = [ + "libofproto", + "libopenvswitch", + "libovsdb", + "libsflow", + "libvtep", + ] + error, ldd_result = run_program(["ldd", debug_file]) + + if error: + err( + "ERROR: Failed 'ldd' on \"{}\"!\n{}".format( + debug_file, + re.sub("^", " " * 7, ldd_result, flags=re.MULTILINE), + ) + ) + + for lib in ovs_libs: + match = re.search( + r"^\s*{}.* => (.*) \(.*\)$".format(lib), + ldd_result, + flags=re.MULTILINE, + ) + if match is None: + continue + + result = run_pahole(match.group(1)) + if result is None: + result = run_pahole(get_debug_file(match.group(1))) + + if result: + return result + + return None + + # + # First try to find the debug data as part of the executable. + # + result = run_pahole(vswitchd) + + if result is None: + print(f'INFO: Failed to find debug info in "{vswitchd}"!') + + # + # Get additional .debug information if available. + # + dbg_file = get_debug_file(vswitchd) + result = run_pahole(dbg_file) + if result is None: + print(f'INFO: Failed to find debug info in "{dbg_file}"!') + + # + # Try to get information from shared libraries if used. + # + result = get_from_shared_library(vswitchd) + + if result is None: + err(f"ERROR: Failed to find needed data structures through {pahole}") + + # + # We need an empty _Atomic definition to avoid compiler complaints. + # + result = "#define _Atomic\n" + result + + # + # Remove the uint64_t definition as it conflicts with the kernel one. + # + result = re.sub("^typedef.*uint64_t;$", "", result, flags=re.MULTILINE) + + return result + + +def buffer_size_type(astr, min=64, max=2048): + """Checks whether a string passed in is a number between min and max.""" + + value = int(astr) + if min <= value <= max: + return value + else: + raise argparse.ArgumentTypeError( + "value not in range {}-{}".format(min, max) + ) + + +def format_ufid(ufid): + """Formats a UFID object into a human readable form. If ufid is None, + prints "ufid:none" instead.""" + if ufid is None: + return "ufid:none" + + return "{:08x}-{:04x}-{:04x}-{:04x}-{:04x}{:08x}".format( + ufid[0], + ufid[1] >> 16, + ufid[1] & 0xFFFF, + ufid[2] >> 16, + ufid[2] & 0, + ufid[3], + ) + + +def find_and_delete_from_watchlist(event): + """If the event ufid is in the watchlist, delete it""" + + for k, _ in b["watchlist"].items(): + key_ufid = struct.unpack("=IIII", k) + if key_ufid == tuple(event.ufid): + key = (b["watchlist"].Key * 1)(k) + b["watchlist"].items_delete_batch(key) + break + + +def handle_flow_put(event): + """Event handler for the `flow_put` action. This function will try + to populate the watchlist based on the vswitchd emitting a put event + to push an ODP flow key with associated actions into the kernel module""" + + if args.flow_keys or args.filter_flows is not None: + key = decode_key(bytes(event.key)[: event.key_size]) + flow_dict, flow_str = parse_flow_dict(key) + # For each attribute that we're watching. + if args.filter_flows is not None: + if not compare_flow_to_target(args.filter_flows, flow_dict): + find_and_delete_from_watchlist(event) + return + + print( + "{:<10} {:<18.9f} {:<36} {}".format( + event.pid, + event.ts / 1000000000, + format_ufid(event.ufid), + "Insert (put) flow to ovs kernel module.", + ) + ) + + if args.flow_keys and len(flow_str): + flow_str_fields = flow_str.split("), ") + flow_str = " " + curlen = 4 + for field in flow_str_fields: + if curlen + len(field) > 79: + flow_str += "\n " + curlen = 4 + if field[-1] != ")": + field += ")" + flow_str += field + ", " + curlen += len(field) + 2 + + print(" - It holds the following key information:") + print(flow_str) + + +def compare_flow_to_target(target, flow): + """Routine to compare two flow keys""" + + for key in target: + if key not in flow: + return False + elif target[key] is True: + continue + elif target[key] == flow[key]: + continue + elif isinstance(target[key], dict) and isinstance(flow[key], dict): + return compare_flow_to_target(target[key], flow[key]) + else: + return False + return True + + +# +# parse_flow_str() +# +def parse_flow_str(flow_str): + """Loosely parses an ODP flow key into a dict for further processing""" + + f_list = [i.strip(", ") for i in flow_str.split(")")] + if f_list[-1] == "": + f_list = f_list[:-1] + flow_dict = {} + for e in f_list: + split_list = e.split("(") + k = split_list[0] + if len(split_list) == 1: + flow_dict[k] = True + elif split_list[1].count("=") == 0: + flow_dict[k] = split_list[1] + else: + sub_dict = {} + sublist = [i.strip() for i in split_list[1].split(",")] + for subkey in sublist: + brk = subkey.find("=") + sub_dict[subkey[:brk]] = subkey[brk + 1 :] + flow_dict[k] = sub_dict + return flow_dict + + +def print_expiration(event): + """Prints a UFID eviction with a reason.""" + ufid_str = format_ufid(event.ufid) + + if event.reason > len(FdrReasons): + reason = f"Unknown reason '{event.reason}'" + else: + reason = FdrReasonStrings[event.reason] + + print( + "{:<10} {:<18.9f} {:<36} {:<17}".format( + event.pid, + event.ts / 1000000000, + ufid_str, + reason, + ) + ) + + +def decode_key(msg): + """Decodes netlink OVS key attribute.""" + bytes_left = len(msg) + result = {} + while bytes_left: + if bytes_left < 4: + break + nla_len, nla_type = struct.unpack("=HH", msg[:4]) + if nla_len < 4: + break + nla_data = msg[4:nla_len] + if nla_len > bytes_left: + nla_data = nla_data[: (bytes_left - 4)] + break + else: + result[get_ovs_key_attr_str(nla_type)] = nla_data + next_offset = (nla_len + 3) & (~3) + msg = msg[next_offset:] + bytes_left -= next_offset + if bytes_left: + print(f"INFO: Buffer truncated with {bytes_left} bytes left.") + return result + + +# +# get_ovs_key_attr_str() +# +def get_ovs_key_attr_str(attr): + ovs_key_attr = [ + "OVS_KEY_ATTR_UNSPEC", + "encap", + "skb_priority", + "in_port", + "eth", + "vlan", + "eth_type", + "ipv4", + "ipv6", + "tcp", + "udp", + "icmp", + "icmpv6", + "arp", + "nd", + "skb_mark", + "tunnel", + "sctp", + "tcp_flags", + "dp_hash", + "recirc_id", + "mpls", + "ct_state", + "ct_zone", + "ct_mark", + "ct_label", + "ct_tuple4", + "ct_tuple6", + "nsh", + ] + + if attr < 0 or attr > len(ovs_key_attr): + return ": {}".format(attr) + return ovs_key_attr[attr] + + +def parse_flow_dict(key_dict, decode=True): + """Processes a flow key dict (see `parse_flow_str` or `decode_key`) and + returns a tuple of both the final flow key dict, and a string that + represents and ODP-like representation. Attempts to decode the actual + data values if `decode` is true. Otherwise, this can be for a loose form + of validation. Throws a KeyError when it encounters an unknown flow + key.""" + + ret_str = "" + parseable = {} + skip = ["nsh", "tunnel", "mpls", "vlan"] + need_byte_swap = ["ct_label"] + ipv4addrs = ["ct_tuple4", "tunnel", "ipv4", "arp"] + ipv6addrs = ["ipv6", "nd", "ct_tuple6"] + macs = {"eth": [0, 1], "arp": [3, 4], "nd": [1, 2]} + fields = [ + ("OVS_KEY_ATTR_UNSPEC"), + ("encap",), + ("skb_priority", " 1: + data = list( + struct.unpack( + fields[attr][1], v[: struct.calcsize(fields[attr][1])] + ) + ) + if k in ipv4addrs: + if data[0].count(0) < 4: + data[0] = str(IPv4Address(data[0])) + else: + data[0] = b"\x00" + if data[1].count(0) < 4: + data[1] = str(IPv4Address(data[1])) + else: + data[1] = b"\x00" + if k in ipv6addrs: + if data[0].count(0) < 16: + data[0] = str(IPv6Address(data[0])) + else: + data[0] = b"\x00" + if data[1].count(0) < len(data[1]): + data[1] = str(IPv6Address(data[1])) + else: + data[1] = b"\x00" + if k in macs.keys(): + for e in macs[k]: + if data[e].count(0) == 6: + mac_str = b"\x00" + else: + mac_str = ":".join(["%02x" % i for i in data[e]]) + data[e] = mac_str + if decode and len(fields[attr]) > 2: + field_dict = dict(zip(fields[attr][2:], data)) + s = ", ".join(k + "=" + str(v) for k, v in field_dict.items()) + elif decode and k != "eth_type": + s = str(data[0]) + field_dict = s + else: + if decode: + s = hex(data[0]) + field_dict = s + ret_str += k + "(" + s + "), " + parseable[k] = field_dict + ret_str = ret_str[:-2] + return (parseable, ret_str) + + +def handle_event(ctx, data, size): + """Dispatches to the correct event handler based on the event probe + type. + + Once we grab the event, we have three cases. + 1. It's a revalidator probe and the reason is nonzero: A flow is expiring + 2. It's a revalidator probe and the reason is zero: flow revalidated + 3. It's a flow_put probe. + + We will ignore case 2, and report all others. + """ + + event = b["events"].event(data) + if event.probe == Event.OP_FLOW_PUT: + handle_flow_put(event) + elif ( + event.probe == Event.FLOW_RESULT + and event.result == RevalResult.UKEY_DELETE + ): + print_expiration(event) + + +def main(): + # + # Don't like these globals, but ctx passing does not work with the existing + # open_ring_buffer() API :( + # + global b + global args + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + parser.add_argument( + "--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=int, + default=1024, + metavar="NUMBER", + ) + parser.add_argument( + "-f", + "--flow-key-size", + help="Set maximum flow key size to capture, " + "default 128 - see notes", + type=buffer_size_type, + default=128, + metavar="[128-2048]", + ) + parser.add_argument( + "-k", + "--flow-keys", + help="Print flow keys as flow strings", + action="store_true", + ) + parser.add_argument( + "-l", + "--filter-flows", + metavar="FLOW_STRING", + help="Filter flows that match the specified " "ODP-like flow", + type=str, + default=None, + nargs="*", + ) + parser.add_argument( + "-P", + "--pahole", + metavar="PAHOLE", + help="Pahole executable to use, default pahole", + type=str, + default="pahole", + ) + parser.add_argument( + "-p", + "--pid", + metavar="VSWITCHD_PID", + help="ovs-vswitchd's PID", + type=int, + default=None, + ) + parser.add_argument( + "-D", + "--debug", + help="Enable eBPF debugging", + type=int, + const=0x3F, + default=0, + nargs="?", + ) + args = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if args.pid is None: + for proc in psutil.process_iter(): + if "ovs-vswitchd" in proc.name(): + if args.pid is not None: + err( + "Error: Multiple ovs-vswitchd daemons running, " + "use the -p option!" + ) + + args.pid = proc.pid + # + # Error checking on input parameters + # + if args.pid is None: + err("ERROR: Failed to find ovs-vswitchd's PID!") + + # + # Attach the USDT probes + # + try: + u = USDT(pid=int(args.pid)) + u.enable_probe(probe="op_flow_put", fn_name="usdt__op_flow_put") + u.enable_probe(probe="flow_result", fn_name="usdt__flow_result") + u.enable_probe( + probe="flow_sweep_result", fn_name="usdt__flow_sweep_result" + ) + except USDTException as e: + err("Failed to attach probes due to:\n" + str(e)) + + # + # Attach the probes to the running process + # + source = bpf_src.replace( + "", str(args.buffer_page_count) + ) + + source = source.replace( + "", + get_ovs_definitions( + ["udpif_key", "ovs_u128", "dpif_flow_put"], + pid=args.pid, + pahole=args.pahole, + ), + ) + + if args.filter_flows is None: + filter_bool = 0 + + # Set the key size based on what the user wanted + source = source.replace("", str(args.flow_key_size)) + else: + filter_bool = 1 + args.filter_flows = parse_flow_str(args.filter_flows[0]) + + # Run through the parser to make sure we only filter on fields we + # understand + parse_flow_dict(args.filter_flows, False) + + # This is hardcoded here because it doesn't make sense to shrink the + # size, since the flow key might be missing fields that are matched in + # the flow filter. + source = source.replace("", "2048") + + source = source.replace("", str(filter_bool)) + + source = source.replace( + "", + "\n".join([f"{event.name} = {event.value}," for event in Event]), + ) + + b = BPF(text=source, usdt_contexts=[u], debug=args.debug) + + # + # Print header + # + print( + "{:<10} {:<18} {:<36} {:<17}".format( + "TID", "TIME", "UFID", "EVENT/REASON" + ) + ) + + # + # Dump out all events. + # + b["events"].open_ring_buffer(handle_event) + while 1: + try: + b.ring_buffer_poll() + except KeyboardInterrupt: + break + + dropcnt = b.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print( + "\n# WARNING: Not all flow operations were captured, {} were" + " dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count) + ) + + +# +# Start main() as the default entry point +# +if __name__ == "__main__": + main() From b89a6b81451ec878316d2cc15f0a64e6aeeeffc7 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Tue, 5 Mar 2024 10:44:42 -0500 Subject: [PATCH 629/833] rhel: Enable USDT scripts by default in Fedora builds. All supported versions of Fedora do package libbpf, so it makes sense to enable USDT support. Acked-by: Simon Horman Signed-off-by: Aaron Conole Signed-off-by: Eelco Chaudron --- rhel/openvswitch-fedora.spec.in | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 5d24ebcda8b..94b6d7431cb 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -28,6 +28,8 @@ %bcond_with dpdk # To disable AF_XDP support, specify '--without afxdp' when building %bcond_without afxdp +# To control the USDT support +%bcond_without usdt # If there is a need to automatically enable the package after installation, # specify the "--with autoenable" @@ -77,6 +79,9 @@ Provides: %{name}-dpdk = %{version}-%{release} %if %{with afxdp} BuildRequires: libxdp-devel libbpf-devel numactl-devel %endif +%if %{with usdt} +BuildRequires: libbpf-devel systemtap-sdt-devel +%endif BuildRequires: unbound unbound-devel Requires: openssl hostname iproute module-init-tools unbound @@ -173,6 +178,9 @@ This package provides IPsec tunneling support for OVS tunnels. --enable-afxdp \ %else --disable-afxdp \ +%endif +%if %{with usdt} + --enable-usdt-probes \ %endif --enable-ssl \ --disable-static \ From 679b068ac78dd55891ddd0bedc9d1fa5d8138c3d Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 19 Mar 2024 16:24:33 +0100 Subject: [PATCH 630/833] AUTHORS: Add Kevin Sprague. Signed-off-by: Eelco Chaudron --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index f99df385ba0..6b42c133a38 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -256,6 +256,7 @@ Kenneth Duda kduda@arista.com Kentaro Ebisawa ebiken.g@gmail.com Keshav Gupta keshav.gupta@ericsson.com Kevin Lo kevlo@FreeBSD.org +Kevin Sprague ksprague0711@gmail.com Kevin Traynor kevin.traynor@intel.com Khem Raj raj.khem@gmail.com Kmindg G kmindg@gmail.com From 5339ce386f3cccc4972bc49c3f68272138d58511 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Thu, 7 Mar 2024 16:01:51 +0100 Subject: [PATCH 631/833] ofpbuf: Prevent undefined behavior in ofpbuf_clone. The new_buffer data pointer is NULL when the size of the cloned buffer is 0. This is fine as there is no need to allocate space. However, the cloned buffer header/msg might be the same pointer as data. This causes undefined behavior by adding 0 to NULL pointer. Check if the data buffer is not NULL before attempting to apply the header/msg offset. This was caught by OVN system test: lib/ofpbuf.c:203:56: runtime error: applying zero offset to null pointer 0 0xa012fc in ofpbuf_clone_with_headroom /ovs/lib/ofpbuf.c:203:56 1 0x635fd4 in put_remote_port_redirect_overlay /controller/physical.c:397:40 2 0x635fd4 in consider_port_binding /controller/physical.c:1951:9 3 0x62e046 in physical_run /controller/physical.c:2447:9 4 0x601d98 in en_pflow_output_run /controller/ovn-controller.c:4690:5 5 0x707769 in engine_recompute /lib/inc-proc-eng.c:415:5 6 0x7060eb in engine_compute /lib/inc-proc-eng.c:454:17 7 0x7060eb in engine_run_node /lib/inc-proc-eng.c:503:14 8 0x7060eb in engine_run /lib/inc-proc-eng.c:528:9 9 0x5f9f26 in main /controller/ovn-controller.c Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/ofpbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/ofpbuf.c b/lib/ofpbuf.c index d3d42b41482..232ebeb97ba 100644 --- a/lib/ofpbuf.c +++ b/lib/ofpbuf.c @@ -197,12 +197,12 @@ ofpbuf_clone_with_headroom(const struct ofpbuf *b, size_t headroom) struct ofpbuf *new_buffer; new_buffer = ofpbuf_clone_data_with_headroom(b->data, b->size, headroom); - if (b->header) { + if (new_buffer->data && b->header) { ptrdiff_t header_offset = (char *) b->header - (char *) b->data; new_buffer->header = (char *) new_buffer->data + header_offset; } - if (b->msg) { + if (new_buffer->data && b->msg) { ptrdiff_t msg_offset = (char *) b->msg - (char *) b->data; new_buffer->msg = (char *) new_buffer->data + msg_offset; From 9d0a40120f9f71ed9ddf32d37d1b03b0fd7f4703 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Tue, 12 Mar 2024 22:04:11 +0800 Subject: [PATCH 632/833] ofproto-dpif: Fix tunnel with different name del/add failure. Reproduce: ovs-vsctl add-port br-int p0 \ -- set interface p0 type=vxlan options:remote_ip=10.10.10.1 sleep 2 ovs-vsctl --if-exists del-port p0 \ -- add-port br-int p1 \ -- set interface p1 type=vxlan options:remote_ip=10.10.10.1 ovs-vsctl: Error detected while setting up 'p1': could not add network device p1 to ofproto (File exists). vswitchd log: bridge|INFO|bridge br-int: added interface p0 on port 1106 bridge|INFO|bridge br-int: deleted interface p0 on port 1106 tunnel|WARN|p1: attempting to add tunnel port with same config as port 'p0' (::->10.10.10.1, key=0, legacy_l2, dp port=122) ofproto|WARN|br-int: could not add port p1 (File exists) bridge|WARN|could not add network device p1 to ofproto (File exists) CallTrace: bridge_reconfigure bridge_del_ports port_destroy iface_destroy__ netdev_remove <------ netdev p0 removed bridge_delete_or_reconfigure_ports OFPROTO_PORT_FOR_EACH ofproto_port_dump_next port_dump_next port_query_by_name <------ netdev_shash do not contain p0 ofproto_port_del <------ p0 do not del in ofproto bridge_add_ports bridge_add_ports__ iface_create iface_do_create ofproto_port_add <------ p1 add failed Fixes: fe83f81df977 ("netdev: Remove netdev from global shash when the user is changing interface configuration.") Acked-by: Han Zhou Tested-by: Han Zhou Signed-off-by: Tao Liu Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif.c | 18 ++++++++++++------ tests/tunnel.at | 12 ++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index f59d69c4d1e..fe034f9717b 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -3904,15 +3904,21 @@ port_query_by_name(const struct ofproto *ofproto_, const char *devname, int error; if (sset_contains(&ofproto->ghost_ports, devname)) { - const char *type = netdev_get_type_from_name(devname); - /* We may be called before ofproto->up.port_by_name is populated with * the appropriate ofport. For this reason, we must get the name and - * type from the netdev layer directly. */ - if (type) { - const struct ofport *ofport; + * type from the netdev layer directly. + * However, when a port deleted, the corresponding netdev is also + * removed from netdev_shash. netdev_get_type_from_name returns NULL + * in such case and we should try to get type from ofport->netdev. */ + const char *type = netdev_get_type_from_name(devname); + const struct ofport *ofport = + shash_find_data(&ofproto->up.port_by_name, devname); - ofport = shash_find_data(&ofproto->up.port_by_name, devname); + if (!type && ofport && ofport->netdev) { + type = netdev_get_type(ofport->netdev); + } + + if (type) { ofproto_port->ofp_port = ofport ? ofport->ofp_port : OFPP_NONE; ofproto_port->name = xstrdup(devname); ofproto_port->type = xstrdup(type); diff --git a/tests/tunnel.at b/tests/tunnel.at index 71e7c2df4ea..9d539ee6f67 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -1269,6 +1269,18 @@ OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) OVS_APP_EXIT_AND_WAIT([ovsdb-server])] AT_CLEANUP +AT_SETUP([tunnel - re-create port with different name]) +OVS_VSWITCHD_START( + [add-port br0 p0 -- set int p0 type=vxlan options:remote_ip=10.10.10.1]) + +AT_CHECK([ovs-vsctl --if-exists del-port p0 -- \ + add-port br0 p1 -- \ + set int p1 type=vxlan options:remote_ip=10.10.10.1]) + +OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server])] +AT_CLEANUP + AT_SETUP([tunnel - SRV6 basic]) OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy \ ofport_request=1 \ From 3388c3451ff8ab6f0b41db50d35ca0160cc7e800 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Wed, 20 Mar 2024 10:54:07 +0800 Subject: [PATCH 633/833] ovs-tcpdump: Fix cleanup mirror failed with twice fatal signals. After running ovs-tcpdump and inputs multiple CTRL+C, the program will raise the following exception. Error in atexit._run_exitfuncs: Traceback (most recent call last): File "/usr/bin/ovs-tcpdump", line 421, in cleanup_mirror ovsdb = OVSDB(db_sock) File "/usr/bin/ovs-tcpdump", line 168, in __init__ OVSDB.wait_for_db_change(self._idl_conn) # Initial Sync with DB File "/usr/bin/ovs-tcpdump", line 155, in wait_for_db_change while idl.change_seqno == seq and not idl.run(): The default handler of SIGINT is default_int_handler, so it was not registered to the signal handler. When received CTRL+C again, the program was broken, and calling hook could not be executed completely. Signed-off-by: Daniel Ding Signed-off-by: Ilya Maximets --- python/ovs/fatal_signal.py | 24 +++++++++++++----------- utilities/ovs-tcpdump.in | 32 +++++++++++--------------------- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/python/ovs/fatal_signal.py b/python/ovs/fatal_signal.py index cb2e99e87d4..16a7e78a03f 100644 --- a/python/ovs/fatal_signal.py +++ b/python/ovs/fatal_signal.py @@ -16,6 +16,7 @@ import os import signal import sys +import threading import ovs.vlog @@ -112,29 +113,29 @@ def _unlink(file_): def _signal_handler(signr, _): _call_hooks(signr) - # Re-raise the signal with the default handling so that the program - # termination status reflects that we were killed by this signal. - signal.signal(signr, signal.SIG_DFL) - os.kill(os.getpid(), signr) - def _atexit_handler(): _call_hooks(0) -recurse = False +mutex = threading.Lock() def _call_hooks(signr): - global recurse - if recurse: + global mutex + if not mutex.acquire(blocking=False): return - recurse = True for hook, cancel, run_at_exit in _hooks: if signr != 0 or run_at_exit: hook() + if signr != 0: + # Re-raise the signal with the default handling so that the program + # termination status reflects that we were killed by this signal. + signal.signal(signr, signal.SIG_DFL) + os.kill(os.getpid(), signr) + _inited = False @@ -150,7 +151,9 @@ def _init(): signal.SIGALRM] for signr in signals: - if signal.getsignal(signr) == signal.SIG_DFL: + handler = signal.getsignal(signr) + if (handler == signal.SIG_DFL or + handler == signal.default_int_handler): signal.signal(signr, _signal_handler) atexit.register(_atexit_handler) @@ -165,7 +168,6 @@ def signal_alarm(timeout): if sys.platform == "win32": import time - import threading class Alarm (threading.Thread): def __init__(self, timeout): diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index 4cbd9a5d310..eada803bb41 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -534,29 +534,19 @@ def main(): ovsdb.close_idl() pipes = _doexec(*([dump_cmd, '-i', mirror_interface] + tcpdargs)) - try: - while pipes.poll() is None: - data = pipes.stdout.readline().strip(b'\n') - if len(data) == 0: - raise KeyboardInterrupt - print(data.decode('utf-8')) - raise KeyboardInterrupt - except KeyboardInterrupt: - # If there is a pipe behind ovs-tcpdump (such as ovs-tcpdump - # -i eth0 | grep "192.168.1.1"), the pipe is no longer available - # after received Ctrl+C. - # If we write data to an unavailable pipe, a pipe error will be - # reported, so we turn off stdout to avoid subsequent flushing - # of data into the pipe. - try: - sys.stdout.close() - except IOError: - pass + while pipes.poll() is None: + data = pipes.stdout.readline().strip(b'\n') + if len(data) == 0: + break + print(data.decode('utf-8')) - if pipes.poll() is None: - pipes.terminate() + try: + sys.stdout.close() + except IOError: + pass - sys.exit(0) + if pipes.poll() is None: + pipes.terminate() if __name__ == '__main__': From 840979663d604e5d8f285bf840b8c800d895bc8d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 20 Mar 2024 19:47:21 +0100 Subject: [PATCH 634/833] route-table: Avoid routes from non-standard routing tables. Currently, ovs-vswitchd is subscribed to all the routing changes in the kernel. On each change, it marks the internal routing table cache as invalid, then resets it and dumps all the routes from the kernel from scratch. The reason for that is kernel routing updates not being reliable in a sense that it's hard to tell which route is getting removed or modified. Userspace application has to track the order in which route entries are dumped from the kernel. Updates can get lost or even duplicated and the kernel doesn't provide a good mechanism to distinguish one route from another. To my knowledge, dumping all the routes from a kernel after each change is the only way to keep the cache consistent. Some more info can be found in the following never addressed issues: https://bugzilla.redhat.com/1337860 https://bugzilla.redhat.com/1337855 It seems to be believed that NetworkManager "mostly" does incremental updates right. But it is still not completely correct, will re-dump the whole table in certain cases, and it takes a huge amount of very complicated code to do the accounting and route comparisons. Going back to ovs-vswitchd, it currently dumps routes from all the routing tables. If it will get conflicting routes from multiple tables, the cache will not be useful. The routing cache in userspace is primarily used for checking the egress port for tunneled traffic and this way also detecting link state changes for a tunnel port. For userspace datapath it is used for actual routing of the packet after sending to a native tunnel. With kernel datapath we don't really have a mechanism to know which routing table will actually be used by the kernel after encapsulation, so our lookups on a cache may be incorrect because of this as well. So, unless all the relevant routes are in the standard tables, the lookup in userspace route cache is unreliable. Luckily, most setups are not using any complicated routing in non-standard tables that OVS has to be aware of. It is possible, but unlikely, that standard routing tables are completely empty while some other custom table is not, and all the OVS tunnel traffic is directed to that table. That would be the only scenario where dumping non-standard tables would make sense. But it seems like this kind of setup will likely need a way to tell OVS from which table the routes should be taken, or we'll need to dump routing rules and keep a separate cache for each table, so we can first match on rules and then lookup correct routes in a specific table. I'm not sure if trying to implement all that is justified. For now, stop considering routes from non-standard tables to avoid mixing different tables together and also wasting CPU resources. This fixes a high CPU usage in ovs-vswitchd in case a BGP daemon is running on a same host and in a same network namespace with OVS using its own custom routing table. Unfortunately, there seems to be no way to tell the kernel to send updates only for particular tables. So, we'll still receive and parse all of them. But they will not result in a full cache invalidation in most cases. Linux kernel v4.20 introduced filtering support for RTM_GETROUTE dumps. So, we can make use of it and dump only standard tables when we get a relevant route update. NETLINK_GET_STRICT_CHK has to be enabled on the socket for filtering to work. There is no reason to not enable it by default, if supported. It is not used outside of NETLINK_ROUTE. Fixes: f0e167f0dbad ("route-table: Handle route updates more robustly.") Fixes: ea83a2fcd0d3 ("lib: Show tunnel egress interface in ovsdb") Reported-at: https://github.com/openvswitch/ovs-issues/issues/185 Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-October/052091.html Acked-by: Aaron Conole Signed-off-by: Ilya Maximets --- lib/netlink-protocol.h | 10 ++++++ lib/netlink-socket.c | 9 +++++ lib/route-table.c | 80 +++++++++++++++++++++++++++++++++--------- tests/system-route.at | 64 +++++++++++++++++++++++++++++++++ 4 files changed, 147 insertions(+), 16 deletions(-) diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h index 6eaa7035a4b..e4bb28ac9f6 100644 --- a/lib/netlink-protocol.h +++ b/lib/netlink-protocol.h @@ -155,6 +155,11 @@ enum { #define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) #endif +/* Introduced in v4.4. */ +#ifndef NLM_F_DUMP_FILTERED +#define NLM_F_DUMP_FILTERED 0x20 +#endif + /* These were introduced all together in 2.6.14. (We want our programs to * support the newer kernel features even if compiled with older headers.) */ #ifndef NETLINK_ADD_MEMBERSHIP @@ -168,6 +173,11 @@ enum { #define NETLINK_LISTEN_ALL_NSID 8 #endif +/* Strict checking of netlink arguments introduced in Linux kernel v4.20. */ +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + /* These were introduced all together in 2.6.23. (We want our programs to * support the newer kernel features even if compiled with older headers.) */ #ifndef CTRL_ATTR_MCAST_GRP_MAX diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index 80da20d9f05..5cb1fc89aed 100644 --- a/lib/netlink-socket.c +++ b/lib/netlink-socket.c @@ -205,6 +205,15 @@ nl_sock_create(int protocol, struct nl_sock **sockp) } } + /* Strict checking only supported for NETLINK_ROUTE. */ + if (protocol == NETLINK_ROUTE + && setsockopt(sock->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, + &one, sizeof one) < 0) { + VLOG_RL(&rl, errno == ENOPROTOOPT ? VLL_DBG : VLL_WARN, + "netlink: could not enable strict checking (%s)", + ovs_strerror(errno)); + } + retval = get_socket_rcvbuf(sock->fd); if (retval < 0) { retval = -retval; diff --git a/lib/route-table.c b/lib/route-table.c index 9927dcc1854..f1fe32714e8 100644 --- a/lib/route-table.c +++ b/lib/route-table.c @@ -26,6 +26,7 @@ #include #include +#include "coverage.h" #include "hash.h" #include "netdev.h" #include "netlink.h" @@ -44,6 +45,8 @@ VLOG_DEFINE_THIS_MODULE(route_table); +COVERAGE_DEFINE(route_table_dump); + struct route_data { /* Copied from struct rtmsg. */ unsigned char rtm_dst_len; @@ -80,7 +83,7 @@ static struct nln_notifier *name_notifier = NULL; static bool route_table_valid = false; -static int route_table_reset(void); +static void route_table_reset(void); static void route_table_handle_msg(const struct route_table_msg *); static int route_table_parse(struct ofpbuf *, struct route_table_msg *); static void route_table_change(const struct route_table_msg *, void *); @@ -153,26 +156,22 @@ route_table_wait(void) ovs_mutex_unlock(&route_table_mutex); } -static int -route_table_reset(void) +static bool +route_table_dump_one_table(unsigned char id) { - struct nl_dump dump; - struct rtgenmsg *rtgenmsg; uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; struct ofpbuf request, reply, buf; - - route_map_clear(); - netdev_get_addrs_list_flush(); - route_table_valid = true; - rt_change_seq++; + struct rtmsg *rq_msg; + bool filtered = true; + struct nl_dump dump; ofpbuf_init(&request, 0); - nl_msg_put_nlmsghdr(&request, sizeof *rtgenmsg, RTM_GETROUTE, - NLM_F_REQUEST); + nl_msg_put_nlmsghdr(&request, sizeof *rq_msg, RTM_GETROUTE, NLM_F_REQUEST); - rtgenmsg = ofpbuf_put_zeros(&request, sizeof *rtgenmsg); - rtgenmsg->rtgen_family = AF_UNSPEC; + rq_msg = ofpbuf_put_zeros(&request, sizeof *rq_msg); + rq_msg->rtm_family = AF_UNSPEC; + rq_msg->rtm_table = id; nl_dump_start(&dump, NETLINK_ROUTE, &request); ofpbuf_uninit(&request); @@ -182,12 +181,43 @@ route_table_reset(void) struct route_table_msg msg; if (route_table_parse(&reply, &msg)) { + struct nlmsghdr *nlmsghdr = nl_msg_nlmsghdr(&reply); + + /* Older kernels do not support filtering. */ + if (!(nlmsghdr->nlmsg_flags & NLM_F_DUMP_FILTERED)) { + filtered = false; + } route_table_handle_msg(&msg); } } ofpbuf_uninit(&buf); + nl_dump_done(&dump); + + return filtered; +} + +static void +route_table_reset(void) +{ + unsigned char tables[] = { + RT_TABLE_DEFAULT, + RT_TABLE_MAIN, + RT_TABLE_LOCAL, + }; - return nl_dump_done(&dump); + route_map_clear(); + netdev_get_addrs_list_flush(); + route_table_valid = true; + rt_change_seq++; + + COVERAGE_INC(route_table_dump); + + for (size_t i = 0; i < ARRAY_SIZE(tables); i++) { + if (!route_table_dump_one_table(tables[i])) { + /* Got unfiltered reply, no need to dump further. */ + break; + } + } } /* Return RTNLGRP_IPV4_ROUTE or RTNLGRP_IPV6_ROUTE on success, 0 on parse @@ -203,6 +233,7 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) [RTA_GATEWAY] = { .type = NL_A_U32, .optional = true }, [RTA_MARK] = { .type = NL_A_U32, .optional = true }, [RTA_PREFSRC] = { .type = NL_A_U32, .optional = true }, + [RTA_TABLE] = { .type = NL_A_U32, .optional = true }, }; static const struct nl_policy policy6[] = { @@ -211,6 +242,7 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) [RTA_MARK] = { .type = NL_A_U32, .optional = true }, [RTA_GATEWAY] = { .type = NL_A_IPV6, .optional = true }, [RTA_PREFSRC] = { .type = NL_A_IPV6, .optional = true }, + [RTA_TABLE] = { .type = NL_A_U32, .optional = true }, }; struct nlattr *attrs[ARRAY_SIZE(policy)]; @@ -232,6 +264,7 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) if (parsed) { const struct nlmsghdr *nlmsg; + uint32_t table_id; int rta_oif; /* Output interface index. */ nlmsg = buf->data; @@ -247,6 +280,19 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) rtm->rtm_type != RTN_LOCAL) { change->relevant = false; } + + table_id = rtm->rtm_table; + if (attrs[RTA_TABLE]) { + table_id = nl_attr_get_u32(attrs[RTA_TABLE]); + } + /* Do not consider changes in non-standard routing tables. */ + if (table_id + && table_id != RT_TABLE_DEFAULT + && table_id != RT_TABLE_MAIN + && table_id != RT_TABLE_LOCAL) { + change->relevant = false; + } + change->nlmsg_type = nlmsg->nlmsg_type; change->rd.rtm_dst_len = rtm->rtm_dst_len + (ipv4 ? 96 : 0); change->rd.local = rtm->rtm_type == RTN_LOCAL; @@ -312,7 +358,9 @@ static void route_table_change(const struct route_table_msg *change OVS_UNUSED, void *aux OVS_UNUSED) { - route_table_valid = false; + if (!change || change->relevant) { + route_table_valid = false; + } } static void diff --git a/tests/system-route.at b/tests/system-route.at index 114aaebc77f..c0ecad6cfb4 100644 --- a/tests/system-route.at +++ b/tests/system-route.at @@ -64,3 +64,67 @@ Cached: fc00:db8:beef::13/128 dev br0 GW fc00:db8:cafe::1 SRC fc00:db8:cafe::2]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +dnl Checks that OVS doesn't use routes from non-standard tables. +AT_SETUP([ovs-route - route tables]) +AT_KEYWORDS([route]) +OVS_TRAFFIC_VSWITCHD_START() + +dnl Create tap port. +on_exit 'ip link del p1-route' +AT_CHECK([ip tuntap add name p1-route mode tap]) +AT_CHECK([ip link set p1-route up]) + +dnl Add ip address. +AT_CHECK([ip addr add 10.0.0.17/24 dev p1-route], [0], [stdout]) + +dnl Check that OVS catches route updates. +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local]) + +dnl Add a route to the main routing table and check that OVS caches +dnl this new route. +AT_CHECK([ip route add 10.0.0.18/32 dev p1-route]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local +Cached: 10.0.0.18/32 dev p1-route SRC 10.0.0.17]) + +dnl Add a route to a custom routing table and check that OVS doesn't cache it. +AT_CHECK([ip route add 10.0.0.19/32 dev p1-route table 42]) +AT_CHECK([ip route show table 42 | grep 'p1-route' | grep -q '10.0.0.19']) +dnl Give the main thread a chance to act. +AT_CHECK([ovs-appctl revalidator/wait]) +dnl Check that OVS didn't learn this route. +AT_CHECK([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [0], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local +Cached: 10.0.0.18/32 dev p1-route SRC 10.0.0.17 +]) + +dnl Delete a route from the main table and check that OVS removes the route +dnl from the cache. +AT_CHECK([ip route del 10.0.0.18/32 dev p1-route]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local]) + +dnl Delete a route from a custom routing table and check that the cache +dnl dosn't change. +AT_CHECK([ip route del 10.0.0.19/32 dev p1-route table 42]) +dnl Give the main thread a chance to act. +AT_CHECK([ovs-appctl revalidator/wait]) +dnl Check that the cache is still the same. +AT_CHECK([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [0], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local +]) + +dnl Delete ip address. +AT_CHECK([ip addr del 10.0.0.17/24 dev p1-route], [0], [stdout]) +dnl Check that routes were removed from OVS. +OVS_WAIT_UNTIL([test $(ovs-appctl ovs/route/show | grep -c 'p1-route') -eq 0 ]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From 3ddb31f60487c9e26102372b56dec4b705368602 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 21 Mar 2024 09:16:49 -0400 Subject: [PATCH 635/833] ovs-monitor-ipsec: LibreSwan autodetect paths. In v4.0, LibreSwan changed a default paths that had been hardcoded in ovs-monitor-ipsec, breaking some uses of this script. This patch adds support for both old and newer versions by auto detecting the version of LibreSwan and then choosing the correct path. Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1975039 Reported-by: Qijun Ding Fixes: d6afbc00d5b3 ("ipsec: Allow custom file locations.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ipsec/ovs-monitor-ipsec.in | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in index 7945162f9f3..bc7ac552379 100755 --- a/ipsec/ovs-monitor-ipsec.in +++ b/ipsec/ovs-monitor-ipsec.in @@ -457,14 +457,30 @@ conn prevent_unencrypted_vxlan CERTKEY_PREFIX = "ovs_certkey_" def __init__(self, libreswan_root_prefix, args): + # Collect version infromation + self.IPSEC = libreswan_root_prefix + "/usr/sbin/ipsec" + proc = subprocess.Popen([self.IPSEC, "--version"], + stdout=subprocess.PIPE, + encoding="latin1") + pout, perr = proc.communicate() + + v = re.match("^Libreswan (.*)$", pout) + try: + version = int(v.group(1).split(".")[0]) + except: + version = 0 + + if version >= 4: + ipsec_d = args.ipsec_d if args.ipsec_d else "/var/lib/ipsec/nss" + else: + ipsec_d = args.ipsec_d if args.ipsec_d else "/etc/ipsec.d" + ipsec_conf = args.ipsec_conf if args.ipsec_conf else "/etc/ipsec.conf" - ipsec_d = args.ipsec_d if args.ipsec_d else "/etc/ipsec.d" ipsec_secrets = (args.ipsec_secrets if args.ipsec_secrets else "/etc/ipsec.secrets") ipsec_ctl = (args.ipsec_ctl if args.ipsec_ctl else "/run/pluto/pluto.ctl") - self.IPSEC = libreswan_root_prefix + "/usr/sbin/ipsec" self.IPSEC_CONF = libreswan_root_prefix + ipsec_conf self.IPSEC_SECRETS = libreswan_root_prefix + ipsec_secrets self.IPSEC_D = "sql:" + libreswan_root_prefix + ipsec_d From 6f93d8e62f13271201cc4b8ab3c8dd121390c94f Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 22 Mar 2024 16:19:57 +0800 Subject: [PATCH 636/833] netdev-dpdk: Disable outer UDP checksum offload for ice/i40e driver. Fixing the issue of incorrect outer UDP checksum in packets sent by E810 or X710. We disable RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM,but also disable all the dependent offloads like RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO and RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Reported-at: https://github.com/openvswitch/ovs-issues/issues/321 Signed-off-by: Jun Wang Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 29a6bf0328e..2111f776810 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1354,6 +1354,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM; } + if (!strcmp(info.driver_name, "net_ice") + || !strcmp(info.driver_name, "net_i40e")) { + /* FIXME: Driver advertises the capability but doesn't seem + * to actually support it correctly. Can remove this once + * the driver is fixed on DPDK side. */ + VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a " + "net/ice or net/i40e port.", netdev_get_name(&dev->up)); + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM; + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO; + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO; + } + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) { dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; } else { From e6a8a8e90084a124cad377b73f3f87537d9dba92 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 22 Mar 2024 20:44:43 +0100 Subject: [PATCH 637/833] AUTHORS: Add Jun Wang. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 6b42c133a38..82075d32067 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -245,6 +245,7 @@ Jon Kohler jon@nutanix.com Jonathan Vestin jonavest@kau.se Jorge Arturo Sauma Vargas jorge.sauma@hpe.com Jun Nakajima jun.nakajima@intel.com +Jun Wang junwang01@cestc.cn JunhanYan juyan@redhat.com JunoZhu zhunatuzi@gmail.com Justin Pettit jpettit@ovn.org From c6538b443984e10c266d7e75e797ef2f1b722d61 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 22 Mar 2024 15:42:12 +0100 Subject: [PATCH 638/833] dpif-netdev: Fix crash due to tunnel offloading on recirculation. Recirculation involves re-parsing the packet from scratch and that process is not aware of multiple header levels nor the inner/outer offsets. So, it overwrites offsets with new ones from the outermost headers and sets offloading flags that change their meaning when the packet is marked for tunnel offloading. For example: 1. TCP packet enters OVS. 2. TCP packet gets encapsulated into UDP tunnel. 3. Recirculation happens. 4. Packet is re-parsed after recirculation with miniflow_extract() or similar function. 5. Packet is marked for UDP checksumming because we parse the outermost set of headers. But since it is tunneled, it means inner UDP checksumming. And that makes no sense, because the inner packet is TCP. This is causing packet drops due to malformed packets or even assertions and crashes in the code that is trying to fixup checksums for packets using incorrect metadata: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior lib/packets.c:2061:15: runtime error: member access within null pointer of type 'struct udp_header' 0 0xbe5221 in packet_udp_complete_csum lib/packets.c:2061:15 1 0x7e5662 in dp_packet_ol_send_prepare lib/dp-packet.c:638:9 2 0x96ef89 in netdev_send lib/netdev.c:940:9 3 0x818e94 in dp_netdev_pmd_flush_output_on_port lib/dpif-netdev.c:5577:9 4 0x817606 in dp_netdev_pmd_flush_output_packets lib/dpif-netdev.c:5618:27 5 0x81cfa5 in dp_netdev_process_rxq_port lib/dpif-netdev.c:5677:9 6 0x7eefe4 in dpif_netdev_run lib/dpif-netdev.c:7001:25 7 0x610e87 in type_run ofproto/ofproto-dpif.c:367:9 8 0x5b9e80 in ofproto_type_run ofproto/ofproto.c:1879:31 9 0x55bbb4 in bridge_run__ vswitchd/bridge.c:3281:9 10 0x558b6b in bridge_run vswitchd/bridge.c:3346:5 11 0x591dc5 in main vswitchd/ovs-vswitchd.c:130:9 12 0x172b89 in __libc_start_call_main (/lib64/libc.so.6+0x27b89) 13 0x172c4a in __libc_start_main@GLIBC_2.2.5 (/lib64/libc.so.6+0x27c4a) 14 0x47eff4 in _start (vswitchd/ovs-vswitchd+0x47eff4) Tests added for both IPv4 and IPv6 cases. Though IPv6 test doesn't trigger the issue it's better to have a symmetric test. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2024-March/053014.html Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dp-packet.h | 8 ++++ lib/dpif-netdev.c | 29 +++++++++++ tests/tunnel-push-pop-ipv6.at | 90 +++++++++++++++++++++++++++++++++++ tests/tunnel-push-pop.at | 89 ++++++++++++++++++++++++++++++++++ 4 files changed, 216 insertions(+) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 2fa17d81402..3622764c47b 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -1300,6 +1300,14 @@ dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; } +/* Clears tunnel offloading marks. */ +static inline void +dp_packet_hwol_reset_tunnel(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) &= ~(DP_PACKET_OL_TX_TUNNEL_VXLAN | + DP_PACKET_OL_TX_TUNNEL_GENEVE); +} + /* Mark packet 'b' as a tunnel packet with outer IPv4 header. */ static inline void dp_packet_hwol_set_tx_outer_ipv4(struct dp_packet *b) diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index e6c53937d8b..7e637ff8ac6 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -115,6 +115,7 @@ COVERAGE_DEFINE(datapath_drop_lock_error); COVERAGE_DEFINE(datapath_drop_userspace_action_error); COVERAGE_DEFINE(datapath_drop_tunnel_push_error); COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); +COVERAGE_DEFINE(datapath_drop_tunnel_tso_recirc); COVERAGE_DEFINE(datapath_drop_recirc_error); COVERAGE_DEFINE(datapath_drop_invalid_port); COVERAGE_DEFINE(datapath_drop_invalid_bond); @@ -8920,6 +8921,34 @@ static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + size_t i, size = dp_packet_batch_size(packets); + struct dp_packet *packet; + + DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets) { + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + + if (dp_packet_hwol_is_tso(packet)) { + /* Can't perform GSO in the middle of a pipeline. */ + COVERAGE_INC(datapath_drop_tunnel_tso_recirc); + dp_packet_delete(packet); + VLOG_WARN_RL(&rl, "Recirculating tunnel packets with " + "TSO is not supported"); + continue; + } + /* Have to fix all the checksums before re-parsing, because the + * packet will be treated as having a single set of headers. */ + dp_packet_ol_send_prepare(packet, 0); + /* This packet must not be marked with anything tunnel-related. */ + dp_packet_hwol_reset_tunnel(packet); + /* Clear inner offsets. Other ones are collateral, but they will + * be re-initialized on re-parsing. */ + dp_packet_reset_offsets(packet); + } + dp_packet_batch_refill(packets, packet, i); + } + dp_netdev_input__(pmd, packets, true, 0); } diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index 3f2cf842927..f1c5d42f664 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -726,3 +726,93 @@ udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),1 OVS_VSWITCHD_STOP AT_CLEANUP + +dnl This is a regression test for outer header checksum offloading +dnl with recirculation. +AT_SETUP([tunnel_push_pop_ipv6 - recirculation after encapsulation]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:remote_ip=2001:cafe::92 \ + options:key=123 ofport_request=2]) + +dnl Setup an IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/64], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 2001:cafe::/64 dev br0 SRC 2001:cafe::88 local +]) + +dnl Add a dp-hash selection group. +AT_CHECK([ovs-ofctl add-group br0 \ + 'group_id=1234,type=select,selection_method=dp_hash,bucket=weight=1,output:p0']) +AT_CHECK([ovs-ofctl add-flow br0 in_port=br0,action=group:1234]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=p0,action=normal]) + +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This Neighbor Advertisement from p0 has two effects: +dnl 1. The neighbor cache will learn that 2001:cafe::92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),dnl + ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=58,tclass=0,hlimit=255,frag=no),dnl + icmpv6(type=136,code=0),dnl + nd(target=2001:cafe::92,sll=00:00:00:00:00:00,tll=f8:bc:12:44:34:b6)' +]) + +dnl Check that selection group is used in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 2001:cafe::92 via br0 + -> tunneling from aa:55:aa:55:00:00 2001:cafe::88 to f8:bc:12:44:34:b6 2001:cafe::92 +Datapath actions: tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x1) +]) + +dnl Now check that the packet is actually encapsulated and delivered. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa55000086dd +ip6=60000000001e11402001cafe0000000000000000000000882001cafe000000000000000000000092 +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Same for UDP checksum. Masked with '....'. +udp=....17c1001e.... +geneve=0000655800007b00 +encap=${eth}${ip6}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow is also correct. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x2) +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index 97405636f98..508737c53ec 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -1163,3 +1163,92 @@ gre((flags=0x0,proto=0x6558))),out_port(2)),1 OVS_VSWITCHD_STOP AT_CLEANUP + +dnl This is a regression test for outer header checksum offloading +dnl with recirculation. +AT_SETUP([tunnel_push_pop - recirculation after encapsulation]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:remote_ip=1.1.2.92 \ + options:key=123 ofport_request=2]) + +dnl Setup an IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +]) + +dnl Add a dp-hash selection group. +AT_CHECK([ovs-ofctl add-group br0 \ + 'group_id=1234,type=select,selection_method=dp_hash,bucket=weight=1,output:p0']) +AT_CHECK([ovs-ofctl add-flow br0 in_port=br0,action=group:1234]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=p0,action=normal]) + +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This ARP reply from p0 has two effects: +dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),dnl + arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)' +]) + +dnl Check that selection group is used in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 1.1.2.92 via br0 + -> tunneling from aa:55:aa:55:00:00 1.1.2.88 to f8:bc:12:44:34:b6 1.1.2.92 +Datapath actions: tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x1) +]) + +dnl Now check that the packet is actually encapsulated and delivered. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa5500000800 +ip4=450000320000400040113406010102580101025c +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Masked with '....'. +udp=....17c1001e0000 +geneve=0000655800007b00 +encap=${eth}${ip4}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) + +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow is also correct. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x2) +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From bf7c0b0a90b2b6154088ec990ec21e41702c015d Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Fri, 22 Mar 2024 09:40:16 -0400 Subject: [PATCH 639/833] ofproto-dpif-xlate: Fix continuations with associated metering. Open vSwitch supports the ability to invoke a controller action by way of a sample action with a specified meter. In the normal case, this sample action is transparently generated during xlate processing. However, when executing via a continuation, the logic to generate the sample action when finishing the context freeze was missing. The result is that the behavior when action is 'controller(pause,meter_id=1)' does not match the behavior when action is 'controller(meter_id=1)'. OVN and other controller solutions may rely on this metering to protect the control path, so it is critical to preserve metering, whether we are doing a plain old send to controller, or a continuation. Fixes: 77ab5fd2a95b ("Implement serializing the state of packet traversal in "continuations".") Reported-at: https://issues.redhat.com/browse/FDP-455 Tested-by: Alex Musil Signed-off-by: Aaron Conole Acked-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 66 +++++++++++++++++++----------------- tests/ofproto-dpif.at | 51 ++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 32 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 89f183182ea..7c495089509 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -5080,10 +5080,37 @@ put_controller_user_action(struct xlate_ctx *ctx, bool dont_send, bool continuation, uint32_t recirc_id, int len, enum ofp_packet_in_reason reason, + uint32_t provider_meter_id, uint16_t controller_id) { struct user_action_cookie cookie; + /* If the controller action didn't request a meter (indicated by a + * 'meter_id' argument other than NX_CTLR_NO_METER), see if one was + * configured through the "controller" virtual meter. + * + * Internally, ovs-vswitchd uses UINT32_MAX to indicate no meter is + * configured. */ + uint32_t meter_id; + if (provider_meter_id == UINT32_MAX) { + meter_id = ctx->xbridge->ofproto->up.controller_meter_id; + } else { + meter_id = provider_meter_id; + } + + size_t offset; + size_t ac_offset; + if (meter_id != UINT32_MAX) { + /* If controller meter is configured, generate + * clone(meter,userspace) action. */ + offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE); + nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY, + UINT32_MAX); + ac_offset = nl_msg_start_nested(ctx->odp_actions, + OVS_SAMPLE_ATTR_ACTIONS); + nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); + } + memset(&cookie, 0, sizeof cookie); cookie.type = USER_ACTION_COOKIE_CONTROLLER; cookie.ofp_in_port = OFPP_NONE, @@ -5101,6 +5128,11 @@ put_controller_user_action(struct xlate_ctx *ctx, uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); odp_put_userspace_action(pid, &cookie, sizeof cookie, ODPP_NONE, false, ctx->odp_actions, NULL); + + if (meter_id != UINT32_MAX) { + nl_msg_end_nested(ctx->odp_actions, ac_offset); + nl_msg_end_nested(ctx->odp_actions, offset); + } } static void @@ -5145,32 +5177,6 @@ xlate_controller_action(struct xlate_ctx *ctx, int len, } recirc_refs_add(&ctx->xout->recircs, recirc_id); - /* If the controller action didn't request a meter (indicated by a - * 'meter_id' argument other than NX_CTLR_NO_METER), see if one was - * configured through the "controller" virtual meter. - * - * Internally, ovs-vswitchd uses UINT32_MAX to indicate no meter is - * configured. */ - uint32_t meter_id; - if (provider_meter_id == UINT32_MAX) { - meter_id = ctx->xbridge->ofproto->up.controller_meter_id; - } else { - meter_id = provider_meter_id; - } - - size_t offset; - size_t ac_offset; - if (meter_id != UINT32_MAX) { - /* If controller meter is configured, generate clone(meter, userspace) - * action. */ - offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE); - nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY, - UINT32_MAX); - ac_offset = nl_msg_start_nested(ctx->odp_actions, - OVS_SAMPLE_ATTR_ACTIONS); - nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); - } - /* Generate the datapath flows even if we don't send the packet-in * so that debugging more closely represents normal state. */ bool dont_send = false; @@ -5178,12 +5184,7 @@ xlate_controller_action(struct xlate_ctx *ctx, int len, dont_send = true; } put_controller_user_action(ctx, dont_send, false, recirc_id, len, - reason, controller_id); - - if (meter_id != UINT32_MAX) { - nl_msg_end_nested(ctx->odp_actions, ac_offset); - nl_msg_end_nested(ctx->odp_actions, offset); - } + reason, provider_meter_id, controller_id); } /* Creates a frozen state, and allocates a unique recirc id for the given @@ -5235,6 +5236,7 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table) put_controller_user_action(ctx, false, true, recirc_id, ctx->pause->max_len, ctx->pause->reason, + ctx->pause->provider_meter_id, ctx->pause->controller_id); } else { if (ctx->recirc_update_dp_hash) { diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index a1393f7f8e5..3eaccb13a69 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -6195,6 +6195,57 @@ AT_CHECK([test 1 = `$PYTHON3 "$top_srcdir/utilities/ovs-pcap.in" p2-tx.pcap | wc OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - continuation with meters]) +AT_KEYWORDS([continuations pause meters]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +dnl Add meter with id=1. +AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) + +AT_DATA([flows.txt], [dnl +table=0 dl_dst=50:54:00:00:00:0a actions=goto_table(1) +table=1 dl_dst=50:54:00:00:00:0a actions=controller(pause,meter_id=1) +]) +AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br0 flows.txt]) + +on_exit 'kill $(cat ovs-ofctl.pid)' +AT_CAPTURE_FILE([ofctl_monitor.log]) +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in \ + --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ + 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234)']) + +OVS_WAIT_UNTIL([test $(wc -l < ofctl_monitor.log) -ge 2]) +OVS_APP_EXIT_AND_WAIT([ovs-ofctl]) +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=14 in_port=1 (via action) data_len=14 (unbuffered) +vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,dl_type=0x1234 +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow13 dump-flows br0 | ofctl_strip | sort], [0], [dnl + n_packets=1, n_bytes=14, dl_dst=50:54:00:00:00:0a actions=goto_table:1 + table=1, n_packets=1, n_bytes=14, dl_dst=50:54:00:00:00:0a actions=controller(pause,meter_id=1) +OFPST_FLOW reply (OF1.3): +]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 dump-meters br0 | ofctl_strip | sort], [0], [dnl +OFPST_METER_CONFIG reply (OF1.3): +meter=1 pktps bands= +type=drop rate=1 +]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 meter-stats br0 | strip_timers], [0], [dnl +OFPST_METER reply (OF1.3) (xid=0x2): +meter:1 flow_count:0 packet_in_count:1 byte_in_count:14 duration:0.0s bands: +0: packet_count:0 byte_count:0 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - continuation with patch port]) AT_KEYWORDS([continuations pause resume]) OVS_VSWITCHD_START( From aab379ec21c95971fe6a05fb94793d1744a864ce Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 18:27:09 +0100 Subject: [PATCH 640/833] ovsdb: raft: Avoid transferring leadership to unavailable servers. Current implementation of the leadership transfer just shoots the leadership in the general direction of the first stable server in the configuration. It doesn't check if the server was active recently or even that the connection is established. This may result in sending leadership to a disconnected or otherwise unavailable server. Such behavior should not cause log truncation or any other correctness issues because the destination server would have all the append requests queued up or the connection will be dropped by the leader. In a worst case we will have a leader-less cluster until the next election timer fires up. Other servers will notice the absence of the leader and will trigger a new leader election normally. However, the potential wait for the election timer is not good as real-world setups may have high values configured. Fix that by trying to transfer to servers that we know have applied the most changes, i.e., have the highest 'match_index'. Such servers replied to the most recent append requests, so they have highest chances to be healthy. Choosing the random starting point in the list of such servers so we don't transfer to the same server every single time. This slightly improves load distribution, but, most importantly, increases robustness of our test suite, making it cover more cases. Also checking that the message was actually sent without immediate failure. If we fail to transfer to any server with the highest index, try to just transfer to any other server that is not behind majority and then just any other server that is connected. We did actually send them all the updates (if the connection is open), they just didn't reply yet for one reason or another. It should be better than leaving the cluster without a leader. Note that there is always a chance that transfer will fail, since we're not waiting for it to be acknowledged (and must not wait). In this case, normal election will be triggered after the election timer fires up. Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") Acked-by: Han Zhou Acked-by: Felix Huettner Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index f463afcb3da..b171da34552 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -1261,10 +1261,30 @@ raft_transfer_leadership(struct raft *raft, const char *reason) return; } - struct raft_server *s; + struct raft_server **servers, *s; + uint64_t threshold = 0; + size_t n = 0, start, i; + + servers = xmalloc(hmap_count(&raft->servers) * sizeof *servers); + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { - if (!uuid_equals(&raft->sid, &s->sid) - && s->phase == RAFT_PHASE_STABLE) { + if (uuid_equals(&raft->sid, &s->sid) + || s->phase != RAFT_PHASE_STABLE) { + continue; + } + if (s->match_index > threshold) { + threshold = s->match_index; + } + servers[n++] = s; + } + + start = n ? random_range(n) : 0; + +retry: + for (i = 0; i < n; i++) { + s = servers[(start + i) % n]; + + if (s->match_index >= threshold) { struct raft_conn *conn = raft_find_conn_by_sid(raft, &s->sid); if (!conn) { continue; @@ -1280,7 +1300,10 @@ raft_transfer_leadership(struct raft *raft, const char *reason) .term = raft->term, } }; - raft_send_to_conn(raft, &rpc, conn); + + if (!raft_send_to_conn(raft, &rpc, conn)) { + continue; + } raft_record_note(raft, "transfer leadership", "transferring leadership to %s because %s", @@ -1288,6 +1311,23 @@ raft_transfer_leadership(struct raft *raft, const char *reason) break; } } + + if (n && i == n && threshold) { + if (threshold > raft->commit_index) { + /* Failed to transfer to servers with the highest 'match_index'. + * Try other servers that are not behind the majority. */ + threshold = raft->commit_index; + } else { + /* Try any other server. It is safe, because they either have all + * the append requests queued up for them before the leadership + * transfer message or their connection is broken and we will not + * transfer anyway. */ + threshold = 0; + } + goto retry; + } + + free(servers); } /* Send a RemoveServerRequest to the rest of the servers in the cluster. From bcad733e2ce36437ab503bc53d87dd80b9d7d336 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 18:27:10 +0100 Subject: [PATCH 641/833] ovsdb: raft: Fix time intervals for multitasking while joining. While joining, ovsdb-server may not wake up for a duration of a join timer, which is 1 second and is by default 3x larger than a heartbeat timer. This is causing unnecessary warnings from the cooperative multitasking module that thinks that we missed the heartbeat time by a lot. Use join timer (1000) instead while joining. Fixes: d4a15647b917 ("ovsdb: raft: Enable cooperative multitasking.") Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index b171da34552..ec3a0ff661e 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -280,6 +280,7 @@ struct raft { /* Used for joining a cluster. */ bool joining; /* Attempting to join the cluster? */ struct sset remote_addresses; /* Addresses to try to find other servers. */ +#define RAFT_JOIN_TIMEOUT_MS 1000 long long int join_timeout; /* Time to re-send add server request. */ /* Used for leaving a cluster. */ @@ -1083,7 +1084,7 @@ raft_open(struct ovsdb_log *log, struct raft **raftp) raft_start_election(raft, false, false); } } else { - raft->join_timeout = time_msec() + 1000; + raft->join_timeout = time_msec() + RAFT_JOIN_TIMEOUT_MS; } raft_reset_ping_timer(raft); @@ -2128,7 +2129,7 @@ raft_run(struct raft *raft) } if (raft->joining && time_msec() >= raft->join_timeout) { - raft->join_timeout = time_msec() + 1000; + raft->join_timeout = time_msec() + RAFT_JOIN_TIMEOUT_MS; LIST_FOR_EACH (conn, list_node, &raft->conns) { raft_send_add_server_request(raft, conn); } @@ -2162,10 +2163,12 @@ raft_run(struct raft *raft) raft_reset_ping_timer(raft); } + uint64_t interval = raft->joining + ? RAFT_JOIN_TIMEOUT_MS + : RAFT_TIMER_THRESHOLD(raft->election_timer); cooperative_multitasking_set( &raft_run_cb, (void *) raft, time_msec(), - RAFT_TIMER_THRESHOLD(raft->election_timer) - + RAFT_TIMER_THRESHOLD(raft->election_timer) / 10, "raft_run"); + interval + interval / 10, "raft_run"); /* Do this only at the end; if we did it as soon as we set raft->left or * raft->failed in handling the RemoveServerReply, then it could easily From af5a99737e8af10ce460a190efeccffff1692864 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 18:27:11 +0100 Subject: [PATCH 642/833] ovsdb: raft: Fix permanent joining state on a cluster member. Consider the following chain of events: 1. Have a cluster with 2 members - A and B. A is a leader. 2. C connects to A, sends a request to join the cluster. 3. A catches up C, creates an update for the 'servers' list and sends it to B and C to apply. This entry is not committed yet. 4. Before B or C can reply, A looses leadership for some reason. 5. A sends a joining failure message to C, C remains in joining state. 5. Both B and C have the new version of 'servers', so they recognize each other as valid cluster members. 6. B initiates a vote, C (or A) replies and B becomes a new leader. 7. B has a new list of servers. B commits it. C becomes a committed cluster member. 8. A and C receive heartbeats with a new commit index and C is now a committed cluster member for all A, B and C. However, at the end of this process, C is still in joining state as it never received a successful reply for a join request, and C is still in a COMMITTING phase for A. So, C skips some parts of the RAFT life cycle and A will refuse to transfer leadership to C if something happens in the future. More interestingly, B can actually transfer leadership to C and vote for it. A will vote for it just fine as well. After that, C becomes a new cluster leader while still in joining state. In this state C will not commit any changes. So, we have seemingly stable cluster that doesn't commit any changes! E.g.: s3 Address: unix:s3.raft Status: joining cluster Remotes for joining: unix:s3.raft unix:s2.raft unix:s1.raft Role: leader Term: 4 Leader: self Vote: self Last Election started 30095 ms ago, reason: leadership_transfer Last Election won: 30093 ms ago Election timer: 1000 Log: [2, 7] Entries not yet committed: 2 Entries not yet applied: 6 Connections: ->s1 ->s2 <-s1 <-s2 Disconnections: 0 Servers: s3 (60cf at unix:s3.raft) (self) next_index=7 match_index=6 s2 (46aa at unix:s2.raft) next_index=7 match_index=6 last msg 58 ms ago s1 (28f7 at unix:s1.raft) next_index=7 match_index=6 last msg 59 ms ago Fix the first scenario by examining server changes in committed log entries. This way server A can transition C to a STABLE phase and server C can find itself in the committed list of servers and move out from a joining state. This is similar to completing commands without receiving an explicit reply or after the role change from leader to follower. The second scenario with a leader in a joining state can be fixed when the joining server becomes leader. New leader's log is getting committed automatically and all servers transition into STABLE phase for it, but it should also move on from a joining state, since it leads the cluster now. It is also possible that B transfers leadership to C before the list of servers is marked as committed on other servers. In this case C will commit it's own addition to the cluster configuration. The added test usually triggers both scenarios, but it will trigger at least one of them. Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 44 ++++++++++++++++++++++++++++++++++- tests/ovsdb-cluster.at | 53 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index ec3a0ff661e..5d7e8873254 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -386,6 +386,7 @@ static void raft_get_servers_from_log(struct raft *, enum vlog_level); static void raft_get_election_timer_from_log(struct raft *); static bool raft_handle_write_error(struct raft *, struct ovsdb_error *); +static bool raft_has_uncommitted_configuration(const struct raft *); static void raft_run_reconfigure(struct raft *); @@ -2848,6 +2849,18 @@ raft_become_leader(struct raft *raft) raft_reset_election_timer(raft); raft_reset_ping_timer(raft); + if (raft->joining) { + /* It is possible that the server committing this one to the list of + * servers lost leadership before the entry is committed but after + * it was already replicated to majority of servers. In this case + * other servers will recognize this one as a valid cluster member + * and may transfer leadership to it and vote for it. This way + * we're becoming a cluster leader without receiving reply for a + * join request and will commit addition of this server ourselves. */ + VLOG_INFO_RL(&rl, "elected as leader while joining"); + raft->joining = false; + } + struct raft_server *s; HMAP_FOR_EACH (s, hmap_node, &raft->servers) { raft_server_init_leader(raft, s); @@ -3006,12 +3019,12 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) } while (raft->commit_index < new_commit_index) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); uint64_t index = ++raft->commit_index; const struct raft_entry *e = raft_get_entry(raft, index); if (raft_entry_has_data(e)) { struct raft_command *cmd = raft_find_command_by_eid(raft, &e->eid); - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); if (cmd) { if (!cmd->index && raft->role == RAFT_LEADER) { @@ -3055,6 +3068,35 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) * reallocate raft->entries, which would invalidate 'e', so * this case must be last, after the one for 'e->data'. */ raft_run_reconfigure(raft); + } else if (e->servers && !raft_has_uncommitted_configuration(raft)) { + struct ovsdb_error *error; + struct raft_server *s; + struct hmap servers; + + error = raft_servers_from_json(e->servers, &servers); + ovs_assert(!error); + HMAP_FOR_EACH (s, hmap_node, &servers) { + struct raft_server *server = raft_find_server(raft, &s->sid); + + if (server && server->phase == RAFT_PHASE_COMMITTING) { + /* This server lost leadership while committing + * server 's', but it was committed later by a + * new leader. */ + server->phase = RAFT_PHASE_STABLE; + } + + if (raft->joining && uuid_equals(&s->sid, &raft->sid)) { + /* Leadership change happened before previous leader + * could commit the change of a servers list, but it + * was replicated and a new leader committed it. */ + VLOG_INFO_RL(&rl, + "added to configuration without reply " + "(eid: "UUID_FMT", commit index: %"PRIu64")", + UUID_ARGS(&e->eid), index); + raft->joining = false; + } + } + raft_servers_destroy(&servers); } } diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 481afc08b32..482e4e02d5c 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -473,6 +473,59 @@ done AT_CLEANUP +AT_SETUP([OVSDB cluster - leadership change after replication while joining]) +AT_KEYWORDS([ovsdb server negative unix cluster join]) + +n=5 +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl + $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=$(ovsdb-tool db-cid s1.db) +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema) +for i in $(seq 2 $n); do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +on_exit " + for i in \$(ls $(pwd)/s[[0-$n]]); do + ovs-appctl --timeout 1 -t \$i cluster/status $schema_name; + done +" + +dnl Starting servers one by one asking all exisitng servers to transfer +dnl leadership after append reply forcing the joining server to try another +dnl one that will also transfer leadership. Since transfer is happening +dnl after the servers update is replicated to other servers, one of the +dnl other servers will actually commit it. It may be a new leader from +dnl one of the old members or the new joining server itself. +for i in $(seq $n); do + dnl Make sure that all already started servers joined the cluster. + for j in $(seq $((i - 1)) ); do + AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name connected]) + done + for j in $(seq $((i - 1)) ); do + OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \ + cluster/failure-test \ + transfer-leadership-after-sending-append-request \ + | grep -q "engaged"]) + done + + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s$i.log \ + --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done + +dnl Make sure that all servers joined the cluster. +for i in $(seq $n); do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +for i in $(seq $n); do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + +AT_CLEANUP OVS_START_SHELL_HELPERS From e987af503975d6d1600e59f7da1bb465eb66aca3 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 18:27:12 +0100 Subject: [PATCH 643/833] ovsdb: raft: Fix assertion when 1-node cluster looses leadership. Some of the failure tests can make a single-node cluster to loose leadership. In this case the next raft_run() will trigger election with a pre-vote enabled. This is causing an assertion when this server attempts to vote for itself. Fix that by not using pre-voting if there is only one server. A new failure test introduced in later commit triggers this assertion every time. Fixes: 85634fd58004 ("ovsdb: raft: Support pre-vote mechanism to deal with disruptive server.") Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 5d7e8873254..0c171b754c4 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -2120,7 +2120,7 @@ raft_run(struct raft *raft) raft_start_election(raft, true, false); } } else { - raft_start_election(raft, true, false); + raft_start_election(raft, hmap_count(&raft->servers) > 1, false); } } From 5b9feacfc8ffc06837e965c5f17269e62e0a1668 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 18:27:13 +0100 Subject: [PATCH 644/833] ovsdb: raft: Fix inability to join after leadership change round trip. Consider the following sequence of events: 1. Cluster with 2 nodes - A and B. A is a leader. 2. C connects to A and sends a join request. 3. A sends an append request to C. C is in CATCHUP phase for A. 4. A looses leadership to B. Sends join failure notification to C. 5. C sends append reply to A. 6. A discards append reply (not leader). 7. B looses leadership back to A. 8. C sends a new join request to A. 9. A replies with failure (already in progress). 10. GoTo step 8. At this point A is waiting for an append reply that it already discarded at step 6 and fails all the new attempts of C to join with 'already in progress' verdict. C stays forever in a joining state and in a CATCHUP phase from A's perspective. This is a similar case to a sudden disconnect from a leader fixed in commit 999ba294fb4f ("ovsdb: raft: Fix inability to join the cluster after interrupted attempt."), but since we're not disconnecting, the servers are not getting destroyed. Fix that by destroying all the servers that are not yet part of the configuration after leadership is lost. This way, server C will be able to simply re-start the joining process from scratch. New failure test command is added in order to simulate leadership change before we receive the append reply, so it gets discarded. New cluster test is added to exercise this scenario. Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") Reported-at: https://github.com/ovn-org/ovn/issues/235 Acked-by: Han Zhou Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 16 ++++++++++++- tests/ovsdb-cluster.at | 53 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 0c171b754c4..d81a1758a0c 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -81,6 +81,7 @@ enum raft_failure_test { FT_STOP_RAFT_RPC, FT_TRANSFER_LEADERSHIP, FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ, + FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD, }; static enum raft_failure_test failure_test; @@ -2740,15 +2741,22 @@ raft_become_follower(struct raft *raft) * new configuration. Our AppendEntries processing will properly update * the server configuration later, if necessary. * + * However, since we're sending replies about a failure to add, those new + * servers has to be cleaned up. Otherwise, they will stuck in a 'CATCHUP' + * phase in case this server regains leadership before they join through + * the current new leader. They are not yet in 'raft->servers', so not + * part of the shared configuration. + * * Also we do not complete commands here, as they can still be completed * if their log entries have already been replicated to other servers. * If the entries were actually committed according to the new leader, our * AppendEntries processing will complete the corresponding commands. */ struct raft_server *s; - HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) { + HMAP_FOR_EACH_POP (s, hmap_node, &raft->add_servers) { raft_send_add_server_reply__(raft, &s->sid, s->address, false, RAFT_SERVER_LOST_LEADERSHIP); + raft_server_destroy(s); } if (raft->remove_server) { raft_send_remove_server_reply__(raft, &raft->remove_server->sid, @@ -4023,6 +4031,10 @@ raft_handle_add_server_request(struct raft *raft, "to cluster "CID_FMT, s->nickname, SID_ARGS(&s->sid), rq->address, CID_ARGS(&raft->cid)); raft_send_append_request(raft, s, 0, "initialize new server"); + + if (failure_test == FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD) { + failure_test = FT_TRANSFER_LEADERSHIP; + } } static void @@ -5148,6 +5160,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED, } else if (!strcmp(test, "transfer-leadership-after-sending-append-request")) { failure_test = FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ; + } else if (!strcmp(test, "transfer-leadership-after-starting-to-add")) { + failure_test = FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD; } else if (!strcmp(test, "transfer-leadership")) { failure_test = FT_TRANSFER_LEADERSHIP; } else if (!strcmp(test, "clear")) { diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 482e4e02d5c..9d8b4d06a4a 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -525,6 +525,59 @@ for i in $(seq $n); do OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) done +AT_CLEANUP + +AT_SETUP([OVSDB cluster - leadership change before replication while joining]) +AT_KEYWORDS([ovsdb server negative unix cluster join]) + +n=5 +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl + $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=$(ovsdb-tool db-cid s1.db) +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema) +for i in $(seq 2 $n); do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +on_exit " + for i in \$(ls $(pwd)/s[[0-$n]]); do + ovs-appctl --timeout 1 -t \$i cluster/status $schema_name; + done +" + +dnl Starting servers one by one asking all exisitng servers to transfer +dnl leadership right after starting to add a server. Joining server will +dnl need to find a new leader that will also transfer leadership. +dnl This will continue until the same server will not become a leader +dnl for the second time and will be able to add a new server. +for i in $(seq $n); do + dnl Make sure that all already started servers joined the cluster. + for j in $(seq $((i - 1)) ); do + AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name connected]) + done + for j in $(seq $((i - 1)) ); do + OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \ + cluster/failure-test \ + transfer-leadership-after-starting-to-add \ + | grep -q "engaged"]) + done + + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s$i.log \ + --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done + +dnl Make sure that all servers joined the cluster. +for i in $(seq $n); do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +for i in $(seq $n); do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + AT_CLEANUP From 95ff912edef88160f50f01ddd906869ba6d5b9ba Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 22 Mar 2024 15:11:17 +0000 Subject: [PATCH 645/833] appveyor: Prepare for rename of primary development branch. Recently OVS adopted a policy of using the inclusive naming word list v1 [1, 2]. And in keeping with this policy it is intended to rename the primary development branch from master to main [3]. In order to help facilitate this change allow Appveyor to run on the main as well as master branch. It is intended that master branch will be removed from appveyor.yml after the primary branch has been renamed. Also, update the string included in artifacts from 'master' to 'main'. [1] df5e5cf4318a ("Documentation: Add section on inclusive language.") [2] https://inclusivenaming.org/word-lists/ [3] https://mail.openvswitch.org/pipermail/ovs-dev/2024-March/412686.html Signed-off-by: Simon Horman Acked-by: Ilya Maximets --- appveyor.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 29cc44d6c6f..050c7dead78 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,6 +2,7 @@ version: 1.0.{build} image: Visual Studio 2019 branches: only: + - main - master configuration: - Debug @@ -74,6 +75,6 @@ build_script: c:\OpenvSwitch-$env:CONFIGURATION.msi after_build: -- ps: 7z a C:\ovs-master-$env:CONFIGURATION.zip C:\openvswitch -- ps: Push-AppveyorArtifact C:\ovs-master-$env:CONFIGURATION.zip +- ps: 7z a C:\ovs-main-$env:CONFIGURATION.zip C:\openvswitch +- ps: Push-AppveyorArtifact C:\ovs-main-$env:CONFIGURATION.zip - ps: Push-AppveyorArtifact C:\OpenvSwitch-$env:CONFIGURATION.msi From 2324596f115166a771c338cd79ca7fb70cf8d8cb Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 22 Mar 2024 15:56:54 +0000 Subject: [PATCH 646/833] Documentation: Update references to kernel datapath in OVS tree. The Kernel datapath is no longer present in the primary development branch of the OVS tree. Update documentation to more clearly reflect this. Documentation relating to the kernel datapath in the OVS tree can be removed once 2.17 is EOL. Also, update wording of affected text as there is more than one upstream networking maintainer these days. Signed-off-by: Simon Horman Acked-by: Ilya Maximets --- Documentation/faq/releases.rst | 18 +++++++++++++++--- .../contributing/backporting-patches.rst | 13 ++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 49b987b610c..26973c2adc3 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -110,7 +110,7 @@ Q: Are all features available with all datapaths? Linux OVS tree The datapath implemented by the Linux kernel module distributed with the OVS source tree. This datapath is deprecated starting with OVS - 2.15.x and support capped at Linux kernel version 5.8. As of OVS 3.0.x + 2.15 and support capped at Linux kernel version 5.8. As of OVS 3.0 the Linux OVS tree is no longer supported. Userspace @@ -256,8 +256,11 @@ Q: I get an error like this when I configure Open vSwitch: that one, because it may support the kernel that you are building against. (To find out, consult the table in the previous FAQ.) - - The Open vSwitch "master" branch may support the kernel that you are - using, so consider building the kernel module from "master". + - For Open vSwitch releases prior to 3.0, the corresponding Open + vSwitch branch may support the kernel that you are using, so consider + building the kernel module from that branch. For Open vSwitch 2.17, + the only non EOL release to which this applies, the branch is + "branch-2.17". All versions of Open vSwitch userspace are compatible with all versions of the Open vSwitch kernel module, so you do not have to use the kernel module @@ -277,6 +280,9 @@ ships as part of the upstream Linux kernel? supported, consider upgrading to a newer upstream Linux release or using the kernel module paired with the userspace distribution. + Please note that as of Open vSwitch 3.0 the kernel module is no longer + part of the Open vSwitch distribution. + Q: Why do tunnels not work when using a kernel module other than the one packaged with Open vSwitch? @@ -303,6 +309,9 @@ packaged with Open vSwitch? doing this, check to make sure that the module that is loaded is the one you expect. + Please note that as of Open vSwitch 3.0 the kernel module is no longer + part of the Open vSwitch distribution. + Q: Why are UDP tunnel checksums not computed for VXLAN or Geneve? A: Generating outer UDP checksums requires kernel support that was not part @@ -311,6 +320,9 @@ Q: Why are UDP tunnel checksums not computed for VXLAN or Geneve? out-of-tree modules from Open vSwitch release 2.4 and later support UDP checksums. + Please note that as of Open vSwitch 3.0 the kernel module is no longer + part of the Open vSwitch distribution. + Q: What features are not available when using the userspace datapath? A: Tunnel virtual ports are not supported, as described in the previous diff --git a/Documentation/internals/contributing/backporting-patches.rst b/Documentation/internals/contributing/backporting-patches.rst index 0ef7f5beb9b..04bb0fc350f 100644 --- a/Documentation/internals/contributing/backporting-patches.rst +++ b/Documentation/internals/contributing/backporting-patches.rst @@ -52,11 +52,14 @@ branches. For Linux datapath code, the primary development branch is in the `net-next`_ tree as described in the section below, and patch discussion occurs on the `netdev`__ mailing list. Patches are first applied to the upstream branch by the -networking maintainer, then the contributor backports the patch to the Open -vSwitch `master` development branch. Patches in this category may include -features which have been applied upstream, or bugfixes to the Open vSwitch -datapath code. For bugfixes, the patches subsequently follow the regular Open -vSwitch process as described above to reach older branches. +networking maintainers, then the contributor backports the patch to an Open +vSwitch branch. Patches in this category may include features which have +been applied upstream, or bugfixes to the Open vSwitch datapath code. + +The practice for Linux datapath code described above is currently only +applicable to bugfixes for Open vSwitch 2.17. This is because all earlier +versions are EOL and all subsequent versions do not include the Linux +datapath as it is now maintained as part of the upstream Linux kernel. __ https://lore.kernel.org/netdev/ From b674e7e04e1d37400c3946f1d3ebe81c99501f7e Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Thu, 28 Mar 2024 17:18:46 +0100 Subject: [PATCH 647/833] conntrack: Fix SNAT with exhaustion system test. Recent kernels introduced a mechanism that allows to evict colliding entries in a closing state whereas they were previously considered as parts of a non-recoverable clash. This new behavior makes "conntrack - SNAT with port range with exhaustion test" fail, as it relies on the previous assumptions. Fix it by creating and not advancing the first entry in SYN_SENT to avoid early eviction. Suggested-by: Ilya Maximets Reported-at: https://issues.redhat.com/browse/FDP-486 Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 2d12d558ec2..20b011b7e19 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -6388,7 +6388,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - SNAT with port range with exhaustion]) -OVS_CHECK_GITHUB_ACTION() CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() OVS_TRAFFIC_VSWITCHD_START() @@ -6398,11 +6397,11 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address 80:89:89:89:89:89]) dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0. AT_DATA([flows.txt], [dnl -in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568,random)),2 -in_port=2,ct_state=-trk,tcp,tp_dst=34567,action=ct(table=0,zone=1,nat) +in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568)),2 in_port=2,ct_state=-trk,tcp,tp_dst=34568,action=ct(table=0,zone=1,nat) in_port=2,ct_state=+trk,ct_zone=1,tcp,action=1 dnl @@ -6426,17 +6425,28 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) dnl HTTP requests from p0->p1 should work fine. OVS_START_L7([at_ns1], [http]) -NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o wget0.log]) + +dnl Send a valid SYN to make conntrack pick it up. +dnl The source port used is 123 to prevent unwanted reuse in the next HTTP request. +syn_pkt=$(ovs-ofctl compose-packet --bare "eth_src=80:88:88:88:88:88,eth_dst=80:89:89:89:89:89,\ + dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_proto=6,nw_ttl=64,nw_frag=no,tcp_flags=syn,\ + tcp_src=123,tcp_dst=80") +AT_CHECK([ovs-ofctl packet-out br0 "packet=${syn_pkt} actions=ct(commit,zone=1,nat(src=10.1.1.240:34568))"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=) +]) NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o wget0.log], [4]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 's/dst=10.1.1.2[[45]][[0-9]]/dst=10.1.1.2XX/' | uniq], [0], [dnl -tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=),zone=1,protoinfo=(state=) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=) ]) OVS_TRAFFIC_VSWITCHD_STOP(["dnl /Unable to NAT due to tuple space exhaustion - if DoS attack, use firewalling and\/or zone partitioning./d -/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) due to excessive rate/d"]) +/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) due to excessive rate/d +/|WARN|.* execute ct.* failed/d"]) AT_CLEANUP AT_SETUP([conntrack - more complex SNAT]) From b5e6829254cc74ed100392dd22e9db3c93b2974e Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Thu, 28 Mar 2024 17:56:08 +0100 Subject: [PATCH 648/833] conntrack: Do not use icmp reverse helper for icmpv6. In the flush tuple code path, while populating the conn_key, reverse_icmp_type() gets called for both icmp and icmpv6 cases, while, depending on the proto, its respective helper should be called, instead. The above leads to an abort: [...] __GI_abort () at abort.c:79 reverse_icmp_type (type=128 '\200') at lib/conntrack.c:1795 tuple_to_conn_key (...) at lib/conntrack.c:2590 in conntrack_flush_tuple (...) at lib/conntrack.c:2787 in dpif_netdev_ct_flush (...) at lib/dpif-netdev.c:9618 ct_dpif_flush_tuple (...) at lib/ct-dpif.c:331 ct_dpif_flush (...) at lib/ct-dpif.c:361 dpctl_flush_conntrack (...) at lib/dpctl.c:1797 [...] Fix it by calling reverse_icmp6_type() when needed. Furthermore, self tests have been modified in order to exercise and check this behavior. Fixes: 271e48a0e244 ("conntrack: Support conntrack flush by ct 5-tuple") Reported-at: https://issues.redhat.com/browse/FDP-447 Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- lib/conntrack.c | 4 +++- tests/system-traffic.at | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 5786424f6d9..7e3ed0ee009 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2586,7 +2586,9 @@ tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone, key->src.icmp_type = tuple->icmp_type; key->src.icmp_code = tuple->icmp_code; key->dst.icmp_id = tuple->icmp_id; - key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type); + key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP) + ? reverse_icmp_type(tuple->icmp_type) + : reverse_icmp6_type(tuple->icmp_type); key->dst.icmp_code = tuple->icmp_code; } else { key->src.port = tuple->src_port; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 20b011b7e19..0008bc1720a 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3103,7 +3103,10 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0) ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +]) dnl Pings from ns1->ns0 should fail. NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl @@ -3244,6 +3247,11 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl icmpv6,orig=(src=fc00::1,dst=fc00::2,id=,type=128,code=0),reply=(src=fc00::2,dst=fc00::1,id=,type=129,code=0) ]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl +]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP From 6439d694ae05b07823b5ca21a089b4ef9ab6b2dd Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Mon, 11 Mar 2024 14:15:47 +0100 Subject: [PATCH 649/833] util: Support checking for kernel versions. Extract checking for a given kernel version to a separate function. It will be used also in the next patch. Acked-by: Mike Pattrick Acked-by: Eelco Chaudron Acked-by: Aaron Conole Signed-off-by: Felix Huettner Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 15 +++------------ lib/util.c | 27 +++++++++++++++++++++++++++ lib/util.h | 4 ++++ 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 1e904d8e631..25349c605cd 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -6428,18 +6427,10 @@ getqdisc_is_safe(void) static bool safe = false; if (ovsthread_once_start(&once)) { - struct utsname utsname; - int major, minor; - - if (uname(&utsname) == -1) { - VLOG_WARN("uname failed (%s)", ovs_strerror(errno)); - } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) { - VLOG_WARN("uname reported bad OS release (%s)", utsname.release); - } else if (major < 2 || (major == 2 && minor < 35)) { - VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s", - utsname.release); - } else { + if (ovs_kernel_is_version_or_newer(2, 35)) { safe = true; + } else { + VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel"); } ovsthread_once_done(&once); } diff --git a/lib/util.c b/lib/util.c index 3fb3a4b40fd..5c31d983a66 100644 --- a/lib/util.c +++ b/lib/util.c @@ -27,6 +27,7 @@ #include #ifdef __linux__ #include +#include #endif #include #include @@ -2500,3 +2501,29 @@ OVS_CONSTRUCTOR(winsock_start) { } } #endif + +#ifdef __linux__ +bool +ovs_kernel_is_version_or_newer(int target_major, int target_minor) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + static int current_major, current_minor = -1; + + if (ovsthread_once_start(&once)) { + struct utsname utsname; + + if (uname(&utsname) == -1) { + VLOG_WARN("uname failed (%s)", ovs_strerror(errno)); + } else if (!ovs_scan(utsname.release, "%d.%d", + ¤t_major, ¤t_minor)) { + VLOG_WARN("uname reported bad OS release (%s)", utsname.release); + } + ovsthread_once_done(&once); + } + if (current_major == -1 || current_minor == -1) { + return false; + } + return current_major > target_major || ( + current_major == target_major && current_minor >= target_minor); +} +#endif diff --git a/lib/util.h b/lib/util.h index f2d45bcac8a..55718fd87ca 100644 --- a/lib/util.h +++ b/lib/util.h @@ -611,4 +611,8 @@ int ftruncate(int fd, off_t length); } #endif +#ifdef __linux__ +bool ovs_kernel_is_version_or_newer(int target_major, int target_minor); +#endif + #endif /* util.h */ From 9ec849e8aa869b646c372fac552ae2609a4b5f66 Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Mon, 11 Mar 2024 14:16:00 +0100 Subject: [PATCH 650/833] netlink-conntrack: Optimize flushing ct zone. Previously the kernel did not provide a netlink interface to flush/list only conntrack entries matching a specific zone. With [1] and [2] it is now possible to flush and list conntrack entries filtered by zone. Older kernels not yet supporting this feature will ignore the filter. For the list request that means just returning all entries (which we can then filter in userspace as before). For the flush request that means deleting all conntrack entries. The implementation is now identical to the windows one, so we combine them. These significantly improves the performance of flushing conntrack zones when the conntrack table is large. Since flushing a conntrack zone is normally triggered via an openflow command it blocks the main ovs thread and thereby also blocks new flows from being applied. Using this new feature we can reduce the flushing time for zones by around 93%. In combination with OVN the creation of a Logical_Router (which causes the flushing of a ct zone) could block other operations, e.g. the failover of Logical_Routers (as they cause new flows to be created). This is visible from a user perspective as a ovn-controller that is idle (as it waits for vswitchd) and vswitchd reporting: "blocked 1000 ms waiting for main to quiesce" (potentially with ever increasing times). The following performance tests where run in a qemu vm with 500.000 conntrack entries distributed evenly over 500 ct zones using `ovstest test-netlink-conntrack flush zone=`. | flush zone with 1000 entries | flush zone with no entry | +---------------------+----------+---------------------+----------| | with the patch | without | with the patch | without | +----------+----------+----------+----------+----------+----------| | v6.8-rc4 | v6.7.1 | v6.8-rc4 | v6.8-rc4 | v6.7.1 | v6.8-rc4 | +---------+----------+----------+----------+----------+----------+----------| | Min | 0.260 | 3.946 | 3.497 | 0.228 | 3.462 | 3.212 | | Median | 0.319 | 4.237 | 4.349 | 0.298 | 4.460 | 4.010 | | 90%ile | 0.335 | 4.367 | 4.522 | 0.325 | 4.662 | 4.572 | | 99%ile | 0.348 | 4.495 | 4.773 | 0.340 | 4.931 | 6.003 | | Max | 0.362 | 4.543 | 5.054 | 0.348 | 5.390 | 6.396 | | Mean | 0.320 | 4.236 | 4.331 | 0.296 | 4.430 | 4.071 | | Total | 80.02 | 1058 | 1082 | 73.93 | 1107 | 1017 | [1]: https://github.com/torvalds/linux/commit/eff3c558bb7e61c41b53e4c8130e514a5a4df9ba [2]: https://github.com/torvalds/linux/commit/fa173a1b4e3fd1ab5451cbc57de6fc624c824b0a Acked-by: Mike Pattrick Acked-by: Aaron Conole Co-Authored-By: Luca Czesla Signed-off-by: Luca Czesla Co-Authored-By: Max Lamprecht Signed-off-by: Max Lamprecht Signed-off-by: Felix Huettner Signed-off-by: Ilya Maximets --- lib/netlink-conntrack.c | 52 ++++++++++++++++++++++++++++++++++++----- tests/system-traffic.at | 28 ++++++++++++++++++++++ 2 files changed, 74 insertions(+), 6 deletions(-) diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c index 492bfcffb8a..0b3a8adf590 100644 --- a/lib/netlink-conntrack.c +++ b/lib/netlink-conntrack.c @@ -141,6 +141,9 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone, nl_msg_put_nfgenmsg(&state->buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET, NLM_F_REQUEST); + if (zone) { + nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone)); + } nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf); ofpbuf_clear(&state->buf); @@ -263,11 +266,9 @@ nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone) return err; } -#ifdef _WIN32 -int -nl_ct_flush_zone(uint16_t flush_zone) +static int +nl_ct_flush_zone_with_cta_zone(uint16_t flush_zone) { - /* Windows can flush a specific zone */ struct ofpbuf buf; int err; @@ -282,24 +283,63 @@ nl_ct_flush_zone(uint16_t flush_zone) return err; } + +#ifdef _WIN32 +int +nl_ct_flush_zone(uint16_t flush_zone) +{ + return nl_ct_flush_zone_with_cta_zone(flush_zone); +} #else + +static bool +netlink_flush_supports_zone(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + static bool supported = false; + + if (ovsthread_once_start(&once)) { + if (ovs_kernel_is_version_or_newer(6, 8)) { + supported = true; + } else { + VLOG_INFO("Disabling conntrack flush by zone. " + "Not supported in Linux kernel."); + } + ovsthread_once_done(&once); + } + return supported; +} + int nl_ct_flush_zone(uint16_t flush_zone) { - /* Apparently, there's no netlink interface to flush a specific zone. + /* In older kernels, there was no netlink interface to flush a specific + * conntrack zone. * This code dumps every connection, checks the zone and eventually * delete the entry. + * In newer kernels there is the option to specify a zone for filtering + * during dumps. Older kernels ignore this option. We set it here in the + * hope we only get relevant entries back, but fall back to filtering here + * to keep compatibility. + * + * This is race-prone, but it is better than using shell scripts. * - * This is race-prone, but it is better than using shell scripts. */ + * Additionally newer kernels also support flushing a zone without listing + * it first. */ struct nl_dump dump; struct ofpbuf buf, reply, delete; + if (netlink_flush_supports_zone()) { + return nl_ct_flush_zone_with_cta_zone(flush_zone); + } + ofpbuf_init(&buf, NL_DUMP_BUFSIZE); ofpbuf_init(&delete, NL_DUMP_BUFSIZE); nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET, NLM_F_REQUEST); + nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone)); nl_dump_start(&dump, NETLINK_NETFILTER, &buf); ofpbuf_clear(&buf); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 0008bc1720a..bc850d9449d 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3069,6 +3069,34 @@ AT_CHECK([grep -q "failed to parse mark" stderr]) AT_CHECK([FLUSH_CMD labels=invalid], [ignore], [ignore], [stderr]) AT_CHECK([grep -q "failed to parse labels" stderr]) + +dnl Test UDP from port 1 and 2, partial flush by zone. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD zone=5]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + +AT_CHECK([FLUSH_CMD zone=0]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) ]) OVS_TRAFFIC_VSWITCHD_STOP From 8bb065961e54d0588317b71a73d7908bcb7fe4ca Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 3 Apr 2024 10:35:28 -0400 Subject: [PATCH 651/833] dpif: Stub out unimplemented action OVS_ACTION_ATTR_DEC_TTL. This is prep for adding a different OVS_ACTION_ATTR_ enum value. This action, OVS_ACTION_ATTR_DEC_TTL, is not actually implemented. However, to make -Werror happy we must add a case to all existing switches. Acked-by: Eelco Chaudron Signed-off-by: Eric Garver Signed-off-by: Ilya Maximets --- include/linux/openvswitch.h | 9 +++++++++ lib/dpif-netdev.c | 1 + lib/dpif.c | 1 + lib/odp-execute.c | 2 ++ lib/odp-util.c | 23 +++++++++++++++++++++++ ofproto/ofproto-dpif-ipfix.c | 1 + ofproto/ofproto-dpif-sflow.c | 1 + 7 files changed, 38 insertions(+) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index e305c331516..d18754c84f6 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -1085,6 +1085,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ + OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ #ifndef __KERNEL__ OVS_ACTION_ATTR_TUNNEL_PUSH, /* struct ovs_action_push_tnl*/ @@ -1183,6 +1184,14 @@ struct ovs_zone_limit { __u32 count; }; +enum ovs_dec_ttl_attr { + OVS_DEC_TTL_ATTR_UNSPEC, + OVS_DEC_TTL_ATTR_ACTION, /* Nested struct nlattr */ + __OVS_DEC_TTL_ATTR_MAX +}; + +#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1) + #define OVS_CLONE_ATTR_EXEC 0 /* Specify an u32 value. When nonzero, * actions in clone will not change flow * keys. False otherwise. diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 7e637ff8ac6..c7f9e149025 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9518,6 +9518,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } diff --git a/lib/dpif.c b/lib/dpif.c index d07241f1e7c..0f480bec48d 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1289,6 +1289,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } diff --git a/lib/odp-execute.c b/lib/odp-execute.c index eb03b57c42e..081e4d43268 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -837,6 +837,7 @@ requires_datapath_assistance(const struct nlattr *a) case OVS_ACTION_ATTR_CT_CLEAR: case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case OVS_ACTION_ATTR_DROP: return false; @@ -1227,6 +1228,7 @@ odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal, case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_CT: case OVS_ACTION_ATTR_UNSPEC: + case OVS_ACTION_ATTR_DEC_TTL: case __OVS_ACTION_ATTR_MAX: /* The following actions are handled by the scalar implementation. */ case OVS_ACTION_ATTR_POP_VLAN: diff --git a/lib/odp-util.c b/lib/odp-util.c index 9306c9b4d47..21f34d95571 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -143,6 +143,7 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_POP_NSH: return 0; case OVS_ACTION_ATTR_CHECK_PKT_LEN: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_ADD_MPLS: return sizeof(struct ovs_action_add_mpls); + case OVS_ACTION_ATTR_DEC_TTL: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_DROP: return sizeof(uint32_t); case OVS_ACTION_ATTR_UNSPEC: @@ -1130,6 +1131,25 @@ format_odp_check_pkt_len_action(struct ds *ds, const struct nlattr *attr, ds_put_cstr(ds, "))"); } +static void +format_dec_ttl_action(struct ds *ds, const struct nlattr *attr, + const struct hmap *portno_names) +{ + const struct nlattr *a; + unsigned int left; + + ds_put_cstr(ds,"dec_ttl(le_1("); + NL_ATTR_FOR_EACH (a, left, + nl_attr_get(attr), nl_attr_get_size(attr)) { + if (nl_attr_type(a) == OVS_DEC_TTL_ATTR_ACTION) { + format_odp_actions(ds, nl_attr_get(a), + nl_attr_get_size(a), portno_names); + break; + } + } + ds_put_format(ds, "))"); +} + static void format_odp_action(struct ds *ds, const struct nlattr *a, const struct hmap *portno_names) @@ -1283,6 +1303,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a, ntohs(mpls->mpls_ethertype)); break; } + case OVS_ACTION_ATTR_DEC_TTL: + format_dec_ttl_action(ds, a, portno_names); + break; case OVS_ACTION_ATTR_DROP: ds_put_cstr(ds, "drop"); break; diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index e6c2968f7e9..cd65dae7e18 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -3135,6 +3135,7 @@ dpif_ipfix_read_actions(const struct flow *flow, case OVS_ACTION_ATTR_UNSPEC: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case __OVS_ACTION_ATTR_MAX: default: break; diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c index a3c83bac815..4a68e9b949b 100644 --- a/ofproto/ofproto-dpif-sflow.c +++ b/ofproto/ofproto-dpif-sflow.c @@ -1236,6 +1236,7 @@ dpif_sflow_read_actions(const struct flow *flow, case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case __OVS_ACTION_ATTR_MAX: default: break; From dec09fd7b28e9786ad365159a293a0bc346c0171 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 3 Apr 2024 10:35:29 -0400 Subject: [PATCH 652/833] dpif: Make get_datapath_cap() access support by pointer. This avoids copying the support struct onto the stack. Acked-by: Eelco Chaudron Signed-off-by: Eric Garver Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif.c | 59 +++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index fe034f9717b..8ff165d1ec4 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -5694,47 +5694,46 @@ ct_zone_limit_protection_update(const char *datapath_type, bool protected) static void get_datapath_cap(const char *datapath_type, struct smap *cap) { - struct odp_support odp; - struct dpif_backer_support s; + struct dpif_backer_support *s; struct dpif_backer *backer = shash_find_data(&all_dpif_backers, datapath_type); if (!backer) { return; } - s = backer->rt_support; - odp = s.odp; + s = &backer->rt_support; /* ODP_SUPPORT_FIELDS */ smap_add_format(cap, "max_vlan_headers", "%"PRIuSIZE, - odp.max_vlan_headers); - smap_add_format(cap, "max_mpls_depth", "%"PRIuSIZE, odp.max_mpls_depth); - smap_add(cap, "recirc", odp.recirc ? "true" : "false"); - smap_add(cap, "ct_state", odp.ct_state ? "true" : "false"); - smap_add(cap, "ct_zone", odp.ct_zone ? "true" : "false"); - smap_add(cap, "ct_mark", odp.ct_mark ? "true" : "false"); - smap_add(cap, "ct_label", odp.ct_label ? "true" : "false"); - smap_add(cap, "ct_state_nat", odp.ct_state_nat ? "true" : "false"); - smap_add(cap, "ct_orig_tuple", odp.ct_orig_tuple ? "true" : "false"); - smap_add(cap, "ct_orig_tuple6", odp.ct_orig_tuple6 ? "true" : "false"); - smap_add(cap, "nd_ext", odp.nd_ext ? "true" : "false"); + s->odp.max_vlan_headers); + smap_add_format(cap, "max_mpls_depth", "%"PRIuSIZE, s->odp.max_mpls_depth); + smap_add(cap, "recirc", s->odp.recirc ? "true" : "false"); + smap_add(cap, "ct_state", s->odp.ct_state ? "true" : "false"); + smap_add(cap, "ct_zone", s->odp.ct_zone ? "true" : "false"); + smap_add(cap, "ct_mark", s->odp.ct_mark ? "true" : "false"); + smap_add(cap, "ct_label", s->odp.ct_label ? "true" : "false"); + smap_add(cap, "ct_state_nat", s->odp.ct_state_nat ? "true" : "false"); + smap_add(cap, "ct_orig_tuple", s->odp.ct_orig_tuple ? "true" : "false"); + smap_add(cap, "ct_orig_tuple6", s->odp.ct_orig_tuple6 ? "true" : "false"); + smap_add(cap, "nd_ext", s->odp.nd_ext ? "true" : "false"); /* DPIF_SUPPORT_FIELDS */ - smap_add(cap, "masked_set_action", s.masked_set_action ? "true" : "false"); - smap_add(cap, "tnl_push_pop", s.tnl_push_pop ? "true" : "false"); - smap_add(cap, "ufid", s.ufid ? "true" : "false"); - smap_add(cap, "trunc", s.trunc ? "true" : "false"); - smap_add(cap, "clone", s.clone ? "true" : "false"); - smap_add(cap, "sample_nesting", s.sample_nesting ? "true" : "false"); - smap_add(cap, "ct_eventmask", s.ct_eventmask ? "true" : "false"); - smap_add(cap, "ct_clear", s.ct_clear ? "true" : "false"); - smap_add_format(cap, "max_hash_alg", "%"PRIuSIZE, s.max_hash_alg); - smap_add(cap, "check_pkt_len", s.check_pkt_len ? "true" : "false"); - smap_add(cap, "ct_timeout", s.ct_timeout ? "true" : "false"); + smap_add(cap, "masked_set_action", + s->masked_set_action ? "true" : "false"); + smap_add(cap, "tnl_push_pop", s->tnl_push_pop ? "true" : "false"); + smap_add(cap, "ufid", s->ufid ? "true" : "false"); + smap_add(cap, "trunc", s->trunc ? "true" : "false"); + smap_add(cap, "clone", s->clone ? "true" : "false"); + smap_add(cap, "sample_nesting", s->sample_nesting ? "true" : "false"); + smap_add(cap, "ct_eventmask", s->ct_eventmask ? "true" : "false"); + smap_add(cap, "ct_clear", s->ct_clear ? "true" : "false"); + smap_add_format(cap, "max_hash_alg", "%"PRIuSIZE, s->max_hash_alg); + smap_add(cap, "check_pkt_len", s->check_pkt_len ? "true" : "false"); + smap_add(cap, "ct_timeout", s->ct_timeout ? "true" : "false"); smap_add(cap, "explicit_drop_action", - s.explicit_drop_action ? "true" :"false"); - smap_add(cap, "lb_output_action", s.lb_output_action ? "true" : "false"); - smap_add(cap, "ct_zero_snat", s.ct_zero_snat ? "true" : "false"); - smap_add(cap, "add_mpls", s.add_mpls ? "true" : "false"); + s->explicit_drop_action ? "true" :"false"); + smap_add(cap, "lb_output_action", s->lb_output_action ? "true" : "false"); + smap_add(cap, "ct_zero_snat", s->ct_zero_snat ? "true" : "false"); + smap_add(cap, "add_mpls", s->add_mpls ? "true" : "false"); /* The ct_tuple_flush is implemented on dpif level, so it is supported * for all backers. */ From 54d94f8f4d0644054dfc9ac9eb205c2e34781f8d Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 3 Apr 2024 10:35:30 -0400 Subject: [PATCH 653/833] dpif: Support atomic_bool field type. The next commit will convert a dp feature from bool to atomic_bool. As such we have to add support to the macros and functions. We must pass by reference instead of pass by value because all the atomic operations require a reference. Acked-by: Eelco Chaudron Signed-off-by: Eric Garver Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif.c | 54 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 8ff165d1ec4..c4e2e867ecd 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -717,6 +717,8 @@ close_dpif_backer(struct dpif_backer *backer, bool del) } static void check_support(struct dpif_backer *backer); +static void copy_support(struct dpif_backer_support *dst, + struct dpif_backer_support *src); static int open_dpif_backer(const char *type, struct dpif_backer **backerp) @@ -837,7 +839,7 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) * 'boottime_support' can be checked to prevent 'support' to be changed * beyond the datapath capabilities. In case 'support' is changed by * the user, 'boottime_support' can be used to restore it. */ - backer->bt_support = backer->rt_support; + copy_support(&backer->bt_support, &backer->rt_support); return error; } @@ -1611,6 +1613,24 @@ CHECK_FEATURE__(ct_orig_tuple6, ct_orig_tuple6, ct_nw_proto, 1, ETH_TYPE_IPV6) #undef CHECK_FEATURE #undef CHECK_FEATURE__ +static void +copy_support(struct dpif_backer_support *dst, struct dpif_backer_support *src) +{ +#define DPIF_SUPPORT_FIELD(TYPE, NAME, TITLE) \ + if (!strcmp(#TYPE, "atomic_bool")) { \ + bool value; \ + atomic_read_relaxed((atomic_bool *) &src->NAME, &value); \ + atomic_store_relaxed((atomic_bool *) &dst->NAME, value); \ + } else { \ + dst->NAME = src->NAME; \ + } + + DPIF_SUPPORT_FIELDS +#undef DPIF_SUPPORT_FIELD + + dst->odp = src->odp; +} + static void check_support(struct dpif_backer *backer) { @@ -6254,20 +6274,30 @@ ofproto_unixctl_dpif_dump_dps(struct unixctl_conn *conn, int argc OVS_UNUSED, } static void -show_dp_feature_bool(struct ds *ds, const char *feature, bool b) +show_dp_feature_bool(struct ds *ds, const char *feature, const bool *b) { - ds_put_format(ds, "%s: %s\n", feature, b ? "Yes" : "No"); + ds_put_format(ds, "%s: %s\n", feature, *b ? "Yes" : "No"); +} + +static void OVS_UNUSED +show_dp_feature_atomic_bool(struct ds *ds, const char *feature, + const atomic_bool *b) +{ + bool value; + atomic_read_relaxed((atomic_bool *) b, &value); + ds_put_format(ds, "%s: %s\n", feature, value ? "Yes" : "No"); } static void -show_dp_feature_size_t(struct ds *ds, const char *feature, size_t s) +show_dp_feature_size_t(struct ds *ds, const char *feature, const size_t *s) { - ds_put_format(ds, "%s: %"PRIuSIZE"\n", feature, s); + ds_put_format(ds, "%s: %"PRIuSIZE"\n", feature, *s); } enum dpif_support_field_type { DPIF_SUPPORT_FIELD_bool, DPIF_SUPPORT_FIELD_size_t, + DPIF_SUPPORT_FIELD_atomic_bool, }; struct dpif_support_field { @@ -6284,12 +6314,12 @@ static void dpif_show_support(const struct dpif_backer_support *support, struct ds *ds) { #define DPIF_SUPPORT_FIELD(TYPE, NAME, TITLE) \ - show_dp_feature_##TYPE (ds, TITLE, support->NAME); + show_dp_feature_##TYPE (ds, TITLE, &support->NAME); DPIF_SUPPORT_FIELDS #undef DPIF_SUPPORT_FIELD #define ODP_SUPPORT_FIELD(TYPE, NAME, TITLE) \ - show_dp_feature_##TYPE (ds, TITLE, support->odp.NAME ); + show_dp_feature_##TYPE (ds, TITLE, &support->odp.NAME ); ODP_SUPPORT_FIELDS #undef ODP_SUPPORT_FIELD } @@ -6308,6 +6338,16 @@ display_support_field(const char *name, b ? "true" : "false"); break; } + case DPIF_SUPPORT_FIELD_atomic_bool: { + bool b, v; + + atomic_read_relaxed((atomic_bool *) field->rt_ptr, &v); + atomic_read_relaxed((atomic_bool *) field->bt_ptr, &b); + ds_put_format(ds, "%s (%s) : [run time]:%s, [boot time]:%s\n", name, + field->title, v ? "true" : "false", + b ? "true" : "false"); + break; + } case DPIF_SUPPORT_FIELD_size_t: ds_put_format(ds, "%s (%s) : [run time]:%"PRIuSIZE ", [boot time]:%"PRIuSIZE"\n", name, From 3c8d069b9bc2307ff2740d45a61d8d9504d65a80 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 3 Apr 2024 10:35:31 -0400 Subject: [PATCH 654/833] dpif: Probe support for OVS_ACTION_ATTR_DROP. Kernel support has been added for this action. As such, we need to probe the datapath for support. Acked-by: Eelco Chaudron Signed-off-by: Eric Garver Signed-off-by: Ilya Maximets --- include/linux/openvswitch.h | 2 +- lib/dpif.c | 6 ++- lib/dpif.h | 2 +- ofproto/ofproto-dpif.c | 77 ++++++++++++++++++++++++++++++++++--- ofproto/ofproto-dpif.h | 4 +- 5 files changed, 81 insertions(+), 10 deletions(-) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index d18754c84f6..d9fb991ef23 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -1086,11 +1086,11 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ + OVS_ACTION_ATTR_DROP, /* u32 xlate_error. */ #ifndef __KERNEL__ OVS_ACTION_ATTR_TUNNEL_PUSH, /* struct ovs_action_push_tnl*/ OVS_ACTION_ATTR_TUNNEL_POP, /* u32 port number. */ - OVS_ACTION_ATTR_DROP, /* u32 xlate_error. */ OVS_ACTION_ATTR_LB_OUTPUT, /* u32 bond-id. */ #endif __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted diff --git a/lib/dpif.c b/lib/dpif.c index 0f480bec48d..23eb18495a6 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -28,6 +28,7 @@ #include "dpctl.h" #include "dpif-netdev.h" #include "flow.h" +#include "netdev-offload.h" #include "netdev-provider.h" #include "netdev.h" #include "netlink.h" @@ -1935,9 +1936,10 @@ dpif_supports_tnl_push_pop(const struct dpif *dpif) } bool -dpif_supports_explicit_drop_action(const struct dpif *dpif) +dpif_may_support_explicit_drop_action(const struct dpif *dpif) { - return dpif_is_netdev(dpif); + /* TC does not support offloading this action. */ + return dpif_is_netdev(dpif) || !netdev_is_flow_api_enabled(); } bool diff --git a/lib/dpif.h b/lib/dpif.h index 0f2dc2ef3c5..a764e8a592b 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -940,7 +940,7 @@ int dpif_get_pmds_for_port(const struct dpif * dpif, odp_port_t port_no, char *dpif_get_dp_version(const struct dpif *); bool dpif_supports_tnl_push_pop(const struct dpif *); -bool dpif_supports_explicit_drop_action(const struct dpif *); +bool dpif_may_support_explicit_drop_action(const struct dpif *); bool dpif_synced_dp_layers(struct dpif *); /* Log functions. */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index c4e2e867ecd..32d037be607 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -221,6 +221,7 @@ static void ct_zone_config_init(struct dpif_backer *backer); static void ct_zone_config_uninit(struct dpif_backer *backer); static void ct_zone_timeout_policy_sweep(struct dpif_backer *backer); static void ct_zone_limits_commit(struct dpif_backer *backer); +static bool recheck_support_explicit_drop_action(struct dpif_backer *backer); static inline struct ofproto_dpif * ofproto_dpif_cast(const struct ofproto *ofproto) @@ -391,6 +392,10 @@ type_run(const char *type) udpif_set_threads(backer->udpif, n_handlers, n_revalidators); } + if (recheck_support_explicit_drop_action(backer)) { + backer->need_revalidate = REV_RECONFIGURE; + } + if (backer->need_revalidate) { struct ofproto_dpif *ofproto; struct simap_node *node; @@ -855,7 +860,11 @@ ovs_native_tunneling_is_on(struct ofproto_dpif *ofproto) bool ovs_explicit_drop_action_supported(struct ofproto_dpif *ofproto) { - return ofproto->backer->rt_support.explicit_drop_action; + bool value; + + atomic_read_relaxed(&ofproto->backer->rt_support.explicit_drop_action, + &value); + return value; } bool @@ -1379,6 +1388,40 @@ check_ct_timeout_policy(struct dpif_backer *backer) return !error; } +/* Tests whether backer's datapath supports the OVS_ACTION_ATTR_DROP action. */ +static bool +check_drop_action(struct dpif_backer *backer) +{ + struct odputil_keybuf keybuf; + uint8_t actbuf[NL_A_U32_SIZE]; + struct ofpbuf actions; + struct ofpbuf key; + bool supported; + + struct flow flow = { + .dl_type = CONSTANT_HTONS(0x1234), /* bogus */ + }; + struct odp_flow_key_parms odp_parms = { + .flow = &flow, + .probe = true, + }; + + ofpbuf_use_stack(&key, &keybuf, sizeof keybuf); + odp_flow_key_from_flow(&odp_parms, &key); + + ofpbuf_use_stack(&actions, &actbuf, sizeof actbuf); + nl_msg_put_u32(&actions, OVS_ACTION_ATTR_DROP, XLATE_OK); + + supported = dpif_may_support_explicit_drop_action(backer->dpif) && + dpif_probe_feature(backer->dpif, "drop", &key, &actions, NULL); + + VLOG_INFO("%s: Datapath %s explicit drop action", + dpif_name(backer->dpif), + (supported) ? "supports" : "does not support"); + + return supported; +} + /* Tests whether 'backer''s datapath supports the all-zero SNAT case. */ static bool dpif_supports_ct_zero_snat(struct dpif_backer *backer) @@ -1649,8 +1692,8 @@ check_support(struct dpif_backer *backer) backer->rt_support.max_hash_alg = check_max_dp_hash_alg(backer); backer->rt_support.check_pkt_len = check_check_pkt_len(backer); backer->rt_support.ct_timeout = check_ct_timeout_policy(backer); - backer->rt_support.explicit_drop_action = - dpif_supports_explicit_drop_action(backer->dpif); + atomic_store_relaxed(&backer->rt_support.explicit_drop_action, + check_drop_action(backer)); backer->rt_support.lb_output_action = dpif_supports_lb_output_action(backer->dpif); backer->rt_support.ct_zero_snat = dpif_supports_ct_zero_snat(backer); @@ -1667,6 +1710,28 @@ check_support(struct dpif_backer *backer) backer->rt_support.odp.nd_ext = check_nd_extensions(backer); } +/* TC does not support offloading the explicit drop action. As such we need to + * re-probe the datapath if hw-offload has been modified. + * Note: We don't support true --> false transition as that requires a restart. + * See netdev_set_flow_api_enabled(). */ +static bool +recheck_support_explicit_drop_action(struct dpif_backer *backer) +{ + bool explicit_drop_action; + + atomic_read_relaxed(&backer->rt_support.explicit_drop_action, + &explicit_drop_action); + + if (explicit_drop_action + && !dpif_may_support_explicit_drop_action(backer->dpif)) { + ovs_assert(!check_drop_action(backer)); + atomic_store_relaxed(&backer->rt_support.explicit_drop_action, false); + return true; + } + + return false; +} + static int construct(struct ofproto *ofproto_) { @@ -5714,6 +5779,7 @@ ct_zone_limit_protection_update(const char *datapath_type, bool protected) static void get_datapath_cap(const char *datapath_type, struct smap *cap) { + bool explicit_drop_action; struct dpif_backer_support *s; struct dpif_backer *backer = shash_find_data(&all_dpif_backers, datapath_type); @@ -5749,8 +5815,9 @@ get_datapath_cap(const char *datapath_type, struct smap *cap) smap_add_format(cap, "max_hash_alg", "%"PRIuSIZE, s->max_hash_alg); smap_add(cap, "check_pkt_len", s->check_pkt_len ? "true" : "false"); smap_add(cap, "ct_timeout", s->ct_timeout ? "true" : "false"); + atomic_read_relaxed(&s->explicit_drop_action, &explicit_drop_action); smap_add(cap, "explicit_drop_action", - s->explicit_drop_action ? "true" :"false"); + explicit_drop_action ? "true" :"false"); smap_add(cap, "lb_output_action", s->lb_output_action ? "true" : "false"); smap_add(cap, "ct_zero_snat", s->ct_zero_snat ? "true" : "false"); smap_add(cap, "add_mpls", s->add_mpls ? "true" : "false"); @@ -6279,7 +6346,7 @@ show_dp_feature_bool(struct ds *ds, const char *feature, const bool *b) ds_put_format(ds, "%s: %s\n", feature, *b ? "Yes" : "No"); } -static void OVS_UNUSED +static void show_dp_feature_atomic_bool(struct ds *ds, const char *feature, const atomic_bool *b) { diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index 92d33aa6470..d33f73df8ae 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -51,6 +51,7 @@ #include "hmapx.h" #include "odp-util.h" #include "id-pool.h" +#include "ovs-atomic.h" #include "ovs-thread.h" #include "ofproto-provider.h" #include "util.h" @@ -202,7 +203,8 @@ struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, DPIF_SUPPORT_FIELD(bool, ct_timeout, "Conntrack timeout policy") \ \ /* True if the datapath supports explicit drop action. */ \ - DPIF_SUPPORT_FIELD(bool, explicit_drop_action, "Explicit Drop action") \ + DPIF_SUPPORT_FIELD(atomic_bool, explicit_drop_action, \ + "Explicit Drop action") \ \ /* True if the datapath supports balance_tcp optimization */ \ DPIF_SUPPORT_FIELD(bool, lb_output_action, "Optimized Balance TCP mode")\ From edf56245b828ed8f22c1659c76e8d92c95a92f22 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 3 Apr 2024 10:35:32 -0400 Subject: [PATCH 655/833] tests: system-traffic: Add coverage for drop action. Exercise the drop action in the datapath. This specific tests triggers an xlate_error. For the kernel datapath skb drop reasons can then be seen while this test runs. # perf trace -e skb:kfree_skb --filter="reason >= 0x30000" 0.000 ping/1275884 skb:kfree_skb(skbaddr: 0xffff8acd76546000, \ location: 0xffffffffc0ee3634, protocol: 2048, reason: 196611) Acked-by: Eelco Chaudron Signed-off-by: Eric Garver Signed-off-by: Ilya Maximets --- tests/system-common-macros.at | 4 ++++ tests/system-traffic.at | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 01ebe364ee7..2a68cd664e5 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -374,3 +374,7 @@ m4_define([OVS_CHECK_CT_CLEAR], # OVS_CHECK_GITHUB_ACTION m4_define([OVS_CHECK_GITHUB_ACTION], [AT_SKIP_IF([test "$GITHUB_ACTIONS" = "true"])]) + +# OVS_CHECK_DROP_ACTION() +m4_define([OVS_CHECK_DROP_ACTION], + [AT_SKIP_IF([! grep -q "Datapath supports explicit drop action" ovs-vswitchd.log])]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index bc850d9449d..bd7647cbee6 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2231,6 +2231,37 @@ masks-cache:size:256 OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - drop action]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_DROP_ACTION() +AT_KEYWORDS(drop_action) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Exceed the max number of resubmits. +(echo "dl_type=0x806, actions=normal" +for i in $(seq 1 64); do + j=$(expr $i + 1) + echo "in_port=$i, actions=resubmit:$j, resubmit:$j, local" + done + echo "in_port=65, actions=local" +) > flows.txt +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Generate some traffic. +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2], [1], [ignore]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | dnl + strip_ptype | strip_eth | strip_recirc | strip_stats | dnl + strip_used | sort], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:drop]) + +OVS_TRAFFIC_VSWITCHD_STOP(["/WARN/d"]) +AT_CLEANUP + AT_SETUP([datapath - simulated flow action update]) OVS_TRAFFIC_VSWITCHD_START() From 337db586534148f6f9aa9f2535e398745b5ba47e Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 3 Apr 2024 10:35:33 -0400 Subject: [PATCH 656/833] tests: system-offload-traffic: Verify re-probe of drop action. Verify that the explicit drop action is re-probed if the hw-offload flag is changed. Acked-by: Eelco Chaudron Signed-off-by: Eric Garver Signed-off-by: Ilya Maximets --- tests/system-offloads-traffic.at | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 6bd49a3eef3..d1da33d96c6 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -921,3 +921,15 @@ AT_CHECK([tc -d filter show dev ovs-p0 ingress | grep -q "csum (iph)"], [0]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([offloads - re-probe drop action]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_DROP_ACTION() +AT_KEYWORDS(drop_action) + +dnl Trigger a re-probe of the explicit drop action. +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true]) +OVS_WAIT_UNTIL([grep -q "Datapath does not support explicit drop action" ovs-vswitchd.log]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP From ed379a810ac81bfa7d86543d7b1914b17d2c0063 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 4 Apr 2024 14:09:37 +0200 Subject: [PATCH 657/833] ofproto-dpif-upcall: Fix ukey installation failure logs and counters. ukey_install() returns boolean signaling if the ukey was installed or not. Installation may fail for a few reasons: 1. Conflicting ukey. 2. Mutex contention while trying to replace existing ukey. 3. The same ukey already exists and active. Only the first case here signals an actual problem. Third one is a little odd for userspace datapath, but harmless. Second is the most common one that can easily happen during normal operation since other threads like revalidators may be currently working on this ukey preventing an immediate access. Since only the first case is actually worth logging and it already has its own log message, removing the 'upcall installation fails' warning from the upcall_cb(). This should fix most of the random failures of userspace system tests in CI. While at it, also fixing coverage counters. Mutex contention was mistakenly counted as a duplicate upcall. ukey contention for revalidators was counted only in one of two places. New counter added for the ukey contention on replace. We should not re-use existing upcall_ukey_contention counter for this, since it may lead to double counting. Fixes: 67f08985d769 ("upcall: Replace ukeys for deleted flows.") Fixes: 9cec8274ed9a ("ofproto-dpif-upcall: Add VLOG_WARN_RL logs for upcall_cb() error.") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-upcall.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index d8819563662..73901b65105 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -59,6 +59,7 @@ COVERAGE_DEFINE(handler_duplicate_upcall); COVERAGE_DEFINE(revalidate_missed_dp_flow); COVERAGE_DEFINE(ukey_dp_change); COVERAGE_DEFINE(ukey_invalid_stat_reset); +COVERAGE_DEFINE(ukey_replace_contention); COVERAGE_DEFINE(upcall_flow_limit_grew); COVERAGE_DEFINE(upcall_flow_limit_hit); COVERAGE_DEFINE(upcall_flow_limit_kill); @@ -1449,8 +1450,6 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi } if (upcall.ukey && !ukey_install(udpif, upcall.ukey)) { - static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1); - VLOG_WARN_RL(&rll, "upcall_cb failure: ukey installation fails"); error = ENOSPC; } out: @@ -1948,15 +1947,15 @@ try_ukey_replace(struct umap *umap, struct udpif_key *old_ukey, transition_ukey(old_ukey, UKEY_DELETED); transition_ukey(new_ukey, UKEY_VISIBLE); replaced = true; + COVERAGE_INC(upcall_ukey_replace); + } else { + COVERAGE_INC(handler_duplicate_upcall); } ovs_mutex_unlock(&old_ukey->mutex); - } - - if (replaced) { - COVERAGE_INC(upcall_ukey_replace); } else { - COVERAGE_INC(handler_duplicate_upcall); + COVERAGE_INC(ukey_replace_contention); } + return replaced; } @@ -3009,6 +3008,7 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) /* Handler threads could be holding a ukey lock while it installs a * new flow, so don't hang around waiting for access to it. */ if (ovs_mutex_trylock(&ukey->mutex)) { + COVERAGE_INC(upcall_ukey_contention); continue; } ukey_state = ukey->state; From 95f5012bbc158345bfa17a6d7e58caedbffb2e30 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 2 Apr 2024 15:21:16 +0200 Subject: [PATCH 658/833] checkpatch: Add additional words to extra_keywords. This patch add another set of keywords based on the results of the last thousand committed patches. Acked-by: Simon Horman Signed-off-by: Eelco Chaudron --- utilities/checkpatch.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index e0cab6b9f82..692bb88d40d 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -97,7 +97,22 @@ def open_spell_check_dict(): 'debian', 'travis', 'cirrus', 'appveyor', 'faq', 'erspan', 'const', 'hotplug', 'addresssanitizer', 'ovsdb', 'dpif', 'veth', 'rhel', 'jsonrpc', 'json', - 'syscall', 'lacp', 'ipf', 'skb', 'valgrind'] + 'syscall', 'lacp', 'ipf', 'skb', 'valgrind', + 'appctl', 'arp', 'asan', 'backport', 'backtrace', + 'chmod', 'ci', 'cpu', 'cpus', 'dnat', 'dns', 'dpcls', + 'eol', 'ethtool', 'fdb', 'freebsd', 'gcc', 'github', + 'glibc', 'gre', 'inlined', 'ip', 'ipfix', 'ipsec', + 'ixgbe', 'libbpf', 'libcrypto', 'libgcc', + 'libopenvswitch', 'libreswan', 'libssl', 'libxdp', + 'lldp', 'llvm', 'lockless', 'mcast', 'megaflows', + 'mfex', 'ncat', 'networkmanager', 'pcap', 'pedit', + 'pidfile', 'pps', 'rculist', 'rebalance', 'rebased' + 'recirculations', 'revalidators', 'rst', 'sed', + 'shrinked', 'snat', 'stderr', 'stdout', 'testpmd', + 'tftp', 'timeval', 'trie', 'tso', 'ubsan', 'ukey', + 'umask', 'unassociated', 'unixctl', 'uuid' + 'virtqueue', 'vms', 'vnet', 'vport', 'vports', + 'vtep', 'wc', 'wget', 'xenserver'] global spell_check_dict From 9185793e75435d890f18d391eaaeab0ade6f1415 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Mon, 8 Apr 2024 23:24:14 +0200 Subject: [PATCH 659/833] tests: Fix compatibility issue with Python 3.13 in vlog.at. The vlog - Python3 test makes use of output from Python Tracebacks in its test assertion. In Python 3.13 a line with tophat (``^``) markers is added below Tracebacks from calls to assert [0], which makes the test fail. This change of behavior is also backported to the Python 3.12 and 3.11 stable branches [1]. Strip lines containing one or more occurrence of the ``^`` character from the output before performing the test assertions. 0: https://github.com/python/cpython/pull/105935 1: https://github.com/python/cpython/issues/116034 Reported-at: https://launchpad.net/bugs/2060434 Signed-off-by: Frode Nordahl Signed-off-by: Ilya Maximets --- tests/vlog.at | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/vlog.at b/tests/vlog.at index 785014956e7..efe91479a63 100644 --- a/tests/vlog.at +++ b/tests/vlog.at @@ -8,6 +8,7 @@ AT_CHECK([$PYTHON3 $srcdir/test-vlog.py --log-file log_file \ AT_CHECK([sed -e 's/.*-.*-.*T..:..:..Z |//' \ -e 's/File ".*", line [[0-9]][[0-9]]*,/File , line ,/' \ +-e '/\^\+/d' \ stderr_log], [0], [dnl 0 | module_0 | EMER | emergency 1 | module_0 | ERR | error From 120140f891a2b9502fba54ae1f7031e7c4f68f1e Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Tue, 9 Apr 2024 22:45:18 +0800 Subject: [PATCH 660/833] ofproto: Fix Coverity false positive. Coverity reports a false positive below: Ofproto_class_find__() may return NULL, and dereference it to cause segfault. This patch is made just to avoid false-positive Coverity report. Tested-by: Zhang YuHuang Signed-off-by: Lin Huang Signed-off-by: Ilya Maximets --- ofproto/ofproto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 122a06f3032..21c6a1d8257 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -800,7 +800,7 @@ ofproto_type_set_config(const char *datapath_type, const struct smap *cfg) datapath_type = ofproto_normalize_type(datapath_type); class = ofproto_class_find__(datapath_type); - if (class->type_set_config) { + if (class && class->type_set_config) { class->type_set_config(datapath_type, cfg); } } From 74cf01436fd2d05eea755b0a43b5f306aa06bc1f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 20 Mar 2024 17:36:36 +0000 Subject: [PATCH 661/833] Documentation: Updates for rename of primary development branch as main. Recently OVS adopted a policy of using the inclusive naming word list v1 [1, 2]. In keeping with this policy rename the primary development branch from 'master' to 'main'. This patch does not actually make that change, but rather updates references to the branch in documentation in the source tree. It is intended to be applied at (approximately) the same time that the change is made. OVS is currently hosted on GitHub. We can expect the following behaviour after the rename: 1. GitHub pull requests against are renamed branch are automatically re-homed on new branch 2. GitHub Issues do not seem to be affected - at least the test issue I created had no association with a branch 3. URLs accessed via the GitHub web UI are automatically renamed (so long as a new branch called master is not created). 4. Using the git cli command, fetch will fetch the new branch (main), and fetch -p will remove (prune) the old branch (master) [1] df5e5cf4318a ("Documentation: Add section on inclusive language.") [2] https://inclusivenaming.org/word-lists/ Signed-off-by: Simon Horman Acked-by: Eelco Chaudron Acked-by: Aaron Conole Acked-by: Ilya Maximets --- Notes: * Now is the time to raise any concerns regarding this patch. It is planned to implement this change next week. * If you have an automation that fetches the master branch then the suggested action is: 1. Before the branch rename occurs: update the automation to pull main an fall back to pulling master if that fails 2. After the rename occurs: Update the automation to only fetch main * After the change it may be necessary to update your local git configuration for checked out branches. For example: # Fetch origin: new remote main branch; remote master branch is deleted git fetch -tp origin # Rename local branch git branch -m master main # Update local main branch to use remote main branch as it's upstream git branch --set-upstream-to=origin/main main * As a follow-up, after the rename, I plan to post a patch which removes references to master in CI jobs --- .../internals/committer-responsibilities.rst | 12 ++--- .../contributing/backporting-patches.rst | 12 ++--- Documentation/internals/release-process.rst | 50 +++++++++---------- Documentation/intro/install/dpdk.rst | 2 +- Documentation/intro/install/fedora.rst | 2 +- Documentation/intro/install/general.rst | 2 +- Documentation/intro/install/rhel.rst | 2 +- Documentation/topics/language-bindings.rst | 2 +- Documentation/tutorials/faucet.rst | 6 +-- Documentation/tutorials/ovs-conntrack.rst | 1 - NEWS | 3 ++ README.rst | 2 +- 12 files changed, 49 insertions(+), 47 deletions(-) diff --git a/Documentation/internals/committer-responsibilities.rst b/Documentation/internals/committer-responsibilities.rst index c35fd708913..eed2e017678 100644 --- a/Documentation/internals/committer-responsibilities.rst +++ b/Documentation/internals/committer-responsibilities.rst @@ -73,14 +73,14 @@ If it is someone else's change, then you can ask the original submitter to address it. Regardless, you need to ensure that the problem is fixed in a timely way. The definition of "timely" depends on the severity of the problem. -If a bug is present on master and other branches, fix it on master first, then +If a bug is present on main and other branches, fix it on main first, then backport the fix to other branches. Straightforward backports do not require -additional review (beyond that for the fix on master). +additional review (beyond that for the fix on main). -Feature development should be done only on master. Occasionally it makes sense +Feature development should be done only on main. Occasionally it makes sense to add a feature to the most recent release branch, before the first actual release of that branch. These should be handled in the same way as bug fixes, -that is, first implemented on master and then backported. +that is, first implemented on main and then backported. Keep the authorship of a commit clear by maintaining a correct list of "Signed-off-by:"s. If a confusing situation comes up, as it occasionally does, @@ -99,7 +99,7 @@ Pre-Push Hook ------------- The following script can be helpful because it provides an extra -chance to check for mistakes while pushing to the master branch of OVS +chance to check for mistakes while pushing to the main branch of OVS or OVN. If you would like to use it, install it as ``hooks/pre-push`` in your ``.git`` directory and make sure to mark it as executable with ``chmod +x``. For maximum utility, make sure ``checkpatch.py`` is in @@ -118,7 +118,7 @@ in your ``.git`` directory and make sure to mark it as executable with while read local_ref local_sha1 remote_ref remote_sha1; do case $remote_ref in - refs/heads/master) + refs/heads/main) n=0 while read sha do diff --git a/Documentation/internals/contributing/backporting-patches.rst b/Documentation/internals/contributing/backporting-patches.rst index 04bb0fc350f..2007a429c7b 100644 --- a/Documentation/internals/contributing/backporting-patches.rst +++ b/Documentation/internals/contributing/backporting-patches.rst @@ -43,10 +43,10 @@ within Open vSwitch, but is broadly applied in the following fashion: - Maintainers backport changes from a development branch to release branches. With regards to Open vSwitch user space code and code that does not comprise -the Linux datapath and compat code, the development branch is `master` in the +the Linux datapath and compat code, the development branch is `main` in the Open vSwitch repository. Patches are applied first to this branch, then to the most recent `branch-X.Y`, then earlier `branch-X.Z`, and so on. The most common -kind of patch in this category is a bugfix which affects master and other +kind of patch in this category is a bugfix which affects main and other branches. For Linux datapath code, the primary development branch is in the `net-next`_ @@ -67,15 +67,15 @@ Changes to userspace components ------------------------------- Patches which are fixing bugs should be considered for backporting from -`master` to release branches. Open vSwitch contributors submit their patches -targeted to the `master` branch, using the ``Fixes`` tag described in -:doc:`submitting-patches`. The maintainer first applies the patch to `master`, +`main` to release branches. Open vSwitch contributors submit their patches +targeted to the `main` branch, using the ``Fixes`` tag described in +:doc:`submitting-patches`. The maintainer first applies the patch to `main`, then backports the patch to each older affected tree, as far back as it goes or at least to all currently supported branches. This is usually each branch back to the oldest maintained LTS release branch or the last 4 release branches if the oldest LTS is newer. -If the fix only affects a particular branch and not `master`, contributors +If the fix only affects a particular branch and not `main`, contributors should submit the change with the target branch listed in the subject line of the patch. Contributors should list all versions that the bug affects. The ``git format-patch`` argument ``--subject-prefix`` may be used when posting the diff --git a/Documentation/internals/release-process.rst b/Documentation/internals/release-process.rst index d939c2d3ab8..f0c745dc6de 100644 --- a/Documentation/internals/release-process.rst +++ b/Documentation/internals/release-process.rst @@ -34,33 +34,33 @@ or the #openvswitch IRC channel. Release Strategy ---------------- -Open vSwitch feature development takes place on the "master" branch. -Ordinarily, new features are rebased against master and applied directly. For +Open vSwitch feature development takes place on the "main" branch. +Ordinarily, new features are rebased against main and applied directly. For features that take significant development, sometimes it is more appropriate to -merge a separate branch into master; please discuss this on ovs-dev in advance. +merge a separate branch into main; please discuss this on ovs-dev in advance. The process of making a release has the following stages. See `Release Scheduling`_ for the timing of each stage: -1. "Soft freeze" of the master branch. +1. "Soft freeze" of the main branch. During the freeze, we ask committers to refrain from applying patches that add new features unless those patches were already being publicly discussed and reviewed before the freeze began. Bug fixes are welcome at any time. Please propose and discuss exceptions on ovs-dev. -2. Fork a release branch from master, named for the expected release number, +2. Fork a release branch from main, named for the expected release number, e.g. "branch-2.3" for the branch that will yield Open vSwitch 2.3.x. Release branches are intended for testing and stabilization. At this stage and in later stages, they should receive only bug fixes, not new features. Bug fixes applied to release branches should be backports of corresponding - bug fixes to the master branch, except for bugs present only on release + bug fixes to the main branch, except for bugs present only on release branches (which are rare in practice). At this stage, sometimes there can be exceptions to the rule that a release branch receives only bug fixes. Like bug fixes, new features on release - branches should be backports of the corresponding commits on the master + branches should be backports of the corresponding commits on the main branch. Features to be added to release branches should be limited in scope and risk and discussed on ovs-dev before creating the branch. @@ -125,10 +125,10 @@ intermediate branches). Release Numbering ----------------- -The version number on master should normally end in .90. This indicates that +The version number on main should normally end in .90. This indicates that the Open vSwitch version is "almost" the next version to branch. -Forking master into branch-x.y requires two commits to master. The first is +Forking main into branch-x.y requires two commits to main. The first is titled "Prepare for x.y.0" and increments the version number to x.y. This is the initial commit on branch-x.y. The second is titled "Prepare for post-x.y.0 (x.y.90)" and increments the version number to x.y.90. @@ -146,23 +146,23 @@ Release Scheduling Open vSwitch makes releases at the following six-month cadence. All dates are approximate: -+---------------+----------------+--------------------------------------+ -| Time (months) | Dates | Stage | -+---------------+----------------+--------------------------------------+ -| T | Mar 1, Sep 1 | Begin x.y release cycle | -+---------------+----------------+--------------------------------------+ -| T + 4 | Jul 1, Jan 1 | "Soft freeze" master for x.y release | -+---------------+----------------+--------------------------------------+ -| T + 4.5 | Jul 15, Jan 15 | Fork branch-x.y from master | -+---------------+----------------+--------------------------------------+ -| T + 5.5 | Aug 15, Feb 15 | Release version x.y.0 | -+---------------+----------------+--------------------------------------+ ++---------------+----------------+------------------------------------+ +| Time (months) | Dates | Stage | ++---------------+----------------+------------------------------------+ +| T | Mar 1, Sep 1 | Begin x.y release cycle | ++---------------+----------------+------------------------------------+ +| T + 4 | Jul 1, Jan 1 | "Soft freeze" main for x.y release | ++---------------+----------------+------------------------------------+ +| T + 4.5 | Jul 15, Jan 15 | Fork branch-x.y from main | ++---------------+----------------+------------------------------------+ +| T + 5.5 | Aug 15, Feb 15 | Release version x.y.0 | ++---------------+----------------+------------------------------------+ How to Branch ------------- -To branch "master" for the eventual release of OVS version x.y.0, -prepare two patches against master: +To branch "main" for the eventual release of OVS version x.y.0, +prepare two patches against main: 1. "Prepare for x.y.0." following the model of commit 836d1973c56e ("Prepare for 2.11.0."). @@ -172,12 +172,12 @@ prepare two patches against master: Post both patches to ovs-dev. Get them reviewed in the usual way. -Apply both patches to master, and create branch-x.y by pushing only +Apply both patches to main, and create branch-x.y by pushing only the first patch. The following command illustrates how to do both of these at once assuming the local repository HEAD points to the "Prepare for post-x.y.0" commit: - git push origin HEAD:master HEAD^:refs/heads/branch-x.y + git push origin HEAD:main HEAD^:refs/heads/branch-x.y Branching should be announced on ovs-dev. @@ -200,7 +200,7 @@ Follow these steps to release version x.y.z of OVS from branch-x.y. 4. Apply the patches to branch-x.y. -5. If z = 0, apply the first patch (only) to master. +5. If z = 0, apply the first patch (only) to main. 6. Sign a tag vx.y.z "Open vSwitch version x.y.z" and push it to the repo. diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index c92e598d7ae..f1646322c7e 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -174,7 +174,7 @@ Additional information can be found in :doc:`general`. daemon will run as a non-root user. This implies that you must have a working IOMMU. Visit the `RHEL README`__ for additional information. -__ https://github.com/openvswitch/ovs/blob/master/rhel/README.RHEL.rst +__ https://github.com/openvswitch/ovs/blob/main/rhel/README.RHEL.rst Possible issues when enabling AVX512 diff --git a/Documentation/intro/install/fedora.rst b/Documentation/intro/install/fedora.rst index 49fad844c7f..f8a6bb6b603 100644 --- a/Documentation/intro/install/fedora.rst +++ b/Documentation/intro/install/fedora.rst @@ -146,7 +146,7 @@ purpose. Refer to the `RHEL README`__ for additional usage and configuration information. -__ https://github.com/openvswitch/ovs/blob/master/rhel/README.RHEL.rst +__ https://github.com/openvswitch/ovs/blob/main/rhel/README.RHEL.rst Reporting Bugs -------------- diff --git a/Documentation/intro/install/general.rst b/Documentation/intro/install/general.rst index 17c15426805..2b3959a1437 100644 --- a/Documentation/intro/install/general.rst +++ b/Documentation/intro/install/general.rst @@ -37,7 +37,7 @@ repository, which you can clone into a directory named "ovs" with:: $ git clone https://github.com/openvswitch/ovs.git -Cloning the repository leaves the "master" branch initially checked +Cloning the repository leaves the "main" branch initially checked out. This is the right branch for general development. If, on the other hand, if you want to build a particular released version, you can check it out by running a command such as the following from the diff --git a/Documentation/intro/install/rhel.rst b/Documentation/intro/install/rhel.rst index f2151d89071..e442fca0c0e 100644 --- a/Documentation/intro/install/rhel.rst +++ b/Documentation/intro/install/rhel.rst @@ -211,7 +211,7 @@ implemented. Refer to `README.RHEL.rst`__ in the source tree or /usr/share/doc/openvswitch/README.RHEL.rst in the installed openvswitch package for details. -__ https://github.com/openvswitch/ovs/blob/master/rhel/README.RHEL.rst +__ https://github.com/openvswitch/ovs/blob/main/rhel/README.RHEL.rst Reporting Bugs -------------- diff --git a/Documentation/topics/language-bindings.rst b/Documentation/topics/language-bindings.rst index 414f7c73fa3..15958d76da9 100644 --- a/Documentation/topics/language-bindings.rst +++ b/Documentation/topics/language-bindings.rst @@ -49,7 +49,7 @@ required dependencies, run: or install `python3-netaddr` and `python3-pyparsing`. -__ https://github.com/openvswitch/ovs/tree/master/python/ovs +__ https://github.com/openvswitch/ovs/tree/main/python/ovs Third-Party Bindings -------------------- diff --git a/Documentation/tutorials/faucet.rst b/Documentation/tutorials/faucet.rst index 6aa4d39aa8a..33e4543e402 100644 --- a/Documentation/tutorials/faucet.rst +++ b/Documentation/tutorials/faucet.rst @@ -27,7 +27,7 @@ OVS Faucet Tutorial This tutorial demonstrates how Open vSwitch works with a general-purpose OpenFlow controller, using the Faucet controller as a simple way to get -started. It was tested with the "master" branch of Open vSwitch and version +started. It was tested with the "main" branch of Open vSwitch and version 1.6.15 of Faucet. It does not use advanced or recently added features in OVS or Faucet, so other versions of both pieces of software are likely to work equally well. @@ -68,7 +68,7 @@ approaches: $ git clone https://github.com/openvswitch/ovs.git $ cd ovs - The default checkout is the master branch. You will need to use the master + The default checkout is the main branch. You will need to use the main branch for this tutorial as it includes some functionality required for this tutorial. @@ -84,7 +84,7 @@ approaches: The default behaviour for some of the commands used in this tutorial changed in Open vSwitch versions 2.9.x and 2.10.x which breaks the - tutorial. We recommend following step 3 and building master from + tutorial. We recommend following step 3 and building main from source or using a system Open vSwitch that is version 2.8.x or older. If it is successful, you will find yourself in a subshell environment, which diff --git a/Documentation/tutorials/ovs-conntrack.rst b/Documentation/tutorials/ovs-conntrack.rst index e8a58c4eb29..909daf3bd79 100644 --- a/Documentation/tutorials/ovs-conntrack.rst +++ b/Documentation/tutorials/ovs-conntrack.rst @@ -35,7 +35,6 @@ to match on the TCP segments from connection setup to connection tear down. It will use OVS with the Linux kernel module as the datapath for this tutorial. (The datapath that utilizes the openvswitch kernel module to do the packet processing in the Linux kernel) -It was tested with the "master" branch of Open vSwitch. Definitions ----------- diff --git a/NEWS b/NEWS index c9e4064e67a..b92cec532c5 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,9 @@ Post-v3.3.0 * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address from a range. + - The primary development branch has been renamed from 'master' to 'main'. + The OVS tree remains hosted on GitHub. + https://github.com/openvswitch/ovs.git v3.3.0 - 16 Feb 2024 diff --git a/README.rst b/README.rst index a2c234f4d17..ca9e386c206 100644 --- a/README.rst +++ b/README.rst @@ -8,7 +8,7 @@ Open vSwitch .. image:: https://github.com/openvswitch/ovs/workflows/Build%20and%20Test/badge.svg :target: https://github.com/openvswitch/ovs/actions -.. image:: https://ci.appveyor.com/api/projects/status/github/openvswitch/ovs?branch=master&svg=true&retina=true +.. image:: https://ci.appveyor.com/api/projects/status/github/openvswitch/ovs?branch=main&svg=true&retina=true :target: https://ci.appveyor.com/project/blp/ovs/history .. image:: https://api.cirrus-ci.com/github/openvswitch/ovs.svg :target: https://cirrus-ci.com/github/openvswitch/ovs From a75e1c37aae62a4559a52907ad9962cfcba77253 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Sat, 6 Apr 2024 00:08:56 +0200 Subject: [PATCH 662/833] vlog: Log stack trace on vlog_abort. Currently, calls like ovs_assert() just print out a condition that caused assertion to fail. But it may be not enough to understand what exactly has happened, especially if assertion failed in some generic function like dp_packet_resize() or similar. Print the stack trace along with the abort message to give more context for the later debugging. This should be especially useful in case the issue happens in an environment with core dumps disabled. Adding the log to vlog_abort() to cover a little more cases where VLOG_ABORT is called and not only assertion failures. It would be nice to also have stack traces in case of reaching the OVS_NOT_REACHED(). But this macro is used in some places as a last resort and should not actually do more than just stopping the process immediately. And it also can be used in contexts without logging initialized. Such a change will need to be done more carefully. Better solution might be to use VLOG_ABORT() where appropriate instead. Acked-by: Kevin Traynor Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/vlog.c | 10 ++++++++-- tests/library.at | 4 +++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/vlog.c b/lib/vlog.c index b2653142f3f..e78c785f7bb 100644 --- a/lib/vlog.c +++ b/lib/vlog.c @@ -29,6 +29,7 @@ #include #include #include "async-append.h" +#include "backtrace.h" #include "coverage.h" #include "dirs.h" #include "openvswitch/dynamic-string.h" @@ -1274,8 +1275,9 @@ vlog_fatal(const struct vlog_module *module, const char *message, ...) va_end(args); } -/* Logs 'message' to 'module' at maximum verbosity, then calls abort(). Always - * writes the message to stderr, even if the console destination is disabled. +/* Attempts to log a stack trace, logs 'message' to 'module' at maximum + * verbosity, then calls abort(). Always writes the message to stderr, even + * if the console destination is disabled. * * Choose this function instead of vlog_fatal_valist() if the daemon monitoring * facility should automatically restart the current daemon. */ @@ -1289,6 +1291,10 @@ vlog_abort_valist(const struct vlog_module *module_, * message written by the later ovs_abort_valist(). */ module->levels[VLF_CONSOLE] = VLL_OFF; + /* Printing the stack trace before the 'message', because the 'message' + * will flush the async log queue (VLL_EMER). With a different order we + * would need to flush the queue manually again. */ + log_backtrace(); vlog_valist(module, VLL_EMER, message, args); ovs_abort_valist(0, message, args); } diff --git a/tests/library.at b/tests/library.at index 7b4acebb8a3..d962e1b3fd2 100644 --- a/tests/library.at +++ b/tests/library.at @@ -230,7 +230,9 @@ AT_CHECK([ovstest test-util -voff -vfile:info '-vPATTERN:file:%c|%p|%m' --log-fi [$exit_status], [], [stderr]) AT_CHECK([sed 's/\(opened log file\) .*/\1/ -s/|[[^|]]*: /|/' test-util.log], [0], [dnl +s/|[[^|]]*: /|/ +/backtrace/d +/|.*|/!d' test-util.log], [0], [dnl vlog|INFO|opened log file util|EMER|assertion false failed in test_assert() ]) From a35fc4ae5b4d80692022381c4bd39eecb766b5d4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Apr 2024 21:55:11 +0200 Subject: [PATCH 663/833] checkpatch: Don't spellcheck names in tags. Current code checks spelling of names in commit message tags and that makes no sense. Most of the tags are explicitly handled, but tags like 'Tested-by' or other lesser used ones are falling through to the spellchecker and need to be excluded. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- utilities/checkpatch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 692bb88d40d..5a41bbef781 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -442,7 +442,8 @@ def check_spelling(line, comment): if not spell_check_dict or not spellcheck: return False - if line.startswith('Fixes: '): + is_name_tag = re.compile(r'^\s*([a-z-]+-by): (.*@.*)$', re.I | re.M | re.S) + if line.startswith('Fixes: ') or is_name_tag.match(line): return False words = filter_comments(line, True) if comment else line From 751e14c635d97fce80949f4aab6d579d3ed4fab8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 10 Apr 2024 11:41:27 +0200 Subject: [PATCH 664/833] cirrus: Update to FreeBSD 13.3. 13.3 was released on March 5 and 13.2 will reach EoL in June. Update now. Acked-by: Eelco Chaudron Acked-by: Kevin Traynor Signed-off-by: Ilya Maximets --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index d8a97228095..8db385f002f 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -2,7 +2,7 @@ freebsd_build_task: freebsd_instance: matrix: - image_family: freebsd-13-2-snap + image_family: freebsd-13-3-snap image_family: freebsd-14-0-snap cpu: 4 memory: 4G From 241365b6d10fe565b5ae198bc5bd9120f604e8fa Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Wed, 10 Apr 2024 15:56:24 +0200 Subject: [PATCH 665/833] python: Remove hacking dependency and use recent flake8. The previously enabled 'hacking' checks were only applicable to Python 2 code. OVS doesn't support Python 2 for a while now so it's fine to remove the dependency on hacking. A similar change landed in OVN a while ago: https://github.com/ovn-org/ovn/commit/271186fa7d76 Acked-by: Simon Horman Signed-off-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- .ci/linux-prepare.sh | 2 +- Documentation/intro/install/general.rst | 5 +---- Makefile.am | 6 ------ 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh index 5028bdc442d..2a191b57fb8 100755 --- a/.ci/linux-prepare.sh +++ b/.ci/linux-prepare.sh @@ -23,7 +23,7 @@ cd .. # https://github.com/pypa/pip/issues/10655 pip3 install --disable-pip-version-check --user wheel pip3 install --disable-pip-version-check --user \ - flake8 'hacking>=3.0' netaddr pyparsing sarif-tools sphinx setuptools + flake8 netaddr pyparsing sarif-tools sphinx setuptools # Install python test dependencies pip3 install -r python/test_requirements.txt diff --git a/Documentation/intro/install/general.rst b/Documentation/intro/install/general.rst index 2b3959a1437..eb0813b97f9 100644 --- a/Documentation/intro/install/general.rst +++ b/Documentation/intro/install/general.rst @@ -176,10 +176,7 @@ following to obtain better warnings: - clang, version 3.4 or later -- flake8 along with the hacking flake8 plugin (for Python code). The automatic - flake8 check that runs against Python code has some warnings enabled that - come from the "hacking" flake8 plugin. If it's not installed, the warnings - just won't occur until it's run on a system with "hacking" installed. +- flake8 (for Python code) - the python packages listed in "python/test_requirements.txt" (compatible with pip). If they are installed, the pytest-based Python unit tests will diff --git a/Makefile.am b/Makefile.am index 45fce1243a7..e6c90a911aa 100644 --- a/Makefile.am +++ b/Makefile.am @@ -402,16 +402,10 @@ ALL_LOCAL += flake8-check # F811 redefinition of unused from line (only from flake8 v2.0) # D*** -- warnings from flake8-docstrings plugin # H*** -- warnings from flake8 hacking plugin (custom style checks beyond PEP8) -# H231 Python 3.x incompatible 'except x,y:' construct -# H232 Python 3.x incompatible octal 077 should be written as 0o77 -# H233 Python 3.x incompatible use of print operator -# H238 old style class declaration, use new style (inherit from `object`) -FLAKE8_SELECT = H231,H232,H233,H238 FLAKE8_IGNORE = E121,E123,E125,E126,E127,E128,E129,E131,E203,E722,W503,W504,F811,D,H,I flake8-check: $(FLAKE8_PYFILES) $(FLAKE8_WERROR)$(AM_V_GEN) \ src='$^' && \ - flake8 $$src --select=$(FLAKE8_SELECT) $(FLAKE8_FLAGS) && \ flake8 $$src --ignore=$(FLAKE8_IGNORE) $(FLAKE8_FLAGS) && \ touch $@ endif From b34dac4c68eb60f655a9c7ffe8b1974c1b18b8c3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 10 Apr 2024 15:09:20 +0100 Subject: [PATCH 666/833] appveyor: Remove reference to master branch. The OvS primary development branch has been renamed main so there is no longer any need for this CI configuration to refer to master. Acked-by: Alin Gabriel Serdean Acked-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- appveyor.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 050c7dead78..baa84475396 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -3,7 +3,6 @@ image: Visual Studio 2019 branches: only: - main - - master configuration: - Debug - Release From acf6537124b6eb9ab68f0ba49df9454c913439ff Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 10 Apr 2024 15:09:21 +0100 Subject: [PATCH 667/833] github: Remove reference to master branch. The OvS primary development branch has been renamed main so there is no longer any need for this CI configuration to refer to master. Acked-by: Eelco Chaudron Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 6f5139304ae..2d64937e41b 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -307,7 +307,7 @@ jobs: MIN_DISTANCE=1000 git remote add upstream https://github.com/openvswitch/ovs.git git fetch upstream - for upstream_head in $(git ls-remote --heads upstream main master dpdk-latest branch-2.17 branch-[3456789]* | cut -f 1); do + for upstream_head in $(git ls-remote --heads upstream main dpdk-latest branch-2.17 branch-[3456789]* | cut -f 1); do CURR_BASE=$(git merge-base ${upstream_head} HEAD 2>/dev/null) if [ ${CURR_BASE} ]; then DISTANCE=$(git log --oneline ${CURR_BASE}..HEAD | wc -l); From 3cd0299aaa6c4293d0468a5ae38fb65854e1bee9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 11 Apr 2024 00:43:28 +0200 Subject: [PATCH 668/833] ovsdb-doc: Fix syntax warning with Python 3.12 and flake8 issues. ovsdb-doc script generates the following syntax warning while running with Python 3.12: /ovsdb/ovsdb-doc:240: SyntaxWarning: invalid escape sequence '\{' s += """ This doesn't cause a build failure because so far it's only a warning, but it will become a syntax error in the future. Fix that by converting to a raw string and removing unnecessary escape sequences. Adding ovsdb-doc to flake8-check to avoid re-introducing issues in the future. This means also fixing all the other issues with the script like unused imports and variables, long lines, missing empty lines, wildcarded imports. Also cleaning up one place that handles compatibility with Python 2 types, since we do not support Python 2 for a long time now. Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- ovsdb/automake.mk | 1 + ovsdb/ovsdb-doc | 50 +++++++++++++++++++++++------------------------ 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/ovsdb/automake.mk b/ovsdb/automake.mk index eba713bb6d7..e8149224b23 100644 --- a/ovsdb/automake.mk +++ b/ovsdb/automake.mk @@ -114,6 +114,7 @@ $(OVSIDL_BUILT): ovsdb/ovsdb-idlc.in python/ovs/dirs.py # ovsdb-doc EXTRA_DIST += ovsdb/ovsdb-doc +FLAKE8_PYFILES += ovsdb/ovsdb-doc OVSDB_DOC = $(run_python) $(srcdir)/ovsdb/ovsdb-doc ovsdb/ovsdb-doc: python/ovs/dirs.py diff --git a/ovsdb/ovsdb-doc b/ovsdb/ovsdb-doc index 099770d253f..2edf487a289 100755 --- a/ovsdb/ovsdb-doc +++ b/ovsdb/ovsdb-doc @@ -14,9 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from datetime import date import getopt -import os import sys import xml.dom.minidom @@ -24,10 +22,13 @@ import ovs.json from ovs.db import error import ovs.db.schema -from ovs_build_helpers.nroff import * +from ovs_build_helpers.nroff import block_xml_to_nroff +from ovs_build_helpers.nroff import escape_nroff_literal +from ovs_build_helpers.nroff import text_to_nroff argv0 = sys.argv[0] + def typeAndConstraintsToNroff(column): type = column.type.toEnglish(escape_nroff_literal) constraints = column.type.constraintsToEnglish(escape_nroff_literal, @@ -38,6 +39,7 @@ def typeAndConstraintsToNroff(column): type += " (must be unique within table)" return type + def columnGroupToNroff(table, groupXml, documented_columns): introNodes = [] columnNodes = [] @@ -49,7 +51,10 @@ def columnGroupToNroff(table, groupXml, documented_columns): if (columnNodes and not (node.nodeType == node.TEXT_NODE and node.data.isspace())): - raise error.Error("text follows or inside : %s" % node) + raise error.Error( + "text follows or inside : %s" + % node + ) introNodes += [node] summary = [] @@ -65,15 +70,9 @@ def columnGroupToNroff(table, groupXml, documented_columns): if node.hasAttribute('type'): type_string = node.attributes['type'].nodeValue type_json = ovs.json.from_string(str(type_string)) - # py2 -> py3 means str -> bytes and unicode -> str - try: - if type(type_json) in (str, unicode): - raise error.Error("%s %s:%s has invalid 'type': %s" - % (table.name, name, key, type_json)) - except: - if type(type_json) in (bytes, str): - raise error.Error("%s %s:%s has invalid 'type': %s" - % (table.name, name, key, type_json)) + if type(type_json) in (bytes, str): + raise error.Error("%s %s:%s has invalid 'type': %s" + % (table.name, name, key, type_json)) type_ = ovs.db.types.BaseType.from_json(type_json) else: type_ = column.type.value @@ -91,10 +90,11 @@ def columnGroupToNroff(table, groupXml, documented_columns): else: if type_.type != column.type.value.type: type_english = type_.toEnglish() + typeNroff += ", containing " if type_english[0] in 'aeiou': - typeNroff += ", containing an %s" % type_english + typeNroff += "an %s" % type_english else: - typeNroff += ", containing a %s" % type_english + typeNroff += "a %s" % type_english constraints = ( type_.constraintsToEnglish(escape_nroff_literal, text_to_nroff)) @@ -121,6 +121,7 @@ def columnGroupToNroff(table, groupXml, documented_columns): raise error.Error("unknown element %s in " % node.tagName) return summary, intro, body + def tableSummaryToNroff(summary, level=0): s = "" for type, name, arg in summary: @@ -132,6 +133,7 @@ def tableSummaryToNroff(summary, level=0): s += ".RE\n" return s + def tableToNroff(schema, tableXml): tableName = tableXml.attributes['name'].nodeValue table = schema.tables[tableName] @@ -156,20 +158,17 @@ def tableToNroff(schema, tableXml): return s + def docsToNroff(schemaFile, xmlFile, erFile, version=None): schema = ovs.db.schema.DbSchema.from_json(ovs.json.from_file(schemaFile)) doc = xml.dom.minidom.parse(xmlFile).documentElement - schemaDate = os.stat(schemaFile).st_mtime - xmlDate = os.stat(xmlFile).st_mtime - d = date.fromtimestamp(max(schemaDate, xmlDate)) - if doc.hasAttribute('name'): manpage = doc.attributes['name'].nodeValue else: manpage = schema.name - if version == None: + if version is None: version = "UNKNOWN" # Putting '\" p as the first line tells "man" that the manpage @@ -194,7 +193,6 @@ def docsToNroff(schemaFile, xmlFile, erFile, version=None): .PP ''' % (manpage, schema.version, version, text_to_nroff(manpage), schema.name) - tables = "" introNodes = [] tableNodes = [] summary = [] @@ -237,8 +235,8 @@ Purpose """ % (name, text_to_nroff(title)) if erFile: - s += """ -.\\" check if in troff mode (TTY) + s += r""" +.\" check if in troff mode (TTY) .if t \{ .bp .SH "TABLE RELATIONSHIPS" @@ -248,8 +246,8 @@ database. Each node represents a table. Tables that are part of the ``root set'' are shown with double borders. Each edge leads from the table that contains it and points to the table that its value represents. Edges are labeled with their column names, followed by a -constraint on the number of allowed values: \\fB?\\fR for zero or one, -\\fB*\\fR for zero or more, \\fB+\\fR for one or more. Thick lines +constraint on the number of allowed values: \fB?\fR for zero or one, +\fB*\fR for zero or more, \fB+\fR for one or more. Thick lines represent strong references; thin lines represent weak references. .RS -1in """ @@ -263,6 +261,7 @@ represent strong references; thin lines represent weak references. s += tableToNroff(schema, node) + "\n" return s + def usage(): print("""\ %(argv0)s: ovsdb schema documentation generator @@ -278,6 +277,7 @@ The following options are also available: """ % {'argv0': argv0}) sys.exit(0) + if __name__ == "__main__": try: try: From dd24c57deded8d376a446ba3dede0591ed8898c9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 11 Apr 2024 00:43:29 +0200 Subject: [PATCH 669/833] ovsdb-dot: Fix flake8 issues. Missing and extra spaces, missing empty lines, unused imports and variables, long lines. Decided to just comment out the unused 'tail' and 'head' as they seem useful in documenting the meaning of the words. Files added to flake8-check to avoid future issues. Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- ovsdb/automake.mk | 1 + ovsdb/dot2pic | 6 +++--- ovsdb/ovsdb-dot.in | 41 ++++++++++++++++++++++------------------- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/ovsdb/automake.mk b/ovsdb/automake.mk index e8149224b23..d484fe9debb 100644 --- a/ovsdb/automake.mk +++ b/ovsdb/automake.mk @@ -120,6 +120,7 @@ ovsdb/ovsdb-doc: python/ovs/dirs.py # ovsdb-dot EXTRA_DIST += ovsdb/ovsdb-dot.in ovsdb/dot2pic +FLAKE8_PYFILES += ovsdb/ovsdb-dot.in ovsdb/dot2pic noinst_SCRIPTS += ovsdb/ovsdb-dot CLEANFILES += ovsdb/ovsdb-dot OVSDB_DOT = $(run_python) $(srcdir)/ovsdb/ovsdb-dot.in diff --git a/ovsdb/dot2pic b/ovsdb/dot2pic index 2f858e19d5b..3db6444de64 100755 --- a/ovsdb/dot2pic +++ b/ovsdb/dot2pic @@ -17,6 +17,7 @@ import getopt import sys + def dot2pic(src, dst): scale = 1.0 while True: @@ -49,8 +50,8 @@ def dot2pic(src, dst): dst.write("box at %f,%f wid %f height %f\n" % (x, y, width, height)) elif command == 'edge': - tail = words[1] - head = words[2] + # tail = words[1] + # head = words[2] n = int(words[3]) # Extract x,y coordinates. @@ -114,4 +115,3 @@ else: if font_scale: print(".ps %+d" % font_scale) print(".PE") - diff --git a/ovsdb/ovsdb-dot.in b/ovsdb/ovsdb-dot.in index 41b986c0ac7..f1eefd49cbc 100755 --- a/ovsdb/ovsdb-dot.in +++ b/ovsdb/ovsdb-dot.in @@ -1,15 +1,13 @@ #! @PYTHON3@ -from datetime import date import ovs.db.error import ovs.db.schema import getopt -import os -import re import sys argv0 = sys.argv[0] + def printEdge(tableName, type, baseType, label): if baseType.ref_table_name: if type.n_min == 0: @@ -31,38 +29,42 @@ def printEdge(tableName, type, baseType, label): options['label'] = '"%s%s"' % (label, arity) if baseType.ref_type == 'weak': options['style'] = 'dotted' - print ("\t%s -> %s [%s];" % ( + print("\t%s -> %s [%s];" % ( tableName, baseType.ref_table_name, - ', '.join(['%s=%s' % (k,v) for k,v in options.items()]))) + ', '.join(['%s=%s' % (k, v) for k, v in options.items()]))) + def schemaToDot(schemaFile, arrows): schema = ovs.db.schema.DbSchema.from_json(ovs.json.from_file(schemaFile)) - print ("digraph %s {" % schema.name) - print ('\trankdir=LR;') - print ('\tsize="6.5,4";') - print ('\tmargin="0";') - print ("\tnode [shape=box];") + print("digraph %s {" % schema.name) + print('\trankdir=LR;') + print('\tsize="6.5,4";') + print('\tmargin="0";') + print("\tnode [shape=box];") if not arrows: - print ("\tedge [dir=none, arrowhead=none, arrowtail=none];") + print("\tedge [dir=none, arrowhead=none, arrowtail=none];") for tableName, table in schema.tables.items(): options = {} if table.is_root: options['style'] = 'bold' - print ("\t%s [%s];" % ( + print("\t%s [%s];" % ( tableName, - ', '.join(['%s=%s' % (k,v) for k,v in options.items()]))) + ', '.join(['%s=%s' % (k, v) for k, v in options.items()]))) for columnName, column in table.columns.items(): if column.type.value: - printEdge(tableName, column.type, column.type.key, "%s key" % columnName) - printEdge(tableName, column.type, column.type.value, "%s value" % columnName) + printEdge(tableName, column.type, column.type.key, + "%s key" % columnName) + printEdge(tableName, column.type, column.type.value, + "%s value" % columnName) else: printEdge(tableName, column.type, column.type.key, columnName) - print ("}"); + print("}") + def usage(): - print ("""\ + print("""\ %(argv0)s: compiles ovsdb schemas to graphviz format Prints a .dot file that "dot" can render to an entity-relationship diagram usage: %(argv0)s [OPTIONS] SCHEMA @@ -75,12 +77,13 @@ The following options are also available: """ % {'argv0': argv0}) sys.exit(0) + if __name__ == "__main__": try: try: options, args = getopt.gnu_getopt(sys.argv[1:], 'hV', ['no-arrows', - 'help', 'version',]) + 'help', 'version']) except getopt.GetoptError as geo: sys.stderr.write("%s: %s\n" % (argv0, geo.msg)) sys.exit(1) @@ -92,7 +95,7 @@ if __name__ == "__main__": elif key in ['-h', '--help']: usage() elif key in ['-V', '--version']: - print ("ovsdb-dot (Open vSwitch) @VERSION@") + print("ovsdb-dot (Open vSwitch) @VERSION@") else: sys.exit(0) From 16b7475414fa1eaf0ab1d723fdb6978060511c44 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 11 Apr 2024 00:43:30 +0200 Subject: [PATCH 670/833] github: Update python to 3.12. We pinned the python version to 3.9 because we had issues building older meson 0.47.1 with python 3.10. Since then meson was updated to 0.53.2 in our CI, but we didn't reconsider the python version. Newer versions of python uncover more issues with our python files. And newer major distributions are using newer versions of python. But we do not really want to use bleeding edge of python releases either to avoid unexpected CI failures that need immediate fixes. Pin python version to 3.12 as it is the latest released version and we should not have any issues with this version. While at it, updating meson to a newer version that plays nicely with python 3.12. We do not really care much about the version we use here as long as it is able to build the version of DPDK we're using. Meson has no LTS releases, as far as I can tell, so just choosing the latest stable 1.4.x series. It should be fine to use for a next few years. Major distributions are using 1.0+ versions. Upcoming F40 and Ubuntu 24.03 have meson 1.3. It would also be nice to test the minimal supported version of python, but 3.6 is not available in setup-python for 22.04. The oldest is 3.7. And 3.7 is EoL, so pip fails to install some of our dependencies. The oldest version we can use today is 3.8. But, in the end, this becomes a race against older python versions reaching end of their life and packages dropping support of these versions. This may cause unexpected CI failures. So, not doing that for now. Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- .ci/dpdk-prepare.sh | 2 +- .github/workflows/build-and-test.yml | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.ci/dpdk-prepare.sh b/.ci/dpdk-prepare.sh index f7e6215ddac..4424f9eb97f 100755 --- a/.ci/dpdk-prepare.sh +++ b/.ci/dpdk-prepare.sh @@ -8,4 +8,4 @@ set -ev # https://github.com/pypa/pip/issues/10655 pip3 install --disable-pip-version-check --user wheel pip3 install --disable-pip-version-check --user pyelftools -pip3 install --user 'meson==0.53.2' +pip3 install --user 'meson>=1.4,<1.5' diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 2d64937e41b..424dbab6c91 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -2,6 +2,9 @@ name: Build and Test on: [push, pull_request] +env: + python_default: 3.12 + jobs: build-dpdk: env: @@ -54,7 +57,7 @@ jobs: if: steps.dpdk_cache.outputs.cache-hit != 'true' uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: ${{ env.python_default }} - name: update APT cache if: steps.dpdk_cache.outputs.cache-hit != 'true' @@ -217,7 +220,7 @@ jobs: - name: set up python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: ${{ env.python_default }} - name: cache if: matrix.dpdk != '' || matrix.dpdk_shared != '' @@ -354,7 +357,7 @@ jobs: - name: set up python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: ${{ env.python_default }} - name: get cached dpdk-dir uses: actions/cache/restore@v4 @@ -407,7 +410,7 @@ jobs: - name: set up python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: ${{ env.python_default }} - name: install dependencies run: brew install automake libtool - name: prepare From e1e0c6a3ed51bf442ccc7cd4f811c453e240e113 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 9 Apr 2024 09:19:02 +0200 Subject: [PATCH 671/833] checkpatch: Allow rST manpages to be added. The current __check_doc_is_listed() verifies that the new .rst file is listed in Documentation/automake.mk with the full path (i.e: "{directory}/{filename}"). While this holds true for generic documentation files, which are added to DOC_SOURCE with the full path, it's not true for rST manpages which are added only by filename to RST_MANPAGES target (see Documentation/automake.mk). This makes the current implementation of the check to incorrectly raise a warning as follows even though the patch does add the file to RST_MANPAGES: """ WARNING: New doc ovs-flowviz.8.rst not listed in Documentation/automake.mk """ Fix it by making the {dir}/ part of the docre regexp optional. Signed-off-by: Adrian Moreno Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- utilities/checkpatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 5a41bbef781..6b293770ddd 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -495,7 +495,7 @@ def __check_doc_is_listed(text, doctype, docdir, docfile): docre = re.compile(r'\n\+.*{}'.format(docfile.replace('.rst', ''))) elif doctype == 'automake': beginre = re.compile(r'\+\+\+.*Documentation/automake.mk') - docre = re.compile(r'\n\+\t{}/{}'.format(docdir, docfile)) + docre = re.compile(r'\n\+\t(?:{}/)?{}'.format(docdir, docfile)) else: raise NotImplementedError("Invalid doctype: {}".format(doctype)) From 66a8430c70c6ab67cfd32d4cd1848a80ae445154 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 12 Apr 2024 01:32:40 +0200 Subject: [PATCH 672/833] appveyor: Fix too wide OpenSSL version regexp. Current regexp is not good enough. OpenSSL 3.3.0 is now available and unfortunately the regexp is matching both 3.3.0 and 3.0.13. All the AppVeyor runs are currently failing because of this. Making it more restrictive by matching on the start of the string, explicit dots and numbers after the last dot. Hopefully, this is good enough. In addition, taking only the first result just in case it mismatches again. Fixes: 9d8208484a35 ("appveyor: Build with OpenSSL 3.0.") Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- appveyor.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index baa84475396..d11e4639989 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,12 +26,12 @@ install: $URL = "https://raw.githubusercontent.com/slproweb/opensslhashes/master/win32_openssl_hashes.json" $webData = (Invoke-WebRequest -Uri $URL).content | ConvertFrom-Json $source = ($webData.files.PSObject.Properties | Where-Object { - $_.Value.basever -match "3.0.*" -and - $_.Value.bits -eq "64" -and - $_.Value.arch -eq "INTEL" -and - $_.Value.installer -eq "exe" -and + $_.Value.basever -match "^3\.0\.[0-9]+" -and + $_.Value.bits -eq "64" -and + $_.Value.arch -eq "INTEL" -and + $_.Value.installer -eq "exe" -and -not $_.Value.light - } | Select-Object Value).PSObject.Properties.Value + } | Select-Object Value | Select -First 1).PSObject.Properties.Value Write-Host "Latest OpenSSL 3.0:" ($source | Format-List | Out-String) From 4f29804f249bf5994603b73c3250198ce0adb8d7 Mon Sep 17 00:00:00 2001 From: Roi Dayan via dev Date: Tue, 16 Apr 2024 16:21:48 +0300 Subject: [PATCH 673/833] netdev-dpdk: Fix possible memory leak configuring VF MAC address. VLOG_WARN_BUF() is allocating memory for the error string and should e used if the configuration cannot continue and error is being returned so the caller has indication of releasing the pointer. Change to VLOG_WARN() to keep the logic that error is not being returned. Fixes: f4336f504b17 ("netdev-dpdk: Add option to configure VF MAC address.") Signed-off-by: Roi Dayan Acked-by: Gaetan Rivet Acked-by: Eli Britstein Signed-off-by: Simon Horman --- lib/netdev-dpdk.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 2111f776810..9249b9e9c64 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2379,17 +2379,16 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, struct eth_addr mac; if (!dpdk_port_is_representor(dev)) { - VLOG_WARN_BUF(errp, "'%s' is trying to set the VF MAC '%s' " - "but 'options:dpdk-vf-mac' is only supported for " - "VF representors.", - netdev_get_name(netdev), vf_mac); + VLOG_WARN("'%s' is trying to set the VF MAC '%s' " + "but 'options:dpdk-vf-mac' is only supported for " + "VF representors.", + netdev_get_name(netdev), vf_mac); } else if (!eth_addr_from_string(vf_mac, &mac)) { - VLOG_WARN_BUF(errp, "interface '%s': cannot parse VF MAC '%s'.", - netdev_get_name(netdev), vf_mac); + VLOG_WARN("interface '%s': cannot parse VF MAC '%s'.", + netdev_get_name(netdev), vf_mac); } else if (eth_addr_is_multicast(mac)) { - VLOG_WARN_BUF(errp, - "interface '%s': cannot set VF MAC to multicast " - "address '%s'.", netdev_get_name(netdev), vf_mac); + VLOG_WARN("interface '%s': cannot set VF MAC to multicast " + "address '%s'.", netdev_get_name(netdev), vf_mac); } else if (!eth_addr_equals(dev->requested_hwaddr, mac)) { dev->requested_hwaddr = mac; netdev_request_reconfigure(netdev); From 2b7efee031c3a2205ad2ee999275893edd083c1c Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Fri, 12 Apr 2024 02:45:17 +0000 Subject: [PATCH 674/833] socket: Increase listen backlog to 64 everywhere. Before the patch, the size of the backlog depended on the type of socket (UNIX vs INET) as well as on the language (C vs Python), specifically: - python used backlog size = 10 for all sockets; - C used 64 for UNIX sockets but 10 for INET sockets. This consolidates the values across the board. It effectively bumps the number of simultaneous connections to python unixctl servers to 64. Also for INET C servers too. The rationale to do it, on top of consistency, is as follows: - fmt_pkt in ovn testsuite is limited by python server listen backlog, and as was found out when adopting the tool, it is sometimes useful to run lots of parallel calls to fmt_pkt unixctl server in some tests. (See [1] for example.) - there is a recent report [2] on discuss@ ML where the reporter noticed significant listen queue overflows in some scenarios (large openstack deployments; happens during leader transition when hundreds of neutron nodes - with dozens of neutron api workers each - simultaneously reconnect to the same northbound leader.) Note: While there is no clear indication that this backlog size bump would resolve the reported issues, it would probably help somewhat. [1] https://github.com/ovn-org/ovn/commit/0baca3e519756cbe98a32526ccc637bb73468743 [2] https://mail.openvswitch.org/pipermail/ovs-discuss/2024-April/053049.html Signed-off-by: Ihar Hrachyshka Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- lib/socket-util.c | 2 +- python/ovs/stream.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/socket-util.c b/lib/socket-util.c index 3eb3a3816b7..2d89fce8501 100644 --- a/lib/socket-util.c +++ b/lib/socket-util.c @@ -760,7 +760,7 @@ inet_open_passive(int style, const char *target, int default_port, } /* Listen. */ - if (style == SOCK_STREAM && listen(fd, 10) < 0) { + if (style == SOCK_STREAM && listen(fd, 64) < 0) { error = sock_errno(); VLOG_ERR("%s: listen: %s", target, sock_strerror(error)); goto error; diff --git a/python/ovs/stream.py b/python/ovs/stream.py index 82fbb0d6883..dbb6b2e1f77 100644 --- a/python/ovs/stream.py +++ b/python/ovs/stream.py @@ -620,7 +620,7 @@ def open(name): raise Exception('Unknown connection string') try: - sock.listen(10) + sock.listen(64) except socket.error as e: vlog.err("%s: listen: %s" % (name, os.strerror(e.error))) sock.close() From fbade819d2de1685c49e1deaf62d45049a5c2a27 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Wed, 10 Apr 2024 16:38:24 -0500 Subject: [PATCH 675/833] ovsdb-idl: Add python keyword to persistent UUID test. The Python persistent UUID tests should have the keyword "python" added so that TESTSUITEFLAGS="-k python" will not miss testing them. Fixes: 55b9507e6824 ("ovsdb-idl: Add the support to specify the uuid for row insert.") Signed-off-by: Terry Wilson Tested-by: Simon Horman Signed-off-by: Simon Horman --- tests/ovsdb-idl.at | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index fb568dd823c..c9e36d678b0 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -2713,7 +2713,7 @@ m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_C], m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_PY], [AT_SETUP([$1 - Python3]) - AT_KEYWORDS([idl persistent uuid insert]) + AT_KEYWORDS([idl python persistent uuid insert]) OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest.ovsschema"]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $2], [0], [stdout], [stderr]) From e876b046630bdd141387c5054407acdf96bc87ce Mon Sep 17 00:00:00 2001 From: Chris Riches Date: Wed, 10 Apr 2024 15:48:55 +0000 Subject: [PATCH 676/833] rhel/systemd: Set ovsdb-server timeout to 5 minutes. If the database is particularly large (multi-GB), ovsdb-server can take several minutes to come up. This tends to fall afoul of the default systemd start timeout, which is typically 90s, putting the service into an infinite restart loop. To avoid this, set the timeout to a more generous 5 minutes. This change brings ovsdb-server's timeout in line with ovs-vswitchd, which got the same treatment in commit c1c69e8a45 ("rhel/systemd: Set ovs-vswitchd timeout to 5 minutes"). Acked-by: Simon Horman Signed-off-by: Chris Riches Signed-off-by: Ilya Maximets --- rhel/usr_lib_systemd_system_ovsdb-server.service | 1 + 1 file changed, 1 insertion(+) diff --git a/rhel/usr_lib_systemd_system_ovsdb-server.service b/rhel/usr_lib_systemd_system_ovsdb-server.service index 49dc06e38c2..558632320cc 100644 --- a/rhel/usr_lib_systemd_system_ovsdb-server.service +++ b/rhel/usr_lib_systemd_system_ovsdb-server.service @@ -29,3 +29,4 @@ ExecStop=/usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd stop ExecReload=/usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd \ ${OVS_USER_OPT} \ --no-monitor restart $OPTIONS +TimeoutSec=300 From 153d563c291d025d6578292dcbcec4f7a7041132 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 23 Apr 2024 21:32:31 +0200 Subject: [PATCH 677/833] AUTHORS: Add Chris Riches. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 82075d32067..68ee6d46470 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -101,6 +101,7 @@ Carlo Andreotti c.andreotti@m3s.it Casey Barker crbarker@google.com Chandan Somani csomani@redhat.com Chandra Sekhar Vejendla csvejend@us.ibm.com +Chris Riches chris.riches@nutanix.com Chris Wright chrisw@sous-sol.org Christoph Jaeger cj@linux.com Christophe Fontaine cfontain@redhat.com From d7f2150ea83a02336adbf0bf48f160664f11ce8f Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 12 Apr 2024 01:45:01 +0200 Subject: [PATCH 678/833] ovsdb: raft: Fix inability to join a cluster with a large database. Inactivity probe interval on RAFT connections depend on a value of the election timer. However, the actual value is not known until the database snapshot with the RAFT information is received by a joining server. New joining server is using a default 1 second until then. In case a new joining server is trying to join an existing cluster with a large database, it may take more than a second to generate and send an initial database snapshot. This is causing an inability to actually join this cluster. Joining server sends ADD_SERVER request, waits 1 second, sends a probe, doesn't get a reply within another second, because the leader is busy preparing and sending an initial snapshot to it, disconnects, repeat. This is not an issue for the servers that did already join, since their probe intervals are larger than election timeout. Cooperative multitasking also doesn't fully solve this issue, since it depends on election timer, which is likely higher in the existing cluster with a very big database. Fix that by using the maximum election timer value for inactivity probes until the actual value is known. We still shouldn't completely disable the probes, because in the rare event the connection is established but the other side silently goes away, we still want to disconnect and try to re-establish the connection eventually. Since probe intervals also depend on the joining state now, update them when the server joins the cluster. Fixes: 14b2b0aad7ae ("raft: Reintroduce jsonrpc inactivity probes.") Reported-by: Terry Wilson Reported-at: https://issues.redhat.com/browse/FDP-144 Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index d81a1758a0c..083ebf66a72 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -1018,8 +1018,13 @@ raft_conn_update_probe_interval(struct raft *raft, struct raft_conn *r_conn) * inactivity probe follower will just try to initiate election * indefinitely staying in 'candidate' role. And the leader will continue * to send heartbeats to the dead connection thinking that remote server - * is still part of the cluster. */ - int probe_interval = raft->election_timer + ELECTION_RANGE_MSEC; + * is still part of the cluster. + * + * While joining, the real value of the election timeout is not known to + * this server, so using the maximum. */ + int probe_interval = (raft->joining ? ELECTION_MAX_MSEC + : raft->election_timer) + + ELECTION_RANGE_MSEC; jsonrpc_session_set_probe_interval(r_conn->js, probe_interval); } @@ -2820,6 +2825,13 @@ raft_send_heartbeats(struct raft *raft) raft_reset_ping_timer(raft); } +static void +raft_join_complete(struct raft *raft) +{ + raft->joining = false; + raft_update_probe_intervals(raft); +} + /* Initializes the fields in 's' that represent the leader's view of the * server. */ static void @@ -2866,7 +2878,7 @@ raft_become_leader(struct raft *raft) * we're becoming a cluster leader without receiving reply for a * join request and will commit addition of this server ourselves. */ VLOG_INFO_RL(&rl, "elected as leader while joining"); - raft->joining = false; + raft_join_complete(raft); } struct raft_server *s; @@ -3101,7 +3113,7 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) "added to configuration without reply " "(eid: "UUID_FMT", commit index: %"PRIu64")", UUID_ARGS(&e->eid), index); - raft->joining = false; + raft_join_complete(raft); } } raft_servers_destroy(&servers); @@ -4049,7 +4061,7 @@ raft_handle_add_server_reply(struct raft *raft, } if (rpy->success) { - raft->joining = false; + raft_join_complete(raft); /* It is tempting, at this point, to check that this server is part of * the current configuration. However, this is not necessarily the From bcacd805fefa776d4943190d5b1c19238b3dcf75 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 12 Apr 2024 01:45:02 +0200 Subject: [PATCH 679/833] ovsdb: raft: Fix probe intervals after install snapshot request. If the new snapshot received with INSTALL_SNAPSHOT request contains a different election timer value, the timer is updated, but the probe intervals for RAFT connections are not. Fix that by updating probe intervals whenever we get election timer from the log. Fixes: 14b2b0aad7ae ("raft: Reintroduce jsonrpc inactivity probes.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 083ebf66a72..ac3d37ac409 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -5035,6 +5035,7 @@ raft_get_election_timer_from_log(struct raft *raft) break; } } + raft_update_probe_intervals(raft); } static void From fb46f5d29ab86f4e95997ca47098a455ce91ed67 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 17 Apr 2024 10:54:14 +0300 Subject: [PATCH 680/833] netdev-dpdk: Improve error print to the user for flow control error. When failing to get flow control parameters use VLOG_WARN_BUF() to expose the error string in ovs-vsctl show. Signed-off-by: Roi Dayan Suggested-by: Simon Horman Acked-by: Eli Britstein Signed-off-by: Simon Horman --- lib/netdev-dpdk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 9249b9e9c64..7b84c858e9b 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2425,8 +2425,8 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, } err = 0; /* Not fatal. */ } else { - VLOG_WARN("%s: Cannot get flow control parameters: %s", - netdev_get_name(netdev), rte_strerror(err)); + VLOG_WARN_BUF(errp, "%s: Cannot get flow control parameters: %s", + netdev_get_name(netdev), rte_strerror(err)); } goto out; } From 8ce5c95f089c2006709d8f6039d597d75aa73a2f Mon Sep 17 00:00:00 2001 From: Paolo Valerio Date: Mon, 22 Apr 2024 14:37:41 +0200 Subject: [PATCH 681/833] dpctl: Fix segfault on ct-{set,del}-limits. When no parameters other than the datapath are specified a segfault occurs. Fix it by checking the argument access is inside the bounds. Signed-off-by: Paolo Valerio Acked-by: Kevin Traynor Signed-off-by: Simon Horman --- lib/dpctl.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index 34ee7d0e2de..3c555a55925 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -2168,13 +2168,20 @@ static int dpctl_ct_set_limits(int argc, const char *argv[], struct dpctl_params *dpctl_p) { - struct dpif *dpif; - struct ds ds = DS_EMPTY_INITIALIZER; + struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); int i = dp_arg_exists(argc, argv) ? 2 : 1; + struct ds ds = DS_EMPTY_INITIALIZER; + struct dpif *dpif = NULL; uint32_t default_limit; - struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); + int error; + + if (i >= argc) { + ds_put_cstr(&ds, "too few arguments"); + error = EINVAL; + goto error; + } - int error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, &dpif); + error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, &dpif); if (error) { return error; } @@ -2261,11 +2268,17 @@ static int dpctl_ct_del_limits(int argc, const char *argv[], struct dpctl_params *dpctl_p) { - struct dpif *dpif; + struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); + int i = dp_arg_exists(argc, argv) ? 2 : 1; struct ds ds = DS_EMPTY_INITIALIZER; + struct dpif *dpif = NULL; int error; - int i = dp_arg_exists(argc, argv) ? 2 : 1; - struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); + + if (i >= argc) { + ds_put_cstr(&ds, "too few arguments"); + error = EINVAL; + goto error; + } error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); if (error) { From 1876b2796fb970f92bc188a4d06bbdf38394cde9 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 26 Apr 2024 18:35:21 +0200 Subject: [PATCH 682/833] tests: Fix build failure with Clang 18 due to -Wformat-truncation. Cirrus CI is broken on FreeBSD 13.3 due to clang version update. It now complains about snprintf truncation the same way GCC does: tests/test-util.c:1129:16: error: 'snprintf' will always be truncated; specified size is 5, but format string expands to at least 6 [-Werror,-Wformat-truncation] 1129 | ovs_assert(snprintf(s, 5, "abcde") == 5); | ^ Clang 17 on FreeBSD 14.0 works fine, but new Clang 18.1.4 on 13.3 fails to build. Fix that by disabling Clang diagnostic the same way as we do for GCC. Unfortunately, the pragma's are compiler-specific, so cannot be combined, AFAIK. Acked-by: Ales Musil Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/test-util.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/test-util.c b/tests/test-util.c index 7d899fbbfd9..5d88d38f26a 100644 --- a/tests/test-util.c +++ b/tests/test-util.c @@ -1116,12 +1116,16 @@ test_snprintf(struct ovs_cmdl_context *ctx OVS_UNUSED) { char s[16]; + /* GCC 7+ and Clang 18+ warn about the following calls that truncate + * a string using snprintf(). We're testing that truncation works + * properly, so temporarily disable the warning. */ #if __GNUC__ >= 7 - /* GCC 7+ warns about the following calls that truncate a string using - * snprintf(). We're testing that truncation works properly, so - * temporarily disable the warning. */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +#if __clang_major__ >= 18 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wformat-truncation" #endif ovs_assert(snprintf(s, 4, "abcde") == 5); ovs_assert(!strcmp(s, "abc")); @@ -1130,6 +1134,9 @@ test_snprintf(struct ovs_cmdl_context *ctx OVS_UNUSED) ovs_assert(!strcmp(s, "abcd")); #if __GNUC__ >= 7 #pragma GCC diagnostic pop +#endif +#if __clang_major__ >= 18 +#pragma clang diagnostic pop #endif ovs_assert(snprintf(s, 6, "abcde") == 5); From 169ff9ea933f74854eab8d27457990a52706eec4 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 26 Apr 2024 19:44:52 +0200 Subject: [PATCH 683/833] tc: Fix -Wgnu-variable-sized-type-not-at-end warning with Clang 18. Clang 18.1.3-2.fc41 throws a warning: lib/tc.c:3060:25: error: field 'sel' with variable sized type 'struct tc_pedit_sel' not at the end of a struct or class is a GNU extension [-Werror,-Wgnu-variable-sized-type-not-at-end] 3060 | struct tc_pedit sel; | ^ Refactor the structure into a proper union to avoid the build failure. Interestingly, clang 18.1.3-2.fc41 on Fedora throws a warning, but relatively the same version 18.1.3 (1) on Ubuntu 24.04 does not. Acked-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/tc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/tc.c b/lib/tc.c index e9bcae4e4b1..e55ba3b1bbc 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -3056,17 +3056,17 @@ nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, struct tc_action *action, uint32_t action_pc) { - struct { + union { struct tc_pedit sel; - struct tc_pedit_key keys[MAX_PEDIT_OFFSETS]; - struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS]; - } sel = { - .sel = { - .nkeys = 0 - } - }; + uint8_t buffer[sizeof(struct tc_pedit) + + MAX_PEDIT_OFFSETS * sizeof(struct tc_pedit_key)]; + } sel; + struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS]; int i, j, err; + memset(&sel, 0, sizeof sel); + memset(keys_ex, 0, sizeof keys_ex); + for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) { struct flower_key_to_pedit *m = &flower_pedit_map[i]; struct tc_pedit_key *pedit_key = NULL; @@ -3100,8 +3100,8 @@ nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, return EOPNOTSUPP; } - pedit_key = &sel.keys[sel.sel.nkeys]; - pedit_key_ex = &sel.keys_ex[sel.sel.nkeys]; + pedit_key = &sel.sel.keys[sel.sel.nkeys]; + pedit_key_ex = &keys_ex[sel.sel.nkeys]; pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET; pedit_key_ex->htype = m->htype; pedit_key->off = cur_offset; @@ -3121,7 +3121,7 @@ nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, } } } - nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex, + nl_msg_put_act_pedit(request, &sel.sel, keys_ex, flower->csum_update_flags ? TC_ACT_PIPE : action_pc); return 0; From bd8e9f48f180800292c10e12f26824833f18506a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 30 Apr 2024 16:36:32 +0200 Subject: [PATCH 684/833] sparse: Add immintrin.h header. Sparse doesn't understand _Float16 and some other types used by immintrin.h from GCC 13. This breaks sparse builds with DPDK on Fedora 38+ and Ubuntu 24.04. Add another sparse-specific header to workaround the problem. We do need some of the functions and types defined in these headers, so we can't really stab out the whole header. Carving out the main offenders instead by defining the inclusion guards. This is fragile and depends on internals of immintrin and underlying headers, but I'm not sure what the better way to solve the issue would be. This approach should be more or less portable between compilers, because it only defines a few specific variables. We may have to add more as GCC headers change over time. This fixes the build with a following config on F38 and Ubuntu 24.04: ./configure --enable-sparse --with-dpdk=yes --enable-Werror Acked-by: Ales Musil Signed-off-by: Ilya Maximets --- include/sparse/automake.mk | 1 + include/sparse/immintrin.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 include/sparse/immintrin.h diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk index c1229870bb8..45e6202c52e 100644 --- a/include/sparse/automake.mk +++ b/include/sparse/automake.mk @@ -1,5 +1,6 @@ noinst_HEADERS += \ include/sparse/rte_byteorder.h \ + include/sparse/immintrin.h \ include/sparse/xmmintrin.h \ include/sparse/arpa/inet.h \ include/sparse/bits/floatn.h \ diff --git a/include/sparse/immintrin.h b/include/sparse/immintrin.h new file mode 100644 index 00000000000..dd742be9f55 --- /dev/null +++ b/include/sparse/immintrin.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CHECKER__ +#error "Use this header only with sparse. It is not a correct implementation." +#endif + +/* Sparse doesn't know some types used by AVX512 and some other headers. + * Mark those headers as already included to avoid failures. This is fragile, + * so may need adjustments with compiler changes. */ +#define _AVX512BF16INTRIN_H_INCLUDED +#define _AVX512BF16VLINTRIN_H_INCLUDED +#define _AVXNECONVERTINTRIN_H_INCLUDED +#define _KEYLOCKERINTRIN_H_INCLUDED +#define __AVX512FP16INTRIN_H_INCLUDED +#define __AVX512FP16VLINTRIN_H_INCLUDED + +#include_next From bf82d2cce2d94443fbd40a32023eeb1d29c48e1b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 1 May 2024 11:54:57 +0100 Subject: [PATCH 685/833] Documentation: Update Pacemaker main page link. Update link to pacemaker main page as the existing link is broken. Also, use HTTPS. Broken link flagged by make check-docs Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- Documentation/topics/integration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/topics/integration.rst b/Documentation/topics/integration.rst index ee83f8d4390..0f40baae741 100644 --- a/Documentation/topics/integration.rst +++ b/Documentation/topics/integration.rst @@ -191,7 +191,7 @@ contents. At all times, the data can be transacted only from the active server. When the active server dies for some reason, entire OVN operations will be stalled. -`Pacemaker `__ is a cluster resource +`Pacemaker `__ is a cluster resource manager which can manage a defined set of resource across a set of clustered nodes. Pacemaker manages the resource with the help of the resource agents. One among the resource agent is `OCF From c176635f5131b94132b61c760fc26cbff5a9434d Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Wed, 24 Apr 2024 14:44:47 +0200 Subject: [PATCH 686/833] test-conntrack: Add per zone benchmark tool. The current test-conntrack benchmark command runs with multiple threads against a single conntrack zone. We now add a new benchmark-zones command that allows us to check the performance between multiple zones. We in there test the following scenarios for one zone while other zones also contain entries: 1. Flushing a single full zone 2. Flushing a single empty zone 3. Committing new conntrack entries against a single zone 4. Running conntrack_execute without commit against the entries of a single zone Signed-off-by: Felix Huettner Signed-off-by: Simon Horman --- tests/test-conntrack.c | 181 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 166 insertions(+), 15 deletions(-) diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c index 292b6c048b8..dc8d6cff941 100644 --- a/tests/test-conntrack.c +++ b/tests/test-conntrack.c @@ -25,36 +25,48 @@ #include "ovstest.h" #include "pcap-file.h" #include "timeval.h" +#include "stopwatch.h" + +#define STOPWATCH_CT_EXECUTE_COMMIT "ct-execute-commit" +#define STOPWATCH_CT_EXECUTE_NO_COMMIT "ct-execute-no-commit" +#define STOPWATCH_FLUSH_FULL_ZONE "full-zone" +#define STOPWATCH_FLUSH_EMPTY_ZONE "empty-zone" static const char payload[] = "50540000000a50540000000908004500001c0000000000" "11a4cd0a0101010a0101020001000200080000"; +static struct dp_packet * +build_packet(uint16_t udp_src, uint16_t udp_dst, ovs_be16 *dl_type) +{ + struct udp_header *udp; + struct flow flow; + struct dp_packet *pkt = dp_packet_new(sizeof payload / 2); + + dp_packet_put_hex(pkt, payload, NULL); + flow_extract(pkt, &flow); + + udp = dp_packet_l4(pkt); + udp->udp_src = htons(udp_src); + udp->udp_dst = htons(udp_dst); + + *dl_type = flow.dl_type; + + return pkt; +} + static struct dp_packet_batch * prepare_packets(size_t n, bool change, unsigned tid, ovs_be16 *dl_type) { struct dp_packet_batch *pkt_batch = xzalloc(sizeof *pkt_batch); - struct flow flow; size_t i; ovs_assert(n <= ARRAY_SIZE(pkt_batch->packets)); dp_packet_batch_init(pkt_batch); for (i = 0; i < n; i++) { - struct udp_header *udp; - struct dp_packet *pkt = dp_packet_new(sizeof payload/2); - - dp_packet_put_hex(pkt, payload, NULL); - flow_extract(pkt, &flow); - - udp = dp_packet_l4(pkt); - udp->udp_src = htons(ntohs(udp->udp_src) + tid); - - if (change) { - udp->udp_dst = htons(ntohs(udp->udp_dst) + i); - } - + uint16_t udp_dst = change ? 2+1 : 2; + struct dp_packet *pkt = build_packet(1 + tid, udp_dst, dl_type); dp_packet_batch_add(pkt_batch, pkt); - *dl_type = flow.dl_type; } return pkt_batch; @@ -153,6 +165,140 @@ test_benchmark(struct ovs_cmdl_context *ctx) free(threads); } +static void +test_benchmark_zones(struct ovs_cmdl_context *ctx) +{ + unsigned long n_conns, n_zones, iterations; + long long start; + unsigned i, j; + ovs_be16 dl_type; + long long now = time_msec(); + + fatal_signal_init(); + + /* Parse arguments */ + n_conns = strtoul(ctx->argv[1], NULL, 0); + if (n_conns == 0 || n_conns >= UINT32_MAX) { + ovs_fatal(0, "n_conns must be between 1 and 2^32"); + } + n_zones = strtoul(ctx->argv[2], NULL, 0); + if (n_zones == 0 || n_zones >= UINT16_MAX) { + ovs_fatal(0, "n_zones must be between 1 and 2^16"); + } + iterations = strtoul(ctx->argv[3], NULL, 0); + if (iterations == 0) { + ovs_fatal(0, "iterations must be greater than 0"); + } + + ct = conntrack_init(); + + /* Create initial connection entries */ + start = time_msec(); + struct dp_packet_batch **pkt_batch = xzalloc(n_conns * sizeof *pkt_batch); + for (i = 0; i < n_conns; i++) { + pkt_batch[i] = xzalloc(sizeof(struct dp_packet_batch)); + dp_packet_batch_init(pkt_batch[i]); + uint16_t udp_src = (i & 0xFFFF0000) >> 16; + if (udp_src == 0) { + udp_src = UINT16_MAX; + } + uint16_t udp_dst = i & 0xFFFF; + if (udp_dst == 0) { + udp_dst = UINT16_MAX; + } + struct dp_packet *pkt = build_packet(udp_src, udp_dst, &dl_type); + dp_packet_batch_add(pkt_batch[i], pkt); + } + printf("initial packet generation time: %lld ms\n", time_msec() - start); + + /* Put initial entries to each zone */ + start = time_msec(); + for (i = 0; i < n_zones; i++) { + for (j = 0; j < n_conns; j++) { + conntrack_execute(ct, pkt_batch[j], dl_type, false, true, i, + NULL, NULL, NULL, NULL, now, 0); + pkt_metadata_init_conn(&pkt_batch[j]->packets[0]->md); + } + } + printf("initial insert time: %lld ms\n", time_msec() - start); + + /* Actually run the tests */ + stopwatch_create(STOPWATCH_CT_EXECUTE_COMMIT, SW_US); + stopwatch_create(STOPWATCH_CT_EXECUTE_NO_COMMIT, SW_US); + stopwatch_create(STOPWATCH_FLUSH_FULL_ZONE, SW_US); + stopwatch_create(STOPWATCH_FLUSH_EMPTY_ZONE, SW_US); + start = time_msec(); + for (i = 0; i < iterations; i++) { + /* Testing flushing a full zone */ + stopwatch_start(STOPWATCH_FLUSH_FULL_ZONE, time_usec()); + uint16_t zone = 1; + conntrack_flush(ct, &zone); + stopwatch_stop(STOPWATCH_FLUSH_FULL_ZONE, time_usec()); + + /* Now fill the zone again */ + stopwatch_start(STOPWATCH_CT_EXECUTE_COMMIT, time_usec()); + for (j = 0; j < n_conns; j++) { + conntrack_execute(ct, pkt_batch[j], dl_type, false, true, zone, + NULL, NULL, NULL, NULL, now, 0); + pkt_metadata_init_conn(&pkt_batch[j]->packets[0]->md); + } + stopwatch_stop(STOPWATCH_CT_EXECUTE_COMMIT, time_usec()); + + /* Running conntrack_execute on the now existing connections */ + stopwatch_start(STOPWATCH_CT_EXECUTE_NO_COMMIT, time_usec()); + for (j = 0; j < n_conns; j++) { + conntrack_execute(ct, pkt_batch[j], dl_type, false, false, zone, + NULL, NULL, NULL, NULL, now, 0); + pkt_metadata_init_conn(&pkt_batch[j]->packets[0]->md); + } + stopwatch_stop(STOPWATCH_CT_EXECUTE_NO_COMMIT, time_usec()); + + /* Testing flushing an empty zone */ + stopwatch_start(STOPWATCH_FLUSH_EMPTY_ZONE, time_usec()); + zone = UINT16_MAX; + conntrack_flush(ct, &zone); + stopwatch_stop(STOPWATCH_FLUSH_EMPTY_ZONE, time_usec()); + } + + printf("flush run time: %lld ms\n", time_msec() - start); + + stopwatch_sync(); + struct stopwatch_stats stats_ct_execute_commit = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_CT_EXECUTE_COMMIT, &stats_ct_execute_commit); + struct stopwatch_stats stats_ct_execute_nocommit = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_CT_EXECUTE_NO_COMMIT, + &stats_ct_execute_nocommit); + struct stopwatch_stats stats_flush_full = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_FLUSH_FULL_ZONE, &stats_flush_full); + struct stopwatch_stats stats_flush_empty = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_FLUSH_EMPTY_ZONE, &stats_flush_empty); + + printf("results:\n"); + printf(" | ct execute (commit) | ct execute (no commit) |" + " flush full zone | flush empty zone |\n"); + printf("+--------+---------------------+------------------------+" + "-----------------+------------------+\n"); + printf("| Min | %16llu us | %19llu us | %12llu us | %13llu us |\n", + stats_ct_execute_commit.min, stats_ct_execute_nocommit.min, + stats_flush_full.min, stats_flush_empty.min); + printf("| Max | %16llu us | %19llu us | %12llu us | %13llu us |\n", + stats_ct_execute_commit.max, stats_ct_execute_nocommit.max, + stats_flush_full.max, stats_flush_empty.max); + printf("| 95%%ile | %16.2f us | %19.2f us | %12.2f us | %13.2f us |\n", + stats_ct_execute_commit.pctl_95, stats_ct_execute_nocommit.pctl_95, + stats_flush_full.pctl_95, stats_flush_empty.pctl_95); + printf("| Avg | %16.2f us | %19.2f us | %12.2f us | %13.2f us |\n", + stats_ct_execute_commit.ewma_1, stats_ct_execute_nocommit.ewma_1, + stats_flush_full.ewma_1, stats_flush_empty.ewma_1); + + conntrack_destroy(ct); + for (i = 0; i < n_conns; i++) { + dp_packet_delete_batch(pkt_batch[i], true); + free(pkt_batch[i]); + } + free(pkt_batch); +} + static void pcap_batch_execute_conntrack(struct conntrack *ct_, struct dp_packet_batch *pkt_batch) @@ -264,6 +410,11 @@ static const struct ovs_cmdl_command commands[] = { * 'batch_size' (1 by default) per call, with the commit flag set. * Prints the ct_state of each packet. */ {"pcap", "file [batch_size]", 1, 2, test_pcap, OVS_RO}, + /* Creates 'n_conns' connections in 'n_zones' zones each. + * Afterwards triggers flush requests repeadeatly for the last filled zone + * and an empty zone. */ + {"benchmark-zones", "n_conns n_zones iterations", 3, 3, + test_benchmark_zones, OVS_RO}, {NULL, NULL, 0, 0, NULL, OVS_RO}, }; From 139b564dbd0a1934a4a135cc0bfb700249afd572 Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Wed, 24 Apr 2024 14:44:54 +0200 Subject: [PATCH 687/833] conntrack: Key connections by zone. Currently conntrack uses a single large cmap for all connections stored. This cmap contains all connections for all conntrack zones which are completely separate from each other. By separating each zone to its own cmap we can significantly optimize the performance when using multiple zones. The change fixes a similar issue as [1] where slow conntrack zone flush operations significantly slow down OVN router failover. The difference is just that this fix is used whith dpdk, while [1] was when using the ovs kernel module. As we now need to store more cmap's the memory usage of struct conntrack increases by 524280 bytes. Additionally we need 65535 cmaps with 128 bytes each. This leads to a total memory increase of around 10MB. Running "./ovstest test-conntrack benchmark 4 33554432 32 1" shows no real difference in the multithreading behaviour against a single zone. Running the new "./ovstest test-conntrack benchmark-zones" show significant speedups as shown below. The values for "ct execute" are for acting on the complete zone with all its entries in total (so in the first case adding 10,000 new conntrack entries). All tests are run 1000 times. When running with 1,000 zones with 10,000 entries each we see the following results (all in microseconds): "./ovstest test-conntrack benchmark-zones 10000 1000 1000" +------+--------+---------+---------+ | Min | Max | 95%ile | Avg | +------------------------+------+--------+---------+---------+ | ct execute (commit) | | | | | | with commit | 2266 | 3505 | 2707.06 | 2592.06 | | without commit | 2411 | 12730 | 4432.50 | 2736.78 | +------------------------+------+--------+---------+---------+ | ct execute (no commit) | | | | | | with commit | 699 | 1238 | 886.15 | 722.67 | | without commit | 700 | 3377 | 1934.42 | 803.53 | +------------------------+------+--------+---------+---------+ | flush full zone | | | | | | with commit | 619 | 1122 | 901.36 | 679.15 | | without commit | 618 | 105078 | 64591 | 2886.46 | +------------------------+------+--------+---------+---------+ | flush empty zone | | | | | | with commit | 0 | 5 | 1.00 | 0.64 | | without commit | 54 | 87469 | 64520 | 2172.25 | +------------------------+------+--------+---------+---------+ When running with 10,000 zones with 1,000 entries each we see the following results (all in microseconds): "./ovstest test-conntrack benchmark-zones 1000 10000 1000" +------+--------+---------+---------+ | Min | Max | 95%ile | Avg | +------------------------+------+--------+---------+---------+ | ct execute (commit) | | | | | | with commit | 215 | 287 | 231.88 | 222.30 | | without commit | 214 | 1692 | 569.18 | 285.83 | +------------------------+------+--------+---------+---------+ | ct execute (no commit) | | | | | | with commit | 68 | 97 | 74.69 | 70.09 | | without commit | 68 | 300 | 158.40 | 82.06 | +------------------------+------+--------+---------+---------+ | flush full zone | | | | | | with commit | 47 | 211 | 56.34 | 50.34 | | without commit | 48 | 96330 | 63392 | 63923 | +------------------------+------+--------+---------+---------+ | flush empty zone | | | | | | with commit | 0 | 1 | 1.00 | 0.44 | | without commit | 3 | 109728 | 63923 | 3629.44 | +------------------------+------+--------+---------+---------+ Comparing the averages we see: * a moderate performance improvement for conntrack_execute with or without commiting of around 6% to 23% * a significant performance improvement for flushing a full zone of around 75% to 99% * an even more significant improvement for flushing empty zones since we no longer need to check any unrelated connections [1] 9ec849e8aa869b646c372fac552ae2609a4b5f66 Signed-off-by: Felix Huettner Signed-off-by: Simon Horman --- lib/conntrack-private.h | 2 +- lib/conntrack.c | 73 ++++++++++++++++++++++++++++------------- lib/conntrack.h | 1 + 3 files changed, 52 insertions(+), 24 deletions(-) diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index 3fd5fccd3eb..71367f211c9 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -200,7 +200,7 @@ enum ct_ephemeral_range { struct conntrack { struct ovs_mutex ct_lock; /* Protects 2 following fields. */ - struct cmap conns OVS_GUARDED; + struct cmap conns[UINT16_MAX + 1] OVS_GUARDED; struct rculist exp_lists[N_EXP_LISTS]; struct cmap zone_limits OVS_GUARDED; struct cmap timeout_policies OVS_GUARDED; diff --git a/lib/conntrack.c b/lib/conntrack.c index 7e3ed0ee009..16e1c8bb58b 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -254,7 +254,9 @@ conntrack_init(void) ovs_mutex_init_adaptive(&ct->ct_lock); ovs_mutex_lock(&ct->ct_lock); - cmap_init(&ct->conns); + for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) { + cmap_init(&ct->conns[i]); + } for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) { rculist_init(&ct->exp_lists[i]); } @@ -427,12 +429,14 @@ conn_clean__(struct conntrack *ct, struct conn *conn) } hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis); - cmap_remove(&ct->conns, &conn->key_node[CT_DIR_FWD].cm_node, hash); + cmap_remove(&ct->conns[conn->key_node[CT_DIR_FWD].key.zone], + &conn->key_node[CT_DIR_FWD].cm_node, hash); if (conn->nat_action) { hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key, ct->hash_basis); - cmap_remove(&ct->conns, &conn->key_node[CT_DIR_REV].cm_node, hash); + cmap_remove(&ct->conns[conn->key_node[CT_DIR_REV].key.zone], + &conn->key_node[CT_DIR_REV].cm_node, hash); } rculist_remove(&conn->node); @@ -503,7 +507,9 @@ conntrack_destroy(struct conntrack *ct) ovs_mutex_lock(&ct->ct_lock); - cmap_destroy(&ct->conns); + for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) { + cmap_destroy(&ct->conns[i]); + } cmap_destroy(&ct->zone_limits); cmap_destroy(&ct->timeout_policies); @@ -534,7 +540,7 @@ conn_key_lookup(struct conntrack *ct, const struct conn_key *key, struct conn *conn = NULL; bool found = false; - CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns) { + CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns[key->zone]) { if (keyn->dir == CT_DIR_FWD) { conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); } else { @@ -962,14 +968,16 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, nat_packet(pkt, nc, false, ctx->icmp_related); uint32_t rev_hash = conn_key_hash(&rev_key_node->key, ct->hash_basis); - cmap_insert(&ct->conns, &rev_key_node->cm_node, rev_hash); + cmap_insert(&ct->conns[ctx->key.zone], + &rev_key_node->cm_node, rev_hash); } ovs_mutex_init_adaptive(&nc->lock); atomic_flag_clear(&nc->reclaimed); fwd_key_node->dir = CT_DIR_FWD; rev_key_node->dir = CT_DIR_REV; - cmap_insert(&ct->conns, &fwd_key_node->cm_node, ctx->hash); + cmap_insert(&ct->conns[ctx->key.zone], + &fwd_key_node->cm_node, ctx->hash); conn_expire_push_front(ct, nc); atomic_count_inc(&ct->n_conn); ctx->conn = nc; /* For completeness. */ @@ -2649,11 +2657,12 @@ conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump, if (pzone) { dump->zone = *pzone; dump->filter_zone = true; + dump->current_zone = dump->zone; } dump->ct = ct; *ptot_bkts = 1; /* Need to clean up the callers. */ - dump->cursor = cmap_cursor_start(&ct->conns); + dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]); return 0; } @@ -2665,20 +2674,26 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) struct conn_key_node *keyn; struct conn *conn; - CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, &dump->cursor) { - if (keyn->dir != CT_DIR_FWD) { - continue; - } + while (true) { + CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, &dump->cursor) { + if (keyn->dir != CT_DIR_FWD) { + continue; + } - conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); - if (conn_expired(conn, now)) { - continue; - } + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); + if (conn_expired(conn, now)) { + continue; + } - if (!dump->filter_zone || keyn->key.zone == dump->zone) { conn_to_ct_dpif_entry(conn, entry, now); return 0; } + + if (dump->filter_zone || dump->current_zone == UINT16_MAX) { + break; + } + dump->current_zone++; + dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]); } return EOF; @@ -2756,20 +2771,32 @@ conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED) return 0; } -int -conntrack_flush(struct conntrack *ct, const uint16_t *zone) +static int +conntrack_flush_zone(struct conntrack *ct, const uint16_t zone) { struct conn_key_node *keyn; struct conn *conn; - CMAP_FOR_EACH (keyn, cm_node, &ct->conns) { + CMAP_FOR_EACH (keyn, cm_node, &ct->conns[zone]) { if (keyn->dir != CT_DIR_FWD) { continue; } conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); - if (!zone || *zone == keyn->key.zone) { - conn_clean(ct, conn); - } + conn_clean(ct, conn); + } + + return 0; +} + +int +conntrack_flush(struct conntrack *ct, const uint16_t *zone) +{ + if (zone) { + return conntrack_flush_zone(ct, *zone); + } + + for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) { + conntrack_flush_zone(ct, i); } return 0; diff --git a/lib/conntrack.h b/lib/conntrack.h index 8ab8b00176e..13bb02ea934 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -112,6 +112,7 @@ struct conntrack_dump { }; bool filter_zone; uint16_t zone; + uint16_t current_zone; }; struct conntrack_zone_limit { From 01a0fff36104790640e274f1d457084aeb5b968d Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Fri, 26 Apr 2024 16:54:48 +0000 Subject: [PATCH 688/833] docs: Document manual cluster recovery procedure. Remove the notion of cluster/leave --force since it was never implemented. Instead of these instructions, document how a broken cluster can be re-initialized with the old database contents. Acked-by: Simon Horman Signed-off-by: Ihar Hrachyshka Signed-off-by: Ilya Maximets --- Documentation/ref/ovsdb.7.rst | 43 +++++++++++++++++++++++++++++------ ovsdb/ovsdb-server.1.in | 3 +-- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/Documentation/ref/ovsdb.7.rst b/Documentation/ref/ovsdb.7.rst index 46ed13e6163..a45c4ce38b6 100644 --- a/Documentation/ref/ovsdb.7.rst +++ b/Documentation/ref/ovsdb.7.rst @@ -315,16 +315,11 @@ The above methods for adding and removing servers only work for healthy clusters, that is, for clusters with no more failures than their maximum tolerance. For example, in a 3-server cluster, the failure of 2 servers prevents servers joining or leaving the cluster (as well as database access). + To prevent data loss or inconsistency, the preferred solution to this problem is to bring up enough of the failed servers to make the cluster healthy again, then if necessary remove any remaining failed servers and add new ones. If -this cannot be done, though, use ``ovs-appctl`` to invoke ``cluster/leave ---force`` on a running server. This command forces the server to which it is -directed to leave its cluster and form a new single-node cluster that contains -only itself. The data in the new cluster may be inconsistent with the former -cluster: transactions not yet replicated to the server will be lost, and -transactions not yet applied to the cluster may be committed. Afterward, any -servers in its former cluster will regard the server to have failed. +this is not an option, see the next section for `Manual cluster recovery`_. Once a server leaves a cluster, it may never rejoin it. Instead, create a new server and join it to the cluster. @@ -362,6 +357,40 @@ Clustered OVSDB does not support the OVSDB "ephemeral columns" feature. ones when they work with schemas for clustered databases. Future versions of OVSDB might add support for this feature. +Manual cluster recovery +~~~~~~~~~~~~~~~~~~~~~~~ + +.. important:: + + The procedure below will result in ``cid`` and ``sid`` change. A *new* + cluster will be initialized. + +To recover a clustered database after a failure: + +1. Stop *all* old cluster ``ovsdb-server`` instances before proceeding. + +2. Pick one of the old members which will serve as a bootstrap member of the + to-be-recovered cluster. + +3. Convert its database file to the standalone format using ``ovsdb-tool + cluster-to-standalone``. + +4. Backup the standalone database file. + +5. Create a new single-node cluster with ``ovsdb-tool create-cluster`` + using the previously saved standalone database file, then start + ``ovsdb-server``. + +6. Once the single-node cluster is up and running and serves the restored data, + new members should be created and added to the cluster, as usual, with + ``ovsdb-tool join-cluster``. + +.. note:: + + The data in the new cluster may be inconsistent with the former cluster: + transactions not yet replicated to the server chosen in step 2 will be lost, + and transactions not yet applied to the cluster may be committed. + Upgrading from version 2.14 and earlier to 2.15 and later ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in index 9fabf2d6727..23b8e6e9cd8 100644 --- a/ovsdb/ovsdb-server.1.in +++ b/ovsdb/ovsdb-server.1.in @@ -461,8 +461,7 @@ This does not result in a three server cluster that lacks quorum. . .IP "\fBcluster/kick \fIdb server\fR" Start graceful removal of \fIserver\fR from \fIdb\fR's cluster, like -\fBcluster/leave\fR (without \fB\-\-force\fR) except that it can -remove any server, not just this one. +\fBcluster/leave\fR, except that it can remove any server, not just this one. .IP \fIserver\fR may be a server ID, as printed by \fBcluster/sid\fR, or the server's local network address as passed to \fBovsdb-tool\fR's From 855df0ad25a819d03c578ef39803695fcfebb429 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Thu, 2 May 2024 13:13:39 +0200 Subject: [PATCH 689/833] sparse: Add additional define for sparse on GCC >= 14. GCC 14 renamed one of the AVX512 defines to have only single underscore instead of two [0]. Add the single underscore define to keep compatibility with multiple GCC versions. [0] https://github.com/gcc-mirror/gcc/commit/aea8e4105553cd16799f2134d15420ccf182d732 Tested-by: Dumitru Ceara Acked-by: Simon Horman Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- include/sparse/immintrin.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/sparse/immintrin.h b/include/sparse/immintrin.h index dd742be9f55..9a23d7f7461 100644 --- a/include/sparse/immintrin.h +++ b/include/sparse/immintrin.h @@ -26,5 +26,9 @@ #define _KEYLOCKERINTRIN_H_INCLUDED #define __AVX512FP16INTRIN_H_INCLUDED #define __AVX512FP16VLINTRIN_H_INCLUDED +/* GCC >=14 changed the '__AVX512FP16INTRIN_H_INCLUDED' to have only single + * underscore. We need both to keep compatibility between various GCC + * versions. */ +#define _AVX512FP16INTRIN_H_INCLUDED #include_next From f0e0e48ec51f06ff67ed6ebd824674a7794e4f7e Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Fri, 3 May 2024 07:44:13 +0200 Subject: [PATCH 690/833] hash, jhash: Fix unaligned access to the hash remainder. Partially revert db5a101931c5, this was to avoid warning, however we shouldn't use pointer to "uint32_t" when the data are potentially unaligned [0]. Use pointer to "uint8_t" right from the start, this requires us to use ALIGNED_CAST for the get_unaligned_u32, which is fine in that case, because the function uses " __attribute__((__packed__))" struct to access the underlying "uint32_t". lib/hash.c:46:22: runtime error: load of misaligned address 0x507000000065 for type 'const uint32_t *' (aka 'const unsigned int *'), which requires 4 byte alignment 0x507000000065: note: pointer points here 73 62 2e 73 6f 63 6b 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ^ 0 0x6191cb in hash_bytes ovs/lib/hash.c:46:9 1 0x69d064 in hash_string ovs/lib/hash.h:404:12 2 0x69d064 in hash_name ovs/lib/shash.c:29:12 3 0x69d064 in shash_find ovs/lib/shash.c:237:49 4 0x69dada in shash_find_data ovs/lib/shash.c:251:31 5 0x507987 in add_remote ovs/ovsdb/ovsdb-server.c:1382:15 6 0x507987 in parse_options ovs/ovsdb/ovsdb-server.c:2659:13 7 0x507987 in main ovs/ovsdb/ovsdb-server.c:751:5 SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior lib/hash.c:46:22 [0] https://github.com/llvm/llvm-project/issues/90848 Fixes: db5a101931c5 ("clang: Fix the alignment warning.") Acked-by: Eelco Chaudron Acked-by: Simon Horman Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/hash.c | 7 ++++--- lib/jhash.c | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/hash.c b/lib/hash.c index c722f3c3cc2..3d574de9b44 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -29,15 +29,16 @@ hash_3words(uint32_t a, uint32_t b, uint32_t c) uint32_t hash_bytes(const void *p_, size_t n, uint32_t basis) { - const uint32_t *p = p_; + const uint8_t *p = p_; size_t orig_n = n; uint32_t hash; hash = basis; while (n >= 4) { - hash = hash_add(hash, get_unaligned_u32(p)); + hash = hash_add(hash, + get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p))); n -= 4; - p += 1; + p += 4; } if (n) { diff --git a/lib/jhash.c b/lib/jhash.c index c59b51b6113..a8e3f457b94 100644 --- a/lib/jhash.c +++ b/lib/jhash.c @@ -96,18 +96,18 @@ jhash_words(const uint32_t *p, size_t n, uint32_t basis) uint32_t jhash_bytes(const void *p_, size_t n, uint32_t basis) { - const uint32_t *p = p_; + const uint8_t *p = p_; uint32_t a, b, c; a = b = c = 0xdeadbeef + n + basis; while (n >= 12) { - a += get_unaligned_u32(p); - b += get_unaligned_u32(p + 1); - c += get_unaligned_u32(p + 2); + a += get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p)); + b += get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p + 4)); + c += get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p + 8)); jhash_mix(&a, &b, &c); n -= 12; - p += 3; + p += 12; } if (n) { From b91f6788c4be0dd35b9f5edae14f372d68fced08 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 3 May 2024 01:36:37 +0200 Subject: [PATCH 691/833] ofproto-dpif-trace: Fix access to an out-of-scope stack memory. While tracing NAT actions, pointer to the action may be stored in the recirculation node for future reference. However, while translating actions for the group bucket in xlate_group_bucket, the action list is allocated temporarily on stack. So, in case the group translation leads to NAT, the stack pointer can be stored in the recirculation node and accessed later by the tracing mechanism when this stack memory is long gone: ==396230==ERROR: AddressSanitizer: stack-use-after-return on address 0x191844 at pc 0x64222a bp 0xa5da10 sp 0xa5da08 READ of size 1 at 0x191844 thread T0 0 0x642229 in ofproto_trace_recirc_node ofproto/ofproto-dpif-trace.c:704:49 1 0x642229 in ofproto_trace ofproto/ofproto-dpif-trace.c:867:9 2 0x6434c1 in ofproto_unixctl_trace ofproto/ofproto-dpif-trace.c:489:9 3 0xc1e491 in process_command lib/unixctl.c:310:13 4 0xc1e491 in run_connection lib/unixctl.c:344:17 5 0xc1e491 in unixctl_server_run lib/unixctl.c:395:21 6 0x53eedf in main ovs/vswitchd/ovs-vswitchd.c:131:9 7 0x2be087 in __libc_start_call_main 8 0x2be14a in __libc_start_main@GLIBC_2.2.5 9 0x42dee4 in _start (vswitchd/ovs-vswitchd+0x42dee4) Address 0x191844 is located in stack of thread T0 at offset 68 in frame 0 0x6d391f in xlate_group_bucket ofproto/ofproto-dpif-xlate.c:4751 This frame has 3 object(s): [32, 1056) 'action_list_stub' (line 4760) <== Memory access at offset 68 is inside this variable [1184, 1248) 'action_list' (line 4761) [1280, 1344) 'action_set' (line 4762) SUMMARY: AddressSanitizer: stack-use-after-return ofproto/ofproto-dpif-trace.c:704:49 in ofproto_trace_recirc_node Fix that by copying the action. Fixes: d072d2de011b ("ofproto-dpif-trace: Improve NAT tracing.") Reported-by: Ales Musil Reviewed-by: Adrian Moreno Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-trace.c | 3 ++- ofproto/ofproto-dpif-trace.h | 2 +- tests/ofproto-dpif.at | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/ofproto/ofproto-dpif-trace.c b/ofproto/ofproto-dpif-trace.c index 87506aa7858..e43d9f88c9c 100644 --- a/ofproto/ofproto-dpif-trace.c +++ b/ofproto/ofproto-dpif-trace.c @@ -102,7 +102,7 @@ oftrace_add_recirc_node(struct ovs_list *recirc_queue, node->flow = *flow; node->flow.recirc_id = recirc_id; node->flow.ct_zone = zone; - node->nat_act = ofn; + node->nat_act = ofn ? xmemdup(ofn, sizeof *ofn) : NULL; node->packet = packet ? dp_packet_clone(packet) : NULL; return true; @@ -113,6 +113,7 @@ oftrace_recirc_node_destroy(struct oftrace_recirc_node *node) { if (node) { recirc_free_id(node->recirc_id); + free(node->nat_act); dp_packet_delete(node->packet); free(node); } diff --git a/ofproto/ofproto-dpif-trace.h b/ofproto/ofproto-dpif-trace.h index f579a5ca468..f023b10cdf4 100644 --- a/ofproto/ofproto-dpif-trace.h +++ b/ofproto/ofproto-dpif-trace.h @@ -73,7 +73,7 @@ struct oftrace_recirc_node { uint32_t recirc_id; struct flow flow; struct dp_packet *packet; - const struct ofpact_nat *nat_act; + struct ofpact_nat *nat_act; }; /* A node within a next_ct_states list. */ diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 3eaccb13a69..0b23fd6c5ea 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -947,6 +947,28 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - group with ct and dnat recirculation in action list]) +OVS_VSWITCHD_START +add_of_ports br0 1 10 +AT_CHECK([ovs-ofctl -O OpenFlow12 add-group br0 \ + 'group_id=1234,type=all,bucket=ct(nat(dst=10.10.10.7:80),commit,table=2)']) +AT_DATA([flows.txt], [dnl +table=0 ip,ct_state=-trk actions=group:1234 +table=2 ip,ct_state=+trk actions=output:10 +]) +AT_CHECK([ovs-ofctl -O OpenFlow12 add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl ofproto/trace br0 ' + in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800, + nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=1,nw_tos=0,nw_ttl=128,nw_frag=no, + icmp_type=8,icmp_code=0 +'], [0], [stdout]) +AT_CHECK([grep 'Datapath actions' stdout], [0], [dnl +Datapath actions: ct(commit,nat(dst=10.10.10.7:80)),recirc(0x1) +Datapath actions: 10 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - group actions have no effect afterwards]) OVS_VSWITCHD_START add_of_ports br0 1 10 From 0940a51b1f5a5a9be8b859aa4af04de5776d5067 Mon Sep 17 00:00:00 2001 From: Martin Kalcok Date: Wed, 1 May 2024 13:10:43 +0200 Subject: [PATCH 692/833] ovsdb-client: Add missing arg to help for 'dump'. Help text for 'ovsdb-client dump' does not mention that it's capable of dumping specific table's contents if user supplies table's name as a third positional argument. Signed-off-by: Martin Kalcok Signed-off-by: Simon Horman --- ovsdb/ovsdb-client.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index 7249805bab5..cf2ecfd08a9 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -451,8 +451,9 @@ usage(void) " wait until DATABASE reaches STATE " "(\"added\" or \"connected\" or \"removed\")\n" " in DATBASE on SERVER.\n" - "\n dump [SERVER] [DATABASE]\n" - " dump contents of DATABASE on SERVER to stdout\n" + "\n dump [SERVER] [DATABASE] [TABLE]\n" + " dump contents of TABLE (or all tables) in DATABASE on SERVER\n" + " to stdout\n" "\n backup [SERVER] [DATABASE] > SNAPSHOT\n" " dump database contents in the form of a database file\n" "\n [--force] restore [SERVER] [DATABASE] < SNAPSHOT\n" From 5dfbc643f3687ce6b5b80f67481eaaead3ec53a2 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Tue, 7 May 2024 17:04:34 +0200 Subject: [PATCH 693/833] ci: Set platform parameter when building DPDK. This change has no impact, since -Dmachine=default gets converted by DPDK into -Dplatform=generic (since v21.08, see the link to DPDK commit below). Yet, switch to explicitly setting -Dplatform and avoid the following warning: 2024-04-18T14:50:16.8001092Z config/meson.build:113: WARNING: The "machine" option is deprecated. Please use "cpu_instruction_set" instead. While at it, solve another warning and call explicitly meson setup. 2024-04-18T14:50:17.0770596Z WARNING: Running the setup command as `meson [options]` instead of `meson setup [options]` is ambiguous and deprecated. Link: https://git.dpdk.org/dpdk/commit/?id=bf66003b51ec Signed-off-by: David Marchand Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- .ci/dpdk-build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh index 23f3166a548..e1b8e3ccbb4 100755 --- a/.ci/dpdk-build.sh +++ b/.ci/dpdk-build.sh @@ -25,9 +25,9 @@ function build_dpdk() pushd dpdk-src fi - # Switching to 'default' machine to make the dpdk cache usable on + # Switching to 'generic' platform to make the dpdk cache usable on # different CPUs. We can't be sure that all CI machines are exactly same. - DPDK_OPTS="$DPDK_OPTS -Dmachine=default" + DPDK_OPTS="$DPDK_OPTS -Dplatform=generic" # Disable building DPDK unit tests. Not needed for OVS build or tests. DPDK_OPTS="$DPDK_OPTS -Dtests=false" @@ -49,7 +49,7 @@ function build_dpdk() # Install DPDK using prefix. DPDK_OPTS="$DPDK_OPTS --prefix=$DPDK_INSTALL_DIR" - meson $DPDK_OPTS build + meson setup $DPDK_OPTS build ninja -C build ninja -C build install popd From cbc54b2fe05440adbdb4a6980aa294924a555572 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 9 May 2024 13:39:09 +0100 Subject: [PATCH 694/833] AUTHORS: Add Martin Kalcok. Signed-off-by: Simon Horman --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 68ee6d46470..5676a613a76 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -308,6 +308,7 @@ Mark Michelson mmichels@redhat.com Markos Chandras mchandras@suse.de Martin Casado casado@cs.stanford.edu Martin Fong mwfong@csl.sri.com +Martin Kalcok martin.kalcok@canonical.com Martin Varghese martin.varghese@nokia.com Martin Xu martinxu9.ovs@gmail.com Martin Zhang martinbj2008@gmail.com From 4989dc7e0e95df42d448c30b505a89a416b44e89 Mon Sep 17 00:00:00 2001 From: Xavier Simonart Date: Wed, 8 May 2024 18:29:32 +0200 Subject: [PATCH 695/833] conntrack: Do not use {0} to initialize unions. In the following case: union ct_addr { unsigned int ipv4; struct in6_addr ipv6; }; union ct_addr zero_ip = {0}; The ipv6 field might not be properly initialized. For instance, clang 18.1.1 does not initialize the ipv6 field. Reported-at: https://issues.redhat.com/browse/FDP-608 Acked-by: Paolo Valerio Signed-off-by: Xavier Simonart Signed-off-by: Ilya Maximets --- lib/conntrack.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index 16e1c8bb58b..c28c71ec2f2 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -2302,7 +2302,9 @@ find_addr(const struct conn_key *key, union ct_addr *min, uint32_t hash, bool ipv4, const struct nat_action_info_t *nat_info) { - const union ct_addr zero_ip = {0}; + union ct_addr zero_ip; + + memset(&zero_ip, 0, sizeof zero_ip); /* All-zero case. */ if (!memcmp(min, &zero_ip, sizeof *min)) { @@ -2394,14 +2396,18 @@ nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, { struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key; struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key; - union ct_addr min_addr = {0}, max_addr = {0}, addr = {0}; bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP || fwd_key->nw_proto == IPPROTO_UDP || fwd_key->nw_proto == IPPROTO_SCTP; uint16_t min_dport, max_dport, curr_dport; uint16_t min_sport, max_sport, curr_sport; + union ct_addr min_addr, max_addr, addr; uint32_t hash, port_off, basis; + memset(&min_addr, 0, sizeof min_addr); + memset(&max_addr, 0, sizeof max_addr); + memset(&addr, 0, sizeof addr); + basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis; hash = nat_range_hash(fwd_key, basis, nat_info); From 3833506db0de7a9c7e72b82323bc1c355d2c03b3 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Fri, 10 May 2024 17:45:54 +0200 Subject: [PATCH 696/833] conntrack: Fully initialize conn struct before insertion. In case packets are concurrently received in both directions, there's a chance that the ones in the reverse direction get received right after the connection gets added to the connection tracker but before some of the connection's fields are fully initialized. This could cause OVS to access potentially invalid, as the lookup may end up retrieving the wrong offsets during CONTAINER_OF(), or uninitialized memory. This may happen in case of regular NAT or all-zero SNAT. Fix it by initializing early the connections fields. Fixes: 1116459b3ba8 ("conntrack: Remove nat_conn introducing key directionality.") Reported-at: https://issues.redhat.com/browse/FDP-616 Acked-by: Simon Horman Signed-off-by: Mike Pattrick Co-authored-by: Paolo Valerio Signed-off-by: Paolo Valerio Signed-off-by: Ilya Maximets --- lib/conntrack.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/lib/conntrack.c b/lib/conntrack.c index c28c71ec2f2..db44f823749 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -947,6 +947,18 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, nc->parent_key = alg_exp->parent_key; } + ovs_mutex_init_adaptive(&nc->lock); + atomic_flag_clear(&nc->reclaimed); + fwd_key_node->dir = CT_DIR_FWD; + rev_key_node->dir = CT_DIR_REV; + + if (zl) { + nc->admit_zone = zl->czl.zone; + nc->zone_limit_seq = zl->czl.zone_limit_seq; + } else { + nc->admit_zone = INVALID_ZONE; + } + if (nat_action_info) { nc->nat_action = nat_action_info->nat_action; @@ -972,22 +984,16 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, &rev_key_node->cm_node, rev_hash); } - ovs_mutex_init_adaptive(&nc->lock); - atomic_flag_clear(&nc->reclaimed); - fwd_key_node->dir = CT_DIR_FWD; - rev_key_node->dir = CT_DIR_REV; cmap_insert(&ct->conns[ctx->key.zone], &fwd_key_node->cm_node, ctx->hash); conn_expire_push_front(ct, nc); atomic_count_inc(&ct->n_conn); - ctx->conn = nc; /* For completeness. */ + if (zl) { - nc->admit_zone = zl->czl.zone; - nc->zone_limit_seq = zl->czl.zone_limit_seq; atomic_count_inc(&zl->czl.count); - } else { - nc->admit_zone = INVALID_ZONE; } + + ctx->conn = nc; /* For completeness. */ } return nc; From 90cd797da196edc817a9733a1c2e0c019ca2e845 Mon Sep 17 00:00:00 2001 From: Peng He Date: Wed, 15 May 2024 11:28:21 +0800 Subject: [PATCH 697/833] vlog: Destroy async_append first then close log_fd. async_append stores log_fd, it should be destructed before log_fd is closed. Fixes: 81d6495fd937 ("vlog: Make thread-safe.") Acked-by: Simon Horman Signed-off-by: Peng He Signed-off-by: Ilya Maximets --- lib/vlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vlog.c b/lib/vlog.c index e78c785f7bb..59b524b097e 100644 --- a/lib/vlog.c +++ b/lib/vlog.c @@ -411,10 +411,10 @@ vlog_set_log_file__(char *new_log_file_name) /* Close old log file, if any. */ ovs_mutex_lock(&log_file_mutex); + async_append_destroy(log_writer); if (log_fd >= 0) { close(log_fd); } - async_append_destroy(log_writer); free(log_file_name); /* Install new log file. */ From ac7a10721f3b93cb153fe51065230c7ce0389a43 Mon Sep 17 00:00:00 2001 From: Pengfei Sun Date: Wed, 15 May 2024 14:14:13 +0800 Subject: [PATCH 698/833] table: Fix freeing global variable. In function shash_replace_nocopy, argument to free() is the address of a global variable (argument passed by function table_print_json__), which is not memory allocated by malloc(). ovsdb-client -f json monitor Open_vSwitch --timestamp ASan reports: ================================================================= ==1443083==ERROR: AddressSanitizer: attempting free on address which was not malloc()-ed: 0x000000535980 in thread T0 0 0xfc9eac in __interceptor_free (/usr/lib64/libasan.so.6) 1 0x4826e4 in json_destroy_object lib/json.c:445 2 0x4826e4 in json_destroy__ lib/json.c:403 3 0x4cc4e4 in table_print lib/table.c:633 4 0x410650 in monitor_print_table ovsdb/ovsdb-client.c:1019 5 0x410650 in monitor_print ovsdb/ovsdb-client.c:1040 6 0x4110cc in monitor_print ovsdb/ovsdb-client.c:1030 7 0x4110cc in do_monitor__ ovsdb/ovsdb-client.c:1503 8 0x40743c in main ovsdb/ovsdb-client.c:283 9 0xf50038 (/usr/lib64/libc.so.6+0x2b038) 10 0xf50110 in __libc_start_main (/usr/lib64/libc.so.6) 11 0x40906c in _start (/usr/local/bin/ovsdb-client) Fixes: cb139fa8b3a1 ("table: New function table_format() for formatting a table as a string.") Acked-by: Eelco Chaudron Acked-by: Simon Horman Signed-off-by: Pengfei Sun Signed-off-by: Ilya Maximets --- lib/table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/table.c b/lib/table.c index 48d18b65182..b7addbf390f 100644 --- a/lib/table.c +++ b/lib/table.c @@ -522,7 +522,7 @@ table_print_json__(const struct table *table, const struct table_style *style, json_object_put_string(json, "caption", table->caption); } if (table->timestamp) { - json_object_put_nocopy( + json_object_put( json, "time", json_string_create_nocopy(table_format_timestamp__())); } From f61e1da46204fc0d4bb9bb78da0677ef36b30757 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 17 May 2024 13:28:23 +0200 Subject: [PATCH 699/833] AUTHORS: Add Pengfei Sun. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 5676a613a76..ae6a07fa443 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -363,6 +363,7 @@ Paul Ingram Paul-Emmanuel Raoul skyper@skyplabs.net Pavithra Ramesh paramesh@vmware.com Peng He hepeng.0320@bytedance.com +Pengfei Sun sunpengfei16@huawei.com Peter Downs padowns@gmail.com Philippe Jung phil.jung@free.fr Pim van den Berg pim@nethuis.nl From 325ca0455002f8b7e265de0f3e99918ed8e68c69 Mon Sep 17 00:00:00 2001 From: Amit Prakash Shukla Date: Wed, 15 May 2024 17:45:46 +0530 Subject: [PATCH 700/833] packets: Assert for incorrect packet. Packets that are not encapsulated but metadata of the packet contains an offload flag set, will call dp_packet_inner_l4 to get TCP, UDP, SCTP header pointers. dp_packet_inner_l4 for such packets would return NULL as the inner offsets by-default are configured as UINT16_MAX. On dereferencing such pointers, segfault is observed. Add assert check for packets with incorrect header or incorrect offload flag set. [i.maximets] Note: segfault was caused by a faulty driver, not OVS logic. Assertion will allow catching such issues earlier and get more information from the process instead of letting it crash somewhere later. Signed-off-by: Amit Prakash Shukla Signed-off-by: Ilya Maximets --- lib/packets.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/packets.c b/lib/packets.c index 5803d26f4ac..ebf516d6790 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -2011,6 +2011,9 @@ packet_tcp_complete_csum(struct dp_packet *p, bool inner) tcp_sz = dp_packet_l4_size(p); } + ovs_assert(tcp); + ovs_assert(ip_hdr); + if (!inner && dp_packet_hwol_is_outer_ipv6(p)) { is_v4 = false; } else if (!inner && dp_packet_hwol_is_outer_ipv4(p)) { @@ -2057,6 +2060,9 @@ packet_udp_complete_csum(struct dp_packet *p, bool inner) udp_sz = dp_packet_l4_size(p); } + ovs_assert(udp); + ovs_assert(ip_hdr); + /* Skip csum calculation if the udp_csum is zero. */ if (!udp->udp_csum) { return; @@ -2109,6 +2115,8 @@ packet_sctp_complete_csum(struct dp_packet *p, bool inner) tp_len = dp_packet_l4_size(p); } + ovs_assert(sh); + put_16aligned_be32(&sh->sctp_csum, 0); csum = crc32c((void *) sh, tp_len); put_16aligned_be32(&sh->sctp_csum, csum); From 8d5486b653b3fea9448417e445a3a9478a4130bc Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 17 May 2024 13:37:05 +0200 Subject: [PATCH 701/833] AUTHORS: Add Amit Prakash Shukla. Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index ae6a07fa443..bb44ef14251 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -52,6 +52,7 @@ Alin Serdean aserdean@ovn.org Amber Kumar kumar.amber@intel.com Ambika Arora ambika.arora@tcs.com Amit Bose bose@noironetworks.com +Amit Prakash Shukla amitprakashs@marvell.com Amitabha Biswas azbiswas@gmail.com Anand Kumar kumaranand@vmware.com Andrea Kao eirinikos@gmail.com From ec405e8573c5ce1590171a029e6be546b3821aad Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 16 May 2024 09:57:58 -0400 Subject: [PATCH 702/833] compiler: Fix errors in Clang 17 ubsan checks. This patch attempts to fix a large number of ubsan error messages that take the following form: lib/netlink-notifier.c:237:13: runtime error: call to function route_table_change through pointer to incorrect function type 'void (*)(const void *, void *)' In Clang 17 the undefined behaviour sanitizer check for function pointers was enabled by default, whereas it was previously disabled while compiling C code. These warnings are a false positive in the case of OVS, as our macros already check to make sure the function parameter is the correct size. So that check is disabled in the single function that is causing all of the errors. Acked-by: Jakob Meng Acked-by: Eelco Chaudron Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- include/openvswitch/compiler.h | 11 +++++++++++ lib/ovs-rcu.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h index 878c5c6a70d..ecb91801cc1 100644 --- a/include/openvswitch/compiler.h +++ b/include/openvswitch/compiler.h @@ -69,6 +69,17 @@ #define OVS_UNLIKELY(CONDITION) (!!(CONDITION)) #endif +/* Clang 17's implementation of ubsan enables checking that function pointers + * match the type of the called function. This currently breaks ovs-rcu, which + * calls multiple different types of callbacks via a generic void *(void*) + * function pointer type. This macro enables disabling that check for specific + * functions. */ +#if __clang__ && __has_feature(undefined_behavior_sanitizer) +#define OVS_NO_SANITIZE_FUNCTION __attribute__((no_sanitize("function"))) +#else +#define OVS_NO_SANITIZE_FUNCTION +#endif + #if __has_feature(c_thread_safety_attributes) /* "clang" annotations for thread safety check. * diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c index 9e07d9bab66..49afcc55c94 100644 --- a/lib/ovs-rcu.c +++ b/lib/ovs-rcu.c @@ -326,7 +326,7 @@ ovsrcu_postpone__(void (*function)(void *aux), void *aux) cb->aux = aux; } -static bool +static bool OVS_NO_SANITIZE_FUNCTION ovsrcu_call_postponed(void) { struct ovsrcu_cbset *cbset; From 0c8e626401252d0085b65742b9e4c2f682bad7c6 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 14 May 2024 15:15:34 +0200 Subject: [PATCH 703/833] utilities: Correct deletion reason in flow_reval_monitor.py. The flow_reval_monitor.py script incorrectly reported the reasons for FDR_PURGE and FDR_TOO_EXPENSIVE, as their descriptions were swapped. This patch rectifies the order using a dictionary to avoid similar problems in the future. In addition this patch also syncs the delete reason output of the script, with the comments in the code. Fixes: 86b9e653ef22 ("revalidator: Add a USDT probe during flow deletion with purge reason.") Acked-by: Ilya Maximets Acked-by: Aaron Conole Signed-off-by: Eelco Chaudron --- ofproto/ofproto-dpif-upcall.c | 24 +++++++------ utilities/usdt-scripts/flow_reval_monitor.py | 37 +++++++++++--------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 73901b65105..83609ec62b6 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -270,18 +270,20 @@ enum ukey_state { }; #define N_UKEY_STATES (UKEY_DELETED + 1) +/* Ukey delete reasons used by USDT probes. Please keep in sync with the + * definition in utilities/usdt-scripts/flow_reval_monitor.py. */ enum flow_del_reason { - FDR_NONE = 0, /* No deletion reason for the flow. */ - FDR_AVOID_CACHING, /* Flow deleted to avoid caching. */ - FDR_BAD_ODP_FIT, /* The flow had a bad ODP flow fit. */ - FDR_FLOW_IDLE, /* The flow went unused and was deleted. */ - FDR_FLOW_LIMIT, /* All flows being killed. */ - FDR_FLOW_WILDCARDED, /* The flow needed a narrower wildcard mask. */ - FDR_NO_OFPROTO, /* The flow didn't have an associated ofproto. */ - FDR_PURGE, /* User action caused flows to be killed. */ - FDR_TOO_EXPENSIVE, /* The flow was too expensive to revalidate. */ - FDR_UPDATE_FAIL, /* Flow state transition was unexpected. */ - FDR_XLATION_ERROR, /* There was an error translating the flow. */ + FDR_NONE = 0, /* No delete reason specified. */ + FDR_AVOID_CACHING, /* Cache avoidance flag set. */ + FDR_BAD_ODP_FIT, /* Bad ODP flow fit. */ + FDR_FLOW_IDLE, /* Flow idle timeout. */ + FDR_FLOW_LIMIT, /* Kill all flows condition reached. */ + FDR_FLOW_WILDCARDED, /* Flow needs a narrower wildcard mask. */ + FDR_NO_OFPROTO, /* Bridge not found. */ + FDR_PURGE, /* User requested flow deletion. */ + FDR_TOO_EXPENSIVE, /* Too expensive to revalidate. */ + FDR_UPDATE_FAIL, /* Datapath update failed. */ + FDR_XLATION_ERROR, /* Flow translation error. */ }; /* 'udpif_key's are responsible for tracking the little bit of state udpif diff --git a/utilities/usdt-scripts/flow_reval_monitor.py b/utilities/usdt-scripts/flow_reval_monitor.py index 534ba8fa216..28479a5650d 100755 --- a/utilities/usdt-scripts/flow_reval_monitor.py +++ b/utilities/usdt-scripts/flow_reval_monitor.py @@ -236,6 +236,11 @@ ], start=0, ) + +# +# The below FdrReasons and FdrReasonStrings definitions can be found in the +# ofproto/ofproto-dpif-upcall.c file. Please keep them in sync. +# FdrReasons = IntEnum( "flow_del_reason", [ @@ -254,19 +259,19 @@ start=0, ) -FdrReasonStrings = [ - "No deletion reason", - "Cache avoidance flag set", - "Bad ODP flow fit", - "Idle flow timed out", - "Kill all flows condition detected", - "Mask too wide - need narrower match", - "No matching ofproto rules", - "Too expensive to revalidate", - "Purged with user action", - "Flow state inconsistent after updates", - "Flow translation error", -] +FdrReasonStrings = { + FdrReasons.FDR_NONE: "No delete reason specified", + FdrReasons.FDR_AVOID_CACHING: "Cache avoidance flag set", + FdrReasons.FDR_BAD_ODP_FIT: "Bad ODP flow fit", + FdrReasons.FDR_FLOW_IDLE: "Flow idle timeout", + FdrReasons.FDR_FLOW_LIMIT: "Kill all flows condition reached", + FdrReasons.FDR_FLOW_WILDCARDED: "Flow needs a narrower wildcard mask", + FdrReasons.FDR_NO_OFPROTO: "Bridge not found", + FdrReasons.FDR_PURGE: "User requested flow deletion", + FdrReasons.FDR_TOO_EXPENSIVE: "Too expensive to revalidate", + FdrReasons.FDR_UPDATE_FAIL: "Datapath update failed", + FdrReasons.FDR_XLATION_ERROR: "Flow translation error" +} def err(msg, code=-1): @@ -572,10 +577,10 @@ def print_expiration(event): """Prints a UFID eviction with a reason.""" ufid_str = format_ufid(event.ufid) - if event.reason > len(FdrReasons): - reason = f"Unknown reason '{event.reason}'" - else: + try: reason = FdrReasonStrings[event.reason] + except KeyError: + reason = f"Unknown reason '{event.reason}'" print( "{:<10} {:<18.9f} {:<36} {:<17}".format( From 320f7e1a408e9956b9bc2144b3886eaf5795090e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 17 May 2024 20:33:03 +0200 Subject: [PATCH 704/833] srv6: Fix misaligned writes to segment list. Segments list in SRv6 header is 16-bit aligned as most of other fields in packet headers. A little counter-intuitively, compilers are allowed to make alignment assumptions based on the pointer type passed to memcpy(), so they can use copy instructions that require 32-bit alignment in case of struct in6_addr pointer. Reported by UBsan in Clang 18: lib/netdev-native-tnl.c:985:16: runtime error: store to misaligned address 0x7fd9e97351ce for type 'struct in6_addr *', which requires 4 byte alignment 0x7fd9e97351ce: note: pointer points here 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ^ 0 0xc1de38 in netdev_srv6_build_header lib/netdev-native-tnl.c:985:9 1 0x6e794b in tnl_port_build_header ofproto/tunnel.c:751:11 2 0x6c9c0a in native_tunnel_output ofproto/ofproto-dpif-xlate.c:3887:11 3 0x6c9c0a in compose_output_action__ ofproto/ofproto-dpif-xlate.c:4502:13 4 0x6b6646 in compose_output_action ofproto/ofproto-dpif-xlate.c:4564:5 5 0x6b6646 in xlate_output_action ofproto/ofproto-dpif-xlate.c:5517:13 6 0x68cfee in do_xlate_actions ofproto/ofproto-dpif-xlate.c:7288:13 7 0x67fed0 in xlate_actions ofproto/ofproto-dpif-xlate.c:8314:13 8 0x6468bd in ofproto_trace__ ofproto/ofproto-dpif-trace.c:782:30 9 0x64484a in ofproto_trace ofproto/ofproto-dpif-trace.c:851:5 10 0x647469 in ofproto_unixctl_trace ofproto/ofproto-dpif-trace.c:490:9 11 0xc33771 in process_command lib/unixctl.c:310:13 12 0xc33771 in run_connection lib/unixctl.c:344:17 13 0xc33771 in unixctl_server_run lib/unixctl.c:395:21 14 0x53e6ef in main vswitchd/ovs-vswitchd.c:131:9 15 0x7f61c7 in __libc_start_call_main (/lib64/libc.so.6+0x2a1c7) 16 0x7f628a in __libc_start_main@GLIBC_2.2.5 (/lib64/libc.so.6+0x2a28a) 17 0x42ca24 in _start (vswitchd/ovs-vswitchd+0x42ca24) SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior lib/netdev-native-tnl.c:985:16 Having misaligned pointers is also generally not allowed in C, let alone accessing memory through them. Fix that by using an appropriate ovs_16aligned_in6_addr pointer instead. Fixes: 7381fd440a88 ("odp: Add SRv6 tunnel actions.") Fixes: 03fc1ad78521 ("userspace: Add SRv6 tunnel support.") Reviewed-by: Nobuhiro MIKI Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 5 ++--- lib/odp-util.c | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index dee9ab344e4..b21176037bd 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -932,9 +932,9 @@ netdev_srv6_build_header(const struct netdev *netdev, const struct netdev_tnl_build_header_params *params) { const struct netdev_tunnel_config *tnl_cfg; + union ovs_16aligned_in6_addr *s; const struct in6_addr *segs; struct srv6_base_hdr *srh; - struct in6_addr *s; ovs_be16 dl_type; int nr_segs; int i; @@ -978,8 +978,7 @@ netdev_srv6_build_header(const struct netdev *netdev, return EOPNOTSUPP; } - s = ALIGNED_CAST(struct in6_addr *, - (char *) srh + sizeof *srh); + s = (union ovs_16aligned_in6_addr *) (srh + 1); for (i = 0; i < nr_segs; i++) { /* Segment list is written to the header in reverse order. */ memcpy(s, &segs[nr_segs - i - 1], sizeof *s); diff --git a/lib/odp-util.c b/lib/odp-util.c index 21f34d95571..724e6f2bca1 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -1820,8 +1820,8 @@ ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) } else if (ovs_scan_len(s, &n, "srv6(segments_left=%"SCNu8, &segments_left)) { struct srv6_base_hdr *srh = (struct srv6_base_hdr *) (ip6 + 1); + union ovs_16aligned_in6_addr *segs; char seg_s[IPV6_SCAN_LEN + 1]; - struct in6_addr *segs; struct in6_addr seg; uint8_t n_segs = 0; @@ -1844,7 +1844,7 @@ ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) return -EINVAL; } - segs = ALIGNED_CAST(struct in6_addr *, srh + 1); + segs = (union ovs_16aligned_in6_addr *) (srh + 1); segs += segments_left; while (ovs_scan_len(s, &n, IPV6_SCAN_FMT, seg_s) From 4d2c64ca1b6bc009f4d9cdd975f23605d0879f1e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 17 May 2024 20:47:11 +0200 Subject: [PATCH 705/833] atlocal: Fix setting HAVE_PYTEST on unexpected errors. If the python script throws an unexpected exception, the HAVE_PYTEST variable remains undefined. If at the same time dependencies are not actually present, pytest tests will fail instead of being skipped. Define the variable to 'no' on unexpected failures to skip the tests when dependencies cannot be verified. The issue can be reproduced on systems with python 3.12+ in case the deprecated 'pkg_resources' module is not available. Fixes: 445dceb88461 ("python: Introduce unit tests.") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/atlocal.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/atlocal.in b/tests/atlocal.in index f321bae55f3..466fd4ed4e3 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -242,5 +242,6 @@ with pathlib.Path(os.path.join(os.getenv("REQUIREMENT_PATH"))).open() as reqs: case $? in 0) HAVE_PYTEST=yes ;; 2) HAVE_PYTEST=no ;; - *) echo "$0: unexpected error probing Python unit test requirements" >&2 ;; + *) HAVE_PYTEST=no + echo "$0: unexpected error probing Python unit test requirements" >&2 ;; esac From d4bd0a2ad54b63c1e767ea2a7bff761d4676514a Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 17 May 2024 20:47:12 +0200 Subject: [PATCH 706/833] atlocal: Replace deprecated pkg_resources. 'pkg_resources' module is deprecated and no longer available in newer versions of python, so pytest tests are skipped: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html Unfortunately, there is no direct replacement for it and functionality is scattered between different packages. Using a new standard library importlib.metadata to find installed packages and their versions. Using packaging.requirements to parse lines from the requirements file and compare versions. This covers all we need. The 'packaging' is a project used by pip and a dependency for many other libraries, so should be available for any supported verison of python. 'importlib' was introduced in python 3.8. Since we support older versions of python and 'packaging' is not part of the standard library, checking that import is possible and falling back to 'pkg_resources' if needed. We may remove the fallback when we stop supporting python below 3.8. Even though 'packaging' is a common dependency, added to the test requirements so it will not be missed in CI. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- python/test_requirements.txt | 1 + tests/atlocal.in | 28 ++++++++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/python/test_requirements.txt b/python/test_requirements.txt index 5043c71e223..a1424506b64 100644 --- a/python/test_requirements.txt +++ b/python/test_requirements.txt @@ -1,4 +1,5 @@ netaddr +packaging pyftpdlib pyparsing pytest diff --git a/tests/atlocal.in b/tests/atlocal.in index 466fd4ed4e3..8565a0bae9f 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -229,15 +229,31 @@ export UBSAN_OPTIONS REQUIREMENT_PATH=$abs_top_srcdir/python/test_requirements.txt $PYTHON3 -c ' import os import pathlib -import pkg_resources import sys +PACKAGING = True +try: + from packaging import requirements + from importlib import metadata +except ModuleNotFoundError: + PACKAGING = False + import pkg_resources + with pathlib.Path(os.path.join(os.getenv("REQUIREMENT_PATH"))).open() as reqs: - for req in pkg_resources.parse_requirements(reqs): - try: - pkg_resources.require(str(req)) - except pkg_resources.DistributionNotFound: - sys.exit(2) + if PACKAGING: + for req in reqs.readlines(): + try: + r = requirements.Requirement(req.strip()) + if metadata.version(r.name) not in r.specifier: + raise metadata.PackageNotFoundError + except metadata.PackageNotFoundError: + sys.exit(2) + else: + for req in pkg_resources.parse_requirements(reqs): + try: + pkg_resources.require(str(req)) + except pkg_resources.DistributionNotFound: + sys.exit(2) ' case $? in 0) HAVE_PYTEST=yes ;; From ce1663b5fe4fae839bd0d20736d5dceae5650f7d Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 27 May 2024 14:00:33 +0300 Subject: [PATCH 707/833] netdev-linux: Fix ethtool_cmd is partly outside array bounds. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Used Ubuntu with gcc (Ubuntu 11.2.0-19ubuntu1) 11.2.0 lib/netdev-linux.c: In function 'netdev_linux_construct': lib/netdev-linux.c:7003:15: error: array subscript 'struct ethtool_cmd[0]' is partly outside array bounds of ‘union [1]’ [-Werror=array-bounds] 7003 | ecmd->cmd = cmd; | ~~~~~~~~~~^~~~~ lib/netdev-linux.c:2411:7: note: while referencing ‘sset_info’ 2411 | } sset_info; | ^~~~~~~~~ Fixes: 6c59c195266c ("netdev-linux: Use ethtool to detect offload support.") Signed-off-by: Roi Dayan Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 25349c605cd..e81883d1d11 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2402,6 +2402,7 @@ static int netdev_linux_read_stringset_info(struct netdev_linux *netdev, uint32_t *len) { union { + struct ethtool_cmd ecmd; struct ethtool_sset_info hdr; struct { uint64_t pad[2]; From 8e5a89cd048540c0cf7b6f52d18be4d76f026a54 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 27 May 2024 15:08:41 -0400 Subject: [PATCH 708/833] netdev-offload: Fix null pointer dereference warning on dump creation. Clang's static analyzer will complain about a null pointer dereference because dumps can be set to null and then there is a loop where it could have been written to. This is a false positive, but only because the netdev dpif type won't change during this loop. Instead, return early from the netdev_ports_flow_dump_create function if dumps is NULL. Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-offload.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/netdev-offload.c b/lib/netdev-offload.c index 931d634e15a..8a9d3655592 100644 --- a/lib/netdev-offload.c +++ b/lib/netdev-offload.c @@ -626,8 +626,8 @@ netdev_ports_traverse(const char *dpif_type, struct netdev_flow_dump ** netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) { + struct netdev_flow_dump **dumps = NULL; struct port_to_netdev_data *data; - struct netdev_flow_dump **dumps; int count = 0; int i = 0; @@ -638,7 +638,11 @@ netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) } } - dumps = count ? xzalloc(sizeof *dumps * count) : NULL; + if (!count) { + goto unlock; + } + + dumps = xzalloc(sizeof *dumps * count); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { @@ -650,6 +654,8 @@ netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) i++; } } + +unlock: ovs_rwlock_unlock(&port_to_netdev_rwlock); *ports = i; From 8359cc422e6541af58d306803a13225b2d01265f Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 27 May 2024 15:08:42 -0400 Subject: [PATCH 709/833] netdev-native-tnl: Fix use of uninitialized offset on SRv6 header pop. Clang's static analyzer will complain about uninitialized value 'hlen' because we weren't properly checking the error code from a function that would have initialized the value. Instead, add a check for that return code. Fixes: 03fc1ad78521 ("userspace: Add SRv6 tunnel support.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-native-tnl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index b21176037bd..d6f46ac4ae1 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -1067,7 +1067,10 @@ netdev_srv6_pop_header(struct dp_packet *packet) } pkt_metadata_init_tnl(md); - netdev_tnl_ip_extract_tnl_md(packet, tnl, &hlen); + if (!netdev_tnl_ip_extract_tnl_md(packet, tnl, &hlen)) { + goto err; + } + dp_packet_reset_packet(packet, hlen); return packet; From 51a2476bc2311fe26ac98af92d188a762a208457 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 27 May 2024 15:08:43 -0400 Subject: [PATCH 710/833] dpctl: Fix uninitialized value when deleting flows. Clang's static analyzer will complain about an uninitialized value because we weren't setting a value for ufid_generated in all code paths. Now we initialize this on declaration. This patch also corrects the reverse x-mass of variable declaration. Fixes: bbe2e3928747 ("dpctl: Fix broken flow deletion via ovs-dpctl due to missing ufid.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/dpctl.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index 3c555a55925..a70df534202 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1359,19 +1359,17 @@ static int dpctl_del_flow_dpif(struct dpif *dpif, const char *key_s, struct dpctl_params *dpctl_p) { + struct dpif_port_dump port_dump; struct dpif_flow_stats stats; + bool ufid_generated = false; struct dpif_port dpif_port; - struct dpif_port_dump port_dump; - struct ofpbuf key; + bool ufid_present = false; + struct simap port_names; struct ofpbuf mask; /* To be ignored. */ - + struct ofpbuf key; ovs_u128 ufid; - bool ufid_generated; - bool ufid_present; - struct simap port_names; int n, error; - ufid_present = false; n = odp_ufid_from_string(key_s, &ufid); if (n < 0) { dpctl_error(dpctl_p, -n, "parsing flow ufid"); From 4837b5fed31037d57fec1ae21adf210a07449fd3 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 27 May 2024 15:08:44 -0400 Subject: [PATCH 711/833] socket: Fix uninitialized values in inet_parse_ functions. Clang's static analyzer will complain about uninitialized value dns_failure because we weren't setting a value for dns_failure in all code paths. Now we initialize this in the error conditions of inet_parse_passive and inet_parse_active. Fixes: 08e9e5337383 ("ovsdb: raft: Fix inability to read the database with DNS host names.") Fixes: 5f219af8b3c7 ("ovsdb-server: Fix handling of DNS name for listener configuration.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/socket-util.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/socket-util.c b/lib/socket-util.c index 2d89fce8501..c569b7d1664 100644 --- a/lib/socket-util.c +++ b/lib/socket-util.c @@ -546,9 +546,15 @@ inet_parse_active(const char *target_, int default_port, if (!host) { VLOG_ERR("%s: host must be specified", target_); ok = false; + if (dns_failure) { + *dns_failure = false; + } } else if (!port && default_port < 0) { VLOG_ERR("%s: port must be specified", target_); ok = false; + if (dns_failure) { + *dns_failure = false; + } } else { ok = parse_sockaddr_components(ss, host, port, default_port, target_, resolve_host, dns_failure); @@ -671,6 +677,9 @@ inet_parse_passive(const char *target_, int default_port, if (!port && default_port < 0) { VLOG_ERR("%s: port must be specified", target_); ok = false; + if (dns_failure) { + *dns_failure = false; + } } else { ok = parse_sockaddr_components(ss, host, port, default_port, target_, resolve_host, dns_failure); From 0c1b9b015df66946e08de8d2e541f56397dd70e6 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 27 May 2024 15:08:45 -0400 Subject: [PATCH 712/833] netdev-linux: Return an error if device feature names are empty. When retrieving a list of features supported by a network card, return with an error code if the request completed without an error but the list contains zero entries. In practice this should never happen, but it does contribute to a detection in Clang's static analyzer. Fixes: 6c59c195266c ("netdev-linux: Use ethtool to detect offload support.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index e81883d1d11..7a2c055f0a5 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2440,9 +2440,12 @@ netdev_linux_read_definitions(struct netdev_linux *netdev, int error = 0; error = netdev_linux_read_stringset_info(netdev, &len); - if (error || !len) { + if (error) { return error; + } else if (!len) { + return -EOPNOTSUPP; } + strings = xzalloc(sizeof *strings + len * ETH_GSTRING_LEN); strings->cmd = ETHTOOL_GSTRINGS; From 4d25656a231b86455b1c10b737343fcfa72052d6 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 27 May 2024 15:08:46 -0400 Subject: [PATCH 713/833] netdev-linux: Initialize link speed in error conditions. Clang's static analyzer noted that the output from netdev_linux_get_speed_locked can be checked even if this function doesn't set any values. Now we always set those values to a sane default in all cases. Fixes: 19cffe30cfda ("netdev-linux: Avoid deadlock in netdev_get_speed.") Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netdev-linux.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 7a2c055f0a5..3f200f28bc1 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2728,6 +2728,7 @@ netdev_linux_get_speed_locked(struct netdev_linux *netdev, uint32_t *current, uint32_t *max) { if (netdev_linux_netnsid_is_remote(netdev)) { + *current = *max = 0; return EOPNOTSUPP; } @@ -2737,6 +2738,8 @@ netdev_linux_get_speed_locked(struct netdev_linux *netdev, ? 0 : netdev->current_speed; *max = MIN(UINT32_MAX, netdev_features_to_bps(netdev->supported, 0) / 1000000ULL); + } else { + *current = *max = 0; } return netdev->get_features_error; } From 7af0716ea621a8cebcd9c3061fcb7a044e343f14 Mon Sep 17 00:00:00 2001 From: Emma Finn Date: Fri, 24 May 2024 09:20:18 +0000 Subject: [PATCH 714/833] odp-execute: Fix AVX checksum calculation. The AVX implementation for calculating checksums was not handling carry-over addition correctly in some cases. This patch adds an additional shuffle to add 16-bit padding to the final part of the calculation to handle such cases. This commit also adds a unit test to check the checksum carry-bits issue with actions autovalidator enabled. Reported-by: Eelco Chaudron Signed-off-by: Emma Finn Signed-off-by: Eelco Chaudron --- lib/odp-execute-avx512.c | 5 ++++ tests/dpif-netdev.at | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 50c48bfd479..a74a85dc1a8 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -366,6 +366,8 @@ avx512_get_delta(__m256i old_header, __m256i new_header) 0xF, 0xF, 0xF, 0xF); v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_shuffle_epi8(v_delta, v_swap16a); v_delta = _mm256_hadd_epi32(v_delta, v_zeros); v_delta = _mm256_hadd_epi16(v_delta, v_zeros); @@ -575,6 +577,9 @@ avx512_ipv6_sum_header(__m512i ip6_header) 0xF, 0xF, 0xF, 0xF); v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_shuffle_epi8(v_delta, v_swap16a); v_delta = _mm256_hadd_epi32(v_delta, v_zeros); v_delta = _mm256_hadd_epi16(v_delta, v_zeros); diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 790b5a43af9..bdc24cc3071 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -1091,3 +1091,66 @@ OVS_VSWITCHD_STOP(["dnl /Error: unknown miniflow extract implementation superstudy./d /Error: invalid study_pkt_cnt value: -pmd./d"]) AT_CLEANUP + +AT_SETUP([datapath - Actions Autovalidator Checksum]) + +OVS_VSWITCHD_START(add-port br0 p0 -- set Interface p0 type=dummy \ + -- add-port br0 p1 -- set Interface p1 type=dummy) + +AT_CHECK([ovs-appctl odp-execute/action-impl-set autovalidator], [0], [dnl +Action implementation set to autovalidator. +]) + +dnl Add flows to trigger checksum calculation. +AT_DATA([flows.txt], [dnl + in_port=p0,ip,actions=mod_nw_src=10.1.1.1,p1 + in_port=p0,ipv6,actions=set_field:fc00::100->ipv6_src,p1 +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl -Oopenflow13 add-flows br0 flows.txt]) + +dnl Make sure checksum won't be offloaded. +AT_CHECK([ovs-vsctl set Interface p0 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p0 options:ol_ip_csum_set_good=false]) + +AT_CHECK([ovs-vsctl set Interface p1 options:pcap=p1.pcap]) + +dnl IPv4 packet with values that will trigger carry-over addition for checksum. +flow_s_v4=" + eth_src=47:42:86:08:17:50,eth_dst=3e:55:b5:9e:3a:fb,dl_type=0x0800, + nw_src=229.167.36.90,nw_dst=130.161.64.186,nw_proto=6,nw_ttl=64,nw_frag=no, + tp_src=54392,tp_dst=5201,tcp_flags=ack" + +good_frame=$(ovs-ofctl compose-packet --bare "${flow_s_v4}") +AT_CHECK([ovs-appctl netdev-dummy/receive p0 ${good_frame}]) + +dnl Checksum should change to 0xAC33 with ip_src changed to 10.1.1.1 +dnl by the datapath while processing the packet. +flow_expected=$(echo "${flow_s_v4}" | sed 's/229.167.36.90/10.1.1.1/g') +good_expected=$(ovs-ofctl compose-packet --bare "${flow_expected}") +AT_CHECK([ovs-pcap p1.pcap > p1.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p1.pcap.txt], [0], [${good_expected} +]) + +dnl Repeat similar test for IPv6. +flow_s_v6=" + eth_src=8a:bf:7e:2f:05:84,eth_dst=0a:8f:39:4f:e0:73,dl_type=0x86dd, + ipv6_src=2f8a:2076:3926:9e7:2d47:4bc9:9c7:17f3, + ipv6_dst=7287:10dd:2fb9:41d5:3eb2:2c7a:11b0:6258, + ipv6_label=0x51ac,nw_proto=6,nw_ttl=142,nw_frag=no, + tp_src=20405,tp_dst=20662,tcp_flags=ack" + +good_frame_v6=$(ovs-ofctl compose-packet --bare "${flow_s_v6}") +AT_CHECK([ovs-appctl netdev-dummy/receive p0 ${good_frame_v6}]) + +dnl Checksum should change to 0x59FD with ipv6_src changed to fc00::100 +dnl by the datapath while processing the packet. +flow_expected_v6=$(echo "${flow_s_v6}" | \ + sed 's/2f8a:2076:3926:9e7:2d47:4bc9:9c7:17f3/fc00::100/g') +good_expected_v6=$(ovs-ofctl compose-packet --bare "${flow_expected_v6}") +AT_CHECK([ovs-pcap p1.pcap > p1.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p1.pcap.txt], [0], [${good_expected_v6} +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP From 488ad8beec4485f68dd9aa74d677b165bace0b7c Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Tue, 28 May 2024 10:25:09 +0100 Subject: [PATCH 715/833] dpdk: Use DPDK 23.11.1 release. Update the CI and docs to use DPDK 23.11.1. Signed-off-by: Kevin Traynor Acked-by: Ilya Maximets Acked-by: Eelco Chaudron --- .github/workflows/build-and-test.yml | 4 ++-- Documentation/faq/releases.rst | 10 +++++----- Documentation/intro/install/dpdk.rst | 8 ++++---- NEWS | 2 ++ 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 424dbab6c91..9d3a13ca1c9 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -10,8 +10,8 @@ jobs: env: dependencies: gcc libbpf-dev libnuma-dev libpcap-dev ninja-build pkgconf CC: gcc - DPDK_GIT: https://dpdk.org/git/dpdk - DPDK_VER: 23.11 + DPDK_GIT: https://dpdk.org/git/dpdk-stable + DPDK_VER: 23.11.1 name: dpdk gcc outputs: dpdk_key: ${{ steps.gen_dpdk_key.outputs.key }} diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 26973c2adc3..70219d7175e 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -216,11 +216,11 @@ Q: What DPDK version does each Open vSwitch release work with? 2.14.x 19.11.13 2.15.x 20.11.6 2.16.x 20.11.6 - 2.17.x 21.11.6 - 3.0.x 21.11.6 - 3.1.x 22.11.4 - 3.2.x 22.11.4 - 3.3.x 23.11 + 2.17.x 21.11.7 + 3.0.x 21.11.7 + 3.1.x 22.11.5 + 3.2.x 22.11.5 + 3.3.x 23.11.1 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index f1646322c7e..63a978f0e81 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -42,7 +42,7 @@ Build requirements In addition to the requirements described in :doc:`general`, building Open vSwitch with DPDK will require the following: -- DPDK 23.11 +- DPDK 23.11.1 - A `DPDK supported NIC`_ @@ -73,9 +73,9 @@ Install DPDK #. Download the `DPDK sources`_, extract the file and set ``DPDK_DIR``:: $ cd /usr/src/ - $ wget https://fast.dpdk.org/rel/dpdk-23.11.tar.xz - $ tar xf dpdk-23.11.tar.xz - $ export DPDK_DIR=/usr/src/dpdk-23.11 + $ wget https://fast.dpdk.org/rel/dpdk-23.11.1.tar.xz + $ tar xf dpdk-23.11.1.tar.xz + $ export DPDK_DIR=/usr/src/dpdk-stable-23.11.1 $ cd $DPDK_DIR #. Configure and install DPDK using Meson diff --git a/NEWS b/NEWS index b92cec532c5..5ae0108d552 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ Post-v3.3.0 - The primary development branch has been renamed from 'master' to 'main'. The OVS tree remains hosted on GitHub. https://github.com/openvswitch/ovs.git + - DPDK: + * OVS validated with DPDK 23.11.1. v3.3.0 - 16 Feb 2024 From 18436bed853c8e8178c77857408974bb170c5195 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:26 +0200 Subject: [PATCH 716/833] netdev-linux: Fix possible int overflow in tc_add_matchall_policer(). Fix unintentional integer overflow reported by Coverity by adding the ULL suffix to the numerical literals used in the multiplications. Fixes: ed2300cca0d3 ("netdev-linux: Refactor put police action netlink message") Acked-by: Mike Pattrick Acked-by: Paolo Valerio Signed-off-by: Eelco Chaudron --- lib/netdev-linux.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 3f200f28bc1..f5074f4f5f3 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -2922,8 +2922,8 @@ tc_add_matchall_policer(struct netdev *netdev, uint64_t kbits_rate, basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT); nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst, - kpkts_rate * 1000, kpkts_burst * 1000, TC_ACT_UNSPEC, - false); + kpkts_rate * 1000ULL, kpkts_burst * 1000ULL, + TC_ACT_UNSPEC, false); nl_msg_end_nested(&request, action_offset); nl_msg_end_nested(&request, basic_offset); From b253d74d48e0cbf51bb46b99dc145d62a0565ea2 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:27 +0200 Subject: [PATCH 717/833] cfm: Fix possible integer overflow in tc_add_matchall_policer(). Fix unintentional integer overflow reported by Coverity by adding the LL suffix to the numerical literals used in the multiplication. Fixes: 5767a79a4059 ("cfm: Require ccm received in demand mode.") Acked-by: Mike Pattrick Signed-off-by: Eelco Chaudron --- lib/cfm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cfm.c b/lib/cfm.c index c3742f3de20..7eb08015776 100644 --- a/lib/cfm.c +++ b/lib/cfm.c @@ -863,7 +863,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct dp_packet *p) rmp->num_health_ccm++; if (cfm->demand) { timer_set_duration(&cfm->demand_rx_ccm_t, - 100 * cfm->ccm_interval_ms); + 100LL * cfm->ccm_interval_ms); } } rmp->recv = true; From e42d3ed298941b65e8cb5900a52a79e3cb579996 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:28 +0200 Subject: [PATCH 718/833] sflow: Replace libc's random() function with the OVS's random_range(). Coverity has flagged the use of a potentially unsafe function. Although this is not a concern in this case since it's not used for encryption, we should replace it with the OVS implementation to achieve better randomness. Fixes: c72e245a0e2c ("Add InMon's sFlow Agent library to the build system.") Acked-by: Mike Pattrick Signed-off-by: Eelco Chaudron --- lib/sflow_poller.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/sflow_poller.c b/lib/sflow_poller.c index 9e6a487bc07..46e40cbd42d 100644 --- a/lib/sflow_poller.c +++ b/lib/sflow_poller.c @@ -6,6 +6,7 @@ */ #include "sflow_api.h" +#include "random.h" /*_________________--------------------------__________________ _________________ sfl_poller_init __________________ @@ -88,7 +89,7 @@ void sfl_poller_set_sFlowCpInterval(SFLPoller *poller, u_int32_t sFlowCpInterval Another smoothing factor is that the tick() function called here is usually driven from a fairly "soft" polling loop rather than a hard real-time event. */ - poller->countersCountdown = 1 + (random() % sFlowCpInterval); + poller->countersCountdown = 1 + random_range(sFlowCpInterval); } else { /* Setting sFlowCpInterval to 0 disables counter polling altogether. Thanks to From 11e48b8ca2115d73ce53d3d85fc0e55435d7a80e Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:29 +0200 Subject: [PATCH 719/833] sflow: Use uint32_t instead of time_t for tick handling in the poller. The sFlow library uses a uint32_t to configure timeout ticks, but stores this value as a time_t. Although this doesn't cause functional issues, it wastes space and confuses Coverity, potentially indicating a Y2K38 problem when storing uint32_t values in time_t. This patch updates the internal data structures to use uint32_t variables. Fixes: c72e245a0e2c ("Add InMon's sFlow Agent library to the build system.") Acked-by: Mike Pattrick Signed-off-by: Eelco Chaudron --- lib/sflow_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/sflow_api.h b/lib/sflow_api.h index eb23e2acdb9..f4bfa5eada8 100644 --- a/lib/sflow_api.h +++ b/lib/sflow_api.h @@ -148,7 +148,7 @@ typedef struct _SFLPoller { /* MIB fields */ SFLDataSource_instance dsi; u_int32_t sFlowCpReceiver; - time_t sFlowCpInterval; + u_int32_t sFlowCpInterval; /* public fields */ struct _SFLAgent *agent; /* pointer to my agent */ void *magic; /* ptr to pass back in getCountersFn() */ @@ -156,7 +156,7 @@ typedef struct _SFLPoller { u_int32_t bridgePort; /* port number local to bridge */ /* private fields */ SFLReceiver *myReceiver; - time_t countersCountdown; + u_int32_t countersCountdown; u_int32_t countersSampleSeqNo; } SFLPoller; From f673d0cd5fc5709e6869672a7d3c920c5ae5740d Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:30 +0200 Subject: [PATCH 720/833] sflow: Fix check for disabled receive time. Changed sFlowRcvrTimeout to a uint32_t to avoid time_t warnings reported by Coverity. A uint32_t is more than large enough as this is a (seconds) tick counter and OVS is not even using this. Fixes: c72e245a0e2c ("Add InMon's sFlow Agent library to the build system.") Acked-by: Ilya Maximets Signed-off-by: Eelco Chaudron --- lib/sflow_api.h | 6 +++--- lib/sflow_receiver.c | 7 ++++--- ofproto/ofproto-dpif-sflow.c | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/sflow_api.h b/lib/sflow_api.h index f4bfa5eada8..b884a6a7d09 100644 --- a/lib/sflow_api.h +++ b/lib/sflow_api.h @@ -97,7 +97,7 @@ typedef struct _SFLReceiver { struct _SFLReceiver *nxt; /* MIB fields */ char *sFlowRcvrOwner; - time_t sFlowRcvrTimeout; + u_int32_t sFlowRcvrTimeout; u_int32_t sFlowRcvrMaximumDatagramSize; SFLAddress sFlowRcvrAddress; u_int32_t sFlowRcvrPort; @@ -251,8 +251,8 @@ SFLSampler *sfl_agent_getSamplerByIfIndex(SFLAgent *agent, u_int32_t ifIndex); /* receiver */ char * sfl_receiver_get_sFlowRcvrOwner(SFLReceiver *receiver); void sfl_receiver_set_sFlowRcvrOwner(SFLReceiver *receiver, char *sFlowRcvrOwner); -time_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver); -void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, time_t sFlowRcvrTimeout); +u_int32_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver); +void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, u_int32_t sFlowRcvrTimeout); u_int32_t sfl_receiver_get_sFlowRcvrMaximumDatagramSize(SFLReceiver *receiver); void sfl_receiver_set_sFlowRcvrMaximumDatagramSize(SFLReceiver *receiver, u_int32_t sFlowRcvrMaximumDatagramSize); SFLAddress *sfl_receiver_get_sFlowRcvrAddress(SFLReceiver *receiver); diff --git a/lib/sflow_receiver.c b/lib/sflow_receiver.c index 4162518e3c4..3c5aec897e4 100644 --- a/lib/sflow_receiver.c +++ b/lib/sflow_receiver.c @@ -102,10 +102,10 @@ void sfl_receiver_set_sFlowRcvrOwner(SFLReceiver *receiver, char *sFlowRcvrOwner reset(receiver); } } -time_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver) { +u_int32_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver) { return receiver->sFlowRcvrTimeout; } -void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, time_t sFlowRcvrTimeout) { +void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, u_int32_t sFlowRcvrTimeout) { receiver->sFlowRcvrTimeout =sFlowRcvrTimeout; } u_int32_t sfl_receiver_get_sFlowRcvrMaximumDatagramSize(SFLReceiver *receiver) { @@ -146,7 +146,8 @@ void sfl_receiver_tick(SFLReceiver *receiver) // if there are any samples to send, flush them now if(receiver->sampleCollector.numSamples > 0) sendSample(receiver); // check the timeout - if(receiver->sFlowRcvrTimeout && (u_int32_t)receiver->sFlowRcvrTimeout != 0xFFFFFFFF) { + if(receiver->sFlowRcvrTimeout + && receiver->sFlowRcvrTimeout != UINT32_MAX) { // count down one tick and reset if we reach 0 if(--receiver->sFlowRcvrTimeout == 0) reset(receiver); } diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c index 4a68e9b949b..80405b68a67 100644 --- a/ofproto/ofproto-dpif-sflow.c +++ b/ofproto/ofproto-dpif-sflow.c @@ -808,7 +808,7 @@ dpif_sflow_set_options(struct dpif_sflow *ds, receiver = sfl_agent_addReceiver(ds->sflow_agent); sfl_receiver_set_sFlowRcvrOwner(receiver, "Open vSwitch sFlow"); - sfl_receiver_set_sFlowRcvrTimeout(receiver, 0xffffffff); + sfl_receiver_set_sFlowRcvrTimeout(receiver, UINT32_MAX); /* Set the sampling_rate down in the datapath. */ ds->probability = MAX(1, UINT32_MAX / ds->options->sampling_rate); From 361d7bce0f68708fc06d4ab8f7b3759d7be81c75 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:31 +0200 Subject: [PATCH 721/833] ofproto-dpif: Define age as time_t in ofproto_unixctl_fdb_add(). Fix the warning from Coverity about potential truncation of the time_t value when copying to a local variable by changing the local variable's type to time_t. Fixes: ccc24fc88d59 ("ofproto-dpif: APIs and CLI option to add/delete static fdb entry.") Acked-by: Mike Pattrick Acked-by: Paolo Valerio Signed-off-by: Eelco Chaudron --- ofproto/ofproto-dpif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 32d037be607..fcd7cd753ca 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -6097,7 +6097,7 @@ ofproto_unixctl_fdb_add(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *port_name = argv[2]; uint16_t vlan = atoi(argv[3]); struct eth_addr mac; - int age; + time_t age; ofproto = ofproto_dpif_lookup_by_name(br_name); if (!ofproto) { From d97eee88e58013b608ce134521c40e537509c939 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:32 +0200 Subject: [PATCH 722/833] db-ctl-base: Initialize the output variable in the ctx structure. Coverity was flagged that the uninitialized output variable was used in the ctl_context_init_command() function. This patch initializes the variable. In addition it also destroys the ds string in ctl_context_done() in case it's not cleared properly. Fixes: 07ff77ccb82a ("db-ctl-base: Make common database command code into library.") Acked-by: Ilya Maximets Acked-by: Paolo Valerio Signed-off-by: Eelco Chaudron --- lib/db-ctl-base.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c index 3a8068b12c0..b3e9b92d197 100644 --- a/lib/db-ctl-base.c +++ b/lib/db-ctl-base.c @@ -2656,6 +2656,7 @@ ctl_context_init(struct ctl_context *ctx, struct ctl_command *command, struct ovsdb_symbol_table *symtab, void (*invalidate_cache_cb)(struct ctl_context *)) { + ds_init(&ctx->output); if (command) { ctl_context_init_command(ctx, command, false); } @@ -2688,6 +2689,7 @@ ctl_context_done(struct ctl_context *ctx, ctl_context_done_command(ctx, command); } invalidate_cache(ctx); + ds_destroy(&ctx->output); } char * OVS_WARN_UNUSED_RESULT From f2e4195b0c87e4c80d7e4031516beea7133d1338 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Wed, 29 May 2024 12:53:33 +0200 Subject: [PATCH 723/833] netdev-linux: Fix uninitialized gso_type case. This patch fixes an uninitialized gso_type case in netdev_linux_prepend_vnet_hdr() by returning an error. Fixes: 3337e6d91c5b ("userspace: Enable L4 checksum offloading by default.") Acked-by: Ilya Maximets Signed-off-by: Eelco Chaudron --- lib/netdev-linux.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index f5074f4f5f3..c316238cd56 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -7174,6 +7174,11 @@ netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; } else if (dp_packet_hwol_tx_ipv6(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + } else { + VLOG_ERR_RL(&rl, "Unknown gso_type for TSO packet. " + "Flags: %#"PRIx64, + (uint64_t) *dp_packet_ol_flags_ptr(b)); + return EINVAL; } } else { vnet->hdr_len = 0; From 6003a8875915138fa2f7fe78efb41318341c2cf1 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 3 Jun 2024 13:12:04 +0200 Subject: [PATCH 724/833] tests: Fix non-portable plus match in python vlog test. '\+' as a one-or-more match is a GNU extension and it doesn't work in BSD sed. This makes the python vlog test to fail on FreeBSD 14 that recently got python 3.11 in CirrusCI: | --- - 2024-06-03 10:42:26.363566000 +0000 | +++ /tmp/cirrus-ci-build/tests/testsuite.dir/at-groups/2541/stdout | @@ -7,31 +7,37 @@ | Traceback (most recent call last): | File , line , in main | assert fail | + ^^^^ Remove the '\+' match to make the line removal work. It doesn't do much for us as we would remove the same lines either way. This change makes CirrusCI green again. Fixes: 9185793e7543 ("tests: Fix compatibility issue with Python 3.13 in vlog.at.") Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/vlog.at | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/vlog.at b/tests/vlog.at index efe91479a63..2768c074009 100644 --- a/tests/vlog.at +++ b/tests/vlog.at @@ -8,7 +8,7 @@ AT_CHECK([$PYTHON3 $srcdir/test-vlog.py --log-file log_file \ AT_CHECK([sed -e 's/.*-.*-.*T..:..:..Z |//' \ -e 's/File ".*", line [[0-9]][[0-9]]*,/File , line ,/' \ --e '/\^\+/d' \ +-e '/\^/d' \ stderr_log], [0], [dnl 0 | module_0 | EMER | emergency 1 | module_0 | ERR | error From 474ebd6dd508507f5202383cb2d8ca58d7cf8487 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Mon, 6 May 2024 11:58:33 -0500 Subject: [PATCH 725/833] ovsdb-idl: Add C IDL test for "monitor" fallback. There was a Python-only test for ensuring that the library would work when connecting to an older ovsdb-server that did not support monitor_cond. This adds a C IDL version of that test. Signed-off-by: Terry Wilson Signed-off-by: Ilya Maximets --- tests/ovsdb-idl.at | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index c9e36d678b0..97162707eb4 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -1119,6 +1119,19 @@ OVSDB_CHECK_IDL_FETCH_COLUMNS([simple idl, initially populated], 003: done ]]) +m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_C], + [AT_SETUP([$1 - C]) + AT_KEYWORDS([ovsdb server idl monitor $4]) + OVSDB_START_IDLTEST + AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/disable-monitor-cond]) + + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $2], + [0], [stdout], [ignore]) + AT_CHECK([sort stdout | uuidfilt]m4_if([$5],,, [[| $5]]), + [0], [$3]) + OVSDB_SERVER_SHUTDOWN + AT_CLEANUP]) + m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_PY], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl Python monitor $4]) @@ -1132,7 +1145,8 @@ m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_PY], AT_CLEANUP]) m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND], - [OVSDB_CHECK_IDL_WO_MONITOR_COND_PY($@)]) + [OVSDB_CHECK_IDL_WO_MONITOR_COND_C($@) + OVSDB_CHECK_IDL_WO_MONITOR_COND_PY($@)]) OVSDB_CHECK_IDL_WO_MONITOR_COND([simple idl disable monitor-cond], From 4c0e1a1d702a357b5e48605432a715ffbb5dff71 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Mon, 6 May 2024 11:58:34 -0500 Subject: [PATCH 726/833] python: ovsdb-idl: Use monitor_cond for _Server DB. Unlike the C IDL code, the Python version still monitors the _Server DB with "monitor" instead of "monitor_cond". This results in receiving an entire Database row every time the "index" value is updated which includes the 'schema' column. Using "monitor_cond" will result in "update2" notifications which just include the changed "index" value. Unlike the C IDL, the Python IDL requires a SchemaHelper object to instantiate the IDL, leaving it to the user of the library to call "get_schema" themselves. Since the Python IDL did not have support for retrieving the schema automatically and did not have a state for doing so, instead of transitioning on an error response from retrieving the _Server schema to requesting the "data" schema, this moves directly to monitoring the "data" DB. Signed-off-by: Terry Wilson Signed-off-by: Ilya Maximets --- python/ovs/db/idl.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index a80da84e7a9..c1341fc2a1f 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -35,9 +35,9 @@ ROW_UPDATE = "update" ROW_DELETE = "delete" -OVSDB_UPDATE = 0 -OVSDB_UPDATE2 = 1 -OVSDB_UPDATE3 = 2 +OVSDB_UPDATE = "update" +OVSDB_UPDATE2 = "update2" +OVSDB_UPDATE3 = "update3" CLUSTERED = "clustered" RELAY = "relay" @@ -77,7 +77,7 @@ def __contains__(self, item): return item in self.keys() -class Monitor(enum.IntEnum): +class Monitor(enum.Enum): monitor = OVSDB_UPDATE monitor_cond = OVSDB_UPDATE2 monitor_cond_since = OVSDB_UPDATE3 @@ -465,23 +465,18 @@ def run(self): self.__parse_update(msg.params[2], OVSDB_UPDATE3) self.last_id = msg.params[1] elif (msg.type == ovs.jsonrpc.Message.T_NOTIFY - and msg.method == "update2" - and len(msg.params) == 2): - # Database contents changed. - self.__parse_update(msg.params[1], OVSDB_UPDATE2) - elif (msg.type == ovs.jsonrpc.Message.T_NOTIFY - and msg.method == "update" + and msg.method in (OVSDB_UPDATE, OVSDB_UPDATE2) and len(msg.params) == 2): # Database contents changed. if msg.params[0] == str(self.server_monitor_uuid): - self.__parse_update(msg.params[1], OVSDB_UPDATE, + self.__parse_update(msg.params[1], msg.method, tables=self.server_tables) self.change_seqno = previous_change_seqno if not self.__check_server_db(): self.force_reconnect() break else: - self.__parse_update(msg.params[1], OVSDB_UPDATE) + self.__parse_update(msg.params[1], msg.method) elif self.handle_monitor_canceled(msg): break elif self.handle_monitor_cancel_reply(msg): @@ -540,7 +535,7 @@ def run(self): # Reply to our "monitor" of _Server request. try: self._server_monitor_request_id = None - self.__parse_update(msg.result, OVSDB_UPDATE, + self.__parse_update(msg.result, OVSDB_UPDATE2, tables=self.server_tables) self.change_seqno = previous_change_seqno if self.__check_server_db(): @@ -579,6 +574,11 @@ def run(self): elif msg.type == ovs.jsonrpc.Message.T_NOTIFY and msg.id == "echo": # Reply to our echo request. Ignore it. pass + elif (msg.type == ovs.jsonrpc.Message.T_ERROR and + self.state == self.IDL_S_SERVER_MONITOR_REQUESTED and + msg.id == self._server_monitor_request_id): + self._server_monitor_request_id = None + self.__send_monitor_request() elif (msg.type == ovs.jsonrpc.Message.T_ERROR and self.state == ( self.IDL_S_DATA_MONITOR_COND_SINCE_REQUESTED) and @@ -912,7 +912,7 @@ def __send_server_monitor_request(self): monitor_request = {"columns": columns} monitor_requests[table.name] = [monitor_request] msg = ovs.jsonrpc.Message.create_request( - 'monitor', [self._server_db.name, + 'monitor_cond', [self._server_db.name, str(self.server_monitor_uuid), monitor_requests]) self._server_monitor_request_id = msg.id From 8e2b06af9832cf5d21fe21acfd69e0f62b78d439 Mon Sep 17 00:00:00 2001 From: Jun Gu Date: Mon, 20 May 2024 16:39:50 +0100 Subject: [PATCH 727/833] ovs-ctl: Fix typo. Correct spelling of destination in ovs-ctl.in. Signed-off-by: Jun Gu Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- utilities/ovs-ctl.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/ovs-ctl.in b/utilities/ovs-ctl.in index 0b2820c3611..57abd3a5b45 100644 --- a/utilities/ovs-ctl.in +++ b/utilities/ovs-ctl.in @@ -456,7 +456,7 @@ File location options: Options for "enable-protocol": --protocol=PROTOCOL protocol to enable with iptables (default: gre) --sport=PORT source port to match (for tcp or udp protocol) - --dport=PORT ddestination port to match (for tcp or udp protocol) + --dport=PORT destination port to match (for tcp or udp protocol) Option for "start-ovs-ipsec": --ike-daemon=IKE_DAEMON From 4a838f627c263792aa077fcf73417c689ae3a811 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 20 May 2024 16:39:51 +0100 Subject: [PATCH 728/833] AUTHORS: Add Jun Gu. Add Jun Gu to AUTHORS file. Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index bb44ef14251..b8cb2aef418 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -246,6 +246,7 @@ Joe Stringer joe@ovn.org Jon Kohler jon@nutanix.com Jonathan Vestin jonavest@kau.se Jorge Arturo Sauma Vargas jorge.sauma@hpe.com +Jun Gu jun.gu@easystack.cn Jun Nakajima jun.nakajima@intel.com Jun Wang junwang01@cestc.cn JunhanYan juyan@redhat.com From 90b8e4d0533cb59335241303a0ac1f41c69227d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Skytt=C3=A4?= Date: Mon, 20 May 2024 19:13:44 +0100 Subject: [PATCH 729/833] docs: afxdp: Fix CONFIG_HAVE_EBPF_JIT Kconfig option spelling. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix CONFIG_HAVE_EBPF_JIT Kconfig option spelling "EBPF" vs "BPF"). Signed-off-by: Ville Skyttä Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- Documentation/intro/install/afxdp.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index 964d9ef5b1d..7fa8088c6ee 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -103,7 +103,7 @@ vSwitch with AF_XDP will require the following: * CONFIG_BPF_JIT=y (Performance) - * CONFIG_HAVE_BPF_JIT=y (Performance) + * CONFIG_HAVE_EBPF_JIT=y (Performance) * CONFIG_XDP_SOCKETS_DIAG=y (Debugging) From 1d681ffe3b208a0db4945b6389142ab18404a4d1 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 20 May 2024 19:13:45 +0100 Subject: [PATCH 730/833] =?UTF-8?q?AUTHORS:=20Add=20Ville=20Skytt=C3=A4.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Ville Skyttä to AUTHORS file. Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index b8cb2aef418..155e484360d 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -475,6 +475,7 @@ Valient Gough vgough@pobox.com Vasu Dasari vdasari@gmail.com Venkata Anil Kommaddi vkommadi@redhat.com Viacheslav Galaktionov viacheslav.galaktionov@arknetworks.am +Ville Skyttä ville.skytta@upcloud.com Vishal Deep Ajmera vishal.deep.ajmera@ericsson.com Vivien Bernet-Rollande vbr@soprive.net Vlad Buslov vladbu@nvidia.com From 3a6b8c83619d3b0e11ff2fcea5b9a04edc2f3e4a Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 16 May 2024 11:38:31 -0400 Subject: [PATCH 731/833] ipf: Only add fragments to batch of same dl_type. When conntrack is reassembling packet fragments, the same reassembly context can be shared across multiple threads handling different packets simultaneously. Once a full packet is assembled, it is added to a packet batch for processing, this is most likely the batch that added it in the first place, but that isn't a guarantee. The packets in these batches should be segregated by network protocol version (ipv4 vs ipv6) for conntrack defragmentation to function appropriately. However, there are conditions where we would add a reassembled packet of one type to a batch of another. This change introduces checks to make sure that reassembled or expired fragments are only added to packet batches of the same type. Fixes: 4ea96698f667 ("Userspace datapath: Add fragmentation handling.") Reported-at: https://issues.redhat.com/browse/FDP-560 Signed-off-by: Mike Pattrick Acked-by: Paolo Valerio Acked-by: Simon Horman Signed-off-by: Aaron Conole --- lib/ipf.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/ipf.c b/lib/ipf.c index 7d74e2c131e..3c8960be311 100644 --- a/lib/ipf.c +++ b/lib/ipf.c @@ -1063,6 +1063,9 @@ ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb, struct ipf_list *ipf_list; LIST_FOR_EACH_SAFE (ipf_list, list_node, &ipf->frag_complete_list) { + if ((ipf_list->key.dl_type == htons(ETH_TYPE_IPV6)) != v6) { + continue; + } if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST, v6, now)) { ipf_completed_list_clean(&ipf->frag_lists, ipf_list); @@ -1096,6 +1099,9 @@ ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb, size_t lists_removed = 0; LIST_FOR_EACH_SAFE (ipf_list, list_node, &ipf->frag_exp_list) { + if ((ipf_list->key.dl_type == htons(ETH_TYPE_IPV6)) != v6) { + continue; + } if (now <= ipf_list->expiration || lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) { break; @@ -1116,7 +1122,8 @@ ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb, /* Adds a reassmebled packet to a packet batch to be processed by the caller. */ static void -ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb) +ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb, + ovs_be16 dl_type) { if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) { return; @@ -1127,6 +1134,7 @@ ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb) LIST_FOR_EACH_SAFE (rp, rp_list_node, &ipf->reassembled_pkt_list) { if (!rp->list->reass_execute_ctx && + rp->list->key.dl_type == dl_type && ipf_dp_packet_batch_add(pb, rp->pkt, false)) { rp->list->reass_execute_ctx = rp->pkt; } @@ -1237,7 +1245,7 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, } if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) { - ipf_execute_reass_pkts(ipf, pb); + ipf_execute_reass_pkts(ipf, pb, dl_type); } } From 16f6885353c2820adc3c217bd2588d35748f0849 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Thu, 16 May 2024 11:38:32 -0400 Subject: [PATCH 732/833] ipf: Handle common case of ipf defragmentation. When conntrack is reassembling packet fragments, the same reassembly context can be shared across multiple threads handling different packets simultaneously. Once a full packet is assembled, it is added to a packet batch for processing, in the case where there are multiple different pmd threads accessing conntrack simultaneously, there is a race condition where the reassembled packet may be added to an arbitrary batch even if the current batch is available. When this happens, the packet may be handled incorrectly as it is inserted into a random openflow execution pipeline, instead of the pipeline for that packets flow. This change makes a best effort attempt to try to add the defragmented packet to the current batch. directly. This should succeed most of the time. Fixes: 4ea96698f667 ("Userspace datapath: Add fragmentation handling.") Reported-at: https://issues.redhat.com/browse/FDP-560 Signed-off-by: Mike Pattrick Acked-by: Paolo Valerio Acked-by: Simon Horman Signed-off-by: Aaron Conole --- lib/ipf.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/ipf.c b/lib/ipf.c index 3c8960be311..2d715f5e9d3 100644 --- a/lib/ipf.c +++ b/lib/ipf.c @@ -506,13 +506,15 @@ ipf_reassemble_v6_frags(struct ipf_list *ipf_list) } /* Called when a frag list state transitions to another state. This is - * triggered by new fragment for the list being received.*/ -static void +* triggered by new fragment for the list being received. Returns a reassembled +* packet if this fragment has completed one. */ +static struct reassembled_pkt * ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list, bool ff, bool lf, bool v6) OVS_REQUIRES(ipf->ipf_lock) { enum ipf_list_state curr_state = ipf_list->state; + struct reassembled_pkt *ret = NULL; enum ipf_list_state next_state; switch (curr_state) { case IPF_LIST_STATE_UNUSED: @@ -562,12 +564,15 @@ ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list, ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp); ipf_expiry_list_remove(ipf_list); next_state = IPF_LIST_STATE_COMPLETED; + ret = rp; } else { next_state = IPF_LIST_STATE_REASS_FAIL; } } } ipf_list->state = next_state; + + return ret; } /* Some sanity checks are redundant, but prudent, in case code paths for @@ -799,7 +804,8 @@ ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx, static bool ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list, struct dp_packet *pkt, uint16_t start_data_byte, - uint16_t end_data_byte, bool ff, bool lf, bool v6) + uint16_t end_data_byte, bool ff, bool lf, bool v6, + struct reassembled_pkt **rp) OVS_REQUIRES(ipf->ipf_lock) { bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list, @@ -820,7 +826,7 @@ ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list, ipf_list->last_inuse_idx++; atomic_count_inc(&ipf->nfrag); ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED); - ipf_list_state_transition(ipf, ipf_list, ff, lf, v6); + *rp = ipf_list_state_transition(ipf, ipf_list, ff, lf, v6); } else { OVS_NOT_REACHED(); } @@ -853,7 +859,8 @@ ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key, * to a list of fragemnts. */ static bool ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type, - uint16_t zone, long long now, uint32_t hash_basis) + uint16_t zone, long long now, uint32_t hash_basis, + struct reassembled_pkt **rp) OVS_REQUIRES(ipf->ipf_lock) { struct ipf_list_key key; @@ -922,7 +929,7 @@ ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type, } return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte, - end_data_byte, ff, lf, v6); + end_data_byte, ff, lf, v6, rp); } /* Filters out fragments from a batch of fragments and adjust the batch. */ @@ -941,11 +948,17 @@ ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb, || (dl_type == htons(ETH_TYPE_IPV6) && ipf_is_valid_v6_frag(ipf, pkt)))) { + struct reassembled_pkt *rp = NULL; ovs_mutex_lock(&ipf->ipf_lock); - if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis)) { + if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis, + &rp)) { dp_packet_batch_refill(pb, pkt, pb_idx); } else { + if (rp && !dp_packet_batch_is_full(pb)) { + dp_packet_batch_refill(pb, rp->pkt, pb_idx); + rp->list->reass_execute_ctx = rp->pkt; + } dp_packet_delete(pkt); } ovs_mutex_unlock(&ipf->ipf_lock); From 40f0ac48ffee7936e3c87c122171f5a1143fbd2e Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 31 May 2024 23:45:10 +0200 Subject: [PATCH 733/833] tests: sendpkt: Allow different input formats. We require python 3, so instead of manually parsing bytes on input we can use built-in bytes.fromhex(). This function ignores whitespaces, so we can use different input formats - the old style space-separated bytes as well as pure hex strings provided by ovs-ofctl compose-packet and ovs-pcap. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/sendpkt.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/tests/sendpkt.py b/tests/sendpkt.py index 49ac45275a9..7cbea516548 100755 --- a/tests/sendpkt.py +++ b/tests/sendpkt.py @@ -48,28 +48,10 @@ if options.packet_type != "eth": parser.error('invalid argument to "-t"/"--type". Allowed value is "eth".') -# store the hex bytes with 0x appended at the beginning -# if not present in the user input and validate the hex bytes -hex_list = [] -for a in args[1:]: - if a[:2] != "0x": - hex_byte = "0x" + a - else: - hex_byte = a - try: - temp = int(hex_byte, 0) - except: - parser.error("invalid hex byte " + a) - - if temp > 0xff: - parser.error("hex byte " + a + " cannot be greater than 0xff!") - - hex_list.append(temp) - -if sys.version_info < (3, 0): - pkt = "".join(map(chr, hex_list)) -else: - pkt = bytes(hex_list) +# Strip '0x' prefixes from hex input, combine into a single string and +# convert to bytes. +hex_str = "".join([a[2:] if a.startswith("0x") else a for a in args[1:]]) +pkt = bytes.fromhex(hex_str) try: sockfd = socket.socket(socket.AF_PACKET, socket.SOCK_RAW) From cd4ea33b5bb0862e75d165a6baaf1308709e9ffc Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 31 May 2024 23:45:11 +0200 Subject: [PATCH 734/833] tests: Convert ND, MPLS and CT sendpkt tests to compose-packet. These tests contain plain hex dumps that are hard to read and modify. Replace with equivalent calls to ovs-ofctl compose-packet --bare and ovs-pcap. Tcpdump calls modified to write actual pcaps instead of text output, so ovs-pcap can be used while checking the results. While at it, replacing sleeps with more robust waiting for tcpdump to start listening. M4 macros are better than shell variables, because we can see the substitution result in the test log. So, using m4_define and m4_join extensively. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- tests/system-traffic.at | 233 ++++++++++++++++++++++++++-------------- 1 file changed, 152 insertions(+), 81 deletions(-) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index bd7647cbee6..c4cebb0a374 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -2390,11 +2390,22 @@ table=20 actions=drop AT_CHECK([ovs-ofctl del-flows br0]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +m4_define([ND_NS_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x86dd], + [ipv6_src=fe80::f816:3eff:fe04:6604,ipv6_dst=fe80::f816:3eff:fea7:dd0e], + [nw_proto=58,nw_ttl=255,nw_frag=no], + [icmpv6_type=136,icmpv6_code=0], + [nd_options_type=2,nd_tll=36:b1:ee:7c:01:03])]) + dnl Send a mismatching neighbor discovery. -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 f1 f2 20 00 00 00 30 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 02 01 36 b1 ee 7c 01 03 > /dev/null]) +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=3000::1')], + [0], [ignore]) dnl Send a matching neighbor discovery. -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 fe 5f 20 00 00 00 20 01 00 00 00 00 00 00 00 00 00 01 00 00 03 92 02 01 36 b1 ee 7c 01 03 > /dev/null]) +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=2001::1:0:392')], + [0], [ignore]) AT_CHECK([ovs-appctl dpctl/dump-flows | strip_stats | strip_used | dnl strip_key32 | strip_ptype | strip_eth | strip_recirc | dnl @@ -2406,10 +2417,14 @@ recirc_id(),in_port(2),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(ty OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-flows | grep ",nd" | wc -l | grep -E ^0]) dnl Send a matching neighbor discovery. -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 fe 5f 20 00 00 00 20 01 00 00 00 00 00 00 00 00 00 01 00 00 03 92 02 01 36 b1 ee 7c 01 03 > /dev/null]) +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=2001::1:0:392')], + [0], [ignore]) dnl Send a mismatching neighbor discovery. -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 86 dd 60 00 00 00 00 20 3a ff fe 80 00 00 00 00 00 00 f8 16 3e ff fe 04 66 04 fe 80 00 00 00 00 00 00 f8 16 3e ff fe a7 dd 0e 88 00 f1 f2 20 00 00 00 30 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 02 01 36 b1 ee 7c 01 03 > /dev/null]) +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=3000::1')], + [0], [ignore]) AT_CHECK([ovs-appctl dpctl/dump-flows | strip_stats | strip_used | dnl strip_key32 | strip_ptype | strip_eth | strip_recirc | dnl @@ -2438,20 +2453,29 @@ dnl The flow will encap a mpls header to the ip packet dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) + +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8847 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2470,20 +2494,29 @@ dnl The flow will encap a mpls header to the ip packet dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8847 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2503,20 +2536,29 @@ dnl The flow will encap a mpls header to the ip packet dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls_mc),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) + +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8848], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8848 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2535,20 +2577,29 @@ dnl The flow will encap a mpls header to the ip packet dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls_mc),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8848 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8848], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2569,24 +2620,30 @@ dnl eth/mpls/eth/ip/icmp --> OVS --> eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x8847,mpls_label=2 actions=decap(),decap(packet_type(ns=0,type=0)),ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is an mpls packet encapsulating ethernet packet. pkt=eth/mpls/eth/ip/icmp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 00 00 00 00 00 02 00 00 00 00 00 01 88 47 00 00 21 40 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl Check the expected decapsulated on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800 *4500" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0054 *0344 *4000 *4001 *2161 *0a01 *0101 *0a01" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0000 *500b *0200 *0000 *0000 *1011 *1213 *1415" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *1617 *1819 *1a1b *1c1d *1e1f *2021 *2223 *2425" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *2627 *2829 *2a2b *2c2d *2e2f *3031 *3233 *3435" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0060: *3637" 2>&1 1>/dev/null]) +dnl The packet is an eth/mpls/eth/ip/icmp sent from p0(at_ns0) interface +dnl directed to p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'MPLS_HEADER')" \ + "$(ovs-ofctl compose-packet --bare 'ICMP_PKT')"], + [0], [ignore]) +dnl Check the expected decapsulated on the egress interface. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q \ + "^$(ovs-ofctl compose-packet --bare 'ICMP_PKT')\$"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2606,24 +2663,30 @@ dnl eth/mpls/eth/ip/icmp --> OVS --> eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x8847,mpls_label=2 actions=decap(),decap(packet_type(ns=0,type=0)),ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is an mpls packet encapsulating ethernet packet. pkt=eth/mpls/eth/ip/icmp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 00 00 00 00 00 02 00 00 00 00 00 01 88 47 00 00 21 40 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) -dnl Check the expected decapsulated on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800 *4500" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0054 *0344 *4000 *4001 *2161 *0a01 *0101 *0a01" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0000 *500b *0200 *0000 *0000 *1011 *1213 *1415" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *1617 *1819 *1a1b *1c1d *1e1f *2021 *2223 *2425" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *2627 *2829 *2a2b *2c2d *2e2f *3031 *3233 *3435" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0060: *3637" 2>&1 1>/dev/null]) +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) +dnl The packet is an eth/mpls/eth/ip/icmp sent from p0(at_ns0) interface +dnl directed to p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'MPLS_HEADER')" \ + "$(ovs-ofctl compose-packet --bare 'ICMP_PKT')"], + [0], [ignore]) + +dnl Check the expected decapsulated on the egress interface. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q \ + "^$(ovs-ofctl compose-packet --bare 'ICMP_PKT')\$"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -8293,10 +8356,18 @@ table=2,priority=10 ct_state=+trk+est action=drop AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -# sending icmp pkts, first and second -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f0 00 00 01 01 02 f0 00 00 01 01 01 08 00 45 00 00 1c 00 01 00 00 40 01 64 dc 0a 01 01 01 0a 01 01 02 08 00 f7 ff ff ff ff ff > /dev/null]) +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=f0:00:00:01:01:01,eth_dst=f0:00:00:01:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) + +# Sending ICMP packets, first and second. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT' '')], [0], [ignore]) -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f0 00 00 01 01 02 f0 00 00 01 01 01 08 00 45 00 00 1c 00 01 00 00 40 01 64 dc 0a 01 01 01 0a 01 01 02 08 00 f7 ff ff ff ff ff > /dev/null]) +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT' '')], [0], [ignore]) sleep 1 From ac4df0c8dbb78e516f3481612d130deb34a8c216 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 31 May 2024 23:45:12 +0200 Subject: [PATCH 735/833] nsh: Add support to compose-packet and use it in system tests. OVS can parse NSH, but can't compose. Fix that and get rid of plain hex NSH packets in system tests as they are hard to read or modify. Tcpdump calls modified to write actual pcaps instead of text output, so ovs-pcap can be used while checking the results. While at it, replacing sleeps with more robust waiting for tcpdump to start listening. M4 macros are better than shell variables, because we can see the substitution result in the test log. So, using m4_define and m4_join extensively. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- lib/flow.c | 18 ++++ tests/system-traffic.at | 177 ++++++++++++++++++++++++++-------------- 2 files changed, 134 insertions(+), 61 deletions(-) diff --git a/lib/flow.c b/lib/flow.c index 8e3402388cb..dc5fb328d9c 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -3420,6 +3420,24 @@ flow_compose(struct dp_packet *p, const struct flow *flow, arp->ar_sha = flow->arp_sha; arp->ar_tha = flow->arp_tha; } + } else if (flow->dl_type == htons(ETH_TYPE_NSH)) { + struct nsh_hdr *nsh; + + nsh = dp_packet_put_zeros(p, sizeof *nsh); + dp_packet_set_l3(p, nsh); + + nsh_set_flags_ttl_len(nsh, flow->nsh.flags, flow->nsh.ttl, + flow->nsh.mdtype == NSH_M_TYPE1 + ? NSH_M_TYPE1_LEN : NSH_BASE_HDR_LEN); + nsh->next_proto = flow->nsh.np; + nsh->md_type = flow->nsh.mdtype; + put_16aligned_be32(&nsh->path_hdr, flow->nsh.path_hdr); + + if (flow->nsh.mdtype == NSH_M_TYPE1) { + for (size_t i = 0; i < 4; i++) { + put_16aligned_be32(&nsh->md1.context[i], flow->nsh.context[i]); + } + } } if (eth_type_mpls(flow->dl_type)) { diff --git a/tests/system-traffic.at b/tests/system-traffic.at index c4cebb0a374..3f1a15445ee 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -8920,21 +8920,29 @@ dnl The flow will encap a nsh header to the TCP syn packet dnl eth/ip/tcp --> OVS --> eth/nsh/eth/ip/tcp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,in_port=ovs-p0,ip,actions=encap(nsh(md_type=1)),set_field:0x1234->nsh_spi,set_field:0x11223344->nsh_c1,encap(ethernet),set_field:f2:ff:00:00:00:02->dl_dst,set_field:f2:ff:00:00:00:01->dl_src,ovs-p1"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is a TCP syn packet. pkt=eth/ip/tcp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) -dnl Check the expected nsh encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000 *0001 *894f *0fc6" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0103 *0012 *34ff *1122 *3344 *0000 *0000 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0000 *0000 *0000 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +dnl Send the TCP SYN packet from p0(at_ns0) interface directed to +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')], [0], [ignore]) + +m4_define([NSH_HEADER], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=63,nsh_np=3,nsh_spi=0x1234,nsh_si=255], + [nsh_mdtype=1,nsh_c1=0x11223344])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -8952,19 +8960,31 @@ dnl The flow will decap a nsh header which in turn carries a TCP syn packet dnl eth/nsh/eth/ip/tcp --> OVS --> eth/ip/tcp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,in_port=ovs-p0,dl_type=0x894f, actions=decap(),decap(), ovs-p1"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is NSH packet with TCP syn payload. pkt=eth/nsh/eth/ip/tcp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 02 06 01 03 00 00 64 03 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +m4_define([NSH_HEADER], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=63,nsh_np=3,nsh_spi=0x1234,nsh_si=255], + [nsh_mdtype=1,nsh_c1=0x11223344])]) + +dnl Send the NSH packet with TCP SYN payload from p0(at_ns0) interface directed +dnl to p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) dnl Check the expected de-capsulated TCP packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f200 *0000 *0002 *f200 *0000 *0001 *0800 *4500" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0028 *0001 *0000 *4006 *b013 *c0a8 *000a *0a00" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *000a *0400 *0800 *0000 *00c8 *0000 *0000 *5002" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *2000 *b85e *0000" 2>&1 1>/dev/null]) +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q \ + "^$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')\$"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -8984,22 +9004,38 @@ dnl The flow will add another NSH header with nsh_spi=0x101, nsh_si=4, dnl nsh_ttl=7 and change the md1 context AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,in_port=ovs-p0,dl_type=0x894f,nsh_spi=0x100,nsh_si=0x03,actions=decap(),decap(),encap(nsh(md_type=1)),set_field:0x07->nsh_ttl,set_field:0x0101->nsh_spi,set_field:0x04->nsh_si,set_field:0x100f0e0d->nsh_c1,set_field:0x0c0b0a09->nsh_c2,set_field:0x08070605->nsh_c3,set_field:0x04030201->nsh_c4,encap(ethernet),set_field:f2:ff:00:00:00:02->dl_dst,set_field:f2:ff:00:00:00:01->dl_src,ovs-p1"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is NSH packet with TCP syn payload. pkt=eth/nsh/eth/ip/tcp -dnl The nsh_ttl is 8, nsh_spi is 0x100 and nsh_si is 3 -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 02 06 01 03 00 01 00 03 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +m4_define([NSH_HEADER_1], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=8,nsh_np=3,nsh_spi=0x100,nsh_si=3,nsh_mdtype=1], + [nsh_c1=0x01020304,nsh_c2=0x05060708,nsh_c3=0x090a0b0c,nsh_c4=0x0d0e0f10])]) -dnl Check the expected NSH packet with new fields in the header -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000* 0001 *894f *01c6" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0103 *0001 *0104 *100f *0e0d *0c0b *0a09 *0807" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0605 *0403 *0201 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +dnl Send the NSH packet with TCP SYN payload from p0(at_ns0) interface directed +dnl to p1(at_ns1) interface. +dnl The nsh_ttl is 8, nsh_spi is 0x100 and nsh_si is 3. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER_1')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) + +m4_define([NSH_HEADER_2], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=7,nsh_np=3,nsh_spi=0x101,nsh_si=4,nsh_mdtype=1], + [nsh_c1=0x100f0e0d,nsh_c2=0x0c0b0a09,nsh_c3=0x08070605,nsh_c4=0x04030201])]) + +dnl Check the expected NSH packet with new fields in the header. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER_2'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -9020,31 +9056,50 @@ dnl packet to to at_ns2. AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x894f,nsh_spi=0x100,nsh_si=0x02,actions=ovs-p1"]) AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x894f,nsh_spi=0x100,nsh_si=0x01,actions=ovs-p2"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -NETNS_DAEMONIZE([at_ns2], [tcpdump -l -n -xx -U -i p2 > p2.pcap], [tcpdump2.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) +NETNS_DAEMONIZE([at_ns2], + [tcpdump -l -n -xx -U -i p2 -w p2.pcap 2>tcpdump2_err], [tcpdump2.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump2_err]) + +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +dnl First send packet from at_ns0 --> OVS with SPI=0x100 and SI=2. +m4_define([NSH_HEADER_1], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=8,nsh_np=3,nsh_spi=0x100,nsh_si=2,nsh_mdtype=1], + [nsh_c1=0x01020304,nsh_c2=0x05060708,nsh_c3=0x090a0b0c,nsh_c4=0x0d0e0f10])]) + +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER_1')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) + +dnl Check for the above packet on p1 interface. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER_1'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) + +dnl Send the second packet from at_ns1 --> OVS with SPI=0x100 and SI=1. +m4_define([NSH_HEADER_2], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=8,nsh_np=3,nsh_spi=0x100,nsh_si=1,nsh_mdtype=1], + [nsh_c1=0x01020304,nsh_c2=0x05060708,nsh_c3=0x090a0b0c,nsh_c4=0x0d0e0f10])]) + +NS_CHECK_EXEC([at_ns1], [$PYTHON3 $srcdir/sendpkt.py p1 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER_2')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) -dnl First send packet from at_ns0 --> OVS with SPI=0x100 and SI=2 -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 02 06 01 03 00 01 00 02 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) - -dnl Check for the above packet on p1 interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000 *0001 *894f *0206" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0103 *0001 *0002 *0102 *0304 *0506 *0708 *090a" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0b0c *0d0e *0f10 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) - -dnl Send the second packet from at_ns1 --> OVS with SPI=0x100 and SI=1 -NS_CHECK_EXEC([at_ns1], [$PYTHON3 $srcdir/sendpkt.py p1 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 01 c6 01 03 00 01 00 01 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) - -dnl Check for the above packet on p2 interface -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000 *0001 *894f *01c6" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0010: *0103 *0001 *0001 *0102 *0304 *0506 *0708 *090a" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0020: *0b0c *0d0e *0f10 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +dnl Check for the above packet on p2 interface. +OVS_WAIT_UNTIL([ovs-pcap p2.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER_2'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP From 792e8ee869aa89b639a104879a9e31106b79c17f Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 28 May 2024 11:34:17 +0300 Subject: [PATCH 736/833] debian: Fix tabs vs spaces. Getting the following message while trying to build a debian package. debian/openvswitch-switch.init debian/openvswitch-switch.postinst See above for files that use tabs for indentation. Please use spaces instead. Fix it. Signed-off-by: Roi Dayan Signed-off-by: Simon Horman --- debian/openvswitch-switch.init | 22 +++++++++++----------- debian/openvswitch-switch.postinst | 28 ++++++++++++++-------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/debian/openvswitch-switch.init b/debian/openvswitch-switch.init index 7b9fbf61e16..96fe1f7c4c6 100755 --- a/debian/openvswitch-switch.init +++ b/debian/openvswitch-switch.init @@ -47,21 +47,21 @@ load_kmod () { start () { if ovs_ctl load-kmod; then - : + : else - echo "Module has probably not been built for this kernel." - echo "Please install Linux 3.3 or later with openvswitch kernel support." + echo "Module has probably not been built for this kernel." + echo "Please install Linux 3.3 or later with openvswitch kernel support." - if test X"$OVS_MISSING_KMOD_OK" = Xyes; then - # We're being invoked by the package postinst. Do not - # fail package installation just because the kernel module - # is not available. - exit 0 - fi + if test X"$OVS_MISSING_KMOD_OK" = Xyes; then + # We're being invoked by the package postinst. Do not + # fail package installation just because the kernel module + # is not available. + exit 0 + fi fi set ovs_ctl ${1-start} --system-id=random if test X"$FORCE_COREFILES" != X; then - set "$@" --force-corefiles="$FORCE_COREFILES" + set "$@" --force-corefiles="$FORCE_COREFILES" fi set "$@" $OVS_CTL_OPTS "$@" || exit $? @@ -113,7 +113,7 @@ restart () { case $1 in start) start - ;; + ;; stop | force-stop) stop ;; diff --git a/debian/openvswitch-switch.postinst b/debian/openvswitch-switch.postinst index 042e671d514..1a20a944ecc 100755 --- a/debian/openvswitch-switch.postinst +++ b/debian/openvswitch-switch.postinst @@ -30,20 +30,20 @@ case "$1" in mv "${conffile}.dpkg-bak" "${conffile}" fi - # Ensure that /etc/openvswitch/conf.db links to /var/lib/openvswitch, - # moving an existing file if there is one. - # - # Ditto for .conf.db.~lock~. - for base in conf.db .conf.db.~lock~; do - new=/var/lib/openvswitch/$base - old=/etc/openvswitch/$base - if test -f $old && test ! -e $new; then - mv $old $new - fi - if test ! -e $old && test ! -h $old; then - ln -s $new $old - fi - done + # Ensure that /etc/openvswitch/conf.db links to /var/lib/openvswitch, + # moving an existing file if there is one. + # + # Ditto for .conf.db.~lock~. + for base in conf.db .conf.db.~lock~; do + new=/var/lib/openvswitch/$base + old=/etc/openvswitch/$base + if test -f $old && test ! -e $new; then + mv $old $new + fi + if test ! -e $old && test ! -h $old; then + ln -s $new $old + fi + done ;; abort-upgrade|abort-remove|abort-deconfigure) From 6280f5d04a8daad2ad7c5723da05e92b217877e2 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Sun, 26 May 2024 11:31:12 +0300 Subject: [PATCH 737/833] netdev-offload-tc: Reserve lower tc prio for vlan ethertype. The cited commit reserved lower tc priorities for IP ethertypes in order to give IP traffic higher priority than other management traffic. In case of of vlan encap traffic, IP traffic will still get lower priority. Fix it by also reserving low priority tc prio for vlan. Fixes: c230c7579c14 ("netdev-offload-tc: Reserve lower tc prios for ip ethertypes") Signed-off-by: Maor Dickman Acked-by: Roi Dayan Signed-off-by: Simon Horman --- lib/netdev-offload-tc.c | 2 ++ lib/tc.h | 1 + 2 files changed, 3 insertions(+) diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 921d5231777..3be1c08d24f 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -400,6 +400,8 @@ get_next_available_prio(ovs_be16 protocol) return TC_RESERVED_PRIORITY_IPV4; } else if (protocol == htons(ETH_P_IPV6)) { return TC_RESERVED_PRIORITY_IPV6; + } else if (protocol == htons(ETH_P_8021Q)) { + return TC_RESERVED_PRIORITY_VLAN; } } diff --git a/lib/tc.h b/lib/tc.h index fdbcf4b7cb2..8442c8d8b8c 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -51,6 +51,7 @@ enum tc_flower_reserved_prio { TC_RESERVED_PRIORITY_POLICE, TC_RESERVED_PRIORITY_IPV4, TC_RESERVED_PRIORITY_IPV6, + TC_RESERVED_PRIORITY_VLAN, __TC_RESERVED_PRIORITY_MAX }; #define TC_RESERVED_PRIORITY_MAX (__TC_RESERVED_PRIORITY_MAX -1) From 041d6adeda1b110b0044dee29e5b16262f03bd11 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 30 May 2024 15:10:09 +0200 Subject: [PATCH 738/833] netdev-dpdk: Fallback to non tunnel checksum offloading. The outer checksum offloading API in DPDK is ambiguous and was implemented by Intel folks in their drivers with the assumption that any outer offloading always goes with an inner offloading request. With net/i40e and net/ice drivers, in the case of encapsulating a ARP packet in a vxlan tunnel (which results in requesting outer ip checksum with a tunnel context but no inner offloading request), a Tx failure is triggered, associated with a port MDD event. 2024-03-27T16:02:07.084Z|00018|dpdk|WARN|ice_interrupt_handler(): OICR: MDD event To avoid this situation, if no checksum or segmentation offloading is requested on the inner part of a packet, fallback to "normal" (non outer) offloading request. Reported-at: https://github.com/openvswitch/ovs-issues/issues/321 Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Fixes: f81d782c1906 ("netdev-native-tnl: Mark all vxlan/geneve packets as tunneled.") Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- lib/netdev-dpdk.c | 71 +++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 7b84c858e9b..e15b491ed57 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2583,16 +2583,18 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); struct tcp_header *th; - const uint64_t all_requests = (RTE_MBUF_F_TX_IP_CKSUM | - RTE_MBUF_F_TX_L4_MASK | - RTE_MBUF_F_TX_OUTER_IP_CKSUM | - RTE_MBUF_F_TX_OUTER_UDP_CKSUM | - RTE_MBUF_F_TX_TCP_SEG); - const uint64_t all_marks = (RTE_MBUF_F_TX_IPV4 | - RTE_MBUF_F_TX_IPV6 | - RTE_MBUF_F_TX_OUTER_IPV4 | - RTE_MBUF_F_TX_OUTER_IPV6 | - RTE_MBUF_F_TX_TUNNEL_MASK); + const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM | + RTE_MBUF_F_TX_L4_MASK | + RTE_MBUF_F_TX_TCP_SEG); + const uint64_t all_outer_requests = (RTE_MBUF_F_TX_OUTER_IP_CKSUM | + RTE_MBUF_F_TX_OUTER_UDP_CKSUM); + const uint64_t all_requests = all_inner_requests | all_outer_requests; + const uint64_t all_inner_marks = (RTE_MBUF_F_TX_IPV4 | + RTE_MBUF_F_TX_IPV6); + const uint64_t all_outer_marks = (RTE_MBUF_F_TX_OUTER_IPV4 | + RTE_MBUF_F_TX_OUTER_IPV6 | + RTE_MBUF_F_TX_TUNNEL_MASK); + const uint64_t all_marks = all_inner_marks | all_outer_marks; if (!(mbuf->ol_flags & all_requests)) { /* No offloads requested, no marks should be set. */ @@ -2613,34 +2615,43 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated * before. */ const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK; - if (tunnel_type == RTE_MBUF_F_TX_TUNNEL_GENEVE || - tunnel_type == RTE_MBUF_F_TX_TUNNEL_VXLAN) { - mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - - (char *) dp_packet_eth(pkt); - mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - - (char *) dp_packet_l3(pkt); - - /* If neither inner checksums nor TSO is requested, inner marks - * should not be set. */ - if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | - RTE_MBUF_F_TX_L4_MASK | - RTE_MBUF_F_TX_TCP_SEG))) { - mbuf->ol_flags &= ~(RTE_MBUF_F_TX_IPV4 | - RTE_MBUF_F_TX_IPV6); - } - } else if (OVS_UNLIKELY(tunnel_type)) { + if (OVS_UNLIKELY(tunnel_type && + tunnel_type != RTE_MBUF_F_TX_TUNNEL_GENEVE && + tunnel_type != RTE_MBUF_F_TX_TUNNEL_VXLAN)) { VLOG_WARN_RL(&rl, "%s: Unexpected tunnel type: %#"PRIx64, netdev_get_name(&dev->up), tunnel_type); netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), "Packet with unexpected tunnel type", mbuf); return false; + } + + if (tunnel_type && (mbuf->ol_flags & all_inner_requests)) { + mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - + (char *) dp_packet_eth(pkt); + mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - + (char *) dp_packet_l3(pkt); } else { - mbuf->l2_len = (char *) dp_packet_l3(pkt) - - (char *) dp_packet_eth(pkt); - mbuf->l3_len = (char *) dp_packet_l4(pkt) - - (char *) dp_packet_l3(pkt); + if (tunnel_type) { + /* No inner offload is requested, fallback to non tunnel + * checksum offloads. */ + mbuf->ol_flags &= ~all_inner_marks; + if (mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM) { + mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; + mbuf->ol_flags |= RTE_MBUF_F_TX_IPV4; + } + if (mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM) { + mbuf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM; + mbuf->ol_flags |= mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_IPV4 + ? RTE_MBUF_F_TX_IPV4 : RTE_MBUF_F_TX_IPV6; + } + mbuf->ol_flags &= ~(all_outer_requests | all_outer_marks); + } mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; + mbuf->l2_len = (char *) dp_packet_l3(pkt) - + (char *) dp_packet_eth(pkt); + mbuf->l3_len = (char *) dp_packet_l4(pkt) - + (char *) dp_packet_l3(pkt); } th = dp_packet_l4(pkt); From 29abd07e4fa6a4c5969f2b3f5d629cfb6d13763a Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 30 May 2024 15:10:10 +0200 Subject: [PATCH 739/833] netdev-dpdk: Disable outer UDP checksum for net/iavf. Same as the commit 6f93d8e62f13 ("netdev-dpdk: Disable outer UDP checksum offload for ice/i40e driver."), disable outer UDP checksum and related offloads for net/iavf. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- lib/netdev-dpdk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index e15b491ed57..7c910cac8e5 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -1355,12 +1355,14 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) } if (!strcmp(info.driver_name, "net_ice") - || !strcmp(info.driver_name, "net_i40e")) { + || !strcmp(info.driver_name, "net_i40e") + || !strcmp(info.driver_name, "net_iavf")) { /* FIXME: Driver advertises the capability but doesn't seem * to actually support it correctly. Can remove this once * the driver is fixed on DPDK side. */ VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a " - "net/ice or net/i40e port.", netdev_get_name(&dev->up)); + "net/ice, net/i40e or net/iavf port.", + netdev_get_name(&dev->up)); info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM; info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO; info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO; From 3d2c8223ab77a2abb662331b1fb31b67a81a5b46 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 30 May 2024 15:10:11 +0200 Subject: [PATCH 740/833] netdev-dpdk: Fix inner checksum when outer is not supported. If outer checksum is not supported and OVS already set L3/L4 outer checksums in the packet, no outer mark should be left in ol_flags (as it confuses some driver, like net/ixgbe). l2_len must be adjusted to account for the tunnel header. Fixes: 084c8087292c ("userspace: Support VXLAN and GENEVE TSO.") Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- lib/netdev-dpdk.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 7c910cac8e5..0c624d5d389 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2628,10 +2628,21 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) } if (tunnel_type && (mbuf->ol_flags & all_inner_requests)) { - mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - - (char *) dp_packet_eth(pkt); - mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - - (char *) dp_packet_l3(pkt); + if (mbuf->ol_flags & all_outer_requests) { + mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - + (char *) dp_packet_eth(pkt); + mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - + (char *) dp_packet_l3(pkt); + } else { + /* If no outer offloading is requested, clear outer marks. */ + mbuf->ol_flags &= ~all_outer_marks; + mbuf->outer_l2_len = 0; + mbuf->outer_l3_len = 0; + + /* Skip outer headers. */ + mbuf->l2_len += (char *) dp_packet_l4(pkt) - + (char *) dp_packet_eth(pkt); + } } else { if (tunnel_type) { /* No inner offload is requested, fallback to non tunnel From d618d091731cb4d1bea7638e1b4ad893afab4ec5 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 30 May 2024 15:10:12 +0200 Subject: [PATCH 741/833] netdev-dpdk: Refactor TSO request code. Every L3, L4 checksum offload or TSO requires a (outer) L3 length to be provided. This length is computed via dp_packet_l4(pkt) that is always set when such offloads are requested in OVS. Getting a th == NULL is a bug in OVS, so an assert() is more appropriate. Besides, filling l4_len and tso_segsz only matters to TSO, so there is no need to set it for other L4 checksum offloading requests. Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- lib/netdev-dpdk.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 0c624d5d389..0dfd685467b 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2583,7 +2583,6 @@ static bool netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); - struct tcp_header *th; const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK | @@ -2613,6 +2612,8 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return true; } + ovs_assert(dp_packet_l4(pkt)); + /* If packet is vxlan or geneve tunnel packet, calculate outer * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated * before. */ @@ -2666,22 +2667,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); } - th = dp_packet_l4(pkt); if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { - if (!th) { - VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); - return false; - } - } - - if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) == RTE_MBUF_F_TX_TCP_CKSUM) { - if (!th) { - VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header" - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); - return false; - } + struct tcp_header *th = dp_packet_l4(pkt); + int hdr_len; if (tunnel_type) { mbuf->tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - @@ -2691,16 +2680,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; } - if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { - int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; - if (OVS_UNLIKELY((hdr_len + - mbuf->tso_segsz) > dev->max_packet_len)) { - VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", " - "gso: %"PRIu32", max len: %"PRIu32"", - dev->up.name, hdr_len, mbuf->tso_segsz, - dev->max_packet_len); - return false; - } + hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; + if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) { + VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", " + "gso: %"PRIu32", max len: %"PRIu32"", + dev->up.name, hdr_len, mbuf->tso_segsz, + dev->max_packet_len); + return false; } } From 844a7cfa6eddceea7a8836caa6005f4978ed6e03 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 30 May 2024 15:10:13 +0200 Subject: [PATCH 742/833] netdev-dpdk: Use guest TSO segmentation size hint. In a typical setup like: guest A <-virtio-> OVS A <-vxlan-> OVS B <-virtio-> guest B TSO packets from guest A are segmented against the OVS A physical port mtu adjusted by the vxlan tunnel header size, regardless of guest A interface mtu. As an example, let's say guest A and guest B mtu are set to 1500 bytes. OVS A and OVS B physical ports mtu are set to 1600 bytes. Guest A will request TCP segmentation for 1448 bytes segments. On the other hand, OVS A will request 1498 bytes segments to the HW. This results in OVS B dropping packets because decapsulated packets are larger than the vhost-user port (serving guest B) mtu. 2024-04-17T14:13:01.239Z|00002|netdev_dpdk(pmd-c03/id:7)|WARN|vhost0: Too big size 1564 max_packet_len 1518 vhost-user ports expose a guest mtu by filling mbuf->tso_segsz. Use it as a hint. This may result in segments (on the wire) slightly shorter than the optimal size. Reported-at: https://github.com/openvswitch/ovs-issues/issues/321 Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- lib/netdev-dpdk.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 0dfd685467b..bda3fa94b6c 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2670,14 +2670,19 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { struct tcp_header *th = dp_packet_l4(pkt); + uint16_t link_tso_segsz; int hdr_len; if (tunnel_type) { - mbuf->tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - - mbuf->l4_len - mbuf->outer_l3_len; + link_tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - + mbuf->l4_len - mbuf->outer_l3_len; } else { mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; - mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + } + + if (mbuf->tso_segsz > link_tso_segsz) { + mbuf->tso_segsz = link_tso_segsz; } hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; From c39a84c131f209e94fd6a98439328f69e1cc1714 Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 30 May 2024 15:10:14 +0200 Subject: [PATCH 743/833] netdev-dpdk: Refactor tunnel checksum offloading. All information required for checksum offloading can be deduced by already tracked dp_packet l3_ofs, l4_ofs, inner_l3_ofs and inner_l4_ofs fields. Remove DPDK specific l[2-4]_len from generic OVS code. netdev-dpdk code then fills mbuf specifics step by step: - outer_l2_len and outer_l3_len are needed for tunneling (and below features), - l2_len and l3_len are needed for IP and L4 checksum (and below features), - l4_len and tso_segsz are needed when doing TSO, Signed-off-by: David Marchand Acked-by: Kevin Traynor Signed-off-by: Kevin Traynor --- lib/dp-packet.h | 37 ------------------------------ lib/netdev-dpdk.c | 35 ++++++++++++++++++----------- lib/netdev-native-tnl.c | 50 +++++------------------------------------ 3 files changed, 27 insertions(+), 95 deletions(-) diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 3622764c47b..a75b1c5cdbb 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -604,25 +604,6 @@ dp_packet_get_nd_payload(const struct dp_packet *b) } #ifdef DPDK_NETDEV -static inline void -dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len) -{ - b->mbuf.l2_len = l2_len; -} - -static inline void -dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len) -{ - b->mbuf.l3_len = l3_len; -} - -static inline void -dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len) -{ - b->mbuf.l4_len = l4_len; -} - - static inline uint64_t * dp_packet_ol_flags_ptr(const struct dp_packet *b) { @@ -642,24 +623,6 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b) } #else -static inline void -dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len OVS_UNUSED) -{ - /* There is no implementation. */ -} - -static inline void -dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len OVS_UNUSED) -{ - /* There is no implementation. */ -} - -static inline void -dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len OVS_UNUSED) -{ - /* There is no implementation. */ -} - static inline uint32_t * dp_packet_ol_flags_ptr(const struct dp_packet *b) { diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index bda3fa94b6c..0fa37d51456 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2583,6 +2583,9 @@ static bool netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); + void *l2; + void *l3; + void *l4; const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK | @@ -2612,11 +2615,6 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return true; } - ovs_assert(dp_packet_l4(pkt)); - - /* If packet is vxlan or geneve tunnel packet, calculate outer - * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated - * before. */ const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK; if (OVS_UNLIKELY(tunnel_type && tunnel_type != RTE_MBUF_F_TX_TUNNEL_GENEVE && @@ -2634,6 +2632,11 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) (char *) dp_packet_eth(pkt); mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt); + + /* Inner L2 length must account for the tunnel header length. */ + l2 = dp_packet_l4(pkt); + l3 = dp_packet_inner_l3(pkt); + l4 = dp_packet_inner_l4(pkt); } else { /* If no outer offloading is requested, clear outer marks. */ mbuf->ol_flags &= ~all_outer_marks; @@ -2641,8 +2644,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->outer_l3_len = 0; /* Skip outer headers. */ - mbuf->l2_len += (char *) dp_packet_l4(pkt) - - (char *) dp_packet_eth(pkt); + l2 = dp_packet_eth(pkt); + l3 = dp_packet_inner_l3(pkt); + l4 = dp_packet_inner_l4(pkt); } } else { if (tunnel_type) { @@ -2662,22 +2666,27 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) } mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; - mbuf->l2_len = (char *) dp_packet_l3(pkt) - - (char *) dp_packet_eth(pkt); - mbuf->l3_len = (char *) dp_packet_l4(pkt) - - (char *) dp_packet_l3(pkt); + + l2 = dp_packet_eth(pkt); + l3 = dp_packet_l3(pkt); + l4 = dp_packet_l4(pkt); } + ovs_assert(l4); + + mbuf->l2_len = (char *) l3 - (char *) l2; + mbuf->l3_len = (char *) l4 - (char *) l3; + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { - struct tcp_header *th = dp_packet_l4(pkt); + struct tcp_header *th = l4; uint16_t link_tso_segsz; int hdr_len; + mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; if (tunnel_type) { link_tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - mbuf->l4_len - mbuf->outer_l3_len; } else { - mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; } diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index d6f46ac4ae1..0f9f07f44b6 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -240,35 +240,15 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, return udp + 1; } -/* Calculate inner l2 l3 l4 len as tunnel outer header is not - * encapsulated now. */ static void dp_packet_tnl_ol_process(struct dp_packet *packet, const struct ovs_action_push_tnl *data) { - struct udp_header *udp = NULL; - uint8_t opt_len = 0; - struct eth_header *eth = NULL; struct ip_header *ip = NULL; - struct genevehdr *gnh = NULL; - /* l2 l3 l4 len refer to inner len, tunnel outer - * header is not encapsulated here. */ if (dp_packet_hwol_l4_mask(packet)) { ip = dp_packet_l3(packet); - if (ip->ip_proto == IPPROTO_TCP) { - struct tcp_header *th = dp_packet_l4(packet); - dp_packet_set_l4_len(packet, TCP_OFFSET(th->tcp_ctl) * 4); - } else if (ip->ip_proto == IPPROTO_UDP) { - dp_packet_set_l4_len(packet, UDP_HEADER_LEN); - } else if (ip->ip_proto == IPPROTO_SCTP) { - dp_packet_set_l4_len(packet, SCTP_HEADER_LEN); - } - - dp_packet_set_l3_len(packet, (char *) dp_packet_l4(packet) - - (char *) dp_packet_l3(packet)); - if (data->tnl_type == OVS_VPORT_TYPE_GENEVE || data->tnl_type == OVS_VPORT_TYPE_VXLAN) { @@ -279,32 +259,12 @@ dp_packet_tnl_ol_process(struct dp_packet *packet, dp_packet_hwol_set_tx_ipv6(packet); } } + } - /* Attention please, tunnel inner l2 len is consist of udp header - * len and tunnel header len and inner l2 len. */ - if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { - eth = (struct eth_header *)(data->header); - ip = (struct ip_header *)(eth + 1); - udp = (struct udp_header *)(ip + 1); - gnh = (struct genevehdr *)(udp + 1); - opt_len = gnh->opt_len * 4; - dp_packet_hwol_set_tunnel_geneve(packet); - dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) - - (char *) dp_packet_eth(packet) + - GENEVE_BASE_HLEN + opt_len); - } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { - dp_packet_hwol_set_tunnel_vxlan(packet); - dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) - - (char *) dp_packet_eth(packet) + - VXLAN_HLEN); - } - } else { - /* Mark non-l4 packets as tunneled. */ - if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { - dp_packet_hwol_set_tunnel_geneve(packet); - } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { - dp_packet_hwol_set_tunnel_vxlan(packet); - } + if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { + dp_packet_hwol_set_tunnel_geneve(packet); + } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { + dp_packet_hwol_set_tunnel_vxlan(packet); } } From 2efae58940dae8e61302aeb3892a0256f08e7a9a Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 6 Jun 2024 15:11:11 +0200 Subject: [PATCH 744/833] system-dpdk: Fix socket conflict when starting testpmd. The DPDK telemetry library tries to connect to existing socket files so that it knows whether it can take over them. As was reported by Christian, following a fix in DPDK that got backported in v23.11.1, vhost-user unit tests that have both OVS and testpmd running at the same time reveal a conflict over the telemetry socket. This conflict shows up as an error message in OVS logs which makes those tests fail in the CI: 2024-06-06T13:03:38.351Z|00001|dpdk|ERR|TELEMETRY: Socket write base info to client failed The EAL file-prefix option affects both the directory where DPDK stores running files (like the telemetry socket) and how files backing hugepages are named (when in non --in-memory mode). Configure (again) this prefix so that testpmd runs in a dedicated directory. Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2024-June/414545.html Fixes: c488f28a0eaf ("system-dpdk: Don't require hugetlbfs.") Signed-off-by: David Marchand Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- tests/system-dpdk-macros.at | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 7cf9bac1700..f8ba7667390 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -102,7 +102,7 @@ m4_define([OVS_DPDK_CHECK_TESTPMD], m4_define([OVS_DPDK_START_TESTPMD], [AT_CHECK([lscpu], [], [stdout]) AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) - eal_options="$DPDK_EAL_OPTIONS --in-memory --socket-mem="$(cat NUMA_NODE)" --single-file-segments --no-pci" + eal_options="$DPDK_EAL_OPTIONS --in-memory --socket-mem="$(cat NUMA_NODE)" --single-file-segments --no-pci --file-prefix testpmd" options="$1" test "$options" != "${options%% -- *}" || options="$options -- " eal_options="$eal_options ${options%% -- *}" From f01a768e95e2532674b376b51193ebebb536f87d Mon Sep 17 00:00:00 2001 From: David Marchand Date: Thu, 6 Jun 2024 15:11:12 +0200 Subject: [PATCH 745/833] ci: Restore vhost-user unit tests in check-dpdk. Following a rework in the DPDK cache, the PATH variable is incorrectly set, resulting in dpdk-testpmd not being available. Because of this, vhost-user unit tests were skipped in GHA runs. Fixes: 8893e24d9d09 ("dpdk: Update to use v23.11.") Signed-off-by: David Marchand Acked-by: Kevin Traynor Acked-by: Eelco Chaudron --- .ci/linux-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index bf9d6241d52..702feeb3bb3 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -25,7 +25,7 @@ function install_dpdk() export PKG_CONFIG_PATH=$DPDK_LIB/pkgconfig/:$PKG_CONFIG_PATH # Expose dpdk binaries. - export PATH=$(pwd)/dpdk-dir/build/bin:$PATH + export PATH=$(pwd)/dpdk-dir/bin:$PATH if [ ! -f "${VERSION_FILE}" ]; then echo "Could not find DPDK in $DPDK_INSTALL_DIR" From 35e647051f984eeb305f76524f69a8bbf8c3933a Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Wed, 5 Jun 2024 15:51:38 +0200 Subject: [PATCH 746/833] checkpatch: Don't warn on pointer to pointer. Current regexp used to check whitespaces around operators does not consider that there can be more than one "*" together to express pointer to pointer. As a result, false positive warnings are raised when the patch contains a simple list of pointers, e.g: "char **errrp"). Fix the regexp to allow more than one consecutive "*" characters. Signed-off-by: Adrian Moreno Acked-by: Eelco Chaudron Signed-off-by: Simon Horman --- tests/checkpatch.at | 25 +++++++++++++++++++++++++ utilities/checkpatch.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/checkpatch.at b/tests/checkpatch.at index caab2817bd9..34971c514ca 100755 --- a/tests/checkpatch.at +++ b/tests/checkpatch.at @@ -353,6 +353,31 @@ try_checkpatch \ if (--mcs->n_refs==0) { " +try_checkpatch \ + "COMMON_PATCH_HEADER + +char *string; + +char **list; + +char ***ptr_list; + " + +try_checkpatch \ + "COMMON_PATCH_HEADER + +char** list; + " \ + "WARNING: Line lacks whitespace around operator + #8 FILE: A.c:1: + char** list; + " + +try_checkpatch \ + "COMMON_PATCH_HEADER + +char*** list; + " \ + "WARNING: Line lacks whitespace around operator + #8 FILE: A.c:1: + char*** list; + " + AT_CLEANUP AT_SETUP([checkpatch - check misuse APIs]) diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 6b293770ddd..742a0bc470f 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -739,7 +739,7 @@ def regex_operator_factory(operator): '&=', '^=', '|=', '<<=', '>>=']] \ + [r'[^<" ]<[^=" ]', r'[^\->" ]>[^=" ]', - r'[^ !()/"]\*[^/]', + r'[^ !()/"\*]\*+[^/]', r'[^ !&()"]&', r'[^" +(]\+[^"+;]', r'[^" \-(]\-[^"\->;]', From 79a3065657b2b47749b0cd38a7c660bdf86a05c0 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Wed, 10 Apr 2024 16:38:25 -0500 Subject: [PATCH 747/833] python: ovsdb-idl: Make IndexedRows mirror hmap. The Python IDL code very closely mirrors the C IDL code, which uses an hmap to store table rows. hmap code allows duplicate keys, while IndexedRows, which is derived from DictBase does not. The persistent UUID code can attempt to temporarily add a Row with a duplicate UUID to table.rows, so IndexedRows is modified to behave similarly to the C IDL's hmap implementation. Fixes: 55b9507e6824 ("ovsdb-idl: Add the support to specify the uuid for row insert.") Signed-off-by: Terry Wilson Signed-off-by: Ilya Maximets --- python/ovs/db/custom_index.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/ovs/db/custom_index.py b/python/ovs/db/custom_index.py index 587caf5e3e1..3fa03d3c959 100644 --- a/python/ovs/db/custom_index.py +++ b/python/ovs/db/custom_index.py @@ -90,14 +90,21 @@ def index_create(self, name): index = self.indexes[name] = MultiColumnIndex(name) return index + def __getitem__(self, key): + return self.data[key][-1] + def __setitem__(self, key, item): - self.data[key] = item + try: + self.data[key].append(item) + except KeyError: + self.data[key] = [item] for index in self.indexes.values(): index.add(item) def __delitem__(self, key): - val = self.data[key] - del self.data[key] + val = self.data[key].pop() + if len(self.data[key]) == 0: + del self.data[key] for index in self.indexes.values(): index.remove(val) From d401291955696b26688c194b64902ebe4cd48bd8 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Wed, 10 Apr 2024 16:38:26 -0500 Subject: [PATCH 748/833] python: ovsdb-idl: Convert new_uuid insert() arg to UUID. The argument to insert() should be a uuid.UUID object. If it isn't then a Row is created with a string uuid attribute and that row is added to table.rows with a string key instead of a UUID key. Fixes: 55b9507e6824 ("ovsdb-idl: Add the support to specify the uuid for row insert.") Signed-off-by: Terry Wilson Signed-off-by: Ilya Maximets --- python/ovs/db/idl.py | 2 +- tests/test-ovsdb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index c1341fc2a1f..1a4bd3bf982 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -1854,7 +1854,7 @@ def commit(self): if row._data is None: op["op"] = "insert" if row._persist_uuid: - op["uuid"] = row.uuid + op["uuid"] = str(row.uuid) else: op["uuid-name"] = _uuid_name_from_uuid(row.uuid) diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index 48f8ee2d704..6307aa2bdec 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -434,7 +434,7 @@ def notify(event, row, updates=None): sys.stderr.write('"set" command requires 2 argument\n') sys.exit(1) - s = txn.insert(idl.tables["simple"], new_uuid=args[0], + s = txn.insert(idl.tables["simple"], new_uuid=uuid.UUID(args[0]), persist_uuid=True) s.i = int(args[1]) elif name == "delete": From fad8c8f7f651796fe20001d9d2679b43ed44adf8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 27 May 2024 23:39:06 +0200 Subject: [PATCH 749/833] python: idl: Fix index not being updated on row modification. When a row is modified, python IDL doesn't perform any operations on existing client-side indexes. This means that if the column on which index is created changes, the old value will remain in the index and the new one will not be added to the index. Beside lookup failures this is also causing inability to remove modified rows, because the new column value doesn't exist in the index causing an exception on attempt to remove it: Traceback (most recent call last): File "ovsdbapp/backend/ovs_idl/connection.py", line 110, in run self.idl.run() File "ovs/db/idl.py", line 465, in run self.__parse_update(msg.params[2], OVSDB_UPDATE3) File "ovs/db/idl.py", line 924, in __parse_update self.__do_parse_update(update, version, self.tables) File "ovs/db/idl.py", line 964, in __do_parse_update changes = self.__process_update2(table, uuid, row_update) File "ovs/db/idl.py", line 991, in __process_update2 del table.rows[uuid] File "ovs/db/custom_index.py", line 102, in __delitem__ index.remove(val) File "ovs/db/custom_index.py", line 66, in remove self.values.remove(self.index_entry_from_row(row)) File "sortedcontainers/sortedlist.py", line 2015, in remove raise ValueError('{0!r} not in list'.format(value)) ValueError: Datapath_Binding( uuid=UUID('498e66a2-70bc-4587-a66f-0433baf82f60'), tunnel_key=16711683, load_balancers=[], external_ids={}) not in list Fix that by always removing an existing row from indexes before modification and adding back afterwards. This ensures that old values are removed from the index and new ones are added. This behavior is consistent with the C implementation. The new test that reproduces the removal issue is added. Some extra testing infrastructure added to be able to handle and print out the 'indexed' table from the idltest schema. Fixes: 13973bc41524 ("Add multi-column index support for the Python IDL") Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2024-May/053159.html Reported-by: Roberto Bartzen Acosta Acked-by: Mike Pattrick Acked-by: Dumitru Ceara Acked-by: Terry Wilson Signed-off-by: Ilya Maximets --- python/ovs/db/idl.py | 13 ++++-- tests/ovsdb-idl.at | 95 +++++++++++++++++++++++++++++++++++++++++++- tests/test-ovsdb.c | 43 ++++++++++++++++++++ tests/test-ovsdb.py | 15 +++++++ 4 files changed, 160 insertions(+), 6 deletions(-) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index 1a4bd3bf982..b6d5ed6972a 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -1013,7 +1013,9 @@ def __process_update2(self, table, uuid, row_update): if not row: raise error.Error('Modify non-existing row') + del table.rows[uuid] old_row = self.__apply_diff(table, row, row_update['modify']) + table.rows[uuid] = row return Notice(ROW_UPDATE, row, Row(self, table, uuid, old_row)) else: raise error.Error(' unknown operation', @@ -1044,9 +1046,10 @@ def __process_update(self, table, uuid, old, new): op = ROW_UPDATE vlog.warn("cannot add existing row %s to table %s" % (uuid, table.name)) + del table.rows[uuid] + changed |= self.__row_update(table, row, new) - if op == ROW_CREATE: - table.rows[uuid] = row + table.rows[uuid] = row if changed: return Notice(ROW_CREATE, row) else: @@ -1058,9 +1061,11 @@ def __process_update(self, table, uuid, old, new): # XXX rate-limit vlog.warn("cannot modify missing row %s in table %s" % (uuid, table.name)) + else: + del table.rows[uuid] + changed |= self.__row_update(table, row, new) - if op == ROW_CREATE: - table.rows[uuid] = row + table.rows[uuid] = row if changed: return Notice(op, row, Row.from_json(self, table, uuid, old)) return False diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 97162707eb4..b9dc0bdea1a 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -167,8 +167,17 @@ m4_define([OVSDB_CHECK_IDL_REGISTER_COLUMNS_PY], OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) - AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket ?simple:b,ba,i,ia,r,ra,s,sa,u,ua?simple3:name,uset,uref?simple4:name?simple6:name,weak_ref?link1:i,k,ka,l2?link2:i,l1?singleton:name $3], - [0], [stdout], [ignore]) + m4_define([REGISTER], m4_joinall([?], [], + [simple:b,ba,i,ia,r,ra,s,sa,u,ua], + [simple3:name,uset,uref], + [simple4:name], + [simple6:name,weak_ref], + [link1:i,k,ka,l2], + [link2:i,l1], + [indexed:i], + [singleton:name])) + AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema \ + unix:socket REGISTER $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) OVSDB_SERVER_SHUTDOWN @@ -747,6 +756,31 @@ OVSDB_CHECK_IDL([simple idl, conditional, multiple tables], 009: done ]]) +OVSDB_CHECK_IDL([indexed idl, modification and removal], + [], + [['["idltest", + {"op": "insert", + "table": "indexed", + "row": {"i": 123 }}]' \ + '["idltest", + {"op": "update", + "table": "indexed", + "where": [["i", "==", 123]], + "row": {"i": 456}}]' \ + '["idltest", + {"op": "delete", + "table": "indexed", + "where": [["i", "==", 456]]}]']], + [[000: empty +001: {"error":null,"result":[{"uuid":["uuid","<0>"]}]} +002: table indexed: i=123 uuid=<0> +003: {"error":null,"result":[{"count":1}]} +004: table indexed: i=456 uuid=<0> +005: {"error":null,"result":[{"count":1}]} +006: empty +007: done +]]) + OVSDB_CHECK_IDL([self-linking idl, consistent ops], [], [['["idltest", @@ -1288,6 +1322,33 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated], 003: done ]]) +OVSDB_CHECK_IDL_TRACK([track, indexed idl, modification and removal], + [], + [['["idltest", + {"op": "insert", + "table": "indexed", + "row": {"i": 123 }}]' \ + '["idltest", + {"op": "update", + "table": "indexed", + "where": [["i", "==", 123]], + "row": {"i": 456}}]' \ + '["idltest", + {"op": "delete", + "table": "indexed", + "where": [["i", "==", 456]]}]']], + [[000: empty +001: {"error":null,"result":[{"uuid":["uuid","<0>"]}]} +002: table indexed: inserted row: i=123 uuid=<0> +002: table indexed: updated columns: i +003: {"error":null,"result":[{"count":1}]} +004: table indexed: i=456 uuid=<0> +004: table indexed: updated columns: i +005: {"error":null,"result":[{"count":1}]} +006: empty +007: done +]]) + dnl This test creates database with weak references and checks that orphan dnl rows created for weak references are not available for iteration via dnl list of tracked changes. @@ -2036,6 +2097,36 @@ OVSDB_CHECK_IDL_NOTIFY([simple idl verify notify], 015: done ]]) +OVSDB_CHECK_IDL_NOTIFY([indexed idl, modification and removal notify], + [['track-notify' \ + '["idltest", + {"op": "insert", + "table": "indexed", + "row": {"i": 123 }}]' \ + '["idltest", + {"op": "update", + "table": "indexed", + "where": [["i", "==", 123]], + "row": {"i": 456}}]' \ + '["idltest", + {"op": "delete", + "table": "indexed", + "where": [["i", "==", 456]]}]']], + [[000: empty +000: event:create, row={}, uuid=<0>, updates=None +000: event:create, row={}, uuid=<1>, updates=None +001: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} +002: event:create, row={i=123}, uuid=<2>, updates=None +002: table indexed: i=123 uuid=<2> +003: {"error":null,"result":[{"count":1}]} +004: event:update, row={i=456}, uuid=<2>, updates={i=123} +004: table indexed: i=456 uuid=<2> +005: {"error":null,"result":[{"count":1}]} +006: empty +006: event:delete, row={i=456}, uuid=<2>, updates=None +007: done +]]) + # Tests to verify the functionality of the one column compound index. # It tests index for one column string and integer indexes. # The run of test-ovsdb generates the output of the display of data using the different indexes defined in diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index c4ab899d459..41c1525f451 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -2023,6 +2023,24 @@ print_idl_row_updated_link2(const struct idltest_link2 *l2, int step) } } +static void +print_idl_row_updated_indexed(const struct idltest_indexed *ind, int step) +{ + struct ds updates = DS_EMPTY_INITIALIZER; + + for (size_t i = 0; i < IDLTEST_INDEXED_N_COLUMNS; i++) { + if (idltest_indexed_is_updated(ind, i)) { + ds_put_format(&updates, " %s", idltest_indexed_columns[i].name); + } + } + if (updates.length) { + print_and_log("%03d: table %s: updated columns:%s", + step, ind->header_.table->class_->name, + ds_cstr(&updates)); + ds_destroy(&updates); + } +} + static void print_idl_row_updated_simple3(const struct idltest_simple3 *s3, int step) { @@ -2172,6 +2190,21 @@ print_idl_row_link2(const struct idltest_link2 *l2, int step, bool terse) print_idl_row_updated_link2(l2, step); } +static void +print_idl_row_indexed(const struct idltest_indexed *ind, int step, bool terse) +{ + struct ds msg = DS_EMPTY_INITIALIZER; + + ds_put_format(&msg, "i=%"PRId64, ind->i); + + char *row_msg = format_idl_row(&ind->header_, step, ds_cstr(&msg), terse); + print_and_log("%s", row_msg); + ds_destroy(&msg); + free(row_msg); + + print_idl_row_updated_indexed(ind, step); +} + static void print_idl_row_simple3(const struct idltest_simple3 *s3, int step, bool terse) { @@ -2252,6 +2285,7 @@ print_idl_row_singleton(const struct idltest_singleton *sng, int step, static void print_idl(struct ovsdb_idl *idl, int step, bool terse) { + const struct idltest_indexed *ind; const struct idltest_simple3 *s3; const struct idltest_simple4 *s4; const struct idltest_simple6 *s6; @@ -2285,6 +2319,10 @@ print_idl(struct ovsdb_idl *idl, int step, bool terse) print_idl_row_simple6(s6, step, terse); n++; } + IDLTEST_INDEXED_FOR_EACH (ind, idl) { + print_idl_row_indexed(ind, step, terse); + n++; + } IDLTEST_SINGLETON_FOR_EACH (sng, idl) { print_idl_row_singleton(sng, step, terse); n++; @@ -2297,6 +2335,7 @@ print_idl(struct ovsdb_idl *idl, int step, bool terse) static void print_idl_track(struct ovsdb_idl *idl, int step, bool terse) { + const struct idltest_indexed *ind; const struct idltest_simple3 *s3; const struct idltest_simple4 *s4; const struct idltest_simple6 *s6; @@ -2329,6 +2368,10 @@ print_idl_track(struct ovsdb_idl *idl, int step, bool terse) print_idl_row_simple6(s6, step, terse); n++; } + IDLTEST_INDEXED_FOR_EACH (ind, idl) { + print_idl_row_indexed(ind, step, terse); + n++; + } if (!n) { print_and_log("%03d: empty", step); diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index 6307aa2bdec..67a45f044b8 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -228,6 +228,10 @@ def get_link2_table_printable_row(row): return s +def get_indexed_table_printable_row(row): + return "i=%s" % row.i + + def get_singleton_table_printable_row(row): return "name=%s" % row.name @@ -307,6 +311,14 @@ def print_idl(idl, step, terse=False): terse) n += 1 + if "indexed" in idl.tables: + ind = idl.tables["indexed"].rows + for row in ind.values(): + print_row("indexed", row, step, + get_indexed_table_printable_row(row), + terse) + n += 1 + if "singleton" in idl.tables: sng = idl.tables["singleton"].rows for row in sng.values(): @@ -690,6 +702,9 @@ def do_idl(schema_file, remote, *commands): idl = ovs.db.idl.Idl(remote, schema_helper, leader_only=False) if "simple3" in idl.tables: idl.index_create("simple3", "simple3_by_name") + if "indexed" in idl.tables: + idx = idl.index_create("indexed", "indexed_by_i") + idx.add_column("i") if commands: remotes = remote.split(',') From 2c1a432e2f089f54c4aa395befbc6c2f07f0d305 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Thu, 6 Jun 2024 17:15:46 +0200 Subject: [PATCH 750/833] python: ovs: flow: Fix nested check_pkt_len acts. Add check_pkt_len action to the decoder list that it, itself, uses. This makes nested check_pkt_len (i.e:a check_pkt_len inside another) work. Fixes: 076663b31edc ("python: Add ovs datapath flow parsing.") Reported-by: Ilya Maximets Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- python/ovs/flow/odp.py | 43 ++++++++++++++++++------------------ python/ovs/tests/test_odp.py | 29 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index 7d9b165d464..a8f8c067a90 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -365,29 +365,30 @@ def _action_decoders_args(): is_list=True, ) - return { - **_decoders, - "check_pkt_len": nested_kv_decoder( - KVDecoders( - { - "size": decode_int, - "gt": nested_kv_decoder( - KVDecoders( - decoders=_decoders, - default_free=decode_free_output, - ), - is_list=True, + _decoders["check_pkt_len"] = nested_kv_decoder( + KVDecoders( + { + "size": decode_int, + "gt": nested_kv_decoder( + KVDecoders( + decoders=_decoders, + default_free=decode_free_output, ), - "le": nested_kv_decoder( - KVDecoders( - decoders=_decoders, - default_free=decode_free_output, - ), - is_list=True, + is_list=True, + ), + "le": nested_kv_decoder( + KVDecoders( + decoders=_decoders, + default_free=decode_free_output, ), - } - ) - ), + is_list=True, + ), + } + ) + ) + + return { + **_decoders, } @staticmethod diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index f19ec386e8e..d514e9be32d 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -541,6 +541,35 @@ def test_odp_fields(input_string, expected): ), ], ), + ( + "actions:check_pkt_len(size=200,gt(check_pkt_len(size=400,gt(4),le(2))),le(check_pkt_len(size=100,gt(1),le(drop))))", # noqa: E501 + [ + KeyValue( + "check_pkt_len", + { + "size": 200, + "gt": [ + { + "check_pkt_len": { + "size": 400, + "gt": [{"output": {"port": 4}}], + "le": [{"output": {"port": 2}}], + } + } + ], + "le": [ + { + "check_pkt_len": { + "size": 100, + "gt": [{"output": {"port": 1}}], + "le": [{"drop": True}], + } + } + ], + }, + ) + ], + ), ( "actions:meter(1),hash(l4(0))", [ From 85d19a5edd160a91b1407561cc49296380663b61 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 10 Jun 2024 23:18:14 +0200 Subject: [PATCH 751/833] appveyor: Fix caching of OpenSSL installer. Apparently, if the cache dependency is specified, the cache folder is not checked at the end of a build and so the cache is never updated unless we change appveyor.yml. This makes the cache to not actually work, because on each build we discover that the installer is outdated, download the new one and it is not uploaded to the cache, so it is still outdated on the next build. Removing the dependency to get a normal cache behavior. We're manually comparing the hash of the cached binary with the most latest one, so we will still catch any OpenSSL updates, but now we will also upload the updated cache back. Fixes: 9d8208484a35 ("appveyor: Build with OpenSSL 3.0.") Reported-at: https://help.appveyor.com/discussions/problems/36144-cache-reports-up-to-date-while-it-is-not Acked-by: Alin-Gabriel Serdean Signed-off-by: Ilya Maximets --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index d11e4639989..d0293b2118c 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -15,7 +15,7 @@ init: -Value "C:\Python312-x64\python.exe" cache: -- C:\ovs-build-downloads -> appveyor.yml +- C:\ovs-build-downloads install: - ps: | From c00dd0c9445c8bf1a31e4f75d6ca1551ad2e9349 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Thu, 13 Jun 2024 10:05:00 +0100 Subject: [PATCH 752/833] dpdk: Check other_config:dpdk-extra for '--lcores'. Currently dpdk lcore args for DPDK EAL init can be generated or they can be written directly by the user through dpdk-extra. If dpdk-extra does not contain '-l' or '-c', a '-l' argument with a core list for DPDK EAL init will be generated. The '--lcores' argument should also be checked, as if it is used in dpdk-extra, currently a '-l' is still generated and that causes DPDK EAL init to fail: |00009|dpdk|INFO|EAL ARGS: ovs-vswitchd --lcores 0@18 --in-memory -l 0. |00012|dpdk|ERR|EAL: Option -l is ignored, because (--lcore) is set! Add check for '--lcores' in dpdk-extra config and don't generate '-l' if it is detected: |00009|dpdk|INFO|EAL ARGS: ovs-vswitchd --lcores 0@8 --in-memory. Fixes: 543342a41cbc ("DPDK: add support for v2.0.0") Signed-off-by: Kevin Traynor Reviewed-by: David Marchand Acked-by: Eelco Chaudron --- lib/dpdk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/dpdk.c b/lib/dpdk.c index d76d53f8f16..940c43c070b 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -337,7 +337,9 @@ dpdk_init__(const struct smap *ovs_other_config) } #endif - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { + if (args_contains(&args, "-c") || + args_contains(&args, "-l") || + args_contains(&args, "--lcores")) { auto_determine = false; } From 8b405f45d5f1fd112d19526e609e6669038f8974 Mon Sep 17 00:00:00 2001 From: Martin Kalcok Date: Fri, 7 Jun 2024 11:30:55 +0200 Subject: [PATCH 753/833] ovsdb-client: Add "COLUMN" arg to help for 'dump'. Help text for 'ovsdb-client dump' does not mention that it's capable of dumping a specific column's contents if the user supplies the column's name as a fourth positional argument. Signed-off-by: Martin Kalcok Signed-off-by: Simon Horman --- ovsdb/ovsdb-client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index cf2ecfd08a9..b7b189c7e41 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -451,9 +451,9 @@ usage(void) " wait until DATABASE reaches STATE " "(\"added\" or \"connected\" or \"removed\")\n" " in DATBASE on SERVER.\n" - "\n dump [SERVER] [DATABASE] [TABLE]\n" - " dump contents of TABLE (or all tables) in DATABASE on SERVER\n" - " to stdout\n" + "\n dump [SERVER] [DATABASE] [TABLE [COLUMN]...]\n" + " dump contents of COLUMNs, TABLE (or all tables) in DATABASE\n" + " on SERVER to stdout\n" "\n backup [SERVER] [DATABASE] > SNAPSHOT\n" " dump database contents in the form of a database file\n" "\n [--force] restore [SERVER] [DATABASE] < SNAPSHOT\n" From 24907bd1bc1a261d3647e7224d86149f313a8774 Mon Sep 17 00:00:00 2001 From: Martin Kalcok Date: Fri, 7 Jun 2024 13:53:30 +0200 Subject: [PATCH 754/833] ovsdb-client: Document "--timeout" option in help. Add information about "-t" and "--timeout" options for ovsdb-client. The option is documented in the "Other options" section, similar to how "ovs-appctl" has it. Signed-off-by: Martin Kalcok Signed-off-by: Simon Horman --- ovsdb/ovsdb-client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index b7b189c7e41..45501911c30 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -474,6 +474,8 @@ usage(void) vlog_usage(); ovs_replay_usage(); printf("\nOther options:\n" + " -t, --timeout=SECS limits ovsdb-client runtime to\n" + " approximately SECS seconds.\n" " -h, --help display this help message\n" " -V, --version display version information\n"); exit(EXIT_SUCCESS); From 6b09799f03548f0377788f18160a3d9db3eac752 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Wed, 19 Jun 2024 09:19:17 -0400 Subject: [PATCH 755/833] ofp-prop: Fix unaligned 128 bit access. When compiling with '-fsanitize=address,undefined', the "ovs-ofctl ct-flush" test will yield the following undefined behavior flagged by UBSan. This problem is caused by the fact that 128bit property put/parse functions weren't adding appropriate padding before writing or reading the value. This patch uses get_32aligned_* functions to copy the bytes as they are aligned. lib/ofp-prop.c:277:14: runtime error: load of misaligned address 0x60600000687c for type 'union ovs_be128', which requires 8 byte alignment 0x60600000687c: note: pointer points here 00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 ^ 0: in ofpprop_parse_u128 lib/ofp-prop.c:277 1: in ofp_ct_match_decode lib/ofp-ct.c:525 2: in ofp_print_nxt_ct_flush lib/ofp-print.c:959 3: in ofp_to_string__ lib/ofp-print.c:1206 4: in ofp_to_string lib/ofp-print.c:1264 5: in ofp_print lib/ofp-print.c:1308 6: in ofctl_ofp_print utilities/ovs-ofctl.c:4899 7: in ovs_cmdl_run_command__ lib/command-line.c:247 8: in ovs_cmdl_run_command lib/command-line.c:278 9: in main utilities/ovs-ofctl.c:186 Fixes: 62c5d32ad4ab ("ofp-prop: Add helper for parsing and storing of ovs_u128.") Acked-by: Ales Musil Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/ofp-prop.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/ofp-prop.c b/lib/ofp-prop.c index 0a685750c17..0e54543bdd8 100644 --- a/lib/ofp-prop.c +++ b/lib/ofp-prop.c @@ -21,6 +21,7 @@ #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-prop.h" #include "openvswitch/vlog.h" +#include "unaligned.h" #include "util.h" #include "uuid.h" @@ -190,11 +191,12 @@ ofpprop_parse_be64(const struct ofpbuf *property, ovs_be64 *value) enum ofperr ofpprop_parse_be128(const struct ofpbuf *property, ovs_be128 *value) { - ovs_be128 *p = property->msg; + ovs_32aligned_be128 *p = property->msg; + if (ofpbuf_msgsize(property) != sizeof *p) { return OFPERR_OFPBPC_BAD_LEN; } - *value = *p; + *value = get_32aligned_be128(p); return 0; } @@ -270,12 +272,13 @@ ofpprop_parse_u64(const struct ofpbuf *property, uint64_t *value) enum ofperr ofpprop_parse_u128(const struct ofpbuf *property, ovs_u128 *value) { - ovs_be128 *p = property->msg; - if (ofpbuf_msgsize(property) != sizeof *p) { - return OFPERR_OFPBPC_BAD_LEN; + enum ofperr error = ofpprop_parse_be128(property, (ovs_be128 *) value); + + if (!error) { + *value = ntoh128(*(ovs_be128 *) value); } - *value = ntoh128(*p); - return 0; + + return error; } /* Attempts to parse 'property' as a property containing a UUID. If From 2f196c80e7165150d956fc50de0db58b8964ee2e Mon Sep 17 00:00:00 2001 From: David Marchand Date: Wed, 19 Jun 2024 18:00:55 +0200 Subject: [PATCH 756/833] netdev-dpdk: Use LSC interrupt mode. Querying link status may get delayed for an undeterministic (long) time with mlx5 ports. This is a consequence of the mlx5 driver calling ethtool kernel API and getting stuck on the kernel RTNL lock while some other operation is in progress under this lock. One impact for long link status query is that it is called under the bond lock taken in write mode periodically in bond_run(). In parallel, datapath threads may block requesting to read bonding related info (like for example in bond_check_admissibility()). The LSC interrupt mode is available with many DPDK drivers and is used by default with testpmd. It seems safe enough to switch on this feature by default in OVS. We keep the per interface option to disable this feature in case of an unforeseen bug. Signed-off-by: David Marchand Reviewed-by: Robin Jarry Acked-by: Mike Pattrick Acked-by: Maxime Coquelin Acked-by: Kevin Traynor Acked-by: Aaron Conole --- Documentation/topics/dpdk/phy.rst | 4 ++-- NEWS | 3 +++ lib/netdev-dpdk.c | 13 ++++++++++++- vswitchd/vswitch.xml | 8 ++++---- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index efd168cba80..eefc25613d2 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -546,8 +546,8 @@ the firmware every time to fulfil this request. Note that not all PMD drivers support LSC interrupts. -The default configuration is polling mode. To set interrupt mode, option -``dpdk-lsc-interrupt`` has to be set to ``true``. +The default configuration is interrupt mode. To set polling mode, option +``dpdk-lsc-interrupt`` has to be set to ``false``. Command to set interrupt mode for a specific interface:: $ ovs-vsctl set interface options:dpdk-lsc-interrupt=true diff --git a/NEWS b/NEWS index 5ae0108d552..d05f2d0f89e 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,9 @@ Post-v3.3.0 https://github.com/openvswitch/ovs.git - DPDK: * OVS validated with DPDK 23.11.1. + * Link status changes are now handled via interrupt mode if the DPDK + driver supports it. It is possible to revert to polling mode by setting + per interface 'options:dpdk-lsc-interrupt' to 'false'. v3.3.0 - 16 Feb 2024 diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 0fa37d51456..76b8e76ece5 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2397,7 +2397,18 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, } } - lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false); + lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", true); + if (lsc_interrupt_mode && !(*info.dev_flags & RTE_ETH_DEV_INTR_LSC)) { + if (smap_get(args, "dpdk-lsc-interrupt")) { + VLOG_WARN_BUF(errp, "'%s': link status interrupt is not " + "supported.", netdev_get_name(netdev)); + err = EINVAL; + goto out; + } + VLOG_DBG("'%s': not enabling link status interrupt.", + netdev_get_name(netdev)); + lsc_interrupt_mode = false; + } if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) { dev->requested_lsc_interrupt_mode = lsc_interrupt_mode; netdev_request_reconfigure(netdev); diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 8a1b607d71b..e3afb78a4e5 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -4647,12 +4647,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \

      - Set this value to true to configure interrupt mode for - Link State Change (LSC) detection instead of poll mode for the DPDK - interface. + Set this value to false to configure poll mode for + Link State Change (LSC) detection instead of interrupt mode for the + DPDK interface.

      - If this value is not set, poll mode is configured. + If this value is not set, interrupt mode is configured.

      This parameter has an effect only on netdev dpdk interfaces. From 3f4df4c7bfe4ecd662a31a00a89eb990752c9879 Mon Sep 17 00:00:00 2001 From: Emma Finn Date: Wed, 12 Jun 2024 10:44:23 +0000 Subject: [PATCH 757/833] odp-execute: Set IPv6 traffic class in AVX implementation. The AVX implementation for the IPv6 action did not set traffic class field. Adding support for this field to the AVX implementation. Fixes: a879beb4dbee ("odp-execute: Add ISA implementation of set_masked IPv6 action") Reported-by: Eelco Chaudron Signed-off-by: Emma Finn Signed-off-by: Eelco Chaudron --- lib/odp-execute-avx512.c | 8 ++++++++ lib/packets.c | 2 +- lib/packets.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index a74a85dc1a8..4e33e35f8bd 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -741,6 +741,14 @@ action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) } /* Write back the modified IPv6 addresses. */ _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); + + /* Scalar method for setting IPv6 tclass field. */ + if (key->ipv6_tclass) { + uint8_t old_tc = ntohl(get_16aligned_be32(&nh->ip6_flow)) >> 20; + uint8_t key_tc = key->ipv6_tclass | (old_tc & ~mask->ipv6_tclass); + + packet_set_ipv6_tc(&nh->ip6_flow, key_tc); + } } } #endif /* HAVE_AVX512VBMI */ diff --git a/lib/packets.c b/lib/packets.c index ebf516d6790..91c28daf028 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -1299,7 +1299,7 @@ packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, ovs_be32 flow_key) put_16aligned_be32(flow_label, new_label); } -static void +void packet_set_ipv6_tc(ovs_16aligned_be32 *flow_label, uint8_t tc) { ovs_be32 old_label = get_16aligned_be32(flow_label); diff --git a/lib/packets.h b/lib/packets.h index 8b6994809fe..a102f81634e 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -1635,6 +1635,7 @@ void packet_set_ipv6_addr(struct dp_packet *packet, uint8_t proto, bool recalculate_csum); void packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, ovs_be32 flow_key); +void packet_set_ipv6_tc(ovs_16aligned_be32 *flow_label, uint8_t tc); void packet_set_tcp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); void packet_set_udp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); void packet_set_sctp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); From 48118494497040e71c0c60f59ab5664c5b00464c Mon Sep 17 00:00:00 2001 From: Emma Finn Date: Mon, 17 Jun 2024 14:08:37 +0000 Subject: [PATCH 758/833] odp-execute: Check IPv4 checksum offload flag in AVX. The AVX implementation for IPv4 action did not check whether the IPv4 checksum offload flag has been set and was incorrectly calculating checksums in software. Adding a check to skip AVX checksum calculation when offload flags are set. Fixes: 5d11c47d3ebe ("userspace: Enable IP checksum offloading by default.") Reported-by: Eelco Chaudron Acked-by: Mike Pattrick Signed-off-by: Emma Finn Signed-off-by: Eelco Chaudron --- lib/odp-execute-avx512.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 4e33e35f8bd..09eb685cbac 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -473,7 +473,7 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, * (v_pkt_masked). */ __m256i v_new_hdr = _mm256_or_si256(v_key_shuf, v_pkt_masked); - if (dp_packet_hwol_tx_ip_csum(packet)) { + if (dp_packet_hwol_l3_ipv4(packet)) { dp_packet_ol_reset_ip_csum_good(packet); } else { ovs_be16 old_csum = ~nh->ip_csum; From 639fcf200517e757fbf651f870080c01629cbd74 Mon Sep 17 00:00:00 2001 From: Kevin Traynor Date: Wed, 12 Jun 2024 15:32:55 +0100 Subject: [PATCH 759/833] netdev-dpdk: Check pending reset when adding device. When a device reset interrupt event (RTE_ETH_EVENT_INTR_RESET) is detected for a DPDK device added to OVS, a device reset is performed. If a device reset interrupt event is detected for a device before it is added to OVS, device reset is not called. If that device is later attempted to be added to OVS, it may fail while being configured if it is still pending a reset as pending reset is not checked when adding a device. A simple way to force a reset event from the ice driver for an iavf device is to set the mac address after binding iavf dev to vfio but before adding to OVS. (note: should not be set like this in normal case). e.g. $ echo 2 > /sys/class/net/ens3f0/device/sriov_numvfs $ ./devbind.py -b vfio-pci 0000:d8:01.1 $ ip link set ens3f0 vf 1 mac 26:ab:e6:6f:79:4d $ ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk \ options:dpdk-devargs=0000:d8:01.1 |dpdk|ERR|Port1 dev_configure = -1 |netdev_dpdk|WARN|Interface dpdk0 eth_dev setup error Operation not permitted |netdev_dpdk|ERR|Interface dpdk0(rxq:1 txq:5 lsc interrupt mode:false) configure error: Operation not permitted |dpif_netdev|ERR|Failed to set interface dpdk0 new configuration Add a check if there was any previous device reset interrupt events when a device is added to OVS. If there was, perform the reset before continuing with the rest of the configuration. netdev_dpdk_pending_reset[] already tracks device reset interrupt events for all devices, so it can be reused to check if there is a reset needed during configuration of newly added devices. By extending it's usage, dev->reset_needed is no longer needed. Fixes: 3eb91a8d1b9a ("netdev-dpdk: Trigger port reconfiguration in main thread for resets.") Reviewed-by: David Marchand Signed-off-by: Kevin Traynor Signed-off-by: Ilya Maximets --- lib/netdev-dpdk.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 76b8e76ece5..78e2d070322 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -464,9 +464,8 @@ struct netdev_dpdk { bool attached; /* If true, rte_eth_dev_start() was successfully called */ bool started; - bool reset_needed; - /* 1 pad byte here. */ struct eth_addr hwaddr; + /* 2 pad bytes here. */ int mtu; int socket_id; int buf_size; @@ -1531,7 +1530,6 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; dev->attached = false; dev->started = false; - dev->reset_needed = false; ovsrcu_init(&dev->qos_conf, NULL); @@ -2154,13 +2152,11 @@ netdev_dpdk_run(const struct netdev_class *netdev_class OVS_UNUSED) if (!pending_reset) { continue; } - atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], false); ovs_mutex_lock(&dpdk_mutex); dev = netdev_dpdk_lookup_by_port_id(port_id); if (dev) { ovs_mutex_lock(&dev->mutex); - dev->reset_needed = true; netdev_request_reconfigure(&dev->up); VLOG_DBG_RL(&rl, "%s: Device reset requested.", netdev_get_name(&dev->up)); @@ -6083,6 +6079,7 @@ static int netdev_dpdk_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + bool pending_reset; bool try_rx_steer; int err = 0; @@ -6094,6 +6091,9 @@ netdev_dpdk_reconfigure(struct netdev *netdev) dev->requested_n_rxq += 1; } + atomic_read_relaxed(&netdev_dpdk_pending_reset[dev->port_id], + &pending_reset); + if (netdev->n_txq == dev->requested_n_txq && netdev->n_rxq == dev->requested_n_rxq && dev->rx_steer_flags == dev->requested_rx_steer_flags @@ -6103,7 +6103,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) && dev->txq_size == dev->requested_txq_size && eth_addr_equals(dev->hwaddr, dev->requested_hwaddr) && dev->socket_id == dev->requested_socket_id - && dev->started && !dev->reset_needed) { + && dev->started && !pending_reset) { /* Reconfiguration is unnecessary */ goto out; @@ -6112,10 +6112,14 @@ netdev_dpdk_reconfigure(struct netdev *netdev) retry: dpdk_rx_steer_unconfigure(dev); - if (dev->reset_needed) { + if (pending_reset) { + /* + * Set false before reset to avoid missing a new reset interrupt event + * in a race with event callback. + */ + atomic_store_relaxed(&netdev_dpdk_pending_reset[dev->port_id], false); rte_eth_dev_reset(dev->port_id); if_notifier_manual_report(); - dev->reset_needed = false; } else { rte_eth_dev_stop(dev->port_id); } From 56e315937eeb640d5d8f305988d133390445eaee Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 14 Jun 2024 14:22:47 +0200 Subject: [PATCH 760/833] vswitchd: Only lock pages that are faulted in. The main purpose of locking the memory is to ensure that OVS can keep doing what it did before in case of increased memory pressure, e.g., during VM ingest / migration. Fulfilling this requirement can be achieved without locking all the allocated memory, but only the pages already accessed in the past (faulted in). Processing of the new traffic involves new memory allocations. Latency on these operations can't be guaranteed by the locking. The main difference would be the pre-faulting of the stack memory. However, in order to revalidate or process upcalls on the same traffic, the same amount of stack is likely needed, so all the necessary memory will already be faulted in. Switch 'mlockall' to MCL_ONFAULT to avoid consuming unnecessarily large amounts of RAM on systems with high core counts. For example, in a densely populated OVN cluster this saves about 650 MB of RAM per node on a system with 64 cores. This equates to 320 GB of allocated but unused RAM in a 500 node cluster. This also makes OVS better suited by default for small systems with limited amount of memory. The MCL_ONFAULT flag was introduced in Linux kernel 4.4 and wasn't available at the time of '--mlockall' introduction, but we can use it now. Falling back to an old way of locking in case we're running on an older kernel just in case. Only locking the faulted in pages also makes locking compatible with vhost post-copy live migration by default, because we'll no longer pre-fault all the guest's memory. Post-copy relies on userfaultfd to work on shared huge pages, which is only available in 4.11+ kernels. So, technically, it should not be possible for MCL_ONFAULT to fail and the call without it to succeed. But keeping the check just in case for now. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-ctl.8.rst | 5 +++-- Documentation/topics/dpdk/vhost-user.rst | 6 ++++-- NEWS | 2 ++ lib/netdev-dpdk.c | 2 +- lib/util.c | 12 ++++++------ lib/util.h | 4 ++-- vswitchd/ovs-vswitchd.8.in | 9 +++++---- vswitchd/ovs-vswitchd.c | 17 ++++++++++++----- 8 files changed, 35 insertions(+), 22 deletions(-) diff --git a/Documentation/ref/ovs-ctl.8.rst b/Documentation/ref/ovs-ctl.8.rst index 9f077a122c2..cdbaac4dc0b 100644 --- a/Documentation/ref/ovs-ctl.8.rst +++ b/Documentation/ref/ovs-ctl.8.rst @@ -170,8 +170,9 @@ The following options are less important: * ``--no-mlockall`` By default ``ovs-ctl`` passes ``--mlockall`` to ``ovs-vswitchd``, - requesting that it lock all of its virtual memory, preventing it - from being paged to disk. This option suppresses that behavior. + requesting that it lock all of its virtual memory on page fault (on + allocation, when running on Linux kernel 4.4 and older), preventing + it from being paged to disk. This option suppresses that behavior. * ``--no-self-confinement`` diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst index 7866543d89a..d9d87aa0872 100644 --- a/Documentation/topics/dpdk/vhost-user.rst +++ b/Documentation/topics/dpdk/vhost-user.rst @@ -340,8 +340,10 @@ The default value is ``false``. fixes (like userfaulfd leak) was released in 3.0.1. DPDK Post-copy feature requires avoiding to populate the guest memory - (application must not call mlock* syscall). So enabling mlockall is - incompatible with post-copy feature. + (application must not call mlock* syscall without MCL_ONFAULT). + So enabling mlockall is incompatible with post-copy feature in OVS 3.3 and + older. Newer versions of OVS only lock memory pages that are faulted in, + so both features can be used at the same time. Note that during migration of vhost-user device, PMD threads hang for the time of faulted pages download from source host. Transferring 1GB hugepage diff --git a/NEWS b/NEWS index d05f2d0f89e..e7f1a9fe128 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,7 @@ Post-v3.3.0 -------------------- + - Option '--mlockall' now only locks memory pages on fault, if possible. + This also makes it compatible with vHost Post-copy Live Migration. - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 78e2d070322..02cef6e4513 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -6719,7 +6719,7 @@ parse_vhost_config(const struct smap *ovs_other_config) vhost_postcopy_enabled = smap_get_bool(ovs_other_config, "vhost-postcopy-support", false); - if (vhost_postcopy_enabled && memory_locked()) { + if (vhost_postcopy_enabled && memory_all_locked()) { VLOG_WARN("vhost-postcopy-support and mlockall are not compatible."); vhost_postcopy_enabled = false; } diff --git a/lib/util.c b/lib/util.c index 5c31d983a66..3a6351a2f37 100644 --- a/lib/util.c +++ b/lib/util.c @@ -67,8 +67,8 @@ DEFINE_PER_THREAD_MALLOCED_DATA(char *, subprogram_name); /* --version option output. */ static char *program_version; -/* 'true' if mlockall() succeeded. */ -static bool is_memory_locked = false; +/* 'true' if mlockall() succeeded, but doesn't support ONFAULT. */ +static bool is_all_memory_locked = false; /* Buffer used by ovs_strerror() and ovs_format_message(). */ DEFINE_STATIC_PER_THREAD_DATA(struct { char s[128]; }, @@ -102,15 +102,15 @@ ovs_assert_failure(const char *where, const char *function, } void -set_memory_locked(void) +set_all_memory_locked(void) { - is_memory_locked = true; + is_all_memory_locked = true; } bool -memory_locked(void) +memory_all_locked(void) { - return is_memory_locked; + return is_all_memory_locked; } void diff --git a/lib/util.h b/lib/util.h index 55718fd87ca..c486b534049 100644 --- a/lib/util.h +++ b/lib/util.h @@ -156,8 +156,8 @@ void ctl_timeout_setup(unsigned int secs); void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp); -void set_memory_locked(void); -bool memory_locked(void); +void set_all_memory_locked(void); +bool memory_all_locked(void); OVS_NO_RETURN void out_of_memory(void); diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in index 10c6e077bac..98e58951dcf 100644 --- a/vswitchd/ovs-vswitchd.8.in +++ b/vswitchd/ovs-vswitchd.8.in @@ -68,10 +68,11 @@ load the Open vSwitch kernel module. .PP .SH OPTIONS .IP "\fB\-\-mlockall\fR" -Causes \fBovs\-vswitchd\fR to call the \fBmlockall()\fR function, to -attempt to lock all of its process memory into physical RAM, -preventing the kernel from paging any of its memory to disk. This -helps to avoid networking interruptions due to system memory pressure. +Causes \fBovs\-vswitchd\fR to call the \fBmlockall()\fR function, to attempt to +lock all of its process memory into physical RAM on page faults (on allocation, +when running on Linux kernel 4.4 or older), preventing the kernel from paging +any of its memory to disk. This helps to avoid networking interruptions due to +system memory pressure. .IP Some systems do not support \fBmlockall()\fR at all, and other systems only allow privileged users, such as the superuser, to use it. diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index 273af9f5d62..6d90c73b830 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -56,7 +56,8 @@ VLOG_DEFINE_THIS_MODULE(vswitchd); -/* --mlockall: If set, locks all process memory into physical RAM, preventing +/* --mlockall: If set, locks all present process memory pages into physical + * RAM and all the new pages the moment they are faulted in, preventing * the kernel from paging any of its memory to disk. */ static bool want_mlockall; @@ -96,10 +97,16 @@ main(int argc, char *argv[]) if (want_mlockall) { #ifdef HAVE_MLOCKALL - if (mlockall(MCL_CURRENT | MCL_FUTURE)) { - VLOG_ERR("mlockall failed: %s", ovs_strerror(errno)); - } else { - set_memory_locked(); +/* MCL_ONFAULT introduced in Linux kernel 4.4. */ +#ifndef MCL_ONFAULT +#define MCL_ONFAULT 4 +#endif + if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + VLOG_ERR("mlockall failed: %s", ovs_strerror(errno)); + } else { + set_all_memory_locked(); + } } #else VLOG_ERR("mlockall not supported on this system"); From c1dce72a37eae7ce230db045a78617fa194661d7 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Fri, 28 Jun 2024 14:18:41 -0500 Subject: [PATCH 761/833] python: ovsdb-idl: Add custom transaction operations. It can be useful to be able to send raw transaction operations through the Idl's connection. For example, to clean up MAC_Binding entries for floating IPs without having to monitor the MAC_Binding table which can be quite large. Signed-off-by: Terry Wilson Signed-off-by: Ilya Maximets --- NEWS | 2 ++ python/ovs/db/idl.py | 21 ++++++++++++- tests/ovsdb-idl.at | 27 ++++++++++++++++ tests/test-ovsdb.py | 73 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index e7f1a9fe128..e0359b759a6 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,8 @@ Post-v3.3.0 * Link status changes are now handled via interrupt mode if the DPDK driver supports it. It is possible to revert to polling mode by setting per interface 'options:dpdk-lsc-interrupt' to 'false'. + - Python: + * Added custom transaction support to the Idl via add_op(). v3.3.0 - 16 Feb 2024 diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index b6d5ed6972a..c8cc543465c 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -1708,6 +1708,8 @@ def __init__(self, idl): self._inserted_rows = {} # Map from UUID to _InsertedRow + self._operations = [] + def add_comment(self, comment): """Appends 'comment' to the comments that will be passed to the OVSDB server when this transaction is committed. (The comment will be @@ -1843,7 +1845,7 @@ def commit(self): "rows": [rows]}) # Add updates. - any_updates = False + any_updates = bool(self._operations) for row in self._txn_rows.values(): if row._changes is None: if row._table.is_root: @@ -1978,6 +1980,8 @@ def commit(self): operations.append({"op": "comment", "comment": "\n".join(self._comments)}) + operations += self._operations + # Dry run? if self.dry_run: operations.append({"op": "abort"}) @@ -1996,6 +2000,21 @@ def commit(self): self.__disassemble() return self._status + def add_op(self, op): + """Add a raw OVSDB operation to the transaction + + This can be useful for re-using the existing Idl connection to take + actions that are difficult or expensive to do with the Idl itself, e.g. + bulk deleting rows from the server without downloading them into a + local cache. + + All ops are applied after any other operations in the transaction. + + :param op: An "op" for an OVSDB "transact" request (rfc 7047 Sec 5.2) + :type op: dict + """ + self._operations.append(op) + def commit_block(self): """Attempts to commit this transaction, blocking until the commit either succeeds or fails. Returns the final commit status, which may diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index b9dc0bdea1a..9070ea051a6 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -2863,6 +2863,33 @@ OVSDB_CHECK_IDL_PERS_UUID_INSERT([simple idl, persistent uuid insert], [['This UUID would duplicate a UUID already present within the table or deleted within the same transaction']]) +OVSDB_CHECK_IDL_PY([simple idl, python, add_op], + [], + [['insert 1, insert 2, insert 3, insert 1' \ + 'add_op {"op": "delete", "table": "simple", "where": [["i", "==", 1]]}' \ + 'add_op {"op": "insert", "table": "simple", "row": {"i": 2}}, delete 3' \ + 'insert 2, add_op {"op": "update", "table": "simple", "row": {"i": 1}, "where": [["i", "==", 2]]}' + ]], + [[000: empty +001: commit, status=success +002: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +002: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> +002: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +002: table simple: i=3 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<4> +003: commit, status=success +004: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +004: table simple: i=3 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<4> +005: commit, status=success +006: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +006: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<5> +007: commit, status=success +008: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +008: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<5> +008: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<6> +009: done +]],[],sort) + + m4_define([OVSDB_CHECK_IDL_CHANGE_AWARE], [AT_SETUP([simple idl, database change aware, online conversion - $1]) AT_KEYWORDS([ovsdb server idl db_change_aware conversion $1]) diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index 67a45f044b8..60752ef4ae2 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -36,6 +36,66 @@ vlog.init(None) +def substitute_object_text(data, quotechar='"', obj_chars=("{}", "[]"), + tag_format="_OBJECT_{}_"): + """Replace objects in strings with tags that can later be retrieved + + Given data like: + 'cmd1 1, cmd2 {"a": {"a": "b"}}, cmd3 1 2, cmd4 ["a", "b"]' + + Return an output string: + 'cmd1 1, cmd2 _OBJECT_0_, cmd3 1 2, cmd4 _OBJECT_1_' + + and a dictionary of replaced text: + {'_OBJECT_0_': '{"a": {"a": "b"}}', '_OBJECT_1_': '["a", "b"]'} + """ + + obj_chars = dict(obj_chars) + in_quote = False + in_object = [] # Stack of nested outer object opening characters. + replaced_text = {} + output = "" + start = end = 0 + for i, c in enumerate(data): + if not in_object: + if not in_quote and c in obj_chars: + # This is the start of a non-quoted outer object that will + # be replaced by a tag. + in_object.append(c) + start = i + else: + # Regular output. + output += c + if c == quotechar: + in_quote = not in_quote + elif not in_quote: # Unquoted object. + if c == in_object[0]: + # Record on the stack that we are in a nested object of the + # same type as the outer object, this object will not be + # substituted with a tag. + in_object.append(c) + elif c == obj_chars[in_object[0]]: + # This is the closing character to this potentially nested + # object's opening character, so pop it off the stack. + in_object.pop() + if not in_object: + # This is the outer object's closing character, so record + # the substituted text and generate the tagged text. + end = i + 1 + tag = tag_format.format(len(replaced_text)) + replaced_text[tag] = data[start:end] + output += tag + return output, replaced_text + + +def recover_object_text_from_list(words, json): + if not json: + return words + # NOTE(twilson) This does not handle the case of having multiple replaced + # objects in the same word, e.g. two json adjacent json strings. + return [json.get(word, word) for word in words] + + def unbox_json(json): if type(json) is list and len(json) == 1: return json[0] @@ -389,8 +449,15 @@ def idl_set(idl, commands, step): increment = False fetch_cmds = [] events = [] + # `commands` is a comma-separated list of space-separated arguments. To + # handle commands that take arguments that may contain spaces or commas, + # e.g. JSON, it is necessary to process `commands` to extract those + # arguments before splitting by ',' or ' ' below, and then re-insert them + # after the arguments are split. + commands, data = substitute_object_text(commands) for command in commands.split(','): words = command.split() + words = recover_object_text_from_list(words, data) name = words[0] args = words[1:] @@ -449,6 +516,12 @@ def notify(event, row, updates=None): s = txn.insert(idl.tables["simple"], new_uuid=uuid.UUID(args[0]), persist_uuid=True) s.i = int(args[1]) + elif name == "add_op": + if len(args) != 1: + sys.stderr.write('"add_op" command requires 1 argument\n') + sys.stderr.write(f"args={args}\n") + sys.exit(1) + txn.add_op(ovs.json.from_string(args[0])) elif name == "delete": if len(args) != 1: sys.stderr.write('"delete" command requires 1 argument\n') From 802df1e37bf317e3b99a93a78c14f33b6c8f57cb Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Fri, 28 Jun 2024 00:24:05 -0400 Subject: [PATCH 762/833] ovs-monitor-ipsec: LibreSwan autodetect version. Previously a change was made to LibreSwan necessitating the detection of version numbers. However, this change didn't properly account for all possible output from "ipsec version". When installed from the git repository, LibreSwan will report versions differently then when installed from a package. Fixes: 3ddb31f60487 ("ovs-monitor-ipsec: LibreSwan autodetect paths.") Signed-off-by: Mike Pattrick Signed-off-by: Simon Horman --- ipsec/ovs-monitor-ipsec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in index bc7ac552379..2b602c75f60 100755 --- a/ipsec/ovs-monitor-ipsec.in +++ b/ipsec/ovs-monitor-ipsec.in @@ -464,7 +464,7 @@ conn prevent_unencrypted_vxlan encoding="latin1") pout, perr = proc.communicate() - v = re.match("^Libreswan (.*)$", pout) + v = re.match("^Libreswan v?(.*)$", pout) try: version = int(v.group(1).split(".")[0]) except: From 239b59bdfb8cbb48ec655d50405c15a08057dbb2 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Fri, 28 Jun 2024 00:24:06 -0400 Subject: [PATCH 763/833] ovs-monitor-ipsec: LibreSwan v5 support. In version 5, LibreSwan made significant command line interface changes. This includes changing the order or command line parameters and removing the "ipsec auto" command. To maintain compatibility with previous versions, the ipsec.d version check is repurposed for this. Checking the version proved simpler than removing use of auto. There was also a change to ipsec status command that effected the tests. However, this change was backwards compatible. Reported-at: https://issues.redhat.com/browse/FDP-645 Reported-by: Ilya Maximets Signed-off-by: Mike Pattrick Signed-off-by: Simon Horman --- ipsec/ovs-monitor-ipsec.in | 46 +++++++++++++++++++++----------------- tests/system-ipsec.at | 8 +++---- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in index 2b602c75f60..37c509ac682 100755 --- a/ipsec/ovs-monitor-ipsec.in +++ b/ipsec/ovs-monitor-ipsec.in @@ -459,6 +459,7 @@ conn prevent_unencrypted_vxlan def __init__(self, libreswan_root_prefix, args): # Collect version infromation self.IPSEC = libreswan_root_prefix + "/usr/sbin/ipsec" + self.IPSEC_AUTO = [self.IPSEC] proc = subprocess.Popen([self.IPSEC, "--version"], stdout=subprocess.PIPE, encoding="latin1") @@ -470,6 +471,11 @@ conn prevent_unencrypted_vxlan except: version = 0 + if version < 5: + # With v5, LibreSWAN removed the auto command, however, it is + # still required for older versions + self.IPSEC_AUTO.append("auto") + if version >= 4: ipsec_d = args.ipsec_d if args.ipsec_d else "/var/lib/ipsec/nss" else: @@ -593,7 +599,7 @@ conn prevent_unencrypted_vxlan def refresh(self, monitor): vlog.info("Refreshing LibreSwan configuration") - subprocess.call([self.IPSEC, "auto", "--ctlsocket", self.IPSEC_CTL, + subprocess.call(self.IPSEC_AUTO + ["--ctlsocket", self.IPSEC_CTL, "--config", self.IPSEC_CONF, "--rereadsecrets"]) tunnels = set(monitor.tunnels.keys()) @@ -621,7 +627,7 @@ conn prevent_unencrypted_vxlan if not tunnel or tunnel.version != ver: vlog.info("%s is outdated %u" % (conn, ver)) - subprocess.call([self.IPSEC, "auto", "--ctlsocket", + subprocess.call(self.IPSEC_AUTO + ["--ctlsocket", self.IPSEC_CTL, "--config", self.IPSEC_CONF, "--delete", conn]) elif ifname in tunnels: @@ -643,44 +649,44 @@ conn prevent_unencrypted_vxlan # Update shunt policy if changed if monitor.conf_in_use["skb_mark"] != monitor.conf["skb_mark"]: if monitor.conf["skb_mark"]: - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_gre"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_geneve"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_stt"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_vxlan"]) else: - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_gre"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_geneve"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_stt"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_vxlan"]) @@ -726,8 +732,8 @@ conn prevent_unencrypted_vxlan # the "ipsec auto --start" command is lost. Just retry to make sure # the command is received by LibreSwan. while True: - proc = subprocess.Popen([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + proc = subprocess.Popen(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--start", "--asynchronous", conn], diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at index d3d27133b97..1e155fecea3 100644 --- a/tests/system-ipsec.at +++ b/tests/system-ipsec.at @@ -110,16 +110,16 @@ m4_define([CHECK_LIBRESWAN], dnl IPSEC_STATUS_LOADED([]) dnl dnl Get number of loaded connections from ipsec status -m4_define([IPSEC_STATUS_LOADED], [ipsec status --rundir $ovs_base/$1 | \ +m4_define([IPSEC_STATUS_LOADED], [ipsec --rundir $ovs_base/$1 status | \ grep "Total IPsec connections" | \ - sed 's/[[0-9]]* Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\1/m']) + sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\1/m']) dnl IPSEC_STATUS_ACTIVE([]) dnl dnl Get number of active connections from ipsec status -m4_define([IPSEC_STATUS_ACTIVE], [ipsec status --rundir $ovs_base/$1 | \ +m4_define([IPSEC_STATUS_ACTIVE], [ipsec --rundir $ovs_base/$1 status | \ grep "Total IPsec connections" | \ - sed 's/[[0-9]]* Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\2/m']) + sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\2/m']) dnl CHECK_ESP_TRAFFIC() dnl From 773b0fb59310da98536657e30c9c1a443c045581 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 2 Jul 2024 20:58:09 +0200 Subject: [PATCH 764/833] cirrus: Update to FreeBSD 14.1. 14.1 was released on June 4 and 14.0 will reach EoL in September. Update now. Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 8db385f002f..d73154a9716 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -3,7 +3,7 @@ freebsd_build_task: freebsd_instance: matrix: image_family: freebsd-13-3-snap - image_family: freebsd-14-0-snap + image_family: freebsd-14-1-snap cpu: 4 memory: 4G From f8ed13355d9d01f5437e5c27cf5b3a2f094543e5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 27 Jun 2024 00:02:21 +0200 Subject: [PATCH 765/833] ovsdb: raft: Don't forward more than one command to the leader. Every transaction has RAFT log prerequisites. Even if transactions are not related (because RAFT doesn't actually know what data it is handling). When leader writes a new record to a RAFT storage, it is getting appended to the log right away and changes current 'eid', i.e., changes prerequisites. The leader will not try to write new records until the current one is committed, because until then the pre-check will be failing. However, that is different for the follower. Followers do not add records to the RAFT log until the leader sends an append request back. So, if there are multiple transactions pending on a follower, it will create a command for each of them and prerequisites will be set to the same values. All these commands will be sent to the leader, but only one can succeed at a time, because accepting one command immediately changes prerequisites and all other commands become non-applicable. So, out of N commands, 1 will succeed and N - 1 will fail. The cluster failure is a transient failure, so the follower will re-process all the failed transactions and send them again. 1 will succeed and N - 2 will fail. And so on, until there are no more transactions. In the end, instead of processing N transactions, the follower is performing N * (N - 1) / 2 transaction processing iterations. That is consuming a huge amount of CPU resources completely unnecessarily. Since there is no real chance for multiple transactions from the same follower to succeed, it's better to not send them in the first place. This also eliminates prerequisite mismatch messages on a leader in this particular case. In a test with 30 parallel shell threads executing 12K transactions total with separate ovsdb-client calls through the same follower there is about 60% performance improvement. The test takes ~100 seconds to complete without this change and ~40 seconds with this change applied. The new time is very close to what it takes to execute the same test through the cluster leader. The test can be found at the link below. Note: prerequisite failures on a leader are still possible, but mostly in a case of simultaneous transactions from different followers. It's a normal thing for a distributed database due to its nature. Link: https://mail.openvswitch.org/pipermail/ovs-dev/2024-June/415167.html Acked-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- ovsdb/raft.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- ovsdb/raft.h | 2 +- ovsdb/storage.c | 9 +++++---- ovsdb/storage.h | 5 ++++- ovsdb/transaction.c | 6 +----- 5 files changed, 55 insertions(+), 12 deletions(-) diff --git a/ovsdb/raft.c b/ovsdb/raft.c index ac3d37ac409..9c3c351b5be 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -2307,12 +2307,55 @@ raft_get_eid(const struct raft *raft, uint64_t index) return &raft->snap.eid; } -const struct uuid * +static const struct uuid * raft_current_eid(const struct raft *raft) { return raft_get_eid(raft, raft->log_end - 1); } +bool +raft_precheck_prereq(const struct raft *raft, const struct uuid *prereq) +{ + if (!uuid_equals(raft_current_eid(raft), prereq)) { + VLOG_DBG("%s: prerequisites (" UUID_FMT ") " + "do not match current eid (" UUID_FMT ")", + __func__, UUID_ARGS(prereq), + UUID_ARGS(raft_current_eid(raft))); + return false; + } + + /* Incomplete commands on a leader will not change the leader's current + * 'eid' on commit as they are already part of the leader's log. */ + if (raft->role == RAFT_LEADER) { + return true; + } + + /* Having incomplete commands on a follower means that the leader has + * these commands and they will change the prerequisites once added to + * the leader's log. + * + * There is a chance that all these commands will actually fail and the + * record with current prerequisites will in fact succeed, but, since + * these are our own commands, the chances are low. */ + struct raft_command *cmd; + HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) { + /* Skip commands that are already part of the log (have non-zero + * index) and ones that do not carry any data (have zero 'eid'), + * as they can't change prerequisites. + * + * Database will not re-run triggers unless the data changes or + * one of the data-carrying triggers completes. So, pre-check must + * not fail if there are no outstanding data-carrying commands. */ + if (!cmd->index && !uuid_is_zero(&cmd->eid)) { + VLOG_DBG("%s: follower still has an incomplete command " + UUID_FMT, __func__, UUID_ARGS(&cmd->eid)); + return false; + } + } + + return true; +} + static struct raft_command * raft_command_create_completed(enum raft_command_status status) { diff --git a/ovsdb/raft.h b/ovsdb/raft.h index a5b55d9bf03..5833aaf23b2 100644 --- a/ovsdb/raft.h +++ b/ovsdb/raft.h @@ -189,5 +189,5 @@ struct ovsdb_error *raft_store_snapshot(struct raft *, void raft_take_leadership(struct raft *); void raft_transfer_leadership(struct raft *, const char *reason); -const struct uuid *raft_current_eid(const struct raft *); +bool raft_precheck_prereq(const struct raft *, const struct uuid *prereq); #endif /* lib/raft.h */ diff --git a/ovsdb/storage.c b/ovsdb/storage.c index 6c395106c01..c5aec545944 100644 --- a/ovsdb/storage.c +++ b/ovsdb/storage.c @@ -661,11 +661,12 @@ ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, return w; } -const struct uuid * -ovsdb_storage_peek_last_eid(struct ovsdb_storage *storage) +bool +ovsdb_storage_precheck_prereq(const struct ovsdb_storage *storage, + const struct uuid *prereq) { if (!storage->raft) { - return NULL; + return true; } - return raft_current_eid(storage->raft); + return raft_precheck_prereq(storage->raft, prereq); } diff --git a/ovsdb/storage.h b/ovsdb/storage.h index 05f40ce934a..7079ea261f8 100644 --- a/ovsdb/storage.h +++ b/ovsdb/storage.h @@ -96,6 +96,9 @@ struct ovsdb_storage *ovsdb_storage_open_standalone(const char *filename, bool rw); struct ovsdb_schema *ovsdb_storage_read_schema(struct ovsdb_storage *); -const struct uuid *ovsdb_storage_peek_last_eid(struct ovsdb_storage *); +/* Checks that there is a chance for a record with specified prerequisites + * to be successfully written to the storage. */ +bool ovsdb_storage_precheck_prereq(const struct ovsdb_storage *, + const struct uuid *prereq); #endif /* ovsdb/storage.h */ diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 484a88e1cc2..65eca647837 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -1277,11 +1277,7 @@ struct ovsdb_txn_progress { bool ovsdb_txn_precheck_prereq(const struct ovsdb *db) { - const struct uuid *eid = ovsdb_storage_peek_last_eid(db->storage); - if (!eid) { - return true; - } - return uuid_equals(&db->prereq, eid); + return ovsdb_storage_precheck_prereq(db->storage, &db->prereq); } struct ovsdb_txn_progress * From 939a5cea5b6c38e8865b19e12de3c642844b9930 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Tue, 9 Jul 2024 09:14:17 +0200 Subject: [PATCH 766/833] Add global option for JSON output to ovs-appctl. For monitoring systems such as Prometheus it would be beneficial if OVS would expose statistics in a machine-readable format. This patch introduces support for different output formats to ovs-appctl. It gains a global option '-f,--format' which changes it to print a JSON document instead of plain-text for humans. For example, a later patch implements support for 'ovs-appctl --format json dpif/show'. By default, the output format is plain-text as before. A new 'set-options' command has been added to lib/unixctl.c which allows to change the output format of the commands executed afterwards on the same socket connection. It is supposed to be run by ovs-appctl transparently for the user when a specific output format has been requested. For example, when a user calls 'ovs-appctl --format json dpif/show', then ovs-appctl will call 'set-options' to set the output format as requested by the user and afterwards it will call the actual command 'dpif/show'. This ovs-appctl behaviour has been implemented in a backward compatible way. One can use an updated client (ovs-appctl) with an old server (ovs-vswitchd) and vice versa. Of course, JSON output only works when both sides have been updated. Two access functions unixctl_command_{get,set}_output_format() and a unixctl_command_reply_json function have been added to lib/unixctl.h: unixctl_command_get_output_format() is supposed to be used in commands like 'dpif/show' to query the requested output format. When JSON output has been selected, the unixctl_command_reply_json() function can be used to return JSON objects to the client (ovs-appctl) instead of plain-text with the unixctl_command_reply{,_error}() functions. When JSON has been requested but a command has not implemented JSON output the plain-text output will be wrapped in a provisional JSON document with the following structure: {"reply":"$PLAIN_TEXT_HERE","reply-format":"plain"} Thus commands which have been executed successfully will not fail when they try to render the output at a later stage. A test for the 'version' command has been implemented which shows how the provisional JSON document looks like in practice. For a cleaner JSON document, the trailing newline has been moved from the program version string to function ovs_print_version(). This way, the plain-text output of the 'version' command has not changed. Output formatting has been moved from unixctl_client_transact() in lib/unixctl.c to utilities/ovs-appctl.c. The former merely returns the JSON objects returned from the server and the latter is now responsible for printing it properly. In popular tools like kubectl the option for output control is usually called '-o|--output' instead of '-f,--format'. But ovs-appctl already has an short option '-o' which prints the available ovs-appctl options ('--option'). The now chosen name also better aligns with ovsdb-client where '-f,--format' controls output formatting. Reported-at: https://bugzilla.redhat.com/1824861 Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-appctl.8.rst | 12 ++ NEWS | 3 + lib/unixctl.c | 180 ++++++++++++++++++++++------- lib/unixctl.h | 17 ++- lib/util.c | 6 +- python/ovs/unixctl/server.py | 3 - tests/appctl.py | 5 + tests/ovs-vswitchd.at | 12 ++ utilities/ovs-appctl.c | 135 +++++++++++++++++++--- 9 files changed, 305 insertions(+), 68 deletions(-) diff --git a/Documentation/ref/ovs-appctl.8.rst b/Documentation/ref/ovs-appctl.8.rst index 3ce02e9848f..148cc763295 100644 --- a/Documentation/ref/ovs-appctl.8.rst +++ b/Documentation/ref/ovs-appctl.8.rst @@ -8,6 +8,7 @@ Synopsis ``ovs-appctl`` [``--target=`` | ``-t`` ] [``--timeout=`` | ``-T`` ] +[``--format=`` | ``-f`` ] [...] ``ovs-appctl --help`` @@ -67,6 +68,17 @@ In normal use only a single option is accepted: runtime to approximately seconds. If the timeout expires, ``ovs-appctl`` exits with a ``SIGALRM`` signal. +* ``-f `` or ``--format=`` + + Tells ``ovs-appctl`` which output format to use. By default, or with a + of ``text``, ``ovs-appctl`` will print plain-text for humans. + When is ``json``, ``ovs-appctl`` will return a JSON document. + When ``json`` is requested, but a command has not implemented JSON + output, the plain-text output will be wrapped in a provisional JSON + document with the following structure:: + + {"reply-format":"plain","reply":"$PLAIN_TEXT_HERE"} + Common Commands =============== diff --git a/NEWS b/NEWS index e0359b759a6..f182647c7fb 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,9 @@ Post-v3.3.0 -------------------- - Option '--mlockall' now only locks memory pages on fault, if possible. This also makes it compatible with vHost Post-copy Live Migration. + - ovs-appctl: + * Added new option [-f|--format] to choose the output format, e.g. 'json' + or 'text' (by default). - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/lib/unixctl.c b/lib/unixctl.c index 103357ee91b..e7ce77e2ce0 100644 --- a/lib/unixctl.c +++ b/lib/unixctl.c @@ -17,7 +17,9 @@ #include #include "unixctl.h" #include +#include #include +#include "command-line.h" #include "coverage.h" #include "dirs.h" #include "openvswitch/dynamic-string.h" @@ -50,6 +52,8 @@ struct unixctl_conn { /* Only one request can be in progress at a time. While the request is * being processed, 'request_id' is populated, otherwise it is null. */ struct json *request_id; /* ID of the currently active request. */ + + enum unixctl_output_fmt fmt; /* Output format of current connection. */ }; /* Server for control connection. */ @@ -63,6 +67,30 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); static struct shash commands = SHASH_INITIALIZER(&commands); +const char * +unixctl_output_fmt_to_string(enum unixctl_output_fmt fmt) +{ + switch (fmt) { + case UNIXCTL_OUTPUT_FMT_TEXT: return "text"; + case UNIXCTL_OUTPUT_FMT_JSON: return "json"; + default: return ""; + } +} + +bool +unixctl_output_fmt_from_string(const char *string, + enum unixctl_output_fmt *fmt) +{ + if (!strcasecmp(string, "text")) { + *fmt = UNIXCTL_OUTPUT_FMT_TEXT; + } else if (!strcasecmp(string, "json")) { + *fmt = UNIXCTL_OUTPUT_FMT_JSON; + } else { + return false; + } + return true; +} + static void unixctl_list_commands(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) @@ -94,6 +122,52 @@ unixctl_version(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, ovs_get_program_version()); } +static void +unixctl_set_options(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux OVS_UNUSED) +{ + struct ovs_cmdl_parsed_option *parsed_options = NULL; + size_t n_parsed_options; + char *error = NULL; + + static const struct option options[] = { + {"format", required_argument, NULL, 'f'}, + {NULL, 0, NULL, 0}, + }; + + error = ovs_cmdl_parse_all(argc--, (char **) (argv++), options, + &parsed_options, &n_parsed_options); + if (error) { + goto error; + } + + for (size_t i = 0; i < n_parsed_options; i++) { + struct ovs_cmdl_parsed_option *parsed_option = &parsed_options[i]; + + switch (parsed_option->o->val) { + case 'f': + if (!unixctl_output_fmt_from_string(parsed_option->arg, + &conn->fmt)) { + error = xasprintf("option format has invalid value %s", + parsed_option->arg); + goto error; + } + break; + + default: + OVS_NOT_REACHED(); + } + } + + unixctl_command_reply(conn, NULL); + free(parsed_options); + return; +error: + unixctl_command_reply_error(conn, error); + free(error); + free(parsed_options); +} + /* Registers a unixctl command with the given 'name'. 'usage' describes the * arguments to the command; it is used only for presentation to the user in * "list-commands" output. (If 'usage' is NULL, then the command is hidden.) @@ -128,36 +202,35 @@ unixctl_command_register(const char *name, const char *usage, shash_add(&commands, name, command); } +enum unixctl_output_fmt +unixctl_command_get_output_format(struct unixctl_conn *conn) +{ + return conn->fmt; +} + +/* Takes ownership of the 'body'. */ static void unixctl_command_reply__(struct unixctl_conn *conn, - bool success, const char *body) + bool success, struct json *body) { - struct json *body_json; struct jsonrpc_msg *reply; COVERAGE_INC(unixctl_replied); ovs_assert(conn->request_id); - if (!body) { - body = ""; - } - - if (body[0] && body[strlen(body) - 1] != '\n') { - body_json = json_string_create_nocopy(xasprintf("%s\n", body)); - } else { - body_json = json_string_create(body); - } - if (success) { - reply = jsonrpc_create_reply(body_json, conn->request_id); + reply = jsonrpc_create_reply(body, conn->request_id); } else { - reply = jsonrpc_create_error(body_json, conn->request_id); + reply = jsonrpc_create_error(body, conn->request_id); } if (VLOG_IS_DBG_ENABLED()) { char *id = json_to_string(conn->request_id, 0); + char *msg = json_to_string(body, JSSF_SORT); + VLOG_DBG("replying with %s, id=%s: \"%s\"", - success ? "success" : "error", id, body); + success ? "success" : "error", id, msg); + free(msg); free(id); } @@ -169,23 +242,52 @@ unixctl_command_reply__(struct unixctl_conn *conn, } /* Replies to the active unixctl connection 'conn'. 'result' is sent to the - * client indicating the command was processed successfully. Only one call to - * unixctl_command_reply() or unixctl_command_reply_error() may be made per - * request. */ + * client indicating the command was processed successfully. 'result' should + * be plain-text; use unixctl_command_reply_json() to return a JSON document + * when JSON output has been requested. Only one call to + * unixctl_command_reply*() functions may be made per request. */ void unixctl_command_reply(struct unixctl_conn *conn, const char *result) { - unixctl_command_reply__(conn, true, result); + struct json *json_result = json_string_create(result ? result : ""); + + if (conn->fmt == UNIXCTL_OUTPUT_FMT_JSON) { + /* Wrap plain-text reply in provisional JSON document when JSON output + * has been requested. */ + struct json *json_reply = json_object_create(); + + json_object_put_string(json_reply, "reply-format", "plain"); + json_object_put(json_reply, "reply", json_result); + + json_result = json_reply; + } + + unixctl_command_reply__(conn, true, json_result); +} + +/* Replies to the active unixctl connection 'conn'. 'body' is sent to the + * client indicating the command was processed successfully. Use this function + * when JSON output has been requested; otherwise use unixctl_command_reply() + * for plain-text output. Only one call to unixctl_command_reply*() functions + * may be made per request. + * + * Takes ownership of the 'body'. */ +void +unixctl_command_reply_json(struct unixctl_conn *conn, struct json *body) +{ + ovs_assert(conn->fmt == UNIXCTL_OUTPUT_FMT_JSON); + unixctl_command_reply__(conn, true, body); } /* Replies to the active unixctl connection 'conn'. 'error' is sent to the - * client indicating an error occurred processing the command. Only one call to - * unixctl_command_reply() or unixctl_command_reply_error() may be made per - * request. */ + * client indicating an error occurred processing the command. 'error' should + * be plain-text. Only one call to unixctl_command_reply*() functions may be + * made per request. */ void unixctl_command_reply_error(struct unixctl_conn *conn, const char *error) { - unixctl_command_reply__(conn, false, error); + unixctl_command_reply__(conn, false, + json_string_create(error ? error : "")); } /* Creates a unixctl server listening on 'path', which for POSIX may be: @@ -250,6 +352,8 @@ unixctl_server_create(const char *path, struct unixctl_server **serverp) unixctl_command_register("list-commands", "", 0, 0, unixctl_list_commands, NULL); unixctl_command_register("version", "", 0, 0, unixctl_version, NULL); + unixctl_command_register("set-options", "[--format text|json]", 1, 2, + unixctl_set_options, NULL); struct unixctl_server *server = xmalloc(sizeof *server); server->listener = listener; @@ -381,6 +485,7 @@ unixctl_server_run(struct unixctl_server *server) struct unixctl_conn *conn = xzalloc(sizeof *conn); ovs_list_push_back(&server->conns, &conn->node); conn->rpc = jsonrpc_open(stream); + conn->fmt = UNIXCTL_OUTPUT_FMT_TEXT; } else if (error == EAGAIN) { break; } else { @@ -483,7 +588,7 @@ unixctl_client_create(const char *path, struct jsonrpc **client) * '*err' if not NULL. */ int unixctl_client_transact(struct jsonrpc *client, const char *command, int argc, - char *argv[], char **result, char **err) + char *argv[], struct json **result, struct json **err) { struct jsonrpc_msg *request, *reply; struct json **json_args, *params; @@ -506,24 +611,15 @@ unixctl_client_transact(struct jsonrpc *client, const char *command, int argc, return error; } - if (reply->error) { - if (reply->error->type == JSON_STRING) { - *err = xstrdup(json_string(reply->error)); - } else { - VLOG_WARN("%s: unexpected error type in JSON RPC reply: %s", - jsonrpc_get_name(client), - json_type_to_string(reply->error->type)); - error = EINVAL; - } - } else if (reply->result) { - if (reply->result->type == JSON_STRING) { - *result = xstrdup(json_string(reply->result)); - } else { - VLOG_WARN("%s: unexpected result type in JSON rpc reply: %s", - jsonrpc_get_name(client), - json_type_to_string(reply->result->type)); - error = EINVAL; - } + if (reply->result && reply->error) { + VLOG_WARN("unexpected response when communicating with %s: %s\n %s", + jsonrpc_get_name(client), + json_to_string(reply->result, JSSF_SORT), + json_to_string(reply->error, JSSF_SORT)); + error = EINVAL; + } else { + *result = json_nullable_clone(reply->result); + *err = json_nullable_clone(reply->error); } jsonrpc_msg_destroy(reply); diff --git a/lib/unixctl.h b/lib/unixctl.h index 4562dbc4911..1965f100dc2 100644 --- a/lib/unixctl.h +++ b/lib/unixctl.h @@ -17,10 +17,21 @@ #ifndef UNIXCTL_H #define UNIXCTL_H 1 +#include + #ifdef __cplusplus extern "C" { #endif +struct json; +enum unixctl_output_fmt { + UNIXCTL_OUTPUT_FMT_TEXT = 1 << 0, + UNIXCTL_OUTPUT_FMT_JSON = 1 << 1, +}; + +const char *unixctl_output_fmt_to_string(enum unixctl_output_fmt); +bool unixctl_output_fmt_from_string(const char *, enum unixctl_output_fmt *); + /* Server for Unix domain socket control connection. */ struct unixctl_server; int unixctl_server_create(const char *path, struct unixctl_server **); @@ -36,7 +47,7 @@ int unixctl_client_create(const char *path, struct jsonrpc **client); int unixctl_client_transact(struct jsonrpc *client, const char *command, int argc, char *argv[], - char **result, char **error); + struct json **result, struct json **error); /* Command registration. */ struct unixctl_conn; @@ -45,8 +56,12 @@ typedef void unixctl_cb_func(struct unixctl_conn *, void unixctl_command_register(const char *name, const char *usage, int min_args, int max_args, unixctl_cb_func *cb, void *aux); +enum unixctl_output_fmt unixctl_command_get_output_format( + struct unixctl_conn *); void unixctl_command_reply_error(struct unixctl_conn *, const char *error); void unixctl_command_reply(struct unixctl_conn *, const char *body); +void unixctl_command_reply_json(struct unixctl_conn *, + struct json *body); #ifdef __cplusplus } diff --git a/lib/util.c b/lib/util.c index 3a6351a2f37..84e8c4966db 100644 --- a/lib/util.c +++ b/lib/util.c @@ -619,11 +619,11 @@ ovs_set_program_name(const char *argv0, const char *version) free(program_version); if (!strcmp(version, VERSION)) { - program_version = xasprintf("%s (Open vSwitch) "VERSION"\n", + program_version = xasprintf("%s (Open vSwitch) "VERSION, program_name); } else { program_version = xasprintf("%s %s\n" - "Open vSwitch Library "VERSION"\n", + "Open vSwitch Library "VERSION, program_name, version); } } @@ -760,7 +760,7 @@ ovs_get_program_name(void) void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp) { - printf("%s", program_version); + printf("%s\n", program_version); if (min_ofp || max_ofp) { printf("OpenFlow versions %#x:%#x\n", min_ofp, max_ofp); } diff --git a/python/ovs/unixctl/server.py b/python/ovs/unixctl/server.py index b9cb52fadd3..d24a7092c15 100644 --- a/python/ovs/unixctl/server.py +++ b/python/ovs/unixctl/server.py @@ -87,9 +87,6 @@ def _reply_impl(self, success, body): if body is None: body = "" - if body and not body.endswith("\n"): - body += "\n" - if success: reply = Message.create_reply(body, self._request_id) else: diff --git a/tests/appctl.py b/tests/appctl.py index b85b364fac5..e5cc2813883 100644 --- a/tests/appctl.py +++ b/tests/appctl.py @@ -63,11 +63,16 @@ def main(): ovs.util.ovs_fatal(err_no, "%s: transaction error" % target) elif error is not None: sys.stderr.write(error) + if error and not error.endswith("\n"): + sys.stderr.write("\n") + ovs.util.ovs_error(0, "%s: server returned an error" % target) sys.exit(2) else: assert result is not None sys.stdout.write(result) + if result and not result.endswith("\n"): + sys.stdout.write("\n") if __name__ == '__main__': diff --git a/tests/ovs-vswitchd.at b/tests/ovs-vswitchd.at index 977b2eba1f2..b1ae1ae1edc 100644 --- a/tests/ovs-vswitchd.at +++ b/tests/ovs-vswitchd.at @@ -265,3 +265,15 @@ OFPT_FEATURES_REPLY: dpid:$orig_dpid OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ovs-vswitchd version]) +OVS_VSWITCHD_START + +AT_CHECK([ovs-appctl version], [0], [ignore]) +ovs_version=$(ovs-appctl version) + +AT_CHECK_UNQUOTED([ovs-appctl --format json version], [0], [dnl +{"reply":"$ovs_version","reply-format":"plain"} +]) + +AT_CLEANUP diff --git a/utilities/ovs-appctl.c b/utilities/ovs-appctl.c index ba0c172e6da..721698755c9 100644 --- a/utilities/ovs-appctl.c +++ b/utilities/ovs-appctl.c @@ -26,57 +26,105 @@ #include "daemon.h" #include "dirs.h" #include "openvswitch/dynamic-string.h" +#include "openvswitch/json.h" #include "jsonrpc.h" #include "process.h" #include "timeval.h" +#include "svec.h" #include "unixctl.h" #include "util.h" #include "openvswitch/vlog.h" static void usage(void); -static const char *parse_command_line(int argc, char *argv[]); + +/* Parsed command line args. */ +struct cmdl_args { + enum unixctl_output_fmt format; + char *target; +}; + +static struct cmdl_args *cmdl_args_create(void); +static struct cmdl_args *parse_command_line(int argc, char *argv[]); static struct jsonrpc *connect_to_target(const char *target); +static char *reply_to_string(struct json *reply, enum unixctl_output_fmt fmt); int main(int argc, char *argv[]) { - char *cmd_result, *cmd_error; + struct svec opt_argv = SVEC_EMPTY_INITIALIZER; + struct json *cmd_result, *cmd_error; struct jsonrpc *client; + struct cmdl_args *args; char *cmd, **cmd_argv; - const char *target; + char *msg = NULL; int cmd_argc; int error; set_program_name(argv[0]); /* Parse command line and connect to target. */ - target = parse_command_line(argc, argv); - client = connect_to_target(target); + args = parse_command_line(argc, argv); + client = connect_to_target(args->target); + + /* Transact options request (if required) and process reply. */ + if (args->format != UNIXCTL_OUTPUT_FMT_TEXT) { + svec_add(&opt_argv, "--format"); + svec_add(&opt_argv, unixctl_output_fmt_to_string(args->format)); + } + svec_terminate(&opt_argv); + + if (!svec_is_empty(&opt_argv)) { + error = unixctl_client_transact(client, "set-options", + opt_argv.n, opt_argv.names, + &cmd_result, &cmd_error); + + if (error) { + ovs_fatal(error, "%s: transaction error", args->target); + } + + if (cmd_error) { + jsonrpc_close(client); + msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT); + fputs(msg, stderr); + free(msg); + ovs_error(0, "%s: server returned an error", args->target); + exit(2); + } - /* Transact request and process reply. */ + json_destroy(cmd_result); + json_destroy(cmd_error); + } + svec_destroy(&opt_argv); + + /* Transact command request and process reply. */ cmd = argv[optind++]; cmd_argc = argc - optind; cmd_argv = cmd_argc ? argv + optind : NULL; error = unixctl_client_transact(client, cmd, cmd_argc, cmd_argv, &cmd_result, &cmd_error); if (error) { - ovs_fatal(error, "%s: transaction error", target); + ovs_fatal(error, "%s: transaction error", args->target); } if (cmd_error) { jsonrpc_close(client); - fputs(cmd_error, stderr); - ovs_error(0, "%s: server returned an error", target); + msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT); + fputs(msg, stderr); + free(msg); + ovs_error(0, "%s: server returned an error", args->target); exit(2); } else if (cmd_result) { - fputs(cmd_result, stdout); + msg = reply_to_string(cmd_result, args->format); + fputs(msg, stdout); + free(msg); } else { OVS_NOT_REACHED(); } jsonrpc_close(client); - free(cmd_result); - free(cmd_error); + json_destroy(cmd_result); + json_destroy(cmd_error); + free(args); return 0; } @@ -101,13 +149,26 @@ Common commands:\n\ vlog/reopen Make the program reopen its log file\n\ Other options:\n\ --timeout=SECS wait at most SECS seconds for a response\n\ + -f, --format=FMT Output format. One of: 'json', or 'text'\n\ + (default: text)\n\ -h, --help Print this helpful information\n\ -V, --version Display ovs-appctl version information\n", program_name, program_name); exit(EXIT_SUCCESS); } -static const char * +static struct cmdl_args * +cmdl_args_create(void) +{ + struct cmdl_args *args = xmalloc(sizeof *args); + + args->format = UNIXCTL_OUTPUT_FMT_TEXT; + args->target = NULL; + + return args; +} + +static struct cmdl_args * parse_command_line(int argc, char *argv[]) { enum { @@ -117,6 +178,7 @@ parse_command_line(int argc, char *argv[]) static const struct option long_options[] = { {"target", required_argument, NULL, 't'}, {"execute", no_argument, NULL, 'e'}, + {"format", required_argument, NULL, 'f'}, {"help", no_argument, NULL, 'h'}, {"option", no_argument, NULL, 'o'}, {"version", no_argument, NULL, 'V'}, @@ -126,11 +188,10 @@ parse_command_line(int argc, char *argv[]) }; char *short_options_ = ovs_cmdl_long_options_to_short_options(long_options); char *short_options = xasprintf("+%s", short_options_); - const char *target; - int e_options; + struct cmdl_args *args = cmdl_args_create(); unsigned int timeout = 0; + int e_options; - target = NULL; e_options = 0; for (;;) { int option; @@ -141,10 +202,10 @@ parse_command_line(int argc, char *argv[]) } switch (option) { case 't': - if (target) { + if (args->target) { ovs_fatal(0, "-t or --target may be specified only once"); } - target = optarg; + args->target = optarg; break; case 'e': @@ -157,6 +218,12 @@ parse_command_line(int argc, char *argv[]) } break; + case 'f': + if (!unixctl_output_fmt_from_string(optarg, &args->format)) { + ovs_fatal(0, "value %s on -f or --format is invalid", optarg); + } + break; + case 'h': usage(); break; @@ -194,7 +261,10 @@ parse_command_line(int argc, char *argv[]) "(use --help for help)"); } - return target ? target : "ovs-vswitchd"; + if (!args->target) { + args->target = "ovs-vswitchd"; + } + return args; } static struct jsonrpc * @@ -236,3 +306,30 @@ connect_to_target(const char *target) return client; } +/* The caller is responsible for freeing the returned string, with free(), when + * it is no longer needed. */ +static char * +reply_to_string(struct json *reply, enum unixctl_output_fmt fmt) +{ + ovs_assert(reply); + + if (fmt == UNIXCTL_OUTPUT_FMT_TEXT && reply->type != JSON_STRING) { + ovs_error(0, "Unexpected reply type in JSON rpc reply: %s", + json_type_to_string(reply->type)); + exit(2); + } + + struct ds ds = DS_EMPTY_INITIALIZER; + + if (fmt == UNIXCTL_OUTPUT_FMT_TEXT) { + ds_put_cstr(&ds, json_string(reply)); + } else { + json_to_ds(reply, JSSF_SORT, &ds); + } + + if (ds_last(&ds) != EOF && ds_last(&ds) != '\n') { + ds_put_char(&ds, '\n'); + } + + return ds_steal_cstr(&ds); +} From 97a1bce6aae782b4ce994f78de6cae9e7667f0b1 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Tue, 9 Jul 2024 09:14:18 +0200 Subject: [PATCH 767/833] python: Add option for JSON output to unixctl classes and appctl.py. This patch introduces support for different output formats to Python Unixctl* classes and appctl.py, similar to what the previous commit did for ovs-appctl. In particular, tests/appctl.py gains a global option '-f,--format' which allows users to request JSON instead of plain-text for humans. Reported-at: https://bugzilla.redhat.com/1824861 Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- NEWS | 2 ++ python/ovs/unixctl/__init__.py | 8 +++++ python/ovs/unixctl/client.py | 5 ++-- python/ovs/unixctl/server.py | 53 +++++++++++++++++++++++++++++----- tests/appctl.py | 39 ++++++++++++++++++++----- tests/unixctl-py.at | 7 +++++ 6 files changed, 98 insertions(+), 16 deletions(-) diff --git a/NEWS b/NEWS index f182647c7fb..c750ebae205 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,8 @@ Post-v3.3.0 per interface 'options:dpdk-lsc-interrupt' to 'false'. - Python: * Added custom transaction support to the Idl via add_op(). + * Added support for different output formats like 'json' to Python's + unixctl classes. v3.3.0 - 16 Feb 2024 diff --git a/python/ovs/unixctl/__init__.py b/python/ovs/unixctl/__init__.py index 8ee31294339..b05f3df7203 100644 --- a/python/ovs/unixctl/__init__.py +++ b/python/ovs/unixctl/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum import sys import ovs.util @@ -19,6 +20,13 @@ commands = {} +@enum.unique +# FIXME: Use @enum.verify(enum.NAMED_FLAGS) from Python 3.11 when available. +class UnixctlOutputFormat(enum.IntFlag): + TEXT = 1 << 0 + JSON = 1 << 1 + + class _UnixctlCommand(object): def __init__(self, usage, min_args, max_args, callback, aux): self.usage = usage diff --git a/python/ovs/unixctl/client.py b/python/ovs/unixctl/client.py index 8283f99bbfc..8a6fcb1b985 100644 --- a/python/ovs/unixctl/client.py +++ b/python/ovs/unixctl/client.py @@ -14,6 +14,7 @@ import os +import ovs.json import ovs.jsonrpc import ovs.stream import ovs.util @@ -41,10 +42,10 @@ def transact(self, command, argv): return error, None, None if reply.error is not None: - return 0, str(reply.error), None + return 0, reply.error, None else: assert reply.result is not None - return 0, None, str(reply.result) + return 0, None, reply.result def close(self): self._conn.close() diff --git a/python/ovs/unixctl/server.py b/python/ovs/unixctl/server.py index d24a7092c15..9a58a38d52d 100644 --- a/python/ovs/unixctl/server.py +++ b/python/ovs/unixctl/server.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse import copy import errno import os @@ -35,6 +36,7 @@ def __init__(self, rpc): assert isinstance(rpc, ovs.jsonrpc.Connection) self._rpc = rpc self._request_id = None + self._fmt = ovs.unixctl.UnixctlOutputFormat.TEXT def run(self): self._rpc.run() @@ -63,10 +65,29 @@ def run(self): return error def reply(self, body): - self._reply_impl(True, body) + assert body is None or isinstance(body, str) + + if body is None: + body = "" + + if self._fmt == ovs.unixctl.UnixctlOutputFormat.JSON: + body = { + "reply-format": "plain", + "reply": body + } + + return self._reply_impl_json(True, body) + + def reply_json(self, body): + self._reply_impl_json(True, body) def reply_error(self, body): - self._reply_impl(False, body) + assert body is None or isinstance(body, str) + + if body is None: + body = "" + + return self._reply_impl_json(False, body) # Called only by unixctl classes. def _close(self): @@ -78,15 +99,11 @@ def _wait(self, poller): if not self._rpc.get_backlog(): self._rpc.recv_wait(poller) - def _reply_impl(self, success, body): + def _reply_impl_json(self, success, body): assert isinstance(success, bool) - assert body is None or isinstance(body, str) assert self._request_id is not None - if body is None: - body = "" - if success: reply = Message.create_reply(body, self._request_id) else: @@ -133,6 +150,25 @@ def _unixctl_version(conn, unused_argv, version): conn.reply(version) +def _unixctl_set_options(conn, argv, unused_aux): + assert isinstance(conn, UnixctlConnection) + + parser = argparse.ArgumentParser() + parser.add_argument("--format", default="text", + choices=[fmt.name.lower() + for fmt in ovs.unixctl.UnixctlOutputFormat], + type=str.lower) + + try: + args = parser.parse_args(args=argv) + except argparse.ArgumentError as e: + conn.reply_error(str(e)) + return + + conn._fmt = ovs.unixctl.UnixctlOutputFormat[args.format.upper()] + conn.reply(None) + + class UnixctlServer(object): def __init__(self, listener): assert isinstance(listener, ovs.stream.PassiveStream) @@ -207,4 +243,7 @@ def create(path, version=None): ovs.unixctl.command_register("version", "", 0, 0, _unixctl_version, version) + ovs.unixctl.command_register("set-options", "[--format text|json]", 1, + 2, _unixctl_set_options, None) + return 0, UnixctlServer(listener) diff --git a/tests/appctl.py b/tests/appctl.py index e5cc2813883..4aca7efbc15 100644 --- a/tests/appctl.py +++ b/tests/appctl.py @@ -37,6 +37,18 @@ def connect_to_target(target): return client +def reply_to_string(reply, fmt=ovs.unixctl.UnixctlOutputFormat.TEXT): + if fmt == ovs.unixctl.UnixctlOutputFormat.TEXT: + body = str(reply) + else: + body = ovs.json.to_string(reply) + + if body and not body.endswith("\n"): + body += "\n" + + return body + + def main(): parser = argparse.ArgumentParser(description="Python Implementation of" " ovs-appctl.") @@ -49,30 +61,43 @@ def main(): help="Arguments to the command.") parser.add_argument("-T", "--timeout", metavar="SECS", help="wait at most SECS seconds for a response") + parser.add_argument("-f", "--format", metavar="FMT", + help="Output format.", default="text", + choices=[fmt.name.lower() + for fmt in ovs.unixctl.UnixctlOutputFormat], + type=str.lower) args = parser.parse_args() signal_alarm(int(args.timeout) if args.timeout else None) ovs.vlog.Vlog.init() target = args.target + format = ovs.unixctl.UnixctlOutputFormat[args.format.upper()] client = connect_to_target(target) + + if format != ovs.unixctl.UnixctlOutputFormat.TEXT: + err_no, error, _ = client.transact( + "set-options", ["--format", args.format]) + + if err_no: + ovs.util.ovs_fatal(err_no, "%s: transaction error" % target) + elif error is not None: + sys.stderr.write(reply_to_string(error)) + ovs.util.ovs_error(0, "%s: server returned an error" % target) + sys.exit(2) + err_no, error, result = client.transact(args.command, args.argv) client.close() if err_no: ovs.util.ovs_fatal(err_no, "%s: transaction error" % target) elif error is not None: - sys.stderr.write(error) - if error and not error.endswith("\n"): - sys.stderr.write("\n") - + sys.stderr.write(reply_to_string(error)) ovs.util.ovs_error(0, "%s: server returned an error" % target) sys.exit(2) else: assert result is not None - sys.stdout.write(result) - if result and not result.endswith("\n"): - sys.stdout.write("\n") + sys.stdout.write(reply_to_string(result, format)) if __name__ == '__main__': diff --git a/tests/unixctl-py.at b/tests/unixctl-py.at index 72400611822..f4a664dc0e8 100644 --- a/tests/unixctl-py.at +++ b/tests/unixctl-py.at @@ -100,6 +100,7 @@ The available commands are: exit help log [[arg ...]] + set-options [[--format text|json]] version vlog/close vlog/list @@ -112,6 +113,12 @@ AT_CHECK([PYAPPCTL_PY -t test-unixctl.py help], [0], [expout]) AT_CHECK([ovs-vsctl --version | sed 's/ovs-vsctl/test-unixctl.py/' | head -1 > expout]) AT_CHECK([APPCTL -t test-unixctl.py version], [0], [expout]) AT_CHECK([PYAPPCTL_PY -t test-unixctl.py version], [0], [expout]) +AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format json version], [0], [dnl +{"reply":"$(cat expout)","reply-format":"plain"} +]) +AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format JSON version], [0], [dnl +{"reply":"$(cat expout)","reply-format":"plain"} +]) AT_CHECK([APPCTL -t test-unixctl.py echo robot ninja], [0], [stdout]) AT_CHECK([cat stdout | sed -e "s/u'/'/g"], [0], [dnl From a0925cef7db40decabd5cb473e8166eef6ddd89d Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Tue, 9 Jul 2024 09:14:19 +0200 Subject: [PATCH 768/833] appctl: Add option '--pretty' for pretty-printing JSON output. With the '--pretty' option, ovs-appctl will now print JSON output in a more readable fashion, i.e. with additional line breaks, spaces and sorted dictionary keys. Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-appctl.8.rst | 8 +++++++ NEWS | 1 + tests/ovs-vswitchd.at | 6 ++++++ utilities/ovs-appctl.c | 34 ++++++++++++++++++++++++------ 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/Documentation/ref/ovs-appctl.8.rst b/Documentation/ref/ovs-appctl.8.rst index 148cc763295..7054cf559e5 100644 --- a/Documentation/ref/ovs-appctl.8.rst +++ b/Documentation/ref/ovs-appctl.8.rst @@ -9,6 +9,7 @@ Synopsis [``--target=`` | ``-t`` ] [``--timeout=`` | ``-T`` ] [``--format=`` | ``-f`` ] +[``--pretty``] [...] ``ovs-appctl --help`` @@ -79,6 +80,13 @@ In normal use only a single option is accepted: {"reply-format":"plain","reply":"$PLAIN_TEXT_HERE"} +* ``--pretty`` + + By default, JSON output is printed as compactly as possible. This option + causes JSON in output to be printed in a more readable fashion. For + example, members of objects and elements of arrays are printed one + per line, with indentation. Requires ``--format=json``. + Common Commands =============== diff --git a/NEWS b/NEWS index c750ebae205..d903d2f74a9 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,7 @@ Post-v3.3.0 - ovs-appctl: * Added new option [-f|--format] to choose the output format, e.g. 'json' or 'text' (by default). + * Added new option [--pretty] to print JSON output in a readable fashion. - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/tests/ovs-vswitchd.at b/tests/ovs-vswitchd.at index b1ae1ae1edc..0f7a6085e1e 100644 --- a/tests/ovs-vswitchd.at +++ b/tests/ovs-vswitchd.at @@ -276,4 +276,10 @@ AT_CHECK_UNQUOTED([ovs-appctl --format json version], [0], [dnl {"reply":"$ovs_version","reply-format":"plain"} ]) +AT_CHECK_UNQUOTED([ovs-appctl --format json --pretty version], [0], [dnl +{ + "reply": "$ovs_version", + "reply-format": "plain"} +]) + AT_CLEANUP diff --git a/utilities/ovs-appctl.c b/utilities/ovs-appctl.c index 721698755c9..682ee100ce0 100644 --- a/utilities/ovs-appctl.c +++ b/utilities/ovs-appctl.c @@ -40,13 +40,15 @@ static void usage(void); /* Parsed command line args. */ struct cmdl_args { enum unixctl_output_fmt format; + unsigned int format_flags; char *target; }; static struct cmdl_args *cmdl_args_create(void); static struct cmdl_args *parse_command_line(int argc, char *argv[]); static struct jsonrpc *connect_to_target(const char *target); -static char *reply_to_string(struct json *reply, enum unixctl_output_fmt fmt); +static char *reply_to_string(struct json *reply, enum unixctl_output_fmt fmt, + unsigned int fmt_flags); int main(int argc, char *argv[]) @@ -84,7 +86,7 @@ main(int argc, char *argv[]) if (cmd_error) { jsonrpc_close(client); - msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT); + msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT, 0); fputs(msg, stderr); free(msg); ovs_error(0, "%s: server returned an error", args->target); @@ -108,13 +110,13 @@ main(int argc, char *argv[]) if (cmd_error) { jsonrpc_close(client); - msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT); + msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT, 0); fputs(msg, stderr); free(msg); ovs_error(0, "%s: server returned an error", args->target); exit(2); } else if (cmd_result) { - msg = reply_to_string(cmd_result, args->format); + msg = reply_to_string(cmd_result, args->format, args->format_flags); fputs(msg, stdout); free(msg); } else { @@ -151,6 +153,8 @@ Other options:\n\ --timeout=SECS wait at most SECS seconds for a response\n\ -f, --format=FMT Output format. One of: 'json', or 'text'\n\ (default: text)\n\ + --pretty Format the output in a more readable fashion.\n\ + Requires: --format=json.\n\ -h, --help Print this helpful information\n\ -V, --version Display ovs-appctl version information\n", program_name, program_name); @@ -163,6 +167,7 @@ cmdl_args_create(void) struct cmdl_args *args = xmalloc(sizeof *args); args->format = UNIXCTL_OUTPUT_FMT_TEXT; + args->format_flags = 0; args->target = NULL; return args; @@ -173,7 +178,8 @@ parse_command_line(int argc, char *argv[]) { enum { OPT_START = UCHAR_MAX + 1, - VLOG_OPTION_ENUMS + OPT_PRETTY, + VLOG_OPTION_ENUMS, }; static const struct option long_options[] = { {"target", required_argument, NULL, 't'}, @@ -181,6 +187,7 @@ parse_command_line(int argc, char *argv[]) {"format", required_argument, NULL, 'f'}, {"help", no_argument, NULL, 'h'}, {"option", no_argument, NULL, 'o'}, + {"pretty", no_argument, NULL, OPT_PRETTY}, {"version", no_argument, NULL, 'V'}, {"timeout", required_argument, NULL, 'T'}, VLOG_LONG_OPTIONS, @@ -190,6 +197,7 @@ parse_command_line(int argc, char *argv[]) char *short_options = xasprintf("+%s", short_options_); struct cmdl_args *args = cmdl_args_create(); unsigned int timeout = 0; + bool pretty = false; int e_options; e_options = 0; @@ -232,6 +240,10 @@ parse_command_line(int argc, char *argv[]) ovs_cmdl_print_options(long_options); exit(EXIT_SUCCESS); + case OPT_PRETTY: + pretty = true; + break; + case 'T': if (!str_to_uint(optarg, 10, &timeout) || !timeout) { ovs_fatal(0, "value %s on -T or --timeout is invalid", optarg); @@ -261,6 +273,13 @@ parse_command_line(int argc, char *argv[]) "(use --help for help)"); } + if (pretty) { + if (args->format != UNIXCTL_OUTPUT_FMT_JSON) { + ovs_fatal(0, "--pretty is supported with --format json only"); + } + args->format_flags |= JSSF_PRETTY; + } + if (!args->target) { args->target = "ovs-vswitchd"; } @@ -309,7 +328,8 @@ connect_to_target(const char *target) /* The caller is responsible for freeing the returned string, with free(), when * it is no longer needed. */ static char * -reply_to_string(struct json *reply, enum unixctl_output_fmt fmt) +reply_to_string(struct json *reply, enum unixctl_output_fmt fmt, + unsigned int fmt_flags) { ovs_assert(reply); @@ -324,7 +344,7 @@ reply_to_string(struct json *reply, enum unixctl_output_fmt fmt) if (fmt == UNIXCTL_OUTPUT_FMT_TEXT) { ds_put_cstr(&ds, json_string(reply)); } else { - json_to_ds(reply, JSSF_SORT, &ds); + json_to_ds(reply, JSSF_SORT | fmt_flags, &ds); } if (ds_last(&ds) != EOF && ds_last(&ds) != '\n') { From 3c572af65e54ef97aeb339c91497ba213cd9c515 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Tue, 9 Jul 2024 09:14:20 +0200 Subject: [PATCH 769/833] python: Add option for pretty-printing JSON output to appctl.py. With the '--pretty' option, appctl.py will now print JSON output in a more readable fashion, i.e. with additional line breaks, spaces and sorted dictionary keys. The pretty-printed output from appctl.py is not strictly the same as with ovs-appctl because of both use different pretty-printing implementations. Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- tests/appctl.py | 15 ++++++++++++--- tests/unixctl-py.at | 6 ++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/appctl.py b/tests/appctl.py index 4aca7efbc15..5f4b2754a33 100644 --- a/tests/appctl.py +++ b/tests/appctl.py @@ -37,11 +37,12 @@ def connect_to_target(target): return client -def reply_to_string(reply, fmt=ovs.unixctl.UnixctlOutputFormat.TEXT): +def reply_to_string(reply, fmt=ovs.unixctl.UnixctlOutputFormat.TEXT, + fmt_flags={}): if fmt == ovs.unixctl.UnixctlOutputFormat.TEXT: body = str(reply) else: - body = ovs.json.to_string(reply) + body = ovs.json.to_string(reply, **fmt_flags) if body and not body.endswith("\n"): body += "\n" @@ -66,13 +67,21 @@ def main(): choices=[fmt.name.lower() for fmt in ovs.unixctl.UnixctlOutputFormat], type=str.lower) + parser.add_argument("--pretty", action="store_true", + help="Format the output in a more readable fashion." + " Requires: --format json.") args = parser.parse_args() + if (args.format != ovs.unixctl.UnixctlOutputFormat.JSON.name.lower() + and args.pretty): + ovs.util.ovs_fatal(0, "--pretty is supported with --format json only") + signal_alarm(int(args.timeout) if args.timeout else None) ovs.vlog.Vlog.init() target = args.target format = ovs.unixctl.UnixctlOutputFormat[args.format.upper()] + format_flags = dict(pretty=True) if args.pretty else {} client = connect_to_target(target) if format != ovs.unixctl.UnixctlOutputFormat.TEXT: @@ -97,7 +106,7 @@ def main(): sys.exit(2) else: assert result is not None - sys.stdout.write(reply_to_string(result, format)) + sys.stdout.write(reply_to_string(result, format, format_flags)) if __name__ == '__main__': diff --git a/tests/unixctl-py.at b/tests/unixctl-py.at index f4a664dc0e8..ae8bd5ad189 100644 --- a/tests/unixctl-py.at +++ b/tests/unixctl-py.at @@ -119,6 +119,12 @@ AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format json version], [0], [ AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format JSON version], [0], [dnl {"reply":"$(cat expout)","reply-format":"plain"} ]) +AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format json --pretty version], [0], [dnl +{ + "reply":"$(cat expout)", + "reply-format":"plain" +} +]) AT_CHECK([APPCTL -t test-unixctl.py echo robot ninja], [0], [stdout]) AT_CHECK([cat stdout | sed -e "s/u'/'/g"], [0], [dnl From 379d036ac7c923463ad36628d7c703950df46b66 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Tue, 9 Jul 2024 09:14:21 +0200 Subject: [PATCH 770/833] vswitchd: Add JSON output for 'list-commands' command. The 'list-commands' command now supports machine-readable JSON output in addition to the plain-text output for humans. Reported-at: https://bugzilla.redhat.com/1824861 Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- NEWS | 1 + lib/unixctl.c | 42 +++++++++++++++++++++++++++++------------- tests/ovs-vswitchd.at | 15 +++++++++++++++ 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index d903d2f74a9..feebae86e94 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,7 @@ Post-v3.3.0 * Added new option [-f|--format] to choose the output format, e.g. 'json' or 'text' (by default). * Added new option [--pretty] to print JSON output in a readable fashion. + * 'list-commands' now supports output in JSON format. - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/lib/unixctl.c b/lib/unixctl.c index e7ce77e2ce0..c060e86597d 100644 --- a/lib/unixctl.c +++ b/lib/unixctl.c @@ -95,24 +95,40 @@ static void unixctl_list_commands(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) { - struct ds ds = DS_EMPTY_INITIALIZER; - const struct shash_node **nodes = shash_sort(&commands); - size_t i; + if (unixctl_command_get_output_format(conn) == UNIXCTL_OUTPUT_FMT_JSON) { + struct json *json_commands = json_object_create(); + const struct shash_node *node; - ds_put_cstr(&ds, "The available commands are:\n"); + SHASH_FOR_EACH (node, &commands) { + const struct unixctl_command *command = node->data; - for (i = 0; i < shash_count(&commands); i++) { - const struct shash_node *node = nodes[i]; - const struct unixctl_command *command = node->data; + if (command->usage) { + json_object_put_string(json_commands, node->name, + command->usage); + } + } + unixctl_command_reply_json(conn, json_commands); + } else { + struct ds ds = DS_EMPTY_INITIALIZER; + const struct shash_node **nodes = shash_sort(&commands); + size_t i; + + ds_put_cstr(&ds, "The available commands are:\n"); - if (command->usage) { - ds_put_format(&ds, " %-23s %s\n", node->name, command->usage); + for (i = 0; i < shash_count(&commands); ++i) { + const struct shash_node *node = nodes[i]; + const struct unixctl_command *command = node->data; + + if (command->usage) { + ds_put_format(&ds, " %-23s %s\n", node->name, + command->usage); + } } - } - free(nodes); + free(nodes); - unixctl_command_reply(conn, ds_cstr(&ds)); - ds_destroy(&ds); + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); + } } static void diff --git a/tests/ovs-vswitchd.at b/tests/ovs-vswitchd.at index 0f7a6085e1e..730363e8357 100644 --- a/tests/ovs-vswitchd.at +++ b/tests/ovs-vswitchd.at @@ -283,3 +283,18 @@ AT_CHECK_UNQUOTED([ovs-appctl --format json --pretty version], [0], [dnl ]) AT_CLEANUP + +AT_SETUP([ovs-vswitchd list-commands]) +OVS_VSWITCHD_START + +AT_CHECK([ovs-appctl list-commands], [0], [ignore]) +AT_CHECK([ovs-appctl --format json list-commands], [0], [stdout]) + +# Check that ovs-appctl prints a single line with a trailing newline. +AT_CHECK([wc -l stdout], [0], [1 stdout +]) + +# Check that ovs-appctl prints a JSON document. +AT_CHECK([ovstest test-json stdout], [0], [ignore]) + +AT_CLEANUP From 4935e893256af22620619dc4d94cd194271a1158 Mon Sep 17 00:00:00 2001 From: Jakob Meng Date: Tue, 9 Jul 2024 09:14:22 +0200 Subject: [PATCH 771/833] ofproto: Add JSON output for 'dpif/show' command. The 'dpif/show' command now supports machine-readable JSON output in addition to the plain-text output for humans. An example would be: ovs-appctl --format json dpif/show Reported-at: https://bugzilla.redhat.com/1824861 Signed-off-by: Jakob Meng Signed-off-by: Ilya Maximets --- NEWS | 2 +- ofproto/ofproto-dpif.c | 124 ++++++++++++++++++++++++++++++++++++----- tests/ofproto-dpif.at | 40 +++++++++++++ 3 files changed, 152 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index feebae86e94..d18693315ea 100644 --- a/NEWS +++ b/NEWS @@ -6,7 +6,7 @@ Post-v3.3.0 * Added new option [-f|--format] to choose the output format, e.g. 'json' or 'text' (by default). * Added new option [--pretty] to print JSON output in a readable fashion. - * 'list-commands' now supports output in JSON format. + * 'dpif/show' and 'list-commands' now support output in JSON format. - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index fcd7cd753ca..87dfb0043dd 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -28,6 +28,7 @@ #include "fail-open.h" #include "guarded-list.h" #include "hmapx.h" +#include "openvswitch/json.h" #include "lacp.h" #include "learn.h" #include "mac-learning.h" @@ -6519,19 +6520,108 @@ dpif_set_support(struct dpif_backer_support *rt_support, return changed; } +static struct json * +dpif_show_backer_json(struct json *backers, const struct dpif_backer *backer) +{ + struct json *json_backer = json_object_create(); + + /* Add datapath as new JSON object using its name as key. */ + json_object_put(backers, dpif_name(backer->dpif), json_backer); + + /* Add datapath's stats under "stats" key. */ + struct json *json_dp_stats = json_object_create(); + struct dpif_dp_stats dp_stats; + + dpif_get_dp_stats(backer->dpif, &dp_stats); + json_object_put_format(json_dp_stats, "hit", "%"PRIu64, dp_stats.n_hit); + json_object_put_format(json_dp_stats, "missed", "%"PRIu64, + dp_stats.n_missed); + json_object_put(json_backer, "stats", json_dp_stats); + + /* Add datapath's bridges under "bridges" key. */ + struct json *json_dp_bridges = json_object_create(); + + struct shash ofproto_shash = SHASH_INITIALIZER(&ofproto_shash); + free(get_ofprotos(&ofproto_shash)); + + struct shash_node *node; + SHASH_FOR_EACH (node, &ofproto_shash) { + struct ofproto_dpif *ofproto = node->data; + + if (ofproto->backer != backer) { + continue; + } + + /* Add bridge to "bridges" dictionary using its name as key. */ + struct json *json_ofproto = json_object_create(); + + /* Add bridge ports to the current bridge dictionary. */ + const struct shash_node *port; + SHASH_FOR_EACH (port, &ofproto->up.port_by_name) { + /* Add bridge port to a bridge's dict using port name as key. */ + struct json *json_ofproto_port = json_object_create(); + struct ofport *ofport = port->data; + + /* Add OpenFlow port associated with a bridge port. */ + json_object_put_format(json_ofproto_port, "ofport", "%"PRIu32, + ofport->ofp_port); + + /* Add bridge port number. */ + odp_port_t odp_port = ofp_port_to_odp_port(ofproto, + ofport->ofp_port); + if (odp_port != ODPP_NONE) { + json_object_put_format(json_ofproto_port, "port_no", + "%"PRIu32, odp_port); + } else { + json_object_put_string(json_ofproto_port, "port_no", "none"); + } + + /* Add type of a bridge port. */ + json_object_put_string(json_ofproto_port, "type", + netdev_get_type(ofport->netdev)); + + /* Add config entries for a bridge port. */ + + struct smap config = SMAP_INITIALIZER(&config); + + if (!netdev_get_config(ofport->netdev, &config) + && smap_count(&config)) { + struct json *json_port_config = json_object_create(); + struct smap_node *cfg_node; + + SMAP_FOR_EACH (cfg_node, &config) { + json_object_put_string(json_port_config, cfg_node->key, + cfg_node->value); + } + json_object_put(json_ofproto_port, "config", json_port_config); + } + smap_destroy(&config); + + json_object_put(json_ofproto, netdev_get_name(ofport->netdev), + json_ofproto_port); + } /* End of bridge port(s). */ + + json_object_put(json_dp_bridges, ofproto->up.name, json_ofproto); + } /* End of bridge(s). */ + + shash_destroy(&ofproto_shash); + + json_object_put(json_backer, "bridges", json_dp_bridges); + return json_backer; +} + static void -dpif_show_backer(const struct dpif_backer *backer, struct ds *ds) +dpif_show_backer_text(const struct dpif_backer *backer, struct ds *ds) { + struct shash ofproto_shash = SHASH_INITIALIZER(&ofproto_shash); const struct shash_node **ofprotos; struct dpif_dp_stats dp_stats; - struct shash ofproto_shash; size_t i; dpif_get_dp_stats(backer->dpif, &dp_stats); ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n", dpif_name(backer->dpif), dp_stats.n_hit, dp_stats.n_missed); - shash_init(&ofproto_shash); ofprotos = get_ofprotos(&ofproto_shash); for (i = 0; i < shash_count(&ofproto_shash); i++) { struct ofproto_dpif *ofproto = ofprotos[i]->data; @@ -6587,18 +6677,26 @@ static void ofproto_unixctl_dpif_show(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) { - struct ds ds = DS_EMPTY_INITIALIZER; - const struct shash_node **backers; - int i; + if (unixctl_command_get_output_format(conn) == UNIXCTL_OUTPUT_FMT_JSON) { + struct json *backers = json_object_create(); + const struct shash_node *backer; - backers = shash_sort(&all_dpif_backers); - for (i = 0; i < shash_count(&all_dpif_backers); i++) { - dpif_show_backer(backers[i]->data, &ds); - } - free(backers); + SHASH_FOR_EACH (backer, &all_dpif_backers) { + dpif_show_backer_json(backers, backer->data); + } + unixctl_command_reply_json(conn, backers); + } else { + const struct shash_node **backers = shash_sort(&all_dpif_backers); + struct ds ds = DS_EMPTY_INITIALIZER; - unixctl_command_reply(conn, ds_cstr(&ds)); - ds_destroy(&ds); + for (int i = 0; i < shash_count(&all_dpif_backers); i++) { + dpif_show_backer_text(backers[i]->data, &ds); + } + free(backers); + + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); + } } static void diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 0b23fd6c5ea..30ef0468c8d 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -8879,6 +8879,46 @@ dummy@ovs-dummy: hit:0 missed:0 br1 65534/101: (dummy-internal) p3 3/3: (dummy) ]) + +AT_CHECK([ovs-appctl --format json --pretty dpif/show], [0], [dnl +[{ + "dummy@ovs-dummy": { + "bridges": { + "br0": { + "br0": { + "ofport": "65534", + "port_no": "100", + "type": "dummy-internal"}, + "p1": { + "config": { + "n_rxq": "1", + "n_txq": "1", + "numa_id": "0"}, + "ofport": "1", + "port_no": "1", + "type": "dummy-pmd"}, + "p2": { + "config": { + "n_rxq": "1", + "n_txq": "1", + "numa_id": "0"}, + "ofport": "2", + "port_no": "2", + "type": "dummy-pmd"}}, + "br1": { + "br1": { + "ofport": "65534", + "port_no": "101", + "type": "dummy-internal"}, + "p3": { + "ofport": "3", + "port_no": "3", + "type": "dummy"}}}, + "stats": { + "hit": "0", + "missed": "0"}}}] +]) + OVS_VSWITCHD_STOP AT_CLEANUP From d7e77143fbdc855ce949a3f23e6dc7be5a6e0cd9 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Fri, 5 Jul 2024 16:45:01 -0400 Subject: [PATCH 772/833] tunnel: Allow UDP zero checksum with IPv6 tunnels. This patch adopts the proposed RFC 6935 by allowing null UDP checksums even if the tunnel protocol is IPv6. This is already supported by Linux through the udp6zerocsumtx tunnel option. It is disabled by default and IPv6 tunnels are flagged as requiring a checksum, but this patch enables the user to set csum=false on IPv6 tunnels. Acked-by: Simon Horman Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- NEWS | 11 +++++++++++ lib/netdev-native-tnl.c | 2 +- lib/netdev-vport.c | 17 +++++++++++++++-- lib/netdev.h | 18 +++++++++++++++++- ofproto/tunnel.c | 10 ++++++++-- tests/tunnel-push-pop-ipv6.at | 9 +++++++++ tests/tunnel-push-pop.at | 7 +++++++ tests/tunnel.at | 2 +- vswitchd/vswitch.xml | 12 +++++++++--- 9 files changed, 78 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index d18693315ea..bd97ea3c5aa 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,10 @@ Post-v3.3.0 * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address from a range. + * IPv6 UDP tunnel encapsulation including Geneve and VXLAN will now + honour the csum option. Configuring the interface with + "options:csum=false" now has the same effect as the udp6zerocsumtx + option has with Linux kernel UDP tunnels. - The primary development branch has been renamed from 'master' to 'main'. The OVS tree remains hosted on GitHub. https://github.com/openvswitch/ovs.git @@ -23,6 +27,13 @@ Post-v3.3.0 * Added custom transaction support to the Idl via add_op(). * Added support for different output formats like 'json' to Python's unixctl classes. + - Tunnels: + * Previously the kernel datapath did not enable UDP checksums by default + in IPv6 tunnels. This behaviour is non-standard, differs from the + Linux kernel, and as also different than the userspace datapath. Now + these tunnels will calculate checksums by default and that behaviour can + be changed with "options:csum=false" just as with the userspace + datapath. v3.3.0 - 16 Feb 2024 diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index 0f9f07f44b6..16c56608d87 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -384,7 +384,7 @@ udp_build_header(const struct netdev_tunnel_config *tnl_cfg, udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP, 0); udp->udp_dst = tnl_cfg->dst_port; - if (params->is_ipv6 || params->flow->tunnel.flags & FLOW_TNL_F_CSUM) { + if (params->flow->tunnel.flags & FLOW_TNL_F_CSUM) { /* Write a value in now to mark that we should compute the checksum * later. 0xffff is handy because it is transparent to the * calculation. */ diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 60caa02fbb9..234a4ebe127 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -702,7 +702,9 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) tnl_cfg.dst_port = htons(atoi(node->value)); } else if (!strcmp(node->key, "csum") && has_csum) { if (!strcmp(node->value, "true")) { - tnl_cfg.csum = true; + tnl_cfg.csum = NETDEV_TNL_CSUM_ENABLED; + } else if (!strcmp(node->value, "false")) { + tnl_cfg.csum = NETDEV_TNL_CSUM_DISABLED; } } else if (!strcmp(node->key, "seq") && has_seq) { if (!strcmp(node->value, "true")) { @@ -850,6 +852,15 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) } } + /* The default csum state for GRE is special as it does have an optional + * checksum but the default configuration isn't correlated with IP version + * like UDP tunnels are. Likewise, tunnels with no checksum at all must be + * in this state. */ + if (tnl_cfg.csum == NETDEV_TNL_CSUM_DEFAULT && + (!has_csum || strstr(type, "gre"))) { + tnl_cfg.csum = NETDEV_TNL_DEFAULT_NO_CSUM; + } + enum tunnel_layers layers = tunnel_supported_layers(type, &tnl_cfg); const char *full_type = (strcmp(type, "vxlan") ? type : (tnl_cfg.exts & (1 << OVS_VXLAN_EXT_GPE) @@ -1026,8 +1037,10 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) } } - if (tnl_cfg->csum) { + if (tnl_cfg->csum == NETDEV_TNL_CSUM_ENABLED) { smap_add(args, "csum", "true"); + } else if (tnl_cfg->csum == NETDEV_TNL_CSUM_DISABLED) { + smap_add(args, "csum", "false"); } if (tnl_cfg->set_seq) { diff --git a/lib/netdev.h b/lib/netdev.h index 67a8486bdba..63e03d72db4 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -111,6 +111,22 @@ enum netdev_srv6_flowlabel { SRV6_FLOWLABEL_COMPUTE, }; +enum netdev_tnl_csum { + /* Default value for UDP tunnels if no configurations is present. Enforce + * checksum calculation in IPv6 tunnels, disable in IPv4 tunnels. */ + NETDEV_TNL_CSUM_DEFAULT = 0, + + /* Checksum explicitly to be calculated. */ + NETDEV_TNL_CSUM_ENABLED, + + /* Checksum calculation explicitly disabled. */ + NETDEV_TNL_CSUM_DISABLED, + + /* A value for when there is no checksum or the default value is no + * checksum regardless of IP version. */ + NETDEV_TNL_DEFAULT_NO_CSUM, +}; + /* Configuration specific to tunnels. */ struct netdev_tunnel_config { ovs_be64 in_key; @@ -139,7 +155,7 @@ struct netdev_tunnel_config { uint8_t tos; bool tos_inherit; - bool csum; + enum netdev_tnl_csum csum; bool dont_fragment; enum netdev_pt_mode pt_mode; diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c index 80ddee78acf..f067a6c26c1 100644 --- a/ofproto/tunnel.c +++ b/ofproto/tunnel.c @@ -465,9 +465,13 @@ tnl_port_send(const struct ofport_dpif *ofport, struct flow *flow, flow->tunnel.flags &= ~(FLOW_TNL_F_MASK & ~FLOW_TNL_PUB_F_MASK); flow->tunnel.flags |= (cfg->dont_fragment ? FLOW_TNL_F_DONT_FRAGMENT : 0) - | (cfg->csum ? FLOW_TNL_F_CSUM : 0) | (cfg->out_key_present ? FLOW_TNL_F_KEY : 0); + if (cfg->csum == NETDEV_TNL_CSUM_ENABLED || + (cfg->csum == NETDEV_TNL_CSUM_DEFAULT && !flow->tunnel.ip_dst)) { + flow->tunnel.flags |= FLOW_TNL_F_CSUM; + } + if (cfg->set_egress_pkt_mark) { flow->pkt_mark = cfg->egress_pkt_mark; wc->masks.pkt_mark = UINT32_MAX; @@ -706,8 +710,10 @@ tnl_port_format(const struct tnl_port *tnl_port, struct ds *ds) ds_put_cstr(ds, ", df=false"); } - if (cfg->csum) { + if (cfg->csum == NETDEV_TNL_CSUM_ENABLED) { ds_put_cstr(ds, ", csum=true"); + } else if (cfg->csum == NETDEV_TNL_CSUM_DISABLED) { + ds_put_cstr(ds, ", csum=false"); } ds_put_cstr(ds, ")\n"); diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index f1c5d42f664..3edec5fbca1 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -610,6 +610,15 @@ AT_CHECK([ovs-appctl tnl/arp/show | tail -n+3 | sort], [0], [dnl 2001:cafe::93 f8:bc:12:44:34:b7 br0 ]) +dnl Disable checksum from VXLAN port. +AT_CHECK([ovs-vsctl set Interface t3 options:csum=false]) +AT_CHECK([ovs-ofctl del-flows int-br]) +AT_CHECK([ovs-ofctl add-flow int-br action=4]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=36:b1:ee:7c:01:01,dst=36:b1:ee:7c:01:02),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(4789),header(size=70,type=4,eth(dst=f8:bc:12:44:34:b7,src=aa:55:aa:55:00:00,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::93,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0x8000000,vni=0x0)),out_port(100)),1 +]) + ovs-appctl time/warp 10000 AT_CHECK([ovs-vsctl del-port int-br t3 \ diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index 508737c53ec..7ec4c31ab2d 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -642,6 +642,13 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0x8000000,vni=0x7b)),out_port(100)),1 ]) +dnl Check VXLAN tunnel push with checksum. +AT_CHECK([ovs-vsctl set Interface t2 options:csum=true]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=36:b1:ee:7c:01:01,dst=36:b1:ee:7c:01:02),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0xffff),vxlan(flags=0x8000000,vni=0x7b)),out_port(100)),1 +]) + AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl 1.1.2.92 f8:bc:12:44:34:b6 br0 1.1.2.93 f8:bc:12:44:34:b7 br0 diff --git a/tests/tunnel.at b/tests/tunnel.at index 9d539ee6f67..31e935901d3 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -1038,7 +1038,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0,src=1.1.1.1,dst=1.1.1.2,ttl=64),in_port(4789)'], [0], [stdout]) AT_CHECK([tail -1 stdout], [0], - [Datapath actions: set(tunnel(ipv6_dst=2001:cafe::1,ttl=64,tp_dst=4789,flags(df))),4789 + [Datapath actions: set(tunnel(ipv6_dst=2001:cafe::1,ttl=64,tp_dst=4789,flags(df|csum))),4789 ]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x0,ipv6_src=2001:cafe::1,ipv6_dst=2001:cafe::2,ttl=64),in_port(4789)'], [0], [stdout]) diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index e3afb78a4e5..7f2d188be12 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3207,9 +3207,15 @@

      Optional. Compute encapsulation header (either GRE or UDP) - checksums on outgoing packets. Default is disabled, set to - true to enable. Checksums present on incoming - packets will be validated regardless of this setting. + checksums on outgoing packets. When unset (the default value), + checksum computing for outgoing packets is enabled for UDP IPv6 + tunnels, and disabled for GRE and IPv4 UDP tunnels. When set to + false, no checksums will be computed for outgoing + tunnel encapsulation headers. When true, checksums + will be computed for all outgoing tunnel encapsulation headers. + Checksums present on incoming packets will be validated + regardless of this setting. Incoming packets without a checksum + will also be accepted regardless of this setting.

      From d9de6b01c268379b689025a486c043e70c8e020e Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:37 +0200 Subject: [PATCH 773/833] ofproto-dpif: Allow forcing dp features. Datapath features can be set with dpif/set-dp-features unixctl command. This command is not documented and therefore not supported in production but only useful for unit tests. A limitation was put in place originally to avoid enabling features at runtime that were disabled at boot time to avoid breaking the datapath in unexpected ways. But, considering users should not use this command and it should only be used for testing, we can assume whoever runs it knows what they are doing. Therefore, the limitation should be bypass-able. This patch adds a "--force" flag to the unixctl command to allow bypassing the mentioned limitation. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 87dfb0043dd..33c80d87c0a 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -6433,7 +6433,8 @@ display_support_field(const char *name, static bool dpif_set_support(struct dpif_backer_support *rt_support, struct dpif_backer_support *bt_support, - const char *name, const char *value, struct ds *ds) + const char *name, const char *value, bool force, + struct ds *ds) { struct shash all_fields = SHASH_INITIALIZER(&all_fields); struct dpif_support_field *field; @@ -6485,8 +6486,13 @@ dpif_set_support(struct dpif_backer_support *rt_support, if (field->type == DPIF_SUPPORT_FIELD_bool) { if (!strcasecmp(value, "true")) { - if (*(bool *)field->bt_ptr) { - *(bool *)field->rt_ptr = true; + if (*(bool *) field->bt_ptr || force) { + if (force) { + VLOG_WARN( + "Enabling an unsupported feature is very dangerous" + ); + } + *(bool *) field->rt_ptr = true; changed = true; } else { ds_put_cstr(ds, "Can not enable features not supported by the datapth"); @@ -6818,10 +6824,19 @@ ofproto_unixctl_dpif_set_dp_features(struct unixctl_conn *conn, void *aux OVS_UNUSED) { struct ds ds = DS_EMPTY_INITIALIZER; - const char *br = argv[1]; + struct ofproto_dpif *ofproto; + bool changed, force = false; const char *name, *value; - struct ofproto_dpif *ofproto = ofproto_dpif_lookup_by_name(br); - bool changed; + const char *br; + + if (argc > 2 && !strcmp(argv[1], "--force")) { + force = true; + argc--; + argv++; + } + + br = argv[1]; + ofproto = ofproto_dpif_lookup_by_name(br); if (!ofproto) { unixctl_command_reply_error(conn, "no such bridge"); @@ -6832,7 +6847,7 @@ ofproto_unixctl_dpif_set_dp_features(struct unixctl_conn *conn, value = argc > 3 ? argv[3] : NULL; changed = dpif_set_support(&ofproto->backer->rt_support, &ofproto->backer->bt_support, - name, value, &ds); + name, value, force, &ds); if (changed) { xlate_set_support(ofproto, &ofproto->backer->rt_support); udpif_flush(ofproto->backer->udpif); @@ -6875,7 +6890,8 @@ ofproto_unixctl_init(void) unixctl_command_register("dpif/dump-flows", "[-m] [--names | --no-names] bridge", 1, INT_MAX, ofproto_unixctl_dpif_dump_flows, NULL); - unixctl_command_register("dpif/set-dp-features", "bridge", 1, 3 , + unixctl_command_register("dpif/set-dp-features", + "[--force] bridge [feature [value]]", 1, 4, ofproto_unixctl_dpif_set_dp_features, NULL); } From 1a3bd96b4fc47a8ff2b98fa02d7709c4cdc2a9b1 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:38 +0200 Subject: [PATCH 774/833] odp-util: Add support OVS_ACTION_ATTR_PSAMPLE. Add support for parsing and formatting the new action. Also, flag OVS_ACTION_ATTR_SAMPLE as requiring datapath assistance if it contains a nested OVS_ACTION_ATTR_PSAMPLE. The reason is that the sampling rate from the parent "sample" is made available to the nested "psample" by the kernel. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- include/linux/openvswitch.h | 28 +++++++++++ lib/dpif-netdev.c | 1 + lib/dpif.c | 3 +- lib/odp-execute.c | 25 +++++++++- lib/odp-util.c | 91 ++++++++++++++++++++++++++++++++++++ lib/odp-util.h | 3 ++ ofproto/ofproto-dpif-ipfix.c | 1 + ofproto/ofproto-dpif-sflow.c | 1 + python/ovs/flow/odp.py | 8 ++++ tests/odp.at | 16 +++++++ 10 files changed, 175 insertions(+), 2 deletions(-) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index d9fb991ef23..0023b65fbbf 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -992,6 +992,31 @@ struct check_pkt_len_arg { }; #endif +#define OVS_PSAMPLE_COOKIE_MAX_SIZE 16 +/** + * enum ovs_pample_attr - Attributes for %OVS_ACTION_ATTR_PSAMPLE + * action. + * + * @OVS_PSAMPLE_ATTR_GROUP: 32-bit number to identify the source of the + * sample. + * @OVS_PSAMPLE_ATTR_COOKIE: An optional variable-length binary cookie that + * contains user-defined metadata. The maximum length is + * OVS_PSAMPLE_COOKIE_MAX_SIZE bytes. + * + * Sends the packet to the psample multicast group with the specified group and + * cookie. It is possible to combine this action with the + * %OVS_ACTION_ATTR_TRUNC action to limit the size of the sample. + */ +enum ovs_psample_attr { + OVS_PSAMPLE_ATTR_GROUP = 1, /* u32 number. */ + OVS_PSAMPLE_ATTR_COOKIE, /* Optional, user specified cookie. */ + + /* private: */ + __OVS_PSAMPLE_ATTR_MAX +}; + +#define OVS_PSAMPLE_ATTR_MAX (__OVS_PSAMPLE_ATTR_MAX - 1) + /** * enum ovs_action_attr - Action types. * @@ -1056,6 +1081,8 @@ struct check_pkt_len_arg { * of l3 tunnel flag in the tun_flags field of OVS_ACTION_ATTR_ADD_MPLS * argument. * @OVS_ACTION_ATTR_DROP: Explicit drop action. + * @OVS_ACTION_ATTR_PSAMPLE: Send a sample of the packet to external observers + * via psample. */ enum ovs_action_attr { @@ -1087,6 +1114,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ OVS_ACTION_ATTR_DROP, /* u32 xlate_error. */ + OVS_ACTION_ATTR_PSAMPLE, /* Nested OVS_PSAMPLE_ATTR_*. */ #ifndef __KERNEL__ OVS_ACTION_ATTR_TUNNEL_PUSH, /* struct ovs_action_push_tnl*/ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index c7f9e149025..f0594e5f5ce 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -9519,6 +9519,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } diff --git a/lib/dpif.c b/lib/dpif.c index 23eb18495a6..94db4630eda 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1192,6 +1192,8 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_TUNNEL_PUSH: case OVS_ACTION_ATTR_TUNNEL_POP: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_PSAMPLE: + case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_RECIRC: { struct dpif_execute execute; struct ofpbuf execute_actions; @@ -1278,7 +1280,6 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_POP_MPLS: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: - case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_TRUNC: case OVS_ACTION_ATTR_PUSH_ETH: case OVS_ACTION_ATTR_POP_ETH: diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 081e4d43268..15577d5394f 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -818,13 +818,13 @@ requires_datapath_assistance(const struct nlattr *a) case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_CT: case OVS_ACTION_ATTR_METER: + case OVS_ACTION_ATTR_PSAMPLE: return true; case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: - case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_PUSH_MPLS: case OVS_ACTION_ATTR_POP_MPLS: @@ -841,6 +841,28 @@ requires_datapath_assistance(const struct nlattr *a) case OVS_ACTION_ATTR_DROP: return false; + case OVS_ACTION_ATTR_SAMPLE: { + /* Nested "psample" actions rely on the datapath executing the + * parent "sample", storing the probability and making it available + * when the nested "psample" is run. */ + const struct nlattr *attr; + unsigned int left; + + NL_NESTED_FOR_EACH (attr, left, a) { + if (nl_attr_type(attr) == OVS_SAMPLE_ATTR_ACTIONS) { + const struct nlattr *act; + unsigned int act_left; + + NL_NESTED_FOR_EACH (act, act_left, attr) { + if (nl_attr_type(act) == OVS_ACTION_ATTR_PSAMPLE) { + return true; + } + } + } + } + return false; + } + case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); @@ -1229,6 +1251,7 @@ odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal, case OVS_ACTION_ATTR_CT: case OVS_ACTION_ATTR_UNSPEC: case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: /* The following actions are handled by the scalar implementation. */ case OVS_ACTION_ATTR_POP_VLAN: diff --git a/lib/odp-util.c b/lib/odp-util.c index 724e6f2bca1..d3245223dd6 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -145,6 +145,7 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_ADD_MPLS: return sizeof(struct ovs_action_add_mpls); case OVS_ACTION_ATTR_DEC_TTL: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_DROP: return sizeof(uint32_t); + case OVS_ACTION_ATTR_PSAMPLE: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: @@ -1150,6 +1151,28 @@ format_dec_ttl_action(struct ds *ds, const struct nlattr *attr, ds_put_format(ds, "))"); } +static void +format_odp_psample_action(struct ds *ds, const struct nlattr *attr) +{ + const struct nlattr *a; + unsigned int left; + + ds_put_cstr(ds, "psample("); + NL_NESTED_FOR_EACH (a, left, attr) { + switch (a->nla_type) { + case OVS_PSAMPLE_ATTR_GROUP: + ds_put_format(ds, "group=%"PRIu32",", nl_attr_get_u32(a)); + break; + case OVS_PSAMPLE_ATTR_COOKIE: + ds_put_cstr(ds, "cookie="); + ds_put_hex(ds, nl_attr_get(a), nl_attr_get_size(a)); + break; + } + } + ds_chomp(ds, ','); + ds_put_char(ds, ')'); +} + static void format_odp_action(struct ds *ds, const struct nlattr *a, const struct hmap *portno_names) @@ -1309,6 +1332,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a, case OVS_ACTION_ATTR_DROP: ds_put_cstr(ds, "drop"); break; + case OVS_ACTION_ATTR_PSAMPLE: + format_odp_psample_action(ds, a); + break; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: default: @@ -2358,6 +2384,50 @@ parse_odp_push_nsh_action(const char *s, struct ofpbuf *actions) return ret; } +static int +parse_odp_psample_action(const char *s, struct ofpbuf *actions) +{ + char buf[2 * OVS_PSAMPLE_COOKIE_MAX_SIZE + 1]; + uint8_t cookie[OVS_PSAMPLE_COOKIE_MAX_SIZE]; + bool has_group = false; + size_t cookie_len = 0; + uint32_t group; + int n = 0; + + if (!ovs_scan_len(s, &n, "psample(")) { + return -EINVAL; + } + + while (s[n] != ')') { + n += strspn(s + n, delimiters); + + if (!has_group && ovs_scan_len(s, &n, "group=%"SCNi32, &group)) { + has_group = true; + continue; + } + + if (!cookie_len && + ovs_scan_len(s, &n, "cookie=0x%32[0-9a-fA-F]", buf) && n > 7) { + struct ofpbuf b; + + ofpbuf_use_stub(&b, cookie, OVS_PSAMPLE_COOKIE_MAX_SIZE); + ofpbuf_put_hex(&b, buf, &cookie_len); + ofpbuf_uninit(&b); + continue; + } + return -EINVAL; + } + n++; + + if (!has_group) { + return -EINVAL; + } + + odp_put_psample_action(actions, group, cookie_len ? cookie : NULL, + cookie_len); + return n; +} + static int parse_action_list(struct parse_odp_context *context, const char *s, struct ofpbuf *actions) @@ -2719,6 +2789,10 @@ parse_odp_action__(struct parse_odp_context *context, const char *s, } } + if (!strncmp(s, "psample(", 8)) { + return parse_odp_psample_action(s, actions); + } + { struct ovs_action_push_tnl data; int n; @@ -7828,6 +7902,23 @@ odp_put_tnl_push_action(struct ofpbuf *odp_actions, nl_msg_put_unspec(odp_actions, OVS_ACTION_ATTR_TUNNEL_PUSH, data, size); } +void +odp_put_psample_action(struct ofpbuf *odp_actions, uint32_t group_id, + uint8_t *cookie, size_t cookie_len) +{ + size_t offset = nl_msg_start_nested_with_flag(odp_actions, + OVS_ACTION_ATTR_PSAMPLE); + + nl_msg_put_u32(odp_actions, OVS_PSAMPLE_ATTR_GROUP, group_id); + if (cookie && cookie_len) { + ovs_assert(cookie_len <= OVS_PSAMPLE_COOKIE_MAX_SIZE); + nl_msg_put_unspec(odp_actions, OVS_PSAMPLE_ATTR_COOKIE, cookie, + cookie_len); + } + + nl_msg_end_nested(odp_actions, offset); +} + /* The commit_odp_actions() function and its helpers. */ diff --git a/lib/odp-util.h b/lib/odp-util.h index 8c7baa680dd..e454dbfcdb5 100644 --- a/lib/odp-util.h +++ b/lib/odp-util.h @@ -376,6 +376,9 @@ void odp_put_pop_eth_action(struct ofpbuf *odp_actions); void odp_put_push_eth_action(struct ofpbuf *odp_actions, const struct eth_addr *eth_src, const struct eth_addr *eth_dst); +void odp_put_psample_action(struct ofpbuf *odp_actions, + uint32_t group_id, uint8_t *cookie, + size_t cookie_len); static inline void odp_decode_gbp_raw(uint32_t gbp_raw, ovs_be16 *id, diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index cd65dae7e18..15b65623351 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -3136,6 +3136,7 @@ dpif_ipfix_read_actions(const struct flow *flow, case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: default: break; diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c index 80405b68a67..fb12cf41927 100644 --- a/ofproto/ofproto-dpif-sflow.c +++ b/ofproto/ofproto-dpif-sflow.c @@ -1237,6 +1237,7 @@ dpif_sflow_read_actions(const struct flow *flow, case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: default: break; diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index a8f8c067a90..572dbebe98f 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -343,6 +343,14 @@ def _action_decoders_args(): } ) ), + "psample": nested_kv_decoder( + KVDecoders( + { + "group": decode_int, + "cookie": decode_default, + } + ) + ) } _decoders["sample"] = nested_kv_decoder( diff --git a/tests/odp.at b/tests/odp.at index ba20604e43d..402b2386d37 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -393,6 +393,10 @@ check_pkt_len(size=200,gt(ct(nat)),le(drop)) check_pkt_len(size=200,gt(set(eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15))),le(set(eth(src=00:01:02:03:04:06,dst=10:11:12:13:14:16)))) lb_output(1) add_mpls(label=200,tc=7,ttl=64,bos=1,eth_type=0x8847) +psample(group=12,cookie=0xf1020304050607080910111213141516) +psample(group=12) +sample(sample=50.0%,actions(psample(group=12,cookie=0xf1020304))) +sample(sample=50.0%,actions(userspace(pid=42,userdata(0102030400000000)),psample(group=12))) ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-actions < actions.txt], [0], [`cat actions.txt` @@ -406,11 +410,23 @@ AT_DATA([actions.txt], [dnl encap_nsh@:{@ tnl_push(tnl_port(6),header(size=94,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91))),out_port(1)) tnl_push(tnl_port(6),header(size=126,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91,2001:cafe::92,2001:cafe::93))),out_port(1)) +psample(group_id=12,cookie=0x0102030405060708090a0b0c0d0e0f0f0f) +psample(cookie=0x010203) +psample(group=12,cookie=0x010203,group=12) +psample(group=abc) +psample(group=12,cookie=wrong) +psample() ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-actions < actions.txt], [0], [dnl odp_actions_from_string: error odp_actions_from_string: error odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error ]) AT_CLEANUP From d0afbf094455bbe3e048b8e39a823f904e1ac55c Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:39 +0200 Subject: [PATCH 775/833] ofproto_dpif: Check for psample support. Only kernel datapath supports this action so add a function in dpif.c that checks for that. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- lib/dpif.c | 7 +++++++ lib/dpif.h | 1 + ofproto/ofproto-dpif.c | 46 ++++++++++++++++++++++++++++++++++++++++++ ofproto/ofproto-dpif.h | 6 +++++- vswitchd/vswitch.xml | 5 +++++ 5 files changed, 64 insertions(+), 1 deletion(-) diff --git a/lib/dpif.c b/lib/dpif.c index 94db4630eda..ab633fd274d 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1953,6 +1953,13 @@ dpif_supports_lb_output_action(const struct dpif *dpif) return dpif_is_netdev(dpif); } +bool +dpif_may_support_psample(const struct dpif *dpif) +{ + /* Userspace datapath does not support this action. */ + return !dpif_is_netdev(dpif); +} + /* Meters */ void dpif_meter_get_features(const struct dpif *dpif, diff --git a/lib/dpif.h b/lib/dpif.h index a764e8a592b..6bef7d5b304 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -941,6 +941,7 @@ int dpif_get_pmds_for_port(const struct dpif * dpif, odp_port_t port_no, char *dpif_get_dp_version(const struct dpif *); bool dpif_supports_tnl_push_pop(const struct dpif *); bool dpif_may_support_explicit_drop_action(const struct dpif *); +bool dpif_may_support_psample(const struct dpif *); bool dpif_synced_dp_layers(struct dpif *); /* Log functions. */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 33c80d87c0a..c2d2fab6ffb 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -874,6 +874,12 @@ ovs_lb_output_action_supported(struct ofproto_dpif *ofproto) return ofproto->backer->rt_support.lb_output_action; } +bool +ovs_psample_supported(struct ofproto_dpif *ofproto) +{ + return ofproto->backer->rt_support.psample; +} + /* Tests whether 'backer''s datapath supports recirculation. Only newer * datapaths support OVS_KEY_ATTR_RECIRC_ID in keys. We need to disable some * features on older datapaths that don't support this feature. @@ -1610,6 +1616,44 @@ check_add_mpls(struct dpif_backer *backer) return supported; } +/* Tests whether 'backer''s datapath supports the OVS_ACTION_ATTR_PSAMPLE + * action. */ +static bool +check_psample(struct dpif_backer *backer) +{ + uint8_t cookie[OVS_PSAMPLE_COOKIE_MAX_SIZE]; + struct odputil_keybuf keybuf; + struct ofpbuf actions; + struct ofpbuf key; + bool supported; + + /* Intentionally bogus dl_type. */ + struct flow flow = { + .dl_type = CONSTANT_HTONS(0x1234), + }; + struct odp_flow_key_parms odp_parms = { + .flow = &flow, + .probe = true, + }; + + ofpbuf_use_stack(&key, &keybuf, sizeof keybuf); + odp_flow_key_from_flow(&odp_parms, &key); + ofpbuf_init(&actions, 32); + + /* Generate a random max-size cookie. */ + random_bytes(cookie, sizeof cookie); + + odp_put_psample_action(&actions, 10, cookie, sizeof cookie); + + supported = dpif_may_support_psample(backer->dpif) && + dpif_probe_feature(backer->dpif, "psample", &key, &actions, NULL); + + ofpbuf_uninit(&actions); + VLOG_INFO("%s: Datapath %s psample action", dpif_name(backer->dpif), + supported ? "supports" : "does not support"); + return supported; +} + #define CHECK_FEATURE__(NAME, SUPPORT, FIELD, VALUE, ETHTYPE) \ static bool \ check_##NAME(struct dpif_backer *backer) \ @@ -1699,6 +1743,7 @@ check_support(struct dpif_backer *backer) dpif_supports_lb_output_action(backer->dpif); backer->rt_support.ct_zero_snat = dpif_supports_ct_zero_snat(backer); backer->rt_support.add_mpls = check_add_mpls(backer); + backer->rt_support.psample = check_psample(backer); /* Flow fields. */ backer->rt_support.odp.ct_state = check_ct_state(backer); @@ -5822,6 +5867,7 @@ get_datapath_cap(const char *datapath_type, struct smap *cap) smap_add(cap, "lb_output_action", s->lb_output_action ? "true" : "false"); smap_add(cap, "ct_zero_snat", s->ct_zero_snat ? "true" : "false"); smap_add(cap, "add_mpls", s->add_mpls ? "true" : "false"); + smap_add(cap, "psample", s->psample ? "true" : "false"); /* The ct_tuple_flush is implemented on dpif level, so it is supported * for all backers. */ diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index d33f73df8ae..abf9ac62ed7 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -213,7 +213,10 @@ struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, DPIF_SUPPORT_FIELD(bool, ct_zero_snat, "Conntrack all-zero IP SNAT") \ \ /* True if the datapath supports add_mpls action. */ \ - DPIF_SUPPORT_FIELD(bool, add_mpls, "MPLS Label add") + DPIF_SUPPORT_FIELD(bool, add_mpls, "MPLS Label add") \ + \ + /* True if the datapath supports psample action. */ \ + DPIF_SUPPORT_FIELD(bool, psample, "psample action") /* Stores the various features which the corresponding backer supports. */ @@ -411,5 +414,6 @@ bool ofproto_dpif_ct_zone_timeout_policy_get_name( uint8_t nw_proto, char **tp_name, bool *unwildcard); bool ovs_explicit_drop_action_supported(struct ofproto_dpif *); +bool ovs_psample_supported(struct ofproto_dpif *); #endif /* ofproto-dpif.h */ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 7f2d188be12..c754616f99e 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -6517,6 +6517,11 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ called NXT_CT_FLUSH. The NXT_CT_FLUSH extensions allows to flush CT entries based on specified parameters. + + True if the datapath supports OVS_ACTION_ATTR_PSAMPLE. If false, + local sampling will not be supported with the Linux kernel datapath. + From 5b99ebc2687f8e972f842dcb354b9b7dfec29062 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:40 +0200 Subject: [PATCH 776/833] ofproto: Add ofproto-dpif-lsample. Add a new resource in ofproto-dpif and the corresponding API in ofproto_provider.h to represent and local sampling configuration. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- ofproto/automake.mk | 2 + ofproto/ofproto-dpif-lsample.c | 185 +++++++++++++++++++++++++++++++++ ofproto/ofproto-dpif-lsample.h | 35 +++++++ ofproto/ofproto-dpif.c | 38 +++++++ ofproto/ofproto-dpif.h | 1 + ofproto/ofproto-provider.h | 9 ++ ofproto/ofproto.c | 12 +++ ofproto/ofproto.h | 8 ++ 8 files changed, 290 insertions(+) create mode 100644 ofproto/ofproto-dpif-lsample.c create mode 100644 ofproto/ofproto-dpif-lsample.h diff --git a/ofproto/automake.mk b/ofproto/automake.mk index 7c08b563bc3..cb1361b8a61 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -30,6 +30,8 @@ ofproto_libofproto_la_SOURCES = \ ofproto/ofproto-dpif.h \ ofproto/ofproto-dpif-ipfix.c \ ofproto/ofproto-dpif-ipfix.h \ + ofproto/ofproto-dpif-lsample.c \ + ofproto/ofproto-dpif-lsample.h \ ofproto/ofproto-dpif-mirror.c \ ofproto/ofproto-dpif-mirror.h \ ofproto/ofproto-dpif-monitor.c \ diff --git a/ofproto/ofproto-dpif-lsample.c b/ofproto/ofproto-dpif-lsample.c new file mode 100644 index 00000000000..80f5a51fd9b --- /dev/null +++ b/ofproto/ofproto-dpif-lsample.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ofproto-dpif-lsample.h" + +#include "cmap.h" +#include "hash.h" +#include "ofproto.h" +#include "openvswitch/thread.h" + +/* Dpif local sampling. + * + * Thread safety: dpif_lsample allows lockless concurrent reads of local + * sampling exporters as long as the following restrictions are met: + * 1) While the last reference is being dropped, i.e: a thread is calling + * "dpif_lsample_unref" on the last reference, other threads cannot call + * "dpif_lsample_ref". + * 2) Threads do not quiese while holding references to internal + * lsample_exporter objects. + */ + +struct dpif_lsample { + struct cmap exporters; /* Contains lsample_exporter_node instances + * indexed by collector_set_id. */ + struct ovs_mutex mutex; /* Protects concurrent insertion/deletion + * of exporters. */ + struct ovs_refcount ref_cnt; /* Controls references to this instance. */ +}; + +struct lsample_exporter { + struct ofproto_lsample_options options; +}; + +struct lsample_exporter_node { + struct cmap_node node; /* In dpif_lsample->exporters. */ + struct lsample_exporter exporter; +}; + +static void +dpif_lsample_delete_exporter(struct dpif_lsample *lsample, + struct lsample_exporter_node *node) +{ + ovs_mutex_lock(&lsample->mutex); + cmap_remove(&lsample->exporters, &node->node, + hash_int(node->exporter.options.collector_set_id, 0)); + ovs_mutex_unlock(&lsample->mutex); + + ovsrcu_postpone(free, node); +} + +/* Adds an exporter with the provided options which are copied. */ +static struct lsample_exporter_node * +dpif_lsample_add_exporter(struct dpif_lsample *lsample, + const struct ofproto_lsample_options *options) +{ + struct lsample_exporter_node *node; + + node = xzalloc(sizeof *node); + node->exporter.options = *options; + + ovs_mutex_lock(&lsample->mutex); + cmap_insert(&lsample->exporters, &node->node, + hash_int(options->collector_set_id, 0)); + ovs_mutex_unlock(&lsample->mutex); + + return node; +} + +static struct lsample_exporter_node * +dpif_lsample_find_exporter_node(const struct dpif_lsample *lsample, + const uint32_t collector_set_id) +{ + struct lsample_exporter_node *node; + + CMAP_FOR_EACH_WITH_HASH (node, node, hash_int(collector_set_id, 0), + &lsample->exporters) { + if (node->exporter.options.collector_set_id == collector_set_id) { + return node; + } + } + return NULL; +} + +/* Sets the lsample configuration and returns true if the configuration + * has changed. */ +bool +dpif_lsample_set_options(struct dpif_lsample *lsample, + const struct ofproto_lsample_options *options, + size_t n_options) +{ + const struct ofproto_lsample_options *opt; + struct lsample_exporter_node *node; + bool changed = false; + int i; + + for (i = 0; i < n_options; i++) { + opt = &options[i]; + node = dpif_lsample_find_exporter_node(lsample, + opt->collector_set_id); + if (!node) { + dpif_lsample_add_exporter(lsample, opt); + changed = true; + } else if (memcmp(&node->exporter.options, opt, sizeof *opt)) { + dpif_lsample_delete_exporter(lsample, node); + dpif_lsample_add_exporter(lsample, opt); + changed = true; + } + } + + /* Delete exporters that have been removed. */ + CMAP_FOR_EACH (node, node, &lsample->exporters) { + for (i = 0; i < n_options; i++) { + if (node->exporter.options.collector_set_id + == options[i].collector_set_id) { + break; + } + } + if (i == n_options) { + dpif_lsample_delete_exporter(lsample, node); + changed = true; + } + } + + return changed; +} + +struct dpif_lsample * +dpif_lsample_create(void) +{ + struct dpif_lsample *lsample; + + lsample = xzalloc(sizeof *lsample); + cmap_init(&lsample->exporters); + ovs_mutex_init(&lsample->mutex); + ovs_refcount_init(&lsample->ref_cnt); + + return lsample; +} + +static void +dpif_lsample_destroy(struct dpif_lsample *lsample) +{ + if (lsample) { + struct lsample_exporter_node *node; + + CMAP_FOR_EACH (node, node, &lsample->exporters) { + dpif_lsample_delete_exporter(lsample, node); + } + cmap_destroy(&lsample->exporters); + free(lsample); + } +} + +struct dpif_lsample * +dpif_lsample_ref(const struct dpif_lsample *lsample_) +{ + struct dpif_lsample *lsample = CONST_CAST(struct dpif_lsample *, lsample_); + + if (lsample) { + ovs_refcount_ref(&lsample->ref_cnt); + } + return lsample; +} + +void +dpif_lsample_unref(struct dpif_lsample *lsample) +{ + if (lsample && ovs_refcount_unref_relaxed(&lsample->ref_cnt) == 1) { + dpif_lsample_destroy(lsample); + } +} diff --git a/ofproto/ofproto-dpif-lsample.h b/ofproto/ofproto-dpif-lsample.h new file mode 100644 index 00000000000..a491c137d20 --- /dev/null +++ b/ofproto/ofproto-dpif-lsample.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OFPROTO_DPIF_LSAMPLE_H +#define OFPROTO_DPIF_LSAMPLE_H 1 + +#include +#include + +struct dpif_lsample; +struct ofproto_lsample_options; + +struct dpif_lsample *dpif_lsample_create(void); + +struct dpif_lsample *dpif_lsample_ref(const struct dpif_lsample *); +void dpif_lsample_unref(struct dpif_lsample *); + +bool dpif_lsample_set_options(struct dpif_lsample *, + const struct ofproto_lsample_options *, + size_t n_opts); + +#endif /* OFPROTO_DPIF_LSAMPLE_H */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index c2d2fab6ffb..59b9a525287 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -50,6 +50,7 @@ #include "ofproto-dpif-sflow.h" #include "ofproto-dpif-trace.h" #include "ofproto-dpif-upcall.h" +#include "ofproto-dpif-lsample.h" #include "ofproto-dpif-xlate.h" #include "ofproto-dpif-xlate-cache.h" #include "openvswitch/ofp-actions.h" @@ -1957,6 +1958,7 @@ destruct(struct ofproto *ofproto_, bool del) netflow_unref(ofproto->netflow); dpif_sflow_unref(ofproto->sflow); dpif_ipfix_unref(ofproto->ipfix); + dpif_lsample_unref(ofproto->lsample); hmap_destroy(&ofproto->bundles); mac_learning_unref(ofproto->ml); mcast_snooping_unref(ofproto->ms); @@ -2516,6 +2518,41 @@ get_ipfix_stats(const struct ofproto *ofproto_, return dpif_ipfix_get_stats(di, bridge_ipfix, replies); } +static int +set_local_sample(struct ofproto *ofproto_, + const struct ofproto_lsample_options *options, + size_t n_opts) +{ + struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); + struct dpif_lsample *lsample = ofproto->lsample; + bool changed = false; + + if (!ofproto->backer->rt_support.psample) { + return EOPNOTSUPP; + } + + if (n_opts && !lsample) { + lsample = ofproto->lsample = dpif_lsample_create(); + changed = true; + } + + if (lsample) { + if (!n_opts) { + dpif_lsample_unref(lsample); + lsample = ofproto->lsample = NULL; + changed = true; + } else if (dpif_lsample_set_options(lsample, options, n_opts)) { + changed = true; + } + } + + if (changed) { + ofproto->backer->need_revalidate = REV_RECONFIGURE; + } + + return 0; +} + static int set_cfm(struct ofport *ofport_, const struct cfm_settings *s) { @@ -7201,6 +7238,7 @@ const struct ofproto_class ofproto_dpif_class = { set_sflow, set_ipfix, get_ipfix_stats, + set_local_sample, set_cfm, cfm_status_changed, get_cfm_status, diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index abf9ac62ed7..b3dbece6711 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -331,6 +331,7 @@ struct ofproto_dpif { struct netflow *netflow; struct dpif_sflow *sflow; struct dpif_ipfix *ipfix; + struct dpif_lsample *lsample; struct hmap bundles; /* Contains "struct ofbundle"s. */ struct mac_learning *ml; struct mcast_snooping *ms; diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 83c509fcf80..85991554cd9 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -1489,6 +1489,15 @@ struct ofproto_class { bool bridge_ipfix, struct ovs_list *replies ); + /* Configures local sampling on 'ofproto' according to the options array + * of 'options' which contains 'n_options' elements. + * + * EOPNOTSUPP as a return value indicates that 'ofproto' does not support + * local sampling. */ + int (*set_local_sample)(struct ofproto *ofproto, + const struct ofproto_lsample_options *options, + size_t n_options); + /* Configures connectivity fault management on 'ofport'. * * If 'cfm_settings' is nonnull, configures CFM according to its members. diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 21c6a1d8257..8c1efe4bf72 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -1000,6 +1000,18 @@ ofproto_get_datapath_cap(const char *datapath_type, struct smap *dp_cap) } } +int ofproto_set_local_sample(struct ofproto *ofproto, + const struct ofproto_lsample_options *options, + size_t n_options) +{ + if (ofproto->ofproto_class->set_local_sample) { + return ofproto->ofproto_class->set_local_sample(ofproto, options, + n_options); + } else { + return EOPNOTSUPP; + } +} + /* Connection tracking configuration. */ void ofproto_ct_set_zone_timeout_policy(const char *datapath_type, uint16_t zone_id, diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 1c07df27518..f1ff80e5204 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -103,6 +103,11 @@ struct ofproto_ipfix_flow_exporter_options { char *virtual_obs_id; }; +struct ofproto_lsample_options { + uint32_t collector_set_id; + uint32_t group_id; +}; + struct ofproto_rstp_status { bool enabled; /* If false, ignore other members. */ rstp_identifier root_id; @@ -371,6 +376,9 @@ int ofproto_set_ipfix(struct ofproto *, const struct ofproto_ipfix_bridge_exporter_options *, const struct ofproto_ipfix_flow_exporter_options *, size_t); +int ofproto_set_local_sample(struct ofproto *ofproto, + const struct ofproto_lsample_options *, + size_t n_options); void ofproto_set_flow_restore_wait(bool flow_restore_wait_db); bool ofproto_get_flow_restore_wait(void); int ofproto_set_stp(struct ofproto *, const struct ofproto_stp_settings *); From d54b967e8d2160fc187b77eff202cb6d9196e055 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:41 +0200 Subject: [PATCH 777/833] vswitchd: Add local sampling to vswitchd schema. Add as new column in the Flow_Sample_Collector_Set table named "local_group_id" which enables this feature. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- NEWS | 5 +++ vswitchd/bridge.c | 78 +++++++++++++++++++++++++++++++++++--- vswitchd/vswitch.ovsschema | 9 ++++- vswitchd/vswitch.xml | 41 ++++++++++++++++++-- 4 files changed, 122 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index bd97ea3c5aa..17ac8cd1d90 100644 --- a/NEWS +++ b/NEWS @@ -34,6 +34,11 @@ Post-v3.3.0 these tunnels will calculate checksums by default and that behaviour can be changed with "options:csum=false" just as with the userspace datapath. + - Local sampling is introduced. It reuses the OpenFlow sample action and + allows samples to be emitted locally (instead of via IPFIX) in a + datapath-specific manner. The Linux kernel datapath is the first to + support this feature by using the new datapath 'psample' action. See + 'local-group-id' column in the Flow_Sample_Collector_Set table. v3.3.0 - 16 Feb 2024 diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 95a65fcdcd5..c5399d18c43 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -288,6 +288,7 @@ static void bridge_configure_mac_table(struct bridge *); static void bridge_configure_mcast_snooping(struct bridge *); static void bridge_configure_sflow(struct bridge *, int *sflow_bridge_number); static void bridge_configure_ipfix(struct bridge *); +static void bridge_configure_lsample(struct bridge *); static void bridge_configure_spanning_tree(struct bridge *); static void bridge_configure_tables(struct bridge *); static void bridge_configure_dp_desc(struct bridge *); @@ -989,6 +990,7 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) bridge_configure_netflow(br); bridge_configure_sflow(br, &sflow_bridge_number); bridge_configure_ipfix(br); + bridge_configure_lsample(br); bridge_configure_spanning_tree(br); bridge_configure_tables(br); bridge_configure_dp_desc(br); @@ -1537,10 +1539,11 @@ ovsrec_ipfix_is_valid(const struct ovsrec_ipfix *ipfix) return ipfix && ipfix->n_targets > 0; } -/* Returns whether a Flow_Sample_Collector_Set row is valid. */ +/* Returns whether a Flow_Sample_Collector_Set row contains a valid IPFIX + * configuration. */ static bool -ovsrec_fscs_is_valid(const struct ovsrec_flow_sample_collector_set *fscs, - const struct bridge *br) +ovsrec_fscs_is_valid_ipfix(const struct ovsrec_flow_sample_collector_set *fscs, + const struct bridge *br) { return ovsrec_ipfix_is_valid(fscs->ipfix) && fscs->bridge == br->cfg; } @@ -1558,7 +1561,7 @@ bridge_configure_ipfix(struct bridge *br) const char *virtual_obs_id; OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH(fe_cfg, idl) { - if (ovsrec_fscs_is_valid(fe_cfg, br)) { + if (ovsrec_fscs_is_valid_ipfix(fe_cfg, br)) { n_fe_opts++; } } @@ -1621,7 +1624,7 @@ bridge_configure_ipfix(struct bridge *br) fe_opts = xcalloc(n_fe_opts, sizeof *fe_opts); opts = fe_opts; OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH(fe_cfg, idl) { - if (ovsrec_fscs_is_valid(fe_cfg, br)) { + if (ovsrec_fscs_is_valid_ipfix(fe_cfg, br)) { opts->collector_set_id = fe_cfg->id; sset_init(&opts->targets); sset_add_array(&opts->targets, fe_cfg->ipfix->targets, @@ -1667,6 +1670,71 @@ bridge_configure_ipfix(struct bridge *br) } } +/* Returns whether a Flow_Sample_Collector_Set row contains a valid local + * sampling configuration. */ +static bool +ovsrec_fscs_is_valid_local(const struct ovsrec_flow_sample_collector_set *fscs, + const struct bridge *br) +{ + return fscs->local_group_id && fscs->n_local_group_id == 1 && + fscs->bridge == br->cfg; +} + +/* Set local sample configuration on 'br'. */ +static void +bridge_configure_lsample(struct bridge *br) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + const struct ovsrec_flow_sample_collector_set *fscs; + struct ofproto_lsample_options *opts_array, *opts; + size_t n_opts = 0; + int ret; + + /* Iterate the Flow_Sample_Collector_Set table twice. + * First to get the number of valid configuration entries, then to process + * each of them and build an array of options. */ + OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH (fscs, idl) { + if (ovsrec_fscs_is_valid_local(fscs, br)) { + n_opts++; + } + } + + if (n_opts == 0) { + ofproto_set_local_sample(br->ofproto, NULL, 0); + return; + } + + opts_array = xcalloc(n_opts, sizeof *opts_array); + opts = opts_array; + + OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH (fscs, idl) { + if (!ovsrec_fscs_is_valid_local(fscs, br)) { + continue; + } + opts->collector_set_id = fscs->id; + opts->group_id = *fscs->local_group_id; + opts++; + } + + ret = ofproto_set_local_sample(br->ofproto, opts_array, n_opts); + + if (ret == EOPNOTSUPP) { + if (n_opts) { + VLOG_WARN_RL(&rl, + "bridge %s: ignoring local sampling configuration: " + "not supported by this datapath", + br->name); + } + } else if (ret) { + VLOG_ERR_RL(&rl, "bridge %s: error configuring local sampling: %s", + br->name, ovs_strerror(ret)); + } + + if (n_opts > 0) { + free(opts_array); + } +} + static void port_configure_stp(const struct ofproto *ofproto, struct port *port, struct ofproto_port_stp_settings *port_s, diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index e2d5e2e85e6..95018d10745 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "8.5.0", - "cksum": "4040946650 27557", + "version": "8.6.0", + "cksum": "1543805939 27765", "tables": { "Open_vSwitch": { "columns": { @@ -562,6 +562,11 @@ "type": {"key": {"type": "uuid", "refTable": "IPFIX"}, "min": 0, "max": 1}}, + "local_group_id": { + "type": {"key": {"type": "integer", + "minInteger": 0, + "maxInteger": 4294967295}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index c754616f99e..d89ad2d184c 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -7019,10 +7019,37 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \

      - A set of IPFIX collectors of packet samples generated by OpenFlow - sample actions. This table is used only for IPFIX - flow-based sampling, not for per-bridge sampling (see the table for a description of the two forms). + A set of IPFIX or local sampling collectors of packet samples generated + by OpenFlow sample actions. +

      + +

      + If the column ipfix contains a reference to a + valid IPFIX entry, samples will be emitted via IPFIX. This mechanism + is known as flow-based IPFIX sampling, as opposed to bridge-based + sampling (see the table for a description of the + two forms). +

      + +

      + If the column local_group_id contains an integer and the + running datapath supports local sample emission, packets will be sent + to some local sample collector. Samples will contain the group number + specified by local_group_id which helps identify their + source as well as a 64-bit cookie result from the concatenation of the + observation_domain_id an the observation_point_id in network byte order. + + The way the sample is emitted and made available for local collectors + is datapath-specific. + + Currently only Linux kernel datapath supports local sampling which is + implemented by sending the packet to the psample netlink + multicast group. +

      + +

      + Note: both local_group_id and ipfix can be + configured simultaneously.

      @@ -7041,6 +7068,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ record per sampled packet to. + + Configuration of the sample group id to be used in local sampling. + + The overall purpose of these columns is described under Common Columns at the beginning of this document. From c10dbcec758d32d4cca25cedc87b433bd9ffa06a Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:42 +0200 Subject: [PATCH 778/833] ofproto-dpif-xlate: Use psample for local sample. Use the newly added psample action to implement OpenFlow sample() actions with local sampling configuration if possible. A bit of refactoring in compose_sample_actions arguments helps make it a bit more readable. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-lsample.c | 14 ++ ofproto/ofproto-dpif-lsample.h | 5 + ofproto/ofproto-dpif-xlate.c | 238 ++++++++++++++++++++++----------- ofproto/ofproto-dpif-xlate.h | 5 +- ofproto/ofproto-dpif.c | 2 +- tests/ofproto-dpif.at | 159 ++++++++++++++++++++++ 6 files changed, 343 insertions(+), 80 deletions(-) diff --git a/ofproto/ofproto-dpif-lsample.c b/ofproto/ofproto-dpif-lsample.c index 80f5a51fd9b..11706e3635c 100644 --- a/ofproto/ofproto-dpif-lsample.c +++ b/ofproto/ofproto-dpif-lsample.c @@ -138,6 +138,20 @@ dpif_lsample_set_options(struct dpif_lsample *lsample, return changed; } +/* Returns the group_id for a given collector_set_id, if it exists. */ +bool +dpif_lsample_get_group_id(struct dpif_lsample *ps, uint32_t collector_set_id, + uint32_t *group_id) +{ + struct lsample_exporter_node *node; + + node = dpif_lsample_find_exporter_node(ps, collector_set_id); + if (node) { + *group_id = node->exporter.options.group_id; + } + return !!node; +} + struct dpif_lsample * dpif_lsample_create(void) { diff --git a/ofproto/ofproto-dpif-lsample.h b/ofproto/ofproto-dpif-lsample.h index a491c137d20..26517a64590 100644 --- a/ofproto/ofproto-dpif-lsample.h +++ b/ofproto/ofproto-dpif-lsample.h @@ -18,6 +18,7 @@ #define OFPROTO_DPIF_LSAMPLE_H 1 #include +#include #include struct dpif_lsample; @@ -32,4 +33,8 @@ bool dpif_lsample_set_options(struct dpif_lsample *, const struct ofproto_lsample_options *, size_t n_opts); +bool dpif_lsample_get_group_id(struct dpif_lsample *, + uint32_t collector_set_id, + uint32_t *group_id); + #endif /* OFPROTO_DPIF_LSAMPLE_H */ diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 7c495089509..8704aa9b9bc 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -45,6 +45,7 @@ #include "nx-match.h" #include "odp-execute.h" #include "ofproto/ofproto-dpif-ipfix.h" +#include "ofproto/ofproto-dpif-lsample.h" #include "ofproto/ofproto-dpif-mirror.h" #include "ofproto/ofproto-dpif-monitor.h" #include "ofproto/ofproto-dpif-sflow.h" @@ -116,6 +117,7 @@ struct xbridge { struct mbridge *mbridge; /* Mirroring. */ struct dpif_sflow *sflow; /* SFlow handle, or null. */ struct dpif_ipfix *ipfix; /* Ipfix handle, or null. */ + struct dpif_lsample *lsample; /* Local sample handle, or null. */ struct netflow *netflow; /* Netflow handle, or null. */ struct stp *stp; /* STP or null if disabled. */ struct rstp *rstp; /* RSTP or null if disabled. */ @@ -686,6 +688,7 @@ static void xlate_xbridge_set(struct xbridge *, struct dpif *, const struct mbridge *, const struct dpif_sflow *, const struct dpif_ipfix *, + const struct dpif_lsample *, const struct netflow *, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *, @@ -1069,6 +1072,7 @@ xlate_xbridge_set(struct xbridge *xbridge, const struct mbridge *mbridge, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, + const struct dpif_lsample *lsample, const struct netflow *netflow, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *support, @@ -1099,6 +1103,11 @@ xlate_xbridge_set(struct xbridge *xbridge, xbridge->ipfix = dpif_ipfix_ref(ipfix); } + if (xbridge->lsample != lsample) { + dpif_lsample_unref(xbridge->lsample); + xbridge->lsample = dpif_lsample_ref(lsample); + } + if (xbridge->stp != stp) { stp_unref(xbridge->stp); xbridge->stp = stp_ref(stp); @@ -1213,9 +1222,10 @@ xlate_xbridge_copy(struct xbridge *xbridge) xlate_xbridge_set(new_xbridge, xbridge->dpif, xbridge->ml, xbridge->stp, xbridge->rstp, xbridge->ms, xbridge->mbridge, - xbridge->sflow, xbridge->ipfix, xbridge->netflow, - xbridge->forward_bpdu, xbridge->has_in_band, - &xbridge->support, xbridge->addr); + xbridge->sflow, xbridge->ipfix, xbridge->lsample, + xbridge->netflow, xbridge->forward_bpdu, + xbridge->has_in_band, &xbridge->support, + xbridge->addr); LIST_FOR_EACH (xbundle, list_node, &xbridge->xbundles) { xlate_xbundle_copy(new_xbridge, xbundle); } @@ -1372,6 +1382,7 @@ xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name, const struct mbridge *mbridge, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, + const struct dpif_lsample *lsample, const struct netflow *netflow, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *support) @@ -1396,7 +1407,7 @@ xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name, old_addr = xbridge->addr; xlate_xbridge_set(xbridge, dpif, ml, stp, rstp, ms, mbridge, sflow, ipfix, - netflow, forward_bpdu, has_in_band, support, + lsample, netflow, forward_bpdu, has_in_band, support, xbridge_addr); if (xbridge_addr != old_addr) { @@ -1428,6 +1439,7 @@ xlate_xbridge_remove(struct xlate_cfg *xcfg, struct xbridge *xbridge) mbridge_unref(xbridge->mbridge); dpif_sflow_unref(xbridge->sflow); dpif_ipfix_unref(xbridge->ipfix); + dpif_lsample_unref(xbridge->lsample); netflow_unref(xbridge->netflow); stp_unref(xbridge->stp); rstp_unref(xbridge->rstp); @@ -3357,58 +3369,91 @@ xlate_normal(struct xlate_ctx *ctx) } } -/* Appends a "sample" action for sFlow or IPFIX to 'ctx->odp_actions'. The - * 'probability' is the number of packets out of UINT32_MAX to sample. The - * 'cookie' is passed back in the callback for each sampled packet. - * 'tunnel_out_port', if not ODPP_NONE, is added as the - * OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute. If 'include_actions', - * an OVS_USERSPACE_ATTR_ACTIONS attribute is added. If - * 'emit_set_tunnel', sample(sampling_port=1) would translate into - * datapath sample action set(tunnel(...)), sample(...) and it is used - * for sampling egress tunnel information. - */ +/* Psample-related arguments for compose_sample_action. */ +struct sample_psample_args { + uint32_t group_id; /* Group to be used in psample. */ + ovs_32aligned_be64 cookie; /* Cookie to be used in psample. */ +}; + +/* Userspace-related arguments for compose_sample_action. */ +struct sample_userspace_args { + struct user_action_cookie cookie; /* Data passed back in the upcall + * for each sampled packet. */ + odp_port_t tunnel_out_port; /* If not ODPP_NONE, it is added in + * OVS_USERSPACE_ATTR_EGRESS_TUN_PORT + * attribute. */ + bool include_actions; /* Whether OVS_USERSPACE_ATTR_ACTIONS + * is to be set. */ + +}; + +/* Arguments for compose_sample_action. */ +struct compose_sample_args { + uint32_t probability; /* Number of packets out of + * UINT32_MAX to sample. */ + struct sample_userspace_args *userspace; /* Optional, + * arguments for userspace. */ + struct sample_psample_args *psample; /* Optional, + * arguments for psample. */ +}; + +/* Composes sample action according to 'args'. */ static size_t compose_sample_action(struct xlate_ctx *ctx, - const uint32_t probability, - const struct user_action_cookie *cookie, - const odp_port_t tunnel_out_port, - bool include_actions) + const struct compose_sample_args *args) { - if (probability == 0) { + if (args->probability == 0) { /* No need to generate sampling or the inner action. */ return 0; } + /* At least one of userspace or psample config must be provided. */ + ovs_assert(args->userspace || args->psample); + /* If the slow path meter is configured by the controller, * insert a meter action before the user space action. */ struct ofproto *ofproto = &ctx->xin->ofproto->up; uint32_t meter_id = ofproto->slowpath_meter_id; + size_t cookie_offset = 0; - /* When meter action is not required, avoid generate sample action - * for 100% sampling rate. */ - bool is_sample = probability < UINT32_MAX || meter_id != UINT32_MAX; + /* The meter action is only used to throttle userspace actions. + * If they are not needed and the sampling rate is 100%, avoid generating + * a sample action. */ + bool is_sample = (args->probability < UINT32_MAX || + (args->userspace && meter_id != UINT32_MAX)); size_t sample_offset = 0, actions_offset = 0; if (is_sample) { sample_offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE); nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY, - probability); + args->probability); actions_offset = nl_msg_start_nested(ctx->odp_actions, OVS_SAMPLE_ATTR_ACTIONS); } - if (meter_id != UINT32_MAX) { - nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); + if (args->psample) { + odp_put_psample_action(ctx->odp_actions, + args->psample->group_id, + (void *) &args->psample->cookie, + sizeof args->psample->cookie); + } + + if (args->userspace) { + if (meter_id != UINT32_MAX) { + nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); + } + + odp_port_t odp_port = ofp_port_to_odp_port( + ctx->xbridge, ctx->xin->flow.in_port.ofp_port); + uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); + int res = odp_put_userspace_action(pid, &args->userspace->cookie, + sizeof args->userspace->cookie, + args->userspace->tunnel_out_port, + args->userspace->include_actions, + ctx->odp_actions, &cookie_offset); + ovs_assert(res == 0); } - odp_port_t odp_port = ofp_port_to_odp_port( - ctx->xbridge, ctx->xin->flow.in_port.ofp_port); - uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); - size_t cookie_offset; - int res = odp_put_userspace_action(pid, cookie, sizeof *cookie, - tunnel_out_port, include_actions, - ctx->odp_actions, &cookie_offset); - ovs_assert(res == 0); if (is_sample) { nl_msg_end_nested(ctx->odp_actions, actions_offset); nl_msg_end_nested(ctx->odp_actions, sample_offset); @@ -3428,19 +3473,24 @@ static size_t compose_sflow_action(struct xlate_ctx *ctx) { struct dpif_sflow *sflow = ctx->xbridge->sflow; + struct sample_userspace_args userspace; + struct compose_sample_args args = {0}; + if (!sflow || ctx->xin->flow.in_port.ofp_port == OFPP_NONE) { return 0; } - struct user_action_cookie cookie; + memset(&userspace, 0, sizeof userspace); + userspace.cookie.type = USER_ACTION_COOKIE_SFLOW; + userspace.cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; + userspace.cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + userspace.tunnel_out_port = ODPP_NONE; + userspace.include_actions = true; - memset(&cookie, 0, sizeof cookie); - cookie.type = USER_ACTION_COOKIE_SFLOW; - cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; - cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + args.probability = dpif_sflow_get_probability(sflow); + args.userspace = &userspace; - return compose_sample_action(ctx, dpif_sflow_get_probability(sflow), - &cookie, ODPP_NONE, true); + return compose_sample_action(ctx, &args); } /* If flow IPFIX is enabled, make sure IPFIX flow sample action @@ -3451,7 +3501,11 @@ static void compose_ipfix_action(struct xlate_ctx *ctx, odp_port_t output_odp_port) { struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; - odp_port_t tunnel_out_port = ODPP_NONE; + struct sample_userspace_args userspace; + struct compose_sample_args args = {0}; + + memset(&userspace, 0, sizeof userspace); + userspace.tunnel_out_port = ODPP_NONE; if (!ipfix || (output_odp_port == ODPP_NONE && @@ -3476,21 +3530,20 @@ compose_ipfix_action(struct xlate_ctx *ctx, odp_port_t output_odp_port) */ if (dpif_ipfix_get_bridge_exporter_tunnel_sampling(ipfix) && dpif_ipfix_is_tunnel_port(ipfix, output_odp_port) ) { - tunnel_out_port = output_odp_port; + userspace.tunnel_out_port = output_odp_port; } } - struct user_action_cookie cookie; + userspace.cookie.type = USER_ACTION_COOKIE_IPFIX; + userspace.cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; + userspace.cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + userspace.cookie.ipfix.output_odp_port = output_odp_port; + userspace.include_actions = false; - memset(&cookie, 0, sizeof cookie); - cookie.type = USER_ACTION_COOKIE_IPFIX; - cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; - cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; - cookie.ipfix.output_odp_port = output_odp_port; + args.probability = dpif_ipfix_get_bridge_exporter_probability(ipfix); + args.userspace = &userspace; - compose_sample_action(ctx, - dpif_ipfix_get_bridge_exporter_probability(ipfix), - &cookie, tunnel_out_port, false); + compose_sample_action(ctx, &args); } /* Fix "sample" action according to data collected while composing ODP actions, @@ -5847,22 +5900,16 @@ xlate_fin_timeout(struct xlate_ctx *ctx, } static void -xlate_sample_action(struct xlate_ctx *ctx, - const struct ofpact_sample *os) +xlate_fill_ipfix_sample(struct xlate_ctx *ctx, + const struct ofpact_sample *os, + const struct dpif_ipfix *ipfix, + struct sample_userspace_args *userspace) { odp_port_t output_odp_port = ODPP_NONE; - odp_port_t tunnel_out_port = ODPP_NONE; - struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; bool emit_set_tunnel = false; - if (!ipfix) { - return; - } - - /* Scale the probability from 16-bit to 32-bit while representing - * the same percentage. */ - uint32_t probability = - ((uint32_t) os->probability << 16) | os->probability; + memset(userspace, 0, sizeof *userspace); + userspace->tunnel_out_port = ODPP_NONE; /* If ofp_port in flow sample action is equel to ofp_port, * this sample action is a input port action. */ @@ -5879,7 +5926,7 @@ xlate_sample_action(struct xlate_ctx *ctx, if (dpif_ipfix_get_flow_exporter_tunnel_sampling(ipfix, os->collector_set_id) && dpif_ipfix_is_tunnel_port(ipfix, output_odp_port)) { - tunnel_out_port = output_odp_port; + userspace->tunnel_out_port = output_odp_port; emit_set_tunnel = true; } } @@ -5913,20 +5960,57 @@ xlate_sample_action(struct xlate_ctx *ctx, } } - struct user_action_cookie cookie; + userspace->cookie.type = USER_ACTION_COOKIE_FLOW_SAMPLE; + userspace->cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; + userspace->cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + userspace->cookie.flow_sample.probability = os->probability; + userspace->cookie.flow_sample.collector_set_id = os->collector_set_id; + userspace->cookie.flow_sample.obs_domain_id = os->obs_domain_id; + userspace->cookie.flow_sample.obs_point_id = os->obs_point_id; + userspace->cookie.flow_sample.output_odp_port = output_odp_port; + userspace->cookie.flow_sample.direction = os->direction; + userspace->include_actions = false; +} - memset(&cookie, 0, sizeof cookie); - cookie.type = USER_ACTION_COOKIE_FLOW_SAMPLE; - cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; - cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; - cookie.flow_sample.probability = os->probability; - cookie.flow_sample.collector_set_id = os->collector_set_id; - cookie.flow_sample.obs_domain_id = os->obs_domain_id; - cookie.flow_sample.obs_point_id = os->obs_point_id; - cookie.flow_sample.output_odp_port = output_odp_port; - cookie.flow_sample.direction = os->direction; - - compose_sample_action(ctx, probability, &cookie, tunnel_out_port, false); +static void +xlate_sample_action(struct xlate_ctx *ctx, + const struct ofpact_sample *os) +{ + struct dpif_lsample *lsample = ctx->xbridge->lsample; + struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; + struct compose_sample_args compose_args = {0}; + struct sample_userspace_args userspace; + struct sample_psample_args psample; + + if (!ipfix && !lsample) { + return; + } + + /* Scale the probability from 16-bit to 32-bit while representing + * the same percentage. */ + compose_args.probability = + ((uint32_t) os->probability << 16) | os->probability; + + if (ipfix) { + xlate_fill_ipfix_sample(ctx, os, ipfix, &userspace); + compose_args.userspace = &userspace; + } + + if (lsample && + dpif_lsample_get_group_id(lsample, + os->collector_set_id, + &psample.group_id)) { + psample.cookie.hi = htonl(os->obs_domain_id); + psample.cookie.lo = htonl(os->obs_point_id); + + compose_args.psample = &psample; + } + + if (!compose_args.userspace && !compose_args.psample) { + return; + } + + compose_sample_action(ctx, &compose_args); } /* Determine if an datapath action translated from the openflow action diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index 05b46fb26b1..08f9397d824 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -176,8 +176,9 @@ void xlate_ofproto_set(struct ofproto_dpif *, const char *name, struct dpif *, const struct mac_learning *, struct stp *, struct rstp *, const struct mcast_snooping *, const struct mbridge *, const struct dpif_sflow *, - const struct dpif_ipfix *, const struct netflow *, - bool forward_bpdu, bool has_in_band, + const struct dpif_ipfix *, const struct dpif_lsample *, + const struct netflow *, bool forward_bpdu, + bool has_in_band, const struct dpif_backer_support *support); void xlate_remove_ofproto(struct ofproto_dpif *); struct ofproto_dpif *xlate_ofproto_lookup(const struct uuid *uuid); diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 59b9a525287..173a618cc90 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -486,7 +486,7 @@ type_run(const char *type) ofproto->backer->dpif, ofproto->ml, ofproto->stp, ofproto->rstp, ofproto->ms, ofproto->mbridge, ofproto->sflow, ofproto->ipfix, - ofproto->netflow, + ofproto->lsample, ofproto->netflow, ofproto->up.forward_bpdu, connmgr_has_in_band(ofproto->up.connmgr), &ofproto->backer->rt_support); diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 30ef0468c8d..e6646106ec8 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -12176,3 +12176,162 @@ AT_CHECK([test 1 = `ovs-ofctl parse-pcap p2-tx.pcap | wc -l`]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - not supported]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +m4_define([NOT_SUPPORTED_WARN], [dnl +ignoring local sampling configuration: not supported by this datapath]) + +AT_CHECK([grep -q "NOT_SUPPORTED_WARN" ovs-vswitchd.log ]) + +AT_DATA([flows.txt], [dnl +in_port=1 actions=sample(probability=32767,obs_domain_id=100,obs_point_id=200),2 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [in_port(1)], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: 2 +]) + +OVS_VSWITCHD_STOP(["/NOT_SUPPORTED_WARN/d"]) +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - sanity check]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +dnl Enabling an usupported feature is dangerous but we are not sending traffic. +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=42], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200),3 +in_port=2, actions=sample(probability=32767,collector_set_id=20,obs_domain_id=100,obs_point_id=200),3 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +dnl collector_set_id does not match. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: 3 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: sample(sample=50.0%,actions(psample(group=42,cookie=0x64000000c8))),3 +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - with IPFIX]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +dnl Enabling an usupported feature is dangerous but we are not sending traffic. +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@i create ipfix targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set ipfix=@i id=1 \ + bridge=@br0 local-group-id=42], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200),2 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +m4_define([EXPECTED_ACT], [m4_join([], + [sample(sample=50.0%,actions(], + [psample(group=42,cookie=0x64000000c8),], + [userspace(pid=0,], + [flow_sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200,output_port=4294967295)], + [))),], + [2], +)]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: EXPECTED_ACT +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - with metered IPFIX]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +dnl Enabling an usupported feature is dangerous but we are not sending traffic. +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@i create ipfix targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set ipfix=@i id=1 \ + bridge=@br0 local-group-id=42], + [0], [ignore]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=slowpath pktps stats bands=type=drop rate=2']) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200),2 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +m4_define([EXPECTED_ACT], [m4_join([], + [sample(sample=50.0%,actions(], + [psample(group=42,cookie=0x64000000c8),], + [meter(0),], + [userspace(pid=0,], + [flow_sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200,output_port=4294967295)], + [))),], + [2], +)]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: EXPECTED_ACT +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP From 742de01a4a2ee0b6222698debcefd84dc7fbf268 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:43 +0200 Subject: [PATCH 779/833] tests: Add test-psample testing utility. This simple program reads from psample and prints the packets to stdout. Acked-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- include/linux/automake.mk | 1 + include/linux/psample.h | 68 +++++++++ tests/automake.mk | 3 +- tests/test-psample.c | 290 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 include/linux/psample.h create mode 100644 tests/test-psample.c diff --git a/include/linux/automake.mk b/include/linux/automake.mk index cdae5eedc48..ac306b53c2c 100644 --- a/include/linux/automake.mk +++ b/include/linux/automake.mk @@ -3,6 +3,7 @@ noinst_HEADERS += \ include/linux/netfilter/nf_conntrack_sctp.h \ include/linux/openvswitch.h \ include/linux/pkt_cls.h \ + include/linux/psample.h \ include/linux/gen_stats.h \ include/linux/tc_act/tc_mpls.h \ include/linux/tc_act/tc_pedit.h \ diff --git a/include/linux/psample.h b/include/linux/psample.h new file mode 100644 index 00000000000..d5761b73072 --- /dev/null +++ b/include/linux/psample.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PSAMPLE_H +#define __LINUX_PSAMPLE_H + +enum { + PSAMPLE_ATTR_IIFINDEX, + PSAMPLE_ATTR_OIFINDEX, + PSAMPLE_ATTR_ORIGSIZE, + PSAMPLE_ATTR_SAMPLE_GROUP, + PSAMPLE_ATTR_GROUP_SEQ, + PSAMPLE_ATTR_SAMPLE_RATE, + PSAMPLE_ATTR_DATA, + PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_TUNNEL, + + PSAMPLE_ATTR_PAD, + PSAMPLE_ATTR_OUT_TC, /* u16 */ + PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */ + PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */ + PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */ + PSAMPLE_ATTR_PROTO, /* u16 */ + PSAMPLE_ATTR_USER_COOKIE, /* binary, user provided data */ + PSAMPLE_ATTR_SAMPLE_PROBABILITY,/* no argument, interpret rate in + * PSAMPLE_ATTR_SAMPLE_RATE as a + * probability scaled 0 - U32_MAX. + */ + + __PSAMPLE_ATTR_MAX +}; + +enum psample_command { + PSAMPLE_CMD_SAMPLE, + PSAMPLE_CMD_GET_GROUP, + PSAMPLE_CMD_NEW_GROUP, + PSAMPLE_CMD_DEL_GROUP, + PSAMPLE_CMD_SAMPLE_FILTER_SET, +}; + +enum psample_tunnel_key_attr { + PSAMPLE_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */ + PSAMPLE_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ + PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ + PSAMPLE_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + PSAMPLE_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, /* be16 src Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, /* be16 dst Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_VXLAN_OPTS, /* Nested VXLAN opts* */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_PAD, + PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE, /* No argument. IPV4_INFO_BRIDGE mode.*/ + __PSAMPLE_TUNNEL_KEY_ATTR_MAX +}; + +/* Can be overridden at runtime by module option */ +#define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1) + +#define PSAMPLE_NL_MCGRP_CONFIG_NAME "config" +#define PSAMPLE_NL_MCGRP_SAMPLE_NAME "packets" +#define PSAMPLE_GENL_NAME "psample" +#define PSAMPLE_GENL_VERSION 1 + +#endif diff --git a/tests/automake.mk b/tests/automake.mk index 04f48f2d8be..edfc2cb3359 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -499,7 +499,8 @@ endif if LINUX tests_ovstest_SOURCES += \ tests/test-netlink-conntrack.c \ - tests/test-netlink-policy.c + tests/test-netlink-policy.c \ + tests/test-psample.c endif tests_ovstest_LDADD = lib/libopenvswitch.la diff --git a/tests/test-psample.c b/tests/test-psample.c new file mode 100644 index 00000000000..1494dcc8d25 --- /dev/null +++ b/tests/test-psample.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include +#include +#include +#include +#include + +#include + +#include "command-line.h" +#include "dp-packet.h" +#include "util.h" +#include "netlink.h" +#include "netlink-socket.h" +#include "openvswitch/ofp-actions.h" +#include "openvswitch/ofp-print.h" +#include "openvswitch/types.h" +#include "openvswitch/uuid.h" +#include "openvswitch/vlog.h" +#include "ovstest.h" + +VLOG_DEFINE_THIS_MODULE(test_psample); + +static int psample_family = 0; +static uint32_t group_id = 0; +static bool has_filter; + +static void usage(void) +{ + printf("%s: psample collector test utility\n" + "usage: %s [OPTIONS] [GROUP]\n" + "where GROUP is the psample group_id to listen on. " + "If none is provided all events are printed.\n", + program_name, program_name); + vlog_usage(); + printf("\nOther Options:\n" + " -h, --help display this help message\n"); +} + +static void parse_options(int argc, char *argv[]) +{ + enum { + VLOG_OPTION_ENUMS + }; + static const struct option long_options[] = { + {"group", required_argument, NULL, 'g'}, + {"help", no_argument, NULL, 'h'}, + VLOG_LONG_OPTIONS, + {NULL, 0, NULL, 0}, + }; + char *tmp_short_options, *short_options; + int ret = EXIT_SUCCESS; + bool do_exit = false; + + tmp_short_options = ovs_cmdl_long_options_to_short_options(long_options); + short_options = xasprintf("+%s", tmp_short_options); + + while (!do_exit) { + int option; + + option = getopt_long(argc, argv, short_options, long_options, NULL); + if (option == -1) { + break; + } + + switch (option) { + + VLOG_OPTION_HANDLERS + + case 'h': + usage(); + do_exit = true; + ret = EXIT_SUCCESS; + break; + + case '?': + do_exit = true; + ret = EXIT_FAILURE; + break; + + default: + OVS_NOT_REACHED(); + } + } + + free(tmp_short_options); + free(short_options); + if (do_exit) { + exit(ret); + } +} + +static int connect_psample_socket(struct nl_sock **sock) +{ + unsigned int psample_packet_mcgroup; + int error; + + error = nl_lookup_genl_family(PSAMPLE_GENL_NAME , &psample_family); + if (error) { + VLOG_ERR("PSAMPLE_GENL_NAME not found: %s", ovs_strerror(error)); + return error; + } + + error = nl_lookup_genl_mcgroup(PSAMPLE_GENL_NAME, + PSAMPLE_NL_MCGRP_SAMPLE_NAME, + &psample_packet_mcgroup); + if (error) { + VLOG_ERR("psample packet multicast group not found: %s", + ovs_strerror(error)); + return error; + } + + error = nl_sock_create(NETLINK_GENERIC, sock); + if (error) { + VLOG_ERR("cannot create netlink socket: %s ", ovs_strerror(error)); + return error; + } + + nl_sock_listen_all_nsid(*sock, true); + + error = nl_sock_join_mcgroup(*sock, psample_packet_mcgroup); + if (error) { + nl_sock_destroy(*sock); + *sock = NULL; + VLOG_ERR("cannot join psample multicast group: %s", + ovs_strerror(error)); + return error; + } + return 0; +} + +/* Internal representation of a sample. */ +struct sample { + struct dp_packet packet; + uint32_t group_id; + uint32_t rate; + uint32_t obs_domain_id; + uint32_t obs_point_id; + bool has_cookie; +}; + +static inline void +sample_clear(struct sample *sample) +{ + sample->group_id = 0; + sample->obs_domain_id = 0; + sample->obs_point_id = 0; + sample->has_cookie = false; + dp_packet_clear(&sample->packet); +} + +static int +parse_psample(struct ofpbuf *buf, struct sample *sample) +{ + static const struct nl_policy psample_packet_policy[] = { + [PSAMPLE_ATTR_SAMPLE_GROUP] = { .type = NL_A_U32 }, + [PSAMPLE_ATTR_SAMPLE_RATE] = { .type = NL_A_U32 }, + [PSAMPLE_ATTR_DATA] = { .type = NL_A_UNSPEC, + .optional = true }, + [PSAMPLE_ATTR_USER_COOKIE] = { .type = NL_A_UNSPEC, + .optional = true }, + }; + + struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl); + struct nlattr *attr; + + struct nlattr *a[ARRAY_SIZE(psample_packet_policy)]; + if (!nlmsg || !genl + || !nl_policy_parse(&b, 0, psample_packet_policy, a, + ARRAY_SIZE(psample_packet_policy))) { + return EINVAL; + } + + attr = a[PSAMPLE_ATTR_DATA]; + if (attr) { + dp_packet_push(&sample->packet, nl_attr_get(attr), + nl_attr_get_size(attr)); + } + + sample->group_id = nl_attr_get_u32(a[PSAMPLE_ATTR_SAMPLE_GROUP]); + sample->rate = nl_attr_get_u32(a[PSAMPLE_ATTR_SAMPLE_RATE]); + + attr = a[PSAMPLE_ATTR_USER_COOKIE]; + if (attr && nl_attr_get_size(attr) == + sizeof sample->obs_domain_id + sizeof sample->obs_point_id) { + const ovs_be32 *data = nl_attr_get(attr); + + sample->has_cookie = true; + sample->obs_domain_id = ntohl(*data++); + sample->obs_point_id = ntohl(*data); + } + return 0; +} + +static void run(struct nl_sock *sock) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + struct sample sample = {}; + int error; + + dp_packet_init(&sample.packet, 1500); + + fprintf(stdout, "Listening for psample events\n"); + fflush(stdout); + + for (;;) { + uint64_t buf_stub[4096 / 8]; + struct ofpbuf buf; + + sample_clear(&sample); + + ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub); + error = nl_sock_recv(sock, &buf, NULL, true); + + if (error == ENOBUFS) { + fprintf(stderr, "[missed events]\n"); + continue; + } else if (error == EAGAIN) { + continue; + } else if (error) { + VLOG_ERR_RL(&rl, "error reading samples: %i", error); + continue; + } + + error = parse_psample(&buf, &sample); + if (error) { + VLOG_ERR_RL(&rl, "error parsing samples: %i", error); + continue; + } + + if (!has_filter || sample.group_id == group_id) { + fprintf(stdout, "group_id=0x%"PRIx32",prob=%"PRIu32" ", + sample.group_id, sample.rate); + if (sample.has_cookie) { + fprintf(stdout, + "obs_domain=0x%"PRIx32",obs_point=0x%"PRIx32" ", + sample.obs_domain_id, sample.obs_point_id); + } + ofp_print_dp_packet(stdout, &sample.packet); + } + fflush(stdout); + } +} + +static void +test_psample_main(int argc, char *argv[]) +{ + struct nl_sock *sock; + int error; + + parse_options(argc, argv); + + if (argc - optind > 1) { + ovs_fatal(0, "at most one positional argument supported " + "(use --help for help)"); + } else if (argc - optind == 1) { + if (!str_to_uint(argv[optind], 10, &group_id)) { + ovs_fatal(0, "invalid group id"); + } + has_filter = true; + } + + error = connect_psample_socket(&sock); + if (error) { + ovs_fatal(error, "failed to connect to psample socket"); + } + + run(sock); +} + +OVSTEST_REGISTER("test-psample", test_psample_main); From 45034c2064d58e80b7b2a65f962900e73db4cd16 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:44 +0200 Subject: [PATCH 780/833] tests: Test local sampling. Test simultaneous IPFIX and local sampling including slow-path. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- tests/system-common-macros.at | 4 + tests/system-traffic.at | 306 ++++++++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 2a68cd664e5..e9be021f3ff 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -378,3 +378,7 @@ m4_define([OVS_CHECK_GITHUB_ACTION], # OVS_CHECK_DROP_ACTION() m4_define([OVS_CHECK_DROP_ACTION], [AT_SKIP_IF([! grep -q "Datapath supports explicit drop action" ovs-vswitchd.log])]) + +# OVS_CHECK_PSAMPLE() +m4_define([OVS_CHECK_PSAMPLE], + [AT_SKIP_IF([! grep -q "Datapath supports psample action" ovs-vswitchd.log])]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 3f1a15445ee..120c66e5d46 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -9103,3 +9103,309 @@ OVS_WAIT_UNTIL([ovs-pcap p2.pcap | grep -q "m4_join([], [^], OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_BANNER([local-sampling]) + +m4_define([SAMPLE_ACTION], + [sample(probability=65535,collector_set_id=$1,obs_domain_id=$2,obs_point_id=$3)]dnl +) + +AT_SETUP([psample - sanity check]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=SAMPLE_ACTION(1, 2853183536, 2856341600),ovs-p1 +in_port=ovs-p1,ip actions=SAMPLE_ACTION(2, 3138396208, 3141554272),ovs-p0 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows -m --names], [0], [stdout]) +AT_CHECK([grep -q 'actions:psample(group=10,cookie=0xaa102030aa405060),ovs-p1' stdout]) +AT_CHECK([grep -q 'actions:psample(group=12,cookie=0xbb102030bb405060),ovs-p0' stdout]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) + +OVS_WAIT_UNTIL([grep -qE 'SAMPLE1' psample.out]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE2' psample.out]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - sanity check IPv6]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +priority=100,in_port=ovs-p0,ip6,icmp6,icmpv6_type=128 actions=SAMPLE_ACTION(1, 2853183536, 2856341600),ovs-p1 +priority=100,in_port=ovs-p1,ip6,icmp6,icmpv6_type=129 actions=SAMPLE_ACTION(2, 3138396208, 3141554272),ovs-p0 +priority=0 actions=NORMAL +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +OVS_WAIT_UNTIL_EQUAL([ip netns exec at_ns0 ping6 -I fc00::1 -q -W 2 -c 1 fc00::2 | FORMAT_PING], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms]) + +AT_CHECK([ovs-appctl dpctl/dump-flows -m --names], [0], [stdout]) +AT_CHECK([grep -q 'actions:psample(group=10,cookie=0xaa102030aa405060),ovs-p1' stdout]) +AT_CHECK([grep -q 'actions:psample(group=12,cookie=0xbb102030bb405060),ovs-p0' stdout]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp6.*ipv6_src=fc00::1,ipv6_dst=fc00::2])]) +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp6.*ipv6_src=fc00::2,ipv6_dst=fc00::1])]) + +OVS_WAIT_UNTIL([grep -qE 'SAMPLE1' psample.out]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE2' psample.out]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - slow]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=SAMPLE_ACTION(1, 2853183536, 2856341600),output(port=ovs-p1,max_len=200) +in_port=ovs-p1,ip actions=SAMPLE_ACTION(2, 3138396208, 3141554272),output(port=ovs-p0,max_len=200) +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Disable datapath truncate support to force actions to run in slow path. +AT_CHECK([ovs-appctl dpif/set-dp-features br0 trunc false], [0]) + +AT_CHECK([ovs-appctl ofproto/trace br0 \ + 'in_port=ovs-p0,dl_src=e4:11:22:33:44:55,dl_dst=e4:11:22:33:44:66,dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.12'], + [0], [stdout]) + +AT_CHECK([tail -3 stdout], [0], [dnl +Datapath actions: psample(group=10,cookie=0xaa102030aa405060),trunc(200),3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) + +AT_CHECK([grep -qE 'SAMPLE1' psample.out]) +AT_CHECK([grep -qE 'SAMPLE2' psample.out]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - slow with probability]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10], + [0], [ignore]) + +dnl A probability != 100% but still pretty high (99.99847%). This ensures that +dnl the outer sample action is not optimized out. +m4_define([PROBABLE_SAMPLE_ACTION], + [sample(probability=65534,collector_set_id=$1,obs_domain_id=$2,obs_point_id=$3)]dnl +) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=PROBABLE_SAMPLE_ACTION(1, 2853183536, 2856341600),output(port=ovs-p1,max_len=200) +in_port=ovs-p1,ip actions=PROBABLE_SAMPLE_ACTION(1, 2853183536, 2856341600),output(port=ovs-p0,max_len=200) +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Disable datapath truncate support to force actions to run in slow path. +AT_CHECK([ovs-appctl dpif/set-dp-features br0 trunc false], [0]) + +AT_CHECK([ovs-appctl ofproto/trace br0 \ + 'in_port=ovs-p0,dl_src=e4:11:22:33:44:55,dl_dst=e4:11:22:33:44:66,dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.12'], + [0], [stdout]) + +AT_CHECK([tail -3 stdout], [0], [dnl +Datapath actions: sample(sample=100.0%,actions(psample(group=10,cookie=0xaa102030aa405060))),trunc(200),3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +dnl Sending 10 packets to decrease even more the odds of not sampling a packet. +NS_CHECK_EXEC([at_ns0], [ping -q -i 0.1 -c 10 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE], [m4_join([ ], + [group_id=0xa,prob=4294901758], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +AT_CHECK([grep -qE 'SAMPLE' psample.out]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - with IPFIX]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@i create IPFIX targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set id=1 ipfix=@i \ + bridge=@br0 local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 ipfix=@i \ + bridge=@br0 local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=SAMPLE_ACTION(1, 2853183536, 2856341600),ovs-p1 +in_port=ovs-p1,ip actions=SAMPLE_ACTION(2, 3138396208, 3141554272),ovs-p0 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl ofproto/trace br0 \ + 'in_port=ovs-p0,dl_src=e4:11:22:33:44:55,dl_dst=e4:11:22:33:44:66,dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.12'], + [0], [stdout]) + +m4_define([ACTIONS], [m4_join([], + [psample(group=10,cookie=0xaa102030aa405060),], + [userspace(pid=4294967295,], + [flow_sample(probability=65535,], + [collector_set_id=1,], + [obs_domain_id=2853183536,], + [obs_point_id=2856341600,], + [output_port=4294967295)),], + [3])]) + +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: ACTIONS +]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) + +OVS_WAIT_UNTIL([grep -qE 'SAMPLE1' psample.out]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE2' psample.out]) + +dnl Check IPFIX samples have been received. +dnl Entries can be unsorted and IFPIX packets might not have been sent (or +dnl at least tried to be sent) yet. +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-ipfix-flow br0 | \ + sed 's/tx pkts=[[0-9]]*/tx pkts=24/' | \ + sed 's/tx errs=[[0-9]]*/tx errs=0/' | \ + sed 's/id [[1-2]]:/id ?:/'], [dnl +NXST_IPFIX_FLOW reply (xid=0x2): 2 ids + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0 + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0]) + +dnl OVS will fail to send IPFIX packets because the target is localhost +dnl and the port is closed. Ignore the message it generates. +OVS_TRAFFIC_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP From 516569d31fbff5c8febd388ac3ad752e8402ebe4 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:45 +0200 Subject: [PATCH 781/833] ofproto: xlate: Make sampled drops explicit. When the flow translation results in a datapath action list whose last action is an "observational" action, i.e: one generated for IPFIX, sFlow or local sampling applications, the packet is actually going to be dropped (and observed). In that case, add an explicit drop action so that drop statistics remain accurate. This behavior is controlled by a configurable boolean knob called "explicit_sampled_drops" Combine the "optimizations" and other odp_actions "tweaks" into a single function. Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- NEWS | 4 + ofproto/ofproto-dpif-xlate.c | 64 ++++++++++--- ofproto/ofproto-dpif-xlate.h | 4 + ofproto/ofproto-dpif.c | 6 ++ ofproto/ofproto-dpif.h | 2 + ofproto/ofproto-provider.h | 4 + ofproto/ofproto.c | 9 ++ ofproto/ofproto.h | 2 + tests/drop-stats.at | 168 +++++++++++++++++++++++++++++++++++ tests/ofproto-dpif.at | 49 ++++++++++ vswitchd/bridge.c | 4 + vswitchd/vswitch.xml | 24 +++++ 12 files changed, 326 insertions(+), 14 deletions(-) diff --git a/NEWS b/NEWS index 17ac8cd1d90..ee5aa4174c2 100644 --- a/NEWS +++ b/NEWS @@ -39,6 +39,10 @@ Post-v3.3.0 datapath-specific manner. The Linux kernel datapath is the first to support this feature by using the new datapath 'psample' action. See 'local-group-id' column in the Flow_Sample_Collector_Set table. + - A new configuration knob 'other-config:explicit-sampled-drops' in the + Open_vSwitch table controls whether an explicit drop action shall be + added at the end of datapath flows whose last action is an + observability-driven sample action. v3.3.0 - 16 Feb 2024 diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 8704aa9b9bc..9e7d0842a3d 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3415,6 +3415,7 @@ compose_sample_action(struct xlate_ctx *ctx, struct ofproto *ofproto = &ctx->xin->ofproto->up; uint32_t meter_id = ofproto->slowpath_meter_id; size_t cookie_offset = 0; + size_t observe_offset; /* The meter action is only used to throttle userspace actions. * If they are not needed and the sampling rate is 100%, avoid generating @@ -3432,6 +3433,7 @@ compose_sample_action(struct xlate_ctx *ctx, } if (args->psample) { + observe_offset = ctx->odp_actions->size; odp_put_psample_action(ctx->odp_actions, args->psample->group_id, (void *) &args->psample->cookie, @@ -3443,6 +3445,7 @@ compose_sample_action(struct xlate_ctx *ctx, nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); } + observe_offset = ctx->odp_actions->size; odp_port_t odp_port = ofp_port_to_odp_port( ctx->xbridge, ctx->xin->flow.in_port.ofp_port); uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); @@ -3457,6 +3460,9 @@ compose_sample_action(struct xlate_ctx *ctx, if (is_sample) { nl_msg_end_nested(ctx->odp_actions, actions_offset); nl_msg_end_nested(ctx->odp_actions, sample_offset); + ctx->xout->last_observe_offset = sample_offset; + } else { + ctx->xout->last_observe_offset = observe_offset; } return cookie_offset; @@ -8053,12 +8059,16 @@ xlate_wc_finish(struct xlate_ctx *ctx) } } -/* This will optimize the odp actions generated. For now, it will remove - * trailing clone actions that are unnecessary. */ +/* This will tweak the odp actions generated. For now, it will: + * - Remove trailing clone actions that are unnecessary. + * - Add an explicit drop action if the action list is empty. + * - Add an explicit drop action if the last action is an observability + * sample. This tweak is controlled by a configurable knob. */ static void -xlate_optimize_odp_actions(struct xlate_in *xin) +xlate_tweak_odp_actions(struct xlate_ctx *ctx) { - struct ofpbuf *actions = xin->odp_actions; + uint32_t last_observe_offset = ctx->xout->last_observe_offset; + struct ofpbuf *actions = ctx->xin->odp_actions; struct nlattr *last_action = NULL; struct nlattr *a; int left; @@ -8072,11 +8082,28 @@ xlate_optimize_odp_actions(struct xlate_in *xin) last_action = a; } + if (!last_action) { + if (ovs_explicit_drop_action_supported(ctx->xbridge->ofproto)) { + put_drop_action(actions, XLATE_OK); + } + return; + } + /* Remove the trailing clone() action, by directly embedding the nested * actions. */ - if (last_action && nl_attr_type(last_action) == OVS_ACTION_ATTR_CLONE) { + if (nl_attr_type(last_action) == OVS_ACTION_ATTR_CLONE) { void *dest; + if (last_observe_offset != UINT32_MAX && + (unsigned char *) actions->data + last_observe_offset > + (unsigned char *) last_action) { + /* The last sample is inside the trailing clone. + * Adjust its offset. */ + last_observe_offset -= (unsigned char *) nl_attr_get(last_action) - + (unsigned char *) last_action; + ctx->xout->last_observe_offset = last_observe_offset; + } + nl_msg_reset_size(actions, (unsigned char *) last_action - (unsigned char *) actions->data); @@ -8084,6 +8111,16 @@ xlate_optimize_odp_actions(struct xlate_in *xin) dest = nl_msg_put_uninit(actions, nl_attr_get_size(last_action)); memmove(dest, nl_attr_get(last_action), nl_attr_get_size(last_action)); } + + /* If the last action of the list is an observability action, add an + * explicit drop action so that drop statistics remain reliable. */ + if (ctx->xbridge->ofproto->explicit_sampled_drops && + ovs_explicit_drop_action_supported(ctx->xbridge->ofproto) && + last_observe_offset != UINT32_MAX && + (unsigned char *) last_action == (unsigned char *) actions->data + + last_observe_offset) { + put_drop_action(actions, XLATE_OK); + } } /* Translates the flow, actions, or rule in 'xin' into datapath actions in @@ -8100,6 +8137,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) *xout = (struct xlate_out) { .slow = 0, .recircs = RECIRC_REFS_EMPTY_INITIALIZER, + .last_observe_offset = UINT32_MAX, }; struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp); @@ -8528,17 +8566,15 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) xout->slow = 0; if (xin->odp_actions) { ofpbuf_clear(xin->odp_actions); + /* Make the drop explicit if the datapath supports it. */ + if (ovs_explicit_drop_action_supported(ctx.xbridge->ofproto)) { + put_drop_action(xin->odp_actions, ctx.error); + } } } else { - /* In the non-error case, see if we can further optimize the datapath - * rules by removing redundant (clone) actions. */ - xlate_optimize_odp_actions(xin); - } - - /* Install drop action if datapath supports explicit drop action. */ - if (xin->odp_actions && !xin->odp_actions->size && - ovs_explicit_drop_action_supported(ctx.xbridge->ofproto)) { - put_drop_action(xin->odp_actions, ctx.error); + /* In the non-error case, see if we can further optimize or tweak + * datapath actions. */ + xlate_tweak_odp_actions(&ctx); } /* Since congestion drop and forwarding drop are not exactly diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index 08f9397d824..d973a634aca 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -61,6 +61,10 @@ struct xlate_out { /* Recirc action IDs on which references are held. */ struct recirc_refs recircs; + + /* Keep track of the last action whose purpose is purely observational. + * e.g: IPFIX, sFlow, local sampling. */ + uint32_t last_observe_offset; }; struct xlate_in { diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 173a618cc90..dca6a6ffab1 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -1819,6 +1819,7 @@ construct(struct ofproto *ofproto_) ofproto->change_seq = 0; ofproto->ams_seq = seq_create(); ofproto->ams_seqno = seq_read(ofproto->ams_seq); + ofproto->explicit_sampled_drops = false; SHASH_FOR_EACH_SAFE (node, &init_ofp_ports) { @@ -2091,6 +2092,11 @@ run(struct ofproto *ofproto_) } } } + + if (ofproto->explicit_sampled_drops != ofproto_explicit_sampled_drops) { + ofproto->explicit_sampled_drops = ofproto_explicit_sampled_drops; + ofproto->backer->need_revalidate = REV_RECONFIGURE; + } return 0; } diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index b3dbece6711..f8d3df5ab5a 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -365,6 +365,8 @@ struct ofproto_dpif { bool is_controller_connected; /* True if any controller admitted this * switch connection. */ + bool explicit_sampled_drops; /* If explicit drop actions must added after + * trailing sample actions. */ }; struct ofproto_dpif *ofproto_dpif_lookup_by_name(const char *name); diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 85991554cd9..cce90066bfe 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -550,6 +550,10 @@ extern unsigned ofproto_offloaded_stats_delay; * ofproto-dpif implementation. */ extern uint32_t n_handlers, n_revalidators; +/* If an explicit datapath drop action shall be added after trailing sample + * actions coming from IPFIX / sFlow / local sampling. */ +extern bool ofproto_explicit_sampled_drops; + static inline struct rule *rule_from_cls_rule(const struct cls_rule *); void ofproto_rule_expire(struct rule *rule, uint8_t reason) diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 8c1efe4bf72..2bd59fc9c16 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -312,6 +312,7 @@ unsigned ofproto_max_idle = OFPROTO_MAX_IDLE_DEFAULT; unsigned ofproto_max_revalidator = OFPROTO_MAX_REVALIDATOR_DEFAULT; unsigned ofproto_min_revalidate_pps = OFPROTO_MIN_REVALIDATE_PPS_DEFAULT; unsigned ofproto_offloaded_stats_delay = OFPROTO_OFFLOADED_STATS_DELAY; +bool ofproto_explicit_sampled_drops = OFPROTO_EXPLICIT_SAMPLED_DROPS_DEFAULT; uint32_t n_handlers, n_revalidators; @@ -737,6 +738,14 @@ ofproto_set_offloaded_stats_delay(unsigned offloaded_stats_delay) ofproto_offloaded_stats_delay = offloaded_stats_delay; } +/* Set if an explicit datapath drop action shall be added after trailing sample + * actions coming from IPFIX / sFlow / local sampling. */ +void +ofproto_set_explicit_sampled_drops(bool explicit_sampled_drops) +{ + ofproto_explicit_sampled_drops = explicit_sampled_drops; +} + /* If forward_bpdu is true, the NORMAL action will forward frames with * reserved (e.g. STP) destination Ethernet addresses. if forward_bpdu is false, * the NORMAL action will drop these frames. */ diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index f1ff80e5204..fcf8e201d45 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -326,6 +326,7 @@ int ofproto_port_dump_done(struct ofproto_port_dump *); #define OFPROTO_MAX_REVALIDATOR_DEFAULT 500 /* ms */ #define OFPROTO_MIN_REVALIDATE_PPS_DEFAULT 5 #define OFPROTO_OFFLOADED_STATS_DELAY 2000 /* ms */ +#define OFPROTO_EXPLICIT_SAMPLED_DROPS_DEFAULT false const char *ofproto_port_open_type(const struct ofproto *, const char *port_type); @@ -398,6 +399,7 @@ void ofproto_ct_zone_limit_protection_update(const char *datapath_type, bool protected); void ofproto_get_datapath_cap(const char *datapath_type, struct smap *dp_cap); +void ofproto_set_explicit_sampled_drops(bool explicit_sampled_drops); /* Configuration of ports. */ void ofproto_port_unregister(struct ofproto *, ofp_port_t ofp_port); diff --git a/tests/drop-stats.at b/tests/drop-stats.at index 1d3af98dabe..946c998a1fc 100644 --- a/tests/drop-stats.at +++ b/tests/drop-stats.at @@ -191,3 +191,171 @@ ovs-appctl coverage/read-counter drop_action_too_many_mpls_labels OVS_VSWITCHD_STOP(["/|WARN|/d"]) AT_CLEANUP + +m4_define([ICMP_PKT], [m4_join([,], + [in_port(1),packet_type(ns=0,id=0)], + [eth(src=3a:6d:d2:09:9c:ab,dst=1e:2c:e9:2a:66:9e)], + [ipv4(src=192.168.10.10,dst=192.168.10.30,proto=1,tos=0,ttl=64,frag=no)], + [icmp(type=8,code=0)])]) + +AT_SETUP([drop-stats - bridge sampling]) + +OVS_VSWITCHD_START([dnl + set bridge br0 datapath_type=dummy \ + protocols=OpenFlow10,OpenFlow13,OpenFlow14,OpenFlow15 -- \ + add-port br0 p1 -- set Interface p1 type=dummy ofport_request=1]) + +AT_DATA([flows.txt], [dnl +table=0,in_port=1,actions=drop +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-vsctl -- set bridge br0 ipfix=@fix -- \ + --id=@fix create ipfix targets=\"127.0.0.1:4739\" \ + sampling=1], + [0], [ignore]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:userspace(pid=0,ipfix(output_port=4294967295)) +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +0 +]) + +dnl Now activate explicit sampled drops. +AT_CHECK([ovs-vsctl set Open_vSwitch . other-config:explicit-sampled-drops=true]) +AT_CHECK([ovs-appctl revalidator/wait]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:5, bytes:530, used:0.0s, dnl +actions:userspace(pid=0,ipfix(output_port=4294967295)),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +3 +]) + +OVS_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP + +AT_SETUP([drop-stats - sampling action]) + +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +AT_DATA([flows.txt], [dnl +table=0,in_port=1,actions=sample(probability=65535,collector_set_id=1) +table=0,in_port=2,actions=sample(probability=32767,collector_set_id=1),load:0->reg0 +table=0,in_port=3,actions=clone(sample(probability=65535,collector_set_id=1)) +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-vsctl --id=@br0 get Bridge br0 \ + -- --id=@ipfix create IPFIX targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + ipfix=@ipfix], + [0], [ignore]) + +m4_define([USERSPACE_SAMPLE_ACTION], [m4_join([,], + [userspace(pid=0], + [flow_sample(probability=$1,collector_set_id=1,obs_domain_id=0], + [obs_point_id=0,output_port=4294967295))])]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:USERSPACE_SAMPLE_ACTION(65535) +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +0 +]) + +dnl Now activate explicit sampled drops. +AT_CHECK([ovs-vsctl set Open_vSwitch . other-config:explicit-sampled-drops=true]) +AT_CHECK([ovs-appctl revalidator/wait]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:5, bytes:530, used:0.0s, dnl +actions:USERSPACE_SAMPLE_ACTION(65535),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +3 +]) + +AT_CHECK([ovs-appctl dpctl/del-flows]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p2 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:sample(sample=50.0%,actions(USERSPACE_SAMPLE_ACTION(32767))),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +6 +]) + +AT_CHECK([ovs-appctl dpctl/del-flows]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p3 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:USERSPACE_SAMPLE_ACTION(65535),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +9 +]) + +OVS_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index e6646106ec8..9415f571c24 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -12335,3 +12335,52 @@ Datapath actions: EXPECTED_ACT OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - drop]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 local-group-id=42], + [0], [ignore]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=slowpath pktps stats bands=type=drop rate=2']) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200) +in_port=2, actions=sample(probability=65535,collector_set_id=1,obs_domain_id=100,obs_point_id=200) +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: sample(sample=50.0%,actions(psample(group=42,cookie=0x64000000c8))) +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: psample(group=42,cookie=0x64000000c8) +]) + +AT_CHECK([ovs-vsctl set Open_vSwitch . other-config:explicit-sampled-drops=true]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: sample(sample=50.0%,actions(psample(group=42,cookie=0x64000000c8))),drop +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: psample(group=42,cookie=0x64000000c8),drop +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index c5399d18c43..86ba06e2009 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -890,6 +890,10 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0), smap_get_int(&ovs_cfg->other_config, "n-revalidator-threads", 0)); + ofproto_set_explicit_sampled_drops( + smap_get_bool(&ovs_cfg->other_config, "explicit-sampled-drops", + OFPROTO_EXPLICIT_SAMPLED_DROPS_DEFAULT)); + /* Destroy "struct bridge"s, "struct port"s, and "struct iface"s according * to 'ovs_cfg', with only very minimal configuration otherwise. * diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index d89ad2d184c..70e49e166a9 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -869,6 +869,30 @@ The feature is considered experimental.

      + + +

      + When a flow is installed in the datapath with an empty action list, + it indicates an implicit "drop" action. Most datapaths report this + for event for statistics and monitoring (in datapath-specific ways). +

      +

      + However, if any of the per-bridge or per-flow sampling functionalities + are enabled (e.g: sFlow, IPFIX, local sampling), the action list might + not be empty, but contain an action to implement such functionality. + This makes the datapaths not report the packet drop. +

      +

      + This knob makes Open vSwitch detect when the last datapath action + comes from these sampling features and add an explicit drop action at + the end to keep drop statistics accurate. +

      +

      + The default value is false. +

      +
      +
      From c2e6836460ee783890ee69a49a067646d6ecde9f Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:46 +0200 Subject: [PATCH 782/833] ofproto-dpif-xlate: Avoid allocating mf_subfield. "enum mf_subfield" (a 128byte object) is dynamically allocated a few times just to set it to an all-ones mask. Avoid dynamically allocating them by creating a static all-ones mask similar to what was done with "exact_match_mask". Acked-by: Eelco Chaudron Suggested-by: Eelco Chaudron Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- include/openvswitch/meta-flow.h | 3 +++ lib/meta-flow.c | 2 ++ ofproto/ofproto-dpif-xlate.c | 16 +++++----------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/openvswitch/meta-flow.h b/include/openvswitch/meta-flow.h index 3b0220aaa25..aff917bcf60 100644 --- a/include/openvswitch/meta-flow.h +++ b/include/openvswitch/meta-flow.h @@ -2233,6 +2233,9 @@ union mf_subvalue { }; BUILD_ASSERT_DECL(sizeof(union mf_value) == sizeof (union mf_subvalue)); +/* A const mf_subvalue with all bits initialized to ones. */ +extern const union mf_subvalue exact_sub_match_mask; + bool mf_subvalue_intersect(const union mf_subvalue *a_value, const union mf_subvalue *a_mask, const union mf_subvalue *b_value, diff --git a/lib/meta-flow.c b/lib/meta-flow.c index aa7cf1fcbbd..499be04b608 100644 --- a/lib/meta-flow.c +++ b/lib/meta-flow.c @@ -71,8 +71,10 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); #define MF_VALUE_EXACT_64 MF_VALUE_EXACT_32, MF_VALUE_EXACT_32 #define MF_VALUE_EXACT_128 MF_VALUE_EXACT_64, MF_VALUE_EXACT_64 #define MF_VALUE_EXACT_INITIALIZER { .tun_metadata = { MF_VALUE_EXACT_128 } } +#define MF_SUBVALUE_EXACT_INITIALIZER { .u8 = { MF_VALUE_EXACT_128 } } const union mf_value exact_match_mask = MF_VALUE_EXACT_INITIALIZER; +const union mf_subvalue exact_sub_match_mask = MF_SUBVALUE_EXACT_INITIALIZER; static void nxm_init(void); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 9e7d0842a3d..79283ea161b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -5598,15 +5598,12 @@ xlate_output_reg_action(struct xlate_ctx *ctx, { uint64_t port = mf_get_subfield(&or->src, &ctx->xin->flow); if (port <= UINT16_MAX) { - union mf_subvalue *value = xmalloc(sizeof *value); - xlate_report(ctx, OFT_DETAIL, "output port is %"PRIu64, port); - memset(value, 0xff, sizeof *value); - mf_write_subfield_flow(&or->src, value, &ctx->wc->masks); + mf_write_subfield_flow(&or->src, &exact_sub_match_mask, + &ctx->wc->masks); xlate_output_action(ctx, u16_to_ofp(port), or->max_len, false, is_last_action, false, group_bucket_action); - free(value); } else { xlate_report(ctx, OFT_WARN, "output port %"PRIu64" is out of range", port); @@ -6561,9 +6558,6 @@ compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc, { uint16_t zone; if (ofc->zone_src.field) { - union mf_subvalue *value = xmalloc(sizeof *value); - memset(value, 0xff, sizeof *value); - zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow); if (ctx->xin->frozen_state) { /* If the upcall is a resume of a recirculation, we only need to @@ -6572,13 +6566,13 @@ compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc, * which will invalidate the megaflow with old the recirc_id. */ if (!mf_is_frozen_metadata(ofc->zone_src.field)) { - mf_write_subfield_flow(&ofc->zone_src, value, + mf_write_subfield_flow(&ofc->zone_src, &exact_sub_match_mask, &ctx->wc->masks); } } else { - mf_write_subfield_flow(&ofc->zone_src, value, &ctx->wc->masks); + mf_write_subfield_flow(&ofc->zone_src, &exact_sub_match_mask, + &ctx->wc->masks); } - free(value); } else { zone = ofc->zone_imm; } From 1aa9e137fe36a810271415d79735dedfedfc9f6e Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Sat, 13 Jul 2024 23:23:47 +0200 Subject: [PATCH 783/833] ofp-actions: Load data from fields in sample action. When sample action gets used as a way of sampling traffic with controller-generated metadata (i.e: obs_domain_id and obs_point_id), the controller will have to increase the number of flows to ensure each part of the pipeline contains the right metadata. As an example, if the controller decides to sample stateful traffic, it could store the computed metadata for each connection in the conntrack label. However, for established connections, a flow must be created for each different ct_label value with a sample action that contains a different hardcoded obs_domain and obs_point id. This patch adds a new version of the NXAST_RAW_SAMPLE* action (number 4) that supports specifying the observation point and domain using an OpenFlow field reference, so now the controller can express: sample(... obs_domain_id=NXM_NX_CT_LABEL[0..31], obs_point_id=NXM_NX_CT_LABEL[32..63] ... ) Signed-off-by: Adrian Moreno Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-actions.7.rst | 15 +- NEWS | 3 + include/openvswitch/ofp-actions.h | 8 +- lib/ofp-actions.c | 245 +++++++++++++++++++++++++--- ofproto/ofproto-dpif-xlate.c | 44 ++++- python/ovs/flow/ofp.py | 8 +- python/ovs/flow/ofp_act.py | 6 +- tests/ofp-actions.at | 8 + tests/ofproto-dpif.at | 55 +++++++ tests/ovs-ofctl.at | 14 ++ tests/system-traffic.at | 79 +++++++++ 11 files changed, 444 insertions(+), 41 deletions(-) diff --git a/Documentation/ref/ovs-actions.7.rst b/Documentation/ref/ovs-actions.7.rst index 80acd9070b7..30d5b98ef4c 100644 --- a/Documentation/ref/ovs-actions.7.rst +++ b/Documentation/ref/ovs-actions.7.rst @@ -2201,13 +2201,17 @@ The following *argument* forms are accepted: The unsigned 32-bit integer identifier of the set of sample collectors to send sampled packets to. Defaults to 0. - ``obs_domain_id=``\ *id* + ``obs_domain_id=``\ *value* When sending samples to IPFIX collectors, the unsigned 32-bit integer - Observation Domain ID sent in every IPFIX flow record. Defaults to 0. + Observation Domain ID sent in every IPFIX flow record. The *value* may + be specified as a 32-bit integer or a field or subfield in the syntax + described under `Field Specifications`_ above. Defaults to 0. - ``obs_point_id=``\ *id* + ``obs_point_id=``\ *value* When sending samples to IPFIX collectors, the unsigned 32-bit integer - Observation Point ID sent in every IPFIX flow record. Defaults to 0. + Observation Point ID sent in every IPFIX flow record. The *value* may + be specified as a 32-bit integer or a field or subfield in the syntax + described under `Field Specifications`_ above. Defaults to 0. ``sampling_port=``\ *port* Sample packets on *port*, which should be the ingress or egress port. This @@ -2232,6 +2236,9 @@ collector sets. **Conformance** This action is an OpenFlow extension added in Open vSwitch 2.4. + Support for subfields in `obs_domain_id` and `obs_point_id` was added in + Open vSwitch 3.4. + Instructions ============ diff --git a/NEWS b/NEWS index ee5aa4174c2..10e08fbac4c 100644 --- a/NEWS +++ b/NEWS @@ -43,6 +43,9 @@ Post-v3.3.0 Open_vSwitch table controls whether an explicit drop action shall be added at the end of datapath flows whose last action is an observability-driven sample action. + - OpenFlow: + * A new version of the 'sample' action (NXAST_SAMPLE4) is introduced + that allows use of subfields in 'obs_point_id' and 'obs_domain_id'. v3.3.0 - 16 Feb 2024 diff --git a/include/openvswitch/ofp-actions.h b/include/openvswitch/ofp-actions.h index 7b57e49ad65..56dc2c1476c 100644 --- a/include/openvswitch/ofp-actions.h +++ b/include/openvswitch/ofp-actions.h @@ -1015,14 +1015,16 @@ enum nx_action_sample_direction { /* OFPACT_SAMPLE. * - * Used for NXAST_SAMPLE, NXAST_SAMPLE2, and NXAST_SAMPLE3. */ + * Used for NXAST_SAMPLE, NXAST_SAMPLE2, NXAST_SAMPLE3 and NXAST_SAMPLE4. */ struct ofpact_sample { OFPACT_PADDED_MEMBERS( struct ofpact ofpact; uint16_t probability; /* Always positive. */ uint32_t collector_set_id; - uint32_t obs_domain_id; - uint32_t obs_point_id; + uint32_t obs_domain_imm; + struct mf_subfield obs_domain_src; + uint32_t obs_point_imm; + struct mf_subfield obs_point_src; ofp_port_t sampling_port; enum nx_action_sample_direction direction; ); diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c index da7b1dd31ae..2a1f5c3c4ee 100644 --- a/lib/ofp-actions.c +++ b/lib/ofp-actions.c @@ -330,6 +330,8 @@ enum ofp_raw_action_type { NXAST_RAW_SAMPLE2, /* NX1.0+(41): struct nx_action_sample2. */ NXAST_RAW_SAMPLE3, + /* NX1.0+(51): struct nx_action_sample4. VLMFF */ + NXAST_RAW_SAMPLE4, /* NX1.0+(34): struct nx_action_conjunction. */ NXAST_RAW_CONJUNCTION, @@ -6188,6 +6190,34 @@ struct nx_action_sample2 { }; OFP_ASSERT(sizeof(struct nx_action_sample2) == 32); +/* Action structure for NXAST_SAMPLE4 + * + * NXAST_SAMPLE4 was added in Open vSwitch 3.4.0. Compared to NXAST_SAMPLE3, + * it adds support for using field specifiers for observation_domain_id and + * observation_point_id. */ +struct nx_action_sample4 { + ovs_be16 type; /* OFPAT_VENDOR. */ + ovs_be16 len; /* Length is 40. */ + ovs_be32 vendor; /* NX_VENDOR_ID. */ + ovs_be16 subtype; /* NXAST_SAMPLE4. */ + ovs_be16 probability; /* Fraction of packets to sample. */ + ovs_be32 collector_set_id; /* ID of collector set in OVSDB. */ + ovs_be32 obs_domain_src; /* The observation_domain_id source. */ + union { + ovs_be16 obs_domain_ofs_nbits; /* Range to use from source field. */ + ovs_be32 obs_domain_imm; /* Immediate value for domain id. */ + }; + ovs_be32 obs_point_src; /* The observation_point_id source. */ + union { + ovs_be16 obs_point_ofs_nbits; /* Range to use from source field. */ + ovs_be32 obs_point_imm; /* Immediate value for point id. */ + }; + ovs_be16 sampling_port; /* Sampling port. */ + uint8_t direction; /* Sampling direction. */ + uint8_t zeros[5]; /* Pad to a multiple of 8 bytes */ + }; + OFP_ASSERT(sizeof(struct nx_action_sample4) == 40); + static enum ofperr decode_NXAST_RAW_SAMPLE(const struct nx_action_sample *nas, enum ofp_version ofp_version OVS_UNUSED, @@ -6199,11 +6229,14 @@ decode_NXAST_RAW_SAMPLE(const struct nx_action_sample *nas, sample->ofpact.raw = NXAST_RAW_SAMPLE; sample->probability = ntohs(nas->probability); sample->collector_set_id = ntohl(nas->collector_set_id); - sample->obs_domain_id = ntohl(nas->obs_domain_id); - sample->obs_point_id = ntohl(nas->obs_point_id); + sample->obs_domain_imm = ntohl(nas->obs_domain_id); + sample->obs_domain_src.field = NULL; + sample->obs_point_imm = ntohl(nas->obs_point_id); + sample->obs_point_src.field = NULL; sample->sampling_port = OFPP_NONE; sample->direction = NX_ACTION_SAMPLE_DEFAULT; - + sample->obs_domain_src.field = NULL; + sample->obs_point_src.field = NULL; if (sample->probability == 0) { return OFPERR_OFPBAC_BAD_ARGUMENT; } @@ -6220,8 +6253,10 @@ decode_SAMPLE2(const struct nx_action_sample2 *nas, sample->ofpact.raw = raw; sample->probability = ntohs(nas->probability); sample->collector_set_id = ntohl(nas->collector_set_id); - sample->obs_domain_id = ntohl(nas->obs_domain_id); - sample->obs_point_id = ntohl(nas->obs_point_id); + sample->obs_domain_imm = ntohl(nas->obs_domain_id); + sample->obs_domain_src.field = NULL; + sample->obs_point_imm = ntohl(nas->obs_point_id); + sample->obs_point_src.field = NULL; sample->sampling_port = u16_to_ofp(ntohs(nas->sampling_port)); sample->direction = direction; @@ -6241,41 +6276,170 @@ decode_NXAST_RAW_SAMPLE2(const struct nx_action_sample2 *nas, ofpact_put_SAMPLE(out)); } +static int +check_sample_direction(enum nx_action_sample_direction direction) +{ + if (direction != NX_ACTION_SAMPLE_DEFAULT && + direction != NX_ACTION_SAMPLE_INGRESS && + direction != NX_ACTION_SAMPLE_EGRESS) { + VLOG_WARN_RL(&rl, "invalid sample direction %"PRIu8, direction); + return OFPERR_OFPBAC_BAD_ARGUMENT; + } + return 0; +} + static enum ofperr decode_NXAST_RAW_SAMPLE3(const struct nx_action_sample2 *nas, enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out) { struct ofpact_sample *sample = ofpact_put_SAMPLE(out); + int err; + if (!is_all_zeros(nas->zeros, sizeof nas->zeros)) { return OFPERR_NXBRC_MUST_BE_ZERO; } - if (nas->direction != NX_ACTION_SAMPLE_DEFAULT && - nas->direction != NX_ACTION_SAMPLE_INGRESS && - nas->direction != NX_ACTION_SAMPLE_EGRESS) { - VLOG_WARN_RL(&rl, "invalid sample direction %"PRIu8, nas->direction); - return OFPERR_OFPBAC_BAD_ARGUMENT; + err = check_sample_direction(nas->direction); + if (err) { + return err; } return decode_SAMPLE2(nas, NXAST_RAW_SAMPLE3, nas->direction, sample); } +static int +decode_sample_obs_id(ovs_be32 src, ovs_be16 ofs_nbits, ovs_be32 imm, + const struct vl_mff_map *vl_mff_map, uint64_t *tlv_bitmap, + struct mf_subfield *src_out, uint32_t *imm_out) +{ + if (src) { + enum ofperr error; + + src_out->ofs = nxm_decode_ofs(ofs_nbits); + src_out->n_bits = nxm_decode_n_bits(ofs_nbits); + error = mf_vl_mff_mf_from_nxm_header(ntohl(src), + vl_mff_map, &src_out->field, + tlv_bitmap); + if (error) { + return error; + } + + error = mf_check_src(src_out, NULL); + if (error) { + return error; + } + + if (src_out->n_bits > 32) { + VLOG_WARN_RL(&rl, "size of field used in observation_id (%d) " + "exceeds maximum (32)", src_out->n_bits); + return OFPERR_OFPBAC_BAD_ARGUMENT; + } + } else { + src_out->field = NULL; + *imm_out = ntohl(imm); + } + + return 0; +} + +static enum ofperr +decode_NXAST_RAW_SAMPLE4(const struct nx_action_sample4 *nas, + enum ofp_version ofp_version OVS_UNUSED, + const struct vl_mff_map *vl_mff_map, + uint64_t *tlv_bitmap, + struct ofpbuf *out) +{ + struct ofpact_sample *sample = ofpact_put_SAMPLE(out); + int err; + + if (!is_all_zeros(nas->zeros, sizeof nas->zeros)) { + return OFPERR_NXBRC_MUST_BE_ZERO; + } + + err = check_sample_direction(nas->direction); + if (err) { + return err; + } + + sample->ofpact.raw = NXAST_RAW_SAMPLE4; + sample->probability = ntohs(nas->probability); + sample->collector_set_id = ntohl(nas->collector_set_id); + sample->sampling_port = u16_to_ofp(ntohs(nas->sampling_port)); + sample->direction = nas->direction; + + if (sample->probability == 0) { + return OFPERR_OFPBAC_BAD_ARGUMENT; + } + + err = decode_sample_obs_id(nas->obs_domain_src, + nas->obs_domain_ofs_nbits, + nas->obs_domain_imm, + vl_mff_map, tlv_bitmap, + &sample->obs_domain_src, + &sample->obs_domain_imm); + if (err) { + return err; + } + + return decode_sample_obs_id(nas->obs_point_src, + nas->obs_point_ofs_nbits, + nas->obs_point_imm, + vl_mff_map, tlv_bitmap, + &sample->obs_point_src, + &sample->obs_point_imm); +} + static void encode_SAMPLE2(const struct ofpact_sample *sample, struct nx_action_sample2 *nas) { nas->probability = htons(sample->probability); nas->collector_set_id = htonl(sample->collector_set_id); - nas->obs_domain_id = htonl(sample->obs_domain_id); - nas->obs_point_id = htonl(sample->obs_point_id); + nas->obs_domain_id = htonl(sample->obs_domain_imm); + nas->obs_point_id = htonl(sample->obs_point_imm); + nas->sampling_port = htons(ofp_to_u16(sample->sampling_port)); + nas->direction = sample->direction; +} + +static void +encode_SAMPLE4(const struct ofpact_sample *sample, + struct nx_action_sample4 *nas) +{ + nas->probability = htons(sample->probability); + nas->collector_set_id = htonl(sample->collector_set_id); nas->sampling_port = htons(ofp_to_u16(sample->sampling_port)); nas->direction = sample->direction; + + if (sample->obs_domain_src.field) { + nas->obs_domain_src = + htonl(nxm_header_from_mff(sample->obs_domain_src.field)); + nas->obs_domain_ofs_nbits = + nxm_encode_ofs_nbits(sample->obs_domain_src.ofs, + sample->obs_domain_src.n_bits); + } else { + nas->obs_domain_src = htonl(0); + nas->obs_domain_imm = htonl(sample->obs_domain_imm); + } + if (sample->obs_point_src.field) { + nas->obs_point_src = + htonl(nxm_header_from_mff(sample->obs_point_src.field)); + nas->obs_point_ofs_nbits = + nxm_encode_ofs_nbits(sample->obs_point_src.ofs, + sample->obs_point_src.n_bits); + } else { + nas->obs_point_src = htonl(0); + nas->obs_point_imm = htonl(sample->obs_point_imm); + } } static void encode_SAMPLE(const struct ofpact_sample *sample, enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out) { - if (sample->ofpact.raw == NXAST_RAW_SAMPLE3 + if (sample->ofpact.raw == NXAST_RAW_SAMPLE4 || + sample->obs_domain_src.field || + sample->obs_point_src.field) { + encode_SAMPLE4(sample, put_NXAST_SAMPLE4(out)); + } else if (sample->ofpact.raw == NXAST_RAW_SAMPLE3 || sample->direction != NX_ACTION_SAMPLE_DEFAULT) { encode_SAMPLE2(sample, put_NXAST_SAMPLE3(out)); } else if (sample->ofpact.raw == NXAST_RAW_SAMPLE2 @@ -6285,8 +6449,8 @@ encode_SAMPLE(const struct ofpact_sample *sample, struct nx_action_sample *nas = put_NXAST_SAMPLE(out); nas->probability = htons(sample->probability); nas->collector_set_id = htonl(sample->collector_set_id); - nas->obs_domain_id = htonl(sample->obs_domain_id); - nas->obs_point_id = htonl(sample->obs_point_id); + nas->obs_domain_id = htonl(sample->obs_domain_imm); + nas->obs_point_id = htonl(sample->obs_point_imm); } } @@ -6314,9 +6478,35 @@ parse_SAMPLE(char *arg, const struct ofpact_parse_params *pp) } else if (!strcmp(key, "collector_set_id")) { error = str_to_u32(value, &os->collector_set_id); } else if (!strcmp(key, "obs_domain_id")) { - error = str_to_u32(value, &os->obs_domain_id); + error = str_to_u32(value, &os->obs_domain_imm); + + if (error) { + free(error); + error = mf_parse_subfield(&os->obs_domain_src, value); + if (error) { + return error; + } + if (os->obs_domain_src.n_bits > 32) { + return xasprintf("size of obs_domain_id field (%d) " + "exceeds maximum (32)", + os->obs_point_src.n_bits); + } + } } else if (!strcmp(key, "obs_point_id")) { - error = str_to_u32(value, &os->obs_point_id); + error = str_to_u32(value, &os->obs_point_imm); + + if (error) { + free(error); + error = mf_parse_subfield(&os->obs_point_src, value); + if (error) { + return error; + } + if (os->obs_point_src.n_bits > 32) { + return xasprintf("size of obs_point_id field (%d) " + "exceeds maximum (32)", + os->obs_point_src.n_bits); + } + } } else if (!strcmp(key, "sampling_port")) { if (!ofputil_port_from_string(value, pp->port_map, &os->sampling_port)) { @@ -6346,14 +6536,23 @@ format_SAMPLE(const struct ofpact_sample *a, const struct ofpact_format_params *fp) { ds_put_format(fp->s, "%ssample(%s%sprobability=%s%"PRIu16 - ",%scollector_set_id=%s%"PRIu32 - ",%sobs_domain_id=%s%"PRIu32 - ",%sobs_point_id=%s%"PRIu32, + ",%scollector_set_id=%s%"PRIu32, colors.paren, colors.end, colors.param, colors.end, a->probability, - colors.param, colors.end, a->collector_set_id, - colors.param, colors.end, a->obs_domain_id, - colors.param, colors.end, a->obs_point_id); + colors.param, colors.end, a->collector_set_id); + + ds_put_format(fp->s, ",%sobs_domain_id=%s", colors.param, colors.end); + if (a->obs_domain_src.field) { + mf_format_subfield(&a->obs_domain_src, fp->s); + } else { + ds_put_format(fp->s, "%"PRIu32, a->obs_domain_imm); + } + ds_put_format(fp->s, ",%sobs_point_id=%s", colors.param, colors.end); + if (a->obs_point_src.field) { + mf_format_subfield(&a->obs_point_src, fp->s); + } else { + ds_put_format(fp->s, "%"PRIu32, a->obs_point_imm); + } if (a->sampling_port != OFPP_NONE) { ds_put_format(fp->s, ",%ssampling_port=%s", colors.param, colors.end); ofputil_format_port(a->sampling_port, fp->port_map, fp->s); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 79283ea161b..3436b44755f 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -5902,6 +5902,40 @@ xlate_fin_timeout(struct xlate_ctx *ctx, } } +static uint32_t +ofpact_sample_get_domain(struct xlate_ctx *ctx, + const struct ofpact_sample *os) +{ + if (os->obs_domain_src.field) { + uint32_t obs_domain_id; + + obs_domain_id = mf_get_subfield(&os->obs_domain_src, &ctx->xin->flow); + mf_write_subfield_flow(&os->obs_domain_src, &exact_sub_match_mask, + &ctx->wc->masks); + + return obs_domain_id; + } else { + return os->obs_domain_imm; + } +} + +static uint32_t +ofpact_sample_get_point(struct xlate_ctx *ctx, + const struct ofpact_sample *os) +{ + if (os->obs_point_src.field) { + uint32_t obs_point_id; + + obs_point_id = mf_get_subfield(&os->obs_point_src, &ctx->xin->flow); + mf_write_subfield_flow(&os->obs_point_src, &exact_sub_match_mask, + &ctx->wc->masks); + + return obs_point_id; + } else { + return os->obs_point_imm; + } +} + static void xlate_fill_ipfix_sample(struct xlate_ctx *ctx, const struct ofpact_sample *os, @@ -5968,8 +6002,10 @@ xlate_fill_ipfix_sample(struct xlate_ctx *ctx, userspace->cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; userspace->cookie.flow_sample.probability = os->probability; userspace->cookie.flow_sample.collector_set_id = os->collector_set_id; - userspace->cookie.flow_sample.obs_domain_id = os->obs_domain_id; - userspace->cookie.flow_sample.obs_point_id = os->obs_point_id; + userspace->cookie.flow_sample.obs_domain_id = + ofpact_sample_get_domain(ctx, os); + userspace->cookie.flow_sample.obs_point_id = + ofpact_sample_get_point(ctx, os); userspace->cookie.flow_sample.output_odp_port = output_odp_port; userspace->cookie.flow_sample.direction = os->direction; userspace->include_actions = false; @@ -6003,8 +6039,8 @@ xlate_sample_action(struct xlate_ctx *ctx, dpif_lsample_get_group_id(lsample, os->collector_set_id, &psample.group_id)) { - psample.cookie.hi = htonl(os->obs_domain_id); - psample.cookie.lo = htonl(os->obs_point_id); + psample.cookie.hi = htonl(ofpact_sample_get_domain(ctx, os)); + psample.cookie.lo = htonl(ofpact_sample_get_point(ctx, os)); compose_args.psample = &psample; } diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index 3d3226c919c..f011b0460e4 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -30,7 +30,7 @@ decode_move_field, decode_dec_ttl, decode_chk_pkt_larger, - decode_zone, + decode_field_or_int, decode_learn, ) @@ -330,7 +330,7 @@ def _fw_action_decoders_args(): KVDecoders( { "commit": decode_flag, - "zone": decode_zone, + "zone": decode_field_or_int, "table": decode_int, "nat": decode_nat, "force": decode_flag, @@ -426,8 +426,8 @@ def _other_action_decoders_args(): { "probability": decode_int, "collector_set_id": decode_int, - "obs_domain_id": decode_int, - "obs_point_id": decode_int, + "obs_domain_id": decode_field_or_int, + "obs_point_id": decode_field_or_int, "sampling_port": decode_default, "ingress": decode_flag, "egress": decode_flag, diff --git a/python/ovs/flow/ofp_act.py b/python/ovs/flow/ofp_act.py index 2c85076a34c..73727428a90 100644 --- a/python/ovs/flow/ofp_act.py +++ b/python/ovs/flow/ofp_act.py @@ -246,9 +246,9 @@ def decode_chk_pkt_larger(value): return {"pkt_len": pkt_len, "dst": dst} -# CT decoders -def decode_zone(value): - """Decodes the value of the 'zone' keyword (part of the ct action).""" +def decode_field_or_int(value): + """Decodes a value that can be either a subfield specification or an + integer.""" try: return int(value, 0) except ValueError: diff --git a/tests/ofp-actions.at b/tests/ofp-actions.at index 40a23bb15dc..86aec12e80a 100644 --- a/tests/ofp-actions.at +++ b/tests/ofp-actions.at @@ -136,6 +136,9 @@ ffff 0020 00002320 0026 3039 00005BA0 00008707 0000B26E DDD50000 00000000 # actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789,egress) ffff 0020 00002320 0029 3039 00005BA0 00008707 0000B26E DDD50200 00000000 +# actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) +ffff 0028 00002320 0033 3039 00005ba0 00000002 000f0000 0001d810 081f0000 0000 000000000000 + # bad OpenFlow10 actions: OFPBAC_BAD_LEN & ofp_actions|WARN|OpenFlow action OFPAT_OUTPUT length 240 exceeds action buffer length 8 & ofp_actions|WARN|bad action at offset 0 (OFPBAC_BAD_LEN): @@ -489,6 +492,9 @@ ffff 0020 00002320 0015 000500000000 80003039005A02fd 0400000000000000 # actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) ffff 0018 00002320 001d 3039 00005BA0 00008707 0000B26E +# actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) +ffff 0028 00002320 0033 3039 00005ba0 00000002 000f0000 0001d810 081f0000 0000 000000000000 + # bad OpenFlow11 actions: OFPBAC_BAD_OUT_PORT & ofp_actions|WARN|bad action at offset 0 (OFPBAC_BAD_OUT_PORT): & 00000000 00 00 00 10 ff ff ff ff-00 00 00 00 00 00 00 00 @@ -1121,6 +1127,8 @@ bad_action 'unroll_xlate' "UNROLL is an internal action that shouldn't be used v # sample bad_action 'sample(probability=0)' 'invalid probability value "0"' bad_action 'sample(sampling_port=asdf)' 'asdf: unknown port' +bad_action 'sample(probability=12345,obs_point_id=NXM_NX_CT_LABEL[[0..32]])' \ + 'size of obs_point_id field (33) exceeds maximum (32)' bad_action 'sample(foo=bar)' 'invalid key "foo" in "sample" argument' bad_action 'sample' 'non-zero "probability" must be specified on sample' diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 9415f571c24..61b24bfd25d 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -8304,6 +8304,61 @@ AT_CHECK([ovs-vsctl destroy Flow_Sample_Collector_Set 1], [0], [ignore]) OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - Flow IPFIX sanity check - from field]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@ipfix create IPFIX targets=\"127.0.0.1:5500\" \ + -- --id=@cs create Flow_Sample_Collector_Set id=0 \ + bridge=@br0 ipfix=@ipfix], + [0], [ignore]) + +m4_define([SAMPLE_ACTION], + [sample(probability=65535,collector_set_id=1,obs_domain_id=NXM_OF_IN_PORT,obs_point_id=$1)]dnl +) + +dnl Store in_port in obs_domain_id and dp_hash in the obs_point_id. +AT_DATA([flows.txt], [dnl +priority=100,arp,action=normal +priority=10,in_port=1,ip actions=SAMPLE_ACTION(NXM_NX_DP_HASH),2 +priority=10,in_port=2,ip actions=SAMPLE_ACTION(NXM_NX_CT_LABEL[[[0..31]]]),1 +priority=10,in_port=3,ip actions=SAMPLE_ACTION(NXM_NX_CT_LABEL[[[10..14]]]),1 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt], [0], [ignore]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy \ + "in_port(1),dp_hash(45),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),\ + ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)"], [0], [stdout]) + +AT_CHECK([tail -2 stdout], [0], [dnl +Megaflow: recirc_id=0,dp_hash=0x2d,eth,ip,in_port=1,nw_frag=no +Datapath actions: userspace(pid=0,flow_sample(probability=65535,collector_set_id=1,obs_domain_id=1,obs_point_id=45,output_port=4294967295)),2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy \ + "in_port(2),ct_label(0x1234567890abcdef1234567890abcdef),\ + eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),\ + ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)"], [0], [stdout]) + +AT_CHECK([tail -2 stdout], [0], [dnl +Megaflow: recirc_id=0,ct_label=0x90abcdef/0xffffffff,eth,ip,in_port=2,nw_frag=no +Datapath actions: userspace(pid=0,flow_sample(probability=65535,collector_set_id=1,obs_domain_id=2,obs_point_id=2427178479,output_port=4294967295)),1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy \ + "in_port(3),ct_label(0x1234567890abcdef1234567890abcdef),\ + eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),\ + ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)"], [0], [stdout]) + +AT_CHECK([tail -2 stdout], [0], [dnl +Megaflow: recirc_id=0,ct_label=0x4c00/0x7c00,eth,ip,in_port=3,nw_frag=no +Datapath actions: userspace(pid=0,flow_sample(probability=65535,collector_set_id=1,obs_domain_id=3,obs_point_id=19,output_port=4294967295)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - clone action]) OVS_VSWITCHD_START add_of_ports br0 1 2 3 4 diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at index d03d365003b..e2f4429ae55 100644 --- a/tests/ovs-ofctl.at +++ b/tests/ovs-ofctl.at @@ -198,6 +198,8 @@ actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_ actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,ingress) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789,egress) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63]) ip,actions=ct(nat) ip,actions=ct(commit,nat(dst)) ip,actions=ct(commit,nat(src)) @@ -233,6 +235,8 @@ OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_d OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,ingress) OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789,egress) +OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) +OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63]) OFPT_FLOW_MOD: ADD ip actions=ct(nat) OFPT_FLOW_MOD: ADD ip actions=ct(commit,nat(dst)) OFPT_FLOW_MOD: ADD ip actions=ct(commit,nat(src)) @@ -265,6 +269,7 @@ sctp actions=drop in_port=0 actions=resubmit:0 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CHECK([ovs-ofctl --protocols OpenFlow11 parse-flows flows.txt @@ -286,6 +291,7 @@ OFPT_FLOW_MOD (OF1.1): ADD sctp actions=drop OFPT_FLOW_MOD (OF1.1): ADD in_port=0 actions=resubmit:0 OFPT_FLOW_MOD (OF1.1): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) OFPT_FLOW_MOD (OF1.1): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +OFPT_FLOW_MOD (OF1.1): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CLEANUP @@ -312,6 +318,7 @@ in_port=0 actions=mod_dl_src:11:22:33:44:55:66,mod_dl_dst:10:20:30:40:50:60 in_port=0 actions=resubmit:0 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CHECK([ovs-ofctl --protocols OpenFlow12 parse-flows flows.txt @@ -339,6 +346,7 @@ OFPT_FLOW_MOD (OF1.2): ADD in_port=0 actions=set_field:11:22:33:44:55:66->eth_sr OFPT_FLOW_MOD (OF1.2): ADD in_port=0 actions=resubmit:0 OFPT_FLOW_MOD (OF1.2): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) OFPT_FLOW_MOD (OF1.2): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +OFPT_FLOW_MOD (OF1.2): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CLEANUP @@ -441,6 +449,7 @@ tcp,actions=fin_timeout(idle_timeout=5,hard_timeout=15) actions=controller(max_len=123,reason=invalid_ttl,id=555) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789) mpls,mpls_label=5,mpls_tc=1,mpls_ttl=1,mpls_bos=0,actions=drop ip,actions=ct(commit,zone=5) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_MARK[]))) @@ -508,6 +517,7 @@ NXT_FLOW_MOD: ADD table:255 tcp actions=fin_timeout(idle_timeout=5,hard_timeout= NXT_FLOW_MOD: ADD table:255 actions=controller(reason=invalid_ttl,max_len=123,id=555) NXT_FLOW_MOD: ADD table:255 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) NXT_FLOW_MOD: ADD table:255 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +NXT_FLOW_MOD: ADD table:255 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789) NXT_FLOW_MOD: ADD table:255 mpls,mpls_label=5,mpls_tc=1,mpls_ttl=1,mpls_bos=0 actions=drop NXT_FLOW_MOD: ADD table:255 ip actions=ct(commit,zone=5) NXT_FLOW_MOD: ADD table:255 ip actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[])) @@ -567,6 +577,7 @@ dl_dst=aa:bb:cc:dd:ee:ff/fe:ff:ff:ff:ff:ff,actions=drop dl_dst=aa:bb:cc:dd:ee:ff/00:00:00:00:00:00,actions=drop actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[[]],obs_point_id=NXM_NX_CT_LABEL[[32..63]],sampling_port=56789,egress) ip,actions=ct(commit,zone=5) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_MARK[[]]))) ip,actions=ct(commit,exec(load(0x1->NXM_NX_CT_LABEL[[]]))) @@ -608,6 +619,7 @@ NXT_FLOW_MOD: ADD dl_dst=aa:bb:cc:dd:ee:ff/fe:ff:ff:ff:ff:ff actions=drop NXT_FLOW_MOD: ADD actions=drop NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +NXT_FLOW_MOD: ADD actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[[]],obs_point_id=NXM_NX_CT_LABEL[[32..63]],sampling_port=56789,egress) NXT_FLOW_MOD: ADD ip actions=ct(commit,zone=5) NXT_FLOW_MOD: ADD ip actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[[]])) NXT_FLOW_MOD: ADD ip actions=ct(commit,exec(load:0x1->NXM_NX_CT_LABEL[[0..63]],load:0->NXM_NX_CT_LABEL[[64..127]])) @@ -648,6 +660,7 @@ actions=push:reg0[0..31],pop:reg0 vlan_tci=0x1123/0x1fff,actions=drop actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) ip,actions=ct(commit,zone=5) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_MARK[]))) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_LABEL[]))) @@ -688,6 +701,7 @@ NXT_FLOW_MOD: ADD actions=push:NXM_NX_REG0[],pop:NXM_NX_REG0[] NXT_FLOW_MOD: ADD NXM_OF_VLAN_TCI_W(1123/1fff) actions=drop NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +NXT_FLOW_MOD: ADD actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) NXT_FLOW_MOD: ADD NXM_OF_ETH_TYPE(0800) actions=ct(commit,zone=5) NXT_FLOW_MOD: ADD NXM_OF_ETH_TYPE(0800) actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[])) NXT_FLOW_MOD: ADD NXM_OF_ETH_TYPE(0800) actions=ct(commit,exec(load:0x1->NXM_NX_CT_LABEL[0..63],load:0->NXM_NX_CT_LABEL[64..127])) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 120c66e5d46..202ff049222 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -9409,3 +9409,82 @@ dnl OVS will fail to send IPFIX packets because the target is localhost dnl and the port is closed. Ignore the message it generates. OVS_TRAFFIC_VSWITCHD_STOP(["/sending to collector failed/d"]) AT_CLEANUP + +AT_SETUP([psample - from ct label]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) +NS_CHECK_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns1], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "e4:11:22:33:44:55") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "e4:11:22:33:44:66") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@ipfix create IPFIX targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + ipfix=@ipfix, local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + ipfix=@ipfix, local-group-id=12], + [0], [ignore]) + +m4_define([CT_STORE_ACT], + [ct(zone=5,commit,exec(load:0x0bb102030->NXM_NX_CT_LABEL[[0..31]],load:0xbb405060->NXM_NX_CT_LABEL[[32..63]]))]) + +AT_DATA([flows.txt], [dnl +priority=100,ip actions=ct(zone=5, table=10) +priority=0 actions=NORMAL +table=10,priority=100,ip,ct_state=+trk+new action=SAMPLE_ACTION(1, 2853183536, 2856341600),CT_STORE_ACT,NORMAL +table=10,priority=100,ip,ct_state=+trk-new action=SAMPLE_ACTION(2, NXM_NX_CT_LABEL[[[0..31]]], NXM_NX_CT_LABEL[[[32..63]]]),NORMAL +table=10, priority=50, ip, actions=DROP +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample1.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) +AT_CHECK([grep -qE 'SAMPLE1' psample.out]) +AT_CHECK([grep -qE 'SAMPLE2' psample.out]) + +m4_define([FLOW_MATCH], [m4_join([], + [ct_label(0xbb405060bb102030/0xffffffffffffffff).*actions:], + [actions:psample(group=12,cookie=0xbb102030bb405060),], + [userspace(pid=[[0-9]]+,flow_sample(.*obs_domain_id=3138396208,obs_point_id=3141554272.*))] +)]) + +AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p1 \ + | grep -qE 'FLOW_MATCH' ], [0], []) + +dnl Check IPFIX samples have been received. +dnl Entries can be unsorted and IFPIX packets might not have been sent (or +dnl at least tried to be sent) yet. +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-ipfix-flow br0 | \ + sed 's/tx pkts=[[0-9]]*/tx pkts=24/' | \ + sed 's/tx errs=[[0-9]]*/tx errs=0/' | \ + sed 's/id [[1-2]]:/id ?:/'], [dnl +NXST_IPFIX_FLOW reply (xid=0x2): 2 ids + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0 + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0]) + +dnl OVS will fail to send IPFIX packets because the target is localhost +dnl and the port is closed. Ignore the message it generates. +OVS_TRAFFIC_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP From 9e6d43ef32152527f7887d7f316a191adb5f338c Mon Sep 17 00:00:00 2001 From: Timothy Redaelli Date: Wed, 10 Jul 2024 13:06:11 +0200 Subject: [PATCH 784/833] rhel: Make the version, displayed to the user, customizable. Since on CentOS/RHEL the builds are based on stable branches and not on tags for debugging purpose it's better to have the downstream version as version so it's easier to know which commits are included in a build. This commit adds --with-version-suffix as ./configure option in order to set an OVS version suffix that should be shown to the user via ovs-vsctl -V and, so, also on database, on ovs-vsctl show and the other utilities. --with-version-suffix is used in Fedora/CentOS/RHEL spec file in order to have the version be aligned with the downstream one. Signed-off-by: Timothy Redaelli Signed-off-by: Ilya Maximets --- Makefile.am | 3 +++ acinclude.m4 | 13 ++++++++++++ configure.ac | 1 + include/openvswitch/version.h.in | 2 +- lib/ovsdb-error.c | 2 +- lib/util.c | 8 +++++--- ovsdb/ovsdb-server.c | 3 ++- python/.gitignore | 1 + python/automake.mk | 22 +++++++++++++------- python/{setup.py => setup.py.template} | 28 +++++++++----------------- rhel/openvswitch-fedora.spec.in | 1 + utilities/ovs-dpctl-top.in | 2 +- utilities/ovs-lib.in | 2 +- utilities/ovs-parse-backtrace.in | 2 +- utilities/ovs-pcap.in | 2 +- utilities/ovs-pki.in | 2 +- utilities/ovs-tcpdump.in | 4 ++-- utilities/ovs-tcpundump.in | 2 +- utilities/ovs-vlan-test.in | 2 +- vswitchd/bridge.c | 3 ++- 20 files changed, 64 insertions(+), 41 deletions(-) rename python/{setup.py => setup.py.template} (87%) diff --git a/Makefile.am b/Makefile.am index e6c90a911aa..dc5c34a6ae8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -8,6 +8,8 @@ AUTOMAKE_OPTIONS = foreign subdir-objects ACLOCAL_AMFLAGS = -I m4 +AM_DISTCHECK_CONFIGURE_FLAGS = --with-version-suffix="$(VERSION_SUFFIX)" + AM_CPPFLAGS = $(SSL_CFLAGS) AM_LDFLAGS = $(SSL_LDFLAGS) AM_LDFLAGS += $(OVS_LDFLAGS) @@ -163,6 +165,7 @@ SUFFIXES += .in -e 's,[@]PYTHON3[@],$(PYTHON3),g' \ -e 's,[@]RUNDIR[@],$(RUNDIR),g' \ -e 's,[@]VERSION[@],$(VERSION),g' \ + -e 's,[@]VERSION_SUFFIX[@],$(VERSION_SUFFIX),g' \ -e 's,[@]localstatedir[@],$(localstatedir),g' \ -e 's,[@]pkgdatadir[@],$(pkgdatadir),g' \ -e 's,[@]sysconfdir[@],$(sysconfdir),g' \ diff --git a/acinclude.m4 b/acinclude.m4 index f1ba046c238..1ace70c92a7 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -497,6 +497,19 @@ AC_DEFUN([OVS_CHECK_DPDK], [ AM_CONDITIONAL([DPDK_NETDEV], test "$DPDKLIB_FOUND" = true) ]) +dnl Append a version suffix. + +AC_DEFUN([OVS_CHECK_VERSION_SUFFIX], [ + AC_ARG_WITH([version-suffix], + [AS_HELP_STRING([--with-version-suffix=ver_suffix], + [Specify a string that will be appended + to OVS version])]) + AC_DEFINE_UNQUOTED([VERSION_SUFFIX], ["$with_version_suffix"], + [Package version suffix]) + AC_SUBST([VERSION_SUFFIX], [$with_version_suffix]) + ]) +]) + dnl Checks for net/if_dl.h. dnl dnl (We use this as a proxy for checking whether we're building on FreeBSD diff --git a/configure.ac b/configure.ac index dd6553fea07..8323e481d29 100644 --- a/configure.ac +++ b/configure.ac @@ -202,6 +202,7 @@ OVS_CHECK_LINUX_SCTP_CT OVS_CHECK_LINUX_VIRTIO_TYPES OVS_CHECK_DPDK OVS_CHECK_PRAGMA_MESSAGE +OVS_CHECK_VERSION_SUFFIX AC_SUBST([CFLAGS]) AC_SUBST([OVS_CFLAGS]) AC_SUBST([OVS_LDFLAGS]) diff --git a/include/openvswitch/version.h.in b/include/openvswitch/version.h.in index 23d8fde4f18..231f61e30c0 100644 --- a/include/openvswitch/version.h.in +++ b/include/openvswitch/version.h.in @@ -19,7 +19,7 @@ #define OPENVSWITCH_VERSION_H 1 #define OVS_PACKAGE_STRING "@PACKAGE_STRING@" -#define OVS_PACKAGE_VERSION "@PACKAGE_VERSION@" +#define OVS_PACKAGE_VERSION "@PACKAGE_VERSION@@VERSION_SUFFIX@" #define OVS_LIB_VERSION @LT_CURRENT@ #define OVS_LIB_REVISION @LT_REVISION@ diff --git a/lib/ovsdb-error.c b/lib/ovsdb-error.c index 9ad42b232d4..56512fc28dd 100644 --- a/lib/ovsdb-error.c +++ b/lib/ovsdb-error.c @@ -146,7 +146,7 @@ ovsdb_internal_error(struct ovsdb_error *inner_error, ds_put_char(&ds, ')'); } - ds_put_format(&ds, " (%s %s)", program_name, VERSION); + ds_put_format(&ds, " (%s %s)", program_name, VERSION VERSION_SUFFIX); if (inner_error) { char *s = ovsdb_error_to_string_free(inner_error); diff --git a/lib/util.c b/lib/util.c index 84e8c4966db..5253921b2c3 100644 --- a/lib/util.c +++ b/lib/util.c @@ -618,12 +618,14 @@ ovs_set_program_name(const char *argv0, const char *version) program_name = basename; free(program_version); - if (!strcmp(version, VERSION)) { - program_version = xasprintf("%s (Open vSwitch) "VERSION, + if (!strcmp(version, VERSION VERSION_SUFFIX)) { + program_version = xasprintf("%s (Open vSwitch) "VERSION + VERSION_SUFFIX, program_name); } else { program_version = xasprintf("%s %s\n" - "Open vSwitch Library "VERSION, + "Open vSwitch Library "VERSION + VERSION_SUFFIX, program_name, version); } } diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index b51fd42fe56..a876f8bcf72 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -816,7 +816,8 @@ main(int argc, char *argv[]) /* ovsdb-server is usually a long-running process, in which case it * makes plenty of sense to log the version, but --run makes * ovsdb-server more like a command-line tool, so skip it. */ - VLOG_INFO("%s (Open vSwitch) %s", program_name, VERSION); + VLOG_INFO("%s (Open vSwitch) %s", program_name, + VERSION VERSION_SUFFIX); } unixctl_command_register("exit", "", 0, 0, ovsdb_server_exit, &exiting); diff --git a/python/.gitignore b/python/.gitignore index 60ace6f05b5..ad5486af838 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,2 +1,3 @@ dist/ *.egg-info +setup.py diff --git a/python/automake.mk b/python/automake.mk index 84cf2eab57e..d0523870d67 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -75,25 +75,24 @@ EXTRA_DIST += \ EXTRA_DIST += \ python/ovs/compat/sortedcontainers/LICENSE \ python/README.rst \ - python/setup.py \ python/test_requirements.txt # C extension support. EXTRA_DIST += python/ovs/_json.c -PYFILES = $(ovs_pyfiles) python/ovs/dirs.py $(ovstest_pyfiles) $(ovs_pytests) +PYFILES = $(ovs_pyfiles) python/ovs/dirs.py python/setup.py $(ovstest_pyfiles) $(ovs_pytests) EXTRA_DIST += $(PYFILES) PYCOV_CLEAN_FILES += $(PYFILES:.py=.py,cover) FLAKE8_PYFILES += \ - $(filter-out python/ovs/compat/% python/ovs/dirs.py,$(PYFILES)) \ + $(filter-out python/ovs/compat/% python/ovs/dirs.py python/setup.py,$(PYFILES)) \ python/ovs_build_helpers/__init__.py \ python/ovs_build_helpers/extract_ofp_fields.py \ python/ovs_build_helpers/nroff.py \ python/ovs_build_helpers/soutil.py \ python/ovs/dirs.py.template \ - python/setup.py + python/setup.py.template nobase_pkgdata_DATA = $(ovs_pyfiles) $(ovstest_pyfiles) ovs-install-data-local: @@ -113,7 +112,7 @@ ovs-install-data-local: rm python/ovs/dirs.py.tmp .PHONY: python-sdist -python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py +python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py python/setup.py cd python/ && $(PYTHON3) -m build --sdist .PHONY: pypi-upload @@ -129,8 +128,8 @@ ovs-uninstall-local: ALL_LOCAL += $(srcdir)/python/ovs/version.py $(srcdir)/python/ovs/version.py: config.status $(AM_V_GEN)$(ro_shell) > $(@F).tmp && \ - echo 'VERSION = "$(VERSION)"' >> $(@F).tmp && \ - if cmp -s $(@F).tmp $@; then touch $@; rm $(@F).tmp; else mv $(@F).tmp $@; fi + echo 'VERSION = "$(VERSION)$(VERSION_SUFFIX)"' >> $(@F).tmp && \ + if cmp -s $(@F).tmp $@; then touch $@; else cp $(@F).tmp $@; fi; rm $(@F).tmp ALL_LOCAL += $(srcdir)/python/ovs/dirs.py $(srcdir)/python/ovs/dirs.py: python/ovs/dirs.py.template @@ -147,6 +146,15 @@ $(srcdir)/python/ovs/dirs.py: python/ovs/dirs.py.template EXTRA_DIST += python/ovs/dirs.py.template CLEANFILES += python/ovs/dirs.py +ALL_LOCAL += $(srcdir)/python/setup.py +$(srcdir)/python/setup.py: python/setup.py.template config.status + $(AM_V_GEN)sed \ + -e 's,[@]VERSION[@],$(VERSION),g' \ + < $(srcdir)/python/setup.py.template > $(@F).tmp && \ + if cmp -s $(@F).tmp $@; then touch $@; else cp $(@F).tmp $@; fi; rm $(@F).tmp +EXTRA_DIST += python/setup.py.template +CLEANFILES += python/setup.py + EXTRA_DIST += python/TODO.rst $(srcdir)/python/ovs/flow/ofp_fields.py: $(srcdir)/build-aux/gen_ofp_field_decoders include/openvswitch/meta-flow.h diff --git a/python/setup.py b/python/setup.py.template similarity index 87% rename from python/setup.py rename to python/setup.py.template index bcf832ce9ba..e7d59f2ca3f 100644 --- a/python/setup.py +++ b/python/setup.py.template @@ -23,24 +23,16 @@ import setuptools -VERSION = "unknown" - -try: - # Try to set the version from the generated ovs/version.py - exec(open("ovs/version.py").read()) -except IOError: - print("Ensure version.py is created by running make python/ovs/version.py", - file=sys.stderr) - sys.exit(-1) - -try: - # Try to open generated ovs/dirs.py. However, in this case we - # don't need to exec() - open("ovs/dirs.py") -except IOError: - print("Ensure dirs.py is created by running make python/ovs/dirs.py", - file=sys.stderr) - sys.exit(-1) +VERSION = "@VERSION@" + +for x in ("version.py", "dirs.py"): + try: + # Try to open generated ovs/{version,dirs}.py + open(f"ovs/{x}") + except IOError: + print(f"Ensure {x} is created by running make python/ovs/{x}", + file=sys.stderr) + sys.exit(-1) ext_errors = (CCompilerError, ExecError, PlatformError) if sys.platform == 'win32': diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 94b6d7431cb..f129bc64625 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -186,6 +186,7 @@ This package provides IPsec tunneling support for OVS tunnels. --disable-static \ --enable-shared \ --with-pkidir=%{_sharedstatedir}/openvswitch/pki \ + --with-version-suffix=-%{release} \ PYTHON3=%{__python3} build-aux/dpdkstrip.py \ diff --git a/utilities/ovs-dpctl-top.in b/utilities/ovs-dpctl-top.in index 2c1766eff5e..ec57eccd66e 100755 --- a/utilities/ovs-dpctl-top.in +++ b/utilities/ovs-dpctl-top.in @@ -351,7 +351,7 @@ def args_get(): # None is a special value indicating to read flows from stdin. # This handles the case # ovs-dpctl dump-flows | ovs-dpctl-flows.py - parser.add_argument("-v", "--version", version="@VERSION@", + parser.add_argument("-v", "--version", version="@VERSION@@VERSION_SUFFIX@", action="version", help="show version") parser.add_argument("-f", "--flow-file", dest="flowFiles", default=None, action="append", diff --git a/utilities/ovs-lib.in b/utilities/ovs-lib.in index 7812a94ee8b..d162227dc5e 100644 --- a/utilities/ovs-lib.in +++ b/utilities/ovs-lib.in @@ -70,7 +70,7 @@ ovs_ctl () { esac } -VERSION='@VERSION@' +VERSION='@VERSION@@VERSION_SUFFIX@' DAEMON_CWD=/ diff --git a/utilities/ovs-parse-backtrace.in b/utilities/ovs-parse-backtrace.in index f44f05cd1e1..42f831eed51 100755 --- a/utilities/ovs-parse-backtrace.in +++ b/utilities/ovs-parse-backtrace.in @@ -51,7 +51,7 @@ def addr2line(binary, addr): def main(): - parser = optparse.OptionParser(version='@VERSION@', + parser = optparse.OptionParser(version='@VERSION@@VERSION_SUFFIX@', usage="usage: %prog [binary]", description="""\ Parses the output of ovs-appctl backtrace producing a more human readable diff --git a/utilities/ovs-pcap.in b/utilities/ovs-pcap.in index 6b5f63399ec..d0ca9478869 100755 --- a/utilities/ovs-pcap.in +++ b/utilities/ovs-pcap.in @@ -85,7 +85,7 @@ if __name__ == "__main__": if key in ['-h', '--help']: usage() elif key in ['-V', '--version']: - print("ovs-pcap (Open vSwitch) @VERSION@") + print("ovs-pcap (Open vSwitch) @VERSION@@VERSION_SUFFIX@") else: sys.exit(0) diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in index 3d2ef911c94..69060b4ace4 100755 --- a/utilities/ovs-pki.in +++ b/utilities/ovs-pki.in @@ -189,7 +189,7 @@ EOF exit 0 ;; -V|--version) - echo "ovs-pki (Open vSwitch) @VERSION@" + echo "ovs-pki (Open vSwitch) @VERSION@@VERSION_SUFFIX@" exit 0 ;; --di*=*) diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index eada803bb41..cb46e43ba8f 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -47,7 +47,7 @@ try: from ovs.fatal_signal import add_hook except Exception: print("ERROR: Please install the correct Open vSwitch python support") - print(" libraries (version @VERSION@).") + print(" libraries (version @VERSION@@VERSION_SUFFIX@).") print(" Alternatively, check that your PYTHONPATH is pointing to") print(" the correct location.") sys.exit(1) @@ -453,7 +453,7 @@ def main(): if cur in ['-h', '--help']: usage() elif cur in ['-V', '--version']: - print("ovs-tcpdump (Open vSwitch) @VERSION@") + print("ovs-tcpdump (Open vSwitch) @VERSION@@VERSION_SUFFIX@") sys.exit(0) elif cur in ['--db-sock']: db_sock = nxt diff --git a/utilities/ovs-tcpundump.in b/utilities/ovs-tcpundump.in index ede5448b496..2a1b08332d7 100755 --- a/utilities/ovs-tcpundump.in +++ b/utilities/ovs-tcpundump.in @@ -46,7 +46,7 @@ if __name__ == "__main__": if key in ['-h', '--help']: usage() elif key in ['-V', '--version']: - print("ovs-tcpundump (Open vSwitch) @VERSION@") + print("ovs-tcpundump (Open vSwitch) @VERSION@@VERSION_SUFFIX@") sys.exit(0) else: sys.exit(0) diff --git a/utilities/ovs-vlan-test.in b/utilities/ovs-vlan-test.in index de3ae168623..3c15e2b1353 100755 --- a/utilities/ovs-vlan-test.in +++ b/utilities/ovs-vlan-test.in @@ -393,7 +393,7 @@ def main(): usage() return 0 elif key in ['-V', '--version']: - print_safe('ovs-vlan-test (Open vSwitch) @VERSION@') + print_safe('ovs-vlan-test (Open vSwitch) @VERSION@@VERSION_SUFFIX@') return 0 elif key in ['-s', '--server']: server = True diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 86ba06e2009..88aedf6b2ee 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -3470,7 +3470,8 @@ bridge_run(void) vlog_enable_async(); - VLOG_INFO_ONCE("%s (Open vSwitch) %s", program_name, VERSION); + VLOG_INFO_ONCE("%s (Open vSwitch) %s", program_name, + VERSION VERSION_SUFFIX); } } From 600125b2c380c02869351e96f0c62ac1aac06547 Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Fri, 12 Jul 2024 15:47:55 +0200 Subject: [PATCH 785/833] ofproto: Add ofproto/detrace command to map UFIDs to OpenFlow. It improves the debugging experience if we can easily get a list of OpenFlow rules and groups that contribute to the creation of a datapath flow. The suggested workflow is: a. dump datapath flows (along with UUIDs), this also prints the core IDs (PMD IDs) when applicable. $ ovs-appctl dpctl/dump-flows -m flow-dump from pmd on cpu core: 7 ufid:7460db8f..., recirc_id(0), .... b. dump related OpenFlow rules and groups: $ ovs-appctl ofproto/detrace ufid:7460db8f... pmd=7 cookie=0x12345678, table=0 priority=100,ip,in_port=2,nw_dst=10.0.0.2,actions=resubmit(,1) cookie=0x0, table=1 priority=200,actions=group:1 group_id=1,bucket=bucket_id:0,actions=ct(commit,table=2,nat(dst=20.0.0.2)) cookie=0x0, table=2 actions=output:1 The new command only shows rules and groups attached to ukeys that are in states UKEY_VISIBLE or UKEY_OPERATIONAL. That should be fine as all other ukeys should not be relevant for the use case presented above. This commit tries to mimic the output format of the ovs-ofctl dump-flows/dump-groups commands. Signed-off-by: Dumitru Ceara Signed-off-by: Ilya Maximets --- NEWS | 2 + include/openvswitch/ofp-group.h | 14 +++ lib/ofp-group.c | 131 +++++++++++++++++------------ ofproto/ofproto-dpif-upcall.c | 51 +++++++++++ ofproto/ofproto-dpif-xlate-cache.c | 34 ++++++++ ofproto/ofproto-dpif-xlate-cache.h | 2 + ofproto/ofproto-provider.h | 2 + ofproto/ofproto.c | 11 +-- tests/ofproto-dpif.at | 56 ++++++++++++ tests/ofproto-macros.at | 14 ++- 10 files changed, 257 insertions(+), 60 deletions(-) diff --git a/NEWS b/NEWS index 10e08fbac4c..9c576c577f7 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ Post-v3.3.0 or 'text' (by default). * Added new option [--pretty] to print JSON output in a readable fashion. * 'dpif/show' and 'list-commands' now support output in JSON format. + * Added 'ofproto/detrace' command that outputs the set of OpenFlow rules + and groups that contributed to the creation of a specific datapath flow. - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h index cd7af0ebff9..7cbb2f70f31 100644 --- a/include/openvswitch/ofp-group.h +++ b/include/openvswitch/ofp-group.h @@ -70,6 +70,10 @@ struct ofputil_bucket *ofputil_bucket_find(const struct ovs_list *, bool ofputil_bucket_check_duplicate_id(const struct ovs_list *); struct ofputil_bucket *ofputil_bucket_list_front(const struct ovs_list *); struct ofputil_bucket *ofputil_bucket_list_back(const struct ovs_list *); +void ofputil_bucket_format(struct ds *, const struct ofputil_bucket *, + enum ofp11_group_type, enum ofp_version, + const struct ofputil_port_map *, + const struct ofputil_table_map *); static inline bool ofputil_bucket_has_liveness(const struct ofputil_bucket *bucket) @@ -88,6 +92,8 @@ struct ofputil_group_props { void ofputil_group_properties_destroy(struct ofputil_group_props *); void ofputil_group_properties_copy(struct ofputil_group_props *to, const struct ofputil_group_props *from); +void ofputil_group_properties_format(const struct ofputil_group_props *, + struct ds *); /* Protocol-independent group_mod. */ struct ofputil_group_mod { uint16_t command; /* One of OFPGC15_*. */ @@ -199,6 +205,14 @@ enum ofperr ofputil_group_desc_format(struct ds *, const struct ofp_header *, enum ofperr ofputil_group_features_format(struct ds *, const struct ofp_header *); +/* Group formatting. */ +void ofputil_group_format(struct ds *s, uint32_t group_id, uint8_t type, + const struct ofputil_bucket *, + const struct ovs_list *p_buckets, + const struct ofputil_group_props *, + enum ofp_version, bool suppress_type, + const struct ofputil_port_map *, + const struct ofputil_table_map *); #ifdef __cplusplus } #endif diff --git a/lib/ofp-group.c b/lib/ofp-group.c index 737f48047b1..3edf1b01b37 100644 --- a/lib/ofp-group.c +++ b/lib/ofp-group.c @@ -1526,6 +1526,31 @@ ofputil_group_properties_destroy(struct ofputil_group_props *gp) free(gp->fields.values); } +void +ofputil_group_properties_format(const struct ofputil_group_props *gp, + struct ds *ds) +{ + if (!gp->selection_method[0]) { + return; + } + + ds_put_format(ds, ",selection_method=%s", gp->selection_method); + if (gp->selection_method_param) { + ds_put_format(ds, ",selection_method_param=%"PRIu64, + gp->selection_method_param); + } + + size_t n = bitmap_count1(gp->fields.used.bm, MFF_N_IDS); + if (n == 1) { + ds_put_cstr(ds, ",fields="); + oxm_format_field_array(ds, &gp->fields); + } else if (n > 1) { + ds_put_cstr(ds, ",fields("); + oxm_format_field_array(ds, &gp->fields); + ds_put_char(ds, ')'); + } +} + static enum ofperr parse_group_prop_ntr_selection_method(struct ofpbuf *payload, enum ofp11_group_type group_type, @@ -1813,16 +1838,45 @@ ofp_print_bucket_id(struct ds *s, const char *label, uint32_t bucket_id, ds_put_char(s, ','); } -static void -ofp_print_group(struct ds *s, uint32_t group_id, uint8_t type, - const struct ovs_list *p_buckets, - const struct ofputil_group_props *props, - enum ofp_version ofp_version, bool suppress_type, - const struct ofputil_port_map *port_map, - const struct ofputil_table_map *table_map) +void +ofputil_bucket_format(struct ds * s, const struct ofputil_bucket *bucket, + enum ofp11_group_type type, enum ofp_version ofp_version, + const struct ofputil_port_map *port_map, + const struct ofputil_table_map *table_map) { - struct ofputil_bucket *bucket; + ds_put_cstr(s, "bucket="); + + ofp_print_bucket_id(s, "bucket_id:", bucket->bucket_id, ofp_version); + if (bucket->weight != (type == OFPGT11_SELECT ? 1 : 0)) { + ds_put_format(s, "weight:%"PRIu16",", bucket->weight); + } + if (bucket->watch_port != OFPP_NONE) { + ds_put_cstr(s, "watch_port:"); + ofputil_format_port(bucket->watch_port, port_map, s); + ds_put_char(s, ','); + } + if (bucket->watch_group != OFPG_ANY) { + ds_put_format(s, "watch_group:%"PRIu32",", bucket->watch_group); + } + ds_put_cstr(s, "actions="); + struct ofpact_format_params fp = { + .port_map = port_map, + .table_map = table_map, + .s = s, + }; + ofpacts_format(bucket->ofpacts, bucket->ofpacts_len, &fp); +} + +void +ofputil_group_format(struct ds *s, uint32_t group_id, uint8_t type, + const struct ofputil_bucket *bucket, + const struct ovs_list *p_buckets, + const struct ofputil_group_props *props, + enum ofp_version ofp_version, bool suppress_type, + const struct ofputil_port_map *port_map, + const struct ofputil_table_map *table_map) +{ ds_put_format(s, "group_id=%"PRIu32, group_id); if (!suppress_type) { @@ -1831,57 +1885,24 @@ ofp_print_group(struct ds *s, uint32_t group_id, uint8_t type, ds_put_format(s, ",type=%s", type_str[type > 4 ? 4 : type]); } - if (props->selection_method[0]) { - ds_put_format(s, ",selection_method=%s", props->selection_method); - if (props->selection_method_param) { - ds_put_format(s, ",selection_method_param=%"PRIu64, - props->selection_method_param); - } - - size_t n = bitmap_count1(props->fields.used.bm, MFF_N_IDS); - if (n == 1) { - ds_put_cstr(s, ",fields="); - oxm_format_field_array(s, &props->fields); - } else if (n > 1) { - ds_put_cstr(s, ",fields("); - oxm_format_field_array(s, &props->fields); - ds_put_char(s, ')'); - } - } + ofputil_group_properties_format(props, s); - if (!p_buckets) { + if (!bucket && !p_buckets) { return; } ds_put_char(s, ','); - LIST_FOR_EACH (bucket, list_node, p_buckets) { - ds_put_cstr(s, "bucket="); - - ofp_print_bucket_id(s, "bucket_id:", bucket->bucket_id, ofp_version); - if (bucket->weight != (type == OFPGT11_SELECT ? 1 : 0)) { - ds_put_format(s, "weight:%"PRIu16",", bucket->weight); - } - if (bucket->watch_port != OFPP_NONE) { - ds_put_cstr(s, "watch_port:"); - ofputil_format_port(bucket->watch_port, port_map, s); + if (bucket) { + ofputil_bucket_format(s, bucket, type, ofp_version, NULL, NULL); + } else { + LIST_FOR_EACH (bucket, list_node, p_buckets) { + ofputil_bucket_format(s, bucket, type, ofp_version, + port_map, table_map); ds_put_char(s, ','); } - if (bucket->watch_group != OFPG_ANY) { - ds_put_format(s, "watch_group:%"PRIu32",", bucket->watch_group); - } - - ds_put_cstr(s, "actions="); - struct ofpact_format_params fp = { - .port_map = port_map, - .table_map = table_map, - .s = s, - }; - ofpacts_format(bucket->ofpacts, bucket->ofpacts_len, &fp); - ds_put_char(s, ','); + ds_chomp(s, ','); } - - ds_chomp(s, ','); } enum ofperr @@ -1901,8 +1922,9 @@ ofputil_group_desc_format(struct ds *s, const struct ofp_header *oh, ds_put_char(s, '\n'); ds_put_char(s, ' '); - ofp_print_group(s, gd.group_id, gd.type, &gd.buckets, &gd.props, - oh->version, false, port_map, table_map); + ofputil_group_format(s, gd.group_id, gd.type, NULL, &gd.buckets, + &gd.props, oh->version, false, + port_map, table_map); ofputil_uninit_group_desc(&gd); } } @@ -2368,8 +2390,9 @@ ofputil_group_mod_format__(struct ds *s, enum ofp_version ofp_version, gm->command_bucket_id, ofp_version); } - ofp_print_group(s, gm->group_id, gm->type, &gm->buckets, &gm->props, - ofp_version, bucket_command, port_map, table_map); + ofputil_group_format(s, gm->group_id, gm->type, NULL, &gm->buckets, + &gm->props, ofp_version, bucket_command, + port_map, table_map); } enum ofperr diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 83609ec62b6..4d39bc5a713 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -383,6 +383,7 @@ static void upcall_unixctl_disable_ufid(struct unixctl_conn *, int argc, const char *argv[], void *aux); static void upcall_unixctl_enable_ufid(struct unixctl_conn *, int argc, const char *argv[], void *aux); + static void upcall_unixctl_set_flow_limit(struct unixctl_conn *conn, int argc, const char *argv[], void *aux); static void upcall_unixctl_dump_wait(struct unixctl_conn *conn, int argc, @@ -394,6 +395,9 @@ static void upcall_unixctl_pause(struct unixctl_conn *conn, int argc, static void upcall_unixctl_resume(struct unixctl_conn *conn, int argc, const char *argv[], void *aux); +static void upcall_unixctl_ofproto_detrace(struct unixctl_conn *, int argc, + const char *argv[], void *aux); + static struct udpif_key *ukey_create_from_upcall(struct upcall *, struct flow_wildcards *); static int ukey_create_from_dpif_flow(const struct udpif *, @@ -470,6 +474,8 @@ udpif_init(void) upcall_unixctl_pause, NULL); unixctl_command_register("revalidator/resume", NULL, 0, 0, upcall_unixctl_resume, NULL); + unixctl_command_register("ofproto/detrace", "UFID [pmd=PMD-ID]", 1, 2, + upcall_unixctl_ofproto_detrace, NULL); ovsthread_once_done(&once); } } @@ -3310,6 +3316,51 @@ upcall_unixctl_resume(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, ""); } +static void +upcall_unixctl_ofproto_detrace(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + unsigned int pmd_id = NON_PMD_CORE_ID; + const char *key_s = argv[1]; + ovs_u128 ufid; + + if (odp_ufid_from_string(key_s, &ufid) <= 0) { + unixctl_command_reply_error(conn, "failed to parse ufid"); + return; + } + + if (argc == 3) { + const char *pmd_str = argv[2]; + if (!ovs_scan(pmd_str, "pmd=%d", &pmd_id)) { + unixctl_command_reply_error(conn, + "Invalid pmd argument format. " + "Expecting 'pmd=PMD-ID'"); + return; + } + } + + struct ds ds = DS_EMPTY_INITIALIZER; + struct udpif *udpif; + + LIST_FOR_EACH (udpif, list_node, &all_udpifs) { + struct udpif_key *ukey = ukey_lookup(udpif, &ufid, pmd_id); + if (!ukey) { + continue; + } + + ovs_mutex_lock(&ukey->mutex); + /* It only makes sense to format rules for ukeys that are (still) + * in use. */ + if ((ukey->state == UKEY_VISIBLE || ukey->state == UKEY_OPERATIONAL) + && ukey->xcache) { + xlate_xcache_format(&ds, ukey->xcache); + } + ovs_mutex_unlock(&ukey->mutex); + } + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); +} + /* Flows are sorted in the following order: * netdev, flow state (offloaded/kernel path), flow_pps_rate. diff --git a/ofproto/ofproto-dpif-xlate-cache.c b/ofproto/ofproto-dpif-xlate-cache.c index 2e1fcb3a6f7..c6d935cf0ae 100644 --- a/ofproto/ofproto-dpif-xlate-cache.c +++ b/ofproto/ofproto-dpif-xlate-cache.c @@ -301,3 +301,37 @@ xlate_cache_steal_entries(struct xlate_cache *dst, struct xlate_cache *src) memcpy(p, src_entries->data, src_entries->size); ofpbuf_clear(src_entries); } + +void +xlate_xcache_format(struct ds *s, const struct xlate_cache *xcache) +{ + struct ofpbuf entries = xcache->entries; + struct xc_entry *entry; + struct ofgroup *ofg; + + XC_ENTRY_FOR_EACH (entry, &entries) { + switch (entry->type) { + case XC_RULE: + ofproto_rule_stats_ds(s, &entry->rule->up, true); + break; + case XC_GROUP: + ofg = &entry->group.group->up; + ofputil_group_format(s, ofg->group_id, ofg->type, + entry->group.bucket, &ofg->buckets, + &ofg->props, OFP15_VERSION, + false, NULL, NULL); + break; + case XC_TABLE: + case XC_BOND: + case XC_NETDEV: + case XC_NETFLOW: + case XC_MIRROR: + case XC_LEARN: + case XC_NORMAL: + case XC_FIN_TIMEOUT: + case XC_TNL_NEIGH: + case XC_TUNNEL_HEADER: + break; + } + } +} diff --git a/ofproto/ofproto-dpif-xlate-cache.h b/ofproto/ofproto-dpif-xlate-cache.h index 0fc6d2ea60c..e701734d796 100644 --- a/ofproto/ofproto-dpif-xlate-cache.h +++ b/ofproto/ofproto-dpif-xlate-cache.h @@ -151,4 +151,6 @@ void xlate_cache_uninit(struct xlate_cache *); void xlate_cache_delete(struct xlate_cache *); void xlate_cache_steal_entries(struct xlate_cache *, struct xlate_cache *); +void xlate_xcache_format(struct ds *, const struct xlate_cache *); + #endif /* ofproto-dpif-xlate-cache.h */ diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index cce90066bfe..7df3f524691 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -450,6 +450,8 @@ void ofproto_rule_ref(struct rule *); bool ofproto_rule_try_ref(struct rule *); void ofproto_rule_unref(struct rule *); +void ofproto_rule_stats_ds(struct ds *, struct rule *, bool offload_stats); + static inline const struct rule_actions * rule_get_actions(const struct rule *); static inline bool rule_is_table_miss(const struct rule *); static inline bool rule_is_hidden(const struct rule *); diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 2bd59fc9c16..982421cddd5 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -4865,9 +4865,9 @@ handle_flow_stats_request(struct ofconn *ofconn, return 0; } -static void -flow_stats_ds(struct ofproto *ofproto, struct rule *rule, struct ds *results, - bool offload_stats) +void +ofproto_rule_stats_ds(struct ds *results, struct rule *rule, + bool offload_stats) { struct pkt_stats stats; const struct rule_actions *actions; @@ -4896,7 +4896,8 @@ flow_stats_ds(struct ofproto *ofproto, struct rule *rule, struct ds *results, ds_put_format(results, "n_offload_bytes=%"PRIu64", ", stats.n_offload_bytes); } - cls_rule_format(&rule->cr, ofproto_get_tun_tab(ofproto), NULL, results); + cls_rule_format(&rule->cr, ofproto_get_tun_tab(rule->ofproto), NULL, + results); ds_put_char(results, ','); ds_put_cstr(results, "actions="); @@ -4918,7 +4919,7 @@ ofproto_get_all_flows(struct ofproto *p, struct ds *results, struct rule *rule; CLS_FOR_EACH (rule, cr, &table->cls) { - flow_stats_ds(p, rule, results, offload_stats); + ofproto_rule_stats_ds(results, rule, offload_stats); } } } diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 61b24bfd25d..489514be834 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -12439,3 +12439,59 @@ Datapath actions: psample(group=42,cookie=0x64000000c8),drop OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") AT_CLEANUP + +AT_SETUP([ofproto-dpif - Dump OF rules corresponding to UFID]) +OVS_VSWITCHD_START + +add_of_ports br0 1 2 3 + +dnl Add some OpenFlow rules and groups. +AT_DATA([groups.txt], [dnl +group_id=1,type=select,selection_method=dp_hash,bucket=bucket_id:0,weight:100,actions=ct(commit,table=2,nat(dst=20.0.0.2)) +group_id=2,type=all,bucket=resubmit(,3),bucket=resubmit(,4) +]) +AT_DATA([flows.txt], [dnl +table=0,priority=100,cookie=0x12345678,in_port=p1,ip,nw_dst=10.0.0.2,actions=resubmit(,1) +table=1,priority=200,ip,actions=group:1 +table=2,ip,actions=group:2 +table=3,ip,actions=p2 +table=4,ip,actions=p3 +]) +AT_CHECK([ovs-ofctl add-groups br0 groups.txt]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6),tcp(src=1,dst=2)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6),tcp(src=1,dst=2)']) +AT_CHECK([ovs-appctl revalidator/wait]) +AT_CHECK([ovs-appctl revalidator/pause]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | strip_stats | strip_duration | strip_dp_hash | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.0.0.2,frag=no), packets:0, bytes:0, used:0.0s, actions:hash(l4(0)),recirc(0x1) +recirc_id(0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:ct(commit,nat(dst=20.0.0.2)),recirc(0x2) +recirc_id(0x2),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:2,3 +]) + +ufid=$(ovs-appctl dpctl/dump-flows -m filter='recirc_id(0)' | parse_ufid) +AT_CHECK([ovs-appctl ofproto/detrace $ufid | ofctl_strip], [0], [dnl +cookie=0x12345678, n_packets=2, n_bytes=236, priority=100,ip,in_port=1,nw_dst=10.0.0.2,actions=resubmit(,1) +table_id=1, n_packets=2, n_bytes=236, priority=200,ip,actions=group:1 +]) + +ufid=$(ovs-appctl dpctl/dump-flows -m filter='recirc_id(0x1)' | parse_ufid) +AT_CHECK([ovs-appctl ofproto/detrace $ufid | ofctl_strip], [0], [dnl +group_id=1,type=select,selection_method=dp_hash,bucket=bucket_id:0,weight:100,actions=ct(commit,table=2,nat(dst=20.0.0.2)) +]) + +ufid=$(ovs-appctl dpctl/dump-flows -m filter='recirc_id(0x2)' | parse_ufid) +AT_CHECK([ovs-appctl ofproto/detrace $ufid | ofctl_strip], [0], [dnl +table_id=2, n_packets=2, n_bytes=236, ip,actions=group:2 +table_id=3, n_packets=2, n_bytes=236, ip,actions=output:2 +table_id=4, n_packets=2, n_bytes=236, ip,actions=output:3 +group_id=2,type=all,bucket=bucket_id:0,actions=resubmit(,3),bucket=bucket_id:1,actions=resubmit(,4) +]) + +AT_CHECK([ovs-appctl revalidator/resume]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index c22fb3c79c3..c27d96177b6 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -9,7 +9,9 @@ s/ duration=[0-9.]*s,// s/ cookie=0x0,// s/ table=0,// s/ n_packets=0,// +s/ n_offload_packets=0,// s/ n_bytes=0,// +s/ n_offload_bytes=0,// s/ idle_age=[0-9]*,// s/ hard_age=[0-9]*,// s/dp_hash=0x[0-9a-f]*\//dp_hash=0x0\// @@ -130,7 +132,7 @@ strip_used () { # Removes all 'duration=...' to make output easier to compare. strip_duration () { - sed 's/duration=[[0-9]]*\.[[0-9]]*s,//' + sed 's/duration=[[0-9.]]*s,//' } # Strips 'ufid:...' from output, to make it easier to compare. @@ -140,6 +142,10 @@ strip_ufid () { s/ufid:[[-0-9a-f]]* //' } +parse_ufid () { + grep -o 'ufid:[[-0-9a-f]]*' +} + # Strips packets: and bytes: from output strip_stats () { sed 's/packets:[[0-9]]*/packets:0/ @@ -169,6 +175,12 @@ strip_recirc() { s/recirc_id=[[x0-9]]*/recirc_id=/ s/recirc([[x0-9]]*)/recirc()/' } + +# Strips dp_hash from output. +strip_dp_hash() { + sed 's/dp_hash([[0-9a-fx/]]*),//' +} + m4_divert_pop([PREPARE_TESTS]) m4_define([TESTABLE_LOG], [-vPATTERN:ANY:'%c|%p|%m']) From 04c090c61efab91ca698af4fc3e8efaca8d5fa38 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 8 Jul 2024 14:27:41 -0400 Subject: [PATCH 786/833] ofproto-dpif-mirror: Reduce number of function parameters. Previously the mirror_set() and mirror_get() functions took a large number of parameters, which was inefficient and difficult to read and extend. This patch moves most of the parameters into a struct. Acked-by: Simon Horman Acked-by: Eelco Chaudron Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-mirror.c | 60 ++++++++++++++++++----------------- ofproto/ofproto-dpif-mirror.h | 40 ++++++++++++++++++----- ofproto/ofproto-dpif-xlate.c | 28 +++++++--------- ofproto/ofproto-dpif.c | 23 +++++++------- 4 files changed, 87 insertions(+), 64 deletions(-) diff --git a/ofproto/ofproto-dpif-mirror.c b/ofproto/ofproto-dpif-mirror.c index 343b75f0ed0..4967ecc9a10 100644 --- a/ofproto/ofproto-dpif-mirror.c +++ b/ofproto/ofproto-dpif-mirror.c @@ -207,19 +207,22 @@ mirror_bundle_dst(struct mbridge *mbridge, struct ofbundle *ofbundle) } int -mirror_set(struct mbridge *mbridge, void *aux, const char *name, - struct ofbundle **srcs, size_t n_srcs, - struct ofbundle **dsts, size_t n_dsts, - unsigned long *src_vlans, struct ofbundle *out_bundle, - uint16_t snaplen, - uint16_t out_vlan) +mirror_set(struct mbridge *mbridge, void *aux, + const struct ofproto_mirror_settings *ms, + const struct mirror_bundles *mb) { struct mbundle *mbundle, *out; mirror_mask_t mirror_bit; struct mirror *mirror; struct hmapx srcs_map; /* Contains "struct ofbundle *"s. */ struct hmapx dsts_map; /* Contains "struct ofbundle *"s. */ + uint16_t out_vlan; + if (!ms || !mbridge) { + return EINVAL; + } + + out_vlan = ms->out_vlan; mirror = mirror_lookup(mbridge, aux); if (!mirror) { int idx; @@ -227,7 +230,7 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, idx = mirror_scan(mbridge); if (idx < 0) { VLOG_WARN("maximum of %d port mirrors reached, cannot create %s", - MAX_MIRRORS, name); + MAX_MIRRORS, ms->name); return EFBIG; } @@ -242,8 +245,8 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, unsigned long *vlans = ovsrcu_get(unsigned long *, &mirror->vlans); /* Get the new configuration. */ - if (out_bundle) { - out = mbundle_lookup(mbridge, out_bundle); + if (mb->out_bundle) { + out = mbundle_lookup(mbridge, mb->out_bundle); if (!out) { mirror_destroy(mbridge, mirror->aux); return EINVAL; @@ -252,16 +255,16 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, } else { out = NULL; } - mbundle_lookup_multiple(mbridge, srcs, n_srcs, &srcs_map); - mbundle_lookup_multiple(mbridge, dsts, n_dsts, &dsts_map); + mbundle_lookup_multiple(mbridge, mb->srcs, mb->n_srcs, &srcs_map); + mbundle_lookup_multiple(mbridge, mb->dsts, mb->n_dsts, &dsts_map); /* If the configuration has not changed, do nothing. */ if (hmapx_equals(&srcs_map, &mirror->srcs) && hmapx_equals(&dsts_map, &mirror->dsts) - && vlan_bitmap_equal(vlans, src_vlans) + && vlan_bitmap_equal(vlans, ms->src_vlans) && mirror->out == out && mirror->out_vlan == out_vlan - && mirror->snaplen == snaplen) + && mirror->snaplen == ms->snaplen) { hmapx_destroy(&srcs_map); hmapx_destroy(&dsts_map); @@ -275,15 +278,15 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, hmapx_swap(&dsts_map, &mirror->dsts); hmapx_destroy(&dsts_map); - if (vlans || src_vlans) { + if (vlans || ms->src_vlans) { ovsrcu_postpone(free, vlans); - vlans = vlan_bitmap_clone(src_vlans); + vlans = vlan_bitmap_clone(ms->src_vlans); ovsrcu_set(&mirror->vlans, vlans); } mirror->out = out; mirror->out_vlan = out_vlan; - mirror->snaplen = snaplen; + mirror->snaplen = ms->snaplen; /* Update mbundles. */ mirror_bit = MIRROR_MASK_C(1) << mirror->idx; @@ -406,23 +409,22 @@ mirror_update_stats(struct mbridge *mbridge, mirror_mask_t mirrors, /* Retrieves the mirror numbered 'index' in 'mbridge'. Returns true if such a * mirror exists, false otherwise. * - * If successful, '*vlans' receives the mirror's VLAN membership information, + * If successful 'mc->vlans' receives the mirror's VLAN membership information, * either a null pointer if the mirror includes all VLANs or a 4096-bit bitmap * in which a 1-bit indicates that the mirror includes a particular VLAN, - * '*dup_mirrors' receives a bitmap of mirrors whose output duplicates mirror - * 'index', '*out' receives the output ofbundle (if any), and '*out_vlan' - * receives the output VLAN (if any). + * 'mc->dup_mirrors' receives a bitmap of mirrors whose output duplicates + * mirror 'index', 'mc->out' receives the output ofbundle (if any), + * and 'mc->out_vlan' receives the output VLAN (if any). * * Everything returned here is assumed to be RCU protected. */ bool -mirror_get(struct mbridge *mbridge, int index, const unsigned long **vlans, - mirror_mask_t *dup_mirrors, struct ofbundle **out, - int *snaplen, int *out_vlan) +mirror_get(struct mbridge *mbridge, int index, + struct mirror_config *mc) { struct mirror *mirror; - if (!mbridge) { + if (!mc || !mbridge) { return false; } @@ -433,11 +435,11 @@ mirror_get(struct mbridge *mbridge, int index, const unsigned long **vlans, /* Assume 'mirror' is RCU protected, i.e., it will not be freed until this * thread quiesces. */ - *vlans = ovsrcu_get(unsigned long *, &mirror->vlans); - *dup_mirrors = mirror->dup_mirrors; - *out = mirror->out ? mirror->out->ofbundle : NULL; - *out_vlan = mirror->out_vlan; - *snaplen = mirror->snaplen; + mc->vlans = ovsrcu_get(unsigned long *, &mirror->vlans); + mc->dup_mirrors = mirror->dup_mirrors; + mc->out_bundle = mirror->out ? mirror->out->ofbundle : NULL; + mc->out_vlan = mirror->out_vlan; + mc->snaplen = mirror->snaplen; return true; } diff --git a/ofproto/ofproto-dpif-mirror.h b/ofproto/ofproto-dpif-mirror.h index eed63ec4a48..37d57463c1f 100644 --- a/ofproto/ofproto-dpif-mirror.h +++ b/ofproto/ofproto-dpif-mirror.h @@ -22,9 +22,37 @@ #define MAX_MIRRORS 32 typedef uint32_t mirror_mask_t; +struct ofproto_mirror_settings; struct ofproto_dpif; struct ofbundle; +struct mirror_bundles { + struct ofbundle **srcs; + size_t n_srcs; + + struct ofbundle **dsts; + size_t n_dsts; + + struct ofbundle *out_bundle; +}; + +struct mirror_config { + /* A bitmap of mirrors that duplicate the current mirror. */ + mirror_mask_t dup_mirrors; + + /* VLANs of packets to select for mirroring. */ + unsigned long *vlans; /* vlan_bitmap, NULL selects all VLANs. */ + + /* Output (mutually exclusive). */ + struct ofbundle *out_bundle; /* A registered ofbundle handle or NULL. */ + uint16_t out_vlan; /* Output VLAN, not used if out_bundle is + set. */ + + /* Max size of a mirrored packet in bytes, if set to zero then no + * truncation will occur. */ + uint16_t snaplen; +}; + /* The following functions are used by handler threads without any locking, * assuming RCU protection. */ @@ -38,9 +66,7 @@ mirror_mask_t mirror_bundle_dst(struct mbridge *, struct ofbundle *); void mirror_update_stats(struct mbridge*, mirror_mask_t, uint64_t packets, uint64_t bytes); -bool mirror_get(struct mbridge *, int index, const unsigned long **vlans, - mirror_mask_t *dup_mirrors, struct ofbundle **out, - int *snaplen, int *out_vlan); +bool mirror_get(struct mbridge *, int index, struct mirror_config *); /* The remaining functions are assumed to be called by the main thread only. */ @@ -50,11 +76,9 @@ bool mbridge_need_revalidate(struct mbridge *); void mbridge_register_bundle(struct mbridge *, struct ofbundle *); void mbridge_unregister_bundle(struct mbridge *, struct ofbundle *); -int mirror_set(struct mbridge *, void *aux, const char *name, - struct ofbundle **srcs, size_t n_srcs, - struct ofbundle **dsts, size_t n_dsts, - unsigned long *src_vlans, struct ofbundle *out_bundle, - uint16_t snaplen, uint16_t out_vlan); +int mirror_set(struct mbridge *, void *aux, + const struct ofproto_mirror_settings *, + const struct mirror_bundles *); void mirror_destroy(struct mbridge *, void *aux); int mirror_get_stats(struct mbridge *, void *aux, uint64_t *packets, uint64_t *bytes); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 3436b44755f..2dc9c96b85e 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -2291,16 +2291,11 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, * 'used_mirrors', as long as some candidates remain. */ mirror_mask_t used_mirrors = 0; while (mirrors) { - const unsigned long *vlans; - mirror_mask_t dup_mirrors; - struct ofbundle *out; - int out_vlan; - int snaplen; + struct mirror_config mc; /* Get the details of the mirror represented by the rightmost 1-bit. */ - if (OVS_UNLIKELY(!mirror_get(xbridge->mbridge, raw_ctz(mirrors), - &vlans, &dup_mirrors, - &out, &snaplen, &out_vlan))) { + if (OVS_UNLIKELY(!mirror_get(xbridge->mbridge, + raw_ctz(mirrors), &mc))) { /* The mirror got reconfigured before we got to read it's * configuration. */ mirrors = zero_rightmost_1bit(mirrors); @@ -2310,10 +2305,10 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, /* If this mirror selects on the basis of VLAN, and it does not select * 'vlan', then discard this mirror and go on to the next one. */ - if (vlans) { + if (mc.vlans) { ctx->wc->masks.vlans[0].tci |= htons(VLAN_CFI | VLAN_VID_MASK); } - if (vlans && !bitmap_is_set(vlans, xvlan.v[0].vid)) { + if (mc.vlans && !bitmap_is_set(mc.vlans, xvlan.v[0].vid)) { mirrors = zero_rightmost_1bit(mirrors); continue; } @@ -2325,21 +2320,22 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, * destination, so that we don't mirror to them again. This must be * done now to ensure that output_normal(), below, doesn't recursively * output to the same mirrors. */ - ctx->mirrors |= dup_mirrors; - ctx->mirror_snaplen = snaplen; + ctx->mirrors |= mc.dup_mirrors; + ctx->mirror_snaplen = mc.snaplen; /* Send the packet to the mirror. */ - if (out) { - struct xbundle *out_xbundle = xbundle_lookup(ctx->xcfg, out); + if (mc.out_bundle) { + struct xbundle *out_xbundle = xbundle_lookup(ctx->xcfg, + mc.out_bundle); if (out_xbundle) { output_normal(ctx, out_xbundle, &xvlan); } - } else if (xvlan.v[0].vid != out_vlan + } else if (xvlan.v[0].vid != mc.out_vlan && !eth_addr_is_reserved(ctx->xin->flow.dl_dst)) { struct xbundle *xb; uint16_t old_vid = xvlan.v[0].vid; - xvlan.v[0].vid = out_vlan; + xvlan.v[0].vid = mc.out_vlan; LIST_FOR_EACH (xb, list_node, &xbridge->xbundles) { if (xbundle_includes_vlan(xb, &xvlan) && !xbundle_mirror_out(xbridge, xb)) { diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index dca6a6ffab1..15da96f7b8a 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -3819,7 +3819,7 @@ mirror_set__(struct ofproto *ofproto_, void *aux, const struct ofproto_mirror_settings *s) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - struct ofbundle **srcs, **dsts; + struct mirror_bundles mb; int error; size_t i; @@ -3828,23 +3828,24 @@ mirror_set__(struct ofproto *ofproto_, void *aux, return 0; } - srcs = xmalloc(s->n_srcs * sizeof *srcs); - dsts = xmalloc(s->n_dsts * sizeof *dsts); + mb.srcs = xmalloc(s->n_srcs * sizeof *mb.srcs); + mb.dsts = xmalloc(s->n_dsts * sizeof *mb.dsts); for (i = 0; i < s->n_srcs; i++) { - srcs[i] = bundle_lookup(ofproto, s->srcs[i]); + mb.srcs[i] = bundle_lookup(ofproto, s->srcs[i]); } for (i = 0; i < s->n_dsts; i++) { - dsts[i] = bundle_lookup(ofproto, s->dsts[i]); + mb.dsts[i] = bundle_lookup(ofproto, s->dsts[i]); } - error = mirror_set(ofproto->mbridge, aux, s->name, srcs, s->n_srcs, dsts, - s->n_dsts, s->src_vlans, - bundle_lookup(ofproto, s->out_bundle), - s->snaplen, s->out_vlan); - free(srcs); - free(dsts); + mb.n_srcs = s->n_srcs; + mb.n_dsts = s->n_dsts; + mb.out_bundle = bundle_lookup(ofproto, s->out_bundle); + + error = mirror_set(ofproto->mbridge, aux, s, &mb); + free(mb.srcs); + free(mb.dsts); return error; } From 3b1882261c8b363abc281c21a052053715a63b39 Mon Sep 17 00:00:00 2001 From: Mike Pattrick Date: Mon, 8 Jul 2024 14:27:42 -0400 Subject: [PATCH 787/833] ofproto-dpif-mirror: Add support for pre-selection filter. Currently a bridge mirror will collect all packets and tools like ovs-tcpdump can apply additional filters after they have already been duplicated by vswitchd. This can result in inefficient collection. This patch adds support to apply pre-selection to bridge mirrors, which can limit which packets are mirrored based on flow metadata. This significantly improves overall vswitchd performance during mirroring if only a subset of traffic is required. Signed-off-by: Mike Pattrick Signed-off-by: Ilya Maximets --- Documentation/ref/ovs-tcpdump.8.rst | 8 +- NEWS | 6 + lib/flow.h | 9 ++ ofproto/ofproto-dpif-mirror.c | 105 +++++++++++++++++- ofproto/ofproto-dpif-mirror.h | 8 +- ofproto/ofproto-dpif-xlate.c | 15 ++- ofproto/ofproto-dpif.c | 12 +- ofproto/ofproto.h | 3 + tests/ofproto-dpif.at | 166 ++++++++++++++++++++++++++++ utilities/ovs-tcpdump.in | 13 ++- vswitchd/bridge.c | 13 ++- vswitchd/vswitch.ovsschema | 7 +- vswitchd/vswitch.xml | 15 +++ 13 files changed, 365 insertions(+), 15 deletions(-) diff --git a/Documentation/ref/ovs-tcpdump.8.rst b/Documentation/ref/ovs-tcpdump.8.rst index b9f8cdf6f78..e7bd5e9e4fb 100644 --- a/Documentation/ref/ovs-tcpdump.8.rst +++ b/Documentation/ref/ovs-tcpdump.8.rst @@ -61,8 +61,14 @@ Options If specified, mirror all ports (optional). +* ``--filter `` + + If specified, only mirror packets that match the provided OpenFlow filter. + The available fields are documented in ``ovs-fields(7)``. + See Also ======== ``ovs-appctl(8)``, ``ovs-vswitchd(8)``, ``ovs-pcap(1)``, -``ovs-tcpundump(1)``, ``tcpdump(8)``, ``wireshark(8)``. +``ovs-fields(7)``, ``ovs-tcpundump(1)``, ``tcpdump(8)``, +``wireshark(8)``. diff --git a/NEWS b/NEWS index 9c576c577f7..70f1ebefe93 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,12 @@ Post-v3.3.0 * 'dpif/show' and 'list-commands' now support output in JSON format. * Added 'ofproto/detrace' command that outputs the set of OpenFlow rules and groups that contributed to the creation of a specific datapath flow. + - ovs-vsctl: + * Added a new filter column in the Mirror table which can be used to + apply filters to mirror ports. + - ovs-tcpdump: + * Added command line parameter --filter to enable filtering the packets + that are captured by tcpdump. - Userspace datapath: * Conntrack now supports 'random' flag for selecting ports in a range while natting and 'persistent' flag for selection of the IP address diff --git a/lib/flow.h b/lib/flow.h index 75a9be3c19d..60ec4b0d780 100644 --- a/lib/flow.h +++ b/lib/flow.h @@ -939,6 +939,15 @@ flow_union_with_miniflow(struct flow *dst, const struct miniflow *src) flow_union_with_miniflow_subset(dst, src, src->map); } +/* Perform a bitwise OR of minimask 'src' mask data with the equivalent + * fields in 'dst', storing the result in 'dst'. */ +static inline void +flow_wildcards_union_with_minimask(struct flow_wildcards *dst, + const struct minimask *src) +{ + flow_union_with_miniflow_subset(&dst->masks, &src->masks, src->masks.map); +} + static inline bool is_ct_valid(const struct flow *flow, const struct flow_wildcards *mask, struct flow_wildcards *wc) diff --git a/ofproto/ofproto-dpif-mirror.c b/ofproto/ofproto-dpif-mirror.c index 4967ecc9a10..e8a2830fb44 100644 --- a/ofproto/ofproto-dpif-mirror.c +++ b/ofproto/ofproto-dpif-mirror.c @@ -21,6 +21,7 @@ #include "cmap.h" #include "hmapx.h" #include "ofproto.h" +#include "ofproto-dpif-trace.h" #include "vlan-bitmap.h" #include "openvswitch/vlog.h" @@ -48,6 +49,11 @@ struct mbundle { mirror_mask_t mirror_out; /* Mirrors that output to this mbundle. */ }; +struct filtermask { + struct miniflow *flow; + struct minimask *mask; +}; + struct mirror { struct mbridge *mbridge; /* Owning ofproto. */ size_t idx; /* In ofproto's "mirrors" array. */ @@ -57,6 +63,10 @@ struct mirror { struct hmapx srcs; /* Contains "struct mbundle*"s. */ struct hmapx dsts; /* Contains "struct mbundle*"s. */ + /* Filter criteria. */ + OVSRCU_TYPE(struct filtermask *) filter_mask; + char *filter_str; + /* This is accessed by handler threads assuming RCU protection (see * mirror_get()), but can be manipulated by mirror_set() without any * explicit synchronization. */ @@ -83,6 +93,25 @@ static void mbundle_lookup_multiple(const struct mbridge *, struct ofbundle **, static int mirror_scan(struct mbridge *); static void mirror_update_dups(struct mbridge *); +static void +filtermask_free(struct filtermask *fm) +{ + free(fm->flow); + free(fm->mask); + free(fm); +} + +static struct filtermask * +filtermask_create(struct flow *flow, struct flow_wildcards *wc) +{ + struct filtermask *fm; + + fm = xmalloc(sizeof *fm); + fm->flow = miniflow_create(flow); + fm->mask = minimask_create(wc); + return fm; +} + struct mbridge * mbridge_create(void) { @@ -207,8 +236,8 @@ mirror_bundle_dst(struct mbridge *mbridge, struct ofbundle *ofbundle) } int -mirror_set(struct mbridge *mbridge, void *aux, - const struct ofproto_mirror_settings *ms, +mirror_set(struct mbridge *mbridge, const struct ofproto *ofproto, + void *aux, const struct ofproto_mirror_settings *ms, const struct mirror_bundles *mb) { struct mbundle *mbundle, *out; @@ -264,11 +293,13 @@ mirror_set(struct mbridge *mbridge, void *aux, && vlan_bitmap_equal(vlans, ms->src_vlans) && mirror->out == out && mirror->out_vlan == out_vlan - && mirror->snaplen == ms->snaplen) + && mirror->snaplen == ms->snaplen + && nullable_string_is_equal(mirror->filter_str, ms->filter) + && !ms->filter) { hmapx_destroy(&srcs_map); hmapx_destroy(&dsts_map); - return 0; + return ECANCELED; } /* XXX: Not sure if these need to be thread safe. */ @@ -288,6 +319,50 @@ mirror_set(struct mbridge *mbridge, void *aux, mirror->out_vlan = out_vlan; mirror->snaplen = ms->snaplen; + if (!nullable_string_is_equal(mirror->filter_str, ms->filter)) { + if (mirror->filter_str) { + ovsrcu_postpone(filtermask_free, + ovsrcu_get(struct filtermask *, + &mirror->filter_mask)); + free(mirror->filter_str); + mirror->filter_str = NULL; + ovsrcu_set(&mirror->filter_mask, NULL); + } + + if (ms->filter && strlen(ms->filter)) { + struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); + struct flow_wildcards wc; + struct flow flow; + char *err; + + ofproto_append_ports_to_map(&map, ofproto->ports); + err = parse_ofp_exact_flow(&flow, &wc, + ofproto_get_tun_tab(ofproto), + ms->filter, &map); + ofputil_port_map_destroy(&map); + if (err) { + VLOG_WARN("filter is invalid: %s", err); + free(err); + mirror_destroy(mbridge, mirror->aux); + return EINVAL; + } + + /* If the user wants to filter on in_port, they should use the srcs + * bundle. Users setting in_port could experience unexpected + * behavior, and it would be overly complex to detect all possible + * issues. So instead we attempt to extract the in_port and error + * if successful. */ + if (wc.masks.in_port.ofp_port) { + VLOG_WARN("filter is invalid due to in_port field."); + mirror_destroy(mbridge, mirror->aux); + return EINVAL; + } + + mirror->filter_str = xstrdup(ms->filter); + ovsrcu_set(&mirror->filter_mask, filtermask_create(&flow, &wc)); + } + } + /* Update mbundles. */ mirror_bit = MIRROR_MASK_C(1) << mirror->idx; CMAP_FOR_EACH (mbundle, cmap_node, &mirror->mbridge->mbundles) { @@ -343,6 +418,15 @@ mirror_destroy(struct mbridge *mbridge, void *aux) ovsrcu_postpone(free, vlans); } + if (mirror->filter_str) { + ovsrcu_postpone(filtermask_free, + ovsrcu_get(struct filtermask *, + &mirror->filter_mask)); + free(mirror->filter_str); + mirror->filter_str = NULL; + ovsrcu_set(&mirror->filter_mask, NULL); + } + mbridge->mirrors[mirror->idx] = NULL; /* mirror_get() might have just read the pointer, so we must postpone the * free. */ @@ -414,7 +498,9 @@ mirror_update_stats(struct mbridge *mbridge, mirror_mask_t mirrors, * in which a 1-bit indicates that the mirror includes a particular VLAN, * 'mc->dup_mirrors' receives a bitmap of mirrors whose output duplicates * mirror 'index', 'mc->out' receives the output ofbundle (if any), - * and 'mc->out_vlan' receives the output VLAN (if any). + * and 'mc->out_vlan' receives the output VLAN (if any). In cases where the + * mirror has a filter configured 'mc->filter_flow' and 'mc->filter_mask' + * receives the flow and mask that this mirror should collect. * * Everything returned here is assumed to be RCU protected. */ @@ -422,6 +508,7 @@ bool mirror_get(struct mbridge *mbridge, int index, struct mirror_config *mc) { + struct filtermask *fm; struct mirror *mirror; if (!mc || !mbridge) { @@ -440,6 +527,14 @@ mirror_get(struct mbridge *mbridge, int index, mc->out_bundle = mirror->out ? mirror->out->ofbundle : NULL; mc->out_vlan = mirror->out_vlan; mc->snaplen = mirror->snaplen; + fm = ovsrcu_get(struct filtermask *, &mirror->filter_mask); + if (fm) { + mc->filter_flow = fm->flow; + mc->filter_mask = fm->mask; + } else { + mc->filter_flow = NULL; + mc->filter_mask = NULL; + } return true; } diff --git a/ofproto/ofproto-dpif-mirror.h b/ofproto/ofproto-dpif-mirror.h index 37d57463c1f..a03dd82356f 100644 --- a/ofproto/ofproto-dpif-mirror.h +++ b/ofproto/ofproto-dpif-mirror.h @@ -23,8 +23,8 @@ typedef uint32_t mirror_mask_t; struct ofproto_mirror_settings; -struct ofproto_dpif; struct ofbundle; +struct ofproto; struct mirror_bundles { struct ofbundle **srcs; @@ -43,6 +43,10 @@ struct mirror_config { /* VLANs of packets to select for mirroring. */ unsigned long *vlans; /* vlan_bitmap, NULL selects all VLANs. */ + /* Miniflow and minimask if a filter is configured, else both are NULL. */ + struct miniflow *filter_flow; + struct minimask *filter_mask; + /* Output (mutually exclusive). */ struct ofbundle *out_bundle; /* A registered ofbundle handle or NULL. */ uint16_t out_vlan; /* Output VLAN, not used if out_bundle is @@ -76,7 +80,7 @@ bool mbridge_need_revalidate(struct mbridge *); void mbridge_register_bundle(struct mbridge *, struct ofbundle *); void mbridge_unregister_bundle(struct mbridge *, struct ofbundle *); -int mirror_set(struct mbridge *, void *aux, +int mirror_set(struct mbridge *, const struct ofproto *, void *aux, const struct ofproto_mirror_settings *, const struct mirror_bundles *); void mirror_destroy(struct mbridge *, void *aux); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 2dc9c96b85e..be2c707215f 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -2262,7 +2262,8 @@ lookup_input_bundle(const struct xlate_ctx *ctx, /* Mirrors the packet represented by 'ctx' to appropriate mirror destinations, * given the packet is ingressing or egressing on 'xbundle', which has ingress - * or egress (as appropriate) mirrors 'mirrors'. */ + * or egress (as appropriate) mirrors 'mirrors'. In cases where a mirror is + * filtered, the current wildcard for the flow's current filter is modified. */ static void mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, mirror_mask_t mirrors) @@ -2313,6 +2314,18 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, continue; } + /* After the VLAN check, apply a flow mask if a filter is specified. */ + if (ctx->wc && mc.filter_flow) { + flow_wildcards_union_with_minimask(ctx->wc, mc.filter_mask); + if (!OVS_UNLIKELY( + miniflow_equal_flow_in_minimask(mc.filter_flow, + &ctx->xin->flow, + mc.filter_mask))) { + mirrors = zero_rightmost_1bit(mirrors); + continue; + } + } + /* We sent a packet to this mirror. */ used_mirrors |= rightmost_1bit(mirrors); diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 15da96f7b8a..d3c353b9d60 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -3843,7 +3843,17 @@ mirror_set__(struct ofproto *ofproto_, void *aux, mb.n_dsts = s->n_dsts; mb.out_bundle = bundle_lookup(ofproto, s->out_bundle); - error = mirror_set(ofproto->mbridge, aux, s, &mb); + error = mirror_set(ofproto->mbridge, ofproto_, aux, s, &mb); + + if (!error) { + ofproto->backer->need_revalidate = REV_RECONFIGURE; + } else if (error == ECANCELED) { + /* The user requested a change that is identical to the current state, + * the reconfiguration is canceled, but don't log an error message + * about that. */ + error = 0; + } + free(mb.srcs); free(mb.dsts); return error; diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index fcf8e201d45..642a9d001f9 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -511,6 +511,9 @@ struct ofproto_mirror_settings { uint16_t out_vlan; /* Output VLAN, only if out_bundle is NULL. */ uint16_t snaplen; /* Max packet size of a mirrored packet in byte, set to 0 equals 65535. */ + + /* Output filter. */ + char *filter; }; int ofproto_mirror_register(struct ofproto *, void *aux, diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 489514be834..42fb66de687 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -5220,6 +5220,172 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - mirroring, filter]) +AT_KEYWORDS([mirror mirrors mirroring]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 +AT_CHECK([ovs-vsctl \ + set Bridge br0 mirrors=@m -- \ + --id=@p3 get Port p3 -- \ + --id=@m create Mirror name=mymirror select_all=true output_port=@p3 filter="icmp"], [0], [ignore]) + +icmp_flow="eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" +tcp_flow1="eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=128,frag=no),tcp(dst=443)" +tcp_flow2="eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=128,frag=no),tcp(dst=80)" + +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flow br0 'actions=normal' ]) + +dnl Add non-matching flows, then change the mirror to match one of the flows, +dnl then add a matching flow. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $icmp_flow]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $tcp_flow1]) +AT_CHECK([ovs-vsctl set mirror mymirror filter="tcp"], [0]) +AT_CHECK([ovs-appctl revalidator/wait]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $tcp_flow2]) +AT_CHECK([ovs-appctl dpif/dump-flows --names br0 | strip_ufid | strip_used | sort], [0], [dnl +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:br0,p2 +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=6,frag=no), packets:1, bytes:118, used:0.0s, actions:p3,br0,p2 +]) +AT_CHECK([ovs-appctl dpctl/dump-flows --names | strip_ufid | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:br0,p2 +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=6,frag=no), packets:1, bytes:118, used:0.0s, actions:p3,br0,p2 +]) + +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=1 actions=output:2"]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=2 actions=output:1"]) + +dnl Add mirrored flow after non-mirrored flow. +AT_CHECK([ovs-vsctl set mirror mymirror filter="icmp"], [0]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $tcp_flow1]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $icmp_flow]) +AT_CHECK([ovs-appctl dpif/dump-flows --names br0 | strip_ufid | strip_used | sort], [0], [dnl +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=1,frag=no), packets:1, bytes:106, used:0.0s, actions:p3,p2 +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=6,frag=no), packets:2, bytes:236, used:0.0s, actions:p2 +]) + +dnl Check one direction, only icmp should mirror. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +dnl Check other direction, only icmp should mirror. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +dnl Change filter to tcp, only tcp should mirror. +AT_CHECK([ovs-vsctl set mirror mymirror filter="tcp"], [0]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,1 +]) + +dnl Invalid filter. Nothing should mirror, error should be logged. +AT_CHECK([ovs-vsctl set mirror mymirror filter="invalid"], [0]) +dnl Setting an in_port is also invalid. +AT_CHECK([ovs-vsctl set mirror mymirror filter="\"in_port=p1\""], [0]) + +dnl Each of the above two lines should produce two log messages. +OVS_WAIT_UNTIL([test $(grep -Ec "filter is invalid|mirror mymirror configuration is invalid" ovs-vswitchd.log) -eq 4]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +dnl Check more complex filter cases with partially overlapping default wildcards. +AT_CHECK([ovs-vsctl set mirror mymirror filter="\"tcp,tcp_dst=80\""], [0]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +dnl Change port number. +AT_CHECK([ovs-appctl dpif-dummy/change-port-number ovs-dummy p1 8]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(8),$tcp_flow2"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +dnl Empty filter, all traffic should mirror. +AT_CHECK([ovs-vsctl clear mirror mymirror filter], [0]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(8),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(8),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,8 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,8 +]) + +OVS_VSWITCHD_STOP(["/filter is invalid: invalid: unknown field invalid/d +/filter is invalid due to in_port field/d +/mirror mymirror configuration is invalid/d"]) +AT_CLEANUP AT_SETUP([ofproto-dpif - mirroring, select_all]) AT_KEYWORDS([mirror mirrors mirroring]) diff --git a/utilities/ovs-tcpdump.in b/utilities/ovs-tcpdump.in index cb46e43ba8f..187eafdf25b 100755 --- a/utilities/ovs-tcpdump.in +++ b/utilities/ovs-tcpdump.in @@ -142,6 +142,7 @@ The following options are available: --mirror-to The name for the mirror port to use (optional) Default 'miINTERFACE' --span If specified, mirror all ports (optional) + --filter Set an OpenFlow formatted preselection filter """ % {'prog': sys.argv[0]}) sys.exit(0) @@ -354,7 +355,7 @@ class OVSDB(object): return result def bridge_mirror(self, intf_name, mirror_intf_name, br_name, - mirror_select_all=False): + mirror_select_all=False, mirror_filter=None): txn = self._start_txn() mirror = txn.insert(self.get_table('Mirror')) @@ -362,6 +363,9 @@ class OVSDB(object): mirror.select_all = mirror_select_all + if mirror_filter is not None: + mirror.filter = mirror_filter + mirrored_port = self._find_row_by_name('Port', intf_name) mirror.verify('select_dst_port') @@ -445,6 +449,7 @@ def main(): mirror_interface = None mirror_select_all = False dump_cmd = 'tcpdump' + mirror_filter = None for cur, nxt in argv_tuples(sys.argv[1:]): if skip_next: @@ -474,6 +479,10 @@ def main(): elif cur in ['--span']: mirror_select_all = True continue + elif cur in ['--filter']: + mirror_filter = nxt + skip_next = True + continue tcpdargs.append(cur) if interface is None: @@ -526,7 +535,7 @@ def main(): ovsdb.make_port(mirror_interface, ovsdb.port_bridge(interface)) ovsdb.bridge_mirror(interface, mirror_interface, ovsdb.port_bridge(interface), - mirror_select_all) + mirror_select_all, mirror_filter=mirror_filter) except OVSDBException as oe: print("ERROR: Unable to properly setup the mirror: %s." % str(oe)) sys.exit(1) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 88aedf6b2ee..6bb687f4b13 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -5217,6 +5217,7 @@ mirror_configure(struct mirror *m) { const struct ovsrec_mirror *cfg = m->cfg; struct ofproto_mirror_settings s; + int ret; /* Set name. */ if (strcmp(cfg->name, m->name)) { @@ -5285,8 +5286,18 @@ mirror_configure(struct mirror *m) /* Get VLAN selection. */ s.src_vlans = vlan_bitmap_from_array(cfg->select_vlan, cfg->n_select_vlan); + /* Set the filter, mirror_set() will strdup this pointer. */ + s.filter = cfg->filter; + /* Configure. */ - ofproto_mirror_register(m->bridge->ofproto, m, &s); + ret = ofproto_mirror_register(m->bridge->ofproto, m, &s); + if (ret == EOPNOTSUPP) { + VLOG_ERR("ofproto %s: does not support mirroring", + m->bridge->ofproto->name); + } else if (ret) { + VLOG_ERR("bridge %s: mirror %s configuration is invalid", + m->bridge->name, m->name); + } /* Clean up. */ if (s.srcs != s.dsts) { diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index 95018d10745..68689fe2a30 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "8.6.0", - "cksum": "1543805939 27765", + "version": "8.7.0", + "cksum": "3751637058 27869", "tables": { "Open_vSwitch": { "columns": { @@ -461,6 +461,9 @@ "type": {"key": "string", "value": "integer", "min": 0, "max": "unlimited"}, "ephemeral": true}, + "filter": { + "type": {"key": {"type": "string"}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}}, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 70e49e166a9..36cb4e49516 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -5286,6 +5286,21 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ VLANs on which packets are selected for mirroring. An empty set selects packets on all VLANs. + +

      + When set, only packets that match are + selected for mirroring. Packets that do not match are ignored + by thie mirror. The syntax is described + in ovs-fields(7). However, the in_port + field is not supported; should be + used to limit the mirror to a source port. +

      +

      + This filter is applied after , , , and + . +

      +
      From 0aa14d912d9a29d07ebc727007a1f21e3639eea5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 15 Jul 2024 13:23:51 +0200 Subject: [PATCH 788/833] Prepare for 3.4.0. Acked-by: Mike Pattrick Acked-by: Jakob Meng Signed-off-by: Ilya Maximets --- Documentation/faq/releases.rst | 1 + NEWS | 2 +- configure.ac | 2 +- debian/changelog | 4 ++-- debian/rules | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index 70219d7175e..9fbee90edc1 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -221,6 +221,7 @@ Q: What DPDK version does each Open vSwitch release work with? 3.1.x 22.11.5 3.2.x 22.11.5 3.3.x 23.11.1 + 3.4.x 23.11.1 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? diff --git a/NEWS b/NEWS index 70f1ebefe93..5290696a865 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Post-v3.3.0 +v3.4.0 - xx xxx xxxx -------------------- - Option '--mlockall' now only locks memory pages on fault, if possible. This also makes it compatible with vHost Post-copy Live Migration. diff --git a/configure.ac b/configure.ac index 8323e481d29..3e39120af87 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.3.90, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.4.0, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 614c46ef919..929b8ecf433 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,8 +1,8 @@ -openvswitch (3.3.90-1) unstable; urgency=low +openvswitch (3.4.0-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Wed, 17 Jan 2024 13:00:01 +0100 + -- Open vSwitch team Mon, 15 Jul 2024 13:00:00 +0100 openvswitch (3.3.0-1) unstable; urgency=low diff --git a/debian/rules b/debian/rules index 075b0416284..b6f905f3cdd 100755 --- a/debian/rules +++ b/debian/rules @@ -134,8 +134,8 @@ override_dh_python3: # Helper target for creating snapshots from upstream git DATE=$(shell date +%Y%m%d) # Upstream branch to track -BRANCH=branch-3.3 -VERSION=3.3.0 +BRANCH=branch-3.4 +VERSION=3.4.0 get-orig-snapshot: rm -Rf openvswitch-upstream From 903aa8fdc8f2f491ee72dc3303f6adfde7594b49 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 15 Jul 2024 13:23:52 +0200 Subject: [PATCH 789/833] Prepare for post-3.4.0 (3.4.90). Acked-by: Mike Pattrick Acked-by: Jakob Meng Signed-off-by: Ilya Maximets --- NEWS | 4 ++++ configure.ac | 2 +- debian/changelog | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 5290696a865..a6fc436c88d 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Post-v3.4.0 +-------------------- + + v3.4.0 - xx xxx xxxx -------------------- - Option '--mlockall' now only locks memory pages on fault, if possible. diff --git a/configure.ac b/configure.ac index 3e39120af87..266e9d4799e 100644 --- a/configure.ac +++ b/configure.ac @@ -13,7 +13,7 @@ # limitations under the License. AC_PREREQ(2.63) -AC_INIT(openvswitch, 3.4.0, bugs@openvswitch.org) +AC_INIT(openvswitch, 3.4.90, bugs@openvswitch.org) AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/debian/changelog b/debian/changelog index 929b8ecf433..3bc24aa706b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +openvswitch (3.4.90-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Mon, 15 Jul 2024 13:00:01 +0100 + openvswitch (3.4.0-1) unstable; urgency=low * New upstream version From 3985fa03b5c9f5ac45952299b3c6880f401294e2 Mon Sep 17 00:00:00 2001 From: Vipul Ashri Date: Tue, 16 Jul 2024 13:57:36 +0530 Subject: [PATCH 790/833] dpctl: Fix netdev reference leak in "show" command. This specific Netdev leak is causing us stale VHU entries, where it is showing false limit reaching maximum and preventing us to create new entries for us. This leak can impact other nics also. Steps to reproduce, While running a test with a continous VM creation/deletion using an orchestration script with-in cloud environment. In parallel we have some monitoring script calling ovs-appctl dpctl/show stats commands every minute. Root-cause analysis, During VHU port delete, one of netdev references were not reduced to 0 as show_dpif call has not given-up the reference back or doing bad cleanup. This pending deference preventing VHU deletion sequence, this is found to be one of corner case inside dpctl code which results in leaking up netdev which ultimately results in stale VHU entry. After fixing this problematic cleanup, issue is not seen. Fixes: fceef2095222 ("dpctl: add ovs-appctl dpctl/* commands to talk to dpif-netdev") Signed-off-by: Vipul Ashri Reviewed-by: David Marchand Signed-off-by: Simon Horman --- lib/dpctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dpctl.c b/lib/dpctl.c index a70df534202..f764cf16410 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -738,8 +738,8 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) continue; } error = netdev_get_stats(netdev, &s); + netdev_close(netdev); if (!error) { - netdev_close(netdev); print_stat(dpctl_p, " RX packets:", s.rx_packets); print_stat(dpctl_p, " errors:", s.rx_errors); print_stat(dpctl_p, " dropped:", s.rx_dropped); From 03cd668e05c212368f434ec6b8293772bb5317ea Mon Sep 17 00:00:00 2001 From: David Marchand Date: Fri, 12 Jul 2024 10:30:23 +0200 Subject: [PATCH 791/833] dpif-netlink-rtnl: Fix netdev leak in out-of-tree tunnels probe. Caught by code review, calling netdev_open works in pair of netdev_close when no reference to a netdev must be kept. Fixes: 921c370a9df5 ("dpif-netlink: Probe for out-of-tree tunnels, decides used interface") Signed-off-by: David Marchand Acked-by: Eric Garver Signed-off-by: Simon Horman --- lib/dpif-netlink-rtnl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c index 5788294ae0d..f7035333e63 100644 --- a/lib/dpif-netlink-rtnl.c +++ b/lib/dpif-netlink-rtnl.c @@ -566,6 +566,7 @@ dpif_netlink_rtnl_probe_oot_tunnels(void) tnl_cfg = netdev_get_tunnel_config(netdev); if (!tnl_cfg) { + netdev_close(netdev); return true; } From 6165c92a2867c00316d9b3b80d8d59aa74452988 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 16 Jul 2024 12:48:06 +0200 Subject: [PATCH 792/833] ofp-actions: Fix reporting observation point bits instead of domain. Found by Coverity: CID 397544: Incorrect expression (COPY_PASTE_ERROR) "obs_point_src" in "(*os).obs_point_src.n_bits" looks like a copy-paste error. Also adding a test case to cover this situation. Fixes: 1aa9e137fe36 ("ofp-actions: Load data from fields in sample action.") Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- lib/ofp-actions.c | 2 +- tests/ofp-actions.at | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c index 2a1f5c3c4ee..fe6a17b6dad 100644 --- a/lib/ofp-actions.c +++ b/lib/ofp-actions.c @@ -6489,7 +6489,7 @@ parse_SAMPLE(char *arg, const struct ofpact_parse_params *pp) if (os->obs_domain_src.n_bits > 32) { return xasprintf("size of obs_domain_id field (%d) " "exceeds maximum (32)", - os->obs_point_src.n_bits); + os->obs_domain_src.n_bits); } } } else if (!strcmp(key, "obs_point_id")) { diff --git a/tests/ofp-actions.at b/tests/ofp-actions.at index 86aec12e80a..8a0504b3cb3 100644 --- a/tests/ofp-actions.at +++ b/tests/ofp-actions.at @@ -1127,6 +1127,8 @@ bad_action 'unroll_xlate' "UNROLL is an internal action that shouldn't be used v # sample bad_action 'sample(probability=0)' 'invalid probability value "0"' bad_action 'sample(sampling_port=asdf)' 'asdf: unknown port' +bad_action 'sample(probability=12345,obs_domain_id=NXM_NX_CT_LABEL[[5..40]])' \ + 'size of obs_domain_id field (36) exceeds maximum (32)' bad_action 'sample(probability=12345,obs_point_id=NXM_NX_CT_LABEL[[0..32]])' \ 'size of obs_point_id field (33) exceeds maximum (32)' bad_action 'sample(foo=bar)' 'invalid key "foo" in "sample" argument' From d5fef714bca433f616c128ba4c5f4e05715db5e7 Mon Sep 17 00:00:00 2001 From: Ales Musil Date: Tue, 16 Jul 2024 13:45:53 +0200 Subject: [PATCH 793/833] flow: Fix unaligned access to the ND target in miniflow_extract. The data in the buffer are aligned to 2 bytes, however 'struct in6_addr' is aligned to 4 bytes. Use the 2 bytes aligned equivalent 'union ovs_16aligned_in6_addr' instead. This was caught by one of the OVN tests: lib/flow.c:1133:25: runtime error: load of misaligned address 0x51400009cc92 for type 'const struct in6_addr *', which requires 4 byte alignment 0x51400009cc92: note: pointer points here 00 00 00 00 10 00 00 00 00 00 00 00 00 00 00 00 00 00 ^ 0 0x8255b2 in miniflow_extract lib/flow.c:1133:25 1 0x81d921 in flow_extract lib/flow.c:671:5 2 0xa966d4 in ofp_packet_to_string lib/ofp-print.c:82:5 3 0xa76de2 in ofputil_packet_in_private_format lib/ofp-packet.c:1037:24 4 0xa99817 in ofp_print_packet_in lib/ofp-print.c:132:9 5 0xa97f46 in ofp_to_string__ lib/ofp-print.c 6 0xa97f46 in ofp_to_string lib/ofp-print.c:1264:21 7 0xc338f4 in do_send lib/vconn.c:687:19 8 0xb7f678 in try_send lib/rconn.c:1128:14 9 0xb7d725 in rconn_send__ lib/rconn.c:760:13 10 0xb7d8e7 in rconn_send_with_limit lib/rconn.c:816:17 11 0x6f70de in do_send_packet_ins ofproto/connmgr.c:1697:13 12 0x6f691f in connmgr_send_async_msg ofproto/connmgr.c:1682:9 13 0x5c2d23 in run ofproto/ofproto-dpif.c:1877:13 14 0x56737f in ofproto_run ofproto/ofproto.c:1906:13 15 0x50d4fc in bridge_run__ vswitchd/bridge.c:3287:9 16 0x50a764 in bridge_run vswitchd/bridge.c:3346:5 17 0x53eed7 in main vswitchd/ovs-vswitchd.c:130:9 Acked-by: Simon Horman Signed-off-by: Ales Musil Signed-off-by: Ilya Maximets --- lib/flow.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/flow.c b/lib/flow.c index dc5fb328d9c..9be4375246a 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -408,7 +408,8 @@ parse_ethertype(const void **datap, size_t *sizep) static inline bool parse_icmpv6(const void **datap, size_t *sizep, const struct icmp6_data_header *icmp6, - ovs_be32 *rso_flags, const struct in6_addr **nd_target, + ovs_be32 *rso_flags, + const union ovs_16aligned_in6_addr **nd_target, struct eth_addr arp_buf[2], uint8_t *opt_type) { if (icmp6->icmp6_base.icmp6_code != 0 || @@ -1117,7 +1118,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } } else if (OVS_LIKELY(nw_proto == IPPROTO_ICMPV6)) { if (OVS_LIKELY(size >= sizeof(struct icmp6_data_header))) { - const struct in6_addr *nd_target; + const union ovs_16aligned_in6_addr *nd_target; struct eth_addr arp_buf[2]; /* This will populate whether we received Option 1 * or Option 2. */ From f973d9543ecd461adcb63ca8db08a51252f1a8a7 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 16 Jul 2024 14:47:17 +0200 Subject: [PATCH 794/833] ofproto-dpif-xlate: Remove misleading wc NULL check in packet mirror. 'wc' can't be NULL there and if it can we'd already crash a few lines before setting up vlan flags. The check is misleading as it makes people to assume that wc can be NULL. And it makes Coverity think the same: CID 1596572: (#1 of 1): Dereference after null check (FORWARD_NULL) 25. var_deref_op: Dereferencing null pointer ctx->wc. 14. var_compare_op: Comparing ctx->wc to null implies that ctx->wc might be null Remove the check. Fixes: 3b1882261c8b ("ofproto-dpif-mirror: Add support for pre-selection filter.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index be2c707215f..02567a961d5 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -2315,7 +2315,7 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, } /* After the VLAN check, apply a flow mask if a filter is specified. */ - if (ctx->wc && mc.filter_flow) { + if (mc.filter_flow) { flow_wildcards_union_with_minimask(ctx->wc, mc.filter_mask); if (!OVS_UNLIKELY( miniflow_equal_flow_in_minimask(mc.filter_flow, From ebdc3cf91ed582974d67da529e7e6292d3ee9f58 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jul 2024 12:55:02 +0200 Subject: [PATCH 795/833] docs: Define Read the Docs configuration for Sphinx HTML parameters. Read the Docs was always mangling the conf.py during the build to inject custom domains configured in the project settings and some other stuff. But they will stop doing that soon [1]. Adding recommended changes to the config to get this info from the environment. [1] https://about.readthedocs.com/blog/2024/07/addons-by-default/ Acked-by: Simon Horman Signed-off-by: Ilya Maximets --- Documentation/conf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/conf.py b/Documentation/conf.py index 15785605ad8..2364405ade8 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -12,6 +12,7 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import os import string import sys @@ -108,6 +109,13 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +# Define the canonical URL for our domain configured on Read the Docs. +html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "") + +# Tell Jinja2 templates the build is running on Read the Docs. +html_context = {} +if os.environ.get("READTHEDOCS", "") == "True": + html_context["READTHEDOCS"] = True # -- Options for manual page output --------------------------------------- From f9078407a9d8dffffc391ccfe144de823436cae6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jul 2024 14:00:56 +0200 Subject: [PATCH 796/833] ofproto-dpif-xlate: Initialize observe_offset for sample actions. For some reason gcc 14.1.1 from Fedora 41 thinks that the variable may end up not initialized: ofproto/ofproto-dpif-xlate.c: In function 'compose_sample_action': ofproto/ofproto-dpif-xlate.c:3465:40: error: 'observe_offset' may be used uninitialized 3465 | ctx->xout->last_observe_offset = observe_offset; | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~ ofproto/ofproto-dpif-xlate.c:3418:12: note: 'observe_offset' was declared here 3418 | size_t observe_offset; | ^~~~~~~~~~~~~~ We have an assertion in the code to ensure that at least one of the actions is present (userspace or psample), so the variable should actually be always initialized. Initialize explicitly just to silence the warning. Fixes: 516569d31fbf ("ofproto: xlate: Make sampled drops explicit.") Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- ofproto/ofproto-dpif-xlate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 02567a961d5..850597b3a48 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3423,8 +3423,8 @@ compose_sample_action(struct xlate_ctx *ctx, * insert a meter action before the user space action. */ struct ofproto *ofproto = &ctx->xin->ofproto->up; uint32_t meter_id = ofproto->slowpath_meter_id; + size_t observe_offset = UINT32_MAX; size_t cookie_offset = 0; - size_t observe_offset; /* The meter action is only used to throttle userspace actions. * If they are not needed and the sampling rate is 100%, avoid generating From b516da14ca8b38cf65db22a5ff85921da1d13002 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jul 2024 14:00:57 +0200 Subject: [PATCH 797/833] util: Add non-NULL format assertion to xvasprintf. For some reason GCC 14.1.1 on Fedora 41 assumes that format can be NULL and emits a warning: lib/util.c: In function 'xvasprintf': lib/util.c:229:14: error: null format string 229 | needed = vsnprintf(NULL, 0, format, args); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ I didn't find any users where this can be true. Adding an assertion to silence the warning. In the worst case we'll find out where it is being called incorrectly. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/util.c b/lib/util.c index 5253921b2c3..bdd6408b2a6 100644 --- a/lib/util.c +++ b/lib/util.c @@ -225,6 +225,8 @@ xvasprintf(const char *format, va_list args) size_t needed; char *s; + ovs_assert(format); + va_copy(args2, args); needed = vsnprintf(NULL, 0, format, args); From 53d9dcb9fbe05bf0432263b1826a71a1176f8916 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 17 Jul 2024 14:00:58 +0200 Subject: [PATCH 798/833] match: Fix false-positive snprintf size warning. GCC 14.1.1 of Fedora 41 thinks that 'i' can be in a full range and so 8 bytes is not enough to print it. lib/match.c: In function 'match_format': lib/match.c:1631:45: error: '%d' directive output may be truncated writing between 1 and 11 bytes into a region of size 8 1631 | snprintf(str_i, sizeof(str_i), "%d", i); | ^~ lib/match.c:1631:44: note: directive argument in the range [-2147483646, 1] 1631 | snprintf(str_i, sizeof(str_i), "%d", i); | ^~~~ lib/match.c:1631:13: note: 'snprintf' output between 2 and 12 bytes into a destination of size 8 1631 | snprintf(str_i, sizeof(str_i), "%d", i); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In practice that value can't be larger than 2, but it's not a performance critical code, so let's just increase the size to a maximum 12. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/match.c b/lib/match.c index 0b9dc4278c1..9b7e06e0c7f 100644 --- a/lib/match.c +++ b/lib/match.c @@ -1618,7 +1618,7 @@ match_format(const struct match *match, ds_put_char(s, ','); } for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) { - char str_i[8]; + char str_i[12]; if (!wc->masks.vlans[i].tci) { break; From cdba5e30edc3ebd0559ef53fb6fb92ec26b7868e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 17 Jul 2024 14:24:16 +0100 Subject: [PATCH 799/833] AUTHORS: Add Vipul Ashri. Add Vipul Ashri to AUTHORS file. Signed-off-by: Simon Horman Signed-off-by: Ilya Maximets --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 155e484360d..28dcce4eaf7 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -760,6 +760,7 @@ Tytus Kurek Tytus.Kurek@pega.com Valentin Bud valentin@hackaserver.com Vasiliy Tolstov v.tolstov@selfip.ru Vinllen Chen cvinllen@gmail.com +Vipul Ashri vipul.ashri@ericsson.com Vishal Swarankar vishal.swarnkar@gmail.com Vjekoslav Brajkovic balkan@cs.washington.edu Voravit T. voravit@kth.se From f12f4d89063bb7b67d975d618a1792a6a834a78f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 18 Jul 2024 14:48:03 +0100 Subject: [PATCH 800/833] Documentation: Update QEMU documentation URLs. The current QEMU documentation URLs, which point to a presumably old Git repository, appear to time out. Update it with a new links under https://www.qemu.org/docs Acked-by: Ilya Maximets Signed-off-by: Simon Horman --- Documentation/topics/dpdk/vhost-user.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst index d9d87aa0872..7bba08ac216 100644 --- a/Documentation/topics/dpdk/vhost-user.rst +++ b/Documentation/topics/dpdk/vhost-user.rst @@ -312,7 +312,7 @@ predictable migration time. Mostly used as a second phase after the normal More information can be found in QEMU `docs`_. -.. _`docs`: https://git.qemu.org/?p=qemu.git;a=blob;f=docs/devel/migration.rst +.. _`docs`: https://www.qemu.org/docs/master/devel/migration/postcopy.html Post-copy support may be enabled via a global config value ``vhost-postcopy-support``. Setting this to ``true`` enables Post-copy support @@ -487,7 +487,7 @@ Sample XML -.. _QEMU documentation: http://git.qemu-project.org/?p=qemu.git;a=blob;f=docs/specs/vhost-user.txt;h=7890d7169;hb=HEAD +.. _QEMU documentation: https://www.qemu.org/docs/master/interop/vhost-user.html Jumbo Frames ------------ From 8f3d6c145b7cf9ae6c320a0291a715b9c4715bd6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 18 Jul 2024 21:12:37 +0200 Subject: [PATCH 801/833] netlink-notifier: Silence the UBsan's function pointer mismatch error. There are two types of netlink notifier callbacks: nln_notify_func and rtnetlink_notify_func. The rtnetlink_notify_func is only registered via rtnetlink_notifier_create(), so there is no real case where we could use the wrong function pointer. But UBsan in Clang 17 complains that the function pointer type is not exactly the same: lib/netlink-notifier.c:237:13: runtime error: call to function name_table_change through pointer to incorrect function type 'void (*)(const void *, void *)' lib/route-table.c:406: note: name_table_change defined here 0 0xf65ed7 in nln_report lib/netlink-notifier.c:237:13 1 0xf64e2e in nln_run lib/netlink-notifier.c 2 0x50d4f2 in bridge_run vswitchd/bridge.c:3373:5 3 0x547c55 in main vswitchd/ovs-vswitchd.c:137:9 4 0x7f8149 in __libc_start_call_main 5 0x7f820a in __libc_start_main@GLIBC_2.2.5 6 0x42dfd4 in _start (vswitchd/ovs-vswitchd+0x42dfd4) Turn off function sanitizing for nln_report() the same as we do for RCU callbacks to avoid runtime errors with UBsan enabled. Reproduced with OVN test suite running multiple tests in parallel. Acked-by: Mike Pattrick Signed-off-by: Ilya Maximets --- lib/netlink-notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/netlink-notifier.c b/lib/netlink-notifier.c index dfecb97789f..7ea5a418182 100644 --- a/lib/netlink-notifier.c +++ b/lib/netlink-notifier.c @@ -223,7 +223,7 @@ nln_wait(struct nln *nln) } } -void +void OVS_NO_SANITIZE_FUNCTION nln_report(const struct nln *nln, void *change, int group) { struct nln_notifier *notifier; From f5e1ab2ad992448d35475e94f8ce6e7d9ac6c8ba Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Thu, 13 Oct 2022 04:52:27 -0700 Subject: [PATCH 802/833] Apply P4-OVS changes to main branch Signed-off-by: Derek G Foster --- .gitignore | 1 + configure.ac | 3 +++ lib/dpif-netlink-rtnl.c | 12 ++++++++++++ lib/netdev-vport.c | 7 +++++-- lib/netdev.h | 4 ++++ lib/util.h | 1 - m4/ovs_check_p4ovs.m4 | 21 +++++++++++++++++++++ vswitchd/automake.mk | 27 +++++++++++++++++++++++++-- 8 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 m4/ovs_check_p4ovs.m4 diff --git a/.gitignore b/.gitignore index 26ed8d3d067..b3fb29ec078 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ *.so *.suo **/*.sym +**/*.pc *~ *,cover .#* diff --git a/configure.ac b/configure.ac index 266e9d4799e..0eb62b496b5 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,5 @@ # Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc. +# Copyright (c) 2021 Intel Corporation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -203,6 +204,8 @@ OVS_CHECK_LINUX_VIRTIO_TYPES OVS_CHECK_DPDK OVS_CHECK_PRAGMA_MESSAGE OVS_CHECK_VERSION_SUFFIX +OVS_CHECK_P4OVS + AC_SUBST([CFLAGS]) AC_SUBST([OVS_CFLAGS]) AC_SUBST([OVS_LDFLAGS]) diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c index f7035333e63..7e489663478 100644 --- a/lib/dpif-netlink-rtnl.c +++ b/lib/dpif-netlink-rtnl.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2017 Red Hat, Inc. + * Copyright (c) 2021 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -377,6 +378,17 @@ dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg, /* tunnel unique info */ switch (type) { case OVS_VPORT_TYPE_VXLAN: + nl_msg_put_u8(&request, IFLA_VXLAN_TTL, tnl_cfg->ttl); + nl_msg_put_u32(&request, IFLA_VXLAN_ID, tnl_cfg->vni); + if (tnl_cfg->ipv6_dst.__in6_u.__u6_addr32[0] == 0) { + nl_msg_put_be32(&request, IFLA_VXLAN_GROUP, + tnl_cfg->ipv6_dst.__in6_u.__u6_addr32[3]); + } + if (tnl_cfg->ipv6_src.__in6_u.__u6_addr32[0] == 0) { + nl_msg_put_be32(&request, IFLA_VXLAN_LOCAL, + tnl_cfg->ipv6_src.__in6_u.__u6_addr32[3]); + } + nl_msg_put_u8(&request, IFLA_VXLAN_LEARNING, 0); nl_msg_put_u8(&request, IFLA_VXLAN_COLLECT_METADATA, 1); nl_msg_put_u8(&request, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1); diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 234a4ebe127..b3d1fb041f8 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2010, 2011, 2012, 2013, 2014, 2017 Nicira, Inc. * Copyright (c) 2016 Red Hat, Inc. + * Copyright (c) 2021 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -714,8 +715,10 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) if (!strcmp(node->value, "false")) { tnl_cfg.dont_fragment = false; } - } else if (!strcmp(node->key, "key") || - !strcmp(node->key, "in_key") || + } else if (!strcmp(node->key, "key") && strcmp(node->value, "flow")) { + /* Add VNI to tunnel config if the value is not flow */ + tnl_cfg.vni = atoi(node->value); + } else if (!strcmp(node->key, "in_key") || !strcmp(node->key, "out_key") || !strcmp(node->key, "packet_type")) { /* Handled separately below. */ diff --git a/lib/netdev.h b/lib/netdev.h index 63e03d72db4..d298cabe779 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc. + * Copyright (c) 2021 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -174,6 +175,9 @@ struct netdev_tunnel_config { #define SRV6_MAX_SEGS 6 struct in6_addr srv6_segs[SRV6_MAX_SEGS]; enum netdev_srv6_flowlabel srv6_flowlabel; +#ifdef P4OVS + uint32_t vni; +#endif }; void netdev_run(void); diff --git a/lib/util.h b/lib/util.h index c486b534049..c1fd120bc42 100644 --- a/lib/util.h +++ b/lib/util.h @@ -27,7 +27,6 @@ #include #include #include "compiler.h" -#include "util.h" #include "openvswitch/util.h" #if defined(__aarch64__) && __GNUC__ >= 6 #include diff --git a/m4/ovs_check_p4ovs.m4 b/m4/ovs_check_p4ovs.m4 new file mode 100644 index 00000000000..c1062327505 --- /dev/null +++ b/m4/ovs_check_p4ovs.m4 @@ -0,0 +1,21 @@ +dnl OVS_CHECK_P4OVS - Process P4 options. + +dnl Copyright(c) 2021-2022 Intel Corporation. +dnl SPDX-License-Identifier: Apache 2.0 + +AC_DEFUN([OVS_CHECK_P4OVS], [ + AC_ARG_WITH([p4ovs], + [AC_HELP_STRING([--with-p4ovs], [Build with P4 support])], + [have_p4ovs=true]) + AC_MSG_CHECKING([whether P4OVS is enabled]) + if test "$have_p4ovs" != true || test "$with_p4ovs" = no; then + AC_MSG_RESULT([no]) + P4OVS_VALID=false + else + AC_MSG_RESULT([yes]) + P4OVS_VALID=true + AC_DEFINE([P4OVS], [1], [System includes P4 support.]) + fi + dnl export automake conditional + AM_CONDITIONAL([P4OVS], test "$P4OVS_VALID" = true) +]) diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk index 830c9a18821..1794efb7de2 100644 --- a/vswitchd/automake.mk +++ b/vswitchd/automake.mk @@ -1,19 +1,42 @@ -sbin_PROGRAMS += vswitchd/ovs-vswitchd +# vswitchd man_MANS += vswitchd/ovs-vswitchd.8 CLEANFILES += \ vswitchd/ovs-vswitchd.8 -vswitchd_ovs_vswitchd_SOURCES = \ +vswitchd_sources = \ vswitchd/bridge.c \ vswitchd/bridge.h \ vswitchd/ovs-vswitchd.c \ vswitchd/system-stats.c \ vswitchd/system-stats.h + +if P4OVS +# Build a static library instead of an executable. +lib_LTLIBRARIES += vswitchd/libvswitchd.la + +vswitchd_libvswitchd_la_CPPFLAGS = $(AM_CPPFLAGS) + +vswitchd_libvswitchd_la_SOURCES = \ + $(vswitchd_sources) + +vswitchd_libvswitchd_la_LIBADD = \ + ofproto/libofproto.la \ + lib/libsflow.la \ + lib/libopenvswitch.la +else +sbin_PROGRAMS += vswitchd/ovs-vswitchd + +vswitchd_ovs_vswitchd_SOURCES = \ + $(vswitchd_sources) + vswitchd_ovs_vswitchd_LDADD = \ ofproto/libofproto.la \ lib/libsflow.la \ lib/libopenvswitch.la + vswitchd_ovs_vswitchd_LDFLAGS = $(AM_LDFLAGS) $(DPDK_vswitchd_LDFLAGS) +endif + MAN_ROOTS += vswitchd/ovs-vswitchd.8.in # vswitch schema and IDL From d093cc0f1a26e45a9c29a664ab9c70a13c32e56c Mon Sep 17 00:00:00 2001 From: nupuruttarwar Date: Mon, 21 Nov 2022 08:22:32 -0800 Subject: [PATCH 803/833] Add support for P4 table entry for mac learning and tunnel creation (#90) Program forwarding table entry is mac is learnt or expired and tunnel table entry when vxlan port is created/deleted. Disable ovs-testcontroller when built in P4OVS mode since testcontroller references mac learning library. In future, test controller will be built as static library and linked to sidecar before building it's executable Signed-off-by: Nupur Uttarwar Signed-off-by: Nupur Uttarwar --- include/openvswitch/automake.mk | 5 ++ include/openvswitch/ovs-p4rt.h | 58 ++++++++++++++++++++++ lib/dpif-netlink-rtnl.c | 11 ----- lib/mac-learning.c | 11 +++++ ofproto/ofproto-dpif-xlate.c | 86 +++++++++++++++++++++++++++++++++ ofproto/ofproto-dpif.c | 58 ++++++++++++++++++++++ utilities/automake.mk | 31 ++++++++++-- 7 files changed, 245 insertions(+), 15 deletions(-) create mode 100644 include/openvswitch/ovs-p4rt.h diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 0cc1f569e0a..681eca82d43 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -52,6 +52,11 @@ openvswitchinclude_HEADERS = \ include/openvswitch/vlog.h \ include/openvswitch/nsh.h +if P4OVS +openvswitchinclude_HEADERS += \ + include/openvswitch/ovs-p4rt.h +endif + if HAVE_CXX # OVS does not use C++ itself, but it provides public header files # that a C++ compiler should accept, so when --enable-Werror is in diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h new file mode 100644 index 00000000000..74ec5350bcd --- /dev/null +++ b/include/openvswitch/ovs-p4rt.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Intel Corporation. + * SPDX-License-Identifier: Apache-2.0 + * + * Defines the public interface to an externally-supplied module + * that permits OvS to communicate with the P4 control plane. + */ + +#ifndef OPENVSWITCH_OVS_P4RT_H +#define OPENVSWITCH_OVS_P4RT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct p4_ipaddr { + uint8_t family; + uint8_t prefix_len; + uint32_t v4addr; +}; + +struct tunnel_info { + uint32_t ifindex; + uint32_t port_id; + struct p4_ipaddr local_ip; + struct p4_ipaddr remote_ip; + uint16_t dst_port; + uint16_t vni; +}; + +struct vlan_info { + uint32_t vlan_id; +}; + +struct mac_learning_info { + bool is_tunnel; + bool is_vlan; + uint8_t mac_addr[6]; + union { + struct tunnel_info tnl_info; + struct vlan_info vln_info; + }; +}; + +// Function declarations +extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, + bool insert_entry); +extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, + bool insert_entry); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // OPENVSWITCH_OVS_P4RT_H + diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c index 7e489663478..e2e0b40d685 100644 --- a/lib/dpif-netlink-rtnl.c +++ b/lib/dpif-netlink-rtnl.c @@ -378,17 +378,6 @@ dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg, /* tunnel unique info */ switch (type) { case OVS_VPORT_TYPE_VXLAN: - nl_msg_put_u8(&request, IFLA_VXLAN_TTL, tnl_cfg->ttl); - nl_msg_put_u32(&request, IFLA_VXLAN_ID, tnl_cfg->vni); - if (tnl_cfg->ipv6_dst.__in6_u.__u6_addr32[0] == 0) { - nl_msg_put_be32(&request, IFLA_VXLAN_GROUP, - tnl_cfg->ipv6_dst.__in6_u.__u6_addr32[3]); - } - if (tnl_cfg->ipv6_src.__in6_u.__u6_addr32[0] == 0) { - nl_msg_put_be32(&request, IFLA_VXLAN_LOCAL, - tnl_cfg->ipv6_src.__in6_u.__u6_addr32[3]); - } - nl_msg_put_u8(&request, IFLA_VXLAN_LEARNING, 0); nl_msg_put_u8(&request, IFLA_VXLAN_COLLECT_METADATA, 1); nl_msg_put_u8(&request, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1); diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 5932e2709d0..763c032c9a0 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -30,6 +30,10 @@ #include "util.h" #include "vlan-bitmap.h" +#if defined(P4OVS) +#include "openvswitch/ovs-p4rt.h" +#endif + COVERAGE_DEFINE(mac_learning_learned); COVERAGE_DEFINE(mac_learning_expired); COVERAGE_DEFINE(mac_learning_evicted); @@ -617,6 +621,13 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) mac_entry_set_port(ml, e, NULL); hmap_remove(&ml->table, &e->hmap_node); ovs_list_remove(&e->lru_node); +#if defined(P4OVS) + struct mac_learning_info fdb_info; + memset(&fdb_info, 0, sizeof(struct mac_learning_info)); + memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); + fdb_info.is_vlan = true; + ConfigFdbTableEntry(fdb_info, false); +#endif free(e); } diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 850597b3a48..345bfd8a39a 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -69,6 +69,15 @@ #include "uuid.h" #include "vlan-bitmap.h" +#if defined(P4OVS) +#include "lib/netdev.h" +#include +#include +#include +#include +#include "openvswitch/ovs-p4rt.h" +#endif //P4OVS + COVERAGE_DEFINE(xlate_actions); COVERAGE_DEFINE(xlate_actions_oversize); COVERAGE_DEFINE(xlate_actions_too_many_output); @@ -3150,6 +3159,67 @@ is_ip_local_multicast(const struct flow *flow, struct flow_wildcards *wc) } } +#if defined(P4OVS) +static int32_t +get_fdb_data(struct xport *port, struct eth_addr mac_addr, + struct mac_learning_info *fdb_info) +{ + if (!port || !port->netdev || !port->xbundle) { + return -1; + } + + memcpy(fdb_info->mac_addr, mac_addr.ea, sizeof(fdb_info->mac_addr)); + if (port->is_tunnel) { + fdb_info->is_tunnel = port->is_tunnel; + const struct netdev_tunnel_config *underlay_tnl = NULL; + underlay_tnl = netdev_get_tunnel_config(port->netdev); + if (!underlay_tnl) { + VLOG_ERR("Error retrieving netdev tunnel config"); + return -1; + } + + int underlay_ifindex = netdev_get_ifindex(port->netdev); + if (underlay_ifindex < 0) { + VLOG_ERR("Invalid tunnel ifindex"); + return -1; + } + + fdb_info->tnl_info.ifindex = (uint32_t)underlay_ifindex; + fdb_info->tnl_info.local_ip.v4addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; + fdb_info->tnl_info.remote_ip.v4addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; + fdb_info->tnl_info.dst_port = underlay_tnl->dst_port; + fdb_info->tnl_info.vni = underlay_tnl->vni; + } else { + const char *port_name = port->xbundle->name; + if (strncmp(port_name, "vlan", strlen("vlan"))) { + VLOG_ERR("Not a VLAN interface, port name = %s", port_name); + return -1; + } else { + fdb_info->is_vlan = true; + int fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd == -1) { + VLOG_ERR("socket creation failed"); + return -1; + } + struct vlan_ioctl_args if_request; + memset(&if_request, 0, sizeof(if_request)); + strncpy(if_request.device1, port_name, sizeof(if_request.device1)-1); + + if_request.cmd = GET_VLAN_VID_CMD; + if (ioctl(fd, SIOCSIFVLAN, &if_request) == -1) { + close(fd); + VLOG_ERR("Error retrieving vlan id through ioctl"); + return -1; + } + fdb_info->vln_info.vlan_id = if_request.u.VID; + close(fd); + } + } + + return 0; +} +#endif + static void xlate_normal(struct xlate_ctx *ctx) { @@ -3221,6 +3291,22 @@ xlate_normal(struct xlate_ctx *ctx) update_learning_table(ctx, in_xbundle, flow->dl_src, vlan, is_grat_arp); } + +#if defined(P4OVS) + //MAC is learnt, program P4 forwarding table + struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, + flow->in_port.ofp_port); + struct mac_learning_info fdb_info; + memset(&fdb_info, 0, sizeof(fdb_info)); + + if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { + ConfigFdbTableEntry(fdb_info, true); + } else { + VLOG_ERR("Error retrieving FDB information, skipping programming " + "P4 entry"); + } +#endif + if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) { struct xc_entry *entry; diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index d3c353b9d60..3955cd4f578 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -75,6 +75,13 @@ #include "uuid.h" #include "vlan-bitmap.h" +#if defined(P4OVS) +#include "openvswitch/ovs-p4rt.h" +static int32_t +get_tunnel_data(struct netdev *netdev, + struct tunnel_info *tnl_info); +#endif + VLOG_DEFINE_THIS_MODULE(ofproto_dpif); COVERAGE_DEFINE(ofproto_dpif_expired); @@ -2355,6 +2362,18 @@ port_destruct(struct ofport *port_, bool del) } tnl_port_del(port, port->odp_port); +#if defined(P4OVS) + if (port->is_tunnel) { + struct tunnel_info tnl_info; + memset(&tnl_info, 0, sizeof(tnl_info)); + if (!get_tunnel_data(port->up.netdev, &tnl_info)) { + ConfigTunnelTableEntry(tnl_info, false); + } else { + VLOG_ERR("Error retrieving tunnel information, skipping programming " + "P4 entry"); + } + } +#endif sset_find_and_delete(&ofproto->ports, devname); sset_find_and_delete(&ofproto->ghost_ports, devname); bundle_remove(port_); @@ -4123,6 +4142,32 @@ port_query_by_name(const struct ofproto *ofproto_, const char *devname, return error; } +#if defined(P4OVS) +static int32_t +get_tunnel_data(struct netdev *netdev, + struct tunnel_info *tnl_info) +{ + const struct netdev_tunnel_config *underlay_tnl = NULL; + underlay_tnl = netdev_get_tunnel_config(netdev); + if (!underlay_tnl) { + VLOG_ERR("Error retrieving netdev tunnel config"); + return -1; + } + int underlay_ifindex = netdev_get_ifindex(netdev); + if (underlay_ifindex < 0) { + VLOG_ERR("Invalid tunnel ifindex"); + return -1; + } + tnl_info->ifindex = (uint32_t)underlay_ifindex; + tnl_info->local_ip.v4addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; + tnl_info->remote_ip.v4addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; + tnl_info->dst_port = underlay_tnl->dst_port; + tnl_info->vni = underlay_tnl->vni; + + return 0; +} +#endif + static int port_add(struct ofproto *ofproto_, struct netdev *netdev) { @@ -4160,6 +4205,19 @@ port_add(struct ofproto *ofproto_, struct netdev *netdev) } else { sset_add(&ofproto->ports, devname); } + +#if defined(P4OVS) + if (netdev_get_tunnel_config(netdev)) { + struct tunnel_info tnl_info; + memset(&tnl_info, 0, sizeof(tnl_info)); + if (!get_tunnel_data(netdev, &tnl_info)) { + ConfigTunnelTableEntry(tnl_info, true); + } else { + VLOG_ERR("Error retrieving tunnel information, skipping programming " + "P4 entry"); + } + } +#endif return 0; } diff --git a/utilities/automake.mk b/utilities/automake.mk index 146b8c37fbb..42df6ea8eed 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -1,9 +1,18 @@ bin_PROGRAMS += \ utilities/ovs-appctl \ - utilities/ovs-testcontroller \ utilities/ovs-dpctl \ utilities/ovs-ofctl \ utilities/ovs-vsctl + +# Disable ovs-testcontroller when built in P4OVS mode since +# testcontroller references mac learning library. +# TODO:Build test controller as static library and +# link to sidecar before building it's executable + +if !P4OVS +bin_PROGRAMS += utilities/ovs-testcontroller +endif + bin_SCRIPTS += utilities/ovs-docker \ utilities/ovs-pki \ utilities/ovs-pcap \ @@ -80,17 +89,20 @@ EXTRA_DIST += \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py MAN_ROOTS += \ - utilities/ovs-testcontroller.8.in \ utilities/ovs-dpctl.8.in \ utilities/ovs-dpctl-top.8.in \ utilities/ovs-kmod-ctl.8 \ utilities/ovs-ofctl.8.in \ utilities/ovs-pcap.1.in \ utilities/ovs-vsctl.8.in + +if !P4OVS +MAN_ROOTS += utilities/ovs-testcontroller.8.in +endif + CLEANFILES += \ utilities/ovs-ctl \ utilities/ovs-check-dead-ifs \ - utilities/ovs-testcontroller.8 \ utilities/ovs-dpctl.8 \ utilities/ovs-dpctl-top \ utilities/ovs-dpctl-top.8 \ @@ -109,8 +121,12 @@ CLEANFILES += \ utilities/ovs-vlan-test \ utilities/ovs-vsctl.8 +if !P4OVS +CLEANFILES += \ + utilities/ovs-testcontroller.8 +endif + man_MANS += \ - utilities/ovs-testcontroller.8 \ utilities/ovs-dpctl.8 \ utilities/ovs-dpctl-top.8 \ utilities/ovs-kmod-ctl.8 \ @@ -118,11 +134,18 @@ man_MANS += \ utilities/ovs-pcap.1 \ utilities/ovs-vsctl.8 +if !P4OVS +man_MANS += \ + utilities/ovs-testcontroller.8 +endif + utilities_ovs_appctl_SOURCES = utilities/ovs-appctl.c utilities_ovs_appctl_LDADD = lib/libopenvswitch.la +if !P4OVS utilities_ovs_testcontroller_SOURCES = utilities/ovs-testcontroller.c utilities_ovs_testcontroller_LDADD = lib/libopenvswitch.la $(SSL_LIBS) +endif utilities_ovs_dpctl_SOURCES = utilities/ovs-dpctl.c utilities_ovs_dpctl_LDADD = lib/libopenvswitch.la From c4ec3ce5e5648600e4ca0ac8d844bfeec3144905 Mon Sep 17 00:00:00 2001 From: Nupur Uttarwar Date: Tue, 6 Dec 2022 19:24:52 -0800 Subject: [PATCH 804/833] Build ovs testcontroller as static library (#91) Build test controller as static library in P4OVS mode. The networking recipe will link it with the sidecar to generate the executable. Signed-off-by: Nupur Uttarwar --- utilities/automake.mk | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/utilities/automake.mk b/utilities/automake.mk index 42df6ea8eed..f3a16095659 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -4,12 +4,11 @@ bin_PROGRAMS += \ utilities/ovs-ofctl \ utilities/ovs-vsctl -# Disable ovs-testcontroller when built in P4OVS mode since -# testcontroller references mac learning library. -# TODO:Build test controller as static library and -# link to sidecar before building it's executable - -if !P4OVS +# Build test controller as static library and +# link to sidecar before building its executable +if P4OVS +lib_LTLIBRARIES += utilities/libtestcontroller.la +else bin_PROGRAMS += utilities/ovs-testcontroller endif @@ -94,11 +93,8 @@ MAN_ROOTS += \ utilities/ovs-kmod-ctl.8 \ utilities/ovs-ofctl.8.in \ utilities/ovs-pcap.1.in \ - utilities/ovs-vsctl.8.in - -if !P4OVS -MAN_ROOTS += utilities/ovs-testcontroller.8.in -endif + utilities/ovs-vsctl.8.in \ + utilities/ovs-testcontroller.8.in CLEANFILES += \ utilities/ovs-ctl \ @@ -119,12 +115,8 @@ CLEANFILES += \ utilities/ovs-tcpundump \ utilities/ovs-test \ utilities/ovs-vlan-test \ - utilities/ovs-vsctl.8 - -if !P4OVS -CLEANFILES += \ + utilities/ovs-vsctl.8 \ utilities/ovs-testcontroller.8 -endif man_MANS += \ utilities/ovs-dpctl.8 \ @@ -132,17 +124,17 @@ man_MANS += \ utilities/ovs-kmod-ctl.8 \ utilities/ovs-ofctl.8 \ utilities/ovs-pcap.1 \ - utilities/ovs-vsctl.8 - -if !P4OVS -man_MANS += \ + utilities/ovs-vsctl.8 \ utilities/ovs-testcontroller.8 -endif utilities_ovs_appctl_SOURCES = utilities/ovs-appctl.c utilities_ovs_appctl_LDADD = lib/libopenvswitch.la -if !P4OVS +if P4OVS +utilities_libtestcontroller_la_CPPFLAGS = $(AM_CPPFLAGS) +utilities_libtestcontroller_la_SOURCES = utilities/ovs-testcontroller.c +utilities_libtestcontroller_la_LIBADD = lib/libopenvswitch.la $(SSL_LIBS) +else utilities_ovs_testcontroller_SOURCES = utilities/ovs-testcontroller.c utilities_ovs_testcontroller_LDADD = lib/libopenvswitch.la $(SSL_LIBS) endif From 6f60f62c5aab3ee999064b2df6ac57f824938263 Mon Sep 17 00:00:00 2001 From: Sandeep N Date: Fri, 5 May 2023 20:54:49 +0530 Subject: [PATCH 805/833] Enble IPv6 support Changes include - Receiving IPv6 address from OVS tunnel configuration and call ovs sidecar API's - Changes to support Static MAC entries programming to the target. Signed-off-by: Sandeep N --- include/openvswitch/ovs-p4rt.h | 5 ++++- ofproto/ofproto-dpif-xlate.c | 35 +++++++++++++++++++++++++++++++--- ofproto/ofproto-dpif.c | 19 ++++++++++++++++-- 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 74ec5350bcd..135595583e4 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -18,7 +18,10 @@ extern "C" { struct p4_ipaddr { uint8_t family; uint8_t prefix_len; - uint32_t v4addr; + union { + struct in_addr v4addr; + struct in6_addr v6addr; + } ip; }; struct tunnel_info { diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 345bfd8a39a..324f2ac0f22 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3185,10 +3185,24 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, } fdb_info->tnl_info.ifindex = (uint32_t)underlay_ifindex; - fdb_info->tnl_info.local_ip.v4addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; - fdb_info->tnl_info.remote_ip.v4addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; fdb_info->tnl_info.dst_port = underlay_tnl->dst_port; fdb_info->tnl_info.vni = underlay_tnl->vni; + + if (underlay_tnl->ipv6_src.__in6_u.__u6_addr32[0]) { + /* IPv6 tunnel configuration */ + fdb_info->tnl_info.local_ip.family = AF_INET6; + fdb_info->tnl_info.local_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_src; + + fdb_info->tnl_info.remote_ip.family = AF_INET6; + fdb_info->tnl_info.remote_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_dst; + } else { + /* IPv4 tunnel configuration */ + fdb_info->tnl_info.local_ip.family = AF_INET; + fdb_info->tnl_info.local_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; + + fdb_info->tnl_info.remote_ip.family = AF_INET; + fdb_info->tnl_info.remote_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; + } } else { const char *port_name = port->xbundle->name; if (strncmp(port_name, "vlan", strlen("vlan"))) { @@ -3293,7 +3307,7 @@ xlate_normal(struct xlate_ctx *ctx) } #if defined(P4OVS) - //MAC is learnt, program P4 forwarding table + /* Dynamic MAC is learnt, program P4 forwarding table */ struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, flow->in_port.ofp_port); struct mac_learning_info fdb_info; @@ -8861,6 +8875,21 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, return false; } +#if defined(P4OVS) + /* Static MAC is configured, program P4 forwarding table */ + struct xport *ovs_port = get_ofp_port(xbundle->xbridge, + in_port); + struct mac_learning_info fdb_info; + memset(&fdb_info, 0, sizeof(fdb_info)); + + if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { + ConfigFdbTableEntry(fdb_info, true); + } else { + VLOG_ERR("Error retrieving FDB information, skipping programming " + "P4 entry"); + } +#endif + return mac_learning_add_static_entry(ofproto->ml, dl_src, vlan, xbundle->ofbundle); } diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 3955cd4f578..6ec2dcdc518 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -4159,8 +4159,23 @@ get_tunnel_data(struct netdev *netdev, return -1; } tnl_info->ifindex = (uint32_t)underlay_ifindex; - tnl_info->local_ip.v4addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; - tnl_info->remote_ip.v4addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; + if (underlay_tnl->ipv6_src.__in6_u.__u6_addr32[0]) { + /* IPv6 tunnel configuration */ + tnl_info->local_ip.family = AF_INET6; + tnl_info->local_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_src; + + tnl_info->remote_ip.family = AF_INET6; + tnl_info->remote_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_dst; + + } else { + /* IPv4 tunnel configuration */ + tnl_info->local_ip.family = AF_INET; + tnl_info->local_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; + + tnl_info->remote_ip.family = AF_INET; + tnl_info->remote_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; + } + tnl_info->dst_port = underlay_tnl->dst_port; tnl_info->vni = underlay_tnl->vni; From 91513477e2bb5f385d68639b98eeb05ac60e5a83 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Sun, 18 Jun 2023 08:07:14 -0700 Subject: [PATCH 806/833] Add CODEOWNERS file Signed-off-by: Derek G Foster --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000000..5569d581286 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @ffoulkes @nupuruttarwar From c40710237cb47db57efd90a1755c80aafbcaa639 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Wed, 9 Aug 2023 16:16:18 -0700 Subject: [PATCH 807/833] Ignore CODEOWNERS file in dist-hook-git Signed-off-by: Derek G Foster --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index dc5c34a6ae8..a4edd848540 100644 --- a/Makefile.am +++ b/Makefile.am @@ -208,7 +208,7 @@ ALL_LOCAL += dist-hook-git dist-hook-git: distfiles @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \ (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \ - grep -v '\.gitattributes$$' | \ + grep -v '\.gitattributes$$' | grep -v 'CODEOWNERS' | \ LC_ALL=C sort -u > all-gitfiles; \ LC_ALL=C comm -1 -3 distfiles all-gitfiles > missing-distfiles; \ if test -s missing-distfiles; then \ From 62ea71354407076fefe84f932a5fbc0379236609 Mon Sep 17 00:00:00 2001 From: Sandeep N Date: Mon, 13 Nov 2023 02:00:27 +0530 Subject: [PATCH 808/833] Support linux networking version 2 This PR includes - Basic logic to include a unique bridge ID for each bridge creation. - Basic logic to include a unique SRC port ID for each vxlan tunnel creation. - Additional intelligence to read each port (including tunnel port) configuration and extract vlan id, vlan mode. - Configure other new P4 tables for vxlan_encap (V4 and V6) vxlan_encap_pop_vlan (V4 and V6) vxlan_decap (V4 and V6) vxlan_decap_push_vlan (V4 and V6) tunnel_term (V4 and V6) rx_tunnel (V4 and V6) vlan_push vlan_pop tunnel_src_port vsi_src_port - Delete P4 tables when port is deleted (including vxlan) or bridge table. Signed-off-by: Sandeep N --- include/openvswitch/ovs-p4rt.h | 41 ++++++ lib/mac-learning.c | 1 + lib/mac-learning.h | 3 + ofproto/ofproto-dpif-xlate.c | 85 ++++++++++++- ofproto/ofproto-dpif-xlate.h | 10 ++ ofproto/ofproto-dpif.c | 82 +++--------- ofproto/ofproto.h | 3 + vswitchd/bridge.c | 219 +++++++++++++++++++++++++++++++++ 8 files changed, 373 insertions(+), 71 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 135595583e4..1f58689450b 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -15,6 +15,20 @@ extern "C" { #endif +#define VPORT_ID_OFFSET 16 +#define MAX_P4_BRIDGE_ID 256 +#define P4_VXLAN_SOURCE_PORT_OFFSET 2048 + +/* This is a replica of port_vlan_mode in ofproto.h */ +enum p4_vlan_mode { + P4_PORT_VLAN_ACCESS, + P4_PORT_VLAN_TRUNK, + P4_PORT_VLAN_NATIVE_TAGGED, + P4_PORT_VLAN_NATIVE_UNTAGGED, + P4_PORT_VLAN_DOT1Q_TUNNEL, + P4_PORT_VLAN_UNSUPPORTED +}; + struct p4_ipaddr { uint8_t family; uint8_t prefix_len; @@ -24,13 +38,27 @@ struct p4_ipaddr { } ip; }; +struct port_vlan_info { + enum p4_vlan_mode port_vlan_mode; + int port_vlan; +}; + struct tunnel_info { uint32_t ifindex; uint32_t port_id; + uint32_t src_port; struct p4_ipaddr local_ip; struct p4_ipaddr remote_ip; uint16_t dst_port; uint16_t vni; + struct port_vlan_info vlan_info; + uint8_t bridge_id; +}; + +struct src_port_info { + uint8_t bridge_id; + uint16_t vlan_id; + uint32_t src_port; }; struct vlan_info { @@ -41,6 +69,9 @@ struct mac_learning_info { bool is_tunnel; bool is_vlan; uint8_t mac_addr[6]; + uint8_t bridge_id; + uint32_t src_port; + struct port_vlan_info vlan_info; union { struct tunnel_info tnl_info; struct vlan_info vln_info; @@ -52,6 +83,16 @@ extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, bool insert_entry); extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, bool insert_entry); +extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, + bool insert_entry); +extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, + bool insert_entry); +extern void ConfigVlanTableEntry(uint16_t vlan_id, + bool insert_entry); +extern void ConfigIpTunnelTermTableEntry(struct tunnel_info tunnel_info, + bool insert_entry); +extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, + bool insert_entry); #ifdef __cplusplus } // extern "C" diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 763c032c9a0..bcefd2dd7c8 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -626,6 +626,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) memset(&fdb_info, 0, sizeof(struct mac_learning_info)); memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); fdb_info.is_vlan = true; + fdb_info.bridge_id = ml->p4_bridge_id; ConfigFdbTableEntry(fdb_info, false); #endif free(e); diff --git a/lib/mac-learning.h b/lib/mac-learning.h index 270fbd70d40..48e4eafbd8c 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -189,6 +189,9 @@ struct mac_learning { * ports_by_ptr is a hash table indexed by the client-provided pointer. */ struct hmap ports_by_ptr; /* struct mac_learning_port hmap_nodes. */ struct heap ports_by_usage; /* struct mac_learning_port heap_nodes. */ + + /* P4 specific bridge ID */ + uint8_t p4_bridge_id; }; int mac_entry_age(const struct mac_learning *ml, const struct mac_entry *e) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 324f2ac0f22..b55c64b1b7b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -76,6 +76,7 @@ #include #include #include "openvswitch/ovs-p4rt.h" +struct ofbundle; #endif //P4OVS COVERAGE_DEFINE(xlate_actions); @@ -165,6 +166,9 @@ struct xbundle { /* Use 802.1p tag for frames in VLAN 0? */ bool floodable; /* No port has OFPUTIL_PC_NO_FLOOD set? */ bool protected; /* Protected port mode */ +#if defined(P4OVS) + uint8_t p4_bridge_id; +#endif }; struct xport { @@ -702,6 +706,15 @@ static void xlate_xbridge_set(struct xbridge *, struct dpif *, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *, const struct xbridge_addr *); +#if defined(P4OVS) +static void xlate_xbundle_set(struct xbundle *xbundle, + enum port_vlan_mode vlan_mode, + uint16_t qinq_ethtype, int vlan, + unsigned long *trunks, unsigned long *cvlans, + enum port_priority_tags_mode, + const struct bond *bond, const struct lacp *lacp, + bool floodable, bool protected, uint8_t p4_bridge_id); +#elif static void xlate_xbundle_set(struct xbundle *xbundle, enum port_vlan_mode vlan_mode, uint16_t qinq_ethtype, int vlan, @@ -709,6 +722,7 @@ static void xlate_xbundle_set(struct xbundle *xbundle, enum port_priority_tags_mode, const struct bond *bond, const struct lacp *lacp, bool floodable, bool protected); +#endif static void xlate_xport_set(struct xport *xport, odp_port_t odp_port, const struct netdev *netdev, const struct cfm *cfm, const struct bfd *bfd, const struct lldp *lldp, @@ -1143,6 +1157,15 @@ xlate_xbridge_set(struct xbridge *xbridge, xbridge->support = *support; } +#if defined(P4OVS) +static void +xlate_xbundle_set(struct xbundle *xbundle, + enum port_vlan_mode vlan_mode, uint16_t qinq_ethtype, + int vlan, unsigned long *trunks, unsigned long *cvlans, + enum port_priority_tags_mode use_priority_tags, + const struct bond *bond, const struct lacp *lacp, + bool floodable, bool protected, uint8_t p4_bridge_id) +#elif static void xlate_xbundle_set(struct xbundle *xbundle, enum port_vlan_mode vlan_mode, uint16_t qinq_ethtype, @@ -1150,6 +1173,7 @@ xlate_xbundle_set(struct xbundle *xbundle, enum port_priority_tags_mode use_priority_tags, const struct bond *bond, const struct lacp *lacp, bool floodable, bool protected) +#endif { ovs_assert(xbundle->xbridge); @@ -1165,6 +1189,10 @@ xlate_xbundle_set(struct xbundle *xbundle, xbundle->floodable = floodable; xbundle->protected = protected; +#if defined(P4OVS) + xbundle->p4_bridge_id = p4_bridge_id; +#endif + if (xbundle->bond != bond) { bond_unref(xbundle->bond); xbundle->bond = bond_ref(bond); @@ -1257,10 +1285,18 @@ xlate_xbundle_copy(struct xbridge *xbridge, struct xbundle *xbundle) new_xbundle->name = xstrdup(xbundle->name); xlate_xbundle_init(new_xcfg, new_xbundle); +#if defined(P4OVS) + xlate_xbundle_set(new_xbundle, xbundle->vlan_mode, xbundle->qinq_ethtype, + xbundle->vlan, xbundle->trunks, xbundle->cvlans, + xbundle->use_priority_tags, xbundle->bond, xbundle->lacp, + xbundle->floodable, xbundle->protected, + xbundle->p4_bridge_id); +#elif xlate_xbundle_set(new_xbundle, xbundle->vlan_mode, xbundle->qinq_ethtype, xbundle->vlan, xbundle->trunks, xbundle->cvlans, xbundle->use_priority_tags, xbundle->bond, xbundle->lacp, xbundle->floodable, xbundle->protected); +#endif LIST_FOR_EACH (xport, bundle_node, &xbundle->xports) { xlate_xport_copy(xbridge, new_xbundle, xport); } @@ -1469,6 +1505,16 @@ xlate_remove_ofproto(struct ofproto_dpif *ofproto) xlate_xbridge_remove(new_xcfg, xbridge); } +#if defined(P4OVS) +void +xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, + const char *name, enum port_vlan_mode vlan_mode, + uint16_t qinq_ethtype, int vlan, + unsigned long *trunks, unsigned long *cvlans, + enum port_priority_tags_mode use_priority_tags, + const struct bond *bond, const struct lacp *lacp, + bool floodable, bool protected, uint8_t p4_bridge_id) +#elif void xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, const char *name, enum port_vlan_mode vlan_mode, @@ -1477,6 +1523,7 @@ xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, enum port_priority_tags_mode use_priority_tags, const struct bond *bond, const struct lacp *lacp, bool floodable, bool protected) +#endif { struct xbundle *xbundle; @@ -1494,8 +1541,14 @@ xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, free(xbundle->name); xbundle->name = xstrdup(name); +#if defined(P4OVS) + xlate_xbundle_set(xbundle, vlan_mode, qinq_ethtype, vlan, trunks, cvlans, + use_priority_tags, bond, lacp, floodable, protected, + p4_bridge_id); +#elif xlate_xbundle_set(xbundle, vlan_mode, qinq_ethtype, vlan, trunks, cvlans, use_priority_tags, bond, lacp, floodable, protected); +#endif } static void @@ -3160,6 +3213,22 @@ is_ip_local_multicast(const struct flow *flow, struct flow_wildcards *wc) } #if defined(P4OVS) +static enum p4_vlan_mode +get_p4_vlan_mode(enum port_vlan_mode vlan_mode) { + if (vlan_mode == PORT_VLAN_ACCESS) + return P4_PORT_VLAN_ACCESS; + else if (vlan_mode == PORT_VLAN_TRUNK) + return P4_PORT_VLAN_TRUNK; + else if (vlan_mode == PORT_VLAN_NATIVE_TAGGED) + return P4_PORT_VLAN_NATIVE_TAGGED; + else if (vlan_mode == PORT_VLAN_NATIVE_UNTAGGED) + return P4_PORT_VLAN_NATIVE_UNTAGGED; + else if (vlan_mode == PORT_VLAN_DOT1Q_TUNNEL) + return P4_PORT_VLAN_DOT1Q_TUNNEL; + else + return -1; +} + static int32_t get_fdb_data(struct xport *port, struct eth_addr mac_addr, struct mac_learning_info *fdb_info) @@ -3169,8 +3238,13 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, } memcpy(fdb_info->mac_addr, mac_addr.ea, sizeof(fdb_info->mac_addr)); + fdb_info->bridge_id = port->xbundle->p4_bridge_id; + fdb_info->vlan_info.port_vlan_mode = get_p4_vlan_mode(port->xbundle->vlan_mode); + fdb_info->vlan_info.port_vlan = port->xbundle->vlan; + if (port->is_tunnel) { fdb_info->is_tunnel = port->is_tunnel; + const struct netdev_tunnel_config *underlay_tnl = NULL; underlay_tnl = netdev_get_tunnel_config(port->netdev); if (!underlay_tnl) { @@ -3206,8 +3280,7 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, } else { const char *port_name = port->xbundle->name; if (strncmp(port_name, "vlan", strlen("vlan"))) { - VLOG_ERR("Not a VLAN interface, port name = %s", port_name); - return -1; + VLOG_DBG("Continue, this is latest LNW"); } else { fdb_info->is_vlan = true; int fd = socket(AF_INET, SOCK_DGRAM, 0); @@ -3315,6 +3388,7 @@ xlate_normal(struct xlate_ctx *ctx) if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { ConfigFdbTableEntry(fdb_info, true); + ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_ERR("Error retrieving FDB information, skipping programming " "P4 entry"); @@ -8883,7 +8957,12 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, memset(&fdb_info, 0, sizeof(fdb_info)); if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true); + struct eth_addr smac; + int err = netdev_get_etheraddr(ovs_port->netdev, &smac); + if (!err) { + ConfigFdbTableEntry(fdb_info, true); + } + ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_ERR("Error retrieving FDB information, skipping programming " "P4 entry"); diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index d973a634aca..be7caa45182 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -187,6 +187,15 @@ void xlate_ofproto_set(struct ofproto_dpif *, const char *name, struct dpif *, void xlate_remove_ofproto(struct ofproto_dpif *); struct ofproto_dpif *xlate_ofproto_lookup(const struct uuid *uuid); +#if defined(P4OVS) +void xlate_bundle_set(struct ofproto_dpif *, struct ofbundle *, + const char *name, enum port_vlan_mode, + uint16_t qinq_ethtype, int vlan, + unsigned long *trunks, unsigned long *cvlans, + enum port_priority_tags_mode, + const struct bond *, const struct lacp *, + bool floodable, bool protected, uint8_t p4_bridge_id); +#elif void xlate_bundle_set(struct ofproto_dpif *, struct ofbundle *, const char *name, enum port_vlan_mode, uint16_t qinq_ethtype, int vlan, @@ -194,6 +203,7 @@ void xlate_bundle_set(struct ofproto_dpif *, struct ofbundle *, enum port_priority_tags_mode, const struct bond *, const struct lacp *, bool floodable, bool protected); +#endif void xlate_bundle_remove(struct ofbundle *); void xlate_ofport_set(struct ofproto_dpif *, struct ofbundle *, diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 6ec2dcdc518..9da401580c4 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -77,9 +77,6 @@ #if defined(P4OVS) #include "openvswitch/ovs-p4rt.h" -static int32_t -get_tunnel_data(struct netdev *netdev, - struct tunnel_info *tnl_info); #endif VLOG_DEFINE_THIS_MODULE(ofproto_dpif); @@ -117,6 +114,9 @@ struct ofbundle { /* Status. */ bool floodable; /* True if no port has OFPUTIL_PC_NO_FLOOD set. */ +#if defined(P4OVS) + uint8_t p4_bridge_id; +#endif }; static void bundle_remove(struct ofport *); @@ -499,12 +499,22 @@ type_run(const char *type) &ofproto->backer->rt_support); HMAP_FOR_EACH (bundle, hmap_node, &ofproto->bundles) { +#if defined(P4OVS) + xlate_bundle_set(ofproto, bundle, bundle->name, + bundle->vlan_mode, bundle->qinq_ethtype, + bundle->vlan, bundle->trunks, bundle->cvlans, + bundle->use_priority_tags, + bundle->bond, bundle->lacp, + bundle->floodable, bundle->protected, + bundle->p4_bridge_id); +#elif xlate_bundle_set(ofproto, bundle, bundle->name, bundle->vlan_mode, bundle->qinq_ethtype, bundle->vlan, bundle->trunks, bundle->cvlans, bundle->use_priority_tags, bundle->bond, bundle->lacp, bundle->floodable, bundle->protected); +#endif } HMAP_FOR_EACH (ofport, up.hmap_node, &ofproto->up.ports) { @@ -2362,18 +2372,6 @@ port_destruct(struct ofport *port_, bool del) } tnl_port_del(port, port->odp_port); -#if defined(P4OVS) - if (port->is_tunnel) { - struct tunnel_info tnl_info; - memset(&tnl_info, 0, sizeof(tnl_info)); - if (!get_tunnel_data(port->up.netdev, &tnl_info)) { - ConfigTunnelTableEntry(tnl_info, false); - } else { - VLOG_ERR("Error retrieving tunnel information, skipping programming " - "P4 entry"); - } - } -#endif sset_find_and_delete(&ofproto->ports, devname); sset_find_and_delete(&ofproto->ghost_ports, devname); bundle_remove(port_); @@ -3507,6 +3505,7 @@ bundle_set(struct ofproto *ofproto_, void *aux, bundle->floodable = true; bundle->protected = false; + bundle->p4_bridge_id = s->p4_bridge_id; mbridge_register_bundle(ofproto->mbridge, bundle); } @@ -4142,47 +4141,6 @@ port_query_by_name(const struct ofproto *ofproto_, const char *devname, return error; } -#if defined(P4OVS) -static int32_t -get_tunnel_data(struct netdev *netdev, - struct tunnel_info *tnl_info) -{ - const struct netdev_tunnel_config *underlay_tnl = NULL; - underlay_tnl = netdev_get_tunnel_config(netdev); - if (!underlay_tnl) { - VLOG_ERR("Error retrieving netdev tunnel config"); - return -1; - } - int underlay_ifindex = netdev_get_ifindex(netdev); - if (underlay_ifindex < 0) { - VLOG_ERR("Invalid tunnel ifindex"); - return -1; - } - tnl_info->ifindex = (uint32_t)underlay_ifindex; - if (underlay_tnl->ipv6_src.__in6_u.__u6_addr32[0]) { - /* IPv6 tunnel configuration */ - tnl_info->local_ip.family = AF_INET6; - tnl_info->local_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_src; - - tnl_info->remote_ip.family = AF_INET6; - tnl_info->remote_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_dst; - - } else { - /* IPv4 tunnel configuration */ - tnl_info->local_ip.family = AF_INET; - tnl_info->local_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; - - tnl_info->remote_ip.family = AF_INET; - tnl_info->remote_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; - } - - tnl_info->dst_port = underlay_tnl->dst_port; - tnl_info->vni = underlay_tnl->vni; - - return 0; -} -#endif - static int port_add(struct ofproto *ofproto_, struct netdev *netdev) { @@ -4221,18 +4179,6 @@ port_add(struct ofproto *ofproto_, struct netdev *netdev) sset_add(&ofproto->ports, devname); } -#if defined(P4OVS) - if (netdev_get_tunnel_config(netdev)) { - struct tunnel_info tnl_info; - memset(&tnl_info, 0, sizeof(tnl_info)); - if (!get_tunnel_data(netdev, &tnl_info)) { - ConfigTunnelTableEntry(tnl_info, true); - } else { - VLOG_ERR("Error retrieving tunnel information, skipping programming " - "P4 entry"); - } - } -#endif return 0; } diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index 642a9d001f9..df69f3ce620 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -484,6 +484,9 @@ struct ofproto_bundle_settings { struct lacp_member_settings *lacp_members; /* Array of n_members elements. */ bool protected; /* Protected port mode */ +#if defined(P4OVS) + uint8_t p4_bridge_id; /* Unique bridge ID used by P4 tables */ +#endif }; int ofproto_bundle_register(struct ofproto *, void *aux, diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 6bb687f4b13..29269d6fc57 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -72,6 +72,19 @@ #include "lib/vswitch-idl.h" #include "vlan-bitmap.h" +#if defined(P4OVS) +#include "openvswitch/ovs-p4rt.h" +#include + +static int32_t +get_tunnel_data(struct netdev *netdev, + struct tunnel_info *tnl_info); + +uint8_t last_p4_bridge_id_used = 0; +uint32_t unique_tunnel_src_port = P4_VXLAN_SOURCE_PORT_OFFSET; + +#endif + VLOG_DEFINE_THIS_MODULE(bridge); COVERAGE_DEFINE(bridge_reconfigure); @@ -108,6 +121,13 @@ struct port { struct bridge *bridge; char *name; +#if defined(P4OVS) + uint16_t p4_src_port; + bool is_src_port_configured; + uint16_t p4_vlan_id; + enum p4_vlan_mode p4_vlan_mode; +#endif + const struct ovsrec_port *cfg; /* An ordinary bridge port has 1 interface. @@ -119,6 +139,9 @@ struct bridge { struct hmap_node node; /* In 'all_bridges'. */ char *name; /* User-specified arbitrary name. */ char *type; /* Datapath type. */ +#if defined(P4OVS) + uint8_t p4_bridge_id; /* Unique bridge ID used by P4 tables */ +#endif struct eth_addr ea; /* Bridge Ethernet Address. */ struct eth_addr default_ea; /* Default MAC. */ const struct ovsrec_bridge *cfg; @@ -263,6 +286,7 @@ static uint64_t last_ifaces_changed; #define BRIDGE_CONTROLLER_PACKET_QUEUE_MIN_SIZE 1 #define BRIDGE_CONTROLLER_PACKET_QUEUE_MAX_SIZE 512 + static void add_del_bridges(const struct ovsrec_open_vswitch *); static void bridge_run__(void); static void bridge_create(const struct ovsrec_bridge *); @@ -1348,6 +1372,10 @@ port_configure(struct port *port) /* Protected port mode */ s.protected = cfg->protected_; +#if defined(P4OVS) + s.p4_bridge_id = port->bridge->p4_bridge_id; +#endif + /* Register. */ ofproto_bundle_register(port->bridge->ofproto, port, &s); @@ -2224,6 +2252,171 @@ iface_do_create(const struct bridge *br, return error; } +#if defined(P4OVS) +static int32_t +get_tunnel_data(struct netdev *netdev, + struct tunnel_info *tnl_info) +{ + const struct netdev_tunnel_config *underlay_tnl = NULL; + underlay_tnl = netdev_get_tunnel_config(netdev); + if (!underlay_tnl) { + VLOG_ERR("Error retrieving netdev tunnel config"); + return -1; + } + int underlay_ifindex = netdev_get_ifindex(netdev); + if (underlay_ifindex < 0) { + VLOG_ERR("Invalid tunnel ifindex"); + return -1; + } + tnl_info->ifindex = (uint32_t)underlay_ifindex; + if (underlay_tnl->ipv6_src.__in6_u.__u6_addr32[0]) { + /* IPv6 tunnel configuration */ + tnl_info->local_ip.family = AF_INET6; + tnl_info->local_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_src; + + tnl_info->remote_ip.family = AF_INET6; + tnl_info->remote_ip.ip.v6addr = (struct in6_addr) underlay_tnl->ipv6_dst; + + } else { + /* IPv4 tunnel configuration */ + tnl_info->local_ip.family = AF_INET; + tnl_info->local_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_src.__in6_u.__u6_addr32[3]; + + tnl_info->remote_ip.family = AF_INET; + tnl_info->remote_ip.ip.v4addr.s_addr = underlay_tnl->ipv6_dst.__in6_u.__u6_addr32[3]; + } + + tnl_info->dst_port = underlay_tnl->dst_port; + tnl_info->vni = underlay_tnl->vni; + + return 0; +} + +static bool +get_p4_vlan_info(const struct ovsrec_port *cfg, + struct port *port) { + if (cfg && cfg->vlan_mode) { + if (!strcmp(cfg->vlan_mode, "native-tagged")) { + port->p4_vlan_mode = P4_PORT_VLAN_NATIVE_TAGGED; + port->p4_vlan_id = *cfg->tag; + } else if (!strcmp(cfg->vlan_mode, "native-untagged")) { + port->p4_vlan_mode = P4_PORT_VLAN_NATIVE_UNTAGGED; + port->p4_vlan_id = *cfg->tag; + } else { + /* Do Nothing, no support yet */ + port->p4_vlan_mode = P4_PORT_VLAN_UNSUPPORTED; + port->p4_vlan_id = 0; + VLOG_DBG("Unsupported VLAN mode for the P4 target"); + return false; + } + return true; + } + return false; +} + +static uint32_t +GetSrcPortVsiId(char *mac_addr) { + struct ether_addr *ea; + + ea = ether_aton((const char *)mac_addr); + if (!ea) { + VLOG_ERR("Cannot convert MAC address: %s to binary data", mac_addr); + return 0; + } + return ea->ether_addr_octet[1] + VPORT_ID_OFFSET; +} + +static void +ConfigureP4Target(struct bridge *br, struct port *port, + struct iface *iface, bool insert_entry) { + if (!iface->cfg || !iface->cfg->type) { + VLOG_DBG("Invalid interface data to configure P4 Target"); + return; + } + + if (!strcmp(iface->cfg->type, "internal")) { + VLOG_DBG("Ignore OVS specific internal interfaces"); + return; + } + + /* Update parent bridge's unique ID in port structure */ + if (!strcmp(iface->cfg->type, "vxlan") || + netdev_get_tunnel_config(iface->netdev)) { + /* Handling VxLAN source port addition */ + struct tunnel_info tnl_info; + + memset(&tnl_info, 0, sizeof(tnl_info)); + + if (insert_entry) { + get_p4_vlan_info(port->cfg, port); + port->p4_src_port = unique_tunnel_src_port++; + } + + if (!get_tunnel_data(iface->netdev, &tnl_info)) { + tnl_info.vlan_info.port_vlan = port->p4_vlan_id; + tnl_info.vlan_info.port_vlan_mode = port->p4_vlan_mode; + tnl_info.bridge_id = br->p4_bridge_id; + tnl_info.src_port = port->p4_src_port; + + ConfigTunnelTableEntry(tnl_info, insert_entry); + ConfigIpTunnelTermTableEntry(tnl_info, insert_entry); + ConfigRxTunnelSrcTableEntry(tnl_info, insert_entry); + } else { + VLOG_ERR("Error retrieving tunnel information, " + "skipping programming P4 entry"); + } + + if (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED || + port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_UNTAGGED) { + struct src_port_info tnl_src_port_info = {br->p4_bridge_id, + port->p4_vlan_id, + port->p4_src_port}; + /* When VLAN tag is configured */ + ConfigVlanTableEntry(port->p4_vlan_id, insert_entry); + ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry); + } else { + /* Wild card VLAN 0 */ + struct src_port_info tnl_src_port_info = {br->p4_bridge_id, + 0, + port->p4_src_port}; + + ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry); + } + port->is_src_port_configured = insert_entry; + } else if (!insert_entry || iface->cfg->mac_in_use) { + /* Handling VSI source port addition */ + if (insert_entry) { + get_p4_vlan_info(port->cfg, port); + port->p4_src_port = GetSrcPortVsiId(iface->cfg->mac_in_use); + } + + if (port->p4_src_port && + (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED || + port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_UNTAGGED)) { + struct src_port_info vsi_src_port_info = {br->p4_bridge_id, + port->p4_vlan_id, + port->p4_src_port}; + + ConfigVlanTableEntry(port->p4_vlan_id, insert_entry); + ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry); + } else if (port->p4_vlan_mode == P4_PORT_VLAN_UNSUPPORTED) { + /* Do nothing, unsupported vlan mode */ + } else if (port->p4_src_port) { + struct src_port_info vsi_src_port_info = {br->p4_bridge_id, + 0, + port->p4_src_port}; + + ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry); + } else { + VLOG_DBG("Invalid P4 use case for source port to " + "bridge mapping"); + } + port->is_src_port_configured = insert_entry; + } + return; +} +#endif + /* Creates a new iface on 'br' based on 'if_cfg'. The new iface has OpenFlow * port number 'ofp_port'. If ofp_port is OFPP_NONE, an OpenFlow port is * automatically allocated for the iface. Takes ownership of and @@ -2274,6 +2467,12 @@ iface_create(struct bridge *br, const struct ovsrec_interface *iface_cfg, iface_refresh_stats(iface); iface_refresh_netdev_status(iface); +#if defined(P4OVS) + if (!port->is_src_port_configured) { + ConfigureP4Target(br, port, iface, true); + } +#endif + /* Add bond fake iface if necessary. */ if (port_is_bond_fake_iface(port)) { struct ofproto_port ofproto_port; @@ -3697,6 +3896,18 @@ bridge_create(const struct ovsrec_bridge *br_cfg) hmap_init(&br->mappings); hmap_insert(&all_bridges, &br->node, hash_string(br->name, 0)); + +#if defined(P4OVS) + /* TODO: Implement a better logic for unique bridge ID of type uint8. */ + if (last_p4_bridge_id_used <= MAX_P4_BRIDGE_ID) { + br->p4_bridge_id = last_p4_bridge_id_used++; + VLOG_DBG("Assigned unique P4 bridge ID of: %d, for bridge: %s", + br->p4_bridge_id, br->name); + } else { + VLOG_WARN("Unable to assign unique P4 bridge ID for bridge: %s, reached" + " max P4 bridge ID limit of %d", br->name, MAX_P4_BRIDGE_ID); + } +#endif } static void @@ -4777,6 +4988,12 @@ iface_destroy__(struct iface *iface) VLOG_INFO("bridge %s: deleted interface %s on port %d", br->name, iface->name, iface->ofp_port); +#if defined(P4OVS) + if (port->is_src_port_configured) { + ConfigureP4Target(br, port, iface, false); + } +#endif + if (br->ofproto && iface->ofp_port != OFPP_NONE) { ofproto_port_unregister(br->ofproto, iface->ofp_port); } @@ -5365,3 +5582,5 @@ discover_types(const struct ovsrec_open_vswitch *cfg) free(iface_types); sset_destroy(&types); } + + From c7195f2ce9af2700f2fd61e5be218dd08c8fc085 Mon Sep 17 00:00:00 2001 From: Sandeep N Date: Fri, 17 Nov 2023 15:56:28 +0530 Subject: [PATCH 809/833] Address review comments and fix defects Signed-off-by: Sandeep N --- include/openvswitch/ovs-p4rt.h | 66 ++++++++++++++++++---------------- lib/mac-learning.h | 5 +-- ofproto/ofproto-dpif-xlate.c | 47 ++++++++++++++++-------- ofproto/ofproto-dpif.c | 4 ++- vswitchd/bridge.c | 16 ++++++--- 5 files changed, 85 insertions(+), 53 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 1f58689450b..1f035886570 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Intel Corporation. + * Copyright (c) 2022-2023 Intel Corporation. * SPDX-License-Identifier: Apache-2.0 * * Defines the public interface to an externally-supplied module @@ -15,8 +15,14 @@ extern "C" { #endif -#define VPORT_ID_OFFSET 16 -#define MAX_P4_BRIDGE_ID 256 +/* When VSI ID is used as an action, we need add an offset of 16 and populate + * the action */ +#define VSI_ID_OFFSET 16 +/* As p4 program uses 8 bits for bridge ID, current limitation is we can go max + * of 256 bridges (0-255) */ +#define MAX_P4_BRIDGE_ID 255 +/* Source port for VxLAN should start from 2048, 0 to 2047 are reserved for + * VSI/phy ports */ #define P4_VXLAN_SOURCE_PORT_OFFSET 2048 /* This is a replica of port_vlan_mode in ofproto.h */ @@ -44,38 +50,38 @@ struct port_vlan_info { }; struct tunnel_info { - uint32_t ifindex; - uint32_t port_id; - uint32_t src_port; - struct p4_ipaddr local_ip; - struct p4_ipaddr remote_ip; - uint16_t dst_port; - uint16_t vni; - struct port_vlan_info vlan_info; - uint8_t bridge_id; + uint32_t ifindex; + uint32_t port_id; + uint32_t src_port; + struct p4_ipaddr local_ip; + struct p4_ipaddr remote_ip; + uint16_t dst_port; + uint16_t vni; + struct port_vlan_info vlan_info; + uint8_t bridge_id; }; struct src_port_info { - uint8_t bridge_id; - uint16_t vlan_id; - uint32_t src_port; + uint8_t bridge_id; + uint16_t vlan_id; + uint32_t src_port; }; struct vlan_info { - uint32_t vlan_id; + uint32_t vlan_id; }; struct mac_learning_info { - bool is_tunnel; - bool is_vlan; - uint8_t mac_addr[6]; - uint8_t bridge_id; - uint32_t src_port; - struct port_vlan_info vlan_info; - union { - struct tunnel_info tnl_info; - struct vlan_info vln_info; - }; + bool is_tunnel; + bool is_vlan; + uint8_t mac_addr[6]; + uint8_t bridge_id; + uint32_t src_port; + struct port_vlan_info vlan_info; + union { + struct tunnel_info tnl_info; + struct vlan_info vln_info; + }; }; // Function declarations @@ -87,16 +93,14 @@ extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, bool insert_entry); extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, bool insert_entry); -extern void ConfigVlanTableEntry(uint16_t vlan_id, - bool insert_entry); +extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry); extern void ConfigIpTunnelTermTableEntry(struct tunnel_info tunnel_info, bool insert_entry); extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, bool insert_entry); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif -#endif // OPENVSWITCH_OVS_P4RT_H - +#endif // OPENVSWITCH_OVS_P4RT_H diff --git a/lib/mac-learning.h b/lib/mac-learning.h index 48e4eafbd8c..44382f0bbf7 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -190,8 +190,9 @@ struct mac_learning { struct hmap ports_by_ptr; /* struct mac_learning_port hmap_nodes. */ struct heap ports_by_usage; /* struct mac_learning_port heap_nodes. */ - /* P4 specific bridge ID */ - uint8_t p4_bridge_id; +#if defined(P4OVS) + uint8_t p4_bridge_id; /* P4 specific bridge ID */ +#endif }; int mac_entry_age(const struct mac_learning *ml, const struct mac_entry *e) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index b55c64b1b7b..662b8a30258 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -76,7 +76,6 @@ #include #include #include "openvswitch/ovs-p4rt.h" -struct ofbundle; #endif //P4OVS COVERAGE_DEFINE(xlate_actions); @@ -167,7 +166,7 @@ struct xbundle { bool floodable; /* No port has OFPUTIL_PC_NO_FLOOD set? */ bool protected; /* Protected port mode */ #if defined(P4OVS) - uint8_t p4_bridge_id; + uint8_t p4_bridge_id; /* P4 specific bridge ID for this xbundle */ #endif }; @@ -3223,23 +3222,30 @@ get_p4_vlan_mode(enum port_vlan_mode vlan_mode) { return P4_PORT_VLAN_NATIVE_TAGGED; else if (vlan_mode == PORT_VLAN_NATIVE_UNTAGGED) return P4_PORT_VLAN_NATIVE_UNTAGGED; - else if (vlan_mode == PORT_VLAN_DOT1Q_TUNNEL) - return P4_PORT_VLAN_DOT1Q_TUNNEL; else - return -1; + return P4_PORT_VLAN_UNSUPPORTED; } static int32_t get_fdb_data(struct xport *port, struct eth_addr mac_addr, struct mac_learning_info *fdb_info) { + enum p4_vlan_mode v_mode; if (!port || !port->netdev || !port->xbundle) { return -1; } memcpy(fdb_info->mac_addr, mac_addr.ea, sizeof(fdb_info->mac_addr)); fdb_info->bridge_id = port->xbundle->p4_bridge_id; - fdb_info->vlan_info.port_vlan_mode = get_p4_vlan_mode(port->xbundle->vlan_mode); + + v_mode = get_p4_vlan_mode(port->xbundle->vlan_mode); + + if (v_mode == P4_PORT_VLAN_UNSUPPORTED) { + VLOG_DBG("Unsupported VLAN mode"); + return -1; + } + + fdb_info->vlan_info.port_vlan_mode = v_mode; fdb_info->vlan_info.port_vlan = port->xbundle->vlan; if (port->is_tunnel) { @@ -3280,6 +3286,23 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, } else { const char *port_name = port->xbundle->name; if (strncmp(port_name, "vlan", strlen("vlan"))) { + struct eth_addr smac; + int err = netdev_get_etheraddr(port->netdev, &smac); + if (err) { + VLOG_DBG("Cannot retrieve Source MAC address for port: %s", + port_name); + return -1; + } + if (!memcmp(smac.ea, mac_addr.ea, sizeof(smac))) { + VLOG_DBG("Ignore self MAC learn use case for port: %s", + port_name); + return -1; + + } + /* this SRC port MAC is needed to configure FDB entry + * for its corresponding HOST port or Phy port. + */ + fdb_info->src_port = smac.ea[1] + VSI_ID_OFFSET; VLOG_DBG("Continue, this is latest LNW"); } else { fdb_info->is_vlan = true; @@ -3390,11 +3413,11 @@ xlate_normal(struct xlate_ctx *ctx) ConfigFdbTableEntry(fdb_info, true); ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { - VLOG_ERR("Error retrieving FDB information, skipping programming " + VLOG_DBG("Error retrieving FDB information, skipping programming " "P4 entry"); } #endif - + if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) { struct xc_entry *entry; @@ -8957,14 +8980,10 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, memset(&fdb_info, 0, sizeof(fdb_info)); if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { - struct eth_addr smac; - int err = netdev_get_etheraddr(ovs_port->netdev, &smac); - if (!err) { - ConfigFdbTableEntry(fdb_info, true); - } + ConfigFdbTableEntry(fdb_info, true); ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { - VLOG_ERR("Error retrieving FDB information, skipping programming " + VLOG_DBG("Error retrieving FDB information, skipping programming " "P4 entry"); } #endif diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 9da401580c4..a65089f740b 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -115,7 +115,7 @@ struct ofbundle { /* Status. */ bool floodable; /* True if no port has OFPUTIL_PC_NO_FLOOD set. */ #if defined(P4OVS) - uint8_t p4_bridge_id; + uint8_t p4_bridge_id; /* P4 specific bridge ID for this ofbundle */ #endif }; @@ -3505,7 +3505,9 @@ bundle_set(struct ofproto *ofproto_, void *aux, bundle->floodable = true; bundle->protected = false; +#if defined(P4OVS) bundle->p4_bridge_id = s->p4_bridge_id; +#endif mbridge_register_bundle(ofproto->mbridge, bundle); } diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 29269d6fc57..00c51e56aa4 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2323,7 +2323,7 @@ GetSrcPortVsiId(char *mac_addr) { VLOG_ERR("Cannot convert MAC address: %s to binary data", mac_addr); return 0; } - return ea->ether_addr_octet[1] + VPORT_ID_OFFSET; + return ea->ether_addr_octet[1] + VSI_ID_OFFSET; } static void @@ -2366,8 +2366,11 @@ ConfigureP4Target(struct bridge *br, struct port *port, "skipping programming P4 entry"); } - if (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED || - port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_UNTAGGED) { + if (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED) { + /* only for native VLAN ports we need to add VLAN when + * configuring SRC port table. As this port only accepts + * TAGGED packets + */ struct src_port_info tnl_src_port_info = {br->p4_bridge_id, port->p4_vlan_id, port->p4_src_port}; @@ -2391,8 +2394,11 @@ ConfigureP4Target(struct bridge *br, struct port *port, } if (port->p4_src_port && - (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED || - port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_UNTAGGED)) { + (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED)) { + /* only for native VLAN ports we need to add VLAN when + * configuring SRC port table. As this port only accepts + * TAGGED packets + */ struct src_port_info vsi_src_port_info = {br->p4_bridge_id, port->p4_vlan_id, port->p4_src_port}; From 99bc4cc10057b37c69c4da679f9426f606aaada2 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Wed, 22 Nov 2023 10:58:58 -0800 Subject: [PATCH 810/833] Add code owners Signed-off-by: Derek G Foster --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5569d581286..237f241eac8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @ffoulkes @nupuruttarwar +* @ffoulkes @nupuruttarwar @n-sandeep @vsureshkumarp From f7f8c5a56a0208f44695cf2c1089fbde4dc7bc58 Mon Sep 17 00:00:00 2001 From: Sandeep N Date: Wed, 22 Nov 2023 22:50:51 +0530 Subject: [PATCH 811/833] Fix OVS intermittent issue During port/bridge delete CFG value is garbage intermittently, this leads to undefined beavior while deleting p4 target rules. This review fixes garbage value issue during delete case. Signed-off-by: Sandeep N --- vswitchd/bridge.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 00c51e56aa4..e0ab6c372b2 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2334,14 +2334,16 @@ ConfigureP4Target(struct bridge *br, struct port *port, return; } - if (!strcmp(iface->cfg->type, "internal")) { + /* when port is deleted, there are chances that iface->cfg is not valid + * Check if iface type only during insert case + */ + if (insert_entry && !strcmp(iface->cfg->type, "internal")) { VLOG_DBG("Ignore OVS specific internal interfaces"); return; } - /* Update parent bridge's unique ID in port structure */ - if (!strcmp(iface->cfg->type, "vxlan") || - netdev_get_tunnel_config(iface->netdev)) { + /* Check if port is of type tunnel */ + if (netdev_get_tunnel_config(iface->netdev)) { /* Handling VxLAN source port addition */ struct tunnel_info tnl_info; From 7163eed9924cc9ae6fd7341133a56b3f08f18c23 Mon Sep 17 00:00:00 2001 From: Sandeep Nagapattinam Date: Thu, 7 Dec 2023 06:14:47 +0530 Subject: [PATCH 812/833] Runtime option to disable OvS offload to P4 target (#102) * Runtime option to disable OvS offload to P4 target Control OVS offload with an environment variable during runtime. If env variable OVS_P4_OFFLOAD=false, then disable OVS offload, else if OVS_P4_OFFLOAD is not set or OVS_P4_OFFLOAD is any value other than false, then by default enable OVS offload. Signed-off-by: Sandeep N * Addressing review comments and fixing UT defects This change introduces a mutex and this mutex is taken before making a p4runtime call to program FDB entry. This is to avoid multiple parallel calls when OvS creates multiple revalidator and handler threads which tries to make a p4runtime call when a MAC is learnt. Signed-off-by: Sandeep N --------- Signed-off-by: Sandeep N --- include/openvswitch/automake.mk | 3 +- include/openvswitch/p4ovs.h | 58 +++++++++++++++++++++++++++++++++ lib/mac-learning.c | 15 +++++---- ofproto/ofproto-dpif-xlate.c | 44 ++++++++++++++----------- vswitchd/bridge.c | 14 ++++++-- 5 files changed, 106 insertions(+), 28 deletions(-) create mode 100644 include/openvswitch/p4ovs.h diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 681eca82d43..635909dbf04 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -54,7 +54,8 @@ openvswitchinclude_HEADERS = \ if P4OVS openvswitchinclude_HEADERS += \ - include/openvswitch/ovs-p4rt.h + include/openvswitch/ovs-p4rt.h \ + include/openvswitch/p4ovs.h endif if HAVE_CXX diff --git a/include/openvswitch/p4ovs.h b/include/openvswitch/p4ovs.h new file mode 100644 index 00000000000..51661df3b9c --- /dev/null +++ b/include/openvswitch/p4ovs.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023 Intel Corporation. + * SPDX-License-Identifier: Apache-2.0 + * + * Defines the P4 OvS specific definitions. These need be used under + * if defined(P4OVS) scope only. + */ + +#ifndef OPENVSWITCH_P4OVS_H +#define OPENVSWITCH_P4OVS_H + +#include +#include "openvswitch/thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern struct ovs_mutex p4ovs_fdb_entry_lock; + +/* Control OvS offload with an environment variable during runtime. + * If env variable OVS_P4_OFFLOAD=false, then disable OVS offload, else + * if OVS_P4_OFFLOAD is not set or OVS_P4_OFFLOAD is any value other + * than false, then by default enable OVS offload. + */ +static inline bool ovs_p4_offload_enabled(void) { + const char* offload = getenv("OVS_P4_OFFLOAD"); + return (offload == NULL) || strcmp(offload, "false") != 0; +} + +/* OvS creates multiple handler and revalidator threads based on the number of + * CPU cores. These threading mechanism also associated with bridges that + * are created in OvS. During multiple bridge scenarios, we are seeing + * issues when a mutiple MAC's are learnt on different bridges at the same time. + * Creating a mutex and with this we are controlling p4runtime calls for each + * MAC learn. + */ +static inline void p4ovs_lock_init(const struct ovs_mutex *p4ovs_lock) { + return ovs_mutex_init(p4ovs_lock); +} + +static inline void p4ovs_lock_destroy(const struct ovs_mutex *p4ovs_lock) { + return ovs_mutex_destroy(p4ovs_lock); +} + +static inline void p4ovs_lock(const struct ovs_mutex *p4ovs_lock) { + return ovs_mutex_lock(p4ovs_lock); +} + +static inline void p4ovs_unlock(const struct ovs_mutex *p4ovs_lock) { + return ovs_mutex_unlock(p4ovs_lock) OVS_RELEASES(p4ovs_lock); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // OPENVSWITCH_P4OVS_H diff --git a/lib/mac-learning.c b/lib/mac-learning.c index bcefd2dd7c8..3a20c9c0339 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -32,6 +32,7 @@ #if defined(P4OVS) #include "openvswitch/ovs-p4rt.h" +#include "openvswitch/p4ovs.h" #endif COVERAGE_DEFINE(mac_learning_learned); @@ -622,12 +623,14 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) hmap_remove(&ml->table, &e->hmap_node); ovs_list_remove(&e->lru_node); #if defined(P4OVS) - struct mac_learning_info fdb_info; - memset(&fdb_info, 0, sizeof(struct mac_learning_info)); - memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); - fdb_info.is_vlan = true; - fdb_info.bridge_id = ml->p4_bridge_id; - ConfigFdbTableEntry(fdb_info, false); + if (ovs_p4_offload_enabled()) { + struct mac_learning_info fdb_info; + memset(&fdb_info, 0, sizeof(fdb_info)); + memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); + fdb_info.is_vlan = true; + fdb_info.bridge_id = ml->p4_bridge_id; + ConfigFdbTableEntry(fdb_info, false); + } #endif free(e); } diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 662b8a30258..8ef303a8442 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -76,6 +76,7 @@ #include #include #include "openvswitch/ovs-p4rt.h" +#include "openvswitch/p4ovs.h" #endif //P4OVS COVERAGE_DEFINE(xlate_actions); @@ -3403,19 +3404,22 @@ xlate_normal(struct xlate_ctx *ctx) } #if defined(P4OVS) + p4ovs_lock(&p4ovs_fdb_entry_lock); /* Dynamic MAC is learnt, program P4 forwarding table */ struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, flow->in_port.ofp_port); struct mac_learning_info fdb_info; memset(&fdb_info, 0, sizeof(fdb_info)); - - if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true); - ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; - } else { - VLOG_DBG("Error retrieving FDB information, skipping programming " - "P4 entry"); + if (ovs_p4_offload_enabled()) { + if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { + ConfigFdbTableEntry(fdb_info, true); + ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; + } else { + VLOG_DBG("Error retrieving FDB information, skipping programming " + "P4 entry"); + } } + p4ovs_unlock(&p4ovs_fdb_entry_lock); #endif if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) { @@ -8973,18 +8977,20 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, } #if defined(P4OVS) - /* Static MAC is configured, program P4 forwarding table */ - struct xport *ovs_port = get_ofp_port(xbundle->xbridge, - in_port); - struct mac_learning_info fdb_info; - memset(&fdb_info, 0, sizeof(fdb_info)); - - if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true); - ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; - } else { - VLOG_DBG("Error retrieving FDB information, skipping programming " - "P4 entry"); + if (ovs_p4_offload_enabled()) { + /* Static MAC is configured, program P4 forwarding table */ + struct xport *ovs_port = get_ofp_port(xbundle->xbridge, + in_port); + struct mac_learning_info fdb_info; + memset(&fdb_info, 0, sizeof(fdb_info)); + + if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { + ConfigFdbTableEntry(fdb_info, true); + ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; + } else { + VLOG_DBG("Error retrieving FDB information, skipping programming " + "P4 entry"); + } } #endif diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index e0ab6c372b2..d6f33589370 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -73,16 +73,18 @@ #include "vlan-bitmap.h" #if defined(P4OVS) -#include "openvswitch/ovs-p4rt.h" #include +#include "openvswitch/ovs-p4rt.h" +#include "openvswitch/p4ovs.h" + static int32_t get_tunnel_data(struct netdev *netdev, struct tunnel_info *tnl_info); uint8_t last_p4_bridge_id_used = 0; uint32_t unique_tunnel_src_port = P4_VXLAN_SOURCE_PORT_OFFSET; - +struct ovs_mutex p4ovs_fdb_entry_lock = OVS_MUTEX_INITIALIZER; #endif VLOG_DEFINE_THIS_MODULE(bridge); @@ -558,6 +560,10 @@ bridge_init(const char *remote) rstp_init(); odp_execute_init(); +#if defined(P4OVS) + p4ovs_lock_init(&p4ovs_fdb_entry_lock); +#endif + ifaces_changed = seq_create(); last_ifaces_changed = seq_read(ifaces_changed); ifnotifier = if_notifier_create(if_change_cb, NULL); @@ -581,6 +587,10 @@ bridge_exit(bool delete_datapath) bridge_destroy(br, delete_datapath); } +#if defined(P4OVS) + p4ovs_lock_destroy(&p4ovs_fdb_entry_lock); +#endif + ovsdb_idl_destroy(idl); } From cf156d1d1c550a06616a7bafb45dea8a4bda15c5 Mon Sep 17 00:00:00 2001 From: Sandeep N Date: Fri, 2 Feb 2024 22:43:57 +0530 Subject: [PATCH 813/833] Delete p4 target rules when interface is deleted. We are not handling all delete port notifications as the OVSD configuration will be invalid/NULL when we receive delete of multiple bridges, ports in a single transaction. During delete path we are not looking for interface DB cfg, process delete notification with internal structures irrespective of iface->cfg is valid or not Signed-off-by: Sandeep N --- include/openvswitch/ovs-p4rt.h | 2 -- ofproto/ofproto-dpif-xlate.c | 4 ++-- vswitchd/bridge.c | 13 ++++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 1f035886570..62df14abcf6 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -94,8 +94,6 @@ extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, bool insert_entry); extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry); -extern void ConfigIpTunnelTermTableEntry(struct tunnel_info tunnel_info, - bool insert_entry); extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, bool insert_entry); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 8ef303a8442..d0d5273b8a6 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3404,13 +3404,13 @@ xlate_normal(struct xlate_ctx *ctx) } #if defined(P4OVS) - p4ovs_lock(&p4ovs_fdb_entry_lock); /* Dynamic MAC is learnt, program P4 forwarding table */ struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, flow->in_port.ofp_port); struct mac_learning_info fdb_info; memset(&fdb_info, 0, sizeof(fdb_info)); if (ovs_p4_offload_enabled()) { + p4ovs_lock(&p4ovs_fdb_entry_lock); if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { ConfigFdbTableEntry(fdb_info, true); ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; @@ -3418,8 +3418,8 @@ xlate_normal(struct xlate_ctx *ctx) VLOG_DBG("Error retrieving FDB information, skipping programming " "P4 entry"); } + p4ovs_unlock(&p4ovs_fdb_entry_lock); } - p4ovs_unlock(&p4ovs_fdb_entry_lock); #endif if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) { diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index d6f33589370..d27aa567b0c 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2339,14 +2339,14 @@ GetSrcPortVsiId(char *mac_addr) { static void ConfigureP4Target(struct bridge *br, struct port *port, struct iface *iface, bool insert_entry) { - if (!iface->cfg || !iface->cfg->type) { + /* when port is deleted, there are chances that iface->cfg is not valid + * Check if iface type only during insert case + */ + if (insert_entry && (!iface->cfg || !iface->cfg->type)) { VLOG_DBG("Invalid interface data to configure P4 Target"); return; } - /* when port is deleted, there are chances that iface->cfg is not valid - * Check if iface type only during insert case - */ if (insert_entry && !strcmp(iface->cfg->type, "internal")) { VLOG_DBG("Ignore OVS specific internal interfaces"); return; @@ -2371,7 +2371,6 @@ ConfigureP4Target(struct bridge *br, struct port *port, tnl_info.src_port = port->p4_src_port; ConfigTunnelTableEntry(tnl_info, insert_entry); - ConfigIpTunnelTermTableEntry(tnl_info, insert_entry); ConfigRxTunnelSrcTableEntry(tnl_info, insert_entry); } else { VLOG_ERR("Error retrieving tunnel information, " @@ -2487,7 +2486,9 @@ iface_create(struct bridge *br, const struct ovsrec_interface *iface_cfg, #if defined(P4OVS) if (!port->is_src_port_configured) { + p4ovs_lock(&p4ovs_fdb_entry_lock); ConfigureP4Target(br, port, iface, true); + p4ovs_unlock(&p4ovs_fdb_entry_lock); } #endif @@ -5008,7 +5009,9 @@ iface_destroy__(struct iface *iface) #if defined(P4OVS) if (port->is_src_port_configured) { + p4ovs_lock(&p4ovs_fdb_entry_lock); ConfigureP4Target(br, port, iface, false); + p4ovs_unlock(&p4ovs_fdb_entry_lock); } #endif From 790d53058f0e8426839dea2f62c5b886bb837860 Mon Sep 17 00:00:00 2001 From: Satish Pitchikala Date: Sun, 21 Jan 2024 05:36:59 +0530 Subject: [PATCH 814/833] Adding geneve support in p4_ovs Signed-off-by: Satish Pitchikala --- include/openvswitch/ovs-p4rt.h | 28 ++++++++++++++++++++++++++++ ofproto/ofproto-dpif-xlate.c | 2 ++ vswitchd/bridge.c | 5 ++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 62df14abcf6..6cafc12ba8e 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -35,6 +35,31 @@ enum p4_vlan_mode { P4_PORT_VLAN_UNSUPPORTED }; +enum p4_tunnel_type { + NO_MODIFY, + VXLAN_ENCAP, + VXLAN_DECAP_OUTER_HDR, + NEIGHBOR, + VLAN_PUSH, + VLAN_POP, + VXLAN_ENCAP_V6, + VXLAN_DECAP_OUTER_HDR_VLAN_PUSH, + VXLAN_ENCAP_VLAN_POP, + VXLAN_ENCAP_V6_VLAN_POP, + GENEVE_ENCAP, + GENEVE_ENCAP_V6, + GENEVE_ENCAP_VLAN_POP, + GENEVE_ENCAP_V6_VLAN_POP, + GENEVE_DECAP_OUTER_HDR, + GENEVE_DECAP_OUTER_HDR_VLAN_PUSH +}; + +enum ovs_tunnel_type { + OVS_TUNNEL_UNKNOWN = 0, + OVS_TUNNEL_VXLAN, + OVS_TUNNEL_GENEVE +}; + struct p4_ipaddr { uint8_t family; uint8_t prefix_len; @@ -59,6 +84,7 @@ struct tunnel_info { uint16_t vni; struct port_vlan_info vlan_info; uint8_t bridge_id; + uint8_t tunnel_type; }; struct src_port_info { @@ -97,6 +123,8 @@ extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry); extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, bool insert_entry); +extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); + #ifdef __cplusplus } // extern "C" #endif diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index d0d5273b8a6..45452946cb4 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3268,6 +3268,8 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, fdb_info->tnl_info.ifindex = (uint32_t)underlay_ifindex; fdb_info->tnl_info.dst_port = underlay_tnl->dst_port; fdb_info->tnl_info.vni = underlay_tnl->vni; + const char *tnl_type = tnl_port_get_type(port->ofport); + fdb_info->tnl_info.tunnel_type = TunnelTypeStrtoEnum(tnl_type); if (underlay_tnl->ipv6_src.__in6_u.__u6_addr32[0]) { /* IPv6 tunnel configuration */ diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index d27aa567b0c..d96377d583a 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2299,7 +2299,10 @@ get_tunnel_data(struct netdev *netdev, tnl_info->dst_port = underlay_tnl->dst_port; tnl_info->vni = underlay_tnl->vni; - return 0; + const char* tnl_type = netdev_get_type(netdev); + tnl_info->tunnel_type = TunnelTypeStrtoEnum(tnl_type); + + return 0; } static bool From d07a9566cf325e2723958c275654254689f2cf24 Mon Sep 17 00:00:00 2001 From: Satish Pitchikala Date: Fri, 2 Feb 2024 06:48:30 +0530 Subject: [PATCH 815/833] Removing enum that was not needed Signed-off-by: Satish Pitchikala --- include/openvswitch/ovs-p4rt.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 6cafc12ba8e..24ff26db9e4 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -35,25 +35,6 @@ enum p4_vlan_mode { P4_PORT_VLAN_UNSUPPORTED }; -enum p4_tunnel_type { - NO_MODIFY, - VXLAN_ENCAP, - VXLAN_DECAP_OUTER_HDR, - NEIGHBOR, - VLAN_PUSH, - VLAN_POP, - VXLAN_ENCAP_V6, - VXLAN_DECAP_OUTER_HDR_VLAN_PUSH, - VXLAN_ENCAP_VLAN_POP, - VXLAN_ENCAP_V6_VLAN_POP, - GENEVE_ENCAP, - GENEVE_ENCAP_V6, - GENEVE_ENCAP_VLAN_POP, - GENEVE_ENCAP_V6_VLAN_POP, - GENEVE_DECAP_OUTER_HDR, - GENEVE_DECAP_OUTER_HDR_VLAN_PUSH -}; - enum ovs_tunnel_type { OVS_TUNNEL_UNKNOWN = 0, OVS_TUNNEL_VXLAN, From 8ace14377181a5120f6201ab8c086f3fde1ae0d6 Mon Sep 17 00:00:00 2001 From: Nupur Uttarwar Date: Tue, 2 Apr 2024 13:26:56 -0700 Subject: [PATCH 816/833] Add support for combined recipe and linux networking v3 (#108) Add support for updating P4 tables for maintaining IP and mac addresses learnt from a flow with ARP response. This will be used when reconstructing L2 after IPSEC packet is decrypted Signed-off-by: nupuruttarwar --- include/openvswitch/ovs-p4rt.h | 11 ++++++ lib/mac-learning.c | 12 ++++++- lib/mac-learning.h | 6 ++++ ofproto/ofproto-dpif-xlate.c | 65 ++++++++++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 4 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 24ff26db9e4..3ecb91388c1 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -84,6 +84,7 @@ struct mac_learning_info { uint8_t mac_addr[6]; uint8_t bridge_id; uint32_t src_port; + uint32_t rx_src_port; struct port_vlan_info vlan_info; union { struct tunnel_info tnl_info; @@ -91,6 +92,13 @@ struct mac_learning_info { }; }; +struct ip_mac_map_info { + uint8_t src_mac_addr[6]; + uint8_t dst_mac_addr[6]; + struct p4_ipaddr src_ip_addr; + struct p4_ipaddr dst_ip_addr; +}; + // Function declarations extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, bool insert_entry); @@ -106,6 +114,9 @@ extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); +extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, + bool insert_entry); + #ifdef __cplusplus } // extern "C" #endif diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 3a20c9c0339..685dbb8ea5b 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -630,8 +630,18 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) fdb_info.is_vlan = true; fdb_info.bridge_id = ml->p4_bridge_id; ConfigFdbTableEntry(fdb_info, false); + + // Remove the corresponding ip_mac tables both for src ip and dst ip + struct ip_mac_map_info ip_info; + memset(&ip_info, 0, sizeof(ip_info)); + ip_info.src_ip_addr.family = AF_INET; + ip_info.src_ip_addr.ip.v4addr.s_addr = e->nw_src; + ip_info.dst_ip_addr.family = AF_INET; + ip_info.dst_ip_addr.ip.v4addr.s_addr = e->nw_dst; + // TODO: Update IPv6 fields when IPv6 support is added + ConfigIpMacMapTableEntry(ip_info, false); } -#endif +#endif // P4OVS free(e); } diff --git a/lib/mac-learning.h b/lib/mac-learning.h index 44382f0bbf7..b4b8429037f 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -124,6 +124,12 @@ struct mac_entry { * The client-specified data is mlport->port. */ struct mac_learning_port *mlport; struct ovs_list port_lru_node; /* In mac_learning_port's "port_lru"s. */ +#if defined(P4OVS) + /* P4 specific fields to maintain corresponding IP's */ + ovs_be32 nw_src; /* IPv4 source address or ARP SPA. */ + ovs_be32 nw_dst; /* IPv4 destination address or ARP TPA. */ + // TODO: Add IPv6 fields when IPv6 support is added +#endif }; static inline void *mac_entry_get_port(const struct mac_learning *ml, diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 45452946cb4..d8876aa81d8 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3299,13 +3299,17 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, if (!memcmp(smac.ea, mac_addr.ea, sizeof(smac))) { VLOG_DBG("Ignore self MAC learn use case for port: %s", port_name); - return -1; - + /* Even for self mac, FDB tables needs to be programmed since Port + * representors are used as underlay ports. Otherwise, underlay port + * traffic will go through slow path which is not advisable + */ + //return -1; } /* this SRC port MAC is needed to configure FDB entry * for its corresponding HOST port or Phy port. */ fdb_info->src_port = smac.ea[1] + VSI_ID_OFFSET; + fdb_info->rx_src_port = mac_addr.ea[1] + VSI_ID_OFFSET; VLOG_DBG("Continue, this is latest LNW"); } else { fdb_info->is_vlan = true; @@ -3331,7 +3335,38 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, return 0; } -#endif + +static inline int32_t +valid_ip_addr(ovs_be32 nw_addr) { + return + (nw_addr && nw_addr != INADDR_ANY && + nw_addr != INADDR_LOOPBACK && nw_addr != 0xffffffff); +} + +static int32_t +update_ip_mac_map_info(const struct flow *flow, + struct ip_mac_map_info *ip_mac_map_info) +{ + if (!flow) { + return -1; + } + + memcpy(ip_mac_map_info->src_mac_addr, flow->dl_src.ea, sizeof(ip_mac_map_info->src_mac_addr)); + memcpy(ip_mac_map_info->dst_mac_addr, flow->dl_dst.ea, sizeof(ip_mac_map_info->dst_mac_addr)); + + //Program the entiry only for an ARP response where we have valid IP's and MAC for both src and dst + if (valid_ip_addr(flow->nw_src) && !eth_addr_is_broadcast(flow->dl_src) && + valid_ip_addr(flow->nw_dst) && !eth_addr_is_broadcast(flow->dl_dst)) { + ip_mac_map_info->src_ip_addr.family = AF_INET; + ip_mac_map_info->src_ip_addr.ip.v4addr.s_addr = flow->nw_src; + + ip_mac_map_info->dst_ip_addr.family = AF_INET; + ip_mac_map_info->dst_ip_addr.ip.v4addr.s_addr = flow->nw_dst; + } + + return -1; +} +#endif // P4OVS static void xlate_normal(struct xlate_ctx *ctx) @@ -3401,6 +3436,7 @@ xlate_normal(struct xlate_ctx *ctx) && flow->packet_type == htonl(PT_ETH) && in_port && in_port->pt_mode != NETDEV_PT_LEGACY_L3 ) { + //The function below calls mac_learning_insert update_learning_table(ctx, in_xbundle, flow->dl_src, vlan, is_grat_arp); } @@ -3422,6 +3458,29 @@ xlate_normal(struct xlate_ctx *ctx) } p4ovs_unlock(&p4ovs_fdb_entry_lock); } + + // Update the recently added MAC entry with flow info + struct mac_entry *e; + + ovs_rwlock_wrlock(&ctx->xbridge->ml->rwlock); + e = mac_learning_lookup(ctx->xbridge->ml, flow->dl_src, vlan); + if (e) { + e->nw_src = flow->nw_src; + e->nw_dst = flow->nw_dst; + //TODO: Update IPv6 info in MAC entry when IPv6 support is added + } + ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock); + + if (ovs_p4_offload_enabled()) { + struct ip_mac_map_info ip_info; + memset(&ip_info, 0, sizeof(ip_info)); + if (update_ip_mac_map_info(flow, &ip_info)) { + ConfigIpMacMapTableEntry(ip_info, true); + } + } else { + VLOG_DBG("P4 offload disabled, skipping programming "); + } + #endif if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) { From 89b0cc822d4e075ec4b2f74df301f4f549d62d51 Mon Sep 17 00:00:00 2001 From: Nupur Uttarwar Date: Wed, 17 Apr 2024 15:36:40 -0700 Subject: [PATCH 817/833] Introduce option to specify grpc server address for ovs-p4rt (#109) * Introduce option to specify grpc server address for ovs-p4rt This patch does the following: - Introduce the option to specify grpc server address for ovs-p4rt client. It can be specified using --grpc-addr or -g option when starting ovs-vswitchd. Default grpc address is localhost - Introduce checks to program FDB learnt entries to hardware only if it's a new entry. Stress tests shows that unnecessary calls to grpc server to program entries that are already present throws duplicate entry errors and reduces the performance and also results into packet loss intermittently Signed-off-by: nupuruttarwar Co-authored-by: Derek G Foster --- include/openvswitch/ovs-p4rt.h | 14 +++--- include/openvswitch/p4ovs.h | 8 ++++ lib/mac-learning.c | 7 +-- lib/mac-learning.h | 6 +++ ofproto/ofproto-dpif-xlate.c | 86 +++++++++++++++++++--------------- vswitchd/automake.mk | 4 ++ vswitchd/bridge.c | 16 +++---- vswitchd/ovs-vswitchd.c | 19 ++++++++ vswitchd/p4ovs.c | 23 +++++++++ 9 files changed, 126 insertions(+), 57 deletions(-) create mode 100644 vswitchd/p4ovs.c diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 3ecb91388c1..9a29df93b57 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -101,21 +101,21 @@ struct ip_mac_map_info { // Function declarations extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, - bool insert_entry); + bool insert_entry, const char* grpc_addr); extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, - bool insert_entry); + bool insert_entry, const char* grpc_addr); extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, - bool insert_entry); + bool insert_entry, const char* grpc_addr); extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, - bool insert_entry); -extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry); + bool insert_entry, const char* grpc_addr); +extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry, const char* grpc_addr); extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, - bool insert_entry); + bool insert_entry, const char* grpc_addr); extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, - bool insert_entry); + bool insert_entry, const char* grpc_addr); #ifdef __cplusplus } // extern "C" diff --git a/include/openvswitch/p4ovs.h b/include/openvswitch/p4ovs.h index 51661df3b9c..c5b2967b125 100644 --- a/include/openvswitch/p4ovs.h +++ b/include/openvswitch/p4ovs.h @@ -10,7 +10,11 @@ #define OPENVSWITCH_P4OVS_H #include +#include +#include + #include "openvswitch/thread.h" +#include "openvswitch/util.h" #ifdef __cplusplus extern "C" { @@ -18,6 +22,8 @@ extern "C" { extern struct ovs_mutex p4ovs_fdb_entry_lock; +extern char grpc_addr[32]; + /* Control OvS offload with an environment variable during runtime. * If env variable OVS_P4_OFFLOAD=false, then disable OVS offload, else * if OVS_P4_OFFLOAD is not set or OVS_P4_OFFLOAD is any value other @@ -51,6 +57,8 @@ static inline void p4ovs_unlock(const struct ovs_mutex *p4ovs_lock) { return ovs_mutex_unlock(p4ovs_lock) OVS_RELEASES(p4ovs_lock); } +void ovs_set_grpc_addr(const char* optarg); + #ifdef __cplusplus } // extern "C" #endif diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 685dbb8ea5b..e563d8f064c 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc. + * Copyright (c) 2022-2024 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -457,7 +458,7 @@ mac_learning_del_static_entry(struct mac_learning *ml, * * Keep the code here synchronized with that in update_learning_table__() * below. */ -static bool +bool is_mac_learning_update_needed(const struct mac_learning *ml, struct eth_addr src, int vlan, bool is_gratuitous_arp, bool is_bond, @@ -629,7 +630,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); fdb_info.is_vlan = true; fdb_info.bridge_id = ml->p4_bridge_id; - ConfigFdbTableEntry(fdb_info, false); + ConfigFdbTableEntry(fdb_info, false, grpc_addr); // Remove the corresponding ip_mac tables both for src ip and dst ip struct ip_mac_map_info ip_info; @@ -639,7 +640,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) ip_info.dst_ip_addr.family = AF_INET; ip_info.dst_ip_addr.ip.v4addr.s_addr = e->nw_dst; // TODO: Update IPv6 fields when IPv6 support is added - ConfigIpMacMapTableEntry(ip_info, false); + ConfigIpMacMapTableEntry(ip_info, false, grpc_addr); } #endif // P4OVS free(e); diff --git a/lib/mac-learning.h b/lib/mac-learning.h index b4b8429037f..0b7eaaa35ca 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -229,6 +229,12 @@ bool mac_learning_may_learn(const struct mac_learning *ml, const struct eth_addr src_mac, uint16_t vlan) OVS_REQ_RDLOCK(ml->rwlock); +bool +is_mac_learning_update_needed(const struct mac_learning *ml, + struct eth_addr src, int vlan, + bool is_gratuitous_arp, bool is_bond, + void *in_port) + OVS_REQ_RDLOCK(ml->rwlock); struct mac_entry *mac_learning_insert(struct mac_learning *ml, const struct eth_addr src, uint16_t vlan) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index d8876aa81d8..f24e3675b4a 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3373,6 +3373,9 @@ xlate_normal(struct xlate_ctx *ctx) { struct flow_wildcards *wc = ctx->wc; struct flow *flow = &ctx->xin->flow; +#if defined(P4OVS) + bool is_mac_learn_required = false; +#endif //P4OVS struct xbundle *in_xbundle; struct xport *in_port; struct mac_entry *mac; @@ -3436,51 +3439,56 @@ xlate_normal(struct xlate_ctx *ctx) && flow->packet_type == htonl(PT_ETH) && in_port && in_port->pt_mode != NETDEV_PT_LEGACY_L3 ) { +#if defined(P4OVS) + is_mac_learn_required = is_mac_learning_update_needed(ctx->xbridge->ml, + flow->dl_src, vlan,is_grat_arp, + in_xbundle->bond != NULL, + in_xbundle->ofbundle); +#endif //The function below calls mac_learning_insert update_learning_table(ctx, in_xbundle, flow->dl_src, vlan, is_grat_arp); } #if defined(P4OVS) - /* Dynamic MAC is learnt, program P4 forwarding table */ - struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, - flow->in_port.ofp_port); - struct mac_learning_info fdb_info; - memset(&fdb_info, 0, sizeof(fdb_info)); - if (ovs_p4_offload_enabled()) { - p4ovs_lock(&p4ovs_fdb_entry_lock); - if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true); - ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; - } else { - VLOG_DBG("Error retrieving FDB information, skipping programming " - "P4 entry"); - } - p4ovs_unlock(&p4ovs_fdb_entry_lock); - } - - // Update the recently added MAC entry with flow info - struct mac_entry *e; - - ovs_rwlock_wrlock(&ctx->xbridge->ml->rwlock); - e = mac_learning_lookup(ctx->xbridge->ml, flow->dl_src, vlan); - if (e) { - e->nw_src = flow->nw_src; - e->nw_dst = flow->nw_dst; - //TODO: Update IPv6 info in MAC entry when IPv6 support is added - } - ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock); - - if (ovs_p4_offload_enabled()) { - struct ip_mac_map_info ip_info; - memset(&ip_info, 0, sizeof(ip_info)); - if (update_ip_mac_map_info(flow, &ip_info)) { - ConfigIpMacMapTableEntry(ip_info, true); - } - } else { - VLOG_DBG("P4 offload disabled, skipping programming "); + if (is_mac_learn_required) { + /* Dynamic MAC is learnt, program P4 forwarding table */ + struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, + flow->in_port.ofp_port); + struct mac_learning_info fdb_info = {0}; + if (ovs_p4_offload_enabled()) { + p4ovs_lock(&p4ovs_fdb_entry_lock); + if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { + ConfigFdbTableEntry(fdb_info, true, grpc_addr); + ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; + } else { + VLOG_DBG("Error retrieving FDB information, skipping programming " + "P4 entry"); + } + p4ovs_unlock(&p4ovs_fdb_entry_lock); + } + + // Update the recently added MAC entry with flow info + struct mac_entry *e; + + ovs_rwlock_wrlock(&ctx->xbridge->ml->rwlock); + e = mac_learning_lookup(ctx->xbridge->ml, flow->dl_src, vlan); + if (e) { + e->nw_src = flow->nw_src; + e->nw_dst = flow->nw_dst; + //TODO: Update IPv6 info in MAC entry when IPv6 support is added + } + ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock); + + if (ovs_p4_offload_enabled()) { + struct ip_mac_map_info ip_info = {0}; + if (update_ip_mac_map_info(flow, &ip_info)) { + ConfigIpMacMapTableEntry(ip_info, true, grpc_addr); + } + } else { + VLOG_DBG("P4 offload disabled, skipping programming "); + } } - #endif if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) { @@ -9046,7 +9054,7 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, memset(&fdb_info, 0, sizeof(fdb_info)); if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true); + ConfigFdbTableEntry(fdb_info, true, grpc_addr); ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_DBG("Error retrieving FDB information, skipping programming " diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk index 1794efb7de2..0ca86153b69 100644 --- a/vswitchd/automake.mk +++ b/vswitchd/automake.mk @@ -10,6 +10,10 @@ vswitchd_sources = \ vswitchd/system-stats.c \ vswitchd/system-stats.h +if P4OVS +vswitchd_sources += vswitchd/p4ovs.c +endif + if P4OVS # Build a static library instead of an executable. lib_LTLIBRARIES += vswitchd/libvswitchd.la diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index d96377d583a..537ad1a2ffb 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2373,8 +2373,8 @@ ConfigureP4Target(struct bridge *br, struct port *port, tnl_info.bridge_id = br->p4_bridge_id; tnl_info.src_port = port->p4_src_port; - ConfigTunnelTableEntry(tnl_info, insert_entry); - ConfigRxTunnelSrcTableEntry(tnl_info, insert_entry); + ConfigTunnelTableEntry(tnl_info, insert_entry, grpc_addr); + ConfigRxTunnelSrcTableEntry(tnl_info, insert_entry, grpc_addr); } else { VLOG_ERR("Error retrieving tunnel information, " "skipping programming P4 entry"); @@ -2389,15 +2389,15 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_vlan_id, port->p4_src_port}; /* When VLAN tag is configured */ - ConfigVlanTableEntry(port->p4_vlan_id, insert_entry); - ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry); + ConfigVlanTableEntry(port->p4_vlan_id, insert_entry, grpc_addr); + ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry, grpc_addr); } else { /* Wild card VLAN 0 */ struct src_port_info tnl_src_port_info = {br->p4_bridge_id, 0, port->p4_src_port}; - ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry); + ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry, grpc_addr); } port->is_src_port_configured = insert_entry; } else if (!insert_entry || iface->cfg->mac_in_use) { @@ -2417,8 +2417,8 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_vlan_id, port->p4_src_port}; - ConfigVlanTableEntry(port->p4_vlan_id, insert_entry); - ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry); + ConfigVlanTableEntry(port->p4_vlan_id, insert_entry, grpc_addr); + ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry, grpc_addr); } else if (port->p4_vlan_mode == P4_PORT_VLAN_UNSUPPORTED) { /* Do nothing, unsupported vlan mode */ } else if (port->p4_src_port) { @@ -2426,7 +2426,7 @@ ConfigureP4Target(struct bridge *br, struct port *port, 0, port->p4_src_port}; - ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry); + ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry, grpc_addr); } else { VLOG_DBG("Invalid P4 use case for source port to " "bridge mapping"); diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index 6d90c73b830..3de92204970 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -53,6 +53,9 @@ #include "openvswitch/vlog.h" #include "lib/vswitch-idl.h" #include "lib/dns-resolve.h" +#if defined(P4OVS) +#include "openvswitch/p4ovs.h" +#endif VLOG_DEFINE_THIS_MODULE(vswitchd); @@ -184,6 +187,9 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) SSL_OPTION_ENUMS, OPT_DUMMY_NUMA, OPT_HW_RAWIO_ACCESS, +#if defined(P4OVS) + OPT_GRPC_ADDR, +#endif }; static const struct option long_options[] = { {"help", no_argument, NULL, 'h'}, @@ -201,6 +207,9 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) {"dpdk", optional_argument, NULL, OPT_DPDK}, {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA}, {"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS}, +#if defined(P4OVS) + {"grpc-addr", optional_argument, NULL, 'g'}, +#endif {NULL, 0, NULL, 0}, }; char *short_options = ovs_cmdl_long_options_to_short_options(long_options); @@ -269,6 +278,13 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) hw_rawio_access = true; break; +#if defined(P4OVS) + case OPT_GRPC_ADDR: + case 'g': + ovs_set_grpc_addr(optarg); + break; +#endif + default: abort(); } @@ -309,6 +325,9 @@ usage(void) printf("\nOther options:\n" " --unixctl=SOCKET override default control socket name\n" " -h, --help display this help message\n" +#if defined(P4OVS) + " -g, --grpc-addr gRPC server address for P4Runtime server\n" +#endif " -V, --version display version information\n"); exit(EXIT_SUCCESS); } diff --git a/vswitchd/p4ovs.c b/vswitchd/p4ovs.c new file mode 100644 index 00000000000..c416a7a8df2 --- /dev/null +++ b/vswitchd/p4ovs.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024 Intel Corporation. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "openvswitch/p4ovs.h" +#include "util.h" + +char grpc_addr[32] = "localhost:9559"; +static const char grpc_port[] = ":9559"; + +void ovs_set_grpc_addr(const char* optarg) { + if (strlen(optarg) + sizeof(grpc_port) >= sizeof(grpc_addr)) { + ovs_fatal(0, "--grpc_addr is too long (> %lu characters)", + sizeof(grpc_addr) - sizeof(grpc_port)); + } + strncpy(grpc_addr, optarg, sizeof(grpc_addr)); + strcat(grpc_addr, grpc_port); +} + From baa2fea22d75fd3d3f80e5c0c9da84946da094a7 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Fri, 19 Apr 2024 07:53:29 -0700 Subject: [PATCH 818/833] Update codeowners file Signed-off-by: Derek G Foster --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 237f241eac8..3522873b48c 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @ffoulkes @nupuruttarwar @n-sandeep @vsureshkumarp +* @ffoulkes @5abeel @n-sandeep @vsureshkumarp From e1e64bd441542810a2d8147f6ff8e2cfac76cd03 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Fri, 19 Apr 2024 07:31:12 -0700 Subject: [PATCH 819/833] Add missing #includes - Added #includes to define all the symbols used by ovs-p4rt.h. Signed-off-by: Derek G Foster --- include/openvswitch/ovs-p4rt.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 9a29df93b57..77bd2fa3e41 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -9,6 +9,8 @@ #ifndef OPENVSWITCH_OVS_P4RT_H #define OPENVSWITCH_OVS_P4RT_H +#include +#include #include #ifdef __cplusplus @@ -18,9 +20,11 @@ extern "C" { /* When VSI ID is used as an action, we need add an offset of 16 and populate * the action */ #define VSI_ID_OFFSET 16 + /* As p4 program uses 8 bits for bridge ID, current limitation is we can go max * of 256 bridges (0-255) */ #define MAX_P4_BRIDGE_ID 255 + /* Source port for VxLAN should start from 2048, 0 to 2047 are reserved for * VSI/phy ports */ #define P4_VXLAN_SOURCE_PORT_OFFSET 2048 From 75ed9cd5e06f233e2b925c44b871346027ae0255 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Fri, 19 Apr 2024 07:39:37 -0700 Subject: [PATCH 820/833] Minor housekeeping on ovs-p4rt.h - Used clang-format to format file. - Alphabetized function declarations. Signed-off-by: Derek G Foster --- include/openvswitch/ovs-p4rt.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h index 77bd2fa3e41..a4a6d364d84 100644 --- a/include/openvswitch/ovs-p4rt.h +++ b/include/openvswitch/ovs-p4rt.h @@ -106,20 +106,28 @@ struct ip_mac_map_info { // Function declarations extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, bool insert_entry, const char* grpc_addr); -extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, - bool insert_entry, const char* grpc_addr); -extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, - bool insert_entry, const char* grpc_addr); + +extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, + bool insert_entry, const char* grpc_addr); + +extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, + bool insert_entry, + const char* grpc_addr); + extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, bool insert_entry, const char* grpc_addr); -extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry, const char* grpc_addr); -extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, - bool insert_entry, const char* grpc_addr); -extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); +extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, + bool insert_entry, + const char* grpc_addr); -extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, - bool insert_entry, const char* grpc_addr); +extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, + bool insert_entry, const char* grpc_addr); + +extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry, + const char* grpc_addr); + +extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); #ifdef __cplusplus } // extern "C" From 55034d2f4b27330d4537d885ac65f8cef96f4c34 Mon Sep 17 00:00:00 2001 From: Derek Foster Date: Sun, 5 May 2024 05:57:57 -0700 Subject: [PATCH 821/833] Move ovs-p4rt.h header file - Moved ovs-p4rt.h from the include/openvswitch folder to a new include/ovsp4rt folder. Updated the buildsystem to reflect the change. - Updated #include paths from "openswitch/ovs-p4rt.h" to "ovsp4rt/ovs-p4rt.h". - Implemented an install-data-hook to define a soft link from openvswitch/ovs-p4rt.h to ovsp4rt/ovs-p4rt.h, to maintain backward compatibility. Modified the dist-hook-git target to ignore the symlink. Signed-off-by: Derek Foster --- Makefile.am | 1 + include/automake.mk | 1 + include/openvswitch/automake.mk | 1 - include/openvswitch/ovs-p4rt.h | 137 +-------------------------- include/ovsp4rt/automake.mk | 9 ++ include/ovsp4rt/ovs-p4rt.h | 159 ++++++++++++++++++++++++++++++++ lib/mac-learning.c | 2 +- ofproto/ofproto-dpif-xlate.c | 2 +- ofproto/ofproto-dpif.c | 2 +- vswitchd/bridge.c | 2 +- 10 files changed, 175 insertions(+), 141 deletions(-) mode change 100644 => 120000 include/openvswitch/ovs-p4rt.h create mode 100644 include/ovsp4rt/automake.mk create mode 100644 include/ovsp4rt/ovs-p4rt.h diff --git a/Makefile.am b/Makefile.am index a4edd848540..b4bd2083947 100644 --- a/Makefile.am +++ b/Makefile.am @@ -209,6 +209,7 @@ dist-hook-git: distfiles @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \ (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \ grep -v '\.gitattributes$$' | grep -v 'CODEOWNERS' | \ + grep -v 'openvswitch/ovs-p4rt.h' | \ LC_ALL=C sort -u > all-gitfiles; \ LC_ALL=C comm -1 -3 distfiles all-gitfiles > missing-distfiles; \ if test -s missing-distfiles; then \ diff --git a/include/automake.mk b/include/automake.mk index a276c680b53..38ce05a5433 100644 --- a/include/automake.mk +++ b/include/automake.mk @@ -12,6 +12,7 @@ CLEANFILES += include/odp-netlink.h include/odp-netlink-macros.h include include/openflow/automake.mk include include/openvswitch/automake.mk +include include/ovsp4rt/automake.mk include include/sparse/automake.mk include include/windows/automake.mk include include/linux/automake.mk diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 635909dbf04..47359a6d4cb 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -54,7 +54,6 @@ openvswitchinclude_HEADERS = \ if P4OVS openvswitchinclude_HEADERS += \ - include/openvswitch/ovs-p4rt.h \ include/openvswitch/p4ovs.h endif diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h deleted file mode 100644 index a4a6d364d84..00000000000 --- a/include/openvswitch/ovs-p4rt.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2022-2023 Intel Corporation. - * SPDX-License-Identifier: Apache-2.0 - * - * Defines the public interface to an externally-supplied module - * that permits OvS to communicate with the P4 control plane. - */ - -#ifndef OPENVSWITCH_OVS_P4RT_H -#define OPENVSWITCH_OVS_P4RT_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* When VSI ID is used as an action, we need add an offset of 16 and populate - * the action */ -#define VSI_ID_OFFSET 16 - -/* As p4 program uses 8 bits for bridge ID, current limitation is we can go max - * of 256 bridges (0-255) */ -#define MAX_P4_BRIDGE_ID 255 - -/* Source port for VxLAN should start from 2048, 0 to 2047 are reserved for - * VSI/phy ports */ -#define P4_VXLAN_SOURCE_PORT_OFFSET 2048 - -/* This is a replica of port_vlan_mode in ofproto.h */ -enum p4_vlan_mode { - P4_PORT_VLAN_ACCESS, - P4_PORT_VLAN_TRUNK, - P4_PORT_VLAN_NATIVE_TAGGED, - P4_PORT_VLAN_NATIVE_UNTAGGED, - P4_PORT_VLAN_DOT1Q_TUNNEL, - P4_PORT_VLAN_UNSUPPORTED -}; - -enum ovs_tunnel_type { - OVS_TUNNEL_UNKNOWN = 0, - OVS_TUNNEL_VXLAN, - OVS_TUNNEL_GENEVE -}; - -struct p4_ipaddr { - uint8_t family; - uint8_t prefix_len; - union { - struct in_addr v4addr; - struct in6_addr v6addr; - } ip; -}; - -struct port_vlan_info { - enum p4_vlan_mode port_vlan_mode; - int port_vlan; -}; - -struct tunnel_info { - uint32_t ifindex; - uint32_t port_id; - uint32_t src_port; - struct p4_ipaddr local_ip; - struct p4_ipaddr remote_ip; - uint16_t dst_port; - uint16_t vni; - struct port_vlan_info vlan_info; - uint8_t bridge_id; - uint8_t tunnel_type; -}; - -struct src_port_info { - uint8_t bridge_id; - uint16_t vlan_id; - uint32_t src_port; -}; - -struct vlan_info { - uint32_t vlan_id; -}; - -struct mac_learning_info { - bool is_tunnel; - bool is_vlan; - uint8_t mac_addr[6]; - uint8_t bridge_id; - uint32_t src_port; - uint32_t rx_src_port; - struct port_vlan_info vlan_info; - union { - struct tunnel_info tnl_info; - struct vlan_info vln_info; - }; -}; - -struct ip_mac_map_info { - uint8_t src_mac_addr[6]; - uint8_t dst_mac_addr[6]; - struct p4_ipaddr src_ip_addr; - struct p4_ipaddr dst_ip_addr; -}; - -// Function declarations -extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, - bool insert_entry, const char* grpc_addr); - -extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, - bool insert_entry, const char* grpc_addr); - -extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, - bool insert_entry, - const char* grpc_addr); - -extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, - bool insert_entry, const char* grpc_addr); - -extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, - bool insert_entry, - const char* grpc_addr); - -extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, - bool insert_entry, const char* grpc_addr); - -extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry, - const char* grpc_addr); - -extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // OPENVSWITCH_OVS_P4RT_H diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h new file mode 120000 index 00000000000..be1d2545e40 --- /dev/null +++ b/include/openvswitch/ovs-p4rt.h @@ -0,0 +1 @@ +../ovsp4rt/ovs-p4rt.h \ No newline at end of file diff --git a/include/ovsp4rt/automake.mk b/include/ovsp4rt/automake.mk new file mode 100644 index 00000000000..06d613908b4 --- /dev/null +++ b/include/ovsp4rt/automake.mk @@ -0,0 +1,9 @@ +if P4OVS +ovsp4rtincludedir = $(includedir)/ovsp4rt +ovsp4rtinclude_HEADERS = \ + include/ovsp4rt/ovs-p4rt.h + +install-data-hook: + cd $(DESTDIR)$(includedir)/openvswitch && \ + $(LN_S) ../ovsp4rt/ovs-p4rt.h ovs-p4rt.h +endif diff --git a/include/ovsp4rt/ovs-p4rt.h b/include/ovsp4rt/ovs-p4rt.h new file mode 100644 index 00000000000..d57baffa5cf --- /dev/null +++ b/include/ovsp4rt/ovs-p4rt.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2022-2024 Intel Corporation. + * SPDX-License-Identifier: Apache-2.0 + * + * Defines the public interface that permits OvS to send p4runtime + * messages to infrap4d. + * + * --------------------------------------------------------------------- + * IMPORTANT: + * + * This file is moving from the ovs repository to the ovs-p4rt directory + * in the networking-recipe repository. + * + * To maintain backward compatibility during the transition, there are + * currently TWO copies of the file, one in each repository. + * + * If you make any changes, they must be made to BOTH copies. + * --------------------------------------------------------------------- + */ + +#ifndef OVSP4RT_OVS_P4RT_H_ +#define OVSP4RT_OVS_P4RT_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +//---------------------------------------------------------------------- +// Constants +//---------------------------------------------------------------------- + +/* When VSI ID is used as an action, we need to add an offset of 16 and + * populate the action. */ +#define VSI_ID_OFFSET 16 + +/* As the p4 program uses 8 bits for bridge ID, we are currently limited + * to a maximum of 256 bridges (0-255). */ +#define MAX_P4_BRIDGE_ID 255 + +/* Source ports for VxLAN should start from 2048. Ports 0 through 2047 + * are reserved for VSI/phy ports. */ +#define P4_VXLAN_SOURCE_PORT_OFFSET 2048 + +//---------------------------------------------------------------------- +// Data types +//---------------------------------------------------------------------- + +/* This is a replica of port_vlan_mode in ofproto.h */ +enum p4_vlan_mode { + P4_PORT_VLAN_ACCESS, + P4_PORT_VLAN_TRUNK, + P4_PORT_VLAN_NATIVE_TAGGED, + P4_PORT_VLAN_NATIVE_UNTAGGED, + P4_PORT_VLAN_DOT1Q_TUNNEL, + P4_PORT_VLAN_UNSUPPORTED +}; + +enum ovs_tunnel_type { + OVS_TUNNEL_UNKNOWN = 0, + OVS_TUNNEL_VXLAN, + OVS_TUNNEL_GENEVE +}; + +struct p4_ipaddr { + uint8_t family; + uint8_t prefix_len; + union { + struct in_addr v4addr; + struct in6_addr v6addr; + } ip; +}; + +struct port_vlan_info { + enum p4_vlan_mode port_vlan_mode; + int port_vlan; +}; + +struct tunnel_info { + uint32_t ifindex; + uint32_t port_id; + uint32_t src_port; + struct p4_ipaddr local_ip; + struct p4_ipaddr remote_ip; + uint16_t dst_port; + uint16_t vni; + struct port_vlan_info vlan_info; + uint8_t bridge_id; + uint8_t tunnel_type; +}; + +struct src_port_info { + uint8_t bridge_id; + uint16_t vlan_id; + uint32_t src_port; +}; + +struct vlan_info { + uint32_t vlan_id; +}; + +struct mac_learning_info { + bool is_tunnel; + bool is_vlan; + uint8_t mac_addr[6]; + uint8_t bridge_id; + uint32_t src_port; + uint32_t rx_src_port; + struct port_vlan_info vlan_info; + union { + struct tunnel_info tnl_info; + struct vlan_info vln_info; + }; +}; + +struct ip_mac_map_info { + uint8_t src_mac_addr[6]; + uint8_t dst_mac_addr[6]; + struct p4_ipaddr src_ip_addr; + struct p4_ipaddr dst_ip_addr; +}; + +//---------------------------------------------------------------------- +// Function prototypes +//---------------------------------------------------------------------- + +extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, + bool insert_entry, const char* grpc_addr); + +extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, + bool insert_entry, const char* grpc_addr); + +extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, + bool insert_entry, + const char* grpc_addr); + +extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, + bool insert_entry, const char* grpc_addr); + +extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, + bool insert_entry, + const char* grpc_addr); + +extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, + bool insert_entry, const char* grpc_addr); + +extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry, + const char* grpc_addr); + +extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // OVSP4RT_OVS_P4RT_H_ diff --git a/lib/mac-learning.c b/lib/mac-learning.c index e563d8f064c..8209013722a 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -32,7 +32,7 @@ #include "vlan-bitmap.h" #if defined(P4OVS) -#include "openvswitch/ovs-p4rt.h" +#include "ovsp4rt/ovs-p4rt.h" #include "openvswitch/p4ovs.h" #endif diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index f24e3675b4a..a3dafa93edd 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -75,7 +75,7 @@ #include #include #include -#include "openvswitch/ovs-p4rt.h" +#include "ovsp4rt/ovs-p4rt.h" #include "openvswitch/p4ovs.h" #endif //P4OVS diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index a65089f740b..43a95958509 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -76,7 +76,7 @@ #include "vlan-bitmap.h" #if defined(P4OVS) -#include "openvswitch/ovs-p4rt.h" +#include "ovsp4rt/ovs-p4rt.h" #endif VLOG_DEFINE_THIS_MODULE(ofproto_dpif); diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 537ad1a2ffb..f08414ea8c6 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -75,7 +75,7 @@ #if defined(P4OVS) #include -#include "openvswitch/ovs-p4rt.h" +#include "ovsp4rt/ovs-p4rt.h" #include "openvswitch/p4ovs.h" static int32_t From 439b0e7a0b200d28586e53f6ca455434bb50d62e Mon Sep 17 00:00:00 2001 From: Derek Foster Date: Tue, 21 May 2024 10:05:50 -0700 Subject: [PATCH 822/833] Rename ovsp4rt public functions - Renamed the functions that ovs-p4rt publishes for use by OVS to conform better to C naming conventions. - Added .vscode/ and .vslick/ to .gitignore. Signed-off-by: Derek Foster --- .gitignore | 2 ++ include/ovsp4rt/ovs-p4rt.h | 33 ++++++++++++++++++--------------- lib/mac-learning.c | 4 ++-- ofproto/ofproto-dpif-xlate.c | 8 ++++---- vswitchd/bridge.c | 25 ++++++++++++++++--------- 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index b3fb29ec078..69bc5c9cbf6 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,5 @@ testsuite.tmp.orig /Documentation/_build /.venv /cxx-check +/.vscode/ +/.vslick/ diff --git a/include/ovsp4rt/ovs-p4rt.h b/include/ovsp4rt/ovs-p4rt.h index d57baffa5cf..1bdfc4078cd 100644 --- a/include/ovsp4rt/ovs-p4rt.h +++ b/include/ovsp4rt/ovs-p4rt.h @@ -127,30 +127,33 @@ struct ip_mac_map_info { // Function prototypes //---------------------------------------------------------------------- -extern void ConfigFdbTableEntry(struct mac_learning_info learn_info, - bool insert_entry, const char* grpc_addr); - -extern void ConfigIpMacMapTableEntry(struct ip_mac_map_info learn_info, +extern void ovsp4rt_config_fdb_entry(struct mac_learning_info learn_info, bool insert_entry, const char* grpc_addr); -extern void ConfigRxTunnelSrcTableEntry(struct tunnel_info tunnel_info, - bool insert_entry, - const char* grpc_addr); +extern void ovsp4rt_config_ip_mac_map_entry(struct ip_mac_map_info learn_info, + bool insert_entry, + const char* grpc_addr); -extern void ConfigSrcPortTableEntry(struct src_port_info vsi_sp, - bool insert_entry, const char* grpc_addr); +extern void ovsp4rt_config_rx_tunnel_src_entry(struct tunnel_info tunnel_info, + bool insert_entry, + const char* grpc_addr); -extern void ConfigTunnelSrcPortTableEntry(struct src_port_info tnl_sp, +extern void ovsp4rt_config_src_port_entry(struct src_port_info vsi_sp, bool insert_entry, const char* grpc_addr); -extern void ConfigTunnelTableEntry(struct tunnel_info tunnel_info, - bool insert_entry, const char* grpc_addr); +extern void ovsp4rt_config_tunnel_src_port_entry(struct src_port_info tnl_sp, + bool insert_entry, + const char* grpc_addr); + +extern void ovsp4rt_config_tunnel_entry(struct tunnel_info tunnel_info, + bool insert_entry, + const char* grpc_addr); -extern void ConfigVlanTableEntry(uint16_t vlan_id, bool insert_entry, - const char* grpc_addr); +extern void ovsp4rt_config_vlan_entry(uint16_t vlan_id, bool insert_entry, + const char* grpc_addr); -extern enum ovs_tunnel_type TunnelTypeStrtoEnum(const char* tnl_type); +extern enum ovs_tunnel_type ovsp4rt_str_to_tunnel_type(const char* tnl_type); #ifdef __cplusplus } // extern "C" diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 8209013722a..af2bd1dca56 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -630,7 +630,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); fdb_info.is_vlan = true; fdb_info.bridge_id = ml->p4_bridge_id; - ConfigFdbTableEntry(fdb_info, false, grpc_addr); + ovsp4rt_config_fdb_entry(fdb_info, false, grpc_addr); // Remove the corresponding ip_mac tables both for src ip and dst ip struct ip_mac_map_info ip_info; @@ -640,7 +640,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) ip_info.dst_ip_addr.family = AF_INET; ip_info.dst_ip_addr.ip.v4addr.s_addr = e->nw_dst; // TODO: Update IPv6 fields when IPv6 support is added - ConfigIpMacMapTableEntry(ip_info, false, grpc_addr); + ovsp4rt_config_ip_mac_map_entry(ip_info, false, grpc_addr); } #endif // P4OVS free(e); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index a3dafa93edd..7635d9149d6 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3269,7 +3269,7 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, fdb_info->tnl_info.dst_port = underlay_tnl->dst_port; fdb_info->tnl_info.vni = underlay_tnl->vni; const char *tnl_type = tnl_port_get_type(port->ofport); - fdb_info->tnl_info.tunnel_type = TunnelTypeStrtoEnum(tnl_type); + fdb_info->tnl_info.tunnel_type = ovsp4rt_str_to_tunnel_type(tnl_type); if (underlay_tnl->ipv6_src.__in6_u.__u6_addr32[0]) { /* IPv6 tunnel configuration */ @@ -3459,7 +3459,7 @@ xlate_normal(struct xlate_ctx *ctx) if (ovs_p4_offload_enabled()) { p4ovs_lock(&p4ovs_fdb_entry_lock); if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true, grpc_addr); + ovsp4rt_config_fdb_entry(fdb_info, true, grpc_addr); ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_DBG("Error retrieving FDB information, skipping programming " @@ -3483,7 +3483,7 @@ xlate_normal(struct xlate_ctx *ctx) if (ovs_p4_offload_enabled()) { struct ip_mac_map_info ip_info = {0}; if (update_ip_mac_map_info(flow, &ip_info)) { - ConfigIpMacMapTableEntry(ip_info, true, grpc_addr); + ovsp4rt_config_ip_mac_map_entry(ip_info, true, grpc_addr); } } else { VLOG_DBG("P4 offload disabled, skipping programming "); @@ -9054,7 +9054,7 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, memset(&fdb_info, 0, sizeof(fdb_info)); if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { - ConfigFdbTableEntry(fdb_info, true, grpc_addr); + ovsp4rt_config_fdb_entry(fdb_info, true, grpc_addr); ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_DBG("Error retrieving FDB information, skipping programming " diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index f08414ea8c6..99e10cc7298 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2300,7 +2300,7 @@ get_tunnel_data(struct netdev *netdev, tnl_info->vni = underlay_tnl->vni; const char* tnl_type = netdev_get_type(netdev); - tnl_info->tunnel_type = TunnelTypeStrtoEnum(tnl_type); + tnl_info->tunnel_type = ovsp4rt_str_to_tunnel_type(tnl_type); return 0; } @@ -2373,8 +2373,9 @@ ConfigureP4Target(struct bridge *br, struct port *port, tnl_info.bridge_id = br->p4_bridge_id; tnl_info.src_port = port->p4_src_port; - ConfigTunnelTableEntry(tnl_info, insert_entry, grpc_addr); - ConfigRxTunnelSrcTableEntry(tnl_info, insert_entry, grpc_addr); + ovsp4rt_config_tunnel_entry(tnl_info, insert_entry, grpc_addr); + ovsp4rt_config_rx_tunnel_src_entry(tnl_info, insert_entry, + grpc_addr); } else { VLOG_ERR("Error retrieving tunnel information, " "skipping programming P4 entry"); @@ -2389,15 +2390,18 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_vlan_id, port->p4_src_port}; /* When VLAN tag is configured */ - ConfigVlanTableEntry(port->p4_vlan_id, insert_entry, grpc_addr); - ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry, grpc_addr); + ovsp4rt_config_vlan_entry(port->p4_vlan_id, insert_entry, + grpc_addr); + ovsp4rt_config_tunnel_src_port_entry(tnl_src_port_info, + insert_entry, grpc_addr); } else { /* Wild card VLAN 0 */ struct src_port_info tnl_src_port_info = {br->p4_bridge_id, 0, port->p4_src_port}; - ConfigTunnelSrcPortTableEntry(tnl_src_port_info, insert_entry, grpc_addr); + ovsp4rt_config_tunnel_src_port_entry(tnl_src_port_info, + insert_entry, grpc_addr); } port->is_src_port_configured = insert_entry; } else if (!insert_entry || iface->cfg->mac_in_use) { @@ -2417,8 +2421,10 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_vlan_id, port->p4_src_port}; - ConfigVlanTableEntry(port->p4_vlan_id, insert_entry, grpc_addr); - ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry, grpc_addr); + ovsp4rt_config_vlan_entry(port->p4_vlan_id, insert_entry, + grpc_addr); + ovsp4rt_config_src_port_entry(vsi_src_port_info, insert_entry, + grpc_addr); } else if (port->p4_vlan_mode == P4_PORT_VLAN_UNSUPPORTED) { /* Do nothing, unsupported vlan mode */ } else if (port->p4_src_port) { @@ -2426,7 +2432,8 @@ ConfigureP4Target(struct bridge *br, struct port *port, 0, port->p4_src_port}; - ConfigSrcPortTableEntry(vsi_src_port_info, insert_entry, grpc_addr); + ovsp4rt_config_src_port_entry(vsi_src_port_info, insert_entry, + grpc_addr); } else { VLOG_DBG("Invalid P4 use case for source port to " "bridge mapping"); From d382e8de89a8177f99e44d12e2b3dc11e962a46b Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Fri, 31 May 2024 14:07:02 -0700 Subject: [PATCH 823/833] Housekeeping and bug fixes (#116) 1. Added or updated Intel copyright notices. 2. Corrected #elif directives to #else in ofproto-dpif-xlate. 3. Added missing P4OVS conditionals in lib/netdev.h and lib/netdev-vport.c. 4. Removed trailing whitespace. Changes #2 and #3 fix compilation errors. Change #3 fixes the eight unit tests that used to fail. Signed-off-by: Derek Foster --- Makefile.am | 1 + configure.ac | 2 +- include/openvswitch/p4ovs.h | 2 +- lib/dpif-netlink-rtnl.c | 2 +- lib/mac-learning.h | 1 + lib/netdev-vport.c | 10 +++++++--- lib/netdev.h | 5 +++-- lib/util.h | 1 + ofproto/ofproto-dpif-xlate.c | 32 ++++++++++++++++++-------------- ofproto/ofproto-dpif-xlate.h | 3 ++- ofproto/ofproto-dpif.c | 5 +++-- ofproto/ofproto.h | 2 ++ vswitchd/bridge.c | 1 + vswitchd/ovs-vswitchd.c | 1 + 14 files changed, 43 insertions(+), 25 deletions(-) diff --git a/Makefile.am b/Makefile.am index b4bd2083947..0a7db4debdf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,5 @@ # Copyright (C) 2007-2017 Nicira, Inc. +# Copyright (c) 2023-2024 Intel Corporation. # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright diff --git a/configure.ac b/configure.ac index 0eb62b496b5..cd13f523234 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ # Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc. -# Copyright (c) 2021 Intel Corporation. +# Copyright (c) 2021-2022 Intel Corporation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/include/openvswitch/p4ovs.h b/include/openvswitch/p4ovs.h index c5b2967b125..8eb49bd5914 100644 --- a/include/openvswitch/p4ovs.h +++ b/include/openvswitch/p4ovs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Intel Corporation. + * Copyright (c) 2023-2024 Intel Corporation. * SPDX-License-Identifier: Apache-2.0 * * Defines the P4 OvS specific definitions. These need be used under diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c index e2e0b40d685..174f3b7c699 100644 --- a/lib/dpif-netlink-rtnl.c +++ b/lib/dpif-netlink-rtnl.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Red Hat, Inc. - * Copyright (c) 2021 Intel Corporation. + * Copyright (c) 2021-2022 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/lib/mac-learning.h b/lib/mac-learning.h index 0b7eaaa35ca..0502b281153 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc. + * Copyright (c) 2023-2024 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index b3d1fb041f8..1e11c63190c 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2010, 2011, 2012, 2013, 2014, 2017 Nicira, Inc. * Copyright (c) 2016 Red Hat, Inc. - * Copyright (c) 2021 Intel Corporation. + * Copyright (c) 2021-2022 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -715,10 +715,14 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) if (!strcmp(node->value, "false")) { tnl_cfg.dont_fragment = false; } - } else if (!strcmp(node->key, "key") && strcmp(node->value, "flow")) { +#if defined(P4OVS) + } else if (!strcmp(node->key, "key") && + strcmp(node->value, "flow")) { /* Add VNI to tunnel config if the value is not flow */ tnl_cfg.vni = atoi(node->value); - } else if (!strcmp(node->key, "in_key") || +#endif + } else if (!strcmp(node->key, "key") || + !strcmp(node->key, "in_key") || !strcmp(node->key, "out_key") || !strcmp(node->key, "packet_type")) { /* Handled separately below. */ diff --git a/lib/netdev.h b/lib/netdev.h index d298cabe779..2f2ec1e18b0 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc. - * Copyright (c) 2021 Intel Corporation. + * Copyright (c) 2021-2022 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -175,7 +175,8 @@ struct netdev_tunnel_config { #define SRV6_MAX_SEGS 6 struct in6_addr srv6_segs[SRV6_MAX_SEGS]; enum netdev_srv6_flowlabel srv6_flowlabel; -#ifdef P4OVS + +#if defined(P4OVS) uint32_t vni; #endif }; diff --git a/lib/util.h b/lib/util.h index c1fd120bc42..385a425b0e7 100644 --- a/lib/util.h +++ b/lib/util.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc. + * Copyright (c) 2021-2022 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 7635d9149d6..e0d81cf2cbc 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -1,4 +1,5 @@ /* Copyright (c) 2009-2017, 2019-2020 Nicira, Inc. + * Copyright (c) 2022-2024 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,14 +71,15 @@ #include "vlan-bitmap.h" #if defined(P4OVS) -#include "lib/netdev.h" #include #include #include #include + +#include "lib/netdev.h" #include "ovsp4rt/ovs-p4rt.h" #include "openvswitch/p4ovs.h" -#endif //P4OVS +#endif COVERAGE_DEFINE(xlate_actions); COVERAGE_DEFINE(xlate_actions_oversize); @@ -714,7 +716,7 @@ static void xlate_xbundle_set(struct xbundle *xbundle, enum port_priority_tags_mode, const struct bond *bond, const struct lacp *lacp, bool floodable, bool protected, uint8_t p4_bridge_id); -#elif +#else static void xlate_xbundle_set(struct xbundle *xbundle, enum port_vlan_mode vlan_mode, uint16_t qinq_ethtype, int vlan, @@ -1165,7 +1167,7 @@ xlate_xbundle_set(struct xbundle *xbundle, enum port_priority_tags_mode use_priority_tags, const struct bond *bond, const struct lacp *lacp, bool floodable, bool protected, uint8_t p4_bridge_id) -#elif +#else static void xlate_xbundle_set(struct xbundle *xbundle, enum port_vlan_mode vlan_mode, uint16_t qinq_ethtype, @@ -1291,7 +1293,7 @@ xlate_xbundle_copy(struct xbridge *xbridge, struct xbundle *xbundle) xbundle->use_priority_tags, xbundle->bond, xbundle->lacp, xbundle->floodable, xbundle->protected, xbundle->p4_bridge_id); -#elif +#else xlate_xbundle_set(new_xbundle, xbundle->vlan_mode, xbundle->qinq_ethtype, xbundle->vlan, xbundle->trunks, xbundle->cvlans, xbundle->use_priority_tags, xbundle->bond, xbundle->lacp, @@ -1514,7 +1516,7 @@ xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, enum port_priority_tags_mode use_priority_tags, const struct bond *bond, const struct lacp *lacp, bool floodable, bool protected, uint8_t p4_bridge_id) -#elif +#else void xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, const char *name, enum port_vlan_mode vlan_mode, @@ -1545,7 +1547,7 @@ xlate_bundle_set(struct ofproto_dpif *ofproto, struct ofbundle *ofbundle, xlate_xbundle_set(xbundle, vlan_mode, qinq_ethtype, vlan, trunks, cvlans, use_priority_tags, bond, lacp, floodable, protected, p4_bridge_id); -#elif +#else xlate_xbundle_set(xbundle, vlan_mode, qinq_ethtype, vlan, trunks, cvlans, use_priority_tags, bond, lacp, floodable, protected); #endif @@ -3213,6 +3215,7 @@ is_ip_local_multicast(const struct flow *flow, struct flow_wildcards *wc) } #if defined(P4OVS) + static enum p4_vlan_mode get_p4_vlan_mode(enum port_vlan_mode vlan_mode) { if (vlan_mode == PORT_VLAN_ACCESS) @@ -3327,7 +3330,7 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, close(fd); VLOG_ERR("Error retrieving vlan id through ioctl"); return -1; - } + } fdb_info->vln_info.vlan_id = if_request.u.VID; close(fd); } @@ -3362,10 +3365,11 @@ update_ip_mac_map_info(const struct flow *flow, ip_mac_map_info->dst_ip_addr.family = AF_INET; ip_mac_map_info->dst_ip_addr.ip.v4addr.s_addr = flow->nw_dst; - } + } return -1; } + #endif // P4OVS static void @@ -3375,7 +3379,7 @@ xlate_normal(struct xlate_ctx *ctx) struct flow *flow = &ctx->xin->flow; #if defined(P4OVS) bool is_mac_learn_required = false; -#endif //P4OVS +#endif struct xbundle *in_xbundle; struct xport *in_port; struct mac_entry *mac; @@ -3444,7 +3448,7 @@ xlate_normal(struct xlate_ctx *ctx) flow->dl_src, vlan,is_grat_arp, in_xbundle->bond != NULL, in_xbundle->ofbundle); -#endif +#endif //The function below calls mac_learning_insert update_learning_table(ctx, in_xbundle, flow->dl_src, vlan, is_grat_arp); @@ -3467,10 +3471,10 @@ xlate_normal(struct xlate_ctx *ctx) } p4ovs_unlock(&p4ovs_fdb_entry_lock); } - + // Update the recently added MAC entry with flow info struct mac_entry *e; - + ovs_rwlock_wrlock(&ctx->xbridge->ml->rwlock); e = mac_learning_lookup(ctx->xbridge->ml, flow->dl_src, vlan); if (e) { @@ -3479,7 +3483,7 @@ xlate_normal(struct xlate_ctx *ctx) //TODO: Update IPv6 info in MAC entry when IPv6 support is added } ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock); - + if (ovs_p4_offload_enabled()) { struct ip_mac_map_info ip_info = {0}; if (update_ip_mac_map_info(flow, &ip_info)) { diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index be7caa45182..46977dd931f 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -1,4 +1,5 @@ /* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc. + * Copyright (c) 2023 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -195,7 +196,7 @@ void xlate_bundle_set(struct ofproto_dpif *, struct ofbundle *, enum port_priority_tags_mode, const struct bond *, const struct lacp *, bool floodable, bool protected, uint8_t p4_bridge_id); -#elif +#else void xlate_bundle_set(struct ofproto_dpif *, struct ofbundle *, const char *name, enum port_vlan_mode, uint16_t qinq_ethtype, int vlan, diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 43a95958509..e35a0ac2319 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -1,4 +1,5 @@ -/* +/* [no original copyright notice] + * Copyright (c) 2022-2024 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -507,7 +508,7 @@ type_run(const char *type) bundle->bond, bundle->lacp, bundle->floodable, bundle->protected, bundle->p4_bridge_id); -#elif +#else xlate_bundle_set(ofproto, bundle, bundle->name, bundle->vlan_mode, bundle->qinq_ethtype, bundle->vlan, bundle->trunks, bundle->cvlans, diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index df69f3ce620..ca136da5a76 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc. + * Copyright (c) 2023 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +23,7 @@ #include #include #include + #include "cfm.h" #include "classifier.h" #include "flow.h" diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 99e10cc7298..e46a42b6e8b 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -1,4 +1,5 @@ /* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc. + * Copyright (c) 2023-2024 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index 3de92204970..ea1a6cd0737 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -1,4 +1,5 @@ /* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc. + * Copyright (c) 2024 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 81faec7ad326ec11bab51b087c9c41a508c7afcc Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Thu, 13 Jun 2024 14:07:54 -0700 Subject: [PATCH 824/833] Move --grpc_addr option to P4OVS help section (#117) Signed-off-by: Derek Foster --- vswitchd/ovs-vswitchd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index ea1a6cd0737..b25e4c33416 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -323,12 +323,14 @@ usage(void) "Configuration of DPDK via command-line is removed from this\n" "version of Open vSwitch. DPDK is configured through ovsdb.\n" ); +#if defined(P4OVS) + printf("\nP4OVS options:\n" + " -g, --grpc-addr=ADDR gRPC server address for P4Runtime server\n" + ); +#endif printf("\nOther options:\n" " --unixctl=SOCKET override default control socket name\n" " -h, --help display this help message\n" -#if defined(P4OVS) - " -g, --grpc-addr gRPC server address for P4Runtime server\n" -#endif " -V, --version display version information\n"); exit(EXIT_SUCCESS); } From edd49441be647c82c6a3f1ca25bbfe98f965b66a Mon Sep 17 00:00:00 2001 From: Sabeel Ansari <35787514+5abeel@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:06:48 -0500 Subject: [PATCH 825/833] Manually terminate grpc addr string to address Coverity hit (#118) (#120) - Manually terminate grpc addr string to address Coverity hit - Revise coverity fix (#119) Signed-off-by: Sabeel Ansari Signed-off-by: Derek Foster Co-authored-by: Derek G Foster --- vswitchd/p4ovs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vswitchd/p4ovs.c b/vswitchd/p4ovs.c index c416a7a8df2..c75f83357f4 100644 --- a/vswitchd/p4ovs.c +++ b/vswitchd/p4ovs.c @@ -13,10 +13,14 @@ char grpc_addr[32] = "localhost:9559"; static const char grpc_port[] = ":9559"; void ovs_set_grpc_addr(const char* optarg) { - if (strlen(optarg) + sizeof(grpc_port) >= sizeof(grpc_addr)) { - ovs_fatal(0, "--grpc_addr is too long (> %lu characters)", - sizeof(grpc_addr) - sizeof(grpc_port)); + size_t maximum = sizeof(grpc_addr) - strlen(grpc_port) - 1; + size_t actual = strlen(optarg); + + if (actual > maximum) { + ovs_fatal(0, "--grpc-addr (%lu chars) is too long (> %lu chars)", + actual, maximum); } + strncpy(grpc_addr, optarg, sizeof(grpc_addr)); strcat(grpc_addr, grpc_port); } From 28464fbac13dd16ce4577fc266a70afdd9d89239 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Mon, 24 Jun 2024 08:41:20 -0700 Subject: [PATCH 826/833] Update ovs-p4rt.h infrastructure (#122) - Removed symbolic link from include/openvswitch/ovs-p4rt.h to include/ovsp4rt/ovs-p4rt.h. Signed-off-by: Derek Foster --- Makefile.am | 2 +- include/openvswitch/ovs-p4rt.h | 1 - include/ovsp4rt/automake.mk | 4 ---- 3 files changed, 1 insertion(+), 6 deletions(-) delete mode 120000 include/openvswitch/ovs-p4rt.h diff --git a/Makefile.am b/Makefile.am index 0a7db4debdf..f47217e0bc3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -210,7 +210,7 @@ dist-hook-git: distfiles @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \ (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \ grep -v '\.gitattributes$$' | grep -v 'CODEOWNERS' | \ - grep -v 'openvswitch/ovs-p4rt.h' | \ + grep -v 'ovsp4rt/ovs-p4rt.h' | \ LC_ALL=C sort -u > all-gitfiles; \ LC_ALL=C comm -1 -3 distfiles all-gitfiles > missing-distfiles; \ if test -s missing-distfiles; then \ diff --git a/include/openvswitch/ovs-p4rt.h b/include/openvswitch/ovs-p4rt.h deleted file mode 120000 index be1d2545e40..00000000000 --- a/include/openvswitch/ovs-p4rt.h +++ /dev/null @@ -1 +0,0 @@ -../ovsp4rt/ovs-p4rt.h \ No newline at end of file diff --git a/include/ovsp4rt/automake.mk b/include/ovsp4rt/automake.mk index 06d613908b4..5a93f3d701e 100644 --- a/include/ovsp4rt/automake.mk +++ b/include/ovsp4rt/automake.mk @@ -2,8 +2,4 @@ if P4OVS ovsp4rtincludedir = $(includedir)/ovsp4rt ovsp4rtinclude_HEADERS = \ include/ovsp4rt/ovs-p4rt.h - -install-data-hook: - cd $(DESTDIR)$(includedir)/openvswitch && \ - $(LN_S) ../ovsp4rt/ovs-p4rt.h ovs-p4rt.h endif From 1baafe761a7f44361b06182da7a7925b0cf7eff3 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Mon, 24 Jun 2024 13:20:49 -0700 Subject: [PATCH 827/833] Update get_p4_vlan_mode() to use switch statement (#121) Signed-off-by: Derek Foster --- ofproto/ofproto-dpif-xlate.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index e0d81cf2cbc..46f15f8992d 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3218,16 +3218,18 @@ is_ip_local_multicast(const struct flow *flow, struct flow_wildcards *wc) static enum p4_vlan_mode get_p4_vlan_mode(enum port_vlan_mode vlan_mode) { - if (vlan_mode == PORT_VLAN_ACCESS) + switch (vlan_mode) { + case PORT_VLAN_ACCESS: return P4_PORT_VLAN_ACCESS; - else if (vlan_mode == PORT_VLAN_TRUNK) + case PORT_VLAN_TRUNK: return P4_PORT_VLAN_TRUNK; - else if (vlan_mode == PORT_VLAN_NATIVE_TAGGED) + case PORT_VLAN_NATIVE_TAGGED: return P4_PORT_VLAN_NATIVE_TAGGED; - else if (vlan_mode == PORT_VLAN_NATIVE_UNTAGGED) + case PORT_VLAN_NATIVE_UNTAGGED: return P4_PORT_VLAN_NATIVE_UNTAGGED; - else + default: return P4_PORT_VLAN_UNSUPPORTED; + } } static int32_t From 03cef450dd830ff1d61b42d278a5723468837114 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Mon, 24 Jun 2024 13:21:38 -0700 Subject: [PATCH 828/833] Rename grpc_addr to p4ovs_grpc_addr (#123) - Renamed the global `grpc_addr` variable to `p4ovs_grpc_addr`. Symbols we introduce into the global namespace should be prefixed to make their affinity clear and to reduce the likelihood of collision with other global names. Signed-off-by: Derek Foster --- include/openvswitch/p4ovs.h | 2 +- lib/mac-learning.c | 4 ++-- ofproto/ofproto-dpif-xlate.c | 6 +++--- vswitchd/bridge.c | 16 ++++++++-------- vswitchd/p4ovs.c | 9 ++++----- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/include/openvswitch/p4ovs.h b/include/openvswitch/p4ovs.h index 8eb49bd5914..a209409c72d 100644 --- a/include/openvswitch/p4ovs.h +++ b/include/openvswitch/p4ovs.h @@ -22,7 +22,7 @@ extern "C" { extern struct ovs_mutex p4ovs_fdb_entry_lock; -extern char grpc_addr[32]; +extern char p4ovs_grpc_addr[32]; /* Control OvS offload with an environment variable during runtime. * If env variable OVS_P4_OFFLOAD=false, then disable OVS offload, else diff --git a/lib/mac-learning.c b/lib/mac-learning.c index af2bd1dca56..a111452405d 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -630,7 +630,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) memcpy(fdb_info.mac_addr, e->mac.ea, sizeof(fdb_info.mac_addr)); fdb_info.is_vlan = true; fdb_info.bridge_id = ml->p4_bridge_id; - ovsp4rt_config_fdb_entry(fdb_info, false, grpc_addr); + ovsp4rt_config_fdb_entry(fdb_info, false, p4ovs_grpc_addr); // Remove the corresponding ip_mac tables both for src ip and dst ip struct ip_mac_map_info ip_info; @@ -640,7 +640,7 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) ip_info.dst_ip_addr.family = AF_INET; ip_info.dst_ip_addr.ip.v4addr.s_addr = e->nw_dst; // TODO: Update IPv6 fields when IPv6 support is added - ovsp4rt_config_ip_mac_map_entry(ip_info, false, grpc_addr); + ovsp4rt_config_ip_mac_map_entry(ip_info, false, p4ovs_grpc_addr); } #endif // P4OVS free(e); diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 46f15f8992d..5659c6215ed 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3465,7 +3465,7 @@ xlate_normal(struct xlate_ctx *ctx) if (ovs_p4_offload_enabled()) { p4ovs_lock(&p4ovs_fdb_entry_lock); if (!get_fdb_data(ovs_port, flow->dl_src, &fdb_info)) { - ovsp4rt_config_fdb_entry(fdb_info, true, grpc_addr); + ovsp4rt_config_fdb_entry(fdb_info, true, p4ovs_grpc_addr); ctx->xbridge->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_DBG("Error retrieving FDB information, skipping programming " @@ -3489,7 +3489,7 @@ xlate_normal(struct xlate_ctx *ctx) if (ovs_p4_offload_enabled()) { struct ip_mac_map_info ip_info = {0}; if (update_ip_mac_map_info(flow, &ip_info)) { - ovsp4rt_config_ip_mac_map_entry(ip_info, true, grpc_addr); + ovsp4rt_config_ip_mac_map_entry(ip_info, true, p4ovs_grpc_addr); } } else { VLOG_DBG("P4 offload disabled, skipping programming "); @@ -9060,7 +9060,7 @@ xlate_add_static_mac_entry(const struct ofproto_dpif *ofproto, memset(&fdb_info, 0, sizeof(fdb_info)); if (!get_fdb_data(ovs_port, dl_src, &fdb_info)) { - ovsp4rt_config_fdb_entry(fdb_info, true, grpc_addr); + ovsp4rt_config_fdb_entry(fdb_info, true, p4ovs_grpc_addr); ofproto->ml->p4_bridge_id = ovs_port->xbundle->p4_bridge_id; } else { VLOG_DBG("Error retrieving FDB information, skipping programming " diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index e46a42b6e8b..a733325ac6a 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -2374,9 +2374,9 @@ ConfigureP4Target(struct bridge *br, struct port *port, tnl_info.bridge_id = br->p4_bridge_id; tnl_info.src_port = port->p4_src_port; - ovsp4rt_config_tunnel_entry(tnl_info, insert_entry, grpc_addr); + ovsp4rt_config_tunnel_entry(tnl_info, insert_entry, p4ovs_grpc_addr); ovsp4rt_config_rx_tunnel_src_entry(tnl_info, insert_entry, - grpc_addr); + p4ovs_grpc_addr); } else { VLOG_ERR("Error retrieving tunnel information, " "skipping programming P4 entry"); @@ -2392,9 +2392,9 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_src_port}; /* When VLAN tag is configured */ ovsp4rt_config_vlan_entry(port->p4_vlan_id, insert_entry, - grpc_addr); + p4ovs_grpc_addr); ovsp4rt_config_tunnel_src_port_entry(tnl_src_port_info, - insert_entry, grpc_addr); + insert_entry, p4ovs_grpc_addr); } else { /* Wild card VLAN 0 */ struct src_port_info tnl_src_port_info = {br->p4_bridge_id, @@ -2402,7 +2402,7 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_src_port}; ovsp4rt_config_tunnel_src_port_entry(tnl_src_port_info, - insert_entry, grpc_addr); + insert_entry, p4ovs_grpc_addr); } port->is_src_port_configured = insert_entry; } else if (!insert_entry || iface->cfg->mac_in_use) { @@ -2423,9 +2423,9 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_src_port}; ovsp4rt_config_vlan_entry(port->p4_vlan_id, insert_entry, - grpc_addr); + p4ovs_grpc_addr); ovsp4rt_config_src_port_entry(vsi_src_port_info, insert_entry, - grpc_addr); + p4ovs_grpc_addr); } else if (port->p4_vlan_mode == P4_PORT_VLAN_UNSUPPORTED) { /* Do nothing, unsupported vlan mode */ } else if (port->p4_src_port) { @@ -2434,7 +2434,7 @@ ConfigureP4Target(struct bridge *br, struct port *port, port->p4_src_port}; ovsp4rt_config_src_port_entry(vsi_src_port_info, insert_entry, - grpc_addr); + p4ovs_grpc_addr); } else { VLOG_DBG("Invalid P4 use case for source port to " "bridge mapping"); diff --git a/vswitchd/p4ovs.c b/vswitchd/p4ovs.c index c75f83357f4..a8a31de168a 100644 --- a/vswitchd/p4ovs.c +++ b/vswitchd/p4ovs.c @@ -9,11 +9,11 @@ #include "openvswitch/p4ovs.h" #include "util.h" -char grpc_addr[32] = "localhost:9559"; +char p4ovs_grpc_addr[32] = "localhost:9559"; static const char grpc_port[] = ":9559"; void ovs_set_grpc_addr(const char* optarg) { - size_t maximum = sizeof(grpc_addr) - strlen(grpc_port) - 1; + size_t maximum = sizeof(p4ovs_grpc_addr) - strlen(grpc_port) - 1; size_t actual = strlen(optarg); if (actual > maximum) { @@ -21,7 +21,6 @@ void ovs_set_grpc_addr(const char* optarg) { actual, maximum); } - strncpy(grpc_addr, optarg, sizeof(grpc_addr)); - strcat(grpc_addr, grpc_port); + strncpy(p4ovs_grpc_addr, optarg, sizeof(p4ovs_grpc_addr)); + strcat(p4ovs_grpc_addr, grpc_port); } - From 123e31ad6bef28505f37dc2793931fb49c163710 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Tue, 2 Jul 2024 12:40:49 -0700 Subject: [PATCH 829/833] Fix compiler warning (#125) Signed-off-by: Derek Foster --- ofproto/ofproto-dpif-xlate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 5659c6215ed..435f79fafc3 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3227,6 +3227,7 @@ get_p4_vlan_mode(enum port_vlan_mode vlan_mode) { return P4_PORT_VLAN_NATIVE_TAGGED; case PORT_VLAN_NATIVE_UNTAGGED: return P4_PORT_VLAN_NATIVE_UNTAGGED; + case P4_PORT_VLAN_DOT1Q_TUNNEL: default: return P4_PORT_VLAN_UNSUPPORTED; } From d45d82ecf1dabd987d077782c1e128ca69c58eb7 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Wed, 3 Jul 2024 14:40:28 -0700 Subject: [PATCH 830/833] Reengineer OvS to build with ovsp4rt libraries (#124) - Replaced OVS_CHECK_P4OVS with OVS_CHECK_OVSP4RT. - Implemented --with-ovsp4rt[=stugs] command-line flag to enable new P4 build modes. - Implemented OVSP4RT and LEGACY_P4OVS automake conditionals to specify whether to build in OVSP4RT mode or legacy P4OVS mode. - Moved p4ovs.c and p4ovs.h to the lib directory. Signed-off-by: Derek Foster --- configure.ac | 2 +- include/openvswitch/automake.mk | 5 -- lib/automake.mk | 6 +++ lib/mac-learning.c | 2 +- {vswitchd => lib}/p4ovs.c | 2 +- {include/openvswitch => lib}/p4ovs.h | 12 +++-- m4/ovs_check_p4ovs.m4 | 77 ++++++++++++++++++++++------ ofproto/ofproto-dpif-xlate.c | 2 +- utilities/automake.mk | 10 ++-- vswitchd/automake.mk | 9 ++-- vswitchd/bridge.c | 2 +- vswitchd/ovs-vswitchd.c | 2 +- 12 files changed, 91 insertions(+), 40 deletions(-) rename {vswitchd => lib}/p4ovs.c (95%) rename {include/openvswitch => lib}/p4ovs.h (89%) diff --git a/configure.ac b/configure.ac index cd13f523234..e075f3afc3d 100644 --- a/configure.ac +++ b/configure.ac @@ -204,7 +204,7 @@ OVS_CHECK_LINUX_VIRTIO_TYPES OVS_CHECK_DPDK OVS_CHECK_PRAGMA_MESSAGE OVS_CHECK_VERSION_SUFFIX -OVS_CHECK_P4OVS +OVS_CHECK_OVSP4RT AC_SUBST([CFLAGS]) AC_SUBST([OVS_CFLAGS]) diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 47359a6d4cb..0cc1f569e0a 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -52,11 +52,6 @@ openvswitchinclude_HEADERS = \ include/openvswitch/vlog.h \ include/openvswitch/nsh.h -if P4OVS -openvswitchinclude_HEADERS += \ - include/openvswitch/p4ovs.h -endif - if HAVE_CXX # OVS does not use C++ itself, but it provides public header files # that a C++ compiler should accept, so when --enable-Werror is in diff --git a/lib/automake.mk b/lib/automake.mk index 78d6e651645..8cf2b66413f 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -432,6 +432,12 @@ lib_libopenvswitch_la_SOURCES += \ lib/stream-unix.c endif +if P4OVS +lib_libopenvswitch_la_SOURCES += \ + lib/p4ovs.c \ + lib/p4ovs.h +endif + EXTRA_DIST += \ lib/stdio.h.in \ lib/string.h.in diff --git a/lib/mac-learning.c b/lib/mac-learning.c index a111452405d..38bd838f88d 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -32,8 +32,8 @@ #include "vlan-bitmap.h" #if defined(P4OVS) +#include "lib/p4ovs.h" #include "ovsp4rt/ovs-p4rt.h" -#include "openvswitch/p4ovs.h" #endif COVERAGE_DEFINE(mac_learning_learned); diff --git a/vswitchd/p4ovs.c b/lib/p4ovs.c similarity index 95% rename from vswitchd/p4ovs.c rename to lib/p4ovs.c index a8a31de168a..3ad74f212b1 100644 --- a/vswitchd/p4ovs.c +++ b/lib/p4ovs.c @@ -6,7 +6,7 @@ #include #include -#include "openvswitch/p4ovs.h" +#include "lib/p4ovs.h" #include "util.h" char p4ovs_grpc_addr[32] = "localhost:9559"; diff --git a/include/openvswitch/p4ovs.h b/lib/p4ovs.h similarity index 89% rename from include/openvswitch/p4ovs.h rename to lib/p4ovs.h index a209409c72d..e576de65c00 100644 --- a/include/openvswitch/p4ovs.h +++ b/lib/p4ovs.h @@ -2,12 +2,14 @@ * Copyright (c) 2023-2024 Intel Corporation. * SPDX-License-Identifier: Apache-2.0 * - * Defines the P4 OvS specific definitions. These need be used under - * if defined(P4OVS) scope only. + * Definitions specific to P4OVS. + * + * OVS code that references this file must do so under protection of an + * #ifdef P4OVS conditional. */ -#ifndef OPENVSWITCH_P4OVS_H -#define OPENVSWITCH_P4OVS_H +#ifndef LIB_P4OVS_H +#define LIB_P4OVS_H #include #include @@ -63,4 +65,4 @@ void ovs_set_grpc_addr(const char* optarg); } // extern "C" #endif -#endif // OPENVSWITCH_P4OVS_H +#endif // LIB_P4OVS_H diff --git a/m4/ovs_check_p4ovs.m4 b/m4/ovs_check_p4ovs.m4 index c1062327505..76c73501c0d 100644 --- a/m4/ovs_check_p4ovs.m4 +++ b/m4/ovs_check_p4ovs.m4 @@ -1,21 +1,68 @@ -dnl OVS_CHECK_P4OVS - Process P4 options. - -dnl Copyright(c) 2021-2022 Intel Corporation. +dnl OVS_CHECK_OVSP4RT - Process P4 options. +dnl +dnl Copyright(c) 2021-2024 Intel Corporation. dnl SPDX-License-Identifier: Apache 2.0 +dnl +dnl Implements the --with-p4ovs and --with-ovsp4rt flags. +dnl +dnl For --with-ovsp4rt, the directory containing the libovsp4rt +dnl pkg-config files must be listed by the PKG_CONFIG_PATH +dnl environment variable. +dnl +dnl config.h symbols: +dnl P4OVS - enables P4 support +dnl +dnl autoconf symbols: +dnl OVSP4RT_CFLAGS - compiler parameters +dnl OVSP4RT_LIBS - linker parameters +dnl +dnl automake conditionals: +dnl P4OVS - enables P4 support +dnl OVSP4RT - links with ovsp4rt internally +dnl LEGACY_P4OVS - links with ovsp4rt externally +dnl +AC_DEFUN([OVS_CHECK_OVSP4RT], [ + AC_ARG_WITH([ovsp4rt], + [AC_HELP_STRING([--with-ovsp4rt@<:@=stubs@:>@], + [Build with OVSP4RT support])], + [have_ovsp4rt=true], + [have_ovsp4rt=false]) -AC_DEFUN([OVS_CHECK_P4OVS], [ AC_ARG_WITH([p4ovs], - [AC_HELP_STRING([--with-p4ovs], [Build with P4 support])], - [have_p4ovs=true]) - AC_MSG_CHECKING([whether P4OVS is enabled]) - if test "$have_p4ovs" != true || test "$with_p4ovs" = no; then - AC_MSG_RESULT([no]) - P4OVS_VALID=false - else - AC_MSG_RESULT([yes]) - P4OVS_VALID=true + [AC_HELP_STRING([--with-p4ovs], + [Build with P4 support (legacy mode)])], + [have_p4ovs=true], + [have_p4ovs=false]) + + if test $have_ovsp4rt = true; then + if test "$with_ovsp4rt" = "stubs"; then + PKG_CHECK_MODULES([ovsp4rtstubs], [libovsp4rt_stubs]) + OVSP4RT_CFLAGS=$ovsp4rtstubs_CFLAGS + OVSP4RT_LIBS=$ovsp4rtstubs_LIBS + else + PKG_CHECK_MODULES([ovsp4rt], [libovsp4rt]) + OVSP4RT_CFLAGS=$ovsp4rt_CFLAGS + OVSP4RT_LIBS=$ovsp4rt_LIBS + fi + have_p4ovs=true + fi + + legacy_p4ovs=$have_p4ovs + if test $have_ovsp4rt = true; then + legacy_p4ovs=false + fi + + dnl define variable in config.h file + if test $have_p4ovs = true; then AC_DEFINE([P4OVS], [1], [System includes P4 support.]) fi - dnl export automake conditional - AM_CONDITIONAL([P4OVS], test "$P4OVS_VALID" = true) + + dnl export autoconf variables + AC_SUBST([OVSP4RT_CFLAGS]) + AC_SUBST([OVSP4RT_LIBS]) + + dnl export automake conditionals + AM_CONDITIONAL([P4OVS], test $have_p4ovs = true) + AM_CONDITIONAL([OVSP4RT], test $have_ovsp4rt = true) + AM_CONDITIONAL([LEGACY_P4OVS], test $legacy_p4ovs = true) ]) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 435f79fafc3..35c9435faa2 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -77,8 +77,8 @@ #include #include "lib/netdev.h" +#include "lib/p4ovs.h" #include "ovsp4rt/ovs-p4rt.h" -#include "openvswitch/p4ovs.h" #endif COVERAGE_DEFINE(xlate_actions); diff --git a/utilities/automake.mk b/utilities/automake.mk index f3a16095659..c4566938c44 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -4,9 +4,8 @@ bin_PROGRAMS += \ utilities/ovs-ofctl \ utilities/ovs-vsctl -# Build test controller as static library and -# link to sidecar before building its executable -if P4OVS +if LEGACY_P4OVS +# Build a static library instead of an executable. lib_LTLIBRARIES += utilities/libtestcontroller.la else bin_PROGRAMS += utilities/ovs-testcontroller @@ -130,13 +129,16 @@ man_MANS += \ utilities_ovs_appctl_SOURCES = utilities/ovs-appctl.c utilities_ovs_appctl_LDADD = lib/libopenvswitch.la -if P4OVS +if LEGACY_P4OVS utilities_libtestcontroller_la_CPPFLAGS = $(AM_CPPFLAGS) utilities_libtestcontroller_la_SOURCES = utilities/ovs-testcontroller.c utilities_libtestcontroller_la_LIBADD = lib/libopenvswitch.la $(SSL_LIBS) else utilities_ovs_testcontroller_SOURCES = utilities/ovs-testcontroller.c utilities_ovs_testcontroller_LDADD = lib/libopenvswitch.la $(SSL_LIBS) +if OVSP4RT +utilities_ovs_testcontroller_LDADD += $(OVSP4RT_LIBS) +endif endif utilities_ovs_dpctl_SOURCES = utilities/ovs-dpctl.c diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk index 0ca86153b69..038347c6ec7 100644 --- a/vswitchd/automake.mk +++ b/vswitchd/automake.mk @@ -10,11 +10,7 @@ vswitchd_sources = \ vswitchd/system-stats.c \ vswitchd/system-stats.h -if P4OVS -vswitchd_sources += vswitchd/p4ovs.c -endif - -if P4OVS +if LEGACY_P4OVS # Build a static library instead of an executable. lib_LTLIBRARIES += vswitchd/libvswitchd.la @@ -39,6 +35,9 @@ vswitchd_ovs_vswitchd_LDADD = \ lib/libopenvswitch.la vswitchd_ovs_vswitchd_LDFLAGS = $(AM_LDFLAGS) $(DPDK_vswitchd_LDFLAGS) +if OVSP4RT +vswitchd_ovs_vswitchd_LDFLAGS += $(OVSP4RT_LIBS) +endif endif MAN_ROOTS += vswitchd/ovs-vswitchd.8.in diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index a733325ac6a..744fbd7b084 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -76,8 +76,8 @@ #if defined(P4OVS) #include +#include "lib/p4ovs.h" #include "ovsp4rt/ovs-p4rt.h" -#include "openvswitch/p4ovs.h" static int32_t get_tunnel_data(struct netdev *netdev, diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index b25e4c33416..be68ff7f7ef 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -55,7 +55,7 @@ #include "lib/vswitch-idl.h" #include "lib/dns-resolve.h" #if defined(P4OVS) -#include "openvswitch/p4ovs.h" +#include "lib/p4ovs.h" #endif VLOG_DEFINE_THIS_MODULE(vswitchd); From a0ca844c4908e2645dfb98043016c6ede07fda64 Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Wed, 3 Jul 2024 19:04:16 -0700 Subject: [PATCH 831/833] Add processing for --with-p4ovs option (#126) Signed-off-by: Derek Foster --- m4/ovs_check_p4ovs.m4 | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/m4/ovs_check_p4ovs.m4 b/m4/ovs_check_p4ovs.m4 index 76c73501c0d..e50ad419939 100644 --- a/m4/ovs_check_p4ovs.m4 +++ b/m4/ovs_check_p4ovs.m4 @@ -45,11 +45,16 @@ AC_DEFUN([OVS_CHECK_OVSP4RT], [ OVSP4RT_LIBS=$ovsp4rt_LIBS fi have_p4ovs=true - fi - - legacy_p4ovs=$have_p4ovs - if test $have_ovsp4rt = true; then legacy_p4ovs=false + else + AC_MSG_CHECKING([whether P4OVS is enabled]) + if test $have_p4ovs == true; then + AC_MSG_RESULT([yes]) + legacy_p4ovs=true + else + AC_MSG_RESULT([no]) + legacy_p4ovs=false + fi fi dnl define variable in config.h file From a5e545ebdd90d1de369b332e5db592a8346ebd7c Mon Sep 17 00:00:00 2001 From: Derek G Foster Date: Mon, 22 Jul 2024 08:54:45 -0700 Subject: [PATCH 832/833] Fix failing tests in P4OVS build (#127) - Downgraded three VLOG_ERR messages to VLOG_DBG. This fixes all but one of the test failures in the P4OVS build. - Shortened one of the downgraded error messages. Signed-off-by: Derek Foster --- ofproto/ofproto-dpif-xlate.c | 2 +- vswitchd/bridge.c | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 35c9435faa2..c0fa3917fff 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3267,7 +3267,7 @@ get_fdb_data(struct xport *port, struct eth_addr mac_addr, int underlay_ifindex = netdev_get_ifindex(port->netdev); if (underlay_ifindex < 0) { - VLOG_ERR("Invalid tunnel ifindex"); + VLOG_DBG("Invalid tunnel ifindex"); return -1; } diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 744fbd7b084..4d7c4f819f2 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -74,18 +74,19 @@ #include "vlan-bitmap.h" #if defined(P4OVS) + #include #include "lib/p4ovs.h" #include "ovsp4rt/ovs-p4rt.h" static int32_t -get_tunnel_data(struct netdev *netdev, - struct tunnel_info *tnl_info); +get_tunnel_data(struct netdev *netdev, struct tunnel_info *tnl_info); uint8_t last_p4_bridge_id_used = 0; uint32_t unique_tunnel_src_port = P4_VXLAN_SOURCE_PORT_OFFSET; struct ovs_mutex p4ovs_fdb_entry_lock = OVS_MUTEX_INITIALIZER; + #endif VLOG_DEFINE_THIS_MODULE(bridge); @@ -2265,8 +2266,7 @@ iface_do_create(const struct bridge *br, #if defined(P4OVS) static int32_t -get_tunnel_data(struct netdev *netdev, - struct tunnel_info *tnl_info) +get_tunnel_data(struct netdev *netdev, struct tunnel_info *tnl_info) { const struct netdev_tunnel_config *underlay_tnl = NULL; underlay_tnl = netdev_get_tunnel_config(netdev); @@ -2276,7 +2276,7 @@ get_tunnel_data(struct netdev *netdev, } int underlay_ifindex = netdev_get_ifindex(netdev); if (underlay_ifindex < 0) { - VLOG_ERR("Invalid tunnel ifindex"); + VLOG_DBG("Invalid tunnel ifindex"); return -1; } tnl_info->ifindex = (uint32_t)underlay_ifindex; @@ -2378,8 +2378,7 @@ ConfigureP4Target(struct bridge *br, struct port *port, ovsp4rt_config_rx_tunnel_src_entry(tnl_info, insert_entry, p4ovs_grpc_addr); } else { - VLOG_ERR("Error retrieving tunnel information, " - "skipping programming P4 entry"); + VLOG_DBG("Error getting tunnel data, P4 entry not programmed"); } if (port->p4_vlan_mode == P4_PORT_VLAN_NATIVE_TAGGED) { From 652af73b10025cd8a936569e847cc43543aadad5 Mon Sep 17 00:00:00 2001 From: Derek Foster Date: Mon, 22 Jul 2024 15:01:39 -0700 Subject: [PATCH 833/833] Fix issue with static_move counter - The mac_learning_static_none_move counter was being incremented twice for the same packet, causing one of the tests to fail. Discovered that the P4 code makes its own call to the function, believing it to be a predicate with no side effects. Solved the problem by having the function return a Boolean value indicating whether a static mac port move was being attempted. The non-P4 call site in mac_learning_update() increments the counter if the returned value is True. The P4 call site in xlate_normal() ignores the returned value. Signed-off-by: Derek Foster --- lib/mac-learning.c | 29 ++++++++++++++++++--------- lib/mac-learning.h | 9 ++++----- ofproto/ofproto-dpif-xlate.c | 39 +++++++++++++++++++++--------------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 38bd838f88d..5d498cfd568 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -462,12 +462,15 @@ bool is_mac_learning_update_needed(const struct mac_learning *ml, struct eth_addr src, int vlan, bool is_gratuitous_arp, bool is_bond, - void *in_port) + const void *in_port, bool *is_static_move) OVS_REQ_RDLOCK(ml->rwlock) { struct mac_entry *mac; + bool is_port_move; int age; + *is_static_move = false; + if (!mac_learning_may_learn(ml, src, vlan)) { return false; } @@ -478,14 +481,13 @@ is_mac_learning_update_needed(const struct mac_learning *ml, return true; } + /* Check whether address is on a different port. */ + is_port_move = mac_entry_get_port(ml, mac) != in_port; + age = mac_entry_age(ml, mac); /* If mac is a static entry, then there is no need to update. */ if (age == MAC_ENTRY_AGE_STATIC_ENTRY) { - /* Coverage counter to increment when a packet with same - * static-mac appears on a different port. */ - if (mac_entry_get_port(ml, mac) != in_port) { - COVERAGE_INC(mac_learning_static_none_move); - } + *is_static_move = is_port_move; return false; } @@ -506,7 +508,7 @@ is_mac_learning_update_needed(const struct mac_learning *ml, } } - return mac_entry_get_port(ml, mac) != in_port /* ofbundle */; + return is_port_move; } /* Updates MAC learning table 'ml' given that a packet matching 'src' was @@ -574,7 +576,8 @@ mac_learning_update(struct mac_learning *ml, struct eth_addr src, void *in_port) OVS_EXCLUDED(ml->rwlock) { - bool need_update; + bool is_static_move = false; + bool need_update = false; bool updated = false; /* Don't learn the OFPP_NONE port. */ @@ -582,8 +585,14 @@ mac_learning_update(struct mac_learning *ml, struct eth_addr src, /* First try the common case: no change to MAC learning table. */ ovs_rwlock_rdlock(&ml->rwlock); need_update = is_mac_learning_update_needed(ml, src, vlan, - is_gratuitous_arp, is_bond, - in_port); + is_gratuitous_arp, + is_bond, in_port, + &is_static_move); + if (is_static_move) { + /* Coverage counter to increment when a packet with same + * static-mac appears on a different port. */ + COVERAGE_INC(mac_learning_static_none_move); + } ovs_rwlock_unlock(&ml->rwlock); if (need_update) { diff --git a/lib/mac-learning.h b/lib/mac-learning.h index 0502b281153..f1e1e8e2ee7 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -230,11 +230,10 @@ bool mac_learning_may_learn(const struct mac_learning *ml, const struct eth_addr src_mac, uint16_t vlan) OVS_REQ_RDLOCK(ml->rwlock); -bool -is_mac_learning_update_needed(const struct mac_learning *ml, - struct eth_addr src, int vlan, - bool is_gratuitous_arp, bool is_bond, - void *in_port) +bool is_mac_learning_update_needed(const struct mac_learning *ml, + struct eth_addr src, int vlan, + bool is_gratuitous_arp, bool is_bond, + const void *in_port, bool *is_static_move) OVS_REQ_RDLOCK(ml->rwlock); struct mac_entry *mac_learning_insert(struct mac_learning *ml, const struct eth_addr src, diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 4cb550bd736..2cddc4a846b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -3368,17 +3368,20 @@ update_ip_mac_map_info(const struct flow *flow, return -1; } - memcpy(ip_mac_map_info->src_mac_addr, flow->dl_src.ea, sizeof(ip_mac_map_info->src_mac_addr)); - memcpy(ip_mac_map_info->dst_mac_addr, flow->dl_dst.ea, sizeof(ip_mac_map_info->dst_mac_addr)); + memcpy(ip_mac_map_info->src_mac_addr, flow->dl_src.ea, + sizeof(ip_mac_map_info->src_mac_addr)); + memcpy(ip_mac_map_info->dst_mac_addr, flow->dl_dst.ea, + sizeof(ip_mac_map_info->dst_mac_addr)); - //Program the entiry only for an ARP response where we have valid IP's and MAC for both src and dst + // Program the entry only for an ARP response where we have valid IPs + // and MAC for both src and dst. if (valid_ip_addr(flow->nw_src) && !eth_addr_is_broadcast(flow->dl_src) && - valid_ip_addr(flow->nw_dst) && !eth_addr_is_broadcast(flow->dl_dst)) { - ip_mac_map_info->src_ip_addr.family = AF_INET; - ip_mac_map_info->src_ip_addr.ip.v4addr.s_addr = flow->nw_src; + valid_ip_addr(flow->nw_dst) && !eth_addr_is_broadcast(flow->dl_dst)) { + ip_mac_map_info->src_ip_addr.family = AF_INET; + ip_mac_map_info->src_ip_addr.ip.v4addr.s_addr = flow->nw_src; - ip_mac_map_info->dst_ip_addr.family = AF_INET; - ip_mac_map_info->dst_ip_addr.ip.v4addr.s_addr = flow->nw_dst; + ip_mac_map_info->dst_ip_addr.family = AF_INET; + ip_mac_map_info->dst_ip_addr.ip.v4addr.s_addr = flow->nw_dst; } return -1; @@ -3391,10 +3394,10 @@ xlate_normal(struct xlate_ctx *ctx) { struct flow_wildcards *wc = ctx->wc; struct flow *flow = &ctx->xin->flow; + struct xbundle *in_xbundle; #if defined(P4OVS) - bool is_mac_learn_required = false; + bool need_update = false; #endif - struct xbundle *in_xbundle; struct xport *in_port; struct mac_entry *mac; void *mac_port; @@ -3458,18 +3461,22 @@ xlate_normal(struct xlate_ctx *ctx) && in_port && in_port->pt_mode != NETDEV_PT_LEGACY_L3 ) { #if defined(P4OVS) - is_mac_learn_required = is_mac_learning_update_needed(ctx->xbridge->ml, - flow->dl_src, vlan,is_grat_arp, - in_xbundle->bond != NULL, - in_xbundle->ofbundle); + bool is_static_move = false; + need_update = is_mac_learning_update_needed(ctx->xbridge->ml, + flow->dl_src, + vlan, is_grat_arp, + in_xbundle->bond != NULL, + in_xbundle->ofbundle, + &is_static_move); + /* ignore is_static_move */ #endif - //The function below calls mac_learning_insert + // The function below calls mac_learning_insert update_learning_table(ctx, in_xbundle, flow->dl_src, vlan, is_grat_arp); } #if defined(P4OVS) - if (is_mac_learn_required) { + if (need_update) { /* Dynamic MAC is learnt, program P4 forwarding table */ struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, flow->in_port.ofp_port);