diff --git a/.gitignore b/.gitignore index 3279fa0..ea11613 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ /ovn-20.12.0.tar.gz /ovn-21.03.0.tar.gz /openvswitch-2.15.90.tar.gz +/ovn-21.06.0.tar.gz diff --git a/ovn-21.03.0.patch b/ovn-21.03.0.patch deleted file mode 100644 index 6b2d1c8..0000000 --- a/ovn-21.03.0.patch +++ /dev/null @@ -1,7237 +0,0 @@ -diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh -index 0bb0ff096..83ad3958b 100755 ---- a/.ci/linux-prepare.sh -+++ b/.ci/linux-prepare.sh -@@ -12,5 +12,5 @@ set -ev - git clone git://git.kernel.org/pub/scm/devel/sparse/sparse.git - cd sparse && make -j4 HAVE_LLVM= HAVE_SQLITE= install && cd .. - --pip install --disable-pip-version-check --user six flake8 hacking --pip install --user --upgrade docutils -+pip3 install --disable-pip-version-check --user flake8 hacking sphinx pyOpenSSL -+pip3 install --upgrade --user docutils -diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml -index f3a53a8b6..91bd1e538 100644 ---- a/.github/workflows/test.yml -+++ b/.github/workflows/test.yml -@@ -13,7 +13,6 @@ jobs: - dependencies: | - automake libtool gcc bc libjemalloc1 libjemalloc-dev \ - libssl-dev llvm-dev libelf-dev libnuma-dev libpcap-dev \ -- python3-openssl python3-pip python3-sphinx \ - selinux-policy-dev - m32_dependecies: gcc-multilib - CC: ${{ matrix.compiler }} -@@ -88,11 +87,21 @@ jobs: - if: matrix.m32 != '' - run: sudo apt install -y ${{ env.m32_dependecies }} - -+ - name: update PATH -+ run: | -+ echo "$HOME/bin" >> $GITHUB_PATH -+ echo "$HOME/.local/bin" >> $GITHUB_PATH -+ -+ - name: set up python -+ uses: actions/setup-python@v2 -+ with: -+ python-version: '3.x' -+ - - name: prepare - run: ./.ci/linux-prepare.sh - - - name: build -- run: PATH="$PATH:$HOME/bin" ./.ci/linux-build.sh -+ run: ./.ci/linux-build.sh - - - name: copy logs on failure - if: failure() || cancelled() -@@ -145,10 +154,18 @@ jobs: - ref: 'master' - - name: install dependencies - run: brew install automake libtool -+ - name: update PATH -+ run: | -+ echo "$HOME/bin" >> $GITHUB_PATH -+ echo "$HOME/.local/bin" >> $GITHUB_PATH -+ - name: set up python -+ uses: actions/setup-python@v2 -+ with: -+ python-version: '3.x' - - name: prepare - run: ./.ci/osx-prepare.sh - - name: build -- run: PATH="$PATH:$HOME/bin" ./.ci/osx-build.sh -+ run: ./.ci/osx-build.sh - - name: upload logs on failure - if: failure() - uses: actions/upload-artifact@v2 -diff --git a/Makefile.am b/Makefile.am -index 80247b62d..1fe730dc4 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -221,6 +221,7 @@ dist-hook-git: distfiles - grep -v '\.gitattributes$$' | \ - grep -v '\.gitmodules$$' | \ - grep -v "$(submodules)" | \ -+ grep -v 'redhat' | \ - LC_ALL=C sort -u > all-gitfiles; \ - LC_ALL=C comm -1 -3 distfiles all-gitfiles > missing-distfiles; \ - if test -s missing-distfiles; then \ -@@ -332,7 +333,7 @@ check-tabs: - @cd $(srcdir); \ - if test -e .git && (git --version) >/dev/null 2>&1 && \ - grep -ln "^ " \ -- `git ls-files | grep -v $(submodules) \ -+ `git ls-files | grep -v $(submodules) | grep -v redhat \ - | grep -v -f build-aux/initial-tab-whitelist` /dev/null \ - | $(EGREP) -v ':[ ]*/?\*'; \ - then \ -diff --git a/NEWS b/NEWS -index 5372668bf..530c5d42f 100644 ---- a/NEWS -+++ b/NEWS -@@ -1,3 +1,13 @@ -+Post-v21.03.0 -+------------------------- -+ - ovn-northd-ddlog: New implementation of northd, based on DDlog. This -+ implementation is incremental, meaning that it only recalculates what is -+ needed for the southbound database when northbound changes occur. It is -+ expected to scale better than the C implementation, for large deployments. -+ (This may take testing and tuning to be effective.) This version of OVN -+ requires DDLog 0.36. -+ - Introduce ovn-controller incremetal processing engine statistics -+ - OVN v21.03.0 - 12 Mar 2021 - ------------------------- - - Support ECMP multiple nexthops for reroute router policies. -diff --git a/configure.ac b/configure.ac -index 37b476d53..f3de6fef2 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -13,7 +13,7 @@ - # limitations under the License. - - AC_PREREQ(2.63) --AC_INIT(ovn, 21.03.0, bugs@openvswitch.org) -+AC_INIT(ovn, 21.03.1, bugs@openvswitch.org) - AC_CONFIG_MACRO_DIR([m4]) - AC_CONFIG_AUX_DIR([build-aux]) - AC_CONFIG_HEADERS([config.h]) -diff --git a/controller/binding.c b/controller/binding.c -index 4e6c75696..451f00e34 100644 ---- a/controller/binding.c -+++ b/controller/binding.c -@@ -597,6 +597,23 @@ remove_local_lport_ids(const struct sbrec_port_binding *pb, - } - } - -+/* Corresponds to each Port_Binding.type. */ -+enum en_lport_type { -+ LP_UNKNOWN, -+ LP_VIF, -+ LP_CONTAINER, -+ LP_PATCH, -+ LP_L3GATEWAY, -+ LP_LOCALNET, -+ LP_LOCALPORT, -+ LP_L2GATEWAY, -+ LP_VTEP, -+ LP_CHASSISREDIRECT, -+ LP_VIRTUAL, -+ LP_EXTERNAL, -+ LP_REMOTE -+}; -+ - /* Local bindings. binding.c module binds the logical port (represented by - * Port_Binding rows) and sets the 'chassis' column when it sees the - * OVS interface row (of type "" or "internal") with the -@@ -608,134 +625,180 @@ remove_local_lport_ids(const struct sbrec_port_binding *pb, - * 'struct local_binding' is used. A shash of these local bindings is - * maintained with the 'external_ids:iface-id' as the key to the shash. - * -- * struct local_binding (defined in binding.h) has 3 main fields: -- * - type -- * - OVS interface row object -- * - Port_Binding row object -- * -- * An instance of 'struct local_binding' can be one of 3 types. -- * -- * BT_VIF: Represent a local binding for an OVS interface of -- * type "" or "internal" with the external_ids:iface-id -- * set. -- * -- * This can be a -- * * probable local binding - external_ids:iface-id is -- * set, but the corresponding Port_Binding row is not -- * created or is not visible to the local ovn-controller -- * instance. -- * -- * * a local binding - external_ids:iface-id is set and -- * which is already bound to the corresponding Port_Binding -- * row. -- * -- * It maintains a list of children -- * (of type BT_CONTAINER/BT_VIRTUAL) if any. -- * -- * BT_CONTAINER: Represents a local binding which has a parent of type -- * BT_VIF. Its Port_Binding row's 'parent' column is set to -- * its parent's Port_Binding. It shares the OVS interface row -- * with the parent. -- * Each ovn-controller when it sees a container Port_Binding, -- * it creates 'struct local_binding' for the parent -- * Port_Binding and for its even if the OVS interface row for -- * the parent is not present. -- * -- * BT_VIRTUAL: Represents a local binding which has a parent of type BT_VIF. -- * Its Port_Binding type is "virtual" and it shares the OVS -- * interface row with the parent. -- * Port_Binding of type "virtual" is claimed by pinctrl module -- * when it sees the ARP packet from the parent's VIF. -- * -+ * struct local_binding has 3 main fields: -+ * - name : 'external_ids:iface-id' of the OVS interface (key). -+ * - OVS interface row object. -+ * - List of 'binding_lport' objects with the primary lport -+ * in the front of the list (if present). - * - * An object of 'struct local_binding' is created: -- * - For each interface that has iface-id configured with the type - BT_VIF. -- * -- * - For each container Port Binding (of type BT_CONTAINER) and its -- * parent Port_Binding (of type BT_VIF), no matter if -- * they are bound to this chassis i.e even if OVS interface row for the -- * parent is not present. -+ * - For each interface that has external_ids:iface-id configured. - * -- * - For each 'virtual' Port Binding (of type BT_VIRTUAL) provided its parent -- * is bound to this chassis. -+ * - For each port binding (also referred as lport) of type 'LP_VIF' -+ * if it is a parent lport of container lports even if there is no -+ * corresponding OVS interface. - */ -+struct local_binding { -+ char *name; -+ const struct ovsrec_interface *iface; -+ struct ovs_list binding_lports; -+}; - --static struct local_binding * --local_binding_create(const char *name, const struct ovsrec_interface *iface, -- const struct sbrec_port_binding *pb, -- enum local_binding_type type) --{ -- struct local_binding *lbinding = xzalloc(sizeof *lbinding); -- lbinding->name = xstrdup(name); -- lbinding->type = type; -- lbinding->pb = pb; -- lbinding->iface = iface; -- shash_init(&lbinding->children); -- return lbinding; --} -- --static void --local_binding_add(struct shash *local_bindings, struct local_binding *lbinding) --{ -- shash_add(local_bindings, lbinding->name, lbinding); --} -+/* This structure represents a logical port (or port binding) -+ * which is associated with 'struct local_binding'. -+ * -+ * An instance of 'struct binding_lport' is created for a logical port -+ * - If the OVS interface's iface-id corresponds to the logical port. -+ * - If it is a container or virtual logical port and its parent -+ * has a 'local binding'. -+ * -+ */ -+struct binding_lport { -+ struct ovs_list list_node; /* Node in local_binding.binding_lports. */ - --static void --local_binding_destroy(struct local_binding *lbinding) --{ -- local_bindings_destroy(&lbinding->children); -+ char *name; -+ const struct sbrec_port_binding *pb; -+ struct local_binding *lbinding; -+ enum en_lport_type type; -+}; - -- free(lbinding->name); -- free(lbinding); --} -+static struct local_binding *local_binding_create( -+ const char *name, const struct ovsrec_interface *); -+static void local_binding_add(struct shash *local_bindings, -+ struct local_binding *); -+static struct local_binding *local_binding_find( -+ struct shash *local_bindings, const char *name); -+static void local_binding_destroy(struct local_binding *, -+ struct shash *binding_lports); -+static void local_binding_delete(struct local_binding *, -+ struct shash *local_bindings, -+ struct shash *binding_lports); -+static struct binding_lport *local_binding_add_lport( -+ struct shash *binding_lports, -+ struct local_binding *, -+ const struct sbrec_port_binding *, -+ enum en_lport_type); -+static struct binding_lport *local_binding_get_primary_lport( -+ struct local_binding *); -+static bool local_binding_handle_stale_binding_lports( -+ struct local_binding *lbinding, struct binding_ctx_in *b_ctx_in, -+ struct binding_ctx_out *b_ctx_out, struct hmap *qos_map); -+ -+static struct binding_lport *binding_lport_create( -+ const struct sbrec_port_binding *, -+ struct local_binding *, enum en_lport_type); -+static void binding_lport_destroy(struct binding_lport *); -+static void binding_lport_delete(struct shash *binding_lports, -+ struct binding_lport *); -+static void binding_lport_add(struct shash *binding_lports, -+ struct binding_lport *); -+static struct binding_lport *binding_lport_find( -+ struct shash *binding_lports, const char *lport_name); -+static const struct sbrec_port_binding *binding_lport_get_parent_pb( -+ struct binding_lport *b_lprt); -+static struct binding_lport *binding_lport_check_and_cleanup( -+ struct binding_lport *, struct shash *b_lports); -+ -+static char *get_lport_type_str(enum en_lport_type lport_type); - - void --local_bindings_init(struct shash *local_bindings) -+local_binding_data_init(struct local_binding_data *lbinding_data) - { -- shash_init(local_bindings); -+ shash_init(&lbinding_data->bindings); -+ shash_init(&lbinding_data->lports); - } - - void --local_bindings_destroy(struct shash *local_bindings) -+local_binding_data_destroy(struct local_binding_data *lbinding_data) - { - struct shash_node *node, *next; -- SHASH_FOR_EACH_SAFE (node, next, local_bindings) { -+ -+ SHASH_FOR_EACH_SAFE (node, next, &lbinding_data->lports) { -+ struct binding_lport *b_lport = node->data; -+ binding_lport_destroy(b_lport); -+ shash_delete(&lbinding_data->lports, node); -+ } -+ -+ SHASH_FOR_EACH_SAFE (node, next, &lbinding_data->bindings) { - struct local_binding *lbinding = node->data; -- local_binding_destroy(lbinding); -- shash_delete(local_bindings, node); -+ local_binding_destroy(lbinding, &lbinding_data->lports); -+ shash_delete(&lbinding_data->bindings, node); - } - -- shash_destroy(local_bindings); -+ shash_destroy(&lbinding_data->lports); -+ shash_destroy(&lbinding_data->bindings); - } - --static --void local_binding_delete(struct shash *local_bindings, -- struct local_binding *lbinding) -+const struct sbrec_port_binding * -+local_binding_get_primary_pb(struct shash *local_bindings, const char *pb_name) - { -- shash_find_and_delete(local_bindings, lbinding->name); -- local_binding_destroy(lbinding); --} -+ struct local_binding *lbinding = -+ local_binding_find(local_bindings, pb_name); -+ struct binding_lport *b_lport = local_binding_get_primary_lport(lbinding); - --static void --local_binding_add_child(struct local_binding *lbinding, -- struct local_binding *child) --{ -- local_binding_add(&lbinding->children, child); -- child->parent = lbinding; -+ return b_lport ? b_lport->pb : NULL; - } - --static struct local_binding * --local_binding_find_child(struct local_binding *lbinding, -- const char *child_name) -+void -+binding_dump_local_bindings(struct local_binding_data *lbinding_data, -+ struct ds *out_data) - { -- return local_binding_find(&lbinding->children, child_name); --} -+ const struct shash_node **nodes; - --static void --local_binding_delete_child(struct local_binding *lbinding, -- struct local_binding *child) --{ -- shash_find_and_delete(&lbinding->children, child->name); -+ nodes = shash_sort(&lbinding_data->bindings); -+ size_t n = shash_count(&lbinding_data->bindings); -+ -+ ds_put_cstr(out_data, "Local bindings:\n"); -+ for (size_t i = 0; i < n; i++) { -+ const struct shash_node *node = nodes[i]; -+ struct local_binding *lbinding = node->data; -+ size_t num_lports = ovs_list_size(&lbinding->binding_lports); -+ ds_put_format(out_data, "name: [%s], OVS interface name : [%s], " -+ "num binding lports : [%"PRIuSIZE"]\n", -+ lbinding->name, -+ lbinding->iface ? lbinding->iface->name : "NULL", -+ num_lports); -+ -+ if (num_lports) { -+ struct shash child_lports = SHASH_INITIALIZER(&child_lports); -+ struct binding_lport *primary_lport = NULL; -+ struct binding_lport *b_lport; -+ bool first_elem = true; -+ -+ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { -+ if (first_elem && b_lport->type == LP_VIF) { -+ primary_lport = b_lport; -+ } else { -+ shash_add(&child_lports, b_lport->name, b_lport); -+ } -+ first_elem = false; -+ } -+ -+ if (primary_lport) { -+ ds_put_format(out_data, "primary lport : [%s]\n", -+ primary_lport->name); -+ } else { -+ ds_put_format(out_data, "no primary lport\n"); -+ } -+ -+ if (!shash_is_empty(&child_lports)) { -+ const struct shash_node **c_nodes = -+ shash_sort(&child_lports); -+ for (size_t j = 0; j < shash_count(&child_lports); j++) { -+ b_lport = c_nodes[j]->data; -+ ds_put_format(out_data, "child lport[%"PRIuSIZE"] : [%s], " -+ "type : [%s]\n", j + 1, b_lport->name, -+ get_lport_type_str(b_lport->type)); -+ } -+ free(c_nodes); -+ } -+ shash_destroy(&child_lports); -+ } -+ -+ ds_put_cstr(out_data, "----------------------------------------\n"); -+ } -+ -+ free(nodes); - } - - static bool -@@ -744,12 +807,6 @@ is_lport_vif(const struct sbrec_port_binding *pb) - return !pb->type[0]; - } - --static bool --is_lport_container(const struct sbrec_port_binding *pb) --{ -- return is_lport_vif(pb) && pb->parent_port && pb->parent_port[0]; --} -- - static struct tracked_binding_datapath * - tracked_binding_datapath_create(const struct sbrec_datapath_binding *dp, - bool is_new, -@@ -818,26 +875,13 @@ binding_tracked_dp_destroy(struct hmap *tracked_datapaths) - hmap_destroy(tracked_datapaths); - } - --/* Corresponds to each Port_Binding.type. */ --enum en_lport_type { -- LP_UNKNOWN, -- LP_VIF, -- LP_PATCH, -- LP_L3GATEWAY, -- LP_LOCALNET, -- LP_LOCALPORT, -- LP_L2GATEWAY, -- LP_VTEP, -- LP_CHASSISREDIRECT, -- LP_VIRTUAL, -- LP_EXTERNAL, -- LP_REMOTE --}; -- - static enum en_lport_type - get_lport_type(const struct sbrec_port_binding *pb) - { - if (is_lport_vif(pb)) { -+ if (pb->parent_port && pb->parent_port[0]) { -+ return LP_CONTAINER; -+ } - return LP_VIF; - } else if (!strcmp(pb->type, "patch")) { - return LP_PATCH; -@@ -864,6 +908,41 @@ get_lport_type(const struct sbrec_port_binding *pb) - return LP_UNKNOWN; - } - -+static char * -+get_lport_type_str(enum en_lport_type lport_type) -+{ -+ switch (lport_type) { -+ case LP_VIF: -+ return "VIF"; -+ case LP_CONTAINER: -+ return "CONTAINER"; -+ case LP_VIRTUAL: -+ return "VIRTUAL"; -+ case LP_PATCH: -+ return "PATCH"; -+ case LP_CHASSISREDIRECT: -+ return "CHASSISREDIRECT"; -+ case LP_L3GATEWAY: -+ return "L3GATEWAT"; -+ case LP_LOCALNET: -+ return "PATCH"; -+ case LP_LOCALPORT: -+ return "LOCALPORT"; -+ case LP_L2GATEWAY: -+ return "L2GATEWAY"; -+ case LP_EXTERNAL: -+ return "EXTERNAL"; -+ case LP_REMOTE: -+ return "REMOTE"; -+ case LP_VTEP: -+ return "VTEP"; -+ case LP_UNKNOWN: -+ return "UNKNOWN"; -+ } -+ -+ OVS_NOT_REACHED(); -+} -+ - /* For newly claimed ports, if 'notify_up' is 'false': - * - set the 'pb.up' field to true if 'pb' has no 'parent_pb'. - * - set the 'pb.up' field to true if 'parent_pb.up' is 'true' (e.g., for -@@ -991,14 +1070,15 @@ release_lport(const struct sbrec_port_binding *pb, bool sb_readonly, - static bool - is_lbinding_set(struct local_binding *lbinding) - { -- return lbinding && lbinding->pb && lbinding->iface; -+ return lbinding && lbinding->iface; - } - - static bool --is_lbinding_this_chassis(struct local_binding *lbinding, -- const struct sbrec_chassis *chassis) -+is_binding_lport_this_chassis(struct binding_lport *b_lport, -+ const struct sbrec_chassis *chassis) - { -- return lbinding && lbinding->pb && lbinding->pb->chassis == chassis; -+ return (b_lport && b_lport->pb && chassis && -+ b_lport->pb->chassis == chassis); - } - - static bool -@@ -1010,15 +1090,14 @@ can_bind_on_this_chassis(const struct sbrec_chassis *chassis_rec, - || !strcmp(requested_chassis, chassis_rec->hostname); - } - --/* Returns 'true' if the 'lbinding' has children of type BT_CONTAINER, -+/* Returns 'true' if the 'lbinding' has binding lports of type LP_CONTAINER, - * 'false' otherwise. */ - static bool - is_lbinding_container_parent(struct local_binding *lbinding) - { -- struct shash_node *node; -- SHASH_FOR_EACH (node, &lbinding->children) { -- struct local_binding *l = node->data; -- if (l->type == BT_CONTAINER) { -+ struct binding_lport *b_lport; -+ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { -+ if (b_lport->type == LP_CONTAINER) { - return true; - } - } -@@ -1027,66 +1106,41 @@ is_lbinding_container_parent(struct local_binding *lbinding) - } - - static bool --release_local_binding_children(const struct sbrec_chassis *chassis_rec, -- struct local_binding *lbinding, -- bool sb_readonly, -- struct hmap *tracked_dp_bindings) --{ -- struct shash_node *node; -- SHASH_FOR_EACH (node, &lbinding->children) { -- struct local_binding *l = node->data; -- if (is_lbinding_this_chassis(l, chassis_rec)) { -- if (!release_lport(l->pb, sb_readonly, tracked_dp_bindings)) { -- return false; -- } -+release_binding_lport(const struct sbrec_chassis *chassis_rec, -+ struct binding_lport *b_lport, bool sb_readonly, -+ struct binding_ctx_out *b_ctx_out) -+{ -+ if (is_binding_lport_this_chassis(b_lport, chassis_rec)) { -+ remove_local_lport_ids(b_lport->pb, b_ctx_out); -+ if (!release_lport(b_lport->pb, sb_readonly, -+ b_ctx_out->tracked_dp_bindings)) { -+ return false; - } -- -- /* Clear the local bindings' 'iface'. */ -- l->iface = NULL; - } - - return true; - } - --static bool --release_local_binding(const struct sbrec_chassis *chassis_rec, -- struct local_binding *lbinding, bool sb_readonly, -- struct hmap *tracked_dp_bindings) --{ -- if (!release_local_binding_children(chassis_rec, lbinding, -- sb_readonly, tracked_dp_bindings)) { -- return false; -- } -- -- bool retval = true; -- if (is_lbinding_this_chassis(lbinding, chassis_rec)) { -- retval = release_lport(lbinding->pb, sb_readonly, tracked_dp_bindings); -- } -- -- lbinding->pb = NULL; -- lbinding->iface = NULL; -- return retval; --} -- - static bool - consider_vif_lport_(const struct sbrec_port_binding *pb, - bool can_bind, const char *vif_chassis, - struct binding_ctx_in *b_ctx_in, - struct binding_ctx_out *b_ctx_out, -- struct local_binding *lbinding, -+ struct binding_lport *b_lport, - struct hmap *qos_map) - { -- bool lbinding_set = is_lbinding_set(lbinding); -+ bool lbinding_set = b_lport && is_lbinding_set(b_lport->lbinding); -+ - if (lbinding_set) { - if (can_bind) { - /* We can claim the lport. */ - const struct sbrec_port_binding *parent_pb = -- lbinding->parent ? lbinding->parent->pb : NULL; -+ binding_lport_get_parent_pb(b_lport); - - if (!claim_lport(pb, parent_pb, b_ctx_in->chassis_rec, -- lbinding->iface, !b_ctx_in->ovnsb_idl_txn, -- !lbinding->parent, -- b_ctx_out->tracked_dp_bindings)){ -+ b_lport->lbinding->iface, -+ !b_ctx_in->ovnsb_idl_txn, -+ !parent_pb, b_ctx_out->tracked_dp_bindings)){ - return false; - } - -@@ -1098,7 +1152,7 @@ consider_vif_lport_(const struct sbrec_port_binding *pb, - b_ctx_out->tracked_dp_bindings); - update_local_lport_ids(pb, b_ctx_out); - update_local_lports(pb->logical_port, b_ctx_out); -- if (lbinding->iface && qos_map && b_ctx_in->ovs_idl_txn) { -+ if (b_lport->lbinding->iface && qos_map && b_ctx_in->ovs_idl_txn) { - get_qos_params(pb, qos_map); - } - } else { -@@ -1136,16 +1190,19 @@ consider_vif_lport(const struct sbrec_port_binding *pb, - vif_chassis); - - if (!lbinding) { -- lbinding = local_binding_find(b_ctx_out->local_bindings, -+ lbinding = local_binding_find(&b_ctx_out->lbinding_data->bindings, - pb->logical_port); - } - -+ struct binding_lport *b_lport = NULL; - if (lbinding) { -- lbinding->pb = pb; -+ struct shash *binding_lports = -+ &b_ctx_out->lbinding_data->lports; -+ b_lport = local_binding_add_lport(binding_lports, lbinding, pb, LP_VIF); - } - - return consider_vif_lport_(pb, can_bind, vif_chassis, b_ctx_in, -- b_ctx_out, lbinding, qos_map); -+ b_ctx_out, b_lport, qos_map); - } - - static bool -@@ -1154,9 +1211,9 @@ consider_container_lport(const struct sbrec_port_binding *pb, - struct binding_ctx_out *b_ctx_out, - struct hmap *qos_map) - { -+ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; - struct local_binding *parent_lbinding; -- parent_lbinding = local_binding_find(b_ctx_out->local_bindings, -- pb->parent_port); -+ parent_lbinding = local_binding_find(local_bindings, pb->parent_port); - - if (!parent_lbinding) { - /* There is no local_binding for parent port. Create it -@@ -1171,54 +1228,61 @@ consider_container_lport(const struct sbrec_port_binding *pb, - * we want the these container ports also be claimed by the - * chassis. - * */ -- parent_lbinding = local_binding_create(pb->parent_port, NULL, NULL, -- BT_VIF); -- local_binding_add(b_ctx_out->local_bindings, parent_lbinding); -+ parent_lbinding = local_binding_create(pb->parent_port, NULL); -+ local_binding_add(local_bindings, parent_lbinding); - } - -- struct local_binding *container_lbinding = -- local_binding_find_child(parent_lbinding, pb->logical_port); -+ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; -+ struct binding_lport *container_b_lport = -+ local_binding_add_lport(binding_lports, parent_lbinding, pb, -+ LP_CONTAINER); - -- if (!container_lbinding) { -- container_lbinding = local_binding_create(pb->logical_port, -- parent_lbinding->iface, -- pb, BT_CONTAINER); -- local_binding_add_child(parent_lbinding, container_lbinding); -- } else { -- ovs_assert(container_lbinding->type == BT_CONTAINER); -- container_lbinding->pb = pb; -- container_lbinding->iface = parent_lbinding->iface; -- } -+ struct binding_lport *parent_b_lport = -+ binding_lport_find(binding_lports, pb->parent_port); - -- if (!parent_lbinding->pb) { -- parent_lbinding->pb = lport_lookup_by_name( -+ bool can_consider_c_lport = true; -+ if (!parent_b_lport || !parent_b_lport->pb) { -+ const struct sbrec_port_binding *parent_pb = lport_lookup_by_name( - b_ctx_in->sbrec_port_binding_by_name, pb->parent_port); - -- if (parent_lbinding->pb) { -+ if (parent_pb && get_lport_type(parent_pb) == LP_VIF) { - /* Its possible that the parent lport is not considered yet. - * So call consider_vif_lport() to process it first. */ -- consider_vif_lport(parent_lbinding->pb, b_ctx_in, b_ctx_out, -+ consider_vif_lport(parent_pb, b_ctx_in, b_ctx_out, - parent_lbinding, qos_map); -+ parent_b_lport = binding_lport_find(binding_lports, -+ pb->parent_port); - } else { -- /* The parent lport doesn't exist. Call release_lport() to -- * release the container lport, if it was bound earlier. */ -- if (is_lbinding_this_chassis(container_lbinding, -- b_ctx_in->chassis_rec)) { -- return release_lport(pb, !b_ctx_in->ovnsb_idl_txn, -- b_ctx_out->tracked_dp_bindings); -- } -+ /* The parent lport doesn't exist. Cannot consider the container -+ * lport for binding. */ -+ can_consider_c_lport = false; -+ } -+ } - -- return true; -+ if (parent_b_lport && parent_b_lport->type != LP_VIF) { -+ can_consider_c_lport = false; -+ } -+ -+ if (!can_consider_c_lport) { -+ /* Call release_lport() to release the container lport, -+ * if it was bound earlier. */ -+ if (is_binding_lport_this_chassis(container_b_lport, -+ b_ctx_in->chassis_rec)) { -+ return release_lport(pb, !b_ctx_in->ovnsb_idl_txn, -+ b_ctx_out->tracked_dp_bindings); - } -+ -+ return true; - } - -- const char *vif_chassis = smap_get(&parent_lbinding->pb->options, -+ ovs_assert(parent_b_lport && parent_b_lport->pb); -+ const char *vif_chassis = smap_get(&parent_b_lport->pb->options, - "requested-chassis"); - bool can_bind = can_bind_on_this_chassis(b_ctx_in->chassis_rec, - vif_chassis); - - return consider_vif_lport_(pb, can_bind, vif_chassis, b_ctx_in, b_ctx_out, -- container_lbinding, qos_map); -+ container_b_lport, qos_map); - } - - static bool -@@ -1227,46 +1291,58 @@ consider_virtual_lport(const struct sbrec_port_binding *pb, - struct binding_ctx_out *b_ctx_out, - struct hmap *qos_map) - { -- struct local_binding * parent_lbinding = -- pb->virtual_parent ? local_binding_find(b_ctx_out->local_bindings, -+ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; -+ struct local_binding *parent_lbinding = -+ pb->virtual_parent ? local_binding_find(local_bindings, - pb->virtual_parent) - : NULL; - -- if (parent_lbinding && !parent_lbinding->pb) { -- parent_lbinding->pb = lport_lookup_by_name( -- b_ctx_in->sbrec_port_binding_by_name, pb->virtual_parent); -- -- if (parent_lbinding->pb) { -- /* Its possible that the parent lport is not considered yet. -- * So call consider_vif_lport() to process it first. */ -- consider_vif_lport(parent_lbinding->pb, b_ctx_in, b_ctx_out, -- parent_lbinding, qos_map); -- } -- } -- -+ struct binding_lport *virtual_b_lport = NULL; - /* Unlike container lports, we don't have to create parent_lbinding if - * it is NULL. This is because, if parent_lbinding is not present, it - * means the virtual port can't bind in this chassis. - * Note: pinctrl module binds the virtual lport when it sees ARP - * packet from the parent lport. */ -- struct local_binding *virtual_lbinding = NULL; -- if (is_lbinding_this_chassis(parent_lbinding, b_ctx_in->chassis_rec)) { -- virtual_lbinding = -- local_binding_find_child(parent_lbinding, pb->logical_port); -- if (!virtual_lbinding) { -- virtual_lbinding = local_binding_create(pb->logical_port, -- parent_lbinding->iface, -- pb, BT_VIRTUAL); -- local_binding_add_child(parent_lbinding, virtual_lbinding); -- } else { -- ovs_assert(virtual_lbinding->type == BT_VIRTUAL); -- virtual_lbinding->pb = pb; -- virtual_lbinding->iface = parent_lbinding->iface; -+ if (parent_lbinding) { -+ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; -+ -+ struct binding_lport *parent_b_lport = -+ binding_lport_find(binding_lports, pb->virtual_parent); -+ -+ if (!parent_b_lport || !parent_b_lport->pb) { -+ const struct sbrec_port_binding *parent_pb = lport_lookup_by_name( -+ b_ctx_in->sbrec_port_binding_by_name, pb->virtual_parent); -+ -+ if (parent_pb && get_lport_type(parent_pb) == LP_VIF) { -+ /* Its possible that the parent lport is not considered yet. -+ * So call consider_vif_lport() to process it first. */ -+ consider_vif_lport(parent_pb, b_ctx_in, b_ctx_out, -+ parent_lbinding, qos_map); -+ } -+ } -+ -+ parent_b_lport = local_binding_get_primary_lport(parent_lbinding); -+ if (is_binding_lport_this_chassis(parent_b_lport, -+ b_ctx_in->chassis_rec)) { -+ virtual_b_lport = -+ local_binding_add_lport(binding_lports, parent_lbinding, pb, -+ LP_VIRTUAL); - } - } - -- return consider_vif_lport_(pb, true, NULL, b_ctx_in, b_ctx_out, -- virtual_lbinding, qos_map); -+ if (!consider_vif_lport_(pb, true, NULL, b_ctx_in, b_ctx_out, -+ virtual_b_lport, qos_map)) { -+ return false; -+ } -+ -+ /* If the virtual lport is not bound to this chassis, then remove -+ * its entry from the local_lport_ids if present. This is required -+ * when a virtual port moves from one chassis to other.*/ -+ if (!virtual_b_lport) { -+ remove_local_lport_ids(pb, b_ctx_out); -+ } -+ -+ return true; - } - - /* Considers either claiming the lport or releasing the lport -@@ -1407,6 +1483,8 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, - continue; - } - -+ struct shash *local_bindings = -+ &b_ctx_out->lbinding_data->bindings; - for (j = 0; j < port_rec->n_interfaces; j++) { - const struct ovsrec_interface *iface_rec; - -@@ -1416,11 +1494,10 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, - - if (iface_id && ofport > 0) { - struct local_binding *lbinding = -- local_binding_find(b_ctx_out->local_bindings, iface_id); -+ local_binding_find(local_bindings, iface_id); - if (!lbinding) { -- lbinding = local_binding_create(iface_id, iface_rec, NULL, -- BT_VIF); -- local_binding_add(b_ctx_out->local_bindings, lbinding); -+ lbinding = local_binding_create(iface_id, iface_rec); -+ local_binding_add(local_bindings, lbinding); - } else { - static struct vlog_rate_limit rl = - VLOG_RATE_LIMIT_INIT(1, 5); -@@ -1431,7 +1508,6 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, - "configuration on interface [%s]", - lbinding->iface->name, iface_rec->name, - iface_rec->name); -- ovs_assert(lbinding->type == BT_VIF); - } - - update_local_lports(iface_id, b_ctx_out); -@@ -1494,11 +1570,11 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out) - break; - - case LP_VIF: -- if (is_lport_container(pb)) { -- consider_container_lport(pb, b_ctx_in, b_ctx_out, qos_map_ptr); -- } else { -- consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr); -- } -+ consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr); -+ break; -+ -+ case LP_CONTAINER: -+ consider_container_lport(pb, b_ctx_in, b_ctx_out, qos_map_ptr); - break; - - case LP_VIRTUAL: -@@ -1799,39 +1875,44 @@ consider_iface_claim(const struct ovsrec_interface *iface_rec, - update_local_lports(iface_id, b_ctx_out); - smap_replace(b_ctx_out->local_iface_ids, iface_rec->name, iface_id); - -- struct local_binding *lbinding = -- local_binding_find(b_ctx_out->local_bindings, iface_id); -+ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; -+ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; -+ struct local_binding *lbinding = local_binding_find(local_bindings, -+ iface_id); - - if (!lbinding) { -- lbinding = local_binding_create(iface_id, iface_rec, NULL, BT_VIF); -- local_binding_add(b_ctx_out->local_bindings, lbinding); -+ lbinding = local_binding_create(iface_id, iface_rec); -+ local_binding_add(local_bindings, lbinding); - } else { - lbinding->iface = iface_rec; - } - -- if (!lbinding->pb || strcmp(lbinding->name, lbinding->pb->logical_port)) { -- lbinding->pb = lport_lookup_by_name( -- b_ctx_in->sbrec_port_binding_by_name, lbinding->name); -- if (lbinding->pb && !strcmp(lbinding->pb->type, "virtual")) { -- lbinding->pb = NULL; -+ struct binding_lport *b_lport = local_binding_get_primary_lport(lbinding); -+ const struct sbrec_port_binding *pb = NULL; -+ if (!b_lport) { -+ pb = lport_lookup_by_name(b_ctx_in->sbrec_port_binding_by_name, -+ lbinding->name); -+ if (pb && get_lport_type(pb) == LP_VIF) { -+ b_lport = local_binding_add_lport(binding_lports, lbinding, pb, -+ LP_VIF); - } - } - -- if (lbinding->pb) { -- if (!consider_vif_lport(lbinding->pb, b_ctx_in, b_ctx_out, -- lbinding, qos_map)) { -- return false; -- } -+ if (!b_lport) { -+ /* There is no binding lport for this local binding. */ -+ return true; -+ } -+ -+ if (!consider_vif_lport(b_lport->pb, b_ctx_in, b_ctx_out, -+ lbinding, qos_map)) { -+ return false; - } - - /* Update the child local_binding's iface (if any children) and try to - * claim the container lbindings. */ -- struct shash_node *node; -- SHASH_FOR_EACH (node, &lbinding->children) { -- struct local_binding *child = node->data; -- child->iface = iface_rec; -- if (child->type == BT_CONTAINER) { -- if (!consider_container_lport(child->pb, b_ctx_in, b_ctx_out, -+ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { -+ if (b_lport->type == LP_CONTAINER) { -+ if (!consider_container_lport(b_lport->pb, b_ctx_in, b_ctx_out, - qos_map)) { - return false; - } -@@ -1862,32 +1943,42 @@ consider_iface_release(const struct ovsrec_interface *iface_rec, - struct binding_ctx_out *b_ctx_out) - { - struct local_binding *lbinding; -- lbinding = local_binding_find(b_ctx_out->local_bindings, -- iface_id); -- if (is_lbinding_this_chassis(lbinding, b_ctx_in->chassis_rec)) { -+ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; -+ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; -+ -+ lbinding = local_binding_find(local_bindings, iface_id); -+ struct binding_lport *b_lport = local_binding_get_primary_lport(lbinding); -+ if (is_binding_lport_this_chassis(b_lport, b_ctx_in->chassis_rec)) { - struct local_datapath *ld = - get_local_datapath(b_ctx_out->local_datapaths, -- lbinding->pb->datapath->tunnel_key); -+ b_lport->pb->datapath->tunnel_key); - if (ld) { -- remove_pb_from_local_datapath(lbinding->pb, -- b_ctx_in->chassis_rec, -- b_ctx_out, ld); -+ remove_pb_from_local_datapath(b_lport->pb, -+ b_ctx_in->chassis_rec, -+ b_ctx_out, ld); - } - -- /* Note: release_local_binding() resets lbinding->pb and -- * lbinding->iface. -- * Cannot access these members of lbinding after this call. */ -- if (!release_local_binding(b_ctx_in->chassis_rec, lbinding, -- !b_ctx_in->ovnsb_idl_txn, -- b_ctx_out->tracked_dp_bindings)) { -- return false; -+ /* Release the primary binding lport and other children lports if -+ * any. */ -+ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { -+ if (!release_binding_lport(b_ctx_in->chassis_rec, b_lport, -+ !b_ctx_in->ovnsb_idl_txn, -+ b_ctx_out)) { -+ return false; -+ } - } -+ -+ } -+ -+ if (lbinding) { -+ /* Clear the iface of the local binding. */ -+ lbinding->iface = NULL; - } - - /* Check if the lbinding has children of type PB_CONTAINER. - * If so, don't delete the local_binding. */ - if (lbinding && !is_lbinding_container_parent(lbinding)) { -- local_binding_delete(b_ctx_out->local_bindings, lbinding); -+ local_binding_delete(lbinding, local_bindings, binding_lports); - } - - remove_local_lports(iface_id, b_ctx_out); -@@ -2088,56 +2179,35 @@ handle_deleted_lport(const struct sbrec_port_binding *pb, - } - } - --static struct local_binding * --get_lbinding_for_lport(const struct sbrec_port_binding *pb, -- enum en_lport_type lport_type, -- struct binding_ctx_out *b_ctx_out) --{ -- ovs_assert(lport_type == LP_VIF || lport_type == LP_VIRTUAL); -- -- if (lport_type == LP_VIF && !is_lport_container(pb)) { -- return local_binding_find(b_ctx_out->local_bindings, pb->logical_port); -- } -- -- struct local_binding *parent_lbinding = NULL; -- -- if (lport_type == LP_VIRTUAL) { -- if (pb->virtual_parent) { -- parent_lbinding = local_binding_find(b_ctx_out->local_bindings, -- pb->virtual_parent); -- } -- } else { -- if (pb->parent_port) { -- parent_lbinding = local_binding_find(b_ctx_out->local_bindings, -- pb->parent_port); -- } -- } -- -- return parent_lbinding -- ? local_binding_find(&parent_lbinding->children, pb->logical_port) -- : NULL; --} -- - static bool - handle_deleted_vif_lport(const struct sbrec_port_binding *pb, - enum en_lport_type lport_type, - struct binding_ctx_in *b_ctx_in, - struct binding_ctx_out *b_ctx_out) - { -- struct local_binding *lbinding = -- get_lbinding_for_lport(pb, lport_type, b_ctx_out); -+ struct local_binding *lbinding = NULL; -+ bool bound = false; - -- if (lbinding) { -- lbinding->pb = NULL; -- /* The port_binding 'pb' is deleted. So there is no need to -- * clear the 'chassis' column of 'pb'. But we need to do -- * for the local_binding's children. */ -- if (lbinding->type == BT_VIF && -- !release_local_binding_children( -- b_ctx_in->chassis_rec, lbinding, -- !b_ctx_in->ovnsb_idl_txn, -- b_ctx_out->tracked_dp_bindings)) { -- return false; -+ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; -+ struct binding_lport *b_lport = binding_lport_find(binding_lports, pb->logical_port); -+ if (b_lport) { -+ lbinding = b_lport->lbinding; -+ bound = is_binding_lport_this_chassis(b_lport, b_ctx_in->chassis_rec); -+ -+ /* Remove b_lport from local_binding. */ -+ binding_lport_delete(binding_lports, b_lport); -+ } -+ -+ if (bound && lbinding && lport_type == LP_VIF) { -+ /* We need to release the container/virtual binding lports (if any) if -+ * deleted 'pb' type is LP_VIF. */ -+ struct binding_lport *c_lport; -+ LIST_FOR_EACH (c_lport, list_node, &lbinding->binding_lports) { -+ if (!release_binding_lport(b_ctx_in->chassis_rec, c_lport, -+ !b_ctx_in->ovnsb_idl_txn, -+ b_ctx_out)) { -+ return false; -+ } - } - } - -@@ -2147,18 +2217,8 @@ handle_deleted_vif_lport(const struct sbrec_port_binding *pb, - * it from local_lports if there is a VIF entry. - * consider_iface_release() takes care of removing from the local_lports - * when the interface change happens. */ -- if (is_lport_container(pb)) { -+ if (lport_type == LP_CONTAINER) { - remove_local_lports(pb->logical_port, b_ctx_out); -- -- /* If the container port is removed we should also remove it from -- * its parent's children set. -- */ -- if (lbinding) { -- if (lbinding->parent) { -- local_binding_delete_child(lbinding->parent, lbinding); -- } -- local_binding_destroy(lbinding); -- } - } - - handle_deleted_lport(pb, b_ctx_in, b_ctx_out); -@@ -2177,7 +2237,7 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, - - if (lport_type == LP_VIRTUAL) { - handled = consider_virtual_lport(pb, b_ctx_in, b_ctx_out, qos_map); -- } else if (lport_type == LP_VIF && is_lport_container(pb)) { -+ } else if (lport_type == LP_CONTAINER) { - handled = consider_container_lport(pb, b_ctx_in, b_ctx_out, qos_map); - } else { - handled = consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map); -@@ -2189,14 +2249,14 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, - - bool now_claimed = (pb->chassis == b_ctx_in->chassis_rec); - -- if (lport_type == LP_VIRTUAL || -- (lport_type == LP_VIF && is_lport_container(pb)) || -+ if (lport_type == LP_VIRTUAL || lport_type == LP_CONTAINER || - claimed == now_claimed) { - return true; - } - -- struct local_binding *lbinding = -- local_binding_find(b_ctx_out->local_bindings, pb->logical_port); -+ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; -+ struct local_binding *lbinding = local_binding_find(local_bindings, -+ pb->logical_port); - - /* If the ovs port backing this binding previously was removed in the - * meantime, we won't have a local_binding for it. -@@ -2206,12 +2266,11 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, - return true; - } - -- struct shash_node *node; -- SHASH_FOR_EACH (node, &lbinding->children) { -- struct local_binding *child = node->data; -- if (child->type == BT_CONTAINER) { -- handled = consider_container_lport(child->pb, b_ctx_in, b_ctx_out, -- qos_map); -+ struct binding_lport *b_lport; -+ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { -+ if (b_lport->type == LP_CONTAINER) { -+ handled = consider_container_lport(b_lport->pb, b_ctx_in, -+ b_ctx_out, qos_map); - if (!handled) { - return false; - } -@@ -2256,12 +2315,25 @@ binding_handle_port_binding_changes(struct binding_ctx_in *b_ctx_in, - - enum en_lport_type lport_type = get_lport_type(pb); - -- if (lport_type == LP_VIF) { -- if (is_lport_container(pb)) { -- shash_add(&deleted_container_pbs, pb->logical_port, pb); -- } else { -- shash_add(&deleted_vif_pbs, pb->logical_port, pb); -+ struct binding_lport *b_lport = -+ binding_lport_find(&b_ctx_out->lbinding_data->lports, -+ pb->logical_port); -+ if (b_lport) { -+ /* If the 'b_lport->type' and 'lport_type' don't match, then update -+ * the b_lport->type to the updated 'lport_type'. The function -+ * binding_lport_check_and_cleanup() will cleanup the 'b_lport' -+ * if required. */ -+ if (b_lport->type != lport_type) { -+ b_lport->type = lport_type; - } -+ b_lport = binding_lport_check_and_cleanup( -+ b_lport, &b_ctx_out->lbinding_data->lports); -+ } -+ -+ if (lport_type == LP_VIF) { -+ shash_add(&deleted_vif_pbs, pb->logical_port, pb); -+ } else if (lport_type == LP_CONTAINER) { -+ shash_add(&deleted_container_pbs, pb->logical_port, pb); - } else if (lport_type == LP_VIRTUAL) { - shash_add(&deleted_virtual_pbs, pb->logical_port, pb); - } else { -@@ -2272,7 +2344,7 @@ binding_handle_port_binding_changes(struct binding_ctx_in *b_ctx_in, - struct shash_node *node; - struct shash_node *node_next; - SHASH_FOR_EACH_SAFE (node, node_next, &deleted_container_pbs) { -- handled = handle_deleted_vif_lport(node->data, LP_VIF, b_ctx_in, -+ handled = handle_deleted_vif_lport(node->data, LP_CONTAINER, b_ctx_in, - b_ctx_out); - shash_delete(&deleted_container_pbs, node); - if (!handled) { -@@ -2326,12 +2398,33 @@ delete_done: - - enum en_lport_type lport_type = get_lport_type(pb); - -+ struct binding_lport *b_lport = -+ binding_lport_find(&b_ctx_out->lbinding_data->lports, -+ pb->logical_port); -+ if (b_lport) { -+ ovs_assert(b_lport->pb == pb); -+ -+ if (b_lport->type != lport_type) { -+ b_lport->type = lport_type; -+ } -+ -+ if (b_lport->lbinding) { -+ handled = local_binding_handle_stale_binding_lports( -+ b_lport->lbinding, b_ctx_in, b_ctx_out, qos_map_ptr); -+ if (!handled) { -+ /* Backout from the handling. */ -+ break; -+ } -+ } -+ } -+ - struct local_datapath *ld = - get_local_datapath(b_ctx_out->local_datapaths, - pb->datapath->tunnel_key); - - switch (lport_type) { - case LP_VIF: -+ case LP_CONTAINER: - case LP_VIRTUAL: - handled = handle_updated_vif_lport(pb, lport_type, b_ctx_in, - b_ctx_out, qos_map_ptr); -@@ -2468,11 +2561,11 @@ binding_init(void) - * available. - */ - void --binding_seqno_run(struct shash *local_bindings) -+binding_seqno_run(struct local_binding_data *lbinding_data) - { - const char *iface_id; - const char *iface_id_next; -- -+ struct shash *local_bindings = &lbinding_data->bindings; - SSET_FOR_EACH_SAFE (iface_id, iface_id_next, &binding_iface_released_set) { - struct shash_node *lb_node = shash_find(local_bindings, iface_id); - -@@ -2508,16 +2601,18 @@ binding_seqno_run(struct shash *local_bindings) - * If so, then this is a newly bound interface, make sure we reset the - * Port_Binding 'up' field and the OVS Interface 'external-id'. - */ -- if (lb && lb->pb && lb->iface) { -+ struct binding_lport *b_lport = local_binding_get_primary_lport(lb); -+ if (lb && b_lport && lb->iface -+ && !simap_contains(&binding_iface_seqno_map, lb->name)) { - new_ifaces = true; - - if (smap_get(&lb->iface->external_ids, OVN_INSTALLED_EXT_ID)) { - ovsrec_interface_update_external_ids_delkey( - lb->iface, OVN_INSTALLED_EXT_ID); - } -- if (lb->pb->n_up) { -+ if (b_lport->pb->n_up) { - bool up = false; -- sbrec_port_binding_set_up(lb->pb, &up, 1); -+ sbrec_port_binding_set_up(b_lport->pb, &up, 1); - } - simap_put(&binding_iface_seqno_map, lb->name, new_seqno); - } -@@ -2542,12 +2637,13 @@ binding_seqno_run(struct shash *local_bindings) - * available. - */ - void --binding_seqno_install(struct shash *local_bindings) -+binding_seqno_install(struct local_binding_data *lbinding_data) - { - struct ofctrl_acked_seqnos *acked_seqnos = - ofctrl_acked_seqnos_get(binding_seq_type_pb_cfg); - struct simap_node *node; - struct simap_node *node_next; -+ struct shash *local_bindings = &lbinding_data->bindings; - - SIMAP_FOR_EACH_SAFE (node, node_next, &binding_iface_seqno_map) { - struct shash_node *lb_node = shash_find(local_bindings, node->name); -@@ -2557,7 +2653,8 @@ binding_seqno_install(struct shash *local_bindings) - } - - struct local_binding *lb = lb_node->data; -- if (!lb->pb || !lb->iface) { -+ struct binding_lport *b_lport = local_binding_get_primary_lport(lb); -+ if (!b_lport || !lb->iface) { - goto del_seqno; - } - -@@ -2568,14 +2665,12 @@ binding_seqno_install(struct shash *local_bindings) - ovsrec_interface_update_external_ids_setkey(lb->iface, - OVN_INSTALLED_EXT_ID, - "true"); -- if (lb->pb->n_up) { -+ if (b_lport->pb->n_up) { - bool up = true; - -- sbrec_port_binding_set_up(lb->pb, &up, 1); -- struct shash_node *child_node; -- SHASH_FOR_EACH (child_node, &lb->children) { -- struct local_binding *lb_child = child_node->data; -- sbrec_port_binding_set_up(lb_child->pb, &up, 1); -+ sbrec_port_binding_set_up(b_lport->pb, &up, 1); -+ LIST_FOR_EACH (b_lport, list_node, &lb->binding_lports) { -+ sbrec_port_binding_set_up(b_lport->pb, &up, 1); - } - } - -@@ -2591,3 +2686,305 @@ binding_seqno_flush(void) - { - simap_clear(&binding_iface_seqno_map); - } -+ -+/* Static functions for local_lbindind and binding_lport. */ -+static struct local_binding * -+local_binding_create(const char *name, const struct ovsrec_interface *iface) -+{ -+ struct local_binding *lbinding = xzalloc(sizeof *lbinding); -+ lbinding->name = xstrdup(name); -+ lbinding->iface = iface; -+ ovs_list_init(&lbinding->binding_lports); -+ -+ return lbinding; -+} -+ -+static struct local_binding * -+local_binding_find(struct shash *local_bindings, const char *name) -+{ -+ return shash_find_data(local_bindings, name); -+} -+ -+static void -+local_binding_add(struct shash *local_bindings, struct local_binding *lbinding) -+{ -+ shash_add(local_bindings, lbinding->name, lbinding); -+} -+ -+static void -+local_binding_destroy(struct local_binding *lbinding, -+ struct shash *binding_lports) -+{ -+ struct binding_lport *b_lport; -+ LIST_FOR_EACH_POP (b_lport, list_node, &lbinding->binding_lports) { -+ b_lport->lbinding = NULL; -+ binding_lport_delete(binding_lports, b_lport); -+ } -+ -+ free(lbinding->name); -+ free(lbinding); -+} -+ -+static void -+local_binding_delete(struct local_binding *lbinding, -+ struct shash *local_bindings, -+ struct shash *binding_lports) -+{ -+ shash_find_and_delete(local_bindings, lbinding->name); -+ local_binding_destroy(lbinding, binding_lports); -+} -+ -+/* Returns the primary binding lport if present in lbinding's -+ * binding lports list. A binding lport is considered primary -+ * if binding lport's type is LP_VIF and the name matches -+ * with the 'lbinding'. -+ */ -+static struct binding_lport * -+local_binding_get_primary_lport(struct local_binding *lbinding) -+{ -+ if (!lbinding) { -+ return NULL; -+ } -+ -+ if (!ovs_list_is_empty(&lbinding->binding_lports)) { -+ struct binding_lport *b_lport = NULL; -+ b_lport = CONTAINER_OF(ovs_list_front(&lbinding->binding_lports), -+ struct binding_lport, list_node); -+ -+ if (b_lport->type == LP_VIF && -+ !strcmp(lbinding->name, b_lport->name)) { -+ return b_lport; -+ } -+ } -+ -+ return NULL; -+} -+ -+static struct binding_lport * -+local_binding_add_lport(struct shash *binding_lports, -+ struct local_binding *lbinding, -+ const struct sbrec_port_binding *pb, -+ enum en_lport_type b_type) -+{ -+ struct binding_lport *b_lport = -+ binding_lport_find(binding_lports, pb->logical_port); -+ bool add_to_lport_list = false; -+ if (!b_lport) { -+ b_lport = binding_lport_create(pb, lbinding, b_type); -+ binding_lport_add(binding_lports, b_lport); -+ add_to_lport_list = true; -+ } else if (b_lport->lbinding != lbinding) { -+ add_to_lport_list = true; -+ if (!ovs_list_is_empty(&b_lport->list_node)) { -+ ovs_list_remove(&b_lport->list_node); -+ } -+ b_lport->lbinding = lbinding; -+ b_lport->type = b_type; -+ } -+ -+ if (add_to_lport_list) { -+ if (b_type == LP_VIF) { -+ ovs_list_push_front(&lbinding->binding_lports, &b_lport->list_node); -+ } else { -+ ovs_list_push_back(&lbinding->binding_lports, &b_lport->list_node); -+ } -+ } -+ -+ return b_lport; -+} -+ -+/* This function handles the stale binding lports of 'lbinding' if 'lbinding' -+ * doesn't have a primary binding lport. -+ */ -+static bool -+local_binding_handle_stale_binding_lports(struct local_binding *lbinding, -+ struct binding_ctx_in *b_ctx_in, -+ struct binding_ctx_out *b_ctx_out, -+ struct hmap *qos_map) -+{ -+ /* Check if this lbinding has a primary binding_lport or not. */ -+ struct binding_lport *p_lport = local_binding_get_primary_lport(lbinding); -+ if (p_lport) { -+ /* Nothing to be done. */ -+ return true; -+ } -+ -+ bool handled = true; -+ struct binding_lport *b_lport, *next; -+ const struct sbrec_port_binding *pb; -+ LIST_FOR_EACH_SAFE (b_lport, next, list_node, &lbinding->binding_lports) { -+ /* Get the lport type again from the pb. Its possible that the -+ * pb type has changed. */ -+ enum en_lport_type pb_lport_type = get_lport_type(b_lport->pb); -+ if (b_lport->type == LP_VIRTUAL && pb_lport_type == LP_VIRTUAL) { -+ pb = b_lport->pb; -+ binding_lport_delete(&b_ctx_out->lbinding_data->lports, -+ b_lport); -+ handled = consider_virtual_lport(pb, b_ctx_in, b_ctx_out, qos_map); -+ } else if (b_lport->type == LP_CONTAINER && -+ pb_lport_type == LP_CONTAINER) { -+ /* For container lport, binding_lport is preserved so that when -+ * the parent port is created, it can be considered. -+ * consider_container_lport() creates the binding_lport for the parent -+ * port (with iface set to NULL). */ -+ handled = consider_container_lport(b_lport->pb, b_ctx_in, b_ctx_out, qos_map); -+ } else { -+ /* This can happen when the lport type changes from one type -+ * to another. Eg. from normal lport to external. Release the -+ * lport if it was claimed earlier and delete the b_lport. */ -+ handled = release_binding_lport(b_ctx_in->chassis_rec, b_lport, -+ !b_ctx_in->ovnsb_idl_txn, -+ b_ctx_out); -+ binding_lport_delete(&b_ctx_out->lbinding_data->lports, -+ b_lport); -+ } -+ -+ if (!handled) { -+ return false; -+ } -+ } -+ -+ return handled; -+} -+ -+static struct binding_lport * -+binding_lport_create(const struct sbrec_port_binding *pb, -+ struct local_binding *lbinding, -+ enum en_lport_type type) -+{ -+ struct binding_lport *b_lport = xzalloc(sizeof *b_lport); -+ b_lport->name = xstrdup(pb->logical_port); -+ b_lport->pb = pb; -+ b_lport->type = type; -+ b_lport->lbinding = lbinding; -+ ovs_list_init(&b_lport->list_node); -+ -+ return b_lport; -+} -+ -+static void -+binding_lport_add(struct shash *binding_lports, struct binding_lport *b_lport) -+{ -+ shash_add(binding_lports, b_lport->pb->logical_port, b_lport); -+} -+ -+static struct binding_lport * -+binding_lport_find(struct shash *binding_lports, const char *lport_name) -+{ -+ if (!lport_name) { -+ return NULL; -+ } -+ -+ return shash_find_data(binding_lports, lport_name); -+} -+ -+static void -+binding_lport_destroy(struct binding_lport *b_lport) -+{ -+ if (!ovs_list_is_empty(&b_lport->list_node)) { -+ ovs_list_remove(&b_lport->list_node); -+ } -+ -+ free(b_lport->name); -+ free(b_lport); -+} -+ -+static void -+binding_lport_delete(struct shash *binding_lports, -+ struct binding_lport *b_lport) -+{ -+ shash_find_and_delete(binding_lports, b_lport->name); -+ binding_lport_destroy(b_lport); -+} -+ -+ -+static const struct sbrec_port_binding * -+binding_lport_get_parent_pb(struct binding_lport *b_lport) -+{ -+ if (!b_lport) { -+ return NULL; -+ } -+ -+ if (b_lport->type == LP_VIF) { -+ return NULL; -+ } -+ -+ struct local_binding *lbinding = b_lport->lbinding; -+ ovs_assert(lbinding); -+ -+ struct binding_lport *parent_b_lport = -+ local_binding_get_primary_lport(lbinding); -+ -+ return parent_b_lport ? parent_b_lport->pb : NULL; -+} -+ -+/* This function checks and cleans up the 'b_lport' if it is -+ * not in the correct state. -+ * -+ * If the 'b_lport' type is LP_VIF, then its name and its lbinding->name -+ * should match. Otherwise this should be cleaned up. -+ * -+ * If the 'b_lport' type is LP_CONTAINER, then its parent_port name should -+ * be the same as its lbinding's name. Otherwise this should be -+ * cleaned up. -+ * -+ * If the 'b_lport' type is LP_VIRTUAL, then its virtual parent name -+ * should be the same as its lbinding's name. Otherwise this -+ * should be cleaned up. -+ * -+ * If the 'b_lport' type is not LP_VIF, LP_CONTAINER or LP_VIRTUAL, it -+ * should be cleaned up. This can happen if the CMS changes -+ * the port binding type. -+ */ -+static struct binding_lport * -+binding_lport_check_and_cleanup(struct binding_lport *b_lport, -+ struct shash *binding_lports) -+{ -+ bool cleanup_blport = false; -+ -+ if (!b_lport->lbinding) { -+ cleanup_blport = true; -+ goto cleanup; -+ } -+ -+ switch (b_lport->type) { -+ case LP_VIF: -+ if (strcmp(b_lport->name, b_lport->lbinding->name)) { -+ cleanup_blport = true; -+ } -+ break; -+ -+ case LP_CONTAINER: -+ if (strcmp(b_lport->pb->parent_port, b_lport->lbinding->name)) { -+ cleanup_blport = true; -+ } -+ break; -+ -+ case LP_VIRTUAL: -+ if (!b_lport->pb->virtual_parent || -+ strcmp(b_lport->pb->virtual_parent, b_lport->lbinding->name)) { -+ cleanup_blport = true; -+ } -+ break; -+ -+ case LP_PATCH: -+ case LP_LOCALPORT: -+ case LP_VTEP: -+ case LP_L2GATEWAY: -+ case LP_L3GATEWAY: -+ case LP_CHASSISREDIRECT: -+ case LP_EXTERNAL: -+ case LP_LOCALNET: -+ case LP_REMOTE: -+ case LP_UNKNOWN: -+ cleanup_blport = true; -+ } -+ -+cleanup: -+ if (cleanup_blport) { -+ binding_lport_delete(binding_lports, b_lport); -+ return NULL; -+ } -+ -+ return b_lport; -+} -diff --git a/controller/binding.h b/controller/binding.h -index c9ebef4b1..4fc9ef207 100644 ---- a/controller/binding.h -+++ b/controller/binding.h -@@ -36,6 +36,7 @@ struct sbrec_chassis; - struct sbrec_port_binding_table; - struct sset; - struct sbrec_port_binding; -+struct ds; - - struct binding_ctx_in { - struct ovsdb_idl_txn *ovnsb_idl_txn; -@@ -56,7 +57,7 @@ struct binding_ctx_in { - - struct binding_ctx_out { - struct hmap *local_datapaths; -- struct shash *local_bindings; -+ struct local_binding_data *lbinding_data; - - /* sset of (potential) local lports. */ - struct sset *local_lports; -@@ -86,28 +87,16 @@ struct binding_ctx_out { - struct hmap *tracked_dp_bindings; - }; - --enum local_binding_type { -- BT_VIF, -- BT_CONTAINER, -- BT_VIRTUAL -+struct local_binding_data { -+ struct shash bindings; -+ struct shash lports; - }; - --struct local_binding { -- char *name; -- enum local_binding_type type; -- const struct ovsrec_interface *iface; -- const struct sbrec_port_binding *pb; -- -- /* shash of 'struct local_binding' representing children. */ -- struct shash children; -- struct local_binding *parent; --}; -+void local_binding_data_init(struct local_binding_data *); -+void local_binding_data_destroy(struct local_binding_data *); - --static inline struct local_binding * --local_binding_find(struct shash *local_bindings, const char *name) --{ -- return shash_find_data(local_bindings, name); --} -+const struct sbrec_port_binding *local_binding_get_primary_pb( -+ struct shash *local_bindings, const char *pb_name); - - /* Represents a tracked binding logical port. */ - struct tracked_binding_lport { -@@ -128,8 +117,6 @@ bool binding_cleanup(struct ovsdb_idl_txn *ovnsb_idl_txn, - const struct sbrec_port_binding_table *, - const struct sbrec_chassis *); - --void local_bindings_init(struct shash *local_bindings); --void local_bindings_destroy(struct shash *local_bindings); - bool binding_handle_ovs_interface_changes(struct binding_ctx_in *, - struct binding_ctx_out *); - bool binding_handle_port_binding_changes(struct binding_ctx_in *, -@@ -137,7 +124,8 @@ bool binding_handle_port_binding_changes(struct binding_ctx_in *, - void binding_tracked_dp_destroy(struct hmap *tracked_datapaths); - - void binding_init(void); --void binding_seqno_run(struct shash *local_bindings); --void binding_seqno_install(struct shash *local_bindings); -+void binding_seqno_run(struct local_binding_data *lbinding_data); -+void binding_seqno_install(struct local_binding_data *lbinding_data); - void binding_seqno_flush(void); -+void binding_dump_local_bindings(struct local_binding_data *, struct ds *); - #endif /* controller/binding.h */ -diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml -index 51c0c372c..8886df568 100644 ---- a/controller/ovn-controller.8.xml -+++ b/controller/ovn-controller.8.xml -@@ -578,6 +578,28 @@ - Displays logical flow cache statistics: enabled/disabled, per cache - type entry counts. - -+ -+
inc-engine/show-stats
ovn-controller
engine counters. For each engine
-+ node the following counters have been added:
-+ recompute
-+ compute
-+ abort
-+ inc-engine/clear-stats
ovn-controller
engine counters.
-+ OVN_Northbound
-- database for alogical switch datapath, a priority-100 flow is added
-+ database for a logical switch datapath, a priority-100 flow is added
- with the match ip
to match on IP packets and sets the action
-- reg0[0] = 1; next;
to act as a hint for table
-+ reg0[2] = 1; next;
to act as a hint for table
- Pre-stateful
to send IP packets to the connection tracker
-- for packet de-fragmentation before eventually advancing to ingress
-- table LB
.
-+ for packet de-fragmentation (and to possibly do DNAT for already
-+ established load balanced traffic) before eventually advancing to ingress
-+ table Stateful
.
- If controller_event has been enabled and load balancing rules with
- empty backends have been added in OVN_Northbound
, a 130 flow
- is added to trigger ovn-controller events whenever the chassis receives a
-@@ -470,11 +471,38 @@
-
- This table prepares flows for all possible stateful processing
- in next tables. It contains a priority-0 flow that simply moves
-- traffic to the next table. A priority-100 flow sends the packets to
-- connection tracker based on a hint provided by the previous tables
-- (with a match for reg0[0] == 1
) by using the
-- ct_next;
action.
-+ traffic to the next table.
-
ct_lb;
as the action so that the already established
-+ traffic destined to the load balancer VIP gets DNATted based on a hint
-+ provided by the previous tables (with a match
-+ for reg0[2] == 1
and on supported load balancer protocols
-+ and address families). For IPv4 traffic the flows also load the
-+ original destination IP and transport port in registers
-+ reg1
and reg2
. For IPv6 traffic the flows
-+ also load the original destination IP and transport port in
-+ registers xxreg1
and reg2
.
-+ reg0[2] == 1
) by using the
-+ ct_lb;
action. This flow is added to handle
-+ the traffic for load balancer VIPs whose protocol is not defined
-+ (mainly for ICMP traffic).
-+ reg0[0] == 1
) by using the
-+ ct_next;
action.
-+ from-lport
ACL hints- The table contains the following flows: -
-+no
ACLs configured, otherwise a
-+ priority-0 flow to advance to the next table.
-+ reg0[10]
and then advances to the next
- table.
- from-lport
ACLs
-- This table also contains a priority 0 flow with action
-- next;
, so that ACLs allow packets by default. If the
-- logical datapath has a stateful ACL or a load balancer with VIP
-+ This table contains a priority-65535 flow to advance to the next table
-+ if the logical switch has no
ACLs configured, otherwise a
-+ priority-0 flow to advance to the next table so that ACLs allow
-+ packets by default.
-+
-+ If the logical datapath has a stateful ACL or a load balancer with VIP - configured, the following flows will also be added: -
- -@@ -615,7 +653,7 @@ - - -ct_label.blocked
set.
-@@ -628,19 +666,19 @@
- ct_label.blocked
set.
- ct_label.blocked
set meaning that the connection
- should no longer be allowed due to a policy change. Packets
- in the request direction are skipped here to let a newly created
-@@ -648,11 +686,18 @@
- -+ If the logical datapath has any ACL or a load balancer with VIP -+ configured, the following flow will also be added: -+
-+ -+eth.dst = E
to allow the service
-@@ -709,33 +754,7 @@
- -- It contains a priority-0 flow that simply moves traffic to the next -- table. --
-- --
-- A priority-65535 flow with the match
-- inport == I
for all logical switch
-- datapaths to move traffic to the next table. Where I
-- is the peer of a logical router port. This flow is added to
-- skip the connection tracking of packets which enter from
-- logical router datapath to logical switch datapath.
--
-- For established connections a priority 65534 flow matches on
-- ct.est && !ct.rel && !ct.new &&
-- !ct.inv
and sets an action reg0[2] = 1; next;
to act
-- as a hint for table Stateful
to send packets through
-- connection tracker to NAT the packets. (The packet will automatically
-- get DNATed to the same IP address as the first packet in that
-- connection.)
--
ct_commit; next;
action based on a hint provided by
- the previous tables (with a match for reg0[1] == 1
).
- ct_lb;
as the action based on a hint provided by the
-- previous tables (with a match for reg0[2] == 1
and
-- on supported load balancer protocols and address families).
-- For IPv4 traffic the flows also load the original destination
-- IP and transport port in registers reg1
and
-- reg2
. For IPv6 traffic the flows also load the original
-- destination IP and transport port in registers xxreg1
and
-- reg2
.
-- - This table implements ARP/ND responder in a logical switch for known -@@ -1164,7 +1172,7 @@ output; - - - --
- This table adds the DHCPv4 options to a DHCPv4 packet from the -@@ -1225,7 +1233,7 @@ next; - - - --
- This table implements DHCP responder for the DHCP replies generated by -@@ -1306,7 +1314,7 @@ output; - - - --
- This table looks up and resolves the DNS names to the corresponding -@@ -1335,7 +1343,7 @@ reg0[4] = dns_lookup(); next; - - - --
- This table implements DNS responder for the DNS replies generated by -@@ -1370,7 +1378,7 @@ output; - - - --
- Traffic from the external
logical ports enter the ingress
-@@ -1413,7 +1421,7 @@ output;
-
-
-
--
- This table implements switching behavior. It contains these logical
-@@ -1639,9 +1647,11 @@ output;
- Moreover it contains a priority-110 flow to move IPv6 Neighbor Discovery
- traffic to the next table. If any load balancing rules exist for the
- datapath, a priority-100 flow is added with a match of ip
-- and action of reg0[0] = 1; next;
to act as a hint for
-+ and action of reg0[2] = 1; next;
to act as a hint for
- table Pre-stateful
to send IP packets to the connection
-- tracker for packet de-fragmentation.
-+ tracker for packet de-fragmentation and possibly DNAT the destination
-+ VIP to one of the selected backend for already commited load balanced
-+ traffic.
-
-@@ -1683,20 +1693,39 @@ output; -
-- This is similar to ingress table Pre-stateful
.
-+ This is similar to ingress table Pre-stateful
. This table
-+ adds the below 3 logical flows.
-
-- This is similar to ingress table LB
.
--
ct_lb;
as the action so that the already established
-+ traffic gets unDNATted from the backend IP to the load balancer VIP
-+ based on a hint provided by the previous tables with a match
-+ for reg0[2] == 1
. If the packet was not DNATted earlier,
-+ then ct_lb
functions like ct_next
.
-+ reg0[0] == 1
) by using the
-+ ct_next;
action.
-+ from-lport
ACL hintsfrom-lport
ACL hints
- This is similar to ingress table ACL hints
.
-
to-lport
ACLsto-lport
ACLs
- This is similar to ingress table ACLs
except for
-@@ -1733,28 +1762,28 @@ output;
-
-
-
--
to-lport
QoS Markingto-lport
QoS Marking
- This is similar to ingress table QoS marking
except
- they apply to to-lport
QoS rules.
-
to-lport
QoS Meterto-lport
QoS Meter
- This is similar to ingress table QoS meter
except
- they apply to to-lport
QoS rules.
-
- This is similar to ingress table Stateful
except that
- there are no rules added for load balancing new connections.
-
- This is similar to the port security logic in table
-@@ -1764,7 +1793,7 @@ output;
- ip4.src
and ip6.src
-
- This is similar to the ingress port security logic in ingress table
-@@ -2720,7 +2749,11 @@ icmp6 {
- (and optional port numbers) to load balance to. If the router is
- configured to force SNAT any load-balanced packets, the above action
- will be replaced by flags.force_snat_for_lb = 1;
-- ct_lb(args);
. If health check is enabled, then
-+ ct_lb(args);.
-+ If the load balancing rule is configured with skip_snat
-+ set to true, the above action will be replaced by
-+ flags.skip_snat_for_lb = 1; ct_lb(args);
.
-+ If health check is enabled, then
- args will only contain those endpoints whose service
- monitor status entry in OVN_Southbound
db is
- either online
or empty.
-@@ -2737,6 +2770,9 @@ icmp6 {
- with an action of ct_dnat;
. If the router is
- configured to force SNAT any load-balanced packets, the above action
- will be replaced by flags.force_snat_for_lb = 1; ct_dnat;
.
-+ If the load balancing rule is configured with skip_snat
-+ set to true, the above action will be replaced by
-+ flags.skip_snat_for_lb = 1; ct_dnat;
.
-
-
-
flags.force_snat_for_lb = 1;
- ct_lb(args);
.
-+ If the load balancing rule is configured with skip_snat
-+ set to true, the above action will be replaced by
-+ flags.skip_snat_for_lb = 1; ct_lb(args);
.
- flags.force_snat_for_lb = 1; ct_dnat;
.
-+ If the load balancing rule is configured with skip_snat
-+ set to true, the above action will be replaced by
-+ flags.skip_snat_for_lb = 1; ct_dnat;
.
-
-+ If a load balancer configured to skip snat has been applied to
-+ the Gateway router pipeline, a priority-120 flow matches
-+ flags.skip_snat_for_lb == 1 && ip
with an
-+ action next;
.
-+
- If the Gateway router in the OVN Northbound database has been -diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c -index 5a2018c2e..ae58fda16 100644 ---- a/northd/ovn-northd.c -+++ b/northd/ovn-northd.c -@@ -97,6 +97,10 @@ static bool check_lsp_is_up; - static char svc_monitor_mac[ETH_ADDR_STRLEN + 1]; - static struct eth_addr svc_monitor_mac_ea; - -+/* If this option is 'true' northd will make use of ct.inv match fields. -+ * Otherwise, it will avoid using it. The default is true. */ -+static bool use_ct_inv_match = true; -+ - /* Default probe interval for NB and SB DB connections. */ - #define DEFAULT_PROBE_INTERVAL_MSEC 5000 - static int northd_probe_interval_nb = 0; -@@ -147,32 +151,30 @@ enum ovn_stage { - PIPELINE_STAGE(SWITCH, IN, ACL, 9, "ls_in_acl") \ - PIPELINE_STAGE(SWITCH, IN, QOS_MARK, 10, "ls_in_qos_mark") \ - PIPELINE_STAGE(SWITCH, IN, QOS_METER, 11, "ls_in_qos_meter") \ -- PIPELINE_STAGE(SWITCH, IN, LB, 12, "ls_in_lb") \ -- PIPELINE_STAGE(SWITCH, IN, STATEFUL, 13, "ls_in_stateful") \ -- PIPELINE_STAGE(SWITCH, IN, PRE_HAIRPIN, 14, "ls_in_pre_hairpin") \ -- PIPELINE_STAGE(SWITCH, IN, NAT_HAIRPIN, 15, "ls_in_nat_hairpin") \ -- PIPELINE_STAGE(SWITCH, IN, HAIRPIN, 16, "ls_in_hairpin") \ -- PIPELINE_STAGE(SWITCH, IN, ARP_ND_RSP, 17, "ls_in_arp_rsp") \ -- PIPELINE_STAGE(SWITCH, IN, DHCP_OPTIONS, 18, "ls_in_dhcp_options") \ -- PIPELINE_STAGE(SWITCH, IN, DHCP_RESPONSE, 19, "ls_in_dhcp_response") \ -- PIPELINE_STAGE(SWITCH, IN, DNS_LOOKUP, 20, "ls_in_dns_lookup") \ -- PIPELINE_STAGE(SWITCH, IN, DNS_RESPONSE, 21, "ls_in_dns_response") \ -- PIPELINE_STAGE(SWITCH, IN, EXTERNAL_PORT, 22, "ls_in_external_port") \ -- PIPELINE_STAGE(SWITCH, IN, L2_LKUP, 23, "ls_in_l2_lkup") \ -- PIPELINE_STAGE(SWITCH, IN, L2_UNKNOWN, 24, "ls_in_l2_unknown") \ -+ PIPELINE_STAGE(SWITCH, IN, STATEFUL, 12, "ls_in_stateful") \ -+ PIPELINE_STAGE(SWITCH, IN, PRE_HAIRPIN, 13, "ls_in_pre_hairpin") \ -+ PIPELINE_STAGE(SWITCH, IN, NAT_HAIRPIN, 14, "ls_in_nat_hairpin") \ -+ PIPELINE_STAGE(SWITCH, IN, HAIRPIN, 15, "ls_in_hairpin") \ -+ PIPELINE_STAGE(SWITCH, IN, ARP_ND_RSP, 16, "ls_in_arp_rsp") \ -+ PIPELINE_STAGE(SWITCH, IN, DHCP_OPTIONS, 17, "ls_in_dhcp_options") \ -+ PIPELINE_STAGE(SWITCH, IN, DHCP_RESPONSE, 18, "ls_in_dhcp_response") \ -+ PIPELINE_STAGE(SWITCH, IN, DNS_LOOKUP, 19, "ls_in_dns_lookup") \ -+ PIPELINE_STAGE(SWITCH, IN, DNS_RESPONSE, 20, "ls_in_dns_response") \ -+ PIPELINE_STAGE(SWITCH, IN, EXTERNAL_PORT, 21, "ls_in_external_port") \ -+ PIPELINE_STAGE(SWITCH, IN, L2_LKUP, 22, "ls_in_l2_lkup") \ -+ PIPELINE_STAGE(SWITCH, IN, L2_UNKNOWN, 23, "ls_in_l2_unknown") \ - \ - /* Logical switch egress stages. */ \ - PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 0, "ls_out_pre_lb") \ - PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 1, "ls_out_pre_acl") \ - PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful") \ -- PIPELINE_STAGE(SWITCH, OUT, LB, 3, "ls_out_lb") \ -- PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 4, "ls_out_acl_hint") \ -- PIPELINE_STAGE(SWITCH, OUT, ACL, 5, "ls_out_acl") \ -- PIPELINE_STAGE(SWITCH, OUT, QOS_MARK, 6, "ls_out_qos_mark") \ -- PIPELINE_STAGE(SWITCH, OUT, QOS_METER, 7, "ls_out_qos_meter") \ -- PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 8, "ls_out_stateful") \ -- PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP, 9, "ls_out_port_sec_ip") \ -- PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2, 10, "ls_out_port_sec_l2") \ -+ PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 3, "ls_out_acl_hint") \ -+ PIPELINE_STAGE(SWITCH, OUT, ACL, 4, "ls_out_acl") \ -+ PIPELINE_STAGE(SWITCH, OUT, QOS_MARK, 5, "ls_out_qos_mark") \ -+ PIPELINE_STAGE(SWITCH, OUT, QOS_METER, 6, "ls_out_qos_meter") \ -+ PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 7, "ls_out_stateful") \ -+ PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP, 8, "ls_out_port_sec_ip") \ -+ PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2, 9, "ls_out_port_sec_l2") \ - \ - /* Logical router ingress stages. */ \ - PIPELINE_STAGE(ROUTER, IN, ADMISSION, 0, "lr_in_admission") \ -@@ -626,6 +628,7 @@ struct ovn_datapath { - bool has_stateful_acl; - bool has_lb_vip; - bool has_unknown; -+ bool has_acls; - - /* IPAM data. */ - struct ipam_info ipam_info; -@@ -664,9 +667,6 @@ struct ovn_datapath { - struct hmap nb_pgs; - }; - --static bool ls_has_stateful_acl(struct ovn_datapath *od); --static bool ls_has_lb_vip(struct ovn_datapath *od); -- - /* Contains a NAT entry with the external addresses pre-parsed. */ - struct ovn_nat { - const struct nbrec_nat *nb; -@@ -4729,27 +4729,38 @@ ovn_ls_port_group_destroy(struct hmap *nb_pgs) - hmap_destroy(nb_pgs); - } - --static bool --ls_has_stateful_acl(struct ovn_datapath *od) -+static void -+ls_get_acl_flags(struct ovn_datapath *od) - { -- for (size_t i = 0; i < od->nbs->n_acls; i++) { -- struct nbrec_acl *acl = od->nbs->acls[i]; -- if (!strcmp(acl->action, "allow-related")) { -- return true; -+ od->has_acls = false; -+ od->has_stateful_acl = false; -+ -+ if (od->nbs->n_acls) { -+ od->has_acls = true; -+ -+ for (size_t i = 0; i < od->nbs->n_acls; i++) { -+ struct nbrec_acl *acl = od->nbs->acls[i]; -+ if (!strcmp(acl->action, "allow-related")) { -+ od->has_stateful_acl = true; -+ return; -+ } - } - } - - struct ovn_ls_port_group *ls_pg; - HMAP_FOR_EACH (ls_pg, key_node, &od->nb_pgs) { -- for (size_t i = 0; i < ls_pg->nb_pg->n_acls; i++) { -- struct nbrec_acl *acl = ls_pg->nb_pg->acls[i]; -- if (!strcmp(acl->action, "allow-related")) { -- return true; -+ if (ls_pg->nb_pg->n_acls) { -+ od->has_acls = true; -+ -+ for (size_t i = 0; i < ls_pg->nb_pg->n_acls; i++) { -+ struct nbrec_acl *acl = ls_pg->nb_pg->acls[i]; -+ if (!strcmp(acl->action, "allow-related")) { -+ od->has_stateful_acl = true; -+ return; -+ } - } - } - } -- -- return false; - } - - /* Logical switch ingress table 0: Ingress port security - L2 -@@ -5128,8 +5139,8 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows, - vip_configured = (vip_configured || lb->n_vips); - } - -- /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send -- * packet to conntrack for defragmentation. -+ /* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send -+ * packet to conntrack for defragmentation and possibly for unNATting. - * - * Send all the packets to conntrack in the ingress pipeline if the - * logical switch has a load balancer with VIP configured. Earlier -@@ -5159,9 +5170,9 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows, - */ - if (vip_configured) { - ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, -- 100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;"); -+ 100, "ip", REGBIT_CONNTRACK_NAT" = 1; next;"); - ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, -- 100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;"); -+ 100, "ip", REGBIT_CONNTRACK_NAT" = 1; next;"); - } - } - -@@ -5173,10 +5184,46 @@ build_pre_stateful(struct ovn_datapath *od, struct hmap *lflows) - ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 0, "1", "next;"); - ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 0, "1", "next;"); - -+ const char *lb_protocols[] = {"tcp", "udp", "sctp"}; -+ struct ds actions = DS_EMPTY_INITIALIZER; -+ struct ds match = DS_EMPTY_INITIALIZER; -+ -+ for (size_t i = 0; i < ARRAY_SIZE(lb_protocols); i++) { -+ ds_clear(&match); -+ ds_clear(&actions); -+ ds_put_format(&match, REGBIT_CONNTRACK_NAT" == 1 && ip4 && %s", -+ lb_protocols[i]); -+ ds_put_format(&actions, REG_ORIG_DIP_IPV4 " = ip4.dst; " -+ REG_ORIG_TP_DPORT " = %s.dst; ct_lb;", -+ lb_protocols[i]); -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 120, -+ ds_cstr(&match), ds_cstr(&actions)); -+ -+ ds_clear(&match); -+ ds_clear(&actions); -+ ds_put_format(&match, REGBIT_CONNTRACK_NAT" == 1 && ip6 && %s", -+ lb_protocols[i]); -+ ds_put_format(&actions, REG_ORIG_DIP_IPV6 " = ip6.dst; " -+ REG_ORIG_TP_DPORT " = %s.dst; ct_lb;", -+ lb_protocols[i]); -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 120, -+ ds_cstr(&match), ds_cstr(&actions)); -+ } -+ -+ ds_destroy(&actions); -+ ds_destroy(&match); -+ -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 110, -+ REGBIT_CONNTRACK_NAT" == 1", "ct_lb;"); -+ -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 110, -+ REGBIT_CONNTRACK_NAT" == 1", "ct_lb;"); -+ - /* If REGBIT_CONNTRACK_DEFRAG is set as 1, then the packets should be - * sent to conntrack for tracking and defragmentation. */ - ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 100, - REGBIT_CONNTRACK_DEFRAG" == 1", "ct_next;"); -+ - ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 100, - REGBIT_CONNTRACK_DEFRAG" == 1", "ct_next;"); - } -@@ -5206,7 +5253,11 @@ build_acl_hints(struct ovn_datapath *od, struct hmap *lflows) - enum ovn_stage stage = stages[i]; - - /* In any case, advance to the next stage. */ -- ovn_lflow_add(lflows, od, stage, 0, "1", "next;"); -+ if (!od->has_acls && !od->has_lb_vip) { -+ ovn_lflow_add(lflows, od, stage, UINT16_MAX, "1", "next;"); -+ } else { -+ ovn_lflow_add(lflows, od, stage, 0, "1", "next;"); -+ } - - if (!od->has_stateful_acl && !od->has_lb_vip) { - continue; -@@ -5606,10 +5657,19 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows, - bool has_stateful = od->has_stateful_acl || od->has_lb_vip; - - /* Ingress and Egress ACL Table (Priority 0): Packets are allowed by -- * default. A related rule at priority 1 is added below if there -+ * default. If the logical switch has no ACLs or no load balancers, -+ * then add 65535-priority flow to advance the packet to next -+ * stage. -+ * -+ * A related rule at priority 1 is added below if there - * are any stateful ACLs in this datapath. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 0, "1", "next;"); -+ if (!od->has_acls && !od->has_lb_vip) { -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, "1", "next;"); -+ } else { -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 0, "1", "next;"); -+ } - - if (has_stateful) { - /* Ingress and Egress ACL Table (Priority 1). -@@ -5640,21 +5700,23 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows, - "ip && (!ct.est || (ct.est && ct_label.blocked == 1))", - REGBIT_CONNTRACK_COMMIT" = 1; next;"); - -- /* Ingress and Egress ACL Table (Priority 65535). -+ /* Ingress and Egress ACL Table (Priority 65532). - * - * Always drop traffic that's in an invalid state. Also drop - * reply direction packets for connections that have been marked - * for deletion (bit 0 of ct_label is set). - * - * This is enforced at a higher priority than ACLs can be defined. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, -- "ct.inv || (ct.est && ct.rpl && ct_label.blocked == 1)", -- "drop;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, -- "ct.inv || (ct.est && ct.rpl && ct_label.blocked == 1)", -- "drop;"); -+ char *match = -+ xasprintf("%s(ct.est && ct.rpl && ct_label.blocked == 1)", -+ use_ct_inv_match ? "ct.inv || " : ""); -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX - 3, -+ match, "drop;"); -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX - 3, -+ match, "drop;"); -+ free(match); - -- /* Ingress and Egress ACL Table (Priority 65535). -+ /* Ingress and Egress ACL Table (Priority 65535 - 3). - * - * Allow reply traffic that is part of an established - * conntrack entry that has not been marked for deletion -@@ -5663,14 +5725,15 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows, - * direction to hit the currently defined policy from ACLs. - * - * This is enforced at a higher priority than ACLs can be defined. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, -- "ct.est && !ct.rel && !ct.new && !ct.inv " -- "&& ct.rpl && ct_label.blocked == 0", -- "next;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, -- "ct.est && !ct.rel && !ct.new && !ct.inv " -- "&& ct.rpl && ct_label.blocked == 0", -- "next;"); -+ match = xasprintf("ct.est && !ct.rel && !ct.new%s && " -+ "ct.rpl && ct_label.blocked == 0", -+ use_ct_inv_match ? " && !ct.inv" : ""); -+ -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX - 3, -+ match, "next;"); -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX - 3, -+ match, "next;"); -+ free(match); - - /* Ingress and Egress ACL Table (Priority 65535). - * -@@ -5683,21 +5746,21 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows, - * a dynamically negotiated FTP data channel), but will allow - * related traffic such as an ICMP Port Unreachable through - * that's generated from a non-listening UDP port. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, -- "!ct.est && ct.rel && !ct.new && !ct.inv " -- "&& ct_label.blocked == 0", -- "next;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, -- "!ct.est && ct.rel && !ct.new && !ct.inv " -- "&& ct_label.blocked == 0", -- "next;"); -+ match = xasprintf("!ct.est && ct.rel && !ct.new%s && " -+ "ct_label.blocked == 0", -+ use_ct_inv_match ? " && !ct.inv" : ""); -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX - 3, -+ match, "next;"); -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX - 3, -+ match, "next;"); -+ free(match); - -- /* Ingress and Egress ACL Table (Priority 65535). -+ /* Ingress and Egress ACL Table (Priority 65532). - * - * Not to do conntrack on ND packets. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX - 3, - "nd || nd_ra || nd_rs || mldv1 || mldv2", "next;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX - 3, - "nd || nd_ra || nd_rs || mldv1 || mldv2", "next;"); - } - -@@ -5784,15 +5847,18 @@ build_acls(struct ovn_datapath *od, struct hmap *lflows, - actions); - } - -- /* Add a 34000 priority flow to advance the service monitor reply -- * packets to skip applying ingress ACLs. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 34000, -- "eth.dst == $svc_monitor_mac", "next;"); - -- /* Add a 34000 priority flow to advance the service monitor packets -- * generated by ovn-controller to skip applying egress ACLs. */ -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 34000, -- "eth.src == $svc_monitor_mac", "next;"); -+ if (od->has_acls || od->has_lb_vip) { -+ /* Add a 34000 priority flow to advance the service monitor reply -+ * packets to skip applying ingress ACLs. */ -+ ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 34000, -+ "eth.dst == $svc_monitor_mac", "next;"); -+ -+ /* Add a 34000 priority flow to advance the service monitor packets -+ * generated by ovn-controller to skip applying egress ACLs. */ -+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 34000, -+ "eth.src == $svc_monitor_mac", "next;"); -+ } - } - - static void -@@ -5856,37 +5922,6 @@ build_qos(struct ovn_datapath *od, struct hmap *lflows) { - } - } - --static void --build_lb(struct ovn_datapath *od, struct hmap *lflows) --{ -- /* Ingress and Egress LB Table (Priority 0): Packets are allowed by -- * default. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;"); -- -- if (od->nbs->n_load_balancer) { -- for (size_t i = 0; i < od->n_router_ports; i++) { -- skip_port_from_conntrack(od, od->router_ports[i], -- S_SWITCH_IN_LB, S_SWITCH_OUT_LB, -- UINT16_MAX, lflows); -- } -- } -- -- if (od->has_lb_vip) { -- /* Ingress and Egress LB Table (Priority 65534). -- * -- * Send established traffic through conntrack for just NAT. */ -- ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, UINT16_MAX - 1, -- "ct.est && !ct.rel && !ct.new && !ct.inv && " -- "ct_label.natted == 1", -- REGBIT_CONNTRACK_NAT" = 1; next;"); -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, UINT16_MAX - 1, -- "ct.est && !ct.rel && !ct.new && !ct.inv && " -- "ct_label.natted == 1", -- REGBIT_CONNTRACK_NAT" = 1; next;"); -- } --} -- - static void - build_lb_rules(struct ovn_datapath *od, struct hmap *lflows, - struct ovn_northd_lb *lb) -@@ -5971,48 +6006,6 @@ build_stateful(struct ovn_datapath *od, struct hmap *lflows, struct hmap *lbs) - REGBIT_CONNTRACK_COMMIT" == 1", - "ct_commit { ct_label.blocked = 0; }; next;"); - -- /* If REGBIT_CONNTRACK_NAT is set as 1, then packets should just be sent -- * through nat (without committing). -- * -- * REGBIT_CONNTRACK_COMMIT is set for new connections and -- * REGBIT_CONNTRACK_NAT is set for established connections. So they -- * don't overlap. -- * -- * In the ingress pipeline, also store the original destination IP and -- * transport port to be used when detecting hairpin packets. -- */ -- const char *lb_protocols[] = {"tcp", "udp", "sctp"}; -- struct ds actions = DS_EMPTY_INITIALIZER; -- struct ds match = DS_EMPTY_INITIALIZER; -- -- for (size_t i = 0; i < ARRAY_SIZE(lb_protocols); i++) { -- ds_clear(&match); -- ds_clear(&actions); -- ds_put_format(&match, REGBIT_CONNTRACK_NAT" == 1 && ip4 && %s", -- lb_protocols[i]); -- ds_put_format(&actions, REG_ORIG_DIP_IPV4 " = ip4.dst; " -- REG_ORIG_TP_DPORT " = %s.dst; ct_lb;", -- lb_protocols[i]); -- ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100, -- ds_cstr(&match), ds_cstr(&actions)); -- -- ds_clear(&match); -- ds_clear(&actions); -- ds_put_format(&match, REGBIT_CONNTRACK_NAT" == 1 && ip6 && %s", -- lb_protocols[i]); -- ds_put_format(&actions, REG_ORIG_DIP_IPV6 " = ip6.dst; " -- REG_ORIG_TP_DPORT " = %s.dst; ct_lb;", -- lb_protocols[i]); -- ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100, -- ds_cstr(&match), ds_cstr(&actions)); -- } -- -- ds_destroy(&actions); -- ds_destroy(&match); -- -- ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100, -- REGBIT_CONNTRACK_NAT" == 1", "ct_lb;"); -- - /* Load balancing rules for new connections get committed to conntrack - * table. So even if REGBIT_CONNTRACK_COMMIT is set in a previous table - * a higher priority rule for load balancing below also commits the -@@ -6759,7 +6752,7 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *lflows) - struct ds actions = DS_EMPTY_INITIALIZER; - struct ovn_datapath *od; - -- /* Ingress table 24: Destination lookup for unknown MACs (priority 0). */ -+ /* Ingress table 23: Destination lookup for unknown MACs (priority 0). */ - HMAP_FOR_EACH (od, key_node, datapaths) { - if (!od->nbs) { - continue; -@@ -6794,8 +6787,8 @@ build_lswitch_lflows_pre_acl_and_acl(struct ovn_datapath *od, - struct hmap *lbs) - { - if (od->nbs) { -- od->has_stateful_acl = ls_has_stateful_acl(od); - od->has_lb_vip = ls_has_lb_vip(od); -+ ls_get_acl_flags(od); - - build_pre_acls(od, lflows); - build_pre_lb(od, lflows, meter_groups, lbs); -@@ -6803,7 +6796,6 @@ build_lswitch_lflows_pre_acl_and_acl(struct ovn_datapath *od, - build_acl_hints(od, lflows); - build_acls(od, lflows, port_groups, meter_groups); - build_qos(od, lflows); -- build_lb(od, lflows); - build_stateful(od, lflows, lbs); - build_lb_hairpin(od, lflows); - } -@@ -8573,10 +8565,16 @@ get_force_snat_ip(struct ovn_datapath *od, const char *key_type, - return true; - } - -+enum lb_snat_type { -+ NO_FORCE_SNAT, -+ FORCE_SNAT, -+ SKIP_SNAT, -+}; -+ - static void - add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od, - struct ds *match, struct ds *actions, int priority, -- bool force_snat_for_lb, struct ovn_lb_vip *lb_vip, -+ enum lb_snat_type snat_type, struct ovn_lb_vip *lb_vip, - const char *proto, struct nbrec_load_balancer *lb, - struct shash *meter_groups, struct sset *nat_entries) - { -@@ -8585,9 +8583,10 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od, - - /* A match and actions for new connections. */ - char *new_match = xasprintf("ct.new && %s", ds_cstr(match)); -- if (force_snat_for_lb) { -- char *new_actions = xasprintf("flags.force_snat_for_lb = 1; %s", -- ds_cstr(actions)); -+ if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) { -+ char *new_actions = xasprintf("flags.%s_snat_for_lb = 1; %s", -+ snat_type == SKIP_SNAT ? "skip" : "force", -+ ds_cstr(actions)); - ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority, - new_match, new_actions, &lb->header_); - free(new_actions); -@@ -8598,11 +8597,12 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od, - - /* A match and actions for established connections. */ - char *est_match = xasprintf("ct.est && %s", ds_cstr(match)); -- if (force_snat_for_lb) { -+ if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) { -+ char *est_actions = xasprintf("flags.%s_snat_for_lb = 1; ct_dnat;", -+ snat_type == SKIP_SNAT ? "skip" : "force"); - ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority, -- est_match, -- "flags.force_snat_for_lb = 1; ct_dnat;", -- &lb->header_); -+ est_match, est_actions, &lb->header_); -+ free(est_actions); - } else { - ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority, - est_match, "ct_dnat;", &lb->header_); -@@ -8675,11 +8675,13 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od, - ds_put_format(&undnat_match, ") && outport == %s && " - "is_chassis_resident(%s)", od->l3dgw_port->json_key, - od->l3redirect_port->json_key); -- if (force_snat_for_lb) { -+ if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) { -+ char *action = xasprintf("flags.%s_snat_for_lb = 1; ct_dnat;", -+ snat_type == SKIP_SNAT ? "skip" : "force"); - ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 120, -- ds_cstr(&undnat_match), -- "flags.force_snat_for_lb = 1; ct_dnat;", -+ ds_cstr(&undnat_match), action, - &lb->header_); -+ free(action); - } else { - ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 120, - ds_cstr(&undnat_match), "ct_dnat;", -@@ -8689,6 +8691,105 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od, - ds_destroy(&undnat_match); - } - -+static void -+build_lrouter_lb_flows(struct hmap *lflows, struct ovn_datapath *od, -+ struct hmap *lbs, struct shash *meter_groups, -+ struct sset *nat_entries, struct ds *match, -+ struct ds *actions) -+{ -+ /* A set to hold all ips that need defragmentation and tracking. */ -+ struct sset all_ips = SSET_INITIALIZER(&all_ips); -+ bool lb_force_snat_ip = -+ !lport_addresses_is_empty(&od->lb_force_snat_addrs); -+ -+ for (int i = 0; i < od->nbr->n_load_balancer; i++) { -+ struct nbrec_load_balancer *nb_lb = od->nbr->load_balancer[i]; -+ struct ovn_northd_lb *lb = -+ ovn_northd_lb_find(lbs, &nb_lb->header_.uuid); -+ ovs_assert(lb); -+ -+ bool lb_skip_snat = smap_get_bool(&nb_lb->options, "skip_snat", false); -+ if (lb_skip_snat) { -+ ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120, -+ "flags.skip_snat_for_lb == 1 && ip", "next;"); -+ } -+ -+ for (size_t j = 0; j < lb->n_vips; j++) { -+ struct ovn_lb_vip *lb_vip = &lb->vips[j]; -+ struct ovn_northd_lb_vip *lb_vip_nb = &lb->vips_nb[j]; -+ ds_clear(actions); -+ build_lb_vip_actions(lb_vip, lb_vip_nb, actions, -+ lb->selection_fields, false); -+ -+ if (!sset_contains(&all_ips, lb_vip->vip_str)) { -+ sset_add(&all_ips, lb_vip->vip_str); -+ /* If there are any load balancing rules, we should send -+ * the packet to conntrack for defragmentation and -+ * tracking. This helps with two things. -+ * -+ * 1. With tracking, we can send only new connections to -+ * pick a DNAT ip address from a group. -+ * 2. If there are L4 ports in load balancing rules, we -+ * need the defragmentation to match on L4 ports. */ -+ ds_clear(match); -+ if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) { -+ ds_put_format(match, "ip && ip4.dst == %s", -+ lb_vip->vip_str); -+ } else { -+ ds_put_format(match, "ip && ip6.dst == %s", -+ lb_vip->vip_str); -+ } -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, -+ 100, ds_cstr(match), "ct_next;", -+ &nb_lb->header_); -+ } -+ -+ /* Higher priority rules are added for load-balancing in DNAT -+ * table. For every match (on a VIP[:port]), we add two flows -+ * via add_router_lb_flow(). One flow is for specific matching -+ * on ct.new with an action of "ct_lb($targets);". The other -+ * flow is for ct.est with an action of "ct_dnat;". */ -+ ds_clear(match); -+ if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) { -+ ds_put_format(match, "ip && ip4.dst == %s", -+ lb_vip->vip_str); -+ } else { -+ ds_put_format(match, "ip && ip6.dst == %s", -+ lb_vip->vip_str); -+ } -+ -+ int prio = 110; -+ bool is_udp = nullable_string_is_equal(nb_lb->protocol, "udp"); -+ bool is_sctp = nullable_string_is_equal(nb_lb->protocol, -+ "sctp"); -+ const char *proto = is_udp ? "udp" : is_sctp ? "sctp" : "tcp"; -+ -+ if (lb_vip->vip_port) { -+ ds_put_format(match, " && %s && %s.dst == %d", proto, -+ proto, lb_vip->vip_port); -+ prio = 120; -+ } -+ -+ if (od->l3redirect_port && -+ (lb_vip->n_backends || !lb_vip->empty_backend_rej)) { -+ ds_put_format(match, " && is_chassis_resident(%s)", -+ od->l3redirect_port->json_key); -+ } -+ -+ enum lb_snat_type snat_type = NO_FORCE_SNAT; -+ if (lb_skip_snat) { -+ snat_type = SKIP_SNAT; -+ } else if (lb_force_snat_ip || od->lb_force_snat_router_ip) { -+ snat_type = FORCE_SNAT; -+ } -+ add_router_lb_flow(lflows, od, match, actions, prio, -+ snat_type, lb_vip, proto, nb_lb, -+ meter_groups, nat_entries); -+ } -+ } -+ sset_destroy(&all_ips); -+} -+ - #define ND_RA_MAX_INTERVAL_MAX 1800 - #define ND_RA_MAX_INTERVAL_MIN 4 - -@@ -11002,668 +11103,643 @@ build_lrouter_ipv4_ip_input(struct ovn_port *op, - } - } - --/* NAT, Defrag and load balancing. */ - static void --build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, -- struct hmap *lflows, -- struct shash *meter_groups, -- struct hmap *lbs, -- struct ds *match, struct ds *actions) -+build_lrouter_in_unsnat_flow(struct hmap *lflows, struct ovn_datapath *od, -+ const struct nbrec_nat *nat, struct ds *match, -+ struct ds *actions, bool distributed, bool is_v6) - { -- if (od->nbr) { -+ /* Ingress UNSNAT table: It is for already established connections' -+ * reverse traffic. i.e., SNAT has already been done in egress -+ * pipeline and now the packet has entered the ingress pipeline as -+ * part of a reply. We undo the SNAT here. -+ * -+ * Undoing SNAT has to happen before DNAT processing. This is -+ * because when the packet was DNATed in ingress pipeline, it did -+ * not know about the possibility of eventual additional SNAT in -+ * egress pipeline. */ -+ if (strcmp(nat->type, "snat") && strcmp(nat->type, "dnat_and_snat")) { -+ return; -+ } - -- /* Packets are allowed by default. */ -- ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;"); -- ovn_lflow_add(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 0, "1", "next;"); -- -- /* Send the IPv6 NS packets to next table. When ovn-controller -- * generates IPv6 NS (for the action - nd_ns{}), the injected -- * packet would go through conntrack - which is not required. */ -- ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120, "nd_ns", "next;"); -- -- /* NAT rules are only valid on Gateway routers and routers with -- * l3dgw_port (router has a port with gateway chassis -- * specified). */ -- if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) { -- return; -+ bool stateless = lrouter_nat_is_stateless(nat); -+ if (!od->l3dgw_port) { -+ /* Gateway router. */ -+ ds_clear(match); -+ ds_clear(actions); -+ ds_put_format(match, "ip && ip%s.dst == %s", -+ is_v6 ? "6" : "4", nat->external_ip); -+ if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -+ ds_put_format(actions, "ip%s.dst=%s; next;", -+ is_v6 ? "6" : "4", nat->logical_ip); -+ } else { -+ ds_put_cstr(actions, "ct_snat;"); - } - -- struct sset nat_entries = SSET_INITIALIZER(&nat_entries); -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT, -+ 90, ds_cstr(match), ds_cstr(actions), -+ &nat->header_); -+ } else { -+ /* Distributed router. */ - -- bool dnat_force_snat_ip = -- !lport_addresses_is_empty(&od->dnat_force_snat_addrs); -- bool lb_force_snat_ip = -- !lport_addresses_is_empty(&od->lb_force_snat_addrs); -+ /* Traffic received on l3dgw_port is subject to NAT. */ -+ ds_clear(match); -+ ds_clear(actions); -+ ds_put_format(match, "ip && ip%s.dst == %s && inport == %s", -+ is_v6 ? "6" : "4", nat->external_ip, -+ od->l3dgw_port->json_key); -+ if (!distributed && od->l3redirect_port) { -+ /* Flows for NAT rules that are centralized are only -+ * programmed on the gateway chassis. */ -+ ds_put_format(match, " && is_chassis_resident(%s)", -+ od->l3redirect_port->json_key); -+ } - -- for (int i = 0; i < od->nbr->n_nat; i++) { -- const struct nbrec_nat *nat; -+ if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -+ ds_put_format(actions, "ip%s.dst=%s; next;", -+ is_v6 ? "6" : "4", nat->logical_ip); -+ } else { -+ ds_put_cstr(actions, "ct_snat;"); -+ } - -- nat = od->nbr->nat[i]; -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT, -+ 100, ds_cstr(match), ds_cstr(actions), -+ &nat->header_); -+ } -+} - -- ovs_be32 ip, mask; -- struct in6_addr ipv6, mask_v6, v6_exact = IN6ADDR_EXACT_INIT; -- bool is_v6 = false; -- bool stateless = lrouter_nat_is_stateless(nat); -- struct nbrec_address_set *allowed_ext_ips = -- nat->allowed_ext_ips; -- struct nbrec_address_set *exempted_ext_ips = -- nat->exempted_ext_ips; -+static void -+build_lrouter_in_dnat_flow(struct hmap *lflows, struct ovn_datapath *od, -+ const struct nbrec_nat *nat, struct ds *match, -+ struct ds *actions, bool distributed, -+ ovs_be32 mask, bool is_v6) -+{ -+ /* Ingress DNAT table: Packets enter the pipeline with destination -+ * IP address that needs to be DNATted from a external IP address -+ * to a logical IP address. */ -+ if (!strcmp(nat->type, "dnat") || !strcmp(nat->type, "dnat_and_snat")) { -+ bool stateless = lrouter_nat_is_stateless(nat); - -- if (allowed_ext_ips && exempted_ext_ips) { -- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); -- VLOG_WARN_RL(&rl, "NAT rule: "UUID_FMT" not applied, since " -- "both allowed and exempt external ips set", -- UUID_ARGS(&(nat->header_.uuid))); -- continue; -+ if (!od->l3dgw_port) { -+ /* Gateway router. */ -+ /* Packet when it goes from the initiator to destination. -+ * We need to set flags.loopback because the router can -+ * send the packet back through the same interface. */ -+ ds_clear(match); -+ ds_put_format(match, "ip && ip%s.dst == %s", -+ is_v6 ? "6" : "4", nat->external_ip); -+ ds_clear(actions); -+ if (nat->allowed_ext_ips || nat->exempted_ext_ips) { -+ lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -+ is_v6, true, mask); - } - -- char *error = ip_parse_masked(nat->external_ip, &ip, &mask); -- if (error || mask != OVS_BE32_MAX) { -- free(error); -- error = ipv6_parse_masked(nat->external_ip, &ipv6, &mask_v6); -- if (error || memcmp(&mask_v6, &v6_exact, sizeof(mask_v6))) { -- /* Invalid for both IPv4 and IPv6 */ -- static struct vlog_rate_limit rl = -- VLOG_RATE_LIMIT_INIT(5, 1); -- VLOG_WARN_RL(&rl, "bad external ip %s for nat", -- nat->external_ip); -- free(error); -- continue; -- } -- /* It was an invalid IPv4 address, but valid IPv6. -- * Treat the rest of the handling of this NAT rule -- * as IPv6. */ -- is_v6 = true; -- } -- -- /* Check the validity of nat->logical_ip. 'logical_ip' can -- * be a subnet when the type is "snat". */ -- int cidr_bits; -- if (is_v6) { -- error = ipv6_parse_masked(nat->logical_ip, &ipv6, &mask_v6); -- cidr_bits = ipv6_count_cidr_bits(&mask_v6); -- } else { -- error = ip_parse_masked(nat->logical_ip, &ip, &mask); -- cidr_bits = ip_count_cidr_bits(mask); -+ if (!lport_addresses_is_empty(&od->dnat_force_snat_addrs)) { -+ /* Indicate to the future tables that a DNAT has taken -+ * place and a force SNAT needs to be done in the -+ * Egress SNAT table. */ -+ ds_put_format(actions, "flags.force_snat_for_dnat = 1; "); - } -- if (!strcmp(nat->type, "snat")) { -- if (error) { -- /* Invalid for both IPv4 and IPv6 */ -- static struct vlog_rate_limit rl = -- VLOG_RATE_LIMIT_INIT(5, 1); -- VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat " -- "in router "UUID_FMT"", -- nat->logical_ip, UUID_ARGS(&od->key)); -- free(error); -- continue; -- } -+ -+ if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -+ ds_put_format(actions, "flags.loopback = 1; " -+ "ip%s.dst=%s; next;", -+ is_v6 ? "6" : "4", nat->logical_ip); - } else { -- if (error || (!is_v6 && mask != OVS_BE32_MAX) -- || (is_v6 && memcmp(&mask_v6, &v6_exact, -- sizeof mask_v6))) { -- /* Invalid for both IPv4 and IPv6 */ -- static struct vlog_rate_limit rl = -- VLOG_RATE_LIMIT_INIT(5, 1); -- VLOG_WARN_RL(&rl, "bad ip %s for dnat in router " -- ""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key)); -- free(error); -- continue; -+ ds_put_format(actions, "flags.loopback = 1; ct_dnat(%s", -+ nat->logical_ip); -+ -+ if (nat->external_port_range[0]) { -+ ds_put_format(actions, ",%s", nat->external_port_range); - } -+ ds_put_format(actions, ");"); - } - -- /* For distributed router NAT, determine whether this NAT rule -- * satisfies the conditions for distributed NAT processing. */ -- bool distributed = false; -- struct eth_addr mac; -- if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") && -- nat->logical_port && nat->external_mac) { -- if (eth_addr_from_string(nat->external_mac, &mac)) { -- distributed = true; -- } else { -- static struct vlog_rate_limit rl = -- VLOG_RATE_LIMIT_INIT(5, 1); -- VLOG_WARN_RL(&rl, "bad mac %s for dnat in router " -- ""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key)); -- continue; -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100, -+ ds_cstr(match), ds_cstr(actions), -+ &nat->header_); -+ } else { -+ /* Distributed router. */ -+ -+ /* Traffic received on l3dgw_port is subject to NAT. */ -+ ds_clear(match); -+ ds_put_format(match, "ip && ip%s.dst == %s && inport == %s", -+ is_v6 ? "6" : "4", nat->external_ip, -+ od->l3dgw_port->json_key); -+ if (!distributed && od->l3redirect_port) { -+ /* Flows for NAT rules that are centralized are only -+ * programmed on the gateway chassis. */ -+ ds_put_format(match, " && is_chassis_resident(%s)", -+ od->l3redirect_port->json_key); -+ } -+ ds_clear(actions); -+ if (nat->allowed_ext_ips || nat->exempted_ext_ips) { -+ lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -+ is_v6, true, mask); -+ } -+ -+ if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -+ ds_put_format(actions, "ip%s.dst=%s; next;", -+ is_v6 ? "6" : "4", nat->logical_ip); -+ } else { -+ ds_put_format(actions, "ct_dnat(%s", nat->logical_ip); -+ if (nat->external_port_range[0]) { -+ ds_put_format(actions, ",%s", nat->external_port_range); - } -+ ds_put_format(actions, ");"); - } - -- /* Ingress UNSNAT table: It is for already established connections' -- * reverse traffic. i.e., SNAT has already been done in egress -- * pipeline and now the packet has entered the ingress pipeline as -- * part of a reply. We undo the SNAT here. -- * -- * Undoing SNAT has to happen before DNAT processing. This is -- * because when the packet was DNATed in ingress pipeline, it did -- * not know about the possibility of eventual additional SNAT in -- * egress pipeline. */ -- if (!strcmp(nat->type, "snat") -- || !strcmp(nat->type, "dnat_and_snat")) { -- if (!od->l3dgw_port) { -- /* Gateway router. */ -- ds_clear(match); -- ds_clear(actions); -- ds_put_format(match, "ip && ip%s.dst == %s", -- is_v6 ? "6" : "4", -- nat->external_ip); -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "ip%s.dst=%s; next;", -- is_v6 ? "6" : "4", nat->logical_ip); -- } else { -- ds_put_cstr(actions, "ct_snat;"); -- } -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100, -+ ds_cstr(match), ds_cstr(actions), -+ &nat->header_); -+ } -+ } -+} - -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT, -- 90, ds_cstr(match), -- ds_cstr(actions), -- &nat->header_); -- } else { -- /* Distributed router. */ -+static void -+build_lrouter_out_undnat_flow(struct hmap *lflows, struct ovn_datapath *od, -+ const struct nbrec_nat *nat, struct ds *match, -+ struct ds *actions, bool distributed, -+ struct eth_addr mac, bool is_v6) -+{ -+ /* Egress UNDNAT table: It is for already established connections' -+ * reverse traffic. i.e., DNAT has already been done in ingress -+ * pipeline and now the packet has entered the egress pipeline as -+ * part of a reply. We undo the DNAT here. -+ * -+ * Note that this only applies for NAT on a distributed router. -+ * Undo DNAT on a gateway router is done in the ingress DNAT -+ * pipeline stage. */ -+ if (!od->l3dgw_port || -+ (strcmp(nat->type, "dnat") && strcmp(nat->type, "dnat_and_snat"))) { -+ return; -+ } - -- /* Traffic received on l3dgw_port is subject to NAT. */ -- ds_clear(match); -- ds_clear(actions); -- ds_put_format(match, "ip && ip%s.dst == %s" -- " && inport == %s", -- is_v6 ? "6" : "4", -- nat->external_ip, -- od->l3dgw_port->json_key); -- if (!distributed && od->l3redirect_port) { -- /* Flows for NAT rules that are centralized are only -- * programmed on the gateway chassis. */ -- ds_put_format(match, " && is_chassis_resident(%s)", -- od->l3redirect_port->json_key); -- } -+ ds_clear(match); -+ ds_put_format(match, "ip && ip%s.src == %s && outport == %s", -+ is_v6 ? "6" : "4", nat->logical_ip, -+ od->l3dgw_port->json_key); -+ if (!distributed && od->l3redirect_port) { -+ /* Flows for NAT rules that are centralized are only -+ * programmed on the gateway chassis. */ -+ ds_put_format(match, " && is_chassis_resident(%s)", -+ od->l3redirect_port->json_key); -+ } -+ ds_clear(actions); -+ if (distributed) { -+ ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ", -+ ETH_ADDR_ARGS(mac)); -+ } - -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "ip%s.dst=%s; next;", -- is_v6 ? "6" : "4", nat->logical_ip); -- } else { -- ds_put_cstr(actions, "ct_snat;"); -- } -+ if (!strcmp(nat->type, "dnat_and_snat") && -+ lrouter_nat_is_stateless(nat)) { -+ ds_put_format(actions, "ip%s.src=%s; next;", -+ is_v6 ? "6" : "4", nat->external_ip); -+ } else { -+ ds_put_format(actions, "ct_dnat;"); -+ } - -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT, -- 100, -- ds_cstr(match), ds_cstr(actions), -- &nat->header_); -- } -- } -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 100, -+ ds_cstr(match), ds_cstr(actions), -+ &nat->header_); -+} - -- /* Ingress DNAT table: Packets enter the pipeline with destination -- * IP address that needs to be DNATted from a external IP address -- * to a logical IP address. */ -- if (!strcmp(nat->type, "dnat") -- || !strcmp(nat->type, "dnat_and_snat")) { -- if (!od->l3dgw_port) { -- /* Gateway router. */ -- /* Packet when it goes from the initiator to destination. -- * We need to set flags.loopback because the router can -- * send the packet back through the same interface. */ -- ds_clear(match); -- ds_put_format(match, "ip && ip%s.dst == %s", -- is_v6 ? "6" : "4", -- nat->external_ip); -- ds_clear(actions); -- if (allowed_ext_ips || exempted_ext_ips) { -- lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -- is_v6, true, mask); -- } -+static void -+build_lrouter_out_snat_flow(struct hmap *lflows, struct ovn_datapath *od, -+ const struct nbrec_nat *nat, struct ds *match, -+ struct ds *actions, bool distributed, -+ struct eth_addr mac, ovs_be32 mask, -+ int cidr_bits, bool is_v6) -+{ -+ /* Egress SNAT table: Packets enter the egress pipeline with -+ * source ip address that needs to be SNATted to a external ip -+ * address. */ -+ if (strcmp(nat->type, "snat") && strcmp(nat->type, "dnat_and_snat")) { -+ return; -+ } - -- if (dnat_force_snat_ip) { -- /* Indicate to the future tables that a DNAT has taken -- * place and a force SNAT needs to be done in the -- * Egress SNAT table. */ -- ds_put_format(actions, -- "flags.force_snat_for_dnat = 1; "); -- } -+ bool stateless = lrouter_nat_is_stateless(nat); -+ if (!od->l3dgw_port) { -+ /* Gateway router. */ -+ ds_clear(match); -+ ds_put_format(match, "ip && ip%s.src == %s", -+ is_v6 ? "6" : "4", nat->logical_ip); -+ ds_clear(actions); - -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "flags.loopback = 1; " -- "ip%s.dst=%s; next;", -- is_v6 ? "6" : "4", nat->logical_ip); -- } else { -- ds_put_format(actions, "flags.loopback = 1; " -- "ct_dnat(%s", nat->logical_ip); -+ if (nat->allowed_ext_ips || nat->exempted_ext_ips) { -+ lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -+ is_v6, false, mask); -+ } - -- if (nat->external_port_range[0]) { -- ds_put_format(actions, ",%s", -- nat->external_port_range); -- } -- ds_put_format(actions, ");"); -- } -+ if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -+ ds_put_format(actions, "ip%s.src=%s; next;", -+ is_v6 ? "6" : "4", nat->external_ip); -+ } else { -+ ds_put_format(actions, "ct_snat(%s", nat->external_ip); - -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100, -- ds_cstr(match), ds_cstr(actions), -- &nat->header_); -- } else { -- /* Distributed router. */ -+ if (nat->external_port_range[0]) { -+ ds_put_format(actions, ",%s", -+ nat->external_port_range); -+ } -+ ds_put_format(actions, ");"); -+ } - -- /* Traffic received on l3dgw_port is subject to NAT. */ -- ds_clear(match); -- ds_put_format(match, "ip && ip%s.dst == %s" -- " && inport == %s", -- is_v6 ? "6" : "4", -- nat->external_ip, -- od->l3dgw_port->json_key); -- if (!distributed && od->l3redirect_port) { -- /* Flows for NAT rules that are centralized are only -- * programmed on the gateway chassis. */ -- ds_put_format(match, " && is_chassis_resident(%s)", -- od->l3redirect_port->json_key); -- } -- ds_clear(actions); -- if (allowed_ext_ips || exempted_ext_ips) { -- lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -- is_v6, true, mask); -- } -+ /* The priority here is calculated such that the -+ * nat->logical_ip with the longest mask gets a higher -+ * priority. */ -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT, -+ cidr_bits + 1, ds_cstr(match), -+ ds_cstr(actions), &nat->header_); -+ } else { -+ uint16_t priority = cidr_bits + 1; - -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "ip%s.dst=%s; next;", -- is_v6 ? "6" : "4", nat->logical_ip); -- } else { -- ds_put_format(actions, "ct_dnat(%s", nat->logical_ip); -- if (nat->external_port_range[0]) { -- ds_put_format(actions, ",%s", -- nat->external_port_range); -- } -- ds_put_format(actions, ");"); -- } -+ /* Distributed router. */ -+ ds_clear(match); -+ ds_put_format(match, "ip && ip%s.src == %s && outport == %s", -+ is_v6 ? "6" : "4", nat->logical_ip, -+ od->l3dgw_port->json_key); -+ if (!distributed && od->l3redirect_port) { -+ /* Flows for NAT rules that are centralized are only -+ * programmed on the gateway chassis. */ -+ priority += 128; -+ ds_put_format(match, " && is_chassis_resident(%s)", -+ od->l3redirect_port->json_key); -+ } -+ ds_clear(actions); - -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100, -- ds_cstr(match), ds_cstr(actions), -- &nat->header_); -- } -- } -+ if (nat->allowed_ext_ips || nat->exempted_ext_ips) { -+ lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -+ is_v6, false, mask); -+ } - -- /* ARP resolve for NAT IPs. */ -- if (od->l3dgw_port) { -- if (!strcmp(nat->type, "snat")) { -- ds_clear(match); -- ds_put_format( -- match, "inport == %s && %s == %s", -- od->l3dgw_port->json_key, -- is_v6 ? "ip6.src" : "ip4.src", nat->external_ip); -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, -- 120, ds_cstr(match), "next;", -- &nat->header_); -- } -+ if (distributed) { -+ ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ", -+ ETH_ADDR_ARGS(mac)); -+ } - -- if (!sset_contains(&nat_entries, nat->external_ip)) { -- ds_clear(match); -- ds_put_format( -- match, "outport == %s && %s == %s", -- od->l3dgw_port->json_key, -- is_v6 ? REG_NEXT_HOP_IPV6 : REG_NEXT_HOP_IPV4, -+ if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -+ ds_put_format(actions, "ip%s.src=%s; next;", -+ is_v6 ? "6" : "4", nat->external_ip); -+ } else { -+ ds_put_format(actions, "ct_snat(%s", - nat->external_ip); -- ds_clear(actions); -- ds_put_format( -- actions, "eth.dst = %s; next;", -- distributed ? nat->external_mac : -- od->l3dgw_port->lrp_networks.ea_s); -- ovn_lflow_add_with_hint(lflows, od, -- S_ROUTER_IN_ARP_RESOLVE, -- 100, ds_cstr(match), -- ds_cstr(actions), -- &nat->header_); -- sset_add(&nat_entries, nat->external_ip); -- } -- } else { -- /* Add the NAT external_ip to the nat_entries even for -- * gateway routers. This is required for adding load balancer -- * flows.*/ -- sset_add(&nat_entries, nat->external_ip); -+ if (nat->external_port_range[0]) { -+ ds_put_format(actions, ",%s", nat->external_port_range); - } -+ ds_put_format(actions, ");"); -+ } - -- /* Egress UNDNAT table: It is for already established connections' -- * reverse traffic. i.e., DNAT has already been done in ingress -- * pipeline and now the packet has entered the egress pipeline as -- * part of a reply. We undo the DNAT here. -- * -- * Note that this only applies for NAT on a distributed router. -- * Undo DNAT on a gateway router is done in the ingress DNAT -- * pipeline stage. */ -- if (od->l3dgw_port && (!strcmp(nat->type, "dnat") -- || !strcmp(nat->type, "dnat_and_snat"))) { -- ds_clear(match); -- ds_put_format(match, "ip && ip%s.src == %s" -- " && outport == %s", -- is_v6 ? "6" : "4", -- nat->logical_ip, -- od->l3dgw_port->json_key); -- if (!distributed && od->l3redirect_port) { -- /* Flows for NAT rules that are centralized are only -- * programmed on the gateway chassis. */ -- ds_put_format(match, " && is_chassis_resident(%s)", -- od->l3redirect_port->json_key); -- } -- ds_clear(actions); -- if (distributed) { -- ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ", -- ETH_ADDR_ARGS(mac)); -- } -+ /* The priority here is calculated such that the -+ * nat->logical_ip with the longest mask gets a higher -+ * priority. */ -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT, -+ priority, ds_cstr(match), -+ ds_cstr(actions), &nat->header_); -+ } -+} - -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "ip%s.src=%s; next;", -- is_v6 ? "6" : "4", nat->external_ip); -- } else { -- ds_put_format(actions, "ct_dnat;"); -- } -+static void -+build_lrouter_ingress_flow(struct hmap *lflows, struct ovn_datapath *od, -+ const struct nbrec_nat *nat, struct ds *match, -+ struct ds *actions, struct eth_addr mac, -+ bool distributed, bool is_v6) -+{ -+ if (od->l3dgw_port && !strcmp(nat->type, "snat")) { -+ ds_clear(match); -+ ds_put_format( -+ match, "inport == %s && %s == %s", -+ od->l3dgw_port->json_key, -+ is_v6 ? "ip6.src" : "ip4.src", nat->external_ip); -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT, -+ 120, ds_cstr(match), "next;", -+ &nat->header_); -+ } -+ /* Logical router ingress table 0: -+ * For NAT on a distributed router, add rules allowing -+ * ingress traffic with eth.dst matching nat->external_mac -+ * on the l3dgw_port instance where nat->logical_port is -+ * resident. */ -+ if (distributed) { -+ /* Store the ethernet address of the port receiving the packet. -+ * This will save us from having to match on inport further -+ * down in the pipeline. -+ */ -+ ds_clear(actions); -+ ds_put_format(actions, REG_INPORT_ETH_ADDR " = %s; next;", -+ od->l3dgw_port->lrp_networks.ea_s); - -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 100, -- ds_cstr(match), ds_cstr(actions), -- &nat->header_); -- } -+ ds_clear(match); -+ ds_put_format(match, -+ "eth.dst == "ETH_ADDR_FMT" && inport == %s" -+ " && is_chassis_resident(\"%s\")", -+ ETH_ADDR_ARGS(mac), -+ od->l3dgw_port->json_key, -+ nat->logical_port); -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ADMISSION, 50, -+ ds_cstr(match), ds_cstr(actions), -+ &nat->header_); -+ } -+} - -- /* Egress SNAT table: Packets enter the egress pipeline with -- * source ip address that needs to be SNATted to a external ip -- * address. */ -- if (!strcmp(nat->type, "snat") -- || !strcmp(nat->type, "dnat_and_snat")) { -- if (!od->l3dgw_port) { -- /* Gateway router. */ -- ds_clear(match); -- ds_put_format(match, "ip && ip%s.src == %s", -- is_v6 ? "6" : "4", -- nat->logical_ip); -- ds_clear(actions); -+static int -+lrouter_check_nat_entry(struct ovn_datapath *od, const struct nbrec_nat *nat, -+ ovs_be32 *mask, bool *is_v6, int *cidr_bits, -+ struct eth_addr *mac, bool *distributed) -+{ -+ struct in6_addr ipv6, mask_v6, v6_exact = IN6ADDR_EXACT_INIT; -+ ovs_be32 ip; - -- if (allowed_ext_ips || exempted_ext_ips) { -- lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -- is_v6, false, mask); -- } -+ if (nat->allowed_ext_ips && nat->exempted_ext_ips) { -+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); -+ VLOG_WARN_RL(&rl, "NAT rule: "UUID_FMT" not applied, since " -+ "both allowed and exempt external ips set", -+ UUID_ARGS(&(nat->header_.uuid))); -+ return -EINVAL; -+ } - -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "ip%s.src=%s; next;", -- is_v6 ? "6" : "4", nat->external_ip); -- } else { -- ds_put_format(actions, "ct_snat(%s", -- nat->external_ip); -+ char *error = ip_parse_masked(nat->external_ip, &ip, mask); -+ *is_v6 = false; - -- if (nat->external_port_range[0]) { -- ds_put_format(actions, ",%s", -- nat->external_port_range); -- } -- ds_put_format(actions, ");"); -- } -+ if (error || *mask != OVS_BE32_MAX) { -+ free(error); -+ error = ipv6_parse_masked(nat->external_ip, &ipv6, &mask_v6); -+ if (error || memcmp(&mask_v6, &v6_exact, sizeof(mask_v6))) { -+ /* Invalid for both IPv4 and IPv6 */ -+ static struct vlog_rate_limit rl = -+ VLOG_RATE_LIMIT_INIT(5, 1); -+ VLOG_WARN_RL(&rl, "bad external ip %s for nat", -+ nat->external_ip); -+ free(error); -+ return -EINVAL; -+ } -+ /* It was an invalid IPv4 address, but valid IPv6. -+ * Treat the rest of the handling of this NAT rule -+ * as IPv6. */ -+ *is_v6 = true; -+ } - -- /* The priority here is calculated such that the -- * nat->logical_ip with the longest mask gets a higher -- * priority. */ -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT, -- cidr_bits + 1, -- ds_cstr(match), ds_cstr(actions), -- &nat->header_); -- } else { -- uint16_t priority = cidr_bits + 1; -+ /* Check the validity of nat->logical_ip. 'logical_ip' can -+ * be a subnet when the type is "snat". */ -+ if (*is_v6) { -+ error = ipv6_parse_masked(nat->logical_ip, &ipv6, &mask_v6); -+ *cidr_bits = ipv6_count_cidr_bits(&mask_v6); -+ } else { -+ error = ip_parse_masked(nat->logical_ip, &ip, mask); -+ *cidr_bits = ip_count_cidr_bits(*mask); -+ } -+ if (!strcmp(nat->type, "snat")) { -+ if (error) { -+ /* Invalid for both IPv4 and IPv6 */ -+ static struct vlog_rate_limit rl = -+ VLOG_RATE_LIMIT_INIT(5, 1); -+ VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat " -+ "in router "UUID_FMT"", -+ nat->logical_ip, UUID_ARGS(&od->key)); -+ free(error); -+ return -EINVAL; -+ } -+ } else { -+ if (error || (*is_v6 == false && *mask != OVS_BE32_MAX) -+ || (*is_v6 && memcmp(&mask_v6, &v6_exact, -+ sizeof mask_v6))) { -+ /* Invalid for both IPv4 and IPv6 */ -+ static struct vlog_rate_limit rl = -+ VLOG_RATE_LIMIT_INIT(5, 1); -+ VLOG_WARN_RL(&rl, "bad ip %s for dnat in router " -+ ""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key)); -+ free(error); -+ return -EINVAL; -+ } -+ } - -- /* Distributed router. */ -- ds_clear(match); -- ds_put_format(match, "ip && ip%s.src == %s" -- " && outport == %s", -- is_v6 ? "6" : "4", -- nat->logical_ip, -- od->l3dgw_port->json_key); -- if (!distributed && od->l3redirect_port) { -- /* Flows for NAT rules that are centralized are only -- * programmed on the gateway chassis. */ -- priority += 128; -- ds_put_format(match, " && is_chassis_resident(%s)", -- od->l3redirect_port->json_key); -- } -- ds_clear(actions); -+ /* For distributed router NAT, determine whether this NAT rule -+ * satisfies the conditions for distributed NAT processing. */ -+ *distributed = false; -+ if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") && -+ nat->logical_port && nat->external_mac) { -+ if (eth_addr_from_string(nat->external_mac, mac)) { -+ *distributed = true; -+ } else { -+ static struct vlog_rate_limit rl = -+ VLOG_RATE_LIMIT_INIT(5, 1); -+ VLOG_WARN_RL(&rl, "bad mac %s for dnat in router " -+ ""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key)); -+ return -EINVAL; -+ } -+ } - -- if (allowed_ext_ips || exempted_ext_ips) { -- lrouter_nat_add_ext_ip_match(od, lflows, match, nat, -- is_v6, false, mask); -- } -+ return 0; -+} - -- if (distributed) { -- ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ", -- ETH_ADDR_ARGS(mac)); -- } -+/* NAT, Defrag and load balancing. */ -+static void -+build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, -+ struct hmap *lflows, -+ struct shash *meter_groups, -+ struct hmap *lbs, -+ struct ds *match, struct ds *actions) -+{ -+ if (!od->nbr) { -+ return; -+ } - -- if (!strcmp(nat->type, "dnat_and_snat") && stateless) { -- ds_put_format(actions, "ip%s.src=%s; next;", -- is_v6 ? "6" : "4", nat->external_ip); -- } else { -- ds_put_format(actions, "ct_snat(%s", -- nat->external_ip); -- if (nat->external_port_range[0]) { -- ds_put_format(actions, ",%s", -- nat->external_port_range); -- } -- ds_put_format(actions, ");"); -- } -+ /* Packets are allowed by default. */ -+ ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;"); -+ ovn_lflow_add(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 0, "1", "next;"); -+ -+ /* Send the IPv6 NS packets to next table. When ovn-controller -+ * generates IPv6 NS (for the action - nd_ns{}), the injected -+ * packet would go through conntrack - which is not required. */ -+ ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120, "nd_ns", "next;"); -+ -+ /* NAT rules are only valid on Gateway routers and routers with -+ * l3dgw_port (router has a port with gateway chassis -+ * specified). */ -+ if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) { -+ return; -+ } - -- /* The priority here is calculated such that the -- * nat->logical_ip with the longest mask gets a higher -- * priority. */ -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT, -- priority, ds_cstr(match), -- ds_cstr(actions), -- &nat->header_); -- } -- } -+ struct sset nat_entries = SSET_INITIALIZER(&nat_entries); - -- /* Logical router ingress table 0: -- * For NAT on a distributed router, add rules allowing -- * ingress traffic with eth.dst matching nat->external_mac -- * on the l3dgw_port instance where nat->logical_port is -- * resident. */ -- if (distributed) { -- /* Store the ethernet address of the port receiving the packet. -- * This will save us from having to match on inport further -- * down in the pipeline. -- */ -- ds_clear(actions); -- ds_put_format(actions, REG_INPORT_ETH_ADDR " = %s; next;", -- od->l3dgw_port->lrp_networks.ea_s); -+ bool dnat_force_snat_ip = -+ !lport_addresses_is_empty(&od->dnat_force_snat_addrs); -+ bool lb_force_snat_ip = -+ !lport_addresses_is_empty(&od->lb_force_snat_addrs); - -- ds_clear(match); -- ds_put_format(match, -- "eth.dst == "ETH_ADDR_FMT" && inport == %s" -- " && is_chassis_resident(\"%s\")", -- ETH_ADDR_ARGS(mac), -- od->l3dgw_port->json_key, -- nat->logical_port); -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ADMISSION, 50, -- ds_cstr(match), ds_cstr(actions), -- &nat->header_); -- } -+ for (int i = 0; i < od->nbr->n_nat; i++) { -+ const struct nbrec_nat *nat = nat = od->nbr->nat[i]; -+ struct eth_addr mac = eth_addr_broadcast; -+ bool is_v6, distributed; -+ ovs_be32 mask; -+ int cidr_bits; - -- /* Ingress Gateway Redirect Table: For NAT on a distributed -- * router, add flows that are specific to a NAT rule. These -- * flows indicate the presence of an applicable NAT rule that -- * can be applied in a distributed manner. -- * In particulr REG_SRC_IPV4/REG_SRC_IPV6 and eth.src are set to -- * NAT external IP and NAT external mac so the ARP request -- * generated in the following stage is sent out with proper IP/MAC -- * src addresses. -- */ -- if (distributed) { -- ds_clear(match); -- ds_clear(actions); -- ds_put_format(match, -- "ip%s.src == %s && outport == %s && " -- "is_chassis_resident(\"%s\")", -- is_v6 ? "6" : "4", nat->logical_ip, -- od->l3dgw_port->json_key, nat->logical_port); -- ds_put_format(actions, "eth.src = %s; %s = %s; next;", -- nat->external_mac, -- is_v6 ? REG_SRC_IPV6 : REG_SRC_IPV4, -- nat->external_ip); -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_GW_REDIRECT, -- 100, ds_cstr(match), -- ds_cstr(actions), &nat->header_); -- } -+ if (lrouter_check_nat_entry(od, nat, &mask, &is_v6, &cidr_bits, -+ &mac, &distributed) < 0) { -+ continue; -+ } - -- /* Egress Loopback table: For NAT on a distributed router. -- * If packets in the egress pipeline on the distributed -- * gateway port have ip.dst matching a NAT external IP, then -- * loop a clone of the packet back to the beginning of the -- * ingress pipeline with inport = outport. */ -- if (od->l3dgw_port) { -- /* Distributed router. */ -- ds_clear(match); -- ds_put_format(match, "ip%s.dst == %s && outport == %s", -- is_v6 ? "6" : "4", -- nat->external_ip, -- od->l3dgw_port->json_key); -- if (!distributed) { -- ds_put_format(match, " && is_chassis_resident(%s)", -- od->l3redirect_port->json_key); -- } else { -- ds_put_format(match, " && is_chassis_resident(\"%s\")", -- nat->logical_port); -- } -+ /* S_ROUTER_IN_UNSNAT */ -+ build_lrouter_in_unsnat_flow(lflows, od, nat, match, actions, distributed, -+ is_v6); -+ /* S_ROUTER_IN_DNAT */ -+ build_lrouter_in_dnat_flow(lflows, od, nat, match, actions, distributed, -+ mask, is_v6); - -+ /* ARP resolve for NAT IPs. */ -+ if (od->l3dgw_port) { -+ if (!sset_contains(&nat_entries, nat->external_ip)) { -+ ds_clear(match); -+ ds_put_format( -+ match, "outport == %s && %s == %s", -+ od->l3dgw_port->json_key, -+ is_v6 ? REG_NEXT_HOP_IPV6 : REG_NEXT_HOP_IPV4, -+ nat->external_ip); - ds_clear(actions); -- ds_put_format(actions, -- "clone { ct_clear; " -- "inport = outport; outport = \"\"; " -- "flags = 0; flags.loopback = 1; "); -- for (int j = 0; j < MFF_N_LOG_REGS; j++) { -- ds_put_format(actions, "reg%d = 0; ", j); -- } -- ds_put_format(actions, REGBIT_EGRESS_LOOPBACK" = 1; " -- "next(pipeline=ingress, table=%d); };", -- ovn_stage_get_table(S_ROUTER_IN_ADMISSION)); -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100, -- ds_cstr(match), ds_cstr(actions), -+ ds_put_format( -+ actions, "eth.dst = %s; next;", -+ distributed ? nat->external_mac : -+ od->l3dgw_port->lrp_networks.ea_s); -+ ovn_lflow_add_with_hint(lflows, od, -+ S_ROUTER_IN_ARP_RESOLVE, -+ 100, ds_cstr(match), -+ ds_cstr(actions), - &nat->header_); -+ sset_add(&nat_entries, nat->external_ip); - } -- } -- -- /* Handle force SNAT options set in the gateway router. */ -- if (!od->l3dgw_port) { -- if (dnat_force_snat_ip) { -- if (od->dnat_force_snat_addrs.n_ipv4_addrs) { -- build_lrouter_force_snat_flows(lflows, od, "4", -- od->dnat_force_snat_addrs.ipv4_addrs[0].addr_s, -- "dnat"); -- } -- if (od->dnat_force_snat_addrs.n_ipv6_addrs) { -- build_lrouter_force_snat_flows(lflows, od, "6", -- od->dnat_force_snat_addrs.ipv6_addrs[0].addr_s, -- "dnat"); -- } -- } -- if (lb_force_snat_ip) { -- if (od->lb_force_snat_addrs.n_ipv4_addrs) { -- build_lrouter_force_snat_flows(lflows, od, "4", -- od->lb_force_snat_addrs.ipv4_addrs[0].addr_s, "lb"); -- } -- if (od->lb_force_snat_addrs.n_ipv6_addrs) { -- build_lrouter_force_snat_flows(lflows, od, "6", -- od->lb_force_snat_addrs.ipv6_addrs[0].addr_s, "lb"); -- } -+ } else { -+ /* Add the NAT external_ip to the nat_entries even for -+ * gateway routers. This is required for adding load balancer -+ * flows.*/ -+ sset_add(&nat_entries, nat->external_ip); -+ } -+ -+ /* S_ROUTER_OUT_UNDNAT */ -+ build_lrouter_out_undnat_flow(lflows, od, nat, match, actions, distributed, -+ mac, is_v6); -+ /* S_ROUTER_OUT_SNAT */ -+ build_lrouter_out_snat_flow(lflows, od, nat, match, actions, distributed, -+ mac, mask, cidr_bits, is_v6); -+ -+ /* S_ROUTER_IN_ADMISSION - S_ROUTER_IN_IP_INPUT */ -+ build_lrouter_ingress_flow(lflows, od, nat, match, actions, -+ mac, distributed, is_v6); -+ -+ /* Ingress Gateway Redirect Table: For NAT on a distributed -+ * router, add flows that are specific to a NAT rule. These -+ * flows indicate the presence of an applicable NAT rule that -+ * can be applied in a distributed manner. -+ * In particulr REG_SRC_IPV4/REG_SRC_IPV6 and eth.src are set to -+ * NAT external IP and NAT external mac so the ARP request -+ * generated in the following stage is sent out with proper IP/MAC -+ * src addresses. -+ */ -+ if (distributed) { -+ ds_clear(match); -+ ds_clear(actions); -+ ds_put_format(match, -+ "ip%s.src == %s && outport == %s && " -+ "is_chassis_resident(\"%s\")", -+ is_v6 ? "6" : "4", nat->logical_ip, -+ od->l3dgw_port->json_key, nat->logical_port); -+ ds_put_format(actions, "eth.src = %s; %s = %s; next;", -+ nat->external_mac, -+ is_v6 ? REG_SRC_IPV6 : REG_SRC_IPV4, -+ nat->external_ip); -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_GW_REDIRECT, -+ 100, ds_cstr(match), -+ ds_cstr(actions), &nat->header_); -+ } -+ -+ /* Egress Loopback table: For NAT on a distributed router. -+ * If packets in the egress pipeline on the distributed -+ * gateway port have ip.dst matching a NAT external IP, then -+ * loop a clone of the packet back to the beginning of the -+ * ingress pipeline with inport = outport. */ -+ if (od->l3dgw_port) { -+ /* Distributed router. */ -+ ds_clear(match); -+ ds_put_format(match, "ip%s.dst == %s && outport == %s", -+ is_v6 ? "6" : "4", -+ nat->external_ip, -+ od->l3dgw_port->json_key); -+ if (!distributed) { -+ ds_put_format(match, " && is_chassis_resident(%s)", -+ od->l3redirect_port->json_key); -+ } else { -+ ds_put_format(match, " && is_chassis_resident(\"%s\")", -+ nat->logical_port); - } - -- /* For gateway router, re-circulate every packet through -- * the DNAT zone. This helps with the following. -- * -- * Any packet that needs to be unDNATed in the reverse -- * direction gets unDNATed. Ideally this could be done in -- * the egress pipeline. But since the gateway router -- * does not have any feature that depends on the source -- * ip address being external IP address for IP routing, -- * we can do it here, saving a future re-circulation. */ -- ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50, -- "ip", "flags.loopback = 1; ct_dnat;"); -+ ds_clear(actions); -+ ds_put_format(actions, -+ "clone { ct_clear; " -+ "inport = outport; outport = \"\"; " -+ "flags = 0; flags.loopback = 1; "); -+ for (int j = 0; j < MFF_N_LOG_REGS; j++) { -+ ds_put_format(actions, "reg%d = 0; ", j); -+ } -+ ds_put_format(actions, REGBIT_EGRESS_LOOPBACK" = 1; " -+ "next(pipeline=ingress, table=%d); };", -+ ovn_stage_get_table(S_ROUTER_IN_ADMISSION)); -+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100, -+ ds_cstr(match), ds_cstr(actions), -+ &nat->header_); - } -+ } - -- /* Load balancing and packet defrag are only valid on -- * Gateway routers or router with gateway port. */ -- if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) { -- sset_destroy(&nat_entries); -- return; -+ /* Handle force SNAT options set in the gateway router. */ -+ if (!od->l3dgw_port) { -+ if (dnat_force_snat_ip) { -+ if (od->dnat_force_snat_addrs.n_ipv4_addrs) { -+ build_lrouter_force_snat_flows(lflows, od, "4", -+ od->dnat_force_snat_addrs.ipv4_addrs[0].addr_s, -+ "dnat"); -+ } -+ if (od->dnat_force_snat_addrs.n_ipv6_addrs) { -+ build_lrouter_force_snat_flows(lflows, od, "6", -+ od->dnat_force_snat_addrs.ipv6_addrs[0].addr_s, -+ "dnat"); -+ } - } -- -- /* A set to hold all ips that need defragmentation and tracking. */ -- struct sset all_ips = SSET_INITIALIZER(&all_ips); -- -- for (int i = 0; i < od->nbr->n_load_balancer; i++) { -- struct nbrec_load_balancer *nb_lb = od->nbr->load_balancer[i]; -- struct ovn_northd_lb *lb = -- ovn_northd_lb_find(lbs, &nb_lb->header_.uuid); -- ovs_assert(lb); -- -- for (size_t j = 0; j < lb->n_vips; j++) { -- struct ovn_lb_vip *lb_vip = &lb->vips[j]; -- struct ovn_northd_lb_vip *lb_vip_nb = &lb->vips_nb[j]; -- ds_clear(actions); -- build_lb_vip_actions(lb_vip, lb_vip_nb, actions, -- lb->selection_fields, false); -- -- if (!sset_contains(&all_ips, lb_vip->vip_str)) { -- sset_add(&all_ips, lb_vip->vip_str); -- /* If there are any load balancing rules, we should send -- * the packet to conntrack for defragmentation and -- * tracking. This helps with two things. -- * -- * 1. With tracking, we can send only new connections to -- * pick a DNAT ip address from a group. -- * 2. If there are L4 ports in load balancing rules, we -- * need the defragmentation to match on L4 ports. */ -- ds_clear(match); -- if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) { -- ds_put_format(match, "ip && ip4.dst == %s", -- lb_vip->vip_str); -- } else { -- ds_put_format(match, "ip && ip6.dst == %s", -- lb_vip->vip_str); -- } -- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, -- 100, ds_cstr(match), "ct_next;", -- &nb_lb->header_); -- } -- -- /* Higher priority rules are added for load-balancing in DNAT -- * table. For every match (on a VIP[:port]), we add two flows -- * via add_router_lb_flow(). One flow is for specific matching -- * on ct.new with an action of "ct_lb($targets);". The other -- * flow is for ct.est with an action of "ct_dnat;". */ -- ds_clear(match); -- if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) { -- ds_put_format(match, "ip && ip4.dst == %s", -- lb_vip->vip_str); -- } else { -- ds_put_format(match, "ip && ip6.dst == %s", -- lb_vip->vip_str); -- } -- -- int prio = 110; -- bool is_udp = nullable_string_is_equal(nb_lb->protocol, "udp"); -- bool is_sctp = nullable_string_is_equal(nb_lb->protocol, -- "sctp"); -- const char *proto = is_udp ? "udp" : is_sctp ? "sctp" : "tcp"; -- -- if (lb_vip->vip_port) { -- ds_put_format(match, " && %s && %s.dst == %d", proto, -- proto, lb_vip->vip_port); -- prio = 120; -- } -- -- if (od->l3redirect_port && -- (lb_vip->n_backends || !lb_vip->empty_backend_rej)) { -- ds_put_format(match, " && is_chassis_resident(%s)", -- od->l3redirect_port->json_key); -- } -- bool force_snat_for_lb = -- lb_force_snat_ip || od->lb_force_snat_router_ip; -- add_router_lb_flow(lflows, od, match, actions, prio, -- force_snat_for_lb, lb_vip, proto, -- nb_lb, meter_groups, &nat_entries); -+ if (lb_force_snat_ip) { -+ if (od->lb_force_snat_addrs.n_ipv4_addrs) { -+ build_lrouter_force_snat_flows(lflows, od, "4", -+ od->lb_force_snat_addrs.ipv4_addrs[0].addr_s, "lb"); -+ } -+ if (od->lb_force_snat_addrs.n_ipv6_addrs) { -+ build_lrouter_force_snat_flows(lflows, od, "6", -+ od->lb_force_snat_addrs.ipv6_addrs[0].addr_s, "lb"); - } - } -- sset_destroy(&all_ips); -+ -+ /* For gateway router, re-circulate every packet through -+ * the DNAT zone. This helps with the following. -+ * -+ * Any packet that needs to be unDNATed in the reverse -+ * direction gets unDNATed. Ideally this could be done in -+ * the egress pipeline. But since the gateway router -+ * does not have any feature that depends on the source -+ * ip address being external IP address for IP routing, -+ * we can do it here, saving a future re-circulation. */ -+ ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50, -+ "ip", "flags.loopback = 1; ct_dnat;"); -+ } -+ -+ /* Load balancing and packet defrag are only valid on -+ * Gateway routers or router with gateway port. */ -+ if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) { - sset_destroy(&nat_entries); -+ return; - } -+ -+ build_lrouter_lb_flows(lflows, od, lbs, meter_groups, &nat_entries, -+ match, actions); -+ -+ sset_destroy(&nat_entries); - } - - -@@ -12909,6 +12985,9 @@ ovnnb_db_run(struct northd_context *ctx, - - use_logical_dp_groups = smap_get_bool(&nb->options, - "use_logical_dp_groups", false); -+ use_ct_inv_match = smap_get_bool(&nb->options, -+ "use_ct_inv_match", true); -+ - /* deprecated, use --event instead */ - controller_event_en = smap_get_bool(&nb->options, - "controller_event", false); -diff --git a/ovn-nb.xml b/ovn-nb.xml -index b0a4adffe..046d053e9 100644 ---- a/ovn-nb.xml -+++ b/ovn-nb.xml -@@ -227,6 +227,21 @@ -
- - -+
-+ If set to false, ovn-northd
will not use the
-+ ct.inv
field in any of the logical flow matches.
-+ The default value is true. If the NIC supports offloading
-+ OVS datapath flows but doesn't support offloading ct_state
-+ inv
flag, then the datapath flows matching on this flag
-+ (either +inv
or -inv
) will not be
-+ offloaded. CMS should consider setting use_ct_inv_match
-+ to false
in such cases. This results in a side effect
-+ of the invalid packets getting delivered to the destination VIF,
-+ which otherwise would have been dropped by OVN
.
-+
- These options control how routes are advertised between OVN
-@@ -1653,6 +1668,12 @@
- exactly one IPv4 and/or one IPv6 address on it, separated by a space
- character.
-
-+
-+ skip_snat
-+ option, the force_snat_for_lb option configured for the router
-+ pipeline will not be applied for this load balancer.
-+