Blob Blame History Raw
From 16476ae504d692ec0a7440c27037d155a3f76445 Mon Sep 17 00:00:00 2001
From: Numan Siddique <numans@ovn.org>
Date: Tue, 23 Feb 2021 00:18:42 +0530
Subject: [PATCH] ofctrl: Fix the assert seen when flood removing flows with
 conj actions.

In one of the scaled deployments, ovn-controller is asserting with the
below stack trace

    ***
     (gdb) bt
       0  raise () from /lib64/libc.so.6
       1  abort () from /lib64/libc.so.6
       2  ovs_abort_valist ("%s: assertion %s failed in %s()") at lib/util.c:419
       3  vlog_abort_valist ("%s: assertion %s failed in %s()") at lib/vlog.c:1249
       4  vlog_abort ("%s: assertion %s failed in %s()") at lib/vlog.c:1263
       5  ovs_assert_failure (where="controller/ofctrl.c:1216",
                              function="flood_remove_flows_for_sb_uuid",
                              condition="ovs_list_is_empty(&f->list_node)") at lib/util.c:86
       6  flood_remove_flows_for_sb_uuid (sb_uuid=...134,
            flood_remove_nodes=...ef0) at controller/ofctrl.c:1216
       9  ofctrl_flood_remove_flows (flood_remove_nodes=...ef0) at controller/ofctrl.c:1280
       10 lflow_handle_changed_ref (ref_type=REF_TYPE_PORTGROUP,
            ref_name= "5564_pg_14...aac") at controller/lflow.c:522
       11 _flow_output_resource_ref_handler (ref_type=REF_TYPE_PORTGROUP)
            at controller/ovn-controller.c:2220
       12 engine_compute () at lib/inc-proc-eng.c:306
       13 engine_run_node (recompute_allowed=true) at lib/inc-proc-eng.c:352
       14 engine_run (recompute_allowed=true) at lib/inc-proc-eng.c:377
       15 main () at controller/ovn-controller.c:2887
    ***

The reason for this assert is different from the assertion bug fixed in
the commit 858d1dd716("ofctrl: Fix the assert seen when flood removing
flows.")

The above assertion is seen if lflow.c calls ofctrl_add_or_append_flow()
twice for the same flow uuid 'S'.  When this function is called for
the first time, a new desired flow 'f' is created and 'f->references'
will have [(S, f)].  When it is called for the 2nd time, the list
'f->references' is updated as [(S, f), (S,f)].  Later when
flood_remove_flows_for_sb_uuid() is called for 'S', the above assertion
is seen.

This scenarion can happen when ovn-controller receives an update message
from SB ovsdb-server in which a new logical flow 'F' belonging to a
datapath 'D' is added which would result in conjunction flows.  If this
datapath 'D' is added to 'local_datapaths' in the same loop, then
logical flow 'F' is first processed in lflow_add_flows_for_datapath()
and the second time in lflow_handle_changed_flows().

This issue can be fixed in 2 ways
1. Flood remove all the tracked flows in lflow_handle_changed_flows()
   first and then process the modified or newly added logical flows.

2. In the function ofctrl_add_or_append_flow(), check if there is
   already a reference for the flow uuid 'S' in the existing desired
   flows and only append the actions if it doesn't exist.

This patch has taken the approach (1) to ensure correctness of flows.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1929978
Signed-off-by: Numan Siddique <numans@ovn.org>
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Numan Siddique <numans@ovn.org>

(cherry-picked from master commit c6c61b4e3462fb5201a61a226c2acaf6f4caf917)
---
 controller/lflow.c |  34 ++++-----------
 tests/ovn.at       | 104 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 26 deletions(-)

diff --git a/controller/lflow.c b/controller/lflow.c
index 2b7d3565e..21ff51d2b 100644
--- a/controller/lflow.c
+++ b/controller/lflow.c
@@ -480,22 +480,19 @@ lflow_handle_changed_flows(struct lflow_ctx_in *l_ctx_in,
     struct controller_event_options controller_event_opts;
     controller_event_opts_init(&controller_event_opts);
 
-    /* Handle flow removing first (for deleted or updated lflows), and then
-     * handle reprocessing or adding flows, so that when the flows being
-     * removed and added with same match conditions can be processed in the
-     * proper order */
-
+    /* Flood remove the flows for all the tracked lflows.  Its possible that
+     * lflow_add_flows_for_datapath() may have been called before calling
+     * this function. */
     struct hmap flood_remove_nodes = HMAP_INITIALIZER(&flood_remove_nodes);
     struct ofctrl_flood_remove_node *ofrn, *next;
     SBREC_LOGICAL_FLOW_TABLE_FOR_EACH_TRACKED (lflow,
                                                l_ctx_in->logical_flow_table) {
+        VLOG_DBG("delete lflow "UUID_FMT, UUID_ARGS(&lflow->header_.uuid));
+        ofrn = xmalloc(sizeof *ofrn);
+        ofrn->sb_uuid = lflow->header_.uuid;
+        hmap_insert(&flood_remove_nodes, &ofrn->hmap_node,
+                    uuid_hash(&ofrn->sb_uuid));
         if (!sbrec_logical_flow_is_new(lflow)) {
-            VLOG_DBG("delete lflow "UUID_FMT,
-                     UUID_ARGS(&lflow->header_.uuid));
-            ofrn = xmalloc(sizeof *ofrn);
-            ofrn->sb_uuid = lflow->header_.uuid;
-            hmap_insert(&flood_remove_nodes, &ofrn->hmap_node,
-                        uuid_hash(&ofrn->sb_uuid));
             if (l_ctx_out->lflow_cache_map) {
                 lflow_cache_delete(l_ctx_out->lflow_cache_map, lflow);
             }
@@ -525,21 +522,6 @@ lflow_handle_changed_flows(struct lflow_ctx_in *l_ctx_in,
     }
     hmap_destroy(&flood_remove_nodes);
 
-    /* Now handle new lflows only. */
-    SBREC_LOGICAL_FLOW_TABLE_FOR_EACH_TRACKED (lflow,
-                                               l_ctx_in->logical_flow_table) {
-        if (sbrec_logical_flow_is_new(lflow)) {
-            VLOG_DBG("add lflow "UUID_FMT,
-                     UUID_ARGS(&lflow->header_.uuid));
-            if (!consider_logical_flow(lflow, &dhcp_opts, &dhcpv6_opts,
-                                       &nd_ra_opts, &controller_event_opts,
-                                       l_ctx_in, l_ctx_out)) {
-                ret = false;
-                l_ctx_out->conj_id_overflow = true;
-                break;
-            }
-        }
-    }
     dhcp_opts_destroy(&dhcp_opts);
     dhcp_opts_destroy(&dhcpv6_opts);
     nd_ra_opts_destroy(&nd_ra_opts);
diff --git a/tests/ovn.at b/tests/ovn.at
index e403fd0f4..6cb4a4cad 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -23963,3 +23963,107 @@ done
 
 OVN_CLEANUP([hv1])
 AT_CLEANUP
+
+# Test case to check that ovn-controller doesn't assert when
+# handling conjunction flows.  When ovn-controller claims
+# the first port of a logical switch datapath, it programs the flows
+# for this datapath incrementally (without full recompute).  If
+# suppose, in the same SB update from ovsdb-server, a logical flow is added
+# which results in conjunction action, then this logical flow is also
+# handled incrementally.  The newly added logical flow is processed
+# twice which results in wrong oflows and it results in an assertion
+# in ovn-controller.  Test this ovn-controller handles this scenario
+# properly and doesn't assert.
+AT_SETUP([ovn -- No ovn-controller assert when generating conjunction flows])
+ovn_start
+
+net_add n1
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.10
+
+as hv1
+ovs-vsctl \
+    -- add-port br-int vif1 \
+    -- set Interface vif1 external_ids:iface-id=sw0-p1 \
+    ofport-request=1
+
+check as hv1
+ovs-vsctl set open . external_ids:ovn-monitor-all=true
+
+check ovn-nbctl ls-add sw0
+check ovn-nbctl pg-add pg1
+check ovn-nbctl pg-add pg2
+check ovn-nbctl lsp-add sw0 sw0-p2
+check ovn-nbctl lsp-set-addresses sw0-p2 "00:00:00:00:00:02 192.168.47.2"
+check ovn-nbctl lsp-add sw0 sw0-p3
+check ovn-nbctl lsp-set-addresses sw0-p3 "00:00:00:00:00:03 192.168.47.3"
+
+# Pause ovn-northd. When it is resumed, all the below NB updates
+# will be sent in one transaction.
+
+check as northd ovn-appctl -t ovn-northd pause
+check as northd-backup ovn-appctl -t ovn-northd pause
+
+check ovn-nbctl lsp-add sw0 sw0-p1
+check ovn-nbctl lsp-set-addresses sw0-p1 "00:00:00:00:00:01 192.168.47.1"
+check ovn-nbctl pg-set-ports pg1 sw0-p1 sw0-p2
+check ovn-nbctl pg-set-ports pg2 sw0-p3
+check ovn-nbctl acl-add pg1 to-lport 1002 "outport == @pg1 && ip4 && ip4.src == \$pg2_ip4 && udp && udp.dst >= 1 && udp.dst <= 65535" allow-related
+
+# resume ovn-northd now. This should result in a single update message
+# from SB ovsdb-server to ovn-controller for all the above NB updates.
+check as northd ovn-appctl -t ovn-northd resume
+
+AS_BOX([Wait for sw0-p1 to be up])
+wait_for_ports_up sw0-p1
+
+# When the port group pg1 is updated, it should not result in
+# any assert in ovn-controller.
+ovn-nbctl --wait=hv pg-set-ports pg1 sw0-p1 sw0-p2 sw0-p3
+AT_CHECK([kill -0 $(cat hv1/ovn-controller.pid)])
+check ovn-nbctl --wait=hv sync
+
+# Check OVS flows are installed properly.
+AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=45 | ofctl_strip_all | \
+    grep "priority=2002" | grep conjunction | \
+    sed 's/conjunction([[^)]]*)/conjunction()/g' | sort], [0], [dnl
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x10/0xfff0 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x100/0xff00 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x1000/0xf000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x2/0xfffe actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x20/0xffe0 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x200/0xfe00 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x2000/0xe000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x4/0xfffc actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x40/0xffc0 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x400/0xfc00 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x4000/0xc000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x8/0xfff8 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x80/0xff80 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x800/0xf800 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x8000/0x8000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,metadata=0x1,nw_src=192.168.47.3,tp_dst=1 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x100/0x100,reg15=0x3,metadata=0x1,nw_src=192.168.47.3 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x10/0xfff0 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x100/0xff00 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x1000/0xf000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x2/0xfffe actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x20/0xffe0 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x200/0xfe00 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x2000/0xe000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x4/0xfffc actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x40/0xffc0 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x400/0xfc00 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x4000/0xc000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x8/0xfff8 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x80/0xff80 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x800/0xf800 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=0x8000/0x8000 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,metadata=0x1,nw_src=192.168.47.3,tp_dst=1 actions=conjunction()
+ table=45, priority=2002,udp,reg0=0x80/0x80,reg15=0x3,metadata=0x1,nw_src=192.168.47.3 actions=conjunction()
+])
+
+OVN_CLEANUP([hv1])
+AT_CLEANUP
-- 
2.29.2