diff --git a/.gitignore b/.gitignore index a7384fa..4d0e77b 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,4 @@ /dpdk-17.11.tar.xz /openvswitch-2.9.1.tar.gz /openvswitch-2.9.2.tar.gz +/openvswitch-2.10.0.tar.gz diff --git a/0001-Add-ovs.compat-module-to-python-package.patch b/0001-Add-ovs.compat-module-to-python-package.patch deleted file mode 100644 index b0b0733..0000000 --- a/0001-Add-ovs.compat-module-to-python-package.patch +++ /dev/null @@ -1,31 +0,0 @@ -From f18adea51cac4f40c50d59d7c001264a8ce83cb3 Mon Sep 17 00:00:00 2001 -From: Terry Wilson -Date: Fri, 31 Aug 2018 13:40:54 -0500 -Subject: [PATCH] Add ovs.compat module to python package - -Signed-off-by: Terry Wilson -Signed-off-by: Ben Pfaff -Acked-by: Timothy Redaelli -(cherry picked from commit 2360464d629de3acacabd960ffc02fbb5081028d) -Signed-off-by: Ben Pfaff ---- - python/setup.py | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/python/setup.py b/python/setup.py -index 0e86834ea..b52657df3 100644 ---- a/python/setup.py -+++ b/python/setup.py -@@ -63,7 +63,8 @@ setup_args = dict( - url='http://www.openvswitch.org/', - author='Open vSwitch', - author_email='dev@openvswitch.org', -- packages=['ovs', 'ovs.db', 'ovs.unixctl'], -+ packages=['ovs', 'ovs.compat', 'ovs.compat.sortedcontainers', -+ 'ovs.db', 'ovs.unixctl'], - keywords=['openvswitch', 'ovs', 'OVSDB'], - license='Apache 2.0', - classifiers=[ --- -2.17.1 - diff --git a/0001-OVN-add-CT_LB-action-to-ovn-trace.patch b/0001-OVN-add-CT_LB-action-to-ovn-trace.patch new file mode 100644 index 0000000..a712bb8 --- /dev/null +++ b/0001-OVN-add-CT_LB-action-to-ovn-trace.patch @@ -0,0 +1,218 @@ +From b37f8c15ca6ee079541b0c02ee77ce9d392b18fc Mon Sep 17 00:00:00 2001 +Message-Id: +In-Reply-To: +References: +From: Lorenzo Bianconi +Date: Thu, 20 Sep 2018 16:46:02 +0200 +Subject: [PATCH] OVN: add CT_LB action to ovn-trace + +Add CT_LB action to ovn-trace utility in order to fix the +following ovn-trace error if a load balancer rule is added to +OVN configuration + +ct_next(ct_state=est|trk /* default (use --ct to customize) */) { + *** ct_lb action not implemented; +}; + +Add '--lb_dst' option in order to specify the ip address to use +in VIP pool. If --lb_dst is not provided the destination ip will be +randomly choosen + +Signed-off-by: Lorenzo Bianconi +Signed-off-by: Ben Pfaff +--- + ovn/utilities/ovn-trace.8.xml | 18 ++++++- + ovn/utilities/ovn-trace.c | 98 +++++++++++++++++++++++++++++++++-- + 2 files changed, 111 insertions(+), 5 deletions(-) + +--- a/ovn/utilities/ovn-trace.8.xml ++++ b/ovn/utilities/ovn-trace.8.xml +@@ -253,9 +253,17 @@ + ct_snat) action. + + +-
ct_lb
++
ct_lb;
++
ct_lb(ip[:port]...);
+
+- Not yet implemented; currently implemented as a no-op. ++ Forks the pipeline. In one fork, sets ip4.dst (or ++ ip6.dst) to one of the load-balancer addresses and the ++ destination port to its associated port, if any, and sets ++ ct.dnat to 1. With one or more arguments, gives preference ++ to the address specified on --lb-dst, if any; without ++ arguments, uses the address and port specified on --lb-dst. ++ In the other fork, the pipeline continues without change after the ++ ct_lb action. +
+ +
ct_commit
+@@ -424,6 +432,12 @@ +

+ + ++
--lb-dst=ip[:port]
++
++ Sets the IP from VIP pool to use as destination of the packet. ++ --lb-dst is not available in daemon mode. ++
++ +
--friendly-names
+
--no-friendly-names
+
+--- a/ovn/utilities/ovn-trace.c ++++ b/ovn/utilities/ovn-trace.c +@@ -46,6 +46,7 @@ + #include "stream.h" + #include "unixctl.h" + #include "util.h" ++#include "random.h" + + VLOG_DEFINE_THIS_MODULE(ovntrace); + +@@ -77,6 +78,9 @@ static uint32_t *ct_states; + static size_t n_ct_states; + static size_t ct_state_idx; + ++/* --lb-dst: load balancer destination info. */ ++static struct ovnact_ct_lb_dst lb_dst; ++ + /* --friendly-names, --no-friendly-names: Whether to substitute human-friendly + * port and datapath names for the awkward UUIDs typically used in the actual + * logical flows. */ +@@ -187,6 +191,24 @@ parse_ct_option(const char *state_s_) + } + + static void ++parse_lb_option(const char *s) ++{ ++ struct sockaddr_storage ss; ++ if (!inet_parse_active(s, 0, &ss)) { ++ ovs_fatal(0, "%s: bad address", s); ++ } ++ ++ lb_dst.family = ss.ss_family; ++ struct in6_addr a = ss_get_address(&ss); ++ if (ss.ss_family == AF_INET) { ++ lb_dst.ipv4 = in6_addr_get_mapped_ipv4(&a); ++ } else { ++ lb_dst.ipv6 = a; ++ } ++ lb_dst.port = ss_get_port(&ss); ++} ++ ++static void + parse_options(int argc, char *argv[]) + { + enum { +@@ -202,7 +224,8 @@ parse_options(int argc, char *argv[]) + OPT_NO_FRIENDLY_NAMES, + DAEMON_OPTION_ENUMS, + SSL_OPTION_ENUMS, +- VLOG_OPTION_ENUMS ++ VLOG_OPTION_ENUMS, ++ OPT_LB_DST + }; + static const struct option long_options[] = { + {"db", required_argument, NULL, OPT_DB}, +@@ -217,6 +240,7 @@ parse_options(int argc, char *argv[]) + {"no-friendly-names", no_argument, NULL, OPT_NO_FRIENDLY_NAMES}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, ++ {"lb-dst", required_argument, NULL, OPT_LB_DST}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + STREAM_SSL_LONG_OPTIONS, +@@ -274,6 +298,10 @@ parse_options(int argc, char *argv[]) + use_friendly_names = false; + break; + ++ case OPT_LB_DST: ++ parse_lb_option(optarg); ++ break; ++ + case 'h': + usage(); + +@@ -1823,6 +1851,71 @@ execute_ct_nat(const struct ovnact_ct_na + } + + static void ++execute_ct_lb(const struct ovnact_ct_lb *ct_lb, ++ const struct ovntrace_datapath *dp, struct flow *uflow, ++ enum ovnact_pipeline pipeline, struct ovs_list *super) ++{ ++ struct flow ct_lb_flow = *uflow; ++ ++ int family = (ct_lb_flow.dl_type == htons(ETH_TYPE_IP) ? AF_INET ++ : ct_lb_flow.dl_type == htons(ETH_TYPE_IPV6) ? AF_INET6 ++ : AF_UNSPEC); ++ if (family != AF_UNSPEC) { ++ const struct ovnact_ct_lb_dst *dst = NULL; ++ if (ct_lb->n_dsts) { ++ /* For ct_lb with addresses, choose one of the addresses. */ ++ int n = 0; ++ for (int i = 0; i < ct_lb->n_dsts; i++) { ++ const struct ovnact_ct_lb_dst *d = &ct_lb->dsts[i]; ++ if (d->family != family) { ++ continue; ++ } ++ ++ /* Check for the destination specified by --lb-dst, if any. */ ++ if (lb_dst.family == family ++ && (family == AF_INET ++ ? d->ipv4 == lb_dst.ipv4 ++ : ipv6_addr_equals(&d->ipv6, &lb_dst.ipv6))) { ++ lb_dst.family = AF_UNSPEC; ++ dst = d; ++ break; ++ } ++ ++ /* Select a random destination as a fallback. */ ++ if (!random_range(++n)) { ++ dst = d; ++ } ++ } ++ ++ if (!dst) { ++ ovntrace_node_append(super, OVNTRACE_NODE_ERROR, ++ "*** no load balancing destination " ++ "(use --lb-dst)"); ++ } ++ } else if (lb_dst.family == family) { ++ /* For ct_lb without addresses, use user-specified address. */ ++ dst = &lb_dst; ++ } ++ ++ if (dst) { ++ if (family == AF_INET6) { ++ ct_lb_flow.ipv6_dst = dst->ipv6; ++ } else { ++ ct_lb_flow.nw_dst = dst->ipv4; ++ } ++ if (dst->port) { ++ ct_lb_flow.tp_dst = htons(dst->port); ++ } ++ ct_lb_flow.ct_state |= CS_DST_NAT; ++ } ++ } ++ ++ struct ovntrace_node *node = ovntrace_node_append( ++ super, OVNTRACE_NODE_TRANSFORMATION, "ct_lb"); ++ trace__(dp, &ct_lb_flow, ct_lb->ltable, pipeline, &node->subs); ++} ++ ++static void + execute_log(const struct ovnact_log *log, struct flow *uflow, + struct ovs_list *super) + { +@@ -1910,8 +2003,7 @@ trace_actions(const struct ovnact *ovnac + break; + + case OVNACT_CT_LB: +- ovntrace_node_append(super, OVNTRACE_NODE_ERROR, +- "*** ct_lb action not implemented"); ++ execute_ct_lb(ovnact_get_CT_LB(a), dp, uflow, pipeline, super); + break; + + case OVNACT_CT_CLEAR: diff --git a/0001-dhparams-Fix-.c-file-generation-with-OpenSSL-1.1.1-p.patch b/0001-dhparams-Fix-.c-file-generation-with-OpenSSL-1.1.1-p.patch deleted file mode 100644 index ee28d29..0000000 --- a/0001-dhparams-Fix-.c-file-generation-with-OpenSSL-1.1.1-p.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 44343cb1ca4232f23dba24cab98d3605686f5700 Mon Sep 17 00:00:00 2001 -From: Timothy Redaelli -Date: Fri, 7 Sep 2018 15:14:53 +0200 -Subject: [PATCH] dhparams: Fix .c file generation with OpenSSL >= 1.1.1-pre9 - -Since OpenSSL upstream commit 201b305a2409 -("apps/dsaparam.c generates code that is intended to be pasted or included into -an existing source file: the function is static, and the code doesn't include -dsa.h. Match the generated C source style of dsaparam.") "openssl dhparam -C" -generates the get_dh functions as static, but the functions are used inside -stream-ssl.c and so the static keyword cannot be used. - -This commit removes the static keyword from the get_dh functions during -dhparams.c file generation by restoring the current behaviour. - -Signed-off-by: Timothy Redaelli -Signed-off-by: Ben Pfaff -(cherry picked from commit dc041eae5019a936618c398a2a1d106f65604ccc) ---- - lib/automake.mk | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/automake.mk b/lib/automake.mk -index 8ecad1241..70461ec8c 100644 ---- a/lib/automake.mk -+++ b/lib/automake.mk -@@ -438,7 +438,7 @@ lib/dhparams.c: lib/dh1024.pem lib/dh2048.pem lib/dh4096.pem - openssl dhparam -C -in $(srcdir)/lib/dh1024.pem -noout && \ - openssl dhparam -C -in $(srcdir)/lib/dh2048.pem -noout && \ - openssl dhparam -C -in $(srcdir)/lib/dh4096.pem -noout) \ -- | sed 's/\(get_dh[0-9]*\)()/\1(void)/' > lib/dhparams.c.tmp && \ -+ | sed 's/^static DH/DH/; s/\(get_dh[0-9]*\)()/\1(void)/' > lib/dhparams.c.tmp && \ - mv lib/dhparams.c.tmp lib/dhparams.c - else - lib_libopenvswitch_la_SOURCES += lib/stream-nossl.c --- -2.17.1 - diff --git a/0001-dpif-Remove-support-for-multiple-queues-per-port.patch b/0001-dpif-Remove-support-for-multiple-queues-per-port.patch new file mode 100644 index 0000000..f433271 --- /dev/null +++ b/0001-dpif-Remove-support-for-multiple-queues-per-port.patch @@ -0,0 +1,228 @@ +From 769b50349f28c5f9e4bff102bc61dadcb9b99c37 Mon Sep 17 00:00:00 2001 +From: Ben Pfaff +Date: Tue, 25 Sep 2018 15:14:13 -0700 +Subject: [PATCH] dpif: Remove support for multiple queues per port. + +Commit 69c51582ff78 ("dpif-netlink: don't allocate per thread netlink +sockets") removed dpif-netlink support for multiple queues per port. +No remaining dpif provider supports multiple queues per port, so +remove infrastructure for the feature. + +CC: Matteo Croce +Signed-off-by: Ben Pfaff +Tested-by: Yifeng Sun +Reviewed-by: Yifeng Sun +--- + lib/dpif-netlink.c | 9 ++++----- + lib/dpif-provider.h | 14 ++------------ + lib/dpif.c | 15 +++------------ + lib/dpif.h | 15 +-------------- + ofproto/ofproto-dpif-upcall.c | 7 +++---- + ofproto/ofproto-dpif-xlate.c | 6 ++---- + 6 files changed, 15 insertions(+), 51 deletions(-) + +diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c +index 4736d21d4..21315033c 100644 +--- a/lib/dpif-netlink.c ++++ b/lib/dpif-netlink.c +@@ -234,7 +234,7 @@ static bool ovs_tunnels_out_of_tree = true; + static int dpif_netlink_init(void); + static int open_dpif(const struct dpif_netlink_dp *, struct dpif **); + static uint32_t dpif_netlink_port_get_pid(const struct dpif *, +- odp_port_t port_no, uint32_t hash); ++ odp_port_t port_no); + static void dpif_netlink_handler_uninit(struct dpif_handler *handler); + static int dpif_netlink_refresh_channels(struct dpif_netlink *, + uint32_t n_handlers); +@@ -991,7 +991,7 @@ dpif_netlink_port_query_by_name(const struct dpif *dpif_, const char *devname, + + static uint32_t + dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif, +- odp_port_t port_no, uint32_t hash OVS_UNUSED) ++ odp_port_t port_no) + OVS_REQ_RDLOCK(dpif->upcall_lock) + { + uint32_t port_idx = odp_to_u32(port_no); +@@ -1015,14 +1015,13 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif, + } + + static uint32_t +-dpif_netlink_port_get_pid(const struct dpif *dpif_, odp_port_t port_no, +- uint32_t hash) ++dpif_netlink_port_get_pid(const struct dpif *dpif_, odp_port_t port_no) + { + const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); + uint32_t ret; + + fat_rwlock_rdlock(&dpif->upcall_lock); +- ret = dpif_netlink_port_get_pid__(dpif, port_no, hash); ++ ret = dpif_netlink_port_get_pid__(dpif, port_no); + fat_rwlock_unlock(&dpif->upcall_lock); + + return ret; +diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h +index debdafc42..eb3ee50a6 100644 +--- a/lib/dpif-provider.h ++++ b/lib/dpif-provider.h +@@ -191,16 +191,7 @@ struct dpif_class { + + /* Returns the Netlink PID value to supply in OVS_ACTION_ATTR_USERSPACE + * actions as the OVS_USERSPACE_ATTR_PID attribute's value, for use in +- * flows whose packets arrived on port 'port_no'. In the case where the +- * provider allocates multiple Netlink PIDs to a single port, it may use +- * 'hash' to spread load among them. The caller need not use a particular +- * hash function; a 5-tuple hash is suitable. +- * +- * (The datapath implementation might use some different hash function for +- * distributing packets received via flow misses among PIDs. This means +- * that packets received via flow misses might be reordered relative to +- * packets received via userspace actions. This is not ordinarily a +- * problem.) ++ * flows whose packets arrived on port 'port_no'. + * + * A 'port_no' of UINT32_MAX should be treated as a special case. The + * implementation should return a reserved PID, not allocated to any port, +@@ -212,8 +203,7 @@ struct dpif_class { + * + * A dpif provider that doesn't have meaningful Netlink PIDs can use NULL + * for this function. This is equivalent to always returning 0. */ +- uint32_t (*port_get_pid)(const struct dpif *dpif, odp_port_t port_no, +- uint32_t hash); ++ uint32_t (*port_get_pid)(const struct dpif *dpif, odp_port_t port_no); + + /* Attempts to begin dumping the ports in a dpif. On success, returns 0 + * and initializes '*statep' with any data needed for iteration. On +diff --git a/lib/dpif.c b/lib/dpif.c +index 85cf9000e..4697a4dcd 100644 +--- a/lib/dpif.c ++++ b/lib/dpif.c +@@ -737,16 +737,7 @@ dpif_port_query_by_name(const struct dpif *dpif, const char *devname, + + /* Returns the Netlink PID value to supply in OVS_ACTION_ATTR_USERSPACE + * actions as the OVS_USERSPACE_ATTR_PID attribute's value, for use in +- * flows whose packets arrived on port 'port_no'. In the case where the +- * provider allocates multiple Netlink PIDs to a single port, it may use +- * 'hash' to spread load among them. The caller need not use a particular +- * hash function; a 5-tuple hash is suitable. +- * +- * (The datapath implementation might use some different hash function for +- * distributing packets received via flow misses among PIDs. This means +- * that packets received via flow misses might be reordered relative to +- * packets received via userspace actions. This is not ordinarily a +- * problem.) ++ * flows whose packets arrived on port 'port_no'. + * + * A 'port_no' of ODPP_NONE is a special case: it returns a reserved PID, not + * allocated to any port, that the client may use for special purposes. +@@ -757,10 +748,10 @@ dpif_port_query_by_name(const struct dpif *dpif, const char *devname, + * update all of the flows that it installed that contain + * OVS_ACTION_ATTR_USERSPACE actions. */ + uint32_t +-dpif_port_get_pid(const struct dpif *dpif, odp_port_t port_no, uint32_t hash) ++dpif_port_get_pid(const struct dpif *dpif, odp_port_t port_no) + { + return (dpif->dpif_class->port_get_pid +- ? (dpif->dpif_class->port_get_pid)(dpif, port_no, hash) ++ ? (dpif->dpif_class->port_get_pid)(dpif, port_no) + : 0); + } + +diff --git a/lib/dpif.h b/lib/dpif.h +index 8fdfe5f00..1a35cc410 100644 +--- a/lib/dpif.h ++++ b/lib/dpif.h +@@ -274,18 +274,6 @@ + * + * - Upcalls that specify the "special" Netlink PID are queued separately. + * +- * Multiple threads may want to read upcalls simultaneously from a single +- * datapath. To support multiple threads well, one extends the above preferred +- * behavior: +- * +- * - Each port has multiple PIDs. The datapath distributes "miss" upcalls +- * across the PIDs, ensuring that a given flow is mapped in a stable way +- * to a single PID. +- * +- * - For "action" upcalls, the thread can specify its own Netlink PID or +- * other threads' Netlink PID of the same port for offloading purpose +- * (e.g. in a "round robin" manner). +- * + * + * Packet Format + * ============= +@@ -470,8 +458,7 @@ int dpif_port_query_by_name(const struct dpif *, const char *devname, + struct dpif_port *); + int dpif_port_get_name(struct dpif *, odp_port_t port_no, + char *name, size_t name_size); +-uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no, +- uint32_t hash); ++uint32_t dpif_port_get_pid(const struct dpif *, odp_port_t port_no); + + struct dpif_port_dump { + const struct dpif *dpif; +diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c +index 62222079f..0cc964a7f 100644 +--- a/ofproto/ofproto-dpif-upcall.c ++++ b/ofproto/ofproto-dpif-upcall.c +@@ -1021,7 +1021,6 @@ classify_upcall(enum dpif_upcall_type type, const struct nlattr *userdata, + * initialized with at least 128 bytes of space. */ + static void + compose_slow_path(struct udpif *udpif, struct xlate_out *xout, +- const struct flow *flow, + odp_port_t odp_in_port, ofp_port_t ofp_in_port, + struct ofpbuf *buf, uint32_t meter_id, + struct uuid *ofproto_uuid) +@@ -1038,7 +1037,7 @@ compose_slow_path(struct udpif *udpif, struct xlate_out *xout, + port = xout->slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP | SLOW_STP) + ? ODPP_NONE + : odp_in_port; +- pid = dpif_port_get_pid(udpif->dpif, port, flow_hash_5tuple(flow, 0)); ++ pid = dpif_port_get_pid(udpif->dpif, port); + + size_t offset; + size_t ac_offset; +@@ -1196,7 +1195,7 @@ upcall_xlate(struct udpif *udpif, struct upcall *upcall, + odp_actions->data, odp_actions->size); + } else { + /* upcall->put_actions already initialized by upcall_receive(). */ +- compose_slow_path(udpif, &upcall->xout, upcall->flow, ++ compose_slow_path(udpif, &upcall->xout, + upcall->flow->in_port.odp_port, upcall->ofp_in_port, + &upcall->put_actions, + upcall->ofproto->up.slowpath_meter_id, +@@ -2155,7 +2154,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, + goto exit; + } + +- compose_slow_path(udpif, xoutp, &ctx.flow, ctx.flow.in_port.odp_port, ++ compose_slow_path(udpif, xoutp, ctx.flow.in_port.odp_port, + ofp_in_port, odp_actions, + ofproto->up.slowpath_meter_id, &ofproto->uuid); + } +diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c +index 6949595ba..f11f60468 100644 +--- a/ofproto/ofproto-dpif-xlate.c ++++ b/ofproto/ofproto-dpif-xlate.c +@@ -3084,8 +3084,7 @@ compose_sample_action(struct xlate_ctx *ctx, + + odp_port_t odp_port = ofp_port_to_odp_port( + ctx->xbridge, ctx->xin->flow.in_port.ofp_port); +- uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port, +- flow_hash_5tuple(&ctx->xin->flow, 0)); ++ uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); + size_t cookie_offset = odp_put_userspace_action(pid, cookie, + sizeof *cookie, + tunnel_out_port, +@@ -4638,8 +4637,7 @@ put_controller_user_action(struct xlate_ctx *ctx, + + odp_port_t odp_port = ofp_port_to_odp_port(ctx->xbridge, + ctx->xin->flow.in_port.ofp_port); +- uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port, +- flow_hash_5tuple(&ctx->xin->flow, 0)); ++ uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); + odp_put_userspace_action(pid, &cookie, sizeof cookie, ODPP_NONE, + false, ctx->odp_actions); + } +-- +2.17.1 + diff --git a/0001-dpif-netdev-Add-round-robin-based-rxq-to-pmd-assignm.patch b/0001-dpif-netdev-Add-round-robin-based-rxq-to-pmd-assignm.patch new file mode 100644 index 0000000..e6a3f59 --- /dev/null +++ b/0001-dpif-netdev-Add-round-robin-based-rxq-to-pmd-assignm.patch @@ -0,0 +1,310 @@ +From 57ce73db12f6d3e980c0b285015c998183f26c8d Mon Sep 17 00:00:00 2001 +From: Kevin Traynor +Date: Fri, 31 Aug 2018 09:47:55 +0100 +Subject: [PATCH] dpif-netdev: Add round-robin based rxq to pmd assignment. + +Prior to OVS 2.9 automatic assignment of Rxqs to PMDs +(i.e. CPUs) was done by round-robin. + +That was changed in OVS 2.9 to ordering the Rxqs based on +their measured processing cycles. This was to assign the +busiest Rxqs to different PMDs, improving aggregate +throughput. + +For the most part the new scheme should be better, but +there could be situations where a user prefers a simple +round-robin scheme because Rxqs from a single port are +more likely to be spread across multiple PMDs, and/or +traffic is very bursty/unpredictable. + +Add 'pmd-rxq-assign' config to allow a user to select +round-robin based assignment. + +Signed-off-by: Kevin Traynor +Acked-by: Eelco Chaudron +Acked-by: Ilya Maximets +Signed-off-by: Ian Stokes +--- + Documentation/topics/dpdk/pmd.rst | 33 +++++++++++++--- + NEWS | 4 +- + lib/dpif-netdev.c | 83 +++++++++++++++++++++++++++++---------- + tests/pmd.at | 12 +++++- + vswitchd/vswitch.xml | 24 +++++++++++ + 5 files changed, 126 insertions(+), 30 deletions(-) + +diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst +index 5f0671e..dd9172d 100644 +--- a/Documentation/topics/dpdk/pmd.rst ++++ b/Documentation/topics/dpdk/pmd.rst +@@ -113,10 +113,15 @@ means that this thread will only poll the *pinned* Rx queues. + + If ``pmd-rxq-affinity`` is not set for Rx queues, they will be assigned to PMDs +-(cores) automatically. Where known, the processing cycles that have been stored +-for each Rx queue will be used to assign Rx queue to PMDs based on a round +-robin of the sorted Rx queues. For example, take the following example, where +-there are five Rx queues and three cores - 3, 7, and 8 - available and the +-measured usage of core cycles per Rx queue over the last interval is seen to +-be: ++(cores) automatically. ++ ++The algorithm used to automatically assign Rxqs to PMDs can be set by:: ++ ++ $ ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign= ++ ++By default, ``cycles`` assignment is used where the Rxqs will be ordered by ++their measured processing cycles, and then be evenly assigned in descending ++order to PMDs based on an up/down walk of the PMDs. For example, where there ++are five Rx queues and three cores - 3, 7, and 8 - available and the measured ++usage of core cycles per Rx queue over the last interval is seen to be: + + - Queue #0: 30% +@@ -132,4 +137,20 @@ The Rx queues will be assigned to the cores in the following order:: + Core 8: Q3 (60%) | Q0 (30%) + ++Alternatively, ``roundrobin`` assignment can be used, where the Rxqs are ++assigned to PMDs in a round-robined fashion. This algorithm was used by ++default prior to OVS 2.9. For example, given the following ports and queues: ++ ++- Port #0 Queue #0 (P0Q0) ++- Port #0 Queue #1 (P0Q1) ++- Port #1 Queue #0 (P1Q0) ++- Port #1 Queue #1 (P1Q1) ++- Port #1 Queue #2 (P1Q2) ++ ++The Rx queues may be assigned to the cores in the following order:: ++ ++ Core 3: P0Q0 | P1Q1 ++ Core 7: P0Q1 | P1Q2 ++ Core 8: P1Q0 | ++ + To see the current measured usage history of PMD core cycles for each Rx + queue:: +diff --git a/NEWS b/NEWS +index 04de807..87da271 100644 +--- a/NEWS ++++ b/NEWS +@@ -43,6 +43,8 @@ + * Allow init to fail and record DPDK status/version in OVS database. + * Add experimental flow hardware offload support + * Support both shared and per port mempools for DPDK devices. ++ * Add option for simple round-robin based Rxq to PMD assignment. ++ It can be set with pmd-rxq-assign. + - Userspace datapath: + * Commands ovs-appctl dpif-netdev/pmd-*-show can now work on a single PMD + * Detailed PMD performance metrics available with new command + +diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c +index 52b5bc2..466d5ac 100644 +--- a/lib/dpif-netdev.c ++++ b/lib/dpif-netdev.c +@@ -342,4 +342,6 @@ struct dp_netdev { + struct id_pool *tx_qid_pool; + struct ovs_mutex tx_qid_pool_mutex; ++ /* Use measured cycles for rxq to pmd assignment. */ ++ bool pmd_rxq_assign_cyc; + + /* Protects the access of the 'struct dp_netdev_pmd_thread' +@@ -1493,4 +1495,5 @@ create_dp_netdev(const char *name, const struct dpif_class *class, + + cmap_init(&dp->poll_threads); ++ dp->pmd_rxq_assign_cyc = true; + + ovs_mutex_init(&dp->tx_qid_pool_mutex); +@@ -3717,4 +3720,6 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) + struct dp_netdev *dp = get_dp_netdev(dpif); + const char *cmask = smap_get(other_config, "pmd-cpu-mask"); ++ const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign", ++ "cycles"); + unsigned long long insert_prob = + smap_get_ullong(other_config, "emc-insert-inv-prob", +@@ -3779,4 +3784,18 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) + } + } ++ ++ bool pmd_rxq_assign_cyc = !strcmp(pmd_rxq_assign, "cycles"); ++ if (!pmd_rxq_assign_cyc && strcmp(pmd_rxq_assign, "roundrobin")) { ++ VLOG_WARN("Unsupported Rxq to PMD assignment mode in pmd-rxq-assign. " ++ "Defaulting to 'cycles'."); ++ pmd_rxq_assign_cyc = true; ++ pmd_rxq_assign = "cycles"; ++ } ++ if (dp->pmd_rxq_assign_cyc != pmd_rxq_assign_cyc) { ++ dp->pmd_rxq_assign_cyc = pmd_rxq_assign_cyc; ++ VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.", ++ pmd_rxq_assign); ++ dp_netdev_request_reconfigure(dp); ++ } + return 0; + } +@@ -4249,8 +4268,16 @@ rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr) + } + +-/* Returns the next pmd from the numa node in +- * incrementing or decrementing order. */ ++/* ++ * Returns the next pmd from the numa node. ++ * ++ * If 'updown' is 'true' it will alternate between selecting the next pmd in ++ * either an up or down walk, switching between up/down when the first or last ++ * core is reached. e.g. 1,2,3,3,2,1,1,2... ++ * ++ * If 'updown' is 'false' it will select the next pmd wrapping around when last ++ * core reached. e.g. 1,2,3,1,2,3,1,2... ++ */ + static struct dp_netdev_pmd_thread * +-rr_numa_get_pmd(struct rr_numa *numa) ++rr_numa_get_pmd(struct rr_numa *numa, bool updown) + { + int numa_idx = numa->cur_index; +@@ -4260,5 +4287,9 @@ rr_numa_get_pmd(struct rr_numa *numa) + if (numa->cur_index == numa->n_pmds-1) { + /* Reached the last pmd. */ +- numa->idx_inc = false; ++ if (updown) { ++ numa->idx_inc = false; ++ } else { ++ numa->cur_index = 0; ++ } + } else { + numa->cur_index++; +@@ -4323,7 +4354,4 @@ compare_rxq_cycles(const void *a, const void *b) + * pmds to unpinned queues. + * +- * If 'pinned' is false queues will be sorted by processing cycles they are +- * consuming and then assigned to pmds in round robin order. +- * + * The function doesn't touch the pmd threads, it just stores the assignment + * in the 'pmd' member of each rxq. */ +@@ -4338,4 +4366,5 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) + struct rr_numa *numa = NULL; + int numa_id; ++ bool assign_cyc = dp->pmd_rxq_assign_cyc; + + HMAP_FOR_EACH (port, node, &dp->ports) { +@@ -4368,10 +4397,13 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) + rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1)); + } +- /* Sum the queue intervals and store the cycle history. */ +- for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) { +- cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i); +- } +- dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist); + ++ if (assign_cyc) { ++ /* Sum the queue intervals and store the cycle history. */ ++ for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) { ++ cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i); ++ } ++ dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, ++ cycle_hist); ++ } + /* Store the queue. */ + rxqs[n_rxqs++] = q; +@@ -4380,5 +4412,5 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) + } + +- if (n_rxqs > 1) { ++ if (n_rxqs > 1 && assign_cyc) { + /* Sort the queues in order of the processing cycles + * they consumed during their last pmd interval. */ +@@ -4404,5 +4436,5 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) + continue; + } +- rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa); ++ rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc); + VLOG_WARN("There's no available (non-isolated) pmd thread " + "on numa node %d. Queue %d on port \'%s\' will " +@@ -4413,11 +4445,20 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) + rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id); + } else { +- rxqs[i]->pmd = rr_numa_get_pmd(numa); +- VLOG_INFO("Core %d on numa node %d assigned port \'%s\' " +- "rx queue %d (measured processing cycles %"PRIu64").", +- rxqs[i]->pmd->core_id, numa_id, +- netdev_rxq_get_name(rxqs[i]->rx), +- netdev_rxq_get_queue_id(rxqs[i]->rx), +- dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST)); ++ rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc); ++ if (assign_cyc) { ++ VLOG_INFO("Core %d on numa node %d assigned port \'%s\' " ++ "rx queue %d " ++ "(measured processing cycles %"PRIu64").", ++ rxqs[i]->pmd->core_id, numa_id, ++ netdev_rxq_get_name(rxqs[i]->rx), ++ netdev_rxq_get_queue_id(rxqs[i]->rx), ++ dp_netdev_rxq_get_cycles(rxqs[i], ++ RXQ_CYCLES_PROC_HIST)); ++ } else { ++ VLOG_INFO("Core %d on numa node %d assigned port \'%s\' " ++ "rx queue %d.", rxqs[i]->pmd->core_id, numa_id, ++ netdev_rxq_get_name(rxqs[i]->rx), ++ netdev_rxq_get_queue_id(rxqs[i]->rx)); ++ } + } + } +diff --git a/tests/pmd.at b/tests/pmd.at +index 4cae6c8..1f952f3 100644 +--- a/tests/pmd.at ++++ b/tests/pmd.at +@@ -62,5 +62,6 @@ m4_define([CHECK_PMD_THREADS_CREATED], [ + + m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1\2:/"]) +-m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/1 2 5 6//;s/0 3 4 7//"]) ++m4_define([SED_NUMA_CORE_QUEUE_CYC_PATTERN], ["s/1 2 5 6//;s/0 3 4 7//"]) ++m4_define([SED_NUMA_CORE_QUEUE_PQ_PATTERN], ["s/1 3 5 7//;s/0 2 4 6//"]) + m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"]) + +@@ -146,9 +147,16 @@ pmd thread numa_id core_id : + ]) + ++AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles]) + TMP=$(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]]) + AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x3]) + CHECK_PMD_THREADS_CREATED([2], [], [+$TMP]) + +-AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed ':a;/AVAIL$/{N;s/\n//;ba;}' | parse_pmd_rxq_show_group | sed SED_NUMA_CORE_QUEUE_PATTERN], [0], [dnl ++AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed ':a;/AVAIL$/{N;s/\n//;ba;}' | parse_pmd_rxq_show_group | sed SED_NUMA_CORE_QUEUE_CYC_PATTERN], [0], [dnl ++port: p0 queue-id: ++port: p0 queue-id: ++]) ++ ++AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin]) ++AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed ':a;/AVAIL$/{N;s/\n//;ba;}' | parse_pmd_rxq_show_group | sed SED_NUMA_CORE_QUEUE_PQ_PATTERN], [0], [dnl + port: p0 queue-id: + port: p0 queue-id: +diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml +index e318151..91d132d 100644 +--- a/vswitchd/vswitch.xml ++++ b/vswitchd/vswitch.xml +@@ -433,4 +433,28 @@ + + ++ ++

++ Specifies how RX queues will be automatically assigned to CPU cores. ++ Options: ++

++
cycles
++
Rxqs will be sorted by order of measured processing cycles ++ before being assigned to CPU cores.
++
roundrobin
++
Rxqs will be round-robined across CPU cores.
++
++

++

++ The default value is cycles. ++

++

++ Changing this value will affect an automatic re-assignment of Rxqs to ++ CPUs. Note: Rxqs mapped to CPU cores with ++ pmd-rxq-affinity are unaffected. ++

++
++ + +-- +1.8.3.1 + diff --git a/0001-dpif-netdev-Avoid-reordering-of-packets-in-a-batch-w.patch b/0001-dpif-netdev-Avoid-reordering-of-packets-in-a-batch-w.patch new file mode 100644 index 0000000..34fe6a5 --- /dev/null +++ b/0001-dpif-netdev-Avoid-reordering-of-packets-in-a-batch-w.patch @@ -0,0 +1,357 @@ +From 9b4f08cdcaf253175edda088683bdd3db9e4c097 Mon Sep 17 00:00:00 2001 +From: Vishal Deep Ajmera +Date: Fri, 27 Jul 2018 23:56:37 +0530 +Subject: [PATCH] dpif-netdev: Avoid reordering of packets in a batch with same + megaflow + +OVS reads packets in batches from a given port and packets in the +batch are subjected to potentially 3 levels of lookups to identify +the datapath megaflow entry (or flow) associated with the packet. +Each megaflow entry has a dedicated buffer in which packets that match +the flow classification criteria are collected. This buffer helps OVS +perform batch processing for all packets associated with a given flow. + +Each packet in the received batch is first subjected to lookup in the +Exact Match Cache (EMC). Each EMC entry will point to a flow. If the +EMC lookup is successful, the packet is moved from the rx batch to the +per-flow buffer. + +Packets that did not match any EMC entry are rearranged in the rx batch +at the beginning and are now subjected to a lookup in the megaflow cache. +Packets that match a megaflow cache entry are *appended* to the per-flow +buffer. + +Packets that do not match any megaflow entry are subjected to slow-path +processing through the upcall mechanism. This cannot change the order of +packets as by definition upcall processing is only done for packets +without matching megaflow entry. + +The EMC entry match fields encompass all potentially significant header +fields, typically more than specified in the associated flow's match +criteria. Hence, multiple EMC entries can point to the same flow. Given +that per-flow batching happens at each lookup stage, packets belonging +to the same megaflow can get re-ordered because some packets match EMC +entries while others do not. + +The following example can illustrate the issue better. Consider +following batch of packets (labelled P1 to P8) associated with a single +TCP connection and associated with a single flow. Let us assume that +packets with just the ACK bit set in TCP flags have been received in a +prior batch also and a corresponding EMC entry exists. + +1. P1 (TCP Flag: ACK) +2. P2 (TCP Flag: ACK) +3. P3 (TCP Flag: ACK) +4. P4 (TCP Flag: ACK, PSH) +5. P5 (TCP Flag: ACK) +6. P6 (TCP Flag: ACK) +7. P7 (TCP Flag: ACK) +8. P8 (TCP Flag: ACK) + +The megaflow classification criteria does not include TCP flags while +the EMC match criteria does. Thus, all packets other than P4 match +the existing EMC entry and are moved to the per-flow packet batch. +Subsequently, packet P4 is moved to the same per-flow packet batch as +a result of the megaflow lookup. Though the packets have all been +correctly classified as being associated with the same flow, the +packet order has not been preserved because of the per-flow batching +performed during the EMC lookup stage. This packet re-ordering has +performance implications for TCP applications. + +This patch preserves the packet ordering by performing the per-flow +batching after both the EMC and megaflow lookups are complete. As an +optimization, packets are flow-batched in emc processing till any +packet in the batch has an EMC miss. + +A new flow map is maintained to keep the original order of packet +along with flow information. Post fastpath processing, packets from +flow map are *appended* to per-flow buffer. + +Signed-off-by: Vishal Deep Ajmera +Co-authored-by: Venkatesan Pradeep +Signed-off-by: Venkatesan Pradeep +Signed-off-by: Ian Stokes +--- + lib/dpif-netdev.c | 125 +++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 106 insertions(+), 19 deletions(-) + +diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c +index 7f836bb18..807a46250 100644 +--- a/lib/dpif-netdev.c ++++ b/lib/dpif-netdev.c +@@ -244,6 +244,13 @@ struct dpcls_rule { + /* 'flow' must be the last field, additional space is allocated here. */ + }; + ++/* Data structure to keep packet order till fastpath processing. */ ++struct dp_packet_flow_map { ++ struct dp_packet *packet; ++ struct dp_netdev_flow *flow; ++ uint16_t tcp_flags; ++}; ++ + static void dpcls_init(struct dpcls *); + static void dpcls_destroy(struct dpcls *); + static void dpcls_sort_subtable_vector(struct dpcls *); +@@ -5765,6 +5772,19 @@ dp_netdev_queue_batches(struct dp_packet *pkt, + packet_batch_per_flow_update(batch, pkt, tcp_flags); + } + ++static inline void ++packet_enqueue_to_flow_map(struct dp_packet *packet, ++ struct dp_netdev_flow *flow, ++ uint16_t tcp_flags, ++ struct dp_packet_flow_map *flow_map, ++ size_t index) ++{ ++ struct dp_packet_flow_map *map = &flow_map[index]; ++ map->flow = flow; ++ map->packet = packet; ++ map->tcp_flags = tcp_flags; ++} ++ + /* SMC lookup function for a batch of packets. + * By doing batching SMC lookup, we can use prefetch + * to hide memory access latency. +@@ -5774,8 +5794,9 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, + struct netdev_flow_key *keys, + struct netdev_flow_key **missed_keys, + struct dp_packet_batch *packets_, +- struct packet_batch_per_flow batches[], +- size_t *n_batches, const int cnt) ++ const int cnt, ++ struct dp_packet_flow_map *flow_map, ++ uint8_t *index_map) + { + int i; + struct dp_packet *packet; +@@ -5783,6 +5804,8 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, + struct dfc_cache *cache = &pmd->flow_cache; + struct smc_cache *smc_cache = &cache->smc_cache; + const struct cmap_node *flow_node; ++ int recv_idx; ++ uint16_t tcp_flags; + + /* Prefetch buckets for all packets */ + for (i = 0; i < cnt; i++) { +@@ -5793,6 +5816,8 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, + struct dp_netdev_flow *flow = NULL; + flow_node = smc_entry_get(pmd, keys[i].hash); + bool hit = false; ++ /* Get the original order of this packet in received batch. */ ++ recv_idx = index_map[i]; + + if (OVS_LIKELY(flow_node != NULL)) { + CMAP_NODE_FOR_EACH (flow, node, flow_node) { +@@ -5800,12 +5825,17 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, + * number, we need to verify that the input ports match. */ + if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) && + flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) { ++ tcp_flags = miniflow_get_tcp_flags(&keys[i].mf); ++ + /* SMC hit and emc miss, we insert into EMC */ + keys[i].len = + netdev_flow_key_size(miniflow_n_values(&keys[i].mf)); + emc_probabilistic_insert(pmd, &keys[i], flow); +- dp_netdev_queue_batches(packet, flow, +- miniflow_get_tcp_flags(&keys[i].mf), batches, n_batches); ++ /* Add these packets into the flow map in the same order ++ * as received. ++ */ ++ packet_enqueue_to_flow_map(packet, flow, tcp_flags, ++ flow_map, recv_idx); + n_smc_hit++; + hit = true; + break; +@@ -5819,6 +5849,10 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd, + /* SMC missed. Group missed packets together at + * the beginning of the 'packets' array. */ + dp_packet_batch_refill(packets_, packet, i); ++ ++ /* Preserve the order of packet for flow batching. */ ++ index_map[n_missed] = recv_idx; ++ + /* Put missed keys to the pointer arrays return to the caller */ + missed_keys[n_missed++] = &keys[i]; + } +@@ -5847,6 +5881,8 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, + struct netdev_flow_key *keys, + struct netdev_flow_key **missed_keys, + struct packet_batch_per_flow batches[], size_t *n_batches, ++ struct dp_packet_flow_map *flow_map, ++ size_t *n_flows, uint8_t *index_map, + bool md_is_valid, odp_port_t port_no) + { + struct netdev_flow_key *key = &keys[0]; +@@ -5858,6 +5894,8 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, + int i; + uint16_t tcp_flags; + bool smc_enable_db; ++ size_t map_cnt = 0; ++ bool batch_enable = true; + + atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); + atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min); +@@ -5888,10 +5926,19 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, + if ((*recirc_depth_get() == 0) && + dp_packet_has_flow_mark(packet, &mark)) { + flow = mark_to_flow_find(pmd, mark); +- if (flow) { ++ if (OVS_LIKELY(flow)) { + tcp_flags = parse_tcp_flags(packet); +- dp_netdev_queue_batches(packet, flow, tcp_flags, batches, +- n_batches); ++ if (OVS_LIKELY(batch_enable)) { ++ dp_netdev_queue_batches(packet, flow, tcp_flags, batches, ++ n_batches); ++ } else { ++ /* Flow batching should be performed only after fast-path ++ * processing is also completed for packets with emc miss ++ * or else it will result in reordering of packets with ++ * same datapath flows. */ ++ packet_enqueue_to_flow_map(packet, flow, tcp_flags, ++ flow_map, map_cnt++); ++ } + continue; + } + } +@@ -5914,13 +5961,27 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, + } + if (OVS_LIKELY(flow)) { + tcp_flags = miniflow_get_tcp_flags(&key->mf); +- dp_netdev_queue_batches(packet, flow, tcp_flags, batches, +- n_batches); + n_emc_hit++; ++ if (OVS_LIKELY(batch_enable)) { ++ dp_netdev_queue_batches(packet, flow, tcp_flags, batches, ++ n_batches); ++ } else { ++ /* Flow batching should be performed only after fast-path ++ * processing is also completed for packets with emc miss ++ * or else it will result in reordering of packets with ++ * same datapath flows. */ ++ packet_enqueue_to_flow_map(packet, flow, tcp_flags, ++ flow_map, map_cnt++); ++ } + } else { + /* Exact match cache missed. Group missed packets together at + * the beginning of the 'packets' array. */ + dp_packet_batch_refill(packets_, packet, i); ++ ++ /* Preserve the order of packet for flow batching. */ ++ index_map[n_missed] = map_cnt; ++ flow_map[map_cnt++].flow = NULL; ++ + /* 'key[n_missed]' contains the key of the current packet and it + * will be passed to SMC lookup. The next key should be extracted + * to 'keys[n_missed + 1]'. +@@ -5928,8 +5989,13 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, + * which will be returned to the caller for future processing. */ + missed_keys[n_missed] = key; + key = &keys[++n_missed]; ++ ++ /* Skip batching for subsequent packets to avoid reordering. */ ++ batch_enable = false; + } + } ++ /* Count of packets which are not flow batched. */ ++ *n_flows = map_cnt; + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit); + +@@ -5938,8 +6004,8 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, + } + + /* Packets miss EMC will do a batch lookup in SMC if enabled */ +- smc_lookup_batch(pmd, keys, missed_keys, packets_, batches, +- n_batches, n_missed); ++ smc_lookup_batch(pmd, keys, missed_keys, packets_, ++ n_missed, flow_map, index_map); + + return dp_packet_batch_size(packets_); + } +@@ -6026,8 +6092,8 @@ static inline void + fast_path_processing(struct dp_netdev_pmd_thread *pmd, + struct dp_packet_batch *packets_, + struct netdev_flow_key **keys, +- struct packet_batch_per_flow batches[], +- size_t *n_batches, ++ struct dp_packet_flow_map *flow_map, ++ uint8_t *index_map, + odp_port_t in_port) + { + const size_t cnt = dp_packet_batch_size(packets_); +@@ -6107,6 +6173,9 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, + + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + struct dp_netdev_flow *flow; ++ /* Get the original order of this packet in received batch. */ ++ int recv_idx = index_map[i]; ++ uint16_t tcp_flags; + + if (OVS_UNLIKELY(!rules[i])) { + continue; +@@ -6117,9 +6186,12 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, + smc_insert(pmd, keys[i], hash); + + emc_probabilistic_insert(pmd, keys[i], flow); +- dp_netdev_queue_batches(packet, flow, +- miniflow_get_tcp_flags(&keys[i]->mf), +- batches, n_batches); ++ /* Add these packets into the flow map in the same order ++ * as received. ++ */ ++ tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf); ++ packet_enqueue_to_flow_map(packet, flow, tcp_flags, ++ flow_map, recv_idx); + } + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, +@@ -6152,18 +6224,34 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, + struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE]; + struct packet_batch_per_flow batches[PKT_ARRAY_SIZE]; + size_t n_batches; ++ struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE]; ++ uint8_t index_map[PKT_ARRAY_SIZE]; ++ size_t n_flows, i; ++ + odp_port_t in_port; + + n_batches = 0; + dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches, +- md_is_valid, port_no); ++ flow_map, &n_flows, index_map, md_is_valid, port_no); ++ + if (!dp_packet_batch_is_empty(packets)) { + /* Get ingress port from first packet's metadata. */ + in_port = packets->packets[0]->md.in_port.odp_port; + fast_path_processing(pmd, packets, missed_keys, +- batches, &n_batches, in_port); ++ flow_map, index_map, in_port); + } + ++ /* Batch rest of packets which are in flow map. */ ++ for (i = 0; i < n_flows; i++) { ++ struct dp_packet_flow_map *map = &flow_map[i]; ++ ++ if (OVS_UNLIKELY(!map->flow)) { ++ continue; ++ } ++ dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags, ++ batches, &n_batches); ++ } ++ + /* All the flow batches need to be reset before any call to + * packet_batch_per_flow_execute() as it could potentially trigger + * recirculation. When a packet matching flow ‘j’ happens to be +@@ -6173,7 +6261,6 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd, + * already its own batches[k] still waiting to be served. So if its + * ‘batch’ member is not reset, the recirculated packet would be wrongly + * appended to batches[k] of the 1st call to dp_netdev_input__(). */ +- size_t i; + for (i = 0; i < n_batches; i++) { + batches[i].flow->batch = NULL; + } +-- +2.17.1 + diff --git a/0001-dpif-netlink-don-t-allocate-per-thread-netlink-socke.patch b/0001-dpif-netlink-don-t-allocate-per-thread-netlink-socke.patch new file mode 100644 index 0000000..7c77843 --- /dev/null +++ b/0001-dpif-netlink-don-t-allocate-per-thread-netlink-socke.patch @@ -0,0 +1,669 @@ +From 4c91bc3bf8c6005db5795fe51632c1feedc4719e Mon Sep 17 00:00:00 2001 +From: Matteo Croce +Date: Tue, 18 Sep 2018 14:56:37 +0200 +Subject: [PATCH v2] dpif-netlink: don't allocate per thread netlink sockets + +When using the kernel datapath, OVS allocates a pool of sockets to handle +netlink events. The number of sockets is: ports * n-handler-threads, where +n-handler-threads is user configurable and defaults to 3/4*number of cores. + +This because vswitchd starts n-handler-threads threads, each one with a +netlink socket for every port of the switch. Every thread then, starts +listening on events on its set of sockets with epoll(). + +On setup with lot of CPUs and ports, the number of sockets easily hits +the process file descriptor limit, and ovs-vswitchd will exit with -EMFILE. + +Change the number of allocated sockets to just one per port by moving +the socket array from a per handler structure to a per datapath one, +and let all the handlers share the same sockets by using EPOLLEXCLUSIVE +epoll flag which avoids duplicate events, on systems that support it. + +The patch was tested on a 56 core machine running Linux 4.18 and latest +Open vSwitch. A bridge was created with 2000+ ports, some of them being +veth interfaces with the peer outside the bridge. The latency of the upcall +is measured by setting a single 'action=controller,local' OpenFlow rule to +force all the packets going to the slow path and then to the local port. +A tool[1] injects some packets to the veth outside the bridge, and measures +the delay until the packet is captured on the local port. The rx timestamp +is get from the socket ancillary data in the attribute SO_TIMESTAMPNS, to +avoid having the scheduler delay in the measured time. + +The first test measures the average latency for an upcall generated from +a single port. To measure it 100k packets, one every msec, are sent to a +single port and the latencies are measured. + +The second test is meant to check latency fairness among ports, namely if +latency is equal between ports or if some ports have lower priority. +The previous test is repeated for every port, the average of the average +latencies and the standard deviation between averages is measured. + +The third test serves to measure responsiveness under load. Heavy traffic +is sent through all ports, latency and packet loss is measured +on a single idle port. + +The fourth test is all about fairness. Heavy traffic is injected in all +ports but one, latency and packet loss is measured on the single idle port. + +This is the test setup: + + # nproc + 56 + # ovs-vsctl show |grep -c Port + 2223 + # ovs-ofctl dump-flows ovs_upc_br + cookie=0x0, duration=4.827s, table=0, n_packets=0, n_bytes=0, actions=CONTROLLER:65535,LOCAL + # uname -a + Linux fc28 4.18.7-200.fc28.x86_64 #1 SMP Mon Sep 10 15:44:45 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux + +And these are the results of the tests: + + Stock OVS Patched + netlink sockets + in use by vswitchd + lsof -p $(pidof ovs-vswitchd) \ + |grep -c GENERIC 91187 2227 + + Test 1 + one port latency + min/avg/max/mdev (us) 2.7/6.6/238.7/1.8 1.6/6.8/160.6/1.7 + + Test 2 + all port + avg latency/mdev (us) 6.51/0.97 6.86/0.17 + + Test 3 + single port latency + under load + avg/mdev (us) 7.5/5.9 3.8/4.8 + packet loss 95 % 62 % + + Test 4 + idle port latency + under load + min/avg/max/mdev (us) 0.8/1.5/210.5/0.9 1.0/2.1/344.5/1.2 + packet loss 94 % 4 % + +CPU and RAM usage seems not to be affected, the resource usage of vswitchd +idle with 2000+ ports is unchanged: + + # ps u $(pidof ovs-vswitchd) + USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND + openvsw+ 5430 54.3 0.3 4263964 510968 pts/1 RLl+ 16:20 0:50 ovs-vswitchd + +Additionally, to check if vswitchd is thread safe with this patch, the +following test was run for circa 48 hours: on a 56 core machine, a +bridge with kernel datapath is filled with 2200 dummy interfaces and 22 +veth, then 22 traffic generators are run in parallel piping traffic into +the veths peers outside the bridge. +To generate as many upcalls as possible, all packets were forced to the +slowpath with an openflow rule like 'action=controller,local' and packet +size was set to 64 byte. Also, to avoid overflowing the FDB early and +slowing down the upcall processing, generated mac addresses were restricted +to a small interval. vswitchd ran without problems for 48+ hours, +obviously with all the handler threads with almost 99% CPU usage. + +[1] https://github.com/teknoraver/network-tools/blob/master/weed.c + +Signed-off-by: Matteo Croce +--- +v1 -> v2: + - define EPOLLEXCLUSIVE on systems with older kernel headers + - explain the thread safety test in the commit message + + lib/dpif-netlink.c | 311 ++++++++++++--------------------------------- + 1 file changed, 82 insertions(+), 229 deletions(-) + +diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c +index e6d5a6ec5..bb565ffee 100644 +--- a/lib/dpif-netlink.c ++++ b/lib/dpif-netlink.c +@@ -78,6 +78,10 @@ enum { MAX_PORTS = USHRT_MAX }; + #define FLOW_DUMP_MAX_BATCH 50 + #define OPERATE_MAX_OPS 50 + ++#ifndef EPOLLEXCLUSIVE ++#define EPOLLEXCLUSIVE (1u << 28) ++#endif ++ + struct dpif_netlink_dp { + /* Generic Netlink header. */ + uint8_t cmd; +@@ -170,7 +174,6 @@ struct dpif_windows_vport_sock { + #endif + + struct dpif_handler { +- struct dpif_channel *channels;/* Array of channels for each handler. */ + struct epoll_event *epoll_events; + int epoll_fd; /* epoll fd that includes channel socks. */ + int n_events; /* Num events returned by epoll_wait(). */ +@@ -193,6 +196,7 @@ struct dpif_netlink { + struct fat_rwlock upcall_lock; + struct dpif_handler *handlers; + uint32_t n_handlers; /* Num of upcall handlers. */ ++ struct dpif_channel *channels; /* Array of channels for each port. */ + int uc_array_size; /* Size of 'handler->channels' and */ + /* 'handler->epoll_events'. */ + +@@ -331,43 +335,6 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp) + return 0; + } + +-/* Destroys the netlink sockets pointed by the elements in 'socksp' +- * and frees the 'socksp'. */ +-static void +-vport_del_socksp__(struct nl_sock **socksp, uint32_t n_socks) +-{ +- size_t i; +- +- for (i = 0; i < n_socks; i++) { +- nl_sock_destroy(socksp[i]); +- } +- +- free(socksp); +-} +- +-/* Creates an array of netlink sockets. Returns an array of the +- * corresponding pointers. Records the error in 'error'. */ +-static struct nl_sock ** +-vport_create_socksp__(uint32_t n_socks, int *error) +-{ +- struct nl_sock **socksp = xzalloc(n_socks * sizeof *socksp); +- size_t i; +- +- for (i = 0; i < n_socks; i++) { +- *error = nl_sock_create(NETLINK_GENERIC, &socksp[i]); +- if (*error) { +- goto error; +- } +- } +- +- return socksp; +- +-error: +- vport_del_socksp__(socksp, n_socks); +- +- return NULL; +-} +- + #ifdef _WIN32 + static void + vport_delete_sock_pool(struct dpif_handler *handler) +@@ -422,129 +389,34 @@ error: + vport_delete_sock_pool(handler); + return error; + } +- +-/* Returns an array pointers to netlink sockets. The sockets are picked from a +- * pool. Records the error in 'error'. */ +-static struct nl_sock ** +-vport_create_socksp_windows(struct dpif_netlink *dpif, int *error) +- OVS_REQ_WRLOCK(dpif->upcall_lock) +-{ +- uint32_t n_socks = dpif->n_handlers; +- struct nl_sock **socksp; +- size_t i; +- +- ovs_assert(n_socks <= 1); +- socksp = xzalloc(n_socks * sizeof *socksp); +- +- /* Pick netlink sockets to use in a round-robin fashion from each +- * handler's pool of sockets. */ +- for (i = 0; i < n_socks; i++) { +- struct dpif_handler *handler = &dpif->handlers[i]; +- struct dpif_windows_vport_sock *sock_pool = handler->vport_sock_pool; +- size_t index = handler->last_used_pool_idx; +- +- /* A pool of sockets is allocated when the handler is initialized. */ +- if (sock_pool == NULL) { +- free(socksp); +- *error = EINVAL; +- return NULL; +- } +- +- ovs_assert(index < VPORT_SOCK_POOL_SIZE); +- socksp[i] = sock_pool[index].nl_sock; +- socksp[i] = sock_pool[index].nl_sock; +- ovs_assert(socksp[i]); +- index = (index == VPORT_SOCK_POOL_SIZE - 1) ? 0 : index + 1; +- handler->last_used_pool_idx = index; +- } +- +- return socksp; +-} +- +-static void +-vport_del_socksp_windows(struct dpif_netlink *dpif, struct nl_sock **socksp) +-{ +- free(socksp); +-} + #endif /* _WIN32 */ + +-static struct nl_sock ** +-vport_create_socksp(struct dpif_netlink *dpif, int *error) +-{ +-#ifdef _WIN32 +- return vport_create_socksp_windows(dpif, error); +-#else +- return vport_create_socksp__(dpif->n_handlers, error); +-#endif +-} +- +-static void +-vport_del_socksp(struct dpif_netlink *dpif, struct nl_sock **socksp) +-{ +-#ifdef _WIN32 +- vport_del_socksp_windows(dpif, socksp); +-#else +- vport_del_socksp__(socksp, dpif->n_handlers); +-#endif +-} +- +-/* Given the array of pointers to netlink sockets 'socksp', returns +- * the array of corresponding pids. If the 'socksp' is NULL, returns +- * a single-element array of value 0. */ +-static uint32_t * +-vport_socksp_to_pids(struct nl_sock **socksp, uint32_t n_socks) +-{ +- uint32_t *pids; +- +- if (!socksp) { +- pids = xzalloc(sizeof *pids); +- } else { +- size_t i; +- +- pids = xzalloc(n_socks * sizeof *pids); +- for (i = 0; i < n_socks; i++) { +- pids[i] = nl_sock_pid(socksp[i]); +- } +- } +- +- return pids; +-} +- +-/* Given the port number 'port_idx', extracts the pids of netlink sockets +- * associated to the port and assigns it to 'upcall_pids'. */ ++/* Given the port number 'port_idx', extracts the pid of netlink socket ++ * associated to the port and assigns it to 'upcall_pid'. */ + static bool +-vport_get_pids(struct dpif_netlink *dpif, uint32_t port_idx, +- uint32_t **upcall_pids) ++vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, ++ uint32_t *upcall_pid) + { +- uint32_t *pids; +- size_t i; +- + /* Since the nl_sock can only be assigned in either all +- * or none "dpif->handlers" channels, the following check ++ * or none "dpif" channels, the following check + * would suffice. */ +- if (!dpif->handlers[0].channels[port_idx].sock) { ++ if (!dpif->channels[port_idx].sock) { + return false; + } + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); + +- pids = xzalloc(dpif->n_handlers * sizeof *pids); +- +- for (i = 0; i < dpif->n_handlers; i++) { +- pids[i] = nl_sock_pid(dpif->handlers[i].channels[port_idx].sock); +- } +- +- *upcall_pids = pids; ++ *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); + + return true; + } + + static int +-vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no, +- struct nl_sock **socksp) ++vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, ++ struct nl_sock *socksp) + { + struct epoll_event event; + uint32_t port_idx = odp_to_u32(port_no); +- size_t i, j; ++ size_t i; + int error; + + if (dpif->handlers == NULL) { +@@ -553,7 +425,7 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no, + + /* We assume that the datapath densely chooses port numbers, which can + * therefore be used as an index into 'channels' and 'epoll_events' of +- * 'dpif->handler'. */ ++ * 'dpif'. */ + if (port_idx >= dpif->uc_array_size) { + uint32_t new_size = port_idx + 1; + +@@ -563,15 +435,15 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no, + return EFBIG; + } + +- for (i = 0; i < dpif->n_handlers; i++) { +- struct dpif_handler *handler = &dpif->handlers[i]; ++ dpif->channels = xrealloc(dpif->channels, ++ new_size * sizeof *dpif->channels); + +- handler->channels = xrealloc(handler->channels, +- new_size * sizeof *handler->channels); ++ for (i = dpif->uc_array_size; i < new_size; i++) { ++ dpif->channels[i].sock = NULL; ++ } + +- for (j = dpif->uc_array_size; j < new_size; j++) { +- handler->channels[j].sock = NULL; +- } ++ for (i = 0; i < dpif->n_handlers; i++) { ++ struct dpif_handler *handler = &dpif->handlers[i]; + + handler->epoll_events = xrealloc(handler->epoll_events, + new_size * sizeof *handler->epoll_events); +@@ -581,33 +453,33 @@ vport_add_channels(struct dpif_netlink *dpif, odp_port_t port_no, + } + + memset(&event, 0, sizeof event); +- event.events = EPOLLIN; ++ event.events = EPOLLIN | EPOLLEXCLUSIVE; + event.data.u32 = port_idx; + + for (i = 0; i < dpif->n_handlers; i++) { + struct dpif_handler *handler = &dpif->handlers[i]; + + #ifndef _WIN32 +- if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp[i]), ++ if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp), + &event) < 0) { + error = errno; + goto error; + } + #endif +- dpif->handlers[i].channels[port_idx].sock = socksp[i]; +- dpif->handlers[i].channels[port_idx].last_poll = LLONG_MIN; + } ++ dpif->channels[port_idx].sock = socksp; ++ dpif->channels[port_idx].last_poll = LLONG_MIN; + + return 0; + + error: +- for (j = 0; j < i; j++) { + #ifndef _WIN32 +- epoll_ctl(dpif->handlers[j].epoll_fd, EPOLL_CTL_DEL, +- nl_sock_fd(socksp[j]), NULL); +-#endif +- dpif->handlers[j].channels[port_idx].sock = NULL; ++ while (i--) { ++ epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, ++ nl_sock_fd(socksp), NULL); + } ++#endif ++ dpif->channels[port_idx].sock = NULL; + + return error; + } +@@ -618,14 +490,8 @@ vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) + uint32_t port_idx = odp_to_u32(port_no); + size_t i; + +- if (!dpif->handlers || port_idx >= dpif->uc_array_size) { +- return; +- } +- +- /* Since the sock can only be assigned in either all or none +- * of "dpif->handlers" channels, the following check would +- * suffice. */ +- if (!dpif->handlers[0].channels[port_idx].sock) { ++ if (!dpif->handlers || port_idx >= dpif->uc_array_size ++ || !dpif->channels[port_idx].sock) { + return; + } + +@@ -633,12 +499,14 @@ vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) + struct dpif_handler *handler = &dpif->handlers[i]; + #ifndef _WIN32 + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, +- nl_sock_fd(handler->channels[port_idx].sock), NULL); +- nl_sock_destroy(handler->channels[port_idx].sock); ++ nl_sock_fd(dpif->channels[port_idx].sock), NULL); + #endif +- handler->channels[port_idx].sock = NULL; + handler->event_offset = handler->n_events = 0; + } ++#ifndef _WIN32 ++ nl_sock_destroy(dpif->channels[port_idx].sock); ++#endif ++ dpif->channels[port_idx].sock = NULL; + } + + static void +@@ -655,10 +523,7 @@ destroy_all_channels(struct dpif_netlink *dpif) + struct dpif_netlink_vport vport_request; + uint32_t upcall_pids = 0; + +- /* Since the sock can only be assigned in either all or none +- * of "dpif->handlers" channels, the following check would +- * suffice. */ +- if (!dpif->handlers[0].channels[i].sock) { ++ if (!dpif->channels[i].sock) { + continue; + } + +@@ -679,11 +544,11 @@ destroy_all_channels(struct dpif_netlink *dpif) + + dpif_netlink_handler_uninit(handler); + free(handler->epoll_events); +- free(handler->channels); + } +- ++ free(dpif->channels); + free(dpif->handlers); + dpif->handlers = NULL; ++ dpif->channels = NULL; + dpif->n_handlers = 0; + dpif->uc_array_size = 0; + } +@@ -846,13 +711,12 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name, + { + struct dpif_netlink_vport request, reply; + struct ofpbuf *buf; +- struct nl_sock **socksp = NULL; +- uint32_t *upcall_pids; ++ struct nl_sock *socksp = NULL; ++ uint32_t upcall_pids; + int error = 0; + + if (dpif->handlers) { +- socksp = vport_create_socksp(dpif, &error); +- if (!socksp) { ++ if (nl_sock_create(NETLINK_GENERIC, &socksp)) { + return error; + } + } +@@ -864,9 +728,9 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name, + request.name = name; + + request.port_no = *port_nop; +- upcall_pids = vport_socksp_to_pids(socksp, dpif->n_handlers); +- request.n_upcall_pids = socksp ? dpif->n_handlers : 1; +- request.upcall_pids = upcall_pids; ++ upcall_pids = nl_sock_pid(socksp); ++ request.n_upcall_pids = 1; ++ request.upcall_pids = &upcall_pids; + + if (options) { + request.options = options->data; +@@ -882,31 +746,27 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name, + dpif_name(&dpif->dpif), *port_nop); + } + +- vport_del_socksp(dpif, socksp); ++ nl_sock_destroy(socksp); + goto exit; + } + +- if (socksp) { +- error = vport_add_channels(dpif, *port_nop, socksp); +- if (error) { +- VLOG_INFO("%s: could not add channel for port %s", +- dpif_name(&dpif->dpif), name); +- +- /* Delete the port. */ +- dpif_netlink_vport_init(&request); +- request.cmd = OVS_VPORT_CMD_DEL; +- request.dp_ifindex = dpif->dp_ifindex; +- request.port_no = *port_nop; +- dpif_netlink_vport_transact(&request, NULL, NULL); +- vport_del_socksp(dpif, socksp); +- goto exit; +- } ++ error = vport_add_channel(dpif, *port_nop, socksp); ++ if (error) { ++ VLOG_INFO("%s: could not add channel for port %s", ++ dpif_name(&dpif->dpif), name); ++ ++ /* Delete the port. */ ++ dpif_netlink_vport_init(&request); ++ request.cmd = OVS_VPORT_CMD_DEL; ++ request.dp_ifindex = dpif->dp_ifindex; ++ request.port_no = *port_nop; ++ dpif_netlink_vport_transact(&request, NULL, NULL); ++ nl_sock_destroy(socksp); ++ goto exit; + } +- free(socksp); + + exit: + ofpbuf_delete(buf); +- free(upcall_pids); + + return error; + } +@@ -1131,7 +991,7 @@ dpif_netlink_port_query_by_name(const struct dpif *dpif_, const char *devname, + + static uint32_t + dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif, +- odp_port_t port_no, uint32_t hash) ++ odp_port_t port_no, uint32_t hash OVS_UNUSED) + OVS_REQ_RDLOCK(dpif->upcall_lock) + { + uint32_t port_idx = odp_to_u32(port_no); +@@ -1141,14 +1001,13 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif, + /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s + * channel, since it is not heavily loaded. */ + uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx; +- struct dpif_handler *h = &dpif->handlers[hash % dpif->n_handlers]; + + /* Needs to check in case the socket pointer is changed in between + * the holding of upcall_lock. A known case happens when the main + * thread deletes the vport while the handler thread is handling + * the upcall from that port. */ +- if (h->channels[idx].sock) { +- pid = nl_sock_pid(h->channels[idx].sock); ++ if (dpif->channels[idx].sock) { ++ pid = nl_sock_pid(dpif->channels[idx].sock); + } + } + +@@ -2382,42 +2241,40 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) + dpif_netlink_port_dump_start__(dpif, &dump); + while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) { + uint32_t port_no = odp_to_u32(vport.port_no); +- uint32_t *upcall_pids = NULL; ++ uint32_t upcall_pid; + int error; + + if (port_no >= dpif->uc_array_size +- || !vport_get_pids(dpif, port_no, &upcall_pids)) { +- struct nl_sock **socksp = vport_create_socksp(dpif, &error); ++ || !vport_get_pid(dpif, port_no, &upcall_pid)) { ++ struct nl_sock *socksp; + +- if (!socksp) { ++ if (nl_sock_create(NETLINK_GENERIC, &socksp)) { + goto error; + } + +- error = vport_add_channels(dpif, vport.port_no, socksp); ++ error = vport_add_channel(dpif, vport.port_no, socksp); + if (error) { + VLOG_INFO("%s: could not add channels for port %s", + dpif_name(&dpif->dpif), vport.name); +- vport_del_socksp(dpif, socksp); ++ nl_sock_destroy(socksp); + retval = error; + goto error; + } +- upcall_pids = vport_socksp_to_pids(socksp, dpif->n_handlers); +- free(socksp); ++ upcall_pid = nl_sock_pid(socksp); + } + + /* Configure the vport to deliver misses to 'sock'. */ + if (vport.upcall_pids[0] == 0 +- || vport.n_upcall_pids != dpif->n_handlers +- || memcmp(upcall_pids, vport.upcall_pids, n_handlers * sizeof +- *upcall_pids)) { ++ || vport.n_upcall_pids != 1 ++ || upcall_pid != vport.upcall_pids[0]) { + struct dpif_netlink_vport vport_request; + + dpif_netlink_vport_init(&vport_request); + vport_request.cmd = OVS_VPORT_CMD_SET; + vport_request.dp_ifindex = dpif->dp_ifindex; + vport_request.port_no = vport.port_no; +- vport_request.n_upcall_pids = dpif->n_handlers; +- vport_request.upcall_pids = upcall_pids; ++ vport_request.n_upcall_pids = 1; ++ vport_request.upcall_pids = &upcall_pid; + error = dpif_netlink_vport_transact(&vport_request, NULL, NULL); + if (error) { + VLOG_WARN_RL(&error_rl, +@@ -2438,11 +2295,9 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers) + if (port_no < keep_channels_nbits) { + bitmap_set1(keep_channels, port_no); + } +- free(upcall_pids); + continue; + + error: +- free(upcall_pids); + vport_del_channels(dpif, vport.port_no); + } + nl_dump_done(&dump); +@@ -2701,7 +2556,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, + + while (handler->event_offset < handler->n_events) { + int idx = handler->epoll_events[handler->event_offset].data.u32; +- struct dpif_channel *ch = &dpif->handlers[handler_id].channels[idx]; ++ struct dpif_channel *ch = &dpif->channels[idx]; + + handler->event_offset++; + +@@ -2803,16 +2658,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) + OVS_REQ_WRLOCK(dpif->upcall_lock) + { + if (dpif->handlers) { +- size_t i, j; ++ size_t i; + ++ if (!dpif->channels[0].sock) { ++ return; ++ } + for (i = 0; i < dpif->uc_array_size; i++ ) { +- if (!dpif->handlers[0].channels[i].sock) { +- continue; +- } + +- for (j = 0; j < dpif->n_handlers; j++) { +- nl_sock_drain(dpif->handlers[j].channels[i].sock); +- } ++ nl_sock_drain(dpif->channels[i].sock); + } + } + } +-- +2.17.1 + diff --git a/0001-lib-netdev-tc-offloads-Fix-frag-first-later-translat.patch b/0001-lib-netdev-tc-offloads-Fix-frag-first-later-translat.patch deleted file mode 100644 index fa1a4a3..0000000 --- a/0001-lib-netdev-tc-offloads-Fix-frag-first-later-translat.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 45a60c21fc17ba31199fa800cdce92cc1f17f06b Mon Sep 17 00:00:00 2001 -From: Roi Dayan -Date: Sun, 25 Mar 2018 12:11:48 +0300 -Subject: [PATCH 1/2] lib/netdev-tc-offloads: Fix frag first/later translation - -Fragment mask (any and later) always exists so we need to test -for FLOW_NW_FRAG_LATER only if the state is FLOW_NW_FRAG_ANY. -Before this fix we could pass frag no and first at the same time to TC -which is also not tested there for bad frag state. -This fix make sure we only pass frag first/later if is frag. - -Fixes: 83e866067ea6 ("netdev-tc-offloads: Add support for IP fragmentation") -Signed-off-by: Roi Dayan -Reviewed-by: Paul Blakey -Signed-off-by: Simon Horman ---- - lib/netdev-tc-offloads.c | 19 +++++++++++++------ - 1 file changed, 13 insertions(+), 6 deletions(-) - -diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c -index f22415ee1..6db76801f 100644 ---- a/lib/netdev-tc-offloads.c -+++ b/lib/netdev-tc-offloads.c -@@ -948,14 +948,21 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, - flower.key.ip_ttl = key->nw_ttl; - flower.mask.ip_ttl = mask->nw_ttl; - -- if (mask->nw_frag) { -- if (key->nw_frag & FLOW_NW_FRAG_ANY) -+ if (mask->nw_frag & FLOW_NW_FRAG_ANY) { -+ flower.mask.flags |= TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT; -+ -+ if (key->nw_frag & FLOW_NW_FRAG_ANY) { - flower.key.flags |= TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT; -- if (!(key->nw_frag & FLOW_NW_FRAG_LATER)) -- flower.key.flags |= TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; - -- flower.mask.flags |= TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT; -- flower.mask.flags |= TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; -+ if (mask->nw_frag & FLOW_NW_FRAG_LATER) { -+ flower.mask.flags |= TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; -+ -+ if (!(key->nw_frag & FLOW_NW_FRAG_LATER)) { -+ flower.key.flags |= TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; -+ } -+ } -+ } -+ - mask->nw_frag = 0; - } - --- -2.17.0 - diff --git a/0001-ofproto-dpif-Delete-system-tunnel-interface-when-rem.patch b/0001-ofproto-dpif-Delete-system-tunnel-interface-when-rem.patch deleted file mode 100644 index 0daca36..0000000 --- a/0001-ofproto-dpif-Delete-system-tunnel-interface-when-rem.patch +++ /dev/null @@ -1,50 +0,0 @@ -From f6193c08c47bfb4bc2b10114bcdea7ae6581b144 Mon Sep 17 00:00:00 2001 -From: "juyan@redhat.com" -Date: Wed, 25 Oct 2017 11:41:27 +0800 -Subject: [PATCH] ofproto-dpif: Delete system tunnel interface when remove ovs - bridge - -When a user adds the first tunnel of a given type (e.g. the first VXLAN -tunnel) to an OVS bridge, OVS adds a vport of the same type to the -kernel datapath that backs the bridge. There is the corresponding -expectation that, when the last tunnel of that type is removed from the -OVS bridges, OVS would remove the vport that represents it from the -backing kernel datapath, but OVS was not doing that. This commit fixes -the problem. - -There is not any major concern about the lingering tunnel interface, but -it's cleaner to delete it. - -Fixes: 921c370a9df5 ("dpif-netlink: Probe for out-of-tree tunnels, decides used interface") -Signed-off-by: JunhanYan -Signed-off-by: Ben Pfaff ---- - ofproto/ofproto-dpif.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c -index 3365d4185..1a648c33f 100644 ---- a/ofproto/ofproto-dpif.c -+++ b/ofproto/ofproto-dpif.c -@@ -661,6 +661,8 @@ dealloc(struct ofproto *ofproto_) - static void - close_dpif_backer(struct dpif_backer *backer, bool del) - { -+ struct simap_node *node; -+ - ovs_assert(backer->refcount > 0); - - if (--backer->refcount) { -@@ -669,6 +671,9 @@ close_dpif_backer(struct dpif_backer *backer, bool del) - - udpif_destroy(backer->udpif); - -+ SIMAP_FOR_EACH (node, &backer->tnl_backers) { -+ dpif_port_del(backer->dpif, u32_to_odp(node->data), false); -+ } - simap_destroy(&backer->tnl_backers); - ovs_rwlock_destroy(&backer->odp_to_ofport_lock); - hmap_destroy(&backer->odp_to_ofport_map); --- -2.14.3 - diff --git a/0001-ovs-save-Don-t-always-include-the-default-flow-durin.patch b/0001-ovs-save-Don-t-always-include-the-default-flow-durin.patch new file mode 100644 index 0000000..24f31cf --- /dev/null +++ b/0001-ovs-save-Don-t-always-include-the-default-flow-durin.patch @@ -0,0 +1,38 @@ +From 949758946767ff79b4c3eb5eca755c6cf21643e3 Mon Sep 17 00:00:00 2001 +From: Timothy Redaelli +Date: Sun, 9 Sep 2018 14:20:02 +0200 +Subject: [PATCH] ovs-save: Don't always include the default flow during + restore + +Currently the default flow (actions=NORMAL) is present in the flow table after +the flow table is restored also when the default flow is removed. + +This commit changes the behaviour of the "ovs-save save-flows" command to use +"replace-flows" instead of "add-flows" to restore the flows. This is needed in +order to always have the new flow table as it was before restoring it. + +Reported-by: Flavio Leitner +Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1626096 +Signed-off-by: Timothy Redaelli +Acked-by: Flavio Leitner +Signed-off-by: Gurucharan Shetty +--- + utilities/ovs-save | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/utilities/ovs-save b/utilities/ovs-save +index ea8fb6a45..2294583d6 100755 +--- a/utilities/ovs-save ++++ b/utilities/ovs-save +@@ -121,7 +121,7 @@ save_flows () { + cnt++;printf "{class="$1",type="$2",len="$3"}->"$4}' + echo "'" + +- printf "%s" "ovs-ofctl -O $ofp_version add-flows ${bridge} " \ ++ printf "%s" "ovs-ofctl -O $ofp_version replace-flows ${bridge} " \ + "\"$workdir/$bridge.flows.dump\"" + + # If possible, use OpenFlow 1.4 atomic bundle transaction to add flows +-- +2.17.1 + diff --git a/0001-stream-ssl-Don-t-enable-new-TLS-versions-by-default.patch b/0001-stream-ssl-Don-t-enable-new-TLS-versions-by-default.patch deleted file mode 100644 index 77c3cce..0000000 --- a/0001-stream-ssl-Don-t-enable-new-TLS-versions-by-default.patch +++ /dev/null @@ -1,40 +0,0 @@ -From a6869520061696cb115afb7de0021556068d1134 Mon Sep 17 00:00:00 2001 -From: Timothy Redaelli -Date: Fri, 27 Jul 2018 16:29:40 +0200 -Subject: [PATCH 1/2] stream-ssl: Don't enable new TLS versions by default - -Currently protocol_flags is populated by the list of SSL and TLS -protocols by hand. This means that when a new TLS version is added to -openssl (in this case TLS v1.3 is added to openssl 1.1.1 beta) -ovsdb-server automatically enable support to it with the default ciphers. -This can be a security problem (since other ciphers can be enabled) and it -also makes a test (SSL db: implementation) to fail. - -This commit changes the 'protocol_flags' to use the list of all protocol -flags as provided by openssl library (SSL_OP_NO_SSL_MASK) so there is no -need to keep the list updated by hand. - -Signed-off-by: Timothy Redaelli -Signed-off-by: Ben Pfaff -(cherry picked from commit ab16d2c2871b82d1f71c652657791acd9ca51161) ---- - lib/stream-ssl.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c -index 278468083..95b0f106e 100644 ---- a/lib/stream-ssl.c -+++ b/lib/stream-ssl.c -@@ -1186,8 +1186,7 @@ stream_ssl_set_protocols(const char *arg) - } - - /* Start with all the flags off and turn them on as requested. */ -- long protocol_flags = SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | SSL_OP_NO_TLSv1; -- protocol_flags |= SSL_OP_NO_TLSv1_1 | SSL_OP_NO_TLSv1_2; -+ long protocol_flags = SSL_OP_NO_SSL_MASK; - - char *s = xstrdup(arg); - char *save_ptr = NULL; --- -2.17.1 - diff --git a/0002-lib-tc-Fix-sparse-warnings.patch b/0002-lib-tc-Fix-sparse-warnings.patch deleted file mode 100644 index 4a5ccb8..0000000 --- a/0002-lib-tc-Fix-sparse-warnings.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 7e0f69b581705064e2fd767426c5227150a31e6f Mon Sep 17 00:00:00 2001 -From: Ian Stokes -Date: Wed, 21 Mar 2018 20:11:22 +0000 -Subject: [PATCH 2/2] lib/tc: Fix sparse warnings. - -"sparse" complains with the warning 'incorrect type in argument 1 -(different base types)' in function nl_parse_flower_ip when parsing a key -flag and in function nl_msg_put_flower_options when writing the key -flag. Fix this by using network byte order when reading and writing key -flags to netlink messages. - -Fixes: 83e86606 ("netdev-tc-offloads: Add support for IP fragmentation") -Signed-off-by: Ian Stokes -Signed-off-by: Ben Pfaff -Acked-by: Roi Dayan ---- - lib/tc.c | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - -diff --git a/lib/tc.c b/lib/tc.c -index c446d8407..6daa44710 100644 ---- a/lib/tc.c -+++ b/lib/tc.c -@@ -377,8 +377,9 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) { - } - - if (attrs[TCA_FLOWER_KEY_FLAGS_MASK]) { -- key->flags = ntohl(nl_attr_get_u32(attrs[TCA_FLOWER_KEY_FLAGS])); -- mask->flags = ntohl(nl_attr_get_u32(attrs[TCA_FLOWER_KEY_FLAGS_MASK])); -+ key->flags = ntohl(nl_attr_get_be32(attrs[TCA_FLOWER_KEY_FLAGS])); -+ mask->flags = -+ ntohl(nl_attr_get_be32(attrs[TCA_FLOWER_KEY_FLAGS_MASK])); - } - - if (attrs[TCA_FLOWER_KEY_IPV4_SRC_MASK]) { -@@ -1503,9 +1504,9 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower) - } - - if (flower->mask.flags) { -- nl_msg_put_u32(request, TCA_FLOWER_KEY_FLAGS, -+ nl_msg_put_be32(request, TCA_FLOWER_KEY_FLAGS, - htonl(flower->key.flags)); -- nl_msg_put_u32(request, TCA_FLOWER_KEY_FLAGS_MASK, -+ nl_msg_put_be32(request, TCA_FLOWER_KEY_FLAGS_MASK, - htonl(flower->mask.flags)); - } - --- -2.17.0 - diff --git a/0002-netdev-tc-offloads-Add-support-for-IP-fragmentation.patch b/0002-netdev-tc-offloads-Add-support-for-IP-fragmentation.patch deleted file mode 100644 index 19de05a..0000000 --- a/0002-netdev-tc-offloads-Add-support-for-IP-fragmentation.patch +++ /dev/null @@ -1,185 +0,0 @@ -From a99f73a22e6303555af3f93535d03c7537da5a9a Mon Sep 17 00:00:00 2001 -From: Roi Dayan -Date: Mon, 12 Mar 2018 14:58:47 +0200 -Subject: [PATCH 2/2] netdev-tc-offloads: Add support for IP fragmentation - -Add support for frag no, first and later. - -Signed-off-by: Roi Dayan -Reviewed-by: Shahar Klein -Reviewed-by: Paul Blakey -Signed-off-by: Simon Horman ---- - acinclude.m4 | 6 +++--- - include/linux/pkt_cls.h | 5 +++-- - lib/netdev-tc-offloads.c | 38 ++++++++++++++++++++++++++++++++------ - lib/tc.c | 14 ++++++++++++++ - lib/tc.h | 1 + - 5 files changed, 53 insertions(+), 11 deletions(-) - -diff --git a/acinclude.m4 b/acinclude.m4 -index 176b93e8e..6a02f6527 100644 ---- a/acinclude.m4 -+++ b/acinclude.m4 -@@ -178,10 +178,10 @@ dnl Configure Linux tc compat. - AC_DEFUN([OVS_CHECK_LINUX_TC], [ - AC_COMPILE_IFELSE([ - AC_LANG_PROGRAM([#include ], [ -- int x = TCA_FLOWER_KEY_IP_TTL_MASK; -+ int x = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; - ])], -- [AC_DEFINE([HAVE_TCA_FLOWER_KEY_IP_TTL_MASK], [1], -- [Define to 1 if TCA_FLOWER_KEY_IP_TTL_MASK is avaiable.])]) -+ [AC_DEFINE([HAVE_TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST], [1], -+ [Define to 1 if TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST is avaiable.])]) - - AC_COMPILE_IFELSE([ - AC_LANG_PROGRAM([#include ], [ -diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h -index f7bc7ea70..60976f3f7 100644 ---- a/include/linux/pkt_cls.h -+++ b/include/linux/pkt_cls.h -@@ -1,7 +1,7 @@ - #ifndef __LINUX_PKT_CLS_WRAPPER_H - #define __LINUX_PKT_CLS_WRAPPER_H 1 - --#if defined(__KERNEL__) || defined(HAVE_TCA_FLOWER_KEY_IP_TTL_MASK) -+#if defined(__KERNEL__) || defined(HAVE_TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST) - #include_next - #else - -@@ -201,8 +201,9 @@ enum { - - enum { - TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), -+ TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), - }; - --#endif /* __KERNEL__ || !HAVE_TCA_FLOWER_KEY_IP_TTL_MASK */ -+#endif /* __KERNEL__ || !HAVE_TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST */ - - #endif /* __LINUX_PKT_CLS_WRAPPER_H */ -diff --git a/lib/netdev-tc-offloads.c b/lib/netdev-tc-offloads.c -index 9364d94f0..f22415ee1 100644 ---- a/lib/netdev-tc-offloads.c -+++ b/lib/netdev-tc-offloads.c -@@ -428,6 +428,27 @@ parse_tc_flower_to_match(struct tc_flower *flower, - - match_set_nw_ttl_masked(match, key->ip_ttl, mask->ip_ttl); - -+ if (mask->flags) { -+ uint8_t flags = 0; -+ uint8_t flags_mask = 0; -+ -+ if (mask->flags & TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT) { -+ if (key->flags & TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT) { -+ flags |= FLOW_NW_FRAG_ANY; -+ } -+ flags_mask |= FLOW_NW_FRAG_ANY; -+ } -+ -+ if (mask->flags & TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST) { -+ if (!(key->flags & TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST)) { -+ flags |= FLOW_NW_FRAG_LATER; -+ } -+ flags_mask |= FLOW_NW_FRAG_LATER; -+ } -+ -+ match_set_nw_frag_masked(match, flags, flags_mask); -+ } -+ - match_set_nw_src_masked(match, key->ipv4.ipv4_src, mask->ipv4.ipv4_src); - match_set_nw_dst_masked(match, key->ipv4.ipv4_dst, mask->ipv4.ipv4_dst); - -@@ -780,11 +801,6 @@ test_key_and_mask(struct match *match) - return EOPNOTSUPP; - } - -- if (mask->nw_frag) { -- VLOG_DBG_RL(&rl, "offloading attribute nw_frag isn't supported"); -- return EOPNOTSUPP; -- } -- - for (int i = 0; i < FLOW_MAX_MPLS_LABELS; i++) { - if (mask->mpls_lse[i]) { - VLOG_DBG_RL(&rl, "offloading attribute mpls_lse isn't supported"); -@@ -932,6 +948,17 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, - flower.key.ip_ttl = key->nw_ttl; - flower.mask.ip_ttl = mask->nw_ttl; - -+ if (mask->nw_frag) { -+ if (key->nw_frag & FLOW_NW_FRAG_ANY) -+ flower.key.flags |= TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT; -+ if (!(key->nw_frag & FLOW_NW_FRAG_LATER)) -+ flower.key.flags |= TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; -+ -+ flower.mask.flags |= TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT; -+ flower.mask.flags |= TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; -+ mask->nw_frag = 0; -+ } -+ - if (key->nw_proto == IPPROTO_TCP) { - flower.key.tcp_dst = key->tp_dst; - flower.mask.tcp_dst = mask->tp_dst; -@@ -958,7 +985,6 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, - mask->tp_dst = 0; - } - -- mask->nw_frag = 0; - mask->nw_tos = 0; - mask->nw_proto = 0; - mask->nw_ttl = 0; -diff --git a/lib/tc.c b/lib/tc.c -index b49bbe89b..c446d8407 100644 ---- a/lib/tc.c -+++ b/lib/tc.c -@@ -281,6 +281,8 @@ static const struct nl_policy tca_flower_policy[] = { - .optional = true, }, - [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NL_A_U16, - .optional = true, }, -+ [TCA_FLOWER_KEY_FLAGS] = { .type = NL_A_BE32, .optional = true, }, -+ [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NL_A_BE32, .optional = true, }, - [TCA_FLOWER_KEY_IP_TTL] = { .type = NL_A_U8, - .optional = true, }, - [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NL_A_U8, -@@ -374,6 +376,11 @@ nl_parse_flower_ip(struct nlattr **attrs, struct tc_flower *flower) { - mask->ip_proto = UINT8_MAX; - } - -+ if (attrs[TCA_FLOWER_KEY_FLAGS_MASK]) { -+ key->flags = ntohl(nl_attr_get_u32(attrs[TCA_FLOWER_KEY_FLAGS])); -+ mask->flags = ntohl(nl_attr_get_u32(attrs[TCA_FLOWER_KEY_FLAGS_MASK])); -+ } -+ - if (attrs[TCA_FLOWER_KEY_IPV4_SRC_MASK]) { - key->ipv4.ipv4_src = - nl_attr_get_be32(attrs[TCA_FLOWER_KEY_IPV4_SRC]); -@@ -1495,6 +1502,13 @@ nl_msg_put_flower_options(struct ofpbuf *request, struct tc_flower *flower) - flower->key.ip_proto); - } - -+ if (flower->mask.flags) { -+ nl_msg_put_u32(request, TCA_FLOWER_KEY_FLAGS, -+ htonl(flower->key.flags)); -+ nl_msg_put_u32(request, TCA_FLOWER_KEY_FLAGS_MASK, -+ htonl(flower->mask.flags)); -+ } -+ - if (flower->key.ip_proto == IPPROTO_UDP) { - FLOWER_PUT_MASKED_VALUE(udp_src, TCA_FLOWER_KEY_UDP_SRC); - FLOWER_PUT_MASKED_VALUE(udp_dst, TCA_FLOWER_KEY_UDP_DST); -diff --git a/lib/tc.h b/lib/tc.h -index 6af51c69b..4400a829e 100644 ---- a/lib/tc.h -+++ b/lib/tc.h -@@ -92,6 +92,7 @@ struct tc_flower_key { - - ovs_be16 encap_eth_type; - -+ uint8_t flags; - uint8_t ip_ttl; - - struct { --- -2.14.3 - diff --git a/0002-stream-ssl-Define-SSL_OP_NO_SSL_MASK-for-OpenSSL-ver.patch b/0002-stream-ssl-Define-SSL_OP_NO_SSL_MASK-for-OpenSSL-ver.patch deleted file mode 100644 index cf77159..0000000 --- a/0002-stream-ssl-Define-SSL_OP_NO_SSL_MASK-for-OpenSSL-ver.patch +++ /dev/null @@ -1,40 +0,0 @@ -From 74f34a896ddaebce7eba66022be8868dd3b44d0a Mon Sep 17 00:00:00 2001 -From: Ben Pfaff -Date: Mon, 6 Aug 2018 15:39:44 -0700 -Subject: [PATCH 2/2] stream-ssl: Define SSL_OP_NO_SSL_MASK for OpenSSL - versions that lack it. - -10 of the travis builds are failing such as -TESTSUITE=1 KERNEL=3.16.54 for gcc and clang. - -Fixes: ab16d2c2871b ("stream-ssl: Don't enable new TLS versions by default") -CC: Timothy Redaelli -Signed-off-by: Darrell Ball -Signed-off-by: Ben Pfaff -Acked-by: Han Zhou -Acked-by: Darrell Ball -(cherry picked from commit ce679280889f0eb4ebc95b62558a20a7a5f7c0fb) ---- - lib/stream-ssl.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c -index 95b0f106e..c64bb8f19 100644 ---- a/lib/stream-ssl.c -+++ b/lib/stream-ssl.c -@@ -1186,6 +1186,12 @@ stream_ssl_set_protocols(const char *arg) - } - - /* Start with all the flags off and turn them on as requested. */ -+#ifndef SSL_OP_NO_SSL_MASK -+ /* For old OpenSSL without this macro, this is the correct value. */ -+#define SSL_OP_NO_SSL_MASK (SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | \ -+ SSL_OP_NO_TLSv1 | SSL_OP_NO_TLSv1_1 | \ -+ SSL_OP_NO_TLSv1_2) -+#endif - long protocol_flags = SSL_OP_NO_SSL_MASK; - - char *s = xstrdup(arg); --- -2.17.1 - diff --git a/ofproto-dpif-xlate_Fix_translation_of_groups_with_no_bu.patch b/ofproto-dpif-xlate_Fix_translation_of_groups_with_no_bu.patch new file mode 100644 index 0000000..83656ba --- /dev/null +++ b/ofproto-dpif-xlate_Fix_translation_of_groups_with_no_bu.patch @@ -0,0 +1,41 @@ +Date: Sun, 2 Sep 2018 09:30:43 -0700 +From: Ben Pfaff +To: dev@openvswitch.org +Cc: Ben Pfaff +Subject: [ovs-dev] [PATCH] ofproto-dpif-xlate: Fix translation of groups with + no buckets. +Message-Id: <20180902163043.11210-1-blp@ovn.org> +List-Id: +X-Bogosity: Unsure, tests=bogofilter, spamicity=0.500000, version=1.2.4 + +A group can have no buckets, in which case ovs_list_back() assert-fails. +This fixes the problem. + +Found by OFTest. + +Fixes: a04e58881e25 ("ofproto-dpif-xlate: Simplify translation for groups.") +Signed-off-by: Ben Pfaff +--- + ofproto/ofproto-dpif-xlate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c +index e26f6c8f554a..507e14dd0d00 100644 +--- a/ofproto/ofproto-dpif-xlate.c ++++ b/ofproto/ofproto-dpif-xlate.c +@@ -4488,7 +4488,7 @@ xlate_group_action__(struct xlate_ctx *ctx, struct group_dpif *group, + bool is_last_action) + { + if (group->up.type == OFPGT11_ALL || group->up.type == OFPGT11_INDIRECT) { +- struct ovs_list *last_bucket = ovs_list_back(&group->up.buckets); ++ struct ovs_list *last_bucket = group->up.buckets.prev; + struct ofputil_bucket *bucket; + LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { + bool is_last_bucket = &bucket->list_node == last_bucket; +-- +2.16.1 + +_______________________________________________ +dev mailing list +dev@openvswitch.org +https://mail.openvswitch.org/mailman/listinfo/ovs-dev diff --git a/openvswitch.spec b/openvswitch.spec index 5008f99..ba047be 100644 --- a/openvswitch.spec +++ b/openvswitch.spec @@ -39,8 +39,8 @@ Name: openvswitch Summary: Open vSwitch daemon/database/utilities URL: http://www.openvswitch.org/ -Version: 2.9.2 -Release: 6%{?commit0:.%{date}git%{shortcommit0}}%{?dist} +Version: 2.10.0 +Release: 1%{?commit0:.%{date}git%{shortcommit0}}%{?dist} # Nearly all of openvswitch is ASL 2.0. The bugtool is LGPLv2+, and the # lib/sflow*.[ch] files are SISSL @@ -64,21 +64,23 @@ ExclusiveArch: x86_64 aarch64 ppc64le s390x # ovs-patches # OVS (including OVN) backports (0 - 300) -Patch0: ovs-dev-ofproto-macros-Ignore-Dropped-log-messages-in-check_logs..patch -Patch10: 0001-ofproto-dpif-Delete-system-tunnel-interface-when-rem.patch -Patch41: 0002-netdev-tc-offloads-Add-support-for-IP-fragmentation.patch -Patch42: 0001-lib-netdev-tc-offloads-Fix-frag-first-later-translat.patch -Patch43: 0002-lib-tc-Fix-sparse-warnings.patch +Patch010: ofproto-dpif-xlate_Fix_translation_of_groups_with_no_bu.patch -Patch50: 0001-Add-ovs.compat-module-to-python-package.patch +Patch020: 0001-ovs-save-Don-t-always-include-the-default-flow-durin.patch +# Bug 1631797 +Patch030: 0001-dpif-netdev-Add-round-robin-based-rxq-to-pmd-assignm.patch -# Don't enable new TLS versions by default (needed since OpenSSL 1.1.1) -Patch310: 0001-stream-ssl-Don-t-enable-new-TLS-versions-by-default.patch -Patch311: 0002-stream-ssl-Define-SSL_OP_NO_SSL_MASK-for-OpenSSL-ver.patch +# Bug 1565205 +Patch040: 0001-dpif-netdev-Avoid-reordering-of-packets-in-a-batch-w.patch -Patch315: 0001-dhparams-Fix-.c-file-generation-with-OpenSSL-1.1.1-p.patch +# Bug 1634015 +Patch050: 0001-dpif-netlink-don-t-allocate-per-thread-netlink-socke.patch +Patch051: 0001-dpif-Remove-support-for-multiple-queues-per-port.patch + +# Bug 1635344 +Patch070: 0001-OVN-add-CT_LB-action-to-ovn-trace.patch BuildRequires: gcc-c++ BuildRequires: gcc @@ -89,6 +91,7 @@ BuildRequires: python2-devel python2-six BuildRequires: python3-devel python3-six BuildRequires: desktop-file-utils BuildRequires: groff-base graphviz +BuildRequires: unbound-devel # make check dependencies BuildRequires: procps-ng BuildRequires: python2-pyOpenSSL @@ -114,6 +117,8 @@ Requires: openssl iproute module-init-tools Requires(pre): shadow-utils Requires(post): /bin/sed +Requires(post): /usr/sbin/usermod +Requires(post): /usr/sbin/groupadd Requires(post): systemd-units Requires(preun): systemd-units Requires(postun): systemd-units @@ -572,6 +577,7 @@ chown -R openvswitch:openvswitch /etc/openvswitch %{_datadir}/openvswitch/scripts/ovs-save %{_datadir}/openvswitch/scripts/ovs-vtep %{_datadir}/openvswitch/scripts/ovs-ctl +%{_datadir}/openvswitch/scripts/ovs-kmod-ctl %{_datadir}/openvswitch/scripts/ovs-systemd-reload %config %{_datadir}/openvswitch/vswitch.ovsschema %config %{_datadir}/openvswitch/vtep.ovsschema @@ -604,13 +610,14 @@ chown -R openvswitch:openvswitch /etc/openvswitch %{_mandir}/man8/ovs-ctl.8* %{_mandir}/man8/ovs-dpctl.8* %{_mandir}/man8/ovs-dpctl-top.8* +%{_mandir}/man8/ovs-kmod-ctl.8.* %{_mandir}/man8/ovs-ofctl.8* %{_mandir}/man8/ovs-pki.8* %{_mandir}/man8/ovs-vsctl.8* %{_mandir}/man8/ovs-vswitchd.8* %{_mandir}/man8/ovs-parse-backtrace.8* %{_udevrulesdir}/91-vfio.rules -%doc COPYING NOTICE README.rst NEWS rhel/README.RHEL.rst +%doc LICENSE NOTICE README.rst NEWS rhel/README.RHEL.rst /var/lib/openvswitch %attr(750,openvswitch,openvswitch) /var/log/openvswitch %ghost %attr(755,root,root) %verify(not owner group) %{_rundir}/openvswitch @@ -661,6 +668,9 @@ chown -R openvswitch:openvswitch /etc/openvswitch %{_unitdir}/ovn-controller-vtep.service %changelog +* Fri Oct 05 2018 Timothy Redaelli - 2.10.0-1 +- Align with "Fast Datapath" 2.10.0-10 (#1633555) + * Fri Sep 14 2018 Timothy Redaelli - 2.9.2-6 - Backport "Add ovs.compat module to python package" (#1619712) - Backport a variant of "dhparams: Fix .c file generation with OpenSSL >= 1.1.1-pre9" diff --git a/ovs-dev-ofproto-macros-Ignore-Dropped-log-messages-in-check_logs..patch b/ovs-dev-ofproto-macros-Ignore-Dropped-log-messages-in-check_logs..patch deleted file mode 100644 index d7d5edc..0000000 --- a/ovs-dev-ofproto-macros-Ignore-Dropped-log-messages-in-check_logs..patch +++ /dev/null @@ -1,54 +0,0 @@ -From patchwork Tue Jul 3 18:32:18 2018 -Content-Type: text/plain; charset="utf-8" -MIME-Version: 1.0 -Content-Transfer-Encoding: 7bit -Subject: [ovs-dev] ofproto-macros: Ignore "Dropped # log messages" in - check_logs. -X-Patchwork-Submitter: Ben Pfaff -X-Patchwork-Id: 938851 -Message-Id: <20180703183218.32329-1-blp@ovn.org> -To: dev@openvswitch.org -Cc: Ben Pfaff -Date: Tue, 3 Jul 2018 11:32:18 -0700 -From: Ben Pfaff -List-Id: - -check_logs ignores some log messages, but it wasn't smart enough to ignore -the messages that said that the ignored messages had been rate-limited. -This fixes the problem. - -It's OK to ignore all rate-limiting messages because they only appear if at -least one message was not rate-limited, which check_logs will catch anyway. - -Reported-by: Timothy Redaelli -Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/046978.html -Signed-off-by: Ben Pfaff -Tested-By: Timothy Redaelli ---- - tests/ofproto-macros.at | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at -index 7388a20a2236..2a56ae6e2f3e 100644 ---- a/tests/ofproto-macros.at -+++ b/tests/ofproto-macros.at -@@ -400,6 +400,11 @@ check_logs () { - # all "connection reset" warning logs for similar reasons (either EPIPE or - # ECONNRESET can be returned on a send depending on whether the peer had - # unconsumed data when it closed the socket). -+ # -+ # We also ignore "Dropped # log messages..." messages. Otherwise, even if -+ # we ignore the messages that were rate-limited, we can end up failing just -+ # because of the announcement that rate-limiting happened (and in a racy, -+ # timing-dependent way, too). - sed -n "$1 - /reset by peer/d - /Broken pipe/d -@@ -408,6 +413,7 @@ check_logs () { - /timeval.*disk: [[0-9]]* reads, [[0-9]]* writes/d - /timeval.*context switches: [[0-9]]* voluntary, [[0-9]]* involuntary/d - /ovs_rcu.*blocked [[0-9]]* ms waiting for .* to quiesce/d -+/Dropped [[0-9]]* log messages/d - /|WARN|/p - /|ERR|/p - /|EMER|/p" ${logs} diff --git a/sources b/sources index 9defd9e..7104884 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openvswitch-2.9.2.tar.gz) = 6122651fcdeb64793ae7cdd379f55f87ff1f989d6cb5ab67ad83125c566508b474e1875f024d2f2fc2159b90baf383d5e792cbf515a96266126f0f05511ceb0d +SHA512 (openvswitch-2.10.0.tar.gz) = f118c1c4ab4e126c3343023b03007ca9819c3c5a5ea42eaffaabdc7c50ecddede3e258574dbe0de95ed3be2e3d101612f5bdb423a7adb679987f4e501183a216