diff --git a/.github/buildomat/common.sh b/.github/buildomat/common.sh index b02113c2..0ef28786 100644 --- a/.github/buildomat/common.sh +++ b/.github/buildomat/common.sh @@ -1,14 +1,11 @@ #!/bin/bash -# The tofino2 has 20 stages, and the current sidecar.p4 needs all 20 of them. -# Specifying the number of stages isn't strictly necessary, but it allows us to -# track when we exceed the current ceiling. The underlying intention is to grow -# deliberately and thoughtfully, given the limited space on the ASIC. -# -# Note: this now seems silly since we have maxed out the number of stages, but -# we want to leave this check and note in place should we ever find a way to -# reduce our footprint below 20 stages. -TOFINO_STAGES=20 +# The tofino2 has 20 stages. With multicast as a default feature, sidecar.p4 +# needs 18 stages. Specifying the number of stages isn't strictly necessary, +# but it allows us to track when we exceed the current ceiling. The underlying +# intention is to grow deliberately and thoughtfully, given the limited space +# on the ASIC. +TOFINO_STAGES=18 # These describe which version of the SDE to download and where to find it SDE_COMMIT=e61fe02c3c1c384b2e212c90177fcea76a31fd4e diff --git a/.github/buildomat/jobs/image.sh b/.github/buildomat/jobs/image.sh index 02d128c6..2245d589 100755 --- a/.github/buildomat/jobs/image.sh +++ b/.github/buildomat/jobs/image.sh @@ -101,7 +101,7 @@ pfexec chown "$UID" /out banner "P4 Codegen" # Add gcc-12 so the p4 compiler can find cpp -PATH=/opt/gcc-12/bin:$PATH cargo xtask codegen --stages $TOFINO_STAGES +PATH=/opt/gcc-12/bin:$PATH cargo xtask codegen --stages $TOFINO_STAGES --multicast # Preserve all the diagnostics spit out by the compiler mkdir -p /out/p4c-diags diff --git a/.github/buildomat/jobs/multicast-test.sh b/.github/buildomat/jobs/multicast-test.sh index a6e1669f..ba0abcf1 100755 --- a/.github/buildomat/jobs/multicast-test.sh +++ b/.github/buildomat/jobs/multicast-test.sh @@ -30,5 +30,4 @@ set -o errexit set -o pipefail set -o xtrace -export MULTICAST=1 source .github/buildomat/packet-test-common.sh diff --git a/.github/buildomat/packet-test-common.sh b/.github/buildomat/packet-test-common.sh index bf6b2b47..addcfc40 100755 --- a/.github/buildomat/packet-test-common.sh +++ b/.github/buildomat/packet-test-common.sh @@ -3,21 +3,17 @@ export RUST_BACKTRACE=1 source .github/buildomat/common.sh source .github/buildomat/linux.sh -wd=`pwd` +wd=$(pwd) export WS=$wd MODEL_STARTUP_TIMEOUT=${MODEL_STARTUP_TIMEOUT:=5} STARTUP_TIMEOUT=${STARTUP_TIMEOUT:=120} -if [ x$MULTICAST == x ]; then - BUILD_FEATURES=tofino_asic - CODEGEN_FEATURES= - SWADM_FEATURES= - else - BUILD_FEATURES=tofino_asic,multicast - CODEGEN_FEATURES=--multicast - SWADM_FEATURES=--features=multicast -fi - +BUILD_FEATURES=tofino_asic +TOFINO_STAGES=18 + +CODEGEN_FEATURES=--multicast +SWADM_FEATURES="--features=multicast" + function cleanup { set +o errexit set +o pipefail @@ -66,29 +62,33 @@ fi banner "Test" sudo -E ./tools/veth_setup.sh -id=`id -un` -gr=`id -gn` +id=$(id -un) +gr=$(id -gn) sudo -E mkdir -p /work sudo -E chown $id:$gr /work -sudo -E ./tools/run_tofino_model.sh &> /work/simulator.log & +sudo -E ./tools/run_tofino_model.sh &>/work/simulator.log /dev/null || true sleep $MODEL_STARTUP_TIMEOUT -sudo -E ./tools/run_dpd.sh -m 127.0.0.1 &> /work/dpd.log & +sudo -E ./tools/run_dpd.sh -m 127.0.0.1 &>/work/dpd.log /dev/null || true echo "waiting for dpd to come online" set +o errexit SLEEP_TIME=5 -iters=$(( $STARTUP_TIMEOUT / $SLEEP_TIME )) -while [ 1 ] ; do - ./target/debug/swadm --host '[::1]' build-info 2> /dev/null - if [ $? == 0 ]; then - break - fi - iters=$(($iters - 1)) - if [ $iters = 0 ]; then - echo "dpd failed to come online in $STARTUP_TIMEOUT seconds" - exit 1 - fi - sleep $SLEEP_TIME +iters=$(($STARTUP_TIMEOUT / $SLEEP_TIME)) +while [ 1 ]; do + ./target/debug/swadm --host '[::1]' build-info 2>/dev/null + rc=$? + stty sane 2>/dev/null || true + if [ $rc == 0 ]; then + break + fi + iters=$(($iters - 1)) + if [ $iters = 0 ]; then + echo "dpd failed to come online in $STARTUP_TIMEOUT seconds" + exit 1 + fi + sleep $SLEEP_TIME done set -o errexit diff --git a/HEAD b/HEAD new file mode 100644 index 00000000..e69de29b diff --git a/aal/Cargo.toml b/aal/Cargo.toml index c2efa3d9..f5c6f62e 100644 --- a/aal/Cargo.toml +++ b/aal/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2024" [features] +default = ["multicast"] multicast = [] [dependencies] diff --git a/asic/Cargo.toml b/asic/Cargo.toml index 39dc3cbe..a77f935e 100644 --- a/asic/Cargo.toml +++ b/asic/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2024" [features] +default = ["multicast"] tofino_asic = [ "dep:lazy_static", "dep:transceiver-controller", diff --git a/asic/src/chaos/table.rs b/asic/src/chaos/table.rs index 20bf8bcb..4c3cf7ba 100644 --- a/asic/src/chaos/table.rs +++ b/asic/src/chaos/table.rs @@ -20,7 +20,7 @@ pub const ROUTE_IPV4: &str = "pipe.Ingress.l3_router.routes_ipv4"; pub const ROUTE_IPV6: &str = "pipe.Ingress.l3_router.routes_ipv6"; pub const ARP_IPV4: &str = "pipe.Ingress.l3_router.arp_ipv4"; pub const NEIGHBOR_IPV6: &str = "pipe.Ingress.l3_router.neighbor_ipv6"; -pub const MAC_REWRITE: &str = "pipe.Ingress.mac_rewrite.mac_rewrite"; +pub const MAC_REWRITE: &str = "pipe.Egress.unicast_mac_rewrite.mac_rewrite"; pub const SWITCH_IPV4_ADDR: &str = "pipe.Ingress.filter.switch_ipv4_addr"; pub const SWITCH_IPV6_ADDR: &str = "pipe.Ingress.filter.switch_ipv6_addr"; pub const NAT_INGRESS_IPV4: &str = "pipe.Ingress.nat_ingress.ingress_ipv4"; @@ -42,7 +42,7 @@ pub(crate) const MCAST_ROUTE_IPV4: &str = pub(crate) const MCAST_ROUTE_IPV6: &str = "pipe.Ingress.l3_router.MulticastRouter6.tbl"; pub(crate) const MCAST_MAC_REWRITE: &str = - "pipe.Egress.mac_rewrite.mac_rewrite"; + "pipe.Egress.mcast_mac_rewrite.mac_rewrite"; pub(crate) const MCAST_DECAP_PORTS: &str = "pipe.Egress.mcast_egress.tbl_decap_ports"; pub(crate) const MCAST_PORT_ID_MAPPING: &str = diff --git a/asic/src/softnpu/table.rs b/asic/src/softnpu/table.rs index 68f9975d..42cb742e 100644 --- a/asic/src/softnpu/table.rs +++ b/asic/src/softnpu/table.rs @@ -23,6 +23,12 @@ pub struct Table { } // soft-npu table names +// Route tables are idx-only in sidecar-lite; route_ttl_is_1 is ignored here. +// TODO: remove compat once sidecar-lite updates route keys/actions: +// - p4/sidecar-lite.p4: add route_ttl_is_1 to route table keys and add +// ttl_exceeded actions; set ingress.route_ttl_is_1 in router. +// - p4/softnpu.p4: add route_ttl_is_1 to ingress metadata. +// - scadm + softnpu tests: encode/decode idx + ttl in route keys. const ROUTER_V4_RT: &str = "ingress.router.v4_route.rtr"; const ROUTER_V4_IDX: &str = "ingress.router.v4_idx.rtr"; const ROUTER_V6_RT: &str = "ingress.router.v6_route.rtr"; @@ -44,16 +50,19 @@ const _PROXY_ARP: &str = "ingress.pxarp.proxy_arp"; const SWITCH_ADDR4: &str = "pipe.Ingress.filter.switch_ipv4_addr"; const SWITCH_ADDR6: &str = "pipe.Ingress.filter.switch_ipv6_addr"; const ROUTER4_LOOKUP_RT: &str = - "pipe.Ingress.l3_router.Router4.lookup_idx.route"; + "pipe.Ingress.l3_router.router4.lookup_idx.route"; const ROUTER4_LOOKUP_IDX: &str = - "pipe.Ingress.l3_router.Router4.lookup_idx.lookup"; + "pipe.Ingress.l3_router.router4.lookup_idx.lookup"; const ROUTER6_LOOKUP_RT: &str = - "pipe.Ingress.l3_router.Router6.lookup_idx.route"; + "pipe.Ingress.l3_router.router6.lookup_idx.route"; const ROUTER6_LOOKUP_IDX: &str = - "pipe.Ingress.l3_router.Router6.lookup_idx.lookup"; + "pipe.Ingress.l3_router.router6.lookup_idx.lookup"; const NDP: &str = "pipe.Ingress.l3_router.Ndp.tbl"; const ARP: &str = "pipe.Ingress.l3_router.Arp.tbl"; -const DPD_MAC_REWRITE: &str = "pipe.Ingress.mac_rewrite.mac_rewrite"; +const DPD_UNICAST_MAC_REWRITE: &str = + "pipe.Egress.unicast_mac_rewrite.mac_rewrite"; +#[cfg(feature = "multicast")] +const DPD_MCAST_MAC_REWRITE: &str = "pipe.Egress.mcast_mac_rewrite.mac_rewrite"; const NAT_INGRESS4: &str = "pipe.Ingress.nat_ingress.ingress_ipv4"; const NAT_INGRESS6: &str = "pipe.Ingress.nat_ingress.ingress_ipv6"; const ATTACHED_SUBNET_INGRESS4: &str = @@ -85,8 +94,12 @@ impl TableOps for Table { SWITCH_ADDR6 => (Some(LOCAL_V6.into()), Some(SWITCH_ADDR6.into())), NDP => (Some(RESOLVER_V6.into()), Some(NDP.into())), ARP => (Some(RESOLVER_V4.into()), Some(ARP.into())), - DPD_MAC_REWRITE => { - (Some(MAC_REWRITE.into()), Some(DPD_MAC_REWRITE.into())) + DPD_UNICAST_MAC_REWRITE => { + (Some(MAC_REWRITE.into()), Some(DPD_UNICAST_MAC_REWRITE.into())) + } + #[cfg(feature = "multicast")] + DPD_MCAST_MAC_REWRITE => { + (Some(MAC_REWRITE.into()), Some(DPD_MCAST_MAC_REWRITE.into())) } NAT_INGRESS4 => (Some(NAT_V4.into()), Some(NAT_INGRESS4.into())), NAT_INGRESS6 => (Some(NAT_V6.into()), Some(NAT_INGRESS6.into())), @@ -135,9 +148,22 @@ impl TableOps for Table { let action_data = data.action_to_ir().unwrap(); trace!(hdl.log, "entry_add called"); - trace!(hdl.log, "table: {}", table); - trace!(hdl.log, "match_data:\n{:#?}", match_data); - trace!(hdl.log, "action_data:\n{:#?}", action_data); + trace!(hdl.log, "table: {table}"); + trace!(hdl.log, "match_data:\n{match_data:#?}"); + trace!(hdl.log, "action_data:\n{action_data:#?}"); + + let is_route_table = + matches!(dpd_table.as_str(), ROUTER4_LOOKUP_RT | ROUTER6_LOOKUP_RT); + if is_route_table { + if route_ttl_is_1(&match_data.fields) { + trace!(hdl.log, "skipping ttl==1 route entry for {dpd_table}"); + return Ok(()); + } + if action_data.action == "ttl_exceeded" { + trace!(hdl.log, "skipping ttl_exceeded action for {dpd_table}"); + return Ok(()); + } + } let keyset_data = keyset_data(match_data.fields, &table); @@ -440,7 +466,23 @@ impl TableOps for Table { } ("rewrite_dst", params) } - (DPD_MAC_REWRITE, "rewrite") => { + (DPD_UNICAST_MAC_REWRITE, "rewrite") => { + let mut params = Vec::new(); + for arg in action_data.args { + match arg.value { + ValueTypes::U64(v) => { + let mac = v.to_le_bytes(); + params.extend_from_slice(&mac[0..6]); + } + ValueTypes::Ptr(v) => { + params.extend_from_slice(v.as_slice()); + } + } + } + ("rewrite", params) + } + #[cfg(feature = "multicast")] + (DPD_MCAST_MAC_REWRITE, "rewrite") => { let mut params = Vec::new(); for arg in action_data.args { match arg.value { @@ -545,10 +587,10 @@ impl TableOps for Table { }; let action = action.to_string(); trace!(hdl.log, "sending request to softnpu"); - trace!(hdl.log, "table: {}", table); - trace!(hdl.log, "action: {:#?}", action); - trace!(hdl.log, "keyset_data:\n{:#?}", keyset_data); - trace!(hdl.log, "parameter_data:\n{:#?}", parameter_data); + trace!(hdl.log, "table: {table}"); + trace!(hdl.log, "action: {action:#?}"); + trace!(hdl.log, "keyset_data:\n{keyset_data:#?}"); + trace!(hdl.log, "parameter_data:\n{parameter_data:#?}"); let msg = ManagementRequest::TableAdd(TableAdd { table, @@ -576,9 +618,9 @@ impl TableOps for Table { let action_data = data.action_to_ir().unwrap(); trace!(hdl.log, "entry_update called"); - trace!(hdl.log, "table: {}", table); - trace!(hdl.log, "match_data:\n{:#?}", match_data); - trace!(hdl.log, "action_data:\n{:#?}", action_data); + trace!(hdl.log, "table: {table}"); + trace!(hdl.log, "match_data:\n{match_data:#?}"); + trace!(hdl.log, "action_data:\n{action_data:#?}"); //TODO implement in softnpu Ok(()) @@ -593,17 +635,31 @@ impl TableOps for Table { None => return Ok(()), Some(id) => id.clone(), }; + let dpd_table = match &self.dpd_id { + None => return Ok(()), + Some(id) => id.clone(), + }; let match_data = key.key_to_ir().unwrap(); trace!(hdl.log, "entry_del called"); - trace!(hdl.log, "table: {}", table); - trace!(hdl.log, "match_data:\n{:#?}", match_data); + trace!(hdl.log, "table: {table}"); + trace!(hdl.log, "match_data:\n{match_data:#?}"); + + let is_route_table = + matches!(dpd_table.as_str(), ROUTER4_LOOKUP_RT | ROUTER6_LOOKUP_RT); + if is_route_table && route_ttl_is_1(&match_data.fields) { + trace!( + hdl.log, + "skipping ttl==1 route entry delete for {dpd_table}" + ); + return Ok(()); + } let keyset_data = keyset_data(match_data.fields, &table); trace!(hdl.log, "sending request to softnpu"); - trace!(hdl.log, "table: {}", table); - trace!(hdl.log, "keyset_data:\n{:#?}", keyset_data); + trace!(hdl.log, "table: {table}"); + trace!(hdl.log, "keyset_data:\n{keyset_data:#?}"); let msg = ManagementRequest::TableRemove(TableRemove { keyset_data, table }); @@ -639,12 +695,12 @@ fn keyset_data(match_data: Vec, table: &str) -> Vec { let mut data: Vec = Vec::new(); match table { RESOLVER_V4 => { - // "nexthop_ipv4" => bit<32> + // "nexthop" => bit<32> serialize_value_type(&x, &mut data); keyset_data.extend_from_slice(&data[..4]); } RESOLVER_V6 => { - // "nexthop_ipv4" => bit<128> + // "nexthop" => bit<128> let mut buf = Vec::new(); serialize_value_type(&x, &mut buf); buf.reverse(); @@ -654,10 +710,12 @@ fn keyset_data(match_data: Vec, table: &str) -> Vec { serialize_value_type(&x, &mut data); keyset_data.extend_from_slice(&data[..2]); } - ROUTER_V4_RT => { - // "idx" => exact => bit<16> - serialize_value_type(&x, &mut data); - keyset_data.extend_from_slice(&data[..2]); + ROUTER_V4_RT | ROUTER_V6_RT => { + // sidecar-lite route keys are idx-only. + if m.name == "idx" { + serialize_value_type(&x, &mut data); + keyset_data.extend_from_slice(&data[..2]); + } } NAT_V4 => { // "dst_addr" => hdr.ipv4.dst: exact => bit<32> @@ -747,3 +805,18 @@ fn serialize_value_type_be(x: &ValueTypes, data: &mut Vec) { } } } + +fn route_ttl_is_1(fields: &[MatchEntryField]) -> bool { + fields.iter().any(|field| { + if field.name != "route_ttl_is_1" { + return false; + } + match &field.value { + MatchEntryValue::Value(ValueTypes::U64(v)) => *v != 0, + MatchEntryValue::Value(ValueTypes::Ptr(v)) => { + v.first().is_some_and(|b| *b != 0) + } + _ => false, + } + }) +} diff --git a/config b/config new file mode 100644 index 00000000..e69de29b diff --git a/dpd-client/Cargo.toml b/dpd-client/Cargo.toml index aadd5a80..5b214413 100644 --- a/dpd-client/Cargo.toml +++ b/dpd-client/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" description = "Client library for the Dendrite data plane daemon" [features] +default = ["multicast"] multicast = ["asic/multicast"] chaos = ["asic/chaos"] tofino_asic = ["asic/tofino_asic"] diff --git a/dpd-client/tests/integration_tests/mcast.rs b/dpd-client/tests/integration_tests/mcast.rs index 4b05b8be..22326ea2 100644 --- a/dpd-client/tests/integration_tests/mcast.rs +++ b/dpd-client/tests/integration_tests/mcast.rs @@ -1442,14 +1442,6 @@ async fn test_ipv6_multicast_invalid_destination_mac() -> TestResult { let ctr_baseline = switch.get_counter("multicast_invalid_mac", None).await.unwrap(); - let port_label_ingress = switch.port_label(ingress).unwrap(); - - // Check the Multicast_Drop counter baseline for the ingress port - let drop_mcast_baseline = switch - .get_counter(&port_label_ingress, Some("multicast_drop")) - .await - .unwrap(); - switch.packet_test(vec![test_pkt], expected_pkts).unwrap(); check_counter_incremented( @@ -1462,17 +1454,6 @@ async fn test_ipv6_multicast_invalid_destination_mac() -> TestResult { .await .unwrap(); - // Verify that the Multicast_Drop counter also incremented - check_counter_incremented( - switch, - &port_label_ingress, - drop_mcast_baseline, - 1, - Some("multicast_drop"), - ) - .await - .unwrap(); - cleanup_test_group(switch, get_group_ip(&created_group)).await } @@ -2156,7 +2137,8 @@ async fn test_encapped_multicast_geneve_mcast_tag_to_underlay_and_external_membe #[tokio::test] #[ignore] -async fn test_ipv4_multicast_drops_ingress_is_egress_port() -> TestResult { +async fn test_ipv4_ingress_multicast_drops_ingress_is_egress_port() -> TestResult +{ let switch = &*get_switch().await; // Define test ports diff --git a/dpd-client/tests/integration_tests/table_tests.rs b/dpd-client/tests/integration_tests/table_tests.rs index 1f0a0163..e5c28c20 100644 --- a/dpd-client/tests/integration_tests/table_tests.rs +++ b/dpd-client/tests/integration_tests/table_tests.rs @@ -39,16 +39,8 @@ use dpd_client::types; // investigating. If it only changes by an entry or two, it's fine to just // adjust the constant below to match the observed result. // -#[cfg(feature = "multicast")] -const IPV4_LPM_SIZE: usize = 8125; // ipv4 forwarding table -#[cfg(not(feature = "multicast"))] -const IPV4_LPM_SIZE: usize = 8187; // ipv4 forwarding table - -#[cfg(feature = "multicast")] +const IPV4_LPM_SIZE: usize = 8191; // ipv4 forwarding table const IPV6_LPM_SIZE: usize = 1023; // ipv6 forwarding table -#[cfg(not(feature = "multicast"))] -const IPV6_LPM_SIZE: usize = 1023; // ipv6 forwarding table - const SWITCH_IPV4_ADDRS_SIZE: usize = 511; // ipv4 addrs assigned to our ports const SWITCH_IPV6_ADDRS_SIZE: usize = 511; // ipv6 addrs assigned to our ports const IPV4_NAT_TABLE_SIZE: usize = 1024; // nat routing table diff --git a/dpd/Cargo.toml b/dpd/Cargo.toml index 31e8a9cb..cf8a5b28 100644 --- a/dpd/Cargo.toml +++ b/dpd/Cargo.toml @@ -5,6 +5,7 @@ authors = ["nils "] edition = "2024" [features] +default = ["multicast"] multicast = ["aal/multicast", "asic/multicast"] tofino_asic = ["asic/tofino_asic"] tofino_stub = ["asic/tofino_stub"] diff --git a/dpd/p4/constants.p4 b/dpd/p4/constants.p4 index 4a483773..d0e890cc 100644 --- a/dpd/p4/constants.p4 +++ b/dpd/p4/constants.p4 @@ -4,7 +4,13 @@ // // Copyright 2026 Oxide Computer Company -const bit<16> L2_ISOLATED_FLAG = 0x8000; +// Multicast MAC prefixes per RFC 1112 and RFC 2464. +const bit<24> IPV4_MCAST_MAC_PREFIX = 0x01005e; +const bit<16> IPV6_MCAST_MAC_PREFIX = 0x3333; + +// Multicast IP address prefixes per RFC 1112 and RFC 4291. +const bit<4> IPV4_MCAST_PREFIX = 0xe; // 224.0.0.0/4 +const bit<8> IPV6_MCAST_PREFIX = 0xff; // ff00::/8 // TODO: these all need to be bigger. Early experimentation is showing that this // is going to need to come either through ATCAM/ALPM or code restructuring. @@ -12,6 +18,7 @@ const int IPV4_NAT_TABLE_SIZE = 1024; // nat routing table const int IPV6_NAT_TABLE_SIZE = 1024; // nat routing table const int IPV4_LPM_SIZE = 8192; // ipv4 forwarding table const int IPV6_LPM_SIZE = 1024; // ipv6 forwarding table +const int FWD_ENTRIES_PER_ROUTE = 2; // TTL compound key: forward + ttl_exceeded const int IPV4_ARP_SIZE = 512; // arp cache const int IPV6_NEIGHBOR_SIZE = 512; // ipv6 neighbor cache const int SWITCH_IPV4_ADDRS_SIZE = 512; // ipv4 addrs assigned to our ports @@ -55,16 +62,17 @@ const bit<32> SVC_COUNTER_MAX = 7; const bit<2> MULTICAST_TAG_EXTERNAL = 0; const bit<2> MULTICAST_TAG_UNDERLAY = 1; const bit<2> MULTICAST_TAG_UNDERLAY_EXTERNAL = 2; +const bit<2> MULTICAST_TAG_INVALID = 3; // Sentinel for missing/invalid header -/* IPv6 Address Mask Constants */ -const bit<128> IPV6_SCOPE_MASK = 0xffff0000000000000000000000000000; // Match ff00::/16 -const bit<128> IPV6_ULA_MASK = 0xff000000000000000000000000000000; // Match fd00::/8 - -/* IPv6 Address Pattern Constants */ -const bit<128> IPV6_ADMIN_LOCAL_PATTERN = 0xff040000000000000000000000000000; // ff04::/16 -const bit<128> IPV6_SITE_LOCAL_PATTERN = 0xff050000000000000000000000000000; // ff05::/16 -const bit<128> IPV6_ORG_SCOPE_PATTERN = 0xff080000000000000000000000000000; // ff08::/16 -const bit<128> IPV6_ULA_PATTERN = 0xfd000000000000000000000000000000; // fd00::/8 +/* IPv6 Address Constants (16-bit prefix for TCAM optimization) */ +const bit<16> IPV6_SCOPE_MASK_16 = 0xffff; // Match all 16 bits of prefix +const bit<16> IPV6_ULA_MASK_16 = 0xff00; // Match top 8 bits (fd00::/8) +const bit<16> IPV6_INTERFACE_LOCAL_16 = 0xff01; // ff01::/16 +const bit<16> IPV6_LINK_LOCAL_16 = 0xff02; // ff02::/16 +const bit<16> IPV6_ADMIN_LOCAL_16 = 0xff04; // ff04::/16 +const bit<16> IPV6_SITE_LOCAL_16 = 0xff05; // ff05::/16 +const bit<16> IPV6_ORG_SCOPE_16 = 0xff08; // ff08::/16 +const bit<16> IPV6_ULA_16 = 0xfd00; // fd00::/8 /* Reasons a packet may be dropped by the p4 pipeline */ const bit<8> DROP_IPV4_SWITCH_ADDR_MISS = 0x01; @@ -95,4 +103,3 @@ const bit<8> DROP_GENEVE_OPTION_MALFORMED = 0x19; const bit<8> DROP_GENEVE_OPTION_UNKNOWN = 0x1A; // MAX(DROP_xxx) + 1 const bit<32> DROP_REASON_MAX = 0x1B; - diff --git a/dpd/p4/headers.p4 b/dpd/p4/headers.p4 index 425f9958..89f2deb6 100644 --- a/dpd/p4/headers.p4 +++ b/dpd/p4/headers.p4 @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company const bit<16> ETHERTYPE_IPV4 = 0x0800; const bit<16> ETHERTYPE_ARP = 0x0806; @@ -184,7 +184,6 @@ header geneve_opt_mcast_h { bit<30> reserved; } - header geneve_opt_mss_h { bit<32> mss; } diff --git a/dpd/p4/metadata.p4 b/dpd/p4/metadata.p4 index fb170a80..93f0c52d 100644 --- a/dpd/p4/metadata.p4 +++ b/dpd/p4/metadata.p4 @@ -4,12 +4,98 @@ // // Copyright 2026 Oxide Computer Company +// Guard against compiler bug: RemoveMetadataInits strips explicit `= false` +// initializations, assuming parser will zero-init the PHV container. +// ComputeInitZeroContainers only marks containers for zero-init if the field +// is actually used in the parser, not just initialized. These assumptions are +// incompatible: fields initialized but only used in MAU get stale data. +// See: https://github.com/oxidecomputer/tofino-p4c/blob/ry/upstream-merge/rydocs/tofino-metadata-corruption.md +@pa_no_init("ingress", "meta.service_routed") +@pa_no_init("ingress", "meta.nat_egress_hit") +@pa_no_init("ingress", "meta.nat_ingress_hit") +@pa_no_init("ingress", "meta.uplink_ingress") +@pa_no_init("ingress", "meta.encap_needed") +@pa_no_init("ingress", "meta.icmp_recalc") +@pa_no_init("ingress", "meta.allow_source_mcast") +@pa_no_init("ingress", "meta.resolve_nexthop") +@pa_no_init("ingress", "meta.nexthop_is_v6") +@pa_no_init("ingress", "meta.route_ttl_is_1") +// These fields are set in the parser on some paths but not all. On paths +// that skip the set, the field is init-only and vulnerable. +@pa_no_init("ingress", "meta.is_switch_address") +@pa_no_init("ingress", "meta.is_link_local_mcastv6") + +// Force fields out of mocha containers into normal containers. Mocha containers +// only support whole-container-set operations, so isolated fields can have +// their other bits corrupted by stale data from previous packets. +// +// Without these pragmas the compiler may pack small metadata fields into mocha +// containers alongside unrelated fields. A whole-container write to one field +// then clobbers the others. The risk is highest for 1-bit booleans and fields +// with long liverange gaps between set and use. +// +// Both builds share ipv4_checksum_err: confirmed allocated to mocha MH0 where +// it shared a container with pkt_type, risking false checksum-error drops. +@pa_container_type("ingress", "meta.ipv4_checksum_err", "normal") + +// 1-bit ingress booleans: high risk of mocha packing. The compiler can +// pack up to 8 booleans into a single 8-bit mocha container, and a +// whole-container write to any one clobbers the rest. +@pa_container_type("ingress", "meta.dropped", "normal") +@pa_container_type("ingress", "meta.is_switch_address", "normal") +@pa_container_type("ingress", "meta.is_mcast", "normal") +@pa_container_type("ingress", "meta.allow_source_mcast", "normal") +@pa_container_type("ingress", "meta.is_link_local_mcastv6", "normal") +@pa_container_type("ingress", "meta.service_routed", "normal") +@pa_container_type("ingress", "meta.nat_egress_hit", "normal") +@pa_container_type("ingress", "meta.nat_ingress_hit", "normal") +@pa_container_type("ingress", "meta.uplink_ingress", "normal") +@pa_container_type("ingress", "meta.encap_needed", "normal") +@pa_container_type("ingress", "meta.resolve_nexthop", "normal") +@pa_container_type("ingress", "meta.route_ttl_is_1", "normal") +@pa_container_type("ingress", "meta.nexthop_is_v6", "normal") +@pa_container_type("ingress", "meta.icmp_recalc", "normal") + +// Wider ingress fields used by NAT encapsulation, checksum computation, +// and routing. Protected in both builds to avoid relying on incidental +// co-location with deparsed fields, which is fragile across compiler +// versions and PHV pressure changes. +@pa_container_type("ingress", "meta.drop_reason", "normal") +@pa_container_type("ingress", "meta.l4_src_port", "normal") +@pa_container_type("ingress", "meta.l4_dst_port", "normal") +@pa_container_type("ingress", "meta.nat_ingress_tgt", "normal") +@pa_container_type("ingress", "meta.nat_geneve_vni", "normal") +@pa_container_type("ingress", "meta.nat_inner_mac", "normal") +@pa_container_type("ingress", "meta.icmp_csum", "normal") +@pa_container_type("ingress", "meta.body_checksum", "normal") +@pa_container_type("ingress", "meta.orig_src_mac", "normal") +@pa_container_type("ingress", "meta.orig_src_ipv4", "normal") +@pa_container_type("ingress", "meta.nat_ingress_csum", "normal") +@pa_container_type("ingress", "meta.nexthop", "normal") + +// Egress bridge header fields crossing the ingress/egress boundary. +@pa_container_type("egress", "meta.bridge_hdr.ingress_port", "normal") +@pa_container_type("egress", "meta.bridge_hdr.is_mcast_routed", "normal") +@pa_container_type("egress", "meta.bridge_hdr.nat_egress_hit", "normal") +// Egress drop_reason is used in the final drop/forward decision in both +// builds. In the MULTICAST build, additional egress fields are set by +// multicast table actions and consumed later in the pipeline. +@pa_container_type("egress", "meta.drop_reason", "normal") +#ifdef MULTICAST +@pa_container_type("egress", "meta.vlan_id", "normal") +@pa_container_type("egress", "meta.port_number", "normal") +@pa_container_type("egress", "meta.ipv4_checksum_recalc", "normal") +#endif + /* Flexible bridge header for passing metadata between ingress and egress * pipelines. */ @flexible header bridge_h { - PortId_t ingress_port; + PortId_t ingress_port; // 9 bits + bool is_mcast_routed; // 1 bit: packet was routed to multicast (PRE) + bool nat_egress_hit; // 1 bit: NAT egress matched, check egress filter + bit<5> reserved; // 5 bits: padding to 16-bit boundary } struct sidecar_ingress_meta_t { @@ -25,8 +111,9 @@ struct sidecar_ingress_meta_t { bool uplink_ingress; // Packet arrived on an uplink port bool encap_needed; bool resolve_nexthop; // signals nexthop needs to be resolved - ipv4_addr_t nexthop_ipv4; // ip address of next router - ipv6_addr_t nexthop_ipv6; // ip address of next router + bool route_ttl_is_1; // TTL/hop_limit equals 1 (for route lookup) + bool nexthop_is_v6; // true when nexthop is IPv6 + ipv6_addr_t nexthop; // next hop address; IPv4 uses low bits bit<10> pkt_type; bit<8> drop_reason; // reason a packet was dropped bit<16> l4_src_port; // tcp or udp destination port @@ -74,41 +161,11 @@ struct sidecar_egress_meta_t { bit<8> port_number; // Port number for the outgoing port (0-255) } -struct route4_result_t { - /* - * The result of the multistage route selection process is an egress - * port and a nexthop address - */ - ipv4_addr_t nexthop; - ipv6_addr_t nexthop6; - PortId_t port; - - /* Did we successfully look up the route in the table? */ - bool is_hit; - bool is_v6; - - /* - * A hash of the (address,port) fields, which is used to choose between - * multiple potential routes. - */ - bit<8> ecmp_hash; - - /* Index into the target table of the first potential route */ - bit<16> idx; - /* Number of consecutive slots containing potential routes */ - bit<8> slots; - /* Which of those routes we should select, based the flow hash */ - bit<16> slot; -} - -struct route6_result_t { - /* - * The result of the multistage route selection process is an egress - * port and a nexthop address - */ - ipv6_addr_t nexthop; - PortId_t port; - +// Unified route result struct for both Router4 and Router6. +// A single instance is allocated in L3Router and passed to both +// controls, forcing the compiler to use the same PHV allocation +// and preventing liverange divergence under high PHV pressure. +struct route_result_t { /* Did we successfully look up the route in the table? */ bool is_hit; diff --git a/dpd/p4/parser.p4 b/dpd/p4/parser.p4 index ba82901e..5d7e4d04 100644 --- a/dpd/p4/parser.p4 +++ b/dpd/p4/parser.p4 @@ -43,8 +43,8 @@ parser IngressParser( meta.l4_dst_port = 0; meta.l4_length = 0; meta.body_checksum = 0; - meta.nexthop_ipv4 = 0; - meta.nexthop_ipv6 = 0; + meta.nexthop = 0; + meta.nexthop_is_v6 = false; meta.orig_src_mac = 0; meta.orig_src_ipv4 = 0; meta.orig_dst_ipv4 = 0; @@ -52,9 +52,12 @@ parser IngressParser( meta.drop_reason = 0; meta.nat_ingress_csum = 0; meta.resolve_nexthop = false; + meta.route_ttl_is_1 = false; meta.bridge_hdr.setValid(); meta.bridge_hdr.ingress_port = ig_intr_md.ingress_port; + meta.bridge_hdr.is_mcast_routed = false; + meta.bridge_hdr.reserved = 0; transition port_metadata; } @@ -245,8 +248,8 @@ parser IngressParser( }); transition select(hdr.ipv6.dst_addr[127:112]) { - 16w0xff01: drop_interface_local_mcast; - 16w0xff02: set_link_local_mcast; + IPV6_INTERFACE_LOCAL_16: drop_interface_local_mcast; + IPV6_LINK_LOCAL_16: set_link_local_mcast; default: check_ipv6_mcast; } } @@ -546,7 +549,6 @@ parser EgressParser( meta.vlan_id = 0; meta.port_number = 0; - transition parse_bridge_hdr; } @@ -583,8 +585,6 @@ parser EgressParser( pkt.extract(hdr.ipv4); transition select(hdr.ipv4.protocol) { - IPPROTO_ICMP: parse_icmp; - IPPROTO_TCP: parse_tcp; IPPROTO_UDP: parse_udp; default: accept; } @@ -594,24 +594,11 @@ parser EgressParser( pkt.extract(hdr.ipv6); transition select(hdr.ipv6.next_hdr) { - IPPROTO_TCP: parse_tcp; IPPROTO_UDP: parse_udp; default: accept; } } - state parse_icmp { - pkt.extract(hdr.icmp); - - transition accept; - } - - state parse_tcp { - pkt.extract(_); - - transition accept; - } - state parse_udp { pkt.extract(hdr.udp); diff --git a/dpd/p4/route_selector.p4 b/dpd/p4/route_selector.p4 index 81590c72..1cc20eb1 100644 --- a/dpd/p4/route_selector.p4 +++ b/dpd/p4/route_selector.p4 @@ -2,8 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company - +// Copyright 2026 Oxide Computer Company action set_slot(bit<8> slot) { res.slot = (bit<16>) slot; diff --git a/dpd/p4/sidecar.p4 b/dpd/p4/sidecar.p4 index 2cfef006..06e9e9e6 100644 --- a/dpd/p4/sidecar.p4 +++ b/dpd/p4/sidecar.p4 @@ -48,6 +48,22 @@ const bit<9> USER_SPACE_SERVICE_PORT = 192; #define IPV6_FIELDS \ hdr.inner_ipv6 +// Common setup for sending ICMP error responses to user space. +// +// Sets up the sidecar header and routes to USER_SPACE_SERVICE_PORT. +// Callers should set meta.drop_reason and call counters as needed. +#define ICMP_ERROR_SETUP(type, code) \ + hdr.sidecar.sc_code = SC_ICMP_NEEDED; \ + hdr.sidecar.sc_pad = 0; \ + hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; \ + hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; \ + hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; \ + hdr.sidecar.sc_payload = (bit<128>)(type) << 8 | (bit<128>)(code); \ + hdr.sidecar.setValid(); \ + hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; \ + meta.service_routed = true; \ + ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT + // This control handles the calculation of Layer 4 payload length // by subtracting the IPv4 header length from the total packet length. // @@ -106,7 +122,6 @@ control Filter( DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv4_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ipv6_ctr; #ifdef MULTICAST - Counter, PortId_t>(512, CounterType_t.PACKETS) drop_mcast_ctr; bit<16> mcast_scope; #endif /* MULTICAST */ @@ -122,11 +137,6 @@ control Filter( ipv6_ctr.count(); } - action drop_bad_mac() { - meta.drop_reason = DROP_MULTICAST_INVALID_MAC; - meta.dropped = true; - } - action claimv4() { meta.is_switch_address = true; ipv4_ctr.count(); @@ -137,7 +147,6 @@ control Filter( ipv6_ctr.count(); } - // Table of the IPv4 addresses assigned to ports on the switch. table switch_ipv4_addr { key = { @@ -203,8 +212,8 @@ control Filter( if (mac_byte4 != (bit<8>)ipv4_lower7 || mac_byte5 != ipv4_byte3 || mac_byte6 != ipv4_byte4) { - drop_bad_mac(); - drop_mcast_ctr.count(ig_intr_md.ingress_port); + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + meta.dropped = true; return; } } else { @@ -228,8 +237,8 @@ control Filter( // registers on the device. if (hdr.ethernet.dst_mac[47:40] != 8w0x33 || hdr.ethernet.dst_mac[39:32] != 8w0x33) { - drop_bad_mac(); - drop_mcast_ctr.count(ig_intr_md.ingress_port); + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + meta.dropped = true; return; } @@ -242,14 +251,14 @@ control Filter( hdr.ethernet.dst_mac[23:16] != hdr.ipv6.dst_addr[23:16] || hdr.ethernet.dst_mac[15:8] != hdr.ipv6.dst_addr[15:8] || hdr.ethernet.dst_mac[7:0] != hdr.ipv6.dst_addr[7:0]) { - drop_bad_mac(); - drop_mcast_ctr.count(ig_intr_md.ingress_port); + meta.drop_reason = DROP_MULTICAST_INVALID_MAC; + meta.dropped = true; return; } } #endif /* MULTICAST */ - if (!meta.is_mcast || meta.is_link_local_mcastv6 && !meta.encap_needed) { + if (!meta.is_mcast || (meta.is_link_local_mcastv6 && !meta.encap_needed)) { switch_ipv6_addr.apply(); } } @@ -316,8 +325,11 @@ control Services( // sidecar tag, which indicates which port the request arrived on. action forward_to_userspace() { hdr.sidecar.sc_code = SC_FWD_TO_USERSPACE; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; + hdr.sidecar.sc_egress = 0; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; + hdr.sidecar.sc_payload = 0; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; meta.service_routed = true; @@ -350,6 +362,7 @@ control Services( // packets always go to the port indicated by the sidecar header. action mcast_inbound_link_local() { hdr.sidecar.sc_code = SC_FWD_TO_USERSPACE; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; @@ -415,11 +428,19 @@ control Services( } apply { + // TODO: This can be simplified by checking "is this a switch address" + // at the start, then dropping non-NAT packets in NatIngress directly. + // Deferred due to knock-on effects in dpd and sidecar-lite. if (meta.is_switch_address && hdr.geneve.isValid() && hdr.geneve.vni != 0) { meta.nat_egress_hit = true; } else { service.apply(); + // Detect link-local multicast for packets forwarded from userspace. + if (meta.service_routed && hdr.ipv6.isValid() && + hdr.ipv6.dst_addr[127:112] == IPV6_LINK_LOCAL_16) { + meta.is_link_local_mcastv6 = true; + } } } } @@ -702,33 +723,28 @@ control NatIngress ( apply { icmp_dst_port.apply(); - // Note: This whole conditional could be simpler as a set of */ + // Note: This whole conditional could be simpler as a set of // `const entries`, but apply (on tables) cannot be called from actions -#ifdef MULTICAST if (hdr.ipv4.isValid()) { +#ifdef MULTICAST if (meta.is_mcast) { ingress_ipv4_mcast.apply(); - } else if (!meta.encap_needed) { + } else +#endif /* MULTICAST */ + if (!meta.encap_needed) { ingress_ipv4.apply(); } } else if (hdr.ipv6.isValid()) { - // If this is a multicast packet and not a link-local multicast, - // we need to check the multicast table +#ifdef MULTICAST + // If multicast and not link-local, check the multicast table if (meta.is_mcast && !meta.is_link_local_mcastv6) { ingress_ipv6_mcast.apply(); - } else { + } else +#endif /* MULTICAST */ + { ingress_ipv6.apply(); } } -#else /* MULTICAST */ - if (hdr.ipv4.isValid()) { - if (!meta.encap_needed) { - ingress_ipv4.apply(); - } - } else if (hdr.ipv6.isValid()) { - ingress_ipv6.apply(); - } -#endif /* MULTICAST */ if (ingress_hit.apply().hit) { if (hdr.ipv4.isValid()) { @@ -898,7 +914,10 @@ control NatEgress ( control RouterLookupIndex6( inout sidecar_headers_t hdr, - inout route6_result_t res + inout sidecar_ingress_meta_t meta, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t res ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) index_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) forward_ctr; @@ -911,27 +930,39 @@ control RouterLookupIndex6( hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; hdr.ethernet.ether_type = ETHERTYPE_VLAN; - res.port = port; - res.nexthop = nexthop; + ig_tm_md.ucast_egress_port = port; + hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; forward_ctr.count(); } action forward(PortId_t port, ipv6_addr_t nexthop) { - res.port = port; - res.nexthop = nexthop; + ig_tm_md.ucast_egress_port = port; + hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; + forward_ctr.count(); + } + + action ttl_exceeded() { + ICMP_ERROR_SETUP(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); + meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; forward_ctr.count(); } /* * The table size is reduced by one here just to allow the integration - * test to pass. We want the lookup and forward tables to have the same - * capacity from dpd's perspective, and the "default" entry consumes a - * slot in the lookup table. + * test to pass. We keep the forward table capacity aligned with the + * lookup table from dpd's perspective. The route_ttl_is_1 key doubles + * the physical entries, so the size is scaled accordingly. */ table route { - key = { res.idx: exact; } - actions = { forward; forward_vlan; } - const size = IPV6_LPM_SIZE - 1; + key = { res.idx: exact; meta.route_ttl_is_1: exact; } + actions = { forward; forward_vlan; ttl_exceeded; } + const size = IPV6_LPM_SIZE * FWD_ENTRIES_PER_ROUTE - 1; counters = forward_ctr; } @@ -940,8 +971,8 @@ control RouterLookupIndex6( res.idx = 0; res.slots = 0; res.slot = 0; - res.port = 0; - res.nexthop = 0; + ICMP_ERROR_SETUP(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); + meta.drop_reason = DROP_IPV6_UNROUTEABLE; index_ctr.count(); } @@ -953,15 +984,9 @@ control RouterLookupIndex6( action index(bit<16> idx, bit<8> slots) { res.is_hit = true; - res.idx = idx; res.slots = slots; res.slot = 0; - - // The rest of this data is extracted from the target table at - // entry `res.idx`. - res.port = 0; - res.nexthop = 0; index_ctr.count(); } @@ -996,7 +1021,10 @@ control RouterLookupIndex6( control RouterLookupIndex4( inout sidecar_headers_t hdr, - inout route4_result_t res + inout sidecar_ingress_meta_t meta, + in ingress_intrinsic_metadata_t ig_intr_md, + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t res ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) index_ctr; DirectCounter>(CounterType_t.PACKETS_AND_BYTES) forward_ctr; @@ -1009,9 +1037,11 @@ control RouterLookupIndex4( hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; hdr.ethernet.ether_type = ETHERTYPE_VLAN; - res.port = port; - res.nexthop = nexthop; - res.is_v6 = false; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = (ipv6_addr_t)nexthop; + meta.nexthop_is_v6 = false; + meta.resolve_nexthop = true; forward_ctr.count(); } @@ -1023,37 +1053,49 @@ control RouterLookupIndex4( hdr.vlan.vlan_id = vlan_id; hdr.vlan.ether_type = hdr.ethernet.ether_type; hdr.ethernet.ether_type = ETHERTYPE_VLAN; - res.port = port; - res.nexthop6 = nexthop; - res.is_v6 = true; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; forward_ctr.count(); } action forward(PortId_t port, ipv4_addr_t nexthop) { - res.port = port; - res.nexthop = nexthop; - res.is_v6 = false; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = (ipv6_addr_t)nexthop; + meta.nexthop_is_v6 = false; + meta.resolve_nexthop = true; forward_ctr.count(); } action forward_v6(PortId_t port, ipv6_addr_t nexthop) { - res.port = port; - res.nexthop6 = nexthop; - res.is_v6 = true; + ig_tm_md.ucast_egress_port = port; + hdr.ipv4.ttl = hdr.ipv4.ttl - 1; + meta.nexthop = nexthop; + meta.nexthop_is_v6 = true; + meta.resolve_nexthop = true; + forward_ctr.count(); + } + + action ttl_exceeded() { + ICMP_ERROR_SETUP(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); + meta.drop_reason = DROP_IPV4_TTL_EXCEEDED; forward_ctr.count(); } /* * The table size is reduced by one here just to allow the integration - * test to pass. We want the lookup and forward tables to have the same - * capacity from dpd's perspective, and the "default" entry consumes a - * slot in the lookup table. + * test to pass. We keep the forward table capacity aligned with the + * lookup table from dpd's perspective. The route_ttl_is_1 key doubles + * the physical entries, so the size is scaled accordingly. */ table route { - key = { res.idx: exact; } - actions = { forward; forward_v6; forward_vlan; forward_vlan_v6; } - const size = IPV4_LPM_SIZE - 1; - counters = forward_ctr; + key = { res.idx: exact; meta.route_ttl_is_1: exact; } + actions = { forward; forward_v6; forward_vlan; forward_vlan_v6; ttl_exceeded; } + const size = IPV4_LPM_SIZE * FWD_ENTRIES_PER_ROUTE - 1; + counters = forward_ctr; } action unreachable() { @@ -1061,8 +1103,8 @@ control RouterLookupIndex4( res.idx = 0; res.slots = 0; res.slot = 0; - res.port = 0; - res.nexthop = 0; + ICMP_ERROR_SETUP(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); + meta.drop_reason = DROP_IPV4_UNROUTEABLE; index_ctr.count(); } @@ -1074,15 +1116,9 @@ control RouterLookupIndex4( action index(bit<16> idx, bit<8> slots) { res.is_hit = true; - res.idx = idx; res.slots = slots; res.slot = 0; - - // The rest of this data is extracted from the target table at - // entry `res.idx`. - res.port = 0; - res.nexthop = 0; index_ctr.count(); } @@ -1135,10 +1171,11 @@ control Arp ( action request() { hdr.sidecar.sc_code = SC_ARP_NEEDED; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)meta.nexthop_ipv4; + hdr.sidecar.sc_payload = (bit<128>)meta.nexthop; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; meta.service_routed = true; @@ -1150,7 +1187,11 @@ control Arp ( } table tbl { - key = { meta.nexthop_ipv4: exact; } + // @name required for complex key expressions (casts); provides + // the control-plane name used by Rust match_xlate. + key = { + (ipv4_addr_t)meta.nexthop : exact @name("nexthop"); + } actions = { drop; request; rewrite; } default_action = request; const size = IPV4_ARP_SIZE; @@ -1183,10 +1224,11 @@ control Ndp ( action request() { hdr.sidecar.sc_code = SC_NEIGHBOR_NEEDED; + hdr.sidecar.sc_pad = 0; hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)meta.nexthop_ipv6; + hdr.sidecar.sc_payload = (bit<128>)meta.nexthop; hdr.sidecar.setValid(); hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; meta.service_routed = true; @@ -1198,7 +1240,7 @@ control Ndp ( } table tbl { - key = { meta.nexthop_ipv6: exact; } + key = { meta.nexthop: exact; } actions = { drop; rewrite; request; } default_action = request; const size = IPV6_NEIGHBOR_SIZE; @@ -1212,33 +1254,22 @@ control Router4 ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, in ingress_intrinsic_metadata_t ig_intr_md, - inout ingress_intrinsic_metadata_for_tm_t ig_tm_md + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t fwd ) { RouterLookupIndex4() lookup_idx; Hash>(HashAlgorithm_t.CRC8) index_hash; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - apply { - route4_result_t fwd; - fwd.is_v6 = false; - fwd.nexthop6 = 0; - fwd.nexthop = 0; - fwd.port = 0; + // fwd is passed in from L3Router to share PHV allocation with Router6 fwd.is_hit = false; fwd.idx = 0; fwd.slots = 0; fwd.slot = 0; + meta.resolve_nexthop = false; + meta.nexthop = 0; + meta.nexthop_is_v6 = false; + meta.route_ttl_is_1 = hdr.ipv4.ttl == 1; // Our route selection table is 11 bits wide, and we need 5 bits // of that for our "slot count" index. Thus, we only need 6 // bits of the 8-bit hash calculated here to complete the 11-bit @@ -1250,26 +1281,7 @@ control Router4 ( meta.l4_src_port }) & 0x3f; - lookup_idx.apply(hdr, fwd); - - if (!fwd.is_hit) { - icmp_error(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); - // Dont set meta.dropped because we want an error packet - // to go out. - meta.drop_reason = DROP_IPV4_UNROUTEABLE; - } else if (hdr.ipv4.ttl == 1 && !IS_SERVICE(fwd.port)) { - icmp_error(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); - // Dont set meta.dropped because we want an error packet - // to go out. - meta.drop_reason = DROP_IPV4_TTL_EXCEEDED; - } else { - hdr.ipv4.ttl = hdr.ipv4.ttl - 1; - ig_tm_md.ucast_egress_port = fwd.port; - - meta.nexthop_ipv4 = fwd.nexthop; - meta.nexthop_ipv6 = fwd.nexthop6; - meta.resolve_nexthop = true; - } + lookup_idx.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } @@ -1282,18 +1294,6 @@ control MulticastRouter4( ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - action unreachable() { ctr.count(); } @@ -1333,12 +1333,12 @@ control MulticastRouter4( } if (!tbl.apply().hit) { - icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); - meta.drop_reason = DROP_IPV6_UNROUTEABLE; + ICMP_ERROR_SETUP(ICMP_DEST_UNREACH, ICMP_DST_UNREACH_NET); + meta.drop_reason = DROP_IPV4_UNROUTEABLE; // Dont set meta.dropped because we want an error packet // to go out. } else if (hdr.ipv4.ttl == 1 && !meta.service_routed) { - icmp_error(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); + ICMP_ERROR_SETUP(ICMP_TIME_EXCEEDED, ICMP_EXC_TTL); meta.drop_reason = DROP_IPV4_TTL_INVALID; // Dont set meta.dropped because we want an error packet // to go out. @@ -1357,31 +1357,22 @@ control Router6 ( inout sidecar_headers_t hdr, inout sidecar_ingress_meta_t meta, in ingress_intrinsic_metadata_t ig_intr_md, - inout ingress_intrinsic_metadata_for_tm_t ig_tm_md + inout ingress_intrinsic_metadata_for_tm_t ig_tm_md, + inout route_result_t fwd ) { RouterLookupIndex6() lookup_idx; Hash>(HashAlgorithm_t.CRC8) index_hash; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - apply { - route6_result_t fwd; - fwd.nexthop = 0; - fwd.port = 0; + // fwd is passed in from L3Router to share PHV allocation with Router4 fwd.is_hit = false; fwd.idx = 0; fwd.slots = 0; fwd.slot = 0; + meta.resolve_nexthop = false; + meta.nexthop = 0; + meta.nexthop_is_v6 = false; + meta.route_ttl_is_1 = hdr.ipv6.hop_limit == 1; // Our route selection table is 11 bits wide, and we need 5 bits // of that for our "slot count" index. Thus, we only need 6 // bits of the 8-bit hash calculated here to complete the 11-bit @@ -1393,24 +1384,7 @@ control Router6 ( meta.l4_src_port }) & 0x3f; - lookup_idx.apply(hdr, fwd); - - if (!fwd.is_hit) { - icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); - meta.drop_reason = DROP_IPV6_UNROUTEABLE; - // Dont set meta.dropped because we want an error packet - // to go out. - } else if (hdr.ipv6.hop_limit == 1 && !IS_SERVICE(fwd.port)) { - icmp_error(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); - meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; - // Dont set meta.dropped because we want an error packet - // to go out. - } else { - ig_tm_md.ucast_egress_port = fwd.port; - hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; - meta.resolve_nexthop = true; - meta.nexthop_ipv6 = fwd.nexthop; - } + lookup_idx.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } @@ -1423,18 +1397,6 @@ control MulticastRouter6 ( ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; - action icmp_error(bit<8> type, bit<8> code) { - hdr.sidecar.sc_code = SC_ICMP_NEEDED; - hdr.sidecar.sc_ingress = (bit<16>)ig_intr_md.ingress_port; - hdr.sidecar.sc_egress = (bit<16>)ig_tm_md.ucast_egress_port; - hdr.sidecar.sc_ether_type = hdr.ethernet.ether_type; - hdr.sidecar.sc_payload = (bit<128>)type << 8 | (bit<128>)code; - hdr.sidecar.setValid(); - hdr.ethernet.ether_type = ETHERTYPE_SIDECAR; - meta.service_routed = true; - ig_tm_md.ucast_egress_port = USER_SPACE_SERVICE_PORT; - } - action unreachable() { ctr.count(); } @@ -1473,18 +1435,18 @@ control MulticastRouter6 ( } if (!tbl.apply().hit) { - icmp_error(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); + ICMP_ERROR_SETUP(ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE); meta.drop_reason = DROP_IPV6_UNROUTEABLE; // Dont set meta.dropped because we want an error packet // to go out. } else if (hdr.ipv6.hop_limit == 1) { - icmp_error(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); + ICMP_ERROR_SETUP(ICMP6_TIME_EXCEEDED, ICMP_EXC_TTL); meta.drop_reason = DROP_IPV6_TTL_EXCEEDED; // Dont set meta.dropped because we want an error packet // to go out. } else { // Set the destination port to an invalid value - ig_tm_md.ucast_egress_port = (PortId_t)0x1ff; + ig_tm_md.ucast_egress_port = (PortId_t)0x1ff; hdr.ipv6.hop_limit = hdr.ipv6.hop_limit - 1; } } @@ -1497,59 +1459,65 @@ control L3Router( in ingress_intrinsic_metadata_t ig_intr_md, inout ingress_intrinsic_metadata_for_tm_t ig_tm_md ) { + Router4() router4; + Router6() router6; + apply { -#ifdef MULTICAST + // Shared: allocate a single route_result_t for Router4 and Router6. + // This forces the compiler to use the same PHV allocation for both, + // preventing liverange divergence under high PHV pressure. + route_result_t fwd; + fwd.is_hit = false; + fwd.ecmp_hash = 0; + fwd.idx = 0; + fwd.slots = 0; + fwd.slot = 0; + if (hdr.ipv4.isValid()) { +#ifdef MULTICAST if (meta.is_mcast && !meta.is_link_local_mcastv6) { MulticastRouter4.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else { - Router4.apply(hdr, meta, ig_intr_md, ig_tm_md); + } else +#endif /* MULTICAST */ + { + router4.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } else if (hdr.ipv6.isValid()) { +#ifdef MULTICAST if (meta.is_mcast && !meta.is_link_local_mcastv6) { MulticastRouter6.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else { - Router6.apply(hdr, meta, ig_intr_md, ig_tm_md); + } else +#endif /* MULTICAST */ + { + router6.apply(hdr, meta, ig_intr_md, ig_tm_md, fwd); } } -#else /* MULTICAST */ - if (hdr.ipv4.isValid()) { - Router4.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else if (hdr.ipv6.isValid()) { - Router6.apply(hdr, meta, ig_intr_md, ig_tm_md); - } -#endif /* MULTICAST */ if (meta.resolve_nexthop) { - if (meta.nexthop_ipv4 != 0) { - Arp.apply(hdr, meta, ig_intr_md, ig_tm_md); - } else { + if (meta.nexthop_is_v6) { Ndp.apply(hdr, meta, ig_intr_md, ig_tm_md); + } else { + Arp.apply(hdr, meta, ig_intr_md, ig_tm_md); } } } } -/* - * XXX: this control could be moved to the Egress pipeline if we need more space - * in the Ingress pipeline. Currently unicast packets are able to bypass that - * pipeline, which is why we've tacked it on here. We could probably also merge - * it with the MacRewrite control, as they are both per-port settings, but that - * would present some weird semantics to the control plane daemon. - */ -control EgressFilter( - inout sidecar_ingress_meta_t meta, - in ingress_intrinsic_metadata_for_tm_t ig_tm_md +// Filter NAT egress traffic by port. Ports not explicitly marked as uplinks +// drop guest traffic to prevent NAT'd packets from egressing on non-uplink +// ports. Placed in the egress pipeline to avoid adding a stage to ingress. +control NatEgressFilter( + inout sidecar_egress_meta_t meta, + in egress_intrinsic_metadata_t eg_intr_md ) { action guest_traffic_not_allowed() { meta.drop_reason = DROP_NAT_EGRESS_BLOCKED; - meta.dropped = true; } action guest_traffic_allowed() { } table egress_filter { - key = { ig_tm_md.ucast_egress_port : exact; } + key = { eg_intr_md.egress_port : exact; } actions = { guest_traffic_allowed; guest_traffic_not_allowed; } const size = 256; @@ -1560,40 +1528,14 @@ control EgressFilter( egress_filter.apply(); } } - -control MacRewrite( - inout sidecar_headers_t hdr, - in PortId_t port -) { - DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; - - action rewrite(mac_addr_t mac) { - hdr.ethernet.src_mac = mac; - ctr.count(); - } - - table mac_rewrite { - key = { port: exact ; } - actions = { rewrite; } - - const size = 256; - counters = ctr; - } - - apply { - mac_rewrite.apply(); - } -} -#ifdef MULTICAST -/* This control is used to rewrite the source and destination MAC addresses - * for multicast packets. The destination MAC address is derived from the - * destination IP address, and the source MAC address is set based on the - * egress port the packet is being sent out on. +/* Rewrite the source MAC address based on the egress port. For multicast + * packets, also derive the destination MAC from the destination IP address. */ -control MulticastMacRewrite( +control MacRewrite( inout sidecar_headers_t hdr, - in PortId_t port + in PortId_t port, + in bool derive_dst_mac ) { DirectCounter>(CounterType_t.PACKETS_AND_BYTES) ctr; @@ -1611,46 +1553,32 @@ control MulticastMacRewrite( } apply { - if (mac_rewrite.apply().hit) { + bool hit = mac_rewrite.apply().hit; + // Derive multicast dst_mac only when src_mac rewrite succeeds. + if (hit && derive_dst_mac) { // Derive the destination MAC based on IP type. // IPV4: https://www.rfc-editor.org/rfc/rfc1112.html#section-6.4 // IPV6: https://www.rfc-editor.org/rfc/rfc2464.html - if (hdr.ipv4.isValid() || (!hdr.geneve.isValid() && hdr.inner_ipv4.isValid())) { + if (hdr.ipv4.isValid()) { // IPv4 multicast MAC address (01:00:5e + 23 bits of IP) - bit<48> mcast_mac = 0; - // Set the first three bytes to 01:00:5e (0x01005e) - mcast_mac = (bit<48>)0x01005e << 24; - - bit<24> ip_suffix; - // Take the last 23 bits of IPv4 address and append them - // We mask the first byte to clear the top bit - if (hdr.ipv4.isValid()) { - ip_suffix = (bit<24>)(hdr.ipv4.dst_addr & 0x007fffff); - } else { - ip_suffix = (bit<24>)(hdr.inner_ipv4.dst_addr & 0x007fffff); - } - - hdr.ethernet.dst_mac = mcast_mac | ((bit<48>)ip_suffix); - } else if (hdr.ipv6.isValid() || (!hdr.geneve.isValid() && hdr.inner_ipv6.isValid())) { + hdr.ethernet.dst_mac = + IPV4_MCAST_MAC_PREFIX ++ 1w0 ++ hdr.ipv4.dst_addr[22:0]; + } else if (!hdr.geneve.isValid() && hdr.inner_ipv4.isValid()) { + hdr.ethernet.dst_mac = + IPV4_MCAST_MAC_PREFIX ++ 1w0 ++ hdr.inner_ipv4.dst_addr[22:0]; + } else if (hdr.ipv6.isValid()) { // IPv6 multicast MAC address (33:33 + last 32 bits of IPv6) - bit<48> mcast_mac = 0; - // Set the first two bytes to 33:33 - mcast_mac = (bit<48>)0x3333 << 32; - - bit<48> ip_suffix; - // Take the last 32 bits of IPv6 address and append them - if (hdr.ipv6.isValid()) { - ip_suffix = (bit<48>)(hdr.ipv6.dst_addr[31:0]); - } else { - ip_suffix = (bit<48>)(hdr.inner_ipv6.dst_addr[31:0]); - } - - hdr.ethernet.dst_mac = mcast_mac | ip_suffix; + hdr.ethernet.dst_mac = + IPV6_MCAST_MAC_PREFIX ++ hdr.ipv6.dst_addr[31:0]; + } else if (!hdr.geneve.isValid() && hdr.inner_ipv6.isValid()) { + hdr.ethernet.dst_mac = + IPV6_MCAST_MAC_PREFIX ++ hdr.inner_ipv6.dst_addr[31:0]; } } } } +#ifdef MULTICAST /* This control is used to configure multicast packets for replication. * It includes actions for dropping packets with no group, allowing * source-specific multicast, and configuring multicast group IDs and hashes. @@ -1829,16 +1757,18 @@ control MulticastIngress ( NoAction; } + // Priority order: first match wins. The geneve tag entries are + // most specific (both headers valid + exact tag), followed by + // group-ID fallbacks for non-geneve multicast. const entries = { ( _, _, true, true, MULTICAST_TAG_EXTERNAL ) : invalidate_underlay_grp_and_set_decap; ( _, _, true, true, MULTICAST_TAG_UNDERLAY ) : invalidate_external_grp; ( _, _, true, true, MULTICAST_TAG_UNDERLAY_EXTERNAL ) : NoAction; ( 0, _, _, _, _ ) : invalidate_external_grp; ( _, 0, _, _, _ ) : invalidate_underlay_grp; - ( 0, 0, _, _, _ ) : invalidate_grps; } - const size = 6; + const size = 5; } // Note: SSM tables currently take one extra stage in the pipeline (17->18). @@ -1916,7 +1846,7 @@ control MulticastEgress ( table mcast_tag_check { key = { hdr.ipv6.isValid(): exact; - hdr.ipv6.dst_addr: ternary; + hdr.ipv6.dst_addr[127:112]: ternary @name("ipv6_scope"); hdr.geneve.isValid(): exact; hdr.geneve_opts.oxg_mcast.isValid(): exact; hdr.geneve_opts.oxg_mcast.mcast_tag: exact; @@ -1925,19 +1855,14 @@ control MulticastEgress ( actions = { NoAction; } const entries = { - // Admin-local (scope value 4): Matches IPv6 multicast addresses - // with scope ff04::/16 - ( true, IPV6_ADMIN_LOCAL_PATTERN &&& IPV6_SCOPE_MASK, true, true, 2 ) : NoAction; - // Site-local (scope value 5): Matches IPv6 multicast addresses with - // scope ff05::/16 - ( true, IPV6_SITE_LOCAL_PATTERN &&& IPV6_SCOPE_MASK, true, true, 2 ) : NoAction; - // Organization-local (scope value 8): Matches IPv6 multicast - // addresses with scope ff08::/16 - ( true, IPV6_ORG_SCOPE_PATTERN &&& IPV6_SCOPE_MASK, true, true, 2 ) : NoAction; - // ULA (Unique Local Address): Matches IPv6 addresses that start - // with fc00::/7. This is not a multicast address, but it is used - // for other internal routing purposes. - ( true, IPV6_ULA_PATTERN &&& IPV6_ULA_MASK, true, true, 2 ) : NoAction; + // Admin-local (scope value 4): ff04::/16 + ( true, IPV6_ADMIN_LOCAL_16 &&& IPV6_SCOPE_MASK_16, true, true, 2w2 ) : NoAction; + // Site-local (scope value 5): ff05::/16 + ( true, IPV6_SITE_LOCAL_16 &&& IPV6_SCOPE_MASK_16, true, true, 2w2 ) : NoAction; + // Organization-local (scope value 8): ff08::/16 + ( true, IPV6_ORG_SCOPE_16 &&& IPV6_SCOPE_MASK_16, true, true, 2w2 ) : NoAction; + // ULA (Unique Local Address): fd00::/8 + ( true, IPV6_ULA_16 &&& IPV6_ULA_MASK_16, true, true, 2w2 ) : NoAction; } const size = 4; @@ -2094,14 +2019,11 @@ control Ingress( NatIngress() nat_ingress; NatEgress() nat_egress; L3Router() l3_router; - EgressFilter() egress_filter; #ifdef MULTICAST MulticastIngress() mcast_ingress; #endif /* MULTICAST */ - MacRewrite() mac_rewrite; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) ingress_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) egress_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS) drop_port_ctr; Counter, bit<8>>(DROP_REASON_MAX, CounterType_t.PACKETS) drop_reason_ctr; Counter, bit<10>>(1024, CounterType_t.PACKETS) packet_ctr; @@ -2148,9 +2070,6 @@ control Ingress( if (!meta.dropped) { l3_router.apply(hdr, meta, ig_intr_md, ig_tm_md); } - if (!meta.dropped && meta.nat_egress_hit && !meta.is_mcast) { - egress_filter.apply(meta, ig_tm_md); - } } if (meta.dropped) { @@ -2159,14 +2078,18 @@ control Ingress( drop_port_ctr.count(ig_intr_md.ingress_port); drop_reason_ctr.count(meta.drop_reason); } else if (!meta.is_mcast) { - egress_ctr.count(ig_tm_md.ucast_egress_port); - if (ig_tm_md.ucast_egress_port != USER_SPACE_SERVICE_PORT) { - mac_rewrite.apply(hdr, ig_tm_md.ucast_egress_port); - } - meta.bridge_hdr.setInvalid(); - ig_tm_md.bypass_egress = 1w1; + // Unicast packets proceed to egress for MAC rewrite. + // Counted by unicast_ctr in Egress for consistency. } + // Pass state to egress via bridge header. + if (meta.is_mcast && !meta.is_link_local_mcastv6) { + meta.bridge_hdr.is_mcast_routed = true; + } else { + meta.bridge_hdr.is_mcast_routed = false; + } + meta.bridge_hdr.nat_egress_hit = meta.nat_egress_hit; + if (meta.encap_needed) { // This works around a few things which cropped up in // supporting several concurrent Geneve options: @@ -2294,70 +2217,129 @@ control Egress( inout egress_intrinsic_metadata_for_deparser_t eg_dprsr_md, inout egress_intrinsic_metadata_for_output_port_t eg_oport_md ) { + // Separate MacRewrite instances for unicast and multicast paths. + // The P4 compiler requires each table to have a single deterministic + // next-table chain. Using one instance from multiple control flow + // paths causes "incompatible next-table chains" errors. Separate + // instances also provide distinct DirectCounters for traffic accounting. + NatEgressFilter() egress_filter; + MacRewrite() unicast_mac_rewrite; #ifdef MULTICAST - MulticastMacRewrite() mac_rewrite; + MacRewrite() mcast_mac_rewrite; MulticastEgress() mcast_egress; +#endif /* MULTICAST */ + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) forwarded_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) unicast_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) mcast_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) link_local_mcast_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) external_mcast_ctr; - Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) underlay_mcast_ctr; Counter, PortId_t>(512, CounterType_t.PACKETS) drop_port_ctr; Counter, bit<8>>(DROP_REASON_MAX, CounterType_t.PACKETS) drop_reason_ctr; +#ifdef MULTICAST + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) external_mcast_ctr; + Counter, PortId_t>(512, CounterType_t.PACKETS_AND_BYTES) underlay_mcast_ctr; +#endif /* MULTICAST */ apply { - // Check multicast egress packets by checking that RID is not 0. - bool is_egress_rid_mcast = eg_intr_md.egress_rid > 0; - // We track IPv6 multicast packets separately for counters. + // Link-local IPv6 multicast: ff02::/16 scope prefix. bool is_link_local_ipv6_mcast = false; if (hdr.ipv6.isValid()) { - bit<16> ipv6_prefix = (bit<16>)hdr.ipv6.dst_addr[127:112]; - is_link_local_ipv6_mcast = (ipv6_prefix == 16w0xff02); + if (hdr.ipv6.dst_addr[127:112] == IPV6_LINK_LOCAL_16) { + is_link_local_ipv6_mcast = true; + } } - bool is_mcast = is_egress_rid_mcast || is_link_local_ipv6_mcast; - if (is_egress_rid_mcast == true) { - if (meta.bridge_hdr.ingress_port == eg_intr_md.egress_port) { - // If the ingress port is the same as the egress port, drop - // the packet +#ifdef MULTICAST + // Multicast state from bridge header. + PortId_t ingress_port = meta.bridge_hdr.ingress_port; + bit<2> mcast_tag; + if (hdr.geneve_opts.oxg_mcast.isValid()) { + mcast_tag = hdr.geneve_opts.oxg_mcast.mcast_tag; + } else { + mcast_tag = MULTICAST_TAG_INVALID; + } + bool is_mcast_routed = meta.bridge_hdr.is_mcast_routed; + + if (eg_intr_md.egress_rid != 0) { + // Replicated multicast packet (egress_rid > 0 from PRE). + if (ingress_port == eg_intr_md.egress_port) { + // Drop if ingress port equals egress port (path filter). meta.drop_reason = DROP_MULTICAST_PATH_FILTERED; } else { mcast_egress.apply(hdr, meta, eg_intr_md, eg_dprsr_md); - mac_rewrite.apply(hdr, eg_intr_md.egress_port); + mcast_mac_rewrite.apply(hdr, eg_intr_md.egress_port, true); } - } else if (eg_intr_md.egress_rid == 0 && - eg_intr_md.egress_rid_first == 1) { - // Drop CPU copies (RID=0) to prevent unwanted packets on port 0 + } else if (is_mcast_routed) { + // CPU copy: routed to multicast but egress_rid == 0. + eg_dprsr_md.drop_ctl = 1; meta.drop_reason = DROP_MULTICAST_CPU_COPY; + } else { + // Unicast: check egress filter, then rewrite src_mac. + if (meta.bridge_hdr.nat_egress_hit) { + egress_filter.apply(meta, eg_intr_md); + } + if (meta.drop_reason == 0 && + eg_intr_md.egress_port != USER_SPACE_SERVICE_PORT && + !is_link_local_ipv6_mcast) { + unicast_mac_rewrite.apply(hdr, eg_intr_md.egress_port, false); + } + } +#else /* MULTICAST */ + // Non-multicast: check egress filter for NAT traffic, + // then rewrite src_mac. + if (meta.bridge_hdr.nat_egress_hit) { + egress_filter.apply(meta, eg_intr_md); } + if (meta.drop_reason == 0 && + eg_intr_md.egress_port != USER_SPACE_SERVICE_PORT && + !is_link_local_ipv6_mcast) { + unicast_mac_rewrite.apply(hdr, eg_intr_md.egress_port, false); + } +#endif /* MULTICAST */ + // Shared: drop and forwarded counting if (meta.drop_reason != 0) { - // Handle dropped packets drop_port_ctr.count(eg_intr_md.egress_port); drop_reason_ctr.count(meta.drop_reason); eg_dprsr_md.drop_ctl = 1; - } else if (is_mcast == true) { - mcast_ctr.count(eg_intr_md.egress_port); + } else { + forwarded_ctr.count(eg_intr_md.egress_port); +#ifdef MULTICAST + // Multicast-specific counting. Use the mcast_tag + // local (captured before egress decap may strip + // geneve headers) rather than re-checking header + // validity. + if (is_mcast_routed) { + mcast_ctr.count(eg_intr_md.egress_port); + if (mcast_tag == MULTICAST_TAG_UNDERLAY) { + underlay_mcast_ctr.count( + eg_intr_md.egress_port); + } else if (mcast_tag == MULTICAST_TAG_EXTERNAL) { + external_mcast_ctr.count( + eg_intr_md.egress_port); + } else if (mcast_tag == MULTICAST_TAG_UNDERLAY_EXTERNAL) { + underlay_mcast_ctr.count( + eg_intr_md.egress_port); + external_mcast_ctr.count( + eg_intr_md.egress_port); + } + } else if (is_link_local_ipv6_mcast) { + mcast_ctr.count(eg_intr_md.egress_port); + link_local_mcast_ctr.count(eg_intr_md.egress_port); + } else { + unicast_ctr.count(eg_intr_md.egress_port); + } +#else /* MULTICAST */ + // Non-multicast counter increments if (is_link_local_ipv6_mcast) { link_local_mcast_ctr.count(eg_intr_md.egress_port); - } else if (hdr.geneve.isValid()) { - external_mcast_ctr.count(eg_intr_md.egress_port); - } else if (hdr.geneve.isValid() && - hdr.geneve_opts.oxg_mcast.isValid() && - hdr.geneve_opts.oxg_mcast.mcast_tag == MULTICAST_TAG_UNDERLAY) { - underlay_mcast_ctr.count(eg_intr_md.egress_port); + } else { + unicast_ctr.count(eg_intr_md.egress_port); } - } else { - // non-multicast packets should bypass the egress - // pipeline, so we would expect this to be 0. - unicast_ctr.count(eg_intr_md.egress_port); +#endif /* MULTICAST */ } } -#else /* MULTICAST */ - apply { } -#endif /* MULTICAST */ } control EgressDeparser( diff --git a/dpd/src/counters.rs b/dpd/src/counters.rs index 2c8e42a4..cc44e67a 100644 --- a/dpd/src/counters.rs +++ b/dpd/src/counters.rs @@ -51,26 +51,22 @@ pub struct Counter { enum CounterId { Service, Ingress, - Egress, Packet, DropPort, DropReason, - #[cfg(feature = "multicast")] + Forwarded, + Unicast, + /// Link-local IPv6 multicast (ff02::/16). Not feature-gated because + /// link-local forwarding uses standard routing, not replication groups. + MulticastLL, EgressDropPort, - #[cfg(feature = "multicast")] EgressDropReason, #[cfg(feature = "multicast")] - Unicast, - #[cfg(feature = "multicast")] Multicast, #[cfg(feature = "multicast")] MulticastExt, #[cfg(feature = "multicast")] - MulticastLL, - #[cfg(feature = "multicast")] MulticastUL, - #[cfg(feature = "multicast")] - MulticastDrop, } impl From for u8 { @@ -93,7 +89,7 @@ struct CounterDescription { p4_name: &'static str, } -const BASE_COUNTERS: [CounterDescription; 6] = [ +const BASE_COUNTERS: [CounterDescription; 10] = [ CounterDescription { id: CounterId::Service, client_name: "Service", @@ -109,11 +105,6 @@ const BASE_COUNTERS: [CounterDescription; 6] = [ client_name: "Packet", p4_name: "pipe.Ingress.packet_ctr", }, - CounterDescription { - id: CounterId::Egress, - client_name: "Egress", - p4_name: "pipe.Ingress.egress_ctr", - }, CounterDescription { id: CounterId::DropPort, client_name: "Ingress_Drop_Port", @@ -124,12 +115,21 @@ const BASE_COUNTERS: [CounterDescription; 6] = [ client_name: "Ingress_Drop_Reason", p4_name: "pipe.Ingress.drop_reason_ctr", }, -]; - -#[cfg(not(feature = "multicast"))] -const MULTICAST_COUNTERS: [CounterDescription; 0] = []; -#[cfg(feature = "multicast")] -const MULTICAST_COUNTERS: [CounterDescription; 8] = [ + CounterDescription { + id: CounterId::Forwarded, + client_name: "Forwarded", + p4_name: "pipe.Egress.forwarded_ctr", + }, + CounterDescription { + id: CounterId::Unicast, + client_name: "Unicast", + p4_name: "pipe.Egress.unicast_ctr", + }, + CounterDescription { + id: CounterId::MulticastLL, + client_name: "Multicast_Link_Local", + p4_name: "pipe.Egress.link_local_mcast_ctr", + }, CounterDescription { id: CounterId::EgressDropPort, client_name: "Egress_Drop_Port", @@ -140,11 +140,12 @@ const MULTICAST_COUNTERS: [CounterDescription; 8] = [ client_name: "Egress_Drop_Reason", p4_name: "pipe.Egress.drop_reason_ctr", }, - CounterDescription { - id: CounterId::Unicast, - client_name: "Unicast", - p4_name: "pipe.Egress.unicast_ctr", - }, +]; + +#[cfg(not(feature = "multicast"))] +const MULTICAST_COUNTERS: [CounterDescription; 0] = []; +#[cfg(feature = "multicast")] +const MULTICAST_COUNTERS: [CounterDescription; 3] = [ CounterDescription { id: CounterId::Multicast, client_name: "Multicast", @@ -155,21 +156,11 @@ const MULTICAST_COUNTERS: [CounterDescription; 8] = [ client_name: "Multicast_External", p4_name: "pipe.Egress.external_mcast_ctr", }, - CounterDescription { - id: CounterId::MulticastLL, - client_name: "Multicast_Link_Local", - p4_name: "pipe.Egress.link_local_mcast_ctr", - }, CounterDescription { id: CounterId::MulticastUL, client_name: "Multicast_Underlay", p4_name: "pipe.Egress.underlay_mcast_ctr", }, - CounterDescription { - id: CounterId::MulticastDrop, - client_name: "Multicast_Drop", - p4_name: "pipe.Ingress.filter.drop_mcast_ctr", - }, ]; /// Get the list of names by which end users can refer to a counter. @@ -435,20 +426,19 @@ pub async fn get_values( let key = match counter_id { CounterId::Packet => packet_label(idx.idx), CounterId::Service => service_label(idx.idx as u8), - CounterId::Ingress | CounterId::Egress | CounterId::DropPort => { - port_label(switch, idx.idx).await - } - CounterId::DropReason => reason_label(idx.idx as u8)?, - #[cfg(feature = "multicast")] - CounterId::EgressDropPort + CounterId::Ingress + | CounterId::DropPort + | CounterId::Forwarded | CounterId::Unicast - | CounterId::Multicast - | CounterId::MulticastExt | CounterId::MulticastLL - | CounterId::MulticastUL - | CounterId::MulticastDrop => port_label(switch, idx.idx).await, + | CounterId::EgressDropPort => port_label(switch, idx.idx).await, + CounterId::DropReason | CounterId::EgressDropReason => { + reason_label(idx.idx as u8)? + } #[cfg(feature = "multicast")] - CounterId::EgressDropReason => reason_label(idx.idx as u8)?, + CounterId::Multicast + | CounterId::MulticastExt + | CounterId::MulticastUL => port_label(switch, idx.idx).await, }; if let Some(key) = key { diff --git a/dpd/src/link.rs b/dpd/src/link.rs index 4dadb56d..950afd8f 100644 --- a/dpd/src/link.rs +++ b/dpd/src/link.rs @@ -1574,6 +1574,7 @@ fn set_mac_config( )?; mcast::mcast_egress::add_port_mapping_entry(switch, asic_id)?; } + Ok(()) } @@ -1587,6 +1588,7 @@ fn clear_mac_config(switch: &Switch, asic_id: AsicId) -> DpdResult<()> { )?; mcast::mcast_egress::del_port_mapping_entry(switch, asic_id)?; } + Ok(()) } diff --git a/dpd/src/macaddrs.rs b/dpd/src/macaddrs.rs index ecf48d94..42d86aaa 100644 --- a/dpd/src/macaddrs.rs +++ b/dpd/src/macaddrs.rs @@ -427,7 +427,7 @@ impl Switch { assert_eq!(mgr.set_base_mac(temp_mac)?, None); } - // Reset ingress and egress MAC tables and Port ID table(s). + // Reset egress MAC table and Port ID table(s). MacOps::::reset(self)?; #[cfg(feature = "multicast")] { diff --git a/dpd/src/route.rs b/dpd/src/route.rs index 5cbe8b50..266f2a22 100644 --- a/dpd/src/route.rs +++ b/dpd/src/route.rs @@ -126,6 +126,21 @@ use oxnet::{IpNet, Ipv4Net, Ipv6Net}; const MAX_TARGETS_IPV4: usize = 32; const MAX_TARGETS_IPV6: usize = 32; +// Each route index maps to 2 physical entries in the forward table: +// one for normal forwarding (TTL > 1) and one for TTL exceeded (TTL == 1). +const ROUTE_FWD_ENTRIES_PER_ROUTE: u32 = 2; + +/// Convert a P4 table size to freemap size, accounting for the fact that each +/// logical route uses multiple physical entries. +fn freemap_size_from_table(table_size: u32) -> DpdResult { + let logical_routes = table_size / ROUTE_FWD_ENTRIES_PER_ROUTE; + u16::try_from(logical_routes).map_err(|_| { + DpdError::Invalid(format!( + "route table size {table_size} exceeds maximum supported" + )) + }) +} + #[derive(Clone, Debug, PartialEq, Eq)] struct Route { tag: String, @@ -515,18 +530,19 @@ fn add_route_locked( ) -> DpdResult<()> { info!(switch.log, "adding route {subnet} -> {:?}", route.tgt_ip); - // Verify that the slot freelist has been initialized + // Verify that the slot freelist has been initialized. + // The freemap tracks logical route indices, not physical table entries. + // Since each route uses ROUTE_FWD_ENTRIES_PER_ROUTE physical entries + // (normal forward + TTL exceeded), we divide the table size accordingly. let max_targets; if subnet.is_ipv4() { max_targets = MAX_TARGETS_IPV4; - route_data.v4_freemap.maybe_init( - switch.table_size(table::TableType::RouteFwdIpv4)? as u16, - ); + let table_size = switch.table_size(table::TableType::RouteFwdIpv4)?; + route_data.v4_freemap.maybe_init(freemap_size_from_table(table_size)?); } else { max_targets = MAX_TARGETS_IPV6; - route_data.v6_freemap.maybe_init( - switch.table_size(table::TableType::RouteFwdIpv6)? as u16, - ); + let table_size = switch.table_size(table::TableType::RouteFwdIpv6)?; + route_data.v6_freemap.maybe_init(freemap_size_from_table(table_size)?); } // Get the old set of targets that we'll be adding to diff --git a/dpd/src/table/arp_ipv4.rs b/dpd/src/table/arp_ipv4.rs index 2603b2f8..3e7c0294 100644 --- a/dpd/src/table/arp_ipv4.rs +++ b/dpd/src/table/arp_ipv4.rs @@ -18,7 +18,7 @@ pub const TABLE_NAME: &str = "pipe.Ingress.l3_router.Arp.tbl"; #[derive(MatchParse, Hash)] struct MatchKey { - #[match_xlate(name = "nexthop_ipv4")] + #[match_xlate(name = "nexthop")] ip: Ipv4Addr, } diff --git a/dpd/src/table/mcast/mcast_port_mac.rs b/dpd/src/table/mcast/mcast_port_mac.rs index 687d35af..efefaa82 100644 --- a/dpd/src/table/mcast/mcast_port_mac.rs +++ b/dpd/src/table/mcast/mcast_port_mac.rs @@ -2,14 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Table operations for multicast port MAC entries. use crate::table::{MacTable, TableType}; -/// Table for multicast port MAC entries. -pub const TABLE_NAME: &str = "pipe.Egress.mac_rewrite.mac_rewrite"; +/// Table name for multicast port MAC rewrite entries. +pub const TABLE_NAME: &str = "pipe.Egress.mcast_mac_rewrite.mac_rewrite"; /// Table for multicast port MAC entries. pub struct PortMacTable; diff --git a/dpd/src/table/mcast/mcast_replication.rs b/dpd/src/table/mcast/mcast_replication.rs index e815c92a..65570a8c 100644 --- a/dpd/src/table/mcast/mcast_replication.rs +++ b/dpd/src/table/mcast/mcast_replication.rs @@ -5,6 +5,10 @@ // Copyright 2026 Oxide Computer Company //! Table operations for multicast replication information. +//! +//! Only IPv6 replication groups are managed here. IPv4 multicast uses +//! direct forwarding via the `MulticastRouter4` P4 control without +//! replication group tracking. use std::net::Ipv6Addr; diff --git a/dpd/src/table/mcast/mod.rs b/dpd/src/table/mcast/mod.rs index ee3ff5b5..600f7c7a 100644 --- a/dpd/src/table/mcast/mod.rs +++ b/dpd/src/table/mcast/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! Multicast table operations. diff --git a/dpd/src/table/mod.rs b/dpd/src/table/mod.rs index 749679ae..0bce7dd8 100644 --- a/dpd/src/table/mod.rs +++ b/dpd/src/table/mod.rs @@ -30,6 +30,26 @@ pub mod route_ipv4; pub mod route_ipv6; pub mod uplink; +// The service port is the CPU/userspace port. Routes targeting this port should +// forward packets even when TTL==1, bypassing the normal TTL exceeded handling. +// This matches the P4 behavior: `ttl == 1 && !IS_SERVICE(fwd.port)`. +// +// Port values match USER_SPACE_SERVICE_PORT in sidecar.p4: +// - Tofino2 (tofino_asic/softnpu/tofino_stub): port 0 +// - Tofino1 (chaos): port 192 +#[cfg(any( + feature = "softnpu", + feature = "tofino_asic", + feature = "tofino_stub" +))] +pub const SERVICE_PORT: u16 = 0; +#[cfg(not(any( + feature = "softnpu", + feature = "tofino_asic", + feature = "tofino_stub" +)))] +pub const SERVICE_PORT: u16 = 192; + const BASE_TABLES: [(&str, TableType); 15] = [ (route_ipv4::INDEX_TABLE_NAME, TableType::RouteIdxIpv4), (route_ipv4::FORWARD_TABLE_NAME, TableType::RouteFwdIpv4), @@ -395,6 +415,12 @@ pub fn get_counters( mcast::mcast_route::ipv6_counter_fetch(switch, force_sync) } #[cfg(feature = "multicast")] + TableType::PortMacMcast => { + MacOps::::counter_fetch( + switch, force_sync, + ) + } + #[cfg(feature = "multicast")] TableType::McastEgressDecapPorts => { mcast::mcast_egress::bitmap_counter_fetch(switch, force_sync) } @@ -402,12 +428,6 @@ pub fn get_counters( TableType::McastEgressPortMapping => { mcast::mcast_egress::port_mapping_counter_fetch(switch, force_sync) } - #[cfg(feature = "multicast")] - TableType::PortMacMcast => { - MacOps::::counter_fetch( - switch, force_sync, - ) - } } } diff --git a/dpd/src/table/neighbor_ipv6.rs b/dpd/src/table/neighbor_ipv6.rs index e06425c5..4a9d8c57 100644 --- a/dpd/src/table/neighbor_ipv6.rs +++ b/dpd/src/table/neighbor_ipv6.rs @@ -20,7 +20,7 @@ pub const TABLE_NAME: &str = "pipe.Ingress.l3_router.Ndp.tbl"; #[derive(MatchParse, Hash)] struct MatchKey { - #[match_xlate(name = "nexthop_ipv6")] + #[match_xlate(name = "nexthop")] ip: Ipv6Addr, } diff --git a/dpd/src/table/port_mac.rs b/dpd/src/table/port_mac.rs index 267a12c9..cc16b56a 100644 --- a/dpd/src/table/port_mac.rs +++ b/dpd/src/table/port_mac.rs @@ -2,11 +2,12 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/ // -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company use super::{MacTable, TableType}; -pub const TABLE_NAME: &str = "pipe.Ingress.mac_rewrite.mac_rewrite"; +// Unicast MAC rewrite table (always present in both MULTICAST and non-MULTICAST P4) +pub const TABLE_NAME: &str = "pipe.Egress.unicast_mac_rewrite.mac_rewrite"; pub struct PortMacTable; diff --git a/dpd/src/table/route_ipv4.rs b/dpd/src/table/route_ipv4.rs index 91fbff57..d42a8c21 100644 --- a/dpd/src/table/route_ipv4.rs +++ b/dpd/src/table/route_ipv4.rs @@ -9,6 +9,7 @@ use std::net::Ipv4Addr; use std::net::Ipv6Addr; use crate::Switch; +use crate::table::SERVICE_PORT; use crate::table::*; use aal::ActionParse; use aal::MatchParse; @@ -18,19 +19,21 @@ use slog::error; use slog::info; pub const INDEX_TABLE_NAME: &str = - "pipe.Ingress.l3_router.Router4.lookup_idx.lookup"; + "pipe.Ingress.l3_router.router4.lookup_idx.lookup"; pub const FORWARD_TABLE_NAME: &str = - "pipe.Ingress.l3_router.Router4.lookup_idx.route"; + "pipe.Ingress.l3_router.router4.lookup_idx.route"; -// Used for indentifying entries in the index->route_data table +// Used for identifying entries in the index->route_data table #[derive(MatchParse, Hash, Debug)] struct IndexKey { #[match_xlate(type = "value")] idx: u16, + #[match_xlate(name = "route_ttl_is_1", type = "value")] + route_ttl_is_1: bool, } // Route entries stored in the index->route_data table -#[derive(ActionParse, Debug)] +#[derive(ActionParse, Debug, Clone, Copy)] enum RouteAction { #[action_xlate(name = "forward")] Forward { port: u16, nexthop: Ipv4Addr }, @@ -40,6 +43,8 @@ enum RouteAction { ForwardVlan { port: u16, nexthop: Ipv4Addr, vlan_id: u16 }, #[action_xlate(name = "forward_vlan_v6")] ForwardVlanV6 { port: u16, nexthop: Ipv6Addr, vlan_id: u16 }, + #[action_xlate(name = "ttl_exceeded")] + TtlExceeded, } // Used to identify entries in the route->index table @@ -70,17 +75,17 @@ pub fn add_route_index( match s.table_entry_add(TableType::RouteIdxIpv4, &match_key, &action_data) { Ok(()) => { info!(s.log, "added ipv4 route index"; - "route" => %cidr, - "index" => %idx, - "slots" => %slots); + "route" => %cidr, + "index" => %idx, + "slots" => %slots); Ok(()) } Err(e) => { error!(s.log, "failed to add ipv4 route index"; - "route" => %cidr, - "index" => %idx, - "slots" => %slots, - "error" => %e); + "route" => %cidr, + "index" => %idx, + "slots" => %slots, + "error" => %e); Err(e) } } @@ -94,8 +99,8 @@ pub fn delete_route_index(s: &Switch, cidr: &Ipv4Net) -> DpdResult<()> { .map(|_| info!(s.log, "deleted ipv4 index"; "route" => %cidr)) .map_err(|e| { error!(s.log, "failed to delete ipv4 index"; - "route" => %cidr, - "error" => %e); + "route" => %cidr, + "error" => %e); e }) } @@ -108,7 +113,7 @@ pub fn add_route_target( nexthop: Ipv4Addr, vlan_id: Option, ) -> DpdResult<()> { - let match_key = IndexKey { idx }; + let match_key = IndexKey { idx, route_ttl_is_1: false }; let action_data = match vlan_id { None => RouteAction::Forward { port, nexthop }, Some(vlan_id) => { @@ -120,24 +125,64 @@ pub fn add_route_target( match s.table_entry_add(TableType::RouteFwdIpv4, &match_key, &action_data) { Ok(()) => { info!(s.log, "added ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "vlan_id" => ?vlan_id); - Ok(()) + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "vlan_id" => ?vlan_id); + add_ttl_entry(s, idx, &match_key, &action_data, port) } Err(e) => { error!(s.log, "failed to add ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "error" => %e); + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "error" => %e); Err(e) } } } -// Add a target into the route_data table at the given index +// Add the TTL==1 entry for a route target. +// +// For service port routes, we forward even when TTL==1 (bypassing ICMP TTL exceeded). +// For all other routes, we trigger TTL exceeded handling. +// This matches the P4 behavior: `ttl == 1 && !IS_SERVICE(fwd.port)`. +fn add_ttl_entry( + s: &Switch, + idx: u16, + forward_key: &IndexKey, + forward_action: &RouteAction, + port: u16, +) -> DpdResult<()> { + let ttl_match_key = IndexKey { idx, route_ttl_is_1: true }; + + // Service port routes forward even with TTL==1 + let ttl_action = if port == SERVICE_PORT { + *forward_action + } else { + RouteAction::TtlExceeded + }; + + if let Err(e) = + s.table_entry_add(TableType::RouteFwdIpv4, &ttl_match_key, &ttl_action) + { + error!(s.log, "failed to add ipv4 ttl entry"; + "index" => idx, + "error" => %e); + if let Err(cleanup_err) = + s.table_entry_del(TableType::RouteFwdIpv4, forward_key) + { + error!(s.log, "failed to clean up ipv4 route entry"; + "index" => idx, + "error" => %cleanup_err); + } + return Err(e); + } + Ok(()) +} + +// Add a target with IPv6 nexthop into the route_data table at the given index +// (used for v4-over-v6 routing) pub fn add_route_target_v6( s: &Switch, idx: u16, @@ -145,7 +190,7 @@ pub fn add_route_target_v6( nexthop: Ipv6Addr, vlan_id: Option, ) -> DpdResult<()> { - let match_key = IndexKey { idx }; + let match_key = IndexKey { idx, route_ttl_is_1: false }; let action_data = match vlan_id { None => RouteAction::ForwardV6 { port, nexthop }, Some(vlan_id) => { @@ -156,36 +201,50 @@ pub fn add_route_target_v6( match s.table_entry_add(TableType::RouteFwdIpv4, &match_key, &action_data) { Ok(()) => { - info!(s.log, "added ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "vlan_id" => ?vlan_id); - Ok(()) + info!(s.log, "added ipv4 route entry (v6 nexthop)"; + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "vlan_id" => ?vlan_id); + add_ttl_entry(s, idx, &match_key, &action_data, port) } Err(e) => { - error!(s.log, "failed to add ipv4 route entry"; - "index" => idx, - "port" => port, - "nexthop" => %nexthop, - "error" => %e); + error!(s.log, "failed to add ipv4 route entry (v6 nexthop)"; + "index" => idx, + "port" => port, + "nexthop" => %nexthop, + "error" => %e); Err(e) } } } -// Remove the route data at the given index +// Remove the route data at the given index (both forward and ttl_exceeded entries). +// The main entry (route_ttl_is_1=false) must succeed. The TTL==1 companion entry +// may not exist for routes created before the compound key change, so we only +// log a warning for TTL==1 entry failures instead of returning an error. pub fn delete_route_target(s: &Switch, idx: u16) -> DpdResult<()> { - let match_key = IndexKey { idx }; + // Delete the main entry first (route_ttl_is_1=false). + let main_key = IndexKey { idx, route_ttl_is_1: false }; + if let Err(e) = s.table_entry_del(TableType::RouteFwdIpv4, &main_key) { + error!(s.log, "failed to delete ipv4 route entry"; + "index" => %idx, + "error" => %e); + return Err(e); + } + info!(s.log, "deleted ipv4 route entry"; "index" => %idx); - s.table_entry_del(TableType::RouteFwdIpv4, &match_key) - .map(|_| info!(s.log, "deleted ipv4 route entry"; "index" => %idx)) - .map_err(|e| { - error!(s.log, "failed to delete ipv4 route entry"; - "index" => %idx, - "error" => %e); - e - }) + // Delete the TTL==1 companion entry. + let ttl_key = IndexKey { idx, route_ttl_is_1: true }; + if let Err(e) = s.table_entry_del(TableType::RouteFwdIpv4, &ttl_key) { + error!(s.log, "failed to delete ipv4 route ttl==1 entry"; + "index" => %idx, + "error" => %e); + return Err(e); + } + info!(s.log, "deleted ipv4 route ttl==1 entry"; "index" => %idx); + + Ok(()) } pub fn forward_dump(s: &Switch) -> DpdResult { @@ -215,14 +274,14 @@ pub fn reset(s: &Switch) -> DpdResult<()> { .map(|_| info!(s.log, "reset ipv4 route-index table")) .map_err(|e| { error!(s.log, "failed to clear ipv4 route-index table"; - "error" => %e); + "error" => %e); e })?; s.table_clear(TableType::RouteFwdIpv4) .map(|_| info!(s.log, "reset ipv4 route-data table")) .map_err(|e| { error!(s.log, "failed to clear ipv4 route-data table"; - "error" => %e); + "error" => %e); e }) } diff --git a/dpd/src/table/route_ipv6.rs b/dpd/src/table/route_ipv6.rs index 33946d1d..ed672bc8 100644 --- a/dpd/src/table/route_ipv6.rs +++ b/dpd/src/table/route_ipv6.rs @@ -8,6 +8,7 @@ use std::convert::TryInto; use std::net::Ipv6Addr; use crate::Switch; +use crate::table::SERVICE_PORT; use crate::table::*; use aal::ActionParse; use aal::MatchParse; @@ -17,24 +18,28 @@ use slog::error; use slog::info; pub const INDEX_TABLE_NAME: &str = - "pipe.Ingress.l3_router.Router6.lookup_idx.lookup"; + "pipe.Ingress.l3_router.router6.lookup_idx.lookup"; pub const FORWARD_TABLE_NAME: &str = - "pipe.Ingress.l3_router.Router6.lookup_idx.route"; + "pipe.Ingress.l3_router.router6.lookup_idx.route"; -// Used for indentifying entries in the index->route_data table +// Used for identifying entries in the index->route_data table #[derive(MatchParse, Hash, Debug)] struct IndexKey { #[match_xlate(type = "value")] idx: u16, + #[match_xlate(name = "route_ttl_is_1", type = "value")] + route_ttl_is_1: bool, } // Route entries stored in the index->route_data table -#[derive(ActionParse, Debug)] +#[derive(ActionParse, Debug, Clone, Copy)] enum RouteAction { #[action_xlate(name = "forward")] Forward { port: u16, nexthop: Ipv6Addr }, #[action_xlate(name = "forward_vlan")] ForwardVlan { port: u16, nexthop: Ipv6Addr, vlan_id: u16 }, + #[action_xlate(name = "ttl_exceeded")] + TtlExceeded, } // Used to identify entries in the route->index table @@ -103,7 +108,7 @@ pub fn add_route_target( nexthop: Ipv6Addr, vlan_id: Option, ) -> DpdResult<()> { - let match_key = IndexKey { idx }; + let match_key = IndexKey { idx, route_ttl_is_1: false }; let action_data = match vlan_id { None => RouteAction::Forward { port, nexthop }, Some(vlan_id) => { @@ -119,7 +124,7 @@ pub fn add_route_target( "port" => port, "nexthop" => %nexthop, "vlan_id" => ?vlan_id); - Ok(()) + add_ttl_entry(s, idx, &match_key, &action_data, port) } Err(e) => { error!(s.log, "failed to add ipv6 route entry"; @@ -132,18 +137,70 @@ pub fn add_route_target( } } -// Remove the route data at the given index +// Add the TTL==1 entry for a route target. +// +// For service port routes, we forward even when TTL==1 (bypassing ICMP TTL exceeded). +// For all other routes, we trigger TTL exceeded handling. +// This matches the P4 behavior: `ttl == 1 && !IS_SERVICE(fwd.port)`. +fn add_ttl_entry( + s: &Switch, + idx: u16, + forward_key: &IndexKey, + forward_action: &RouteAction, + port: u16, +) -> DpdResult<()> { + let ttl_match_key = IndexKey { idx, route_ttl_is_1: true }; + + let ttl_action = if port == SERVICE_PORT { + *forward_action + } else { + RouteAction::TtlExceeded + }; + + if let Err(e) = + s.table_entry_add(TableType::RouteFwdIpv6, &ttl_match_key, &ttl_action) + { + error!(s.log, "failed to add ipv6 ttl entry"; + "index" => idx, + "error" => %e); + if let Err(cleanup_err) = + s.table_entry_del(TableType::RouteFwdIpv6, forward_key) + { + error!(s.log, "failed to clean up ipv6 route entry"; + "index" => idx, + "error" => %cleanup_err); + } + return Err(e); + } + Ok(()) +} + +// Remove the route data at the given index (both forward and ttl_exceeded entries). +// The main entry (route_ttl_is_1=false) must succeed. The TTL==1 companion entry +// may not exist for routes created before the compound key change, so we only +// log a warning for TTL==1 entry failures instead of returning an error. pub fn delete_route_target(s: &Switch, idx: u16) -> DpdResult<()> { - let match_key = IndexKey { idx }; + // Delete the main entry first (route_ttl_is_1=false). + let main_key = IndexKey { idx, route_ttl_is_1: false }; + if let Err(e) = s.table_entry_del(TableType::RouteFwdIpv6, &main_key) { + error!(s.log, "failed to delete ipv6 route entry"; + "index" => %idx, + "error" => %e); + return Err(e); + } + info!(s.log, "deleted ipv6 route entry"; "index" => %idx); + + // Delete the TTL==1 companion entry. + let ttl_key = IndexKey { idx, route_ttl_is_1: true }; + if let Err(e) = s.table_entry_del(TableType::RouteFwdIpv6, &ttl_key) { + error!(s.log, "failed to delete ipv6 route ttl==1 entry"; + "index" => %idx, + "error" => %e); + return Err(e); + } + info!(s.log, "deleted ipv6 route ttl==1 entry"; "index" => %idx); - s.table_entry_del(TableType::RouteFwdIpv6, &match_key) - .map(|_| info!(s.log, "deleted ipv6 route entry"; "index" => %idx)) - .map_err(|e| { - error!(s.log, "failed to delete ipv6 route entry"; - "index" => %idx, - "error" => %e); - e - }) + Ok(()) } pub fn forward_dump(s: &Switch) -> DpdResult { diff --git a/dpd/src/table/uplink.rs b/dpd/src/table/uplink.rs index f42a965a..754e1b49 100644 --- a/dpd/src/table/uplink.rs +++ b/dpd/src/table/uplink.rs @@ -14,7 +14,7 @@ use aal::{ActionParse, MatchParse}; use aal_macros::*; pub const INGRESS_TABLE_NAME: &str = "pipe.Ingress.filter.uplink_ports"; -pub const EGRESS_TABLE_NAME: &str = "pipe.Ingress.egress_filter.egress_filter"; +pub const EGRESS_TABLE_NAME: &str = "pipe.Egress.egress_filter.egress_filter"; #[derive(MatchParse, Debug, Hash)] struct IngressMatchKey { @@ -30,7 +30,7 @@ enum IngressAction { #[derive(MatchParse, Debug, Hash)] struct EgressMatchKey { - #[match_xlate(name = "ucast_egress_port")] + #[match_xlate(name = "egress_port")] out_port: u16, } diff --git a/hooks b/hooks new file mode 100644 index 00000000..e69de29b diff --git a/objects b/objects new file mode 100644 index 00000000..e69de29b diff --git a/refs b/refs new file mode 100644 index 00000000..e69de29b diff --git a/swadm/Cargo.toml b/swadm/Cargo.toml index cab5bfde..8c3bf7b7 100644 --- a/swadm/Cargo.toml +++ b/swadm/Cargo.toml @@ -6,6 +6,7 @@ authors = ["nils "] edition = "2024" [features] +default = ["multicast"] multicast = [] [dependencies] diff --git a/swadm/tests/counters.rs b/swadm/tests/counters.rs index f01f7e15..9c24bb88 100644 --- a/swadm/tests/counters.rs +++ b/swadm/tests/counters.rs @@ -37,30 +37,28 @@ fn test_p4_counter_list() { // Verify output is not empty and contains expected counter information assert!(!stdout.is_empty(), "Counter list output should not be empty"); - // Expected P4 counters from dpd/src/counters.rs COUNTERS array + // Expected P4 counters from dpd/src/counters.rs + // BASE_COUNTERS let base_counters = [ "Service", "Ingress", "Packet", - "Egress", "Ingress_Drop_Port", "Ingress_Drop_Reason", + "Forwarded", + "Unicast", + "Multicast_Link_Local", + "Egress_Drop_Port", + "Egress_Drop_Reason", ]; + // MULTICAST_COUNTERS #[cfg(not(feature = "multicast"))] - let multicast_counters = Vec::new(); + let multicast_counters: Vec<&str> = Vec::new(); #[cfg(feature = "multicast")] - let multicast_counters = vec![ - "Egress_Drop_Port", - "Egress_Drop_Reason", - "Unicast", - "Multicast", - "Multicast_External", - "Multicast_Link_Local", - "Multicast_Underlay", - "Multicast_Drop", - ]; + let multicast_counters = + vec!["Multicast", "Multicast_External", "Multicast_Underlay"]; // Verify all expected counters are present in the output for counter in base_counters.iter().chain(multicast_counters.iter()) { diff --git a/tools/veth_setup.sh b/tools/veth_setup.sh index 1fb1195d..a3602425 100755 --- a/tools/veth_setup.sh +++ b/tools/veth_setup.sh @@ -4,7 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/ # -# Copyright 2025 Oxide Computer Company +# Copyright 2026 Oxide Computer Company function config_veth() { /usr/bin/ip link set dev $1 mtu 10240 up @@ -19,7 +19,9 @@ function config_veth() { function add_port() { veth0="veth$(($1*2))" veth1="veth$(($1*2+1))" - echo Adding $veth0 and $veth1 for port $1 + if [ "${VETH_VERBOSE:-1}" -eq 1 ]; then + echo "Adding $veth0 and $veth1 for port $1" + fi if ! /usr/bin/ip link show $veth0 &> /dev/null; then /usr/bin/ip link add name $veth0 type veth peer name $veth1 &> /dev/null @@ -39,7 +41,11 @@ else ports=16 fi -echo "building veths for $ports ports" +if [ "${VETH_VERBOSE:-1}" -eq 1 ]; then + echo "building veths for $ports ports" +else + echo "veth setup: ports 0..$ports plus 125" +fi port_list="`seq 0 $ports` 125" for port in $port_list; do