73 files changed, 4666 insertions, 563 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net
index 1f2002df5ba2..1419103d11f9 100644
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -337,3 +337,18 @@ Contact:	[email protected]
 Description:
 		32-bit unsigned integer counting the number of times the link has
 		been down
+
+What:		/sys/class/net/<iface>/threaded
+Date:		Jan 2021
+KernelVersion:	5.12
+Contact:	[email protected]
+Description:
+		Boolean value to control the threaded mode per device. User could
+		set this value to enable/disable threaded mode for all napi
+		belonging to this device, without the need to do device up/down.
+
+		Possible values:
+		== ==================================
+		0  threaded mode disabled for this dev
+		1  threaded mode enabled for this dev
+		== ==================================
diff --git a/Documentation/devicetree/bindings/net/brcm,bcm4908enet.yaml b/Documentation/devicetree/bindings/net/brcm,bcm4908enet.yaml
new file mode 100644
index 000000000000..5f12f51c5b19
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/brcm,bcm4908enet.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/brcm,bcm4908enet.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom BCM4908 Ethernet controller
+
+description: Broadcom's Ethernet controller integrated into BCM4908 family SoCs
+
+maintainers:
+  - Rafał Miłecki <[email protected]>
+
+properties:
+  compatible:
+    const: brcm,bcm4908enet
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    description: RX interrupt
+
+  interrupt-names:
+    const: rx
+
+required:
+  - reg
+  - interrupts
+  - interrupt-names
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    ethernet@80002000 {
+        compatible = "brcm,bcm4908enet";
+        reg = <0x80002000 0x1000>;
+
+        interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "rx";
+    };
diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
index ee43ea57d443..e7d9cbff771b 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
@@ -1,46 +1,1031 @@
 .. SPDX-License-Identifier: GPL-2.0+
 
-==================================================================
-Linux Base Driver for the Intel(R) Ethernet Connection E800 Series
-==================================================================
+=================================================================
+Linux Base Driver for the Intel(R) Ethernet Controller 800 Series
+=================================================================
 
 Intel ice Linux driver.
-Copyright(c) 2018 Intel Corporation.
+Copyright(c) 2018-2021 Intel Corporation.
 
 Contents
 ========
 
-- Enabling the driver
-- Support
+- Overview
+- Identifying Your Adapter
+- Important Notes
+- Additional Features & Configurations
+- Performance Optimization
 
-The driver in this release supports Intel's E800 Series of products. For
-more information, visit Intel's support page at https://support.intel.com.
 
-Enabling the driver
-===================
-The driver is enabled via the standard kernel configuration system,
-using the make command::
+The associated Virtual Function (VF) driver for this driver is iavf.
 
-  make oldconfig/menuconfig/etc.
+Driver information can be obtained using ethtool and lspci.
 
-The driver is located in the menu structure at:
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel adapter. All hardware requirements listed apply to use
+with Linux.
+
+This driver supports XDP (Express Data Path) and AF_XDP zero-copy. Note that
+XDP is blocked for frame sizes larger than 3KB.
+
+
+Identifying Your Adapter
+========================
+For information on how to identify your adapter, and for the latest Intel
+network drivers, refer to the Intel Support website:
+https://www.intel.com/support
+
+
+Important Notes
+===============
+
+Packet drops may occur under receive stress
+-------------------------------------------
+Devices based on the Intel(R) Ethernet Controller 800 Series are designed to
+tolerate a limited amount of system latency during PCIe and DMA transactions.
+If these transactions take longer than the tolerated latency, it can impact the
+length of time the packets are buffered in the device and associated memory,
+which may result in dropped packets. These packets drops typically do not have
+a noticeable impact on throughput and performance under standard workloads.
+
+If these packet drops appear to affect your workload, the following may improve
+the situation:
+
+1) Make sure that your system's physical memory is in a high-performance
+   configuration, as recommended by the platform vendor. A common
+   recommendation is for all channels to be populated with a single DIMM
+   module.
+2) In your system's BIOS/UEFI settings, select the "Performance" profile.
+3) Your distribution may provide tools like "tuned," which can help tweak
+   kernel settings to achieve better standard settings for different workloads.
+
+
+Configuring SR-IOV for improved network security
+------------------------------------------------
+In a virtualized environment, on Intel(R) Ethernet Network Adapters that
+support SR-IOV, the virtual function (VF) may be subject to malicious behavior.
+Software-generated layer two frames, like IEEE 802.3x (link flow control), IEEE
+802.1Qbb (priority based flow-control), and others of this type, are not
+expected and can throttle traffic between the host and the virtual switch,
+reducing performance. To resolve this issue, and to ensure isolation from
+unintended traffic streams, configure all SR-IOV enabled ports for VLAN tagging
+from the administrative interface on the PF. This configuration allows
+unexpected, and potentially malicious, frames to be dropped.
+
+See "Configuring VLAN Tagging on SR-IOV Enabled Adapter Ports" later in this
+README for configuration instructions.
+
+
+Do not unload port driver if VF with active VM is bound to it
+-------------------------------------------------------------
+Do not unload a port's driver if a Virtual Function (VF) with an active Virtual
+Machine (VM) is bound to it. Doing so will cause the port to appear to hang.
+Once the VM shuts down, or otherwise releases the VF, the command will
+complete.
+
+
+Important notes for SR-IOV and Link Aggregation
+-----------------------------------------------
+Link Aggregation is mutually exclusive with SR-IOV.
+
+- If Link Aggregation is active, SR-IOV VFs cannot be created on the PF.
+- If SR-IOV is active, you cannot set up Link Aggregation on the interface.
+
+Bridging and MACVLAN are also affected by this. If you wish to use bridging or
+MACVLAN with SR-IOV, you must set up bridging or MACVLAN before enabling
+SR-IOV. If you are using bridging or MACVLAN in conjunction with SR-IOV, and
+you want to remove the interface from the bridge or MACVLAN, you must follow
+these steps:
+
+1. Destroy SR-IOV VFs if they exist
+2. Remove the interface from the bridge or MACVLAN
+3. Recreate SRIOV VFs as needed
+
+
+Additional Features and Configurations
+======================================
+
+ethtool
+-------
+The driver utilizes the ethtool interface for driver configuration and
+diagnostics, as well as displaying statistical information. The latest ethtool
+version is required for this functionality. Download it at:
+https://kernel.org/pub/software/network/ethtool/
+
+NOTE: The rx_bytes value of ethtool does not match the rx_bytes value of
+Netdev, due to the 4-byte CRC being stripped by the device. The difference
+between the two rx_bytes values will be 4 x the number of Rx packets. For
+example, if Rx packets are 10 and Netdev (software statistics) displays
+rx_bytes as "X", then ethtool (hardware statistics) will display rx_bytes as
+"X+40" (4 bytes CRC x 10 packets).
+
+
+Viewing Link Messages
+---------------------
+Link messages will not be displayed to the console if the distribution is
+restricting system messages. In order to see network driver link messages on
+your console, set dmesg to eight by entering the following::
+
+  # dmesg -n 8
+
+NOTE: This setting is not saved across reboots.
+
+
+Dynamic Device Personalization
+------------------------------
+Dynamic Device Personalization (DDP) allows you to change the packet processing
+pipeline of a device by applying a profile package to the device at runtime.
+Profiles can be used to, for example, add support for new protocols, change
+existing protocols, or change default settings. DDP profiles can also be rolled
+back without rebooting the system.
+
+The DDP package loads during device initialization. The driver looks for
+``intel/ice/ddp/ice.pkg`` in your firmware root (typically ``/lib/firmware/``
+or ``/lib/firmware/updates/``) and checks that it contains a valid DDP package
+file.
+
+NOTE: Your distribution should likely have provided the latest DDP file, but if
+ice.pkg is missing, you can find it in the linux-firmware repository or from
+intel.com.
+
+If the driver is unable to load the DDP package, the device will enter Safe
+Mode. Safe Mode disables advanced and performance features and supports only
+basic traffic and minimal functionality, such as updating the NVM or
+downloading a new driver or DDP package. Safe Mode only applies to the affected
+physical function and does not impact any other PFs. See the "Intel(R) Ethernet
+Adapters and Devices User Guide" for more details on DDP and Safe Mode.
+
+NOTES:
+
+- If you encounter issues with the DDP package file, you may need to download
+  an updated driver or DDP package file. See the log messages for more
+  information.
+
+- The ice.pkg file is a symbolic link to the default DDP package file.
+
+- You cannot update the DDP package if any PF drivers are already loaded. To
+  overwrite a package, unload all PFs and then reload the driver with the new
+  package.
+
+- Only the first loaded PF per device can download a package for that device.
+
+You can install specific DDP package files for different physical devices in
+the same system. To install a specific DDP package file:
+
+1. Download the DDP package file you want for your device.
+
+2. Rename the file ice-xxxxxxxxxxxxxxxx.pkg, where 'xxxxxxxxxxxxxxxx' is the
+   unique 64-bit PCI Express device serial number (in hex) of the device you
+   want the package downloaded on. The filename must include the complete
+   serial number (including leading zeros) and be all lowercase. For example,
+   if the 64-bit serial number is b887a3ffffca0568, then the file name would be
+   ice-b887a3ffffca0568.pkg.
+
+   To find the serial number from the PCI bus address, you can use the
+   following command::
+
+     # lspci -vv -s af:00.0 | grep -i Serial
+     Capabilities: [150 v1] Device Serial Number b8-87-a3-ff-ff-ca-05-68
+
+   You can use the following command to format the serial number without the
+   dashes::
+
+     # lspci -vv -s af:00.0 | grep -i Serial | awk '{print $7}' | sed s/-//g
+     b887a3ffffca0568
+
+3. Copy the renamed DDP package file to
+   ``/lib/firmware/updates/intel/ice/ddp/``. If the directory does not yet
+   exist, create it before copying the file.
+
+4. Unload all of the PFs on the device.
+
+5. Reload the driver with the new package.
+
+NOTE: The presence of a device-specific DDP package file overrides the loading
+of the default DDP package file (ice.pkg).
+
+
+Intel(R) Ethernet Flow Director
+-------------------------------
+The Intel Ethernet Flow Director performs the following tasks:
+
+- Directs receive packets according to their flows to different queues
+- Enables tight control on routing a flow in the platform
+- Matches flows and CPU cores for flow affinity
+
+NOTE: This driver supports the following flow types:
+
+- IPv4
+- TCPv4
+- UDPv4
+- SCTPv4
+- IPv6
+- TCPv6
+- UDPv6
+- SCTPv6
+
+Each flow type supports valid combinations of IP addresses (source or
+destination) and UDP/TCP/SCTP ports (source and destination). You can supply
+only a source IP address, a source IP address and a destination port, or any
+combination of one or more of these four parameters.
+
+NOTE: This driver allows you to filter traffic based on a user-defined flexible
+two-byte pattern and offset by using the ethtool user-def and mask fields. Only
+L3 and L4 flow types are supported for user-defined flexible filters. For a
+given flow type, you must clear all Intel Ethernet Flow Director filters before
+changing the input set (for that flow type).
+
+
+Flow Director Filters
+---------------------
+Flow Director filters are used to direct traffic that matches specified
+characteristics. They are enabled through ethtool's ntuple interface. To enable
+or disable the Intel Ethernet Flow Director and these filters::
+
+  # ethtool -K <ethX> ntuple <off|on>
+
+NOTE: When you disable ntuple filters, all the user programmed filters are
+flushed from the driver cache and hardware. All needed filters must be re-added
+when ntuple is re-enabled.
+
+To display all of the active filters::
+
+  # ethtool -u <ethX>
+
+To add a new filter::
+
+  # ethtool -U <ethX> flow-type <type> src-ip <ip> [m <ip_mask>] dst-ip <ip>
+  [m <ip_mask>] src-port <port> [m <port_mask>] dst-port <port> [m <port_mask>]
+  action <queue>
+
+  Where:
+    <ethX> - the Ethernet device to program
+    <type> - can be ip4, tcp4, udp4, sctp4, ip6, tcp6, udp6, sctp6
+    <ip> - the IP address to match on
+    <ip_mask> - the IPv4 address to mask on
+              NOTE: These filters use inverted masks.
+    <port> - the port number to match on
+    <port_mask> - the 16-bit integer for masking
+              NOTE: These filters use inverted masks.
+    <queue> - the queue to direct traffic toward (-1 discards the
+              matched traffic)
+
+To delete a filter::
+
+  # ethtool -U <ethX> delete <N>
+
+  Where <N> is the filter ID displayed when printing all the active filters,
+  and may also have been specified using "loc <N>" when adding the filter.
+
+EXAMPLES:
+
+To add a filter that directs packet to queue 2::
+
+  # ethtool -U <ethX> flow-type tcp4 src-ip 192.168.10.1 dst-ip \
+  192.168.10.2 src-port 2000 dst-port 2001 action 2 [loc 1]
+
+To set a filter using only the source and destination IP address::
+
+  # ethtool -U <ethX> flow-type tcp4 src-ip 192.168.10.1 dst-ip \
+  192.168.10.2 action 2 [loc 1]
+
+To set a filter based on a user-defined pattern and offset::
+
+  # ethtool -U <ethX> flow-type tcp4 src-ip 192.168.10.1 dst-ip \
+  192.168.10.2 user-def 0x4FFFF action 2 [loc 1]
+
+  where the value of the user-def field contains the offset (4 bytes) and
+  the pattern (0xffff).
+
+To match TCP traffic sent from 192.168.0.1, port 5300, directed to 192.168.0.5,
+port 80, and then send it to queue 7::
+
+  # ethtool -U enp130s0 flow-type tcp4 src-ip 192.168.0.1 dst-ip 192.168.0.5
+  src-port 5300 dst-port 80 action 7
+
+To add a TCPv4 filter with a partial mask for a source IP subnet::
+
+  # ethtool -U <ethX> flow-type tcp4 src-ip 192.168.0.0 m 0.255.255.255 dst-ip
+  192.168.5.12 src-port 12600 dst-port 31 action 12
+
+NOTES:
+
+For each flow-type, the programmed filters must all have the same matching
+input set. For example, issuing the following two commands is acceptable::
+
+  # ethtool -U enp130s0 flow-type ip4 src-ip 192.168.0.1 src-port 5300 action 7
+  # ethtool -U enp130s0 flow-type ip4 src-ip 192.168.0.5 src-port 55 action 10
+
+Issuing the next two commands, however, is not acceptable, since the first
+specifies src-ip and the second specifies dst-ip::
+
+  # ethtool -U enp130s0 flow-type ip4 src-ip 192.168.0.1 src-port 5300 action 7
+  # ethtool -U enp130s0 flow-type ip4 dst-ip 192.168.0.5 src-port 55 action 10
+
+The second command will fail with an error. You may program multiple filters
+with the same fields, using different values, but, on one device, you may not
+program two tcp4 filters with different matching fields.
+
+The ice driver does not support matching on a subportion of a field, thus
+partial mask fields are not supported.
+
+
+Flex Byte Flow Director Filters
+-------------------------------
+The driver also supports matching user-defined data within the packet payload.
+This flexible data is specified using the "user-def" field of the ethtool
+command in the following way:
+
+.. table::
+
+    ============================== ============================
+    ``31    28    24    20    16`` ``15    12    8    4    0``
+    ``offset into packet payload`` ``2 bytes of flexible data``
+    ============================== ============================
+
+For example,
+
+::
+
+  ... user-def 0x4FFFF ...
+
+tells the filter to look 4 bytes into the payload and match that value against
+0xFFFF. The offset is based on the beginning of the payload, and not the
+beginning of the packet. Thus
+
+::
+
+  flow-type tcp4 ... user-def 0x8BEAF ...
+
+would match TCP/IPv4 packets which have the value 0xBEAF 8 bytes into the
+TCP/IPv4 payload.
+
+Note that ICMP headers are parsed as 4 bytes of header and 4 bytes of payload.
+Thus to match the first byte of the payload, you must actually add 4 bytes to
+the offset. Also note that ip4 filters match both ICMP frames as well as raw
+(unknown) ip4 frames, where the payload will be the L3 payload of the IP4
+frame.
+
+The maximum offset is 64. The hardware will only read up to 64 bytes of data
+from the payload. The offset must be even because the flexible data is 2 bytes
+long and must be aligned to byte 0 of the packet payload.
+
+The user-defined flexible offset is also considered part of the input set and
+cannot be programmed separately for multiple filters of the same type. However,
+the flexible data is not part of the input set and multiple filters may use the
+same offset but match against different data.
+
+
+RSS Hash Flow
+-------------
+Allows you to set the hash bytes per flow type and any combination of one or
+more options for Receive Side Scaling (RSS) hash byte configuration.
+
+::
+
+  # ethtool -N <ethX> rx-flow-hash <type> <option>
+
+  Where <type> is:
+    tcp4  signifying TCP over IPv4
+    udp4  signifying UDP over IPv4
+    tcp6  signifying TCP over IPv6
+    udp6  signifying UDP over IPv6
+  And <option> is one or more of:
+    s     Hash on the IP source address of the Rx packet.
+    d     Hash on the IP destination address of the Rx packet.
+    f     Hash on bytes 0 and 1 of the Layer 4 header of the Rx packet.
+    n     Hash on bytes 2 and 3 of the Layer 4 header of the Rx packet.
+
+
+Accelerated Receive Flow Steering (aRFS)
+----------------------------------------
+Devices based on the Intel(R) Ethernet Controller 800 Series support
+Accelerated Receive Flow Steering (aRFS) on the PF. aRFS is a load-balancing
+mechanism that allows you to direct packets to the same CPU where an
+application is running or consuming the packets in that flow.
+
+NOTES:
+
+- aRFS requires that ntuple filtering is enabled via ethtool.
+- aRFS support is limited to the following packet types:
+
+    - TCP over IPv4 and IPv6
+    - UDP over IPv4 and IPv6
+    - Nonfragmented packets
+
+- aRFS only supports Flow Director filters, which consist of the
+  source/destination IP addresses and source/destination ports.
+- aRFS and ethtool's ntuple interface both use the device's Flow Director. aRFS
+  and ntuple features can coexist, but you may encounter unexpected results if
+  there's a conflict between aRFS and ntuple requests. See "Intel(R) Ethernet
+  Flow Director" for additional information.
+
+To set up aRFS:
+
+1. Enable the Intel Ethernet Flow Director and ntuple filters using ethtool.
+
+::
+
+   # ethtool -K <ethX> ntuple on
+
+2. Set up the number of entries in the global flow table. For example:
+
+::
+
+   # NUM_RPS_ENTRIES=16384
+   # echo $NUM_RPS_ENTRIES > /proc/sys/net/core/rps_sock_flow_entries
+
+3. Set up the number of entries in the per-queue flow table. For example:
+
+::
+
+   # NUM_RX_QUEUES=64
+   # for file in /sys/class/net/$IFACE/queues/rx-*/rps_flow_cnt; do
+   # echo $(($NUM_RPS_ENTRIES/$NUM_RX_QUEUES)) > $file;
+   # done
+
+4. Disable the IRQ balance daemon (this is only a temporary stop of the service
+   until the next reboot).
+
+::
+
+   # systemctl stop irqbalance
+
+5. Configure the interrupt affinity.
+
+   See ``/Documentation/core-api/irq/irq-affinity.rst``
+
+
+To disable aRFS using ethtool::
+
+  # ethtool -K <ethX> ntuple off
+
+NOTE: This command will disable ntuple filters and clear any aRFS filters in
+software and hardware.
+
+Example Use Case:
+
+1. Set the server application on the desired CPU (e.g., CPU 4).
+
+::
+
+   # taskset -c 4 netserver
+
+2. Use netperf to route traffic from the client to CPU 4 on the server with
+   aRFS configured. This example uses TCP over IPv4.
+
+::
+
+   # netperf -H <Host IPv4 Address> -t TCP_STREAM
+
+
+Enabling Virtual Functions (VFs)
+--------------------------------
+Use sysfs to enable virtual functions (VF).
+
+For example, you can create 4 VFs as follows::
+
+  # echo 4 > /sys/class/net/<ethX>/device/sriov_numvfs
+
+To disable VFs, write 0 to the same file::
+
+  # echo 0 > /sys/class/net/<ethX>/device/sriov_numvfs
+
+The maximum number of VFs for the ice driver is 256 total (all ports). To check
+how many VFs each PF supports, use the following command::
+
+  # cat /sys/class/net/<ethX>/device/sriov_totalvfs
+
+Note: You cannot use SR-IOV when link aggregation (LAG)/bonding is active, and
+vice versa. To enforce this, the driver checks for this mutual exclusion.
+
+
+Displaying VF Statistics on the PF
+----------------------------------
+Use the following command to display the statistics for the PF and its VFs::
+
+  # ip -s link show dev <ethX>
+
+NOTE: The output of this command can be very large due to the maximum number of
+possible VFs.
+
+The PF driver will display a subset of the statistics for the PF and for all
+VFs that are configured. The PF will always print a statistics block for each
+of the possible VFs, and it will show zero for all unconfigured VFs.
+
+
+Configuring VLAN Tagging on SR-IOV Enabled Adapter Ports
+--------------------------------------------------------
+To configure VLAN tagging for the ports on an SR-IOV enabled adapter, use the
+following command. The VLAN configuration should be done before the VF driver
+is loaded or the VM is booted. The VF is not aware of the VLAN tag being
+inserted on transmit and removed on received frames (sometimes called "port
+VLAN" mode).
+
+::
+
+  # ip link set dev <ethX> vf <id> vlan <vlan id>
+
+For example, the following will configure PF eth0 and the first VF on VLAN 10::
+
+  # ip link set dev eth0 vf 0 vlan 10
+
+
+Enabling a VF link if the port is disconnected
+----------------------------------------------
+If the physical function (PF) link is down, you can force link up (from the
+host PF) on any virtual functions (VF) bound to the PF.
+
+For example, to force link up on VF 0 bound to PF eth0::
+
+  # ip link set eth0 vf 0 state enable
+
+Note: If the command does not work, it may not be supported by your system.
+
+
+Setting the MAC Address for a VF
+--------------------------------
+To change the MAC address for the specified VF::
+
+  # ip link set <ethX> vf 0 mac <address>
+
+For example::
+
+  # ip link set <ethX> vf 0 mac 00:01:02:03:04:05
+
+This setting lasts until the PF is reloaded.
+
+NOTE: Assigning a MAC address for a VF from the host will disable any
+subsequent requests to change the MAC address from within the VM. This is a
+security feature. The VM is not aware of this restriction, so if this is
+attempted in the VM, it will trigger MDD events.
+
+
+Trusted VFs and VF Promiscuous Mode
+-----------------------------------
+This feature allows you to designate a particular VF as trusted and allows that
+trusted VF to request selective promiscuous mode on the Physical Function (PF).
+
+To set a VF as trusted or untrusted, enter the following command in the
+Hypervisor::
+
+  # ip link set dev <ethX> vf 1 trust [on|off]
+
+NOTE: It's important to set the VF to trusted before setting promiscuous mode.
+If the VM is not trusted, the PF will ignore promiscuous mode requests from the
+VF. If the VM becomes trusted after the VF driver is loaded, you must make a
+new request to set the VF to promiscuous.
+
+Once the VF is designated as trusted, use the following commands in the VM to
+set the VF to promiscuous mode.
+
+For promiscuous all::
+
+  # ip link set <ethX> promisc on
+  Where <ethX> is a VF interface in the VM
+
+For promiscuous Multicast::
+
+  # ip link set <ethX> allmulticast on
+  Where <ethX> is a VF interface in the VM
+
+NOTE: By default, the ethtool private flag vf-true-promisc-support is set to
+"off," meaning that promiscuous mode for the VF will be limited. To set the
+promiscuous mode for the VF to true promiscuous and allow the VF to see all
+ingress traffic, use the following command::
+
+  # ethtool --set-priv-flags <ethX> vf-true-promisc-support on
+
+The vf-true-promisc-support private flag does not enable promiscuous mode;
+rather, it designates which type of promiscuous mode (limited or true) you will
+get when you enable promiscuous mode using the ip link commands above. Note
+that this is a global setting that affects the entire device. However, the
+vf-true-promisc-support private flag is only exposed to the first PF of the
+device. The PF remains in limited promiscuous mode regardless of the
+vf-true-promisc-support setting.
+
+Next, add a VLAN interface on the VF interface. For example::
+
+  # ip link add link eth2 name eth2.100 type vlan id 100
+
+Note that the order in which you set the VF to promiscuous mode and add the
+VLAN interface does not matter (you can do either first). The result in this
+example is that the VF will get all traffic that is tagged with VLAN 100.
+
+
+Malicious Driver Detection (MDD) for VFs
+----------------------------------------
+Some Intel Ethernet devices use Malicious Driver Detection (MDD) to detect
+malicious traffic from the VF and disable Tx/Rx queues or drop the offending
+packet until a VF driver reset occurs. You can view MDD messages in the PF's
+system log using the dmesg command.
+
+- If the PF driver logs MDD events from the VF, confirm that the correct VF
+  driver is installed.
+- To restore functionality, you can manually reload the VF or VM or enable
+  automatic VF resets.
+- When automatic VF resets are enabled, the PF driver will immediately reset
+  the VF and reenable queues when it detects MDD events on the receive path.
+- If automatic VF resets are disabled, the PF will not automatically reset the
+  VF when it detects MDD events.
+
+To enable or disable automatic VF resets, use the following command::
+
+  # ethtool --set-priv-flags <ethX> mdd-auto-reset-vf on|off
+
+
+MAC and VLAN Anti-Spoofing Feature for VFs
+------------------------------------------
+When a malicious driver on a Virtual Function (VF) interface attempts to send a
+spoofed packet, it is dropped by the hardware and not transmitted.
+
+NOTE: This feature can be disabled for a specific VF::
+
+  # ip link set <ethX> vf <vf id> spoofchk {off|on}
+
+
+Jumbo Frames
+------------
+Jumbo Frames support is enabled by changing the Maximum Transmission Unit (MTU)
+to a value larger than the default value of 1500.
+
+Use the ifconfig command to increase the MTU size. For example, enter the
+following where <ethX> is the interface number::
+
+  # ifconfig <ethX> mtu 9000 up
+
+Alternatively, you can use the ip command as follows::
+
+  # ip link set mtu 9000 dev <ethX>
+  # ip link set up dev <ethX>
+
+This setting is not saved across reboots.
+
+
+NOTE: The maximum MTU setting for jumbo frames is 9702. This corresponds to the
+maximum jumbo frame size of 9728 bytes.
+
+NOTE: This driver will attempt to use multiple page sized buffers to receive
+each jumbo packet. This should help to avoid buffer starvation issues when
+allocating receive packets.
+
+NOTE: Packet loss may have a greater impact on throughput when you use jumbo
+frames. If you observe a drop in performance after enabling jumbo frames,
+enabling flow control may mitigate the issue.
+
+
+Speed and Duplex Configuration
+------------------------------
+In addressing speed and duplex configuration issues, you need to distinguish
+between copper-based adapters and fiber-based adapters.
+
+In the default mode, an Intel(R) Ethernet Network Adapter using copper
+connections will attempt to auto-negotiate with its link partner to determine
+the best setting. If the adapter cannot establish link with the link partner
+using auto-negotiation, you may need to manually configure the adapter and link
+partner to identical settings to establish link and pass packets. This should
+only be needed when attempting to link with an older switch that does not
+support auto-negotiation or one that has been forced to a specific speed or
+duplex mode. Your link partner must match the setting you choose. 1 Gbps speeds
+and higher cannot be forced. Use the autonegotiation advertising setting to
+manually set devices for 1 Gbps and higher.
+
+Speed, duplex, and autonegotiation advertising are configured through the
+ethtool utility. For the latest version, download and install ethtool from the
+following website:
+
+   https://kernel.org/pub/software/network/ethtool/
+
+To see the speed configurations your device supports, run the following::
+
+  # ethtool <ethX>
+
+Caution: Only experienced network administrators should force speed and duplex
+or change autonegotiation advertising manually. The settings at the switch must
+always match the adapter settings. Adapter performance may suffer or your
+adapter may not operate if you configure the adapter differently from your
+switch.
+
+
+Data Center Bridging (DCB)
+--------------------------
+NOTE: The kernel assumes that TC0 is available, and will disable Priority Flow
+Control (PFC) on the device if TC0 is not available. To fix this, ensure TC0 is
+enabled when setting up DCB on your switch.
+
+DCB is a configuration Quality of Service implementation in hardware. It uses
+the VLAN priority tag (802.1p) to filter traffic. That means that there are 8
+different priorities that traffic can be filtered into. It also enables
+priority flow control (802.1Qbb) which can limit or eliminate the number of
+dropped packets during network stress. Bandwidth can be allocated to each of
+these priorities, which is enforced at the hardware level (802.1Qaz).
+
+DCB is normally configured on the network using the DCBX protocol (802.1Qaz), a
+specialization of LLDP (802.1AB). The ice driver supports the following
+mutually exclusive variants of DCBX support:
+
+1) Firmware-based LLDP Agent
+2) Software-based LLDP Agent
+
+In firmware-based mode, firmware intercepts all LLDP traffic and handles DCBX
+negotiation transparently for the user. In this mode, the adapter operates in
+"willing" DCBX mode, receiving DCB settings from the link partner (typically a
+switch). The local user can only query the negotiated DCB configuration. For
+information on configuring DCBX parameters on a switch, please consult the
+switch manufacturer's documentation.
+
+In software-based mode, LLDP traffic is forwarded to the network stack and user
+space, where a software agent can handle it. In this mode, the adapter can
+operate in either "willing" or "nonwilling" DCBX mode and DCB configuration can
+be both queried and set locally. This mode requires the FW-based LLDP Agent to
+be disabled.
+
+NOTE:
+
+- You can enable and disable the firmware-based LLDP Agent using an ethtool
+  private flag. Refer to the "FW-LLDP (Firmware Link Layer Discovery Protocol)"
+  section in this README for more information.
+- In software-based DCBX mode, you can configure DCB parameters using software
+  LLDP/DCBX agents that interface with the Linux kernel's DCB Netlink API. We
+  recommend using OpenLLDP as the DCBX agent when running in software mode. For
+  more information, see the OpenLLDP man pages and
+  https://github.com/intel/openlldp.
+- The driver implements the DCB netlink interface layer to allow the user space
+  to communicate with the driver and query DCB configuration for the port.
+- iSCSI with DCB is not supported.
+
+
+FW-LLDP (Firmware Link Layer Discovery Protocol)
+------------------------------------------------
+Use ethtool to change FW-LLDP settings. The FW-LLDP setting is per port and
+persists across boots.
+
+To enable LLDP::
+
+  # ethtool --set-priv-flags <ethX> fw-lldp-agent on
+
+To disable LLDP::
+
+  # ethtool --set-priv-flags <ethX> fw-lldp-agent off
+
+To check the current LLDP setting::
+
+  # ethtool --show-priv-flags <ethX>
+
+NOTE: You must enable the UEFI HII "LLDP Agent" attribute for this setting to
+take effect. If "LLDP AGENT" is set to disabled, you cannot enable it from the
+OS.
+
+
+Flow Control
+------------
+Ethernet Flow Control (IEEE 802.3x) can be configured with ethtool to enable
+receiving and transmitting pause frames for ice. When transmit is enabled,
+pause frames are generated when the receive packet buffer crosses a predefined
+threshold. When receive is enabled, the transmit unit will halt for the time
+delay specified when a pause frame is received.
+
+NOTE: You must have a flow control capable link partner.
+
+Flow Control is disabled by default.
+
+Use ethtool to change the flow control settings.
+
+To enable or disable Rx or Tx Flow Control::
+
+  # ethtool -A <ethX> rx <on|off> tx <on|off>
+
+Note: This command only enables or disables Flow Control if auto-negotiation is
+disabled. If auto-negotiation is enabled, this command changes the parameters
+used for auto-negotiation with the link partner.
+
+Note: Flow Control auto-negotiation is part of link auto-negotiation. Depending
+on your device, you may not be able to change the auto-negotiation setting.
+
+NOTE:
+
+- The ice driver requires flow control on both the port and link partner. If
+  flow control is disabled on one of the sides, the port may appear to hang on
+  heavy traffic.
+- You may encounter issues with link-level flow control (LFC) after disabling
+  DCB. The LFC status may show as enabled but traffic is not paused. To resolve
+  this issue, disable and reenable LFC using ethtool::
+
+   # ethtool -A <ethX> rx off tx off
+   # ethtool -A <ethX> rx on tx on
+
+
+NAPI
+----
+This driver supports NAPI (Rx polling mode).
+For more information on NAPI, see
+https://www.linuxfoundation.org/collaborate/workgroups/networking/napi
+
+
+MACVLAN
+-------
+This driver supports MACVLAN. Kernel support for MACVLAN can be tested by
+checking if the MACVLAN driver is loaded. You can run 'lsmod | grep macvlan' to
+see if the MACVLAN driver is loaded or run 'modprobe macvlan' to try to load
+the MACVLAN driver.
+
+NOTE:
+
+- In passthru mode, you can only set up one MACVLAN device. It will inherit the
+  MAC address of the underlying PF (Physical Function) device.
+
+
+IEEE 802.1ad (QinQ) Support
+---------------------------
+The IEEE 802.1ad standard, informally known as QinQ, allows for multiple VLAN
+IDs within a single Ethernet frame. VLAN IDs are sometimes referred to as
+"tags," and multiple VLAN IDs are thus referred to as a "tag stack." Tag stacks
+allow L2 tunneling and the ability to segregate traffic within a particular
+VLAN ID, among other uses.
+
+NOTES:
+
+- Receive checksum offloads and VLAN acceleration are not supported for 802.1ad
+  (QinQ) packets.
+
+- 0x88A8 traffic will not be received unless VLAN stripping is disabled with
+  the following command::
+
+    # ethool -K <ethX> rxvlan off
+
+- 0x88A8/0x8100 double VLANs cannot be used with 0x8100 or 0x8100/0x8100 VLANS
+  configured on the same port. 0x88a8/0x8100 traffic will not be received if
+  0x8100 VLANs are configured.
+
+- The VF can only transmit 0x88A8/0x8100 (i.e., 802.1ad/802.1Q) traffic if:
+
+    1) The VF is not assigned a port VLAN.
+    2) spoofchk is disabled from the PF. If you enable spoofchk, the VF will
+       not transmit 0x88A8/0x8100 traffic.
+
+- The VF may not receive all network traffic based on the Inner VLAN header
+  when VF true promiscuous mode (vf-true-promisc-support) and double VLANs are
+  enabled in SR-IOV mode.
+
+The following are examples of how to configure 802.1ad (QinQ)::
+
+  # ip link add link eth0 eth0.24 type vlan proto 802.1ad id 24
+  # ip link add link eth0.24 eth0.24.371 type vlan proto 802.1Q id 371
+
+  Where "24" and "371" are example VLAN IDs.
+
+
+Tunnel/Overlay Stateless Offloads
+---------------------------------
+Supported tunnels and overlays include VXLAN, GENEVE, and others depending on
+hardware and software configuration. Stateless offloads are enabled by default.
+
+To view the current state of all offloads::
+
+  # ethtool -k <ethX>
+
+
+UDP Segmentation Offload
+------------------------
+Allows the adapter to offload transmit segmentation of UDP packets with
+payloads up to 64K into valid Ethernet frames. Because the adapter hardware is
+able to complete data segmentation much faster than operating system software,
+this feature may improve transmission performance.
+In addition, the adapter may use fewer CPU resources.
+
+NOTE:
+
+- The application sending UDP packets must support UDP segmentation offload.
+
+To enable/disable UDP Segmentation Offload, issue the following command::
+
+  # ethtool -K <ethX> tx-udp-segmentation [off|on]
+
+
+Performance Optimization
+========================
+Driver defaults are meant to fit a wide variety of workloads, but if further
+optimization is required, we recommend experimenting with the following
+settings.
+
+
+Rx Descriptor Ring Size
+-----------------------
+To reduce the number of Rx packet discards, increase the number of Rx
+descriptors for each Rx ring using ethtool.
+
+  Check if the interface is dropping Rx packets due to buffers being full
+  (rx_dropped.nic can mean that there is no PCIe bandwidth)::
+
+    # ethtool -S <ethX> | grep "rx_dropped"
+
+  If the previous command shows drops on queues, it may help to increase
+  the number of descriptors using 'ethtool -G'::
+
+    # ethtool -G <ethX> rx <N>
+    Where <N> is the desired number of ring entries/descriptors
+
+  This can provide temporary buffering for issues that create latency while
+  the CPUs process descriptors.
+
+
+Interrupt Rate Limiting
+-----------------------
+This driver supports an adaptive interrupt throttle rate (ITR) mechanism that
+is tuned for general workloads. The user can customize the interrupt rate
+control for specific workloads, via ethtool, adjusting the number of
+microseconds between interrupts.
+
+To set the interrupt rate manually, you must disable adaptive mode::
+
+  # ethtool -C <ethX> adaptive-rx off adaptive-tx off
+
+For lower CPU utilization:
+
+  Disable adaptive ITR and lower Rx and Tx interrupts. The examples below
+  affect every queue of the specified interface.
+
+  Setting rx-usecs and tx-usecs to 80 will limit interrupts to about
+  12,500 interrupts per second per queue::
+
+    # ethtool -C <ethX> adaptive-rx off adaptive-tx off rx-usecs 80 tx-usecs 80
+
+For reduced latency:
+
+  Disable adaptive ITR and ITR by setting rx-usecs and tx-usecs to 0
+  using ethtool::
+
+    # ethtool -C <ethX> adaptive-rx off adaptive-tx off rx-usecs 0 tx-usecs 0
+
+Per-queue interrupt rate settings:
+
+  The following examples are for queues 1 and 3, but you can adjust other
+  queues.
+
+  To disable Rx adaptive ITR and set static Rx ITR to 10 microseconds or
+  about 100,000 interrupts/second, for queues 1 and 3::
+
+    # ethtool --per-queue <ethX> queue_mask 0xa --coalesce adaptive-rx off
+    rx-usecs 10
+
+  To show the current coalesce settings for queues 1 and 3::
+
+    # ethtool --per-queue <ethX> queue_mask 0xa --show-coalesce
+
+Bounding interrupt rates using rx-usecs-high:
+
+  :Valid Range: 0-236 (0=no limit)
+
+   The range of 0-236 microseconds provides an effective range of 4,237 to
+   250,000 interrupts per second. The value of rx-usecs-high can be set
+   independently of rx-usecs and tx-usecs in the same ethtool command, and is
+   also independent of the adaptive interrupt moderation algorithm. The
+   underlying hardware supports granularity in 4-microsecond intervals, so
+   adjacent values may result in the same interrupt rate.
+
+  The following command would disable adaptive interrupt moderation, and allow
+  a maximum of 5 microseconds before indicating a receive or transmit was
+  complete. However, instead of resulting in as many as 200,000 interrupts per
+  second, it limits total interrupts per second to 50,000 via the rx-usecs-high
+  parameter.
+
+  ::
+
+    # ethtool -C <ethX> adaptive-rx off adaptive-tx off rx-usecs-high 20
+    rx-usecs 5 tx-usecs 5
+
+
+Virtualized Environments
+------------------------
+In addition to the other suggestions in this section, the following may be
+helpful to optimize performance in VMs.
+
+  Using the appropriate mechanism (vcpupin) in the VM, pin the CPUs to
+  individual LCPUs, making sure to use a set of CPUs included in the
+  device's local_cpulist: ``/sys/class/net/<ethX>/device/local_cpulist``.
+
+  Configure as many Rx/Tx queues in the VM as available. (See the iavf driver
+  documentation for the number of queues supported.) For example::
+
+    # ethtool -L <virt_interface> rx <max> tx <max>
 
-  -> Device Drivers
-    -> Network device support (NETDEVICES [=y])
-      -> Ethernet driver support
-        -> Intel devices
-          -> Intel(R) Ethernet Connection E800 Series Support
 
 Support
 =======
 For general information, go to the Intel support website at:
-
 https://www.intel.com/support/
 
 or the Intel Wired Networking project hosted by Sourceforge at:
-
 https://sourceforge.net/projects/e1000
 
 If an issue is identified with the released source code on a supported kernel
 with a supported adapter, email the specific information related to the issue
 to [email protected].
+
+
+Trademarks
+==========
+Intel is a trademark or registered trademark of Intel Corporation or its
+subsidiaries in the United States and/or other countries.
+
+* Other names and brands may be claimed as the property of others.
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 61a358301f12..581bfce86dca 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -180,7 +180,7 @@ min_adv_mss - INTEGER
 
 fib_notify_on_flag_change - INTEGER
         Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/
-        RTM_F_TRAP flags are changed.
+        RTM_F_TRAP/RTM_F_OFFLOAD_FAILED flags are changed.
 
         After installing a route to the kernel, user space receives an
         acknowledgment, which means the route was installed in the kernel,
@@ -197,6 +197,7 @@ fib_notify_on_flag_change - INTEGER
 
         - 0 - Do not emit notifications.
         - 1 - Emit notifications.
+        - 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change.
 
 IP Fragmentation:
 
@@ -1445,6 +1446,25 @@ rp_filter - INTEGER
 	Default value is 0. Note that some distributions enable it
 	in startup scripts.
 
+src_valid_mark - BOOLEAN
+	- 0 - The fwmark of the packet is not included in reverse path
+	  route lookup.  This allows for asymmetric routing configurations
+	  utilizing the fwmark in only one direction, e.g., transparent
+	  proxying.
+
+	- 1 - The fwmark of the packet is included in reverse path route
+	  lookup.  This permits rp_filter to function when the fwmark is
+	  used for routing traffic in both directions.
+
+	This setting also affects the utilization of fmwark when
+	performing source address selection for ICMP replies, or
+	determining addresses stored for the IPOPT_TS_TSANDADDR and
+	IPOPT_RR IP options.
+
+	The max value from conf/{all,interface}/src_valid_mark is used.
+
+	Default value is 0.
+
 arp_filter - BOOLEAN
 	- 1 - Allows you to have multiple network interfaces on the same
 	  subnet, and have the ARPs for each interface be answered
@@ -1797,7 +1817,7 @@ nexthop_compat_mode - BOOLEAN
 
 fib_notify_on_flag_change - INTEGER
         Whether to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/
-        RTM_F_TRAP flags are changed.
+        RTM_F_TRAP/RTM_F_OFFLOAD_FAILED flags are changed.
 
         After installing a route to the kernel, user space receives an
         acknowledgment, which means the route was installed in the kernel,
@@ -1814,6 +1834,7 @@ fib_notify_on_flag_change - INTEGER
 
         - 0 - Do not emit notifications.
         - 1 - Emit notifications.
+        - 2 - Emit notifications only for RTM_F_OFFLOAD_FAILED flag change.
 
 IPv6 Fragmentation:
 
diff --git a/MAINTAINERS b/MAINTAINERS
index d1b0057a9797..cbf4b94f89d4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3445,6 +3445,15 @@ F:	Documentation/devicetree/bindings/mips/brcm/
 F:	arch/mips/bcm47xx/*
 F:	arch/mips/include/asm/mach-bcm47xx/*
 
+BROADCOM BCM4908 ETHERNET DRIVER
+M:	Rafał Miłecki <[email protected]>
+M:	[email protected]
+L:	[email protected]
+S:	Maintained
+F:	Documentation/devicetree/bindings/net/brcm,bcm4908enet.yaml
+F:	drivers/net/ethernet/broadcom/bcm4908enet.*
+F:	drivers/net/ethernet/broadcom/unimac.h
+
 BROADCOM BCM5301X ARM ARCHITECTURE
 M:	Hauke Mehrtens <[email protected]>
 M:	Rafał Miłecki <[email protected]>
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 4bdf8fbe75a6..bcf9e0a410fd 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -51,6 +51,14 @@ config B44_PCI
 	depends on B44_PCI_AUTOSELECT && B44_PCICORE_AUTOSELECT
 	default y
 
+config BCM4908ENET
+	tristate "Broadcom BCM4908 internal mac support"
+	depends on ARCH_BCM4908 || COMPILE_TEST
+	default y
+	help
+	  This driver supports Ethernet controller integrated into Broadcom
+	  BCM4908 family SoCs.
+
 config BCM63XX_ENET
 	tristate "Broadcom 63xx internal mac support"
 	depends on BCM63XX
diff --git a/drivers/net/ethernet/broadcom/Makefile b/drivers/net/ethernet/broadcom/Makefile
index 7046ad6d3d0e..379012de3569 100644
--- a/drivers/net/ethernet/broadcom/Makefile
+++ b/drivers/net/ethernet/broadcom/Makefile
@@ -4,6 +4,7 @@
 #
 
 obj-$(CONFIG_B44) += b44.o
+obj-$(CONFIG_BCM4908ENET) += bcm4908enet.o
 obj-$(CONFIG_BCM63XX_ENET) += bcm63xx_enet.o
 obj-$(CONFIG_BCMGENET) += genet/
 obj-$(CONFIG_BNX2) += bnx2.o
diff --git a/drivers/net/ethernet/broadcom/bcm4908enet.c b/drivers/net/ethernet/broadcom/bcm4908enet.c
new file mode 100644
index 000000000000..d68b328e7596
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bcm4908enet.c
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Rafał Miłecki <[email protected]>
+ */
+
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "bcm4908enet.h"
+#include "unimac.h"
+
+#define ENET_DMA_CH_RX_CFG			ENET_DMA_CH0_CFG
+#define ENET_DMA_CH_TX_CFG			ENET_DMA_CH1_CFG
+#define ENET_DMA_CH_RX_STATE_RAM		ENET_DMA_CH0_STATE_RAM
+#define ENET_DMA_CH_TX_STATE_RAM		ENET_DMA_CH1_STATE_RAM
+
+#define ENET_TX_BDS_NUM				200
+#define ENET_RX_BDS_NUM				200
+#define ENET_RX_BDS_NUM_MAX			8192
+
+#define ENET_DMA_INT_DEFAULTS			(ENET_DMA_CH_CFG_INT_DONE | \
+						 ENET_DMA_CH_CFG_INT_NO_DESC | \
+						 ENET_DMA_CH_CFG_INT_BUFF_DONE)
+#define ENET_DMA_MAX_BURST_LEN			8 /* in 64 bit words */
+
+#define ENET_MTU_MIN				60
+#define ENET_MTU_MAX				1500 /* Is it possible to support 2044? */
+#define ENET_MTU_MAX_EXTRA_SIZE			32 /* L2 */
+
+struct bcm4908enet_dma_ring_bd {
+	__le32 ctl;
+	__le32 addr;
+} __packed;
+
+struct bcm4908enet_dma_ring_slot {
+	struct sk_buff *skb;
+	unsigned int len;
+	dma_addr_t dma_addr;
+};
+
+struct bcm4908enet_dma_ring {
+	int is_tx;
+	int read_idx;
+	int write_idx;
+	int length;
+	u16 cfg_block;
+	u16 st_ram_block;
+
+	union {
+		void *cpu_addr;
+		struct bcm4908enet_dma_ring_bd *buf_desc;
+	};
+	dma_addr_t dma_addr;
+
+	struct bcm4908enet_dma_ring_slot *slots;
+};
+
+struct bcm4908enet {
+	struct device *dev;
+	struct net_device *netdev;
+	struct napi_struct napi;
+	void __iomem *base;
+
+	struct bcm4908enet_dma_ring tx_ring;
+	struct bcm4908enet_dma_ring rx_ring;
+};
+
+/***
+ * R/W ops
+ */
+
+static inline u32 enet_read(struct bcm4908enet *enet, u16 offset)
+{
+	return readl(enet->base + offset);
+}
+
+static inline void enet_write(struct bcm4908enet *enet, u16 offset, u32 value)
+{
+	writel(value, enet->base + offset);
+}
+
+static inline void enet_maskset(struct bcm4908enet *enet, u16 offset, u32 mask, u32 set)
+{
+	u32 val;
+
+	WARN_ON(set & ~mask);
+
+	val = enet_read(enet, offset);
+	val = (val & ~mask) | (set & mask);
+	enet_write(enet, offset, val);
+}
+
+static inline void enet_set(struct bcm4908enet *enet, u16 offset, u32 set)
+{
+	enet_maskset(enet, offset, set, set);
+}
+
+static inline u32 enet_umac_read(struct bcm4908enet *enet, u16 offset)
+{
+	return enet_read(enet, ENET_UNIMAC + offset);
+}
+
+static inline void enet_umac_write(struct bcm4908enet *enet, u16 offset, u32 value)
+{
+	enet_write(enet, ENET_UNIMAC + offset, value);
+}
+
+static inline void enet_umac_maskset(struct bcm4908enet *enet, u16 offset, u32 mask, u32 set)
+{
+	enet_maskset(enet, ENET_UNIMAC + offset, mask, set);
+}
+
+static inline void enet_umac_set(struct bcm4908enet *enet, u16 offset, u32 set)
+{
+	enet_set(enet, ENET_UNIMAC + offset, set);
+}
+
+/***
+ * Helpers
+ */
+
+static void bcm4908enet_intrs_on(struct bcm4908enet *enet)
+{
+	enet_write(enet, ENET_DMA_CH_RX_CFG + ENET_DMA_CH_CFG_INT_MASK, ENET_DMA_INT_DEFAULTS);
+}
+
+static void bcm4908enet_intrs_off(struct bcm4908enet *enet)
+{
+	enet_write(enet, ENET_DMA_CH_RX_CFG + ENET_DMA_CH_CFG_INT_MASK, 0);
+}
+
+static void bcm4908enet_intrs_ack(struct bcm4908enet *enet)
+{
+	enet_write(enet, ENET_DMA_CH_RX_CFG + ENET_DMA_CH_CFG_INT_STAT, ENET_DMA_INT_DEFAULTS);
+}
+
+/***
+ * DMA
+ */
+
+static int bcm4908_dma_alloc_buf_descs(struct bcm4908enet *enet, struct bcm4908enet_dma_ring *ring)
+{
+	int size = ring->length * sizeof(struct bcm4908enet_dma_ring_bd);
+	struct device *dev = enet->dev;
+
+	ring->cpu_addr = dma_alloc_coherent(dev, size, &ring->dma_addr, GFP_KERNEL);
+	if (!ring->cpu_addr)
+		return -ENOMEM;
+
+	if (((uintptr_t)ring->cpu_addr) & (0x40 - 1)) {
+		dev_err(dev, "Invalid DMA ring alignment\n");
+		goto err_free_buf_descs;
+	}
+
+	ring->slots = kzalloc(ring->length * sizeof(*ring->slots), GFP_KERNEL);
+	if (!ring->slots)
+		goto err_free_buf_descs;
+
+	memset(ring->cpu_addr, 0, size);
+
+	ring->read_idx = 0;
+	ring->write_idx = 0;
+
+	return 0;
+
+err_free_buf_descs:
+	dma_free_coherent(dev, size, ring->cpu_addr, ring->dma_addr);
+	return -ENOMEM;
+}
+
+static void bcm4908enet_dma_free(struct bcm4908enet *enet)
+{
+	struct bcm4908enet_dma_ring *tx_ring = &enet->tx_ring;
+	struct bcm4908enet_dma_ring *rx_ring = &enet->rx_ring;
+	struct device *dev = enet->dev;
+	int size;
+
+	size = rx_ring->length * sizeof(struct bcm4908enet_dma_ring_bd);
+	if (rx_ring->cpu_addr)
+		dma_free_coherent(dev, size, rx_ring->cpu_addr, rx_ring->dma_addr);
+	kfree(rx_ring->slots);
+
+	size = tx_ring->length * sizeof(struct bcm4908enet_dma_ring_bd);
+	if (tx_ring->cpu_addr)
+		dma_free_coherent(dev, size, tx_ring->cpu_addr, tx_ring->dma_addr);
+	kfree(tx_ring->slots);
+}
+
+static int bcm4908enet_dma_alloc(struct bcm4908enet *enet)
+{
+	struct bcm4908enet_dma_ring *tx_ring = &enet->tx_ring;
+	struct bcm4908enet_dma_ring *rx_ring = &enet->rx_ring;
+	struct device *dev = enet->dev;
+	int err;
+
+	tx_ring->length = ENET_TX_BDS_NUM;
+	tx_ring->is_tx = 1;
+	tx_ring->cfg_block = ENET_DMA_CH_TX_CFG;
+	tx_ring->st_ram_block = ENET_DMA_CH_TX_STATE_RAM;
+	err = bcm4908_dma_alloc_buf_descs(enet, tx_ring);
+	if (err) {
+		dev_err(dev, "Failed to alloc TX buf descriptors: %d\n", err);
+		return err;
+	}
+
+	rx_ring->length = ENET_RX_BDS_NUM;
+	rx_ring->is_tx = 0;
+	rx_ring->cfg_block = ENET_DMA_CH_RX_CFG;
+	rx_ring->st_ram_block = ENET_DMA_CH_RX_STATE_RAM;
+	err = bcm4908_dma_alloc_buf_descs(enet, rx_ring);
+	if (err) {
+		dev_err(dev, "Failed to alloc RX buf descriptors: %d\n", err);
+		bcm4908enet_dma_free(enet);
+		return err;
+	}
+
+	return 0;
+}
+
+static void bcm4908enet_dma_reset(struct bcm4908enet *enet)
+{
+	struct bcm4908enet_dma_ring *rings[] = { &enet->rx_ring, &enet->tx_ring };
+	int i;
+
+	/* Disable the DMA controller and channel */
+	for (i = 0; i < ARRAY_SIZE(rings); i++)
+		enet_write(enet, rings[i]->cfg_block + ENET_DMA_CH_CFG, 0);
+	enet_maskset(enet, ENET_DMA_CONTROLLER_CFG, ENET_DMA_CTRL_CFG_MASTER_EN, 0);
+
+	/* Reset channels state */
+	for (i = 0; i < ARRAY_SIZE(rings); i++) {
+		struct bcm4908enet_dma_ring *ring = rings[i];
+
+		enet_write(enet, ring->st_ram_block + ENET_DMA_CH_STATE_RAM_BASE_DESC_PTR, 0);
+		enet_write(enet, ring->st_ram_block + ENET_DMA_CH_STATE_RAM_STATE_DATA, 0);
+		enet_write(enet, ring->st_ram_block + ENET_DMA_CH_STATE_RAM_DESC_LEN_STATUS, 0);
+		enet_write(enet, ring->st_ram_block + ENET_DMA_CH_STATE_RAM_DESC_BASE_BUFPTR, 0);
+	}
+}
+
+static int bcm4908enet_dma_alloc_rx_buf(struct bcm4908enet *enet, unsigned int idx)
+{
+	struct bcm4908enet_dma_ring_bd *buf_desc = &enet->rx_ring.buf_desc[idx];
+	struct bcm4908enet_dma_ring_slot *slot = &enet->rx_ring.slots[idx];
+	struct device *dev = enet->dev;
+	u32 tmp;
+	int err;
+
+	slot->len = ENET_MTU_MAX + ENET_MTU_MAX_EXTRA_SIZE;
+
+	slot->skb = netdev_alloc_skb(enet->netdev, slot->len);
+	if (!slot->skb)
+		return -ENOMEM;
+
+	slot->dma_addr = dma_map_single(dev, slot->skb->data, slot->len, DMA_FROM_DEVICE);
+	err = dma_mapping_error(dev, slot->dma_addr);
+	if (err) {
+		dev_err(dev, "Failed to map DMA buffer: %d\n", err);
+		kfree_skb(slot->skb);
+		slot->skb = NULL;
+		return err;
+	}
+
+	tmp = slot->len << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
+	tmp |= DMA_CTL_STATUS_OWN;
+	if (idx == enet->rx_ring.length - 1)
+		tmp |= DMA_CTL_STATUS_WRAP;
+	buf_desc->ctl = cpu_to_le32(tmp);
+	buf_desc->addr = cpu_to_le32(slot->dma_addr);
+
+	return 0;
+}
+
+static void bcm4908enet_dma_ring_init(struct bcm4908enet *enet,
+				      struct bcm4908enet_dma_ring *ring)
+{
+	int reset_channel = 0; /* We support only 1 main channel (with TX and RX) */
+	int reset_subch = ring->is_tx ? 1 : 0;
+
+	/* Reset the DMA channel */
+	enet_write(enet, ENET_DMA_CTRL_CHANNEL_RESET, BIT(reset_channel * 2 + reset_subch));
+	enet_write(enet, ENET_DMA_CTRL_CHANNEL_RESET, 0);
+
+	enet_write(enet, ring->cfg_block + ENET_DMA_CH_CFG, 0);
+	enet_write(enet, ring->cfg_block + ENET_DMA_CH_CFG_MAX_BURST, ENET_DMA_MAX_BURST_LEN);
+	enet_write(enet, ring->cfg_block + ENET_DMA_CH_CFG_INT_MASK, 0);
+
+	enet_write(enet, ring->st_ram_block + ENET_DMA_CH_STATE_RAM_BASE_DESC_PTR,
+		   (uint32_t)ring->dma_addr);
+}
+
+static void bcm4908enet_dma_uninit(struct bcm4908enet *enet)
+{
+	struct bcm4908enet_dma_ring *rx_ring = &enet->rx_ring;
+	struct bcm4908enet_dma_ring_slot *slot;
+	struct device *dev = enet->dev;
+	int i;
+
+	for (i = rx_ring->length - 1; i >= 0; i--) {
+		slot = &rx_ring->slots[i];
+		if (!slot->skb)
+			continue;
+		dma_unmap_single(dev, slot->dma_addr, slot->len, DMA_FROM_DEVICE);
+		kfree_skb(slot->skb);
+		slot->skb = NULL;
+	}
+}
+
+static int bcm4908enet_dma_init(struct bcm4908enet *enet)
+{
+	struct bcm4908enet_dma_ring *rx_ring = &enet->rx_ring;
+	struct device *dev = enet->dev;
+	int err;
+	int i;
+
+	for (i = 0; i < rx_ring->length; i++) {
+		err = bcm4908enet_dma_alloc_rx_buf(enet, i);
+		if (err) {
+			dev_err(dev, "Failed to alloc RX buffer: %d\n", err);
+			bcm4908enet_dma_uninit(enet);
+			return err;
+		}
+	}
+
+	bcm4908enet_dma_ring_init(enet, &enet->tx_ring);
+	bcm4908enet_dma_ring_init(enet, &enet->rx_ring);
+
+	return 0;
+}
+
+static void bcm4908enet_dma_tx_ring_ensable(struct bcm4908enet *enet,
+					    struct bcm4908enet_dma_ring *ring)
+{
+	enet_write(enet, ring->cfg_block + ENET_DMA_CH_CFG, ENET_DMA_CH_CFG_ENABLE);
+}
+
+static void bcm4908enet_dma_tx_ring_disable(struct bcm4908enet *enet,
+					    struct bcm4908enet_dma_ring *ring)
+{
+	enet_write(enet, ring->cfg_block + ENET_DMA_CH_CFG, 0);
+}
+
+static void bcm4908enet_dma_rx_ring_enable(struct bcm4908enet *enet,
+					   struct bcm4908enet_dma_ring *ring)
+{
+	enet_set(enet, ring->cfg_block + ENET_DMA_CH_CFG, ENET_DMA_CH_CFG_ENABLE);
+}
+
+static void bcm4908enet_dma_rx_ring_disable(struct bcm4908enet *enet,
+					    struct bcm4908enet_dma_ring *ring)
+{
+	unsigned long deadline;
+	u32 tmp;
+
+	enet_maskset(enet, ring->cfg_block + ENET_DMA_CH_CFG, ENET_DMA_CH_CFG_ENABLE, 0);
+
+	deadline = jiffies + usecs_to_jiffies(2000);
+	do {
+		tmp = enet_read(enet, ring->cfg_block + ENET_DMA_CH_CFG);
+		if (!(tmp & ENET_DMA_CH_CFG_ENABLE))
+			return;
+		enet_maskset(enet, ring->cfg_block + ENET_DMA_CH_CFG, ENET_DMA_CH_CFG_ENABLE, 0);
+		usleep_range(10, 30);
+	} while (!time_after_eq(jiffies, deadline));
+
+	dev_warn(enet->dev, "Timeout waiting for DMA TX stop\n");
+}
+
+/***
+ * Ethernet driver
+ */
+
+static void bcm4908enet_gmac_init(struct bcm4908enet *enet)
+{
+	u32 cmd;
+
+	cmd = enet_umac_read(enet, UMAC_CMD);
+	enet_umac_write(enet, UMAC_CMD, cmd | CMD_SW_RESET);
+	enet_umac_write(enet, UMAC_CMD, cmd & ~CMD_SW_RESET);
+
+	enet_set(enet, ENET_FLUSH, ENET_FLUSH_RXFIFO_FLUSH | ENET_FLUSH_TXFIFO_FLUSH);
+	enet_maskset(enet, ENET_FLUSH, ENET_FLUSH_RXFIFO_FLUSH | ENET_FLUSH_TXFIFO_FLUSH, 0);
+
+	enet_set(enet, ENET_MIB_CTRL, ENET_MIB_CTRL_CLR_MIB);
+	enet_maskset(enet, ENET_MIB_CTRL, ENET_MIB_CTRL_CLR_MIB, 0);
+
+	cmd = enet_umac_read(enet, UMAC_CMD);
+	cmd &= ~(CMD_SPEED_MASK << CMD_SPEED_SHIFT);
+	cmd &= ~CMD_TX_EN;
+	cmd &= ~CMD_RX_EN;
+	cmd |= CMD_SPEED_1000 << CMD_SPEED_SHIFT;
+	enet_umac_write(enet, UMAC_CMD, cmd);
+
+	enet_maskset(enet, ENET_GMAC_STATUS,
+		     ENET_GMAC_STATUS_ETH_SPEED_MASK |
+		     ENET_GMAC_STATUS_HD |
+		     ENET_GMAC_STATUS_AUTO_CFG_EN |
+		     ENET_GMAC_STATUS_LINK_UP,
+		     ENET_GMAC_STATUS_ETH_SPEED_1000 |
+		     ENET_GMAC_STATUS_AUTO_CFG_EN |
+		     ENET_GMAC_STATUS_LINK_UP);
+}
+
+static irqreturn_t bcm4908enet_irq_handler(int irq, void *dev_id)
+{
+	struct bcm4908enet *enet = dev_id;
+
+	bcm4908enet_intrs_off(enet);
+	bcm4908enet_intrs_ack(enet);
+
+	napi_schedule(&enet->napi);
+
+	return IRQ_HANDLED;
+}
+
+static int bcm4908enet_open(struct net_device *netdev)
+{
+	struct bcm4908enet *enet = netdev_priv(netdev);
+	struct device *dev = enet->dev;
+	int err;
+
+	err = request_irq(netdev->irq, bcm4908enet_irq_handler, 0, "enet", enet);
+	if (err) {
+		dev_err(dev, "Failed to request IRQ %d: %d\n", netdev->irq, err);
+		return err;
+	}
+
+	bcm4908enet_gmac_init(enet);
+	bcm4908enet_dma_reset(enet);
+	bcm4908enet_dma_init(enet);
+
+	enet_umac_set(enet, UMAC_CMD, CMD_TX_EN | CMD_RX_EN);
+
+	enet_set(enet, ENET_DMA_CONTROLLER_CFG, ENET_DMA_CTRL_CFG_MASTER_EN);
+	enet_maskset(enet, ENET_DMA_CONTROLLER_CFG, ENET_DMA_CTRL_CFG_FLOWC_CH1_EN, 0);
+	bcm4908enet_dma_rx_ring_enable(enet, &enet->rx_ring);
+
+	napi_enable(&enet->napi);
+	netif_carrier_on(netdev);
+	netif_start_queue(netdev);
+
+	bcm4908enet_intrs_ack(enet);
+	bcm4908enet_intrs_on(enet);
+
+	return 0;
+}
+
+static int bcm4908enet_stop(struct net_device *netdev)
+{
+	struct bcm4908enet *enet = netdev_priv(netdev);
+
+	netif_stop_queue(netdev);
+	netif_carrier_off(netdev);
+	napi_disable(&enet->napi);
+
+	bcm4908enet_dma_rx_ring_disable(enet, &enet->rx_ring);
+	bcm4908enet_dma_tx_ring_disable(enet, &enet->tx_ring);
+
+	bcm4908enet_dma_uninit(enet);
+
+	free_irq(enet->netdev->irq, enet);
+
+	return 0;
+}
+
+static int bcm4908enet_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct bcm4908enet *enet = netdev_priv(netdev);
+	struct bcm4908enet_dma_ring *ring = &enet->tx_ring;
+	struct bcm4908enet_dma_ring_slot *slot;
+	struct device *dev = enet->dev;
+	struct bcm4908enet_dma_ring_bd *buf_desc;
+	int free_buf_descs;
+	u32 tmp;
+
+	/* Free transmitted skbs */
+	while (ring->read_idx != ring->write_idx) {
+		buf_desc = &ring->buf_desc[ring->read_idx];
+		if (buf_desc->ctl & DMA_CTL_STATUS_OWN)
+			break;
+		slot = &ring->slots[ring->read_idx];
+
+		dma_unmap_single(dev, slot->dma_addr, slot->len, DMA_TO_DEVICE);
+		dev_kfree_skb(slot->skb);
+		if (++ring->read_idx == ring->length)
+			ring->read_idx = 0;
+	}
+
+	/* Don't use the last empty buf descriptor */
+	if (ring->read_idx <= ring->write_idx)
+		free_buf_descs = ring->read_idx - ring->write_idx + ring->length;
+	else
+		free_buf_descs = ring->read_idx - ring->write_idx;
+	if (free_buf_descs < 2)
+		return NETDEV_TX_BUSY;
+
+	/* Hardware removes OWN bit after sending data */
+	buf_desc = &ring->buf_desc[ring->write_idx];
+	if (unlikely(le32_to_cpu(buf_desc->ctl) & DMA_CTL_STATUS_OWN)) {
+		netif_stop_queue(netdev);
+		return NETDEV_TX_BUSY;
+	}
+
+	slot = &ring->slots[ring->write_idx];
+	slot->skb = skb;
+	slot->len = skb->len;
+	slot->dma_addr = dma_map_single(dev, skb->data, skb->len, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(dev, slot->dma_addr)))
+		return NETDEV_TX_BUSY;
+
+	tmp = skb->len << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
+	tmp |= DMA_CTL_STATUS_OWN;
+	tmp |= DMA_CTL_STATUS_SOP;
+	tmp |= DMA_CTL_STATUS_EOP;
+	tmp |= DMA_CTL_STATUS_APPEND_CRC;
+	if (ring->write_idx + 1 == ring->length - 1)
+		tmp |= DMA_CTL_STATUS_WRAP;
+
+	buf_desc->addr = cpu_to_le32((uint32_t)slot->dma_addr);
+	buf_desc->ctl = cpu_to_le32(tmp);
+
+	bcm4908enet_dma_tx_ring_ensable(enet, &enet->tx_ring);
+
+	if (++ring->write_idx == ring->length - 1)
+		ring->write_idx = 0;
+	enet->netdev->stats.tx_bytes += skb->len;
+	enet->netdev->stats.tx_packets++;
+
+	return NETDEV_TX_OK;
+}
+
+static int bcm4908enet_poll(struct napi_struct *napi, int weight)
+{
+	struct bcm4908enet *enet = container_of(napi, struct bcm4908enet, napi);
+	struct device *dev = enet->dev;
+	int handled = 0;
+
+	while (handled < weight) {
+		struct bcm4908enet_dma_ring_bd *buf_desc;
+		struct bcm4908enet_dma_ring_slot slot;
+		u32 ctl;
+		int len;
+		int err;
+
+		buf_desc = &enet->rx_ring.buf_desc[enet->rx_ring.read_idx];
+		ctl = le32_to_cpu(buf_desc->ctl);
+		if (ctl & DMA_CTL_STATUS_OWN)
+			break;
+
+		slot = enet->rx_ring.slots[enet->rx_ring.read_idx];
+
+		/* Provide new buffer before unpinning the old one */
+		err = bcm4908enet_dma_alloc_rx_buf(enet, enet->rx_ring.read_idx);
+		if (err)
+			break;
+
+		if (++enet->rx_ring.read_idx == enet->rx_ring.length)
+			enet->rx_ring.read_idx = 0;
+
+		len = (ctl & DMA_CTL_LEN_DESC_BUFLENGTH) >> DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
+
+		if (len < ENET_MTU_MIN ||
+		    (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) {
+			enet->netdev->stats.rx_dropped++;
+			break;
+		}
+
+		dma_unmap_single(dev, slot.dma_addr, slot.len, DMA_FROM_DEVICE);
+
+		skb_put(slot.skb, len - 4 + 2);
+		slot.skb->protocol = eth_type_trans(slot.skb, enet->netdev);
+		netif_receive_skb(slot.skb);
+
+		enet->netdev->stats.rx_packets++;
+		enet->netdev->stats.rx_bytes += len;
+	}
+
+	if (handled < weight) {
+		napi_complete_done(napi, handled);
+		bcm4908enet_intrs_on(enet);
+	}
+
+	return handled;
+}
+
+static const struct net_device_ops bcm96xx_netdev_ops = {
+	.ndo_open = bcm4908enet_open,
+	.ndo_stop = bcm4908enet_stop,
+	.ndo_start_xmit = bcm4908enet_start_xmit,
+	.ndo_set_mac_address = eth_mac_addr,
+};
+
+static int bcm4908enet_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct net_device *netdev;
+	struct bcm4908enet *enet;
+	int err;
+
+	netdev = devm_alloc_etherdev(dev, sizeof(*enet));
+	if (!netdev)
+		return -ENOMEM;
+
+	enet = netdev_priv(netdev);
+	enet->dev = dev;
+	enet->netdev = netdev;
+
+	enet->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(enet->base)) {
+		dev_err(dev, "Failed to map registers: %ld\n", PTR_ERR(enet->base));
+		return PTR_ERR(enet->base);
+	}
+
+	netdev->irq = platform_get_irq_byname(pdev, "rx");
+	if (netdev->irq < 0)
+		return netdev->irq;
+
+	dma_set_coherent_mask(dev, DMA_BIT_MASK(32));
+
+	err = bcm4908enet_dma_alloc(enet);
+	if (err)
+		return err;
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	eth_hw_addr_random(netdev);
+	netdev->netdev_ops = &bcm96xx_netdev_ops;
+	netdev->min_mtu = ETH_ZLEN;
+	netdev->mtu = ENET_MTU_MAX;
+	netdev->max_mtu = ENET_MTU_MAX;
+	netif_napi_add(netdev, &enet->napi, bcm4908enet_poll, 64);
+
+	err = register_netdev(netdev);
+	if (err) {
+		bcm4908enet_dma_free(enet);
+		return err;
+	}
+
+	platform_set_drvdata(pdev, enet);
+
+	return 0;
+}
+
+static int bcm4908enet_remove(struct platform_device *pdev)
+{
+	struct bcm4908enet *enet = platform_get_drvdata(pdev);
+
+	unregister_netdev(enet->netdev);
+	netif_napi_del(&enet->napi);
+	bcm4908enet_dma_free(enet);
+
+	return 0;
+}
+
+static const struct of_device_id bcm4908enet_of_match[] = {
+	{ .compatible = "brcm,bcm4908enet"},
+	{},
+};
+
+static struct platform_driver bcm4908enet_driver = {
+	.driver = {
+		.name = "bcm4908enet",
+		.of_match_table = bcm4908enet_of_match,
+	},
+	.probe	= bcm4908enet_probe,
+	.remove = bcm4908enet_remove,
+};
+module_platform_driver(bcm4908enet_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DEVICE_TABLE(of, bcm4908enet_of_match);
diff --git a/drivers/net/ethernet/broadcom/bcm4908enet.h b/drivers/net/ethernet/broadcom/bcm4908enet.h
new file mode 100644
index 000000000000..11aadf0715d3
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bcm4908enet.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __BCM4908ENET_H
+#define __BCM4908ENET_H
+
+#define ENET_CONTROL					0x000
+#define ENET_MIB_CTRL					0x004
+#define  ENET_MIB_CTRL_CLR_MIB				0x00000001
+#define ENET_RX_ERR_MASK				0x008
+#define ENET_MIB_MAX_PKT_SIZE				0x00C
+#define  ENET_MIB_MAX_PKT_SIZE_VAL			0x00003fff
+#define ENET_DIAG_OUT					0x01c
+#define ENET_ENABLE_DROP_PKT				0x020
+#define ENET_IRQ_ENABLE					0x024
+#define  ENET_IRQ_ENABLE_OVFL				0x00000001
+#define ENET_GMAC_STATUS				0x028
+#define  ENET_GMAC_STATUS_ETH_SPEED_MASK		0x00000003
+#define  ENET_GMAC_STATUS_ETH_SPEED_10			0x00000000
+#define  ENET_GMAC_STATUS_ETH_SPEED_100			0x00000001
+#define  ENET_GMAC_STATUS_ETH_SPEED_1000		0x00000002
+#define  ENET_GMAC_STATUS_HD				0x00000004
+#define  ENET_GMAC_STATUS_AUTO_CFG_EN			0x00000008
+#define  ENET_GMAC_STATUS_LINK_UP			0x00000010
+#define ENET_IRQ_STATUS					0x02c
+#define  ENET_IRQ_STATUS_OVFL				0x00000001
+#define ENET_OVERFLOW_COUNTER				0x030
+#define ENET_FLUSH					0x034
+#define  ENET_FLUSH_RXFIFO_FLUSH			0x00000001
+#define  ENET_FLUSH_TXFIFO_FLUSH			0x00000002
+#define ENET_RSV_SELECT					0x038
+#define ENET_BP_FORCE					0x03c
+#define  ENET_BP_FORCE_FORCE				0x00000001
+#define ENET_DMA_RX_OK_TO_SEND_COUNT			0x040
+#define  ENET_DMA_RX_OK_TO_SEND_COUNT_VAL		0x0000000f
+#define ENET_TX_CRC_CTRL				0x044
+#define ENET_MIB					0x200
+#define ENET_UNIMAC					0x400
+#define ENET_DMA					0x800
+#define ENET_DMA_CONTROLLER_CFG				0x800
+#define  ENET_DMA_CTRL_CFG_MASTER_EN			0x00000001
+#define  ENET_DMA_CTRL_CFG_FLOWC_CH1_EN			0x00000002
+#define  ENET_DMA_CTRL_CFG_FLOWC_CH3_EN			0x00000004
+#define ENET_DMA_FLOWCTL_CH1_THRESH_LO			0x804
+#define ENET_DMA_FLOWCTL_CH1_THRESH_HI			0x808
+#define ENET_DMA_FLOWCTL_CH1_ALLOC			0x80c
+#define  ENET_DMA_FLOWCTL_CH1_ALLOC_FORCE		0x80000000
+#define ENET_DMA_FLOWCTL_CH3_THRESH_LO			0x810
+#define ENET_DMA_FLOWCTL_CH3_THRESH_HI			0x814
+#define ENET_DMA_FLOWCTL_CH3_ALLOC			0x818
+#define ENET_DMA_FLOWCTL_CH5_THRESH_LO			0x81C
+#define ENET_DMA_FLOWCTL_CH5_THRESH_HI			0x820
+#define ENET_DMA_FLOWCTL_CH5_ALLOC			0x824
+#define ENET_DMA_FLOWCTL_CH7_THRESH_LO			0x828
+#define ENET_DMA_FLOWCTL_CH7_THRESH_HI			0x82C
+#define ENET_DMA_FLOWCTL_CH7_ALLOC			0x830
+#define ENET_DMA_CTRL_CHANNEL_RESET			0x834
+#define ENET_DMA_CTRL_CHANNEL_DEBUG			0x838
+#define ENET_DMA_CTRL_GLOBAL_INTERRUPT_STATUS		0x840
+#define ENET_DMA_CTRL_GLOBAL_INTERRUPT_MASK		0x844
+#define ENET_DMA_CH0_CFG				0xa00		/* RX */
+#define ENET_DMA_CH1_CFG				0xa10		/* TX */
+#define ENET_DMA_CH0_STATE_RAM				0xc00		/* RX */
+#define ENET_DMA_CH1_STATE_RAM				0xc10		/* TX */
+
+#define ENET_DMA_CH_CFG					0x00		/* assorted configuration */
+#define  ENET_DMA_CH_CFG_ENABLE				0x00000001	/* set to enable channel */
+#define  ENET_DMA_CH_CFG_PKT_HALT			0x00000002	/* idle after an EOP flag is detected */
+#define  ENET_DMA_CH_CFG_BURST_HALT			0x00000004	/* idle after finish current memory burst */
+#define ENET_DMA_CH_CFG_INT_STAT			0x04		/* interrupts control and status */
+#define ENET_DMA_CH_CFG_INT_MASK			0x08		/* interrupts mask */
+#define  ENET_DMA_CH_CFG_INT_BUFF_DONE			0x00000001	/* buffer done */
+#define  ENET_DMA_CH_CFG_INT_DONE			0x00000002	/* packet xfer complete */
+#define  ENET_DMA_CH_CFG_INT_NO_DESC			0x00000004	/* no valid descriptors */
+#define  ENET_DMA_CH_CFG_INT_RX_ERROR			0x00000008	/* rxdma detect client protocol error */
+#define ENET_DMA_CH_CFG_MAX_BURST			0x0c		/* max burst length permitted */
+#define  ENET_DMA_CH_CFG_MAX_BURST_DESCSIZE_SEL		0x00040000	/* DMA Descriptor Size Selection */
+#define ENET_DMA_CH_CFG_SIZE				0x10
+
+#define ENET_DMA_CH_STATE_RAM_BASE_DESC_PTR		0x00		/* descriptor ring start address */
+#define ENET_DMA_CH_STATE_RAM_STATE_DATA		0x04		/* state/bytes done/ring offset */
+#define ENET_DMA_CH_STATE_RAM_DESC_LEN_STATUS		0x08		/* buffer descriptor status and len */
+#define ENET_DMA_CH_STATE_RAM_DESC_BASE_BUFPTR		0x0c		/* buffer descrpitor current processing */
+#define ENET_DMA_CH_STATE_RAM_SIZE			0x10
+
+#define DMA_CTL_STATUS_APPEND_CRC			0x00000100
+#define DMA_CTL_STATUS_APPEND_BRCM_TAG			0x00000200
+#define DMA_CTL_STATUS_PRIO				0x00000C00  /* Prio for Tx */
+#define DMA_CTL_STATUS_WRAP				0x00001000  /* */
+#define DMA_CTL_STATUS_SOP				0x00002000  /* first buffer in packet */
+#define DMA_CTL_STATUS_EOP				0x00004000  /* last buffer in packet */
+#define DMA_CTL_STATUS_OWN				0x00008000  /* cleared by DMA, set by SW */
+#define DMA_CTL_LEN_DESC_BUFLENGTH			0x0fff0000
+#define DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT		16
+#define DMA_CTL_LEN_DESC_MULTICAST			0x40000000
+#define DMA_CTL_LEN_DESC_USEFPM				0x80000000
+
+#endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index e20a1b3267b9..e9e60a935f40 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -272,7 +272,7 @@ struct hnae3_ring_chain_node {
 };
 
 #define HNAE3_IS_TX_RING(node) \
-	(((node)->flag & (1 << HNAE3_RING_TYPE_B)) == HNAE3_RING_TYPE_TX)
+	(((node)->flag & 1 << HNAE3_RING_TYPE_B) == HNAE3_RING_TYPE_TX)
 
 /* device specification info from firmware */
 struct hnae3_dev_specs {
@@ -292,7 +292,6 @@ struct hnae3_client_ops {
 	int (*init_instance)(struct hnae3_handle *handle);
 	void (*uninit_instance)(struct hnae3_handle *handle, bool reset);
 	void (*link_status_change)(struct hnae3_handle *handle, bool state);
-	int (*setup_tc)(struct hnae3_handle *handle, u8 tc);
 	int (*reset_notify)(struct hnae3_handle *handle,
 			    enum hnae3_reset_notify_type type);
 	void (*process_hw_error)(struct hnae3_handle *handle,
@@ -776,9 +775,9 @@ struct hnae3_handle {
 #define hnae3_get_field(origin, mask, shift) (((origin) & (mask)) >> (shift))
 
 #define hnae3_set_bit(origin, shift, val) \
-	hnae3_set_field((origin), (0x1 << (shift)), (shift), (val))
+	hnae3_set_field(origin, 0x1 << (shift), shift, val)
 #define hnae3_get_bit(origin, shift) \
-	hnae3_get_field((origin), (0x1 << (shift)), (shift))
+	hnae3_get_field(origin, 0x1 << (shift), shift)
 
 #define HNAE3_DBG_TM_NODES		"tm_nodes"
 #define HNAE3_DBG_TM_PRI		"tm_priority"
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index 36c7813b5996..818ac2c7c7ea 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -162,7 +162,7 @@ static int hns3_dbg_queue_map(struct hnae3_handle *h)
 			continue;
 
 		dev_info(&h->pdev->dev,
-			 "      %4d            %4d            %4d\n",
+			 "      %4d            %4u            %4d\n",
 			 i, global_qid, priv->ring[i].tqp_vector->vector_irq);
 	}
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cf16d5f31f26..9565b7999426 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -32,7 +32,7 @@
 #define CREATE_TRACE_POINTS
 #include "hns3_trace.h"
 
-#define hns3_set_field(origin, shift, val)	((origin) |= ((val) << (shift)))
+#define hns3_set_field(origin, shift, val)	((origin) |= (val) << (shift))
 #define hns3_tx_bd_count(S)	DIV_ROUND_UP(S, HNS3_MAX_BD_SIZE)
 
 #define hns3_rl_err(fmt, ...)						\
@@ -2329,7 +2329,7 @@ static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev,
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
 	pci_ers_result_t ret;
 
-	dev_info(&pdev->dev, "PCI error detected, state(=%d)!!\n", state);
+	dev_info(&pdev->dev, "PCI error detected, state(=%u)!!\n", state);
 
 	if (state == pci_channel_io_perm_failure)
 		return PCI_ERS_RESULT_DISCONNECT;
@@ -4084,7 +4084,7 @@ out_when_alloc_ring_memory:
 	return -ENOMEM;
 }
 
-int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
+static void hns3_uninit_all_ring(struct hns3_nic_priv *priv)
 {
 	struct hnae3_handle *h = priv->ae_handle;
 	int i;
@@ -4093,7 +4093,6 @@ int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
 		hns3_fini_ring(&priv->ring[i]);
 		hns3_fini_ring(&priv->ring[i + h->kinfo.num_tqps]);
 	}
-	return 0;
 }
 
 /* Set mac addr if it is configured. or leave it to the AE driver */
@@ -4321,7 +4320,6 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
 {
 	struct net_device *netdev = handle->kinfo.netdev;
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
-	int ret;
 
 	if (netdev->reg_state != NETREG_UNINITIALIZED)
 		unregister_netdev(netdev);
@@ -4347,9 +4345,7 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
 
 	hns3_nic_dealloc_vector_data(priv);
 
-	ret = hns3_uninit_all_ring(priv);
-	if (ret)
-		netdev_err(netdev, "uninit ring error\n");
+	hns3_uninit_all_ring(priv);
 
 	hns3_put_ring_config(priv);
 
@@ -4378,20 +4374,6 @@ static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup)
 	}
 }
 
-static int hns3_client_setup_tc(struct hnae3_handle *handle, u8 tc)
-{
-	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
-	struct net_device *ndev = kinfo->netdev;
-
-	if (tc > HNAE3_MAX_TC)
-		return -EINVAL;
-
-	if (!ndev)
-		return -ENODEV;
-
-	return hns3_nic_set_real_num_queue(ndev);
-}
-
 static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
 {
 	while (ring->next_to_clean != ring->next_to_use) {
@@ -4676,9 +4658,7 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle)
 
 	hns3_nic_dealloc_vector_data(priv);
 
-	ret = hns3_uninit_all_ring(priv);
-	if (ret)
-		netdev_err(netdev, "uninit ring error\n");
+	hns3_uninit_all_ring(priv);
 
 	hns3_put_ring_config(priv);
 
@@ -4828,7 +4808,6 @@ static const struct hnae3_client_ops client_ops = {
 	.init_instance = hns3_client_init,
 	.uninit_instance = hns3_client_uninit,
 	.link_status_change = hns3_link_status_change,
-	.setup_tc = hns3_client_setup_tc,
 	.reset_notify = hns3_reset_notify,
 	.process_hw_error = hns3_process_hw_error,
 };
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index d70af1d0d554..d069b04ee587 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -554,7 +554,7 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value)
 }
 
 #define hns3_read_dev(a, reg) \
-	hns3_read_reg((a)->io_base, (reg))
+	hns3_read_reg((a)->io_base, reg)
 
 static inline bool hns3_nic_resetting(struct net_device *netdev)
 {
@@ -564,7 +564,7 @@ static inline bool hns3_nic_resetting(struct net_device *netdev)
 }
 
 #define hns3_write_dev(a, reg, value) \
-	hns3_write_reg((a)->io_base, (reg), (value))
+	hns3_write_reg((a)->io_base, reg, value)
 
 #define ring_to_dev(ring) ((ring)->dev)
 
@@ -588,15 +588,15 @@ static inline unsigned int hns3_page_order(struct hns3_enet_ring *ring)
 
 /* iterator for handling rings in ring group */
 #define hns3_for_each_ring(pos, head) \
-	for (pos = (head).ring; pos; pos = pos->next)
+	for (pos = (head).ring; (pos); pos = (pos)->next)
 
 #define hns3_get_handle(ndev) \
 	(((struct hns3_nic_priv *)netdev_priv(ndev))->ae_handle)
 
-#define hns3_gl_usec_to_reg(int_gl) (int_gl >> 1)
+#define hns3_gl_usec_to_reg(int_gl) ((int_gl) >> 1)
 #define hns3_gl_round_down(int_gl) round_down(int_gl, 2)
 
-#define hns3_rl_usec_to_reg(int_rl) (int_rl >> 2)
+#define hns3_rl_usec_to_reg(int_rl) ((int_rl) >> 2)
 #define hns3_rl_round_down(int_rl) round_down(int_rl, 4)
 
 void hns3_ethtool_set_ops(struct net_device *netdev);
@@ -605,7 +605,6 @@ int hns3_set_channels(struct net_device *netdev,
 
 void hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget);
 int hns3_init_all_ring(struct hns3_nic_priv *priv);
-int hns3_uninit_all_ring(struct hns3_nic_priv *priv);
 int hns3_nic_reset_all_ring(struct hnae3_handle *h);
 void hns3_fini_ring(struct hns3_enet_ring *ring);
 netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 79e0a9b14b68..adcec4ea7cb9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -456,7 +456,7 @@ static void *hns3_update_strings(u8 *data, const struct hns3_stats *stats,
 			data[ETH_GSTRING_LEN - 1] = '\0';
 
 			/* first, prepend the prefix string */
-			n1 = scnprintf(data, MAX_PREFIX_SIZE, "%s%d_",
+			n1 = scnprintf(data, MAX_PREFIX_SIZE, "%s%u_",
 				       prefix, i);
 			size_left = (ETH_GSTRING_LEN - 1) - n1;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index e7c915eabc8a..ff52a65b4cff 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -1144,9 +1144,9 @@ static inline void hclge_write_reg(void __iomem *base, u32 reg, u32 value)
 }
 
 #define hclge_write_dev(a, reg, value) \
-	hclge_write_reg((a)->io_base, (reg), (value))
+	hclge_write_reg((a)->io_base, reg, value)
 #define hclge_read_dev(a, reg) \
-	hclge_read_reg((a)->io_base, (reg))
+	hclge_read_reg((a)->io_base, reg)
 
 static inline u32 hclge_read_reg(u8 __iomem *base, u32 reg)
 {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
index e08d11b8ecf1..5bf5db91d16c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
@@ -176,29 +176,6 @@ static int hclge_map_update(struct hclge_dev *hdev)
 	return hclge_rss_init_hw(hdev);
 }
 
-static int hclge_client_setup_tc(struct hclge_dev *hdev)
-{
-	struct hclge_vport *vport = hdev->vport;
-	struct hnae3_client *client;
-	struct hnae3_handle *handle;
-	int ret;
-	u32 i;
-
-	for (i = 0; i < hdev->num_vmdq_vport + 1; i++) {
-		handle = &vport[i].nic;
-		client = handle->client;
-
-		if (!client || !client->ops || !client->ops->setup_tc)
-			continue;
-
-		ret = client->ops->setup_tc(handle, hdev->tm_info.num_tc);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 static int hclge_notify_down_uinit(struct hclge_dev *hdev)
 {
 	int ret;
@@ -257,10 +234,6 @@ static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets)
 		if (ret)
 			goto err_out;
 
-		ret = hclge_client_setup_tc(hdev);
-		if (ret)
-			goto err_out;
-
 		ret = hclge_notify_init_up(hdev);
 		if (ret)
 			return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
index 113efd4ae157..a0a33c02ce25 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
@@ -696,17 +696,16 @@ static void hclge_dbg_dump_tm_map(struct hclge_dev *hdev,
 	u32 qset_mapping[HCLGE_BP_EXT_GRP_NUM];
 	struct hclge_qs_to_pri_link_cmd *map;
 	struct hclge_tqp_tx_queue_tc_cmd *tc;
+	u16 group_id, queue_id, qset_id;
 	enum hclge_opcode_type cmd;
+	u8 grp_num, pri_id, tc_id;
 	struct hclge_desc desc;
-	int queue_id, group_id;
-	int tc_id, qset_id;
-	int pri_id, ret;
 	u16 qs_id_l;
 	u16 qs_id_h;
-	u8 grp_num;
+	int ret;
 	u32 i;
 
-	ret = kstrtouint(cmd_buf, 0, &queue_id);
+	ret = kstrtou16(cmd_buf, 0, &queue_id);
 	queue_id = (ret != 0) ? 0 : queue_id;
 
 	cmd = HCLGE_OPC_TM_NQ_TO_QS_LINK;
@@ -754,7 +753,7 @@ static void hclge_dbg_dump_tm_map(struct hclge_dev *hdev,
 	tc_id = tc->tc_id & 0x7;
 
 	dev_info(&hdev->pdev->dev, "queue_id | qset_id | pri_id | tc_id\n");
-	dev_info(&hdev->pdev->dev, "%04d     | %04d    | %02d     | %02d\n",
+	dev_info(&hdev->pdev->dev, "%04u     | %04u    | %02u     | %02u\n",
 		 queue_id, qset_id, pri_id, tc_id);
 
 	if (!hnae3_dev_dcb_supported(hdev)) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index 9ee55ee0487d..0ca7f1b984bf 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -1073,7 +1073,7 @@ static int hclge_config_ssu_hw_err_int(struct hclge_dev *hdev, bool en)
  * This function querys number of mpf and pf buffer descriptors.
  */
 static int hclge_query_bd_num(struct hclge_dev *hdev, bool is_ras,
-			      int *mpf_bd_num, int *pf_bd_num)
+			      u32 *mpf_bd_num, u32 *pf_bd_num)
 {
 	struct device *dev = &hdev->pdev->dev;
 	u32 mpf_min_bd_num, pf_min_bd_num;
@@ -1102,7 +1102,7 @@ static int hclge_query_bd_num(struct hclge_dev *hdev, bool is_ras,
 	*mpf_bd_num = le32_to_cpu(desc_bd.data[0]);
 	*pf_bd_num = le32_to_cpu(desc_bd.data[1]);
 	if (*mpf_bd_num < mpf_min_bd_num || *pf_bd_num < pf_min_bd_num) {
-		dev_err(dev, "Invalid bd num: mpf(%d), pf(%d)\n",
+		dev_err(dev, "Invalid bd num: mpf(%u), pf(%u)\n",
 			*mpf_bd_num, *pf_bd_num);
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index f5a988498cc3..64a80a5933cb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -24,7 +24,7 @@
 #include "hnae3.h"
 
 #define HCLGE_NAME			"hclge"
-#define HCLGE_STATS_READ(p, offset) (*((u64 *)((u8 *)(p) + (offset))))
+#define HCLGE_STATS_READ(p, offset) (*(u64 *)((u8 *)(p) + (offset)))
 #define HCLGE_MAC_STATS_FIELD_OFF(f) (offsetof(struct hclge_mac_stats, f))
 
 #define HCLGE_BUF_SIZE_UNIT	256U
@@ -626,7 +626,7 @@ static u8 *hclge_tqps_get_strings(struct hnae3_handle *handle, u8 *data)
 	for (i = 0; i < kinfo->num_tqps; i++) {
 		struct hclge_tqp *tqp = container_of(handle->kinfo.tqp[i],
 			struct hclge_tqp, q);
-		snprintf(buff, ETH_GSTRING_LEN, "txq%d_pktnum_rcd",
+		snprintf(buff, ETH_GSTRING_LEN, "txq%u_pktnum_rcd",
 			 tqp->index);
 		buff = buff + ETH_GSTRING_LEN;
 	}
@@ -634,7 +634,7 @@ static u8 *hclge_tqps_get_strings(struct hnae3_handle *handle, u8 *data)
 	for (i = 0; i < kinfo->num_tqps; i++) {
 		struct hclge_tqp *tqp = container_of(kinfo->tqp[i],
 			struct hclge_tqp, q);
-		snprintf(buff, ETH_GSTRING_LEN, "rxq%d_pktnum_rcd",
+		snprintf(buff, ETH_GSTRING_LEN, "rxq%u_pktnum_rcd",
 			 tqp->index);
 		buff = buff + ETH_GSTRING_LEN;
 	}
@@ -928,7 +928,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_parse_speed(int speed_cmd, int *speed)
+static int hclge_parse_speed(u8 speed_cmd, u32 *speed)
 {
 	switch (speed_cmd) {
 	case 6:
@@ -5595,7 +5595,7 @@ static int hclge_fd_check_ext_tuple(struct hclge_dev *hdev,
 		if (fs->m_ext.vlan_tci &&
 		    be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID) {
 			dev_err(&hdev->pdev->dev,
-				"failed to config vlan_tci, invalid vlan_tci: %u, max is %u.\n",
+				"failed to config vlan_tci, invalid vlan_tci: %u, max is %d.\n",
 				ntohs(fs->h_ext.vlan_tci), VLAN_N_VID - 1);
 			return -EINVAL;
 		}
@@ -10831,7 +10831,7 @@ static void hclge_reset_vf_rate(struct hclge_dev *hdev)
 	}
 }
 
-static int hclge_vf_rate_param_check(struct hclge_dev *hdev, int vf,
+static int hclge_vf_rate_param_check(struct hclge_dev *hdev,
 				     int min_tx_rate, int max_tx_rate)
 {
 	if (min_tx_rate != 0 ||
@@ -10852,7 +10852,7 @@ static int hclge_set_vf_rate(struct hnae3_handle *handle, int vf,
 	struct hclge_dev *hdev = vport->back;
 	int ret;
 
-	ret = hclge_vf_rate_param_check(hdev, vf, min_tx_rate, max_tx_rate);
+	ret = hclge_vf_rate_param_check(hdev, min_tx_rate, max_tx_rate);
 	if (ret)
 		return ret;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 33b17a199e18..19d7f28773f3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -46,15 +46,12 @@
 #define HCLGE_CMDQ_RX_DEPTH_REG		0x27020
 #define HCLGE_CMDQ_RX_TAIL_REG		0x27024
 #define HCLGE_CMDQ_RX_HEAD_REG		0x27028
-#define HCLGE_CMDQ_INTR_SRC_REG		0x27100
 #define HCLGE_CMDQ_INTR_STS_REG		0x27104
 #define HCLGE_CMDQ_INTR_EN_REG		0x27108
 #define HCLGE_CMDQ_INTR_GEN_REG		0x2710C
 
 /* bar registers for common func */
 #define HCLGE_VECTOR0_OTER_EN_REG	0x20600
-#define HCLGE_RAS_OTHER_STS_REG		0x20B00
-#define HCLGE_FUNC_RESET_STS_REG	0x20C00
 #define HCLGE_GRO_EN_REG		0x28000
 
 /* bar registers for rcb */
@@ -728,7 +725,7 @@ struct hclge_vf_vlan_cfg {
  *	x = (~k) & v
  *	y = (k ^ ~v) & k
  */
-#define calc_x(x, k, v) ((x) = (~(k) & (v)))
+#define calc_x(x, k, v) (x = ~(k) & (v))
 #define calc_y(y, k, v) \
 	do { \
 		const typeof(k) _k_ = (k); \
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 754c09ada901..52a373722589 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -56,7 +56,7 @@ static int hclge_gen_resp_to_vf(struct hclge_vport *vport,
 		resp_pf_to_vf->msg.resp_status = resp;
 	} else {
 		dev_warn(&hdev->pdev->dev,
-			 "failed to send response to VF, response status %d is out-of-bound\n",
+			 "failed to send response to VF, response status %u is out-of-bound\n",
 			 resp);
 		resp_pf_to_vf->msg.resp_status = EIO;
 	}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 906d98e515aa..151afd1f0688 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -41,8 +41,9 @@ static int hclge_shaper_para_calc(u32 ir, u8 shaper_level,
 				  struct hclge_shaper_ir_para *ir_para,
 				  u32 max_tm_rate)
 {
+#define DEFAULT_SHAPER_IR_B	126
 #define DIVISOR_CLK		(1000 * 8)
-#define DIVISOR_IR_B_126	(126 * DIVISOR_CLK)
+#define DEFAULT_DIVISOR_IR_B	(DEFAULT_SHAPER_IR_B * DIVISOR_CLK)
 
 	static const u16 tick_array[HCLGE_SHAPER_LVL_CNT] = {
 		6 * 256,        /* Prioriy level */
@@ -69,10 +70,10 @@ static int hclge_shaper_para_calc(u32 ir, u8 shaper_level,
 	 * ir_calc = ---------------- * 1000
 	 *		tick * 1
 	 */
-	ir_calc = (DIVISOR_IR_B_126 + (tick >> 1) - 1) / tick;
+	ir_calc = (DEFAULT_DIVISOR_IR_B + (tick >> 1) - 1) / tick;
 
 	if (ir_calc == ir) {
-		ir_para->ir_b = 126;
+		ir_para->ir_b = DEFAULT_SHAPER_IR_B;
 		ir_para->ir_u = 0;
 		ir_para->ir_s = 0;
 
@@ -81,7 +82,8 @@ static int hclge_shaper_para_calc(u32 ir, u8 shaper_level,
 		/* Increasing the denominator to select ir_s value */
 		while (ir_calc >= ir && ir) {
 			ir_s_calc++;
-			ir_calc = DIVISOR_IR_B_126 / (tick * (1 << ir_s_calc));
+			ir_calc = DEFAULT_DIVISOR_IR_B /
+				  (tick * (1 << ir_s_calc));
 		}
 
 		ir_para->ir_b = (ir * tick * (1 << ir_s_calc) +
@@ -92,12 +94,12 @@ static int hclge_shaper_para_calc(u32 ir, u8 shaper_level,
 
 		while (ir_calc < ir) {
 			ir_u_calc++;
-			numerator = DIVISOR_IR_B_126 * (1 << ir_u_calc);
+			numerator = DEFAULT_DIVISOR_IR_B * (1 << ir_u_calc);
 			ir_calc = (numerator + (tick >> 1)) / tick;
 		}
 
 		if (ir_calc == ir) {
-			ir_para->ir_b = 126;
+			ir_para->ir_b = DEFAULT_SHAPER_IR_B;
 		} else {
 			u32 denominator = DIVISOR_CLK * (1 << --ir_u_calc);
 			ir_para->ir_b = (ir * tick + (denominator >> 1)) /
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index d33cb04acbef..b25d76023af0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -17,7 +17,7 @@
 
 /* SP or DWRR */
 #define HCLGE_TM_TX_SCHD_DWRR_MSK	BIT(0)
-#define HCLGE_TM_TX_SCHD_SP_MSK		(0xFE)
+#define HCLGE_TM_TX_SCHD_SP_MSK		0xFE
 
 #define HCLGE_ETHER_MAX_RATE	100000
 
@@ -214,8 +214,8 @@ struct hclge_pri_shaper_para {
 			   (HCLGE_TM_SHAP_##string##_MSK), \
 			   (HCLGE_TM_SHAP_##string##_LSH), val)
 #define hclge_tm_get_field(src, string) \
-			hnae3_get_field((src), (HCLGE_TM_SHAP_##string##_MSK), \
-				       (HCLGE_TM_SHAP_##string##_LSH))
+			hnae3_get_field((src), HCLGE_TM_SHAP_##string##_MSK, \
+					HCLGE_TM_SHAP_##string##_LSH)
 
 int hclge_tm_schd_init(struct hclge_dev *hdev);
 int hclge_tm_vport_map_update(struct hclge_dev *hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h
index ac2864a7ce8d..8a37a22a176b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.h
@@ -216,8 +216,8 @@ struct hclgevf_rss_input_tuple_cmd {
 #define HCLGEVF_RSS_CFG_TBL_SIZE	16
 
 struct hclgevf_rss_indirection_table_cmd {
-	u16 start_table_index;
-	u16 rss_set_bitmap;
+	__le16 start_table_index;
+	__le16 rss_set_bitmap;
 	u8 rsv[4];
 	u8 rss_result[HCLGEVF_RSS_CFG_TBL_SIZE];
 };
@@ -229,7 +229,7 @@ struct hclgevf_rss_indirection_table_cmd {
 #define HCLGEVF_RSS_TC_VALID_B		15
 #define HCLGEVF_MAX_TC_NUM		8
 struct hclgevf_rss_tc_mode_cmd {
-	u16 rss_tc_mode[HCLGEVF_MAX_TC_NUM];
+	__le16 rss_tc_mode[HCLGEVF_MAX_TC_NUM];
 	u8 rsv[8];
 };
 
@@ -278,7 +278,6 @@ struct hclgevf_cfg_tx_queue_pointer_cmd {
 
 #define HCLGEVF_NIC_CMQ_DESC_NUM	1024
 #define HCLGEVF_NIC_CMQ_DESC_NUM_S	3
-#define HCLGEVF_NIC_CMDQ_INT_SRC_REG	0x27100
 
 #define HCLGEVF_QUERY_DEV_SPECS_BD_NUM		4
 
@@ -315,9 +314,9 @@ static inline u32 hclgevf_read_reg(u8 __iomem *base, u32 reg)
 }
 
 #define hclgevf_write_dev(a, reg, value) \
-	hclgevf_write_reg((a)->io_base, (reg), (value))
+	hclgevf_write_reg((a)->io_base, reg, value)
 #define hclgevf_read_dev(a, reg) \
-	hclgevf_read_reg((a)->io_base, (reg))
+	hclgevf_read_reg((a)->io_base, reg)
 
 #define HCLGEVF_SEND_SYNC(flag) \
 	((flag) & HCLGEVF_CMD_FLAG_NO_INTR)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index cdb1131ba239..ece31693e624 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -180,7 +180,7 @@ static u8 *hclgevf_tqps_get_strings(struct hnae3_handle *handle, u8 *data)
 	for (i = 0; i < kinfo->num_tqps; i++) {
 		struct hclgevf_tqp *tqp = container_of(kinfo->tqp[i],
 						       struct hclgevf_tqp, q);
-		snprintf(buff, ETH_GSTRING_LEN, "txq%d_pktnum_rcd",
+		snprintf(buff, ETH_GSTRING_LEN, "txq%u_pktnum_rcd",
 			 tqp->index);
 		buff += ETH_GSTRING_LEN;
 	}
@@ -188,7 +188,7 @@ static u8 *hclgevf_tqps_get_strings(struct hnae3_handle *handle, u8 *data)
 	for (i = 0; i < kinfo->num_tqps; i++) {
 		struct hclgevf_tqp *tqp = container_of(kinfo->tqp[i],
 						       struct hclgevf_tqp, q);
-		snprintf(buff, ETH_GSTRING_LEN, "rxq%d_pktnum_rcd",
+		snprintf(buff, ETH_GSTRING_LEN, "rxq%u_pktnum_rcd",
 			 tqp->index);
 		buff += ETH_GSTRING_LEN;
 	}
@@ -658,8 +658,9 @@ static int hclgevf_set_rss_indir_table(struct hclgevf_dev *hdev)
 	for (i = 0; i < rss_cfg_tbl_num; i++) {
 		hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_RSS_INDIR_TABLE,
 					     false);
-		req->start_table_index = i * HCLGEVF_RSS_CFG_TBL_SIZE;
-		req->rss_set_bitmap = HCLGEVF_RSS_SET_BITMAP_MSK;
+		req->start_table_index =
+			cpu_to_le16(i * HCLGEVF_RSS_CFG_TBL_SIZE);
+		req->rss_set_bitmap = cpu_to_le16(HCLGEVF_RSS_SET_BITMAP_MSK);
 		for (j = 0; j < HCLGEVF_RSS_CFG_TBL_SIZE; j++)
 			req->rss_result[j] =
 				indir[i * HCLGEVF_RSS_CFG_TBL_SIZE + j];
@@ -700,12 +701,16 @@ static int hclgevf_set_rss_tc_mode(struct hclgevf_dev *hdev,  u16 rss_size)
 
 	hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_RSS_TC_MODE, false);
 	for (i = 0; i < HCLGEVF_MAX_TC_NUM; i++) {
-		hnae3_set_bit(req->rss_tc_mode[i], HCLGEVF_RSS_TC_VALID_B,
+		u16 mode = 0;
+
+		hnae3_set_bit(mode, HCLGEVF_RSS_TC_VALID_B,
 			      (tc_valid[i] & 0x1));
-		hnae3_set_field(req->rss_tc_mode[i], HCLGEVF_RSS_TC_SIZE_M,
+		hnae3_set_field(mode, HCLGEVF_RSS_TC_SIZE_M,
 				HCLGEVF_RSS_TC_SIZE_S, tc_size[i]);
-		hnae3_set_field(req->rss_tc_mode[i], HCLGEVF_RSS_TC_OFFSET_M,
+		hnae3_set_field(mode, HCLGEVF_RSS_TC_OFFSET_M,
 				HCLGEVF_RSS_TC_OFFSET_S, tc_offset[i]);
+
+		req->rss_tc_mode[i] = cpu_to_le16(mode);
 	}
 	status = hclgevf_cmd_send(&hdev->hw, &desc, 1);
 	if (status)
diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
index 6da4f43f2348..73da4f71f530 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -24,6 +24,7 @@ ice-y := ice_main.o	\
 	 ice_flow.o	\
 	 ice_devlink.o	\
 	 ice_fw_update.o \
+	 ice_lag.o	\
 	 ice_ethtool.o
 ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o
 ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index fca428c879ec..dae8280ce17c 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -39,6 +39,7 @@
 #include <net/devlink.h>
 #include <net/ipv6.h>
 #include <net/xdp_sock.h>
+#include <net/xdp_sock_drv.h>
 #include <net/geneve.h>
 #include <net/gre.h>
 #include <net/udp_tunnel.h>
@@ -55,6 +56,7 @@
 #include "ice_fdir.h"
 #include "ice_xsk.h"
 #include "ice_arfs.h"
+#include "ice_lag.h"
 
 #define ICE_BAR0		0
 #define ICE_REQ_DESC_MULTIPLE	32
@@ -326,9 +328,11 @@ struct ice_vsi {
 	struct ice_ring **xdp_rings;	 /* XDP ring array */
 	u16 num_xdp_txq;		 /* Used XDP queues */
 	u8 xdp_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
-	struct xsk_buff_pool **xsk_pools;
-	u16 num_xsk_pools_used;
-	u16 num_xsk_pools;
+
+	/* setup back reference, to which aggregator node this VSI
+	 * corresponds to
+	 */
+	struct ice_agg_node *agg_node;
 } ____cacheline_internodealigned_in_smp;
 
 /* struct that defines an interrupt vector */
@@ -377,6 +381,13 @@ enum ice_pf_flags {
 	ICE_PF_FLAGS_NBITS		/* must be last */
 };
 
+struct ice_agg_node {
+	u32 agg_id;
+#define ICE_MAX_VSIS_IN_AGG_NODE	64
+	u32 num_vsis;
+	u8 valid;
+};
+
 struct ice_pf {
 	struct pci_dev *pdev;
 
@@ -455,6 +466,15 @@ struct ice_pf {
 	__le64 nvm_phy_type_lo; /* NVM PHY type low */
 	__le64 nvm_phy_type_hi; /* NVM PHY type high */
 	struct ice_link_default_override_tlv link_dflt_override;
+	struct ice_lag *lag; /* Link Aggregation information */
+
+#define ICE_INVALID_AGG_NODE_ID		0
+#define ICE_PF_AGG_NODE_ID_START	1
+#define ICE_MAX_PF_AGG_NODES		32
+	struct ice_agg_node pf_agg_node[ICE_MAX_PF_AGG_NODES];
+#define ICE_VF_AGG_NODE_ID_START	65
+#define ICE_MAX_VF_AGG_NODES		32
+	struct ice_agg_node vf_agg_node[ICE_MAX_VF_AGG_NODES];
 };
 
 struct ice_netdev_priv {
@@ -517,17 +537,15 @@ static inline void ice_set_ring_xdp(struct ice_ring *ring)
  */
 static inline struct xsk_buff_pool *ice_xsk_pool(struct ice_ring *ring)
 {
-	struct xsk_buff_pool **pools = ring->vsi->xsk_pools;
 	u16 qid = ring->q_index;
 
 	if (ice_ring_is_xdp(ring))
 		qid -= ring->vsi->num_xdp_txq;
 
-	if (qid >= ring->vsi->num_xsk_pools || !pools || !pools[qid] ||
-	    !ice_is_xdp_ena_vsi(ring->vsi))
+	if (!ice_is_xdp_ena_vsi(ring->vsi))
 		return NULL;
 
-	return pools[qid];
+	return xsk_get_pool_from_qid(ring->vsi->netdev, qid);
 }
 
 /**
@@ -557,11 +575,31 @@ static inline struct ice_vsi *ice_get_ctrl_vsi(struct ice_pf *pf)
 	return pf->vsi[pf->ctrl_vsi_idx];
 }
 
+/**
+ * ice_set_sriov_cap - enable SRIOV in PF flags
+ * @pf: PF struct
+ */
+static inline void ice_set_sriov_cap(struct ice_pf *pf)
+{
+	if (pf->hw.func_caps.common_cap.sr_iov_1_1)
+		set_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags);
+}
+
+/**
+ * ice_clear_sriov_cap - disable SRIOV in PF flags
+ * @pf: PF struct
+ */
+static inline void ice_clear_sriov_cap(struct ice_pf *pf)
+{
+	clear_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags);
+}
+
 #define ICE_FD_STAT_CTR_BLOCK_COUNT	256
 #define ICE_FD_STAT_PF_IDX(base_idx) \
 			((base_idx) * ICE_FD_STAT_CTR_BLOCK_COUNT)
 #define ICE_FD_SB_STAT_IDX(base_idx) ICE_FD_STAT_PF_IDX(base_idx)
 
+bool netif_is_ice(struct net_device *dev);
 int ice_vsi_setup_tx_rings(struct ice_vsi *vsi);
 int ice_vsi_setup_rx_rings(struct ice_vsi *vsi);
 int ice_vsi_open_ctrl(struct ice_vsi *vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index a51470b68d54..80186589153b 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -695,6 +695,18 @@ struct ice_aqc_sched_elem_cmd {
 	__le32 addr_low;
 };
 
+struct ice_aqc_txsched_move_grp_info_hdr {
+	__le32 src_parent_teid;
+	__le32 dest_parent_teid;
+	__le16 num_elems;
+	__le16 reserved;
+};
+
+struct ice_aqc_move_elem {
+	struct ice_aqc_txsched_move_grp_info_hdr hdr;
+	__le32 teid[];
+};
+
 struct ice_aqc_elem_info_bw {
 	__le16 bw_profile_idx;
 	__le16 bw_alloc;
@@ -1528,6 +1540,16 @@ struct ice_aqc_lldp_stop_start_specific_agent {
 	u8 reserved[15];
 };
 
+/* LLDP Filter Control (direct 0x0A0A) */
+struct ice_aqc_lldp_filter_ctrl {
+	u8 cmd_flags;
+#define ICE_AQC_LLDP_FILTER_ACTION_ADD		0x0
+#define ICE_AQC_LLDP_FILTER_ACTION_DELETE	0x1
+	u8 reserved1;
+	__le16 vsi_num;
+	u8 reserved2[12];
+};
+
 /* Get/Set RSS key (indirect 0x0B04/0x0B02) */
 struct ice_aqc_get_set_rss_key {
 #define ICE_AQC_GSET_RSS_KEY_VSI_VALID	BIT(15)
@@ -1851,6 +1873,7 @@ struct ice_aq_desc {
 		struct ice_aqc_lldp_start lldp_start;
 		struct ice_aqc_lldp_set_local_mib lldp_set_mib;
 		struct ice_aqc_lldp_stop_start_specific_agent lldp_agent_ctrl;
+		struct ice_aqc_lldp_filter_ctrl lldp_filter_ctrl;
 		struct ice_aqc_get_set_rss_lut get_set_rss_lut;
 		struct ice_aqc_get_set_rss_key get_set_rss_key;
 		struct ice_aqc_add_txqs add_txqs;
@@ -1950,6 +1973,7 @@ enum ice_adminq_opc {
 	ice_aqc_opc_add_sched_elems			= 0x0401,
 	ice_aqc_opc_cfg_sched_elems			= 0x0403,
 	ice_aqc_opc_get_sched_elems			= 0x0404,
+	ice_aqc_opc_move_sched_elems			= 0x0408,
 	ice_aqc_opc_suspend_sched_elems			= 0x0409,
 	ice_aqc_opc_resume_sched_elems			= 0x040A,
 	ice_aqc_opc_query_port_ets			= 0x040E,
@@ -1991,6 +2015,7 @@ enum ice_adminq_opc {
 	ice_aqc_opc_get_cee_dcb_cfg			= 0x0A07,
 	ice_aqc_opc_lldp_set_local_mib			= 0x0A08,
 	ice_aqc_opc_lldp_stop_start_specific_agent	= 0x0A09,
+	ice_aqc_opc_lldp_filter_ctrl			= 0x0A0A,
 
 	/* RSS commands */
 	ice_aqc_opc_set_rss_key				= 0x0B02,
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 607d33d05a0c..3d9475e222cd 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -110,7 +110,7 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
 	if (status)
 		return status;
 
-	resp = (struct ice_aqc_manage_mac_read_resp *)buf;
+	resp = buf;
 	flags = le16_to_cpu(cmd->flags) & ICE_AQC_MAN_MAC_READ_M;
 
 	if (!(flags & ICE_AQC_MAN_MAC_LAN_ADDR_VALID)) {
@@ -907,6 +907,7 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 		ice_debug(hw, ICE_DBG_SCHED, "Failed to get scheduler allocated resources\n");
 		goto err_unroll_alloc;
 	}
+	ice_sched_get_psm_clk_freq(hw);
 
 	/* Initialize port_info struct with scheduler data */
 	status = ice_sched_init_port(hw->port_info);
@@ -1979,7 +1980,7 @@ ice_parse_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
 	struct ice_aqc_list_caps_elem *cap_resp;
 	u32 i;
 
-	cap_resp = (struct ice_aqc_list_caps_elem *)buf;
+	cap_resp = buf;
 
 	memset(func_p, 0, sizeof(*func_p));
 
@@ -2109,7 +2110,7 @@ ice_parse_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
 	struct ice_aqc_list_caps_elem *cap_resp;
 	u32 i;
 
-	cap_resp = (struct ice_aqc_list_caps_elem *)buf;
+	cap_resp = buf;
 
 	memset(dev_p, 0, sizeof(*dev_p));
 
@@ -4078,6 +4079,7 @@ static enum ice_status ice_replay_pre_init(struct ice_hw *hw)
 	for (i = 0; i < ICE_SW_LKUP_LAST; i++)
 		list_replace_init(&sw->recp_list[i].filt_rules,
 				  &sw->recp_list[i].filt_replay_rules);
+	ice_sched_replay_agg_vsi_preinit(hw);
 
 	return 0;
 }
@@ -4109,6 +4111,8 @@ enum ice_status ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle)
 		return status;
 	/* Replay per VSI all filters */
 	status = ice_replay_vsi_all_fltr(hw, vsi_handle);
+	if (!status)
+		status = ice_replay_vsi_agg(hw, vsi_handle);
 	return status;
 }
 
@@ -4122,6 +4126,7 @@ void ice_replay_post(struct ice_hw *hw)
 {
 	/* Delete old entries from replay filter list head */
 	ice_rm_all_sw_replay_rule_info(hw);
+	ice_sched_replay_agg(hw);
 }
 
 /**
@@ -4366,3 +4371,50 @@ ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
 
 	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
+
+/**
+ * ice_fw_supports_lldp_fltr - check NVM version supports lldp_fltr_ctrl
+ * @hw: pointer to HW struct
+ */
+bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw)
+{
+	if (hw->mac_type != ICE_MAC_E810)
+		return false;
+
+	if (hw->api_maj_ver == ICE_FW_API_LLDP_FLTR_MAJ) {
+		if (hw->api_min_ver > ICE_FW_API_LLDP_FLTR_MIN)
+			return true;
+		if (hw->api_min_ver == ICE_FW_API_LLDP_FLTR_MIN &&
+		    hw->api_patch >= ICE_FW_API_LLDP_FLTR_PATCH)
+			return true;
+	} else if (hw->api_maj_ver > ICE_FW_API_LLDP_FLTR_MAJ) {
+		return true;
+	}
+	return false;
+}
+
+/**
+ * ice_lldp_fltr_add_remove - add or remove a LLDP Rx switch filter
+ * @hw: pointer to HW struct
+ * @vsi_num: absolute HW index for VSI
+ * @add: boolean for if adding or removing a filter
+ */
+enum ice_status
+ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add)
+{
+	struct ice_aqc_lldp_filter_ctrl *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.lldp_filter_ctrl;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_filter_ctrl);
+
+	if (add)
+		cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_ADD;
+	else
+		cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_DELETE;
+
+	cmd->vsi_num = cpu_to_le16(vsi_num);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 3ebb973878c7..baf4064fcbfe 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -175,4 +175,7 @@ ice_sched_query_elem(struct ice_hw *hw, u32 node_teid,
 enum ice_status
 ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
 		    struct ice_sq_cd *cd);
+bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw);
+enum ice_status
+ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add);
 #endif /* _ICE_COMMON_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
index 4db12d1f5808..b2d8a5932b1d 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.c
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
@@ -838,7 +838,7 @@ static u16 ice_clean_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
  */
 static void ice_debug_cq(struct ice_hw *hw, void *desc, void *buf, u16 buf_len)
 {
-	struct ice_aq_desc *cq_desc = (struct ice_aq_desc *)desc;
+	struct ice_aq_desc *cq_desc = desc;
 	u16 len;
 
 	if (!IS_ENABLED(CONFIG_DYNAMIC_DEBUG) &&
@@ -868,7 +868,7 @@ static void ice_debug_cq(struct ice_hw *hw, void *desc, void *buf, u16 buf_len)
 		if (buf_len < len)
 			len = buf_len;
 
-		ice_debug_array(hw, ICE_DBG_AQ_DESC_BUF, 16, 1, (u8 *)buf, len);
+		ice_debug_array(hw, ICE_DBG_AQ_DESC_BUF, 16, 1, buf, len);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c
index 2a3147ee0bbb..e42727941ef5 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.c
@@ -850,9 +850,9 @@ ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 		return ICE_ERR_PARAM;
 
 	if (dcbx_mode == ICE_DCBX_MODE_IEEE)
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 	else if (dcbx_mode == ICE_DCBX_MODE_CEE)
-		dcbx_cfg = &pi->desired_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.desired_dcbx_cfg;
 
 	/* Get Local DCB Config in case of ICE_DCBX_MODE_IEEE
 	 * or get CEE DCB Desired Config in case of ICE_DCBX_MODE_CEE
@@ -863,7 +863,7 @@ ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 		goto out;
 
 	/* Get Remote DCB Config */
-	dcbx_cfg = &pi->remote_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.remote_dcbx_cfg;
 	ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 				 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID, dcbx_cfg);
 	/* Don't treat ENOENT as an error for Remote MIBs */
@@ -892,14 +892,14 @@ enum ice_status ice_get_dcb_cfg(struct ice_port_info *pi)
 	ret = ice_aq_get_cee_dcb_cfg(pi->hw, &cee_cfg, NULL);
 	if (!ret) {
 		/* CEE mode */
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_CEE;
 		dcbx_cfg->tlv_status = le32_to_cpu(cee_cfg.tlv_status);
 		ice_cee_to_dcb_cfg(&cee_cfg, dcbx_cfg);
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_CEE);
 	} else if (pi->hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT) {
 		/* CEE mode not enabled try querying IEEE data */
-		dcbx_cfg = &pi->local_dcbx_cfg;
+		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_IEEE;
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_IEEE);
 	}
@@ -916,26 +916,26 @@ enum ice_status ice_get_dcb_cfg(struct ice_port_info *pi)
  */
 enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
 {
-	struct ice_port_info *pi = hw->port_info;
+	struct ice_qos_cfg *qos_cfg = &hw->port_info->qos_cfg;
 	enum ice_status ret = 0;
 
 	if (!hw->func_caps.common_cap.dcb)
 		return ICE_ERR_NOT_SUPPORTED;
 
-	pi->is_sw_lldp = true;
+	qos_cfg->is_sw_lldp = true;
 
 	/* Get DCBX status */
-	pi->dcbx_status = ice_get_dcbx_status(hw);
+	qos_cfg->dcbx_status = ice_get_dcbx_status(hw);
 
-	if (pi->dcbx_status == ICE_DCBX_STATUS_DONE ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
-	    pi->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
+	if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DONE ||
+	    qos_cfg->dcbx_status == ICE_DCBX_STATUS_IN_PROGRESS ||
+	    qos_cfg->dcbx_status == ICE_DCBX_STATUS_NOT_STARTED) {
 		/* Get current DCBX configuration */
-		ret = ice_get_dcb_cfg(pi);
+		ret = ice_get_dcb_cfg(hw->port_info);
 		if (ret)
 			return ret;
-		pi->is_sw_lldp = false;
-	} else if (pi->dcbx_status == ICE_DCBX_STATUS_DIS) {
+		qos_cfg->is_sw_lldp = false;
+	} else if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DIS) {
 		return ICE_ERR_NOT_READY;
 	}
 
@@ -943,7 +943,7 @@ enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
 	if (enable_mib_change) {
 		ret = ice_aq_cfg_lldp_mib_change(hw, true, NULL);
 		if (ret)
-			pi->is_sw_lldp = true;
+			qos_cfg->is_sw_lldp = true;
 	}
 
 	return ret;
@@ -958,21 +958,21 @@ enum ice_status ice_init_dcb(struct ice_hw *hw, bool enable_mib_change)
  */
 enum ice_status ice_cfg_lldp_mib_change(struct ice_hw *hw, bool ena_mib)
 {
-	struct ice_port_info *pi = hw->port_info;
+	struct ice_qos_cfg *qos_cfg = &hw->port_info->qos_cfg;
 	enum ice_status ret;
 
 	if (!hw->func_caps.common_cap.dcb)
 		return ICE_ERR_NOT_SUPPORTED;
 
 	/* Get DCBX status */
-	pi->dcbx_status = ice_get_dcbx_status(hw);
+	qos_cfg->dcbx_status = ice_get_dcbx_status(hw);
 
-	if (pi->dcbx_status == ICE_DCBX_STATUS_DIS)
+	if (qos_cfg->dcbx_status == ICE_DCBX_STATUS_DIS)
 		return ICE_ERR_NOT_READY;
 
 	ret = ice_aq_cfg_lldp_mib_change(hw, ena_mib, NULL);
 	if (!ret)
-		pi->is_sw_lldp = !ena_mib;
+		qos_cfg->is_sw_lldp = !ena_mib;
 
 	return ret;
 }
@@ -1270,7 +1270,7 @@ enum ice_status ice_set_dcb_cfg(struct ice_port_info *pi)
 	hw = pi->hw;
 
 	/* update the HW local config */
-	dcbcfg = &pi->local_dcbx_cfg;
+	dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
 	/* Allocate the LLDPDU */
 	lldpmib = devm_kzalloc(ice_hw_to_dev(hw), ICE_LLDPDU_SIZE, GFP_KERNEL);
 	if (!lldpmib)
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index 36abd6b7280c..1e8f71ffc8ce 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -28,7 +28,7 @@ void ice_vsi_cfg_netdev_tc(struct ice_vsi *vsi, u8 ena_tc)
 	if (netdev_set_num_tc(netdev, vsi->tc_cfg.numtc))
 		return;
 
-	dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
+	dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	ice_for_each_traffic_class(i)
 		if (vsi->tc_cfg.ena_tc & BIT(i))
@@ -134,7 +134,7 @@ static u8 ice_dcb_get_mode(struct ice_port_info *port_info, bool host)
 	else
 		mode = DCB_CAP_DCBX_LLD_MANAGED;
 
-	if (port_info->local_dcbx_cfg.dcbx_mode & ICE_DCBX_MODE_CEE)
+	if (port_info->qos_cfg.local_dcbx_cfg.dcbx_mode & ICE_DCBX_MODE_CEE)
 		return mode | DCB_CAP_DCBX_VER_CEE;
 	else
 		return mode | DCB_CAP_DCBX_VER_IEEE;
@@ -277,10 +277,10 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
 	int ret = ICE_DCB_NO_HW_CHG;
 	struct ice_vsi *pf_vsi;
 
-	curr_cfg = &pf->hw.port_info->local_dcbx_cfg;
+	curr_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	/* FW does not care if change happened */
-	if (!pf->hw.port_info->is_sw_lldp)
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp)
 		ret = ICE_DCB_HW_CHG_RST;
 
 	/* Enable DCB tagging only when more than one TC */
@@ -327,7 +327,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked)
 	/* Only send new config to HW if we are in SW LLDP mode. Otherwise,
 	 * the new config came from the HW in the first place.
 	 */
-	if (pf->hw.port_info->is_sw_lldp) {
+	if (pf->hw.port_info->qos_cfg.is_sw_lldp) {
 		ret = ice_set_dcb_cfg(pf->hw.port_info);
 		if (ret) {
 			dev_err(dev, "Set DCB Config failed\n");
@@ -360,7 +360,7 @@ free_cfg:
  */
 static void ice_cfg_etsrec_defaults(struct ice_port_info *pi)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pi->local_dcbx_cfg;
+	struct ice_dcbx_cfg *dcbcfg = &pi->qos_cfg.local_dcbx_cfg;
 	u8 i;
 
 	/* Ensure ETS recommended DCB configuration is not already set */
@@ -446,7 +446,7 @@ void ice_dcb_rebuild(struct ice_pf *pf)
 
 	mutex_lock(&pf->tc_mutex);
 
-	if (!pf->hw.port_info->is_sw_lldp)
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp)
 		ice_cfg_etsrec_defaults(pf->hw.port_info);
 
 	ret = ice_set_dcb_cfg(pf->hw.port_info);
@@ -455,9 +455,9 @@ void ice_dcb_rebuild(struct ice_pf *pf)
 		goto dcb_error;
 	}
 
-	if (!pf->hw.port_info->is_sw_lldp) {
+	if (!pf->hw.port_info->qos_cfg.is_sw_lldp) {
 		ret = ice_cfg_lldp_mib_change(&pf->hw, true);
-		if (ret && !pf->hw.port_info->is_sw_lldp) {
+		if (ret && !pf->hw.port_info->qos_cfg.is_sw_lldp) {
 			dev_err(dev, "Failed to register for MIB changes\n");
 			goto dcb_error;
 		}
@@ -510,11 +510,12 @@ static int ice_dcb_init_cfg(struct ice_pf *pf, bool locked)
 	int ret = 0;
 
 	pi = pf->hw.port_info;
-	newcfg = kmemdup(&pi->local_dcbx_cfg, sizeof(*newcfg), GFP_KERNEL);
+	newcfg = kmemdup(&pi->qos_cfg.local_dcbx_cfg, sizeof(*newcfg),
+			 GFP_KERNEL);
 	if (!newcfg)
 		return -ENOMEM;
 
-	memset(&pi->local_dcbx_cfg, 0, sizeof(*newcfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0, sizeof(*newcfg));
 
 	dev_info(ice_pf_to_dev(pf), "Configuring initial DCB values\n");
 	if (ice_pf_dcb_cfg(pf, newcfg, locked))
@@ -545,7 +546,7 @@ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool ets_willing, bool locked)
 	if (!dcbcfg)
 		return -ENOMEM;
 
-	memset(&pi->local_dcbx_cfg, 0, sizeof(*dcbcfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0, sizeof(*dcbcfg));
 
 	dcbcfg->etscfg.willing = ets_willing ? 1 : 0;
 	dcbcfg->etscfg.maxtcs = hw->func_caps.common_cap.maxtc;
@@ -608,7 +609,7 @@ static bool ice_dcb_tc_contig(u8 *prio_table)
  */
 static int ice_dcb_noncontig_cfg(struct ice_pf *pf)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
+	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 	struct device *dev = ice_pf_to_dev(pf);
 	int ret;
 
@@ -638,7 +639,7 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf)
  */
 void ice_pf_dcb_recfg(struct ice_pf *pf)
 {
-	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->local_dcbx_cfg;
+	struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 	u8 tc_map = 0;
 	int v, ret;
 
@@ -691,7 +692,7 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked)
 	port_info = hw->port_info;
 
 	err = ice_init_dcb(hw, false);
-	if (err && !port_info->is_sw_lldp) {
+	if (err && !port_info->qos_cfg.is_sw_lldp) {
 		dev_err(dev, "Error initializing DCB %d\n", err);
 		goto dcb_init_err;
 	}
@@ -858,7 +859,7 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 		/* Update the remote cached instance and return */
 		ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 					 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID,
-					 &pi->remote_dcbx_cfg);
+					 &pi->qos_cfg.remote_dcbx_cfg);
 		if (ret) {
 			dev_err(dev, "Failed to get remote DCB config\n");
 			return;
@@ -868,10 +869,11 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 	mutex_lock(&pf->tc_mutex);
 
 	/* store the old configuration */
-	tmp_dcbx_cfg = pf->hw.port_info->local_dcbx_cfg;
+	tmp_dcbx_cfg = pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	/* Reset the old DCBX configuration data */
-	memset(&pi->local_dcbx_cfg, 0, sizeof(pi->local_dcbx_cfg));
+	memset(&pi->qos_cfg.local_dcbx_cfg, 0,
+	       sizeof(pi->qos_cfg.local_dcbx_cfg));
 
 	/* Get updated DCBX data from firmware */
 	ret = ice_get_dcb_cfg(pf->hw.port_info);
@@ -881,7 +883,8 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 	}
 
 	/* No change detected in DCBX configs */
-	if (!memcmp(&tmp_dcbx_cfg, &pi->local_dcbx_cfg, sizeof(tmp_dcbx_cfg))) {
+	if (!memcmp(&tmp_dcbx_cfg, &pi->qos_cfg.local_dcbx_cfg,
+		    sizeof(tmp_dcbx_cfg))) {
 		dev_dbg(dev, "No change detected in DCBX configuration.\n");
 		goto out;
 	}
@@ -889,13 +892,13 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 	pf->dcbx_cap = ice_dcb_get_mode(pi, false);
 
 	need_reconfig = ice_dcb_need_recfg(pf, &tmp_dcbx_cfg,
-					   &pi->local_dcbx_cfg);
-	ice_dcbnl_flush_apps(pf, &tmp_dcbx_cfg, &pi->local_dcbx_cfg);
+					   &pi->qos_cfg.local_dcbx_cfg);
+	ice_dcbnl_flush_apps(pf, &tmp_dcbx_cfg, &pi->qos_cfg.local_dcbx_cfg);
 	if (!need_reconfig)
 		goto out;
 
 	/* Enable DCB tagging only when more than one TC */
-	if (ice_dcb_get_num_tc(&pi->local_dcbx_cfg) > 1) {
+	if (ice_dcb_get_num_tc(&pi->qos_cfg.local_dcbx_cfg) > 1) {
 		dev_dbg(dev, "DCB tagging enabled (num TC > 1)\n");
 		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
 	} else {
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
index 87f91b750d59..fcfefad00d1c 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c
@@ -34,12 +34,10 @@ static void ice_dcbnl_devreset(struct net_device *netdev)
 static int ice_dcbnl_getets(struct net_device *netdev, struct ieee_ets *ets)
 {
 	struct ice_dcbx_cfg *dcbxcfg;
-	struct ice_port_info *pi;
 	struct ice_pf *pf;
 
 	pf = ice_netdev_to_pf(netdev);
-	pi = pf->hw.port_info;
-	dcbxcfg = &pi->local_dcbx_cfg;
+	dcbxcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	ets->willing = dcbxcfg->etscfg.willing;
 	ets->ets_cap = dcbxcfg->etscfg.maxtcs;
@@ -74,7 +72,7 @@ static int ice_dcbnl_setets(struct net_device *netdev, struct ieee_ets *ets)
 	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
 		return -EINVAL;
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	mutex_lock(&pf->tc_mutex);
 
@@ -159,6 +157,7 @@ static u8 ice_dcbnl_getdcbx(struct net_device *netdev)
 static u8 ice_dcbnl_setdcbx(struct net_device *netdev, u8 mode)
 {
 	struct ice_pf *pf = ice_netdev_to_pf(netdev);
+	struct ice_qos_cfg *qos_cfg;
 
 	/* No support for LLD_MANAGED modes or CEE+IEEE */
 	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
@@ -171,10 +170,11 @@ static u8 ice_dcbnl_setdcbx(struct net_device *netdev, u8 mode)
 		return ICE_DCB_NO_HW_CHG;
 
 	pf->dcbx_cap = mode;
+	qos_cfg = &pf->hw.port_info->qos_cfg;
 	if (mode & DCB_CAP_DCBX_VER_CEE)
-		pf->hw.port_info->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_CEE;
+		qos_cfg->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_CEE;
 	else
-		pf->hw.port_info->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_IEEE;
+		qos_cfg->local_dcbx_cfg.dcbx_mode = ICE_DCBX_MODE_IEEE;
 
 	dev_info(ice_pf_to_dev(pf), "DCBx mode = 0x%x\n", mode);
 	return ICE_DCB_HW_CHG_RST;
@@ -225,7 +225,7 @@ static int ice_dcbnl_getpfc(struct net_device *netdev, struct ieee_pfc *pfc)
 	struct ice_dcbx_cfg *dcbxcfg;
 	int i;
 
-	dcbxcfg = &pi->local_dcbx_cfg;
+	dcbxcfg = &pi->qos_cfg.local_dcbx_cfg;
 	pfc->pfc_cap = dcbxcfg->pfc.pfccap;
 	pfc->pfc_en = dcbxcfg->pfc.pfcena;
 	pfc->mbc = dcbxcfg->pfc.mbc;
@@ -256,7 +256,7 @@ static int ice_dcbnl_setpfc(struct net_device *netdev, struct ieee_pfc *pfc)
 
 	mutex_lock(&pf->tc_mutex);
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	if (pfc->pfc_cap)
 		new_cfg->pfc.pfccap = pfc->pfc_cap;
@@ -293,9 +293,9 @@ ice_dcbnl_get_pfc_cfg(struct net_device *netdev, int prio, u8 *setting)
 	if (prio >= ICE_MAX_USER_PRIORITY)
 		return;
 
-	*setting = (pi->local_dcbx_cfg.pfc.pfcena >> prio) & 0x1;
+	*setting = (pi->qos_cfg.local_dcbx_cfg.pfc.pfcena >> prio) & 0x1;
 	dev_dbg(ice_pf_to_dev(pf), "Get PFC Config up=%d, setting=%d, pfcenable=0x%x\n",
-		prio, *setting, pi->local_dcbx_cfg.pfc.pfcena);
+		prio, *setting, pi->qos_cfg.local_dcbx_cfg.pfc.pfcena);
 }
 
 /**
@@ -316,7 +316,7 @@ static void ice_dcbnl_set_pfc_cfg(struct net_device *netdev, int prio, u8 set)
 	if (prio >= ICE_MAX_USER_PRIORITY)
 		return;
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	new_cfg->pfc.pfccap = pf->hw.func_caps.common_cap.maxtc;
 	if (set)
@@ -338,7 +338,7 @@ static u8 ice_dcbnl_getpfcstate(struct net_device *netdev)
 	struct ice_port_info *pi = pf->hw.port_info;
 
 	/* Return enabled if any UP enabled for PFC */
-	if (pi->local_dcbx_cfg.pfc.pfcena)
+	if (pi->qos_cfg.local_dcbx_cfg.pfc.pfcena)
 		return 1;
 
 	return 0;
@@ -378,8 +378,8 @@ static u8 ice_dcbnl_setstate(struct net_device *netdev, u8 state)
 
 	if (state) {
 		set_bit(ICE_FLAG_DCB_ENA, pf->flags);
-		memcpy(&pf->hw.port_info->desired_dcbx_cfg,
-		       &pf->hw.port_info->local_dcbx_cfg,
+		memcpy(&pf->hw.port_info->qos_cfg.desired_dcbx_cfg,
+		       &pf->hw.port_info->qos_cfg.local_dcbx_cfg,
 		       sizeof(struct ice_dcbx_cfg));
 	} else {
 		clear_bit(ICE_FLAG_DCB_ENA, pf->flags);
@@ -413,7 +413,7 @@ ice_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int prio,
 	if (prio >= ICE_MAX_USER_PRIORITY)
 		return;
 
-	*pgid = pi->local_dcbx_cfg.etscfg.prio_table[prio];
+	*pgid = pi->qos_cfg.local_dcbx_cfg.etscfg.prio_table[prio];
 	dev_dbg(ice_pf_to_dev(pf), "Get PG config prio=%d tc=%d\n", prio,
 		*pgid);
 }
@@ -444,7 +444,7 @@ ice_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc,
 	if (tc >= ICE_MAX_TRAFFIC_CLASS)
 		return;
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	/* prio_type, bwg_id and bw_pct per UP are not supported */
 
@@ -474,7 +474,7 @@ ice_dcbnl_get_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, u8 *bw_pct)
 	if (pgid >= ICE_MAX_TRAFFIC_CLASS)
 		return;
 
-	*bw_pct = pi->local_dcbx_cfg.etscfg.tcbwtable[pgid];
+	*bw_pct = pi->qos_cfg.local_dcbx_cfg.etscfg.tcbwtable[pgid];
 	dev_dbg(ice_pf_to_dev(pf), "Get PG BW config tc=%d bw_pct=%d\n",
 		pgid, *bw_pct);
 }
@@ -498,7 +498,7 @@ ice_dcbnl_set_pg_bwg_cfg_tx(struct net_device *netdev, int pgid, u8 bw_pct)
 	if (pgid >= ICE_MAX_TRAFFIC_CLASS)
 		return;
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	new_cfg->etscfg.tcbwtable[pgid] = bw_pct;
 }
@@ -528,7 +528,7 @@ ice_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int prio,
 	if (prio >= ICE_MAX_USER_PRIORITY)
 		return;
 
-	*pgid = pi->local_dcbx_cfg.etscfg.prio_table[prio];
+	*pgid = pi->qos_cfg.local_dcbx_cfg.etscfg.prio_table[prio];
 }
 
 /**
@@ -699,9 +699,9 @@ static int ice_dcbnl_setapp(struct net_device *netdev, struct dcb_app *app)
 
 	mutex_lock(&pf->tc_mutex);
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
-	old_cfg = &pf->hw.port_info->local_dcbx_cfg;
+	old_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	if (old_cfg->numapps == ICE_DCBX_MAX_APPS) {
 		ret = -EINVAL;
@@ -751,7 +751,7 @@ static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app)
 		return -EINVAL;
 
 	mutex_lock(&pf->tc_mutex);
-	old_cfg = &pf->hw.port_info->local_dcbx_cfg;
+	old_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
 
 	if (old_cfg->numapps <= 1)
 		goto delapp_out;
@@ -760,7 +760,7 @@ static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app)
 	if (ret)
 		goto delapp_out;
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	for (i = 1; i < new_cfg->numapps; i++) {
 		if (app->selector == new_cfg->app[i].selector &&
@@ -813,7 +813,7 @@ static u8 ice_dcbnl_cee_set_all(struct net_device *netdev)
 	    !(pf->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
 		return ICE_DCB_NO_HW_CHG;
 
-	new_cfg = &pf->hw.port_info->desired_dcbx_cfg;
+	new_cfg = &pf->hw.port_info->qos_cfg.desired_dcbx_cfg;
 
 	mutex_lock(&pf->tc_mutex);
 
@@ -884,7 +884,7 @@ void ice_dcbnl_set_all(struct ice_vsi *vsi)
 	if (!test_bit(ICE_FLAG_DCB_ENA, pf->flags))
 		return;
 
-	dcbxcfg = &pi->local_dcbx_cfg;
+	dcbxcfg = &pi->qos_cfg.local_dcbx_cfg;
 
 	for (i = 0; i < dcbxcfg->numapps; i++) {
 		u8 prio, tc_map;
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index e01b7e34da5e..5636c9b23896 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -1242,6 +1242,11 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
 			enum ice_status status;
 			bool dcbx_agent_status;
 
+			/* Remove rule to direct LLDP packets to default VSI.
+			 * The FW LLDP engine will now be consuming them.
+			 */
+			ice_cfg_sw_lldp(vsi, false, false);
+
 			/* AQ command to start FW LLDP agent will return an
 			 * error if the agent is already started
 			 */
@@ -1270,11 +1275,6 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags)
 			if (status)
 				dev_dbg(dev, "Fail to init DCB\n");
 
-			/* Remove rule to direct LLDP packets to default VSI.
-			 * The FW LLDP engine will now be consuming them.
-			 */
-			ice_cfg_sw_lldp(vsi, false, false);
-
 			/* Register for MIB change events */
 			status = ice_cfg_lldp_mib_change(&pf->hw, true);
 			if (status)
@@ -2979,7 +2979,7 @@ ice_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 	pause->rx_pause = 0;
 	pause->tx_pause = 0;
 
-	dcbx_cfg = &pi->local_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 
 	pcaps = kzalloc(sizeof(*pcaps), GFP_KERNEL);
 	if (!pcaps)
@@ -3031,7 +3031,7 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause)
 
 	pi = vsi->port_info;
 	hw_link_info = &pi->phy.link_info;
-	dcbx_cfg = &pi->local_dcbx_cfg;
+	dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 	link_up = hw_link_info->link_info & ICE_AQ_LINK_UP;
 
 	/* Changing the port's flow control is not supported if this isn't the
diff --git a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
index cf5b717b9293..5e1fd30c0a0f 100644
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
@@ -2727,7 +2727,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_XLT1_RSS:
 		case ICE_SID_XLT1_ACL:
 		case ICE_SID_XLT1_PE:
-			xlt1 = (struct ice_xlt1_section *)sect;
+			xlt1 = sect;
 			src = xlt1->value;
 			sect_len = le16_to_cpu(xlt1->count) *
 				sizeof(*hw->blk[block_id].xlt1.t);
@@ -2740,7 +2740,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_XLT2_RSS:
 		case ICE_SID_XLT2_ACL:
 		case ICE_SID_XLT2_PE:
-			xlt2 = (struct ice_xlt2_section *)sect;
+			xlt2 = sect;
 			src = (__force u8 *)xlt2->value;
 			sect_len = le16_to_cpu(xlt2->count) *
 				sizeof(*hw->blk[block_id].xlt2.t);
@@ -2753,7 +2753,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_PROFID_TCAM_RSS:
 		case ICE_SID_PROFID_TCAM_ACL:
 		case ICE_SID_PROFID_TCAM_PE:
-			pid = (struct ice_prof_id_section *)sect;
+			pid = sect;
 			src = (u8 *)pid->entry;
 			sect_len = le16_to_cpu(pid->count) *
 				sizeof(*hw->blk[block_id].prof.t);
@@ -2766,7 +2766,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_PROFID_REDIR_RSS:
 		case ICE_SID_PROFID_REDIR_ACL:
 		case ICE_SID_PROFID_REDIR_PE:
-			pr = (struct ice_prof_redir_section *)sect;
+			pr = sect;
 			src = pr->redir_value;
 			sect_len = le16_to_cpu(pr->count) *
 				sizeof(*hw->blk[block_id].prof_redir.t);
@@ -2779,7 +2779,7 @@ static void ice_fill_tbl(struct ice_hw *hw, enum ice_block block_id, u32 sid)
 		case ICE_SID_FLD_VEC_RSS:
 		case ICE_SID_FLD_VEC_ACL:
 		case ICE_SID_FLD_VEC_PE:
-			es = (struct ice_sw_fv_section *)sect;
+			es = sect;
 			src = (u8 *)es->fv;
 			sect_len = (u32)(le16_to_cpu(es->count) *
 					 hw->blk[block_id].es.fvw) *
diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 90abc8612a6a..093a1818a392 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -86,6 +86,9 @@
 #define QRXFLXP_CNTXT_RXDID_PRIO_S		8
 #define QRXFLXP_CNTXT_RXDID_PRIO_M		ICE_M(0x7, 8)
 #define QRXFLXP_CNTXT_TS_M			BIT(11)
+#define GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_S		4
+#define GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_M		ICE_M(0x3, 4)
+#define GLGEN_CLKSTAT_SRC			0x000B826C
 #define GLGEN_RSTAT				0x000B8188
 #define GLGEN_RSTAT_DEVSTATE_M			ICE_M(0x3, 0)
 #define GLGEN_RSTCTL				0x000B8180
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c
new file mode 100644
index 000000000000..4599fc3b4ed8
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_lag.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+/* Link Aggregation code */
+
+#include "ice.h"
+#include "ice_lag.h"
+
+/**
+ * ice_lag_nop_handler - no-op Rx handler to disable LAG
+ * @pskb: pointer to skb pointer
+ */
+rx_handler_result_t ice_lag_nop_handler(struct sk_buff __always_unused **pskb)
+{
+	return RX_HANDLER_PASS;
+}
+
+/**
+ * ice_lag_set_primary - set PF LAG state as Primary
+ * @lag: LAG info struct
+ */
+static void ice_lag_set_primary(struct ice_lag *lag)
+{
+	struct ice_pf *pf = lag->pf;
+
+	if (!pf)
+		return;
+
+	if (lag->role != ICE_LAG_UNSET && lag->role != ICE_LAG_BACKUP) {
+		dev_warn(ice_pf_to_dev(pf), "%s: Attempt to be Primary, but incompatible state.\n",
+			 netdev_name(lag->netdev));
+		return;
+	}
+
+	lag->role = ICE_LAG_PRIMARY;
+}
+
+/**
+ * ice_lag_set_backup - set PF LAG state to Backup
+ * @lag: LAG info struct
+ */
+static void ice_lag_set_backup(struct ice_lag *lag)
+{
+	struct ice_pf *pf = lag->pf;
+
+	if (!pf)
+		return;
+
+	if (lag->role != ICE_LAG_UNSET && lag->role != ICE_LAG_PRIMARY) {
+		dev_dbg(ice_pf_to_dev(pf), "%s: Attempt to be Backup, but incompatible state\n",
+			netdev_name(lag->netdev));
+		return;
+	}
+
+	lag->role = ICE_LAG_BACKUP;
+}
+
+/**
+ * ice_display_lag_info - print LAG info
+ * @lag: LAG info struct
+ */
+static void ice_display_lag_info(struct ice_lag *lag)
+{
+	const char *name, *peer, *upper, *role, *bonded, *master;
+	struct device *dev = &lag->pf->pdev->dev;
+
+	name = lag->netdev ? netdev_name(lag->netdev) : "unset";
+	peer = lag->peer_netdev ? netdev_name(lag->peer_netdev) : "unset";
+	upper = lag->upper_netdev ? netdev_name(lag->upper_netdev) : "unset";
+	master = lag->master ? "TRUE" : "FALSE";
+	bonded = lag->bonded ? "BONDED" : "UNBONDED";
+
+	switch (lag->role) {
+	case ICE_LAG_NONE:
+		role = "NONE";
+		break;
+	case ICE_LAG_PRIMARY:
+		role = "PRIMARY";
+		break;
+	case ICE_LAG_BACKUP:
+		role = "BACKUP";
+		break;
+	case ICE_LAG_UNSET:
+		role = "UNSET";
+		break;
+	default:
+		role = "ERROR";
+	}
+
+	dev_dbg(dev, "%s %s, peer:%s, upper:%s, role:%s, master:%s\n", name,
+		bonded, peer, upper, role, master);
+}
+
+/**
+ * ice_lag_info_event - handle NETDEV_BONDING_INFO event
+ * @lag: LAG info struct
+ * @ptr: opaque data pointer
+ *
+ * ptr is to be cast to (netdev_notifier_bonding_info *)
+ */
+static void ice_lag_info_event(struct ice_lag *lag, void *ptr)
+{
+	struct net_device *event_netdev, *netdev_tmp;
+	struct netdev_notifier_bonding_info *info;
+	struct netdev_bonding_info *bonding_info;
+	const char *lag_netdev_name;
+
+	event_netdev = netdev_notifier_info_to_dev(ptr);
+	info = ptr;
+	lag_netdev_name = netdev_name(lag->netdev);
+	bonding_info = &info->bonding_info;
+
+	if (event_netdev != lag->netdev || !lag->bonded || !lag->upper_netdev)
+		return;
+
+	if (bonding_info->master.bond_mode != BOND_MODE_ACTIVEBACKUP) {
+		netdev_dbg(lag->netdev, "Bonding event recv, but mode not active/backup\n");
+		goto lag_out;
+	}
+
+	if (strcmp(bonding_info->slave.slave_name, lag_netdev_name)) {
+		netdev_dbg(lag->netdev, "Bonding event recv, but slave info not for us\n");
+		goto lag_out;
+	}
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(lag->upper_netdev, netdev_tmp) {
+		if (!netif_is_ice(netdev_tmp))
+			continue;
+
+		if (netdev_tmp && netdev_tmp != lag->netdev &&
+		    lag->peer_netdev != netdev_tmp) {
+			dev_hold(netdev_tmp);
+			lag->peer_netdev = netdev_tmp;
+		}
+	}
+	rcu_read_unlock();
+
+	if (bonding_info->slave.state)
+		ice_lag_set_backup(lag);
+	else
+		ice_lag_set_primary(lag);
+
+lag_out:
+	ice_display_lag_info(lag);
+}
+
+/**
+ * ice_lag_link - handle LAG link event
+ * @lag: LAG info struct
+ * @info: info from the netdev notifier
+ */
+static void
+ice_lag_link(struct ice_lag *lag, struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *netdev_tmp, *upper = info->upper_dev;
+	struct ice_pf *pf = lag->pf;
+	int peers = 0;
+
+	if (lag->bonded)
+		dev_warn(ice_pf_to_dev(pf), "%s Already part of a bond\n",
+			 netdev_name(lag->netdev));
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, netdev_tmp)
+		peers++;
+	rcu_read_unlock();
+
+	if (lag->upper_netdev != upper) {
+		dev_hold(upper);
+		lag->upper_netdev = upper;
+	}
+
+	ice_clear_sriov_cap(pf);
+
+	lag->bonded = true;
+	lag->role = ICE_LAG_UNSET;
+
+	/* if this is the first element in an LAG mark as master */
+	lag->master = !!(peers == 1);
+}
+
+/**
+ * ice_lag_unlink - handle unlink event
+ * @lag: LAG info struct
+ * @info: info from netdev notification
+ */
+static void
+ice_lag_unlink(struct ice_lag *lag,
+	       struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *netdev_tmp, *upper = info->upper_dev;
+	struct ice_pf *pf = lag->pf;
+	bool found = false;
+
+	if (!lag->bonded) {
+		netdev_dbg(lag->netdev, "bonding unlink event on non-LAG netdev\n");
+		return;
+	}
+
+	/* determine if we are in the new LAG config or not */
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, netdev_tmp) {
+		if (netdev_tmp == lag->netdev) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (found)
+		return;
+
+	if (lag->upper_netdev) {
+		dev_put(lag->upper_netdev);
+		lag->upper_netdev = NULL;
+	}
+
+	if (lag->peer_netdev) {
+		dev_put(lag->peer_netdev);
+		lag->peer_netdev = NULL;
+	}
+
+	ice_set_sriov_cap(pf);
+	lag->bonded = false;
+	lag->role = ICE_LAG_NONE;
+}
+
+/**
+ * ice_lag_changeupper_event - handle LAG changeupper event
+ * @lag: LAG info struct
+ * @ptr: opaque pointer data
+ *
+ * ptr is to be cast into netdev_notifier_changeupper_info
+ */
+static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info;
+	struct net_device *netdev;
+
+	info = ptr;
+	netdev = netdev_notifier_info_to_dev(ptr);
+
+	/* not for this netdev */
+	if (netdev != lag->netdev)
+		return;
+
+	if (!info->upper_dev) {
+		netdev_dbg(netdev, "changeupper rcvd, but no upper defined\n");
+		return;
+	}
+
+	netdev_dbg(netdev, "bonding %s\n", info->linking ? "LINK" : "UNLINK");
+
+	if (!netif_is_lag_master(info->upper_dev)) {
+		netdev_dbg(netdev, "changeupper rcvd, but not master. bail\n");
+		return;
+	}
+
+	if (info->linking)
+		ice_lag_link(lag, info);
+	else
+		ice_lag_unlink(lag, info);
+
+	ice_display_lag_info(lag);
+}
+
+/**
+ * ice_lag_changelower_event - handle LAG changelower event
+ * @lag: LAG info struct
+ * @ptr: opaque data pointer
+ *
+ * ptr to be cast to netdev_notifier_changelowerstate_info
+ */
+static void ice_lag_changelower_event(struct ice_lag *lag, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	if (netdev != lag->netdev)
+		return;
+
+	netdev_dbg(netdev, "bonding info\n");
+
+	if (!netif_is_lag_port(netdev))
+		netdev_dbg(netdev, "CHANGELOWER rcvd, but netdev not in LAG. Bail\n");
+}
+
+/**
+ * ice_lag_event_handler - handle LAG events from netdev
+ * @notif_blk: notifier block registered by this netdev
+ * @event: event type
+ * @ptr: opaque data containing notifier event
+ */
+static int
+ice_lag_event_handler(struct notifier_block *notif_blk, unsigned long event,
+		      void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct ice_lag *lag;
+
+	lag = container_of(notif_blk, struct ice_lag, notif_block);
+
+	if (!lag->netdev)
+		return NOTIFY_DONE;
+
+	/* Check that the netdev is in the working namespace */
+	if (!net_eq(dev_net(netdev), &init_net))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		ice_lag_changeupper_event(lag, ptr);
+		break;
+	case NETDEV_CHANGELOWERSTATE:
+		ice_lag_changelower_event(lag, ptr);
+		break;
+	case NETDEV_BONDING_INFO:
+		ice_lag_info_event(lag, ptr);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/**
+ * ice_register_lag_handler - register LAG handler on netdev
+ * @lag: LAG struct
+ */
+static int ice_register_lag_handler(struct ice_lag *lag)
+{
+	struct device *dev = ice_pf_to_dev(lag->pf);
+	struct notifier_block *notif_blk;
+
+	notif_blk = &lag->notif_block;
+
+	if (!notif_blk->notifier_call) {
+		notif_blk->notifier_call = ice_lag_event_handler;
+		if (register_netdevice_notifier(notif_blk)) {
+			notif_blk->notifier_call = NULL;
+			dev_err(dev, "FAIL register LAG event handler!\n");
+			return -EINVAL;
+		}
+		dev_dbg(dev, "LAG event handler registered\n");
+	}
+	return 0;
+}
+
+/**
+ * ice_unregister_lag_handler - unregister LAG handler on netdev
+ * @lag: LAG struct
+ */
+static void ice_unregister_lag_handler(struct ice_lag *lag)
+{
+	struct device *dev = ice_pf_to_dev(lag->pf);
+	struct notifier_block *notif_blk;
+
+	notif_blk = &lag->notif_block;
+	if (notif_blk->notifier_call) {
+		unregister_netdevice_notifier(notif_blk);
+		dev_dbg(dev, "LAG event handler unregistered\n");
+	}
+}
+
+/**
+ * ice_init_lag - initialize support for LAG
+ * @pf: PF struct
+ *
+ * Alloc memory for LAG structs and initialize the elements.
+ * Memory will be freed in ice_deinit_lag
+ */
+int ice_init_lag(struct ice_pf *pf)
+{
+	struct device *dev = ice_pf_to_dev(pf);
+	struct ice_lag *lag;
+	struct ice_vsi *vsi;
+	int err;
+
+	pf->lag = kzalloc(sizeof(*lag), GFP_KERNEL);
+	if (!pf->lag)
+		return -ENOMEM;
+	lag = pf->lag;
+
+	vsi = ice_get_main_vsi(pf);
+	if (!vsi) {
+		dev_err(dev, "couldn't get main vsi, link aggregation init fail\n");
+		err = -EIO;
+		goto lag_error;
+	}
+
+	lag->pf = pf;
+	lag->netdev = vsi->netdev;
+	lag->role = ICE_LAG_NONE;
+	lag->bonded = false;
+	lag->peer_netdev = NULL;
+	lag->upper_netdev = NULL;
+	lag->notif_block.notifier_call = NULL;
+
+	err = ice_register_lag_handler(lag);
+	if (err) {
+		dev_warn(dev, "INIT LAG: Failed to register event handler\n");
+		goto lag_error;
+	}
+
+	ice_display_lag_info(lag);
+
+	dev_dbg(dev, "INIT LAG complete\n");
+	return 0;
+
+lag_error:
+	kfree(lag);
+	pf->lag = NULL;
+	return err;
+}
+
+/**
+ * ice_deinit_lag - Clean up LAG
+ * @pf: PF struct
+ *
+ * Clean up kernel LAG info and free memory
+ * This function is meant to only be called on driver remove/shutdown
+ */
+void ice_deinit_lag(struct ice_pf *pf)
+{
+	struct ice_lag *lag;
+
+	lag = pf->lag;
+
+	if (!lag)
+		return;
+
+	if (lag->pf)
+		ice_unregister_lag_handler(lag);
+
+	if (lag->upper_netdev)
+		dev_put(lag->upper_netdev);
+
+	if (lag->peer_netdev)
+		dev_put(lag->peer_netdev);
+
+	kfree(lag);
+
+	pf->lag = NULL;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.h b/drivers/net/ethernet/intel/ice/ice_lag.h
new file mode 100644
index 000000000000..c2e3688dd8fd
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_lag.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2021, Intel Corporation. */
+
+#ifndef _ICE_LAG_H_
+#define _ICE_LAG_H_
+
+#include <linux/netdevice.h>
+
+/* LAG roles for netdev */
+enum ice_lag_role {
+	ICE_LAG_NONE,
+	ICE_LAG_PRIMARY,
+	ICE_LAG_BACKUP,
+	ICE_LAG_UNSET
+};
+
+struct ice_pf;
+
+/* LAG info struct */
+struct ice_lag {
+	struct ice_pf *pf; /* backlink to PF struct */
+	struct net_device *netdev; /* this PF's netdev */
+	struct net_device *peer_netdev;
+	struct net_device *upper_netdev; /* upper bonding netdev */
+	struct notifier_block notif_block;
+	u8 bonded:1; /* currently bonded */
+	u8 master:1; /* this is a master */
+	u8 handler:1; /* did we register a rx_netdev_handler */
+	/* each thing blocking bonding will increment this value by one.
+	 * If this value is zero, then bonding is allowed.
+	 */
+	u16 dis_lag;
+	u8 role;
+};
+
+int ice_init_lag(struct ice_pf *pf);
+void ice_deinit_lag(struct ice_pf *pf);
+rx_handler_result_t ice_lag_nop_handler(struct sk_buff **pskb);
+
+/**
+ * ice_disable_lag - increment LAG disable count
+ * @lag: LAG struct
+ */
+static inline void ice_disable_lag(struct ice_lag *lag)
+{
+	/* If LAG this PF is not already disabled, disable it */
+	rtnl_lock();
+	if (!netdev_is_rx_handler_busy(lag->netdev)) {
+		if (!netdev_rx_handler_register(lag->netdev,
+						ice_lag_nop_handler,
+						NULL))
+			lag->handler = true;
+	}
+	rtnl_unlock();
+	lag->dis_lag++;
+}
+
+/**
+ * ice_enable_lag - decrement disable count for a PF
+ * @lag: LAG struct
+ *
+ * Decrement the disable counter for a port, and if that count reaches
+ * zero, then remove the no-op Rx handler from that netdev
+ */
+static inline void ice_enable_lag(struct ice_lag *lag)
+{
+	if (lag->dis_lag)
+		lag->dis_lag--;
+	if (!lag->dis_lag && lag->handler) {
+		rtnl_lock();
+		netdev_rx_handler_unregister(lag->netdev);
+		rtnl_unlock();
+		lag->handler = false;
+	}
+}
+
+/**
+ * ice_is_lag_dis - is LAG disabled
+ * @lag: LAG struct
+ *
+ * Return true if bonding is disabled
+ */
+static inline bool ice_is_lag_dis(struct ice_lag *lag)
+{
+	return !!(lag->dis_lag);
+}
+#endif /* _ICE_LAG_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index ad9c22a1b97a..8d4e2ad4328d 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -2078,7 +2078,7 @@ err_out:
 
 static void ice_vsi_set_tc_cfg(struct ice_vsi *vsi)
 {
-	struct ice_dcbx_cfg *cfg = &vsi->port_info->local_dcbx_cfg;
+	struct ice_dcbx_cfg *cfg = &vsi->port_info->qos_cfg.local_dcbx_cfg;
 
 	vsi->tc_cfg.ena_tc = ice_dcb_get_ena_tc(cfg);
 	vsi->tc_cfg.numtc = ice_dcb_get_num_tc(cfg);
@@ -2145,11 +2145,18 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
 	dev = ice_pf_to_dev(pf);
 	eth_fltr = create ? ice_fltr_add_eth : ice_fltr_remove_eth;
 
-	if (tx)
+	if (tx) {
 		status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_TX,
 				  ICE_DROP_PACKET);
-	else
-		status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_RX, ICE_FWD_TO_VSI);
+	} else {
+		if (ice_fw_supports_lldp_fltr_ctrl(&pf->hw)) {
+			status = ice_lldp_fltr_add_remove(&pf->hw, vsi->vsi_num,
+							  create);
+		} else {
+			status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_RX,
+					  ICE_FWD_TO_VSI);
+		}
+	}
 
 	if (status)
 		dev_err(dev, "Fail %s %s LLDP rule on VSI %i error: %s\n",
@@ -2158,6 +2165,126 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create)
 }
 
 /**
+ * ice_set_agg_vsi - sets up scheduler aggregator node and move VSI into it
+ * @vsi: pointer to the VSI
+ *
+ * This function will allocate new scheduler aggregator now if needed and will
+ * move specified VSI into it.
+ */
+static void ice_set_agg_vsi(struct ice_vsi *vsi)
+{
+	struct device *dev = ice_pf_to_dev(vsi->back);
+	struct ice_agg_node *agg_node_iter = NULL;
+	u32 agg_id = ICE_INVALID_AGG_NODE_ID;
+	struct ice_agg_node *agg_node = NULL;
+	int node_offset, max_agg_nodes = 0;
+	struct ice_port_info *port_info;
+	struct ice_pf *pf = vsi->back;
+	u32 agg_node_id_start = 0;
+	enum ice_status status;
+
+	/* create (as needed) scheduler aggregator node and move VSI into
+	 * corresponding aggregator node
+	 * - PF aggregator node to contains VSIs of type _PF and _CTRL
+	 * - VF aggregator nodes will contain VF VSI
+	 */
+	port_info = pf->hw.port_info;
+	if (!port_info)
+		return;
+
+	switch (vsi->type) {
+	case ICE_VSI_CTRL:
+	case ICE_VSI_LB:
+	case ICE_VSI_PF:
+		max_agg_nodes = ICE_MAX_PF_AGG_NODES;
+		agg_node_id_start = ICE_PF_AGG_NODE_ID_START;
+		agg_node_iter = &pf->pf_agg_node[0];
+		break;
+	case ICE_VSI_VF:
+		/* user can create 'n' VFs on a given PF, but since max children
+		 * per aggregator node can be only 64. Following code handles
+		 * aggregator(s) for VF VSIs, either selects a agg_node which
+		 * was already created provided num_vsis < 64, otherwise
+		 * select next available node, which will be created
+		 */
+		max_agg_nodes = ICE_MAX_VF_AGG_NODES;
+		agg_node_id_start = ICE_VF_AGG_NODE_ID_START;
+		agg_node_iter = &pf->vf_agg_node[0];
+		break;
+	default:
+		/* other VSI type, handle later if needed */
+		dev_dbg(dev, "unexpected VSI type %s\n",
+			ice_vsi_type_str(vsi->type));
+		return;
+	}
+
+	/* find the appropriate aggregator node */
+	for (node_offset = 0; node_offset < max_agg_nodes; node_offset++) {
+		/* see if we can find space in previously created
+		 * node if num_vsis < 64, otherwise skip
+		 */
+		if (agg_node_iter->num_vsis &&
+		    agg_node_iter->num_vsis == ICE_MAX_VSIS_IN_AGG_NODE) {
+			agg_node_iter++;
+			continue;
+		}
+
+		if (agg_node_iter->valid &&
+		    agg_node_iter->agg_id != ICE_INVALID_AGG_NODE_ID) {
+			agg_id = agg_node_iter->agg_id;
+			agg_node = agg_node_iter;
+			break;
+		}
+
+		/* find unclaimed agg_id */
+		if (agg_node_iter->agg_id == ICE_INVALID_AGG_NODE_ID) {
+			agg_id = node_offset + agg_node_id_start;
+			agg_node = agg_node_iter;
+			break;
+		}
+		/* move to next agg_node */
+		agg_node_iter++;
+	}
+
+	if (!agg_node)
+		return;
+
+	/* if selected aggregator node was not created, create it */
+	if (!agg_node->valid) {
+		status = ice_cfg_agg(port_info, agg_id, ICE_AGG_TYPE_AGG,
+				     (u8)vsi->tc_cfg.ena_tc);
+		if (status) {
+			dev_err(dev, "unable to create aggregator node with agg_id %u\n",
+				agg_id);
+			return;
+		}
+		/* aggregator node is created, store the neeeded info */
+		agg_node->valid = true;
+		agg_node->agg_id = agg_id;
+	}
+
+	/* move VSI to corresponding aggregator node */
+	status = ice_move_vsi_to_agg(port_info, agg_id, vsi->idx,
+				     (u8)vsi->tc_cfg.ena_tc);
+	if (status) {
+		dev_err(dev, "unable to move VSI idx %u into aggregator %u node",
+			vsi->idx, agg_id);
+		return;
+	}
+
+	/* keep active children count for aggregator node */
+	agg_node->num_vsis++;
+
+	/* cache the 'agg_id' in VSI, so that after reset - VSI will be moved
+	 * to aggregator node
+	 */
+	vsi->agg_node = agg_node;
+	dev_dbg(dev, "successfully moved VSI idx %u tc_bitmap 0x%x) into aggregator node %d which has num_vsis %u\n",
+		vsi->idx, vsi->tc_cfg.ena_tc, vsi->agg_node->agg_id,
+		vsi->agg_node->num_vsis);
+}
+
+/**
  * ice_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @pi: pointer to the port_info instance
@@ -2327,6 +2454,8 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
 			ice_cfg_sw_lldp(vsi, true, true);
 		}
 
+	if (!vsi->agg_node)
+		ice_set_agg_vsi(vsi);
 	return vsi;
 
 unroll_clear_rings:
@@ -2342,6 +2471,8 @@ unroll_vsi_init:
 unroll_get_qs:
 	ice_vsi_put_qs(vsi);
 unroll_vsi_alloc:
+	if (vsi_type == ICE_VSI_VF)
+		ice_enable_lag(pf->lag);
 	ice_vsi_clear(vsi);
 
 	return NULL;
@@ -2669,6 +2800,9 @@ int ice_vsi_release(struct ice_vsi *vsi)
 		vsi->netdev = NULL;
 	}
 
+	if (vsi->type == ICE_VSI_VF &&
+	    vsi->agg_node && vsi->agg_node->valid)
+		vsi->agg_node->num_vsis--;
 	ice_vsi_clear_rings(vsi);
 
 	ice_vsi_put_qs(vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 98cd44a3ccf7..813ec6b8ac23 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -44,6 +44,11 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type);
 
 static void ice_vsi_release_all(struct ice_pf *pf);
 
+bool netif_is_ice(struct net_device *dev)
+{
+	return dev && (dev->netdev_ops == &ice_netdev_ops);
+}
+
 /**
  * ice_get_tx_pending - returns number of Tx descriptors not processed
  * @ring: the ring of descriptors
@@ -430,11 +435,19 @@ static void ice_sync_fltr_subtask(struct ice_pf *pf)
  */
 static void ice_pf_dis_all_vsi(struct ice_pf *pf, bool locked)
 {
+	int node;
 	int v;
 
 	ice_for_each_vsi(pf, v)
 		if (pf->vsi[v])
 			ice_dis_vsi(pf->vsi[v], locked);
+
+	for (node = 0; node < ICE_MAX_PF_AGG_NODES; node++)
+		pf->pf_agg_node[node].num_vsis = 0;
+
+	for (node = 0; node < ICE_MAX_VF_AGG_NODES; node++)
+		pf->vf_agg_node[node].num_vsis = 0;
+
 }
 
 /**
@@ -2476,6 +2489,22 @@ free_qmap:
 }
 
 /**
+ * ice_vsi_rx_napi_schedule - Schedule napi on RX queues from VSI
+ * @vsi: VSI to schedule napi on
+ */
+static void ice_vsi_rx_napi_schedule(struct ice_vsi *vsi)
+{
+	int i;
+
+	ice_for_each_rxq(vsi, i) {
+		struct ice_ring *rx_ring = vsi->rx_rings[i];
+
+		if (rx_ring->xsk_pool)
+			napi_schedule(&rx_ring->q_vector->napi);
+	}
+}
+
+/**
  * ice_xdp_setup_prog - Add or remove XDP eBPF program
  * @vsi: VSI to setup XDP for
  * @prog: XDP program
@@ -2519,16 +2548,8 @@ ice_xdp_setup_prog(struct ice_vsi *vsi, struct bpf_prog *prog,
 	if (if_running)
 		ret = ice_up(vsi);
 
-	if (!ret && prog && vsi->xsk_pools) {
-		int i;
-
-		ice_for_each_rxq(vsi, i) {
-			struct ice_ring *rx_ring = vsi->rx_rings[i];
-
-			if (rx_ring->xsk_pool)
-				napi_schedule(&rx_ring->q_vector->napi);
-		}
-	}
+	if (!ret && prog)
+		ice_vsi_rx_napi_schedule(vsi);
 
 	return (ret || xdp_ring_err) ? -ENOMEM : 0;
 }
@@ -3370,28 +3391,20 @@ static int ice_init_pf(struct ice_pf *pf)
  */
 static int ice_ena_msix_range(struct ice_pf *pf)
 {
+	int v_left, v_actual, v_other, v_budget = 0;
 	struct device *dev = ice_pf_to_dev(pf);
-	int v_left, v_actual, v_budget = 0;
 	int needed, err, i;
 
 	v_left = pf->hw.func_caps.common_cap.num_msix_vectors;
 
-	/* reserve one vector for miscellaneous handler */
-	needed = 1;
-	if (v_left < needed)
-		goto no_hw_vecs_left_err;
-	v_budget += needed;
-	v_left -= needed;
-
-	/* reserve vectors for LAN traffic */
-	needed = min_t(int, num_online_cpus(), v_left);
+	/* reserve for LAN miscellaneous handler */
+	needed = ICE_MIN_LAN_OICR_MSIX;
 	if (v_left < needed)
 		goto no_hw_vecs_left_err;
-	pf->num_lan_msix = needed;
 	v_budget += needed;
 	v_left -= needed;
 
-	/* reserve one vector for flow director */
+	/* reserve for flow director */
 	if (test_bit(ICE_FLAG_FD_ENA, pf->flags)) {
 		needed = ICE_FDIR_MSIX;
 		if (v_left < needed)
@@ -3400,9 +3413,19 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 		v_left -= needed;
 	}
 
+	/* total used for non-traffic vectors */
+	v_other = v_budget;
+
+	/* reserve vectors for LAN traffic */
+	needed = min_t(int, num_online_cpus(), v_left);
+	if (v_left < needed)
+		goto no_hw_vecs_left_err;
+	pf->num_lan_msix = needed;
+	v_budget += needed;
+	v_left -= needed;
+
 	pf->msix_entries = devm_kcalloc(dev, v_budget,
 					sizeof(*pf->msix_entries), GFP_KERNEL);
-
 	if (!pf->msix_entries) {
 		err = -ENOMEM;
 		goto exit_err;
@@ -3414,7 +3437,6 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 	/* actually reserve the vectors */
 	v_actual = pci_enable_msix_range(pf->pdev, pf->msix_entries,
 					 ICE_MIN_MSIX, v_budget);
-
 	if (v_actual < 0) {
 		dev_err(dev, "unable to reserve MSI-X vectors\n");
 		err = v_actual;
@@ -3431,7 +3453,16 @@ static int ice_ena_msix_range(struct ice_pf *pf)
 			err = -ERANGE;
 			goto msix_err;
 		} else {
-			pf->num_lan_msix = ICE_MIN_LAN_TXRX_MSIX;
+			int v_traffic = v_actual - v_other;
+
+			if (v_actual == ICE_MIN_MSIX ||
+			    v_traffic < ICE_MIN_LAN_TXRX_MSIX)
+				pf->num_lan_msix = ICE_MIN_LAN_TXRX_MSIX;
+			else
+				pf->num_lan_msix = v_traffic;
+
+			dev_notice(dev, "Enabled %d MSI-X vectors for LAN traffic.\n",
+				   pf->num_lan_msix);
 		}
 	}
 
@@ -4227,6 +4258,9 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 		ice_cfg_lldp_mib_change(&pf->hw, true);
 	}
 
+	if (ice_init_lag(pf))
+		dev_warn(dev, "Failed to init link aggregation support\n");
+
 	/* print PCI link speed and width */
 	pcie_print_link_status(pf->pdev);
 
@@ -4349,6 +4383,7 @@ static void ice_remove(struct pci_dev *pdev)
 	ice_aq_cancel_waiting_tasks(pf);
 
 	mutex_destroy(&(&pf->hw)->fdir_fltr_lock);
+	ice_deinit_lag(pf);
 	if (!ice_is_safe_mode(pf))
 		ice_remove_arfs(pf);
 	ice_setup_mc_magic_wake(pf);
@@ -6152,7 +6187,7 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu)
 
 		err = ice_down(vsi);
 		if (err) {
-			netdev_err(netdev, "change MTU if_up err %d\n", err);
+			netdev_err(netdev, "change MTU if_down err %d\n", err);
 			return err;
 		}
 
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c
index f0912e44d4ad..2403cb38b93c 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -431,6 +431,27 @@ ice_aq_cfg_sched_elems(struct ice_hw *hw, u16 elems_req,
 }
 
 /**
+ * ice_aq_move_sched_elems - move scheduler elements
+ * @hw: pointer to the HW struct
+ * @grps_req: number of groups to move
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @grps_movd: returns total number of groups moved
+ * @cd: pointer to command details structure or NULL
+ *
+ * Move scheduling elements (0x0408)
+ */
+static enum ice_status
+ice_aq_move_sched_elems(struct ice_hw *hw, u16 grps_req,
+			struct ice_aqc_move_elem *buf, u16 buf_size,
+			u16 *grps_movd, struct ice_sq_cd *cd)
+{
+	return ice_aqc_send_sched_elem_cmd(hw, ice_aqc_opc_move_sched_elems,
+					   grps_req, (void *)buf, buf_size,
+					   grps_movd, cd);
+}
+
+/**
  * ice_aq_suspend_sched_elems - suspend scheduler elements
  * @hw: pointer to the HW struct
  * @elems_req: number of elements to suspend
@@ -1022,6 +1043,28 @@ static u8 ice_sched_get_vsi_layer(struct ice_hw *hw)
 }
 
 /**
+ * ice_sched_get_agg_layer - get the current aggregator layer number
+ * @hw: pointer to the HW struct
+ *
+ * This function returns the current aggregator layer number
+ */
+static u8 ice_sched_get_agg_layer(struct ice_hw *hw)
+{
+	/* Num Layers       aggregator layer
+	 *     9               4
+	 *     7 or less       sw_entry_point_layer
+	 */
+	/* calculate the aggregator layer based on number of layers. */
+	if (hw->num_tx_sched_layers > ICE_AGG_LAYER_OFFSET + 1) {
+		u8 layer = hw->num_tx_sched_layers - ICE_AGG_LAYER_OFFSET;
+
+		if (layer > hw->sw_entry_point_layer)
+			return layer;
+	}
+	return hw->sw_entry_point_layer;
+}
+
+/**
  * ice_rm_dflt_leaf_node - remove the default leaf node in the tree
  * @pi: port information structure
  *
@@ -1239,6 +1282,46 @@ sched_query_out:
 }
 
 /**
+ * ice_sched_get_psm_clk_freq - determine the PSM clock frequency
+ * @hw: pointer to the HW struct
+ *
+ * Determine the PSM clock frequency and store in HW struct
+ */
+void ice_sched_get_psm_clk_freq(struct ice_hw *hw)
+{
+	u32 val, clk_src;
+
+	val = rd32(hw, GLGEN_CLKSTAT_SRC);
+	clk_src = (val & GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_M) >>
+		GLGEN_CLKSTAT_SRC_PSM_CLK_SRC_S;
+
+#define PSM_CLK_SRC_367_MHZ 0x0
+#define PSM_CLK_SRC_416_MHZ 0x1
+#define PSM_CLK_SRC_446_MHZ 0x2
+#define PSM_CLK_SRC_390_MHZ 0x3
+
+	switch (clk_src) {
+	case PSM_CLK_SRC_367_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_367MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_416_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_416MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_446_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_446MHZ_IN_HZ;
+		break;
+	case PSM_CLK_SRC_390_MHZ:
+		hw->psm_clk_freq = ICE_PSM_CLK_390MHZ_IN_HZ;
+		break;
+	default:
+		ice_debug(hw, ICE_DBG_SCHED, "PSM clk_src unexpected %u\n",
+			  clk_src);
+		/* fall back to a safe default */
+		hw->psm_clk_freq = ICE_PSM_CLK_446MHZ_IN_HZ;
+	}
+}
+
+/**
  * ice_sched_find_node_in_subtree - Find node in part of base node subtree
  * @hw: pointer to the HW struct
  * @base: pointer to the base node
@@ -1364,7 +1447,7 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 
 /**
  * ice_sched_get_vsi_node - Get a VSI node based on VSI ID
- * @hw: pointer to the HW struct
+ * @pi: pointer to the port information structure
  * @tc_node: pointer to the TC node
  * @vsi_handle: software VSI handle
  *
@@ -1372,14 +1455,14 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
  * TC branch
  */
 static struct ice_sched_node *
-ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
+ice_sched_get_vsi_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 		       u16 vsi_handle)
 {
 	struct ice_sched_node *node;
 	u8 vsi_layer;
 
-	vsi_layer = ice_sched_get_vsi_layer(hw);
-	node = ice_sched_get_first_node(hw->port_info, tc_node, vsi_layer);
+	vsi_layer = ice_sched_get_vsi_layer(pi->hw);
+	node = ice_sched_get_first_node(pi, tc_node, vsi_layer);
 
 	/* Check whether it already exists */
 	while (node) {
@@ -1392,6 +1475,38 @@ ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
 }
 
 /**
+ * ice_sched_get_agg_node - Get an aggregator node based on aggregator ID
+ * @pi: pointer to the port information structure
+ * @tc_node: pointer to the TC node
+ * @agg_id: aggregator ID
+ *
+ * This function retrieves an aggregator node for a given aggregator ID from
+ * a given TC branch
+ */
+static struct ice_sched_node *
+ice_sched_get_agg_node(struct ice_port_info *pi, struct ice_sched_node *tc_node,
+		       u32 agg_id)
+{
+	struct ice_sched_node *node;
+	struct ice_hw *hw = pi->hw;
+	u8 agg_layer;
+
+	if (!hw)
+		return NULL;
+	agg_layer = ice_sched_get_agg_layer(hw);
+	node = ice_sched_get_first_node(pi, tc_node, agg_layer);
+
+	/* Check whether it already exists */
+	while (node) {
+		if (node->agg_id == agg_id)
+			return node;
+		node = node->sibling;
+	}
+
+	return node;
+}
+
+/**
  * ice_sched_calc_vsi_child_nodes - calculate number of VSI child nodes
  * @hw: pointer to the HW struct
  * @num_qs: number of queues
@@ -1444,7 +1559,7 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 
 	qgl = ice_sched_get_qgrp_layer(hw);
 	vsil = ice_sched_get_vsi_layer(hw);
-	parent = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	parent = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 	for (i = vsil + 1; i <= qgl; i++) {
 		if (!parent)
 			return ICE_ERR_CFG;
@@ -1477,7 +1592,7 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 
 /**
  * ice_sched_calc_vsi_support_nodes - calculate number of VSI support nodes
- * @hw: pointer to the HW struct
+ * @pi: pointer to the port info structure
  * @tc_node: pointer to TC node
  * @num_nodes: pointer to num nodes array
  *
@@ -1486,15 +1601,15 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
  * layers
  */
 static void
-ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
+ice_sched_calc_vsi_support_nodes(struct ice_port_info *pi,
 				 struct ice_sched_node *tc_node, u16 *num_nodes)
 {
 	struct ice_sched_node *node;
 	u8 vsil;
 	int i;
 
-	vsil = ice_sched_get_vsi_layer(hw);
-	for (i = vsil; i >= hw->sw_entry_point_layer; i--)
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	for (i = vsil; i >= pi->hw->sw_entry_point_layer; i--)
 		/* Add intermediate nodes if TC has no children and
 		 * need at least one node for VSI
 		 */
@@ -1504,11 +1619,10 @@ ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
 			/* If intermediate nodes are reached max children
 			 * then add a new one.
 			 */
-			node = ice_sched_get_first_node(hw->port_info, tc_node,
-							(u8)i);
+			node = ice_sched_get_first_node(pi, tc_node, (u8)i);
 			/* scan all the siblings */
 			while (node) {
-				if (node->num_children < hw->max_children[i])
+				if (node->num_children < pi->hw->max_children[i])
 					break;
 				node = node->sibling;
 			}
@@ -1588,14 +1702,13 @@ ice_sched_add_vsi_to_topo(struct ice_port_info *pi, u16 vsi_handle, u8 tc)
 {
 	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
 	struct ice_sched_node *tc_node;
-	struct ice_hw *hw = pi->hw;
 
 	tc_node = ice_sched_get_tc_node(pi, tc);
 	if (!tc_node)
 		return ICE_ERR_PARAM;
 
 	/* calculate number of supported nodes needed for this VSI */
-	ice_sched_calc_vsi_support_nodes(hw, tc_node, num_nodes);
+	ice_sched_calc_vsi_support_nodes(pi, tc_node, num_nodes);
 
 	/* add VSI supported nodes to TC subtree */
 	return ice_sched_add_vsi_support_nodes(pi, vsi_handle, tc_node,
@@ -1628,7 +1741,7 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	if (!tc_node)
 		return ICE_ERR_CFG;
 
-	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 	if (!vsi_node)
 		return ICE_ERR_CFG;
 
@@ -1691,7 +1804,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 	vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
 	if (!vsi_ctx)
 		return ICE_ERR_PARAM;
-	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 
 	/* suspend the VSI if TC is not enabled */
 	if (!enable) {
@@ -1712,7 +1825,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		if (status)
 			return status;
 
-		vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_handle);
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 		if (!vsi_node)
 			return ICE_ERR_CFG;
 
@@ -1821,7 +1934,7 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
 		if (!tc_node)
 			continue;
 
-		vsi_node = ice_sched_get_vsi_node(pi->hw, tc_node, vsi_handle);
+		vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
 		if (!vsi_node)
 			continue;
 
@@ -1874,6 +1987,720 @@ enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle)
 }
 
 /**
+ * ice_get_agg_info - get the aggregator ID
+ * @hw: pointer to the hardware structure
+ * @agg_id: aggregator ID
+ *
+ * This function validates aggregator ID. The function returns info if
+ * aggregator ID is present in list otherwise it returns null.
+ */
+static struct ice_sched_agg_info *
+ice_get_agg_info(struct ice_hw *hw, u32 agg_id)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		if (agg_info->agg_id == agg_id)
+			return agg_info;
+
+	return NULL;
+}
+
+/**
+ * ice_sched_get_free_vsi_parent - Find a free parent node in aggregator subtree
+ * @hw: pointer to the HW struct
+ * @node: pointer to a child node
+ * @num_nodes: num nodes count array
+ *
+ * This function walks through the aggregator subtree to find a free parent
+ * node
+ */
+static struct ice_sched_node *
+ice_sched_get_free_vsi_parent(struct ice_hw *hw, struct ice_sched_node *node,
+			      u16 *num_nodes)
+{
+	u8 l = node->tx_sched_layer;
+	u8 vsil, i;
+
+	vsil = ice_sched_get_vsi_layer(hw);
+
+	/* Is it VSI parent layer ? */
+	if (l == vsil - 1)
+		return (node->num_children < hw->max_children[l]) ? node : NULL;
+
+	/* We have intermediate nodes. Let's walk through the subtree. If the
+	 * intermediate node has space to add a new node then clear the count
+	 */
+	if (node->num_children < hw->max_children[l])
+		num_nodes[l] = 0;
+	/* The below recursive call is intentional and wouldn't go more than
+	 * 2 or 3 iterations.
+	 */
+
+	for (i = 0; i < node->num_children; i++) {
+		struct ice_sched_node *parent;
+
+		parent = ice_sched_get_free_vsi_parent(hw, node->children[i],
+						       num_nodes);
+		if (parent)
+			return parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * ice_sched_update_parent - update the new parent in SW DB
+ * @new_parent: pointer to a new parent node
+ * @node: pointer to a child node
+ *
+ * This function removes the child from the old parent and adds it to a new
+ * parent
+ */
+static void
+ice_sched_update_parent(struct ice_sched_node *new_parent,
+			struct ice_sched_node *node)
+{
+	struct ice_sched_node *old_parent;
+	u8 i, j;
+
+	old_parent = node->parent;
+
+	/* update the old parent children */
+	for (i = 0; i < old_parent->num_children; i++)
+		if (old_parent->children[i] == node) {
+			for (j = i + 1; j < old_parent->num_children; j++)
+				old_parent->children[j - 1] =
+					old_parent->children[j];
+			old_parent->num_children--;
+			break;
+		}
+
+	/* now move the node to a new parent */
+	new_parent->children[new_parent->num_children++] = node;
+	node->parent = new_parent;
+	node->info.parent_teid = new_parent->info.node_teid;
+}
+
+/**
+ * ice_sched_move_nodes - move child nodes to a given parent
+ * @pi: port information structure
+ * @parent: pointer to parent node
+ * @num_items: number of child nodes to be moved
+ * @list: pointer to child node teids
+ *
+ * This function move the child nodes to a given parent.
+ */
+static enum ice_status
+ice_sched_move_nodes(struct ice_port_info *pi, struct ice_sched_node *parent,
+		     u16 num_items, u32 *list)
+{
+	struct ice_aqc_move_elem *buf;
+	struct ice_sched_node *node;
+	enum ice_status status = 0;
+	u16 i, grps_movd = 0;
+	struct ice_hw *hw;
+	u16 buf_len;
+
+	hw = pi->hw;
+
+	if (!parent || !num_items)
+		return ICE_ERR_PARAM;
+
+	/* Does parent have enough space */
+	if (parent->num_children + num_items >
+	    hw->max_children[parent->tx_sched_layer])
+		return ICE_ERR_AQ_FULL;
+
+	buf_len = struct_size(buf, teid, 1);
+	buf = kzalloc(buf_len, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < num_items; i++) {
+		node = ice_sched_find_node_by_teid(pi->root, list[i]);
+		if (!node) {
+			status = ICE_ERR_PARAM;
+			goto move_err_exit;
+		}
+
+		buf->hdr.src_parent_teid = node->info.parent_teid;
+		buf->hdr.dest_parent_teid = parent->info.node_teid;
+		buf->teid[0] = node->info.node_teid;
+		buf->hdr.num_elems = cpu_to_le16(1);
+		status = ice_aq_move_sched_elems(hw, 1, buf, buf_len,
+						 &grps_movd, NULL);
+		if (status && grps_movd != 1) {
+			status = ICE_ERR_CFG;
+			goto move_err_exit;
+		}
+
+		/* update the SW DB */
+		ice_sched_update_parent(parent, node);
+	}
+
+move_err_exit:
+	kfree(buf);
+	return status;
+}
+
+/**
+ * ice_sched_move_vsi_to_agg - move VSI to aggregator node
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function moves a VSI to an aggregator node or its subtree.
+ * Intermediate nodes may be created if required.
+ */
+static enum ice_status
+ice_sched_move_vsi_to_agg(struct ice_port_info *pi, u16 vsi_handle, u32 agg_id,
+			  u8 tc)
+{
+	struct ice_sched_node *vsi_node, *agg_node, *tc_node, *parent;
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	u32 first_node_teid, vsi_teid;
+	enum ice_status status;
+	u16 num_nodes_added;
+	u8 aggl, vsil, i;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	vsi_node = ice_sched_get_vsi_node(pi, tc_node, vsi_handle);
+	if (!vsi_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Is this VSI already part of given aggregator? */
+	if (ice_sched_find_node_in_subtree(pi->hw, agg_node, vsi_node))
+		return 0;
+
+	aggl = ice_sched_get_agg_layer(pi->hw);
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+
+	/* set intermediate node count to 1 between aggregator and VSI layers */
+	for (i = aggl + 1; i < vsil; i++)
+		num_nodes[i] = 1;
+
+	/* Check if the aggregator subtree has any free node to add the VSI */
+	for (i = 0; i < agg_node->num_children; i++) {
+		parent = ice_sched_get_free_vsi_parent(pi->hw,
+						       agg_node->children[i],
+						       num_nodes);
+		if (parent)
+			goto move_nodes;
+	}
+
+	/* add new nodes */
+	parent = agg_node;
+	for (i = aggl + 1; i < vsil; i++) {
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_nodes_added);
+		if (status || num_nodes[i] != num_nodes_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_nodes_added)
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+		else
+			parent = parent->children[0];
+
+		if (!parent)
+			return ICE_ERR_CFG;
+	}
+
+move_nodes:
+	vsi_teid = le32_to_cpu(vsi_node->info.node_teid);
+	return ice_sched_move_nodes(pi, parent, 1, &vsi_teid);
+}
+
+/**
+ * ice_move_all_vsi_to_dflt_agg - move all VSI(s) to default aggregator
+ * @pi: port information structure
+ * @agg_info: aggregator info
+ * @tc: traffic class number
+ * @rm_vsi_info: true or false
+ *
+ * This function move all the VSI(s) to the default aggregator and delete
+ * aggregator VSI info based on passed in boolean parameter rm_vsi_info. The
+ * caller holds the scheduler lock.
+ */
+static enum ice_status
+ice_move_all_vsi_to_dflt_agg(struct ice_port_info *pi,
+			     struct ice_sched_agg_info *agg_info, u8 tc,
+			     bool rm_vsi_info)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_vsi_info *tmp;
+	enum ice_status status = 0;
+
+	list_for_each_entry_safe(agg_vsi_info, tmp, &agg_info->agg_vsi_list,
+				 list_entry) {
+		u16 vsi_handle = agg_vsi_info->vsi_handle;
+
+		/* Move VSI to default aggregator */
+		if (!ice_is_tc_ena(agg_vsi_info->tc_bitmap[0], tc))
+			continue;
+
+		status = ice_sched_move_vsi_to_agg(pi, vsi_handle,
+						   ICE_DFLT_AGG_ID, tc);
+		if (status)
+			break;
+
+		clear_bit(tc, agg_vsi_info->tc_bitmap);
+		if (rm_vsi_info && !agg_vsi_info->tc_bitmap[0]) {
+			list_del(&agg_vsi_info->list_entry);
+			devm_kfree(ice_hw_to_dev(pi->hw), agg_vsi_info);
+		}
+	}
+
+	return status;
+}
+
+/**
+ * ice_sched_is_agg_inuse - check whether the aggregator is in use or not
+ * @pi: port information structure
+ * @node: node pointer
+ *
+ * This function checks whether the aggregator is attached with any VSI or not.
+ */
+static bool
+ice_sched_is_agg_inuse(struct ice_port_info *pi, struct ice_sched_node *node)
+{
+	u8 vsil, i;
+
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	if (node->tx_sched_layer < vsil - 1) {
+		for (i = 0; i < node->num_children; i++)
+			if (ice_sched_is_agg_inuse(pi, node->children[i]))
+				return true;
+		return false;
+	} else {
+		return node->num_children ? true : false;
+	}
+}
+
+/**
+ * ice_sched_rm_agg_cfg - remove the aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function removes the aggregator node and intermediate nodes if any
+ * from the given TC
+ */
+static enum ice_status
+ice_sched_rm_agg_cfg(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	struct ice_sched_node *tc_node, *agg_node;
+	struct ice_hw *hw = pi->hw;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	if (!agg_node)
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	/* Can't remove the aggregator node if it has children */
+	if (ice_sched_is_agg_inuse(pi, agg_node))
+		return ICE_ERR_IN_USE;
+
+	/* need to remove the whole subtree if aggregator node is the
+	 * only child.
+	 */
+	while (agg_node->tx_sched_layer > hw->sw_entry_point_layer) {
+		struct ice_sched_node *parent = agg_node->parent;
+
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		if (parent->num_children > 1)
+			break;
+
+		agg_node = parent;
+	}
+
+	ice_free_sched_node(pi, agg_node);
+	return 0;
+}
+
+/**
+ * ice_rm_agg_cfg_tc - remove aggregator configuration for TC
+ * @pi: port information structure
+ * @agg_info: aggregator ID
+ * @tc: TC number
+ * @rm_vsi_info: bool value true or false
+ *
+ * This function removes aggregator reference to VSI of given TC. It removes
+ * the aggregator configuration completely for requested TC. The caller needs
+ * to hold the scheduler lock.
+ */
+static enum ice_status
+ice_rm_agg_cfg_tc(struct ice_port_info *pi, struct ice_sched_agg_info *agg_info,
+		  u8 tc, bool rm_vsi_info)
+{
+	enum ice_status status = 0;
+
+	/* If nothing to remove - return success */
+	if (!ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+		goto exit_rm_agg_cfg_tc;
+
+	status = ice_move_all_vsi_to_dflt_agg(pi, agg_info, tc, rm_vsi_info);
+	if (status)
+		goto exit_rm_agg_cfg_tc;
+
+	/* Delete aggregator node(s) */
+	status = ice_sched_rm_agg_cfg(pi, agg_info->agg_id, tc);
+	if (status)
+		goto exit_rm_agg_cfg_tc;
+
+	clear_bit(tc, agg_info->tc_bitmap);
+exit_rm_agg_cfg_tc:
+	return status;
+}
+
+/**
+ * ice_save_agg_tc_bitmap - save aggregator TC bitmap
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc_bitmap: 8 bits TC bitmap
+ *
+ * Save aggregator TC bitmap. This function needs to be called with scheduler
+ * lock held.
+ */
+static enum ice_status
+ice_save_agg_tc_bitmap(struct ice_port_info *pi, u32 agg_id,
+		       unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	bitmap_copy(agg_info->replay_tc_bitmap, tc_bitmap,
+		    ICE_MAX_TRAFFIC_CLASS);
+	return 0;
+}
+
+/**
+ * ice_sched_add_agg_cfg - create an aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @tc: TC number
+ *
+ * This function creates an aggregator node and intermediate nodes if required
+ * for the given TC
+ */
+static enum ice_status
+ice_sched_add_agg_cfg(struct ice_port_info *pi, u32 agg_id, u8 tc)
+{
+	struct ice_sched_node *parent, *agg_node, *tc_node;
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u32 first_node_teid;
+	u16 num_nodes_added;
+	u8 i, aggl;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	agg_node = ice_sched_get_agg_node(pi, tc_node, agg_id);
+	/* Does Agg node already exist ? */
+	if (agg_node)
+		return status;
+
+	aggl = ice_sched_get_agg_layer(hw);
+
+	/* need one node in Agg layer */
+	num_nodes[aggl] = 1;
+
+	/* Check whether the intermediate nodes have space to add the
+	 * new aggregator. If they are full, then SW needs to allocate a new
+	 * intermediate node on those layers
+	 */
+	for (i = hw->sw_entry_point_layer; i < aggl; i++) {
+		parent = ice_sched_get_first_node(pi, tc_node, i);
+
+		/* scan all the siblings */
+		while (parent) {
+			if (parent->num_children < hw->max_children[i])
+				break;
+			parent = parent->sibling;
+		}
+
+		/* all the nodes are full, reserve one for this layer */
+		if (!parent)
+			num_nodes[i]++;
+	}
+
+	/* add the aggregator node */
+	parent = tc_node;
+	for (i = hw->sw_entry_point_layer; i <= aggl; i++) {
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_nodes_added);
+		if (status || num_nodes[i] != num_nodes_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_nodes_added) {
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+			/* register aggregator ID with the aggregator node */
+			if (parent && i == aggl)
+				parent->agg_id = agg_id;
+		} else {
+			parent = parent->children[0];
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_sched_cfg_agg - configure aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @agg_type: aggregator type queue, VSI, or aggregator group
+ * @tc_bitmap: bits TC bitmap
+ *
+ * It registers a unique aggregator node into scheduler services. It
+ * allows a user to register with a unique ID to track it's resources.
+ * The aggregator type determines if this is a queue group, VSI group
+ * or aggregator group. It then creates the aggregator node(s) for requested
+ * TC(s) or removes an existing aggregator node including its configuration
+ * if indicated via tc_bitmap. Call ice_rm_agg_cfg to release aggregator
+ * resources and remove aggregator ID.
+ * This function needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_cfg_agg(struct ice_port_info *pi, u32 agg_id,
+		  enum ice_agg_type agg_type, unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	agg_info = ice_get_agg_info(hw, agg_id);
+	if (!agg_info) {
+		/* Create new entry for new aggregator ID */
+		agg_info = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*agg_info),
+					GFP_KERNEL);
+		if (!agg_info)
+			return ICE_ERR_NO_MEMORY;
+
+		agg_info->agg_id = agg_id;
+		agg_info->agg_type = agg_type;
+		agg_info->tc_bitmap[0] = 0;
+
+		/* Initialize the aggregator VSI list head */
+		INIT_LIST_HEAD(&agg_info->agg_vsi_list);
+
+		/* Add new entry in aggregator list */
+		list_add(&agg_info->list_entry, &hw->agg_list);
+	}
+	/* Create aggregator node(s) for requested TC(s) */
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc)) {
+			/* Delete aggregator cfg TC if it exists previously */
+			status = ice_rm_agg_cfg_tc(pi, agg_info, tc, false);
+			if (status)
+				break;
+			continue;
+		}
+
+		/* Check if aggregator node for TC already exists */
+		if (ice_is_tc_ena(agg_info->tc_bitmap[0], tc))
+			continue;
+
+		/* Create new aggregator node for TC */
+		status = ice_sched_add_agg_cfg(pi, agg_id, tc);
+		if (status)
+			break;
+
+		/* Save aggregator node's TC information */
+		set_bit(tc, agg_info->tc_bitmap);
+	}
+
+	return status;
+}
+
+/**
+ * ice_cfg_agg - config aggregator node
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @agg_type: aggregator type queue, VSI, or aggregator group
+ * @tc_bitmap: bits TC bitmap
+ *
+ * This function configures aggregator node(s).
+ */
+enum ice_status
+ice_cfg_agg(struct ice_port_info *pi, u32 agg_id, enum ice_agg_type agg_type,
+	    u8 tc_bitmap)
+{
+	unsigned long bitmap = tc_bitmap;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_cfg_agg(pi, agg_id, agg_type,
+				   (unsigned long *)&bitmap);
+	if (!status)
+		status = ice_save_agg_tc_bitmap(pi, agg_id,
+						(unsigned long *)&bitmap);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_get_agg_vsi_info - get the aggregator ID
+ * @agg_info: aggregator info
+ * @vsi_handle: software VSI handle
+ *
+ * The function returns aggregator VSI info based on VSI handle. This function
+ * needs to be called with scheduler lock held.
+ */
+static struct ice_sched_agg_vsi_info *
+ice_get_agg_vsi_info(struct ice_sched_agg_info *agg_info, u16 vsi_handle)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+	list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list, list_entry)
+		if (agg_vsi_info->vsi_handle == vsi_handle)
+			return agg_vsi_info;
+
+	return NULL;
+}
+
+/**
+ * ice_get_vsi_agg_info - get the aggregator info of VSI
+ * @hw: pointer to the hardware structure
+ * @vsi_handle: Sw VSI handle
+ *
+ * The function returns aggregator info of VSI represented via vsi_handle. The
+ * VSI has in this case a different aggregator than the default one. This
+ * function needs to be called with scheduler lock held.
+ */
+static struct ice_sched_agg_info *
+ice_get_vsi_agg_info(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_sched_agg_info *agg_info;
+
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry) {
+		struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+		agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+		if (agg_vsi_info)
+			return agg_info;
+	}
+	return NULL;
+}
+
+/**
+ * ice_save_agg_vsi_tc_bitmap - save aggregator VSI TC bitmap
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * Save VSI to aggregator TC bitmap. This function needs to call with scheduler
+ * lock held.
+ */
+static enum ice_status
+ice_save_agg_vsi_tc_bitmap(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+			   unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_info *agg_info;
+
+	agg_info = ice_get_agg_info(pi->hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	/* check if entry already exist */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info)
+		return ICE_ERR_PARAM;
+	bitmap_copy(agg_vsi_info->replay_tc_bitmap, tc_bitmap,
+		    ICE_MAX_TRAFFIC_CLASS);
+	return 0;
+}
+
+/**
+ * ice_sched_assoc_vsi_to_agg - associate/move VSI to new/default aggregator
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * This function moves VSI to a new or default aggregator node. If VSI is
+ * already associated to the aggregator node then no operation is performed on
+ * the tree. This function needs to be called with scheduler lock held.
+ */
+static enum ice_status
+ice_sched_assoc_vsi_to_agg(struct ice_port_info *pi, u32 agg_id,
+			   u16 vsi_handle, unsigned long *tc_bitmap)
+{
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u8 tc;
+
+	if (!ice_is_vsi_valid(pi->hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	agg_info = ice_get_agg_info(hw, agg_id);
+	if (!agg_info)
+		return ICE_ERR_PARAM;
+	/* check if entry already exist */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info) {
+		/* Create new entry for VSI under aggregator list */
+		agg_vsi_info = devm_kzalloc(ice_hw_to_dev(hw),
+					    sizeof(*agg_vsi_info), GFP_KERNEL);
+		if (!agg_vsi_info)
+			return ICE_ERR_PARAM;
+
+		/* add VSI ID into the aggregator list */
+		agg_vsi_info->vsi_handle = vsi_handle;
+		list_add(&agg_vsi_info->list_entry, &agg_info->agg_vsi_list);
+	}
+	/* Move VSI node to new aggregator node for requested TC(s) */
+	ice_for_each_traffic_class(tc) {
+		if (!ice_is_tc_ena(*tc_bitmap, tc))
+			continue;
+
+		/* Move VSI to new aggregator */
+		status = ice_sched_move_vsi_to_agg(pi, vsi_handle, agg_id, tc);
+		if (status)
+			break;
+
+		set_bit(tc, agg_vsi_info->tc_bitmap);
+	}
+	return status;
+}
+
+/**
  * ice_sched_rm_unused_rl_prof - remove unused RL profile
  * @pi: port information structure
  *
@@ -1955,7 +2782,6 @@ ice_sched_cfg_node_bw_alloc(struct ice_hw *hw, struct ice_sched_node *node,
 {
 	struct ice_aqc_txsched_elem_data buf;
 	struct ice_aqc_txsched_elem *data;
-	enum ice_status status;
 
 	buf = node->info;
 	data = &buf.data;
@@ -1970,7 +2796,32 @@ ice_sched_cfg_node_bw_alloc(struct ice_hw *hw, struct ice_sched_node *node,
 	}
 
 	/* Configure element */
-	status = ice_sched_update_elem(hw, node, &buf);
+	return ice_sched_update_elem(hw, node, &buf);
+}
+
+/**
+ * ice_move_vsi_to_agg - moves VSI to new or default aggregator
+ * @pi: port information structure
+ * @agg_id: aggregator ID
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap of enabled TC(s)
+ *
+ * Move or associate VSI to a new or default aggregator node.
+ */
+enum ice_status
+ice_move_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+		    u8 tc_bitmap)
+{
+	unsigned long bitmap = tc_bitmap;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_assoc_vsi_to_agg(pi, agg_id, vsi_handle,
+					    (unsigned long *)&bitmap);
+	if (!status)
+		status = ice_save_agg_vsi_tc_bitmap(pi, agg_id, vsi_handle,
+						    (unsigned long *)&bitmap);
+	mutex_unlock(&pi->sched_lock);
 	return status;
 }
 
@@ -2045,11 +2896,12 @@ static void ice_set_clear_shared_bw(struct ice_bw_type_info *bw_t_info, u32 bw)
 
 /**
  * ice_sched_calc_wakeup - calculate RL profile wakeup parameter
+ * @hw: pointer to the HW struct
  * @bw: bandwidth in Kbps
  *
  * This function calculates the wakeup parameter of RL profile.
  */
-static u16 ice_sched_calc_wakeup(s32 bw)
+static u16 ice_sched_calc_wakeup(struct ice_hw *hw, s32 bw)
 {
 	s64 bytes_per_sec, wakeup_int, wakeup_a, wakeup_b, wakeup_f;
 	s32 wakeup_f_int;
@@ -2057,7 +2909,7 @@ static u16 ice_sched_calc_wakeup(s32 bw)
 
 	/* Get the wakeup integer value */
 	bytes_per_sec = div64_long(((s64)bw * 1000), BITS_PER_BYTE);
-	wakeup_int = div64_long(ICE_RL_PROF_FREQUENCY, bytes_per_sec);
+	wakeup_int = div64_long(hw->psm_clk_freq, bytes_per_sec);
 	if (wakeup_int > 63) {
 		wakeup = (u16)((1 << 15) | wakeup_int);
 	} else {
@@ -2066,8 +2918,7 @@ static u16 ice_sched_calc_wakeup(s32 bw)
 		 */
 		wakeup_b = (s64)ICE_RL_PROF_MULTIPLIER * wakeup_int;
 		wakeup_a = div64_long((s64)ICE_RL_PROF_MULTIPLIER *
-					   ICE_RL_PROF_FREQUENCY,
-				      bytes_per_sec);
+					   hw->psm_clk_freq, bytes_per_sec);
 
 		/* Get Fraction value */
 		wakeup_f = wakeup_a - wakeup_b;
@@ -2087,13 +2938,15 @@ static u16 ice_sched_calc_wakeup(s32 bw)
 
 /**
  * ice_sched_bw_to_rl_profile - convert BW to profile parameters
+ * @hw: pointer to the HW struct
  * @bw: bandwidth in Kbps
  * @profile: profile parameters to return
  *
  * This function converts the BW to profile structure format.
  */
 static enum ice_status
-ice_sched_bw_to_rl_profile(u32 bw, struct ice_aqc_rl_profile_elem *profile)
+ice_sched_bw_to_rl_profile(struct ice_hw *hw, u32 bw,
+			   struct ice_aqc_rl_profile_elem *profile)
 {
 	enum ice_status status = ICE_ERR_PARAM;
 	s64 bytes_per_sec, ts_rate, mv_tmp;
@@ -2113,7 +2966,7 @@ ice_sched_bw_to_rl_profile(u32 bw, struct ice_aqc_rl_profile_elem *profile)
 	for (i = 0; i < 64; i++) {
 		u64 pow_result = BIT_ULL(i);
 
-		ts_rate = div64_long((s64)ICE_RL_PROF_FREQUENCY,
+		ts_rate = div64_long((s64)hw->psm_clk_freq,
 				     pow_result * ICE_RL_PROF_TS_MULTIPLIER);
 		if (ts_rate <= 0)
 			continue;
@@ -2137,7 +2990,7 @@ ice_sched_bw_to_rl_profile(u32 bw, struct ice_aqc_rl_profile_elem *profile)
 	if (found) {
 		u16 wm;
 
-		wm = ice_sched_calc_wakeup(bw);
+		wm = ice_sched_calc_wakeup(hw, bw);
 		profile->rl_multiply = cpu_to_le16(mv);
 		profile->wake_up_calc = cpu_to_le16(wm);
 		profile->rl_encode = cpu_to_le16(encode);
@@ -2206,7 +3059,7 @@ ice_sched_add_rl_profile(struct ice_port_info *pi,
 	if (!rl_prof_elem)
 		return NULL;
 
-	status = ice_sched_bw_to_rl_profile(bw, &rl_prof_elem->profile);
+	status = ice_sched_bw_to_rl_profile(hw, bw, &rl_prof_elem->profile);
 	if (status)
 		goto exit_add_rl_prof;
 
@@ -2941,6 +3794,156 @@ ice_sched_replay_node_bw(struct ice_hw *hw, struct ice_sched_node *node,
 }
 
 /**
+ * ice_sched_get_ena_tc_bitmap - get enabled TC bitmap
+ * @pi: port info struct
+ * @tc_bitmap: 8 bits TC bitmap to check
+ * @ena_tc_bitmap: 8 bits enabled TC bitmap to return
+ *
+ * This function returns enabled TC bitmap in variable ena_tc_bitmap. Some TCs
+ * may be missing, it returns enabled TCs. This function needs to be called with
+ * scheduler lock held.
+ */
+static void
+ice_sched_get_ena_tc_bitmap(struct ice_port_info *pi,
+			    unsigned long *tc_bitmap,
+			    unsigned long *ena_tc_bitmap)
+{
+	u8 tc;
+
+	/* Some TC(s) may be missing after reset, adjust for replay */
+	ice_for_each_traffic_class(tc)
+		if (ice_is_tc_ena(*tc_bitmap, tc) &&
+		    (ice_sched_get_tc_node(pi, tc)))
+			set_bit(tc, ena_tc_bitmap);
+}
+
+/**
+ * ice_sched_replay_agg - recreate aggregator node(s)
+ * @hw: pointer to the HW struct
+ *
+ * This function recreate aggregator type nodes which are not replayed earlier.
+ * It also replay aggregator BW information. These aggregator nodes are not
+ * associated with VSI type node yet.
+ */
+void ice_sched_replay_agg(struct ice_hw *hw)
+{
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry)
+		/* replay aggregator (re-create aggregator node) */
+		if (!bitmap_equal(agg_info->tc_bitmap, agg_info->replay_tc_bitmap,
+				  ICE_MAX_TRAFFIC_CLASS)) {
+			DECLARE_BITMAP(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+			enum ice_status status;
+
+			bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+			ice_sched_get_ena_tc_bitmap(pi,
+						    agg_info->replay_tc_bitmap,
+						    replay_bitmap);
+			status = ice_sched_cfg_agg(hw->port_info,
+						   agg_info->agg_id,
+						   ICE_AGG_TYPE_AGG,
+						   replay_bitmap);
+			if (status) {
+				dev_info(ice_hw_to_dev(hw),
+					 "Replay agg id[%d] failed\n",
+					 agg_info->agg_id);
+				/* Move on to next one */
+				continue;
+			}
+		}
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_replay_agg_vsi_preinit - Agg/VSI replay pre initialization
+ * @hw: pointer to the HW struct
+ *
+ * This function initialize aggregator(s) TC bitmap to zero. A required
+ * preinit step for replaying aggregators.
+ */
+void ice_sched_replay_agg_vsi_preinit(struct ice_hw *hw)
+{
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+
+	mutex_lock(&pi->sched_lock);
+	list_for_each_entry(agg_info, &hw->agg_list, list_entry) {
+		struct ice_sched_agg_vsi_info *agg_vsi_info;
+
+		agg_info->tc_bitmap[0] = 0;
+		list_for_each_entry(agg_vsi_info, &agg_info->agg_vsi_list,
+				    list_entry)
+			agg_vsi_info->tc_bitmap[0] = 0;
+	}
+	mutex_unlock(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_replay_vsi_agg - replay aggregator & VSI to aggregator node(s)
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ *
+ * This function replays aggregator node, VSI to aggregator type nodes, and
+ * their node bandwidth information. This function needs to be called with
+ * scheduler lock held.
+ */
+static enum ice_status
+ice_sched_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle)
+{
+	DECLARE_BITMAP(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	struct ice_sched_agg_vsi_info *agg_vsi_info;
+	struct ice_port_info *pi = hw->port_info;
+	struct ice_sched_agg_info *agg_info;
+	enum ice_status status;
+
+	bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	agg_info = ice_get_vsi_agg_info(hw, vsi_handle);
+	if (!agg_info)
+		return 0; /* Not present in list - default Agg case */
+	agg_vsi_info = ice_get_agg_vsi_info(agg_info, vsi_handle);
+	if (!agg_vsi_info)
+		return 0; /* Not present in list - default Agg case */
+	ice_sched_get_ena_tc_bitmap(pi, agg_info->replay_tc_bitmap,
+				    replay_bitmap);
+	/* Replay aggregator node associated to vsi_handle */
+	status = ice_sched_cfg_agg(hw->port_info, agg_info->agg_id,
+				   ICE_AGG_TYPE_AGG, replay_bitmap);
+	if (status)
+		return status;
+
+	bitmap_zero(replay_bitmap, ICE_MAX_TRAFFIC_CLASS);
+	ice_sched_get_ena_tc_bitmap(pi, agg_vsi_info->replay_tc_bitmap,
+				    replay_bitmap);
+	/* Move this VSI (vsi_handle) to above aggregator */
+	return ice_sched_assoc_vsi_to_agg(pi, agg_info->agg_id, vsi_handle,
+					  replay_bitmap);
+}
+
+/**
+ * ice_replay_vsi_agg - replay VSI to aggregator node
+ * @hw: pointer to the HW struct
+ * @vsi_handle: software VSI handle
+ *
+ * This function replays association of VSI to aggregator type nodes, and
+ * node bandwidth information.
+ */
+enum ice_status ice_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_port_info *pi = hw->port_info;
+	enum ice_status status;
+
+	mutex_lock(&pi->sched_lock);
+	status = ice_sched_replay_vsi_agg(hw, vsi_handle);
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
  * ice_sched_replay_q_bw - replay queue type node BW
  * @pi: port information structure
  * @q_ctx: queue context structure
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.h b/drivers/net/ethernet/intel/ice/ice_sched.h
index 0e55ae0d446f..9beef8f0ec76 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.h
+++ b/drivers/net/ethernet/intel/ice/ice_sched.h
@@ -8,6 +8,7 @@
 
 #define ICE_QGRP_LAYER_OFFSET	2
 #define ICE_VSI_LAYER_OFFSET	4
+#define ICE_AGG_LAYER_OFFSET	6
 #define ICE_SCHED_INVAL_LAYER_NUM	0xFF
 /* Burst size is a 12 bits register that is configured while creating the RL
  * profile(s). MSB is a granularity bit and tells the granularity type
@@ -23,12 +24,16 @@
 	((BIT(11) - 1) * 64) /* In Bytes */
 #define ICE_MAX_BURST_SIZE_KBYTE_GRANULARITY	ICE_MAX_BURST_SIZE_ALLOWED
 
-#define ICE_RL_PROF_FREQUENCY 446000000
 #define ICE_RL_PROF_ACCURACY_BYTES 128
 #define ICE_RL_PROF_MULTIPLIER 10000
 #define ICE_RL_PROF_TS_MULTIPLIER 32
 #define ICE_RL_PROF_FRACTION 512
 
+#define ICE_PSM_CLK_367MHZ_IN_HZ 367647059
+#define ICE_PSM_CLK_416MHZ_IN_HZ 416666667
+#define ICE_PSM_CLK_446MHZ_IN_HZ 446428571
+#define ICE_PSM_CLK_390MHZ_IN_HZ 390625000
+
 /* BW rate limit profile parameters list entry along
  * with bandwidth maintained per layer in port info
  */
@@ -43,6 +48,8 @@ struct ice_sched_agg_vsi_info {
 	struct list_head list_entry;
 	DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 	u16 vsi_handle;
+	/* save aggregator VSI TC bitmap */
+	DECLARE_BITMAP(replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 };
 
 struct ice_sched_agg_info {
@@ -51,6 +58,8 @@ struct ice_sched_agg_info {
 	DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 	u32 agg_id;
 	enum ice_agg_type agg_type;
+	/* save aggregator TC bitmap */
+	DECLARE_BITMAP(replay_tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
 };
 
 /* FW AQ command calls */
@@ -60,6 +69,8 @@ ice_aq_query_sched_elems(struct ice_hw *hw, u16 elems_req,
 			 u16 *elems_ret, struct ice_sq_cd *cd);
 enum ice_status ice_sched_init_port(struct ice_port_info *pi);
 enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw);
+void ice_sched_get_psm_clk_freq(struct ice_hw *hw);
+
 void ice_sched_clear_port(struct ice_port_info *pi);
 void ice_sched_cleanup_all(struct ice_hw *hw);
 void ice_sched_clear_agg(struct ice_hw *hw);
@@ -78,6 +89,14 @@ enum ice_status
 ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs,
 		  u8 owner, bool enable);
 enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle);
+
+/* Tx scheduler rate limiter functions */
+enum ice_status
+ice_cfg_agg(struct ice_port_info *pi, u32 agg_id,
+	    enum ice_agg_type agg_type, u8 tc_bitmap);
+enum ice_status
+ice_move_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle,
+		    u8 tc_bitmap);
 enum ice_status
 ice_cfg_q_bw_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 		 u16 q_handle, enum ice_rl_type rl_type, u32 bw);
@@ -85,6 +104,9 @@ enum ice_status
 ice_cfg_q_bw_dflt_lmt(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
 		      u16 q_handle, enum ice_rl_type rl_type);
 enum ice_status ice_cfg_rl_burst_size(struct ice_hw *hw, u32 bytes);
+void ice_sched_replay_agg_vsi_preinit(struct ice_hw *hw);
+void ice_sched_replay_agg(struct ice_hw *hw);
+enum ice_status ice_replay_vsi_agg(struct ice_hw *hw, u16 vsi_handle);
 enum ice_status
 ice_sched_replay_q_bw(struct ice_port_info *pi, struct ice_q_ctx *q_ctx);
 #endif /* _ICE_SCHED_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index c33612132ddf..67c965a3f5d2 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -603,7 +603,7 @@ enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw)
 		}
 	} while (req_desc && !status);
 
-	devm_kfree(ice_hw_to_dev(hw), (void *)rbuf);
+	devm_kfree(ice_hw_to_dev(hw), rbuf);
 	return status;
 }
 
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 8ca63c6a6ba4..580419813bb2 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1497,22 +1497,11 @@ static void ice_update_ena_itr(struct ice_q_vector *q_vector)
 	struct ice_vsi *vsi = q_vector->vsi;
 	u32 itr_val;
 
-	/* when exiting WB_ON_ITR lets set a low ITR value and trigger
-	 * interrupts to expire right away in case we have more work ready to go
-	 * already
+	/* when exiting WB_ON_ITR just reset the countdown and let ITR
+	 * resume it's normal "interrupts-enabled" path
 	 */
-	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE) {
-		itr_val = ice_buildreg_itr(rx->itr_idx, ICE_WB_ON_ITR_USECS);
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
-		/* set target back to last user set value */
-		rx->target_itr = rx->itr_setting;
-		/* set current to what we just wrote and dynamic if needed */
-		rx->current_itr = ICE_WB_ON_ITR_USECS |
-			(rx->itr_setting & ICE_ITR_DYNAMIC);
-		/* allow normal interrupt flow to start */
+	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE)
 		q_vector->itr_countdown = 0;
-		return;
-	}
 
 	/* This will do nothing if dynamic updates are not enabled */
 	ice_update_itr(q_vector, tx);
@@ -1552,10 +1541,8 @@ static void ice_update_ena_itr(struct ice_q_vector *q_vector)
 			q_vector->itr_countdown--;
 	}
 
-	if (!test_bit(__ICE_DOWN, q_vector->vsi->state))
-		wr32(&q_vector->vsi->back->hw,
-		     GLINT_DYN_CTL(q_vector->reg_idx),
-		     itr_val);
+	if (!test_bit(__ICE_DOWN, vsi->state))
+		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
 }
 
 /**
@@ -1565,30 +1552,29 @@ static void ice_update_ena_itr(struct ice_q_vector *q_vector)
  * We need to tell hardware to write-back completed descriptors even when
  * interrupts are disabled. Descriptors will be written back on cache line
  * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
- * descriptors may not be written back if they don't fill a cache line until the
- * next interrupt.
+ * descriptors may not be written back if they don't fill a cache line until
+ * the next interrupt.
  *
- * This sets the write-back frequency to 2 microseconds as that is the minimum
- * value that's not 0 due to ITR granularity. Also, set the INTENA_MSK bit to
- * make sure hardware knows we aren't meddling with the INTENA_M bit.
+ * This sets the write-back frequency to whatever was set previously for the
+ * ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we
+ * aren't meddling with the INTENA_M bit.
  */
 static void ice_set_wb_on_itr(struct ice_q_vector *q_vector)
 {
 	struct ice_vsi *vsi = q_vector->vsi;
 
-	/* already in WB_ON_ITR mode no need to change it */
+	/* already in wb_on_itr mode no need to change it */
 	if (q_vector->itr_countdown == ICE_IN_WB_ON_ITR_MODE)
 		return;
 
-	if (q_vector->num_ring_rx)
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
-		     ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS,
-						 ICE_RX_ITR));
-
-	if (q_vector->num_ring_tx)
-		wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
-		     ICE_GLINT_DYN_CTL_WB_ON_ITR(ICE_WB_ON_ITR_USECS,
-						 ICE_TX_ITR));
+	/* use previously set ITR values for all of the ITR indices by
+	 * specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and
+	 * be static in non-adaptive mode (user configured)
+	 */
+	wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
+	     ((ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) &
+	      GLINT_DYN_CTL_ITR_INDX_M) | GLINT_DYN_CTL_INTENA_MSK_M |
+	     GLINT_DYN_CTL_WB_ON_ITR_M);
 
 	q_vector->itr_countdown = ICE_IN_WB_ON_ITR_MODE;
 }
@@ -1655,8 +1641,13 @@ int ice_napi_poll(struct napi_struct *napi, int budget)
 	}
 
 	/* If work not completed, return budget and polling will return */
-	if (!clean_complete)
+	if (!clean_complete) {
+		/* Set the writeback on ITR so partial completions of
+		 * cache-lines will still continue even if we're polling.
+		 */
+		ice_set_wb_on_itr(q_vector);
 		return budget;
+	}
 
 	/* Exit the polling mode, but don't re-enable interrupts if stack might
 	 * poll us due to busy-polling
@@ -2413,7 +2404,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
 	/* allow CONTROL frames egress from main VSI if FW LLDP disabled */
 	if (unlikely(skb->priority == TC_PRIO_CONTROL &&
 		     vsi->type == ICE_VSI_PF &&
-		     vsi->port_info->is_sw_lldp))
+		     vsi->port_info->qos_cfg.is_sw_lldp))
 		offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
 					ICE_TX_CTX_DESC_SWTCH_UPLINK <<
 					ICE_TXD_CTX_QW1_CMD_S);
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index ff1a1cbd078e..db56a0c8bfe1 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -240,7 +240,6 @@ enum ice_rx_dtype {
 #define ICE_DFLT_INTRL	0
 #define ICE_MAX_INTRL	236
 
-#define ICE_WB_ON_ITR_USECS	2
 #define ICE_IN_WB_ON_ITR_MODE	255
 /* Sets WB_ON_ITR and assumes INTENA bit is already cleared, which allows
  * setting the MSK_M bit to tell hardware to ignore the INTENA_M bit. Also,
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index a98800a91045..a6cb0c35748c 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -403,7 +403,11 @@ struct ice_link_default_override_tlv {
 #define ice_for_each_traffic_class(_i)	\
 	for ((_i) = 0; (_i) < ICE_MAX_TRAFFIC_CLASS; (_i)++)
 
+/* ICE_DFLT_AGG_ID means that all new VM(s)/VSI node connects
+ * to driver defined policy for default aggregator
+ */
 #define ICE_INVAL_TEID 0xFFFFFFFF
+#define ICE_DFLT_AGG_ID 0
 
 struct ice_sched_node {
 	struct ice_sched_node *parent;
@@ -552,6 +556,14 @@ struct ice_dcbx_cfg {
 #define ICE_DCBX_APPS_NON_WILLING	0x1
 };
 
+struct ice_qos_cfg {
+	struct ice_dcbx_cfg local_dcbx_cfg;	/* Oper/Local Cfg */
+	struct ice_dcbx_cfg desired_dcbx_cfg;	/* CEE Desired Cfg */
+	struct ice_dcbx_cfg remote_dcbx_cfg;	/* Peer Cfg */
+	u8 dcbx_status : 3;			/* see ICE_DCBX_STATUS_DIS */
+	u8 is_sw_lldp : 1;
+};
+
 struct ice_port_info {
 	struct ice_sched_node *root;	/* Root Node per Port */
 	struct ice_hw *hw;		/* back pointer to HW instance */
@@ -575,13 +587,7 @@ struct ice_port_info {
 		sib_head[ICE_MAX_TRAFFIC_CLASS][ICE_AQC_TOPO_MAX_LEVEL_NUM];
 	/* List contain profile ID(s) and other params per layer */
 	struct list_head rl_prof_list[ICE_AQC_TOPO_MAX_LEVEL_NUM];
-	struct ice_dcbx_cfg local_dcbx_cfg;	/* Oper/Local Cfg */
-	/* DCBX info */
-	struct ice_dcbx_cfg remote_dcbx_cfg;	/* Peer Cfg */
-	struct ice_dcbx_cfg desired_dcbx_cfg;	/* CEE Desired Cfg */
-	/* LLDP/DCBX Status */
-	u8 dcbx_status:3;		/* see ICE_DCBX_STATUS_DIS */
-	u8 is_sw_lldp:1;
+	struct ice_qos_cfg qos_cfg;
 	u8 is_vf:1;
 };
 
@@ -614,6 +620,8 @@ struct ice_hw {
 	void *back;
 	struct ice_aqc_layer_props *layer_info;
 	struct ice_port_info *port_info;
+	/* PSM clock frequency for calculating RL profile params */
+	u32 psm_clk_freq;
 	u64 debug_mask;		/* bitmap for debug mask */
 	enum ice_mac_type mac_type;
 
@@ -902,4 +910,9 @@ struct ice_hw_port_stats {
 /* Hash redirection LUT for VSI - maximum array size */
 #define ICE_VSIQF_HLUT_ARRAY_SIZE	((VSIQF_HLUT_MAX_INDEX + 1) * 4)
 
+/* AQ API version for LLDP_FILTER_CONTROL */
+#define ICE_FW_API_LLDP_FLTR_MAJ	1
+#define ICE_FW_API_LLDP_FLTR_MIN	7
+#define ICE_FW_API_LLDP_FLTR_PATCH	1
+
 #endif /* _ICE_TYPE_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index ec7f6c64132e..bf5fd812ea0e 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -1057,11 +1057,45 @@ static void ice_vf_pre_vsi_rebuild(struct ice_vf *vf)
 }
 
 /**
+ * ice_vf_rebuild_aggregator_node_cfg - rebuild aggregator node config
+ * @vsi: Pointer to VSI
+ *
+ * This function moves VSI into corresponding scheduler aggregator node
+ * based on cached value of "aggregator node info" per VSI
+ */
+static void ice_vf_rebuild_aggregator_node_cfg(struct ice_vsi *vsi)
+{
+	struct ice_pf *pf = vsi->back;
+	enum ice_status status;
+	struct device *dev;
+
+	if (!vsi->agg_node)
+		return;
+
+	dev = ice_pf_to_dev(pf);
+	if (vsi->agg_node->num_vsis == ICE_MAX_VSIS_IN_AGG_NODE) {
+		dev_dbg(dev,
+			"agg_id %u already has reached max_num_vsis %u\n",
+			vsi->agg_node->agg_id, vsi->agg_node->num_vsis);
+		return;
+	}
+
+	status = ice_move_vsi_to_agg(pf->hw.port_info, vsi->agg_node->agg_id,
+				     vsi->idx, vsi->tc_cfg.ena_tc);
+	if (status)
+		dev_dbg(dev, "unable to move VSI idx %u into aggregator %u node",
+			vsi->idx, vsi->agg_node->agg_id);
+	else
+		vsi->agg_node->num_vsis++;
+}
+
+/**
  * ice_vf_rebuild_host_cfg - host admin configuration is persistent across reset
  * @vf: VF to rebuild host configuration on
  */
 static void ice_vf_rebuild_host_cfg(struct ice_vf *vf)
 {
+	struct ice_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
 	struct device *dev = ice_pf_to_dev(vf->pf);
 
 	ice_vf_set_host_trust_cfg(vf);
@@ -1073,6 +1107,8 @@ static void ice_vf_rebuild_host_cfg(struct ice_vf *vf)
 	if (ice_vf_rebuild_host_vlan_cfg(vf))
 		dev_err(dev, "failed to rebuild VLAN configuration for VF %u\n",
 			vf->vf_id);
+	/* rebuild aggregator node config for main VF VSI */
+	ice_vf_rebuild_aggregator_node_cfg(vsi);
 }
 
 /**
@@ -1677,6 +1713,8 @@ int ice_sriov_configure(struct pci_dev *pdev, int num_vfs)
 	if (!num_vfs) {
 		if (!pci_vfs_assigned(pdev)) {
 			ice_free_vfs(pf);
+			if (pf->lag)
+				ice_enable_lag(pf->lag);
 			return 0;
 		}
 
@@ -1688,6 +1726,8 @@ int ice_sriov_configure(struct pci_dev *pdev, int num_vfs)
 	if (err)
 		return err;
 
+	if (pf->lag)
+		ice_disable_lag(pf->lag);
 	return num_vfs;
 }
 
@@ -2312,12 +2352,12 @@ bool ice_is_any_vf_in_promisc(struct ice_pf *pf)
 static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	bool rm_promisc, alluni = false, allmulti = false;
 	struct virtchnl_promisc_info *info =
 	    (struct virtchnl_promisc_info *)msg;
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
 	struct device *dev;
-	bool rm_promisc;
 	int ret = 0;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
@@ -2344,8 +2384,13 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
 		goto error_param;
 	}
 
-	rm_promisc = !(info->flags & FLAG_VF_UNICAST_PROMISC) &&
-		!(info->flags & FLAG_VF_MULTICAST_PROMISC);
+	if (info->flags & FLAG_VF_UNICAST_PROMISC)
+		alluni = true;
+
+	if (info->flags & FLAG_VF_MULTICAST_PROMISC)
+		allmulti = true;
+
+	rm_promisc = !allmulti && !alluni;
 
 	if (vsi->num_vlan || vf->port_vlan_info) {
 		struct ice_vsi *pf_vsi = ice_get_main_vsi(pf);
@@ -2399,12 +2444,12 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
 		enum ice_status status;
 		u8 promisc_m;
 
-		if (info->flags & FLAG_VF_UNICAST_PROMISC) {
+		if (alluni) {
 			if (vf->port_vlan_info || vsi->num_vlan)
 				promisc_m = ICE_UCAST_VLAN_PROMISC_BITS;
 			else
 				promisc_m = ICE_UCAST_PROMISC_BITS;
-		} else if (info->flags & FLAG_VF_MULTICAST_PROMISC) {
+		} else if (allmulti) {
 			if (vf->port_vlan_info || vsi->num_vlan)
 				promisc_m = ICE_MCAST_VLAN_PROMISC_BITS;
 			else
@@ -2432,15 +2477,16 @@ static int ice_vc_cfg_promiscuous_mode_msg(struct ice_vf *vf, u8 *msg)
 		}
 	}
 
-	if (info->flags & FLAG_VF_MULTICAST_PROMISC)
-		set_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states);
-	else
-		clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states);
+	if (allmulti &&
+	    !test_and_set_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+		dev_info(dev, "VF %u successfully set multicast promiscuous mode\n", vf->vf_id);
+	else if (!allmulti && test_and_clear_bit(ICE_VF_STATE_MC_PROMISC, vf->vf_states))
+		dev_info(dev, "VF %u successfully unset multicast promiscuous mode\n", vf->vf_id);
 
-	if (info->flags & FLAG_VF_UNICAST_PROMISC)
-		set_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states);
-	else
-		clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states);
+	if (alluni && !test_and_set_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+		dev_info(dev, "VF %u successfully set unicast promiscuous mode\n", vf->vf_id);
+	else if (!alluni && test_and_clear_bit(ICE_VF_STATE_UC_PROMISC, vf->vf_states))
+		dev_info(dev, "VF %u successfully unset unicast promiscuous mode\n", vf->vf_id);
 
 error_param:
 	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 1782146db644..875fa0cbef56 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -260,45 +260,6 @@ free_buf:
 }
 
 /**
- * ice_xsk_alloc_pools - allocate a buffer pool for an XDP socket
- * @vsi: VSI to allocate the buffer pool on
- *
- * Returns 0 on success, negative on error
- */
-static int ice_xsk_alloc_pools(struct ice_vsi *vsi)
-{
-	if (vsi->xsk_pools)
-		return 0;
-
-	vsi->xsk_pools = kcalloc(vsi->num_xsk_pools, sizeof(*vsi->xsk_pools),
-				 GFP_KERNEL);
-
-	if (!vsi->xsk_pools) {
-		vsi->num_xsk_pools = 0;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/**
- * ice_xsk_remove_pool - Remove an buffer pool for a certain ring/qid
- * @vsi: VSI from which the VSI will be removed
- * @qid: Ring/qid associated with the buffer pool
- */
-static void ice_xsk_remove_pool(struct ice_vsi *vsi, u16 qid)
-{
-	vsi->xsk_pools[qid] = NULL;
-	vsi->num_xsk_pools_used--;
-
-	if (vsi->num_xsk_pools_used == 0) {
-		kfree(vsi->xsk_pools);
-		vsi->xsk_pools = NULL;
-		vsi->num_xsk_pools = 0;
-	}
-}
-
-/**
  * ice_xsk_pool_disable - disable a buffer pool region
  * @vsi: Current VSI
  * @qid: queue ID
@@ -307,12 +268,12 @@ static void ice_xsk_remove_pool(struct ice_vsi *vsi, u16 qid)
  */
 static int ice_xsk_pool_disable(struct ice_vsi *vsi, u16 qid)
 {
-	if (!vsi->xsk_pools || qid >= vsi->num_xsk_pools ||
-	    !vsi->xsk_pools[qid])
+	struct xsk_buff_pool *pool = xsk_get_pool_from_qid(vsi->netdev, qid);
+
+	if (!pool)
 		return -EINVAL;
 
-	xsk_pool_dma_unmap(vsi->xsk_pools[qid], ICE_RX_DMA_ATTR);
-	ice_xsk_remove_pool(vsi, qid);
+	xsk_pool_dma_unmap(pool, ICE_RX_DMA_ATTR);
 
 	return 0;
 }
@@ -333,22 +294,11 @@ ice_xsk_pool_enable(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid)
 	if (vsi->type != ICE_VSI_PF)
 		return -EINVAL;
 
-	if (!vsi->num_xsk_pools)
-		vsi->num_xsk_pools = min_t(u16, vsi->num_rxq, vsi->num_txq);
-	if (qid >= vsi->num_xsk_pools)
+	if (qid >= vsi->netdev->real_num_rx_queues ||
+	    qid >= vsi->netdev->real_num_tx_queues)
 		return -EINVAL;
 
-	err = ice_xsk_alloc_pools(vsi);
-	if (err)
-		return err;
-
-	if (vsi->xsk_pools && vsi->xsk_pools[qid])
-		return -EBUSY;
-
-	vsi->xsk_pools[qid] = pool;
-	vsi->num_xsk_pools_used++;
-
-	err = xsk_pool_dma_map(vsi->xsk_pools[qid], ice_pf_to_dev(vsi->back),
+	err = xsk_pool_dma_map(pool, ice_pf_to_dev(vsi->back),
 			       ICE_RX_DMA_ATTR);
 	if (err)
 		return err;
@@ -842,11 +792,8 @@ bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi)
 {
 	int i;
 
-	if (!vsi->xsk_pools)
-		return false;
-
-	for (i = 0; i < vsi->num_xsk_pools; i++) {
-		if (vsi->xsk_pools[i])
+	ice_for_each_rxq(vsi, i) {
+		if (xsk_get_pool_from_qid(vsi->netdev, i))
 			return true;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index cf111e73f81e..9ce90841f92d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4943,6 +4943,25 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
 }
 
 static void
+mlxsw_sp_fib4_offload_failed_flag_set(struct mlxsw_sp *mlxsw_sp,
+				      struct fib_entry_notifier_info *fen_info)
+{
+	u32 *p_dst = (u32 *) &fen_info->dst;
+	struct fib_rt_info fri;
+
+	fri.fi = fen_info->fi;
+	fri.tb_id = fen_info->tb_id;
+	fri.dst = cpu_to_be32(*p_dst);
+	fri.dst_len = fen_info->dst_len;
+	fri.tos = fen_info->tos;
+	fri.type = fen_info->type;
+	fri.offload = false;
+	fri.trap = false;
+	fri.offload_failed = true;
+	fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri);
+}
+
+static void
 mlxsw_sp_fib4_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp,
 				 struct mlxsw_sp_fib_entry *fib_entry)
 {
@@ -4963,6 +4982,7 @@ mlxsw_sp_fib4_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp,
 	fri.type = fib4_entry->type;
 	fri.offload = should_offload;
 	fri.trap = !should_offload;
+	fri.offload_failed = false;
 	fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri);
 }
 
@@ -4985,11 +5005,36 @@ mlxsw_sp_fib4_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp,
 	fri.type = fib4_entry->type;
 	fri.offload = false;
 	fri.trap = false;
+	fri.offload_failed = false;
 	fib_alias_hw_flags_set(mlxsw_sp_net(mlxsw_sp), &fri);
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static void
+mlxsw_sp_fib6_offload_failed_flag_set(struct mlxsw_sp *mlxsw_sp,
+				      struct fib6_info **rt_arr,
+				      unsigned int nrt6)
+{
+	int i;
+
+	/* In IPv6 a multipath route is represented using multiple routes, so
+	 * we need to set the flags on all of them.
+	 */
+	for (i = 0; i < nrt6; i++)
+		fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), rt_arr[i],
+				       false, false, true);
+}
+#else
+static void
+mlxsw_sp_fib6_offload_failed_flag_set(struct mlxsw_sp *mlxsw_sp,
+				      struct fib6_info **rt_arr,
+				      unsigned int nrt6)
+{
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void
 mlxsw_sp_fib6_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp,
 				 struct mlxsw_sp_fib_entry *fib_entry)
 {
@@ -5006,7 +5051,7 @@ mlxsw_sp_fib6_entry_hw_flags_set(struct mlxsw_sp *mlxsw_sp,
 				  common);
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list)
 		fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), mlxsw_sp_rt6->rt,
-				       should_offload, !should_offload);
+				       should_offload, !should_offload, false);
 }
 #else
 static void
@@ -5028,7 +5073,7 @@ mlxsw_sp_fib6_entry_hw_flags_clear(struct mlxsw_sp *mlxsw_sp,
 				  common);
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list)
 		fib6_info_hw_flags_set(mlxsw_sp_net(mlxsw_sp), mlxsw_sp_rt6->rt,
-				       false, false);
+				       false, false, false);
 }
 #else
 static void
@@ -7021,6 +7066,8 @@ static void mlxsw_sp_router_fib4_event_process(struct mlxsw_sp *mlxsw_sp,
 		if (err) {
 			mlxsw_sp_fib_entry_op_ctx_priv_put_all(op_ctx);
 			mlxsw_sp_router_fib_abort(mlxsw_sp);
+			mlxsw_sp_fib4_offload_failed_flag_set(mlxsw_sp,
+							      &fib_event->fen_info);
 		}
 		fib_info_put(fib_event->fen_info.fi);
 		break;
@@ -7042,6 +7089,7 @@ static void mlxsw_sp_router_fib6_event_process(struct mlxsw_sp *mlxsw_sp,
 					       struct mlxsw_sp_fib_entry_op_ctx *op_ctx,
 					       struct mlxsw_sp_fib_event *fib_event)
 {
+	struct mlxsw_sp_fib6_event *fib6_event = &fib_event->fib6_event;
 	int err;
 
 	mlxsw_sp_span_respin(mlxsw_sp);
@@ -7053,6 +7101,9 @@ static void mlxsw_sp_router_fib6_event_process(struct mlxsw_sp *mlxsw_sp,
 		if (err) {
 			mlxsw_sp_fib_entry_op_ctx_priv_put_all(op_ctx);
 			mlxsw_sp_router_fib_abort(mlxsw_sp);
+			mlxsw_sp_fib6_offload_failed_flag_set(mlxsw_sp,
+							      fib6_event->rt_arr,
+							      fib6_event->nrt6);
 		}
 		mlxsw_sp_router_fib6_event_fini(&fib_event->fib6_event);
 		break;
@@ -7062,6 +7113,9 @@ static void mlxsw_sp_router_fib6_event_process(struct mlxsw_sp *mlxsw_sp,
 		if (err) {
 			mlxsw_sp_fib_entry_op_ctx_priv_put_all(op_ctx);
 			mlxsw_sp_router_fib_abort(mlxsw_sp);
+			mlxsw_sp_fib6_offload_failed_flag_set(mlxsw_sp,
+							      fib6_event->rt_arr,
+							      fib6_event->nrt6);
 		}
 		mlxsw_sp_router_fib6_event_fini(&fib_event->fib6_event);
 		break;
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 816af1f55e2c..dbeb29fa16e8 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -1012,23 +1012,25 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev,
 	nsim_dev->fw_update_status = true;
 	nsim_dev->fw_update_overwrite_mask = 0;
 
-	nsim_dev->fib_data = nsim_fib_create(devlink, extack);
-	if (IS_ERR(nsim_dev->fib_data))
-		return PTR_ERR(nsim_dev->fib_data);
-
 	nsim_devlink_param_load_driverinit_values(devlink);
 
 	err = nsim_dev_dummy_region_init(nsim_dev, devlink);
 	if (err)
-		goto err_fib_destroy;
+		return err;
 
 	err = nsim_dev_traps_init(devlink);
 	if (err)
 		goto err_dummy_region_exit;
 
+	nsim_dev->fib_data = nsim_fib_create(devlink, extack);
+	if (IS_ERR(nsim_dev->fib_data)) {
+		err = PTR_ERR(nsim_dev->fib_data);
+		goto err_traps_exit;
+	}
+
 	err = nsim_dev_health_init(nsim_dev, devlink);
 	if (err)
-		goto err_traps_exit;
+		goto err_fib_destroy;
 
 	err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count);
 	if (err)
@@ -1043,12 +1045,12 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev,
 
 err_health_exit:
 	nsim_dev_health_exit(nsim_dev);
+err_fib_destroy:
+	nsim_fib_destroy(devlink, nsim_dev->fib_data);
 err_traps_exit:
 	nsim_dev_traps_exit(devlink);
 err_dummy_region_exit:
 	nsim_dev_dummy_region_exit(nsim_dev);
-err_fib_destroy:
-	nsim_fib_destroy(devlink, nsim_dev->fib_data);
 	return err;
 }
 
@@ -1080,15 +1082,9 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 	if (err)
 		goto err_devlink_free;
 
-	nsim_dev->fib_data = nsim_fib_create(devlink, NULL);
-	if (IS_ERR(nsim_dev->fib_data)) {
-		err = PTR_ERR(nsim_dev->fib_data);
-		goto err_resources_unregister;
-	}
-
 	err = devlink_register(devlink, &nsim_bus_dev->dev);
 	if (err)
-		goto err_fib_destroy;
+		goto err_resources_unregister;
 
 	err = devlink_params_register(devlink, nsim_devlink_params,
 				      ARRAY_SIZE(nsim_devlink_params));
@@ -1108,9 +1104,15 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 	if (err)
 		goto err_traps_exit;
 
+	nsim_dev->fib_data = nsim_fib_create(devlink, NULL);
+	if (IS_ERR(nsim_dev->fib_data)) {
+		err = PTR_ERR(nsim_dev->fib_data);
+		goto err_debugfs_exit;
+	}
+
 	err = nsim_dev_health_init(nsim_dev, devlink);
 	if (err)
-		goto err_debugfs_exit;
+		goto err_fib_destroy;
 
 	err = nsim_bpf_dev_init(nsim_dev);
 	if (err)
@@ -1128,6 +1130,8 @@ err_bpf_dev_exit:
 	nsim_bpf_dev_exit(nsim_dev);
 err_health_exit:
 	nsim_dev_health_exit(nsim_dev);
+err_fib_destroy:
+	nsim_fib_destroy(devlink, nsim_dev->fib_data);
 err_debugfs_exit:
 	nsim_dev_debugfs_exit(nsim_dev);
 err_traps_exit:
@@ -1139,8 +1143,6 @@ err_params_unregister:
 				  ARRAY_SIZE(nsim_devlink_params));
 err_dl_unregister:
 	devlink_unregister(devlink);
-err_fib_destroy:
-	nsim_fib_destroy(devlink, nsim_dev->fib_data);
 err_resources_unregister:
 	devlink_resources_unregister(devlink, NULL);
 err_devlink_free:
@@ -1157,10 +1159,10 @@ static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev)
 	debugfs_remove(nsim_dev->take_snapshot);
 	nsim_dev_port_del_all(nsim_dev);
 	nsim_dev_health_exit(nsim_dev);
+	nsim_fib_destroy(devlink, nsim_dev->fib_data);
 	nsim_dev_traps_exit(devlink);
 	nsim_dev_dummy_region_exit(nsim_dev);
 	mutex_destroy(&nsim_dev->port_list_lock);
-	nsim_fib_destroy(devlink, nsim_dev->fib_data);
 }
 
 void nsim_dev_remove(struct nsim_bus_dev *nsim_bus_dev)
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index 1779146926a5..46fb414f7ca6 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -26,6 +26,7 @@
 #include <net/fib_rules.h>
 #include <net/net_namespace.h>
 #include <net/nexthop.h>
+#include <linux/debugfs.h>
 
 #include "netdevsim.h"
 
@@ -53,6 +54,8 @@ struct nsim_fib_data {
 	struct work_struct fib_event_work;
 	struct list_head fib_event_queue;
 	spinlock_t fib_event_queue_lock; /* Protects fib event queue list */
+	struct dentry *ddir;
+	bool fail_route_offload;
 };
 
 struct nsim_fib_rt_key {
@@ -303,6 +306,25 @@ nsim_fib4_rt_lookup(struct rhashtable *fib_rt_ht,
 	return container_of(fib_rt, struct nsim_fib4_rt, common);
 }
 
+static void
+nsim_fib4_rt_offload_failed_flag_set(struct net *net,
+				     struct fib_entry_notifier_info *fen_info)
+{
+	u32 *p_dst = (u32 *)&fen_info->dst;
+	struct fib_rt_info fri;
+
+	fri.fi = fen_info->fi;
+	fri.tb_id = fen_info->tb_id;
+	fri.dst = cpu_to_be32(*p_dst);
+	fri.dst_len = fen_info->dst_len;
+	fri.tos = fen_info->tos;
+	fri.type = fen_info->type;
+	fri.offload = false;
+	fri.trap = false;
+	fri.offload_failed = true;
+	fib_alias_hw_flags_set(net, &fri);
+}
+
 static void nsim_fib4_rt_hw_flags_set(struct net *net,
 				      const struct nsim_fib4_rt *fib4_rt,
 				      bool trap)
@@ -319,6 +341,7 @@ static void nsim_fib4_rt_hw_flags_set(struct net *net,
 	fri.type = fib4_rt->type;
 	fri.offload = false;
 	fri.trap = trap;
+	fri.offload_failed = false;
 	fib_alias_hw_flags_set(net, &fri);
 }
 
@@ -383,6 +406,15 @@ static int nsim_fib4_rt_insert(struct nsim_fib_data *data,
 	struct nsim_fib4_rt *fib4_rt, *fib4_rt_old;
 	int err;
 
+	if (data->fail_route_offload) {
+		/* For testing purposes, user set debugfs fail_route_offload
+		 * value to true. Simulate hardware programming latency and then
+		 * fail.
+		 */
+		msleep(1);
+		return -EINVAL;
+	}
+
 	fib4_rt = nsim_fib4_rt_create(data, fen_info);
 	if (!fib4_rt)
 		return -ENOMEM;
@@ -405,7 +437,7 @@ static void nsim_fib4_rt_remove(struct nsim_fib_data *data,
 	struct nsim_fib4_rt *fib4_rt;
 
 	fib4_rt = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info);
-	if (WARN_ON_ONCE(!fib4_rt))
+	if (!fib4_rt)
 		return;
 
 	rhashtable_remove_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node,
@@ -422,6 +454,11 @@ static int nsim_fib4_event(struct nsim_fib_data *data,
 	switch (event) {
 	case FIB_EVENT_ENTRY_REPLACE:
 		err = nsim_fib4_rt_insert(data, fen_info);
+		if (err) {
+			struct net *net = devlink_net(data->devlink);
+
+			nsim_fib4_rt_offload_failed_flag_set(net, fen_info);
+		}
 		break;
 	case FIB_EVENT_ENTRY_DEL:
 		nsim_fib4_rt_remove(data, fen_info);
@@ -481,7 +518,7 @@ static void nsim_fib6_rt_nh_del(struct nsim_fib6_rt *fib6_rt,
 	struct nsim_fib6_rt_nh *fib6_rt_nh;
 
 	fib6_rt_nh = nsim_fib6_rt_nh_find(fib6_rt, rt);
-	if (WARN_ON_ONCE(!fib6_rt_nh))
+	if (!fib6_rt_nh)
 		return;
 
 	fib6_rt->nhs--;
@@ -563,8 +600,17 @@ static int nsim_fib6_rt_append(struct nsim_fib_data *data,
 	struct nsim_fib6_rt *fib6_rt;
 	int i, err;
 
+	if (data->fail_route_offload) {
+		/* For testing purposes, user set debugfs fail_route_offload
+		 * value to true. Simulate hardware programming latency and then
+		 * fail.
+		 */
+		msleep(1);
+		return -EINVAL;
+	}
+
 	fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt);
-	if (WARN_ON_ONCE(!fib6_rt))
+	if (!fib6_rt)
 		return -EINVAL;
 
 	for (i = 0; i < fib6_event->nrt6; i++) {
@@ -586,6 +632,26 @@ err_fib6_rt_nh_del:
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
+static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data,
+						 struct fib6_info **rt_arr,
+						 unsigned int nrt6)
+
+{
+	struct net *net = devlink_net(data->devlink);
+	int i;
+
+	for (i = 0; i < nrt6; i++)
+		fib6_info_hw_flags_set(net, rt_arr[i], false, false, true);
+}
+#else
+static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data,
+						 struct fib6_info **rt_arr,
+						 unsigned int nrt6)
+{
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
 static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data,
 				      const struct nsim_fib6_rt *fib6_rt,
 				      bool trap)
@@ -594,7 +660,7 @@ static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data,
 	struct nsim_fib6_rt_nh *fib6_rt_nh;
 
 	list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list)
-		fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap);
+		fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap, false);
 }
 #else
 static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data,
@@ -666,6 +732,15 @@ static int nsim_fib6_rt_insert(struct nsim_fib_data *data,
 	struct nsim_fib6_rt *fib6_rt, *fib6_rt_old;
 	int err;
 
+	if (data->fail_route_offload) {
+		/* For testing purposes, user set debugfs fail_route_offload
+		 * value to true. Simulate hardware programming latency and then
+		 * fail.
+		 */
+		msleep(1);
+		return -EINVAL;
+	}
+
 	fib6_rt = nsim_fib6_rt_create(data, fib6_event->rt_arr,
 				      fib6_event->nrt6);
 	if (IS_ERR(fib6_rt))
@@ -763,7 +838,7 @@ static int nsim_fib6_event(struct nsim_fib_data *data,
 			   struct nsim_fib6_event *fib6_event,
 			   unsigned long event)
 {
-	int err = 0;
+	int err;
 
 	if (fib6_event->rt_arr[0]->fib6_src.plen)
 		return 0;
@@ -771,9 +846,13 @@ static int nsim_fib6_event(struct nsim_fib_data *data,
 	switch (event) {
 	case FIB_EVENT_ENTRY_REPLACE:
 		err = nsim_fib6_rt_insert(data, fib6_event);
+		if (err)
+			goto err_rt_offload_failed_flag_set;
 		break;
 	case FIB_EVENT_ENTRY_APPEND:
 		err = nsim_fib6_rt_append(data, fib6_event);
+		if (err)
+			goto err_rt_offload_failed_flag_set;
 		break;
 	case FIB_EVENT_ENTRY_DEL:
 		nsim_fib6_rt_remove(data, fib6_event);
@@ -782,6 +861,11 @@ static int nsim_fib6_event(struct nsim_fib_data *data,
 		break;
 	}
 
+	return 0;
+
+err_rt_offload_failed_flag_set:
+	nsim_fib6_rt_offload_failed_flag_set(data, fib6_event->rt_arr,
+					     fib6_event->nrt6);
 	return err;
 }
 
@@ -1289,10 +1373,29 @@ static void nsim_fib_event_work(struct work_struct *work)
 	mutex_unlock(&data->fib_lock);
 }
 
+static int
+nsim_fib_debugfs_init(struct nsim_fib_data *data, struct nsim_dev *nsim_dev)
+{
+	data->ddir = debugfs_create_dir("fib", nsim_dev->ddir);
+	if (IS_ERR(data->ddir))
+		return PTR_ERR(data->ddir);
+
+	data->fail_route_offload = false;
+	debugfs_create_bool("fail_route_offload", 0600, data->ddir,
+			    &data->fail_route_offload);
+	return 0;
+}
+
+static void nsim_fib_debugfs_exit(struct nsim_fib_data *data)
+{
+	debugfs_remove_recursive(data->ddir);
+}
+
 struct nsim_fib_data *nsim_fib_create(struct devlink *devlink,
 				      struct netlink_ext_ack *extack)
 {
 	struct nsim_fib_data *data;
+	struct nsim_dev *nsim_dev;
 	int err;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -1300,10 +1403,15 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink,
 		return ERR_PTR(-ENOMEM);
 	data->devlink = devlink;
 
-	err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params);
+	nsim_dev = devlink_priv(devlink);
+	err = nsim_fib_debugfs_init(data, nsim_dev);
 	if (err)
 		goto err_data_free;
 
+	err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params);
+	if (err)
+		goto err_debugfs_exit;
+
 	mutex_init(&data->fib_lock);
 	INIT_LIST_HEAD(&data->fib_rt_list);
 	err = rhashtable_init(&data->fib_rt_ht, &nsim_fib_rt_ht_params);
@@ -1364,6 +1472,8 @@ err_rhashtable_nexthop_destroy:
 	rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free,
 				    data);
 	mutex_destroy(&data->fib_lock);
+err_debugfs_exit:
+	nsim_fib_debugfs_exit(data);
 err_data_free:
 	kfree(data);
 	return ERR_PTR(err);
@@ -1391,5 +1501,6 @@ void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data)
 	WARN_ON_ONCE(!list_empty(&data->fib_event_queue));
 	WARN_ON_ONCE(!list_empty(&data->fib_rt_list));
 	mutex_destroy(&data->fib_lock);
+	nsim_fib_debugfs_exit(data);
 	kfree(data);
 }
diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 8a4ec3222168..3142ba768313 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -361,96 +361,6 @@ static int bcm54811_config_init(struct phy_device *phydev)
 	return err;
 }
 
-static int bcm5482_config_init(struct phy_device *phydev)
-{
-	int err, reg;
-
-	err = bcm54xx_config_init(phydev);
-
-	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) {
-		/*
-		 * Enable secondary SerDes and its use as an LED source
-		 */
-		reg = bcm_phy_read_shadow(phydev, BCM5482_SHD_SSD);
-		bcm_phy_write_shadow(phydev, BCM5482_SHD_SSD,
-				     reg |
-				     BCM5482_SHD_SSD_LEDM |
-				     BCM5482_SHD_SSD_EN);
-
-		/*
-		 * Enable SGMII slave mode and auto-detection
-		 */
-		reg = BCM5482_SSD_SGMII_SLAVE | MII_BCM54XX_EXP_SEL_SSD;
-		err = bcm_phy_read_exp(phydev, reg);
-		if (err < 0)
-			return err;
-		err = bcm_phy_write_exp(phydev, reg, err |
-					BCM5482_SSD_SGMII_SLAVE_EN |
-					BCM5482_SSD_SGMII_SLAVE_AD);
-		if (err < 0)
-			return err;
-
-		/*
-		 * Disable secondary SerDes powerdown
-		 */
-		reg = BCM5482_SSD_1000BX_CTL | MII_BCM54XX_EXP_SEL_SSD;
-		err = bcm_phy_read_exp(phydev, reg);
-		if (err < 0)
-			return err;
-		err = bcm_phy_write_exp(phydev, reg,
-					err & ~BCM5482_SSD_1000BX_CTL_PWRDOWN);
-		if (err < 0)
-			return err;
-
-		/*
-		 * Select 1000BASE-X register set (primary SerDes)
-		 */
-		reg = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
-		bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE,
-				     reg | BCM54XX_SHD_MODE_1000BX);
-
-		/*
-		 * LED1=ACTIVITYLED, LED3=LINKSPD[2]
-		 * (Use LED1 as secondary SerDes ACTIVITY LED)
-		 */
-		bcm_phy_write_shadow(phydev, BCM5482_SHD_LEDS1,
-			BCM5482_SHD_LEDS1_LED1(BCM_LED_SRC_ACTIVITYLED) |
-			BCM5482_SHD_LEDS1_LED3(BCM_LED_SRC_LINKSPD2));
-
-		/*
-		 * Auto-negotiation doesn't seem to work quite right
-		 * in this mode, so we disable it and force it to the
-		 * right speed/duplex setting.  Only 'link status'
-		 * is important.
-		 */
-		phydev->autoneg = AUTONEG_DISABLE;
-		phydev->speed = SPEED_1000;
-		phydev->duplex = DUPLEX_FULL;
-	}
-
-	return err;
-}
-
-static int bcm5482_read_status(struct phy_device *phydev)
-{
-	int err;
-
-	err = genphy_read_status(phydev);
-
-	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) {
-		/*
-		 * Only link status matters for 1000Base-X mode, so force
-		 * 1000 Mbit/s full-duplex status
-		 */
-		if (phydev->link) {
-			phydev->speed = SPEED_1000;
-			phydev->duplex = DUPLEX_FULL;
-		}
-	}
-
-	return err;
-}
-
 static int bcm5481_config_aneg(struct phy_device *phydev)
 {
 	struct device_node *np = phydev->mdio.dev.of_node;
@@ -800,8 +710,7 @@ static struct phy_driver broadcom_drivers[] = {
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "Broadcom BCM5482",
 	/* PHY_GBIT_FEATURES */
-	.config_init	= bcm5482_config_init,
-	.read_status	= bcm5482_read_status,
+	.config_init	= bcm54xx_config_init,
 	.config_intr	= bcm_phy_config_intr,
 	.handle_interrupt = bcm_phy_handle_interrupt,
 }, {
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 620052c023a5..b523aa37ebf0 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2852,7 +2852,6 @@ static struct phy_driver marvell_drivers[] = {
 		.probe = marvell_probe,
 		.config_init = m88e1145_config_init,
 		.config_aneg = m88e1101_config_aneg,
-		.read_status = genphy_read_status,
 		.config_intr = marvell_config_intr,
 		.handle_interrupt = marvell_handle_interrupt,
 		.resume = genphy_resume,
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 39c7c786a912..494abf608b8f 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -1368,7 +1368,6 @@ static struct phy_driver ksphy_driver[] = {
 	.driver_data	= &ksz9021_type,
 	.probe		= kszphy_probe,
 	.config_init	= ksz9131_config_init,
-	.read_status	= genphy_read_status,
 	.config_intr	= kszphy_config_intr,
 	.handle_interrupt = kszphy_handle_interrupt,
 	.get_sset_count = kszphy_get_sset_count,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e9e7ada07ea1..1340327f7abf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
 	unsigned int		napi_id;
+	struct task_struct	*thread;
 };
 
 enum {
@@ -358,6 +359,7 @@ enum {
 	NAPI_STATE_NO_BUSY_POLL,	/* Do not add in napi_hash, no busy polling */
 	NAPI_STATE_IN_BUSY_POLL,	/* sk_busy_loop() owns this NAPI */
 	NAPI_STATE_PREFER_BUSY_POLL,	/* prefer busy-polling over softirq processing*/
+	NAPI_STATE_THREADED,		/* The poll is performed inside its own thread*/
 };
 
 enum {
@@ -369,6 +371,7 @@ enum {
 	NAPIF_STATE_NO_BUSY_POLL	= BIT(NAPI_STATE_NO_BUSY_POLL),
 	NAPIF_STATE_IN_BUSY_POLL	= BIT(NAPI_STATE_IN_BUSY_POLL),
 	NAPIF_STATE_PREFER_BUSY_POLL	= BIT(NAPI_STATE_PREFER_BUSY_POLL),
+	NAPIF_STATE_THREADED		= BIT(NAPI_STATE_THREADED),
 };
 
 enum gro_result {
@@ -494,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
+int dev_set_threaded(struct net_device *dev, bool threaded);
+
 /**
  *	napi_disable - prevent NAPI from scheduling
  *	@n: NAPI context
@@ -503,20 +508,7 @@ static inline bool napi_complete(struct napi_struct *n)
  */
 void napi_disable(struct napi_struct *n);
 
-/**
- *	napi_enable - enable NAPI scheduling
- *	@n: NAPI context
- *
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
- */
-static inline void napi_enable(struct napi_struct *n)
-{
-	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-	smp_mb__before_atomic();
-	clear_bit(NAPI_STATE_SCHED, &n->state);
-	clear_bit(NAPI_STATE_NPSVC, &n->state);
-}
+void napi_enable(struct napi_struct *n);
 
 /**
  *	napi_synchronize - wait until NAPI is not running
@@ -1827,6 +1819,8 @@ enum netdev_priv_flags {
  *
  *	@wol_enabled:	Wake-on-LAN is enabled
  *
+ *	@threaded:	napi threaded mode is enabled
+ *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
  *				to another network namespace.
@@ -2145,6 +2139,7 @@ struct net_device {
 	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
 	unsigned		wol_enabled:1;
+	unsigned		threaded:1;
 
 	struct list_head	net_notifier_list;
 
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1e262b23c68b..15b7fbe6b15c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -195,7 +195,8 @@ struct fib6_info {
 					fib6_destroying:1,
 					offload:1,
 					trap:1,
-					unused:2;
+					offload_failed:1,
+					unused:1;
 
 	struct rcu_head			rcu;
 	struct nexthop			*nh;
@@ -539,7 +540,7 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
 	return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
 }
 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
-			    bool offload, bool trap);
+			    bool offload, bool trap, bool offload_failed);
 
 #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
 struct bpf_iter__ipv6_route {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 2ec062aaa978..a914f33f3ed5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -213,7 +213,8 @@ struct fib_rt_info {
 	u8			type;
 	u8			offload:1,
 				trap:1,
-				unused:6;
+				offload_failed:1,
+				unused:5;
 };
 
 struct fib_entry_notifier_info {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index b841caa4657e..91e4ca064d61 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -319,6 +319,11 @@ enum rt_scope_t {
 #define RTM_F_FIB_MATCH	        0x2000	/* return full fib lookup match */
 #define RTM_F_OFFLOAD		0x4000	/* route is offloaded */
 #define RTM_F_TRAP		0x8000	/* route is trapping packets */
+#define RTM_F_OFFLOAD_FAILED	0x20000000 /* route offload failed, this value
+					    * is chosen to avoid conflicts with
+					    * other flags defined in
+					    * include/uapi/linux/ipv6_route.h
+					    */
 
 /* Reserved table identifiers */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 21d74d30f5d7..7647278e46f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -91,6 +91,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/kthread.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
 #include <net/net_namespace.h>
@@ -1494,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_notify_peers);
 
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+	int err = 0;
+
+	/* Create and wake up the kthread once to put it in
+	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
+	 * warning and work with loadavg.
+	 */
+	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+				n->dev->name, n->napi_id);
+	if (IS_ERR(n->thread)) {
+		err = PTR_ERR(n->thread);
+		pr_err("kthread_run failed with err %d\n", err);
+		n->thread = NULL;
+	}
+
+	return err;
+}
+
 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -4265,6 +4287,22 @@ int gro_normal_batch __read_mostly = 8;
 static inline void ____napi_schedule(struct softnet_data *sd,
 				     struct napi_struct *napi)
 {
+	struct task_struct *thread;
+
+	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+		/* Paired with smp_mb__before_atomic() in
+		 * napi_enable()/dev_set_threaded().
+		 * Use READ_ONCE() to guarantee a complete
+		 * read on napi->thread. Only call
+		 * wake_up_process() when it's not NULL.
+		 */
+		thread = READ_ONCE(napi->thread);
+		if (thread) {
+			wake_up_process(thread);
+			return;
+		}
+	}
+
 	list_add_tail(&napi->poll_list, &sd->poll_list);
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
@@ -6701,6 +6739,49 @@ static void init_gro_hash(struct napi_struct *napi)
 	napi->gro_bitmask = 0;
 }
 
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+	struct napi_struct *napi;
+	int err = 0;
+
+	if (dev->threaded == threaded)
+		return 0;
+
+	if (threaded) {
+		list_for_each_entry(napi, &dev->napi_list, dev_list) {
+			if (!napi->thread) {
+				err = napi_kthread_create(napi);
+				if (err) {
+					threaded = false;
+					break;
+				}
+			}
+		}
+	}
+
+	dev->threaded = threaded;
+
+	/* Make sure kthread is created before THREADED bit
+	 * is set.
+	 */
+	smp_mb__before_atomic();
+
+	/* Setting/unsetting threaded mode on a napi might not immediately
+	 * take effect, if the current napi instance is actively being
+	 * polled. In this case, the switch between threaded mode and
+	 * softirq mode will happen in the next round of napi_schedule().
+	 * This should not cause hiccups/stalls to the live traffic.
+	 */
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		if (threaded)
+			set_bit(NAPI_STATE_THREADED, &napi->state);
+		else
+			clear_bit(NAPI_STATE_THREADED, &napi->state);
+	}
+
+	return err;
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 		    int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -6728,6 +6809,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
 	napi_hash_add(napi);
+	/* Create kthread for this napi if dev->threaded is set.
+	 * Clear dev->threaded if kthread creation failed so that
+	 * threaded mode will not be enabled in napi_enable().
+	 */
+	if (dev->threaded && napi_kthread_create(napi))
+		dev->threaded = 0;
 }
 EXPORT_SYMBOL(netif_napi_add);
 
@@ -6745,9 +6832,28 @@ void napi_disable(struct napi_struct *n)
 
 	clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
+	clear_bit(NAPI_STATE_THREADED, &n->state);
 }
 EXPORT_SYMBOL(napi_disable);
 
+/**
+ *	napi_enable - enable NAPI scheduling
+ *	@n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+	smp_mb__before_atomic();
+	clear_bit(NAPI_STATE_SCHED, &n->state);
+	clear_bit(NAPI_STATE_NPSVC, &n->state);
+	if (n->dev->threaded && n->thread)
+		set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
 static void flush_gro_hash(struct napi_struct *napi)
 {
 	int i;
@@ -6773,18 +6879,18 @@ void __netif_napi_del(struct napi_struct *napi)
 
 	flush_gro_hash(napi);
 	napi->gro_bitmask = 0;
+
+	if (napi->thread) {
+		kthread_stop(napi->thread);
+		napi->thread = NULL;
+	}
 }
 EXPORT_SYMBOL(__netif_napi_del);
 
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
 {
-	void *have;
 	int work, weight;
 
-	list_del_init(&n->poll_list);
-
-	have = netpoll_poll_lock(n);
-
 	weight = n->weight;
 
 	/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6804,7 +6910,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 			    n->poll, work, weight);
 
 	if (likely(work < weight))
-		goto out_unlock;
+		return work;
 
 	/* Drivers must not modify the NAPI state if they
 	 * consume the entire weight.  In such cases this code
@@ -6813,7 +6919,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 	 */
 	if (unlikely(napi_disable_pending(n))) {
 		napi_complete(n);
-		goto out_unlock;
+		return work;
 	}
 
 	/* The NAPI context has more processing work, but busy-polling
@@ -6826,7 +6932,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 			 */
 			napi_schedule(n);
 		}
-		goto out_unlock;
+		return work;
 	}
 
 	if (n->gro_bitmask) {
@@ -6844,17 +6950,79 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 	if (unlikely(!list_empty(&n->poll_list))) {
 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 			     n->dev ? n->dev->name : "backlog");
-		goto out_unlock;
+		return work;
 	}
 
-	list_add_tail(&n->poll_list, repoll);
+	*repoll = true;
+
+	return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+	bool do_repoll = false;
+	void *have;
+	int work;
+
+	list_del_init(&n->poll_list);
+
+	have = netpoll_poll_lock(n);
+
+	work = __napi_poll(n, &do_repoll);
+
+	if (do_repoll)
+		list_add_tail(&n->poll_list, repoll);
 
-out_unlock:
 	netpoll_poll_unlock(have);
 
 	return work;
 }
 
+static int napi_thread_wait(struct napi_struct *napi)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+		if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+			WARN_ON(!list_empty(&napi->poll_list));
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		}
+
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+	struct napi_struct *napi = data;
+	void *have;
+
+	while (!napi_thread_wait(napi)) {
+		for (;;) {
+			bool repoll = false;
+
+			local_bh_disable();
+
+			have = netpoll_poll_lock(napi);
+			__napi_poll(napi, &repoll);
+			netpoll_poll_unlock(have);
+
+			__kfree_skb_flush();
+			local_bh_enable();
+
+			if (!repoll)
+				break;
+
+			cond_resched();
+		}
+	}
+	return 0;
+}
+
 static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 91afb0b6de69..307628fdf380 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(phys_switch_id);
 
+static ssize_t threaded_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	ssize_t ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (dev_isalive(netdev))
+		ret = sprintf(buf, fmt_dec, netdev->threaded);
+
+	rtnl_unlock();
+	return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+	int ret;
+
+	if (list_empty(&dev->napi_list))
+		return -EOPNOTSUPP;
+
+	if (val != 0 && val != 1)
+		return -EOPNOTSUPP;
+
+	ret = dev_set_threaded(dev, val);
+
+	return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
 static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_netdev_group.attr,
 	&dev_attr_type.attr,
@@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_proto_down.attr,
 	&dev_attr_carrier_up_count.attr,
 	&dev_attr_carrier_down_count.attr,
+	&dev_attr_threaded.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 5bda5aeda579..601f5fbfc63f 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -285,7 +285,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_
 	esp.esph = ip_esp_hdr(skb);
 
 
-	if (!hw_offload || (hw_offload && !skb_is_gso(skb))) {
+	if (!hw_offload || !skb_is_gso(skb)) {
 		esp.nfrags = esp_output_head(x, skb, &esp);
 		if (esp.nfrags < 0)
 			return esp.nfrags;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index aff454ef0fa3..b58db1ca4bfb 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -18,7 +18,8 @@ struct fib_alias {
 	s16			fa_default;
 	u8			offload:1,
 				trap:1,
-				unused:6;
+				offload_failed:1,
+				unused:5;
 	struct rcu_head		rcu;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4c38facf91c0..a632b66bc13a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -521,6 +521,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 	fri.type = fa->fa_type;
 	fri.offload = fa->offload;
 	fri.trap = fa->trap;
+	fri.offload_failed = fa->offload_failed;
 	err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -1811,6 +1812,8 @@ offload:
 		rtm->rtm_flags |= RTM_F_OFFLOAD;
 	if (fri->trap)
 		rtm->rtm_flags |= RTM_F_TRAP;
+	if (fri->offload_failed)
+		rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
 
 	nlmsg_end(skb, nlh);
 	return 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 60559b708158..25cf387cca5b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1047,12 +1047,20 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
 	if (!fa_match)
 		goto out;
 
-	if (fa_match->offload == fri->offload && fa_match->trap == fri->trap)
+	if (fa_match->offload == fri->offload && fa_match->trap == fri->trap &&
+	    fa_match->offload_failed == fri->offload_failed)
 		goto out;
 
 	fa_match->offload = fri->offload;
 	fa_match->trap = fri->trap;
 
+	/* 2 means send notifications only if offload_failed was changed. */
+	if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 &&
+	    fa_match->offload_failed == fri->offload_failed)
+		goto out;
+
+	fa_match->offload_failed = fri->offload_failed;
+
 	if (!net->ipv4.sysctl_fib_notify_on_flag_change)
 		goto out;
 
@@ -1290,6 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			new_fa->fa_default = -1;
 			new_fa->offload = 0;
 			new_fa->trap = 0;
+			new_fa->offload_failed = 0;
 
 			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
@@ -1350,6 +1359,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 	new_fa->fa_default = -1;
 	new_fa->offload = 0;
 	new_fa->trap = 0;
+	new_fa->offload_failed = 0;
 
 	/* Insert new entry to the list. */
 	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
@@ -2289,6 +2299,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 				fri.type = fa->fa_type;
 				fri.offload = fa->offload;
 				fri.trap = fa->trap;
+				fri.offload_failed = fa->offload_failed;
 				err = fib_dump_info(skb,
 						    NETLINK_CB(cb->skb).portid,
 						    cb->nlh->nlmsg_seq,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index be31e2446470..02d81d79deeb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3304,6 +3304,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		fri.type = rt->rt_type;
 		fri.offload = 0;
 		fri.trap = 0;
+		fri.offload_failed = 0;
 		if (res.fa_head) {
 			struct fib_alias *fa;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e5798b3b59d2..f55095d3ed16 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1361,7 +1361,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
+		.extra2		= &two,
 	},
 	{ }
 };
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 2b804fcebcc6..153ad103ba74 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -788,7 +788,7 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 	int hdr_len = skb_network_header_len(skb);
 
-	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+	if (!xo || !(xo->flags & CRYPTO_DONE))
 		kfree(ESP_SKB_CB(skb)->tmp);
 
 	if (unlikely(err))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0d1784b0d65d..1536f4948e86 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5619,6 +5619,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			rtm->rtm_flags |= RTM_F_OFFLOAD;
 		if (rt->trap)
 			rtm->rtm_flags |= RTM_F_TRAP;
+		if (rt->offload_failed)
+			rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
 	}
 
 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
@@ -6070,17 +6072,25 @@ errout:
 }
 
 void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
-			    bool offload, bool trap)
+			    bool offload, bool trap, bool offload_failed)
 {
 	struct sk_buff *skb;
 	int err;
 
-	if (f6i->offload == offload && f6i->trap == trap)
+	if (f6i->offload == offload && f6i->trap == trap &&
+	    f6i->offload_failed == offload_failed)
 		return;
 
 	f6i->offload = offload;
 	f6i->trap = trap;
 
+	/* 2 means send notifications only if offload_failed was changed. */
+	if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
+	    f6i->offload_failed == offload_failed)
+		return;
+
+	f6i->offload_failed = offload_failed;
+
 	if (!rcu_access_pointer(f6i->fib6_node))
 		/* The route was removed from the tree, do not send
 		 * notfication.
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 392ef01e3366..263ab43ed06b 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -167,7 +167,7 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1         = SYSCTL_ZERO,
-		.extra2         = SYSCTL_ONE,
+		.extra2         = &two,
 	},
 	{ }
 };
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 697cdcfbb5e1..495b1f5c979b 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -296,7 +296,8 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 	}
 
 	mtu = dst_mtu(dst);
-	if (skb->len > mtu) {
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) {
 		skb_dst_update_pmtu_no_confirm(skb, mtu);
 
 		if (skb->protocol == htons(ETH_P_IPV6)) {
@@ -564,6 +565,11 @@ static void xfrmi_dev_setup(struct net_device *dev)
 	eth_broadcast_addr(dev->broadcast);
 }
 
+#define XFRMI_FEATURES (NETIF_F_SG |		\
+			NETIF_F_FRAGLIST |	\
+			NETIF_F_GSO_SOFTWARE |	\
+			NETIF_F_HW_CSUM)
+
 static int xfrmi_dev_init(struct net_device *dev)
 {
 	struct xfrm_if *xi = netdev_priv(dev);
@@ -581,6 +587,8 @@ static int xfrmi_dev_init(struct net_device *dev)
 	}
 
 	dev->features |= NETIF_F_LLTX;
+	dev->features |= XFRMI_FEATURES;
+	dev->hw_features |= XFRMI_FEATURES;
 
 	if (phydev) {
 		dev->needed_headroom = phydev->needed_headroom;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0727ac853b55..5a0ef4361e43 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2504,7 +2504,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 		encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
 				sizeof(*encap), GFP_KERNEL);
 		if (!encap)
-			return 0;
+			return -ENOMEM;
 	}
 
 	err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap);
diff --git a/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh b/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh
index 16a9dd43aefc..8d91191a098c 100755
--- a/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh
+++ b/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh
@@ -7,9 +7,11 @@ ALL_TESTS="
 	ipv4_route_addition_test
 	ipv4_route_deletion_test
 	ipv4_route_replacement_test
+	ipv4_route_offload_failed_test
 	ipv6_route_addition_test
 	ipv6_route_deletion_test
 	ipv6_route_replacement_test
+	ipv6_route_offload_failed_test
 "
 
 NETDEVSIM_PATH=/sys/bus/netdevsim/
@@ -17,9 +19,26 @@ DEV_ADDR=1337
 DEV=netdevsim${DEV_ADDR}
 DEVLINK_DEV=netdevsim/${DEV}
 SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/
+DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV/
 NUM_NETIFS=0
 source $lib_dir/lib.sh
 
+check_rt_offload_failed()
+{
+	local outfile=$1; shift
+	local line
+
+	# Make sure that the first notification was emitted without
+	# RTM_F_OFFLOAD_FAILED flag and the second with RTM_F_OFFLOAD_FAILED
+	# flag
+	head -n 1 $outfile | grep -q "rt_offload_failed"
+	if [[ $? -eq 0 ]]; then
+		return 1
+	fi
+
+	head -n 2 $outfile | tail -n 1 | grep -q "rt_offload_failed"
+}
+
 check_rt_trap()
 {
 	local outfile=$1; shift
@@ -39,15 +58,23 @@ route_notify_check()
 {
 	local outfile=$1; shift
 	local expected_num_lines=$1; shift
+	local offload_failed=${1:-0}; shift
 
 	# check the monitor results
 	lines=`wc -l $outfile | cut "-d " -f1`
 	test $lines -eq $expected_num_lines
 	check_err $? "$expected_num_lines notifications were expected but $lines were received"
 
-	if [[ $expected_num_lines -eq 2 ]]; then
+	if [[ $expected_num_lines -eq 1 ]]; then
+		return
+	fi
+
+	if [[ $offload_failed -eq 0 ]]; then
 		check_rt_trap $outfile
 		check_err $? "Wrong RTM_F_TRAP flags in notifications"
+	else
+		check_rt_offload_failed $outfile
+		check_err $? "Wrong RTM_F_OFFLOAD_FAILED flags in notifications"
 	fi
 }
 
@@ -57,6 +84,7 @@ route_addition_check()
 	local notify=$1; shift
 	local route=$1; shift
 	local expected_num_notifications=$1; shift
+	local offload_failed=${1:-0}; shift
 
 	ip netns exec testns1 sysctl -qw net.$ip.fib_notify_on_flag_change=$notify
 
@@ -68,7 +96,7 @@ route_addition_check()
 	sleep 1
 	kill %% && wait %% &> /dev/null
 
-	route_notify_check $outfile $expected_num_notifications
+	route_notify_check $outfile $expected_num_notifications $offload_failed
 	rm -f $outfile
 
 	$IP route del $route dev dummy1
@@ -93,6 +121,13 @@ ipv4_route_addition_test()
 	expected_num_notifications=2
 	route_addition_check $ip $notify $route $expected_num_notifications
 
+	# notify=2 means emit notifications only for failed route installation,
+	# make sure a single notification will be emitted for the programmed
+	# route.
+	notify=2
+	expected_num_notifications=1
+	route_addition_check $ip $notify $route $expected_num_notifications
+
 	log_test "IPv4 route addition"
 }
 
@@ -185,11 +220,55 @@ ipv4_route_replacement_test()
 	expected_num_notifications=2
 	route_replacement_check $ip $notify $route $expected_num_notifications
 
+	# notify=2 means emit notifications only for failed route installation,
+	# make sure a single notification will be emitted for the new route.
+	notify=2
+	expected_num_notifications=1
+	route_replacement_check $ip $notify $route $expected_num_notifications
+
 	$IP link del name dummy2
 
 	log_test "IPv4 route replacement"
 }
 
+ipv4_route_offload_failed_test()
+{
+
+	RET=0
+
+	local ip="ipv4"
+	local route=192.0.2.0/24
+	local offload_failed=1
+
+	echo "y"> $DEBUGFS_DIR/fib/fail_route_offload
+	check_err $? "Failed to setup route offload to fail"
+
+	# Make sure a single notification will be emitted for the programmed
+	# route.
+	local notify=0
+	local expected_num_notifications=1
+	route_addition_check $ip $notify $route $expected_num_notifications \
+		$offload_failed
+
+	# Make sure two notifications will be emitted for the new route.
+	notify=1
+	expected_num_notifications=2
+	route_addition_check $ip $notify $route $expected_num_notifications \
+		$offload_failed
+
+	# notify=2 means emit notifications only for failed route installation,
+	# make sure two notifications will be emitted for the new route.
+	notify=2
+	expected_num_notifications=2
+	route_addition_check $ip $notify $route $expected_num_notifications \
+		$offload_failed
+
+	echo "n"> $DEBUGFS_DIR/fib/fail_route_offload
+	check_err $? "Failed to setup route offload not to fail"
+
+	log_test "IPv4 route offload failed"
+}
+
 ipv6_route_addition_test()
 {
 	RET=0
@@ -208,6 +287,13 @@ ipv6_route_addition_test()
 	expected_num_notifications=2
 	route_addition_check $ip $notify $route $expected_num_notifications
 
+	# notify=2 means emit notifications only for failed route installation,
+	# make sure a single notification will be emitted for the programmed
+	# route.
+	notify=2
+	expected_num_notifications=1
+	route_addition_check $ip $notify $route $expected_num_notifications
+
 	log_test "IPv6 route addition"
 }
 
@@ -250,11 +336,55 @@ ipv6_route_replacement_test()
 	expected_num_notifications=2
 	route_replacement_check $ip $notify $route $expected_num_notifications
 
+	# notify=2 means emit notifications only for failed route installation,
+	# make sure a single notification will be emitted for the new route.
+	notify=2
+	expected_num_notifications=1
+	route_replacement_check $ip $notify $route $expected_num_notifications
+
 	$IP link del name dummy2
 
 	log_test "IPv6 route replacement"
 }
 
+ipv6_route_offload_failed_test()
+{
+
+	RET=0
+
+	local ip="ipv6"
+	local route=2001:db8:1::/64
+	local offload_failed=1
+
+	echo "y"> $DEBUGFS_DIR/fib/fail_route_offload
+	check_err $? "Failed to setup route offload to fail"
+
+	# Make sure a single notification will be emitted for the programmed
+	# route.
+	local notify=0
+	local expected_num_notifications=1
+	route_addition_check $ip $notify $route $expected_num_notifications \
+		$offload_failed
+
+	# Make sure two notifications will be emitted for the new route.
+	notify=1
+	expected_num_notifications=2
+	route_addition_check $ip $notify $route $expected_num_notifications \
+		$offload_failed
+
+	# notify=2 means emit notifications only for failed route installation,
+	# make sure two notifications will be emitted for the new route.
+	notify=2
+	expected_num_notifications=2
+	route_addition_check $ip $notify $route $expected_num_notifications \
+		$offload_failed
+
+	echo "n"> $DEBUGFS_DIR/fib/fail_route_offload
+	check_err $? "Failed to setup route offload not to fail"
+
+	log_test "IPv6 route offload failed"
+}
+
 setup_prepare()
 {
 	modprobe netdevsim &> /dev/null