75 files changed, 4067 insertions, 2057 deletions
diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt
index 06bb7cc334c8..94a7f33ac5e9 100644
--- a/Documentation/devicetree/bindings/net/can/rcar_can.txt
+++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt
@@ -2,7 +2,9 @@ Renesas R-Car CAN controller Device Tree Bindings
 -------------------------------------------------
 
 Required properties:
-- compatible: "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC.
+- compatible: "renesas,can-r8a7743" if CAN controller is a part of R8A7743 SoC.
+	      "renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC.
+	      "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC.
 	      "renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC.
 	      "renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC.
 	      "renesas,can-r8a7791" if CAN controller is a part of R8A7791 SoC.
@@ -12,7 +14,8 @@ Required properties:
 	      "renesas,can-r8a7795" if CAN controller is a part of R8A7795 SoC.
 	      "renesas,can-r8a7796" if CAN controller is a part of R8A7796 SoC.
 	      "renesas,rcar-gen1-can" for a generic R-Car Gen1 compatible device.
-	      "renesas,rcar-gen2-can" for a generic R-Car Gen2 compatible device.
+	      "renesas,rcar-gen2-can" for a generic R-Car Gen2 or RZ/G1
+	      compatible device.
 	      "renesas,rcar-gen3-can" for a generic R-Car Gen3 compatible device.
 	      When compatible with the generic version, nodes must list the
 	      SoC-specific version corresponding to the platform first
diff --git a/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt b/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt
index 4ccb2cd5df94..d096cf461d81 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt
+++ b/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt
@@ -195,4 +195,4 @@ External interrupts:
 
 fsl,mpc5200-mscan nodes
 -----------------------
-See file can.txt in this directory.
+See file Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index f5d642c01dd3..2b89d91b376f 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -36,8 +36,6 @@ bonding.txt
 	- Linux Ethernet Bonding Driver HOWTO: link aggregation in Linux.
 bridge.txt
 	- where to get user space programs for ethernet bridging with Linux.
-can.txt
-	- documentation on CAN protocol family.
 cdc_mbim.txt
 	- 3G/LTE USB modem (Mobile Broadband Interface Model)
 checksum-offloads.txt
diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
new file mode 100644
index 000000000000..d23c51abf8c6
--- /dev/null
+++ b/Documentation/networking/can.rst
@@ -0,0 +1,1437 @@
+===================================
+SocketCAN - Controller Area Network
+===================================
+
+Overview / What is SocketCAN
+============================
+
+The socketcan package is an implementation of CAN protocols
+(Controller Area Network) for Linux.  CAN is a networking technology
+which has widespread use in automation, embedded devices, and
+automotive fields.  While there have been other CAN implementations
+for Linux based on character devices, SocketCAN uses the Berkeley
+socket API, the Linux network stack and implements the CAN device
+drivers as network interfaces.  The CAN socket API has been designed
+as similar as possible to the TCP/IP protocols to allow programmers,
+familiar with network programming, to easily learn how to use CAN
+sockets.
+
+
+.. _socketcan-motivation:
+
+Motivation / Why Using the Socket API
+=====================================
+
+There have been CAN implementations for Linux before SocketCAN so the
+question arises, why we have started another project.  Most existing
+implementations come as a device driver for some CAN hardware, they
+are based on character devices and provide comparatively little
+functionality.  Usually, there is only a hardware-specific device
+driver which provides a character device interface to send and
+receive raw CAN frames, directly to/from the controller hardware.
+Queueing of frames and higher-level transport protocols like ISO-TP
+have to be implemented in user space applications.  Also, most
+character-device implementations support only one single process to
+open the device at a time, similar to a serial interface.  Exchanging
+the CAN controller requires employment of another device driver and
+often the need for adaption of large parts of the application to the
+new driver's API.
+
+SocketCAN was designed to overcome all of these limitations.  A new
+protocol family has been implemented which provides a socket interface
+to user space applications and which builds upon the Linux network
+layer, enabling use all of the provided queueing functionality.  A device
+driver for CAN controller hardware registers itself with the Linux
+network layer as a network device, so that CAN frames from the
+controller can be passed up to the network layer and on to the CAN
+protocol family module and also vice-versa.  Also, the protocol family
+module provides an API for transport protocol modules to register, so
+that any number of transport protocols can be loaded or unloaded
+dynamically.  In fact, the can core module alone does not provide any
+protocol and cannot be used without loading at least one additional
+protocol module.  Multiple sockets can be opened at the same time,
+on different or the same protocol module and they can listen/send
+frames on different or the same CAN IDs.  Several sockets listening on
+the same interface for frames with the same CAN ID are all passed the
+same received matching CAN frames.  An application wishing to
+communicate using a specific transport protocol, e.g. ISO-TP, just
+selects that protocol when opening the socket, and then can read and
+write application data byte streams, without having to deal with
+CAN-IDs, frames, etc.
+
+Similar functionality visible from user-space could be provided by a
+character device, too, but this would lead to a technically inelegant
+solution for a couple of reasons:
+
+* **Intricate usage:**  Instead of passing a protocol argument to
+  socket(2) and using bind(2) to select a CAN interface and CAN ID, an
+  application would have to do all these operations using ioctl(2)s.
+
+* **Code duplication:**  A character device cannot make use of the Linux
+  network queueing code, so all that code would have to be duplicated
+  for CAN networking.
+
+* **Abstraction:**  In most existing character-device implementations, the
+  hardware-specific device driver for a CAN controller directly
+  provides the character device for the application to work with.
+  This is at least very unusual in Unix systems for both, char and
+  block devices.  For example you don't have a character device for a
+  certain UART of a serial interface, a certain sound chip in your
+  computer, a SCSI or IDE controller providing access to your hard
+  disk or tape streamer device.  Instead, you have abstraction layers
+  which provide a unified character or block device interface to the
+  application on the one hand, and a interface for hardware-specific
+  device drivers on the other hand.  These abstractions are provided
+  by subsystems like the tty layer, the audio subsystem or the SCSI
+  and IDE subsystems for the devices mentioned above.
+
+  The easiest way to implement a CAN device driver is as a character
+  device without such a (complete) abstraction layer, as is done by most
+  existing drivers.  The right way, however, would be to add such a
+  layer with all the functionality like registering for certain CAN
+  IDs, supporting several open file descriptors and (de)multiplexing
+  CAN frames between them, (sophisticated) queueing of CAN frames, and
+  providing an API for device drivers to register with.  However, then
+  it would be no more difficult, or may be even easier, to use the
+  networking framework provided by the Linux kernel, and this is what
+  SocketCAN does.
+
+The use of the networking framework of the Linux kernel is just the
+natural and most appropriate way to implement CAN for Linux.
+
+
+.. _socketcan-concept:
+
+SocketCAN Concept
+=================
+
+As described in :ref:`socketcan-motivation` the main goal of SocketCAN is to
+provide a socket interface to user space applications which builds
+upon the Linux network layer. In contrast to the commonly known
+TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!)
+medium that has no MAC-layer addressing like ethernet. The CAN-identifier
+(can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs
+have to be chosen uniquely on the bus. When designing a CAN-ECU
+network the CAN-IDs are mapped to be sent by a specific ECU.
+For this reason a CAN-ID can be treated best as a kind of source address.
+
+
+.. _socketcan-receive-lists:
+
+Receive Lists
+-------------
+
+The network transparent access of multiple applications leads to the
+problem that different applications may be interested in the same
+CAN-IDs from the same CAN network interface. The SocketCAN core
+module - which implements the protocol family CAN - provides several
+high efficient receive lists for this reason. If e.g. a user space
+application opens a CAN RAW socket, the raw protocol module itself
+requests the (range of) CAN-IDs from the SocketCAN core that are
+requested by the user. The subscription and unsubscription of
+CAN-IDs can be done for specific CAN interfaces or for all(!) known
+CAN interfaces with the can_rx_(un)register() functions provided to
+CAN protocol modules by the SocketCAN core (see :ref:`socketcan-core-module`).
+To optimize the CPU usage at runtime the receive lists are split up
+into several specific lists per device that match the requested
+filter complexity for a given use-case.
+
+
+.. _socketcan-local-loopback1:
+
+Local Loopback of Sent Frames
+-----------------------------
+
+As known from other networking concepts the data exchanging
+applications may run on the same or different nodes without any
+change (except for the according addressing information):
+
+.. code::
+
+	 ___   ___   ___                   _______   ___
+	| _ | | _ | | _ |                 | _   _ | | _ |
+	||A|| ||B|| ||C||                 ||A| |B|| ||C||
+	|___| |___| |___|                 |_______| |___|
+	  |     |     |                       |       |
+	-----------------(1)- CAN bus -(2)---------------
+
+To ensure that application A receives the same information in the
+example (2) as it would receive in example (1) there is need for
+some kind of local loopback of the sent CAN frames on the appropriate
+node.
+
+The Linux network devices (by default) just can handle the
+transmission and reception of media dependent frames. Due to the
+arbitration on the CAN bus the transmission of a low prio CAN-ID
+may be delayed by the reception of a high prio CAN frame. To
+reflect the correct [*]_ traffic on the node the loopback of the sent
+data has to be performed right after a successful transmission. If
+the CAN network interface is not capable of performing the loopback for
+some reason the SocketCAN core can do this task as a fallback solution.
+See :ref:`socketcan-local-loopback1` for details (recommended).
+
+The loopback functionality is enabled by default to reflect standard
+networking behaviour for CAN applications. Due to some requests from
+the RT-SocketCAN group the loopback optionally may be disabled for each
+separate socket. See sockopts from the CAN RAW sockets in :ref:`socketcan-raw-sockets`.
+
+.. [*] you really like to have this when you're running analyser
+       tools like 'candump' or 'cansniffer' on the (same) node.
+
+
+.. _socketcan-network-problem-notifications:
+
+Network Problem Notifications
+-----------------------------
+
+The use of the CAN bus may lead to several problems on the physical
+and media access control layer. Detecting and logging of these lower
+layer problems is a vital requirement for CAN users to identify
+hardware issues on the physical transceiver layer as well as
+arbitration problems and error frames caused by the different
+ECUs. The occurrence of detected errors are important for diagnosis
+and have to be logged together with the exact timestamp. For this
+reason the CAN interface driver can generate so called Error Message
+Frames that can optionally be passed to the user application in the
+same way as other CAN frames. Whenever an error on the physical layer
+or the MAC layer is detected (e.g. by the CAN controller) the driver
+creates an appropriate error message frame. Error messages frames can
+be requested by the user application using the common CAN filter
+mechanisms. Inside this filter definition the (interested) type of
+errors may be selected. The reception of error messages is disabled
+by default. The format of the CAN error message frame is briefly
+described in the Linux header file "include/uapi/linux/can/error.h".
+
+
+How to use SocketCAN
+====================
+
+Like TCP/IP, you first need to open a socket for communicating over a
+CAN network. Since SocketCAN implements a new protocol family, you
+need to pass PF_CAN as the first argument to the socket(2) system
+call. Currently, there are two CAN protocols to choose from, the raw
+socket protocol and the broadcast manager (BCM). So to open a socket,
+you would write::
+
+    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+
+and::
+
+    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
+
+respectively.  After the successful creation of the socket, you would
+normally use the bind(2) system call to bind the socket to a CAN
+interface (which is different from TCP/IP due to different addressing
+- see :ref:`socketcan-concept`). After binding (CAN_RAW) or connecting (CAN_BCM)
+the socket, you can read(2) and write(2) from/to the socket or use
+send(2), sendto(2), sendmsg(2) and the recv* counterpart operations
+on the socket as usual. There are also CAN specific socket options
+described below.
+
+The basic CAN frame structure and the sockaddr structure are defined
+in include/linux/can.h:
+
+.. code-block:: C
+
+    struct can_frame {
+            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
+            __u8    can_dlc; /* frame payload length in byte (0 .. 8) */
+            __u8    __pad;   /* padding */
+            __u8    __res0;  /* reserved / padding */
+            __u8    __res1;  /* reserved / padding */
+            __u8    data[8] __attribute__((aligned(8)));
+    };
+
+The alignment of the (linear) payload data[] to a 64bit boundary
+allows the user to define their own structs and unions to easily access
+the CAN payload. There is no given byteorder on the CAN bus by
+default. A read(2) system call on a CAN_RAW socket transfers a
+struct can_frame to the user space.
+
+The sockaddr_can structure has an interface index like the
+PF_PACKET socket, that also binds to a specific interface:
+
+.. code-block:: C
+
+    struct sockaddr_can {
+            sa_family_t can_family;
+            int         can_ifindex;
+            union {
+                    /* transport protocol class address info (e.g. ISOTP) */
+                    struct { canid_t rx_id, tx_id; } tp;
+
+                    /* reserved for future CAN protocols address information */
+            } can_addr;
+    };
+
+To determine the interface index an appropriate ioctl() has to
+be used (example for CAN_RAW sockets without error checking):
+
+.. code-block:: C
+
+    int s;
+    struct sockaddr_can addr;
+    struct ifreq ifr;
+
+    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
+
+    strcpy(ifr.ifr_name, "can0" );
+    ioctl(s, SIOCGIFINDEX, &ifr);
+
+    addr.can_family = AF_CAN;
+    addr.can_ifindex = ifr.ifr_ifindex;
+
+    bind(s, (struct sockaddr *)&addr, sizeof(addr));
+
+    (..)
+
+To bind a socket to all(!) CAN interfaces the interface index must
+be 0 (zero). In this case the socket receives CAN frames from every
+enabled CAN interface. To determine the originating CAN interface
+the system call recvfrom(2) may be used instead of read(2). To send
+on a socket that is bound to 'any' interface sendto(2) is needed to
+specify the outgoing interface.
+
+Reading CAN frames from a bound CAN_RAW socket (see above) consists
+of reading a struct can_frame:
+
+.. code-block:: C
+
+    struct can_frame frame;
+
+    nbytes = read(s, &frame, sizeof(struct can_frame));
+
+    if (nbytes < 0) {
+            perror("can raw socket read");
+            return 1;
+    }
+
+    /* paranoid check ... */
+    if (nbytes < sizeof(struct can_frame)) {
+            fprintf(stderr, "read: incomplete CAN frame\n");
+            return 1;
+    }
+
+    /* do something with the received CAN frame */
+
+Writing CAN frames can be done similarly, with the write(2) system call::
+
+    nbytes = write(s, &frame, sizeof(struct can_frame));
+
+When the CAN interface is bound to 'any' existing CAN interface
+(addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the
+information about the originating CAN interface is needed:
+
+.. code-block:: C
+
+    struct sockaddr_can addr;
+    struct ifreq ifr;
+    socklen_t len = sizeof(addr);
+    struct can_frame frame;
+
+    nbytes = recvfrom(s, &frame, sizeof(struct can_frame),
+                      0, (struct sockaddr*)&addr, &len);
+
+    /* get interface name of the received CAN frame */
+    ifr.ifr_ifindex = addr.can_ifindex;
+    ioctl(s, SIOCGIFNAME, &ifr);
+    printf("Received a CAN frame from interface %s", ifr.ifr_name);
+
+To write CAN frames on sockets bound to 'any' CAN interface the
+outgoing interface has to be defined certainly:
+
+.. code-block:: C
+
+    strcpy(ifr.ifr_name, "can0");
+    ioctl(s, SIOCGIFINDEX, &ifr);
+    addr.can_ifindex = ifr.ifr_ifindex;
+    addr.can_family  = AF_CAN;
+
+    nbytes = sendto(s, &frame, sizeof(struct can_frame),
+                    0, (struct sockaddr*)&addr, sizeof(addr));
+
+An accurate timestamp can be obtained with an ioctl(2) call after reading
+a message from the socket:
+
+.. code-block:: C
+
+    struct timeval tv;
+    ioctl(s, SIOCGSTAMP, &tv);
+
+The timestamp has a resolution of one microsecond and is set automatically
+at the reception of a CAN frame.
+
+Remark about CAN FD (flexible data rate) support:
+
+Generally the handling of CAN FD is very similar to the formerly described
+examples. The new CAN FD capable CAN controllers support two different
+bitrates for the arbitration phase and the payload phase of the CAN FD frame
+and up to 64 bytes of payload. This extended payload length breaks all the
+kernel interfaces (ABI) which heavily rely on the CAN frame with fixed eight
+bytes of payload (struct can_frame) like the CAN_RAW socket. Therefore e.g.
+the CAN_RAW socket supports a new socket option CAN_RAW_FD_FRAMES that
+switches the socket into a mode that allows the handling of CAN FD frames
+and (legacy) CAN frames simultaneously (see :ref:`socketcan-rawfd`).
+
+The struct canfd_frame is defined in include/linux/can.h:
+
+.. code-block:: C
+
+    struct canfd_frame {
+            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
+            __u8    len;     /* frame payload length in byte (0 .. 64) */
+            __u8    flags;   /* additional flags for CAN FD */
+            __u8    __res0;  /* reserved / padding */
+            __u8    __res1;  /* reserved / padding */
+            __u8    data[64] __attribute__((aligned(8)));
+    };
+
+The struct canfd_frame and the existing struct can_frame have the can_id,
+the payload length and the payload data at the same offset inside their
+structures. This allows to handle the different structures very similar.
+When the content of a struct can_frame is copied into a struct canfd_frame
+all structure elements can be used as-is - only the data[] becomes extended.
+
+When introducing the struct canfd_frame it turned out that the data length
+code (DLC) of the struct can_frame was used as a length information as the
+length and the DLC has a 1:1 mapping in the range of 0 .. 8. To preserve
+the easy handling of the length information the canfd_frame.len element
+contains a plain length value from 0 .. 64. So both canfd_frame.len and
+can_frame.can_dlc are equal and contain a length information and no DLC.
+For details about the distinction of CAN and CAN FD capable devices and
+the mapping to the bus-relevant data length code (DLC), see :ref:`socketcan-can-fd-driver`.
+
+The length of the two CAN(FD) frame structures define the maximum transfer
+unit (MTU) of the CAN(FD) network interface and skbuff data length. Two
+definitions are specified for CAN specific MTUs in include/linux/can.h:
+
+.. code-block:: C
+
+  #define CAN_MTU   (sizeof(struct can_frame))   == 16  => 'legacy' CAN frame
+  #define CANFD_MTU (sizeof(struct canfd_frame)) == 72  => CAN FD frame
+
+
+.. _socketcan-raw-sockets:
+
+RAW Protocol Sockets with can_filters (SOCK_RAW)
+------------------------------------------------
+
+Using CAN_RAW sockets is extensively comparable to the commonly
+known access to CAN character devices. To meet the new possibilities
+provided by the multi user SocketCAN approach, some reasonable
+defaults are set at RAW socket binding time:
+
+- The filters are set to exactly one filter receiving everything
+- The socket only receives valid data frames (=> no error message frames)
+- The loopback of sent CAN frames is enabled (see :ref:`socketcan-local-loopback2`)
+- The socket does not receive its own sent frames (in loopback mode)
+
+These default settings may be changed before or after binding the socket.
+To use the referenced definitions of the socket options for CAN_RAW
+sockets, include <linux/can/raw.h>.
+
+
+.. _socketcan-rawfilter:
+
+RAW socket option CAN_RAW_FILTER
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The reception of CAN frames using CAN_RAW sockets can be controlled
+by defining 0 .. n filters with the CAN_RAW_FILTER socket option.
+
+The CAN filter structure is defined in include/linux/can.h:
+
+.. code-block:: C
+
+    struct can_filter {
+            canid_t can_id;
+            canid_t can_mask;
+    };
+
+A filter matches, when:
+
+.. code-block:: C
+
+    <received_can_id> & mask == can_id & mask
+
+which is analogous to known CAN controllers hardware filter semantics.
+The filter can be inverted in this semantic, when the CAN_INV_FILTER
+bit is set in can_id element of the can_filter structure. In
+contrast to CAN controller hardware filters the user may set 0 .. n
+receive filters for each open socket separately:
+
+.. code-block:: C
+
+    struct can_filter rfilter[2];
+
+    rfilter[0].can_id   = 0x123;
+    rfilter[0].can_mask = CAN_SFF_MASK;
+    rfilter[1].can_id   = 0x200;
+    rfilter[1].can_mask = 0x700;
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
+
+To disable the reception of CAN frames on the selected CAN_RAW socket:
+
+.. code-block:: C
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0);
+
+To set the filters to zero filters is quite obsolete as to not read
+data causes the raw socket to discard the received CAN frames. But
+having this 'send only' use-case we may remove the receive list in the
+Kernel to save a little (really a very little!) CPU usage.
+
+CAN Filter Usage Optimisation
+.............................
+
+The CAN filters are processed in per-device filter lists at CAN frame
+reception time. To reduce the number of checks that need to be performed
+while walking through the filter lists the CAN core provides an optimized
+filter handling when the filter subscription focusses on a single CAN ID.
+
+For the possible 2048 SFF CAN identifiers the identifier is used as an index
+to access the corresponding subscription list without any further checks.
+For the 2^29 possible EFF CAN identifiers a 10 bit XOR folding is used as
+hash function to retrieve the EFF table index.
+
+To benefit from the optimized filters for single CAN identifiers the
+CAN_SFF_MASK or CAN_EFF_MASK have to be set into can_filter.mask together
+with set CAN_EFF_FLAG and CAN_RTR_FLAG bits. A set CAN_EFF_FLAG bit in the
+can_filter.mask makes clear that it matters whether a SFF or EFF CAN ID is
+subscribed. E.g. in the example from above:
+
+.. code-block:: C
+
+    rfilter[0].can_id   = 0x123;
+    rfilter[0].can_mask = CAN_SFF_MASK;
+
+both SFF frames with CAN ID 0x123 and EFF frames with 0xXXXXX123 can pass.
+
+To filter for only 0x123 (SFF) and 0x12345678 (EFF) CAN identifiers the
+filter has to be defined in this way to benefit from the optimized filters:
+
+.. code-block:: C
+
+    struct can_filter rfilter[2];
+
+    rfilter[0].can_id   = 0x123;
+    rfilter[0].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_SFF_MASK);
+    rfilter[1].can_id   = 0x12345678 | CAN_EFF_FLAG;
+    rfilter[1].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_EFF_MASK);
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
+
+
+RAW Socket Option CAN_RAW_ERR_FILTER
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As described in :ref:`socketcan-network-problem-notifications` the CAN interface driver can generate so
+called Error Message Frames that can optionally be passed to the user
+application in the same way as other CAN frames. The possible
+errors are divided into different error classes that may be filtered
+using the appropriate error mask. To register for every possible
+error condition CAN_ERR_MASK can be used as value for the error mask.
+The values for the error mask are defined in linux/can/error.h:
+
+.. code-block:: C
+
+    can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF );
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER,
+               &err_mask, sizeof(err_mask));
+
+
+RAW Socket Option CAN_RAW_LOOPBACK
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To meet multi user needs the local loopback is enabled by default
+(see :ref:`socketcan-local-loopback1` for details). But in some embedded use-cases
+(e.g. when only one application uses the CAN bus) this loopback
+functionality can be disabled (separately for each socket):
+
+.. code-block:: C
+
+    int loopback = 0; /* 0 = disabled, 1 = enabled (default) */
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback));
+
+
+RAW socket option CAN_RAW_RECV_OWN_MSGS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When the local loopback is enabled, all the sent CAN frames are
+looped back to the open CAN sockets that registered for the CAN
+frames' CAN-ID on this given interface to meet the multi user
+needs. The reception of the CAN frames on the same socket that was
+sending the CAN frame is assumed to be unwanted and therefore
+disabled by default. This default behaviour may be changed on
+demand:
+
+.. code-block:: C
+
+    int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */
+
+    setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS,
+               &recv_own_msgs, sizeof(recv_own_msgs));
+
+
+.. _socketcan-rawfd:
+
+RAW Socket Option CAN_RAW_FD_FRAMES
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CAN FD support in CAN_RAW sockets can be enabled with a new socket option
+CAN_RAW_FD_FRAMES which is off by default. When the new socket option is
+not supported by the CAN_RAW socket (e.g. on older kernels), switching the
+CAN_RAW_FD_FRAMES option returns the error -ENOPROTOOPT.
+
+Once CAN_RAW_FD_FRAMES is enabled the application can send both CAN frames
+and CAN FD frames. OTOH the application has to handle CAN and CAN FD frames
+when reading from the socket:
+
+.. code-block:: C
+
+    CAN_RAW_FD_FRAMES enabled:  CAN_MTU and CANFD_MTU are allowed
+    CAN_RAW_FD_FRAMES disabled: only CAN_MTU is allowed (default)
+
+Example:
+
+.. code-block:: C
+
+    [ remember: CANFD_MTU == sizeof(struct canfd_frame) ]
+
+    struct canfd_frame cfd;
+
+    nbytes = read(s, &cfd, CANFD_MTU);
+
+    if (nbytes == CANFD_MTU) {
+            printf("got CAN FD frame with length %d\n", cfd.len);
+            /* cfd.flags contains valid data */
+    } else if (nbytes == CAN_MTU) {
+            printf("got legacy CAN frame with length %d\n", cfd.len);
+            /* cfd.flags is undefined */
+    } else {
+            fprintf(stderr, "read: invalid CAN(FD) frame\n");
+            return 1;
+    }
+
+    /* the content can be handled independently from the received MTU size */
+
+    printf("can_id: %X data length: %d data: ", cfd.can_id, cfd.len);
+    for (i = 0; i < cfd.len; i++)
+            printf("%02X ", cfd.data[i]);
+
+When reading with size CANFD_MTU only returns CAN_MTU bytes that have
+been received from the socket a legacy CAN frame has been read into the
+provided CAN FD structure. Note that the canfd_frame.flags data field is
+not specified in the struct can_frame and therefore it is only valid in
+CANFD_MTU sized CAN FD frames.
+
+Implementation hint for new CAN applications:
+
+To build a CAN FD aware application use struct canfd_frame as basic CAN
+data structure for CAN_RAW based applications. When the application is
+executed on an older Linux kernel and switching the CAN_RAW_FD_FRAMES
+socket option returns an error: No problem. You'll get legacy CAN frames
+or CAN FD frames and can process them the same way.
+
+When sending to CAN devices make sure that the device is capable to handle
+CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU.
+The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
+
+
+RAW socket option CAN_RAW_JOIN_FILTERS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CAN_RAW socket can set multiple CAN identifier specific filters that
+lead to multiple filters in the af_can.c filter processing. These filters
+are indenpendent from each other which leads to logical OR'ed filters when
+applied (see :ref:`socketcan-rawfilter`).
+
+This socket option joines the given CAN filters in the way that only CAN
+frames are passed to user space that matched *all* given CAN filters. The
+semantic for the applied filters is therefore changed to a logical AND.
+
+This is useful especially when the filterset is a combination of filters
+where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or
+CAN ID ranges from the incoming traffic.
+
+
+RAW Socket Returned Message Flags
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using recvmsg() call, the msg->msg_flags may contain following flags:
+
+MSG_DONTROUTE:
+	set when the received frame was created on the local host.
+
+MSG_CONFIRM:
+	set when the frame was sent via the socket it is received on.
+	This flag can be interpreted as a 'transmission confirmation' when the
+	CAN driver supports the echo of frames on driver level, see
+	:ref:`socketcan-local-loopback1` and :ref:`socketcan-local-loopback2`.
+	In order to receive such messages, CAN_RAW_RECV_OWN_MSGS must be set.
+
+
+Broadcast Manager Protocol Sockets (SOCK_DGRAM)
+-----------------------------------------------
+
+The Broadcast Manager protocol provides a command based configuration
+interface to filter and send (e.g. cyclic) CAN messages in kernel space.
+
+Receive filters can be used to down sample frequent messages; detect events
+such as message contents changes, packet length changes, and do time-out
+monitoring of received messages.
+
+Periodic transmission tasks of CAN frames or a sequence of CAN frames can be
+created and modified at runtime; both the message content and the two
+possible transmit intervals can be altered.
+
+A BCM socket is not intended for sending individual CAN frames using the
+struct can_frame as known from the CAN_RAW socket. Instead a special BCM
+configuration message is defined. The basic BCM configuration message used
+to communicate with the broadcast manager and the available operations are
+defined in the linux/can/bcm.h include. The BCM message consists of a
+message header with a command ('opcode') followed by zero or more CAN frames.
+The broadcast manager sends responses to user space in the same form:
+
+.. code-block:: C
+
+    struct bcm_msg_head {
+            __u32 opcode;                   /* command */
+            __u32 flags;                    /* special flags */
+            __u32 count;                    /* run 'count' times with ival1 */
+            struct timeval ival1, ival2;    /* count and subsequent interval */
+            canid_t can_id;                 /* unique can_id for task */
+            __u32 nframes;                  /* number of can_frames following */
+            struct can_frame frames[0];
+    };
+
+The aligned payload 'frames' uses the same basic CAN frame structure defined
+at the beginning of :ref:`socketcan-rawfd` and in the include/linux/can.h include. All
+messages to the broadcast manager from user space have this structure.
+
+Note a CAN_BCM socket must be connected instead of bound after socket
+creation (example without error checking):
+
+.. code-block:: C
+
+    int s;
+    struct sockaddr_can addr;
+    struct ifreq ifr;
+
+    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
+
+    strcpy(ifr.ifr_name, "can0");
+    ioctl(s, SIOCGIFINDEX, &ifr);
+
+    addr.can_family = AF_CAN;
+    addr.can_ifindex = ifr.ifr_ifindex;
+
+    connect(s, (struct sockaddr *)&addr, sizeof(addr));
+
+    (..)
+
+The broadcast manager socket is able to handle any number of in flight
+transmissions or receive filters concurrently. The different RX/TX jobs are
+distinguished by the unique can_id in each BCM message. However additional
+CAN_BCM sockets are recommended to communicate on multiple CAN interfaces.
+When the broadcast manager socket is bound to 'any' CAN interface (=> the
+interface index is set to zero) the configured receive filters apply to any
+CAN interface unless the sendto() syscall is used to overrule the 'any' CAN
+interface index. When using recvfrom() instead of read() to retrieve BCM
+socket messages the originating CAN interface is provided in can_ifindex.
+
+
+Broadcast Manager Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The opcode defines the operation for the broadcast manager to carry out,
+or details the broadcast managers response to several events, including
+user requests.
+
+Transmit Operations (user space to broadcast manager):
+
+TX_SETUP:
+	Create (cyclic) transmission task.
+
+TX_DELETE:
+	Remove (cyclic) transmission task, requires only can_id.
+
+TX_READ:
+	Read properties of (cyclic) transmission task for can_id.
+
+TX_SEND:
+	Send one CAN frame.
+
+Transmit Responses (broadcast manager to user space):
+
+TX_STATUS:
+	Reply to TX_READ request (transmission task configuration).
+
+TX_EXPIRED:
+	Notification when counter finishes sending at initial interval
+	'ival1'. Requires the TX_COUNTEVT flag to be set at TX_SETUP.
+
+Receive Operations (user space to broadcast manager):
+
+RX_SETUP:
+	Create RX content filter subscription.
+
+RX_DELETE:
+	Remove RX content filter subscription, requires only can_id.
+
+RX_READ:
+	Read properties of RX content filter subscription for can_id.
+
+Receive Responses (broadcast manager to user space):
+
+RX_STATUS:
+	Reply to RX_READ request (filter task configuration).
+
+RX_TIMEOUT:
+	Cyclic message is detected to be absent (timer ival1 expired).
+
+RX_CHANGED:
+	BCM message with updated CAN frame (detected content change).
+	Sent on first message received or on receipt of revised CAN messages.
+
+
+Broadcast Manager Message Flags
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When sending a message to the broadcast manager the 'flags' element may
+contain the following flag definitions which influence the behaviour:
+
+SETTIMER:
+	Set the values of ival1, ival2 and count
+
+STARTTIMER:
+	Start the timer with the actual values of ival1, ival2
+	and count. Starting the timer leads simultaneously to emit a CAN frame.
+
+TX_COUNTEVT:
+	Create the message TX_EXPIRED when count expires
+
+TX_ANNOUNCE:
+	A change of data by the process is emitted immediately.
+
+TX_CP_CAN_ID:
+	Copies the can_id from the message header to each
+	subsequent frame in frames. This is intended as usage simplification. For
+	TX tasks the unique can_id from the message header may differ from the
+	can_id(s) stored for transmission in the subsequent struct can_frame(s).
+
+RX_FILTER_ID:
+	Filter by can_id alone, no frames required (nframes=0).
+
+RX_CHECK_DLC:
+	A change of the DLC leads to an RX_CHANGED.
+
+RX_NO_AUTOTIMER:
+	Prevent automatically starting the timeout monitor.
+
+RX_ANNOUNCE_RESUME:
+	If passed at RX_SETUP and a receive timeout occurred, a
+	RX_CHANGED message will be generated when the (cyclic) receive restarts.
+
+TX_RESET_MULTI_IDX:
+	Reset the index for the multiple frame transmission.
+
+RX_RTR_FRAME:
+	Send reply for RTR-request (placed in op->frames[0]).
+
+
+Broadcast Manager Transmission Timers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Periodic transmission configurations may use up to two interval timers.
+In this case the BCM sends a number of messages ('count') at an interval
+'ival1', then continuing to send at another given interval 'ival2'. When
+only one timer is needed 'count' is set to zero and only 'ival2' is used.
+When SET_TIMER and START_TIMER flag were set the timers are activated.
+The timer values can be altered at runtime when only SET_TIMER is set.
+
+
+Broadcast Manager message sequence transmission
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Up to 256 CAN frames can be transmitted in a sequence in the case of a cyclic
+TX task configuration. The number of CAN frames is provided in the 'nframes'
+element of the BCM message head. The defined number of CAN frames are added
+as array to the TX_SETUP BCM configuration message:
+
+.. code-block:: C
+
+    /* create a struct to set up a sequence of four CAN frames */
+    struct {
+            struct bcm_msg_head msg_head;
+            struct can_frame frame[4];
+    } mytxmsg;
+
+    (..)
+    mytxmsg.msg_head.nframes = 4;
+    (..)
+
+    write(s, &mytxmsg, sizeof(mytxmsg));
+
+With every transmission the index in the array of CAN frames is increased
+and set to zero at index overflow.
+
+
+Broadcast Manager Receive Filter Timers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The timer values ival1 or ival2 may be set to non-zero values at RX_SETUP.
+When the SET_TIMER flag is set the timers are enabled:
+
+ival1:
+	Send RX_TIMEOUT when a received message is not received again within
+	the given time. When START_TIMER is set at RX_SETUP the timeout detection
+	is activated directly - even without a former CAN frame reception.
+
+ival2:
+	Throttle the received message rate down to the value of ival2. This
+	is useful to reduce messages for the application when the signal inside the
+	CAN frame is stateless as state changes within the ival2 periode may get
+	lost.
+
+Broadcast Manager Multiplex Message Receive Filter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To filter for content changes in multiplex message sequences an array of more
+than one CAN frames can be passed in a RX_SETUP configuration message. The
+data bytes of the first CAN frame contain the mask of relevant bits that
+have to match in the subsequent CAN frames with the received CAN frame.
+If one of the subsequent CAN frames is matching the bits in that frame data
+mark the relevant content to be compared with the previous received content.
+Up to 257 CAN frames (multiplex filter bit mask CAN frame plus 256 CAN
+filters) can be added as array to the TX_SETUP BCM configuration message:
+
+.. code-block:: C
+
+    /* usually used to clear CAN frame data[] - beware of endian problems! */
+    #define U64_DATA(p) (*(unsigned long long*)(p)->data)
+
+    struct {
+            struct bcm_msg_head msg_head;
+            struct can_frame frame[5];
+    } msg;
+
+    msg.msg_head.opcode  = RX_SETUP;
+    msg.msg_head.can_id  = 0x42;
+    msg.msg_head.flags   = 0;
+    msg.msg_head.nframes = 5;
+    U64_DATA(&msg.frame[0]) = 0xFF00000000000000ULL; /* MUX mask */
+    U64_DATA(&msg.frame[1]) = 0x01000000000000FFULL; /* data mask (MUX 0x01) */
+    U64_DATA(&msg.frame[2]) = 0x0200FFFF000000FFULL; /* data mask (MUX 0x02) */
+    U64_DATA(&msg.frame[3]) = 0x330000FFFFFF0003ULL; /* data mask (MUX 0x33) */
+    U64_DATA(&msg.frame[4]) = 0x4F07FC0FF0000000ULL; /* data mask (MUX 0x4F) */
+
+    write(s, &msg, sizeof(msg));
+
+
+Broadcast Manager CAN FD Support
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The programming API of the CAN_BCM depends on struct can_frame which is
+given as array directly behind the bcm_msg_head structure. To follow this
+schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head
+flags indicates that the concatenated CAN frame structures behind the
+bcm_msg_head are defined as struct canfd_frame:
+
+.. code-block:: C
+
+    struct {
+            struct bcm_msg_head msg_head;
+            struct canfd_frame frame[5];
+    } msg;
+
+    msg.msg_head.opcode  = RX_SETUP;
+    msg.msg_head.can_id  = 0x42;
+    msg.msg_head.flags   = CAN_FD_FRAME;
+    msg.msg_head.nframes = 5;
+    (..)
+
+When using CAN FD frames for multiplex filtering the MUX mask is still
+expected in the first 64 bit of the struct canfd_frame data section.
+
+
+Connected Transport Protocols (SOCK_SEQPACKET)
+----------------------------------------------
+
+(to be written)
+
+
+Unconnected Transport Protocols (SOCK_DGRAM)
+--------------------------------------------
+
+(to be written)
+
+
+.. _socketcan-core-module:
+
+SocketCAN Core Module
+=====================
+
+The SocketCAN core module implements the protocol family
+PF_CAN. CAN protocol modules are loaded by the core module at
+runtime. The core module provides an interface for CAN protocol
+modules to subscribe needed CAN IDs (see :ref:`socketcan-receive-lists`).
+
+
+can.ko Module Params
+--------------------
+
+- **stats_timer**:
+  To calculate the SocketCAN core statistics
+  (e.g. current/maximum frames per second) this 1 second timer is
+  invoked at can.ko module start time by default. This timer can be
+  disabled by using stattimer=0 on the module commandline.
+
+- **debug**:
+  (removed since SocketCAN SVN r546)
+
+
+procfs content
+--------------
+
+As described in :ref:`socketcan-receive-lists` the SocketCAN core uses several filter
+lists to deliver received CAN frames to CAN protocol modules. These
+receive lists, their filters and the count of filter matches can be
+checked in the appropriate receive list. All entries contain the
+device and a protocol module identifier::
+
+    foo@bar:~$ cat /proc/net/can/rcvlist_all
+
+    receive list 'rx_all':
+      (vcan3: no entry)
+      (vcan2: no entry)
+      (vcan1: no entry)
+      device   can_id   can_mask  function  userdata   matches  ident
+       vcan0     000    00000000  f88e6370  f6c6f400         0  raw
+      (any: no entry)
+
+In this example an application requests any CAN traffic from vcan0::
+
+    rcvlist_all - list for unfiltered entries (no filter operations)
+    rcvlist_eff - list for single extended frame (EFF) entries
+    rcvlist_err - list for error message frames masks
+    rcvlist_fil - list for mask/value filters
+    rcvlist_inv - list for mask/value filters (inverse semantic)
+    rcvlist_sff - list for single standard frame (SFF) entries
+
+Additional procfs files in /proc/net/can::
+
+    stats       - SocketCAN core statistics (rx/tx frames, match ratios, ...)
+    reset_stats - manual statistic reset
+    version     - prints the SocketCAN core version and the ABI version
+
+
+Writing Own CAN Protocol Modules
+--------------------------------
+
+To implement a new protocol in the protocol family PF_CAN a new
+protocol has to be defined in include/linux/can.h .
+The prototypes and definitions to use the SocketCAN core can be
+accessed by including include/linux/can/core.h .
+In addition to functions that register the CAN protocol and the
+CAN device notifier chain there are functions to subscribe CAN
+frames received by CAN interfaces and to send CAN frames::
+
+    can_rx_register   - subscribe CAN frames from a specific interface
+    can_rx_unregister - unsubscribe CAN frames from a specific interface
+    can_send          - transmit a CAN frame (optional with local loopback)
+
+For details see the kerneldoc documentation in net/can/af_can.c or
+the source code of net/can/raw.c or net/can/bcm.c .
+
+
+CAN Network Drivers
+===================
+
+Writing a CAN network device driver is much easier than writing a
+CAN character device driver. Similar to other known network device
+drivers you mainly have to deal with:
+
+- TX: Put the CAN frame from the socket buffer to the CAN controller.
+- RX: Put the CAN frame from the CAN controller to the socket buffer.
+
+See e.g. at Documentation/networking/netdevices.txt . The differences
+for writing CAN network device driver are described below:
+
+
+General Settings
+----------------
+
+.. code-block:: C
+
+    dev->type  = ARPHRD_CAN; /* the netdevice hardware type */
+    dev->flags = IFF_NOARP;  /* CAN has no arp */
+
+    dev->mtu = CAN_MTU; /* sizeof(struct can_frame) -> legacy CAN interface */
+
+    or alternative, when the controller supports CAN with flexible data rate:
+    dev->mtu = CANFD_MTU; /* sizeof(struct canfd_frame) -> CAN FD interface */
+
+The struct can_frame or struct canfd_frame is the payload of each socket
+buffer (skbuff) in the protocol family PF_CAN.
+
+
+.. _socketcan-local-loopback2:
+
+Local Loopback of Sent Frames
+-----------------------------
+
+As described in :ref:`socketcan-local-loopback1` the CAN network device driver should
+support a local loopback functionality similar to the local echo
+e.g. of tty devices. In this case the driver flag IFF_ECHO has to be
+set to prevent the PF_CAN core from locally echoing sent frames
+(aka loopback) as fallback solution::
+
+    dev->flags = (IFF_NOARP | IFF_ECHO);
+
+
+CAN Controller Hardware Filters
+-------------------------------
+
+To reduce the interrupt load on deep embedded systems some CAN
+controllers support the filtering of CAN IDs or ranges of CAN IDs.
+These hardware filter capabilities vary from controller to
+controller and have to be identified as not feasible in a multi-user
+networking approach. The use of the very controller specific
+hardware filters could make sense in a very dedicated use-case, as a
+filter on driver level would affect all users in the multi-user
+system. The high efficient filter sets inside the PF_CAN core allow
+to set different multiple filters for each socket separately.
+Therefore the use of hardware filters goes to the category 'handmade
+tuning on deep embedded systems'. The author is running a MPC603e
+@133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
+load without any problems ...
+
+
+The Virtual CAN Driver (vcan)
+-----------------------------
+
+Similar to the network loopback devices, vcan offers a virtual local
+CAN interface. A full qualified address on CAN consists of
+
+- a unique CAN Identifier (CAN ID)
+- the CAN bus this CAN ID is transmitted on (e.g. can0)
+
+so in common use cases more than one virtual CAN interface is needed.
+
+The virtual CAN interfaces allow the transmission and reception of CAN
+frames without real CAN controller hardware. Virtual CAN network
+devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
+When compiled as a module the virtual CAN driver module is called vcan.ko
+
+Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
+netlink interface to create vcan network devices. The creation and
+removal of vcan network devices can be managed with the ip(8) tool::
+
+  - Create a virtual CAN network interface:
+       $ ip link add type vcan
+
+  - Create a virtual CAN network interface with a specific name 'vcan42':
+       $ ip link add dev vcan42 type vcan
+
+  - Remove a (virtual CAN) network interface 'vcan42':
+       $ ip link del vcan42
+
+
+The CAN Network Device Driver Interface
+---------------------------------------
+
+The CAN network device driver interface provides a generic interface
+to setup, configure and monitor CAN network devices. The user can then
+configure the CAN device, like setting the bit-timing parameters, via
+the netlink interface using the program "ip" from the "IPROUTE2"
+utility suite. The following chapter describes briefly how to use it.
+Furthermore, the interface uses a common data structure and exports a
+set of common functions, which all real CAN network device drivers
+should use. Please have a look to the SJA1000 or MSCAN driver to
+understand how to use them. The name of the module is can-dev.ko.
+
+
+Netlink interface to set/get devices properties
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CAN device must be configured via netlink interface. The supported
+netlink message types are defined and briefly described in
+"include/linux/can/netlink.h". CAN link support for the program "ip"
+of the IPROUTE2 utility suite is available and it can be used as shown
+below:
+
+Setting CAN device properties::
+
+    $ ip link set can0 type can help
+    Usage: ip link set DEVICE type can
+        [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] |
+        [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1
+          phase-seg2 PHASE-SEG2 [ sjw SJW ] ]
+
+        [ dbitrate BITRATE [ dsample-point SAMPLE-POINT] ] |
+        [ dtq TQ dprop-seg PROP_SEG dphase-seg1 PHASE-SEG1
+          dphase-seg2 PHASE-SEG2 [ dsjw SJW ] ]
+
+        [ loopback { on | off } ]
+        [ listen-only { on | off } ]
+        [ triple-sampling { on | off } ]
+        [ one-shot { on | off } ]
+        [ berr-reporting { on | off } ]
+        [ fd { on | off } ]
+        [ fd-non-iso { on | off } ]
+        [ presume-ack { on | off } ]
+
+        [ restart-ms TIME-MS ]
+        [ restart ]
+
+        Where: BITRATE       := { 1..1000000 }
+               SAMPLE-POINT  := { 0.000..0.999 }
+               TQ            := { NUMBER }
+               PROP-SEG      := { 1..8 }
+               PHASE-SEG1    := { 1..8 }
+               PHASE-SEG2    := { 1..8 }
+               SJW           := { 1..4 }
+               RESTART-MS    := { 0 | NUMBER }
+
+Display CAN device details and statistics::
+
+    $ ip -details -statistics link show can0
+    2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10
+      link/can
+      can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100
+      bitrate 125000 sample_point 0.875
+      tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1
+      sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
+      clock 8000000
+      re-started bus-errors arbit-lost error-warn error-pass bus-off
+      41         17457      0          41         42         41
+      RX: bytes  packets  errors  dropped overrun mcast
+      140859     17608    17457   0       0       0
+      TX: bytes  packets  errors  dropped carrier collsns
+      861        112      0       41      0       0
+
+More info to the above output:
+
+"<TRIPLE-SAMPLING>"
+	Shows the list of selected CAN controller modes: LOOPBACK,
+	LISTEN-ONLY, or TRIPLE-SAMPLING.
+
+"state ERROR-ACTIVE"
+	The current state of the CAN controller: "ERROR-ACTIVE",
+	"ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED"
+
+"restart-ms 100"
+	Automatic restart delay time. If set to a non-zero value, a
+	restart of the CAN controller will be triggered automatically
+	in case of a bus-off condition after the specified delay time
+	in milliseconds. By default it's off.
+
+"bitrate 125000 sample-point 0.875"
+	Shows the real bit-rate in bits/sec and the sample-point in the
+	range 0.000..0.999. If the calculation of bit-timing parameters
+	is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the
+	bit-timing can be defined by setting the "bitrate" argument.
+	Optionally the "sample-point" can be specified. By default it's
+	0.000 assuming CIA-recommended sample-points.
+
+"tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1"
+	Shows the time quanta in ns, propagation segment, phase buffer
+	segment 1 and 2 and the synchronisation jump width in units of
+	tq. They allow to define the CAN bit-timing in a hardware
+	independent format as proposed by the Bosch CAN 2.0 spec (see
+	chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf).
+
+"sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 clock 8000000"
+	Shows the bit-timing constants of the CAN controller, here the
+	"sja1000". The minimum and maximum values of the time segment 1
+	and 2, the synchronisation jump width in units of tq, the
+	bitrate pre-scaler and the CAN system clock frequency in Hz.
+	These constants could be used for user-defined (non-standard)
+	bit-timing calculation algorithms in user-space.
+
+"re-started bus-errors arbit-lost error-warn error-pass bus-off"
+	Shows the number of restarts, bus and arbitration lost errors,
+	and the state changes to the error-warning, error-passive and
+	bus-off state. RX overrun errors are listed in the "overrun"
+	field of the standard network statistics.
+
+Setting the CAN Bit-Timing
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CAN bit-timing parameters can always be defined in a hardware
+independent format as proposed in the Bosch CAN 2.0 specification
+specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2"
+and "sjw"::
+
+    $ ip link set canX type can tq 125 prop-seg 6 \
+				phase-seg1 7 phase-seg2 2 sjw 1
+
+If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA
+recommended CAN bit-timing parameters will be calculated if the bit-
+rate is specified with the argument "bitrate"::
+
+    $ ip link set canX type can bitrate 125000
+
+Note that this works fine for the most common CAN controllers with
+standard bit-rates but may *fail* for exotic bit-rates or CAN system
+clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some
+space and allows user-space tools to solely determine and set the
+bit-timing parameters. The CAN controller specific bit-timing
+constants can be used for that purpose. They are listed by the
+following command::
+
+    $ ip -details link show can0
+    ...
+      sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
+
+
+Starting and Stopping the CAN Network Device
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A CAN network device is started or stopped as usual with the command
+"ifconfig canX up/down" or "ip link set canX up/down". Be aware that
+you *must* define proper bit-timing parameters for real CAN devices
+before you can start it to avoid error-prone default settings::
+
+    $ ip link set canX up type can bitrate 125000
+
+A device may enter the "bus-off" state if too many errors occurred on
+the CAN bus. Then no more messages are received or sent. An automatic
+bus-off recovery can be enabled by setting the "restart-ms" to a
+non-zero value, e.g.::
+
+    $ ip link set canX type can restart-ms 100
+
+Alternatively, the application may realize the "bus-off" condition
+by monitoring CAN error message frames and do a restart when
+appropriate with the command::
+
+    $ ip link set canX type can restart
+
+Note that a restart will also create a CAN error message frame (see
+also :ref:`socketcan-network-problem-notifications`).
+
+
+.. _socketcan-can-fd-driver:
+
+CAN FD (Flexible Data Rate) Driver Support
+------------------------------------------
+
+CAN FD capable CAN controllers support two different bitrates for the
+arbitration phase and the payload phase of the CAN FD frame. Therefore a
+second bit timing has to be specified in order to enable the CAN FD bitrate.
+
+Additionally CAN FD capable CAN controllers support up to 64 bytes of
+payload. The representation of this length in can_frame.can_dlc and
+canfd_frame.len for userspace applications and inside the Linux network
+layer is a plain value from 0 .. 64 instead of the CAN 'data length code'.
+The data length code was a 1:1 mapping to the payload length in the legacy
+CAN frames anyway. The payload length to the bus-relevant DLC mapping is
+only performed inside the CAN drivers, preferably with the helper
+functions can_dlc2len() and can_len2dlc().
+
+The CAN netdevice driver capabilities can be distinguished by the network
+devices maximum transfer unit (MTU)::
+
+  MTU = 16 (CAN_MTU)   => sizeof(struct can_frame)   => 'legacy' CAN device
+  MTU = 72 (CANFD_MTU) => sizeof(struct canfd_frame) => CAN FD capable device
+
+The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
+N.B. CAN FD capable devices can also handle and send legacy CAN frames.
+
+When configuring CAN FD capable CAN controllers an additional 'data' bitrate
+has to be set. This bitrate for the data phase of the CAN FD frame has to be
+at least the bitrate which was configured for the arbitration phase. This
+second bitrate is specified analogue to the first bitrate but the bitrate
+setting keywords for the 'data' bitrate start with 'd' e.g. dbitrate,
+dsample-point, dsjw or dtq and similar settings. When a data bitrate is set
+within the configuration process the controller option "fd on" can be
+specified to enable the CAN FD mode in the CAN controller. This controller
+option also switches the device MTU to 72 (CANFD_MTU).
+
+The first CAN FD specification presented as whitepaper at the International
+CAN Conference 2012 needed to be improved for data integrity reasons.
+Therefore two CAN FD implementations have to be distinguished today:
+
+- ISO compliant:     The ISO 11898-1:2015 CAN FD implementation (default)
+- non-ISO compliant: The CAN FD implementation following the 2012 whitepaper
+
+Finally there are three types of CAN FD controllers:
+
+1. ISO compliant (fixed)
+2. non-ISO compliant (fixed, like the M_CAN IP core v3.0.1 in m_can.c)
+3. ISO/non-ISO CAN FD controllers (switchable, like the PEAK PCAN-USB FD)
+
+The current ISO/non-ISO mode is announced by the CAN controller driver via
+netlink and displayed by the 'ip' tool (controller option FD-NON-ISO).
+The ISO/non-ISO-mode can be altered by setting 'fd-non-iso {on|off}' for
+switchable CAN FD controllers only.
+
+Example configuring 500 kbit/s arbitration bitrate and 4 Mbit/s data bitrate::
+
+    $ ip link set can0 up type can bitrate 500000 sample-point 0.75 \
+                                   dbitrate 4000000 dsample-point 0.8 fd on
+    $ ip -details link show can0
+    5: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 72 qdisc pfifo_fast state UNKNOWN \
+             mode DEFAULT group default qlen 10
+    link/can  promiscuity 0
+    can <FD> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
+          bitrate 500000 sample-point 0.750
+          tq 50 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 1
+          pcan_usb_pro_fd: tseg1 1..64 tseg2 1..16 sjw 1..16 brp 1..1024 \
+          brp-inc 1
+          dbitrate 4000000 dsample-point 0.800
+          dtq 12 dprop-seg 7 dphase-seg1 8 dphase-seg2 4 dsjw 1
+          pcan_usb_pro_fd: dtseg1 1..16 dtseg2 1..8 dsjw 1..4 dbrp 1..1024 \
+          dbrp-inc 1
+          clock 80000000
+
+Example when 'fd-non-iso on' is added on this switchable CAN FD adapter::
+
+   can <FD,FD-NON-ISO> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
+
+
+Supported CAN Hardware
+----------------------
+
+Please check the "Kconfig" file in "drivers/net/can" to get an actual
+list of the support CAN hardware. On the SocketCAN project website
+(see :ref:`socketcan-resources`) there might be further drivers available, also for
+older kernel versions.
+
+
+.. _socketcan-resources:
+
+SocketCAN Resources
+===================
+
+The Linux CAN / SocketCAN project resources (project site / mailing list)
+are referenced in the MAINTAINERS file in the Linux source tree.
+Search for CAN NETWORK [LAYERS|DRIVERS].
+
+Credits
+=======
+
+- Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver)
+- Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
+- Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
+- Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews, CAN device driver interface, MSCAN driver)
+- Robert Schwebel (design reviews, PTXdist integration)
+- Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
+- Benedikt Spranger (reviews)
+- Thomas Gleixner (LKML reviews, coding style, posting hints)
+- Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver)
+- Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
+- Klaus Hitschler (PEAK driver integration)
+- Uwe Koppe (CAN netdevices with PF_PACKET approach)
+- Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
+- Pavel Pisa (Bit-timing calculation)
+- Sascha Hauer (SJA1000 platform driver)
+- Sebastian Haas (SJA1000 EMS PCI driver)
+- Markus Plessing (SJA1000 EMS PCI driver)
+- Per Dalen (SJA1000 Kvaser PCI driver)
+- Sam Ravnborg (reviews, coding style, kbuild help)
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
deleted file mode 100644
index aa15b9ee2e70..000000000000
--- a/Documentation/networking/can.txt
+++ /dev/null
@@ -1,1308 +0,0 @@
-============================================================================
-
-can.txt
-
-Readme file for the Controller Area Network Protocol Family (aka SocketCAN)
-
-This file contains
-
-  1 Overview / What is SocketCAN
-
-  2 Motivation / Why using the socket API
-
-  3 SocketCAN concept
-    3.1 receive lists
-    3.2 local loopback of sent frames
-    3.3 network problem notifications
-
-  4 How to use SocketCAN
-    4.1 RAW protocol sockets with can_filters (SOCK_RAW)
-      4.1.1 RAW socket option CAN_RAW_FILTER
-      4.1.2 RAW socket option CAN_RAW_ERR_FILTER
-      4.1.3 RAW socket option CAN_RAW_LOOPBACK
-      4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS
-      4.1.5 RAW socket option CAN_RAW_FD_FRAMES
-      4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS
-      4.1.7 RAW socket returned message flags
-    4.2 Broadcast Manager protocol sockets (SOCK_DGRAM)
-      4.2.1 Broadcast Manager operations
-      4.2.2 Broadcast Manager message flags
-      4.2.3 Broadcast Manager transmission timers
-      4.2.4 Broadcast Manager message sequence transmission
-      4.2.5 Broadcast Manager receive filter timers
-      4.2.6 Broadcast Manager multiplex message receive filter
-      4.2.7 Broadcast Manager CAN FD support
-    4.3 connected transport protocols (SOCK_SEQPACKET)
-    4.4 unconnected transport protocols (SOCK_DGRAM)
-
-  5 SocketCAN core module
-    5.1 can.ko module params
-    5.2 procfs content
-    5.3 writing own CAN protocol modules
-
-  6 CAN network drivers
-    6.1 general settings
-    6.2 local loopback of sent frames
-    6.3 CAN controller hardware filters
-    6.4 The virtual CAN driver (vcan)
-    6.5 The CAN network device driver interface
-      6.5.1 Netlink interface to set/get devices properties
-      6.5.2 Setting the CAN bit-timing
-      6.5.3 Starting and stopping the CAN network device
-    6.6 CAN FD (flexible data rate) driver support
-    6.7 supported CAN hardware
-
-  7 SocketCAN resources
-
-  8 Credits
-
-============================================================================
-
-1. Overview / What is SocketCAN
---------------------------------
-
-The socketcan package is an implementation of CAN protocols
-(Controller Area Network) for Linux.  CAN is a networking technology
-which has widespread use in automation, embedded devices, and
-automotive fields.  While there have been other CAN implementations
-for Linux based on character devices, SocketCAN uses the Berkeley
-socket API, the Linux network stack and implements the CAN device
-drivers as network interfaces.  The CAN socket API has been designed
-as similar as possible to the TCP/IP protocols to allow programmers,
-familiar with network programming, to easily learn how to use CAN
-sockets.
-
-2. Motivation / Why using the socket API
-----------------------------------------
-
-There have been CAN implementations for Linux before SocketCAN so the
-question arises, why we have started another project.  Most existing
-implementations come as a device driver for some CAN hardware, they
-are based on character devices and provide comparatively little
-functionality.  Usually, there is only a hardware-specific device
-driver which provides a character device interface to send and
-receive raw CAN frames, directly to/from the controller hardware.
-Queueing of frames and higher-level transport protocols like ISO-TP
-have to be implemented in user space applications.  Also, most
-character-device implementations support only one single process to
-open the device at a time, similar to a serial interface.  Exchanging
-the CAN controller requires employment of another device driver and
-often the need for adaption of large parts of the application to the
-new driver's API.
-
-SocketCAN was designed to overcome all of these limitations.  A new
-protocol family has been implemented which provides a socket interface
-to user space applications and which builds upon the Linux network
-layer, enabling use all of the provided queueing functionality.  A device
-driver for CAN controller hardware registers itself with the Linux
-network layer as a network device, so that CAN frames from the
-controller can be passed up to the network layer and on to the CAN
-protocol family module and also vice-versa.  Also, the protocol family
-module provides an API for transport protocol modules to register, so
-that any number of transport protocols can be loaded or unloaded
-dynamically.  In fact, the can core module alone does not provide any
-protocol and cannot be used without loading at least one additional
-protocol module.  Multiple sockets can be opened at the same time,
-on different or the same protocol module and they can listen/send
-frames on different or the same CAN IDs.  Several sockets listening on
-the same interface for frames with the same CAN ID are all passed the
-same received matching CAN frames.  An application wishing to
-communicate using a specific transport protocol, e.g. ISO-TP, just
-selects that protocol when opening the socket, and then can read and
-write application data byte streams, without having to deal with
-CAN-IDs, frames, etc.
-
-Similar functionality visible from user-space could be provided by a
-character device, too, but this would lead to a technically inelegant
-solution for a couple of reasons:
-
-* Intricate usage.  Instead of passing a protocol argument to
-  socket(2) and using bind(2) to select a CAN interface and CAN ID, an
-  application would have to do all these operations using ioctl(2)s.
-
-* Code duplication.  A character device cannot make use of the Linux
-  network queueing code, so all that code would have to be duplicated
-  for CAN networking.
-
-* Abstraction.  In most existing character-device implementations, the
-  hardware-specific device driver for a CAN controller directly
-  provides the character device for the application to work with.
-  This is at least very unusual in Unix systems for both, char and
-  block devices.  For example you don't have a character device for a
-  certain UART of a serial interface, a certain sound chip in your
-  computer, a SCSI or IDE controller providing access to your hard
-  disk or tape streamer device.  Instead, you have abstraction layers
-  which provide a unified character or block device interface to the
-  application on the one hand, and a interface for hardware-specific
-  device drivers on the other hand.  These abstractions are provided
-  by subsystems like the tty layer, the audio subsystem or the SCSI
-  and IDE subsystems for the devices mentioned above.
-
-  The easiest way to implement a CAN device driver is as a character
-  device without such a (complete) abstraction layer, as is done by most
-  existing drivers.  The right way, however, would be to add such a
-  layer with all the functionality like registering for certain CAN
-  IDs, supporting several open file descriptors and (de)multiplexing
-  CAN frames between them, (sophisticated) queueing of CAN frames, and
-  providing an API for device drivers to register with.  However, then
-  it would be no more difficult, or may be even easier, to use the
-  networking framework provided by the Linux kernel, and this is what
-  SocketCAN does.
-
-  The use of the networking framework of the Linux kernel is just the
-  natural and most appropriate way to implement CAN for Linux.
-
-3. SocketCAN concept
----------------------
-
-  As described in chapter 2 it is the main goal of SocketCAN to
-  provide a socket interface to user space applications which builds
-  upon the Linux network layer. In contrast to the commonly known
-  TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!)
-  medium that has no MAC-layer addressing like ethernet. The CAN-identifier
-  (can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs
-  have to be chosen uniquely on the bus. When designing a CAN-ECU
-  network the CAN-IDs are mapped to be sent by a specific ECU.
-  For this reason a CAN-ID can be treated best as a kind of source address.
-
-  3.1 receive lists
-
-  The network transparent access of multiple applications leads to the
-  problem that different applications may be interested in the same
-  CAN-IDs from the same CAN network interface. The SocketCAN core
-  module - which implements the protocol family CAN - provides several
-  high efficient receive lists for this reason. If e.g. a user space
-  application opens a CAN RAW socket, the raw protocol module itself
-  requests the (range of) CAN-IDs from the SocketCAN core that are
-  requested by the user. The subscription and unsubscription of
-  CAN-IDs can be done for specific CAN interfaces or for all(!) known
-  CAN interfaces with the can_rx_(un)register() functions provided to
-  CAN protocol modules by the SocketCAN core (see chapter 5).
-  To optimize the CPU usage at runtime the receive lists are split up
-  into several specific lists per device that match the requested
-  filter complexity for a given use-case.
-
-  3.2 local loopback of sent frames
-
-  As known from other networking concepts the data exchanging
-  applications may run on the same or different nodes without any
-  change (except for the according addressing information):
-
-         ___   ___   ___                   _______   ___
-        | _ | | _ | | _ |                 | _   _ | | _ |
-        ||A|| ||B|| ||C||                 ||A| |B|| ||C||
-        |___| |___| |___|                 |_______| |___|
-          |     |     |                       |       |
-        -----------------(1)- CAN bus -(2)---------------
-
-  To ensure that application A receives the same information in the
-  example (2) as it would receive in example (1) there is need for
-  some kind of local loopback of the sent CAN frames on the appropriate
-  node.
-
-  The Linux network devices (by default) just can handle the
-  transmission and reception of media dependent frames. Due to the
-  arbitration on the CAN bus the transmission of a low prio CAN-ID
-  may be delayed by the reception of a high prio CAN frame. To
-  reflect the correct* traffic on the node the loopback of the sent
-  data has to be performed right after a successful transmission. If
-  the CAN network interface is not capable of performing the loopback for
-  some reason the SocketCAN core can do this task as a fallback solution.
-  See chapter 6.2 for details (recommended).
-
-  The loopback functionality is enabled by default to reflect standard
-  networking behaviour for CAN applications. Due to some requests from
-  the RT-SocketCAN group the loopback optionally may be disabled for each
-  separate socket. See sockopts from the CAN RAW sockets in chapter 4.1.
-
-  * = you really like to have this when you're running analyser tools
-      like 'candump' or 'cansniffer' on the (same) node.
-
-  3.3 network problem notifications
-
-  The use of the CAN bus may lead to several problems on the physical
-  and media access control layer. Detecting and logging of these lower
-  layer problems is a vital requirement for CAN users to identify
-  hardware issues on the physical transceiver layer as well as
-  arbitration problems and error frames caused by the different
-  ECUs. The occurrence of detected errors are important for diagnosis
-  and have to be logged together with the exact timestamp. For this
-  reason the CAN interface driver can generate so called Error Message
-  Frames that can optionally be passed to the user application in the
-  same way as other CAN frames. Whenever an error on the physical layer
-  or the MAC layer is detected (e.g. by the CAN controller) the driver
-  creates an appropriate error message frame. Error messages frames can
-  be requested by the user application using the common CAN filter
-  mechanisms. Inside this filter definition the (interested) type of
-  errors may be selected. The reception of error messages is disabled
-  by default. The format of the CAN error message frame is briefly
-  described in the Linux header file "include/uapi/linux/can/error.h".
-
-4. How to use SocketCAN
-------------------------
-
-  Like TCP/IP, you first need to open a socket for communicating over a
-  CAN network. Since SocketCAN implements a new protocol family, you
-  need to pass PF_CAN as the first argument to the socket(2) system
-  call. Currently, there are two CAN protocols to choose from, the raw
-  socket protocol and the broadcast manager (BCM). So to open a socket,
-  you would write
-
-    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
-
-  and
-
-    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
-
-  respectively.  After the successful creation of the socket, you would
-  normally use the bind(2) system call to bind the socket to a CAN
-  interface (which is different from TCP/IP due to different addressing
-  - see chapter 3). After binding (CAN_RAW) or connecting (CAN_BCM)
-  the socket, you can read(2) and write(2) from/to the socket or use
-  send(2), sendto(2), sendmsg(2) and the recv* counterpart operations
-  on the socket as usual. There are also CAN specific socket options
-  described below.
-
-  The basic CAN frame structure and the sockaddr structure are defined
-  in include/linux/can.h:
-
-    struct can_frame {
-            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
-            __u8    can_dlc; /* frame payload length in byte (0 .. 8) */
-            __u8    __pad;   /* padding */
-            __u8    __res0;  /* reserved / padding */
-            __u8    __res1;  /* reserved / padding */
-            __u8    data[8] __attribute__((aligned(8)));
-    };
-
-  The alignment of the (linear) payload data[] to a 64bit boundary
-  allows the user to define their own structs and unions to easily access
-  the CAN payload. There is no given byteorder on the CAN bus by
-  default. A read(2) system call on a CAN_RAW socket transfers a
-  struct can_frame to the user space.
-
-  The sockaddr_can structure has an interface index like the
-  PF_PACKET socket, that also binds to a specific interface:
-
-    struct sockaddr_can {
-            sa_family_t can_family;
-            int         can_ifindex;
-            union {
-                    /* transport protocol class address info (e.g. ISOTP) */
-                    struct { canid_t rx_id, tx_id; } tp;
-
-                    /* reserved for future CAN protocols address information */
-            } can_addr;
-    };
-
-  To determine the interface index an appropriate ioctl() has to
-  be used (example for CAN_RAW sockets without error checking):
-
-    int s;
-    struct sockaddr_can addr;
-    struct ifreq ifr;
-
-    s = socket(PF_CAN, SOCK_RAW, CAN_RAW);
-
-    strcpy(ifr.ifr_name, "can0" );
-    ioctl(s, SIOCGIFINDEX, &ifr);
-
-    addr.can_family = AF_CAN;
-    addr.can_ifindex = ifr.ifr_ifindex;
-
-    bind(s, (struct sockaddr *)&addr, sizeof(addr));
-
-    (..)
-
-  To bind a socket to all(!) CAN interfaces the interface index must
-  be 0 (zero). In this case the socket receives CAN frames from every
-  enabled CAN interface. To determine the originating CAN interface
-  the system call recvfrom(2) may be used instead of read(2). To send
-  on a socket that is bound to 'any' interface sendto(2) is needed to
-  specify the outgoing interface.
-
-  Reading CAN frames from a bound CAN_RAW socket (see above) consists
-  of reading a struct can_frame:
-
-    struct can_frame frame;
-
-    nbytes = read(s, &frame, sizeof(struct can_frame));
-
-    if (nbytes < 0) {
-            perror("can raw socket read");
-            return 1;
-    }
-
-    /* paranoid check ... */
-    if (nbytes < sizeof(struct can_frame)) {
-            fprintf(stderr, "read: incomplete CAN frame\n");
-            return 1;
-    }
-
-    /* do something with the received CAN frame */
-
-  Writing CAN frames can be done similarly, with the write(2) system call:
-
-    nbytes = write(s, &frame, sizeof(struct can_frame));
-
-  When the CAN interface is bound to 'any' existing CAN interface
-  (addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the
-  information about the originating CAN interface is needed:
-
-    struct sockaddr_can addr;
-    struct ifreq ifr;
-    socklen_t len = sizeof(addr);
-    struct can_frame frame;
-
-    nbytes = recvfrom(s, &frame, sizeof(struct can_frame),
-                      0, (struct sockaddr*)&addr, &len);
-
-    /* get interface name of the received CAN frame */
-    ifr.ifr_ifindex = addr.can_ifindex;
-    ioctl(s, SIOCGIFNAME, &ifr);
-    printf("Received a CAN frame from interface %s", ifr.ifr_name);
-
-  To write CAN frames on sockets bound to 'any' CAN interface the
-  outgoing interface has to be defined certainly.
-
-    strcpy(ifr.ifr_name, "can0");
-    ioctl(s, SIOCGIFINDEX, &ifr);
-    addr.can_ifindex = ifr.ifr_ifindex;
-    addr.can_family  = AF_CAN;
-
-    nbytes = sendto(s, &frame, sizeof(struct can_frame),
-                    0, (struct sockaddr*)&addr, sizeof(addr));
-
-  An accurate timestamp can be obtained with an ioctl(2) call after reading
-  a message from the socket:
-
-    struct timeval tv;
-    ioctl(s, SIOCGSTAMP, &tv);
-
-  The timestamp has a resolution of one microsecond and is set automatically
-  at the reception of a CAN frame.
-
-  Remark about CAN FD (flexible data rate) support:
-
-  Generally the handling of CAN FD is very similar to the formerly described
-  examples. The new CAN FD capable CAN controllers support two different
-  bitrates for the arbitration phase and the payload phase of the CAN FD frame
-  and up to 64 bytes of payload. This extended payload length breaks all the
-  kernel interfaces (ABI) which heavily rely on the CAN frame with fixed eight
-  bytes of payload (struct can_frame) like the CAN_RAW socket. Therefore e.g.
-  the CAN_RAW socket supports a new socket option CAN_RAW_FD_FRAMES that
-  switches the socket into a mode that allows the handling of CAN FD frames
-  and (legacy) CAN frames simultaneously (see section 4.1.5).
-
-  The struct canfd_frame is defined in include/linux/can.h:
-
-    struct canfd_frame {
-            canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
-            __u8    len;     /* frame payload length in byte (0 .. 64) */
-            __u8    flags;   /* additional flags for CAN FD */
-            __u8    __res0;  /* reserved / padding */
-            __u8    __res1;  /* reserved / padding */
-            __u8    data[64] __attribute__((aligned(8)));
-    };
-
-  The struct canfd_frame and the existing struct can_frame have the can_id,
-  the payload length and the payload data at the same offset inside their
-  structures. This allows to handle the different structures very similar.
-  When the content of a struct can_frame is copied into a struct canfd_frame
-  all structure elements can be used as-is - only the data[] becomes extended.
-
-  When introducing the struct canfd_frame it turned out that the data length
-  code (DLC) of the struct can_frame was used as a length information as the
-  length and the DLC has a 1:1 mapping in the range of 0 .. 8. To preserve
-  the easy handling of the length information the canfd_frame.len element
-  contains a plain length value from 0 .. 64. So both canfd_frame.len and
-  can_frame.can_dlc are equal and contain a length information and no DLC.
-  For details about the distinction of CAN and CAN FD capable devices and
-  the mapping to the bus-relevant data length code (DLC), see chapter 6.6.
-
-  The length of the two CAN(FD) frame structures define the maximum transfer
-  unit (MTU) of the CAN(FD) network interface and skbuff data length. Two
-  definitions are specified for CAN specific MTUs in include/linux/can.h :
-
-  #define CAN_MTU   (sizeof(struct can_frame))   == 16  => 'legacy' CAN frame
-  #define CANFD_MTU (sizeof(struct canfd_frame)) == 72  => CAN FD frame
-
-  4.1 RAW protocol sockets with can_filters (SOCK_RAW)
-
-  Using CAN_RAW sockets is extensively comparable to the commonly
-  known access to CAN character devices. To meet the new possibilities
-  provided by the multi user SocketCAN approach, some reasonable
-  defaults are set at RAW socket binding time:
-
-  - The filters are set to exactly one filter receiving everything
-  - The socket only receives valid data frames (=> no error message frames)
-  - The loopback of sent CAN frames is enabled (see chapter 3.2)
-  - The socket does not receive its own sent frames (in loopback mode)
-
-  These default settings may be changed before or after binding the socket.
-  To use the referenced definitions of the socket options for CAN_RAW
-  sockets, include <linux/can/raw.h>.
-
-  4.1.1 RAW socket option CAN_RAW_FILTER
-
-  The reception of CAN frames using CAN_RAW sockets can be controlled
-  by defining 0 .. n filters with the CAN_RAW_FILTER socket option.
-
-  The CAN filter structure is defined in include/linux/can.h:
-
-    struct can_filter {
-            canid_t can_id;
-            canid_t can_mask;
-    };
-
-  A filter matches, when
-
-    <received_can_id> & mask == can_id & mask
-
-  which is analogous to known CAN controllers hardware filter semantics.
-  The filter can be inverted in this semantic, when the CAN_INV_FILTER
-  bit is set in can_id element of the can_filter structure. In
-  contrast to CAN controller hardware filters the user may set 0 .. n
-  receive filters for each open socket separately:
-
-    struct can_filter rfilter[2];
-
-    rfilter[0].can_id   = 0x123;
-    rfilter[0].can_mask = CAN_SFF_MASK;
-    rfilter[1].can_id   = 0x200;
-    rfilter[1].can_mask = 0x700;
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
-
-  To disable the reception of CAN frames on the selected CAN_RAW socket:
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0);
-
-  To set the filters to zero filters is quite obsolete as to not read
-  data causes the raw socket to discard the received CAN frames. But
-  having this 'send only' use-case we may remove the receive list in the
-  Kernel to save a little (really a very little!) CPU usage.
-
-  4.1.1.1 CAN filter usage optimisation
-
-  The CAN filters are processed in per-device filter lists at CAN frame
-  reception time. To reduce the number of checks that need to be performed
-  while walking through the filter lists the CAN core provides an optimized
-  filter handling when the filter subscription focusses on a single CAN ID.
-
-  For the possible 2048 SFF CAN identifiers the identifier is used as an index
-  to access the corresponding subscription list without any further checks.
-  For the 2^29 possible EFF CAN identifiers a 10 bit XOR folding is used as
-  hash function to retrieve the EFF table index.
-
-  To benefit from the optimized filters for single CAN identifiers the
-  CAN_SFF_MASK or CAN_EFF_MASK have to be set into can_filter.mask together
-  with set CAN_EFF_FLAG and CAN_RTR_FLAG bits. A set CAN_EFF_FLAG bit in the
-  can_filter.mask makes clear that it matters whether a SFF or EFF CAN ID is
-  subscribed. E.g. in the example from above
-
-    rfilter[0].can_id   = 0x123;
-    rfilter[0].can_mask = CAN_SFF_MASK;
-
-  both SFF frames with CAN ID 0x123 and EFF frames with 0xXXXXX123 can pass.
-
-  To filter for only 0x123 (SFF) and 0x12345678 (EFF) CAN identifiers the
-  filter has to be defined in this way to benefit from the optimized filters:
-
-    struct can_filter rfilter[2];
-
-    rfilter[0].can_id   = 0x123;
-    rfilter[0].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_SFF_MASK);
-    rfilter[1].can_id   = 0x12345678 | CAN_EFF_FLAG;
-    rfilter[1].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_EFF_MASK);
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter));
-
-  4.1.2 RAW socket option CAN_RAW_ERR_FILTER
-
-  As described in chapter 3.3 the CAN interface driver can generate so
-  called Error Message Frames that can optionally be passed to the user
-  application in the same way as other CAN frames. The possible
-  errors are divided into different error classes that may be filtered
-  using the appropriate error mask. To register for every possible
-  error condition CAN_ERR_MASK can be used as value for the error mask.
-  The values for the error mask are defined in linux/can/error.h .
-
-    can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF );
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER,
-               &err_mask, sizeof(err_mask));
-
-  4.1.3 RAW socket option CAN_RAW_LOOPBACK
-
-  To meet multi user needs the local loopback is enabled by default
-  (see chapter 3.2 for details). But in some embedded use-cases
-  (e.g. when only one application uses the CAN bus) this loopback
-  functionality can be disabled (separately for each socket):
-
-    int loopback = 0; /* 0 = disabled, 1 = enabled (default) */
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback));
-
-  4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS
-
-  When the local loopback is enabled, all the sent CAN frames are
-  looped back to the open CAN sockets that registered for the CAN
-  frames' CAN-ID on this given interface to meet the multi user
-  needs. The reception of the CAN frames on the same socket that was
-  sending the CAN frame is assumed to be unwanted and therefore
-  disabled by default. This default behaviour may be changed on
-  demand:
-
-    int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */
-
-    setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS,
-               &recv_own_msgs, sizeof(recv_own_msgs));
-
-  4.1.5 RAW socket option CAN_RAW_FD_FRAMES
-
-  CAN FD support in CAN_RAW sockets can be enabled with a new socket option
-  CAN_RAW_FD_FRAMES which is off by default. When the new socket option is
-  not supported by the CAN_RAW socket (e.g. on older kernels), switching the
-  CAN_RAW_FD_FRAMES option returns the error -ENOPROTOOPT.
-
-  Once CAN_RAW_FD_FRAMES is enabled the application can send both CAN frames
-  and CAN FD frames. OTOH the application has to handle CAN and CAN FD frames
-  when reading from the socket.
-
-    CAN_RAW_FD_FRAMES enabled:  CAN_MTU and CANFD_MTU are allowed
-    CAN_RAW_FD_FRAMES disabled: only CAN_MTU is allowed (default)
-
-  Example:
-    [ remember: CANFD_MTU == sizeof(struct canfd_frame) ]
-
-    struct canfd_frame cfd;
-
-    nbytes = read(s, &cfd, CANFD_MTU);
-
-    if (nbytes == CANFD_MTU) {
-            printf("got CAN FD frame with length %d\n", cfd.len);
-	    /* cfd.flags contains valid data */
-    } else if (nbytes == CAN_MTU) {
-            printf("got legacy CAN frame with length %d\n", cfd.len);
-	    /* cfd.flags is undefined */
-    } else {
-            fprintf(stderr, "read: invalid CAN(FD) frame\n");
-            return 1;
-    }
-
-    /* the content can be handled independently from the received MTU size */
-
-    printf("can_id: %X data length: %d data: ", cfd.can_id, cfd.len);
-    for (i = 0; i < cfd.len; i++)
-            printf("%02X ", cfd.data[i]);
-
-  When reading with size CANFD_MTU only returns CAN_MTU bytes that have
-  been received from the socket a legacy CAN frame has been read into the
-  provided CAN FD structure. Note that the canfd_frame.flags data field is
-  not specified in the struct can_frame and therefore it is only valid in
-  CANFD_MTU sized CAN FD frames.
-
-  Implementation hint for new CAN applications:
-
-  To build a CAN FD aware application use struct canfd_frame as basic CAN
-  data structure for CAN_RAW based applications. When the application is
-  executed on an older Linux kernel and switching the CAN_RAW_FD_FRAMES
-  socket option returns an error: No problem. You'll get legacy CAN frames
-  or CAN FD frames and can process them the same way.
-
-  When sending to CAN devices make sure that the device is capable to handle
-  CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU.
-  The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
-
-  4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS
-
-  The CAN_RAW socket can set multiple CAN identifier specific filters that
-  lead to multiple filters in the af_can.c filter processing. These filters
-  are indenpendent from each other which leads to logical OR'ed filters when
-  applied (see 4.1.1).
-
-  This socket option joines the given CAN filters in the way that only CAN
-  frames are passed to user space that matched *all* given CAN filters. The
-  semantic for the applied filters is therefore changed to a logical AND.
-
-  This is useful especially when the filterset is a combination of filters
-  where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or
-  CAN ID ranges from the incoming traffic.
-
-  4.1.7 RAW socket returned message flags
-
-  When using recvmsg() call, the msg->msg_flags may contain following flags:
-
-    MSG_DONTROUTE: set when the received frame was created on the local host.
-
-    MSG_CONFIRM: set when the frame was sent via the socket it is received on.
-      This flag can be interpreted as a 'transmission confirmation' when the
-      CAN driver supports the echo of frames on driver level, see 3.2 and 6.2.
-      In order to receive such messages, CAN_RAW_RECV_OWN_MSGS must be set.
-
-  4.2 Broadcast Manager protocol sockets (SOCK_DGRAM)
-
-  The Broadcast Manager protocol provides a command based configuration
-  interface to filter and send (e.g. cyclic) CAN messages in kernel space.
-
-  Receive filters can be used to down sample frequent messages; detect events
-  such as message contents changes, packet length changes, and do time-out
-  monitoring of received messages.
-
-  Periodic transmission tasks of CAN frames or a sequence of CAN frames can be
-  created and modified at runtime; both the message content and the two
-  possible transmit intervals can be altered.
-
-  A BCM socket is not intended for sending individual CAN frames using the
-  struct can_frame as known from the CAN_RAW socket. Instead a special BCM
-  configuration message is defined. The basic BCM configuration message used
-  to communicate with the broadcast manager and the available operations are
-  defined in the linux/can/bcm.h include. The BCM message consists of a
-  message header with a command ('opcode') followed by zero or more CAN frames.
-  The broadcast manager sends responses to user space in the same form:
-
-    struct bcm_msg_head {
-            __u32 opcode;                   /* command */
-            __u32 flags;                    /* special flags */
-            __u32 count;                    /* run 'count' times with ival1 */
-            struct timeval ival1, ival2;    /* count and subsequent interval */
-            canid_t can_id;                 /* unique can_id for task */
-            __u32 nframes;                  /* number of can_frames following */
-            struct can_frame frames[0];
-    };
-
-  The aligned payload 'frames' uses the same basic CAN frame structure defined
-  at the beginning of section 4 and in the include/linux/can.h include. All
-  messages to the broadcast manager from user space have this structure.
-
-  Note a CAN_BCM socket must be connected instead of bound after socket
-  creation (example without error checking):
-
-    int s;
-    struct sockaddr_can addr;
-    struct ifreq ifr;
-
-    s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM);
-
-    strcpy(ifr.ifr_name, "can0");
-    ioctl(s, SIOCGIFINDEX, &ifr);
-
-    addr.can_family = AF_CAN;
-    addr.can_ifindex = ifr.ifr_ifindex;
-
-    connect(s, (struct sockaddr *)&addr, sizeof(addr));
-
-    (..)
-
-  The broadcast manager socket is able to handle any number of in flight
-  transmissions or receive filters concurrently. The different RX/TX jobs are
-  distinguished by the unique can_id in each BCM message. However additional
-  CAN_BCM sockets are recommended to communicate on multiple CAN interfaces.
-  When the broadcast manager socket is bound to 'any' CAN interface (=> the
-  interface index is set to zero) the configured receive filters apply to any
-  CAN interface unless the sendto() syscall is used to overrule the 'any' CAN
-  interface index. When using recvfrom() instead of read() to retrieve BCM
-  socket messages the originating CAN interface is provided in can_ifindex.
-
-  4.2.1 Broadcast Manager operations
-
-  The opcode defines the operation for the broadcast manager to carry out,
-  or details the broadcast managers response to several events, including
-  user requests.
-
-  Transmit Operations (user space to broadcast manager):
-
-    TX_SETUP:   Create (cyclic) transmission task.
-
-    TX_DELETE:  Remove (cyclic) transmission task, requires only can_id.
-
-    TX_READ:    Read properties of (cyclic) transmission task for can_id.
-
-    TX_SEND:    Send one CAN frame.
-
-  Transmit Responses (broadcast manager to user space):
-
-    TX_STATUS:  Reply to TX_READ request (transmission task configuration).
-
-    TX_EXPIRED: Notification when counter finishes sending at initial interval
-      'ival1'. Requires the TX_COUNTEVT flag to be set at TX_SETUP.
-
-  Receive Operations (user space to broadcast manager):
-
-    RX_SETUP:   Create RX content filter subscription.
-
-    RX_DELETE:  Remove RX content filter subscription, requires only can_id.
-
-    RX_READ:    Read properties of RX content filter subscription for can_id.
-
-  Receive Responses (broadcast manager to user space):
-
-    RX_STATUS:  Reply to RX_READ request (filter task configuration).
-
-    RX_TIMEOUT: Cyclic message is detected to be absent (timer ival1 expired).
-
-    RX_CHANGED: BCM message with updated CAN frame (detected content change).
-      Sent on first message received or on receipt of revised CAN messages.
-
-  4.2.2 Broadcast Manager message flags
-
-  When sending a message to the broadcast manager the 'flags' element may
-  contain the following flag definitions which influence the behaviour:
-
-    SETTIMER:           Set the values of ival1, ival2 and count
-
-    STARTTIMER:         Start the timer with the actual values of ival1, ival2
-      and count. Starting the timer leads simultaneously to emit a CAN frame.
-
-    TX_COUNTEVT:        Create the message TX_EXPIRED when count expires
-
-    TX_ANNOUNCE:        A change of data by the process is emitted immediately.
-
-    TX_CP_CAN_ID:       Copies the can_id from the message header to each
-      subsequent frame in frames. This is intended as usage simplification. For
-      TX tasks the unique can_id from the message header may differ from the
-      can_id(s) stored for transmission in the subsequent struct can_frame(s).
-
-    RX_FILTER_ID:       Filter by can_id alone, no frames required (nframes=0).
-
-    RX_CHECK_DLC:       A change of the DLC leads to an RX_CHANGED.
-
-    RX_NO_AUTOTIMER:    Prevent automatically starting the timeout monitor.
-
-    RX_ANNOUNCE_RESUME: If passed at RX_SETUP and a receive timeout occurred, a
-      RX_CHANGED message will be generated when the (cyclic) receive restarts.
-
-    TX_RESET_MULTI_IDX: Reset the index for the multiple frame transmission.
-
-    RX_RTR_FRAME:       Send reply for RTR-request (placed in op->frames[0]).
-
-  4.2.3 Broadcast Manager transmission timers
-
-  Periodic transmission configurations may use up to two interval timers.
-  In this case the BCM sends a number of messages ('count') at an interval
-  'ival1', then continuing to send at another given interval 'ival2'. When
-  only one timer is needed 'count' is set to zero and only 'ival2' is used.
-  When SET_TIMER and START_TIMER flag were set the timers are activated.
-  The timer values can be altered at runtime when only SET_TIMER is set.
-
-  4.2.4 Broadcast Manager message sequence transmission
-
-  Up to 256 CAN frames can be transmitted in a sequence in the case of a cyclic
-  TX task configuration. The number of CAN frames is provided in the 'nframes'
-  element of the BCM message head. The defined number of CAN frames are added
-  as array to the TX_SETUP BCM configuration message.
-
-    /* create a struct to set up a sequence of four CAN frames */
-    struct {
-            struct bcm_msg_head msg_head;
-            struct can_frame frame[4];
-    } mytxmsg;
-
-    (..)
-    mytxmsg.msg_head.nframes = 4;
-    (..)
-
-    write(s, &mytxmsg, sizeof(mytxmsg));
-
-  With every transmission the index in the array of CAN frames is increased
-  and set to zero at index overflow.
-
-  4.2.5 Broadcast Manager receive filter timers
-
-  The timer values ival1 or ival2 may be set to non-zero values at RX_SETUP.
-  When the SET_TIMER flag is set the timers are enabled:
-
-  ival1: Send RX_TIMEOUT when a received message is not received again within
-    the given time. When START_TIMER is set at RX_SETUP the timeout detection
-    is activated directly - even without a former CAN frame reception.
-
-  ival2: Throttle the received message rate down to the value of ival2. This
-    is useful to reduce messages for the application when the signal inside the
-    CAN frame is stateless as state changes within the ival2 periode may get
-    lost.
-
-  4.2.6 Broadcast Manager multiplex message receive filter
-
-  To filter for content changes in multiplex message sequences an array of more
-  than one CAN frames can be passed in a RX_SETUP configuration message. The
-  data bytes of the first CAN frame contain the mask of relevant bits that
-  have to match in the subsequent CAN frames with the received CAN frame.
-  If one of the subsequent CAN frames is matching the bits in that frame data
-  mark the relevant content to be compared with the previous received content.
-  Up to 257 CAN frames (multiplex filter bit mask CAN frame plus 256 CAN
-  filters) can be added as array to the TX_SETUP BCM configuration message.
-
-    /* usually used to clear CAN frame data[] - beware of endian problems! */
-    #define U64_DATA(p) (*(unsigned long long*)(p)->data)
-
-    struct {
-            struct bcm_msg_head msg_head;
-            struct can_frame frame[5];
-    } msg;
-
-    msg.msg_head.opcode  = RX_SETUP;
-    msg.msg_head.can_id  = 0x42;
-    msg.msg_head.flags   = 0;
-    msg.msg_head.nframes = 5;
-    U64_DATA(&msg.frame[0]) = 0xFF00000000000000ULL; /* MUX mask */
-    U64_DATA(&msg.frame[1]) = 0x01000000000000FFULL; /* data mask (MUX 0x01) */
-    U64_DATA(&msg.frame[2]) = 0x0200FFFF000000FFULL; /* data mask (MUX 0x02) */
-    U64_DATA(&msg.frame[3]) = 0x330000FFFFFF0003ULL; /* data mask (MUX 0x33) */
-    U64_DATA(&msg.frame[4]) = 0x4F07FC0FF0000000ULL; /* data mask (MUX 0x4F) */
-
-    write(s, &msg, sizeof(msg));
-
-  4.2.7 Broadcast Manager CAN FD support
-
-  The programming API of the CAN_BCM depends on struct can_frame which is
-  given as array directly behind the bcm_msg_head structure. To follow this
-  schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head
-  flags indicates that the concatenated CAN frame structures behind the
-  bcm_msg_head are defined as struct canfd_frame.
-
-    struct {
-            struct bcm_msg_head msg_head;
-            struct canfd_frame frame[5];
-    } msg;
-
-    msg.msg_head.opcode  = RX_SETUP;
-    msg.msg_head.can_id  = 0x42;
-    msg.msg_head.flags   = CAN_FD_FRAME;
-    msg.msg_head.nframes = 5;
-    (..)
-
-  When using CAN FD frames for multiplex filtering the MUX mask is still
-  expected in the first 64 bit of the struct canfd_frame data section.
-
-  4.3 connected transport protocols (SOCK_SEQPACKET)
-  4.4 unconnected transport protocols (SOCK_DGRAM)
-
-
-5. SocketCAN core module
--------------------------
-
-  The SocketCAN core module implements the protocol family
-  PF_CAN. CAN protocol modules are loaded by the core module at
-  runtime. The core module provides an interface for CAN protocol
-  modules to subscribe needed CAN IDs (see chapter 3.1).
-
-  5.1 can.ko module params
-
-  - stats_timer: To calculate the SocketCAN core statistics
-    (e.g. current/maximum frames per second) this 1 second timer is
-    invoked at can.ko module start time by default. This timer can be
-    disabled by using stattimer=0 on the module commandline.
-
-  - debug: (removed since SocketCAN SVN r546)
-
-  5.2 procfs content
-
-  As described in chapter 3.1 the SocketCAN core uses several filter
-  lists to deliver received CAN frames to CAN protocol modules. These
-  receive lists, their filters and the count of filter matches can be
-  checked in the appropriate receive list. All entries contain the
-  device and a protocol module identifier:
-
-    foo@bar:~$ cat /proc/net/can/rcvlist_all
-
-    receive list 'rx_all':
-      (vcan3: no entry)
-      (vcan2: no entry)
-      (vcan1: no entry)
-      device   can_id   can_mask  function  userdata   matches  ident
-       vcan0     000    00000000  f88e6370  f6c6f400         0  raw
-      (any: no entry)
-
-  In this example an application requests any CAN traffic from vcan0.
-
-    rcvlist_all - list for unfiltered entries (no filter operations)
-    rcvlist_eff - list for single extended frame (EFF) entries
-    rcvlist_err - list for error message frames masks
-    rcvlist_fil - list for mask/value filters
-    rcvlist_inv - list for mask/value filters (inverse semantic)
-    rcvlist_sff - list for single standard frame (SFF) entries
-
-  Additional procfs files in /proc/net/can
-
-    stats       - SocketCAN core statistics (rx/tx frames, match ratios, ...)
-    reset_stats - manual statistic reset
-    version     - prints the SocketCAN core version and the ABI version
-
-  5.3 writing own CAN protocol modules
-
-  To implement a new protocol in the protocol family PF_CAN a new
-  protocol has to be defined in include/linux/can.h .
-  The prototypes and definitions to use the SocketCAN core can be
-  accessed by including include/linux/can/core.h .
-  In addition to functions that register the CAN protocol and the
-  CAN device notifier chain there are functions to subscribe CAN
-  frames received by CAN interfaces and to send CAN frames:
-
-    can_rx_register   - subscribe CAN frames from a specific interface
-    can_rx_unregister - unsubscribe CAN frames from a specific interface
-    can_send          - transmit a CAN frame (optional with local loopback)
-
-  For details see the kerneldoc documentation in net/can/af_can.c or
-  the source code of net/can/raw.c or net/can/bcm.c .
-
-6. CAN network drivers
-----------------------
-
-  Writing a CAN network device driver is much easier than writing a
-  CAN character device driver. Similar to other known network device
-  drivers you mainly have to deal with:
-
-  - TX: Put the CAN frame from the socket buffer to the CAN controller.
-  - RX: Put the CAN frame from the CAN controller to the socket buffer.
-
-  See e.g. at Documentation/networking/netdevices.txt . The differences
-  for writing CAN network device driver are described below:
-
-  6.1 general settings
-
-    dev->type  = ARPHRD_CAN; /* the netdevice hardware type */
-    dev->flags = IFF_NOARP;  /* CAN has no arp */
-
-    dev->mtu = CAN_MTU; /* sizeof(struct can_frame) -> legacy CAN interface */
-
-    or alternative, when the controller supports CAN with flexible data rate:
-    dev->mtu = CANFD_MTU; /* sizeof(struct canfd_frame) -> CAN FD interface */
-
-  The struct can_frame or struct canfd_frame is the payload of each socket
-  buffer (skbuff) in the protocol family PF_CAN.
-
-  6.2 local loopback of sent frames
-
-  As described in chapter 3.2 the CAN network device driver should
-  support a local loopback functionality similar to the local echo
-  e.g. of tty devices. In this case the driver flag IFF_ECHO has to be
-  set to prevent the PF_CAN core from locally echoing sent frames
-  (aka loopback) as fallback solution:
-
-    dev->flags = (IFF_NOARP | IFF_ECHO);
-
-  6.3 CAN controller hardware filters
-
-  To reduce the interrupt load on deep embedded systems some CAN
-  controllers support the filtering of CAN IDs or ranges of CAN IDs.
-  These hardware filter capabilities vary from controller to
-  controller and have to be identified as not feasible in a multi-user
-  networking approach. The use of the very controller specific
-  hardware filters could make sense in a very dedicated use-case, as a
-  filter on driver level would affect all users in the multi-user
-  system. The high efficient filter sets inside the PF_CAN core allow
-  to set different multiple filters for each socket separately.
-  Therefore the use of hardware filters goes to the category 'handmade
-  tuning on deep embedded systems'. The author is running a MPC603e
-  @133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
-  load without any problems ...
-
-  6.4 The virtual CAN driver (vcan)
-
-  Similar to the network loopback devices, vcan offers a virtual local
-  CAN interface. A full qualified address on CAN consists of
-
-  - a unique CAN Identifier (CAN ID)
-  - the CAN bus this CAN ID is transmitted on (e.g. can0)
-
-  so in common use cases more than one virtual CAN interface is needed.
-
-  The virtual CAN interfaces allow the transmission and reception of CAN
-  frames without real CAN controller hardware. Virtual CAN network
-  devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
-  When compiled as a module the virtual CAN driver module is called vcan.ko
-
-  Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
-  netlink interface to create vcan network devices. The creation and
-  removal of vcan network devices can be managed with the ip(8) tool:
-
-  - Create a virtual CAN network interface:
-       $ ip link add type vcan
-
-  - Create a virtual CAN network interface with a specific name 'vcan42':
-       $ ip link add dev vcan42 type vcan
-
-  - Remove a (virtual CAN) network interface 'vcan42':
-       $ ip link del vcan42
-
-  6.5 The CAN network device driver interface
-
-  The CAN network device driver interface provides a generic interface
-  to setup, configure and monitor CAN network devices. The user can then
-  configure the CAN device, like setting the bit-timing parameters, via
-  the netlink interface using the program "ip" from the "IPROUTE2"
-  utility suite. The following chapter describes briefly how to use it.
-  Furthermore, the interface uses a common data structure and exports a
-  set of common functions, which all real CAN network device drivers
-  should use. Please have a look to the SJA1000 or MSCAN driver to
-  understand how to use them. The name of the module is can-dev.ko.
-
-  6.5.1 Netlink interface to set/get devices properties
-
-  The CAN device must be configured via netlink interface. The supported
-  netlink message types are defined and briefly described in
-  "include/linux/can/netlink.h". CAN link support for the program "ip"
-  of the IPROUTE2 utility suite is available and it can be used as shown
-  below:
-
-  - Setting CAN device properties:
-
-    $ ip link set can0 type can help
-    Usage: ip link set DEVICE type can
-        [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] |
-        [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1
-          phase-seg2 PHASE-SEG2 [ sjw SJW ] ]
-
-        [ dbitrate BITRATE [ dsample-point SAMPLE-POINT] ] |
-        [ dtq TQ dprop-seg PROP_SEG dphase-seg1 PHASE-SEG1
-          dphase-seg2 PHASE-SEG2 [ dsjw SJW ] ]
-
-        [ loopback { on | off } ]
-        [ listen-only { on | off } ]
-        [ triple-sampling { on | off } ]
-        [ one-shot { on | off } ]
-        [ berr-reporting { on | off } ]
-        [ fd { on | off } ]
-        [ fd-non-iso { on | off } ]
-        [ presume-ack { on | off } ]
-
-        [ restart-ms TIME-MS ]
-        [ restart ]
-
-        Where: BITRATE       := { 1..1000000 }
-               SAMPLE-POINT  := { 0.000..0.999 }
-               TQ            := { NUMBER }
-               PROP-SEG      := { 1..8 }
-               PHASE-SEG1    := { 1..8 }
-               PHASE-SEG2    := { 1..8 }
-               SJW           := { 1..4 }
-               RESTART-MS    := { 0 | NUMBER }
-
-  - Display CAN device details and statistics:
-
-    $ ip -details -statistics link show can0
-    2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10
-      link/can
-      can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100
-      bitrate 125000 sample_point 0.875
-      tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1
-      sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
-      clock 8000000
-      re-started bus-errors arbit-lost error-warn error-pass bus-off
-      41         17457      0          41         42         41
-      RX: bytes  packets  errors  dropped overrun mcast
-      140859     17608    17457   0       0       0
-      TX: bytes  packets  errors  dropped carrier collsns
-      861        112      0       41      0       0
-
-  More info to the above output:
-
-    "<TRIPLE-SAMPLING>"
-	Shows the list of selected CAN controller modes: LOOPBACK,
-	LISTEN-ONLY, or TRIPLE-SAMPLING.
-
-    "state ERROR-ACTIVE"
-	The current state of the CAN controller: "ERROR-ACTIVE",
-	"ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED"
-
-    "restart-ms 100"
-	Automatic restart delay time. If set to a non-zero value, a
-	restart of the CAN controller will be triggered automatically
-	in case of a bus-off condition after the specified delay time
-	in milliseconds. By default it's off.
-
-    "bitrate 125000 sample-point 0.875"
-	Shows the real bit-rate in bits/sec and the sample-point in the
-	range 0.000..0.999. If the calculation of bit-timing parameters
-	is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the
-	bit-timing can be defined by setting the "bitrate" argument.
-	Optionally the "sample-point" can be specified. By default it's
-	0.000 assuming CIA-recommended sample-points.
-
-    "tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1"
-	Shows the time quanta in ns, propagation segment, phase buffer
-	segment 1 and 2 and the synchronisation jump width in units of
-	tq. They allow to define the CAN bit-timing in a hardware
-	independent format as proposed by the Bosch CAN 2.0 spec (see
-	chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf).
-
-    "sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
-     clock 8000000"
-	Shows the bit-timing constants of the CAN controller, here the
-	"sja1000". The minimum and maximum values of the time segment 1
-	and 2, the synchronisation jump width in units of tq, the
-	bitrate pre-scaler and the CAN system clock frequency in Hz.
-	These constants could be used for user-defined (non-standard)
-	bit-timing calculation algorithms in user-space.
-
-    "re-started bus-errors arbit-lost error-warn error-pass bus-off"
-	Shows the number of restarts, bus and arbitration lost errors,
-	and the state changes to the error-warning, error-passive and
-	bus-off state. RX overrun errors are listed in the "overrun"
-	field of the standard network statistics.
-
-  6.5.2 Setting the CAN bit-timing
-
-  The CAN bit-timing parameters can always be defined in a hardware
-  independent format as proposed in the Bosch CAN 2.0 specification
-  specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2"
-  and "sjw":
-
-    $ ip link set canX type can tq 125 prop-seg 6 \
-				phase-seg1 7 phase-seg2 2 sjw 1
-
-  If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA
-  recommended CAN bit-timing parameters will be calculated if the bit-
-  rate is specified with the argument "bitrate":
-
-    $ ip link set canX type can bitrate 125000
-
-  Note that this works fine for the most common CAN controllers with
-  standard bit-rates but may *fail* for exotic bit-rates or CAN system
-  clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some
-  space and allows user-space tools to solely determine and set the
-  bit-timing parameters. The CAN controller specific bit-timing
-  constants can be used for that purpose. They are listed by the
-  following command:
-
-    $ ip -details link show can0
-    ...
-      sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
-
-  6.5.3 Starting and stopping the CAN network device
-
-  A CAN network device is started or stopped as usual with the command
-  "ifconfig canX up/down" or "ip link set canX up/down". Be aware that
-  you *must* define proper bit-timing parameters for real CAN devices
-  before you can start it to avoid error-prone default settings:
-
-    $ ip link set canX up type can bitrate 125000
-
-  A device may enter the "bus-off" state if too many errors occurred on
-  the CAN bus. Then no more messages are received or sent. An automatic
-  bus-off recovery can be enabled by setting the "restart-ms" to a
-  non-zero value, e.g.:
-
-    $ ip link set canX type can restart-ms 100
-
-  Alternatively, the application may realize the "bus-off" condition
-  by monitoring CAN error message frames and do a restart when
-  appropriate with the command:
-
-    $ ip link set canX type can restart
-
-  Note that a restart will also create a CAN error message frame (see
-  also chapter 3.3).
-
-  6.6 CAN FD (flexible data rate) driver support
-
-  CAN FD capable CAN controllers support two different bitrates for the
-  arbitration phase and the payload phase of the CAN FD frame. Therefore a
-  second bit timing has to be specified in order to enable the CAN FD bitrate.
-
-  Additionally CAN FD capable CAN controllers support up to 64 bytes of
-  payload. The representation of this length in can_frame.can_dlc and
-  canfd_frame.len for userspace applications and inside the Linux network
-  layer is a plain value from 0 .. 64 instead of the CAN 'data length code'.
-  The data length code was a 1:1 mapping to the payload length in the legacy
-  CAN frames anyway. The payload length to the bus-relevant DLC mapping is
-  only performed inside the CAN drivers, preferably with the helper
-  functions can_dlc2len() and can_len2dlc().
-
-  The CAN netdevice driver capabilities can be distinguished by the network
-  devices maximum transfer unit (MTU):
-
-  MTU = 16 (CAN_MTU)   => sizeof(struct can_frame)   => 'legacy' CAN device
-  MTU = 72 (CANFD_MTU) => sizeof(struct canfd_frame) => CAN FD capable device
-
-  The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall.
-  N.B. CAN FD capable devices can also handle and send legacy CAN frames.
-
-  When configuring CAN FD capable CAN controllers an additional 'data' bitrate
-  has to be set. This bitrate for the data phase of the CAN FD frame has to be
-  at least the bitrate which was configured for the arbitration phase. This
-  second bitrate is specified analogue to the first bitrate but the bitrate
-  setting keywords for the 'data' bitrate start with 'd' e.g. dbitrate,
-  dsample-point, dsjw or dtq and similar settings. When a data bitrate is set
-  within the configuration process the controller option "fd on" can be
-  specified to enable the CAN FD mode in the CAN controller. This controller
-  option also switches the device MTU to 72 (CANFD_MTU).
-
-  The first CAN FD specification presented as whitepaper at the International
-  CAN Conference 2012 needed to be improved for data integrity reasons.
-  Therefore two CAN FD implementations have to be distinguished today:
-
-  - ISO compliant:     The ISO 11898-1:2015 CAN FD implementation (default)
-  - non-ISO compliant: The CAN FD implementation following the 2012 whitepaper
-
-  Finally there are three types of CAN FD controllers:
-
-  1. ISO compliant (fixed)
-  2. non-ISO compliant (fixed, like the M_CAN IP core v3.0.1 in m_can.c)
-  3. ISO/non-ISO CAN FD controllers (switchable, like the PEAK PCAN-USB FD)
-
-  The current ISO/non-ISO mode is announced by the CAN controller driver via
-  netlink and displayed by the 'ip' tool (controller option FD-NON-ISO).
-  The ISO/non-ISO-mode can be altered by setting 'fd-non-iso {on|off}' for
-  switchable CAN FD controllers only.
-
-  Example configuring 500 kbit/s arbitration bitrate and 4 Mbit/s data bitrate:
-
-    $ ip link set can0 up type can bitrate 500000 sample-point 0.75 \
-                                   dbitrate 4000000 dsample-point 0.8 fd on
-    $ ip -details link show can0
-    5: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 72 qdisc pfifo_fast state UNKNOWN \
-             mode DEFAULT group default qlen 10
-    link/can  promiscuity 0
-    can <FD> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
-          bitrate 500000 sample-point 0.750
-          tq 50 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 1
-          pcan_usb_pro_fd: tseg1 1..64 tseg2 1..16 sjw 1..16 brp 1..1024 \
-          brp-inc 1
-          dbitrate 4000000 dsample-point 0.800
-          dtq 12 dprop-seg 7 dphase-seg1 8 dphase-seg2 4 dsjw 1
-          pcan_usb_pro_fd: dtseg1 1..16 dtseg2 1..8 dsjw 1..4 dbrp 1..1024 \
-          dbrp-inc 1
-          clock 80000000
-
-  Example when 'fd-non-iso on' is added on this switchable CAN FD adapter:
-   can <FD,FD-NON-ISO> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0
-
-  6.7 Supported CAN hardware
-
-  Please check the "Kconfig" file in "drivers/net/can" to get an actual
-  list of the support CAN hardware. On the SocketCAN project website
-  (see chapter 7) there might be further drivers available, also for
-  older kernel versions.
-
-7. SocketCAN resources
------------------------
-
-  The Linux CAN / SocketCAN project resources (project site / mailing list)
-  are referenced in the MAINTAINERS file in the Linux source tree.
-  Search for CAN NETWORK [LAYERS|DRIVERS].
-
-8. Credits
-----------
-
-  Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver)
-  Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
-  Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
-  Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews,
-                       CAN device driver interface, MSCAN driver)
-  Robert Schwebel (design reviews, PTXdist integration)
-  Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
-  Benedikt Spranger (reviews)
-  Thomas Gleixner (LKML reviews, coding style, posting hints)
-  Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver)
-  Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
-  Klaus Hitschler (PEAK driver integration)
-  Uwe Koppe (CAN netdevices with PF_PACKET approach)
-  Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
-  Pavel Pisa (Bit-timing calculation)
-  Sascha Hauer (SJA1000 platform driver)
-  Sebastian Haas (SJA1000 EMS PCI driver)
-  Markus Plessing (SJA1000 EMS PCI driver)
-  Per Dalen (SJA1000 Kvaser PCI driver)
-  Sam Ravnborg (reviews, coding style, kbuild help)
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 87814859cfc2..a4508ec1816b 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1134,7 +1134,7 @@ The verifier's knowledge about the variable offset consists of:
 mask and value; no bit should ever be 1 in both.  For example, if a byte is read
 into a register from memory, the register's top 56 bits are known zero, while
 the low 8 are unknown - which is represented as the tnum (0x0; 0xff).  If we
-then OR this with 0x40, we get (0x40; 0xcf), then if we add 1 we get (0x0;
+then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0;
 0x1ff), because of potential carries.
 Besides arithmetic, the register state can also be updated by conditional
 branches.  For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 7d4b15977d61..90966c2692d8 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -7,6 +7,7 @@ Contents:
    :maxdepth: 2
 
    batman-adv
+   can
    kapi
    z8530book
    msg_zerocopy
diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt
index 2d9d588cd34b..50c34ca65efe 100644
--- a/Documentation/networking/xfrm_device.txt
+++ b/Documentation/networking/xfrm_device.txt
@@ -41,6 +41,7 @@ struct xfrmdev_ops {
 	void	(*xdo_dev_state_free) (struct xfrm_state *x);
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
+	void    (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 };
 
 The NIC driver offering ipsec offload will need to implement these
@@ -117,6 +118,8 @@ the stack in xfrm_input().
 
 	hand the packet to napi_gro_receive() as usual
 
+In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn().
+Driver will check packet seq number and update HW ESN state machine if needed.
 
 When the SA is removed by the user, the driver's xdo_dev_state_delete()
 is asked to disable the offload.  Later, xdo_dev_state_free() is called
diff --git a/MAINTAINERS b/MAINTAINERS
index 51e3a0d503dc..884ee9601707 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3206,7 +3206,7 @@ W:	https://github.com/linux-can
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git
 S:	Maintained
-F:	Documentation/networking/can.txt
+F:	Documentation/networking/can.rst
 F:	net/can/
 F:	include/linux/can/core.h
 F:	include/uapi/linux/can.h
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 41e2feb0cf4f..b5030e1a41d8 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -363,15 +363,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx)
 static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op)
 {
 	const u8 *tmp = bpf2a32[TMP_REG_1];
-	s32 jmp_offset;
 
-	/* checks if divisor is zero or not. If it is, then
-	 * exit directly.
-	 */
-	emit(ARM_CMP_I(rn, 0), ctx);
-	_emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx);
-	jmp_offset = epilogue_offset(ctx);
-	_emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
 #if __LINUX_ARM_ARCH__ == 7
 	if (elf_hwcap & HWCAP_IDIVA) {
 		if (op == BPF_DIV)
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 0775d5ab8ee9..1d4f1da7c58f 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -390,18 +390,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	case BPF_ALU64 | BPF_DIV | BPF_X:
 	case BPF_ALU | BPF_MOD | BPF_X:
 	case BPF_ALU64 | BPF_MOD | BPF_X:
-	{
-		const u8 r0 = bpf2a64[BPF_REG_0];
-
-		/* if (src == 0) return 0 */
-		jmp_offset = 3; /* skip ahead to else path */
-		check_imm19(jmp_offset);
-		emit(A64_CBNZ(is64, src, jmp_offset), ctx);
-		emit(A64_MOVZ(1, r0, 0, 0), ctx);
-		jmp_offset = epilogue_offset(ctx);
-		check_imm26(jmp_offset);
-		emit(A64_B(jmp_offset), ctx);
-		/* else */
 		switch (BPF_OP(code)) {
 		case BPF_DIV:
 			emit(A64_UDIV(is64, dst, dst, src), ctx);
@@ -413,7 +401,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 			break;
 		}
 		break;
-	}
 	case BPF_ALU | BPF_LSH | BPF_X:
 	case BPF_ALU64 | BPF_LSH | BPF_X:
 		emit(A64_LSLV(is64, dst, dst, src), ctx);
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 4e347030ed2c..3e2798bfea4f 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -741,16 +741,11 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		break;
 	case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */
 	case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */
+		if (insn->imm == 0)
+			return -EINVAL;
 		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
 		if (dst < 0)
 			return dst;
-		if (insn->imm == 0) { /* Div by zero */
-			b_off = b_imm(exit_idx, ctx);
-			if (is_bad_offset(b_off))
-				return -E2BIG;
-			emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
-			emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO);
-		}
 		td = get_reg_val_type(ctx, this_idx, insn->dst_reg);
 		if (td == REG_64BIT || td == REG_32BIT_ZERO_EX)
 			/* sign extend */
@@ -770,19 +765,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		break;
 	case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */
 	case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */
+		if (insn->imm == 0)
+			return -EINVAL;
 		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
 		if (dst < 0)
 			return dst;
-		if (insn->imm == 0) { /* Div by zero */
-			b_off = b_imm(exit_idx, ctx);
-			if (is_bad_offset(b_off))
-				return -E2BIG;
-			emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off);
-			emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO);
-		}
 		if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT)
 			emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
-
 		if (insn->imm == 1) {
 			/* div by 1 is a nop, mod by 1 is zero */
 			if (bpf_op == BPF_MOD)
@@ -860,11 +849,6 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			break;
 		case BPF_DIV:
 		case BPF_MOD:
-			b_off = b_imm(exit_idx, ctx);
-			if (is_bad_offset(b_off))
-				return -E2BIG;
-			emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
-			emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
 			emit_instr(ctx, ddivu, dst, src);
 			if (bpf_op == BPF_DIV)
 				emit_instr(ctx, mflo, dst);
@@ -943,11 +927,6 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 			break;
 		case BPF_DIV:
 		case BPF_MOD:
-			b_off = b_imm(exit_idx, ctx);
-			if (is_bad_offset(b_off))
-				return -E2BIG;
-			emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off);
-			emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src);
 			emit_instr(ctx, divu, dst, src);
 			if (bpf_op == BPF_DIV)
 				emit_instr(ctx, mflo, dst);
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 217a78e84865..0a34b0cec7b7 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -381,10 +381,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			goto bpf_alu32_trunc;
 		case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */
 		case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */
-			PPC_CMPWI(src_reg, 0);
-			PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12);
-			PPC_LI(b2p[BPF_REG_0], 0);
-			PPC_JMP(exit_addr);
 			if (BPF_OP(code) == BPF_MOD) {
 				PPC_DIVWU(b2p[TMP_REG_1], dst_reg, src_reg);
 				PPC_MULW(b2p[TMP_REG_1], src_reg,
@@ -395,10 +391,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			goto bpf_alu32_trunc;
 		case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */
 		case BPF_ALU64 | BPF_MOD | BPF_X: /* dst %= src */
-			PPC_CMPDI(src_reg, 0);
-			PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12);
-			PPC_LI(b2p[BPF_REG_0], 0);
-			PPC_JMP(exit_addr);
 			if (BPF_OP(code) == BPF_MOD) {
 				PPC_DIVD(b2p[TMP_REG_1], dst_reg, src_reg);
 				PPC_MULD(b2p[TMP_REG_1], src_reg,
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index e50188773ff3..78a19c93b380 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -610,11 +610,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
-		jit->seen |= SEEN_RET0;
-		/* ltr %src,%src (if src == 0 goto fail) */
-		EMIT2(0x1200, src_reg, src_reg);
-		/* jz <ret0> */
-		EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg);
 		/* lhi %w0,0 */
 		EMIT4_IMM(0xa7080000, REG_W0, 0);
 		/* lr %w1,%dst */
@@ -630,11 +625,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
-		jit->seen |= SEEN_RET0;
-		/* ltgr %src,%src (if src == 0 goto fail) */
-		EMIT4(0xb9020000, src_reg, src_reg);
-		/* jz <ret0> */
-		EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg);
 		/* lghi %w0,0 */
 		EMIT4_IMM(0xa7090000, REG_W0, 0);
 		/* lgr %w1,%dst */
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 50a24d7bd4c5..48a25869349b 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -967,31 +967,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		emit_alu(MULX, src, dst, ctx);
 		break;
 	case BPF_ALU | BPF_DIV | BPF_X:
-		emit_cmp(src, G0, ctx);
-		emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-		emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
 		emit_write_y(G0, ctx);
 		emit_alu(DIV, src, dst, ctx);
 		break;
-
 	case BPF_ALU64 | BPF_DIV | BPF_X:
-		emit_cmp(src, G0, ctx);
-		emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-		emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
 		emit_alu(UDIVX, src, dst, ctx);
 		break;
-
 	case BPF_ALU | BPF_MOD | BPF_X: {
 		const u8 tmp = bpf2sparc[TMP_REG_1];
 
 		ctx->tmp_1_used = true;
 
-		emit_cmp(src, G0, ctx);
-		emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-		emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
 		emit_write_y(G0, ctx);
 		emit_alu3(DIV, dst, src, tmp, ctx);
 		emit_alu3(MULX, tmp, src, tmp, ctx);
@@ -1003,10 +989,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 
 		ctx->tmp_1_used = true;
 
-		emit_cmp(src, G0, ctx);
-		emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx);
-		emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx);
-
 		emit_alu3(UDIVX, dst, src, tmp, ctx);
 		emit_alu3(MULX, tmp, src, tmp, ctx);
 		emit_alu3(SUB, dst, tmp, dst, ctx);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5acee5139e28..4923d92f918d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -568,26 +568,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			 */
 			EMIT2(0x31, 0xd2);
 
-			if (BPF_SRC(insn->code) == BPF_X) {
-				/* if (src_reg == 0) return 0 */
-
-				/* cmp r11, 0 */
-				EMIT4(0x49, 0x83, 0xFB, 0x00);
-
-				/* jne .+9 (skip over pop, pop, xor and jmp) */
-				EMIT2(X86_JNE, 1 + 1 + 2 + 5);
-				EMIT1(0x5A); /* pop rdx */
-				EMIT1(0x58); /* pop rax */
-				EMIT2(0x31, 0xc0); /* xor eax, eax */
-
-				/* jmp cleanup_addr
-				 * addrs[i] - 11, because there are 11 bytes
-				 * after this insn: div, mov, pop, pop, mov
-				 */
-				jmp_offset = ctx->cleanup_addr - (addrs[i] - 11);
-				EMIT1_off32(0xE9, jmp_offset);
-			}
-
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
 				/* div r11 */
 				EMIT3(0x49, 0xF7, 0xF3);
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index cc94604b23e0..b1779566c5bb 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(can_change_state);
  * Local echo of CAN messages
  *
  * CAN network devices *should* support a local echo functionality
- * (see Documentation/networking/can.txt). To test the handling of CAN
+ * (see Documentation/networking/can.rst). To test the handling of CAN
  * interfaces that do not support the local echo both driver types are
  * implemented. In the case that the driver does not support the echo
  * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index a8cb33264ff1..c2b04f505e16 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -61,7 +61,7 @@ MODULE_ALIAS_RTNL_LINK(DRV_NAME);
 /*
  * CAN test feature:
  * Enable the echo on driver level for testing the CAN core echo modes.
- * See Documentation/networking/can.txt for details.
+ * See Documentation/networking/can.rst for details.
  */
 
 static bool echo; /* echo testing. Default: 0 (Off) */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 8b95117c2923..557fd8bfd54e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -1567,6 +1567,12 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init,
 	tid1->ver_hdr.size = sizeof(struct cudbg_tid_info_region_rev1) -
 			     sizeof(struct cudbg_ver_hdr);
 
+	/* If firmware is not attached/alive, use backdoor register
+	 * access to collect dump.
+	 */
+	if (!is_fw_attached(pdbg_init))
+		goto fill_tid;
+
 #define FW_PARAM_PFVF_A(param) \
 	(FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) | \
 	 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_##param) | \
@@ -1604,6 +1610,9 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init,
 		tid->nhpftids = val[1] - val[0] + 1;
 	}
 
+#undef FW_PARAM_PFVF_A
+
+fill_tid:
 	tid->ntids = padap->tids.ntids;
 	tid->nstids = padap->tids.nstids;
 	tid->stid_base = padap->tids.stid_base;
@@ -1623,8 +1632,6 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init,
 	tid->ip_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV4_A);
 	tid->ipv6_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV6_A);
 
-#undef FW_PARAM_PFVF_A
-
 	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
@@ -1866,11 +1873,18 @@ int cudbg_collect_dump_context(struct cudbg_init *pdbg_init,
 		max_ctx_size = region_info[i].end - region_info[i].start + 1;
 		max_ctx_qid = max_ctx_size / SGE_CTXT_SIZE;
 
-		t4_sge_ctxt_flush(padap, padap->mbox, i);
-		rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i],
-				  region_info[i].start, max_ctx_size,
-				  (__be32 *)ctx_buf, 1);
-		if (rc) {
+		/* If firmware is not attached/alive, use backdoor register
+		 * access to collect dump.
+		 */
+		if (is_fw_attached(pdbg_init)) {
+			t4_sge_ctxt_flush(padap, padap->mbox, i);
+
+			rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i],
+					  region_info[i].start, max_ctx_size,
+					  (__be32 *)ctx_buf, 1);
+		}
+
+		if (rc || !is_fw_attached(pdbg_init)) {
 			max_ctx_qid = CUDBG_LOWMEM_MAX_CTXT_QIDS;
 			cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, i,
 					      &buff);
@@ -1946,9 +1960,10 @@ static void cudbg_mps_rpl_backdoor(struct adapter *padap,
 	mps_rplc->rplc31_0 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP0_A));
 }
 
-static int cudbg_collect_tcam_index(struct adapter *padap,
+static int cudbg_collect_tcam_index(struct cudbg_init *pdbg_init,
 				    struct cudbg_mps_tcam *tcam, u32 idx)
 {
+	struct adapter *padap = pdbg_init->adap;
 	u64 tcamy, tcamx, val;
 	u32 ctl, data2;
 	int rc = 0;
@@ -2033,12 +2048,22 @@ static int cudbg_collect_tcam_index(struct adapter *padap,
 			htons(FW_LDST_CMD_FID_V(FW_LDST_MPS_RPLC) |
 			      FW_LDST_CMD_IDX_V(idx));
 
-		rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd, sizeof(ldst_cmd),
-				&ldst_cmd);
-		if (rc)
+		/* If firmware is not attached/alive, use backdoor register
+		 * access to collect dump.
+		 */
+		if (is_fw_attached(pdbg_init))
+			rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd,
+					sizeof(ldst_cmd), &ldst_cmd);
+
+		if (rc || !is_fw_attached(pdbg_init)) {
 			cudbg_mps_rpl_backdoor(padap, &mps_rplc);
-		else
+			/* Ignore error since we collected directly from
+			 * reading registers.
+			 */
+			rc = 0;
+		} else {
 			mps_rplc = ldst_cmd.u.mps.rplc;
+		}
 
 		tcam->rplc[0] = ntohl(mps_rplc.rplc31_0);
 		tcam->rplc[1] = ntohl(mps_rplc.rplc63_32);
@@ -2075,7 +2100,7 @@ int cudbg_collect_mps_tcam(struct cudbg_init *pdbg_init,
 
 	tcam = (struct cudbg_mps_tcam *)temp_buff.data;
 	for (i = 0; i < n; i++) {
-		rc = cudbg_collect_tcam_index(padap, tcam, i);
+		rc = cudbg_collect_tcam_index(pdbg_init, tcam, i);
 		if (rc) {
 			cudbg_err->sys_err = rc;
 			cudbg_put_buff(pdbg_init, &temp_buff);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index af27d2b0f79f..047609ef0515 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -195,9 +195,11 @@ static void t4_report_fw_error(struct adapter *adap)
 	u32 pcie_fw;
 
 	pcie_fw = t4_read_reg(adap, PCIE_FW_A);
-	if (pcie_fw & PCIE_FW_ERR_F)
+	if (pcie_fw & PCIE_FW_ERR_F) {
 		dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n",
 			reason[PCIE_FW_EVAL_G(pcie_fw)]);
+		adap->flags &= ~FW_OK;
+	}
 }
 
 /*
@@ -5088,7 +5090,7 @@ int t4_read_rss(struct adapter *adapter, u16 *map)
 
 static unsigned int t4_use_ldst(struct adapter *adap)
 {
-	return (adap->flags & FW_OK) || !adap->use_bd;
+	return (adap->flags & FW_OK) && !adap->use_bd;
 }
 
 /**
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 741020534b16..b034c7f24eda 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1109,6 +1109,8 @@ static const struct ethtool_ops hns3vf_ethtool_ops = {
 	.set_rxfh = hns3_set_rss,
 	.get_link_ksettings = hns3_get_link_ksettings,
 	.get_channels = hns3_get_channels,
+	.get_coalesce = hns3_get_coalesce,
+	.set_coalesce = hns3_set_coalesce,
 };
 
 static const struct ethtool_ops hns3_ethtool_ops = {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 96f453ff84b5..f38fc5ce9f51 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -116,6 +116,9 @@ static int hclge_get_ring_chain_from_mbx(
 	hnae_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B, req->msg[3]);
 	ring_chain->tqp_index =
 			hclge_get_queue_id(vport->nic.kinfo.tqp[req->msg[4]]);
+	hnae_set_field(ring_chain->int_gl_idx, HCLGE_INT_GL_IDX_M,
+		       HCLGE_INT_GL_IDX_S,
+		       req->msg[5]);
 
 	cur_chain = ring_chain;
 
@@ -133,6 +136,11 @@ static int hclge_get_ring_chain_from_mbx(
 			[req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i +
 			HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 1]]);
 
+		hnae_set_field(new_chain->int_gl_idx, HCLGE_INT_GL_IDX_M,
+			       HCLGE_INT_GL_IDX_S,
+			       req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i +
+			       HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 2]);
+
 		cur_chain->next = new_chain;
 		cur_chain = new_chain;
 	}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 3d2bc9a971fa..0d89965f7928 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -565,6 +565,11 @@ static int hclgevf_bind_ring_to_vector(struct hnae3_handle *handle, bool en,
 				hnae_get_bit(node->flag, HNAE3_RING_TYPE_B);
 		req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 1] =
 				node->tqp_index;
+		req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 2] =
+				hnae_get_field(node->int_gl_idx,
+					       HNAE3_RING_GL_IDX_M,
+					       HNAE3_RING_GL_IDX_S);
+
 		if (i == (HCLGE_MBX_VF_MSG_DATA_NUM -
 		    HCLGEVF_RING_MAP_MBX_BASIC_MSG_NUM) /
 		    HCLGEVF_RING_NODE_VARIABLE_NUM) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 7ac7ef9b37ff..61188f343955 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -4087,7 +4087,7 @@ void ixgbe_get_oem_prod_version(struct ixgbe_hw *hw,
 	hw->eeprom.ops.read(hw, NVM_OEM_PROD_VER_PTR, &offset);
 
 	/* Return is offset to OEM Product Version block is invalid */
-	if (offset == 0x0 && offset == NVM_INVALID_PTR)
+	if (offset == 0x0 || offset == NVM_INVALID_PTR)
 		return;
 
 	/* Read product version block */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 317351025fd7..221f15803480 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3085,26 +3085,9 @@ static int ixgbe_get_ts_info(struct net_device *dev,
 	case ixgbe_mac_X550EM_x:
 	case ixgbe_mac_x550em_a:
 		info->rx_filters |= BIT(HWTSTAMP_FILTER_ALL);
-		/* fallthrough */
+		break;
 	case ixgbe_mac_X540:
 	case ixgbe_mac_82599EB:
-		info->so_timestamping =
-			SOF_TIMESTAMPING_TX_SOFTWARE |
-			SOF_TIMESTAMPING_RX_SOFTWARE |
-			SOF_TIMESTAMPING_SOFTWARE |
-			SOF_TIMESTAMPING_TX_HARDWARE |
-			SOF_TIMESTAMPING_RX_HARDWARE |
-			SOF_TIMESTAMPING_RAW_HARDWARE;
-
-		if (adapter->ptp_clock)
-			info->phc_index = ptp_clock_index(adapter->ptp_clock);
-		else
-			info->phc_index = -1;
-
-		info->tx_types =
-			BIT(HWTSTAMP_TX_OFF) |
-			BIT(HWTSTAMP_TX_ON);
-
 		info->rx_filters |=
 			BIT(HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
 			BIT(HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
@@ -3113,6 +3096,24 @@ static int ixgbe_get_ts_info(struct net_device *dev,
 	default:
 		return ethtool_op_get_ts_info(dev, info);
 	}
+
+	info->so_timestamping =
+		SOF_TIMESTAMPING_TX_SOFTWARE |
+		SOF_TIMESTAMPING_RX_SOFTWARE |
+		SOF_TIMESTAMPING_SOFTWARE |
+		SOF_TIMESTAMPING_TX_HARDWARE |
+		SOF_TIMESTAMPING_RX_HARDWARE |
+		SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	if (adapter->ptp_clock)
+		info->phc_index = ptp_clock_index(adapter->ptp_clock);
+	else
+		info->phc_index = -1;
+
+	info->tx_types =
+		BIT(HWTSTAMP_TX_OFF) |
+		BIT(HWTSTAMP_TX_ON);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index bbb622f15a77..0da5aa2c8aba 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4133,11 +4133,15 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
 		rxdctl &= ~0x3FFFFF;
 		rxdctl |=  0x080420;
 #if (PAGE_SIZE < 8192)
-	} else {
+	/* RXDCTL.RLPML does not work on 82599 */
+	} else if (hw->mac.type != ixgbe_mac_82599EB) {
 		rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
 			    IXGBE_RXDCTL_RLPML_EN);
 
-		/* Limit the maximum frame size so we don't overrun the skb */
+		/* Limit the maximum frame size so we don't overrun the skb.
+		 * This can happen in SRIOV mode when the MTU of the VF is
+		 * higher than the MTU of the PF.
+		 */
 		if (ring_uses_build_skb(ring) &&
 		    !test_bit(__IXGBE_RX_3K_BUFFER, &ring->state))
 			rxdctl |= IXGBE_MAX_2K_FRAME_BUILD_SKB |
@@ -7259,6 +7263,9 @@ static void ixgbe_watchdog_link_is_up(struct ixgbe_adapter *adapter)
 	case IXGBE_LINK_SPEED_10GB_FULL:
 		speed_str = "10 Gbps";
 		break;
+	case IXGBE_LINK_SPEED_5GB_FULL:
+		speed_str = "5 Gbps";
+		break;
 	case IXGBE_LINK_SPEED_2_5GB_FULL:
 		speed_str = "2.5 Gbps";
 		break;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 3bce26e77090..f470d0204771 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -949,7 +949,7 @@ static s32 ixgbe_checksum_ptr_x550(struct ixgbe_hw *hw, u16 ptr,
 	u16 length, bufsz, i, start;
 	u16 *local_buffer;
 
-	bufsz = sizeof(buf) / sizeof(buf[0]);
+	bufsz = ARRAY_SIZE(buf);
 
 	/* Read a chunk at the pointer location */
 	if (!buffer) {
diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index ff9d05f308ee..4400e49090b4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -75,6 +75,9 @@ static struct ixgbe_stats ixgbevf_gstrings_stats[] = {
 	IXGBEVF_STAT("tx_timeout_count", tx_timeout_count),
 	IXGBEVF_NETDEV_STAT(multicast),
 	IXGBEVF_STAT("rx_csum_offload_errors", hw_csum_rx_error),
+	IXGBEVF_STAT("alloc_rx_page", alloc_rx_page),
+	IXGBEVF_STAT("alloc_rx_page_failed", alloc_rx_page_failed),
+	IXGBEVF_STAT("alloc_rx_buff_failed", alloc_rx_buff_failed),
 };
 
 #define IXGBEVF_QUEUE_STATS_LEN ( \
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 581f44bbd7b3..f6952425c87d 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -62,7 +62,12 @@ struct ixgbevf_tx_buffer {
 struct ixgbevf_rx_buffer {
 	dma_addr_t dma;
 	struct page *page;
-	unsigned int page_offset;
+#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
+	__u32 page_offset;
+#else
+	__u16 page_offset;
+#endif
+	__u16 pagecnt_bias;
 };
 
 struct ixgbevf_stats {
@@ -79,6 +84,7 @@ struct ixgbevf_tx_queue_stats {
 struct ixgbevf_rx_queue_stats {
 	u64 alloc_rx_page_failed;
 	u64 alloc_rx_buff_failed;
+	u64 alloc_rx_page;
 	u64 csum_err;
 };
 
@@ -260,6 +266,9 @@ static inline void ixgbevf_write_tail(struct ixgbevf_ring *ring, u32 value)
 #define MIN_MSIX_Q_VECTORS	1
 #define MIN_MSIX_COUNT		(MIN_MSIX_Q_VECTORS + NON_Q_VECTORS)
 
+#define IXGBEVF_RX_DMA_ATTR \
+	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 /* board specific private data structure */
 struct ixgbevf_adapter {
 	/* this field must be first, see ixgbevf_process_skb_fields */
@@ -287,8 +296,9 @@ struct ixgbevf_adapter {
 	u64 hw_csum_rx_error;
 	u64 hw_rx_no_dma_resources;
 	int num_msix_vectors;
-	u32 alloc_rx_page_failed;
-	u32 alloc_rx_buff_failed;
+	u64 alloc_rx_page_failed;
+	u64 alloc_rx_buff_failed;
+	u64 alloc_rx_page;
 
 	struct msix_entry *msix_entries;
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index ed5c3aea7939..9b3d43d28106 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -206,28 +206,6 @@ static void ixgbevf_set_ivar(struct ixgbevf_adapter *adapter, s8 direction,
 	}
 }
 
-static void ixgbevf_unmap_and_free_tx_resource(struct ixgbevf_ring *tx_ring,
-					struct ixgbevf_tx_buffer *tx_buffer)
-{
-	if (tx_buffer->skb) {
-		dev_kfree_skb_any(tx_buffer->skb);
-		if (dma_unmap_len(tx_buffer, len))
-			dma_unmap_single(tx_ring->dev,
-					 dma_unmap_addr(tx_buffer, dma),
-					 dma_unmap_len(tx_buffer, len),
-					 DMA_TO_DEVICE);
-	} else if (dma_unmap_len(tx_buffer, len)) {
-		dma_unmap_page(tx_ring->dev,
-			       dma_unmap_addr(tx_buffer, dma),
-			       dma_unmap_len(tx_buffer, len),
-			       DMA_TO_DEVICE);
-	}
-	tx_buffer->next_to_watch = NULL;
-	tx_buffer->skb = NULL;
-	dma_unmap_len_set(tx_buffer, len, 0);
-	/* tx_buffer must be completely set up in the transmit path */
-}
-
 static u64 ixgbevf_get_tx_completed(struct ixgbevf_ring *ring)
 {
 	return ring->stats.packets;
@@ -349,7 +327,6 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 				 DMA_TO_DEVICE);
 
 		/* clear tx_buffer data */
-		tx_buffer->skb = NULL;
 		dma_unmap_len_set(tx_buffer, len, 0);
 
 		/* unmap remaining buffers */
@@ -595,8 +572,8 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring,
 	}
 
 	/* map page for use */
-	dma = dma_map_page(rx_ring->dev, page, 0,
-			   PAGE_SIZE, DMA_FROM_DEVICE);
+	dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+				 DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
@@ -604,13 +581,15 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring,
 	if (dma_mapping_error(rx_ring->dev, dma)) {
 		__free_page(page);
 
-		rx_ring->rx_stats.alloc_rx_buff_failed++;
+		rx_ring->rx_stats.alloc_rx_page_failed++;
 		return false;
 	}
 
 	bi->dma = dma;
 	bi->page = page;
 	bi->page_offset = 0;
+	bi->pagecnt_bias = 1;
+	rx_ring->rx_stats.alloc_rx_page++;
 
 	return true;
 }
@@ -639,6 +618,12 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring,
 		if (!ixgbevf_alloc_mapped_page(rx_ring, bi))
 			break;
 
+		/* sync the buffer for use by the device */
+		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+						 bi->page_offset,
+						 IXGBEVF_RX_BUFSZ,
+						 DMA_FROM_DEVICE);
+
 		/* Refresh the desc even if pkt_addr didn't change
 		 * because each write-back erases this info.
 		 */
@@ -653,8 +638,8 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring,
 			i -= rx_ring->count;
 		}
 
-		/* clear the hdr_addr for the next_to_use descriptor */
-		rx_desc->read.hdr_addr = 0;
+		/* clear the length for the next_to_use descriptor */
+		rx_desc->wb.upper.length = 0;
 
 		cleaned_count--;
 	} while (cleaned_count);
@@ -741,12 +726,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring,
 	new_buff->page = old_buff->page;
 	new_buff->dma = old_buff->dma;
 	new_buff->page_offset = old_buff->page_offset;
-
-	/* sync the buffer for use by the device */
-	dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
-					 new_buff->page_offset,
-					 IXGBEVF_RX_BUFSZ,
-					 DMA_FROM_DEVICE);
+	new_buff->pagecnt_bias = old_buff->pagecnt_bias;
 }
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
@@ -754,6 +734,45 @@ static inline bool ixgbevf_page_is_reserved(struct page *page)
 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
+static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer,
+				      struct page *page,
+				      const unsigned int truesize)
+{
+	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
+
+	/* avoid re-using remote pages */
+	if (unlikely(ixgbevf_page_is_reserved(page)))
+		return false;
+
+#if (PAGE_SIZE < 8192)
+	/* if we are only owner of page we can reuse it */
+	if (unlikely(page_ref_count(page) != pagecnt_bias))
+		return false;
+
+	/* flip page offset to other buffer */
+	rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ;
+
+#else
+	/* move offset up to the next cache line */
+	rx_buffer->page_offset += truesize;
+
+	if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ))
+		return false;
+
+#endif
+
+	/* If we have drained the page fragment pool we need to update
+	 * the pagecnt_bias and page count so that we fully restock the
+	 * number of references the driver holds.
+	 */
+	if (unlikely(pagecnt_bias == 1)) {
+		page_ref_add(page, USHRT_MAX);
+		rx_buffer->pagecnt_bias = USHRT_MAX;
+	}
+
+	return true;
+}
+
 /**
  * ixgbevf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @rx_ring: rx descriptor ring to transact packets on
@@ -771,12 +790,12 @@ static inline bool ixgbevf_page_is_reserved(struct page *page)
  **/
 static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring,
 				struct ixgbevf_rx_buffer *rx_buffer,
+				u16 size,
 				union ixgbe_adv_rx_desc *rx_desc,
 				struct sk_buff *skb)
 {
 	struct page *page = rx_buffer->page;
 	unsigned char *va = page_address(page) + rx_buffer->page_offset;
-	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = IXGBEVF_RX_BUFSZ;
 #else
@@ -795,7 +814,6 @@ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring,
 			return true;
 
 		/* this page cannot be reused so discard it */
-		put_page(page);
 		return false;
 	}
 
@@ -815,32 +833,7 @@ add_tail_frag:
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
 			(unsigned long)va & ~PAGE_MASK, size, truesize);
 
-	/* avoid re-using remote pages */
-	if (unlikely(ixgbevf_page_is_reserved(page)))
-		return false;
-
-#if (PAGE_SIZE < 8192)
-	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != 1))
-		return false;
-
-	/* flip page offset to other buffer */
-	rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ;
-
-#else
-	/* move offset up to the next cache line */
-	rx_buffer->page_offset += truesize;
-
-	if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ))
-		return false;
-
-#endif
-	/* Even if we own the page, we are not allowed to use atomic_set()
-	 * This would break get_page_unless_zero() users.
-	 */
-	page_ref_inc(page);
-
-	return true;
+	return ixgbevf_can_reuse_rx_page(rx_buffer, page, truesize);
 }
 
 static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring,
@@ -849,11 +842,19 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring,
 {
 	struct ixgbevf_rx_buffer *rx_buffer;
 	struct page *page;
+	u16 size = le16_to_cpu(rx_desc->wb.upper.length);
 
 	rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
 	page = rx_buffer->page;
 	prefetchw(page);
 
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev,
+				      rx_buffer->dma,
+				      rx_buffer->page_offset,
+				      size,
+				      DMA_FROM_DEVICE);
+
 	if (likely(!skb)) {
 		void *page_addr = page_address(page) +
 				  rx_buffer->page_offset;
@@ -879,21 +880,18 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring,
 		prefetchw(skb->data);
 	}
 
-	/* we are reusing so sync this buffer for CPU use */
-	dma_sync_single_range_for_cpu(rx_ring->dev,
-				      rx_buffer->dma,
-				      rx_buffer->page_offset,
-				      IXGBEVF_RX_BUFSZ,
-				      DMA_FROM_DEVICE);
-
 	/* pull page into skb */
-	if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
+	if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
 		/* hand second half of page back to the ring */
 		ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
 	} else {
-		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page(rx_ring->dev, rx_buffer->dma,
-			       PAGE_SIZE, DMA_FROM_DEVICE);
+		/* We are not reusing the buffer so unmap it and free
+		 * any references we are holding to it
+		 */
+		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
+				     PAGE_SIZE, DMA_FROM_DEVICE,
+				     IXGBEVF_RX_DMA_ATTR);
+		__page_frag_cache_drain(page, rx_buffer->pagecnt_bias);
 	}
 
 	/* clear contents of buffer_info */
@@ -930,7 +928,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 
 		rx_desc = IXGBEVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-		if (!ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_DD))
+		if (!rx_desc->wb.upper.length)
 			break;
 
 		/* This memory barrier is needed to keep us from reading
@@ -943,8 +941,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		skb = ixgbevf_fetch_rx_buffer(rx_ring, rx_desc, skb);
 
 		/* exit if we failed to retrieve a buffer */
-		if (!skb)
+		if (!skb) {
+			rx_ring->rx_stats.alloc_rx_buff_failed++;
 			break;
+		}
 
 		cleaned_count++;
 
@@ -1553,6 +1553,10 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter,
 	txdctl |= (1u << 8) |    /* HTHRESH = 1 */
 		   32;           /* PTHRESH = 32 */
 
+	/* reinitialize tx_buffer_info */
+	memset(ring->tx_buffer_info, 0,
+	       sizeof(struct ixgbevf_tx_buffer) * ring->count);
+
 	clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state);
 
 	IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), txdctl);
@@ -1721,6 +1725,7 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter,
 				      struct ixgbevf_ring *ring)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
+	union ixgbe_adv_rx_desc *rx_desc;
 	u64 rdba = ring->dma;
 	u32 rxdctl;
 	u8 reg_idx = ring->reg_idx;
@@ -1749,6 +1754,14 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter,
 	IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0);
 	ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx);
 
+	/* initialize rx_buffer_info */
+	memset(ring->rx_buffer_info, 0,
+	       sizeof(struct ixgbevf_rx_buffer) * ring->count);
+
+	/* initialize Rx descriptor 0 */
+	rx_desc = IXGBEVF_RX_DESC(ring, 0);
+	rx_desc->wb.upper.length = 0;
+
 	/* reset ntu and ntc to place SW in sync with hardwdare */
 	ring->next_to_clean = 0;
 	ring->next_to_use = 0;
@@ -2103,9 +2116,7 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter)
  **/
 static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring)
 {
-	struct device *dev = rx_ring->dev;
-	unsigned long size;
-	unsigned int i;
+	u16 i = rx_ring->next_to_clean;
 
 	/* Free Rx ring sk_buff */
 	if (rx_ring->skb) {
@@ -2113,29 +2124,39 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring)
 		rx_ring->skb = NULL;
 	}
 
-	/* ring already cleared, nothing to do */
-	if (!rx_ring->rx_buffer_info)
-		return;
-
 	/* Free all the Rx ring pages */
-	for (i = 0; i < rx_ring->count; i++) {
+	while (i != rx_ring->next_to_alloc) {
 		struct ixgbevf_rx_buffer *rx_buffer;
 
 		rx_buffer = &rx_ring->rx_buffer_info[i];
-		if (rx_buffer->dma)
-			dma_unmap_page(dev, rx_buffer->dma,
-				       PAGE_SIZE, DMA_FROM_DEVICE);
-		rx_buffer->dma = 0;
-		if (rx_buffer->page)
-			__free_page(rx_buffer->page);
-		rx_buffer->page = NULL;
-	}
 
-	size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
-	memset(rx_ring->rx_buffer_info, 0, size);
+		/* Invalidate cache lines that may have been written to by
+		 * device so that we avoid corrupting memory.
+		 */
+		dma_sync_single_range_for_cpu(rx_ring->dev,
+					      rx_buffer->dma,
+					      rx_buffer->page_offset,
+					      IXGBEVF_RX_BUFSZ,
+					      DMA_FROM_DEVICE);
+
+		/* free resources associated with mapping */
+		dma_unmap_page_attrs(rx_ring->dev,
+				     rx_buffer->dma,
+				     PAGE_SIZE,
+				     DMA_FROM_DEVICE,
+				     IXGBEVF_RX_DMA_ATTR);
+
+		__page_frag_cache_drain(rx_buffer->page,
+					rx_buffer->pagecnt_bias);
 
-	/* Zero out the descriptor ring */
-	memset(rx_ring->desc, 0, rx_ring->size);
+		i++;
+		if (i == rx_ring->count)
+			i = 0;
+	}
+
+	rx_ring->next_to_alloc = 0;
+	rx_ring->next_to_clean = 0;
+	rx_ring->next_to_use = 0;
 }
 
 /**
@@ -2144,23 +2165,57 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring)
  **/
 static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring)
 {
-	struct ixgbevf_tx_buffer *tx_buffer_info;
-	unsigned long size;
-	unsigned int i;
+	u16 i = tx_ring->next_to_clean;
+	struct ixgbevf_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
 
-	if (!tx_ring->tx_buffer_info)
-		return;
+	while (i != tx_ring->next_to_use) {
+		union ixgbe_adv_tx_desc *eop_desc, *tx_desc;
 
-	/* Free all the Tx ring sk_buffs */
-	for (i = 0; i < tx_ring->count; i++) {
-		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		ixgbevf_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
+		/* Free all the Tx ring sk_buffs */
+		dev_kfree_skb_any(tx_buffer->skb);
+
+		/* unmap skb header data */
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
+
+		/* check for eop_desc to determine the end of the packet */
+		eop_desc = tx_buffer->next_to_watch;
+		tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
+
+		/* unmap remaining buffers */
+		while (tx_desc != eop_desc) {
+			tx_buffer++;
+			tx_desc++;
+			i++;
+			if (unlikely(i == tx_ring->count)) {
+				i = 0;
+				tx_buffer = tx_ring->tx_buffer_info;
+				tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
+			}
+
+			/* unmap any remaining paged data */
+			if (dma_unmap_len(tx_buffer, len))
+				dma_unmap_page(tx_ring->dev,
+					       dma_unmap_addr(tx_buffer, dma),
+					       dma_unmap_len(tx_buffer, len),
+					       DMA_TO_DEVICE);
+		}
+
+		/* move us one more past the eop_desc for start of next pkt */
+		tx_buffer++;
+		i++;
+		if (unlikely(i == tx_ring->count)) {
+			i = 0;
+			tx_buffer = tx_ring->tx_buffer_info;
+		}
 	}
 
-	size = sizeof(struct ixgbevf_tx_buffer) * tx_ring->count;
-	memset(tx_ring->tx_buffer_info, 0, size);
+	/* reset next_to_use and next_to_clean */
+	tx_ring->next_to_use = 0;
+	tx_ring->next_to_clean = 0;
 
-	memset(tx_ring->desc, 0, tx_ring->size);
 }
 
 /**
@@ -2712,6 +2767,8 @@ out:
 void ixgbevf_update_stats(struct ixgbevf_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
+	u64 alloc_rx_page_failed = 0, alloc_rx_buff_failed = 0;
+	u64 alloc_rx_page = 0, hw_csum_rx_error = 0;
 	int i;
 
 	if (test_bit(__IXGBEVF_DOWN, &adapter->state) ||
@@ -2732,10 +2789,18 @@ void ixgbevf_update_stats(struct ixgbevf_adapter *adapter)
 				adapter->stats.vfmprc);
 
 	for (i = 0;  i  < adapter->num_rx_queues;  i++) {
-		adapter->hw_csum_rx_error +=
-			adapter->rx_ring[i]->hw_csum_rx_error;
-		adapter->rx_ring[i]->hw_csum_rx_error = 0;
+		struct ixgbevf_ring *rx_ring = adapter->rx_ring[i];
+
+		hw_csum_rx_error += rx_ring->rx_stats.csum_err;
+		alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed;
+		alloc_rx_buff_failed += rx_ring->rx_stats.alloc_rx_buff_failed;
+		alloc_rx_page += rx_ring->rx_stats.alloc_rx_page;
 	}
+
+	adapter->hw_csum_rx_error = hw_csum_rx_error;
+	adapter->alloc_rx_page_failed = alloc_rx_page_failed;
+	adapter->alloc_rx_buff_failed = alloc_rx_buff_failed;
+	adapter->alloc_rx_page = alloc_rx_page;
 }
 
 /**
@@ -2980,7 +3045,7 @@ int ixgbevf_setup_tx_resources(struct ixgbevf_ring *tx_ring)
 	int size;
 
 	size = sizeof(struct ixgbevf_tx_buffer) * tx_ring->count;
-	tx_ring->tx_buffer_info = vzalloc(size);
+	tx_ring->tx_buffer_info = vmalloc(size);
 	if (!tx_ring->tx_buffer_info)
 		goto err;
 
@@ -3040,7 +3105,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring *rx_ring)
 	int size;
 
 	size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count;
-	rx_ring->rx_buffer_info = vzalloc(size);
+	rx_ring->rx_buffer_info = vmalloc(size);
 	if (!rx_ring->rx_buffer_info)
 		goto err;
 
@@ -3482,34 +3547,37 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 			   struct ixgbevf_tx_buffer *first,
 			   const u8 hdr_len)
 {
-	dma_addr_t dma;
 	struct sk_buff *skb = first->skb;
 	struct ixgbevf_tx_buffer *tx_buffer;
 	union ixgbe_adv_tx_desc *tx_desc;
-	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
-	unsigned int data_len = skb->data_len;
-	unsigned int size = skb_headlen(skb);
-	unsigned int paylen = skb->len - hdr_len;
+	struct skb_frag_struct *frag;
+	dma_addr_t dma;
+	unsigned int data_len, size;
 	u32 tx_flags = first->tx_flags;
-	__le32 cmd_type;
+	__le32 cmd_type = ixgbevf_tx_cmd_type(tx_flags);
 	u16 i = tx_ring->next_to_use;
 
 	tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 
-	ixgbevf_tx_olinfo_status(tx_desc, tx_flags, paylen);
-	cmd_type = ixgbevf_tx_cmd_type(tx_flags);
+	ixgbevf_tx_olinfo_status(tx_desc, tx_flags, skb->len - hdr_len);
+
+	size = skb_headlen(skb);
+	data_len = skb->data_len;
 
 	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
-	if (dma_mapping_error(tx_ring->dev, dma))
-		goto dma_error;
 
-	/* record length, and DMA address */
-	dma_unmap_len_set(first, len, size);
-	dma_unmap_addr_set(first, dma, dma);
+	tx_buffer = first;
+
+	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
+		if (dma_mapping_error(tx_ring->dev, dma))
+			goto dma_error;
+
+		/* record length, and DMA address */
+		dma_unmap_len_set(tx_buffer, len, size);
+		dma_unmap_addr_set(tx_buffer, dma, dma);
 
-	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+		tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
-	for (;;) {
 		while (unlikely(size > IXGBE_MAX_DATA_PER_TXD)) {
 			tx_desc->read.cmd_type_len =
 				cmd_type | cpu_to_le32(IXGBE_MAX_DATA_PER_TXD);
@@ -3520,12 +3588,12 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 				tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
 				i = 0;
 			}
+			tx_desc->read.olinfo_status = 0;
 
 			dma += IXGBE_MAX_DATA_PER_TXD;
 			size -= IXGBE_MAX_DATA_PER_TXD;
 
 			tx_desc->read.buffer_addr = cpu_to_le64(dma);
-			tx_desc->read.olinfo_status = 0;
 		}
 
 		if (likely(!data_len))
@@ -3539,23 +3607,15 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 			tx_desc = IXGBEVF_TX_DESC(tx_ring, 0);
 			i = 0;
 		}
+		tx_desc->read.olinfo_status = 0;
 
 		size = skb_frag_size(frag);
 		data_len -= size;
 
 		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
 				       DMA_TO_DEVICE);
-		if (dma_mapping_error(tx_ring->dev, dma))
-			goto dma_error;
 
 		tx_buffer = &tx_ring->tx_buffer_info[i];
-		dma_unmap_len_set(tx_buffer, len, size);
-		dma_unmap_addr_set(tx_buffer, dma, dma);
-
-		tx_desc->read.buffer_addr = cpu_to_le64(dma);
-		tx_desc->read.olinfo_status = 0;
-
-		frag++;
 	}
 
 	/* write last descriptor with RS and EOP bits */
@@ -3589,18 +3649,32 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 	return;
 dma_error:
 	dev_err(tx_ring->dev, "TX DMA map failed\n");
+	tx_buffer = &tx_ring->tx_buffer_info[i];
 
 	/* clear dma mappings for failed tx_buffer_info map */
-	for (;;) {
+	while (tx_buffer != first) {
+		if (dma_unmap_len(tx_buffer, len))
+			dma_unmap_page(tx_ring->dev,
+				       dma_unmap_addr(tx_buffer, dma),
+				       dma_unmap_len(tx_buffer, len),
+				       DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buffer, len, 0);
+
+		if (i-- == 0)
+			i += tx_ring->count;
 		tx_buffer = &tx_ring->tx_buffer_info[i];
-		ixgbevf_unmap_and_free_tx_resource(tx_ring, tx_buffer);
-		if (tx_buffer == first)
-			break;
-		if (i == 0)
-			i = tx_ring->count;
-		i--;
 	}
 
+	if (dma_unmap_len(tx_buffer, len))
+		dma_unmap_single(tx_ring->dev,
+				 dma_unmap_addr(tx_buffer, dma),
+				 dma_unmap_len(tx_buffer, len),
+				 DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_buffer, len, 0);
+
+	dev_kfree_skb_any(tx_buffer->skb);
+	tx_buffer->skb = NULL;
+
 	tx_ring->next_to_use = i;
 }
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c
index 64c93e8becc6..38d3a327c1bc 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.c
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.c
@@ -286,7 +286,7 @@ static s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr)
 		ether_addr_copy(msg_addr, addr);
 
 	ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-					     sizeof(msgbuf) / sizeof(u32));
+					     ARRAY_SIZE(msgbuf));
 	if (!ret_val) {
 		msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
@@ -456,8 +456,7 @@ static s32 ixgbevf_set_rar_vf(struct ixgbe_hw *hw, u32 index, u8 *addr,
 	ether_addr_copy(msg_addr, addr);
 
 	ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-					     sizeof(msgbuf) / sizeof(u32));
-
+					     ARRAY_SIZE(msgbuf));
 	msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
 	/* if nacked the address was rejected, use "perm_addr" */
@@ -574,7 +573,7 @@ static s32 ixgbevf_update_xcast_mode(struct ixgbe_hw *hw, int xcast_mode)
 	msgbuf[1] = xcast_mode;
 
 	err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-					 sizeof(msgbuf) / sizeof(u32));
+					 ARRAY_SIZE(msgbuf));
 	if (err)
 		return err;
 
@@ -614,7 +613,7 @@ static s32 ixgbevf_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind,
 	msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT;
 
 	err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-					 sizeof(msgbuf) / sizeof(u32));
+					 ARRAY_SIZE(msgbuf));
 	if (err)
 		goto mbx_err;
 
@@ -826,7 +825,7 @@ static s32 ixgbevf_set_rlpml_vf(struct ixgbe_hw *hw, u16 max_size)
 	msgbuf[1] = max_size;
 
 	ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf,
-					     sizeof(msgbuf) / sizeof(u32));
+					     ARRAY_SIZE(msgbuf));
 	if (ret_val)
 		return ret_val;
 	if ((msgbuf[0] & IXGBE_VF_SET_LPE) &&
@@ -872,8 +871,7 @@ static int ixgbevf_negotiate_api_version_vf(struct ixgbe_hw *hw, int api)
 	msg[1] = api;
 	msg[2] = 0;
 
-	err = ixgbevf_write_msg_read_ack(hw, msg, msg,
-					 sizeof(msg) / sizeof(u32));
+	err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg));
 	if (!err) {
 		msg[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
@@ -924,8 +922,7 @@ int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs,
 	msg[0] = IXGBE_VF_GET_QUEUE;
 	msg[1] = msg[2] = msg[3] = msg[4] = 0;
 
-	err = ixgbevf_write_msg_read_ack(hw, msg, msg,
-					 sizeof(msg) / sizeof(u32));
+	err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg));
 	if (!err) {
 		msg[0] &= ~IXGBE_VT_MSGTYPE_CTS;
 
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 6d6fb8cf3e7c..6473cc68c2d5 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -789,7 +789,6 @@ static int ofdpa_flow_tbl_add(struct ofdpa_port *ofdpa_port,
 			       ofdpa_flags_nowait(flags),
 			       ofdpa_cmd_flow_tbl_add,
 			       found, NULL, NULL);
-	return 0;
 }
 
 static int ofdpa_flow_tbl_del(struct ofdpa_port *ofdpa_port,
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 433d29d6bc95..f1cc2ed76029 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -643,7 +643,7 @@ static int efx_ptp_get_attributes(struct efx_nic *efx)
 	case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_QTR_NANOSECONDS:
 		ptp->ns_to_nic_time = efx_ptp_ns_to_s_qns;
 		ptp->nic_to_kernel_time = efx_ptp_s_qns_to_ktime_correction;
-		ptp->nic_time.minor_max = 4000000000;
+		ptp->nic_time.minor_max = 4000000000UL;
 		ptp->nic_time.sync_event_minor_shift = 24;
 		break;
 	default:
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 425056c7f96c..276932d75975 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -688,6 +688,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
 void bpf_prog_free(struct bpf_prog *fp);
 
+bool bpf_opcode_in_insntable(u8 code);
+
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
@@ -1003,10 +1005,20 @@ struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
 	union {
+		u32 args[4];
 		u32 reply;
 		u32 replylong[4];
 	};
 	u32	is_fullsock;
+	u64	temp;			/* temp and everything after is not
+					 * initialized to 0 before calling
+					 * the BPF program. New fields that
+					 * should be initialized to 0 should
+					 * be inserted before temp.
+					 * temp is scratch storage used by
+					 * sock_ops_convert_ctx_access
+					 * as temporary storage of a register.
+					 */
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 24a62d590350..cd46d3d63aa0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -851,6 +851,7 @@ struct xfrmdev_ops {
 	void	(*xdo_dev_state_free) (struct xfrm_state *x);
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
+	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
 };
 #endif
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4f93f0953c41..8f4c54986f97 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -335,6 +335,17 @@ struct tcp_sock {
 
 	int			linger2;
 
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
+					 * values defined in uapi/linux/tcp.h
+					 */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
+
 /* Receiver side RTT estimation */
 	struct {
 		u32	rtt_us;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5a1d26a18599..093e967a2960 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,12 +2006,12 @@ void tcp_cleanup_ulp(struct sock *sk);
  * program loaded).
  */
 #ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
 	if (sk_fullsock(sk)) {
 		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 
 	sock_ops.sk = sk;
 	sock_ops.op = op;
+	if (nargs > 0)
+		memcpy(sock_ops.args, args, nargs * sizeof(*args));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
 	if (ret == 0)
@@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 		ret = -1;
 	return ret;
 }
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	u32 args[2] = {arg1, arg2};
+
+	return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	u32 args[3] = {arg1, arg2, arg3};
+
+	return tcp_call_bpf(sk, op, 3, args);
+}
+
 #else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	return -EPERM;
 }
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	return -EPERM;
+}
+
 #endif
 
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
-	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 {
 	int rwnd;
 
-	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
 
 	if (rwnd < 0)
 		rwnd = 0;
@@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 
 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
-	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2e6d4fe6b0ba..7d2077665c0b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1904,6 +1904,14 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
+static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
+{
+	struct xfrm_state_offload *xso = &x->xso;
+
+	if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
+		xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
+}
+
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
 	struct xfrm_state *x = dst->xfrm;
@@ -1974,6 +1982,10 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x
 	return false;
 }
 
+static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
+{
+}
+
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 {
 	return false;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 406c19d6016b..db6bdc375126 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -642,6 +642,14 @@ union bpf_attr {
  *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
+ * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
+ *     Set callback flags for sock_ops
+ *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
+ *     @flags: flags value
+ *     Return: 0 for no error
+ *             -EINVAL if there is no full tcp socket
+ *             bits in flags that are not supported by current kernel
+ *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
  *     Grow or shrink room in sk_buff.
  *     @skb: pointer to skb
@@ -748,7 +756,8 @@ union bpf_attr {
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
 	FN(getsockopt),			\
-	FN(override_return),
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -952,8 +961,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -968,8 +978,39 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
+							 * supported cb flags
+							 */
+
 /* List of known BPF sock_ops operators.
  * New entries can only be added at the end
  */
@@ -1003,6 +1044,43 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3aa0658add76..5f35f93dcab2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -782,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
 
+/* All UAPI available opcodes. */
+#define BPF_INSN_MAP(INSN_2, INSN_3)		\
+	/* 32 bit ALU operations. */		\
+	/*   Register based. */			\
+	INSN_3(ALU, ADD, X),			\
+	INSN_3(ALU, SUB, X),			\
+	INSN_3(ALU, AND, X),			\
+	INSN_3(ALU, OR,  X),			\
+	INSN_3(ALU, LSH, X),			\
+	INSN_3(ALU, RSH, X),			\
+	INSN_3(ALU, XOR, X),			\
+	INSN_3(ALU, MUL, X),			\
+	INSN_3(ALU, MOV, X),			\
+	INSN_3(ALU, DIV, X),			\
+	INSN_3(ALU, MOD, X),			\
+	INSN_2(ALU, NEG),			\
+	INSN_3(ALU, END, TO_BE),		\
+	INSN_3(ALU, END, TO_LE),		\
+	/*   Immediate based. */		\
+	INSN_3(ALU, ADD, K),			\
+	INSN_3(ALU, SUB, K),			\
+	INSN_3(ALU, AND, K),			\
+	INSN_3(ALU, OR,  K),			\
+	INSN_3(ALU, LSH, K),			\
+	INSN_3(ALU, RSH, K),			\
+	INSN_3(ALU, XOR, K),			\
+	INSN_3(ALU, MUL, K),			\
+	INSN_3(ALU, MOV, K),			\
+	INSN_3(ALU, DIV, K),			\
+	INSN_3(ALU, MOD, K),			\
+	/* 64 bit ALU operations. */		\
+	/*   Register based. */			\
+	INSN_3(ALU64, ADD,  X),			\
+	INSN_3(ALU64, SUB,  X),			\
+	INSN_3(ALU64, AND,  X),			\
+	INSN_3(ALU64, OR,   X),			\
+	INSN_3(ALU64, LSH,  X),			\
+	INSN_3(ALU64, RSH,  X),			\
+	INSN_3(ALU64, XOR,  X),			\
+	INSN_3(ALU64, MUL,  X),			\
+	INSN_3(ALU64, MOV,  X),			\
+	INSN_3(ALU64, ARSH, X),			\
+	INSN_3(ALU64, DIV,  X),			\
+	INSN_3(ALU64, MOD,  X),			\
+	INSN_2(ALU64, NEG),			\
+	/*   Immediate based. */		\
+	INSN_3(ALU64, ADD,  K),			\
+	INSN_3(ALU64, SUB,  K),			\
+	INSN_3(ALU64, AND,  K),			\
+	INSN_3(ALU64, OR,   K),			\
+	INSN_3(ALU64, LSH,  K),			\
+	INSN_3(ALU64, RSH,  K),			\
+	INSN_3(ALU64, XOR,  K),			\
+	INSN_3(ALU64, MUL,  K),			\
+	INSN_3(ALU64, MOV,  K),			\
+	INSN_3(ALU64, ARSH, K),			\
+	INSN_3(ALU64, DIV,  K),			\
+	INSN_3(ALU64, MOD,  K),			\
+	/* Call instruction. */			\
+	INSN_2(JMP, CALL),			\
+	/* Exit instruction. */			\
+	INSN_2(JMP, EXIT),			\
+	/* Jump instructions. */		\
+	/*   Register based. */			\
+	INSN_3(JMP, JEQ,  X),			\
+	INSN_3(JMP, JNE,  X),			\
+	INSN_3(JMP, JGT,  X),			\
+	INSN_3(JMP, JLT,  X),			\
+	INSN_3(JMP, JGE,  X),			\
+	INSN_3(JMP, JLE,  X),			\
+	INSN_3(JMP, JSGT, X),			\
+	INSN_3(JMP, JSLT, X),			\
+	INSN_3(JMP, JSGE, X),			\
+	INSN_3(JMP, JSLE, X),			\
+	INSN_3(JMP, JSET, X),			\
+	/*   Immediate based. */		\
+	INSN_3(JMP, JEQ,  K),			\
+	INSN_3(JMP, JNE,  K),			\
+	INSN_3(JMP, JGT,  K),			\
+	INSN_3(JMP, JLT,  K),			\
+	INSN_3(JMP, JGE,  K),			\
+	INSN_3(JMP, JLE,  K),			\
+	INSN_3(JMP, JSGT, K),			\
+	INSN_3(JMP, JSLT, K),			\
+	INSN_3(JMP, JSGE, K),			\
+	INSN_3(JMP, JSLE, K),			\
+	INSN_3(JMP, JSET, K),			\
+	INSN_2(JMP, JA),			\
+	/* Store instructions. */		\
+	/*   Register based. */			\
+	INSN_3(STX, MEM,  B),			\
+	INSN_3(STX, MEM,  H),			\
+	INSN_3(STX, MEM,  W),			\
+	INSN_3(STX, MEM,  DW),			\
+	INSN_3(STX, XADD, W),			\
+	INSN_3(STX, XADD, DW),			\
+	/*   Immediate based. */		\
+	INSN_3(ST, MEM, B),			\
+	INSN_3(ST, MEM, H),			\
+	INSN_3(ST, MEM, W),			\
+	INSN_3(ST, MEM, DW),			\
+	/* Load instructions. */		\
+	/*   Register based. */			\
+	INSN_3(LDX, MEM, B),			\
+	INSN_3(LDX, MEM, H),			\
+	INSN_3(LDX, MEM, W),			\
+	INSN_3(LDX, MEM, DW),			\
+	/*   Immediate based. */		\
+	INSN_3(LD, IMM, DW),			\
+	/*   Misc (old cBPF carry-over). */	\
+	INSN_3(LD, ABS, B),			\
+	INSN_3(LD, ABS, H),			\
+	INSN_3(LD, ABS, W),			\
+	INSN_3(LD, IND, B),			\
+	INSN_3(LD, IND, H),			\
+	INSN_3(LD, IND, W)
+
+bool bpf_opcode_in_insntable(u8 code)
+{
+#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
+#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
+	static const bool public_insntable[256] = {
+		[0 ... 255] = false,
+		/* Now overwrite non-defaults ... */
+		BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+	};
+#undef BPF_INSN_3_TBL
+#undef BPF_INSN_2_TBL
+	return public_insntable[code];
+}
+
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 /**
  *	__bpf_prog_run - run eBPF program on a given context
@@ -793,115 +924,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
 	u64 tmp;
+#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
+#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
 	static const void *jumptable[256] = {
 		[0 ... 255] = &&default_label,
 		/* Now overwrite non-defaults ... */
-		/* 32 bit ALU operations */
-		[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
-		[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
-		[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
-		[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
-		[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
-		[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
-		[BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X,
-		[BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K,
-		[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
-		[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
-		[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
-		[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
-		[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
-		[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
-		[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
-		[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
-		[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
-		[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
-		[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
-		[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
-		[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
-		[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
-		[BPF_ALU | BPF_NEG] = &&ALU_NEG,
-		[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
-		[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
-		/* 64 bit ALU operations */
-		[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
-		[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
-		[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
-		[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
-		[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
-		[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
-		[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
-		[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
-		[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
-		[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
-		[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
-		[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
-		[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
-		[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
-		[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
-		[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
-		[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
-		[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
-		[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
-		[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
-		[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
-		[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
-		[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
-		[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
-		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
-		/* Call instruction */
-		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
+		/* Non-UAPI available opcodes. */
 		[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
 		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
-		/* Jumps */
-		[BPF_JMP | BPF_JA] = &&JMP_JA,
-		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
-		[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
-		[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
-		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
-		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
-		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
-		[BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
-		[BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
-		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
-		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
-		[BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
-		[BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
-		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
-		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
-		[BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
-		[BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
-		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
-		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
-		[BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
-		[BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
-		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
-		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
-		/* Program return */
-		[BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
-		/* Store instructions */
-		[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
-		[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
-		[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
-		[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
-		[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
-		[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
-		[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
-		[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
-		[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
-		[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
-		/* Load instructions */
-		[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
-		[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
-		[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
-		[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
-		[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
-		[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
-		[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
-		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
-		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
-		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
-		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
+#undef BPF_INSN_3_LBL
+#undef BPF_INSN_2_LBL
 	u32 tail_call_cnt = 0;
 	void *ptr;
 	int off;
@@ -965,14 +999,10 @@ select_insn:
 		(*(s64 *) &DST) >>= IMM;
 		CONT;
 	ALU64_MOD_X:
-		if (unlikely(SRC == 0))
-			return 0;
 		div64_u64_rem(DST, SRC, &tmp);
 		DST = tmp;
 		CONT;
 	ALU_MOD_X:
-		if (unlikely((u32)SRC == 0))
-			return 0;
 		tmp = (u32) DST;
 		DST = do_div(tmp, (u32) SRC);
 		CONT;
@@ -985,13 +1015,9 @@ select_insn:
 		DST = do_div(tmp, (u32) IMM);
 		CONT;
 	ALU64_DIV_X:
-		if (unlikely(SRC == 0))
-			return 0;
 		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
-		if (unlikely((u32)SRC == 0))
-			return 0;
 		tmp = (u32) DST;
 		do_div(tmp, (u32) SRC);
 		DST = (u32) tmp;
@@ -1302,8 +1328,14 @@ load_byte:
 		goto load_byte;
 
 	default_label:
-		/* If we ever reach this, we have a bug somewhere. */
-		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+		/* If we ever reach this, we have a bug somewhere. Die hard here
+		 * instead of just returning 0; we could be somewhere in a subprog,
+		 * so execution could continue otherwise which we do /not/ want.
+		 *
+		 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
+		 */
+		pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+		BUG_ON(1);
 		return 0;
 }
 STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index d7ea96218516..7b469d10d0e9 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -593,11 +593,10 @@ unlock:
 
 static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
+	struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
-	struct lpm_trie_node *node, *next_node = NULL, *parent;
 	struct lpm_trie_node **node_stack = NULL;
-	struct lpm_trie_node __rcu **root;
 	int err = 0, stack_ptr = -1;
 	unsigned int next_bit;
 	size_t matchlen;
@@ -614,22 +613,21 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 	 */
 
 	/* Empty trie */
-	if (!rcu_dereference(trie->root))
+	search_root = rcu_dereference(trie->root);
+	if (!search_root)
 		return -ENOENT;
 
 	/* For invalid key, find the leftmost node in the trie */
-	if (!key || key->prefixlen > trie->max_prefixlen) {
-		root = &trie->root;
+	if (!key || key->prefixlen > trie->max_prefixlen)
 		goto find_leftmost;
-	}
 
 	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
-			     GFP_USER | __GFP_NOWARN);
+			     GFP_ATOMIC | __GFP_NOWARN);
 	if (!node_stack)
 		return -ENOMEM;
 
 	/* Try to find the exact node for the given key */
-	for (node = rcu_dereference(trie->root); node;) {
+	for (node = search_root; node;) {
 		node_stack[++stack_ptr] = node;
 		matchlen = longest_prefix_match(trie, node, key);
 		if (node->prefixlen != matchlen ||
@@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 		node = rcu_dereference(node->child[next_bit]);
 	}
 	if (!node || node->prefixlen != key->prefixlen ||
-	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
-		root = &trie->root;
+	    (node->flags & LPM_TREE_NODE_FLAG_IM))
 		goto find_leftmost;
-	}
 
 	/* The node with the exactly-matching key has been found,
 	 * find the first node in postorder after the matched node.
@@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 	node = node_stack[stack_ptr];
 	while (stack_ptr > 0) {
 		parent = node_stack[stack_ptr - 1];
-		if (rcu_dereference(parent->child[0]) == node &&
-		    rcu_dereference(parent->child[1])) {
-			root = &parent->child[1];
-			goto find_leftmost;
+		if (rcu_dereference(parent->child[0]) == node) {
+			search_root = rcu_dereference(parent->child[1]);
+			if (search_root)
+				goto find_leftmost;
 		}
 		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
 			next_node = parent;
@@ -673,7 +669,7 @@ find_leftmost:
 	/* Find the leftmost non-intermediate node, all intermediate nodes
 	 * have exact two children, so this function will never return NULL.
 	 */
-	for (node = rcu_dereference(*root); node;) {
+	for (node = search_root; node;) {
 		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
 			next_node = node;
 		node = rcu_dereference(node->child[0]);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5bdb0cc84ad2..e24aa3241387 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -709,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
+	} else if (IS_FD_ARRAY(map)) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
 						   attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dfb138b46488..5fb69a85d967 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4981,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 next_insn:
 			insn++;
 			i++;
+			continue;
+		}
+
+		/* Basic sanity check before we invest more work here. */
+		if (!bpf_opcode_in_insntable(insn->code)) {
+			verbose(env, "unknown opcode %02x\n", insn->code);
+			return -EINVAL;
 		}
 	}
 
@@ -5064,14 +5071,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	return new_prog;
 }
 
-/* The verifier does more data flow analysis than llvm and will not explore
- * branches that are dead at run time. Malicious programs can have dead code
- * too. Therefore replace all dead at-run-time code with nops.
+/* The verifier does more data flow analysis than llvm and will not
+ * explore branches that are dead at run time. Malicious programs can
+ * have dead code too. Therefore replace all dead at-run-time code
+ * with 'ja -1'.
+ *
+ * Just nops are not optimal, e.g. if they would sit at the end of the
+ * program and through another bug we would manage to jump there, then
+ * we'd execute beyond program memory otherwise. Returning exception
+ * code also wouldn't work since we can have subprogs where the dead
+ * code could be located.
  */
 static void sanitize_dead_code(struct bpf_verifier_env *env)
 {
 	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
-	struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+	struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
 	struct bpf_insn *insn = env->prog->insnsi;
 	const int insn_cnt = env->prog->len;
 	int i;
@@ -5079,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++) {
 		if (aux_data[i].seen)
 			continue;
-		memcpy(insn + i, &nop, sizeof(nop));
+		memcpy(insn + i, &trap, sizeof(trap));
 	}
 }
 
@@ -5386,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	int i, cnt, delta = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+		    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+		    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
 		    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
-			/* due to JIT bugs clear upper 32-bits of src register
-			 * before div/mod operation
-			 */
-			insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
-			insn_buf[1] = *insn;
-			cnt = 2;
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+			struct bpf_insn mask_and_div[] = {
+				BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+				/* Rx div 0 -> 0 */
+				BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
+				BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
+				BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+				*insn,
+			};
+			struct bpf_insn mask_and_mod[] = {
+				BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+				/* Rx mod 0 -> Rx */
+				BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+				*insn,
+			};
+			struct bpf_insn *patchlet;
+
+			if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+			    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+				patchlet = mask_and_div + (is64 ? 1 : 0);
+				cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
+			} else {
+				patchlet = mask_and_mod + (is64 ? 1 : 0);
+				cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
 			if (!new_prog)
 				return -ENOMEM;
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index e3938e395cba..4cd9ea9b3449 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -2003,10 +2003,14 @@ static struct bpf_test tests[] = {
 		{ { 4, 0 }, { 5, 10 } }
 	},
 	{
-		"INT: DIV by zero",
+		/* This one doesn't go through verifier, but is just raw insn
+		 * as opposed to cBPF tests from here. Thus div by 0 tests are
+		 * done in test_verifier in BPF kselftests.
+		 */
+		"INT: DIV by -1",
 		.u.insns_int = {
 			BPF_ALU64_REG(BPF_MOV, R6, R1),
-			BPF_ALU64_IMM(BPF_MOV, R7, 0),
+			BPF_ALU64_IMM(BPF_MOV, R7, -1),
 			BPF_LD_ABS(BPF_B, 3),
 			BPF_ALU32_REG(BPF_DIV, R0, R7),
 			BPF_EXIT_INSN(),
diff --git a/net/can/Kconfig b/net/can/Kconfig
index a15c0e0d1fc7..a4399be54ff4 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -11,7 +11,7 @@ menuconfig CAN
 	  1991, mainly for automotive, but now widely used in marine
 	  (NMEA2000), industrial, and medical applications.
 	  More information on the CAN network protocol family PF_CAN
-	  is contained in <Documentation/networking/can.txt>.
+	  is contained in <Documentation/networking/can.rst>.
 
 	  If you want CAN support you should say Y here and also to the
 	  specific driver for your controller(s) below.
diff --git a/net/core/filter.c b/net/core/filter.c
index 18da42a81d0c..08ab4c65a998 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -401,8 +401,8 @@ do_pass:
 		/* Classic BPF expects A and X to be reset first. These need
 		 * to be guaranteed to be the first two instructions.
 		 */
-		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
-		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+		*new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+		*new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 
 		/* All programs must keep CTX in callee saved BPF_REG_CTX.
 		 * In eBPF case it's done by the compiler, here we need to
@@ -459,8 +459,15 @@ do_pass:
 				break;
 
 			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
-			    fp->code == (BPF_ALU | BPF_MOD | BPF_X))
+			    fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
 				*insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
+				/* Error with exception code on div/mod by 0.
+				 * For cBPF programs, this was always return 0.
+				 */
+				*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
+				*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+				*insn++ = BPF_EXIT_INSN();
+			}
 
 			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 			break;
@@ -3232,6 +3239,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct ipv6_pinfo *np = inet6_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				np->tclass = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
+#endif
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
@@ -3241,7 +3271,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false, reinit);
+			ret = tcp_set_congestion_control(sk, name, false,
+							 reinit);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3307,6 +3338,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			*((int *)optval) = (int)np->tclass;
+			break;
+		default:
+			goto err_clear;
+		}
+#endif
 	} else {
 		goto err_clear;
 	}
@@ -3328,6 +3375,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
+	   int, argval)
+{
+	struct sock *sk = bpf_sock->sk;
+	int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+
+	if (!sk_fullsock(sk))
+		return -EINVAL;
+
+#ifdef CONFIG_INET
+	if (val)
+		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
+#else
+	return -EINVAL;
+#endif
+}
+
+static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
+	.func		= bpf_sock_ops_cb_flags_set,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3510,6 +3584,8 @@ static const struct bpf_func_proto *
 		return &bpf_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
 		return &bpf_getsockopt_proto;
+	case BPF_FUNC_sock_ops_cb_flags_set:
+		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
 	default:
@@ -3826,34 +3902,44 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-static bool __is_valid_sock_ops_access(int off, int size)
+static bool sock_ops_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
 {
+	const int size_default = sizeof(__u32);
+
 	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
 		return false;
+
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
-		return false;
-
-	return true;
-}
 
-static bool sock_ops_is_valid_access(int off, int size,
-				     enum bpf_access_type type,
-				     struct bpf_insn_access_aux *info)
-{
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case offsetof(struct bpf_sock_ops, op) ...
-		     offsetof(struct bpf_sock_ops, replylong[3]):
+		case offsetof(struct bpf_sock_ops, reply):
+		case offsetof(struct bpf_sock_ops, sk_txhash):
+			if (size != size_default)
+				return false;
 			break;
 		default:
 			return false;
 		}
+	} else {
+		switch (off) {
+		case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
+					bytes_acked):
+			if (size != sizeof(__u64))
+				return false;
+			break;
+		default:
+			if (size != size_default)
+				return false;
+			break;
+		}
 	}
 
-	return __is_valid_sock_ops_access(off, size);
+	return true;
 }
 
 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -4470,10 +4556,37 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+	case offsetof(struct bpf_sock_ops, state):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_state));
+		break;
+
+	case offsetof(struct bpf_sock_ops, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      FIELD_SIZEOF(struct minmax_sample, t));
+		break;
+
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4485,17 +4598,159 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
-				      offsetof(struct tcp_sock, FIELD_NAME)); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \
+						       OBJ_FIELD),	      \
+				      si->dst_reg, si->dst_reg,		      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
+	} while (0)
+
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
+	do {								      \
+		int reg = BPF_REG_9;					      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),	      \
+				      reg, si->src_reg,			      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+	} while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)	      \
+	do {								      \
+		if (TYPE == BPF_WRITE)					      \
+			SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+		else							      \
+			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP32(snd_cwnd);
+		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP32(srtt_us);
+		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
+		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_ssthresh):
+		SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rcv_nxt):
+		SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_nxt):
+		SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_una):
+		SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, mss_cache):
+		SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, ecn_flags):
+		SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_delivered):
+		SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_interval_us):
+		SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, packets_out):
+		SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, retrans_out):
+		SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, total_retrans):
+		SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_in):
+		SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_in):
+		SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_out):
+		SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, lost_out):
+		SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sacked_out):
+		SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, sk_txhash):
+		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+					  struct sock, type);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_received):
+		SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_acked):
+		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+		break;
+
 	}
 	return insn - insn_buf;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d7cf861bf699..f013ddc191e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
@@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
+	/* We defined a new enum for TCP states that are exported in BPF
+	 * so as not force the internal TCP states to be frozen. The
+	 * following checks will detect if an internal state value ever
+	 * differs from the BPF value. If this ever happens, then we will
+	 * need to remap the internal value to the BPF value before calling
+	 * tcp_call_bpf_2arg.
+	 */
+	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05bd82e3..ddbce73edae8 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
 	 * within a datacenter, where we have reasonable estimates of
 	 * RTTs
 	 */
-	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
 	if (base_rtt > 0) {
 		ca->nv_base_rtt = base_rtt;
 		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 95461f02ac9a..e9f985e42405 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
 
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+				  TCP_SKB_CB(skb)->seq, segs, err);
+
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
@@ -3469,7 +3473,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
-	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6db3124cdbda..257abdde23b0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk)
 						icsk->icsk_user_timeout);
 	}
 	tcp_fastopen_active_detect_blackhole(sk, expired);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+				  icsk->icsk_retransmits,
+				  icsk->icsk_rto, (int)expired);
+
 	if (expired) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
 	}
+
 	return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aa4411c81e7e..fe3966a9c999 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2440,7 +2440,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 
 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 					    struct fib6_config *cfg,
-					    const struct in6_addr *gw_addr)
+					    const struct in6_addr *gw_addr,
+					    u32 tbid, int flags)
 {
 	struct flowi6 fl6 = {
 		.flowi6_oif = cfg->fc_ifindex,
@@ -2449,15 +2450,15 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 	};
 	struct fib6_table *table;
 	struct rt6_info *rt;
-	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
 
-	table = fib6_get_table(net, cfg->fc_table);
+	table = fib6_get_table(net, tbid);
 	if (!table)
 		return NULL;
 
 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
 
+	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
 
 	/* if table lookup failed, fall back to full lookup */
@@ -2469,6 +2470,82 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 	return rt;
 }
 
+static int ip6_route_check_nh_onlink(struct net *net,
+				     struct fib6_config *cfg,
+				     struct net_device *dev,
+				     struct netlink_ext_ack *extack)
+{
+	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+	const struct in6_addr *gw_addr = &cfg->fc_gateway;
+	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
+	struct rt6_info *grt;
+	int err;
+
+	err = 0;
+	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
+	if (grt) {
+		if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
+			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			err = -EINVAL;
+		}
+
+		ip6_rt_put(grt);
+	}
+
+	return err;
+}
+
+static int ip6_route_check_nh(struct net *net,
+			      struct fib6_config *cfg,
+			      struct net_device **_dev,
+			      struct inet6_dev **idev)
+{
+	const struct in6_addr *gw_addr = &cfg->fc_gateway;
+	struct net_device *dev = _dev ? *_dev : NULL;
+	struct rt6_info *grt = NULL;
+	int err = -EHOSTUNREACH;
+
+	if (cfg->fc_table) {
+		int flags = RT6_LOOKUP_F_IFACE;
+
+		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
+					  cfg->fc_table, flags);
+		if (grt) {
+			if (grt->rt6i_flags & RTF_GATEWAY ||
+			    (dev && dev != grt->dst.dev)) {
+				ip6_rt_put(grt);
+				grt = NULL;
+			}
+		}
+	}
+
+	if (!grt)
+		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+
+	if (!grt)
+		goto out;
+
+	if (dev) {
+		if (dev != grt->dst.dev) {
+			ip6_rt_put(grt);
+			goto out;
+		}
+	} else {
+		*_dev = dev = grt->dst.dev;
+		*idev = grt->rt6i_idev;
+		dev_hold(dev);
+		in6_dev_hold(grt->rt6i_idev);
+	}
+
+	if (!(grt->rt6i_flags & RTF_GATEWAY))
+		err = 0;
+
+	ip6_rt_put(grt);
+
+out:
+	return err;
+}
+
 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 					      struct netlink_ext_ack *extack)
 {
@@ -2520,6 +2597,21 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (cfg->fc_metric == 0)
 		cfg->fc_metric = IP6_RT_PRIO_USER;
 
+	if (cfg->fc_flags & RTNH_F_ONLINK) {
+		if (!dev) {
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop device required for onlink");
+			err = -ENODEV;
+			goto out;
+		}
+
+		if (!(dev->flags & IFF_UP)) {
+			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+			err = -ENETDOWN;
+			goto out;
+		}
+	}
+
 	err = -ENOBUFS;
 	if (cfg->fc_nlinfo.nlh &&
 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
@@ -2664,8 +2756,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		rt->rt6i_gateway = *gw_addr;
 
 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
-			struct rt6_info *grt = NULL;
-
 			/* IPv6 strictly inhibits using not link-local
 			   addresses as nexthop address.
 			   Otherwise, router will not able to send redirects.
@@ -2682,40 +2772,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 				goto out;
 			}
 
-			if (cfg->fc_table) {
-				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
-
-				if (grt) {
-					if (grt->rt6i_flags & RTF_GATEWAY ||
-					    (dev && dev != grt->dst.dev)) {
-						ip6_rt_put(grt);
-						grt = NULL;
-					}
-				}
-			}
-
-			if (!grt)
-				grt = rt6_lookup(net, gw_addr, NULL,
-						 cfg->fc_ifindex, 1);
-
-			err = -EHOSTUNREACH;
-			if (!grt)
-				goto out;
-			if (dev) {
-				if (dev != grt->dst.dev) {
-					ip6_rt_put(grt);
-					goto out;
-				}
+			if (cfg->fc_flags & RTNH_F_ONLINK) {
+				err = ip6_route_check_nh_onlink(net, cfg, dev,
+								extack);
 			} else {
-				dev = grt->dst.dev;
-				idev = grt->rt6i_idev;
-				dev_hold(dev);
-				in6_dev_hold(grt->rt6i_idev);
+				err = ip6_route_check_nh(net, cfg, &dev, &idev);
 			}
-			if (!(grt->rt6i_flags & RTF_GATEWAY))
-				err = 0;
-			ip6_rt_put(grt);
-
 			if (err)
 				goto out;
 		}
@@ -2757,6 +2819,7 @@ install_route:
 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
 	    !netif_carrier_ok(dev))
 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
 	rt->dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
@@ -3826,6 +3889,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_CLONED)
 		cfg->fc_flags |= RTF_CACHE;
 
+	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
+
 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 	cfg->fc_nlinfo.nlh = nlh;
 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -4231,6 +4296,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
+	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index cf0e11978b66..267e68379110 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -115,7 +115,6 @@ static int smc_release(struct socket *sock)
 		goto out;
 
 	smc = smc_sk(sk);
-	sock_hold(sk);
 	if (sk->sk_state == SMC_LISTEN)
 		/* smc_close_non_accepted() is called and acquires
 		 * sock lock for child sockets again
@@ -124,10 +123,7 @@ static int smc_release(struct socket *sock)
 	else
 		lock_sock(sk);
 
-	if (smc->use_fallback) {
-		sk->sk_state = SMC_CLOSED;
-		sk->sk_state_change(sk);
-	} else {
+	if (!smc->use_fallback) {
 		rc = smc_close_active(smc);
 		sock_set_flag(sk, SOCK_DEAD);
 		sk->sk_shutdown |= SHUTDOWN_MASK;
@@ -136,20 +132,21 @@ static int smc_release(struct socket *sock)
 		sock_release(smc->clcsock);
 		smc->clcsock = NULL;
 	}
+	if (smc->use_fallback) {
+		sock_put(sk); /* passive closing */
+		sk->sk_state = SMC_CLOSED;
+		sk->sk_state_change(sk);
+	}
 
 	/* detach socket */
 	sock_orphan(sk);
 	sock->sk = NULL;
-	if (smc->use_fallback) {
-		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
-	} else if (sk->sk_state == SMC_CLOSED) {
+	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
 		smc_conn_free(&smc->conn);
-		schedule_delayed_work(&smc->sock_put_work,
-				      SMC_CLOSE_SOCK_PUT_DELAY);
-	}
 	release_sock(sk);
 
-	sock_put(sk);
+	sk->sk_prot->unhash(sk);
+	sock_put(sk); /* final sock_put */
 out:
 	return rc;
 }
@@ -181,7 +178,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
-	INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
 	sk->sk_prot->hash(sk);
 	sk_refcnt_debug_inc(sk);
 
@@ -399,6 +395,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	int rc = 0;
 	u8 ibport;
 
+	sock_hold(&smc->sk); /* sock put in passive closing */
+
 	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
 		/* peer has not signalled SMC-capability */
 		smc->use_fallback = true;
@@ -542,6 +540,8 @@ out_err_unlock:
 	mutex_unlock(&smc_create_lgr_pending);
 	smc_conn_free(&smc->conn);
 out_err:
+	if (smc->sk.sk_state == SMC_INIT)
+		sock_put(&smc->sk); /* passive closing */
 	return rc;
 }
 
@@ -620,7 +620,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 		new_sk->sk_state = SMC_CLOSED;
 		sock_set_flag(new_sk, SOCK_DEAD);
 		new_sk->sk_prot->unhash(new_sk);
-		sock_put(new_sk);
+		sock_put(new_sk); /* final */
 		*new_smc = NULL;
 		goto out;
 	}
@@ -637,7 +637,7 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
 {
 	struct smc_sock *par = smc_sk(parent);
 
-	sock_hold(sk);
+	sock_hold(sk); /* sock_put in smc_accept_unlink () */
 	spin_lock(&par->accept_q_lock);
 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
 	spin_unlock(&par->accept_q_lock);
@@ -653,7 +653,7 @@ static void smc_accept_unlink(struct sock *sk)
 	list_del_init(&smc_sk(sk)->accept_q);
 	spin_unlock(&par->accept_q_lock);
 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
-	sock_put(sk);
+	sock_put(sk); /* sock_hold in smc_accept_enqueue */
 }
 
 /* remove a sock from the accept queue to bind it to a new socket created
@@ -670,8 +670,12 @@ struct sock *smc_accept_dequeue(struct sock *parent,
 
 		smc_accept_unlink(new_sk);
 		if (new_sk->sk_state == SMC_CLOSED) {
+			if (isk->clcsock) {
+				sock_release(isk->clcsock);
+				isk->clcsock = NULL;
+			}
 			new_sk->sk_prot->unhash(new_sk);
-			sock_put(new_sk);
+			sock_put(new_sk); /* final */
 			continue;
 		}
 		if (new_sock)
@@ -686,14 +690,11 @@ void smc_close_non_accepted(struct sock *sk)
 {
 	struct smc_sock *smc = smc_sk(sk);
 
-	sock_hold(sk);
 	lock_sock(sk);
 	if (!sk->sk_lingertime)
 		/* wait for peer closing */
 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
-	if (smc->use_fallback) {
-		sk->sk_state = SMC_CLOSED;
-	} else {
+	if (!smc->use_fallback) {
 		smc_close_active(smc);
 		sock_set_flag(sk, SOCK_DEAD);
 		sk->sk_shutdown |= SHUTDOWN_MASK;
@@ -706,14 +707,15 @@ void smc_close_non_accepted(struct sock *sk)
 		sock_release(tcp);
 	}
 	if (smc->use_fallback) {
-		schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
-	} else if (sk->sk_state == SMC_CLOSED) {
-		smc_conn_free(&smc->conn);
-		schedule_delayed_work(&smc->sock_put_work,
-				      SMC_CLOSE_SOCK_PUT_DELAY);
+		sock_put(sk); /* passive closing */
+		sk->sk_state = SMC_CLOSED;
+	} else {
+		if (sk->sk_state == SMC_CLOSED)
+			smc_conn_free(&smc->conn);
 	}
 	release_sock(sk);
-	sock_put(sk);
+	sk->sk_prot->unhash(sk);
+	sock_put(sk); /* final sock_put */
 }
 
 static int smc_serv_conf_first_link(struct smc_sock *smc)
@@ -937,6 +939,8 @@ out_err_unlock:
 		smc_lgr_forget(new_smc->conn.lgr);
 	mutex_unlock(&smc_create_lgr_pending);
 out_err:
+	if (newsmcsk->sk_state == SMC_INIT)
+		sock_put(&new_smc->sk); /* passive closing */
 	newsmcsk->sk_state = SMC_CLOSED;
 	smc_conn_free(&new_smc->conn);
 	goto enqueue; /* queue new sock with sk_err set */
@@ -963,12 +967,22 @@ static void smc_tcp_listen_work(struct work_struct *work)
 		sock_hold(lsk); /* sock_put in smc_listen_work */
 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 		smc_copy_sock_settings_to_smc(new_smc);
-		schedule_work(&new_smc->smc_listen_work);
+		sock_hold(&new_smc->sk); /* sock_put in passive closing */
+		if (!schedule_work(&new_smc->smc_listen_work))
+			sock_put(&new_smc->sk);
 	}
 
 out:
+	if (lsmc->clcsock) {
+		sock_release(lsmc->clcsock);
+		lsmc->clcsock = NULL;
+	}
 	release_sock(lsk);
-	lsk->sk_data_ready(lsk); /* no more listening, wake accept */
+	/* no more listening, wake up smc_close_wait_listen_clcsock and
+	 * accept
+	 */
+	lsk->sk_state_change(lsk);
+	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
 }
 
 static int smc_listen(struct socket *sock, int backlog)
@@ -1002,7 +1016,9 @@ static int smc_listen(struct socket *sock, int backlog)
 	sk->sk_ack_backlog = 0;
 	sk->sk_state = SMC_LISTEN;
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
-	schedule_work(&smc->tcp_listen_work);
+	sock_hold(sk); /* sock_hold in tcp_listen_worker */
+	if (!schedule_work(&smc->tcp_listen_work))
+		sock_put(sk);
 
 out:
 	release_sock(sk);
@@ -1019,6 +1035,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 	int rc = 0;
 
 	lsmc = smc_sk(sk);
+	sock_hold(sk); /* sock_put below */
 	lock_sock(sk);
 
 	if (lsmc->sk.sk_state != SMC_LISTEN) {
@@ -1053,6 +1070,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 
 out:
 	release_sock(sk);
+	sock_put(sk); /* sock_hold above */
 	return rc;
 }
 
@@ -1122,21 +1140,15 @@ out:
 
 static unsigned int smc_accept_poll(struct sock *parent)
 {
-	struct smc_sock *isk;
-	struct sock *sk;
+	struct smc_sock *isk = smc_sk(parent);
+	int mask = 0;
 
-	lock_sock(parent);
-	list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
-		sk = (struct sock *)isk;
+	spin_lock(&isk->accept_q_lock);
+	if (!list_empty(&isk->accept_q))
+		mask = POLLIN | POLLRDNORM;
+	spin_unlock(&isk->accept_q_lock);
 
-		if (sk->sk_state == SMC_ACTIVE) {
-			release_sock(parent);
-			return POLLIN | POLLRDNORM;
-		}
-	}
-	release_sock(parent);
-
-	return 0;
+	return mask;
 }
 
 static unsigned int smc_poll(struct file *file, struct socket *sock,
@@ -1147,9 +1159,15 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 	struct smc_sock *smc;
 	int rc;
 
+	if (!sk)
+		return POLLNVAL;
+
 	smc = smc_sk(sock->sk);
+	sock_hold(sk);
+	lock_sock(sk);
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
+		release_sock(sk);
 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		/* if non-blocking connect finished ... */
 		lock_sock(sk);
@@ -1161,37 +1179,43 @@ static unsigned int smc_poll(struct file *file, struct socket *sock,
 				rc = smc_connect_rdma(smc);
 				if (rc < 0)
 					mask |= POLLERR;
-				else
-					/* success cases including fallback */
-					mask |= POLLOUT | POLLWRNORM;
+				/* success cases including fallback */
+				mask |= POLLOUT | POLLWRNORM;
 			}
 		}
-		release_sock(sk);
 	} else {
-		sock_poll_wait(file, sk_sleep(sk), wait);
-		if (sk->sk_state == SMC_LISTEN)
-			/* woken up by sk_data_ready in smc_listen_work() */
-			mask |= smc_accept_poll(sk);
+		if (sk->sk_state != SMC_CLOSED) {
+			release_sock(sk);
+			sock_poll_wait(file, sk_sleep(sk), wait);
+			lock_sock(sk);
+		}
 		if (sk->sk_err)
 			mask |= POLLERR;
-		if (atomic_read(&smc->conn.sndbuf_space) ||
-		    (sk->sk_shutdown & SEND_SHUTDOWN)) {
-			mask |= POLLOUT | POLLWRNORM;
-		} else {
-			sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		}
-		if (atomic_read(&smc->conn.bytes_to_rcv))
-			mask |= POLLIN | POLLRDNORM;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
 		    (sk->sk_state == SMC_CLOSED))
 			mask |= POLLHUP;
-		if (sk->sk_shutdown & RCV_SHUTDOWN)
-			mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-		if (sk->sk_state == SMC_APPCLOSEWAIT1)
-			mask |= POLLIN;
+		if (sk->sk_state == SMC_LISTEN) {
+			/* woken up by sk_data_ready in smc_listen_work() */
+			mask = smc_accept_poll(sk);
+		} else {
+			if (atomic_read(&smc->conn.sndbuf_space) ||
+			    sk->sk_shutdown & SEND_SHUTDOWN) {
+				mask |= POLLOUT | POLLWRNORM;
+			} else {
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			}
+			if (atomic_read(&smc->conn.bytes_to_rcv))
+				mask |= POLLIN | POLLRDNORM;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+			if (sk->sk_state == SMC_APPCLOSEWAIT1)
+				mask |= POLLIN;
+		}
 
 	}
+	release_sock(sk);
+	sock_put(sk);
 
 	return mask;
 }
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 0bee9d16cf29..9518986c97b1 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -178,7 +178,6 @@ struct smc_sock {				/* smc sock container */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
-	struct delayed_work	sock_put_work;	/* final socket freeing */
 	bool			use_fallback;	/* fallback to tcp */
 	u8			wait_close_tx_prepared : 1;
 						/* shutdown wr or close
@@ -253,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed)
 static inline bool using_ipsec(struct smc_sock *smc)
 {
 	return (smc->clcsock->sk->sk_policy[0] ||
-		smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+		smc->clcsock->sk->sk_policy[1]) ? true : false;
 }
 #else
 static inline bool using_ipsec(struct smc_sock *smc)
 {
-	return 0;
+	return false;
 }
 #endif
 
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 6e8f5fbe0f09..3cd086e5bd28 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -212,6 +212,14 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		smc->sk.sk_data_ready(&smc->sk);
 	}
 
+	/* piggy backed tx info */
+	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+	if (diff_cons && smc_tx_prepared_sends(conn)) {
+		smc_tx_sndbuf_nonempty(conn);
+		/* trigger socket release if connection closed */
+		smc_close_wake_tx_prepared(smc);
+	}
+
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
@@ -221,15 +229,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		if (smc->clcsock && smc->clcsock->sk)
 			smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
 		sock_set_flag(&smc->sk, SOCK_DONE);
-		schedule_work(&conn->close_work);
-	}
-
-	/* piggy backed tx info */
-	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
-	if (diff_cons && smc_tx_prepared_sends(conn)) {
-		smc_tx_sndbuf_nonempty(conn);
-		/* trigger socket release if connection closed */
-		smc_close_wake_tx_prepared(smc);
+		sock_hold(&smc->sk); /* sock_put in close_work */
+		if (!schedule_work(&conn->close_work))
+			sock_put(&smc->sk);
 	}
 }
 
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index babe05d385e7..e339c0186dcf 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -19,6 +19,8 @@
 #include "smc_cdc.h"
 #include "smc_close.h"
 
+#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME	(5 * HZ)
+
 static void smc_close_cleanup_listen(struct sock *parent)
 {
 	struct sock *sk;
@@ -28,6 +30,27 @@ static void smc_close_cleanup_listen(struct sock *parent)
 		smc_close_non_accepted(sk);
 }
 
+static void smc_close_wait_listen_clcsock(struct smc_sock *smc)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct sock *sk = &smc->sk;
+	signed long timeout;
+
+	timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME;
+	add_wait_queue(sk_sleep(sk), &wait);
+	do {
+		release_sock(sk);
+		if (smc->clcsock)
+			timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE,
+					     timeout);
+		sched_annotate_sleep();
+		lock_sock(sk);
+		if (!smc->clcsock)
+			break;
+	} while (timeout);
+	remove_wait_queue(sk_sleep(sk), &wait);
+}
+
 /* wait for sndbuf data being transmitted */
 static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
 {
@@ -110,10 +133,10 @@ static void smc_close_active_abort(struct smc_sock *smc)
 		release_sock(sk);
 		cancel_delayed_work_sync(&smc->conn.tx_work);
 		lock_sock(sk);
+		sock_put(sk); /* passive closing */
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
-		sock_release(smc->clcsock);
 		if (!smc_cdc_rxed_any_close(&smc->conn))
 			sk->sk_state = SMC_PEERABORTWAIT;
 		else
@@ -125,19 +148,20 @@ static void smc_close_active_abort(struct smc_sock *smc)
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 		if (!txflags->peer_conn_closed) {
+			/* just SHUTDOWN_SEND done */
 			sk->sk_state = SMC_PEERABORTWAIT;
-			sock_release(smc->clcsock);
 		} else {
 			sk->sk_state = SMC_CLOSED;
 		}
+		sock_put(sk); /* passive closing */
 		break;
 	case SMC_PROCESSABORT:
 	case SMC_APPFINCLOSEWAIT:
-		if (!txflags->peer_conn_closed)
-			sock_release(smc->clcsock);
 		sk->sk_state = SMC_CLOSED;
 		break;
 	case SMC_PEERFINCLOSEWAIT:
+		sock_put(sk); /* passive closing */
+		break;
 	case SMC_PEERABORTWAIT:
 	case SMC_CLOSED:
 		break;
@@ -172,8 +196,6 @@ again:
 	switch (sk->sk_state) {
 	case SMC_INIT:
 		sk->sk_state = SMC_CLOSED;
-		if (smc->smc_listen_work.func)
-			cancel_work_sync(&smc->smc_listen_work);
 		break;
 	case SMC_LISTEN:
 		sk->sk_state = SMC_CLOSED;
@@ -182,11 +204,9 @@ again:
 			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
 			/* wake up kernel_accept of smc_tcp_listen_worker */
 			smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+			smc_close_wait_listen_clcsock(smc);
 		}
-		release_sock(sk);
 		smc_close_cleanup_listen(sk);
-		cancel_work_sync(&smc->smc_listen_work);
-		lock_sock(sk);
 		break;
 	case SMC_ACTIVE:
 		smc_close_stream_wait(smc, timeout);
@@ -229,12 +249,14 @@ again:
 		rc = smc_close_final(conn);
 		if (rc)
 			break;
-		if (smc_cdc_rxed_any_close(conn))
+		if (smc_cdc_rxed_any_close(conn)) {
 			/* peer has closed the socket already */
 			sk->sk_state = SMC_CLOSED;
-		else
+			sock_put(sk); /* postponed passive closing */
+		} else {
 			/* peer has just issued a shutdown write */
 			sk->sk_state = SMC_PEERFINCLOSEWAIT;
+		}
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
@@ -272,27 +294,33 @@ static void smc_close_passive_abort_received(struct smc_sock *smc)
 	struct sock *sk = &smc->sk;
 
 	switch (sk->sk_state) {
+	case SMC_INIT:
 	case SMC_ACTIVE:
-	case SMC_APPFINCLOSEWAIT:
 	case SMC_APPCLOSEWAIT1:
-	case SMC_APPCLOSEWAIT2:
+		sk->sk_state = SMC_PROCESSABORT;
+		sock_put(sk); /* passive closing */
+		break;
+	case SMC_APPFINCLOSEWAIT:
 		sk->sk_state = SMC_PROCESSABORT;
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 		if (txflags->peer_done_writing &&
-		    !smc_close_sent_any_close(&smc->conn)) {
+		    !smc_close_sent_any_close(&smc->conn))
 			/* just shutdown, but not yet closed locally */
 			sk->sk_state = SMC_PROCESSABORT;
-		} else {
+		else
 			sk->sk_state = SMC_CLOSED;
-		}
+		sock_put(sk); /* passive closing */
 		break;
+	case SMC_APPCLOSEWAIT2:
 	case SMC_PEERFINCLOSEWAIT:
+		sk->sk_state = SMC_CLOSED;
+		sock_put(sk); /* passive closing */
+		break;
 	case SMC_PEERABORTWAIT:
 		sk->sk_state = SMC_CLOSED;
 		break;
-	case SMC_INIT:
 	case SMC_PROCESSABORT:
 	/* nothing to do, add tracing in future patch */
 		break;
@@ -336,13 +364,18 @@ static void smc_close_passive_work(struct work_struct *work)
 	case SMC_INIT:
 		if (atomic_read(&conn->bytes_to_rcv) ||
 		    (rxflags->peer_done_writing &&
-		     !smc_cdc_rxed_any_close(conn)))
+		     !smc_cdc_rxed_any_close(conn))) {
 			sk->sk_state = SMC_APPCLOSEWAIT1;
-		else
+		} else {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk); /* passive closing */
+		}
 		break;
 	case SMC_ACTIVE:
 		sk->sk_state = SMC_APPCLOSEWAIT1;
+		/* postpone sock_put() for passive closing to cover
+		 * received SEND_SHUTDOWN as well
+		 */
 		break;
 	case SMC_PEERCLOSEWAIT1:
 		if (rxflags->peer_done_writing)
@@ -360,13 +393,20 @@ static void smc_close_passive_work(struct work_struct *work)
 			/* just shutdown, but not yet closed locally */
 			sk->sk_state = SMC_APPFINCLOSEWAIT;
 		}
+		sock_put(sk); /* passive closing */
 		break;
 	case SMC_PEERFINCLOSEWAIT:
-		if (smc_cdc_rxed_any_close(conn))
+		if (smc_cdc_rxed_any_close(conn)) {
 			sk->sk_state = SMC_CLOSED;
+			sock_put(sk); /* passive closing */
+		}
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
+		/* postpone sock_put() for passive closing to cover
+		 * received SEND_SHUTDOWN as well
+		 */
+		break;
 	case SMC_APPFINCLOSEWAIT:
 	case SMC_PEERABORTWAIT:
 	case SMC_PROCESSABORT:
@@ -382,23 +422,11 @@ wakeup:
 	if (old_state != sk->sk_state) {
 		sk->sk_state_change(sk);
 		if ((sk->sk_state == SMC_CLOSED) &&
-		    (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) {
+		    (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket))
 			smc_conn_free(conn);
-			schedule_delayed_work(&smc->sock_put_work,
-					      SMC_CLOSE_SOCK_PUT_DELAY);
-		}
 	}
 	release_sock(sk);
-}
-
-void smc_close_sock_put_work(struct work_struct *work)
-{
-	struct smc_sock *smc = container_of(to_delayed_work(work),
-					    struct smc_sock,
-					    sock_put_work);
-
-	smc->sk.sk_prot->unhash(&smc->sk);
-	sock_put(&smc->sk);
+	sock_put(sk); /* sock_hold done by schedulers of close_work */
 }
 
 int smc_close_shutdown_write(struct smc_sock *smc)
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index 8c498885d758..19eb6a211c23 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -21,7 +21,6 @@
 
 void smc_close_wake_tx_prepared(struct smc_sock *smc);
 int smc_close_active(struct smc_sock *smc);
-void smc_close_sock_put_work(struct work_struct *work);
 int smc_close_shutdown_write(struct smc_sock *smc);
 void smc_close_init(struct smc_sock *smc);
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ed5b46d1fe41..2424c7100aaf 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -328,13 +328,13 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	while (node) {
 		conn = rb_entry(node, struct smc_connection, alert_node);
 		smc = container_of(conn, struct smc_sock, conn);
-		sock_hold(&smc->sk);
+		sock_hold(&smc->sk); /* sock_put in close work */
 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
 		__smc_lgr_unregister_conn(conn);
 		write_unlock_bh(&lgr->conns_lock);
-		schedule_work(&conn->close_work);
+		if (!schedule_work(&conn->close_work))
+			sock_put(&smc->sk);
 		write_lock_bh(&lgr->conns_lock);
-		sock_put(&smc->sk);
 		node = rb_first(&lgr->conns_all);
 	}
 	write_unlock_bh(&lgr->conns_lock);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 90f1a7f9085c..2a8957bd6d38 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -141,6 +141,17 @@ out:
 	return rc;
 }
 
+static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_link_group *lgr, *l;
+
+	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
+			smc_lgr_terminate(lgr);
+	}
+}
+
 /* process context wrapper for might_sleep smc_ib_remember_port_attr */
 static void smc_ib_port_event_work(struct work_struct *work)
 {
@@ -151,6 +162,8 @@ static void smc_ib_port_event_work(struct work_struct *work)
 	for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
 		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
 		clear_bit(port_idx, &smcibdev->port_event_mask);
+		if (!smc_ib_port_active(smcibdev, port_idx + 1))
+			smc_ib_port_terminate(smcibdev, port_idx + 1);
 	}
 }
 
@@ -165,15 +178,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
 
 	switch (ibevent->event) {
 	case IB_EVENT_PORT_ERR:
-		port_idx = ibevent->element.port_num - 1;
-		set_bit(port_idx, &smcibdev->port_event_mask);
-		schedule_work(&smcibdev->port_event_work);
-		/* fall through */
 	case IB_EVENT_DEVICE_FATAL:
-		/* tbd in follow-on patch:
-		 * abnormal close of corresponding connections
-		 */
-		break;
 	case IB_EVENT_PORT_ACTIVE:
 		port_idx = ibevent->element.port_num - 1;
 		set_bit(port_idx, &smcibdev->port_event_mask);
@@ -186,7 +191,8 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
 
 void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
 {
-	ib_dealloc_pd(lnk->roce_pd);
+	if (lnk->roce_pd)
+		ib_dealloc_pd(lnk->roce_pd);
 	lnk->roce_pd = NULL;
 }
 
@@ -203,14 +209,18 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
 
 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 {
+	struct smc_ib_device *smcibdev =
+		(struct smc_ib_device *)ibevent->device;
+	u8 port_idx;
+
 	switch (ibevent->event) {
 	case IB_EVENT_DEVICE_FATAL:
 	case IB_EVENT_GID_CHANGE:
 	case IB_EVENT_PORT_ERR:
 	case IB_EVENT_QP_ACCESS_ERR:
-		/* tbd in follow-on patch:
-		 * abnormal close of corresponding connections
-		 */
+		port_idx = ibevent->element.port_num - 1;
+		set_bit(port_idx, &smcibdev->port_event_mask);
+		schedule_work(&smcibdev->port_event_work);
 		break;
 	default:
 		break;
@@ -219,7 +229,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 
 void smc_ib_destroy_queue_pair(struct smc_link *lnk)
 {
-	ib_destroy_qp(lnk->roce_qp);
+	if (lnk->roce_qp)
+		ib_destroy_qp(lnk->roce_qp);
 	lnk->roce_qp = NULL;
 }
 
@@ -462,6 +473,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
 {
 	if (!smcibdev->initialized)
 		return;
+	smcibdev->initialized = 0;
 	smc_wr_remove_dev(smcibdev);
 	ib_unregister_event_handler(&smcibdev->event_handler);
 	ib_destroy_cq(smcibdev->roce_cq_recv);
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 92b4648e75ca..8e70291e586a 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -147,8 +147,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	if (!x->type_offload)
 		return -EINVAL;
 
-	/* We don't yet support UDP encapsulation, TFC padding and ESN. */
-	if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
+	/* We don't yet support UDP encapsulation and TFC padding. */
+	if (x->encap || x->tfcpad)
 		return -EINVAL;
 
 	dev = dev_get_by_index(net, xuo->ifindex);
@@ -178,6 +178,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		return 0;
 	}
 
+	if (x->props.flags & XFRM_STATE_ESN &&
+	    !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
+		xso->dev = NULL;
+		dev_put(dev);
+		return -EINVAL;
+	}
+
 	xso->dev = dev;
 	xso->num_exthdrs = 1;
 	xso->flags = xuo->flags;
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 02501817227b..1d38c6acf8af 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -551,6 +551,8 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
 			bitnr = replay_esn->replay_window - (diff - pos);
 	}
 
+	xfrm_dev_state_advance_esn(x);
+
 	nr = bitnr >> 5;
 	bitnr = bitnr & 0x1F;
 	replay_esn->bmp[nr] |= (1U << bitnr);
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 7f61a3d57fa7..64335bb94f9f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -201,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH)
 endif
 
 # Trick to allow make to be run from this directory
-all:
+all: $(LIBBPF)
 	$(MAKE) -C ../../ $(CURDIR)/
 
 clean:
 	$(MAKE) -C ../../ M=$(CURDIR) clean
 	@rm -f *~
 
+$(LIBBPF): FORCE
+	$(MAKE) -C $(dir $@) $(notdir $@)
+
 $(obj)/syscall_nrs.s:	$(src)/syscall_nrs.c
 	$(call if_changed_dep,cc_s_c)
 
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7cc9d228216f..7c25c0c112bc 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -23,8 +23,11 @@
 #include <stdbool.h>
 #include <signal.h>
 #include <fcntl.h>
+#include <sys/wait.h>
+#include <time.h>
 
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 
 #include <linux/netlink.h>
@@ -35,6 +38,8 @@
 #include <assert.h>
 #include <libgen.h>
 
+#include <getopt.h>
+
 #include "../bpf/bpf_load.h"
 #include "../bpf/bpf_util.h"
 #include "../bpf/libbpf.h"
@@ -46,15 +51,42 @@ void running_handler(int a);
 #define S1_PORT 10000
 #define S2_PORT 10001
 
-static int sockmap_test_sockets(int rate, int dot)
+/* global sockets */
+int s1, s2, c1, c2, p1, p2;
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"cgroup",	required_argument,	NULL, 'c' },
+	{"rate",	required_argument,	NULL, 'r' },
+	{"verbose",	no_argument,		NULL, 'v' },
+	{"iov_count",	required_argument,	NULL, 'i' },
+	{"length",	required_argument,	NULL, 'l' },
+	{"test",	required_argument,	NULL, 't' },
+	{0, 0, NULL, 0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
+	printf(" options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)\n",
+				*long_options[i].flag);
+		else
+			printf(" -%c\n", long_options[i].val);
+	}
+	printf("\n");
+}
+
+static int sockmap_init_sockets(void)
 {
-	int i, sc, err, max_fd, one = 1;
-	int s1, s2, c1, c2, p1, p2;
+	int i, err, one = 1;
 	struct sockaddr_in addr;
-	struct timeval timeout;
-	char buf[1024] = {0};
 	int *fds[4] = {&s1, &s2, &c1, &c2};
-	fd_set w;
 
 	s1 = s2 = p1 = p2 = c1 = c2 = 0;
 
@@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot)
 		*fds[i] = socket(AF_INET, SOCK_STREAM, 0);
 		if (*fds[i] < 0) {
 			perror("socket s1 failed()");
-			err = *fds[i];
-			goto out;
+			return errno;
 		}
 	}
 
@@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot)
 				 (char *)&one, sizeof(one));
 		if (err) {
 			perror("setsockopt failed()");
-			goto out;
+			return errno;
 		}
 	}
 
 	/* Non-blocking sockets */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < 2; i++) {
 		err = ioctl(*fds[i], FIONBIO, (char *)&one);
 		if (err < 0) {
 			perror("ioctl s1 failed()");
-			goto out;
+			return errno;
 		}
 	}
 
@@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
 		perror("bind s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
 		perror("bind s2 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	/* Listen server sockets */
@@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = listen(s1, 32);
 	if (err < 0) {
 		perror("listen s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = listen(s2, 32);
 	if (err < 0) {
 		perror("listen s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	/* Initiate Connect */
@@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
 		perror("connect c1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
 		perror("connect c2 failed()\n");
-		goto out;
+		return errno;
+	} else if (err < 0) {
+		err = 0;
 	}
 
 	/* Accept Connecrtions */
 	p1 = accept(s1, NULL, NULL);
 	if (p1 < 0) {
 		perror("accept s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	p2 = accept(s2, NULL, NULL);
 	if (p2 < 0) {
 		perror("accept s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
-	max_fd = p2;
-	timeout.tv_sec = 10;
-	timeout.tv_usec = 0;
-
 	printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
 	printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
 		c1, s1, c2, s2);
+	return 0;
+}
+
+struct msg_stats {
+	size_t bytes_sent;
+	size_t bytes_recvd;
+	struct timespec start;
+	struct timespec end;
+};
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+		    struct msg_stats *s, bool tx)
+{
+	struct msghdr msg = {0};
+	int err, i, flags = MSG_NOSIGNAL;
+	struct iovec *iov;
+
+	iov = calloc(iov_count, sizeof(struct iovec));
+	if (!iov)
+		return errno;
+
+	for (i = 0; i < iov_count; i++) {
+		char *d = calloc(iov_length, sizeof(char));
+
+		if (!d) {
+			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
+			goto out_errno;
+		}
+		iov[i].iov_base = d;
+		iov[i].iov_len = iov_length;
+	}
+
+	msg.msg_iov = iov;
+	msg.msg_iovlen = iov_count;
+
+	if (tx) {
+		clock_gettime(CLOCK_MONOTONIC, &s->start);
+		for (i = 0; i < cnt; i++) {
+			int sent = sendmsg(fd, &msg, flags);
+
+			if (sent < 0) {
+				perror("send loop error:");
+				goto out_errno;
+			}
+			s->bytes_sent += sent;
+		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
+	} else {
+		int slct, recv, max_fd = fd;
+		struct timeval timeout;
+		float total_bytes;
+		fd_set w;
+
+		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
+		if (err < 0)
+			perror("recv start time: ");
+		while (s->bytes_recvd < total_bytes) {
+			timeout.tv_sec = 1;
+			timeout.tv_usec = 0;
+
+			/* FD sets */
+			FD_ZERO(&w);
+			FD_SET(fd, &w);
+
+			slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
+			if (slct == -1) {
+				perror("select()");
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
+				goto out_errno;
+			} else if (!slct) {
+				fprintf(stderr, "unexpected timeout\n");
+				errno = -EIO;
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
+				goto out_errno;
+			}
+
+			recv = recvmsg(fd, &msg, flags);
+			if (recv < 0) {
+				if (errno != EWOULDBLOCK) {
+					clock_gettime(CLOCK_MONOTONIC, &s->end);
+					perror("recv failed()\n");
+					goto out_errno;
+				}
+			}
+
+			s->bytes_recvd += recv;
+		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
+	}
+
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return 0;
+out_errno:
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return errno;
+}
+
+static float giga = 1000000000;
+
+static inline float sentBps(struct msg_stats s)
+{
+	return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static inline float recvdBps(struct msg_stats s)
+{
+	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static int sendmsg_test(int iov_count, int iov_buf, int cnt,
+			int verbose, bool base)
+{
+	float sent_Bps = 0, recvd_Bps = 0;
+	int rx_fd, txpid, rxpid, err = 0;
+	struct msg_stats s = {0};
+	int status;
+
+	errno = 0;
+
+	if (base)
+		rx_fd = p1;
+	else
+		rx_fd = p2;
+
+	rxpid = fork();
+	if (rxpid == 0) {
+		err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false);
+		if (err)
+			fprintf(stderr,
+				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		shutdown(p2, SHUT_RDWR);
+		shutdown(p1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		fprintf(stdout,
+			"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+			s.bytes_sent, sent_Bps, sent_Bps/giga,
+			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		exit(1);
+	} else if (rxpid == -1) {
+		perror("msg_loop_rx: ");
+		return errno;
+	}
+
+	txpid = fork();
+	if (txpid == 0) {
+		err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
+		if (err)
+			fprintf(stderr,
+				"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		shutdown(c1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		fprintf(stdout,
+			"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+			s.bytes_sent, sent_Bps, sent_Bps/giga,
+			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		exit(1);
+	} else if (txpid == -1) {
+		perror("msg_loop_tx: ");
+		return errno;
+	}
+
+	assert(waitpid(rxpid, &status, 0) == rxpid);
+	assert(waitpid(txpid, &status, 0) == txpid);
+	return err;
+}
+
+static int forever_ping_pong(int rate, int verbose)
+{
+	struct timeval timeout;
+	char buf[1024] = {0};
+	int sc;
+
+	timeout.tv_sec = 10;
+	timeout.tv_usec = 0;
 
 	/* Ping/Pong data from client to server */
 	sc = send(c1, buf, sizeof(buf), 0);
 	if (sc < 0) {
 		perror("send failed()\n");
-		goto out;
+		return sc;
 	}
 
 	do {
-		int s, rc, i;
+		int s, rc, i, max_fd = p2;
+		fd_set w;
 
 		/* FD sets */
 		FD_ZERO(&w);
@@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot)
 			if (rc < 0) {
 				if (errno != EWOULDBLOCK) {
 					perror("recv failed()\n");
-					break;
+					return rc;
 				}
 			}
 
@@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot)
 			sc = send(i, buf, rc, 0);
 			if (sc < 0) {
 				perror("send failed()\n");
-				break;
+				return sc;
 			}
 		}
-		sleep(rate);
-		if (dot) {
+
+		if (rate)
+			sleep(rate);
+
+		if (verbose) {
 			printf(".");
 			fflush(stdout);
 
 		}
 	} while (running);
 
-out:
-	close(s1);
-	close(s2);
-	close(p1);
-	close(p2);
-	close(c1);
-	close(c2);
-	return err;
+	return 0;
 }
 
+enum {
+	PING_PONG,
+	SENDMSG,
+	BASE,
+};
+
 int main(int argc, char **argv)
 {
-	int rate = 1, dot = 1;
+	int iov_count = 1, length = 1024, rate = 1, verbose = 0;
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	int opt, longindex, err, cg_fd = 0;
+	int test = PING_PONG;
 	char filename[256];
-	int err, cg_fd;
-	char *cg_path;
 
-	cg_path = argv[argc - 1];
+	while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		/* Cgroup configuration */
+		case 'c':
+			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+			if (cg_fd < 0) {
+				fprintf(stderr,
+					"ERROR: (%i) open cg path failed: %s\n",
+					cg_fd, optarg);
+				return cg_fd;
+			}
+			break;
+		case 'r':
+			rate = atoi(optarg);
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'i':
+			iov_count = atoi(optarg);
+			break;
+		case 'l':
+			length = atoi(optarg);
+			break;
+		case 't':
+			if (strcmp(optarg, "ping") == 0) {
+				test = PING_PONG;
+			} else if (strcmp(optarg, "sendmsg") == 0) {
+				test = SENDMSG;
+			} else if (strcmp(optarg, "base") == 0) {
+				test = BASE;
+			} else {
+				usage(argv);
+				return -1;
+			}
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return -1;
+		}
+	}
+
+	if (!cg_fd) {
+		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
+			argv[0]);
+		return -1;
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
 	running = 1;
@@ -241,20 +515,16 @@ int main(int argc, char **argv)
 	/* catch SIGINT */
 	signal(SIGINT, running_handler);
 
+	/* If base test skip BPF setup */
+	if (test == BASE)
+		goto run;
+
 	if (load_bpf_file(filename)) {
 		fprintf(stderr, "load_bpf_file: (%s) %s\n",
 			filename, strerror(errno));
 		return 1;
 	}
 
-	/* Cgroup configuration */
-	cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
-	if (cg_fd < 0) {
-		fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n",
-			cg_fd, cg_path);
-		return cg_fd;
-	}
-
 	/* Attach programs to sockmap */
 	err = bpf_prog_attach(prog_fd[0], map_fd[0],
 				BPF_SK_SKB_STREAM_PARSER, 0);
@@ -280,12 +550,30 @@ int main(int argc, char **argv)
 		return err;
 	}
 
-	err = sockmap_test_sockets(rate, dot);
+run:
+	err = sockmap_init_sockets();
 	if (err) {
 		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
-		return err;
+		goto out;
 	}
-	return 0;
+
+	if (test == PING_PONG)
+		err = forever_ping_pong(rate, verbose);
+	else if (test == SENDMSG)
+		err = sendmsg_test(iov_count, length, rate, verbose, false);
+	else if (test == BASE)
+		err = sendmsg_test(iov_count, length, rate, verbose, true);
+	else
+		fprintf(stderr, "unknown test\n");
+out:
+	close(s1);
+	close(s2);
+	close(p1);
+	close(p2);
+	close(c1);
+	close(c2);
+	close(cg_fd);
+	return err;
 }
 
 void running_handler(int a)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index af1f49ad8b88..db6bdc375126 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -17,7 +17,7 @@
 #define BPF_ALU64	0x07	/* alu mode in double word width */
 
 /* ld/ldx fields */
-#define BPF_DW		0x18	/* double word */
+#define BPF_DW		0x18	/* double word (64-bit) */
 #define BPF_XADD	0xc0	/* exclusive add */
 
 /* alu/jmp fields */
@@ -642,6 +642,14 @@ union bpf_attr {
  *     @optlen: length of optval in bytes
  *     Return: 0 or negative error
  *
+ * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
+ *     Set callback flags for sock_ops
+ *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
+ *     @flags: flags value
+ *     Return: 0 for no error
+ *             -EINVAL if there is no full tcp socket
+ *             bits in flags that are not supported by current kernel
+ *
  * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
  *     Grow or shrink room in sk_buff.
  *     @skb: pointer to skb
@@ -748,7 +756,8 @@ union bpf_attr {
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
 	FN(getsockopt),			\
-	FN(override_return),
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -952,8 +961,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -968,8 +978,39 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
+							 * supported cb flags
+							 */
+
 /* List of known BPF sock_ops operators.
  * New entries can only be added at the end
  */
@@ -1003,6 +1044,43 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3a44b655d852..bf05bc5e36e5 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -11,16 +11,16 @@ ifneq ($(wildcard $(GENHDR)),)
 endif
 
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
-LDLIBS += -lcap -lelf -lrt
+LDLIBS += -lcap -lelf -lrt -lpthread
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_align test_verifier_log test_dev_cgroup
+	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o
+	sample_map_ret0.o test_tcpbpf_kern.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..dde2c11d7771 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -71,6 +71,8 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval,
 			     int optlen) =
 	(void *) BPF_FUNC_getsockopt;
+static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) =
+	(void *) BPF_FUNC_sock_ops_cb_flags_set;
 static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
new file mode 100755
index 000000000000..481dccdf140c
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python2
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+serverPort = int(sys.argv[1])
+HostName = socket.gethostname()
+
+# create active socket
+sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+try:
+    sock.connect((HostName, serverPort))
+except socket.error as e:
+    sys.exit(1)
+
+buf = ''
+n = 0
+while n < 1000:
+    buf += '+'
+    n += 1
+
+sock.settimeout(1);
+n = send(sock, buf)
+n = read(sock, 500)
+sys.exit(0)
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
new file mode 100755
index 000000000000..bc454d7d0be2
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python2
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+SERVER_PORT = 12877
+MAX_PORTS = 2
+
+serverPort = SERVER_PORT
+serverSocket = None
+
+HostName = socket.gethostname()
+
+# create passive socket
+serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+host = socket.gethostname()
+
+try: serverSocket.bind((host, 0))
+except socket.error as msg:
+    print 'bind fails: ', msg
+
+sn = serverSocket.getsockname()
+serverPort = sn[1]
+
+cmdStr = ("./tcp_client.py %d &") % (serverPort)
+os.system(cmdStr)
+
+buf = ''
+n = 0
+while n < 500:
+    buf += '.'
+    n += 1
+
+serverSocket.listen(MAX_PORTS)
+readList = [serverSocket]
+
+while True:
+    readyRead, readyWrite, inError = \
+        select.select(readList, [], [], 2)
+
+    if len(readyRead) > 0:
+        waitCount = 0
+        for sock in readyRead:
+            if sock == serverSocket:
+                (clientSocket, address) = serverSocket.accept()
+                address = str(address[0])
+                readList.append(clientSocket)
+            else:
+                sock.settimeout(1);
+                s = read(sock, 1000)
+                n = send(sock, buf)
+                sock.close()
+                serverSocket.close()
+                sys.exit(0)
+    else:
+        print 'Select timeout!'
+        sys.exit(1)
diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c
index e19b410125eb..ff8bd7e3e50c 100644
--- a/tools/testing/selftests/bpf/test_align.c
+++ b/tools/testing/selftests/bpf/test_align.c
@@ -446,11 +446,9 @@ static struct bpf_align_test tests[] = {
 		.insns = {
 			PREP_PKT_POINTERS,
 			BPF_MOV64_IMM(BPF_REG_0, 0),
-			/* ptr & const => unknown & const */
-			BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
-			BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 0x40),
-			/* ptr << const => unknown << const */
-			BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+			/* (ptr - ptr) << 2 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2),
 			BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2),
 			/* We have a (4n) value.  Let's make a packet offset
 			 * out of it.  First add 14, to make it a (4n+2)
@@ -473,8 +471,26 @@ static struct bpf_align_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = REJECT,
 		.matches = {
-			{4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
-			/* R5 bitwise operator &= on pointer prohibited */
+			{4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
+			/* (ptr - ptr) << 2 == unknown, (4n) */
+			{6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"},
+			/* (4n) + 14 == (4n+2).  We blow our bounds, because
+			 * the add could overflow.
+			 */
+			{7, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"},
+			/* Checked s>=0 */
+			{9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+			/* packet pointer + nonnegative (4n+2) */
+			{11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+			{13, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
+			/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
+			 * We checked the bounds, but it might have been able
+			 * to overflow if the packet pointer started in the
+			 * upper half of the address space.
+			 * So we did not get a 'range' on R6, and the access
+			 * attempt will fail.
+			 */
+			{15, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
 		}
 	},
 	{
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
index c1535b34f14f..3489cc283433 100644
--- a/tools/testing/selftests/bpf/test_dev_cgroup.c
+++ b/tools/testing/selftests/bpf/test_dev_cgroup.c
@@ -21,7 +21,7 @@
 
 #define DEV_CGROUP_PROG "./dev_cgroup.o"
 
-#define TEST_CGROUP "test-bpf-based-device-cgroup/"
+#define TEST_CGROUP "/test-bpf-based-device-cgroup/"
 
 int main(int argc, char **argv)
 {
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
index 081510853c6d..2be87e9ee28d 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -14,6 +14,7 @@
 #include <errno.h>
 #include <inttypes.h>
 #include <linux/bpf.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -641,6 +642,98 @@ static void test_lpm_get_next_key(void)
 	close(map_fd);
 }
 
+#define MAX_TEST_KEYS	4
+struct lpm_mt_test_info {
+	int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */
+	int iter;
+	int map_fd;
+	struct {
+		__u32 prefixlen;
+		__u32 data;
+	} key[MAX_TEST_KEYS];
+};
+
+static void *lpm_test_command(void *arg)
+{
+	int i, j, ret, iter, key_size;
+	struct lpm_mt_test_info *info = arg;
+	struct bpf_lpm_trie_key *key_p;
+
+	key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32);
+	key_p = alloca(key_size);
+	for (iter = 0; iter < info->iter; iter++)
+		for (i = 0; i < MAX_TEST_KEYS; i++) {
+			/* first half of iterations in forward order,
+			 * and second half in backward order.
+			 */
+			j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 1;
+			key_p->prefixlen = info->key[j].prefixlen;
+			memcpy(key_p->data, &info->key[j].data, sizeof(__u32));
+			if (info->cmd == 0) {
+				__u32 value = j;
+				/* update must succeed */
+				assert(bpf_map_update_elem(info->map_fd, key_p, &value, 0) == 0);
+			} else if (info->cmd == 1) {
+				ret = bpf_map_delete_elem(info->map_fd, key_p);
+				assert(ret == 0 || errno == ENOENT);
+			} else if (info->cmd == 2) {
+				__u32 value;
+				ret = bpf_map_lookup_elem(info->map_fd, key_p, &value);
+				assert(ret == 0 || errno == ENOENT);
+			} else {
+				struct bpf_lpm_trie_key *next_key_p = alloca(key_size);
+				ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p);
+				assert(ret == 0 || errno == ENOENT || errno == ENOMEM);
+			}
+		}
+
+	// Pass successful exit info back to the main thread
+	pthread_exit((void *)info);
+}
+
+static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd)
+{
+	info->iter = 2000;
+	info->map_fd = map_fd;
+	info->key[0].prefixlen = 16;
+	inet_pton(AF_INET, "192.168.0.0", &info->key[0].data);
+	info->key[1].prefixlen = 24;
+	inet_pton(AF_INET, "192.168.0.0", &info->key[1].data);
+	info->key[2].prefixlen = 24;
+	inet_pton(AF_INET, "192.168.128.0", &info->key[2].data);
+	info->key[3].prefixlen = 24;
+	inet_pton(AF_INET, "192.168.1.0", &info->key[3].data);
+}
+
+static void test_lpm_multi_thread(void)
+{
+	struct lpm_mt_test_info info[4];
+	size_t key_size, value_size;
+	pthread_t thread_id[4];
+	int i, map_fd;
+	void *ret;
+
+	/* create a trie */
+	value_size = sizeof(__u32);
+	key_size = sizeof(struct bpf_lpm_trie_key) + value_size;
+	map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size,
+				100, BPF_F_NO_PREALLOC);
+
+	/* create 4 threads to test update, delete, lookup and get_next_key */
+	setup_lpm_mt_test_info(&info[0], map_fd);
+	for (i = 0; i < 4; i++) {
+		if (i != 0)
+			memcpy(&info[i], &info[0], sizeof(info[i]));
+		info[i].cmd = i;
+		assert(pthread_create(&thread_id[i], NULL, &lpm_test_command, &info[i]) == 0);
+	}
+
+	for (i = 0; i < 4; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 && ret == (void *)&info[i]);
+
+	close(map_fd);
+}
+
 int main(void)
 {
 	struct rlimit limit  = { RLIM_INFINITY, RLIM_INFINITY };
@@ -667,6 +760,8 @@ int main(void)
 
 	test_lpm_get_next_key();
 
+	test_lpm_multi_thread();
+
 	printf("test_lpm: OK\n");
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 040356ecc862..436c4c72414f 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -242,7 +242,7 @@ static void test_hashmap_percpu(int task, void *data)
 
 static void test_hashmap_walk(int task, void *data)
 {
-	int fd, i, max_entries = 100000;
+	int fd, i, max_entries = 1000;
 	long long key, value, next_key;
 	bool next_key_valid = true;
 
@@ -463,7 +463,7 @@ static void test_devmap(int task, void *data)
 #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o"
 static void test_sockmap(int tasks, void *data)
 {
-	int one = 1, map_fd_rx, map_fd_tx, map_fd_break, s, sc, rc;
+	int one = 1, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, s, sc, rc;
 	struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_break;
 	int ports[] = {50200, 50201, 50202, 50204};
 	int err, i, fd, udp, sfd[6] = {0xdeadbeef};
@@ -868,9 +868,12 @@ static void test_sockmap(int tasks, void *data)
 		goto out_sockmap;
 	}
 
-	/* Test map close sockets */
-	for (i = 0; i < 6; i++)
+	/* Test map close sockets and empty maps */
+	for (i = 0; i < 6; i++) {
+		bpf_map_delete_elem(map_fd_tx, &i);
+		bpf_map_delete_elem(map_fd_rx, &i);
 		close(sfd[i]);
+	}
 	close(fd);
 	close(map_fd_rx);
 	bpf_object__close(obj);
@@ -881,8 +884,13 @@ out:
 	printf("Failed to create sockmap '%i:%s'!\n", i, strerror(errno));
 	exit(1);
 out_sockmap:
-	for (i = 0; i < 6; i++)
+	for (i = 0; i < 6; i++) {
+		if (map_fd_tx)
+			bpf_map_delete_elem(map_fd_tx, &i);
+		if (map_fd_rx)
+			bpf_map_delete_elem(map_fd_rx, &i);
 		close(sfd[i]);
+	}
 	close(fd);
 	exit(1);
 }
@@ -931,8 +939,12 @@ static void test_map_large(void)
 	close(fd);
 }
 
-static void run_parallel(int tasks, void (*fn)(int task, void *data),
-			 void *data)
+#define run_parallel(N, FN, DATA) \
+	printf("Fork %d tasks to '" #FN "'\n", N); \
+	__run_parallel(N, FN, DATA)
+
+static void __run_parallel(int tasks, void (*fn)(int task, void *data),
+			   void *data)
 {
 	pid_t pid[tasks];
 	int i;
@@ -972,7 +984,7 @@ static void test_map_stress(void)
 #define DO_UPDATE 1
 #define DO_DELETE 0
 
-static void do_work(int fn, void *data)
+static void test_update_delete(int fn, void *data)
 {
 	int do_update = ((int *)data)[1];
 	int fd = ((int *)data)[0];
@@ -1012,7 +1024,7 @@ static void test_map_parallel(void)
 	 */
 	data[0] = fd;
 	data[1] = DO_UPDATE;
-	run_parallel(TASKS, do_work, data);
+	run_parallel(TASKS, test_update_delete, data);
 
 	/* Check that key=0 is already there. */
 	assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
@@ -1035,7 +1047,7 @@ static void test_map_parallel(void)
 
 	/* Now let's delete all elemenets in parallel. */
 	data[1] = DO_DELETE;
-	run_parallel(TASKS, do_work, data);
+	run_parallel(TASKS, test_update_delete, data);
 
 	/* Nothing should be left. */
 	key = -1;
diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h
new file mode 100644
index 000000000000..2fe43289943c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _TEST_TCPBPF_H
+#define _TEST_TCPBPF_H
+
+struct tcpbpf_globals {
+	__u32 event_map;
+	__u32 total_retrans;
+	__u32 data_segs_in;
+	__u32 data_segs_out;
+	__u32 bad_cb_test_rv;
+	__u32 good_cb_test_rv;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+#endif
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
new file mode 100644
index 000000000000..57119ad57a3f
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <netinet/in.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+#include "test_tcpbpf.h"
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct tcpbpf_globals),
+	.max_entries = 2,
+};
+
+static inline void update_event_map(int event)
+{
+	__u32 key = 0;
+	struct tcpbpf_globals g, *gp;
+
+	gp = bpf_map_lookup_elem(&global_map, &key);
+	if (gp == NULL) {
+		struct tcpbpf_globals g = {0};
+
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	} else {
+		g = *gp;
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	}
+}
+
+int _version SEC("version") = 1;
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+	int rv = -1;
+	int bad_call_rv = 0;
+	int good_call_rv = 0;
+	int op;
+	int v = 0;
+
+	op = (int) skops->op;
+
+	update_event_map(op);
+
+	switch (op) {
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+		/* Test failure to set largest cb flag (assumes not defined) */
+		bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80);
+		/* Set callback */
+		good_call_rv = bpf_sock_ops_cb_flags_set(skops,
+						 BPF_SOCK_OPS_STATE_CB_FLAG);
+		/* Update results */
+		{
+			__u32 key = 0;
+			struct tcpbpf_globals g, *gp;
+
+			gp = bpf_map_lookup_elem(&global_map, &key);
+			if (!gp)
+				break;
+			g = *gp;
+			g.bad_cb_test_rv = bad_call_rv;
+			g.good_cb_test_rv = good_call_rv;
+			bpf_map_update_elem(&global_map, &key, &g,
+					    BPF_ANY);
+		}
+		break;
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		skops->sk_txhash = 0x12345f;
+		v = 0xff;
+		rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
+				    sizeof(v));
+		break;
+	case BPF_SOCK_OPS_RTO_CB:
+		break;
+	case BPF_SOCK_OPS_RETRANS_CB:
+		break;
+	case BPF_SOCK_OPS_STATE_CB:
+		if (skops->args[1] == BPF_TCP_CLOSE) {
+			__u32 key = 0;
+			struct tcpbpf_globals g, *gp;
+
+			gp = bpf_map_lookup_elem(&global_map, &key);
+			if (!gp)
+				break;
+			g = *gp;
+			g.total_retrans = skops->total_retrans;
+			g.data_segs_in = skops->data_segs_in;
+			g.data_segs_out = skops->data_segs_out;
+			g.bytes_received = skops->bytes_received;
+			g.bytes_acked = skops->bytes_acked;
+			bpf_map_update_elem(&global_map, &key, &g,
+					    BPF_ANY);
+		}
+		break;
+	default:
+		rv = -1;
+	}
+	skops->reply = rv;
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
new file mode 100644
index 000000000000..95a370f3d378
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <assert.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include <linux/perf_event.h>
+#include "test_tcpbpf.h"
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map) {
+		printf("%s:FAIL:map '%s' not found\n", test, name);
+		return -1;
+	}
+	return bpf_map__fd(map);
+}
+
+#define SYSTEM(CMD)						\
+	do {							\
+		if (system(CMD)) {				\
+			printf("system(%s) FAILS!\n", CMD);	\
+		}						\
+	} while (0)
+
+int main(int argc, char **argv)
+{
+	const char *file = "test_tcpbpf_kern.o";
+	struct tcpbpf_globals g = {0};
+	int cg_fd, prog_fd, map_fd;
+	bool debug_flag = false;
+	int error = EXIT_FAILURE;
+	struct bpf_object *obj;
+	char cmd[100], *dir;
+	struct stat buffer;
+	__u32 key = 0;
+	int pid;
+	int rv;
+
+	if (argc > 1 && strcmp(argv[1], "-d") == 0)
+		debug_flag = true;
+
+	dir = "/tmp/cgroupv2/foo";
+
+	if (stat(dir, &buffer) != 0) {
+		SYSTEM("mkdir -p /tmp/cgroupv2");
+		SYSTEM("mount -t cgroup2 none /tmp/cgroupv2");
+		SYSTEM("mkdir -p /tmp/cgroupv2/foo");
+	}
+	pid = (int) getpid();
+	sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid);
+	SYSTEM(cmd);
+
+	cg_fd = open(dir, O_DIRECTORY, O_RDONLY);
+	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+		printf("FAILED: load_bpf_file failed for: %s\n", file);
+		goto err;
+	}
+
+	rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (rv) {
+		printf("FAILED: bpf_prog_attach: %d (%s)\n",
+		       error, strerror(errno));
+		goto err;
+	}
+
+	SYSTEM("./tcp_server.py");
+
+	map_fd = bpf_find_map(__func__, obj, "global_map");
+	if (map_fd < 0)
+		goto err;
+
+	rv = bpf_map_lookup_elem(map_fd, &key, &g);
+	if (rv != 0) {
+		printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+		goto err;
+	}
+
+	if (g.bytes_received != 501 || g.bytes_acked != 1002 ||
+	    g.data_segs_in != 1 || g.data_segs_out != 1 ||
+	    (g.event_map ^ 0x47e) != 0 || g.bad_cb_test_rv != 0x80 ||
+		g.good_cb_test_rv != 0) {
+		printf("FAILED: Wrong stats\n");
+		if (debug_flag) {
+			printf("\n");
+			printf("bytes_received: %d (expecting 501)\n",
+			       (int)g.bytes_received);
+			printf("bytes_acked:    %d (expecting 1002)\n",
+			       (int)g.bytes_acked);
+			printf("data_segs_in:   %d (expecting 1)\n",
+			       g.data_segs_in);
+			printf("data_segs_out:  %d (expecting 1)\n",
+			       g.data_segs_out);
+			printf("event_map:      0x%x (at least 0x47e)\n",
+			       g.event_map);
+			printf("bad_cb_test_rv: 0x%x (expecting 0x80)\n",
+			       g.bad_cb_test_rv);
+			printf("good_cb_test_rv:0x%x (expecting 0)\n",
+			       g.good_cb_test_rv);
+		}
+		goto err;
+	}
+	printf("PASSED!\n");
+	error = 0;
+err:
+	bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+	return error;
+
+}
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index fb82d29ee863..697bd83de295 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -21,6 +21,7 @@
 #include <stddef.h>
 #include <stdbool.h>
 #include <sched.h>
+#include <limits.h>
 
 #include <sys/capability.h>
 #include <sys/resource.h>
@@ -111,7 +112,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
-		.retval = 0,
+		.retval = 42,
 	},
 	{
 		"DIV32 by 0, zero check 2",
@@ -123,7 +124,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
-		.retval = 0,
+		.retval = 42,
 	},
 	{
 		"DIV64 by 0, zero check",
@@ -135,7 +136,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
-		.retval = 0,
+		.retval = 42,
 	},
 	{
 		"MOD32 by 0, zero check 1",
@@ -147,7 +148,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
-		.retval = 0,
+		.retval = 42,
 	},
 	{
 		"MOD32 by 0, zero check 2",
@@ -159,7 +160,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
-		.retval = 0,
+		.retval = 42,
 	},
 	{
 		"MOD64 by 0, zero check",
@@ -171,13 +172,245 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"DIV32 by 0, zero check ok, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_MOV32_IMM(BPF_REG_1, 2),
+			BPF_MOV32_IMM(BPF_REG_2, 16),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 8,
+	},
+	{
+		"DIV32 by 0, zero check 1, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV32 by 0, zero check 2, cls",
+		.insns = {
+			BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV64 by 0, zero check, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
 		.retval = 0,
 	},
 	{
+		"MOD32 by 0, zero check ok, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_MOV32_IMM(BPF_REG_1, 3),
+			BPF_MOV32_IMM(BPF_REG_2, 5),
+			BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"MOD32 by 0, zero check 1, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"MOD32 by 0, zero check 2, cls",
+		.insns = {
+			BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"MOD64 by 0, zero check 1, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_0, 2),
+			BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"MOD64 by 0, zero check 2, cls",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_0, -1),
+			BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = -1,
+	},
+	/* Just make sure that JITs used udiv/umod as otherwise we get
+	 * an exception from INT_MIN/-1 overflow similarly as with div
+	 * by zero.
+	 */
+	{
+		"DIV32 overflow, check 1",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, -1),
+			BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV32 overflow, check 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+			BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, -1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV64 overflow, check 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_LD_IMM64(BPF_REG_0, LLONG_MIN),
+			BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"DIV64 overflow, check 2",
+		.insns = {
+			BPF_LD_IMM64(BPF_REG_0, LLONG_MIN),
+			BPF_ALU64_IMM(BPF_DIV, BPF_REG_0, -1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"MOD32 overflow, check 1",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, -1),
+			BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+			BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = INT_MIN,
+	},
+	{
+		"MOD32 overflow, check 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+			BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, -1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = INT_MIN,
+	},
+	{
+		"MOD64 overflow, check 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_LD_IMM64(BPF_REG_2, LLONG_MIN),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"MOD64 overflow, check 2",
+		.insns = {
+			BPF_LD_IMM64(BPF_REG_2, LLONG_MIN),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_MOD, BPF_REG_2, -1),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"xor32 zero extend check",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_2, -1),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
+			BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 0xffff),
+			BPF_ALU32_REG(BPF_XOR, BPF_REG_2, BPF_REG_2),
+			BPF_MOV32_IMM(BPF_REG_0, 2),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 1),
+			BPF_MOV32_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
 		"empty prog",
 		.insns = {
 		},
-		.errstr = "last insn is not an exit or jmp",
+		.errstr = "unknown opcode 00",
 		.result = REJECT,
 	},
 	{
@@ -374,7 +607,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = REJECT,
-		.errstr = "BPF_ARSH not supported for 32 bit ALU",
+		.errstr = "unknown opcode c4",
 	},
 	{
 		"arsh32 on reg",
@@ -385,7 +618,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = REJECT,
-		.errstr = "BPF_ARSH not supported for 32 bit ALU",
+		.errstr = "unknown opcode cc",
 	},
 	{
 		"arsh64 on imm",
@@ -501,7 +734,7 @@ static struct bpf_test tests[] = {
 			BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "BPF_CALL uses reserved",
+		.errstr = "unknown opcode 8d",
 		.result = REJECT,
 	},
 	{
@@ -691,7 +924,7 @@ static struct bpf_test tests[] = {
 			BPF_RAW_INSN(0, 0, 0, 0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "invalid BPF_LD_IMM",
+		.errstr = "unknown opcode 00",
 		.result = REJECT,
 	},
 	{
@@ -709,7 +942,7 @@ static struct bpf_test tests[] = {
 			BPF_RAW_INSN(-1, 0, 0, 0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "invalid BPF_ALU opcode f0",
+		.errstr = "unknown opcode ff",
 		.result = REJECT,
 	},
 	{
@@ -718,7 +951,7 @@ static struct bpf_test tests[] = {
 			BPF_RAW_INSN(-1, -1, -1, -1, -1),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "invalid BPF_ALU opcode f0",
+		.errstr = "unknown opcode ff",
 		.result = REJECT,
 	},
 	{
@@ -7543,7 +7776,7 @@ static struct bpf_test tests[] = {
 			},
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "BPF_END uses reserved fields",
+		.errstr = "unknown opcode d7",
 		.result = REJECT,
 	},
 	{
@@ -8766,6 +8999,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = ACCEPT,
+		.retval = 1,
 	},
 	{
 		"check deducing bounds from const, 3",
@@ -8963,6 +9197,90 @@ static struct bpf_test tests[] = {
 		.retval = 1,
 	},
 	{
+		"calls: div by 0 in subprog",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(BPF_REG_2, 0),
+			BPF_MOV32_IMM(BPF_REG_3, 1),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_3, BPF_REG_2),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"calls: multiple ret types in subprog 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_MOV32_IMM(BPF_REG_0, 42),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "R0 invalid mem access 'inv'",
+	},
+	{
+		"calls: multiple ret types in subprog 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 9),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
+				    offsetof(struct __sk_buff, data)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map1 = { 16 },
+		.result = REJECT,
+		.errstr = "R0 min value is outside of the array range",
+	},
+	{
 		"calls: overlapping caller/callee",
 		.insns = {
 			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0),