diff options
75 files changed, 4067 insertions, 2057 deletions
diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt index 06bb7cc334c8..94a7f33ac5e9 100644 --- a/Documentation/devicetree/bindings/net/can/rcar_can.txt +++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt @@ -2,7 +2,9 @@ Renesas R-Car CAN controller Device Tree Bindings ------------------------------------------------- Required properties: -- compatible: "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC. +- compatible: "renesas,can-r8a7743" if CAN controller is a part of R8A7743 SoC. + "renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC. + "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC. "renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC. "renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC. "renesas,can-r8a7791" if CAN controller is a part of R8A7791 SoC. @@ -12,7 +14,8 @@ Required properties: "renesas,can-r8a7795" if CAN controller is a part of R8A7795 SoC. "renesas,can-r8a7796" if CAN controller is a part of R8A7796 SoC. "renesas,rcar-gen1-can" for a generic R-Car Gen1 compatible device. - "renesas,rcar-gen2-can" for a generic R-Car Gen2 compatible device. + "renesas,rcar-gen2-can" for a generic R-Car Gen2 or RZ/G1 + compatible device. "renesas,rcar-gen3-can" for a generic R-Car Gen3 compatible device. When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first diff --git a/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt b/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt index 4ccb2cd5df94..d096cf461d81 100644 --- a/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt +++ b/Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt @@ -195,4 +195,4 @@ External interrupts: fsl,mpc5200-mscan nodes ----------------------- -See file can.txt in this directory. +See file Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index f5d642c01dd3..2b89d91b376f 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -36,8 +36,6 @@ bonding.txt - Linux Ethernet Bonding Driver HOWTO: link aggregation in Linux. bridge.txt - where to get user space programs for ethernet bridging with Linux. -can.txt - - documentation on CAN protocol family. cdc_mbim.txt - 3G/LTE USB modem (Mobile Broadband Interface Model) checksum-offloads.txt diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst new file mode 100644 index 000000000000..d23c51abf8c6 --- /dev/null +++ b/Documentation/networking/can.rst @@ -0,0 +1,1437 @@ +=================================== +SocketCAN - Controller Area Network +=================================== + +Overview / What is SocketCAN +============================ + +The socketcan package is an implementation of CAN protocols +(Controller Area Network) for Linux. CAN is a networking technology +which has widespread use in automation, embedded devices, and +automotive fields. While there have been other CAN implementations +for Linux based on character devices, SocketCAN uses the Berkeley +socket API, the Linux network stack and implements the CAN device +drivers as network interfaces. The CAN socket API has been designed +as similar as possible to the TCP/IP protocols to allow programmers, +familiar with network programming, to easily learn how to use CAN +sockets. + + +.. _socketcan-motivation: + +Motivation / Why Using the Socket API +===================================== + +There have been CAN implementations for Linux before SocketCAN so the +question arises, why we have started another project. Most existing +implementations come as a device driver for some CAN hardware, they +are based on character devices and provide comparatively little +functionality. Usually, there is only a hardware-specific device +driver which provides a character device interface to send and +receive raw CAN frames, directly to/from the controller hardware. +Queueing of frames and higher-level transport protocols like ISO-TP +have to be implemented in user space applications. Also, most +character-device implementations support only one single process to +open the device at a time, similar to a serial interface. Exchanging +the CAN controller requires employment of another device driver and +often the need for adaption of large parts of the application to the +new driver's API. + +SocketCAN was designed to overcome all of these limitations. A new +protocol family has been implemented which provides a socket interface +to user space applications and which builds upon the Linux network +layer, enabling use all of the provided queueing functionality. A device +driver for CAN controller hardware registers itself with the Linux +network layer as a network device, so that CAN frames from the +controller can be passed up to the network layer and on to the CAN +protocol family module and also vice-versa. Also, the protocol family +module provides an API for transport protocol modules to register, so +that any number of transport protocols can be loaded or unloaded +dynamically. In fact, the can core module alone does not provide any +protocol and cannot be used without loading at least one additional +protocol module. Multiple sockets can be opened at the same time, +on different or the same protocol module and they can listen/send +frames on different or the same CAN IDs. Several sockets listening on +the same interface for frames with the same CAN ID are all passed the +same received matching CAN frames. An application wishing to +communicate using a specific transport protocol, e.g. ISO-TP, just +selects that protocol when opening the socket, and then can read and +write application data byte streams, without having to deal with +CAN-IDs, frames, etc. + +Similar functionality visible from user-space could be provided by a +character device, too, but this would lead to a technically inelegant +solution for a couple of reasons: + +* **Intricate usage:** Instead of passing a protocol argument to + socket(2) and using bind(2) to select a CAN interface and CAN ID, an + application would have to do all these operations using ioctl(2)s. + +* **Code duplication:** A character device cannot make use of the Linux + network queueing code, so all that code would have to be duplicated + for CAN networking. + +* **Abstraction:** In most existing character-device implementations, the + hardware-specific device driver for a CAN controller directly + provides the character device for the application to work with. + This is at least very unusual in Unix systems for both, char and + block devices. For example you don't have a character device for a + certain UART of a serial interface, a certain sound chip in your + computer, a SCSI or IDE controller providing access to your hard + disk or tape streamer device. Instead, you have abstraction layers + which provide a unified character or block device interface to the + application on the one hand, and a interface for hardware-specific + device drivers on the other hand. These abstractions are provided + by subsystems like the tty layer, the audio subsystem or the SCSI + and IDE subsystems for the devices mentioned above. + + The easiest way to implement a CAN device driver is as a character + device without such a (complete) abstraction layer, as is done by most + existing drivers. The right way, however, would be to add such a + layer with all the functionality like registering for certain CAN + IDs, supporting several open file descriptors and (de)multiplexing + CAN frames between them, (sophisticated) queueing of CAN frames, and + providing an API for device drivers to register with. However, then + it would be no more difficult, or may be even easier, to use the + networking framework provided by the Linux kernel, and this is what + SocketCAN does. + +The use of the networking framework of the Linux kernel is just the +natural and most appropriate way to implement CAN for Linux. + + +.. _socketcan-concept: + +SocketCAN Concept +================= + +As described in :ref:`socketcan-motivation` the main goal of SocketCAN is to +provide a socket interface to user space applications which builds +upon the Linux network layer. In contrast to the commonly known +TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!) +medium that has no MAC-layer addressing like ethernet. The CAN-identifier +(can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs +have to be chosen uniquely on the bus. When designing a CAN-ECU +network the CAN-IDs are mapped to be sent by a specific ECU. +For this reason a CAN-ID can be treated best as a kind of source address. + + +.. _socketcan-receive-lists: + +Receive Lists +------------- + +The network transparent access of multiple applications leads to the +problem that different applications may be interested in the same +CAN-IDs from the same CAN network interface. The SocketCAN core +module - which implements the protocol family CAN - provides several +high efficient receive lists for this reason. If e.g. a user space +application opens a CAN RAW socket, the raw protocol module itself +requests the (range of) CAN-IDs from the SocketCAN core that are +requested by the user. The subscription and unsubscription of +CAN-IDs can be done for specific CAN interfaces or for all(!) known +CAN interfaces with the can_rx_(un)register() functions provided to +CAN protocol modules by the SocketCAN core (see :ref:`socketcan-core-module`). +To optimize the CPU usage at runtime the receive lists are split up +into several specific lists per device that match the requested +filter complexity for a given use-case. + + +.. _socketcan-local-loopback1: + +Local Loopback of Sent Frames +----------------------------- + +As known from other networking concepts the data exchanging +applications may run on the same or different nodes without any +change (except for the according addressing information): + +.. code:: + + ___ ___ ___ _______ ___ + | _ | | _ | | _ | | _ _ | | _ | + ||A|| ||B|| ||C|| ||A| |B|| ||C|| + |___| |___| |___| |_______| |___| + | | | | | + -----------------(1)- CAN bus -(2)--------------- + +To ensure that application A receives the same information in the +example (2) as it would receive in example (1) there is need for +some kind of local loopback of the sent CAN frames on the appropriate +node. + +The Linux network devices (by default) just can handle the +transmission and reception of media dependent frames. Due to the +arbitration on the CAN bus the transmission of a low prio CAN-ID +may be delayed by the reception of a high prio CAN frame. To +reflect the correct [*]_ traffic on the node the loopback of the sent +data has to be performed right after a successful transmission. If +the CAN network interface is not capable of performing the loopback for +some reason the SocketCAN core can do this task as a fallback solution. +See :ref:`socketcan-local-loopback1` for details (recommended). + +The loopback functionality is enabled by default to reflect standard +networking behaviour for CAN applications. Due to some requests from +the RT-SocketCAN group the loopback optionally may be disabled for each +separate socket. See sockopts from the CAN RAW sockets in :ref:`socketcan-raw-sockets`. + +.. [*] you really like to have this when you're running analyser + tools like 'candump' or 'cansniffer' on the (same) node. + + +.. _socketcan-network-problem-notifications: + +Network Problem Notifications +----------------------------- + +The use of the CAN bus may lead to several problems on the physical +and media access control layer. Detecting and logging of these lower +layer problems is a vital requirement for CAN users to identify +hardware issues on the physical transceiver layer as well as +arbitration problems and error frames caused by the different +ECUs. The occurrence of detected errors are important for diagnosis +and have to be logged together with the exact timestamp. For this +reason the CAN interface driver can generate so called Error Message +Frames that can optionally be passed to the user application in the +same way as other CAN frames. Whenever an error on the physical layer +or the MAC layer is detected (e.g. by the CAN controller) the driver +creates an appropriate error message frame. Error messages frames can +be requested by the user application using the common CAN filter +mechanisms. Inside this filter definition the (interested) type of +errors may be selected. The reception of error messages is disabled +by default. The format of the CAN error message frame is briefly +described in the Linux header file "include/uapi/linux/can/error.h". + + +How to use SocketCAN +==================== + +Like TCP/IP, you first need to open a socket for communicating over a +CAN network. Since SocketCAN implements a new protocol family, you +need to pass PF_CAN as the first argument to the socket(2) system +call. Currently, there are two CAN protocols to choose from, the raw +socket protocol and the broadcast manager (BCM). So to open a socket, +you would write:: + + s = socket(PF_CAN, SOCK_RAW, CAN_RAW); + +and:: + + s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM); + +respectively. After the successful creation of the socket, you would +normally use the bind(2) system call to bind the socket to a CAN +interface (which is different from TCP/IP due to different addressing +- see :ref:`socketcan-concept`). After binding (CAN_RAW) or connecting (CAN_BCM) +the socket, you can read(2) and write(2) from/to the socket or use +send(2), sendto(2), sendmsg(2) and the recv* counterpart operations +on the socket as usual. There are also CAN specific socket options +described below. + +The basic CAN frame structure and the sockaddr structure are defined +in include/linux/can.h: + +.. code-block:: C + + struct can_frame { + canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */ + __u8 can_dlc; /* frame payload length in byte (0 .. 8) */ + __u8 __pad; /* padding */ + __u8 __res0; /* reserved / padding */ + __u8 __res1; /* reserved / padding */ + __u8 data[8] __attribute__((aligned(8))); + }; + +The alignment of the (linear) payload data[] to a 64bit boundary +allows the user to define their own structs and unions to easily access +the CAN payload. There is no given byteorder on the CAN bus by +default. A read(2) system call on a CAN_RAW socket transfers a +struct can_frame to the user space. + +The sockaddr_can structure has an interface index like the +PF_PACKET socket, that also binds to a specific interface: + +.. code-block:: C + + struct sockaddr_can { + sa_family_t can_family; + int can_ifindex; + union { + /* transport protocol class address info (e.g. ISOTP) */ + struct { canid_t rx_id, tx_id; } tp; + + /* reserved for future CAN protocols address information */ + } can_addr; + }; + +To determine the interface index an appropriate ioctl() has to +be used (example for CAN_RAW sockets without error checking): + +.. code-block:: C + + int s; + struct sockaddr_can addr; + struct ifreq ifr; + + s = socket(PF_CAN, SOCK_RAW, CAN_RAW); + + strcpy(ifr.ifr_name, "can0" ); + ioctl(s, SIOCGIFINDEX, &ifr); + + addr.can_family = AF_CAN; + addr.can_ifindex = ifr.ifr_ifindex; + + bind(s, (struct sockaddr *)&addr, sizeof(addr)); + + (..) + +To bind a socket to all(!) CAN interfaces the interface index must +be 0 (zero). In this case the socket receives CAN frames from every +enabled CAN interface. To determine the originating CAN interface +the system call recvfrom(2) may be used instead of read(2). To send +on a socket that is bound to 'any' interface sendto(2) is needed to +specify the outgoing interface. + +Reading CAN frames from a bound CAN_RAW socket (see above) consists +of reading a struct can_frame: + +.. code-block:: C + + struct can_frame frame; + + nbytes = read(s, &frame, sizeof(struct can_frame)); + + if (nbytes < 0) { + perror("can raw socket read"); + return 1; + } + + /* paranoid check ... */ + if (nbytes < sizeof(struct can_frame)) { + fprintf(stderr, "read: incomplete CAN frame\n"); + return 1; + } + + /* do something with the received CAN frame */ + +Writing CAN frames can be done similarly, with the write(2) system call:: + + nbytes = write(s, &frame, sizeof(struct can_frame)); + +When the CAN interface is bound to 'any' existing CAN interface +(addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the +information about the originating CAN interface is needed: + +.. code-block:: C + + struct sockaddr_can addr; + struct ifreq ifr; + socklen_t len = sizeof(addr); + struct can_frame frame; + + nbytes = recvfrom(s, &frame, sizeof(struct can_frame), + 0, (struct sockaddr*)&addr, &len); + + /* get interface name of the received CAN frame */ + ifr.ifr_ifindex = addr.can_ifindex; + ioctl(s, SIOCGIFNAME, &ifr); + printf("Received a CAN frame from interface %s", ifr.ifr_name); + +To write CAN frames on sockets bound to 'any' CAN interface the +outgoing interface has to be defined certainly: + +.. code-block:: C + + strcpy(ifr.ifr_name, "can0"); + ioctl(s, SIOCGIFINDEX, &ifr); + addr.can_ifindex = ifr.ifr_ifindex; + addr.can_family = AF_CAN; + + nbytes = sendto(s, &frame, sizeof(struct can_frame), + 0, (struct sockaddr*)&addr, sizeof(addr)); + +An accurate timestamp can be obtained with an ioctl(2) call after reading +a message from the socket: + +.. code-block:: C + + struct timeval tv; + ioctl(s, SIOCGSTAMP, &tv); + +The timestamp has a resolution of one microsecond and is set automatically +at the reception of a CAN frame. + +Remark about CAN FD (flexible data rate) support: + +Generally the handling of CAN FD is very similar to the formerly described +examples. The new CAN FD capable CAN controllers support two different +bitrates for the arbitration phase and the payload phase of the CAN FD frame +and up to 64 bytes of payload. This extended payload length breaks all the +kernel interfaces (ABI) which heavily rely on the CAN frame with fixed eight +bytes of payload (struct can_frame) like the CAN_RAW socket. Therefore e.g. +the CAN_RAW socket supports a new socket option CAN_RAW_FD_FRAMES that +switches the socket into a mode that allows the handling of CAN FD frames +and (legacy) CAN frames simultaneously (see :ref:`socketcan-rawfd`). + +The struct canfd_frame is defined in include/linux/can.h: + +.. code-block:: C + + struct canfd_frame { + canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */ + __u8 len; /* frame payload length in byte (0 .. 64) */ + __u8 flags; /* additional flags for CAN FD */ + __u8 __res0; /* reserved / padding */ + __u8 __res1; /* reserved / padding */ + __u8 data[64] __attribute__((aligned(8))); + }; + +The struct canfd_frame and the existing struct can_frame have the can_id, +the payload length and the payload data at the same offset inside their +structures. This allows to handle the different structures very similar. +When the content of a struct can_frame is copied into a struct canfd_frame +all structure elements can be used as-is - only the data[] becomes extended. + +When introducing the struct canfd_frame it turned out that the data length +code (DLC) of the struct can_frame was used as a length information as the +length and the DLC has a 1:1 mapping in the range of 0 .. 8. To preserve +the easy handling of the length information the canfd_frame.len element +contains a plain length value from 0 .. 64. So both canfd_frame.len and +can_frame.can_dlc are equal and contain a length information and no DLC. +For details about the distinction of CAN and CAN FD capable devices and +the mapping to the bus-relevant data length code (DLC), see :ref:`socketcan-can-fd-driver`. + +The length of the two CAN(FD) frame structures define the maximum transfer +unit (MTU) of the CAN(FD) network interface and skbuff data length. Two +definitions are specified for CAN specific MTUs in include/linux/can.h: + +.. code-block:: C + + #define CAN_MTU (sizeof(struct can_frame)) == 16 => 'legacy' CAN frame + #define CANFD_MTU (sizeof(struct canfd_frame)) == 72 => CAN FD frame + + +.. _socketcan-raw-sockets: + +RAW Protocol Sockets with can_filters (SOCK_RAW) +------------------------------------------------ + +Using CAN_RAW sockets is extensively comparable to the commonly +known access to CAN character devices. To meet the new possibilities +provided by the multi user SocketCAN approach, some reasonable +defaults are set at RAW socket binding time: + +- The filters are set to exactly one filter receiving everything +- The socket only receives valid data frames (=> no error message frames) +- The loopback of sent CAN frames is enabled (see :ref:`socketcan-local-loopback2`) +- The socket does not receive its own sent frames (in loopback mode) + +These default settings may be changed before or after binding the socket. +To use the referenced definitions of the socket options for CAN_RAW +sockets, include <linux/can/raw.h>. + + +.. _socketcan-rawfilter: + +RAW socket option CAN_RAW_FILTER +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The reception of CAN frames using CAN_RAW sockets can be controlled +by defining 0 .. n filters with the CAN_RAW_FILTER socket option. + +The CAN filter structure is defined in include/linux/can.h: + +.. code-block:: C + + struct can_filter { + canid_t can_id; + canid_t can_mask; + }; + +A filter matches, when: + +.. code-block:: C + + <received_can_id> & mask == can_id & mask + +which is analogous to known CAN controllers hardware filter semantics. +The filter can be inverted in this semantic, when the CAN_INV_FILTER +bit is set in can_id element of the can_filter structure. In +contrast to CAN controller hardware filters the user may set 0 .. n +receive filters for each open socket separately: + +.. code-block:: C + + struct can_filter rfilter[2]; + + rfilter[0].can_id = 0x123; + rfilter[0].can_mask = CAN_SFF_MASK; + rfilter[1].can_id = 0x200; + rfilter[1].can_mask = 0x700; + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter)); + +To disable the reception of CAN frames on the selected CAN_RAW socket: + +.. code-block:: C + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0); + +To set the filters to zero filters is quite obsolete as to not read +data causes the raw socket to discard the received CAN frames. But +having this 'send only' use-case we may remove the receive list in the +Kernel to save a little (really a very little!) CPU usage. + +CAN Filter Usage Optimisation +............................. + +The CAN filters are processed in per-device filter lists at CAN frame +reception time. To reduce the number of checks that need to be performed +while walking through the filter lists the CAN core provides an optimized +filter handling when the filter subscription focusses on a single CAN ID. + +For the possible 2048 SFF CAN identifiers the identifier is used as an index +to access the corresponding subscription list without any further checks. +For the 2^29 possible EFF CAN identifiers a 10 bit XOR folding is used as +hash function to retrieve the EFF table index. + +To benefit from the optimized filters for single CAN identifiers the +CAN_SFF_MASK or CAN_EFF_MASK have to be set into can_filter.mask together +with set CAN_EFF_FLAG and CAN_RTR_FLAG bits. A set CAN_EFF_FLAG bit in the +can_filter.mask makes clear that it matters whether a SFF or EFF CAN ID is +subscribed. E.g. in the example from above: + +.. code-block:: C + + rfilter[0].can_id = 0x123; + rfilter[0].can_mask = CAN_SFF_MASK; + +both SFF frames with CAN ID 0x123 and EFF frames with 0xXXXXX123 can pass. + +To filter for only 0x123 (SFF) and 0x12345678 (EFF) CAN identifiers the +filter has to be defined in this way to benefit from the optimized filters: + +.. code-block:: C + + struct can_filter rfilter[2]; + + rfilter[0].can_id = 0x123; + rfilter[0].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_SFF_MASK); + rfilter[1].can_id = 0x12345678 | CAN_EFF_FLAG; + rfilter[1].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_EFF_MASK); + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter)); + + +RAW Socket Option CAN_RAW_ERR_FILTER +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As described in :ref:`socketcan-network-problem-notifications` the CAN interface driver can generate so +called Error Message Frames that can optionally be passed to the user +application in the same way as other CAN frames. The possible +errors are divided into different error classes that may be filtered +using the appropriate error mask. To register for every possible +error condition CAN_ERR_MASK can be used as value for the error mask. +The values for the error mask are defined in linux/can/error.h: + +.. code-block:: C + + can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF ); + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER, + &err_mask, sizeof(err_mask)); + + +RAW Socket Option CAN_RAW_LOOPBACK +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To meet multi user needs the local loopback is enabled by default +(see :ref:`socketcan-local-loopback1` for details). But in some embedded use-cases +(e.g. when only one application uses the CAN bus) this loopback +functionality can be disabled (separately for each socket): + +.. code-block:: C + + int loopback = 0; /* 0 = disabled, 1 = enabled (default) */ + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback)); + + +RAW socket option CAN_RAW_RECV_OWN_MSGS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When the local loopback is enabled, all the sent CAN frames are +looped back to the open CAN sockets that registered for the CAN +frames' CAN-ID on this given interface to meet the multi user +needs. The reception of the CAN frames on the same socket that was +sending the CAN frame is assumed to be unwanted and therefore +disabled by default. This default behaviour may be changed on +demand: + +.. code-block:: C + + int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */ + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS, + &recv_own_msgs, sizeof(recv_own_msgs)); + + +.. _socketcan-rawfd: + +RAW Socket Option CAN_RAW_FD_FRAMES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +CAN FD support in CAN_RAW sockets can be enabled with a new socket option +CAN_RAW_FD_FRAMES which is off by default. When the new socket option is +not supported by the CAN_RAW socket (e.g. on older kernels), switching the +CAN_RAW_FD_FRAMES option returns the error -ENOPROTOOPT. + +Once CAN_RAW_FD_FRAMES is enabled the application can send both CAN frames +and CAN FD frames. OTOH the application has to handle CAN and CAN FD frames +when reading from the socket: + +.. code-block:: C + + CAN_RAW_FD_FRAMES enabled: CAN_MTU and CANFD_MTU are allowed + CAN_RAW_FD_FRAMES disabled: only CAN_MTU is allowed (default) + +Example: + +.. code-block:: C + + [ remember: CANFD_MTU == sizeof(struct canfd_frame) ] + + struct canfd_frame cfd; + + nbytes = read(s, &cfd, CANFD_MTU); + + if (nbytes == CANFD_MTU) { + printf("got CAN FD frame with length %d\n", cfd.len); + /* cfd.flags contains valid data */ + } else if (nbytes == CAN_MTU) { + printf("got legacy CAN frame with length %d\n", cfd.len); + /* cfd.flags is undefined */ + } else { + fprintf(stderr, "read: invalid CAN(FD) frame\n"); + return 1; + } + + /* the content can be handled independently from the received MTU size */ + + printf("can_id: %X data length: %d data: ", cfd.can_id, cfd.len); + for (i = 0; i < cfd.len; i++) + printf("%02X ", cfd.data[i]); + +When reading with size CANFD_MTU only returns CAN_MTU bytes that have +been received from the socket a legacy CAN frame has been read into the +provided CAN FD structure. Note that the canfd_frame.flags data field is +not specified in the struct can_frame and therefore it is only valid in +CANFD_MTU sized CAN FD frames. + +Implementation hint for new CAN applications: + +To build a CAN FD aware application use struct canfd_frame as basic CAN +data structure for CAN_RAW based applications. When the application is +executed on an older Linux kernel and switching the CAN_RAW_FD_FRAMES +socket option returns an error: No problem. You'll get legacy CAN frames +or CAN FD frames and can process them the same way. + +When sending to CAN devices make sure that the device is capable to handle +CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU. +The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall. + + +RAW socket option CAN_RAW_JOIN_FILTERS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The CAN_RAW socket can set multiple CAN identifier specific filters that +lead to multiple filters in the af_can.c filter processing. These filters +are indenpendent from each other which leads to logical OR'ed filters when +applied (see :ref:`socketcan-rawfilter`). + +This socket option joines the given CAN filters in the way that only CAN +frames are passed to user space that matched *all* given CAN filters. The +semantic for the applied filters is therefore changed to a logical AND. + +This is useful especially when the filterset is a combination of filters +where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or +CAN ID ranges from the incoming traffic. + + +RAW Socket Returned Message Flags +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using recvmsg() call, the msg->msg_flags may contain following flags: + +MSG_DONTROUTE: + set when the received frame was created on the local host. + +MSG_CONFIRM: + set when the frame was sent via the socket it is received on. + This flag can be interpreted as a 'transmission confirmation' when the + CAN driver supports the echo of frames on driver level, see + :ref:`socketcan-local-loopback1` and :ref:`socketcan-local-loopback2`. + In order to receive such messages, CAN_RAW_RECV_OWN_MSGS must be set. + + +Broadcast Manager Protocol Sockets (SOCK_DGRAM) +----------------------------------------------- + +The Broadcast Manager protocol provides a command based configuration +interface to filter and send (e.g. cyclic) CAN messages in kernel space. + +Receive filters can be used to down sample frequent messages; detect events +such as message contents changes, packet length changes, and do time-out +monitoring of received messages. + +Periodic transmission tasks of CAN frames or a sequence of CAN frames can be +created and modified at runtime; both the message content and the two +possible transmit intervals can be altered. + +A BCM socket is not intended for sending individual CAN frames using the +struct can_frame as known from the CAN_RAW socket. Instead a special BCM +configuration message is defined. The basic BCM configuration message used +to communicate with the broadcast manager and the available operations are +defined in the linux/can/bcm.h include. The BCM message consists of a +message header with a command ('opcode') followed by zero or more CAN frames. +The broadcast manager sends responses to user space in the same form: + +.. code-block:: C + + struct bcm_msg_head { + __u32 opcode; /* command */ + __u32 flags; /* special flags */ + __u32 count; /* run 'count' times with ival1 */ + struct timeval ival1, ival2; /* count and subsequent interval */ + canid_t can_id; /* unique can_id for task */ + __u32 nframes; /* number of can_frames following */ + struct can_frame frames[0]; + }; + +The aligned payload 'frames' uses the same basic CAN frame structure defined +at the beginning of :ref:`socketcan-rawfd` and in the include/linux/can.h include. All +messages to the broadcast manager from user space have this structure. + +Note a CAN_BCM socket must be connected instead of bound after socket +creation (example without error checking): + +.. code-block:: C + + int s; + struct sockaddr_can addr; + struct ifreq ifr; + + s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM); + + strcpy(ifr.ifr_name, "can0"); + ioctl(s, SIOCGIFINDEX, &ifr); + + addr.can_family = AF_CAN; + addr.can_ifindex = ifr.ifr_ifindex; + + connect(s, (struct sockaddr *)&addr, sizeof(addr)); + + (..) + +The broadcast manager socket is able to handle any number of in flight +transmissions or receive filters concurrently. The different RX/TX jobs are +distinguished by the unique can_id in each BCM message. However additional +CAN_BCM sockets are recommended to communicate on multiple CAN interfaces. +When the broadcast manager socket is bound to 'any' CAN interface (=> the +interface index is set to zero) the configured receive filters apply to any +CAN interface unless the sendto() syscall is used to overrule the 'any' CAN +interface index. When using recvfrom() instead of read() to retrieve BCM +socket messages the originating CAN interface is provided in can_ifindex. + + +Broadcast Manager Operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The opcode defines the operation for the broadcast manager to carry out, +or details the broadcast managers response to several events, including +user requests. + +Transmit Operations (user space to broadcast manager): + +TX_SETUP: + Create (cyclic) transmission task. + +TX_DELETE: + Remove (cyclic) transmission task, requires only can_id. + +TX_READ: + Read properties of (cyclic) transmission task for can_id. + +TX_SEND: + Send one CAN frame. + +Transmit Responses (broadcast manager to user space): + +TX_STATUS: + Reply to TX_READ request (transmission task configuration). + +TX_EXPIRED: + Notification when counter finishes sending at initial interval + 'ival1'. Requires the TX_COUNTEVT flag to be set at TX_SETUP. + +Receive Operations (user space to broadcast manager): + +RX_SETUP: + Create RX content filter subscription. + +RX_DELETE: + Remove RX content filter subscription, requires only can_id. + +RX_READ: + Read properties of RX content filter subscription for can_id. + +Receive Responses (broadcast manager to user space): + +RX_STATUS: + Reply to RX_READ request (filter task configuration). + +RX_TIMEOUT: + Cyclic message is detected to be absent (timer ival1 expired). + +RX_CHANGED: + BCM message with updated CAN frame (detected content change). + Sent on first message received or on receipt of revised CAN messages. + + +Broadcast Manager Message Flags +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When sending a message to the broadcast manager the 'flags' element may +contain the following flag definitions which influence the behaviour: + +SETTIMER: + Set the values of ival1, ival2 and count + +STARTTIMER: + Start the timer with the actual values of ival1, ival2 + and count. Starting the timer leads simultaneously to emit a CAN frame. + +TX_COUNTEVT: + Create the message TX_EXPIRED when count expires + +TX_ANNOUNCE: + A change of data by the process is emitted immediately. + +TX_CP_CAN_ID: + Copies the can_id from the message header to each + subsequent frame in frames. This is intended as usage simplification. For + TX tasks the unique can_id from the message header may differ from the + can_id(s) stored for transmission in the subsequent struct can_frame(s). + +RX_FILTER_ID: + Filter by can_id alone, no frames required (nframes=0). + +RX_CHECK_DLC: + A change of the DLC leads to an RX_CHANGED. + +RX_NO_AUTOTIMER: + Prevent automatically starting the timeout monitor. + +RX_ANNOUNCE_RESUME: + If passed at RX_SETUP and a receive timeout occurred, a + RX_CHANGED message will be generated when the (cyclic) receive restarts. + +TX_RESET_MULTI_IDX: + Reset the index for the multiple frame transmission. + +RX_RTR_FRAME: + Send reply for RTR-request (placed in op->frames[0]). + + +Broadcast Manager Transmission Timers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Periodic transmission configurations may use up to two interval timers. +In this case the BCM sends a number of messages ('count') at an interval +'ival1', then continuing to send at another given interval 'ival2'. When +only one timer is needed 'count' is set to zero and only 'ival2' is used. +When SET_TIMER and START_TIMER flag were set the timers are activated. +The timer values can be altered at runtime when only SET_TIMER is set. + + +Broadcast Manager message sequence transmission +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Up to 256 CAN frames can be transmitted in a sequence in the case of a cyclic +TX task configuration. The number of CAN frames is provided in the 'nframes' +element of the BCM message head. The defined number of CAN frames are added +as array to the TX_SETUP BCM configuration message: + +.. code-block:: C + + /* create a struct to set up a sequence of four CAN frames */ + struct { + struct bcm_msg_head msg_head; + struct can_frame frame[4]; + } mytxmsg; + + (..) + mytxmsg.msg_head.nframes = 4; + (..) + + write(s, &mytxmsg, sizeof(mytxmsg)); + +With every transmission the index in the array of CAN frames is increased +and set to zero at index overflow. + + +Broadcast Manager Receive Filter Timers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The timer values ival1 or ival2 may be set to non-zero values at RX_SETUP. +When the SET_TIMER flag is set the timers are enabled: + +ival1: + Send RX_TIMEOUT when a received message is not received again within + the given time. When START_TIMER is set at RX_SETUP the timeout detection + is activated directly - even without a former CAN frame reception. + +ival2: + Throttle the received message rate down to the value of ival2. This + is useful to reduce messages for the application when the signal inside the + CAN frame is stateless as state changes within the ival2 periode may get + lost. + +Broadcast Manager Multiplex Message Receive Filter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To filter for content changes in multiplex message sequences an array of more +than one CAN frames can be passed in a RX_SETUP configuration message. The +data bytes of the first CAN frame contain the mask of relevant bits that +have to match in the subsequent CAN frames with the received CAN frame. +If one of the subsequent CAN frames is matching the bits in that frame data +mark the relevant content to be compared with the previous received content. +Up to 257 CAN frames (multiplex filter bit mask CAN frame plus 256 CAN +filters) can be added as array to the TX_SETUP BCM configuration message: + +.. code-block:: C + + /* usually used to clear CAN frame data[] - beware of endian problems! */ + #define U64_DATA(p) (*(unsigned long long*)(p)->data) + + struct { + struct bcm_msg_head msg_head; + struct can_frame frame[5]; + } msg; + + msg.msg_head.opcode = RX_SETUP; + msg.msg_head.can_id = 0x42; + msg.msg_head.flags = 0; + msg.msg_head.nframes = 5; + U64_DATA(&msg.frame[0]) = 0xFF00000000000000ULL; /* MUX mask */ + U64_DATA(&msg.frame[1]) = 0x01000000000000FFULL; /* data mask (MUX 0x01) */ + U64_DATA(&msg.frame[2]) = 0x0200FFFF000000FFULL; /* data mask (MUX 0x02) */ + U64_DATA(&msg.frame[3]) = 0x330000FFFFFF0003ULL; /* data mask (MUX 0x33) */ + U64_DATA(&msg.frame[4]) = 0x4F07FC0FF0000000ULL; /* data mask (MUX 0x4F) */ + + write(s, &msg, sizeof(msg)); + + +Broadcast Manager CAN FD Support +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The programming API of the CAN_BCM depends on struct can_frame which is +given as array directly behind the bcm_msg_head structure. To follow this +schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head +flags indicates that the concatenated CAN frame structures behind the +bcm_msg_head are defined as struct canfd_frame: + +.. code-block:: C + + struct { + struct bcm_msg_head msg_head; + struct canfd_frame frame[5]; + } msg; + + msg.msg_head.opcode = RX_SETUP; + msg.msg_head.can_id = 0x42; + msg.msg_head.flags = CAN_FD_FRAME; + msg.msg_head.nframes = 5; + (..) + +When using CAN FD frames for multiplex filtering the MUX mask is still +expected in the first 64 bit of the struct canfd_frame data section. + + +Connected Transport Protocols (SOCK_SEQPACKET) +---------------------------------------------- + +(to be written) + + +Unconnected Transport Protocols (SOCK_DGRAM) +-------------------------------------------- + +(to be written) + + +.. _socketcan-core-module: + +SocketCAN Core Module +===================== + +The SocketCAN core module implements the protocol family +PF_CAN. CAN protocol modules are loaded by the core module at +runtime. The core module provides an interface for CAN protocol +modules to subscribe needed CAN IDs (see :ref:`socketcan-receive-lists`). + + +can.ko Module Params +-------------------- + +- **stats_timer**: + To calculate the SocketCAN core statistics + (e.g. current/maximum frames per second) this 1 second timer is + invoked at can.ko module start time by default. This timer can be + disabled by using stattimer=0 on the module commandline. + +- **debug**: + (removed since SocketCAN SVN r546) + + +procfs content +-------------- + +As described in :ref:`socketcan-receive-lists` the SocketCAN core uses several filter +lists to deliver received CAN frames to CAN protocol modules. These +receive lists, their filters and the count of filter matches can be +checked in the appropriate receive list. All entries contain the +device and a protocol module identifier:: + + foo@bar:~$ cat /proc/net/can/rcvlist_all + + receive list 'rx_all': + (vcan3: no entry) + (vcan2: no entry) + (vcan1: no entry) + device can_id can_mask function userdata matches ident + vcan0 000 00000000 f88e6370 f6c6f400 0 raw + (any: no entry) + +In this example an application requests any CAN traffic from vcan0:: + + rcvlist_all - list for unfiltered entries (no filter operations) + rcvlist_eff - list for single extended frame (EFF) entries + rcvlist_err - list for error message frames masks + rcvlist_fil - list for mask/value filters + rcvlist_inv - list for mask/value filters (inverse semantic) + rcvlist_sff - list for single standard frame (SFF) entries + +Additional procfs files in /proc/net/can:: + + stats - SocketCAN core statistics (rx/tx frames, match ratios, ...) + reset_stats - manual statistic reset + version - prints the SocketCAN core version and the ABI version + + +Writing Own CAN Protocol Modules +-------------------------------- + +To implement a new protocol in the protocol family PF_CAN a new +protocol has to be defined in include/linux/can.h . +The prototypes and definitions to use the SocketCAN core can be +accessed by including include/linux/can/core.h . +In addition to functions that register the CAN protocol and the +CAN device notifier chain there are functions to subscribe CAN +frames received by CAN interfaces and to send CAN frames:: + + can_rx_register - subscribe CAN frames from a specific interface + can_rx_unregister - unsubscribe CAN frames from a specific interface + can_send - transmit a CAN frame (optional with local loopback) + +For details see the kerneldoc documentation in net/can/af_can.c or +the source code of net/can/raw.c or net/can/bcm.c . + + +CAN Network Drivers +=================== + +Writing a CAN network device driver is much easier than writing a +CAN character device driver. Similar to other known network device +drivers you mainly have to deal with: + +- TX: Put the CAN frame from the socket buffer to the CAN controller. +- RX: Put the CAN frame from the CAN controller to the socket buffer. + +See e.g. at Documentation/networking/netdevices.txt . The differences +for writing CAN network device driver are described below: + + +General Settings +---------------- + +.. code-block:: C + + dev->type = ARPHRD_CAN; /* the netdevice hardware type */ + dev->flags = IFF_NOARP; /* CAN has no arp */ + + dev->mtu = CAN_MTU; /* sizeof(struct can_frame) -> legacy CAN interface */ + + or alternative, when the controller supports CAN with flexible data rate: + dev->mtu = CANFD_MTU; /* sizeof(struct canfd_frame) -> CAN FD interface */ + +The struct can_frame or struct canfd_frame is the payload of each socket +buffer (skbuff) in the protocol family PF_CAN. + + +.. _socketcan-local-loopback2: + +Local Loopback of Sent Frames +----------------------------- + +As described in :ref:`socketcan-local-loopback1` the CAN network device driver should +support a local loopback functionality similar to the local echo +e.g. of tty devices. In this case the driver flag IFF_ECHO has to be +set to prevent the PF_CAN core from locally echoing sent frames +(aka loopback) as fallback solution:: + + dev->flags = (IFF_NOARP | IFF_ECHO); + + +CAN Controller Hardware Filters +------------------------------- + +To reduce the interrupt load on deep embedded systems some CAN +controllers support the filtering of CAN IDs or ranges of CAN IDs. +These hardware filter capabilities vary from controller to +controller and have to be identified as not feasible in a multi-user +networking approach. The use of the very controller specific +hardware filters could make sense in a very dedicated use-case, as a +filter on driver level would affect all users in the multi-user +system. The high efficient filter sets inside the PF_CAN core allow +to set different multiple filters for each socket separately. +Therefore the use of hardware filters goes to the category 'handmade +tuning on deep embedded systems'. The author is running a MPC603e +@133MHz with four SJA1000 CAN controllers from 2002 under heavy bus +load without any problems ... + + +The Virtual CAN Driver (vcan) +----------------------------- + +Similar to the network loopback devices, vcan offers a virtual local +CAN interface. A full qualified address on CAN consists of + +- a unique CAN Identifier (CAN ID) +- the CAN bus this CAN ID is transmitted on (e.g. can0) + +so in common use cases more than one virtual CAN interface is needed. + +The virtual CAN interfaces allow the transmission and reception of CAN +frames without real CAN controller hardware. Virtual CAN network +devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ... +When compiled as a module the virtual CAN driver module is called vcan.ko + +Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel +netlink interface to create vcan network devices. The creation and +removal of vcan network devices can be managed with the ip(8) tool:: + + - Create a virtual CAN network interface: + $ ip link add type vcan + + - Create a virtual CAN network interface with a specific name 'vcan42': + $ ip link add dev vcan42 type vcan + + - Remove a (virtual CAN) network interface 'vcan42': + $ ip link del vcan42 + + +The CAN Network Device Driver Interface +--------------------------------------- + +The CAN network device driver interface provides a generic interface +to setup, configure and monitor CAN network devices. The user can then +configure the CAN device, like setting the bit-timing parameters, via +the netlink interface using the program "ip" from the "IPROUTE2" +utility suite. The following chapter describes briefly how to use it. +Furthermore, the interface uses a common data structure and exports a +set of common functions, which all real CAN network device drivers +should use. Please have a look to the SJA1000 or MSCAN driver to +understand how to use them. The name of the module is can-dev.ko. + + +Netlink interface to set/get devices properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The CAN device must be configured via netlink interface. The supported +netlink message types are defined and briefly described in +"include/linux/can/netlink.h". CAN link support for the program "ip" +of the IPROUTE2 utility suite is available and it can be used as shown +below: + +Setting CAN device properties:: + + $ ip link set can0 type can help + Usage: ip link set DEVICE type can + [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] | + [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1 + phase-seg2 PHASE-SEG2 [ sjw SJW ] ] + + [ dbitrate BITRATE [ dsample-point SAMPLE-POINT] ] | + [ dtq TQ dprop-seg PROP_SEG dphase-seg1 PHASE-SEG1 + dphase-seg2 PHASE-SEG2 [ dsjw SJW ] ] + + [ loopback { on | off } ] + [ listen-only { on | off } ] + [ triple-sampling { on | off } ] + [ one-shot { on | off } ] + [ berr-reporting { on | off } ] + [ fd { on | off } ] + [ fd-non-iso { on | off } ] + [ presume-ack { on | off } ] + + [ restart-ms TIME-MS ] + [ restart ] + + Where: BITRATE := { 1..1000000 } + SAMPLE-POINT := { 0.000..0.999 } + TQ := { NUMBER } + PROP-SEG := { 1..8 } + PHASE-SEG1 := { 1..8 } + PHASE-SEG2 := { 1..8 } + SJW := { 1..4 } + RESTART-MS := { 0 | NUMBER } + +Display CAN device details and statistics:: + + $ ip -details -statistics link show can0 + 2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10 + link/can + can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100 + bitrate 125000 sample_point 0.875 + tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1 + sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 + clock 8000000 + re-started bus-errors arbit-lost error-warn error-pass bus-off + 41 17457 0 41 42 41 + RX: bytes packets errors dropped overrun mcast + 140859 17608 17457 0 0 0 + TX: bytes packets errors dropped carrier collsns + 861 112 0 41 0 0 + +More info to the above output: + +"<TRIPLE-SAMPLING>" + Shows the list of selected CAN controller modes: LOOPBACK, + LISTEN-ONLY, or TRIPLE-SAMPLING. + +"state ERROR-ACTIVE" + The current state of the CAN controller: "ERROR-ACTIVE", + "ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED" + +"restart-ms 100" + Automatic restart delay time. If set to a non-zero value, a + restart of the CAN controller will be triggered automatically + in case of a bus-off condition after the specified delay time + in milliseconds. By default it's off. + +"bitrate 125000 sample-point 0.875" + Shows the real bit-rate in bits/sec and the sample-point in the + range 0.000..0.999. If the calculation of bit-timing parameters + is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the + bit-timing can be defined by setting the "bitrate" argument. + Optionally the "sample-point" can be specified. By default it's + 0.000 assuming CIA-recommended sample-points. + +"tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1" + Shows the time quanta in ns, propagation segment, phase buffer + segment 1 and 2 and the synchronisation jump width in units of + tq. They allow to define the CAN bit-timing in a hardware + independent format as proposed by the Bosch CAN 2.0 spec (see + chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf). + +"sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 clock 8000000" + Shows the bit-timing constants of the CAN controller, here the + "sja1000". The minimum and maximum values of the time segment 1 + and 2, the synchronisation jump width in units of tq, the + bitrate pre-scaler and the CAN system clock frequency in Hz. + These constants could be used for user-defined (non-standard) + bit-timing calculation algorithms in user-space. + +"re-started bus-errors arbit-lost error-warn error-pass bus-off" + Shows the number of restarts, bus and arbitration lost errors, + and the state changes to the error-warning, error-passive and + bus-off state. RX overrun errors are listed in the "overrun" + field of the standard network statistics. + +Setting the CAN Bit-Timing +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The CAN bit-timing parameters can always be defined in a hardware +independent format as proposed in the Bosch CAN 2.0 specification +specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2" +and "sjw":: + + $ ip link set canX type can tq 125 prop-seg 6 \ + phase-seg1 7 phase-seg2 2 sjw 1 + +If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA +recommended CAN bit-timing parameters will be calculated if the bit- +rate is specified with the argument "bitrate":: + + $ ip link set canX type can bitrate 125000 + +Note that this works fine for the most common CAN controllers with +standard bit-rates but may *fail* for exotic bit-rates or CAN system +clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some +space and allows user-space tools to solely determine and set the +bit-timing parameters. The CAN controller specific bit-timing +constants can be used for that purpose. They are listed by the +following command:: + + $ ip -details link show can0 + ... + sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 + + +Starting and Stopping the CAN Network Device +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A CAN network device is started or stopped as usual with the command +"ifconfig canX up/down" or "ip link set canX up/down". Be aware that +you *must* define proper bit-timing parameters for real CAN devices +before you can start it to avoid error-prone default settings:: + + $ ip link set canX up type can bitrate 125000 + +A device may enter the "bus-off" state if too many errors occurred on +the CAN bus. Then no more messages are received or sent. An automatic +bus-off recovery can be enabled by setting the "restart-ms" to a +non-zero value, e.g.:: + + $ ip link set canX type can restart-ms 100 + +Alternatively, the application may realize the "bus-off" condition +by monitoring CAN error message frames and do a restart when +appropriate with the command:: + + $ ip link set canX type can restart + +Note that a restart will also create a CAN error message frame (see +also :ref:`socketcan-network-problem-notifications`). + + +.. _socketcan-can-fd-driver: + +CAN FD (Flexible Data Rate) Driver Support +------------------------------------------ + +CAN FD capable CAN controllers support two different bitrates for the +arbitration phase and the payload phase of the CAN FD frame. Therefore a +second bit timing has to be specified in order to enable the CAN FD bitrate. + +Additionally CAN FD capable CAN controllers support up to 64 bytes of +payload. The representation of this length in can_frame.can_dlc and +canfd_frame.len for userspace applications and inside the Linux network +layer is a plain value from 0 .. 64 instead of the CAN 'data length code'. +The data length code was a 1:1 mapping to the payload length in the legacy +CAN frames anyway. The payload length to the bus-relevant DLC mapping is +only performed inside the CAN drivers, preferably with the helper +functions can_dlc2len() and can_len2dlc(). + +The CAN netdevice driver capabilities can be distinguished by the network +devices maximum transfer unit (MTU):: + + MTU = 16 (CAN_MTU) => sizeof(struct can_frame) => 'legacy' CAN device + MTU = 72 (CANFD_MTU) => sizeof(struct canfd_frame) => CAN FD capable device + +The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall. +N.B. CAN FD capable devices can also handle and send legacy CAN frames. + +When configuring CAN FD capable CAN controllers an additional 'data' bitrate +has to be set. This bitrate for the data phase of the CAN FD frame has to be +at least the bitrate which was configured for the arbitration phase. This +second bitrate is specified analogue to the first bitrate but the bitrate +setting keywords for the 'data' bitrate start with 'd' e.g. dbitrate, +dsample-point, dsjw or dtq and similar settings. When a data bitrate is set +within the configuration process the controller option "fd on" can be +specified to enable the CAN FD mode in the CAN controller. This controller +option also switches the device MTU to 72 (CANFD_MTU). + +The first CAN FD specification presented as whitepaper at the International +CAN Conference 2012 needed to be improved for data integrity reasons. +Therefore two CAN FD implementations have to be distinguished today: + +- ISO compliant: The ISO 11898-1:2015 CAN FD implementation (default) +- non-ISO compliant: The CAN FD implementation following the 2012 whitepaper + +Finally there are three types of CAN FD controllers: + +1. ISO compliant (fixed) +2. non-ISO compliant (fixed, like the M_CAN IP core v3.0.1 in m_can.c) +3. ISO/non-ISO CAN FD controllers (switchable, like the PEAK PCAN-USB FD) + +The current ISO/non-ISO mode is announced by the CAN controller driver via +netlink and displayed by the 'ip' tool (controller option FD-NON-ISO). +The ISO/non-ISO-mode can be altered by setting 'fd-non-iso {on|off}' for +switchable CAN FD controllers only. + +Example configuring 500 kbit/s arbitration bitrate and 4 Mbit/s data bitrate:: + + $ ip link set can0 up type can bitrate 500000 sample-point 0.75 \ + dbitrate 4000000 dsample-point 0.8 fd on + $ ip -details link show can0 + 5: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 72 qdisc pfifo_fast state UNKNOWN \ + mode DEFAULT group default qlen 10 + link/can promiscuity 0 + can <FD> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0 + bitrate 500000 sample-point 0.750 + tq 50 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 1 + pcan_usb_pro_fd: tseg1 1..64 tseg2 1..16 sjw 1..16 brp 1..1024 \ + brp-inc 1 + dbitrate 4000000 dsample-point 0.800 + dtq 12 dprop-seg 7 dphase-seg1 8 dphase-seg2 4 dsjw 1 + pcan_usb_pro_fd: dtseg1 1..16 dtseg2 1..8 dsjw 1..4 dbrp 1..1024 \ + dbrp-inc 1 + clock 80000000 + +Example when 'fd-non-iso on' is added on this switchable CAN FD adapter:: + + can <FD,FD-NON-ISO> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0 + + +Supported CAN Hardware +---------------------- + +Please check the "Kconfig" file in "drivers/net/can" to get an actual +list of the support CAN hardware. On the SocketCAN project website +(see :ref:`socketcan-resources`) there might be further drivers available, also for +older kernel versions. + + +.. _socketcan-resources: + +SocketCAN Resources +=================== + +The Linux CAN / SocketCAN project resources (project site / mailing list) +are referenced in the MAINTAINERS file in the Linux source tree. +Search for CAN NETWORK [LAYERS|DRIVERS]. + +Credits +======= + +- Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver) +- Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan) +- Jan Kizka (RT-SocketCAN core, Socket-API reconciliation) +- Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews, CAN device driver interface, MSCAN driver) +- Robert Schwebel (design reviews, PTXdist integration) +- Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers) +- Benedikt Spranger (reviews) +- Thomas Gleixner (LKML reviews, coding style, posting hints) +- Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver) +- Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003) +- Klaus Hitschler (PEAK driver integration) +- Uwe Koppe (CAN netdevices with PF_PACKET approach) +- Michael Schulze (driver layer loopback requirement, RT CAN drivers review) +- Pavel Pisa (Bit-timing calculation) +- Sascha Hauer (SJA1000 platform driver) +- Sebastian Haas (SJA1000 EMS PCI driver) +- Markus Plessing (SJA1000 EMS PCI driver) +- Per Dalen (SJA1000 Kvaser PCI driver) +- Sam Ravnborg (reviews, coding style, kbuild help) diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt deleted file mode 100644 index aa15b9ee2e70..000000000000 --- a/Documentation/networking/can.txt +++ /dev/null @@ -1,1308 +0,0 @@ -============================================================================ - -can.txt - -Readme file for the Controller Area Network Protocol Family (aka SocketCAN) - -This file contains - - 1 Overview / What is SocketCAN - - 2 Motivation / Why using the socket API - - 3 SocketCAN concept - 3.1 receive lists - 3.2 local loopback of sent frames - 3.3 network problem notifications - - 4 How to use SocketCAN - 4.1 RAW protocol sockets with can_filters (SOCK_RAW) - 4.1.1 RAW socket option CAN_RAW_FILTER - 4.1.2 RAW socket option CAN_RAW_ERR_FILTER - 4.1.3 RAW socket option CAN_RAW_LOOPBACK - 4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS - 4.1.5 RAW socket option CAN_RAW_FD_FRAMES - 4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS - 4.1.7 RAW socket returned message flags - 4.2 Broadcast Manager protocol sockets (SOCK_DGRAM) - 4.2.1 Broadcast Manager operations - 4.2.2 Broadcast Manager message flags - 4.2.3 Broadcast Manager transmission timers - 4.2.4 Broadcast Manager message sequence transmission - 4.2.5 Broadcast Manager receive filter timers - 4.2.6 Broadcast Manager multiplex message receive filter - 4.2.7 Broadcast Manager CAN FD support - 4.3 connected transport protocols (SOCK_SEQPACKET) - 4.4 unconnected transport protocols (SOCK_DGRAM) - - 5 SocketCAN core module - 5.1 can.ko module params - 5.2 procfs content - 5.3 writing own CAN protocol modules - - 6 CAN network drivers - 6.1 general settings - 6.2 local loopback of sent frames - 6.3 CAN controller hardware filters - 6.4 The virtual CAN driver (vcan) - 6.5 The CAN network device driver interface - 6.5.1 Netlink interface to set/get devices properties - 6.5.2 Setting the CAN bit-timing - 6.5.3 Starting and stopping the CAN network device - 6.6 CAN FD (flexible data rate) driver support - 6.7 supported CAN hardware - - 7 SocketCAN resources - - 8 Credits - -============================================================================ - -1. Overview / What is SocketCAN --------------------------------- - -The socketcan package is an implementation of CAN protocols -(Controller Area Network) for Linux. CAN is a networking technology -which has widespread use in automation, embedded devices, and -automotive fields. While there have been other CAN implementations -for Linux based on character devices, SocketCAN uses the Berkeley -socket API, the Linux network stack and implements the CAN device -drivers as network interfaces. The CAN socket API has been designed -as similar as possible to the TCP/IP protocols to allow programmers, -familiar with network programming, to easily learn how to use CAN -sockets. - -2. Motivation / Why using the socket API ----------------------------------------- - -There have been CAN implementations for Linux before SocketCAN so the -question arises, why we have started another project. Most existing -implementations come as a device driver for some CAN hardware, they -are based on character devices and provide comparatively little -functionality. Usually, there is only a hardware-specific device -driver which provides a character device interface to send and -receive raw CAN frames, directly to/from the controller hardware. -Queueing of frames and higher-level transport protocols like ISO-TP -have to be implemented in user space applications. Also, most -character-device implementations support only one single process to -open the device at a time, similar to a serial interface. Exchanging -the CAN controller requires employment of another device driver and -often the need for adaption of large parts of the application to the -new driver's API. - -SocketCAN was designed to overcome all of these limitations. A new -protocol family has been implemented which provides a socket interface -to user space applications and which builds upon the Linux network -layer, enabling use all of the provided queueing functionality. A device -driver for CAN controller hardware registers itself with the Linux -network layer as a network device, so that CAN frames from the -controller can be passed up to the network layer and on to the CAN -protocol family module and also vice-versa. Also, the protocol family -module provides an API for transport protocol modules to register, so -that any number of transport protocols can be loaded or unloaded -dynamically. In fact, the can core module alone does not provide any -protocol and cannot be used without loading at least one additional -protocol module. Multiple sockets can be opened at the same time, -on different or the same protocol module and they can listen/send -frames on different or the same CAN IDs. Several sockets listening on -the same interface for frames with the same CAN ID are all passed the -same received matching CAN frames. An application wishing to -communicate using a specific transport protocol, e.g. ISO-TP, just -selects that protocol when opening the socket, and then can read and -write application data byte streams, without having to deal with -CAN-IDs, frames, etc. - -Similar functionality visible from user-space could be provided by a -character device, too, but this would lead to a technically inelegant -solution for a couple of reasons: - -* Intricate usage. Instead of passing a protocol argument to - socket(2) and using bind(2) to select a CAN interface and CAN ID, an - application would have to do all these operations using ioctl(2)s. - -* Code duplication. A character device cannot make use of the Linux - network queueing code, so all that code would have to be duplicated - for CAN networking. - -* Abstraction. In most existing character-device implementations, the - hardware-specific device driver for a CAN controller directly - provides the character device for the application to work with. - This is at least very unusual in Unix systems for both, char and - block devices. For example you don't have a character device for a - certain UART of a serial interface, a certain sound chip in your - computer, a SCSI or IDE controller providing access to your hard - disk or tape streamer device. Instead, you have abstraction layers - which provide a unified character or block device interface to the - application on the one hand, and a interface for hardware-specific - device drivers on the other hand. These abstractions are provided - by subsystems like the tty layer, the audio subsystem or the SCSI - and IDE subsystems for the devices mentioned above. - - The easiest way to implement a CAN device driver is as a character - device without such a (complete) abstraction layer, as is done by most - existing drivers. The right way, however, would be to add such a - layer with all the functionality like registering for certain CAN - IDs, supporting several open file descriptors and (de)multiplexing - CAN frames between them, (sophisticated) queueing of CAN frames, and - providing an API for device drivers to register with. However, then - it would be no more difficult, or may be even easier, to use the - networking framework provided by the Linux kernel, and this is what - SocketCAN does. - - The use of the networking framework of the Linux kernel is just the - natural and most appropriate way to implement CAN for Linux. - -3. SocketCAN concept ---------------------- - - As described in chapter 2 it is the main goal of SocketCAN to - provide a socket interface to user space applications which builds - upon the Linux network layer. In contrast to the commonly known - TCP/IP and ethernet networking, the CAN bus is a broadcast-only(!) - medium that has no MAC-layer addressing like ethernet. The CAN-identifier - (can_id) is used for arbitration on the CAN-bus. Therefore the CAN-IDs - have to be chosen uniquely on the bus. When designing a CAN-ECU - network the CAN-IDs are mapped to be sent by a specific ECU. - For this reason a CAN-ID can be treated best as a kind of source address. - - 3.1 receive lists - - The network transparent access of multiple applications leads to the - problem that different applications may be interested in the same - CAN-IDs from the same CAN network interface. The SocketCAN core - module - which implements the protocol family CAN - provides several - high efficient receive lists for this reason. If e.g. a user space - application opens a CAN RAW socket, the raw protocol module itself - requests the (range of) CAN-IDs from the SocketCAN core that are - requested by the user. The subscription and unsubscription of - CAN-IDs can be done for specific CAN interfaces or for all(!) known - CAN interfaces with the can_rx_(un)register() functions provided to - CAN protocol modules by the SocketCAN core (see chapter 5). - To optimize the CPU usage at runtime the receive lists are split up - into several specific lists per device that match the requested - filter complexity for a given use-case. - - 3.2 local loopback of sent frames - - As known from other networking concepts the data exchanging - applications may run on the same or different nodes without any - change (except for the according addressing information): - - ___ ___ ___ _______ ___ - | _ | | _ | | _ | | _ _ | | _ | - ||A|| ||B|| ||C|| ||A| |B|| ||C|| - |___| |___| |___| |_______| |___| - | | | | | - -----------------(1)- CAN bus -(2)--------------- - - To ensure that application A receives the same information in the - example (2) as it would receive in example (1) there is need for - some kind of local loopback of the sent CAN frames on the appropriate - node. - - The Linux network devices (by default) just can handle the - transmission and reception of media dependent frames. Due to the - arbitration on the CAN bus the transmission of a low prio CAN-ID - may be delayed by the reception of a high prio CAN frame. To - reflect the correct* traffic on the node the loopback of the sent - data has to be performed right after a successful transmission. If - the CAN network interface is not capable of performing the loopback for - some reason the SocketCAN core can do this task as a fallback solution. - See chapter 6.2 for details (recommended). - - The loopback functionality is enabled by default to reflect standard - networking behaviour for CAN applications. Due to some requests from - the RT-SocketCAN group the loopback optionally may be disabled for each - separate socket. See sockopts from the CAN RAW sockets in chapter 4.1. - - * = you really like to have this when you're running analyser tools - like 'candump' or 'cansniffer' on the (same) node. - - 3.3 network problem notifications - - The use of the CAN bus may lead to several problems on the physical - and media access control layer. Detecting and logging of these lower - layer problems is a vital requirement for CAN users to identify - hardware issues on the physical transceiver layer as well as - arbitration problems and error frames caused by the different - ECUs. The occurrence of detected errors are important for diagnosis - and have to be logged together with the exact timestamp. For this - reason the CAN interface driver can generate so called Error Message - Frames that can optionally be passed to the user application in the - same way as other CAN frames. Whenever an error on the physical layer - or the MAC layer is detected (e.g. by the CAN controller) the driver - creates an appropriate error message frame. Error messages frames can - be requested by the user application using the common CAN filter - mechanisms. Inside this filter definition the (interested) type of - errors may be selected. The reception of error messages is disabled - by default. The format of the CAN error message frame is briefly - described in the Linux header file "include/uapi/linux/can/error.h". - -4. How to use SocketCAN ------------------------- - - Like TCP/IP, you first need to open a socket for communicating over a - CAN network. Since SocketCAN implements a new protocol family, you - need to pass PF_CAN as the first argument to the socket(2) system - call. Currently, there are two CAN protocols to choose from, the raw - socket protocol and the broadcast manager (BCM). So to open a socket, - you would write - - s = socket(PF_CAN, SOCK_RAW, CAN_RAW); - - and - - s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM); - - respectively. After the successful creation of the socket, you would - normally use the bind(2) system call to bind the socket to a CAN - interface (which is different from TCP/IP due to different addressing - - see chapter 3). After binding (CAN_RAW) or connecting (CAN_BCM) - the socket, you can read(2) and write(2) from/to the socket or use - send(2), sendto(2), sendmsg(2) and the recv* counterpart operations - on the socket as usual. There are also CAN specific socket options - described below. - - The basic CAN frame structure and the sockaddr structure are defined - in include/linux/can.h: - - struct can_frame { - canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */ - __u8 can_dlc; /* frame payload length in byte (0 .. 8) */ - __u8 __pad; /* padding */ - __u8 __res0; /* reserved / padding */ - __u8 __res1; /* reserved / padding */ - __u8 data[8] __attribute__((aligned(8))); - }; - - The alignment of the (linear) payload data[] to a 64bit boundary - allows the user to define their own structs and unions to easily access - the CAN payload. There is no given byteorder on the CAN bus by - default. A read(2) system call on a CAN_RAW socket transfers a - struct can_frame to the user space. - - The sockaddr_can structure has an interface index like the - PF_PACKET socket, that also binds to a specific interface: - - struct sockaddr_can { - sa_family_t can_family; - int can_ifindex; - union { - /* transport protocol class address info (e.g. ISOTP) */ - struct { canid_t rx_id, tx_id; } tp; - - /* reserved for future CAN protocols address information */ - } can_addr; - }; - - To determine the interface index an appropriate ioctl() has to - be used (example for CAN_RAW sockets without error checking): - - int s; - struct sockaddr_can addr; - struct ifreq ifr; - - s = socket(PF_CAN, SOCK_RAW, CAN_RAW); - - strcpy(ifr.ifr_name, "can0" ); - ioctl(s, SIOCGIFINDEX, &ifr); - - addr.can_family = AF_CAN; - addr.can_ifindex = ifr.ifr_ifindex; - - bind(s, (struct sockaddr *)&addr, sizeof(addr)); - - (..) - - To bind a socket to all(!) CAN interfaces the interface index must - be 0 (zero). In this case the socket receives CAN frames from every - enabled CAN interface. To determine the originating CAN interface - the system call recvfrom(2) may be used instead of read(2). To send - on a socket that is bound to 'any' interface sendto(2) is needed to - specify the outgoing interface. - - Reading CAN frames from a bound CAN_RAW socket (see above) consists - of reading a struct can_frame: - - struct can_frame frame; - - nbytes = read(s, &frame, sizeof(struct can_frame)); - - if (nbytes < 0) { - perror("can raw socket read"); - return 1; - } - - /* paranoid check ... */ - if (nbytes < sizeof(struct can_frame)) { - fprintf(stderr, "read: incomplete CAN frame\n"); - return 1; - } - - /* do something with the received CAN frame */ - - Writing CAN frames can be done similarly, with the write(2) system call: - - nbytes = write(s, &frame, sizeof(struct can_frame)); - - When the CAN interface is bound to 'any' existing CAN interface - (addr.can_ifindex = 0) it is recommended to use recvfrom(2) if the - information about the originating CAN interface is needed: - - struct sockaddr_can addr; - struct ifreq ifr; - socklen_t len = sizeof(addr); - struct can_frame frame; - - nbytes = recvfrom(s, &frame, sizeof(struct can_frame), - 0, (struct sockaddr*)&addr, &len); - - /* get interface name of the received CAN frame */ - ifr.ifr_ifindex = addr.can_ifindex; - ioctl(s, SIOCGIFNAME, &ifr); - printf("Received a CAN frame from interface %s", ifr.ifr_name); - - To write CAN frames on sockets bound to 'any' CAN interface the - outgoing interface has to be defined certainly. - - strcpy(ifr.ifr_name, "can0"); - ioctl(s, SIOCGIFINDEX, &ifr); - addr.can_ifindex = ifr.ifr_ifindex; - addr.can_family = AF_CAN; - - nbytes = sendto(s, &frame, sizeof(struct can_frame), - 0, (struct sockaddr*)&addr, sizeof(addr)); - - An accurate timestamp can be obtained with an ioctl(2) call after reading - a message from the socket: - - struct timeval tv; - ioctl(s, SIOCGSTAMP, &tv); - - The timestamp has a resolution of one microsecond and is set automatically - at the reception of a CAN frame. - - Remark about CAN FD (flexible data rate) support: - - Generally the handling of CAN FD is very similar to the formerly described - examples. The new CAN FD capable CAN controllers support two different - bitrates for the arbitration phase and the payload phase of the CAN FD frame - and up to 64 bytes of payload. This extended payload length breaks all the - kernel interfaces (ABI) which heavily rely on the CAN frame with fixed eight - bytes of payload (struct can_frame) like the CAN_RAW socket. Therefore e.g. - the CAN_RAW socket supports a new socket option CAN_RAW_FD_FRAMES that - switches the socket into a mode that allows the handling of CAN FD frames - and (legacy) CAN frames simultaneously (see section 4.1.5). - - The struct canfd_frame is defined in include/linux/can.h: - - struct canfd_frame { - canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */ - __u8 len; /* frame payload length in byte (0 .. 64) */ - __u8 flags; /* additional flags for CAN FD */ - __u8 __res0; /* reserved / padding */ - __u8 __res1; /* reserved / padding */ - __u8 data[64] __attribute__((aligned(8))); - }; - - The struct canfd_frame and the existing struct can_frame have the can_id, - the payload length and the payload data at the same offset inside their - structures. This allows to handle the different structures very similar. - When the content of a struct can_frame is copied into a struct canfd_frame - all structure elements can be used as-is - only the data[] becomes extended. - - When introducing the struct canfd_frame it turned out that the data length - code (DLC) of the struct can_frame was used as a length information as the - length and the DLC has a 1:1 mapping in the range of 0 .. 8. To preserve - the easy handling of the length information the canfd_frame.len element - contains a plain length value from 0 .. 64. So both canfd_frame.len and - can_frame.can_dlc are equal and contain a length information and no DLC. - For details about the distinction of CAN and CAN FD capable devices and - the mapping to the bus-relevant data length code (DLC), see chapter 6.6. - - The length of the two CAN(FD) frame structures define the maximum transfer - unit (MTU) of the CAN(FD) network interface and skbuff data length. Two - definitions are specified for CAN specific MTUs in include/linux/can.h : - - #define CAN_MTU (sizeof(struct can_frame)) == 16 => 'legacy' CAN frame - #define CANFD_MTU (sizeof(struct canfd_frame)) == 72 => CAN FD frame - - 4.1 RAW protocol sockets with can_filters (SOCK_RAW) - - Using CAN_RAW sockets is extensively comparable to the commonly - known access to CAN character devices. To meet the new possibilities - provided by the multi user SocketCAN approach, some reasonable - defaults are set at RAW socket binding time: - - - The filters are set to exactly one filter receiving everything - - The socket only receives valid data frames (=> no error message frames) - - The loopback of sent CAN frames is enabled (see chapter 3.2) - - The socket does not receive its own sent frames (in loopback mode) - - These default settings may be changed before or after binding the socket. - To use the referenced definitions of the socket options for CAN_RAW - sockets, include <linux/can/raw.h>. - - 4.1.1 RAW socket option CAN_RAW_FILTER - - The reception of CAN frames using CAN_RAW sockets can be controlled - by defining 0 .. n filters with the CAN_RAW_FILTER socket option. - - The CAN filter structure is defined in include/linux/can.h: - - struct can_filter { - canid_t can_id; - canid_t can_mask; - }; - - A filter matches, when - - <received_can_id> & mask == can_id & mask - - which is analogous to known CAN controllers hardware filter semantics. - The filter can be inverted in this semantic, when the CAN_INV_FILTER - bit is set in can_id element of the can_filter structure. In - contrast to CAN controller hardware filters the user may set 0 .. n - receive filters for each open socket separately: - - struct can_filter rfilter[2]; - - rfilter[0].can_id = 0x123; - rfilter[0].can_mask = CAN_SFF_MASK; - rfilter[1].can_id = 0x200; - rfilter[1].can_mask = 0x700; - - setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter)); - - To disable the reception of CAN frames on the selected CAN_RAW socket: - - setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, NULL, 0); - - To set the filters to zero filters is quite obsolete as to not read - data causes the raw socket to discard the received CAN frames. But - having this 'send only' use-case we may remove the receive list in the - Kernel to save a little (really a very little!) CPU usage. - - 4.1.1.1 CAN filter usage optimisation - - The CAN filters are processed in per-device filter lists at CAN frame - reception time. To reduce the number of checks that need to be performed - while walking through the filter lists the CAN core provides an optimized - filter handling when the filter subscription focusses on a single CAN ID. - - For the possible 2048 SFF CAN identifiers the identifier is used as an index - to access the corresponding subscription list without any further checks. - For the 2^29 possible EFF CAN identifiers a 10 bit XOR folding is used as - hash function to retrieve the EFF table index. - - To benefit from the optimized filters for single CAN identifiers the - CAN_SFF_MASK or CAN_EFF_MASK have to be set into can_filter.mask together - with set CAN_EFF_FLAG and CAN_RTR_FLAG bits. A set CAN_EFF_FLAG bit in the - can_filter.mask makes clear that it matters whether a SFF or EFF CAN ID is - subscribed. E.g. in the example from above - - rfilter[0].can_id = 0x123; - rfilter[0].can_mask = CAN_SFF_MASK; - - both SFF frames with CAN ID 0x123 and EFF frames with 0xXXXXX123 can pass. - - To filter for only 0x123 (SFF) and 0x12345678 (EFF) CAN identifiers the - filter has to be defined in this way to benefit from the optimized filters: - - struct can_filter rfilter[2]; - - rfilter[0].can_id = 0x123; - rfilter[0].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_SFF_MASK); - rfilter[1].can_id = 0x12345678 | CAN_EFF_FLAG; - rfilter[1].can_mask = (CAN_EFF_FLAG | CAN_RTR_FLAG | CAN_EFF_MASK); - - setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, &rfilter, sizeof(rfilter)); - - 4.1.2 RAW socket option CAN_RAW_ERR_FILTER - - As described in chapter 3.3 the CAN interface driver can generate so - called Error Message Frames that can optionally be passed to the user - application in the same way as other CAN frames. The possible - errors are divided into different error classes that may be filtered - using the appropriate error mask. To register for every possible - error condition CAN_ERR_MASK can be used as value for the error mask. - The values for the error mask are defined in linux/can/error.h . - - can_err_mask_t err_mask = ( CAN_ERR_TX_TIMEOUT | CAN_ERR_BUSOFF ); - - setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER, - &err_mask, sizeof(err_mask)); - - 4.1.3 RAW socket option CAN_RAW_LOOPBACK - - To meet multi user needs the local loopback is enabled by default - (see chapter 3.2 for details). But in some embedded use-cases - (e.g. when only one application uses the CAN bus) this loopback - functionality can be disabled (separately for each socket): - - int loopback = 0; /* 0 = disabled, 1 = enabled (default) */ - - setsockopt(s, SOL_CAN_RAW, CAN_RAW_LOOPBACK, &loopback, sizeof(loopback)); - - 4.1.4 RAW socket option CAN_RAW_RECV_OWN_MSGS - - When the local loopback is enabled, all the sent CAN frames are - looped back to the open CAN sockets that registered for the CAN - frames' CAN-ID on this given interface to meet the multi user - needs. The reception of the CAN frames on the same socket that was - sending the CAN frame is assumed to be unwanted and therefore - disabled by default. This default behaviour may be changed on - demand: - - int recv_own_msgs = 1; /* 0 = disabled (default), 1 = enabled */ - - setsockopt(s, SOL_CAN_RAW, CAN_RAW_RECV_OWN_MSGS, - &recv_own_msgs, sizeof(recv_own_msgs)); - - 4.1.5 RAW socket option CAN_RAW_FD_FRAMES - - CAN FD support in CAN_RAW sockets can be enabled with a new socket option - CAN_RAW_FD_FRAMES which is off by default. When the new socket option is - not supported by the CAN_RAW socket (e.g. on older kernels), switching the - CAN_RAW_FD_FRAMES option returns the error -ENOPROTOOPT. - - Once CAN_RAW_FD_FRAMES is enabled the application can send both CAN frames - and CAN FD frames. OTOH the application has to handle CAN and CAN FD frames - when reading from the socket. - - CAN_RAW_FD_FRAMES enabled: CAN_MTU and CANFD_MTU are allowed - CAN_RAW_FD_FRAMES disabled: only CAN_MTU is allowed (default) - - Example: - [ remember: CANFD_MTU == sizeof(struct canfd_frame) ] - - struct canfd_frame cfd; - - nbytes = read(s, &cfd, CANFD_MTU); - - if (nbytes == CANFD_MTU) { - printf("got CAN FD frame with length %d\n", cfd.len); - /* cfd.flags contains valid data */ - } else if (nbytes == CAN_MTU) { - printf("got legacy CAN frame with length %d\n", cfd.len); - /* cfd.flags is undefined */ - } else { - fprintf(stderr, "read: invalid CAN(FD) frame\n"); - return 1; - } - - /* the content can be handled independently from the received MTU size */ - - printf("can_id: %X data length: %d data: ", cfd.can_id, cfd.len); - for (i = 0; i < cfd.len; i++) - printf("%02X ", cfd.data[i]); - - When reading with size CANFD_MTU only returns CAN_MTU bytes that have - been received from the socket a legacy CAN frame has been read into the - provided CAN FD structure. Note that the canfd_frame.flags data field is - not specified in the struct can_frame and therefore it is only valid in - CANFD_MTU sized CAN FD frames. - - Implementation hint for new CAN applications: - - To build a CAN FD aware application use struct canfd_frame as basic CAN - data structure for CAN_RAW based applications. When the application is - executed on an older Linux kernel and switching the CAN_RAW_FD_FRAMES - socket option returns an error: No problem. You'll get legacy CAN frames - or CAN FD frames and can process them the same way. - - When sending to CAN devices make sure that the device is capable to handle - CAN FD frames by checking if the device maximum transfer unit is CANFD_MTU. - The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall. - - 4.1.6 RAW socket option CAN_RAW_JOIN_FILTERS - - The CAN_RAW socket can set multiple CAN identifier specific filters that - lead to multiple filters in the af_can.c filter processing. These filters - are indenpendent from each other which leads to logical OR'ed filters when - applied (see 4.1.1). - - This socket option joines the given CAN filters in the way that only CAN - frames are passed to user space that matched *all* given CAN filters. The - semantic for the applied filters is therefore changed to a logical AND. - - This is useful especially when the filterset is a combination of filters - where the CAN_INV_FILTER flag is set in order to notch single CAN IDs or - CAN ID ranges from the incoming traffic. - - 4.1.7 RAW socket returned message flags - - When using recvmsg() call, the msg->msg_flags may contain following flags: - - MSG_DONTROUTE: set when the received frame was created on the local host. - - MSG_CONFIRM: set when the frame was sent via the socket it is received on. - This flag can be interpreted as a 'transmission confirmation' when the - CAN driver supports the echo of frames on driver level, see 3.2 and 6.2. - In order to receive such messages, CAN_RAW_RECV_OWN_MSGS must be set. - - 4.2 Broadcast Manager protocol sockets (SOCK_DGRAM) - - The Broadcast Manager protocol provides a command based configuration - interface to filter and send (e.g. cyclic) CAN messages in kernel space. - - Receive filters can be used to down sample frequent messages; detect events - such as message contents changes, packet length changes, and do time-out - monitoring of received messages. - - Periodic transmission tasks of CAN frames or a sequence of CAN frames can be - created and modified at runtime; both the message content and the two - possible transmit intervals can be altered. - - A BCM socket is not intended for sending individual CAN frames using the - struct can_frame as known from the CAN_RAW socket. Instead a special BCM - configuration message is defined. The basic BCM configuration message used - to communicate with the broadcast manager and the available operations are - defined in the linux/can/bcm.h include. The BCM message consists of a - message header with a command ('opcode') followed by zero or more CAN frames. - The broadcast manager sends responses to user space in the same form: - - struct bcm_msg_head { - __u32 opcode; /* command */ - __u32 flags; /* special flags */ - __u32 count; /* run 'count' times with ival1 */ - struct timeval ival1, ival2; /* count and subsequent interval */ - canid_t can_id; /* unique can_id for task */ - __u32 nframes; /* number of can_frames following */ - struct can_frame frames[0]; - }; - - The aligned payload 'frames' uses the same basic CAN frame structure defined - at the beginning of section 4 and in the include/linux/can.h include. All - messages to the broadcast manager from user space have this structure. - - Note a CAN_BCM socket must be connected instead of bound after socket - creation (example without error checking): - - int s; - struct sockaddr_can addr; - struct ifreq ifr; - - s = socket(PF_CAN, SOCK_DGRAM, CAN_BCM); - - strcpy(ifr.ifr_name, "can0"); - ioctl(s, SIOCGIFINDEX, &ifr); - - addr.can_family = AF_CAN; - addr.can_ifindex = ifr.ifr_ifindex; - - connect(s, (struct sockaddr *)&addr, sizeof(addr)); - - (..) - - The broadcast manager socket is able to handle any number of in flight - transmissions or receive filters concurrently. The different RX/TX jobs are - distinguished by the unique can_id in each BCM message. However additional - CAN_BCM sockets are recommended to communicate on multiple CAN interfaces. - When the broadcast manager socket is bound to 'any' CAN interface (=> the - interface index is set to zero) the configured receive filters apply to any - CAN interface unless the sendto() syscall is used to overrule the 'any' CAN - interface index. When using recvfrom() instead of read() to retrieve BCM - socket messages the originating CAN interface is provided in can_ifindex. - - 4.2.1 Broadcast Manager operations - - The opcode defines the operation for the broadcast manager to carry out, - or details the broadcast managers response to several events, including - user requests. - - Transmit Operations (user space to broadcast manager): - - TX_SETUP: Create (cyclic) transmission task. - - TX_DELETE: Remove (cyclic) transmission task, requires only can_id. - - TX_READ: Read properties of (cyclic) transmission task for can_id. - - TX_SEND: Send one CAN frame. - - Transmit Responses (broadcast manager to user space): - - TX_STATUS: Reply to TX_READ request (transmission task configuration). - - TX_EXPIRED: Notification when counter finishes sending at initial interval - 'ival1'. Requires the TX_COUNTEVT flag to be set at TX_SETUP. - - Receive Operations (user space to broadcast manager): - - RX_SETUP: Create RX content filter subscription. - - RX_DELETE: Remove RX content filter subscription, requires only can_id. - - RX_READ: Read properties of RX content filter subscription for can_id. - - Receive Responses (broadcast manager to user space): - - RX_STATUS: Reply to RX_READ request (filter task configuration). - - RX_TIMEOUT: Cyclic message is detected to be absent (timer ival1 expired). - - RX_CHANGED: BCM message with updated CAN frame (detected content change). - Sent on first message received or on receipt of revised CAN messages. - - 4.2.2 Broadcast Manager message flags - - When sending a message to the broadcast manager the 'flags' element may - contain the following flag definitions which influence the behaviour: - - SETTIMER: Set the values of ival1, ival2 and count - - STARTTIMER: Start the timer with the actual values of ival1, ival2 - and count. Starting the timer leads simultaneously to emit a CAN frame. - - TX_COUNTEVT: Create the message TX_EXPIRED when count expires - - TX_ANNOUNCE: A change of data by the process is emitted immediately. - - TX_CP_CAN_ID: Copies the can_id from the message header to each - subsequent frame in frames. This is intended as usage simplification. For - TX tasks the unique can_id from the message header may differ from the - can_id(s) stored for transmission in the subsequent struct can_frame(s). - - RX_FILTER_ID: Filter by can_id alone, no frames required (nframes=0). - - RX_CHECK_DLC: A change of the DLC leads to an RX_CHANGED. - - RX_NO_AUTOTIMER: Prevent automatically starting the timeout monitor. - - RX_ANNOUNCE_RESUME: If passed at RX_SETUP and a receive timeout occurred, a - RX_CHANGED message will be generated when the (cyclic) receive restarts. - - TX_RESET_MULTI_IDX: Reset the index for the multiple frame transmission. - - RX_RTR_FRAME: Send reply for RTR-request (placed in op->frames[0]). - - 4.2.3 Broadcast Manager transmission timers - - Periodic transmission configurations may use up to two interval timers. - In this case the BCM sends a number of messages ('count') at an interval - 'ival1', then continuing to send at another given interval 'ival2'. When - only one timer is needed 'count' is set to zero and only 'ival2' is used. - When SET_TIMER and START_TIMER flag were set the timers are activated. - The timer values can be altered at runtime when only SET_TIMER is set. - - 4.2.4 Broadcast Manager message sequence transmission - - Up to 256 CAN frames can be transmitted in a sequence in the case of a cyclic - TX task configuration. The number of CAN frames is provided in the 'nframes' - element of the BCM message head. The defined number of CAN frames are added - as array to the TX_SETUP BCM configuration message. - - /* create a struct to set up a sequence of four CAN frames */ - struct { - struct bcm_msg_head msg_head; - struct can_frame frame[4]; - } mytxmsg; - - (..) - mytxmsg.msg_head.nframes = 4; - (..) - - write(s, &mytxmsg, sizeof(mytxmsg)); - - With every transmission the index in the array of CAN frames is increased - and set to zero at index overflow. - - 4.2.5 Broadcast Manager receive filter timers - - The timer values ival1 or ival2 may be set to non-zero values at RX_SETUP. - When the SET_TIMER flag is set the timers are enabled: - - ival1: Send RX_TIMEOUT when a received message is not received again within - the given time. When START_TIMER is set at RX_SETUP the timeout detection - is activated directly - even without a former CAN frame reception. - - ival2: Throttle the received message rate down to the value of ival2. This - is useful to reduce messages for the application when the signal inside the - CAN frame is stateless as state changes within the ival2 periode may get - lost. - - 4.2.6 Broadcast Manager multiplex message receive filter - - To filter for content changes in multiplex message sequences an array of more - than one CAN frames can be passed in a RX_SETUP configuration message. The - data bytes of the first CAN frame contain the mask of relevant bits that - have to match in the subsequent CAN frames with the received CAN frame. - If one of the subsequent CAN frames is matching the bits in that frame data - mark the relevant content to be compared with the previous received content. - Up to 257 CAN frames (multiplex filter bit mask CAN frame plus 256 CAN - filters) can be added as array to the TX_SETUP BCM configuration message. - - /* usually used to clear CAN frame data[] - beware of endian problems! */ - #define U64_DATA(p) (*(unsigned long long*)(p)->data) - - struct { - struct bcm_msg_head msg_head; - struct can_frame frame[5]; - } msg; - - msg.msg_head.opcode = RX_SETUP; - msg.msg_head.can_id = 0x42; - msg.msg_head.flags = 0; - msg.msg_head.nframes = 5; - U64_DATA(&msg.frame[0]) = 0xFF00000000000000ULL; /* MUX mask */ - U64_DATA(&msg.frame[1]) = 0x01000000000000FFULL; /* data mask (MUX 0x01) */ - U64_DATA(&msg.frame[2]) = 0x0200FFFF000000FFULL; /* data mask (MUX 0x02) */ - U64_DATA(&msg.frame[3]) = 0x330000FFFFFF0003ULL; /* data mask (MUX 0x33) */ - U64_DATA(&msg.frame[4]) = 0x4F07FC0FF0000000ULL; /* data mask (MUX 0x4F) */ - - write(s, &msg, sizeof(msg)); - - 4.2.7 Broadcast Manager CAN FD support - - The programming API of the CAN_BCM depends on struct can_frame which is - given as array directly behind the bcm_msg_head structure. To follow this - schema for the CAN FD frames a new flag 'CAN_FD_FRAME' in the bcm_msg_head - flags indicates that the concatenated CAN frame structures behind the - bcm_msg_head are defined as struct canfd_frame. - - struct { - struct bcm_msg_head msg_head; - struct canfd_frame frame[5]; - } msg; - - msg.msg_head.opcode = RX_SETUP; - msg.msg_head.can_id = 0x42; - msg.msg_head.flags = CAN_FD_FRAME; - msg.msg_head.nframes = 5; - (..) - - When using CAN FD frames for multiplex filtering the MUX mask is still - expected in the first 64 bit of the struct canfd_frame data section. - - 4.3 connected transport protocols (SOCK_SEQPACKET) - 4.4 unconnected transport protocols (SOCK_DGRAM) - - -5. SocketCAN core module -------------------------- - - The SocketCAN core module implements the protocol family - PF_CAN. CAN protocol modules are loaded by the core module at - runtime. The core module provides an interface for CAN protocol - modules to subscribe needed CAN IDs (see chapter 3.1). - - 5.1 can.ko module params - - - stats_timer: To calculate the SocketCAN core statistics - (e.g. current/maximum frames per second) this 1 second timer is - invoked at can.ko module start time by default. This timer can be - disabled by using stattimer=0 on the module commandline. - - - debug: (removed since SocketCAN SVN r546) - - 5.2 procfs content - - As described in chapter 3.1 the SocketCAN core uses several filter - lists to deliver received CAN frames to CAN protocol modules. These - receive lists, their filters and the count of filter matches can be - checked in the appropriate receive list. All entries contain the - device and a protocol module identifier: - - foo@bar:~$ cat /proc/net/can/rcvlist_all - - receive list 'rx_all': - (vcan3: no entry) - (vcan2: no entry) - (vcan1: no entry) - device can_id can_mask function userdata matches ident - vcan0 000 00000000 f88e6370 f6c6f400 0 raw - (any: no entry) - - In this example an application requests any CAN traffic from vcan0. - - rcvlist_all - list for unfiltered entries (no filter operations) - rcvlist_eff - list for single extended frame (EFF) entries - rcvlist_err - list for error message frames masks - rcvlist_fil - list for mask/value filters - rcvlist_inv - list for mask/value filters (inverse semantic) - rcvlist_sff - list for single standard frame (SFF) entries - - Additional procfs files in /proc/net/can - - stats - SocketCAN core statistics (rx/tx frames, match ratios, ...) - reset_stats - manual statistic reset - version - prints the SocketCAN core version and the ABI version - - 5.3 writing own CAN protocol modules - - To implement a new protocol in the protocol family PF_CAN a new - protocol has to be defined in include/linux/can.h . - The prototypes and definitions to use the SocketCAN core can be - accessed by including include/linux/can/core.h . - In addition to functions that register the CAN protocol and the - CAN device notifier chain there are functions to subscribe CAN - frames received by CAN interfaces and to send CAN frames: - - can_rx_register - subscribe CAN frames from a specific interface - can_rx_unregister - unsubscribe CAN frames from a specific interface - can_send - transmit a CAN frame (optional with local loopback) - - For details see the kerneldoc documentation in net/can/af_can.c or - the source code of net/can/raw.c or net/can/bcm.c . - -6. CAN network drivers ----------------------- - - Writing a CAN network device driver is much easier than writing a - CAN character device driver. Similar to other known network device - drivers you mainly have to deal with: - - - TX: Put the CAN frame from the socket buffer to the CAN controller. - - RX: Put the CAN frame from the CAN controller to the socket buffer. - - See e.g. at Documentation/networking/netdevices.txt . The differences - for writing CAN network device driver are described below: - - 6.1 general settings - - dev->type = ARPHRD_CAN; /* the netdevice hardware type */ - dev->flags = IFF_NOARP; /* CAN has no arp */ - - dev->mtu = CAN_MTU; /* sizeof(struct can_frame) -> legacy CAN interface */ - - or alternative, when the controller supports CAN with flexible data rate: - dev->mtu = CANFD_MTU; /* sizeof(struct canfd_frame) -> CAN FD interface */ - - The struct can_frame or struct canfd_frame is the payload of each socket - buffer (skbuff) in the protocol family PF_CAN. - - 6.2 local loopback of sent frames - - As described in chapter 3.2 the CAN network device driver should - support a local loopback functionality similar to the local echo - e.g. of tty devices. In this case the driver flag IFF_ECHO has to be - set to prevent the PF_CAN core from locally echoing sent frames - (aka loopback) as fallback solution: - - dev->flags = (IFF_NOARP | IFF_ECHO); - - 6.3 CAN controller hardware filters - - To reduce the interrupt load on deep embedded systems some CAN - controllers support the filtering of CAN IDs or ranges of CAN IDs. - These hardware filter capabilities vary from controller to - controller and have to be identified as not feasible in a multi-user - networking approach. The use of the very controller specific - hardware filters could make sense in a very dedicated use-case, as a - filter on driver level would affect all users in the multi-user - system. The high efficient filter sets inside the PF_CAN core allow - to set different multiple filters for each socket separately. - Therefore the use of hardware filters goes to the category 'handmade - tuning on deep embedded systems'. The author is running a MPC603e - @133MHz with four SJA1000 CAN controllers from 2002 under heavy bus - load without any problems ... - - 6.4 The virtual CAN driver (vcan) - - Similar to the network loopback devices, vcan offers a virtual local - CAN interface. A full qualified address on CAN consists of - - - a unique CAN Identifier (CAN ID) - - the CAN bus this CAN ID is transmitted on (e.g. can0) - - so in common use cases more than one virtual CAN interface is needed. - - The virtual CAN interfaces allow the transmission and reception of CAN - frames without real CAN controller hardware. Virtual CAN network - devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ... - When compiled as a module the virtual CAN driver module is called vcan.ko - - Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel - netlink interface to create vcan network devices. The creation and - removal of vcan network devices can be managed with the ip(8) tool: - - - Create a virtual CAN network interface: - $ ip link add type vcan - - - Create a virtual CAN network interface with a specific name 'vcan42': - $ ip link add dev vcan42 type vcan - - - Remove a (virtual CAN) network interface 'vcan42': - $ ip link del vcan42 - - 6.5 The CAN network device driver interface - - The CAN network device driver interface provides a generic interface - to setup, configure and monitor CAN network devices. The user can then - configure the CAN device, like setting the bit-timing parameters, via - the netlink interface using the program "ip" from the "IPROUTE2" - utility suite. The following chapter describes briefly how to use it. - Furthermore, the interface uses a common data structure and exports a - set of common functions, which all real CAN network device drivers - should use. Please have a look to the SJA1000 or MSCAN driver to - understand how to use them. The name of the module is can-dev.ko. - - 6.5.1 Netlink interface to set/get devices properties - - The CAN device must be configured via netlink interface. The supported - netlink message types are defined and briefly described in - "include/linux/can/netlink.h". CAN link support for the program "ip" - of the IPROUTE2 utility suite is available and it can be used as shown - below: - - - Setting CAN device properties: - - $ ip link set can0 type can help - Usage: ip link set DEVICE type can - [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] | - [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1 - phase-seg2 PHASE-SEG2 [ sjw SJW ] ] - - [ dbitrate BITRATE [ dsample-point SAMPLE-POINT] ] | - [ dtq TQ dprop-seg PROP_SEG dphase-seg1 PHASE-SEG1 - dphase-seg2 PHASE-SEG2 [ dsjw SJW ] ] - - [ loopback { on | off } ] - [ listen-only { on | off } ] - [ triple-sampling { on | off } ] - [ one-shot { on | off } ] - [ berr-reporting { on | off } ] - [ fd { on | off } ] - [ fd-non-iso { on | off } ] - [ presume-ack { on | off } ] - - [ restart-ms TIME-MS ] - [ restart ] - - Where: BITRATE := { 1..1000000 } - SAMPLE-POINT := { 0.000..0.999 } - TQ := { NUMBER } - PROP-SEG := { 1..8 } - PHASE-SEG1 := { 1..8 } - PHASE-SEG2 := { 1..8 } - SJW := { 1..4 } - RESTART-MS := { 0 | NUMBER } - - - Display CAN device details and statistics: - - $ ip -details -statistics link show can0 - 2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10 - link/can - can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100 - bitrate 125000 sample_point 0.875 - tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1 - sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 - clock 8000000 - re-started bus-errors arbit-lost error-warn error-pass bus-off - 41 17457 0 41 42 41 - RX: bytes packets errors dropped overrun mcast - 140859 17608 17457 0 0 0 - TX: bytes packets errors dropped carrier collsns - 861 112 0 41 0 0 - - More info to the above output: - - "<TRIPLE-SAMPLING>" - Shows the list of selected CAN controller modes: LOOPBACK, - LISTEN-ONLY, or TRIPLE-SAMPLING. - - "state ERROR-ACTIVE" - The current state of the CAN controller: "ERROR-ACTIVE", - "ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED" - - "restart-ms 100" - Automatic restart delay time. If set to a non-zero value, a - restart of the CAN controller will be triggered automatically - in case of a bus-off condition after the specified delay time - in milliseconds. By default it's off. - - "bitrate 125000 sample-point 0.875" - Shows the real bit-rate in bits/sec and the sample-point in the - range 0.000..0.999. If the calculation of bit-timing parameters - is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the - bit-timing can be defined by setting the "bitrate" argument. - Optionally the "sample-point" can be specified. By default it's - 0.000 assuming CIA-recommended sample-points. - - "tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1" - Shows the time quanta in ns, propagation segment, phase buffer - segment 1 and 2 and the synchronisation jump width in units of - tq. They allow to define the CAN bit-timing in a hardware - independent format as proposed by the Bosch CAN 2.0 spec (see - chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf). - - "sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 - clock 8000000" - Shows the bit-timing constants of the CAN controller, here the - "sja1000". The minimum and maximum values of the time segment 1 - and 2, the synchronisation jump width in units of tq, the - bitrate pre-scaler and the CAN system clock frequency in Hz. - These constants could be used for user-defined (non-standard) - bit-timing calculation algorithms in user-space. - - "re-started bus-errors arbit-lost error-warn error-pass bus-off" - Shows the number of restarts, bus and arbitration lost errors, - and the state changes to the error-warning, error-passive and - bus-off state. RX overrun errors are listed in the "overrun" - field of the standard network statistics. - - 6.5.2 Setting the CAN bit-timing - - The CAN bit-timing parameters can always be defined in a hardware - independent format as proposed in the Bosch CAN 2.0 specification - specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2" - and "sjw": - - $ ip link set canX type can tq 125 prop-seg 6 \ - phase-seg1 7 phase-seg2 2 sjw 1 - - If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA - recommended CAN bit-timing parameters will be calculated if the bit- - rate is specified with the argument "bitrate": - - $ ip link set canX type can bitrate 125000 - - Note that this works fine for the most common CAN controllers with - standard bit-rates but may *fail* for exotic bit-rates or CAN system - clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some - space and allows user-space tools to solely determine and set the - bit-timing parameters. The CAN controller specific bit-timing - constants can be used for that purpose. They are listed by the - following command: - - $ ip -details link show can0 - ... - sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1 - - 6.5.3 Starting and stopping the CAN network device - - A CAN network device is started or stopped as usual with the command - "ifconfig canX up/down" or "ip link set canX up/down". Be aware that - you *must* define proper bit-timing parameters for real CAN devices - before you can start it to avoid error-prone default settings: - - $ ip link set canX up type can bitrate 125000 - - A device may enter the "bus-off" state if too many errors occurred on - the CAN bus. Then no more messages are received or sent. An automatic - bus-off recovery can be enabled by setting the "restart-ms" to a - non-zero value, e.g.: - - $ ip link set canX type can restart-ms 100 - - Alternatively, the application may realize the "bus-off" condition - by monitoring CAN error message frames and do a restart when - appropriate with the command: - - $ ip link set canX type can restart - - Note that a restart will also create a CAN error message frame (see - also chapter 3.3). - - 6.6 CAN FD (flexible data rate) driver support - - CAN FD capable CAN controllers support two different bitrates for the - arbitration phase and the payload phase of the CAN FD frame. Therefore a - second bit timing has to be specified in order to enable the CAN FD bitrate. - - Additionally CAN FD capable CAN controllers support up to 64 bytes of - payload. The representation of this length in can_frame.can_dlc and - canfd_frame.len for userspace applications and inside the Linux network - layer is a plain value from 0 .. 64 instead of the CAN 'data length code'. - The data length code was a 1:1 mapping to the payload length in the legacy - CAN frames anyway. The payload length to the bus-relevant DLC mapping is - only performed inside the CAN drivers, preferably with the helper - functions can_dlc2len() and can_len2dlc(). - - The CAN netdevice driver capabilities can be distinguished by the network - devices maximum transfer unit (MTU): - - MTU = 16 (CAN_MTU) => sizeof(struct can_frame) => 'legacy' CAN device - MTU = 72 (CANFD_MTU) => sizeof(struct canfd_frame) => CAN FD capable device - - The CAN device MTU can be retrieved e.g. with a SIOCGIFMTU ioctl() syscall. - N.B. CAN FD capable devices can also handle and send legacy CAN frames. - - When configuring CAN FD capable CAN controllers an additional 'data' bitrate - has to be set. This bitrate for the data phase of the CAN FD frame has to be - at least the bitrate which was configured for the arbitration phase. This - second bitrate is specified analogue to the first bitrate but the bitrate - setting keywords for the 'data' bitrate start with 'd' e.g. dbitrate, - dsample-point, dsjw or dtq and similar settings. When a data bitrate is set - within the configuration process the controller option "fd on" can be - specified to enable the CAN FD mode in the CAN controller. This controller - option also switches the device MTU to 72 (CANFD_MTU). - - The first CAN FD specification presented as whitepaper at the International - CAN Conference 2012 needed to be improved for data integrity reasons. - Therefore two CAN FD implementations have to be distinguished today: - - - ISO compliant: The ISO 11898-1:2015 CAN FD implementation (default) - - non-ISO compliant: The CAN FD implementation following the 2012 whitepaper - - Finally there are three types of CAN FD controllers: - - 1. ISO compliant (fixed) - 2. non-ISO compliant (fixed, like the M_CAN IP core v3.0.1 in m_can.c) - 3. ISO/non-ISO CAN FD controllers (switchable, like the PEAK PCAN-USB FD) - - The current ISO/non-ISO mode is announced by the CAN controller driver via - netlink and displayed by the 'ip' tool (controller option FD-NON-ISO). - The ISO/non-ISO-mode can be altered by setting 'fd-non-iso {on|off}' for - switchable CAN FD controllers only. - - Example configuring 500 kbit/s arbitration bitrate and 4 Mbit/s data bitrate: - - $ ip link set can0 up type can bitrate 500000 sample-point 0.75 \ - dbitrate 4000000 dsample-point 0.8 fd on - $ ip -details link show can0 - 5: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 72 qdisc pfifo_fast state UNKNOWN \ - mode DEFAULT group default qlen 10 - link/can promiscuity 0 - can <FD> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0 - bitrate 500000 sample-point 0.750 - tq 50 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 1 - pcan_usb_pro_fd: tseg1 1..64 tseg2 1..16 sjw 1..16 brp 1..1024 \ - brp-inc 1 - dbitrate 4000000 dsample-point 0.800 - dtq 12 dprop-seg 7 dphase-seg1 8 dphase-seg2 4 dsjw 1 - pcan_usb_pro_fd: dtseg1 1..16 dtseg2 1..8 dsjw 1..4 dbrp 1..1024 \ - dbrp-inc 1 - clock 80000000 - - Example when 'fd-non-iso on' is added on this switchable CAN FD adapter: - can <FD,FD-NON-ISO> state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0 - - 6.7 Supported CAN hardware - - Please check the "Kconfig" file in "drivers/net/can" to get an actual - list of the support CAN hardware. On the SocketCAN project website - (see chapter 7) there might be further drivers available, also for - older kernel versions. - -7. SocketCAN resources ------------------------ - - The Linux CAN / SocketCAN project resources (project site / mailing list) - are referenced in the MAINTAINERS file in the Linux source tree. - Search for CAN NETWORK [LAYERS|DRIVERS]. - -8. Credits ----------- - - Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver) - Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan) - Jan Kizka (RT-SocketCAN core, Socket-API reconciliation) - Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews, - CAN device driver interface, MSCAN driver) - Robert Schwebel (design reviews, PTXdist integration) - Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers) - Benedikt Spranger (reviews) - Thomas Gleixner (LKML reviews, coding style, posting hints) - Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver) - Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003) - Klaus Hitschler (PEAK driver integration) - Uwe Koppe (CAN netdevices with PF_PACKET approach) - Michael Schulze (driver layer loopback requirement, RT CAN drivers review) - Pavel Pisa (Bit-timing calculation) - Sascha Hauer (SJA1000 platform driver) - Sebastian Haas (SJA1000 EMS PCI driver) - Markus Plessing (SJA1000 EMS PCI driver) - Per Dalen (SJA1000 Kvaser PCI driver) - Sam Ravnborg (reviews, coding style, kbuild help) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 87814859cfc2..a4508ec1816b 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1134,7 +1134,7 @@ The verifier's knowledge about the variable offset consists of: mask and value; no bit should ever be 1 in both. For example, if a byte is read into a register from memory, the register's top 56 bits are known zero, while the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we -then OR this with 0x40, we get (0x40; 0xcf), then if we add 1 we get (0x0; +then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; 0x1ff), because of potential carries. Besides arithmetic, the register state can also be updated by conditional branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 7d4b15977d61..90966c2692d8 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -7,6 +7,7 @@ Contents: :maxdepth: 2 batman-adv + can kapi z8530book msg_zerocopy diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt index 2d9d588cd34b..50c34ca65efe 100644 --- a/Documentation/networking/xfrm_device.txt +++ b/Documentation/networking/xfrm_device.txt @@ -41,6 +41,7 @@ struct xfrmdev_ops { void (*xdo_dev_state_free) (struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); + void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); }; The NIC driver offering ipsec offload will need to implement these @@ -117,6 +118,8 @@ the stack in xfrm_input(). hand the packet to napi_gro_receive() as usual +In ESN mode, xdo_dev_state_advance_esn() is called from xfrm_replay_advance_esn(). +Driver will check packet seq number and update HW ESN state machine if needed. When the SA is removed by the user, the driver's xdo_dev_state_delete() is asked to disable the offload. Later, xdo_dev_state_free() is called diff --git a/MAINTAINERS b/MAINTAINERS index 51e3a0d503dc..884ee9601707 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3206,7 +3206,7 @@ W: https://github.com/linux-can T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkl/linux-can-next.git S: Maintained -F: Documentation/networking/can.txt +F: Documentation/networking/can.rst F: net/can/ F: include/linux/can/core.h F: include/uapi/linux/can.h diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 41e2feb0cf4f..b5030e1a41d8 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -363,15 +363,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { const u8 *tmp = bpf2a32[TMP_REG_1]; - s32 jmp_offset; - /* checks if divisor is zero or not. If it is, then - * exit directly. - */ - emit(ARM_CMP_I(rn, 0), ctx); - _emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx); - jmp_offset = epilogue_offset(ctx); - _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); #if __LINUX_ARM_ARCH__ == 7 if (elf_hwcap & HWCAP_IDIVA) { if (op == BPF_DIV) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0775d5ab8ee9..1d4f1da7c58f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -390,18 +390,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) case BPF_ALU64 | BPF_DIV | BPF_X: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_X: - { - const u8 r0 = bpf2a64[BPF_REG_0]; - - /* if (src == 0) return 0 */ - jmp_offset = 3; /* skip ahead to else path */ - check_imm19(jmp_offset); - emit(A64_CBNZ(is64, src, jmp_offset), ctx); - emit(A64_MOVZ(1, r0, 0, 0), ctx); - jmp_offset = epilogue_offset(ctx); - check_imm26(jmp_offset); - emit(A64_B(jmp_offset), ctx); - /* else */ switch (BPF_OP(code)) { case BPF_DIV: emit(A64_UDIV(is64, dst, dst, src), ctx); @@ -413,7 +401,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } break; - } case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_LSH | BPF_X: emit(A64_LSLV(is64, dst, dst, src), ctx); diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 4e347030ed2c..3e2798bfea4f 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -741,16 +741,11 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */ case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */ + if (insn->imm == 0) + return -EINVAL; dst = ebpf_to_mips_reg(ctx, insn, dst_reg); if (dst < 0) return dst; - if (insn->imm == 0) { /* Div by zero */ - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); - emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); - } td = get_reg_val_type(ctx, this_idx, insn->dst_reg); if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) /* sign extend */ @@ -770,19 +765,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */ case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */ + if (insn->imm == 0) + return -EINVAL; dst = ebpf_to_mips_reg(ctx, insn, dst_reg); if (dst < 0) return dst; - if (insn->imm == 0) { /* Div by zero */ - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); - emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); - } if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); - if (insn->imm == 1) { /* div by 1 is a nop, mod by 1 is zero */ if (bpf_op == BPF_MOD) @@ -860,11 +849,6 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_DIV: case BPF_MOD: - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); - emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, ddivu, dst, src); if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); @@ -943,11 +927,6 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_DIV: case BPF_MOD: - b_off = b_imm(exit_idx, ctx); - if (is_bad_offset(b_off)) - return -E2BIG; - emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); - emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, divu, dst, src); if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 217a78e84865..0a34b0cec7b7 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -381,10 +381,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, goto bpf_alu32_trunc; case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */ case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */ - PPC_CMPWI(src_reg, 0); - PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12); - PPC_LI(b2p[BPF_REG_0], 0); - PPC_JMP(exit_addr); if (BPF_OP(code) == BPF_MOD) { PPC_DIVWU(b2p[TMP_REG_1], dst_reg, src_reg); PPC_MULW(b2p[TMP_REG_1], src_reg, @@ -395,10 +391,6 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, goto bpf_alu32_trunc; case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */ case BPF_ALU64 | BPF_MOD | BPF_X: /* dst %= src */ - PPC_CMPDI(src_reg, 0); - PPC_BCC_SHORT(COND_NE, (ctx->idx * 4) + 12); - PPC_LI(b2p[BPF_REG_0], 0); - PPC_JMP(exit_addr); if (BPF_OP(code) == BPF_MOD) { PPC_DIVD(b2p[TMP_REG_1], dst_reg, src_reg); PPC_MULD(b2p[TMP_REG_1], src_reg, diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e50188773ff3..78a19c93b380 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -610,11 +610,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - jit->seen |= SEEN_RET0; - /* ltr %src,%src (if src == 0 goto fail) */ - EMIT2(0x1200, src_reg, src_reg); - /* jz <ret0> */ - EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg); /* lhi %w0,0 */ EMIT4_IMM(0xa7080000, REG_W0, 0); /* lr %w1,%dst */ @@ -630,11 +625,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i { int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0; - jit->seen |= SEEN_RET0; - /* ltgr %src,%src (if src == 0 goto fail) */ - EMIT4(0xb9020000, src_reg, src_reg); - /* jz <ret0> */ - EMIT4_PCREL(0xa7840000, jit->ret0_ip - jit->prg); /* lghi %w0,0 */ EMIT4_IMM(0xa7090000, REG_W0, 0); /* lgr %w1,%dst */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 50a24d7bd4c5..48a25869349b 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -967,31 +967,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_alu(MULX, src, dst, ctx); break; case BPF_ALU | BPF_DIV | BPF_X: - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_write_y(G0, ctx); emit_alu(DIV, src, dst, ctx); break; - case BPF_ALU64 | BPF_DIV | BPF_X: - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_alu(UDIVX, src, dst, ctx); break; - case BPF_ALU | BPF_MOD | BPF_X: { const u8 tmp = bpf2sparc[TMP_REG_1]; ctx->tmp_1_used = true; - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_write_y(G0, ctx); emit_alu3(DIV, dst, src, tmp, ctx); emit_alu3(MULX, tmp, src, tmp, ctx); @@ -1003,10 +989,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) ctx->tmp_1_used = true; - emit_cmp(src, G0, ctx); - emit_branch(BE|ANNUL, ctx->idx, ctx->epilogue_offset, ctx); - emit_loadimm(0, bpf2sparc[BPF_REG_0], ctx); - emit_alu3(UDIVX, dst, src, tmp, ctx); emit_alu3(MULX, tmp, src, tmp, ctx); emit_alu3(SUB, dst, tmp, dst, ctx); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5acee5139e28..4923d92f918d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -568,26 +568,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ EMIT2(0x31, 0xd2); - if (BPF_SRC(insn->code) == BPF_X) { - /* if (src_reg == 0) return 0 */ - - /* cmp r11, 0 */ - EMIT4(0x49, 0x83, 0xFB, 0x00); - - /* jne .+9 (skip over pop, pop, xor and jmp) */ - EMIT2(X86_JNE, 1 + 1 + 2 + 5); - EMIT1(0x5A); /* pop rdx */ - EMIT1(0x58); /* pop rax */ - EMIT2(0x31, 0xc0); /* xor eax, eax */ - - /* jmp cleanup_addr - * addrs[i] - 11, because there are 11 bytes - * after this insn: div, mov, pop, pop, mov - */ - jmp_offset = ctx->cleanup_addr - (addrs[i] - 11); - EMIT1_off32(0xE9, jmp_offset); - } - if (BPF_CLASS(insn->code) == BPF_ALU64) /* div r11 */ EMIT3(0x49, 0xF7, 0xF3); diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index cc94604b23e0..b1779566c5bb 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(can_change_state); * Local echo of CAN messages * * CAN network devices *should* support a local echo functionality - * (see Documentation/networking/can.txt). To test the handling of CAN + * (see Documentation/networking/can.rst). To test the handling of CAN * interfaces that do not support the local echo both driver types are * implemented. In the case that the driver does not support the echo * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index a8cb33264ff1..c2b04f505e16 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -61,7 +61,7 @@ MODULE_ALIAS_RTNL_LINK(DRV_NAME); /* * CAN test feature: * Enable the echo on driver level for testing the CAN core echo modes. - * See Documentation/networking/can.txt for details. + * See Documentation/networking/can.rst for details. */ static bool echo; /* echo testing. Default: 0 (Off) */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c index 8b95117c2923..557fd8bfd54e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c @@ -1567,6 +1567,12 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init, tid1->ver_hdr.size = sizeof(struct cudbg_tid_info_region_rev1) - sizeof(struct cudbg_ver_hdr); + /* If firmware is not attached/alive, use backdoor register + * access to collect dump. + */ + if (!is_fw_attached(pdbg_init)) + goto fill_tid; + #define FW_PARAM_PFVF_A(param) \ (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_PFVF) | \ FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_PFVF_##param) | \ @@ -1604,6 +1610,9 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init, tid->nhpftids = val[1] - val[0] + 1; } +#undef FW_PARAM_PFVF_A + +fill_tid: tid->ntids = padap->tids.ntids; tid->nstids = padap->tids.nstids; tid->stid_base = padap->tids.stid_base; @@ -1623,8 +1632,6 @@ int cudbg_collect_tid(struct cudbg_init *pdbg_init, tid->ip_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV4_A); tid->ipv6_users = t4_read_reg(padap, LE_DB_ACT_CNT_IPV6_A); -#undef FW_PARAM_PFVF_A - return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff); } @@ -1866,11 +1873,18 @@ int cudbg_collect_dump_context(struct cudbg_init *pdbg_init, max_ctx_size = region_info[i].end - region_info[i].start + 1; max_ctx_qid = max_ctx_size / SGE_CTXT_SIZE; - t4_sge_ctxt_flush(padap, padap->mbox, i); - rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i], - region_info[i].start, max_ctx_size, - (__be32 *)ctx_buf, 1); - if (rc) { + /* If firmware is not attached/alive, use backdoor register + * access to collect dump. + */ + if (is_fw_attached(pdbg_init)) { + t4_sge_ctxt_flush(padap, padap->mbox, i); + + rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type[i], + region_info[i].start, max_ctx_size, + (__be32 *)ctx_buf, 1); + } + + if (rc || !is_fw_attached(pdbg_init)) { max_ctx_qid = CUDBG_LOWMEM_MAX_CTXT_QIDS; cudbg_get_sge_ctxt_fw(pdbg_init, max_ctx_qid, i, &buff); @@ -1946,9 +1960,10 @@ static void cudbg_mps_rpl_backdoor(struct adapter *padap, mps_rplc->rplc31_0 = htonl(t4_read_reg(padap, MPS_VF_RPLCT_MAP0_A)); } -static int cudbg_collect_tcam_index(struct adapter *padap, +static int cudbg_collect_tcam_index(struct cudbg_init *pdbg_init, struct cudbg_mps_tcam *tcam, u32 idx) { + struct adapter *padap = pdbg_init->adap; u64 tcamy, tcamx, val; u32 ctl, data2; int rc = 0; @@ -2033,12 +2048,22 @@ static int cudbg_collect_tcam_index(struct adapter *padap, htons(FW_LDST_CMD_FID_V(FW_LDST_MPS_RPLC) | FW_LDST_CMD_IDX_V(idx)); - rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd, sizeof(ldst_cmd), - &ldst_cmd); - if (rc) + /* If firmware is not attached/alive, use backdoor register + * access to collect dump. + */ + if (is_fw_attached(pdbg_init)) + rc = t4_wr_mbox(padap, padap->mbox, &ldst_cmd, + sizeof(ldst_cmd), &ldst_cmd); + + if (rc || !is_fw_attached(pdbg_init)) { cudbg_mps_rpl_backdoor(padap, &mps_rplc); - else + /* Ignore error since we collected directly from + * reading registers. + */ + rc = 0; + } else { mps_rplc = ldst_cmd.u.mps.rplc; + } tcam->rplc[0] = ntohl(mps_rplc.rplc31_0); tcam->rplc[1] = ntohl(mps_rplc.rplc63_32); @@ -2075,7 +2100,7 @@ int cudbg_collect_mps_tcam(struct cudbg_init *pdbg_init, tcam = (struct cudbg_mps_tcam *)temp_buff.data; for (i = 0; i < n; i++) { - rc = cudbg_collect_tcam_index(padap, tcam, i); + rc = cudbg_collect_tcam_index(pdbg_init, tcam, i); if (rc) { cudbg_err->sys_err = rc; cudbg_put_buff(pdbg_init, &temp_buff); diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index af27d2b0f79f..047609ef0515 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -195,9 +195,11 @@ static void t4_report_fw_error(struct adapter *adap) u32 pcie_fw; pcie_fw = t4_read_reg(adap, PCIE_FW_A); - if (pcie_fw & PCIE_FW_ERR_F) + if (pcie_fw & PCIE_FW_ERR_F) { dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n", reason[PCIE_FW_EVAL_G(pcie_fw)]); + adap->flags &= ~FW_OK; + } } /* @@ -5088,7 +5090,7 @@ int t4_read_rss(struct adapter *adapter, u16 *map) static unsigned int t4_use_ldst(struct adapter *adap) { - return (adap->flags & FW_OK) || !adap->use_bd; + return (adap->flags & FW_OK) && !adap->use_bd; } /** diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 741020534b16..b034c7f24eda 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1109,6 +1109,8 @@ static const struct ethtool_ops hns3vf_ethtool_ops = { .set_rxfh = hns3_set_rss, .get_link_ksettings = hns3_get_link_ksettings, .get_channels = hns3_get_channels, + .get_coalesce = hns3_get_coalesce, + .set_coalesce = hns3_set_coalesce, }; static const struct ethtool_ops hns3_ethtool_ops = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 96f453ff84b5..f38fc5ce9f51 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -116,6 +116,9 @@ static int hclge_get_ring_chain_from_mbx( hnae_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B, req->msg[3]); ring_chain->tqp_index = hclge_get_queue_id(vport->nic.kinfo.tqp[req->msg[4]]); + hnae_set_field(ring_chain->int_gl_idx, HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + req->msg[5]); cur_chain = ring_chain; @@ -133,6 +136,11 @@ static int hclge_get_ring_chain_from_mbx( [req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i + HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 1]]); + hnae_set_field(new_chain->int_gl_idx, HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + req->msg[HCLGE_RING_NODE_VARIABLE_NUM * i + + HCLGE_RING_MAP_MBX_BASIC_MSG_NUM + 2]); + cur_chain->next = new_chain; cur_chain = new_chain; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 3d2bc9a971fa..0d89965f7928 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -565,6 +565,11 @@ static int hclgevf_bind_ring_to_vector(struct hnae3_handle *handle, bool en, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B); req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 1] = node->tqp_index; + req->msg[HCLGEVF_RING_NODE_VARIABLE_NUM * i + 2] = + hnae_get_field(node->int_gl_idx, + HNAE3_RING_GL_IDX_M, + HNAE3_RING_GL_IDX_S); + if (i == (HCLGE_MBX_VF_MSG_DATA_NUM - HCLGEVF_RING_MAP_MBX_BASIC_MSG_NUM) / HCLGEVF_RING_NODE_VARIABLE_NUM) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 7ac7ef9b37ff..61188f343955 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -4087,7 +4087,7 @@ void ixgbe_get_oem_prod_version(struct ixgbe_hw *hw, hw->eeprom.ops.read(hw, NVM_OEM_PROD_VER_PTR, &offset); /* Return is offset to OEM Product Version block is invalid */ - if (offset == 0x0 && offset == NVM_INVALID_PTR) + if (offset == 0x0 || offset == NVM_INVALID_PTR) return; /* Read product version block */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 317351025fd7..221f15803480 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -3085,26 +3085,9 @@ static int ixgbe_get_ts_info(struct net_device *dev, case ixgbe_mac_X550EM_x: case ixgbe_mac_x550em_a: info->rx_filters |= BIT(HWTSTAMP_FILTER_ALL); - /* fallthrough */ + break; case ixgbe_mac_X540: case ixgbe_mac_82599EB: - info->so_timestamping = - SOF_TIMESTAMPING_TX_SOFTWARE | - SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE | - SOF_TIMESTAMPING_TX_HARDWARE | - SOF_TIMESTAMPING_RX_HARDWARE | - SOF_TIMESTAMPING_RAW_HARDWARE; - - if (adapter->ptp_clock) - info->phc_index = ptp_clock_index(adapter->ptp_clock); - else - info->phc_index = -1; - - info->tx_types = - BIT(HWTSTAMP_TX_OFF) | - BIT(HWTSTAMP_TX_ON); - info->rx_filters |= BIT(HWTSTAMP_FILTER_PTP_V1_L4_SYNC) | BIT(HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) | @@ -3113,6 +3096,24 @@ static int ixgbe_get_ts_info(struct net_device *dev, default: return ethtool_op_get_ts_info(dev, info); } + + info->so_timestamping = + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + + if (adapter->ptp_clock) + info->phc_index = ptp_clock_index(adapter->ptp_clock); + else + info->phc_index = -1; + + info->tx_types = + BIT(HWTSTAMP_TX_OFF) | + BIT(HWTSTAMP_TX_ON); + return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index bbb622f15a77..0da5aa2c8aba 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -4133,11 +4133,15 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, rxdctl &= ~0x3FFFFF; rxdctl |= 0x080420; #if (PAGE_SIZE < 8192) - } else { + /* RXDCTL.RLPML does not work on 82599 */ + } else if (hw->mac.type != ixgbe_mac_82599EB) { rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); - /* Limit the maximum frame size so we don't overrun the skb */ + /* Limit the maximum frame size so we don't overrun the skb. + * This can happen in SRIOV mode when the MTU of the VF is + * higher than the MTU of the PF. + */ if (ring_uses_build_skb(ring) && !test_bit(__IXGBE_RX_3K_BUFFER, &ring->state)) rxdctl |= IXGBE_MAX_2K_FRAME_BUILD_SKB | @@ -7259,6 +7263,9 @@ static void ixgbe_watchdog_link_is_up(struct ixgbe_adapter *adapter) case IXGBE_LINK_SPEED_10GB_FULL: speed_str = "10 Gbps"; break; + case IXGBE_LINK_SPEED_5GB_FULL: + speed_str = "5 Gbps"; + break; case IXGBE_LINK_SPEED_2_5GB_FULL: speed_str = "2.5 Gbps"; break; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c index 3bce26e77090..f470d0204771 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -949,7 +949,7 @@ static s32 ixgbe_checksum_ptr_x550(struct ixgbe_hw *hw, u16 ptr, u16 length, bufsz, i, start; u16 *local_buffer; - bufsz = sizeof(buf) / sizeof(buf[0]); + bufsz = ARRAY_SIZE(buf); /* Read a chunk at the pointer location */ if (!buffer) { diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c index ff9d05f308ee..4400e49090b4 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c +++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c @@ -75,6 +75,9 @@ static struct ixgbe_stats ixgbevf_gstrings_stats[] = { IXGBEVF_STAT("tx_timeout_count", tx_timeout_count), IXGBEVF_NETDEV_STAT(multicast), IXGBEVF_STAT("rx_csum_offload_errors", hw_csum_rx_error), + IXGBEVF_STAT("alloc_rx_page", alloc_rx_page), + IXGBEVF_STAT("alloc_rx_page_failed", alloc_rx_page_failed), + IXGBEVF_STAT("alloc_rx_buff_failed", alloc_rx_buff_failed), }; #define IXGBEVF_QUEUE_STATS_LEN ( \ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h index 581f44bbd7b3..f6952425c87d 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h @@ -62,7 +62,12 @@ struct ixgbevf_tx_buffer { struct ixgbevf_rx_buffer { dma_addr_t dma; struct page *page; - unsigned int page_offset; +#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) + __u32 page_offset; +#else + __u16 page_offset; +#endif + __u16 pagecnt_bias; }; struct ixgbevf_stats { @@ -79,6 +84,7 @@ struct ixgbevf_tx_queue_stats { struct ixgbevf_rx_queue_stats { u64 alloc_rx_page_failed; u64 alloc_rx_buff_failed; + u64 alloc_rx_page; u64 csum_err; }; @@ -260,6 +266,9 @@ static inline void ixgbevf_write_tail(struct ixgbevf_ring *ring, u32 value) #define MIN_MSIX_Q_VECTORS 1 #define MIN_MSIX_COUNT (MIN_MSIX_Q_VECTORS + NON_Q_VECTORS) +#define IXGBEVF_RX_DMA_ATTR \ + (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + /* board specific private data structure */ struct ixgbevf_adapter { /* this field must be first, see ixgbevf_process_skb_fields */ @@ -287,8 +296,9 @@ struct ixgbevf_adapter { u64 hw_csum_rx_error; u64 hw_rx_no_dma_resources; int num_msix_vectors; - u32 alloc_rx_page_failed; - u32 alloc_rx_buff_failed; + u64 alloc_rx_page_failed; + u64 alloc_rx_buff_failed; + u64 alloc_rx_page; struct msix_entry *msix_entries; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index ed5c3aea7939..9b3d43d28106 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -206,28 +206,6 @@ static void ixgbevf_set_ivar(struct ixgbevf_adapter *adapter, s8 direction, } } -static void ixgbevf_unmap_and_free_tx_resource(struct ixgbevf_ring *tx_ring, - struct ixgbevf_tx_buffer *tx_buffer) -{ - if (tx_buffer->skb) { - dev_kfree_skb_any(tx_buffer->skb); - if (dma_unmap_len(tx_buffer, len)) - dma_unmap_single(tx_ring->dev, - dma_unmap_addr(tx_buffer, dma), - dma_unmap_len(tx_buffer, len), - DMA_TO_DEVICE); - } else if (dma_unmap_len(tx_buffer, len)) { - dma_unmap_page(tx_ring->dev, - dma_unmap_addr(tx_buffer, dma), - dma_unmap_len(tx_buffer, len), - DMA_TO_DEVICE); - } - tx_buffer->next_to_watch = NULL; - tx_buffer->skb = NULL; - dma_unmap_len_set(tx_buffer, len, 0); - /* tx_buffer must be completely set up in the transmit path */ -} - static u64 ixgbevf_get_tx_completed(struct ixgbevf_ring *ring) { return ring->stats.packets; @@ -349,7 +327,6 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, DMA_TO_DEVICE); /* clear tx_buffer data */ - tx_buffer->skb = NULL; dma_unmap_len_set(tx_buffer, len, 0); /* unmap remaining buffers */ @@ -595,8 +572,8 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, } /* map page for use */ - dma = dma_map_page(rx_ring->dev, page, 0, - PAGE_SIZE, DMA_FROM_DEVICE); + dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, + DMA_FROM_DEVICE, IXGBEVF_RX_DMA_ATTR); /* if mapping failed free memory back to system since * there isn't much point in holding memory we can't use @@ -604,13 +581,15 @@ static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring, if (dma_mapping_error(rx_ring->dev, dma)) { __free_page(page); - rx_ring->rx_stats.alloc_rx_buff_failed++; + rx_ring->rx_stats.alloc_rx_page_failed++; return false; } bi->dma = dma; bi->page = page; bi->page_offset = 0; + bi->pagecnt_bias = 1; + rx_ring->rx_stats.alloc_rx_page++; return true; } @@ -639,6 +618,12 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, if (!ixgbevf_alloc_mapped_page(rx_ring, bi)) break; + /* sync the buffer for use by the device */ + dma_sync_single_range_for_device(rx_ring->dev, bi->dma, + bi->page_offset, + IXGBEVF_RX_BUFSZ, + DMA_FROM_DEVICE); + /* Refresh the desc even if pkt_addr didn't change * because each write-back erases this info. */ @@ -653,8 +638,8 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring, i -= rx_ring->count; } - /* clear the hdr_addr for the next_to_use descriptor */ - rx_desc->read.hdr_addr = 0; + /* clear the length for the next_to_use descriptor */ + rx_desc->wb.upper.length = 0; cleaned_count--; } while (cleaned_count); @@ -741,12 +726,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, new_buff->page = old_buff->page; new_buff->dma = old_buff->dma; new_buff->page_offset = old_buff->page_offset; - - /* sync the buffer for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma, - new_buff->page_offset, - IXGBEVF_RX_BUFSZ, - DMA_FROM_DEVICE); + new_buff->pagecnt_bias = old_buff->pagecnt_bias; } static inline bool ixgbevf_page_is_reserved(struct page *page) @@ -754,6 +734,45 @@ static inline bool ixgbevf_page_is_reserved(struct page *page) return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } +static bool ixgbevf_can_reuse_rx_page(struct ixgbevf_rx_buffer *rx_buffer, + struct page *page, + const unsigned int truesize) +{ + unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--; + + /* avoid re-using remote pages */ + if (unlikely(ixgbevf_page_is_reserved(page))) + return false; + +#if (PAGE_SIZE < 8192) + /* if we are only owner of page we can reuse it */ + if (unlikely(page_ref_count(page) != pagecnt_bias)) + return false; + + /* flip page offset to other buffer */ + rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ; + +#else + /* move offset up to the next cache line */ + rx_buffer->page_offset += truesize; + + if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ)) + return false; + +#endif + + /* If we have drained the page fragment pool we need to update + * the pagecnt_bias and page count so that we fully restock the + * number of references the driver holds. + */ + if (unlikely(pagecnt_bias == 1)) { + page_ref_add(page, USHRT_MAX); + rx_buffer->pagecnt_bias = USHRT_MAX; + } + + return true; +} + /** * ixgbevf_add_rx_frag - Add contents of Rx buffer to sk_buff * @rx_ring: rx descriptor ring to transact packets on @@ -771,12 +790,12 @@ static inline bool ixgbevf_page_is_reserved(struct page *page) **/ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, struct ixgbevf_rx_buffer *rx_buffer, + u16 size, union ixgbe_adv_rx_desc *rx_desc, struct sk_buff *skb) { struct page *page = rx_buffer->page; unsigned char *va = page_address(page) + rx_buffer->page_offset; - unsigned int size = le16_to_cpu(rx_desc->wb.upper.length); #if (PAGE_SIZE < 8192) unsigned int truesize = IXGBEVF_RX_BUFSZ; #else @@ -795,7 +814,6 @@ static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring, return true; /* this page cannot be reused so discard it */ - put_page(page); return false; } @@ -815,32 +833,7 @@ add_tail_frag: skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, (unsigned long)va & ~PAGE_MASK, size, truesize); - /* avoid re-using remote pages */ - if (unlikely(ixgbevf_page_is_reserved(page))) - return false; - -#if (PAGE_SIZE < 8192) - /* if we are only owner of page we can reuse it */ - if (unlikely(page_count(page) != 1)) - return false; - - /* flip page offset to other buffer */ - rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ; - -#else - /* move offset up to the next cache line */ - rx_buffer->page_offset += truesize; - - if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ)) - return false; - -#endif - /* Even if we own the page, we are not allowed to use atomic_set() - * This would break get_page_unless_zero() users. - */ - page_ref_inc(page); - - return true; + return ixgbevf_can_reuse_rx_page(rx_buffer, page, truesize); } static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring, @@ -849,11 +842,19 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring, { struct ixgbevf_rx_buffer *rx_buffer; struct page *page; + u16 size = le16_to_cpu(rx_desc->wb.upper.length); rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; page = rx_buffer->page; prefetchw(page); + /* we are reusing so sync this buffer for CPU use */ + dma_sync_single_range_for_cpu(rx_ring->dev, + rx_buffer->dma, + rx_buffer->page_offset, + size, + DMA_FROM_DEVICE); + if (likely(!skb)) { void *page_addr = page_address(page) + rx_buffer->page_offset; @@ -879,21 +880,18 @@ static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring, prefetchw(skb->data); } - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - rx_buffer->dma, - rx_buffer->page_offset, - IXGBEVF_RX_BUFSZ, - DMA_FROM_DEVICE); - /* pull page into skb */ - if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) { + if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { /* hand second half of page back to the ring */ ixgbevf_reuse_rx_page(rx_ring, rx_buffer); } else { - /* we are not reusing the buffer so unmap it */ - dma_unmap_page(rx_ring->dev, rx_buffer->dma, - PAGE_SIZE, DMA_FROM_DEVICE); + /* We are not reusing the buffer so unmap it and free + * any references we are holding to it + */ + dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, + PAGE_SIZE, DMA_FROM_DEVICE, + IXGBEVF_RX_DMA_ATTR); + __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } /* clear contents of buffer_info */ @@ -930,7 +928,7 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, rx_desc = IXGBEVF_RX_DESC(rx_ring, rx_ring->next_to_clean); - if (!ixgbevf_test_staterr(rx_desc, IXGBE_RXD_STAT_DD)) + if (!rx_desc->wb.upper.length) break; /* This memory barrier is needed to keep us from reading @@ -943,8 +941,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, skb = ixgbevf_fetch_rx_buffer(rx_ring, rx_desc, skb); /* exit if we failed to retrieve a buffer */ - if (!skb) + if (!skb) { + rx_ring->rx_stats.alloc_rx_buff_failed++; break; + } cleaned_count++; @@ -1553,6 +1553,10 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter, txdctl |= (1u << 8) | /* HTHRESH = 1 */ 32; /* PTHRESH = 32 */ + /* reinitialize tx_buffer_info */ + memset(ring->tx_buffer_info, 0, + sizeof(struct ixgbevf_tx_buffer) * ring->count); + clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state); IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), txdctl); @@ -1721,6 +1725,7 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, struct ixgbevf_ring *ring) { struct ixgbe_hw *hw = &adapter->hw; + union ixgbe_adv_rx_desc *rx_desc; u64 rdba = ring->dma; u32 rxdctl; u8 reg_idx = ring->reg_idx; @@ -1749,6 +1754,14 @@ static void ixgbevf_configure_rx_ring(struct ixgbevf_adapter *adapter, IXGBE_WRITE_REG(hw, IXGBE_VFRDT(reg_idx), 0); ring->tail = adapter->io_addr + IXGBE_VFRDT(reg_idx); + /* initialize rx_buffer_info */ + memset(ring->rx_buffer_info, 0, + sizeof(struct ixgbevf_rx_buffer) * ring->count); + + /* initialize Rx descriptor 0 */ + rx_desc = IXGBEVF_RX_DESC(ring, 0); + rx_desc->wb.upper.length = 0; + /* reset ntu and ntc to place SW in sync with hardwdare */ ring->next_to_clean = 0; ring->next_to_use = 0; @@ -2103,9 +2116,7 @@ void ixgbevf_up(struct ixgbevf_adapter *adapter) **/ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) { - struct device *dev = rx_ring->dev; - unsigned long size; - unsigned int i; + u16 i = rx_ring->next_to_clean; /* Free Rx ring sk_buff */ if (rx_ring->skb) { @@ -2113,29 +2124,39 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) rx_ring->skb = NULL; } - /* ring already cleared, nothing to do */ - if (!rx_ring->rx_buffer_info) - return; - /* Free all the Rx ring pages */ - for (i = 0; i < rx_ring->count; i++) { + while (i != rx_ring->next_to_alloc) { struct ixgbevf_rx_buffer *rx_buffer; rx_buffer = &rx_ring->rx_buffer_info[i]; - if (rx_buffer->dma) - dma_unmap_page(dev, rx_buffer->dma, - PAGE_SIZE, DMA_FROM_DEVICE); - rx_buffer->dma = 0; - if (rx_buffer->page) - __free_page(rx_buffer->page); - rx_buffer->page = NULL; - } - size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count; - memset(rx_ring->rx_buffer_info, 0, size); + /* Invalidate cache lines that may have been written to by + * device so that we avoid corrupting memory. + */ + dma_sync_single_range_for_cpu(rx_ring->dev, + rx_buffer->dma, + rx_buffer->page_offset, + IXGBEVF_RX_BUFSZ, + DMA_FROM_DEVICE); + + /* free resources associated with mapping */ + dma_unmap_page_attrs(rx_ring->dev, + rx_buffer->dma, + PAGE_SIZE, + DMA_FROM_DEVICE, + IXGBEVF_RX_DMA_ATTR); + + __page_frag_cache_drain(rx_buffer->page, + rx_buffer->pagecnt_bias); - /* Zero out the descriptor ring */ - memset(rx_ring->desc, 0, rx_ring->size); + i++; + if (i == rx_ring->count) + i = 0; + } + + rx_ring->next_to_alloc = 0; + rx_ring->next_to_clean = 0; + rx_ring->next_to_use = 0; } /** @@ -2144,23 +2165,57 @@ static void ixgbevf_clean_rx_ring(struct ixgbevf_ring *rx_ring) **/ static void ixgbevf_clean_tx_ring(struct ixgbevf_ring *tx_ring) { - struct ixgbevf_tx_buffer *tx_buffer_info; - unsigned long size; - unsigned int i; + u16 i = tx_ring->next_to_clean; + struct ixgbevf_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i]; - if (!tx_ring->tx_buffer_info) - return; + while (i != tx_ring->next_to_use) { + union ixgbe_adv_tx_desc *eop_desc, *tx_desc; - /* Free all the Tx ring sk_buffs */ - for (i = 0; i < tx_ring->count; i++) { - tx_buffer_info = &tx_ring->tx_buffer_info[i]; - ixgbevf_unmap_and_free_tx_resource(tx_ring, tx_buffer_info); + /* Free all the Tx ring sk_buffs */ + dev_kfree_skb_any(tx_buffer->skb); + + /* unmap skb header data */ + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + + /* check for eop_desc to determine the end of the packet */ + eop_desc = tx_buffer->next_to_watch; + tx_desc = IXGBEVF_TX_DESC(tx_ring, i); + + /* unmap remaining buffers */ + while (tx_desc != eop_desc) { + tx_buffer++; + tx_desc++; + i++; + if (unlikely(i == tx_ring->count)) { + i = 0; + tx_buffer = tx_ring->tx_buffer_info; + tx_desc = IXGBEVF_TX_DESC(tx_ring, 0); + } + + /* unmap any remaining paged data */ + if (dma_unmap_len(tx_buffer, len)) + dma_unmap_page(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + } + + /* move us one more past the eop_desc for start of next pkt */ + tx_buffer++; + i++; + if (unlikely(i == tx_ring->count)) { + i = 0; + tx_buffer = tx_ring->tx_buffer_info; + } } - size = sizeof(struct ixgbevf_tx_buffer) * tx_ring->count; - memset(tx_ring->tx_buffer_info, 0, size); + /* reset next_to_use and next_to_clean */ + tx_ring->next_to_use = 0; + tx_ring->next_to_clean = 0; - memset(tx_ring->desc, 0, tx_ring->size); } /** @@ -2712,6 +2767,8 @@ out: void ixgbevf_update_stats(struct ixgbevf_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; + u64 alloc_rx_page_failed = 0, alloc_rx_buff_failed = 0; + u64 alloc_rx_page = 0, hw_csum_rx_error = 0; int i; if (test_bit(__IXGBEVF_DOWN, &adapter->state) || @@ -2732,10 +2789,18 @@ void ixgbevf_update_stats(struct ixgbevf_adapter *adapter) adapter->stats.vfmprc); for (i = 0; i < adapter->num_rx_queues; i++) { - adapter->hw_csum_rx_error += - adapter->rx_ring[i]->hw_csum_rx_error; - adapter->rx_ring[i]->hw_csum_rx_error = 0; + struct ixgbevf_ring *rx_ring = adapter->rx_ring[i]; + + hw_csum_rx_error += rx_ring->rx_stats.csum_err; + alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed; + alloc_rx_buff_failed += rx_ring->rx_stats.alloc_rx_buff_failed; + alloc_rx_page += rx_ring->rx_stats.alloc_rx_page; } + + adapter->hw_csum_rx_error = hw_csum_rx_error; + adapter->alloc_rx_page_failed = alloc_rx_page_failed; + adapter->alloc_rx_buff_failed = alloc_rx_buff_failed; + adapter->alloc_rx_page = alloc_rx_page; } /** @@ -2980,7 +3045,7 @@ int ixgbevf_setup_tx_resources(struct ixgbevf_ring *tx_ring) int size; size = sizeof(struct ixgbevf_tx_buffer) * tx_ring->count; - tx_ring->tx_buffer_info = vzalloc(size); + tx_ring->tx_buffer_info = vmalloc(size); if (!tx_ring->tx_buffer_info) goto err; @@ -3040,7 +3105,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring *rx_ring) int size; size = sizeof(struct ixgbevf_rx_buffer) * rx_ring->count; - rx_ring->rx_buffer_info = vzalloc(size); + rx_ring->rx_buffer_info = vmalloc(size); if (!rx_ring->rx_buffer_info) goto err; @@ -3482,34 +3547,37 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, struct ixgbevf_tx_buffer *first, const u8 hdr_len) { - dma_addr_t dma; struct sk_buff *skb = first->skb; struct ixgbevf_tx_buffer *tx_buffer; union ixgbe_adv_tx_desc *tx_desc; - struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; - unsigned int data_len = skb->data_len; - unsigned int size = skb_headlen(skb); - unsigned int paylen = skb->len - hdr_len; + struct skb_frag_struct *frag; + dma_addr_t dma; + unsigned int data_len, size; u32 tx_flags = first->tx_flags; - __le32 cmd_type; + __le32 cmd_type = ixgbevf_tx_cmd_type(tx_flags); u16 i = tx_ring->next_to_use; tx_desc = IXGBEVF_TX_DESC(tx_ring, i); - ixgbevf_tx_olinfo_status(tx_desc, tx_flags, paylen); - cmd_type = ixgbevf_tx_cmd_type(tx_flags); + ixgbevf_tx_olinfo_status(tx_desc, tx_flags, skb->len - hdr_len); + + size = skb_headlen(skb); + data_len = skb->data_len; dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE); - if (dma_mapping_error(tx_ring->dev, dma)) - goto dma_error; - /* record length, and DMA address */ - dma_unmap_len_set(first, len, size); - dma_unmap_addr_set(first, dma, dma); + tx_buffer = first; + + for (frag = &skb_shinfo(skb)->frags[0];; frag++) { + if (dma_mapping_error(tx_ring->dev, dma)) + goto dma_error; + + /* record length, and DMA address */ + dma_unmap_len_set(tx_buffer, len, size); + dma_unmap_addr_set(tx_buffer, dma, dma); - tx_desc->read.buffer_addr = cpu_to_le64(dma); + tx_desc->read.buffer_addr = cpu_to_le64(dma); - for (;;) { while (unlikely(size > IXGBE_MAX_DATA_PER_TXD)) { tx_desc->read.cmd_type_len = cmd_type | cpu_to_le32(IXGBE_MAX_DATA_PER_TXD); @@ -3520,12 +3588,12 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, tx_desc = IXGBEVF_TX_DESC(tx_ring, 0); i = 0; } + tx_desc->read.olinfo_status = 0; dma += IXGBE_MAX_DATA_PER_TXD; size -= IXGBE_MAX_DATA_PER_TXD; tx_desc->read.buffer_addr = cpu_to_le64(dma); - tx_desc->read.olinfo_status = 0; } if (likely(!data_len)) @@ -3539,23 +3607,15 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, tx_desc = IXGBEVF_TX_DESC(tx_ring, 0); i = 0; } + tx_desc->read.olinfo_status = 0; size = skb_frag_size(frag); data_len -= size; dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size, DMA_TO_DEVICE); - if (dma_mapping_error(tx_ring->dev, dma)) - goto dma_error; tx_buffer = &tx_ring->tx_buffer_info[i]; - dma_unmap_len_set(tx_buffer, len, size); - dma_unmap_addr_set(tx_buffer, dma, dma); - - tx_desc->read.buffer_addr = cpu_to_le64(dma); - tx_desc->read.olinfo_status = 0; - - frag++; } /* write last descriptor with RS and EOP bits */ @@ -3589,18 +3649,32 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring, return; dma_error: dev_err(tx_ring->dev, "TX DMA map failed\n"); + tx_buffer = &tx_ring->tx_buffer_info[i]; /* clear dma mappings for failed tx_buffer_info map */ - for (;;) { + while (tx_buffer != first) { + if (dma_unmap_len(tx_buffer, len)) + dma_unmap_page(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + dma_unmap_len_set(tx_buffer, len, 0); + + if (i-- == 0) + i += tx_ring->count; tx_buffer = &tx_ring->tx_buffer_info[i]; - ixgbevf_unmap_and_free_tx_resource(tx_ring, tx_buffer); - if (tx_buffer == first) - break; - if (i == 0) - i = tx_ring->count; - i--; } + if (dma_unmap_len(tx_buffer, len)) + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + dma_unmap_len_set(tx_buffer, len, 0); + + dev_kfree_skb_any(tx_buffer->skb); + tx_buffer->skb = NULL; + tx_ring->next_to_use = i; } diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c index 64c93e8becc6..38d3a327c1bc 100644 --- a/drivers/net/ethernet/intel/ixgbevf/vf.c +++ b/drivers/net/ethernet/intel/ixgbevf/vf.c @@ -286,7 +286,7 @@ static s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr) ether_addr_copy(msg_addr, addr); ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, - sizeof(msgbuf) / sizeof(u32)); + ARRAY_SIZE(msgbuf)); if (!ret_val) { msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS; @@ -456,8 +456,7 @@ static s32 ixgbevf_set_rar_vf(struct ixgbe_hw *hw, u32 index, u8 *addr, ether_addr_copy(msg_addr, addr); ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, - sizeof(msgbuf) / sizeof(u32)); - + ARRAY_SIZE(msgbuf)); msgbuf[0] &= ~IXGBE_VT_MSGTYPE_CTS; /* if nacked the address was rejected, use "perm_addr" */ @@ -574,7 +573,7 @@ static s32 ixgbevf_update_xcast_mode(struct ixgbe_hw *hw, int xcast_mode) msgbuf[1] = xcast_mode; err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, - sizeof(msgbuf) / sizeof(u32)); + ARRAY_SIZE(msgbuf)); if (err) return err; @@ -614,7 +613,7 @@ static s32 ixgbevf_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT; err = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, - sizeof(msgbuf) / sizeof(u32)); + ARRAY_SIZE(msgbuf)); if (err) goto mbx_err; @@ -826,7 +825,7 @@ static s32 ixgbevf_set_rlpml_vf(struct ixgbe_hw *hw, u16 max_size) msgbuf[1] = max_size; ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, - sizeof(msgbuf) / sizeof(u32)); + ARRAY_SIZE(msgbuf)); if (ret_val) return ret_val; if ((msgbuf[0] & IXGBE_VF_SET_LPE) && @@ -872,8 +871,7 @@ static int ixgbevf_negotiate_api_version_vf(struct ixgbe_hw *hw, int api) msg[1] = api; msg[2] = 0; - err = ixgbevf_write_msg_read_ack(hw, msg, msg, - sizeof(msg) / sizeof(u32)); + err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg)); if (!err) { msg[0] &= ~IXGBE_VT_MSGTYPE_CTS; @@ -924,8 +922,7 @@ int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs, msg[0] = IXGBE_VF_GET_QUEUE; msg[1] = msg[2] = msg[3] = msg[4] = 0; - err = ixgbevf_write_msg_read_ack(hw, msg, msg, - sizeof(msg) / sizeof(u32)); + err = ixgbevf_write_msg_read_ack(hw, msg, msg, ARRAY_SIZE(msg)); if (!err) { msg[0] &= ~IXGBE_VT_MSGTYPE_CTS; diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 6d6fb8cf3e7c..6473cc68c2d5 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -789,7 +789,6 @@ static int ofdpa_flow_tbl_add(struct ofdpa_port *ofdpa_port, ofdpa_flags_nowait(flags), ofdpa_cmd_flow_tbl_add, found, NULL, NULL); - return 0; } static int ofdpa_flow_tbl_del(struct ofdpa_port *ofdpa_port, diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 433d29d6bc95..f1cc2ed76029 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -643,7 +643,7 @@ static int efx_ptp_get_attributes(struct efx_nic *efx) case MC_CMD_PTP_OUT_GET_ATTRIBUTES_SECONDS_QTR_NANOSECONDS: ptp->ns_to_nic_time = efx_ptp_ns_to_s_qns; ptp->nic_to_kernel_time = efx_ptp_s_qns_to_ktime_correction; - ptp->nic_time.minor_max = 4000000000; + ptp->nic_time.minor_max = 4000000000UL; ptp->nic_time.sync_event_minor_shift = 24; break; default: diff --git a/include/linux/filter.h b/include/linux/filter.h index 425056c7f96c..276932d75975 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -688,6 +688,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); +bool bpf_opcode_in_insntable(u8 code); + struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); @@ -1003,10 +1005,20 @@ struct bpf_sock_ops_kern { struct sock *sk; u32 op; union { + u32 args[4]; u32 reply; u32 replylong[4]; }; u32 is_fullsock; + u64 temp; /* temp and everything after is not + * initialized to 0 before calling + * the BPF program. New fields that + * should be initialized to 0 should + * be inserted before temp. + * temp is scratch storage used by + * sock_ops_convert_ctx_access + * as temporary storage of a register. + */ }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 24a62d590350..cd46d3d63aa0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -851,6 +851,7 @@ struct xfrmdev_ops { void (*xdo_dev_state_free) (struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); + void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); }; #endif diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4f93f0953c41..8f4c54986f97 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -335,6 +335,17 @@ struct tcp_sock { int linger2; + +/* Sock_ops bpf program related variables */ +#ifdef CONFIG_BPF + u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs + * values defined in uapi/linux/tcp.h + */ +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) +#else +#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 +#endif + /* Receiver side RTT estimation */ struct { u32 rtt_us; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a1d26a18599..093e967a2960 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2006,12 +2006,12 @@ void tcp_cleanup_ulp(struct sock *sk); * program loaded). */ #ifdef CONFIG_BPF -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; - memset(&sock_ops, 0, sizeof(sock_ops)); + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); @@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op) sock_ops.sk = sk; sock_ops.op = op; + if (nargs > 0) + memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) @@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op) ret = -1; return ret; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + u32 args[2] = {arg1, arg2}; + + return tcp_call_bpf(sk, op, 2, args); +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + u32 args[3] = {arg1, arg2, arg3}; + + return tcp_call_bpf(sk, op, 3, args); +} + #else -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + return -EPERM; +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + return -EPERM; +} + #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; - timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; @@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; - rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; @@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { - return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2e6d4fe6b0ba..7d2077665c0b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1904,6 +1904,14 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + + if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) + xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); +} + static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; @@ -1974,6 +1982,10 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x return false; } +static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) +{ +} + static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { return false; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 406c19d6016b..db6bdc375126 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -748,7 +756,8 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -952,8 +961,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -968,8 +978,39 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -1003,6 +1044,43 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3aa0658add76..5f35f93dcab2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -782,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context @@ -793,115 +924,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -965,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -985,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; @@ -1302,8 +1328,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d7ea96218516..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -593,11 +593,10 @@ unlock: static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node *node, *next_node = NULL, *parent; struct lpm_trie_node **node_stack = NULL; - struct lpm_trie_node __rcu **root; int err = 0, stack_ptr = -1; unsigned int next_bit; size_t matchlen; @@ -614,22 +613,21 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) */ /* Empty trie */ - if (!rcu_dereference(trie->root)) + search_root = rcu_dereference(trie->root); + if (!search_root) return -ENOENT; /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) { - root = &trie->root; + if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_USER | __GFP_NOWARN); + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; /* Try to find the exact node for the given key */ - for (node = rcu_dereference(trie->root); node;) { + for (node = search_root; node;) { node_stack[++stack_ptr] = node; matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = rcu_dereference(node->child[next_bit]); } if (!node || node->prefixlen != key->prefixlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) { - root = &trie->root; + (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; - } /* The node with the exactly-matching key has been found, * find the first node in postorder after the matched node. @@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = node_stack[stack_ptr]; while (stack_ptr > 0) { parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node && - rcu_dereference(parent->child[1])) { - root = &parent->child[1]; - goto find_leftmost; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; } if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { next_node = parent; @@ -673,7 +669,7 @@ find_leftmost: /* Find the leftmost non-intermediate node, all intermediate nodes * have exact two children, so this function will never return NULL. */ - for (node = rcu_dereference(*root); node;) { + for (node = search_root; node;) { if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) next_node = node; node = rcu_dereference(node->child[0]); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5bdb0cc84ad2..e24aa3241387 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -709,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dfb138b46488..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4981,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } @@ -5064,14 +5071,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -5079,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } @@ -5386,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - /* due to JIT bugs clear upper 32-bits of src register - * before div/mod operation - */ - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); - insn_buf[1] = *insn; - cnt = 2; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) return -ENOMEM; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index e3938e395cba..4cd9ea9b3449 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -2003,10 +2003,14 @@ static struct bpf_test tests[] = { { { 4, 0 }, { 5, 10 } } }, { - "INT: DIV by zero", + /* This one doesn't go through verifier, but is just raw insn + * as opposed to cBPF tests from here. Thus div by 0 tests are + * done in test_verifier in BPF kselftests. + */ + "INT: DIV by -1", .u.insns_int = { BPF_ALU64_REG(BPF_MOV, R6, R1), - BPF_ALU64_IMM(BPF_MOV, R7, 0), + BPF_ALU64_IMM(BPF_MOV, R7, -1), BPF_LD_ABS(BPF_B, 3), BPF_ALU32_REG(BPF_DIV, R0, R7), BPF_EXIT_INSN(), diff --git a/net/can/Kconfig b/net/can/Kconfig index a15c0e0d1fc7..a4399be54ff4 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -11,7 +11,7 @@ menuconfig CAN 1991, mainly for automotive, but now widely used in marine (NMEA2000), industrial, and medical applications. More information on the CAN network protocol family PF_CAN - is contained in <Documentation/networking/can.txt>. + is contained in <Documentation/networking/can.rst>. If you want CAN support you should say Y here and also to the specific driver for your controller(s) below. diff --git a/net/core/filter.c b/net/core/filter.c index 18da42a81d0c..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; @@ -3232,6 +3239,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3241,7 +3271,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3307,6 +3338,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3328,6 +3375,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3510,6 +3584,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -3826,34 +3902,44 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - - return true; -} -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4470,10 +4556,37 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, is_fullsock)); break; -/* Helper macro for adding read access to tcp_sock fields. */ -#define SOCK_OPS_GET_TCP32(FIELD_NAME) \ + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ - BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ @@ -4485,17 +4598,159 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \ - offsetof(struct tcp_sock, FIELD_NAME)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_TCP32(snd_cwnd); + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_TCP32(srtt_us); + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; + } return insn - insn_buf; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d7cf861bf699..f013ddc191e0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -2042,6 +2042,30 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); + switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0b5a05bd82e3..ddbce73edae8 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95461f02ac9a..e9f985e42405 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2905,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); @@ -3469,7 +3473,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6db3124cdbda..257abdde23b0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk) icsk->icsk_user_timeout); } tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aa4411c81e7e..fe3966a9c999 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2440,7 +2440,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, static struct rt6_info *ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, - const struct in6_addr *gw_addr) + const struct in6_addr *gw_addr, + u32 tbid, int flags) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2449,15 +2450,15 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, }; struct fib6_table *table; struct rt6_info *rt; - int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; - table = fib6_get_table(net, cfg->fc_table); + table = fib6_get_table(net, tbid); if (!table) return NULL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; + flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); /* if table lookup failed, fall back to full lookup */ @@ -2469,6 +2470,82 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } +static int ip6_route_check_nh_onlink(struct net *net, + struct fib6_config *cfg, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + const struct in6_addr *gw_addr = &cfg->fc_gateway; + u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; + struct rt6_info *grt; + int err; + + err = 0; + grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); + if (grt) { + if (grt->rt6i_flags & flags || dev != grt->dst.dev) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + err = -EINVAL; + } + + ip6_rt_put(grt); + } + + return err; +} + +static int ip6_route_check_nh(struct net *net, + struct fib6_config *cfg, + struct net_device **_dev, + struct inet6_dev **idev) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + struct net_device *dev = _dev ? *_dev : NULL; + struct rt6_info *grt = NULL; + int err = -EHOSTUNREACH; + + if (cfg->fc_table) { + int flags = RT6_LOOKUP_F_IFACE; + + grt = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + + if (!grt) + goto out; + + if (dev) { + if (dev != grt->dst.dev) { + ip6_rt_put(grt); + goto out; + } + } else { + *_dev = dev = grt->dst.dev; + *idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + + if (!(grt->rt6i_flags & RTF_GATEWAY)) + err = 0; + + ip6_rt_put(grt); + +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2520,6 +2597,21 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; + if (cfg->fc_flags & RTNH_F_ONLINK) { + if (!dev) { + NL_SET_ERR_MSG(extack, + "Nexthop device required for onlink"); + err = -ENODEV; + goto out; + } + + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + } + err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -2664,8 +2756,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt = NULL; - /* IPv6 strictly inhibits using not link-local addresses as nexthop address. Otherwise, router will not able to send redirects. @@ -2682,40 +2772,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } - if (cfg->fc_table) { - grt = ip6_nh_lookup_table(net, cfg, gw_addr); - - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } - } - - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, - cfg->fc_ifindex, 1); - - err = -EHOSTUNREACH; - if (!grt) - goto out; - if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (cfg->fc_flags & RTNH_F_ONLINK) { + err = ip6_route_check_nh_onlink(net, cfg, dev, + extack); } else { - dev = grt->dst.dev; - idev = grt->rt6i_idev; - dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + err = ip6_route_check_nh(net, cfg, &dev, &idev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - ip6_rt_put(grt); - if (err) goto out; } @@ -2757,6 +2819,7 @@ install_route: if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3826,6 +3889,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; + cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -4231,6 +4296,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } + *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cf0e11978b66..267e68379110 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -115,7 +115,6 @@ static int smc_release(struct socket *sock) goto out; smc = smc_sk(sk); - sock_hold(sk); if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again @@ -124,10 +123,7 @@ static int smc_release(struct socket *sock) else lock_sock(sk); - if (smc->use_fallback) { - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); - } else { + if (!smc->use_fallback) { rc = smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; @@ -136,20 +132,21 @@ static int smc_release(struct socket *sock) sock_release(smc->clcsock); smc->clcsock = NULL; } + if (smc->use_fallback) { + sock_put(sk); /* passive closing */ + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } /* detach socket */ sock_orphan(sk); sock->sk = NULL; - if (smc->use_fallback) { - schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); - } else if (sk->sk_state == SMC_CLOSED) { + if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); - } release_sock(sk); - sock_put(sk); + sk->sk_prot->unhash(sk); + sock_put(sk); /* final sock_put */ out: return rc; } @@ -181,7 +178,6 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); - INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); @@ -399,6 +395,8 @@ static int smc_connect_rdma(struct smc_sock *smc) int rc = 0; u8 ibport; + sock_hold(&smc->sk); /* sock put in passive closing */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { /* peer has not signalled SMC-capability */ smc->use_fallback = true; @@ -542,6 +540,8 @@ out_err_unlock: mutex_unlock(&smc_create_lgr_pending); smc_conn_free(&smc->conn); out_err: + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return rc; } @@ -620,7 +620,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); new_sk->sk_prot->unhash(new_sk); - sock_put(new_sk); + sock_put(new_sk); /* final */ *new_smc = NULL; goto out; } @@ -637,7 +637,7 @@ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) { struct smc_sock *par = smc_sk(parent); - sock_hold(sk); + sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); spin_unlock(&par->accept_q_lock); @@ -653,7 +653,7 @@ static void smc_accept_unlink(struct sock *sk) list_del_init(&smc_sk(sk)->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); - sock_put(sk); + sock_put(sk); /* sock_hold in smc_accept_enqueue */ } /* remove a sock from the accept queue to bind it to a new socket created @@ -670,8 +670,12 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { + if (isk->clcsock) { + sock_release(isk->clcsock); + isk->clcsock = NULL; + } new_sk->sk_prot->unhash(new_sk); - sock_put(new_sk); + sock_put(new_sk); /* final */ continue; } if (new_sock) @@ -686,14 +690,11 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); - sock_hold(sk); lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (smc->use_fallback) { - sk->sk_state = SMC_CLOSED; - } else { + if (!smc->use_fallback) { smc_close_active(smc); sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; @@ -706,14 +707,15 @@ void smc_close_non_accepted(struct sock *sk) sock_release(tcp); } if (smc->use_fallback) { - schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); - } else if (sk->sk_state == SMC_CLOSED) { - smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); + sock_put(sk); /* passive closing */ + sk->sk_state = SMC_CLOSED; + } else { + if (sk->sk_state == SMC_CLOSED) + smc_conn_free(&smc->conn); } release_sock(sk); - sock_put(sk); + sk->sk_prot->unhash(sk); + sock_put(sk); /* final sock_put */ } static int smc_serv_conf_first_link(struct smc_sock *smc) @@ -937,6 +939,8 @@ out_err_unlock: smc_lgr_forget(new_smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); out_err: + if (newsmcsk->sk_state == SMC_INIT) + sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; smc_conn_free(&new_smc->conn); goto enqueue; /* queue new sock with sk_err set */ @@ -963,12 +967,22 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - schedule_work(&new_smc->smc_listen_work); + sock_hold(&new_smc->sk); /* sock_put in passive closing */ + if (!schedule_work(&new_smc->smc_listen_work)) + sock_put(&new_smc->sk); } out: + if (lsmc->clcsock) { + sock_release(lsmc->clcsock); + lsmc->clcsock = NULL; + } release_sock(lsk); - lsk->sk_data_ready(lsk); /* no more listening, wake accept */ + /* no more listening, wake up smc_close_wait_listen_clcsock and + * accept + */ + lsk->sk_state_change(lsk); + sock_put(&lsmc->sk); /* sock_hold in smc_listen */ } static int smc_listen(struct socket *sock, int backlog) @@ -1002,7 +1016,9 @@ static int smc_listen(struct socket *sock, int backlog) sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); - schedule_work(&smc->tcp_listen_work); + sock_hold(sk); /* sock_hold in tcp_listen_worker */ + if (!schedule_work(&smc->tcp_listen_work)) + sock_put(sk); out: release_sock(sk); @@ -1019,6 +1035,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, int rc = 0; lsmc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (lsmc->sk.sk_state != SMC_LISTEN) { @@ -1053,6 +1070,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, out: release_sock(sk); + sock_put(sk); /* sock_hold above */ return rc; } @@ -1122,21 +1140,15 @@ out: static unsigned int smc_accept_poll(struct sock *parent) { - struct smc_sock *isk; - struct sock *sk; + struct smc_sock *isk = smc_sk(parent); + int mask = 0; - lock_sock(parent); - list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)isk; + spin_lock(&isk->accept_q_lock); + if (!list_empty(&isk->accept_q)) + mask = POLLIN | POLLRDNORM; + spin_unlock(&isk->accept_q_lock); - if (sk->sk_state == SMC_ACTIVE) { - release_sock(parent); - return POLLIN | POLLRDNORM; - } - } - release_sock(parent); - - return 0; + return mask; } static unsigned int smc_poll(struct file *file, struct socket *sock, @@ -1147,9 +1159,15 @@ static unsigned int smc_poll(struct file *file, struct socket *sock, struct smc_sock *smc; int rc; + if (!sk) + return POLLNVAL; + smc = smc_sk(sock->sk); + sock_hold(sk); + lock_sock(sk); if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ + release_sock(sk); mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); /* if non-blocking connect finished ... */ lock_sock(sk); @@ -1161,37 +1179,43 @@ static unsigned int smc_poll(struct file *file, struct socket *sock, rc = smc_connect_rdma(smc); if (rc < 0) mask |= POLLERR; - else - /* success cases including fallback */ - mask |= POLLOUT | POLLWRNORM; + /* success cases including fallback */ + mask |= POLLOUT | POLLWRNORM; } } - release_sock(sk); } else { - sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == SMC_LISTEN) - /* woken up by sk_data_ready in smc_listen_work() */ - mask |= smc_accept_poll(sk); + if (sk->sk_state != SMC_CLOSED) { + release_sock(sk); + sock_poll_wait(file, sk_sleep(sk), wait); + lock_sock(sk); + } if (sk->sk_err) mask |= POLLERR; - if (atomic_read(&smc->conn.sndbuf_space) || - (sk->sk_shutdown & SEND_SHUTDOWN)) { - mask |= POLLOUT | POLLWRNORM; - } else { - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - } - if (atomic_read(&smc->conn.bytes_to_rcv)) - mask |= POLLIN | POLLRDNORM; if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) mask |= POLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLIN | POLLRDNORM | POLLRDHUP; - if (sk->sk_state == SMC_APPCLOSEWAIT1) - mask |= POLLIN; + if (sk->sk_state == SMC_LISTEN) { + /* woken up by sk_data_ready in smc_listen_work() */ + mask = smc_accept_poll(sk); + } else { + if (atomic_read(&smc->conn.sndbuf_space) || + sk->sk_shutdown & SEND_SHUTDOWN) { + mask |= POLLOUT | POLLWRNORM; + } else { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + if (atomic_read(&smc->conn.bytes_to_rcv)) + mask |= POLLIN | POLLRDNORM; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + if (sk->sk_state == SMC_APPCLOSEWAIT1) + mask |= POLLIN; + } } + release_sock(sk); + sock_put(sk); return mask; } diff --git a/net/smc/smc.h b/net/smc/smc.h index 0bee9d16cf29..9518986c97b1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -178,7 +178,6 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ - struct delayed_work sock_put_work; /* final socket freeing */ bool use_fallback; /* fallback to tcp */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close @@ -253,12 +252,12 @@ static inline int smc_uncompress_bufsize(u8 compressed) static inline bool using_ipsec(struct smc_sock *smc) { return (smc->clcsock->sk->sk_policy[0] || - smc->clcsock->sk->sk_policy[1]) ? 1 : 0; + smc->clcsock->sk->sk_policy[1]) ? true : false; } #else static inline bool using_ipsec(struct smc_sock *smc) { - return 0; + return false; } #endif diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 6e8f5fbe0f09..3cd086e5bd28 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -212,6 +212,14 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->sk.sk_data_ready(&smc->sk); } + /* piggy backed tx info */ + /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ + if (diff_cons && smc_tx_prepared_sends(conn)) { + smc_tx_sndbuf_nonempty(conn); + /* trigger socket release if connection closed */ + smc_close_wake_tx_prepared(smc); + } + if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { smc->sk.sk_err = ECONNRESET; conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; @@ -221,15 +229,9 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, if (smc->clcsock && smc->clcsock->sk) smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(&smc->sk, SOCK_DONE); - schedule_work(&conn->close_work); - } - - /* piggy backed tx info */ - /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - if (diff_cons && smc_tx_prepared_sends(conn)) { - smc_tx_sndbuf_nonempty(conn); - /* trigger socket release if connection closed */ - smc_close_wake_tx_prepared(smc); + sock_hold(&smc->sk); /* sock_put in close_work */ + if (!schedule_work(&conn->close_work)) + sock_put(&smc->sk); } } diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index babe05d385e7..e339c0186dcf 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -19,6 +19,8 @@ #include "smc_cdc.h" #include "smc_close.h" +#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) + static void smc_close_cleanup_listen(struct sock *parent) { struct sock *sk; @@ -28,6 +30,27 @@ static void smc_close_cleanup_listen(struct sock *parent) smc_close_non_accepted(sk); } +static void smc_close_wait_listen_clcsock(struct smc_sock *smc) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + signed long timeout; + + timeout = SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME; + add_wait_queue(sk_sleep(sk), &wait); + do { + release_sock(sk); + if (smc->clcsock) + timeout = wait_woken(&wait, TASK_UNINTERRUPTIBLE, + timeout); + sched_annotate_sleep(); + lock_sock(sk); + if (!smc->clcsock) + break; + } while (timeout); + remove_wait_queue(sk_sleep(sk), &wait); +} + /* wait for sndbuf data being transmitted */ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) { @@ -110,10 +133,10 @@ static void smc_close_active_abort(struct smc_sock *smc) release_sock(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); + sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - sock_release(smc->clcsock); if (!smc_cdc_rxed_any_close(&smc->conn)) sk->sk_state = SMC_PEERABORTWAIT; else @@ -125,19 +148,20 @@ static void smc_close_active_abort(struct smc_sock *smc) case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (!txflags->peer_conn_closed) { + /* just SHUTDOWN_SEND done */ sk->sk_state = SMC_PEERABORTWAIT; - sock_release(smc->clcsock); } else { sk->sk_state = SMC_CLOSED; } + sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: - if (!txflags->peer_conn_closed) - sock_release(smc->clcsock); sk->sk_state = SMC_CLOSED; break; case SMC_PEERFINCLOSEWAIT: + sock_put(sk); /* passive closing */ + break; case SMC_PEERABORTWAIT: case SMC_CLOSED: break; @@ -172,8 +196,6 @@ again: switch (sk->sk_state) { case SMC_INIT: sk->sk_state = SMC_CLOSED; - if (smc->smc_listen_work.func) - cancel_work_sync(&smc->smc_listen_work); break; case SMC_LISTEN: sk->sk_state = SMC_CLOSED; @@ -182,11 +204,9 @@ again: rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); /* wake up kernel_accept of smc_tcp_listen_worker */ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); + smc_close_wait_listen_clcsock(smc); } - release_sock(sk); smc_close_cleanup_listen(sk); - cancel_work_sync(&smc->smc_listen_work); - lock_sock(sk); break; case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); @@ -229,12 +249,14 @@ again: rc = smc_close_final(conn); if (rc) break; - if (smc_cdc_rxed_any_close(conn)) + if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; - else + sock_put(sk); /* postponed passive closing */ + } else { /* peer has just issued a shutdown write */ sk->sk_state = SMC_PEERFINCLOSEWAIT; + } break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: @@ -272,27 +294,33 @@ static void smc_close_passive_abort_received(struct smc_sock *smc) struct sock *sk = &smc->sk; switch (sk->sk_state) { + case SMC_INIT: case SMC_ACTIVE: - case SMC_APPFINCLOSEWAIT: case SMC_APPCLOSEWAIT1: - case SMC_APPCLOSEWAIT2: + sk->sk_state = SMC_PROCESSABORT; + sock_put(sk); /* passive closing */ + break; + case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PROCESSABORT; break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && - !smc_close_sent_any_close(&smc->conn)) { + !smc_close_sent_any_close(&smc->conn)) /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_PROCESSABORT; - } else { + else sk->sk_state = SMC_CLOSED; - } + sock_put(sk); /* passive closing */ break; + case SMC_APPCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + break; case SMC_PEERABORTWAIT: sk->sk_state = SMC_CLOSED; break; - case SMC_INIT: case SMC_PROCESSABORT: /* nothing to do, add tracing in future patch */ break; @@ -336,13 +364,18 @@ static void smc_close_passive_work(struct work_struct *work) case SMC_INIT: if (atomic_read(&conn->bytes_to_rcv) || (rxflags->peer_done_writing && - !smc_cdc_rxed_any_close(conn))) + !smc_cdc_rxed_any_close(conn))) { sk->sk_state = SMC_APPCLOSEWAIT1; - else + } else { sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + } break; case SMC_ACTIVE: sk->sk_state = SMC_APPCLOSEWAIT1; + /* postpone sock_put() for passive closing to cover + * received SEND_SHUTDOWN as well + */ break; case SMC_PEERCLOSEWAIT1: if (rxflags->peer_done_writing) @@ -360,13 +393,20 @@ static void smc_close_passive_work(struct work_struct *work) /* just shutdown, but not yet closed locally */ sk->sk_state = SMC_APPFINCLOSEWAIT; } + sock_put(sk); /* passive closing */ break; case SMC_PEERFINCLOSEWAIT: - if (smc_cdc_rxed_any_close(conn)) + if (smc_cdc_rxed_any_close(conn)) { sk->sk_state = SMC_CLOSED; + sock_put(sk); /* passive closing */ + } break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: + /* postpone sock_put() for passive closing to cover + * received SEND_SHUTDOWN as well + */ + break; case SMC_APPFINCLOSEWAIT: case SMC_PEERABORTWAIT: case SMC_PROCESSABORT: @@ -382,23 +422,11 @@ wakeup: if (old_state != sk->sk_state) { sk->sk_state_change(sk); if ((sk->sk_state == SMC_CLOSED) && - (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { + (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) smc_conn_free(conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); - } } release_sock(sk); -} - -void smc_close_sock_put_work(struct work_struct *work) -{ - struct smc_sock *smc = container_of(to_delayed_work(work), - struct smc_sock, - sock_put_work); - - smc->sk.sk_prot->unhash(&smc->sk); - sock_put(&smc->sk); + sock_put(sk); /* sock_hold done by schedulers of close_work */ } int smc_close_shutdown_write(struct smc_sock *smc) diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index 8c498885d758..19eb6a211c23 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -21,7 +21,6 @@ void smc_close_wake_tx_prepared(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); -void smc_close_sock_put_work(struct work_struct *work); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ed5b46d1fe41..2424c7100aaf 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -328,13 +328,13 @@ void smc_lgr_terminate(struct smc_link_group *lgr) while (node) { conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); + sock_hold(&smc->sk); /* sock_put in close work */ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; __smc_lgr_unregister_conn(conn); write_unlock_bh(&lgr->conns_lock); - schedule_work(&conn->close_work); + if (!schedule_work(&conn->close_work)) + sock_put(&smc->sk); write_lock_bh(&lgr->conns_lock); - sock_put(&smc->sk); node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 90f1a7f9085c..2a8957bd6d38 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -141,6 +141,17 @@ out: return rc; } +static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *l; + + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + smc_lgr_terminate(lgr); + } +} + /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -151,6 +162,8 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); + if (!smc_ib_port_active(smcibdev, port_idx + 1)) + smc_ib_port_terminate(smcibdev, port_idx + 1); } } @@ -165,15 +178,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_PORT_ERR: - port_idx = ibevent->element.port_num - 1; - set_bit(port_idx, &smcibdev->port_event_mask); - schedule_work(&smcibdev->port_event_work); - /* fall through */ case IB_EVENT_DEVICE_FATAL: - /* tbd in follow-on patch: - * abnormal close of corresponding connections - */ - break; case IB_EVENT_PORT_ACTIVE: port_idx = ibevent->element.port_num - 1; set_bit(port_idx, &smcibdev->port_event_mask); @@ -186,7 +191,8 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, void smc_ib_dealloc_protection_domain(struct smc_link *lnk) { - ib_dealloc_pd(lnk->roce_pd); + if (lnk->roce_pd) + ib_dealloc_pd(lnk->roce_pd); lnk->roce_pd = NULL; } @@ -203,14 +209,18 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { + struct smc_ib_device *smcibdev = + (struct smc_ib_device *)ibevent->device; + u8 port_idx; + switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: case IB_EVENT_GID_CHANGE: case IB_EVENT_PORT_ERR: case IB_EVENT_QP_ACCESS_ERR: - /* tbd in follow-on patch: - * abnormal close of corresponding connections - */ + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); break; default: break; @@ -219,7 +229,8 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - ib_destroy_qp(lnk->roce_qp); + if (lnk->roce_qp) + ib_destroy_qp(lnk->roce_qp); lnk->roce_qp = NULL; } @@ -462,6 +473,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) { if (!smcibdev->initialized) return; + smcibdev->initialized = 0; smc_wr_remove_dev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); ib_destroy_cq(smcibdev->roce_cq_recv); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 92b4648e75ca..8e70291e586a 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -147,8 +147,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (!x->type_offload) return -EINVAL; - /* We don't yet support UDP encapsulation, TFC padding and ESN. */ - if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) + /* We don't yet support UDP encapsulation and TFC padding. */ + if (x->encap || x->tfcpad) return -EINVAL; dev = dev_get_by_index(net, xuo->ifindex); @@ -178,6 +178,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return 0; } + if (x->props.flags & XFRM_STATE_ESN && + !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { + xso->dev = NULL; + dev_put(dev); + return -EINVAL; + } + xso->dev = dev; xso->num_exthdrs = 1; xso->flags = xuo->flags; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 02501817227b..1d38c6acf8af 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -551,6 +551,8 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) bitnr = replay_esn->replay_window - (diff - pos); } + xfrm_dev_state_advance_esn(x); + nr = bitnr >> 5; bitnr = bitnr & 0x1F; replay_esn->bmp[nr] |= (1U << bitnr); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 7f61a3d57fa7..64335bb94f9f 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -201,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH) endif # Trick to allow make to be run from this directory -all: +all: $(LIBBPF) $(MAKE) -C ../../ $(CURDIR)/ clean: $(MAKE) -C ../../ M=$(CURDIR) clean @rm -f *~ +$(LIBBPF): FORCE + $(MAKE) -C $(dir $@) $(notdir $@) + $(obj)/syscall_nrs.s: $(src)/syscall_nrs.c $(call if_changed_dep,cc_s_c) diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 7cc9d228216f..7c25c0c112bc 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -23,8 +23,11 @@ #include <stdbool.h> #include <signal.h> #include <fcntl.h> +#include <sys/wait.h> +#include <time.h> #include <sys/time.h> +#include <sys/resource.h> #include <sys/types.h> #include <linux/netlink.h> @@ -35,6 +38,8 @@ #include <assert.h> #include <libgen.h> +#include <getopt.h> + #include "../bpf/bpf_load.h" #include "../bpf/bpf_util.h" #include "../bpf/libbpf.h" @@ -46,15 +51,42 @@ void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -static int sockmap_test_sockets(int rate, int dot) +/* global sockets */ +int s1, s2, c1, c2, p1, p2; + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"cgroup", required_argument, NULL, 'c' }, + {"rate", required_argument, NULL, 'r' }, + {"verbose", no_argument, NULL, 'v' }, + {"iov_count", required_argument, NULL, 'i' }, + {"length", required_argument, NULL, 'l' }, + {"test", required_argument, NULL, 't' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]); + printf(" options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)\n", + *long_options[i].flag); + else + printf(" -%c\n", long_options[i].val); + } + printf("\n"); +} + +static int sockmap_init_sockets(void) { - int i, sc, err, max_fd, one = 1; - int s1, s2, c1, c2, p1, p2; + int i, err, one = 1; struct sockaddr_in addr; - struct timeval timeout; - char buf[1024] = {0}; int *fds[4] = {&s1, &s2, &c1, &c2}; - fd_set w; s1 = s2 = p1 = p2 = c1 = c2 = 0; @@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot) *fds[i] = socket(AF_INET, SOCK_STREAM, 0); if (*fds[i] < 0) { perror("socket s1 failed()"); - err = *fds[i]; - goto out; + return errno; } } @@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot) (char *)&one, sizeof(one)); if (err) { perror("setsockopt failed()"); - goto out; + return errno; } } /* Non-blocking sockets */ - for (i = 0; i < 4; i++) { + for (i = 0; i < 2; i++) { err = ioctl(*fds[i], FIONBIO, (char *)&one); if (err < 0) { perror("ioctl s1 failed()"); - goto out; + return errno; } } @@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot) err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s2 failed()\n"); - goto out; + return errno; } /* Listen server sockets */ @@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot) err = listen(s1, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = listen(s2, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } /* Initiate Connect */ @@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot) err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c2 failed()\n"); - goto out; + return errno; + } else if (err < 0) { + err = 0; } /* Accept Connecrtions */ p1 = accept(s1, NULL, NULL); if (p1 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } p2 = accept(s2, NULL, NULL); if (p2 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } - max_fd = p2; - timeout.tv_sec = 10; - timeout.tv_usec = 0; - printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); + return 0; +} + +struct msg_stats { + size_t bytes_sent; + size_t bytes_recvd; + struct timespec start; + struct timespec end; +}; + +static int msg_loop(int fd, int iov_count, int iov_length, int cnt, + struct msg_stats *s, bool tx) +{ + struct msghdr msg = {0}; + int err, i, flags = MSG_NOSIGNAL; + struct iovec *iov; + + iov = calloc(iov_count, sizeof(struct iovec)); + if (!iov) + return errno; + + for (i = 0; i < iov_count; i++) { + char *d = calloc(iov_length, sizeof(char)); + + if (!d) { + fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); + goto out_errno; + } + iov[i].iov_base = d; + iov[i].iov_len = iov_length; + } + + msg.msg_iov = iov; + msg.msg_iovlen = iov_count; + + if (tx) { + clock_gettime(CLOCK_MONOTONIC, &s->start); + for (i = 0; i < cnt; i++) { + int sent = sendmsg(fd, &msg, flags); + + if (sent < 0) { + perror("send loop error:"); + goto out_errno; + } + s->bytes_sent += sent; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } else { + int slct, recv, max_fd = fd; + struct timeval timeout; + float total_bytes; + fd_set w; + + total_bytes = (float)iov_count * (float)iov_length * (float)cnt; + err = clock_gettime(CLOCK_MONOTONIC, &s->start); + if (err < 0) + perror("recv start time: "); + while (s->bytes_recvd < total_bytes) { + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + /* FD sets */ + FD_ZERO(&w); + FD_SET(fd, &w); + + slct = select(max_fd + 1, &w, NULL, NULL, &timeout); + if (slct == -1) { + perror("select()"); + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } else if (!slct) { + fprintf(stderr, "unexpected timeout\n"); + errno = -EIO; + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } + + recv = recvmsg(fd, &msg, flags); + if (recv < 0) { + if (errno != EWOULDBLOCK) { + clock_gettime(CLOCK_MONOTONIC, &s->end); + perror("recv failed()\n"); + goto out_errno; + } + } + + s->bytes_recvd += recv; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } + + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return 0; +out_errno: + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return errno; +} + +static float giga = 1000000000; + +static inline float sentBps(struct msg_stats s) +{ + return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec); +} + +static inline float recvdBps(struct msg_stats s) +{ + return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); +} + +static int sendmsg_test(int iov_count, int iov_buf, int cnt, + int verbose, bool base) +{ + float sent_Bps = 0, recvd_Bps = 0; + int rx_fd, txpid, rxpid, err = 0; + struct msg_stats s = {0}; + int status; + + errno = 0; + + if (base) + rx_fd = p1; + else + rx_fd = p2; + + rxpid = fork(); + if (rxpid == 0) { + err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); + if (err) + fprintf(stderr, + "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(p2, SHUT_RDWR); + shutdown(p1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (rxpid == -1) { + perror("msg_loop_rx: "); + return errno; + } + + txpid = fork(); + if (txpid == 0) { + err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (err) + fprintf(stderr, + "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(c1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (txpid == -1) { + perror("msg_loop_tx: "); + return errno; + } + + assert(waitpid(rxpid, &status, 0) == rxpid); + assert(waitpid(txpid, &status, 0) == txpid); + return err; +} + +static int forever_ping_pong(int rate, int verbose) +{ + struct timeval timeout; + char buf[1024] = {0}; + int sc; + + timeout.tv_sec = 10; + timeout.tv_usec = 0; /* Ping/Pong data from client to server */ sc = send(c1, buf, sizeof(buf), 0); if (sc < 0) { perror("send failed()\n"); - goto out; + return sc; } do { - int s, rc, i; + int s, rc, i, max_fd = p2; + fd_set w; /* FD sets */ FD_ZERO(&w); @@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot) if (rc < 0) { if (errno != EWOULDBLOCK) { perror("recv failed()\n"); - break; + return rc; } } @@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot) sc = send(i, buf, rc, 0); if (sc < 0) { perror("send failed()\n"); - break; + return sc; } } - sleep(rate); - if (dot) { + + if (rate) + sleep(rate); + + if (verbose) { printf("."); fflush(stdout); } } while (running); -out: - close(s1); - close(s2); - close(p1); - close(p2); - close(c1); - close(c2); - return err; + return 0; } +enum { + PING_PONG, + SENDMSG, + BASE, +}; + int main(int argc, char **argv) { - int rate = 1, dot = 1; + int iov_count = 1, length = 1024, rate = 1, verbose = 0; + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + int opt, longindex, err, cg_fd = 0; + int test = PING_PONG; char filename[256]; - int err, cg_fd; - char *cg_path; - cg_path = argv[argc - 1]; + while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", + long_options, &longindex)) != -1) { + switch (opt) { + /* Cgroup configuration */ + case 'c': + cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, optarg); + return cg_fd; + } + break; + case 'r': + rate = atoi(optarg); + break; + case 'v': + verbose = 1; + break; + case 'i': + iov_count = atoi(optarg); + break; + case 'l': + length = atoi(optarg); + break; + case 't': + if (strcmp(optarg, "ping") == 0) { + test = PING_PONG; + } else if (strcmp(optarg, "sendmsg") == 0) { + test = SENDMSG; + } else if (strcmp(optarg, "base") == 0) { + test = BASE; + } else { + usage(argv); + return -1; + } + break; + case 'h': + default: + usage(argv); + return -1; + } + } + + if (!cg_fd) { + fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", + argv[0]); + return -1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); running = 1; @@ -241,20 +515,16 @@ int main(int argc, char **argv) /* catch SIGINT */ signal(SIGINT, running_handler); + /* If base test skip BPF setup */ + if (test == BASE) + goto run; + if (load_bpf_file(filename)) { fprintf(stderr, "load_bpf_file: (%s) %s\n", filename, strerror(errno)); return 1; } - /* Cgroup configuration */ - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - if (cg_fd < 0) { - fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", - cg_fd, cg_path); - return cg_fd; - } - /* Attach programs to sockmap */ err = bpf_prog_attach(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER, 0); @@ -280,12 +550,30 @@ int main(int argc, char **argv) return err; } - err = sockmap_test_sockets(rate, dot); +run: + err = sockmap_init_sockets(); if (err) { fprintf(stderr, "ERROR: test socket failed: %d\n", err); - return err; + goto out; } - return 0; + + if (test == PING_PONG) + err = forever_ping_pong(rate, verbose); + else if (test == SENDMSG) + err = sendmsg_test(iov_count, length, rate, verbose, false); + else if (test == BASE) + err = sendmsg_test(iov_count, length, rate, verbose, true); + else + fprintf(stderr, "unknown test\n"); +out: + close(s1); + close(s2); + close(p1); + close(p2); + close(c1); + close(c2); + close(cg_fd); + return err; } void running_handler(int a) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index af1f49ad8b88..db6bdc375126 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -17,7 +17,7 @@ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ +#define BPF_DW 0x18 /* double word (64-bit) */ #define BPF_XADD 0xc0 /* exclusive add */ /* alu/jmp fields */ @@ -642,6 +642,14 @@ union bpf_attr { * @optlen: length of optval in bytes * Return: 0 or negative error * + * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) + * Set callback flags for sock_ops + * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct + * @flags: flags value + * Return: 0 for no error + * -EINVAL if there is no full tcp socket + * bits in flags that are not supported by current kernel + * * int bpf_skb_adjust_room(skb, len_diff, mode, flags) * Grow or shrink room in sk_buff. * @skb: pointer to skb @@ -748,7 +756,8 @@ union bpf_attr { FN(perf_event_read_value), \ FN(perf_prog_read_value), \ FN(getsockopt), \ - FN(override_return), + FN(override_return), \ + FN(sock_ops_cb_flags_set), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -952,8 +961,9 @@ struct bpf_map_info { struct bpf_sock_ops { __u32 op; union { - __u32 reply; - __u32 replylong[4]; + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ }; __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -968,8 +978,39 @@ struct bpf_sock_ops { */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; }; +/* Definitions for bpf_sock_ops_cb_flags */ +#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently + * supported cb flags + */ + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -1003,6 +1044,43 @@ enum { * a congestion threshold. RTTs above * this indicate congestion */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ }; #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3a44b655d852..bf05bc5e36e5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -11,16 +11,16 @@ ifneq ($(wildcard $(GENHDR)),) endif CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include -LDLIBS += -lcap -lelf -lrt +LDLIBS += -lcap -lelf -lrt -lpthread TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_align test_verifier_log test_dev_cgroup + test_align test_verifier_log test_dev_cgroup test_tcpbpf_user TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ - sample_map_ret0.o + sample_map_ret0.o test_tcpbpf_kern.o TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ test_offload.py diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 33cb00e46c49..dde2c11d7771 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -71,6 +71,8 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_getsockopt; +static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = + (void *) BPF_FUNC_sock_ops_cb_flags_set; static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py new file mode 100755 index 000000000000..481dccdf140c --- /dev/null +++ b/tools/testing/selftests/bpf/tcp_client.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python2 +# +# SPDX-License-Identifier: GPL-2.0 +# + +import sys, os, os.path, getopt +import socket, time +import subprocess +import select + +def read(sock, n): + buf = '' + while len(buf) < n: + rem = n - len(buf) + try: s = sock.recv(rem) + except (socket.error), e: return '' + buf += s + return buf + +def send(sock, s): + total = len(s) + count = 0 + while count < total: + try: n = sock.send(s) + except (socket.error), e: n = 0 + if n == 0: + return count; + count += n + return count + + +serverPort = int(sys.argv[1]) +HostName = socket.gethostname() + +# create active socket +sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) +try: + sock.connect((HostName, serverPort)) +except socket.error as e: + sys.exit(1) + +buf = '' +n = 0 +while n < 1000: + buf += '+' + n += 1 + +sock.settimeout(1); +n = send(sock, buf) +n = read(sock, 500) +sys.exit(0) diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py new file mode 100755 index 000000000000..bc454d7d0be2 --- /dev/null +++ b/tools/testing/selftests/bpf/tcp_server.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python2 +# +# SPDX-License-Identifier: GPL-2.0 +# + +import sys, os, os.path, getopt +import socket, time +import subprocess +import select + +def read(sock, n): + buf = '' + while len(buf) < n: + rem = n - len(buf) + try: s = sock.recv(rem) + except (socket.error), e: return '' + buf += s + return buf + +def send(sock, s): + total = len(s) + count = 0 + while count < total: + try: n = sock.send(s) + except (socket.error), e: n = 0 + if n == 0: + return count; + count += n + return count + + +SERVER_PORT = 12877 +MAX_PORTS = 2 + +serverPort = SERVER_PORT +serverSocket = None + +HostName = socket.gethostname() + +# create passive socket +serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) +host = socket.gethostname() + +try: serverSocket.bind((host, 0)) +except socket.error as msg: + print 'bind fails: ', msg + +sn = serverSocket.getsockname() +serverPort = sn[1] + +cmdStr = ("./tcp_client.py %d &") % (serverPort) +os.system(cmdStr) + +buf = '' +n = 0 +while n < 500: + buf += '.' + n += 1 + +serverSocket.listen(MAX_PORTS) +readList = [serverSocket] + +while True: + readyRead, readyWrite, inError = \ + select.select(readList, [], [], 2) + + if len(readyRead) > 0: + waitCount = 0 + for sock in readyRead: + if sock == serverSocket: + (clientSocket, address) = serverSocket.accept() + address = str(address[0]) + readList.append(clientSocket) + else: + sock.settimeout(1); + s = read(sock, 1000) + n = send(sock, buf) + sock.close() + serverSocket.close() + sys.exit(0) + else: + print 'Select timeout!' + sys.exit(1) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index e19b410125eb..ff8bd7e3e50c 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -446,11 +446,9 @@ static struct bpf_align_test tests[] = { .insns = { PREP_PKT_POINTERS, BPF_MOV64_IMM(BPF_REG_0, 0), - /* ptr & const => unknown & const */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), - BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 0x40), - /* ptr << const => unknown << const */ - BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + /* (ptr - ptr) << 2 */ + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2), BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2), /* We have a (4n) value. Let's make a packet offset * out of it. First add 14, to make it a (4n+2) @@ -473,8 +471,26 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, - /* R5 bitwise operator &= on pointer prohibited */ + {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, + /* (ptr - ptr) << 2 == unknown, (4n) */ + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + /* (4n) + 14 == (4n+2). We blow our bounds, because + * the add could overflow. + */ + {7, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + /* Checked s>=0 */ + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + /* packet pointer + nonnegative (4n+2) */ + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. + * We checked the bounds, but it might have been able + * to overflow if the packet pointer started in the + * upper half of the address space. + * So we did not get a 'range' on R6, and the access + * attempt will fail. + */ + {15, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, } }, { diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c index c1535b34f14f..3489cc283433 100644 --- a/tools/testing/selftests/bpf/test_dev_cgroup.c +++ b/tools/testing/selftests/bpf/test_dev_cgroup.c @@ -21,7 +21,7 @@ #define DEV_CGROUP_PROG "./dev_cgroup.o" -#define TEST_CGROUP "test-bpf-based-device-cgroup/" +#define TEST_CGROUP "/test-bpf-based-device-cgroup/" int main(int argc, char **argv) { diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index 081510853c6d..2be87e9ee28d 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -14,6 +14,7 @@ #include <errno.h> #include <inttypes.h> #include <linux/bpf.h> +#include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -641,6 +642,98 @@ static void test_lpm_get_next_key(void) close(map_fd); } +#define MAX_TEST_KEYS 4 +struct lpm_mt_test_info { + int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */ + int iter; + int map_fd; + struct { + __u32 prefixlen; + __u32 data; + } key[MAX_TEST_KEYS]; +}; + +static void *lpm_test_command(void *arg) +{ + int i, j, ret, iter, key_size; + struct lpm_mt_test_info *info = arg; + struct bpf_lpm_trie_key *key_p; + + key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32); + key_p = alloca(key_size); + for (iter = 0; iter < info->iter; iter++) + for (i = 0; i < MAX_TEST_KEYS; i++) { + /* first half of iterations in forward order, + * and second half in backward order. + */ + j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 1; + key_p->prefixlen = info->key[j].prefixlen; + memcpy(key_p->data, &info->key[j].data, sizeof(__u32)); + if (info->cmd == 0) { + __u32 value = j; + /* update must succeed */ + assert(bpf_map_update_elem(info->map_fd, key_p, &value, 0) == 0); + } else if (info->cmd == 1) { + ret = bpf_map_delete_elem(info->map_fd, key_p); + assert(ret == 0 || errno == ENOENT); + } else if (info->cmd == 2) { + __u32 value; + ret = bpf_map_lookup_elem(info->map_fd, key_p, &value); + assert(ret == 0 || errno == ENOENT); + } else { + struct bpf_lpm_trie_key *next_key_p = alloca(key_size); + ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p); + assert(ret == 0 || errno == ENOENT || errno == ENOMEM); + } + } + + // Pass successful exit info back to the main thread + pthread_exit((void *)info); +} + +static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd) +{ + info->iter = 2000; + info->map_fd = map_fd; + info->key[0].prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", &info->key[0].data); + info->key[1].prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", &info->key[1].data); + info->key[2].prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", &info->key[2].data); + info->key[3].prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", &info->key[3].data); +} + +static void test_lpm_multi_thread(void) +{ + struct lpm_mt_test_info info[4]; + size_t key_size, value_size; + pthread_t thread_id[4]; + int i, map_fd; + void *ret; + + /* create a trie */ + value_size = sizeof(__u32); + key_size = sizeof(struct bpf_lpm_trie_key) + value_size; + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size, + 100, BPF_F_NO_PREALLOC); + + /* create 4 threads to test update, delete, lookup and get_next_key */ + setup_lpm_mt_test_info(&info[0], map_fd); + for (i = 0; i < 4; i++) { + if (i != 0) + memcpy(&info[i], &info[0], sizeof(info[i])); + info[i].cmd = i; + assert(pthread_create(&thread_id[i], NULL, &lpm_test_command, &info[i]) == 0); + } + + for (i = 0; i < 4; i++) + assert(pthread_join(thread_id[i], &ret) == 0 && ret == (void *)&info[i]); + + close(map_fd); +} + int main(void) { struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; @@ -667,6 +760,8 @@ int main(void) test_lpm_get_next_key(); + test_lpm_multi_thread(); + printf("test_lpm: OK\n"); return 0; } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 040356ecc862..436c4c72414f 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -242,7 +242,7 @@ static void test_hashmap_percpu(int task, void *data) static void test_hashmap_walk(int task, void *data) { - int fd, i, max_entries = 100000; + int fd, i, max_entries = 1000; long long key, value, next_key; bool next_key_valid = true; @@ -463,7 +463,7 @@ static void test_devmap(int task, void *data) #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o" static void test_sockmap(int tasks, void *data) { - int one = 1, map_fd_rx, map_fd_tx, map_fd_break, s, sc, rc; + int one = 1, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, s, sc, rc; struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_break; int ports[] = {50200, 50201, 50202, 50204}; int err, i, fd, udp, sfd[6] = {0xdeadbeef}; @@ -868,9 +868,12 @@ static void test_sockmap(int tasks, void *data) goto out_sockmap; } - /* Test map close sockets */ - for (i = 0; i < 6; i++) + /* Test map close sockets and empty maps */ + for (i = 0; i < 6; i++) { + bpf_map_delete_elem(map_fd_tx, &i); + bpf_map_delete_elem(map_fd_rx, &i); close(sfd[i]); + } close(fd); close(map_fd_rx); bpf_object__close(obj); @@ -881,8 +884,13 @@ out: printf("Failed to create sockmap '%i:%s'!\n", i, strerror(errno)); exit(1); out_sockmap: - for (i = 0; i < 6; i++) + for (i = 0; i < 6; i++) { + if (map_fd_tx) + bpf_map_delete_elem(map_fd_tx, &i); + if (map_fd_rx) + bpf_map_delete_elem(map_fd_rx, &i); close(sfd[i]); + } close(fd); exit(1); } @@ -931,8 +939,12 @@ static void test_map_large(void) close(fd); } -static void run_parallel(int tasks, void (*fn)(int task, void *data), - void *data) +#define run_parallel(N, FN, DATA) \ + printf("Fork %d tasks to '" #FN "'\n", N); \ + __run_parallel(N, FN, DATA) + +static void __run_parallel(int tasks, void (*fn)(int task, void *data), + void *data) { pid_t pid[tasks]; int i; @@ -972,7 +984,7 @@ static void test_map_stress(void) #define DO_UPDATE 1 #define DO_DELETE 0 -static void do_work(int fn, void *data) +static void test_update_delete(int fn, void *data) { int do_update = ((int *)data)[1]; int fd = ((int *)data)[0]; @@ -1012,7 +1024,7 @@ static void test_map_parallel(void) */ data[0] = fd; data[1] = DO_UPDATE; - run_parallel(TASKS, do_work, data); + run_parallel(TASKS, test_update_delete, data); /* Check that key=0 is already there. */ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 && @@ -1035,7 +1047,7 @@ static void test_map_parallel(void) /* Now let's delete all elemenets in parallel. */ data[1] = DO_DELETE; - run_parallel(TASKS, do_work, data); + run_parallel(TASKS, test_update_delete, data); /* Nothing should be left. */ key = -1; diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h new file mode 100644 index 000000000000..2fe43289943c --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcpbpf.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _TEST_TCPBPF_H +#define _TEST_TCPBPF_H + +struct tcpbpf_globals { + __u32 event_map; + __u32 total_retrans; + __u32 data_segs_in; + __u32 data_segs_out; + __u32 bad_cb_test_rv; + __u32 good_cb_test_rv; + __u64 bytes_received; + __u64 bytes_acked; +}; +#endif diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c new file mode 100644 index 000000000000..57119ad57a3f --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stddef.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/in6.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/tcp.h> +#include <netinet/in.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" +#include "test_tcpbpf.h" + +struct bpf_map_def SEC("maps") global_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct tcpbpf_globals), + .max_entries = 2, +}; + +static inline void update_event_map(int event) +{ + __u32 key = 0; + struct tcpbpf_globals g, *gp; + + gp = bpf_map_lookup_elem(&global_map, &key); + if (gp == NULL) { + struct tcpbpf_globals g = {0}; + + g.event_map |= (1 << event); + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } else { + g = *gp; + g.event_map |= (1 << event); + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } +} + +int _version SEC("version") = 1; + +SEC("sockops") +int bpf_testcb(struct bpf_sock_ops *skops) +{ + int rv = -1; + int bad_call_rv = 0; + int good_call_rv = 0; + int op; + int v = 0; + + op = (int) skops->op; + + update_event_map(op); + + switch (op) { + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + /* Test failure to set largest cb flag (assumes not defined) */ + bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); + /* Set callback */ + good_call_rv = bpf_sock_ops_cb_flags_set(skops, + BPF_SOCK_OPS_STATE_CB_FLAG); + /* Update results */ + { + __u32 key = 0; + struct tcpbpf_globals g, *gp; + + gp = bpf_map_lookup_elem(&global_map, &key); + if (!gp) + break; + g = *gp; + g.bad_cb_test_rv = bad_call_rv; + g.good_cb_test_rv = good_call_rv; + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } + break; + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + skops->sk_txhash = 0x12345f; + v = 0xff; + rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, + sizeof(v)); + break; + case BPF_SOCK_OPS_RTO_CB: + break; + case BPF_SOCK_OPS_RETRANS_CB: + break; + case BPF_SOCK_OPS_STATE_CB: + if (skops->args[1] == BPF_TCP_CLOSE) { + __u32 key = 0; + struct tcpbpf_globals g, *gp; + + gp = bpf_map_lookup_elem(&global_map, &key); + if (!gp) + break; + g = *gp; + g.total_retrans = skops->total_retrans; + g.data_segs_in = skops->data_segs_in; + g.data_segs_out = skops->data_segs_out; + g.bytes_received = skops->bytes_received; + g.bytes_acked = skops->bytes_acked; + bpf_map_update_elem(&global_map, &key, &g, + BPF_ANY); + } + break; + default: + rv = -1; + } + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c new file mode 100644 index 000000000000..95a370f3d378 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <string.h> +#include <assert.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> +#include <linux/bpf.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_util.h" +#include <linux/perf_event.h> +#include "test_tcpbpf.h" + +static int bpf_find_map(const char *test, struct bpf_object *obj, + const char *name) +{ + struct bpf_map *map; + + map = bpf_object__find_map_by_name(obj, name); + if (!map) { + printf("%s:FAIL:map '%s' not found\n", test, name); + return -1; + } + return bpf_map__fd(map); +} + +#define SYSTEM(CMD) \ + do { \ + if (system(CMD)) { \ + printf("system(%s) FAILS!\n", CMD); \ + } \ + } while (0) + +int main(int argc, char **argv) +{ + const char *file = "test_tcpbpf_kern.o"; + struct tcpbpf_globals g = {0}; + int cg_fd, prog_fd, map_fd; + bool debug_flag = false; + int error = EXIT_FAILURE; + struct bpf_object *obj; + char cmd[100], *dir; + struct stat buffer; + __u32 key = 0; + int pid; + int rv; + + if (argc > 1 && strcmp(argv[1], "-d") == 0) + debug_flag = true; + + dir = "/tmp/cgroupv2/foo"; + + if (stat(dir, &buffer) != 0) { + SYSTEM("mkdir -p /tmp/cgroupv2"); + SYSTEM("mount -t cgroup2 none /tmp/cgroupv2"); + SYSTEM("mkdir -p /tmp/cgroupv2/foo"); + } + pid = (int) getpid(); + sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid); + SYSTEM(cmd); + + cg_fd = open(dir, O_DIRECTORY, O_RDONLY); + if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) { + printf("FAILED: load_bpf_file failed for: %s\n", file); + goto err; + } + + rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0); + if (rv) { + printf("FAILED: bpf_prog_attach: %d (%s)\n", + error, strerror(errno)); + goto err; + } + + SYSTEM("./tcp_server.py"); + + map_fd = bpf_find_map(__func__, obj, "global_map"); + if (map_fd < 0) + goto err; + + rv = bpf_map_lookup_elem(map_fd, &key, &g); + if (rv != 0) { + printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); + goto err; + } + + if (g.bytes_received != 501 || g.bytes_acked != 1002 || + g.data_segs_in != 1 || g.data_segs_out != 1 || + (g.event_map ^ 0x47e) != 0 || g.bad_cb_test_rv != 0x80 || + g.good_cb_test_rv != 0) { + printf("FAILED: Wrong stats\n"); + if (debug_flag) { + printf("\n"); + printf("bytes_received: %d (expecting 501)\n", + (int)g.bytes_received); + printf("bytes_acked: %d (expecting 1002)\n", + (int)g.bytes_acked); + printf("data_segs_in: %d (expecting 1)\n", + g.data_segs_in); + printf("data_segs_out: %d (expecting 1)\n", + g.data_segs_out); + printf("event_map: 0x%x (at least 0x47e)\n", + g.event_map); + printf("bad_cb_test_rv: 0x%x (expecting 0x80)\n", + g.bad_cb_test_rv); + printf("good_cb_test_rv:0x%x (expecting 0)\n", + g.good_cb_test_rv); + } + goto err; + } + printf("PASSED!\n"); + error = 0; +err: + bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); + return error; + +} diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index fb82d29ee863..697bd83de295 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -21,6 +21,7 @@ #include <stddef.h> #include <stdbool.h> #include <sched.h> +#include <limits.h> #include <sys/capability.h> #include <sys/resource.h> @@ -111,7 +112,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "DIV32 by 0, zero check 2", @@ -123,7 +124,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "DIV64 by 0, zero check", @@ -135,7 +136,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "MOD32 by 0, zero check 1", @@ -147,7 +148,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "MOD32 by 0, zero check 2", @@ -159,7 +160,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, - .retval = 0, + .retval = 42, }, { "MOD64 by 0, zero check", @@ -171,13 +172,245 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 42, + }, + { + "DIV32 by 0, zero check ok, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_MOV32_IMM(BPF_REG_2, 16), + BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 8, + }, + { + "DIV32 by 0, zero check 1, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV32 by 0, zero check 2, cls", + .insns = { + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 by 0, zero check, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, .retval = 0, }, { + "MOD32 by 0, zero check ok, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_MOV32_IMM(BPF_REG_1, 3), + BPF_MOV32_IMM(BPF_REG_2, 5), + BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 2, + }, + { + "MOD32 by 0, zero check 1, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "MOD32 by 0, zero check 2, cls", + .insns = { + BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "MOD64 by 0, zero check 1, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 2, + }, + { + "MOD64 by 0, zero check 2, cls", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_0, -1), + BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = -1, + }, + /* Just make sure that JITs used udiv/umod as otherwise we get + * an exception from INT_MIN/-1 overflow similarly as with div + * by zero. + */ + { + "DIV32 overflow, check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV32 overflow, check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 overflow, check 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_LD_IMM64(BPF_REG_0, LLONG_MIN), + BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "DIV64 overflow, check 2", + .insns = { + BPF_LD_IMM64(BPF_REG_0, LLONG_MIN), + BPF_ALU64_IMM(BPF_DIV, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "MOD32 overflow, check 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = INT_MIN, + }, + { + "MOD32 overflow, check 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, INT_MIN), + BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = INT_MIN, + }, + { + "MOD64 overflow, check 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_LD_IMM64(BPF_REG_2, LLONG_MIN), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "MOD64 overflow, check 2", + .insns = { + BPF_LD_IMM64(BPF_REG_2, LLONG_MIN), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_ALU64_IMM(BPF_MOD, BPF_REG_2, -1), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "xor32 zero extend check", + .insns = { + BPF_MOV32_IMM(BPF_REG_2, -1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 0xffff), + BPF_ALU32_REG(BPF_XOR, BPF_REG_2, BPF_REG_2), + BPF_MOV32_IMM(BPF_REG_0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 1), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { "empty prog", .insns = { }, - .errstr = "last insn is not an exit or jmp", + .errstr = "unknown opcode 00", .result = REJECT, }, { @@ -374,7 +607,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_ARSH not supported for 32 bit ALU", + .errstr = "unknown opcode c4", }, { "arsh32 on reg", @@ -385,7 +618,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_ARSH not supported for 32 bit ALU", + .errstr = "unknown opcode cc", }, { "arsh64 on imm", @@ -501,7 +734,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0), BPF_EXIT_INSN(), }, - .errstr = "BPF_CALL uses reserved", + .errstr = "unknown opcode 8d", .result = REJECT, }, { @@ -691,7 +924,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(0, 0, 0, 0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM", + .errstr = "unknown opcode 00", .result = REJECT, }, { @@ -709,7 +942,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(-1, 0, 0, 0, 0), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_ALU opcode f0", + .errstr = "unknown opcode ff", .result = REJECT, }, { @@ -718,7 +951,7 @@ static struct bpf_test tests[] = { BPF_RAW_INSN(-1, -1, -1, -1, -1), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_ALU opcode f0", + .errstr = "unknown opcode ff", .result = REJECT, }, { @@ -7543,7 +7776,7 @@ static struct bpf_test tests[] = { }, BPF_EXIT_INSN(), }, - .errstr = "BPF_END uses reserved fields", + .errstr = "unknown opcode d7", .result = REJECT, }, { @@ -8766,6 +8999,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .result = ACCEPT, + .retval = 1, }, { "check deducing bounds from const, 3", @@ -8963,6 +9197,90 @@ static struct bpf_test tests[] = { .retval = 1, }, { + "calls: div by 0 in subprog", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(BPF_REG_2, 0), + BPF_MOV32_IMM(BPF_REG_3, 1), + BPF_ALU32_REG(BPF_DIV, BPF_REG_3, BPF_REG_2), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 1, + }, + { + "calls: multiple ret types in subprog 1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_MOV32_IMM(BPF_REG_0, 42), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "R0 invalid mem access 'inv'", + }, + { + "calls: multiple ret types in subprog 2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, data_end)), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 9), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct __sk_buff, data)), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .fixup_map1 = { 16 }, + .result = REJECT, + .errstr = "R0 min value is outside of the array range", + }, + { "calls: overlapping caller/callee", .insns = { BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0), |