From 590ce401c207fd944827eb5aa5e87d834eddb149 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Sun, 13 Jan 2019 09:56:48 +0100 Subject: dt-bindings: net: dsa: ksz9477: fix indentation for switch spi bindings Switch bindings for spi managed mode are using spaces instead of tabs. Fix them to get a file with a proper kernel indentation style. Reviewed-by: Florian Fainelli Signed-off-by: Sergio Paracuellos Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/ksz.txt | 102 +++++++++++----------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt index 0f407fb371ce..8d58c2a7de39 100644 --- a/Documentation/devicetree/bindings/net/dsa/ksz.txt +++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt @@ -19,58 +19,58 @@ Examples: Ethernet switch connected via SPI to the host, CPU port wired to eth0: - eth0: ethernet@10001000 { - fixed-link { - speed = <1000>; - full-duplex; - }; - }; + eth0: ethernet@10001000 { + fixed-link { + speed = <1000>; + full-duplex; + }; + }; - spi1: spi@f8008000 { - pinctrl-0 = <&pinctrl_spi_ksz>; - cs-gpios = <&pioC 25 0>; - id = <1>; + spi1: spi@f8008000 { + pinctrl-0 = <&pinctrl_spi_ksz>; + cs-gpios = <&pioC 25 0>; + id = <1>; - ksz9477: ksz9477@0 { - compatible = "microchip,ksz9477"; - reg = <0>; + ksz9477: ksz9477@0 { + compatible = "microchip,ksz9477"; + reg = <0>; - spi-max-frequency = <44000000>; - spi-cpha; - spi-cpol; + spi-max-frequency = <44000000>; + spi-cpha; + spi-cpol; - ports { - #address-cells = <1>; - #size-cells = <0>; - port@0 { - reg = <0>; - label = "lan1"; - }; - port@1 { - reg = <1>; - label = "lan2"; - }; - port@2 { - reg = <2>; - label = "lan3"; - }; - port@3 { - reg = <3>; - label = "lan4"; - }; - port@4 { - reg = <4>; - label = "lan5"; - }; - port@5 { - reg = <5>; - label = "cpu"; - ethernet = <ð0>; - fixed-link { - speed = <1000>; - full-duplex; - }; - }; - }; - }; - }; + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + label = "lan1"; + }; + port@1 { + reg = <1>; + label = "lan2"; + }; + port@2 { + reg = <2>; + label = "lan3"; + }; + port@3 { + reg = <3>; + label = "lan4"; + }; + port@4 { + reg = <4>; + label = "lan5"; + }; + port@5 { + reg = <5>; + label = "cpu"; + ethernet = <ð0>; + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + }; + }; + }; -- cgit From ae5220c672180765615458ae54dbcff9abe6a01d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 13 Jan 2019 20:17:41 -0800 Subject: networking: Documentation: fix snmp_counters.rst Sphinx warnings Fix over 100 documentation warnings in snmp_counter.rst by extending the underline string lengths and inserting a blank line after bullet items. Examples: Documentation/networking/snmp_counter.rst:1: WARNING: Title overline too short. Documentation/networking/snmp_counter.rst:14: WARNING: Bullet list ends without a blank line; unexpected unindent. Fixes: 2b96547223e3 ("add document for TCP OFO, PAWS and skip ACK counters") Fixes: 8e2ea53a83df ("add snmp counters document") Fixes: 712ee16c230f ("add documents for snmp counters") Fixes: 80cc49507ba4 ("net: Add part of TCP counts explanations in snmp_counters.rst") Fixes: b08794a922c4 ("documentation of some IP/ICMP snmp counters") Signed-off-by: Randy Dunlap Cc: yupeng Signed-off-by: David S. Miller --- Documentation/networking/snmp_counter.rst | 113 +++++++++++++++++++++++------- 1 file changed, 86 insertions(+), 27 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index b0dfdaaca512..486ab33acc3a 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -1,16 +1,17 @@ -=========== +============ SNMP counter -=========== +============ This document explains the meaning of SNMP counters. General IPv4 counters -==================== +===================== All layer 4 packets and ICMP packets will change these counters, but these counters won't be changed by layer 2 packets (such as STP) or ARP packets. * IpInReceives + Defined in `RFC1213 ipInReceives`_ .. _RFC1213 ipInReceives: https://tools.ietf.org/html/rfc1213#page-26 @@ -23,6 +24,7 @@ and so on). It indicates the number of aggregated segments after GRO/LRO. * IpInDelivers + Defined in `RFC1213 ipInDelivers`_ .. _RFC1213 ipInDelivers: https://tools.ietf.org/html/rfc1213#page-28 @@ -33,6 +35,7 @@ supported protocols will be delivered, if someone listens on the raw socket, all valid IP packets will be delivered. * IpOutRequests + Defined in `RFC1213 ipOutRequests`_ .. _RFC1213 ipOutRequests: https://tools.ietf.org/html/rfc1213#page-28 @@ -42,6 +45,7 @@ multicast packets, and would always be updated together with IpExtOutOctets. * IpExtInOctets and IpExtOutOctets + They are Linux kernel extensions, no RFC definitions. Please note, RFC1213 indeed defines ifInOctets and ifOutOctets, but they are different things. The ifInOctets and ifOutOctets include the MAC @@ -49,6 +53,7 @@ layer header size but IpExtInOctets and IpExtOutOctets don't, they only include the IP layer header and the IP layer data. * IpExtInNoECTPkts, IpExtInECT1Pkts, IpExtInECT0Pkts, IpExtInCEPkts + They indicate the number of four kinds of ECN IP packets, please refer `Explicit Congestion Notification`_ for more details. @@ -60,6 +65,7 @@ for the same packet, you might find that IpInReceives count 1, but IpExtInNoECTPkts counts 2 or more. * IpInHdrErrors + Defined in `RFC1213 ipInHdrErrors`_. It indicates the packet is dropped due to the IP header error. It might happen in both IP input and IP forward paths. @@ -67,6 +73,7 @@ and IP forward paths. .. _RFC1213 ipInHdrErrors: https://tools.ietf.org/html/rfc1213#page-27 * IpInAddrErrors + Defined in `RFC1213 ipInAddrErrors`_. It will be increased in two scenarios: (1) The IP address is invalid. (2) The destination IP address is not a local address and IP forwarding is not enabled @@ -74,6 +81,7 @@ address is not a local address and IP forwarding is not enabled .. _RFC1213 ipInAddrErrors: https://tools.ietf.org/html/rfc1213#page-27 * IpExtInNoRoutes + This counter means the packet is dropped when the IP stack receives a packet and can't find a route for it from the route table. It might happen when IP forwarding is enabled and the destination IP address is @@ -81,6 +89,7 @@ not a local address and there is no route for the destination IP address. * IpInUnknownProtos + Defined in `RFC1213 ipInUnknownProtos`_. It will be increased if the layer 4 protocol is unsupported by kernel. If an application is using raw socket, kernel will always deliver the packet to the raw socket @@ -89,10 +98,12 @@ and this counter won't be increased. .. _RFC1213 ipInUnknownProtos: https://tools.ietf.org/html/rfc1213#page-27 * IpExtInTruncatedPkts + For IPv4 packet, it means the actual data size is smaller than the "Total Length" field in the IPv4 header. * IpInDiscards + Defined in `RFC1213 ipInDiscards`_. It indicates the packet is dropped in the IP receiving path and due to kernel internal reasons (e.g. no enough memory). @@ -100,20 +111,23 @@ enough memory). .. _RFC1213 ipInDiscards: https://tools.ietf.org/html/rfc1213#page-28 * IpOutDiscards + Defined in `RFC1213 ipOutDiscards`_. It indicates the packet is dropped in the IP sending path and due to kernel internal reasons. .. _RFC1213 ipOutDiscards: https://tools.ietf.org/html/rfc1213#page-28 * IpOutNoRoutes + Defined in `RFC1213 ipOutNoRoutes`_. It indicates the packet is dropped in the IP sending path and no route is found for it. .. _RFC1213 ipOutNoRoutes: https://tools.ietf.org/html/rfc1213#page-29 ICMP counters -============ +============= * IcmpInMsgs and IcmpOutMsgs + Defined by `RFC1213 icmpInMsgs`_ and `RFC1213 icmpOutMsgs`_ .. _RFC1213 icmpInMsgs: https://tools.ietf.org/html/rfc1213#page-41 @@ -126,6 +140,7 @@ IcmpOutMsgs would still be updated if the IP header is constructed by a userspace program. * ICMP named types + | These counters include most of common ICMP types, they are: | IcmpInDestUnreachs: `RFC1213 icmpInDestUnreachs`_ | IcmpInTimeExcds: `RFC1213 icmpInTimeExcds`_ @@ -180,6 +195,7 @@ straightforward. The 'In' counter means kernel receives such a packet and the 'Out' counter means kernel sends such a packet. * ICMP numeric types + They are IcmpMsgInType[N] and IcmpMsgOutType[N], the [N] indicates the ICMP type number. These counters track all kinds of ICMP packets. The ICMP type number definition could be found in the `ICMP parameters`_ @@ -192,12 +208,14 @@ IcmpMsgOutType8 would increase 1. And if kernel gets an ICMP Echo Reply packet, IcmpMsgInType0 would increase 1. * IcmpInCsumErrors + This counter indicates the checksum of the ICMP packet is wrong. Kernel verifies the checksum after updating the IcmpInMsgs and before updating IcmpMsgInType[N]. If a packet has bad checksum, the IcmpInMsgs would be updated but none of IcmpMsgInType[N] would be updated. * IcmpInErrors and IcmpOutErrors + Defined by `RFC1213 icmpInErrors`_ and `RFC1213 icmpOutErrors`_ .. _RFC1213 icmpInErrors: https://tools.ietf.org/html/rfc1213#page-41 @@ -209,7 +227,7 @@ and the sending packet path use IcmpOutErrors. When IcmpInCsumErrors is increased, IcmpInErrors would always be increased too. relationship of the ICMP counters -------------------------------- +--------------------------------- The sum of IcmpMsgOutType[N] is always equal to IcmpOutMsgs, as they are updated at the same time. The sum of IcmpMsgInType[N] plus IcmpInErrors should be equal or larger than IcmpInMsgs. When kernel @@ -229,8 +247,9 @@ IcmpInMsgs should be less than the sum of IcmpMsgOutType[N] plus IcmpInErrors. General TCP counters -================== +==================== * TcpInSegs + Defined in `RFC1213 tcpInSegs`_ .. _RFC1213 tcpInSegs: https://tools.ietf.org/html/rfc1213#page-48 @@ -247,6 +266,7 @@ isn't aware of GRO. So if two packets are merged by GRO, the TcpInSegs counter would only increase 1. * TcpOutSegs + Defined in `RFC1213 tcpOutSegs`_ .. _RFC1213 tcpOutSegs: https://tools.ietf.org/html/rfc1213#page-48 @@ -258,6 +278,7 @@ GSO, so if a packet would be split to 2 by GSO, TcpOutSegs will increase 2. * TcpActiveOpens + Defined in `RFC1213 tcpActiveOpens`_ .. _RFC1213 tcpActiveOpens: https://tools.ietf.org/html/rfc1213#page-47 @@ -267,6 +288,7 @@ state. Every time TcpActiveOpens increases 1, TcpOutSegs should always increase 1. * TcpPassiveOpens + Defined in `RFC1213 tcpPassiveOpens`_ .. _RFC1213 tcpPassiveOpens: https://tools.ietf.org/html/rfc1213#page-47 @@ -275,6 +297,7 @@ It means the TCP layer receives a SYN, replies a SYN+ACK, come into the SYN-RCVD state. * TcpExtTCPRcvCoalesce + When packets are received by the TCP layer and are not be read by the application, the TCP layer will try to merge them. This counter indicate how many packets are merged in such situation. If GRO is @@ -282,12 +305,14 @@ enabled, lots of packets would be merged by GRO, these packets wouldn't be counted to TcpExtTCPRcvCoalesce. * TcpExtTCPAutoCorking + When sending packets, the TCP layer will try to merge small packets to a bigger one. This counter increase 1 for every packet merged in such situation. Please refer to the LWN article for more details: https://lwn.net/Articles/576263/ * TcpExtTCPOrigDataSent + This counter is explained by `kernel commit f19c29e3e391`_, I pasted the explaination below:: @@ -297,6 +322,7 @@ explaination below:: more useful to track the TCP retransmission rate. * TCPSynRetrans + This counter is explained by `kernel commit f19c29e3e391`_, I pasted the explaination below:: @@ -304,6 +330,7 @@ explaination below:: retransmissions into SYN, fast-retransmits, timeout retransmits, etc. * TCPFastOpenActiveFail + This counter is explained by `kernel commit f19c29e3e391`_, I pasted the explaination below:: @@ -313,6 +340,7 @@ explaination below:: .. _kernel commit f19c29e3e391: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f19c29e3e391a66a273e9afebaf01917245148cd * TcpExtListenOverflows and TcpExtListenDrops + When kernel receives a SYN from a client, and if the TCP accept queue is full, kernel will drop the SYN and add 1 to TcpExtListenOverflows. At the same time kernel will also add 1 to TcpExtListenDrops. When a @@ -337,7 +365,7 @@ to the accept queue. TCP Fast Open -============ +============= When kernel receives a TCP packet, it has two paths to handler the packet, one is fast path, another is slow path. The comment in kernel code provides a good explanation of them, I pasted them below:: @@ -370,22 +398,24 @@ will disable the fast path at first, and try to enable it after kernel receives packets. * TcpExtTCPPureAcks and TcpExtTCPHPAcks + If a packet set ACK flag and has no data, it is a pure ACK packet, if kernel handles it in the fast path, TcpExtTCPHPAcks will increase 1, if kernel handles it in the slow path, TcpExtTCPPureAcks will increase 1. * TcpExtTCPHPHits + If a TCP packet has data (which means it is not a pure ACK packet), and this packet is handled in the fast path, TcpExtTCPHPHits will increase 1. TCP abort -======== - +========= * TcpExtTCPAbortOnData + It means TCP layer has data in flight, but need to close the connection. So TCP layer sends a RST to the other side, indicate the connection is not closed very graceful. An easy way to increase this @@ -404,11 +434,13 @@ when the application closes a connection, kernel will send a RST immediately and increase the TcpExtTCPAbortOnData counter. * TcpExtTCPAbortOnClose + This counter means the application has unread data in the TCP layer when the application wants to close the TCP connection. In such a situation, kernel will send a RST to the other side of the TCP connection. * TcpExtTCPAbortOnMemory + When an application closes a TCP connection, kernel still need to track the connection, let it complete the TCP disconnect process. E.g. an app calls the close method of a socket, kernel sends fin to the other @@ -430,10 +462,12 @@ the tcp_mem. Please refer the tcp_mem section in the `TCP man page`_: * TcpExtTCPAbortOnTimeout + This counter will increase when any of the TCP timers expire. In such situation, kernel won't send RST, just give up the connection. * TcpExtTCPAbortOnLinger + When a TCP connection comes into FIN_WAIT_2 state, instead of waiting for the fin packet from the other side, kernel could send a RST and delete the socket immediately. This is not the default behavior of @@ -441,6 +475,7 @@ Linux kernel TCP stack. By configuring the TCP_LINGER2 socket option, you could let kernel follow this behavior. * TcpExtTCPAbortFailed + The kernel TCP layer will send RST if the `RFC2525 2.17 section`_ is satisfied. If an internal error occurs during this process, TcpExtTCPAbortFailed will be increased. @@ -448,7 +483,7 @@ TcpExtTCPAbortFailed will be increased. .. _RFC2525 2.17 section: https://tools.ietf.org/html/rfc2525#page-50 TCP Hybrid Slow Start -==================== +===================== The Hybrid Slow Start algorithm is an enhancement of the traditional TCP congestion window Slow Start algorithm. It uses two pieces of information to detect whether the max bandwidth of the TCP path is @@ -464,23 +499,27 @@ relate with the Hybrid Slow Start algorithm. .. _Hybrid Slow Start paper: https://pdfs.semanticscholar.org/25e9/ef3f03315782c7f1cbcd31b587857adae7d1.pdf * TcpExtTCPHystartTrainDetect + How many times the ACK train length threshold is detected * TcpExtTCPHystartTrainCwnd + The sum of CWND detected by ACK train length. Dividing this value by TcpExtTCPHystartTrainDetect is the average CWND which detected by the ACK train length. * TcpExtTCPHystartDelayDetect + How many times the packet delay threshold is detected. * TcpExtTCPHystartDelayCwnd + The sum of CWND detected by packet delay. Dividing this value by TcpExtTCPHystartDelayDetect is the average CWND which detected by the packet delay. TCP retransmission and congestion control -====================================== +========================================= The TCP protocol has two retransmission mechanisms: SACK and fast recovery. They are exclusive with each other. When SACK is enabled, the kernel TCP stack would use SACK, or kernel would use fast @@ -499,12 +538,14 @@ https://pdfs.semanticscholar.org/0e9c/968d09ab2e53e24c4dca5b2d67c7f7140f8e.pdf .. _RFC6582: https://tools.ietf.org/html/rfc6582 * TcpExtTCPRenoRecovery and TcpExtTCPSackRecovery + When the congestion control comes into Recovery state, if sack is used, TcpExtTCPSackRecovery increases 1, if sack is not used, TcpExtTCPRenoRecovery increases 1. These two counters mean the TCP stack begins to retransmit the lost packets. * TcpExtTCPSACKReneging + A packet was acknowledged by SACK, but the receiver has dropped this packet, so the sender needs to retransmit this packet. In this situation, the sender adds 1 to TcpExtTCPSACKReneging. A receiver @@ -515,6 +556,7 @@ the RTO expires for this packet, then the sender assumes this packet has been dropped by the receiver. * TcpExtTCPRenoReorder + The reorder packet is detected by fast recovery. It would only be used if SACK is disabled. The fast recovery algorithm detects recorder by the duplicate ACK number. E.g., if retransmission is triggered, and @@ -525,6 +567,7 @@ order packet. Thus the sender would find more ACks than its expectation, and the sender knows out of order occurs. * TcpExtTCPTSReorder + The reorder packet is detected when a hole is filled. E.g., assume the sender sends packet 1,2,3,4,5, and the receiving order is 1,2,4,5,3. When the sender receives the ACK of packet 3 (which will @@ -534,6 +577,7 @@ fill the hole), two conditions will let TcpExtTCPTSReorder increase than the retransmission timestamp. * TcpExtTCPSACKReorder + The reorder packet detected by SACK. The SACK has two methods to detect reorder: (1) DSACK is received by the sender. It means the sender sends the same packet more than one times. And the only reason @@ -558,39 +602,46 @@ sender side. .. _RFC2883 : https://tools.ietf.org/html/rfc2883 * TcpExtTCPDSACKOldSent + The TCP stack receives a duplicate packet which has been acked, so it sends a DSACK to the sender. * TcpExtTCPDSACKOfoSent + The TCP stack receives an out of order duplicate packet, so it sends a DSACK to the sender. * TcpExtTCPDSACKRecv + The TCP stack receives a DSACK, which indicate an acknowledged duplicate packet is received. * TcpExtTCPDSACKOfoRecv + The TCP stack receives a DSACK, which indicate an out of order duplicate packet is received. TCP out of order -=============== +================ * TcpExtTCPOFOQueue + The TCP layer receives an out of order packet and has enough memory to queue it. * TcpExtTCPOFODrop + The TCP layer receives an out of order packet but doesn't have enough memory, so drops it. Such packets won't be counted into TcpExtTCPOFOQueue. * TcpExtTCPOFOMerge + The received out of order packet has an overlay with the previous packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge packets will also be counted into TcpExtTCPOFOQueue. TCP PAWS -======= +======== PAWS (Protection Against Wrapped Sequence numbers) is an algorithm which is used to drop old packets. It depends on the TCP timestamps. For detail information, please refer the `timestamp wiki`_ @@ -600,13 +651,15 @@ and the `RFC of PAWS`_. .. _timestamp wiki: https://en.wikipedia.org/wiki/Transmission_Control_Protocol#TCP_timestamps * TcpExtPAWSActive + Packets are dropped by PAWS in Syn-Sent status. * TcpExtPAWSEstab + Packets are dropped by PAWS in any status other than Syn-Sent. TCP ACK skip -=========== +============ In some scenarios, kernel would avoid sending duplicate ACKs too frequently. Please find more details in the tcp_invalid_ratelimit section of the `sysctl document`_. When kernel decides to skip an ACK @@ -618,6 +671,7 @@ it has no data. .. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt * TcpExtTCPACKSkippedSynRecv + The ACK is skipped in Syn-Recv status. The Syn-Recv status means the TCP stack receives a SYN and replies SYN+ACK. Now the TCP stack is waiting for an ACK. Generally, the TCP stack doesn't need to send ACK @@ -631,6 +685,7 @@ increase TcpExtTCPACKSkippedSynRecv. * TcpExtTCPACKSkippedPAWS + The ACK is skipped due to PAWS (Protect Against Wrapped Sequence numbers) check fails. If the PAWS check fails in Syn-Recv, Fin-Wait-2 or Time-Wait statuses, the skipped ACK would be counted to @@ -639,18 +694,22 @@ TcpExtTCPACKSkippedTimeWait. In all other statuses, the skipped ACK would be counted to TcpExtTCPACKSkippedPAWS. * TcpExtTCPACKSkippedSeq + The sequence number is out of window and the timestamp passes the PAWS check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait. * TcpExtTCPACKSkippedFinWait2 + The ACK is skipped in Fin-Wait-2 status, the reason would be either PAWS check fails or the received sequence number is out of window. * TcpExtTCPACKSkippedTimeWait + Tha ACK is skipped in Time-Wait status, the reason would be either PAWS check failed or the received sequence number is out of window. * TcpExtTCPACKSkippedChallenge + The ACK is skipped if the ACK is a challenge ACK. The RFC 5961 defines 3 kind of challenge ACK, please refer `RFC 5961 section 3.2`_, `RFC 5961 section 4.2`_ and `RFC 5961 section 5.2`_. Besides these @@ -664,10 +723,10 @@ unacknowledged number (more strict than `RFC 5961 section 5.2`_). examples -======= +======== ping test --------- +--------- Run the ping command against the public dns server 8.8.8.8:: nstatuser@nstat-a:~$ ping 8.8.8.8 -c 1 @@ -711,7 +770,7 @@ and its corresponding Echo Reply packet are constructed by: So the IpExtInOctets and IpExtOutOctets are 20+16+48=84. tcp 3-way handshake ------------------- +------------------- On server side, we run:: nstatuser@nstat-b:~$ nc -lknv 0.0.0.0 9000 @@ -753,7 +812,7 @@ ACK, so client sent 2 packets, received 1 packet, TcpInSegs increased 1, TcpOutSegs increased 2. TCP normal traffic ------------------ +------------------ Run nc on server:: nstatuser@nstat-b:~$ nc -lkv 0.0.0.0 9000 @@ -876,7 +935,7 @@ and the packet received from client qualified for fast path, so it was counted into 'TcpExtTCPHPHits'. TcpExtTCPAbortOnClose --------------------- +--------------------- On the server side, we run below python script:: import socket @@ -910,7 +969,7 @@ If we run tcpdump on the server side, we could find the server sent a RST after we type Ctrl-C. TcpExtTCPAbortOnMemory and TcpExtTCPAbortOnTimeout ------------------------------------------------ +--------------------------------------------------- Below is an example which let the orphan socket count be higher than net.ipv4.tcp_max_orphans. Change tcp_max_orphans to a smaller value on client:: @@ -1032,7 +1091,7 @@ FIN_WAIT_1 state finally. So we wait for a few minutes, we could find TcpExtTCPAbortOnTimeout 10 0.0 TcpExtTCPAbortOnLinger ---------------------- +---------------------- The server side code:: nstatuser@nstat-b:~$ cat server_linger.py @@ -1077,7 +1136,7 @@ After run client_linger.py, check the output of nstat:: TcpExtTCPAbortOnLinger 1 0.0 TcpExtTCPRcvCoalesce -------------------- +-------------------- On the server, we run a program which listen on TCP port 9000, but doesn't read any data:: @@ -1137,7 +1196,7 @@ the receiving queue. So the TCP layer merged the two packets, and we could find the TcpExtTCPRcvCoalesce increased 1. TcpExtListenOverflows and TcpExtListenDrops ----------------------------------------- +------------------------------------------- On server, run the nc command, listen on port 9000:: nstatuser@nstat-b:~$ nc -lkv 0.0.0.0 9000 @@ -1185,7 +1244,7 @@ TcpExtListenOverflows and TcpExtListenDrops would be larger, because the SYN of the 4th nc was dropped, the client was retrying. IpInAddrErrors, IpExtInNoRoutes and IpOutNoRoutes ----------------------------------------------- +------------------------------------------------- server A IP address: 192.168.122.250 server B IP address: 192.168.122.251 Prepare on server A, add a route to server B:: @@ -1280,7 +1339,7 @@ a route for the 8.8.8.8 IP address, so server B increased IpOutNoRoutes. TcpExtTCPACKSkippedSynRecv ------------------------- +-------------------------- In this test, we send 3 same SYN packets from client to server. The first SYN will let server create a socket, set it to Syn-Recv status, and reply a SYN/ACK. The second SYN will let server reply the SYN/ACK @@ -1328,7 +1387,7 @@ Check snmp cunter on nstat-b:: As we expected, TcpExtTCPACKSkippedSynRecv is 1. TcpExtTCPACKSkippedPAWS ----------------------- +----------------------- To trigger PAWS, we could send an old SYN. On nstat-b, let nc listen on port 9000:: @@ -1365,7 +1424,7 @@ failed, the nstat-b replied an ACK for the first SYN, skipped the ACK for the second SYN, and updated TcpExtTCPACKSkippedPAWS. TcpExtTCPACKSkippedSeq --------------------- +---------------------- To trigger TcpExtTCPACKSkippedSeq, we send packets which have valid timestamp (to pass PAWS check) but the sequence number is out of window. The linux TCP stack would avoid to skip if the packet has -- cgit From 44543f1dd2a39d56c9afdc3778aa050b5a4725b4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 15 Jan 2019 14:35:02 -0800 Subject: Documentation: networking: dsa: Update documentation Since 83c0afaec7b7 ("net: dsa: Add new binding implementation"), DSA is no longer a platform device exclusively and can support registering DSA switches from other bus drivers (PCI, USB, I2C, etc.). Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.txt | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt index 25170ad7d25b..1000b821681c 100644 --- a/Documentation/networking/dsa/dsa.txt +++ b/Documentation/networking/dsa/dsa.txt @@ -236,19 +236,6 @@ description. Design limitations ================== -DSA is a platform device driver -------------------------------- - -DSA is implemented as a DSA platform device driver which is convenient because -it will register the entire DSA switch tree attached to a master network device -in one-shot, facilitating the device creation and simplifying the device driver -model a bit, this comes however with a number of limitations: - -- building DSA and its switch drivers as modules is currently not working -- the device driver parenting does not necessarily reflect the original - bus/device the switch can be created from -- supporting non-MDIO and non-MMIO (platform) switches is not possible - Limits on the number of devices and ports ----------------------------------------- -- cgit From 6685987c29582afc79b7fa3998dfbf36b4295791 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 16 Jan 2019 23:06:56 +0000 Subject: switchdev: Add extack argument to call_switchdev_notifiers() A follow-up patch will enable vetoing of FDB entries. Make it possible to communicate details of why an FDB entry is not acceptable back to the user. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 6 ++++-- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 12 ++++++------ drivers/net/ethernet/rocker/rocker_main.c | 2 +- drivers/net/ethernet/rocker/rocker_ofdpa.c | 4 ++-- drivers/net/vxlan.c | 2 +- include/net/switchdev.h | 6 ++++-- net/bridge/br_switchdev.c | 2 +- net/dsa/slave.c | 2 +- net/switchdev/switchdev.c | 5 +++-- 10 files changed, 24 insertions(+), 19 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 82236a17b5e6..f3244d87512a 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -196,7 +196,7 @@ The switch device will learn/forget source MAC address/VLAN on ingress packets and notify the switch driver of the mac/vlan/port tuples. The switch driver, in turn, will notify the bridge driver using the switchdev notifier call: - err = call_switchdev_notifiers(val, dev, info); + err = call_switchdev_notifiers(val, dev, info, extack); Where val is SWITCHDEV_FDB_ADD when learning and SWITCHDEV_FDB_DEL when forgetting, and info points to a struct switchdev_notifier_fdb_info. On diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 98e5ffd71b91..042341c7f6b9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -7294,7 +7294,8 @@ static void mlxsw_sp_rif_vlan_fdb_del(struct mlxsw_sp_rif *rif, const char *mac) info.addr = mac; info.vid = vid; - call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, dev, &info.info); + call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, dev, &info.info, + NULL); } static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_vlan_ops = { @@ -7381,7 +7382,8 @@ static void mlxsw_sp_rif_fid_fdb_del(struct mlxsw_sp_rif *rif, const char *mac) info.addr = mac; info.vid = 0; - call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, dev, &info.info); + call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, dev, &info.info, + NULL); } static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_fid_ops = { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 0abbaa0fbf14..71aad9d4e160 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2443,7 +2443,7 @@ static void mlxsw_sp_fdb_vxlan_call_notifiers(struct net_device *dev, ether_addr_copy(info.eth_addr, mac); info.vni = vni; info.offloaded = adding; - call_switchdev_notifiers(type, dev, &info.info); + call_switchdev_notifiers(type, dev, &info.info, NULL); } static void mlxsw_sp_fdb_nve_call_notifiers(struct net_device *dev, @@ -2468,7 +2468,7 @@ mlxsw_sp_fdb_call_notifiers(enum switchdev_notifier_type type, info.addr = mac; info.vid = vid; info.offloaded = offloaded; - call_switchdev_notifiers(type, dev, &info.info); + call_switchdev_notifiers(type, dev, &info.info, NULL); } static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp, @@ -2819,7 +2819,7 @@ mlxsw_sp_switchdev_bridge_vxlan_fdb_event(struct mlxsw_sp *mlxsw_sp, return; vxlan_fdb_info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_VXLAN_FDB_OFFLOADED, dev, - &vxlan_fdb_info.info); + &vxlan_fdb_info.info, NULL); mlxsw_sp_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED, vxlan_fdb_info.eth_addr, fdb_info->vid, dev, true); @@ -2832,7 +2832,7 @@ mlxsw_sp_switchdev_bridge_vxlan_fdb_event(struct mlxsw_sp *mlxsw_sp, false); vxlan_fdb_info.offloaded = false; call_switchdev_notifiers(SWITCHDEV_VXLAN_FDB_OFFLOADED, dev, - &vxlan_fdb_info.info); + &vxlan_fdb_info.info, NULL); break; } } @@ -2977,7 +2977,7 @@ mlxsw_sp_switchdev_vxlan_fdb_add(struct mlxsw_sp *mlxsw_sp, } vxlan_fdb_info->offloaded = true; call_switchdev_notifiers(SWITCHDEV_VXLAN_FDB_OFFLOADED, dev, - &vxlan_fdb_info->info); + &vxlan_fdb_info->info, NULL); mlxsw_sp_fid_put(fid); return; } @@ -2998,7 +2998,7 @@ mlxsw_sp_switchdev_vxlan_fdb_add(struct mlxsw_sp *mlxsw_sp, goto err_fdb_tunnel_uc_op; vxlan_fdb_info->offloaded = true; call_switchdev_notifiers(SWITCHDEV_VXLAN_FDB_OFFLOADED, dev, - &vxlan_fdb_info->info); + &vxlan_fdb_info->info, NULL); mlxsw_sp_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED, vxlan_fdb_info->eth_addr, vid, dev, true); diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 6213827e3956..62a205eba9f7 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2725,7 +2725,7 @@ rocker_fdb_offload_notify(struct rocker_port *rocker_port, info.vid = recv_info->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, - rocker_port->dev, &info.info); + rocker_port->dev, &info.info, NULL); } static void rocker_switchdev_event_work(struct work_struct *work) diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 6473cc68c2d5..bea7895930f6 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -1833,10 +1833,10 @@ static void ofdpa_port_fdb_learn_work(struct work_struct *work) rtnl_lock(); if (learned && removing) call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, - lw->ofdpa_port->dev, &info.info); + lw->ofdpa_port->dev, &info.info, NULL); else if (learned && !removing) call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_BRIDGE, - lw->ofdpa_port->dev, &info.info); + lw->ofdpa_port->dev, &info.info, NULL); rtnl_unlock(); kfree(work); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a9e90159bb61..ef45c3c925be 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -393,7 +393,7 @@ static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan, : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE; vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info); ret = call_switchdev_notifiers(notifier_type, vxlan->dev, - &info.info); + &info.info, extack); return notifier_to_errno(ret); } diff --git a/include/net/switchdev.h b/include/net/switchdev.h index a7fdab5ee6c3..63843ae5dc81 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -195,7 +195,8 @@ int switchdev_port_obj_del(struct net_device *dev, int register_switchdev_notifier(struct notifier_block *nb); int unregister_switchdev_notifier(struct notifier_block *nb); int call_switchdev_notifiers(unsigned long val, struct net_device *dev, - struct switchdev_notifier_info *info); + struct switchdev_notifier_info *info, + struct netlink_ext_ack *extack); int register_switchdev_blocking_notifier(struct notifier_block *nb); int unregister_switchdev_blocking_notifier(struct notifier_block *nb); @@ -267,7 +268,8 @@ static inline int unregister_switchdev_notifier(struct notifier_block *nb) static inline int call_switchdev_notifiers(unsigned long val, struct net_device *dev, - struct switchdev_notifier_info *info) + struct switchdev_notifier_info *info, + struct netlink_ext_ack *extack) { return NOTIFY_DONE; } diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 035ff59d9cbd..4d2b9eb7604a 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -113,7 +113,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, info.added_by_user = added_by_user; info.offloaded = offloaded; notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; - call_switchdev_notifiers(notifier_type, dev, &info.info); + call_switchdev_notifiers(notifier_type, dev, &info.info, NULL); } void diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d5680a98a7f0..91de3a663226 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1451,7 +1451,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) } fdb_info->offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev, - &fdb_info->info); + &fdb_info->info, NULL); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 5df9d1138ac9..cd78253de31d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -556,10 +556,11 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier); * Call all network notifier blocks. */ int call_switchdev_notifiers(unsigned long val, struct net_device *dev, - struct switchdev_notifier_info *info) + struct switchdev_notifier_info *info, + struct netlink_ext_ack *extack) { info->dev = dev; - info->extack = NULL; + info->extack = extack; return atomic_notifier_call_chain(&switchdev_notif_chain, val, info); } EXPORT_SYMBOL_GPL(call_switchdev_notifiers); -- cgit From b8c45a033acc607201588f7665ba84207e5149e0 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 17 Jan 2019 23:59:20 +0200 Subject: devlink: Add Documentation/networking/devlink-health.txt This patch adds a new file to add information about devlink health mechanism. Signed-off-by: Aya Levin Signed-off-by: Eran Ben Elisha Signed-off-by: David S. Miller --- Documentation/networking/devlink-health.txt | 86 +++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 Documentation/networking/devlink-health.txt (limited to 'Documentation') diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt new file mode 100644 index 000000000000..1db3fbea0831 --- /dev/null +++ b/Documentation/networking/devlink-health.txt @@ -0,0 +1,86 @@ +The health mechanism is targeted for Real Time Alerting, in order to know when +something bad had happened to a PCI device +- Provide alert debug information +- Self healing +- If problem needs vendor support, provide a way to gather all needed debugging + information. + +The main idea is to unify and centralize driver health reports in the +generic devlink instance and allow the user to set different +attributes of the health reporting and recovery procedures. + +The devlink health reporter: +Device driver creates a "health reporter" per each error/health type. +Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) +or unknown (driver specific). +For each registered health reporter a driver can issue error/health reports +asynchronously. All health reports handling is done by devlink. +Device driver can provide specific callbacks for each "health reporter", e.g. + - Recovery procedures + - Diagnostics and object dump procedures + - OOB initial parameters +Different parts of the driver can register different types of health reporters +with different handlers. + +Once an error is reported, devlink health will do the following actions: + * A log is being send to the kernel trace events buffer + * Health status and statistics are being updated for the reporter instance + * Object dump is being taken and saved at the reporter instance (as long as + there is no other dump which is already stored) + * Auto recovery attempt is being done. Depends on: + - Auto-recovery configuration + - Grace period vs. time passed since last recover + +The user interface: +User can access/change each reporter's parameters and driver specific callbacks +via devlink, e.g per error type (per health reporter) + - Configure reporter's generic parameters (like: disable/enable auto recovery) + - Invoke recovery procedure + - Run diagnostics + - Object dump + +The devlink health interface (via netlink): +DEVLINK_CMD_HEALTH_REPORTER_GET + Retrieves status and configuration info per DEV and reporter. +DEVLINK_CMD_HEALTH_REPORTER_SET + Allows reporter-related configuration setting. +DEVLINK_CMD_HEALTH_REPORTER_RECOVER + Triggers a reporter's recovery procedure. +DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE + Retrieves diagnostics data from a reporter on a device. +DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET + Retrieves the last stored dump. Devlink health + saves a single dump. If an dump is not already stored by the devlink + for this reporter, devlink generates a new dump. + dump output is defined by the reporter. +DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR + Clears the last saved dump file for the specified reporter. + + + netlink + +--------------------------+ + | | + | + | + | | | + +--------------------------+ + |request for ops + |(diagnose, + mlx5_core devlink |recover, + |dump) ++--------+ +--------------------------+ +| | | reporter| | +| | | +---------v----------+ | +| | ops execution | | | | +| <----------------------------------+ | | +| | | | | | +| | | + ^------------------+ | +| | | | request for ops | +| | | | (recover, dump) | +| | | | | +| | | +-+------------------+ | +| | health report | | health handler | | +| +-------------------------------> | | +| | | +--------------------+ | +| | health reporter create | | +| +----------------------------> | ++--------+ +--------------------------+ -- cgit From 856c395cfa63b94a1d8215182f0243c222f6f927 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Jan 2019 23:27:11 -0800 Subject: net: introduce a knob to control whether to inherit devconf config There have been many people complaining about the inconsistent behaviors of IPv4 and IPv6 devconf when creating new network namespaces. Currently, for IPv4, we inherit all current settings from init_net, but for IPv6 we reset all setting to default. This patch introduces a new /proc file /proc/sys/net/core/devconf_inherit_init_net to control the behavior of whether to inhert sysctl current settings from init_net. This file itself is only available in init_net. As demonstrated below: Initial setup in init_net: # cat /proc/sys/net/ipv4/conf/all/rp_filter 2 # cat /proc/sys/net/ipv6/conf/all/accept_dad 1 Default value 0 (current behavior): # ip netns del test # ip netns add test # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter 2 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad 0 Set to 1 (inherit from init_net): # echo 1 > /proc/sys/net/core/devconf_inherit_init_net # ip netns del test # ip netns add test # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter 2 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad 1 Set to 2 (reset to default): # echo 2 > /proc/sys/net/core/devconf_inherit_init_net # ip netns del test # ip netns add test # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter 0 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad 0 Set to a value out of range (invalid): # echo 3 > /proc/sys/net/core/devconf_inherit_init_net -bash: echo: write error: Invalid argument # echo -1 > /proc/sys/net/core/devconf_inherit_init_net -bash: echo: write error: Invalid argument Reported-by: Zhu Yanjun Reported-by: Tonghao Zhang Cc: Nicolas Dichtel Signed-off-by: Cong Wang Acked-by: Nicolas Dichtel Acked-by: Tonghao Zhang Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 14 ++++++++++++++ include/linux/netdevice.h | 1 + net/core/sysctl_net_core.c | 18 ++++++++++++++++++ net/ipv4/devinet.c | 43 ++++++++++++++++++++----------------------- net/ipv6/addrconf.c | 5 +++++ 5 files changed, 58 insertions(+), 23 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 2793d4eac55f..bc0680706870 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -291,6 +291,20 @@ user space is responsible for creating them if needed. Default : 0 (for compatibility reasons) +devconf_inherit_init_net +---------------------------- + +Controls if a new network namespace should inherit all current +settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By +default, we keep the current behavior: for IPv4 we inherit all current +settings from init_net and for IPv6 we reset all settings to default. + +If set to 1, both IPv4 and IPv6 settings are forced to inherit from +current ones in init_net. If set to 2, both IPv4 and IPv6 settings are +forced to reset to their default values. + +Default : 0 (for compatibility reasons) + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ------------------------------------------------------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a57b9a853aab..e675ef97a426 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -630,6 +630,7 @@ struct netdev_queue { } ____cacheline_aligned_in_smp; extern int sysctl_fb_tunnels_only_for_init_net; +extern int sysctl_devconf_inherit_init_net; static inline bool net_has_fallback_tunnels(const struct net *net) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d67ec17f2cc8..84bf2861f45f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -36,6 +36,15 @@ static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); +/* 0 - Keep current behavior: + * IPv4: inherit all current settings from init_net + * IPv6: reset all settings to default + * 1 - Both inherit all current settings from init_net + * 2 - Both reset all settings to default + */ +int sysctl_devconf_inherit_init_net __read_mostly; +EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -544,6 +553,15 @@ static struct ctl_table net_core_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "devconf_inherit_init_net", + .data = &sysctl_devconf_inherit_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, { } }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cd027639df2f..cd9033245b98 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2591,32 +2591,32 @@ static __net_init int devinet_init_net(struct net *net) int err; struct ipv4_devconf *all, *dflt; #ifdef CONFIG_SYSCTL - struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table *tbl; struct ctl_table_header *forw_hdr; #endif err = -ENOMEM; - all = &ipv4_devconf; - dflt = &ipv4_devconf_dflt; - - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); - if (!all) - goto err_alloc_all; + all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); + if (!all) + goto err_alloc_all; - dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); - if (!dflt) - goto err_alloc_dflt; + dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (!dflt) + goto err_alloc_dflt; #ifdef CONFIG_SYSCTL - tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); - if (!tbl) - goto err_alloc_ctl; + tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); + if (!tbl) + goto err_alloc_ctl; - tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; - tbl[0].extra1 = all; - tbl[0].extra2 = net; + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; #endif + + if (sysctl_devconf_inherit_init_net != 2 && !net_eq(net, &init_net)) { + memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); } #ifdef CONFIG_SYSCTL @@ -2646,15 +2646,12 @@ err_reg_ctl: err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: - if (tbl != ctl_forward_entry) - kfree(tbl); + kfree(tbl); err_alloc_ctl: #endif - if (dflt != &ipv4_devconf_dflt) - kfree(dflt); + kfree(dflt); err_alloc_dflt: - if (all != &ipv4_devconf) - kfree(all); + kfree(all); err_alloc_all: return err; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 57198b3c86da..48cd36311901 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6902,6 +6902,11 @@ static int __net_init addrconf_init_net(struct net *net) if (!dflt) goto err_alloc_dflt; + if (sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) { + memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); + memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt)); + } + /* these will be inherited by all namespaces */ dflt->autoconf = ipv6_defaults.autoconf; dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; -- cgit From 00f1ee5361c3f644133ef2d19d2c340d2a730f1d Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Mon, 21 Jan 2019 14:43:14 +0530 Subject: dt-bindings: net: Add Qualcomm ethqos binding Add support for Qualcomm ethqos found in some SoCs like QCS404. Signed-off-by: Vinod Koul Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- .../devicetree/bindings/net/qcom,ethqos.txt | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/qcom,ethqos.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.txt b/Documentation/devicetree/bindings/net/qcom,ethqos.txt new file mode 100644 index 000000000000..fcf5035810b5 --- /dev/null +++ b/Documentation/devicetree/bindings/net/qcom,ethqos.txt @@ -0,0 +1,64 @@ +Qualcomm Ethernet ETHQOS device + +This documents dwmmac based ethernet device which supports Gigabit +ethernet for version v2.3.0 onwards. + +This device has following properties: + +Required properties: + +- compatible: Should be qcom,qcs404-ethqos" + +- reg: Address and length of the register set for the device + +- reg-names: Should contain register names "stmmaceth", "rgmii" + +- clocks: Should contain phandle to clocks + +- clock-names: Should contain clock names "stmmaceth", "pclk", + "ptp_ref", "rgmii" + +- interrupts: Should contain phandle to interrupts + +- interrupt-names: Should contain interrupt names "macirq", "eth_lpi" + +Rest of the properties are defined in stmmac.txt file in same directory + + +Example: + +ethernet: ethernet@7a80000 { + compatible = "qcom,qcs404-ethqos"; + reg = <0x07a80000 0x10000>, + <0x07a96000 0x100>; + reg-names = "stmmaceth", "rgmii"; + clock-names = "stmmaceth", "pclk", "ptp_ref", "rgmii"; + clocks = <&gcc GCC_ETH_AXI_CLK>, + <&gcc GCC_ETH_SLAVE_AHB_CLK>, + <&gcc GCC_ETH_PTP_CLK>, + <&gcc GCC_ETH_RGMII_CLK>; + interrupts = , + ; + interrupt-names = "macirq", "eth_lpi"; + snps,reset-gpio = <&tlmm 60 GPIO_ACTIVE_LOW>; + snps,reset-active-low; + + snps,txpbl = <8>; + snps,rxpbl = <2>; + snps,aal; + snps,tso; + + phy-handle = <&phy1>; + phy-mode = "rgmii"; + + mdio { + #address-cells = <0x1>; + #size-cells = <0x0>; + compatible = "snps,dwmac-mdio"; + phy1: phy@4 { + device_type = "ethernet-phy"; + reg = <0x4>; + }; + }; + +}; -- cgit From 5ff2698b3301c37246f1f79dc9bdcd378b000dbe Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 21 Jan 2019 18:41:40 +0800 Subject: dt-binding: ptp_qoriq: document "fsl,extts-fifo" property Documented "fsl,extts-fifo" property. Signed-off-by: Yangbo Lu Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/ptp/ptp-qoriq.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt index c5d0e7998e2b..8e7f8551d190 100644 --- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt +++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt @@ -17,6 +17,8 @@ Clock Properties: - fsl,tmr-fiper1 Fixed interval period pulse generator. - fsl,tmr-fiper2 Fixed interval period pulse generator. - fsl,max-adj Maximum frequency adjustment in parts per billion. + - fsl,extts-fifo The presence of this property indicates hardware + support for the external trigger stamp FIFO. These properties set the operational parameters for the PTP clock. You must choose these carefully for the clock to work right. -- cgit From ffcf7ce9332723cab5ae55575f3a55d1ce559bf3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 18 Jan 2019 13:56:49 -0800 Subject: bpf: btf: add btf documentation This patch added documentation for BTF (BPF Debug Format). The document is placed under linux:Documentation/bpf directory. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- Documentation/bpf/btf.rst | 870 ++++++++++++++++++++++++++++++++++++++++++++ Documentation/bpf/index.rst | 7 + 2 files changed, 877 insertions(+) create mode 100644 Documentation/bpf/btf.rst (limited to 'Documentation') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst new file mode 100644 index 000000000000..1d434c3a268d --- /dev/null +++ b/Documentation/bpf/btf.rst @@ -0,0 +1,870 @@ +===================== +BPF Type Format (BTF) +===================== + +1. Introduction +*************** + +BTF (BPF Type Format) is the meta data format which +encodes the debug info related to BPF program/map. +The name BTF was used initially to describe +data types. The BTF was later extended to include +function info for defined subroutines, and line info +for source/line information. + +The debug info is used for map pretty print, function +signature, etc. The function signature enables better +bpf program/function kernel symbol. +The line info helps generate +source annotated translated byte code, jited code +and verifier log. + +The BTF specification contains two parts, + * BTF kernel API + * BTF ELF file format + +The kernel API is the contract between +user space and kernel. The kernel verifies +the BTF info before using it. +The ELF file format is a user space contract +between ELF file and libbpf loader. + +The type and string sections are part of the +BTF kernel API, describing the debug info +(mostly types related) referenced by the bpf program. +These two sections are discussed in +details in :ref:`BTF_Type_String`. + +.. _BTF_Type_String: + +2. BTF Type and String Encoding +******************************* + +The file ``include/uapi/linux/btf.h`` provides high +level definition on how types/strings are encoded. + +The beginning of data blob must be:: + + struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ + }; + +The magic is ``0xeB9F``, which has different encoding for big and little +endian system, and can be used to test whether BTF is generated for +big or little endian target. +The btf_header is designed to be extensible with hdr_len equal to +``sizeof(struct btf_header)`` when the data blob is generated. + +2.1 String Encoding +=================== + +The first string in the string section must be a null string. +The rest of string table is a concatenation of other null-treminated +strings. + +2.2 Type Encoding +================= + +The type id ``0`` is reserved for ``void`` type. +The type section is parsed sequentially and the type id is assigned to +each recognized type starting from id ``1``. +Currently, the following types are supported:: + + #define BTF_KIND_INT 1 /* Integer */ + #define BTF_KIND_PTR 2 /* Pointer */ + #define BTF_KIND_ARRAY 3 /* Array */ + #define BTF_KIND_STRUCT 4 /* Struct */ + #define BTF_KIND_UNION 5 /* Union */ + #define BTF_KIND_ENUM 6 /* Enumeration */ + #define BTF_KIND_FWD 7 /* Forward */ + #define BTF_KIND_TYPEDEF 8 /* Typedef */ + #define BTF_KIND_VOLATILE 9 /* Volatile */ + #define BTF_KIND_CONST 10 /* Const */ + #define BTF_KIND_RESTRICT 11 /* Restrict */ + #define BTF_KIND_FUNC 12 /* Function */ + #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ + +Note that the type section encodes debug info, not just pure types. +``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. + +Each type contains the following common data:: + + struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; + }; + +For certain kinds, the common data are followed by kind specific data. +The ``name_off`` in ``struct btf_type`` specifies the offset in the string table. +The following details encoding of each kind. + +2.2.1 BTF_KIND_INT +~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: any valid offset + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_INT + * ``info.vlen``: 0 + * ``size``: the size of the int type in bytes. + +``btf_type`` is followed by a ``u32`` with following bits arrangement:: + + #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) + #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) + #define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +The ``BTF_INT_ENCODING`` has the following attributes:: + + #define BTF_INT_SIGNED (1 << 0) + #define BTF_INT_CHAR (1 << 1) + #define BTF_INT_BOOL (1 << 2) + +The ``BTF_INT_ENCODING()`` provides extra information, signness, +char, or bool, for the int type. The char and bool encoding +are mostly useful for pretty print. At most one encoding can +be specified for the int type. + +The ``BTF_INT_BITS()`` specifies the number of actual bits held by +this int type. For example, a 4-bit bitfield encodes +``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8`` +must be equal to or greater than ``BTF_INT_BITS()`` for the type. +The maximum value of ``BTF_INT_BITS()`` is 128. + +The ``BTF_INT_OFFSET()`` specifies the starting bit offset to +calculate values for this int. For example, a bitfield struct +member has + + * btf member bit offset 100 from the start of the structure, + * btf member pointing to an int type, + * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` + +Then in the struct memory layout, this member will occupy +``4`` bits starting from bits ``100 + 2 = 102``. + +Alternatively, the bitfield struct member can be the following to +access the same bits as the above: + + * btf member bit offset 102, + * btf member pointing to an int type, + * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` + +The original intention of ``BTF_INT_OFFSET()`` is to provide +flexibility of bitfield encoding. +Currently, both llvm and pahole generates ``BTF_INT_OFFSET() = 0`` +for all int types. + +2.2.2 BTF_KIND_PTR +~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_PTR + * ``info.vlen``: 0 + * ``type``: the pointee type of the pointer + +No additional type data follow ``btf_type``. + +2.2.3 BTF_KIND_ARRAY +~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_ARRAY + * ``info.vlen``: 0 + * ``size/type``: 0, not used + +btf_type is followed by one "struct btf_array":: + + struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; + }; + +The ``struct btf_array`` encoding: + * ``type``: the element type + * ``index_type``: the index type + * ``nelems``: the number of elements for this array (``0`` is also allowed). + +The ``index_type`` can be any regular int types +(u8, u16, u32, u64, unsigned __int128). +The original design of including ``index_type`` follows dwarf +which has a ``index_type`` for its array type. +Currently in BTF, beyond type verification, the ``index_type`` is not used. + +The ``struct btf_array`` allows chaining through element type to represent +multiple dimensional arrays. For example, ``int a[5][6]``, the following +type system illustrates the chaining: + + * [1]: int + * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6`` + * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5`` + +Currently, both pahole and llvm collapse multiple dimensional array +into one dimensional array, e.g., ``a[5][6]``, the btf_array.nelems +equal to ``30``. This is because the original use case is map pretty +print where the whole array is dumped out so one dimensional array +is enough. As more BTF usage is explored, pahole and llvm can be +changed to generate proper chained representation for +multiple dimensional arrays. + +2.2.4 BTF_KIND_STRUCT +~~~~~~~~~~~~~~~~~~~~~ +2.2.5 BTF_KIND_UNION +~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 or offset to a valid C identifier + * ``info.kind_flag``: 0 or 1 + * ``info.kind``: BTF_KIND_STRUCT or BTF_KIND_UNION + * ``info.vlen``: the number of struct/union members + * ``info.size``: the size of the struct/union in bytes + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_member``.:: + + struct btf_member { + __u32 name_off; + __u32 type; + __u32 offset; + }; + +``struct btf_member`` encoding: + * ``name_off``: offset to a valid C identifier + * ``type``: the member type + * ``offset``: + +If the type info ``kind_flag`` is not set, the offset contains +only bit offset of the member. Note that the base type of the +bitfield can only be int or enum type. If the bitfield size +is 32, the base type can be either int or enum type. +If the bitfield size is not 32, the base type must be int, +and int type ``BTF_INT_BITS()`` encodes the bitfield size. + +If the ``kind_flag`` is set, the ``btf_member.offset`` +contains both member bitfield size and bit offset. The +bitfield size and bit offset are calculated as below.:: + + #define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) + #define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +In this case, if the base type is an int type, it must +be a regular int type: + + * ``BTF_INT_OFFSET()`` must be 0. + * ``BTF_INT_BITS()`` must be equal to ``{1,2,4,8,16} * 8``. + +The following kernel patch introduced ``kind_flag`` and +explained why both modes exist: + + https://github.com/torvalds/linux/commit/9d5f9f701b1891466fb3dbb1806ad97716f95cc3#diff-fa650a64fdd3968396883d2fe8215ff3 + +2.2.6 BTF_KIND_ENUM +~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 or offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_ENUM + * ``info.vlen``: number of enum values + * ``size``: 4 + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum``.:: + + struct btf_enum { + __u32 name_off; + __s32 val; + }; + +The ``btf_enum`` encoding: + * ``name_off``: offset to a valid C identifier + * ``val``: any value + +2.2.7 BTF_KIND_FWD +~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 for struct, 1 for union + * ``info.kind``: BTF_KIND_FWD + * ``info.vlen``: 0 + * ``type``: 0 + +No additional type data follow ``btf_type``. + +2.2.8 BTF_KIND_TYPEDEF +~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_TYPEDEF + * ``info.vlen``: 0 + * ``type``: the type which can be referred by name at ``name_off`` + +No additional type data follow ``btf_type``. + +2.2.9 BTF_KIND_VOLATILE +~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_VOLATILE + * ``info.vlen``: 0 + * ``type``: the type with ``volatile`` qualifier + +No additional type data follow ``btf_type``. + +2.2.10 BTF_KIND_CONST +~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_CONST + * ``info.vlen``: 0 + * ``type``: the type with ``const`` qualifier + +No additional type data follow ``btf_type``. + +2.2.11 BTF_KIND_RESTRICT +~~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_RESTRICT + * ``info.vlen``: 0 + * ``type``: the type with ``restrict`` qualifier + +No additional type data follow ``btf_type``. + +2.2.12 BTF_KIND_FUNC +~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_FUNC + * ``info.vlen``: 0 + * ``type``: a BTF_KIND_FUNC_PROTO type + +No additional type data follow ``btf_type``. + +A BTF_KIND_FUNC defines, not a type, but a subprogram (function) whose +signature is defined by ``type``. The subprogram is thus an instance of +that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in +the :ref:`BTF_Ext_Section` (ELF) or in the arguments to +:ref:`BPF_Prog_Load` (ABI). + +2.2.13 BTF_KIND_FUNC_PROTO +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_FUNC_PROTO + * ``info.vlen``: # of parameters + * ``type``: the return type + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_param``.:: + + struct btf_param { + __u32 name_off; + __u32 type; + }; + +If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type, +then ``btf_param.name_off`` must point to a valid C identifier +except for the possible last argument representing the variable +argument. The btf_param.type refers to parameter type. + +If the function has variable arguments, the last parameter +is encoded with ``name_off = 0`` and ``type = 0``. + +3. BTF Kernel API +***************** + +The following bpf syscall command involves BTF: + * BPF_BTF_LOAD: load a blob of BTF data into kernel + * BPF_MAP_CREATE: map creation with btf key and value type info. + * BPF_PROG_LOAD: prog load with btf function and line info. + * BPF_BTF_GET_FD_BY_ID: get a btf fd + * BPF_OBJ_GET_INFO_BY_FD: btf, func_info, line_info + and other btf related info are returned. + +The workflow typically looks like: +:: + + Application: + BPF_BTF_LOAD + | + v + BPF_MAP_CREATE and BPF_PROG_LOAD + | + V + ...... + + Introspection tool: + ...... + BPF_{PROG,MAP}_GET_NEXT_ID (get prog/map id's) + | + V + BPF_{PROG,MAP}_GET_FD_BY_ID (get a prog/map fd) + | + V + BPF_OBJ_GET_INFO_BY_FD (get bpf_prog_info/bpf_map_info with btf_id) + | | + V | + BPF_BTF_GET_FD_BY_ID (get btf_fd) | + | | + V | + BPF_OBJ_GET_INFO_BY_FD (get btf) | + | | + V V + pretty print types, dump func signatures and line info, etc. + + +3.1 BPF_BTF_LOAD +================ + +Load a blob of BTF data into kernel. A blob of data +described in :ref:`BTF_Type_String` +can be directly loaded into the kernel. +A ``btf_fd`` returns to userspace. + +3.2 BPF_MAP_CREATE +================== + +A map can be created with ``btf_fd`` and specified key/value type id.:: + + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + +In libbpf, the map can be defined with extra annotation like below: +:: + + struct bpf_map_def SEC("maps") btf_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(struct ipv_counts), + .max_entries = 4, + }; + BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts); + +Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, +key and value types for the map. +During ELF parsing, libbpf is able to extract key/value type_id's +and assigned them to BPF_MAP_CREATE attributes automatically. + +.. _BPF_Prog_Load: + +3.3 BPF_PROG_LOAD +================= + +During prog_load, func_info and line_info can be passed to kernel with +proper values for the following attributes: +:: + + __u32 insn_cnt; + __aligned_u64 insns; + ...... + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + +The func_info and line_info are an array of below, respectively.:: + + struct bpf_func_info { + __u32 insn_off; /* [0, insn_cnt - 1] */ + __u32 type_id; /* pointing to a BTF_KIND_FUNC type */ + }; + struct bpf_line_info { + __u32 insn_off; /* [0, insn_cnt - 1] */ + __u32 file_name_off; /* offset to string table for the filename */ + __u32 line_off; /* offset to string table for the source line */ + __u32 line_col; /* line number and column number */ + }; + +func_info_rec_size is the size of each func_info record, and line_info_rec_size +is the size of each line_info record. Passing the record size to kernel make +it possible to extend the record itself in the future. + +Below are requirements for func_info: + * func_info[0].insn_off must be 0. + * the func_info insn_off is in strictly increasing order and matches + bpf func boundaries. + +Below are requirements for line_info: + * the first insn in each func must points to a line_info record. + * the line_info insn_off is in strictly increasing order. + +For line_info, the line number and column number are defined as below: +:: + + #define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) + #define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +3.4 BPF_{PROG,MAP}_GET_NEXT_ID + +In kernel, every loaded program, map or btf has a unique id. +The id won't change during the life time of the program, map or btf. + +The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID +returns all id's, one for each command, to user space, for bpf +program or maps, +so the inspection tool can inspect all programs and maps. + +3.5 BPF_{PROG,MAP}_GET_FD_BY_ID + +The introspection tool cannot use id to get details about program or maps. +A file descriptor needs to be obtained first for reference counting purpose. + +3.6 BPF_OBJ_GET_INFO_BY_FD +========================== + +Once a program/map fd is acquired, the introspection tool can +get the detailed information from kernel about this fd, +some of which is btf related. For example, +``bpf_map_info`` returns ``btf_id``, key/value type id. +``bpf_prog_info`` returns ``btf_id``, func_info and line info +for translated bpf byte codes, and jited_line_info. + +3.7 BPF_BTF_GET_FD_BY_ID +======================== + +With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``, +bpf syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. +Then, with command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally +loaded into the kernel with BPF_BTF_LOAD, can be retrieved. + +With the btf blob, ``bpf_map_info`` and ``bpf_prog_info``, the introspection +tool has full btf knowledge and is able to pretty print map key/values, +dump func signatures, dump line info along with byte/jit codes. + +4. ELF File Format Interface +**************************** + +4.1 .BTF section +================ + +The .BTF section contains type and string data. The format of this section +is same as the one describe in :ref:`BTF_Type_String`. + +.. _BTF_Ext_Section: + +4.2 .BTF.ext section +==================== + +The .BTF.ext section encodes func_info and line_info which +needs loader manipulation before loading into the kernel. + +The specification for .BTF.ext section is defined at +``tools/lib/bpf/btf.h`` and ``tools/lib/bpf/btf.c``. + +The current header of .BTF.ext section:: + + struct btf_ext_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 func_info_off; + __u32 func_info_len; + __u32 line_info_off; + __u32 line_info_len; + }; + +It is very similar to .BTF section. Instead of type/string section, +it contains func_info and line_info section. See :ref:`BPF_Prog_Load` +for details about func_info and line_info record format. + +The func_info is organized as below.:: + + func_info_rec_size + btf_ext_info_sec for section #1 /* func_info for section #1 */ + btf_ext_info_sec for section #2 /* func_info for section #2 */ + ... + +``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure +when .BTF.ext is generated. btf_ext_info_sec, defined below, is +the func_info for each specific ELF section.:: + + struct btf_ext_info_sec { + __u32 sec_name_off; /* offset to section name */ + __u32 num_info; + /* Followed by num_info * record_size number of bytes */ + __u8 data[0]; + }; + +Here, num_info must be greater than 0. + +The line_info is organized as below.:: + + line_info_rec_size + btf_ext_info_sec for section #1 /* line_info for section #1 */ + btf_ext_info_sec for section #2 /* line_info for section #2 */ + ... + +``line_info_rec_size`` specifies the size of ``bpf_line_info`` structure +when .BTF.ext is generated. + +The interpretation of ``bpf_func_info->insn_off`` and +``bpf_line_info->insn_off`` is different between kernel API and ELF API. +For kernel API, the ``insn_off`` is the instruction offset in the unit +of ``struct bpf_insn``. For ELF API, the ``insn_off`` is the byte offset +from the beginning of section (``btf_ext_info_sec->sec_name_off``). + +5. Using BTF +************ + +5.1 bpftool map pretty print +============================ + +With BTF, the map key/value can be printed based on fields rather than +simply raw bytes. This is especially +valuable for large structure or if you data structure +has bitfields. For example, for the following map,:: + + enum A { A1, A2, A3, A4, A5 }; + typedef enum A ___A; + struct tmp_t { + char a1:4; + int a2:4; + int :4; + __u32 a3:4; + int b; + ___A b1:4; + enum A b2:4; + }; + struct bpf_map_def SEC("maps") tmpmap = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct tmp_t), + .max_entries = 1, + }; + BPF_ANNOTATE_KV_PAIR(tmpmap, int, struct tmp_t); + +bpftool is able to pretty print like below: +:: + + [{ + "key": 0, + "value": { + "a1": 0x2, + "a2": 0x4, + "a3": 0x6, + "b": 7, + "b1": 0x8, + "b2": 0xa + } + } + ] + +5.2 bpftool prog dump +===================== + +The following is an example to show func_info and line_info +can help prog dump with better kernel symbol name, function prototype +and line information.:: + + $ bpftool prog dump jited pinned /sys/fs/bpf/test_btf_haskv + [...] + int test_long_fname_2(struct dummy_tracepoint_args * arg): + bpf_prog_44a040bf25481309_test_long_fname_2: + ; static int test_long_fname_2(struct dummy_tracepoint_args *arg) + 0: push %rbp + 1: mov %rsp,%rbp + 4: sub $0x30,%rsp + b: sub $0x28,%rbp + f: mov %rbx,0x0(%rbp) + 13: mov %r13,0x8(%rbp) + 17: mov %r14,0x10(%rbp) + 1b: mov %r15,0x18(%rbp) + 1f: xor %eax,%eax + 21: mov %rax,0x20(%rbp) + 25: xor %esi,%esi + ; int key = 0; + 27: mov %esi,-0x4(%rbp) + ; if (!arg->sock) + 2a: mov 0x8(%rdi),%rdi + ; if (!arg->sock) + 2e: cmp $0x0,%rdi + 32: je 0x0000000000000070 + 34: mov %rbp,%rsi + ; counts = bpf_map_lookup_elem(&btf_map, &key); + [...] + +5.3 verifier log +================ + +The following is an example how line_info can help verifier failure debug.:: + + /* The code at tools/testing/selftests/bpf/test_xdp_noinline.c + * is modified as below. + */ + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + /* + if (data + 4 > data_end) + return XDP_DROP; + */ + *(u32 *)data = dst->dst; + + $ bpftool prog load ./test_xdp_noinline.o /sys/fs/bpf/test_xdp_noinline type xdp + ; data = (void *)(long)xdp->data; + 224: (79) r2 = *(u64 *)(r10 -112) + 225: (61) r2 = *(u32 *)(r2 +0) + ; *(u32 *)data = dst->dst; + 226: (63) *(u32 *)(r2 +0) = r1 + invalid access to packet, off=0 size=4, R2(id=0,off=0,r=0) + R2 offset is outside of the packet + +6. BTF Generation +***************** + +You need latest pahole + + https://git.kernel.org/pub/scm/devel/pahole/pahole.git/ + +or llvm (8.0 or later). The pahole acts as a dwarf2btf converter. It doesn't support .BTF.ext +and btf BTF_KIND_FUNC type yet. For example,:: + + -bash-4.4$ cat t.c + struct t { + int a:2; + int b:3; + int c:2; + } g; + -bash-4.4$ gcc -c -O2 -g t.c + -bash-4.4$ pahole -JV t.o + File t.o: + [1] STRUCT t kind_flag=1 size=4 vlen=3 + a type_id=2 bitfield_size=2 bits_offset=0 + b type_id=2 bitfield_size=3 bits_offset=2 + c type_id=2 bitfield_size=2 bits_offset=5 + [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED + +The llvm is able to generate .BTF and .BTF.ext directly with -g for bpf target only. +The assembly code (-S) is able to show the BTF encoding in assembly format.:: + + -bash-4.4$ cat t2.c + typedef int __int32; + struct t2 { + int a2; + int (*f2)(char q1, __int32 q2, ...); + int (*f3)(); + } g2; + int main() { return 0; } + int test() { return 0; } + -bash-4.4$ clang -c -g -O2 -target bpf t2.c + -bash-4.4$ readelf -S t2.o + ...... + [ 8] .BTF PROGBITS 0000000000000000 00000247 + 000000000000016e 0000000000000000 0 0 1 + [ 9] .BTF.ext PROGBITS 0000000000000000 000003b5 + 0000000000000060 0000000000000000 0 0 1 + [10] .rel.BTF.ext REL 0000000000000000 000007e0 + 0000000000000040 0000000000000010 16 9 8 + ...... + -bash-4.4$ clang -S -g -O2 -target bpf t2.c + -bash-4.4$ cat t2.s + ...... + .section .BTF,"",@progbits + .short 60319 # 0xeb9f + .byte 1 + .byte 0 + .long 24 + .long 0 + .long 220 + .long 220 + .long 122 + .long 0 # BTF_KIND_FUNC_PROTO(id = 1) + .long 218103808 # 0xd000000 + .long 2 + .long 83 # BTF_KIND_INT(id = 2) + .long 16777216 # 0x1000000 + .long 4 + .long 16777248 # 0x1000020 + ...... + .byte 0 # string offset=0 + .ascii ".text" # string offset=1 + .byte 0 + .ascii "/home/yhs/tmp-pahole/t2.c" # string offset=7 + .byte 0 + .ascii "int main() { return 0; }" # string offset=33 + .byte 0 + .ascii "int test() { return 0; }" # string offset=58 + .byte 0 + .ascii "int" # string offset=83 + ...... + .section .BTF.ext,"",@progbits + .short 60319 # 0xeb9f + .byte 1 + .byte 0 + .long 24 + .long 0 + .long 28 + .long 28 + .long 44 + .long 8 # FuncInfo + .long 1 # FuncInfo section string offset=1 + .long 2 + .long .Lfunc_begin0 + .long 3 + .long .Lfunc_begin1 + .long 5 + .long 16 # LineInfo + .long 1 # LineInfo section string offset=1 + .long 2 + .long .Ltmp0 + .long 7 + .long 33 + .long 7182 # Line 7 Col 14 + .long .Ltmp3 + .long 7 + .long 58 + .long 8206 # Line 8 Col 14 + +7. Testing +********** + +Kernel bpf selftest `test_btf.c` provides extensive set of BTF related tests. diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 00a8450a602f..4e77932959cc 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -15,6 +15,13 @@ that goes into great technical depth about the BPF Architecture. The primary info for the bpf syscall is available in the `man-pages`_ for `bpf(2)`_. +BPF Type Format (BTF) +===================== + +.. toctree:: + :maxdepth: 1 + + btf Frequently asked questions (FAQ) -- cgit From 30e5c2c6bf285d93dee4c45f23da95d7d50b125a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 25 Jan 2019 10:53:23 -0800 Subject: net: Revert devlink health changes. This reverts the devlink health changes from 9/17/2019, Jiri wants things to be designed differently and it was agreed that the easiest way to do this is start from the beginning again. Commits reverted: cb5ccfbe73b389470e1dc11061bb185ef4bc9aec 880ee82f0313453ec5a6cb122866ac057263066b c7af343b4e33578b7de91786a3f639c8cfa0d97b ff253fedab961b22117a73ab808fcfa9e6852b50 6f9d56132eb6d2603d4273cfc65bed914ec47acb fcd852c69d776c0f46c8f79e8e431e5cc6ddc7b7 8a66704a13d9713593342e29b4f0c19762f5746b 12bd0dcefe88782ac1c9fff632958dd1b71d27e5 aba25279c10094c5c97d09c3491ca86d00b4ad5e ce019faa70f81555fa17ebc1d5a03651f2e7e15a b8c45a033acc607201588f7665ba84207e5149e0 And the follow-on build fix: o33a0efa4baecd689da9474ce0e8b673eb6931c60 Signed-off-by: David S. Miller --- Documentation/networking/devlink-health.txt | 86 -- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 18 +- .../net/ethernet/mellanox/mlx5/core/en/reporter.h | 15 - .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 356 ------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 186 +++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- include/net/devlink.h | 144 --- include/trace/events/devlink.h | 62 -- include/uapi/linux/devlink.h | 25 - net/core/devlink.c | 1058 -------------------- 11 files changed, 169 insertions(+), 1785 deletions(-) delete mode 100644 Documentation/networking/devlink-health.txt delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c (limited to 'Documentation') diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt deleted file mode 100644 index 1db3fbea0831..000000000000 --- a/Documentation/networking/devlink-health.txt +++ /dev/null @@ -1,86 +0,0 @@ -The health mechanism is targeted for Real Time Alerting, in order to know when -something bad had happened to a PCI device -- Provide alert debug information -- Self healing -- If problem needs vendor support, provide a way to gather all needed debugging - information. - -The main idea is to unify and centralize driver health reports in the -generic devlink instance and allow the user to set different -attributes of the health reporting and recovery procedures. - -The devlink health reporter: -Device driver creates a "health reporter" per each error/health type. -Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) -or unknown (driver specific). -For each registered health reporter a driver can issue error/health reports -asynchronously. All health reports handling is done by devlink. -Device driver can provide specific callbacks for each "health reporter", e.g. - - Recovery procedures - - Diagnostics and object dump procedures - - OOB initial parameters -Different parts of the driver can register different types of health reporters -with different handlers. - -Once an error is reported, devlink health will do the following actions: - * A log is being send to the kernel trace events buffer - * Health status and statistics are being updated for the reporter instance - * Object dump is being taken and saved at the reporter instance (as long as - there is no other dump which is already stored) - * Auto recovery attempt is being done. Depends on: - - Auto-recovery configuration - - Grace period vs. time passed since last recover - -The user interface: -User can access/change each reporter's parameters and driver specific callbacks -via devlink, e.g per error type (per health reporter) - - Configure reporter's generic parameters (like: disable/enable auto recovery) - - Invoke recovery procedure - - Run diagnostics - - Object dump - -The devlink health interface (via netlink): -DEVLINK_CMD_HEALTH_REPORTER_GET - Retrieves status and configuration info per DEV and reporter. -DEVLINK_CMD_HEALTH_REPORTER_SET - Allows reporter-related configuration setting. -DEVLINK_CMD_HEALTH_REPORTER_RECOVER - Triggers a reporter's recovery procedure. -DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE - Retrieves diagnostics data from a reporter on a device. -DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET - Retrieves the last stored dump. Devlink health - saves a single dump. If an dump is not already stored by the devlink - for this reporter, devlink generates a new dump. - dump output is defined by the reporter. -DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR - Clears the last saved dump file for the specified reporter. - - - netlink - +--------------------------+ - | | - | + | - | | | - +--------------------------+ - |request for ops - |(diagnose, - mlx5_core devlink |recover, - |dump) -+--------+ +--------------------------+ -| | | reporter| | -| | | +---------v----------+ | -| | ops execution | | | | -| <----------------------------------+ | | -| | | | | | -| | | + ^------------------+ | -| | | | request for ops | -| | | | (recover, dump) | -| | | | | -| | | +-+------------------+ | -| | health report | | health handler | | -| +-------------------------------> | | -| | | +--------------------+ | -| | health reporter create | | -| +----------------------------> | -+--------+ +--------------------------+ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 6bb2a860b15b..9de9abacf7f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -22,7 +22,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ # mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \ - en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o + en_selftest.o en/port.o en/monitor_stats.o # # Netdev extra diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 27e276c9bf84..8fa8fdd30b85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -388,7 +388,10 @@ struct mlx5e_txqsq { struct mlx5e_channel *channel; int txq_ix; u32 rate_limit; - struct work_struct recover_work; + struct mlx5e_txqsq_recover { + struct work_struct recover_work; + u64 last_recover; + } recover; } ____cacheline_aligned_in_smp; struct mlx5e_dma_info { @@ -679,13 +682,6 @@ struct mlx5e_rss_params { u8 hfunc; }; -struct mlx5e_modify_sq_param { - int curr_state; - int next_state; - int rl_update; - int rl_index; -}; - struct mlx5e_priv { /* priv data path fields - start */ struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC]; @@ -741,7 +737,6 @@ struct mlx5e_priv { #ifdef CONFIG_MLX5_EN_TLS struct mlx5e_tls *tls; #endif - struct devlink_health_reporter *tx_reporter; }; struct mlx5e_profile { @@ -871,11 +866,6 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params); void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); -int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, - struct mlx5e_modify_sq_param *p); -void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq); -void mlx5e_tx_disable_queue(struct netdev_queue *txq); - static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev) { return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) && diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h deleted file mode 100644 index 2335c5b48820..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ -/* Copyright (c) 2018 Mellanox Technologies. */ - -#ifndef __MLX5E_EN_REPORTER_H -#define __MLX5E_EN_REPORTER_H - -#include -#include "en.h" - -int mlx5e_tx_reporter_create(struct mlx5e_priv *priv); -void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv); -void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq); -void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq); - -#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c deleted file mode 100644 index d9675afbb924..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ /dev/null @@ -1,356 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ -/* Copyright (c) 2018 Mellanox Technologies. */ - -#include -#include "reporter.h" -#include "lib/eq.h" - -#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256 - -struct mlx5e_tx_err_ctx { - int (*recover)(struct mlx5e_txqsq *sq); - struct mlx5e_txqsq *sq; -}; - -static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) -{ - unsigned long exp_time = jiffies + msecs_to_jiffies(2000); - - while (time_before(jiffies, exp_time)) { - if (sq->cc == sq->pc) - return 0; - - msleep(20); - } - - netdev_err(sq->channel->netdev, - "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", - sq->sqn, sq->cc, sq->pc); - - return -ETIMEDOUT; -} - -static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) -{ - WARN_ONCE(sq->cc != sq->pc, - "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", - sq->sqn, sq->cc, sq->pc); - sq->cc = 0; - sq->dma_fifo_cc = 0; - sq->pc = 0; -} - -static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state) -{ - struct mlx5_core_dev *mdev = sq->channel->mdev; - struct net_device *dev = sq->channel->netdev; - struct mlx5e_modify_sq_param msp = {0}; - int err; - - msp.curr_state = curr_state; - msp.next_state = MLX5_SQC_STATE_RST; - - err = mlx5e_modify_sq(mdev, sq->sqn, &msp); - if (err) { - netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn); - return err; - } - - memset(&msp, 0, sizeof(msp)); - msp.curr_state = MLX5_SQC_STATE_RST; - msp.next_state = MLX5_SQC_STATE_RDY; - - err = mlx5e_modify_sq(mdev, sq->sqn, &msp); - if (err) { - netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn); - return err; - } - - return 0; -} - -static int mlx5e_tx_reporter_err_cqe_recover(struct mlx5e_txqsq *sq) -{ - struct mlx5_core_dev *mdev = sq->channel->mdev; - struct net_device *dev = sq->channel->netdev; - u8 state; - int err; - - if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) - return 0; - - err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); - if (err) { - netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", - sq->sqn, err); - return err; - } - - if (state != MLX5_RQC_STATE_ERR) { - netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn); - return -EINVAL; - } - - mlx5e_tx_disable_queue(sq->txq); - - err = mlx5e_wait_for_sq_flush(sq); - if (err) - return err; - - /* At this point, no new packets will arrive from the stack as TXQ is - * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all - * pending WQEs. SQ can safely reset the SQ. - */ - - err = mlx5e_sq_to_ready(sq, state); - if (err) - return err; - - mlx5e_reset_txqsq_cc_pc(sq); - sq->stats->recover++; - mlx5e_activate_txqsq(sq); - - return 0; -} - -void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq) -{ - char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN]; - struct mlx5e_tx_err_ctx err_ctx = {0}; - - err_ctx.sq = sq; - err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; - sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn); - - devlink_health_report(sq->channel->priv->tx_reporter, err_str, - &err_ctx); -} - -static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq) -{ - struct mlx5_eq_comp *eq = sq->cq.mcq.eq; - u32 eqe_count; - - netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", - eq->core.eqn, eq->core.cons_index, eq->core.irqn); - - eqe_count = mlx5_eq_poll_irq_disabled(eq); - if (!eqe_count) { - clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - return 1; - } - - netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n", - eqe_count, eq->core.eqn); - sq->channel->stats->eq_rearm++; - return 0; -} - -void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq) -{ - struct mlx5e_tx_err_ctx err_ctx; - char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN]; - - err_ctx.sq = sq; - err_ctx.recover = mlx5e_tx_reporter_timeout_recover; - sprintf(err_str, - "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", - sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, - jiffies_to_usecs(jiffies - sq->txq->trans_start)); - devlink_health_report(sq->channel->priv->tx_reporter, err_str, - &err_ctx); -} - -/* state lock cannot be grabbed within this function. - * It can cause a dead lock or a read-after-free. - */ -int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx) -{ - return err_ctx->recover(err_ctx->sq); -} - -static int mlx5e_tx_reporter_recover_all(struct mlx5e_priv *priv) -{ - int err; - - mutex_lock(&priv->state_lock); - mlx5e_close_locked(priv->netdev); - err = mlx5e_open_locked(priv->netdev); - mutex_unlock(&priv->state_lock); - - return err; -} - -static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, - void *context) -{ - struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); - struct mlx5e_tx_err_ctx *err_ctx = context; - - return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : - mlx5e_tx_reporter_recover_all(priv); -} - -static int -mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer, - u32 sqn, u8 state, u8 stopped) -{ - int err, i; - int nest = 0; - char name[20]; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR); - if (err) - goto buffer_error; - nest++; - - sprintf(name, "SQ 0x%x", sqn); - err = devlink_health_buffer_put_object_name(buffer, name); - if (err) - goto buffer_error; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_object_name(buffer, "HW state"); - if (err) - goto buffer_error; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_value_u8(buffer, state); - if (err) - goto buffer_error; - - devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */ - nest--; - - devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */ - nest--; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_object_name(buffer, "stopped"); - if (err) - goto buffer_error; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_value_u8(buffer, stopped); - if (err) - goto buffer_error; - - for (i = 0; i < nest; i++) - devlink_health_buffer_nest_end(buffer); - - return 0; - -buffer_error: - for (i = 0; i < nest; i++) - devlink_health_buffer_nest_cancel(buffer); - return err; -} - -static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, - struct devlink_health_buffer **buffers_array, - unsigned int buffer_size, - unsigned int num_buffers) -{ - struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); - unsigned int buff = 0; - int i = 0, err = 0; - - if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN) - return -ENOMEM; - - mutex_lock(&priv->state_lock); - - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - mutex_unlock(&priv->state_lock); - return 0; - } - - while (i < priv->channels.num * priv->channels.params.num_tc) { - struct mlx5e_txqsq *sq = priv->txq2sq[i]; - u8 state; - - err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); - if (err) - break; - - err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff], - sq->sqn, state, - netif_xmit_stopped(sq->txq)); - if (err) { - if (++buff == num_buffers) - break; - } else { - i++; - } - } - - mutex_unlock(&priv->state_lock); - return err; -} - -static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { - .name = "TX", - .recover = mlx5e_tx_reporter_recover, - .diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC * - MLX5E_TX_REPORTER_PER_SQ_MAX_LEN, - .diagnose = mlx5e_tx_reporter_diagnose, - .dump_size = 0, - .dump = NULL, -}; - -#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 -int mlx5e_tx_reporter_create(struct mlx5e_priv *priv) -{ - struct mlx5_core_dev *mdev = priv->mdev; - struct devlink *devlink = priv_to_devlink(mdev); - - priv->tx_reporter = - devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, - MLX5_REPORTER_TX_GRACEFUL_PERIOD, - true, priv); - return PTR_ERR_OR_ZERO(priv->tx_reporter); -} - -void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv) -{ - devlink_health_reporter_destroy(priv->tx_reporter); -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dee0c8f3d4e9..8cfd2ec7c0a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -51,7 +51,6 @@ #include "en/xdp.h" #include "lib/eq.h" #include "en/monitor_stats.h" -#include "en/reporter.h" struct mlx5e_rq_param { u32 rqc[MLX5_ST_SZ_DW(rqc)]; @@ -1161,7 +1160,7 @@ static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa) return 0; } -static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work); +static void mlx5e_sq_recover(struct work_struct *work); static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, int txq_ix, struct mlx5e_params *params, @@ -1183,7 +1182,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->stats = &c->priv->channel_stats[c->ix].sq[tc]; - INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); + INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover); if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); if (mlx5_accel_is_tls_device(c->priv->mdev)) @@ -1271,8 +1270,15 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, return err; } -int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, - struct mlx5e_modify_sq_param *p) +struct mlx5e_modify_sq_param { + int curr_state; + int next_state; + bool rl_update; + int rl_index; +}; + +static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, + struct mlx5e_modify_sq_param *p) { void *in; void *sqc; @@ -1370,7 +1376,17 @@ err_free_txqsq: return err; } -void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq) +static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) +{ + WARN_ONCE(sq->cc != sq->pc, + "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", + sq->sqn, sq->cc, sq->pc); + sq->cc = 0; + sq->dma_fifo_cc = 0; + sq->pc = 0; +} + +static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq) { sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix); clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); @@ -1379,7 +1395,7 @@ void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq) netif_tx_start_queue(sq->txq); } -void mlx5e_tx_disable_queue(struct netdev_queue *txq) +static inline void netif_tx_disable_queue(struct netdev_queue *txq) { __netif_tx_lock_bh(txq); netif_tx_stop_queue(txq); @@ -1395,7 +1411,7 @@ static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq) /* prevent netif_tx_wake_queue */ napi_synchronize(&c->napi); - mlx5e_tx_disable_queue(sq->txq); + netif_tx_disable_queue(sq->txq); /* last doorbell out, godspeed .. */ if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) { @@ -1415,7 +1431,6 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) struct mlx5_rate_limit rl = {0}; cancel_work_sync(&sq->dim.work); - cancel_work_sync(&sq->recover_work); mlx5e_destroy_sq(mdev, sq->sqn); if (sq->rate_limit) { rl.rate = sq->rate_limit; @@ -1425,15 +1440,105 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) mlx5e_free_txqsq(sq); } -static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work) +static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) +{ + unsigned long exp_time = jiffies + msecs_to_jiffies(2000); + + while (time_before(jiffies, exp_time)) { + if (sq->cc == sq->pc) + return 0; + + msleep(20); + } + + netdev_err(sq->channel->netdev, + "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", + sq->sqn, sq->cc, sq->pc); + + return -ETIMEDOUT; +} + +static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state) { - struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq, - recover_work); + struct mlx5_core_dev *mdev = sq->channel->mdev; + struct net_device *dev = sq->channel->netdev; + struct mlx5e_modify_sq_param msp = {0}; + int err; - if (!sq->channel->priv->tx_reporter) + msp.curr_state = curr_state; + msp.next_state = MLX5_SQC_STATE_RST; + + err = mlx5e_modify_sq(mdev, sq->sqn, &msp); + if (err) { + netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn); + return err; + } + + memset(&msp, 0, sizeof(msp)); + msp.curr_state = MLX5_SQC_STATE_RST; + msp.next_state = MLX5_SQC_STATE_RDY; + + err = mlx5e_modify_sq(mdev, sq->sqn, &msp); + if (err) { + netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn); + return err; + } + + return 0; +} + +static void mlx5e_sq_recover(struct work_struct *work) +{ + struct mlx5e_txqsq_recover *recover = + container_of(work, struct mlx5e_txqsq_recover, + recover_work); + struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq, + recover); + struct mlx5_core_dev *mdev = sq->channel->mdev; + struct net_device *dev = sq->channel->netdev; + u8 state; + int err; + + err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); + if (err) { + netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", + sq->sqn, err); + return; + } + + if (state != MLX5_RQC_STATE_ERR) { + netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn); + return; + } + + netif_tx_disable_queue(sq->txq); + + if (mlx5e_wait_for_sq_flush(sq)) return; - mlx5e_tx_reporter_err_cqe(sq); + /* If the interval between two consecutive recovers per SQ is too + * short, don't recover to avoid infinite loop of ERR_CQE -> recover. + * If we reached this state, there is probably a bug that needs to be + * fixed. let's keep the queue close and let tx timeout cleanup. + */ + if (jiffies_to_msecs(jiffies - recover->last_recover) < + MLX5E_SQ_RECOVER_MIN_INTERVAL) { + netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n", + sq->sqn); + return; + } + + /* At this point, no new packets will arrive from the stack as TXQ is + * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all + * pending WQEs. SQ can safely reset the SQ. + */ + if (mlx5e_sq_to_ready(sq, state)) + return; + + mlx5e_reset_txqsq_cc_pc(sq); + sq->stats->recover++; + recover->last_recover = jiffies; + mlx5e_activate_txqsq(sq); } static int mlx5e_open_icosq(struct mlx5e_channel *c, @@ -3102,7 +3207,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv) { int tc; - mlx5e_tx_reporter_destroy(priv); for (tc = 0; tc < priv->profile->max_tc; tc++) mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]); } @@ -4074,14 +4178,31 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb, return features; } +static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev, + struct mlx5e_txqsq *sq) +{ + struct mlx5_eq_comp *eq = sq->cq.mcq.eq; + u32 eqe_count; + + netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", + eq->core.eqn, eq->core.cons_index, eq->core.irqn); + + eqe_count = mlx5_eq_poll_irq_disabled(eq); + if (!eqe_count) + return false; + + netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->core.eqn); + sq->channel->stats->eq_rearm++; + return true; +} + static void mlx5e_tx_timeout_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, tx_timeout_work); - int i; - - if (!priv->tx_reporter) - return; + struct net_device *dev = priv->netdev; + bool reopen_channels = false; + int i, err; rtnl_lock(); mutex_lock(&priv->state_lock); @@ -4090,16 +4211,36 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) goto unlock; for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) { - struct netdev_queue *dev_queue = - netdev_get_tx_queue(priv->netdev, i); + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i); struct mlx5e_txqsq *sq = priv->txq2sq[i]; if (!netif_xmit_stopped(dev_queue)) continue; - mlx5e_tx_reporter_timeout(sq); + netdev_err(dev, + "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", + i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, + jiffies_to_usecs(jiffies - dev_queue->trans_start)); + + /* If we recover a lost interrupt, most likely TX timeout will + * be resolved, skip reopening channels + */ + if (!mlx5e_tx_timeout_eq_recover(dev, sq)) { + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); + reopen_channels = true; + } } + if (!reopen_channels) + goto unlock; + + mlx5e_close_locked(dev); + err = mlx5e_open_locked(dev); + if (err) + netdev_err(priv->netdev, + "mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n", + err); + unlock: mutex_unlock(&priv->state_lock); rtnl_unlock(); @@ -4767,7 +4908,6 @@ static int mlx5e_init_nic_tx(struct mlx5e_priv *priv) #ifdef CONFIG_MLX5_CORE_EN_DCB mlx5e_dcbnl_initialize(priv); #endif - mlx5e_tx_reporter_create(priv); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index a8e052a5ce36..598ad7e4d5c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -514,7 +514,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) mlx5e_dump_error_cqe(sq, (struct mlx5_err_cqe *)cqe); queue_work(cq->channel->priv->wq, - &sq->recover_work); + &sq->recover.recover_work); } stats->cqe_err++; } diff --git a/include/net/devlink.h b/include/net/devlink.h index a81a1b7a67d7..67f4293bc970 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -30,7 +30,6 @@ struct devlink { struct list_head param_list; struct list_head region_list; u32 snapshot_id; - struct list_head reporter_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -424,36 +423,6 @@ struct devlink_region; typedef void devlink_snapshot_data_dest_t(const void *data); -struct devlink_health_buffer; -struct devlink_health_reporter; - -/** - * struct devlink_health_reporter_ops - Reporter operations - * @name: reporter name - * dump_size: dump buffer size allocated by the devlink - * diagnose_size: diagnose buffer size allocated by the devlink - * recover: callback to recover from reported error - * if priv_ctx is NULL, run a full recover - * dump: callback to dump an object - * if priv_ctx is NULL, run a full dump - * diagnose: callback to diagnose the current status - */ - -struct devlink_health_reporter_ops { - char *name; - unsigned int dump_size; - unsigned int diagnose_size; - int (*recover)(struct devlink_health_reporter *reporter, - void *priv_ctx); - int (*dump)(struct devlink_health_reporter *reporter, - struct devlink_health_buffer **buffers_array, - unsigned int buffer_size, unsigned int num_buffers, - void *priv_ctx); - int (*diagnose)(struct devlink_health_reporter *reporter, - struct devlink_health_buffer **buffers_array, - unsigned int buffer_size, unsigned int num_buffers); -}; - struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -615,34 +584,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, u8 *data, u32 snapshot_id, devlink_snapshot_data_dest_t *data_destructor); -int devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype); -void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer); -void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer); -int devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name); -int devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value); -int devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value); -int devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value); -int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name); -int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len); -struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv); -void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); - -void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter); -int devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx); #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -903,91 +844,6 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, return 0; } -static inline int -devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype) -{ - return 0; -} - -static inline void -devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) -{ -} - -static inline void -devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) -{ -} - -static inline int -devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len) -{ - return 0; -} - -static inline struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) -{ - return NULL; -} - -static inline void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ -} - -static inline void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return NULL; -} - -static inline int -devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - return 0; -} #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 7e39d2fc7c75..44acfbca1266 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -46,65 +46,6 @@ TRACE_EVENT(devlink_hwmsg, (int) __entry->len, __get_dynamic_array(buf), __entry->len) ); -TRACE_EVENT(devlink_health_report, - TP_PROTO(const struct devlink *devlink, const char *reporter_name, - const char *msg), - - TP_ARGS(devlink, reporter_name, msg), - - TP_STRUCT__entry( - __string(bus_name, devlink->dev->bus->name) - __string(dev_name, dev_name(devlink->dev)) - __string(driver_name, devlink->dev->driver->name) - __string(reporter_name, msg) - __string(msg, msg) - ), - - TP_fast_assign( - __assign_str(bus_name, devlink->dev->bus->name); - __assign_str(dev_name, dev_name(devlink->dev)); - __assign_str(driver_name, devlink->dev->driver->name); - __assign_str(reporter_name, reporter_name); - __assign_str(msg, msg); - ), - - TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s", - __get_str(bus_name), __get_str(dev_name), - __get_str(driver_name), __get_str(reporter_name), - __get_str(msg)) -); - -TRACE_EVENT(devlink_health_recover_aborted, - TP_PROTO(const struct devlink *devlink, const char *reporter_name, - bool health_state, u64 time_since_last_recover), - - TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover), - - TP_STRUCT__entry( - __string(bus_name, devlink->dev->bus->name) - __string(dev_name, dev_name(devlink->dev)) - __string(driver_name, devlink->dev->driver->name) - __string(reporter_name, reporter_name) - __field(bool, health_state) - __field(u64, time_since_last_recover) - ), - - TP_fast_assign( - __assign_str(bus_name, devlink->dev->bus->name); - __assign_str(dev_name, dev_name(devlink->dev)); - __assign_str(driver_name, devlink->dev->driver->name); - __assign_str(reporter_name, reporter_name); - __entry->health_state = health_state; - __entry->time_since_last_recover = time_since_last_recover; - ), - - TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: health_state=%d time_since_last_recover = %llu recover aborted", - __get_str(bus_name), __get_str(dev_name), - __get_str(driver_name), __get_str(reporter_name), - __entry->health_state, - __entry->time_since_last_recover) -); - #endif /* _TRACE_DEVLINK_H */ /* This part must be outside protection */ @@ -123,9 +64,6 @@ static inline void trace_devlink_hwmsg(const struct devlink *devlink, { } -static inline void trace_devlink_health(const char *msg) -{ -} #endif /* _TRACE_DEVLINK_H */ #endif diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6b26bb2ce4dc..6e52d3660654 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -89,13 +89,6 @@ enum devlink_command { DEVLINK_CMD_REGION_DEL, DEVLINK_CMD_REGION_READ, - DEVLINK_CMD_HEALTH_REPORTER_GET, - DEVLINK_CMD_HEALTH_REPORTER_SET, - DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -292,24 +285,6 @@ enum devlink_attr { DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */ DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME, /* string */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, /* u8 */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, /* dynamic */ - - DEVLINK_ATTR_HEALTH_REPORTER, /* nested */ - DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */ - DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */ - DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, /* u8 */ - DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */ - /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 24f266468ca5..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3597,1015 +3597,6 @@ out: return 0; } -#define DEVLINK_HEALTH_BUFFER_SIZE (4096 - GENL_HDRLEN) -#define DEVLINK_HEALTH_BUFFER_DATA_SIZE (DEVLINK_HEALTH_BUFFER_SIZE / 2) -#define DEVLINK_HEALTH_SIZE_TO_BUFFERS(size) DIV_ROUND_UP_ULL(size, DEVLINK_HEALTH_BUFFER_DATA_SIZE) -#define DEVLINK_HEALTH_BUFFER_MAX_CHUNK 1024 - -struct devlink_health_buffer { - void *data; - u64 offset; - u64 bytes_left; - u64 bytes_left_metadata; - u64 max_nested_depth; - u64 curr_nest; -}; - -struct devlink_health_buffer_desc { - int attrtype; - u16 len; - u8 nla_type; - u8 nest_end; - int value[0]; -}; - -static void -devlink_health_buffers_reset(struct devlink_health_buffer **buffers_list, - u64 num_of_buffers) -{ - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - memset(buffers_list[i]->data, 0, DEVLINK_HEALTH_BUFFER_SIZE); - buffers_list[i]->offset = 0; - buffers_list[i]->bytes_left = DEVLINK_HEALTH_BUFFER_DATA_SIZE; - buffers_list[i]->bytes_left_metadata = - DEVLINK_HEALTH_BUFFER_DATA_SIZE; - buffers_list[i]->max_nested_depth = 0; - buffers_list[i]->curr_nest = 0; - } -} - -static void -devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, - u64 size); - -static struct devlink_health_buffer ** -devlink_health_buffers_create(u64 size) -{ - struct devlink_health_buffer **buffers_list; - u64 num_of_buffers = DEVLINK_HEALTH_SIZE_TO_BUFFERS(size); - u64 i; - - buffers_list = kcalloc(num_of_buffers, - sizeof(struct devlink_health_buffer *), - GFP_KERNEL); - if (!buffers_list) - return NULL; - - for (i = 0; i < num_of_buffers; i++) { - struct devlink_health_buffer *buffer; - void *data; - - buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); - data = kzalloc(DEVLINK_HEALTH_BUFFER_SIZE, GFP_KERNEL); - if (!buffer || !data) { - kfree(buffer); - kfree(data); - goto buffers_cleanup; - } - buffers_list[i] = buffer; - buffer->data = data; - } - devlink_health_buffers_reset(buffers_list, num_of_buffers); - - return buffers_list; - -buffers_cleanup: - devlink_health_buffers_destroy(buffers_list, --i); - kfree(buffers_list); - return NULL; -} - -static void -devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, - u64 num_of_buffers) -{ - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - kfree(buffers_list[i]->data); - kfree(buffers_list[i]); - } -} - -void -devlink_health_buffer_offset_inc(struct devlink_health_buffer *buffer, - int len) -{ - buffer->offset += len; -} - -/* In order to store a nest, need two descriptors, for start and end */ -#define DEVLINK_HEALTH_BUFFER_NEST_SIZE (sizeof(struct devlink_health_buffer_desc) * 2) - -int devlink_health_buffer_verify_len(struct devlink_health_buffer *buffer, - int len, int metadata_len) -{ - if (len > DEVLINK_HEALTH_BUFFER_DATA_SIZE) - return -EINVAL; - - if (buffer->bytes_left < len || - buffer->bytes_left_metadata < metadata_len) - return -ENOMEM; - - return 0; -} - -static struct devlink_health_buffer_desc * -devlink_health_buffer_get_desc_from_offset(struct devlink_health_buffer *buffer) -{ - return buffer->data + buffer->offset; -} - -int -devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype) -{ - struct devlink_health_buffer_desc *desc; - int err; - - err = devlink_health_buffer_verify_len(buffer, 0, - DEVLINK_HEALTH_BUFFER_NEST_SIZE); - if (err) - return err; - - if (attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY) - return -EINVAL; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - - desc->attrtype = attrtype; - buffer->bytes_left_metadata -= DEVLINK_HEALTH_BUFFER_NEST_SIZE; - devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); - - buffer->curr_nest++; - buffer->max_nested_depth = max(buffer->max_nested_depth, - buffer->curr_nest); - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_start); - -enum devlink_health_buffer_nest_end_cancel { - DEVLINK_HEALTH_BUFFER_NEST_END = 1, - DEVLINK_HEALTH_BUFFER_NEST_CANCEL, -}; - -static void -devlink_health_buffer_nest_end_cancel(struct devlink_health_buffer *buffer, - enum devlink_health_buffer_nest_end_cancel nest) -{ - struct devlink_health_buffer_desc *desc; - - WARN_ON(!buffer->curr_nest); - buffer->curr_nest--; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->nest_end = nest; - devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); -} - -void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) -{ - devlink_health_buffer_nest_end_cancel(buffer, - DEVLINK_HEALTH_BUFFER_NEST_END); -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_end); - -void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) -{ - devlink_health_buffer_nest_end_cancel(buffer, - DEVLINK_HEALTH_BUFFER_NEST_CANCEL); -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_cancel); - -int -devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name) -{ - struct devlink_health_buffer_desc *desc; - int err; - - err = devlink_health_buffer_verify_len(buffer, strlen(name) + 1, - sizeof(*desc)); - if (err) - return err; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME; - desc->nla_type = NLA_NUL_STRING; - desc->len = strlen(name) + 1; - memcpy(&desc->value, name, desc->len); - devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); - - buffer->bytes_left_metadata -= sizeof(*desc); - buffer->bytes_left -= (strlen(name) + 1); - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_object_name); - -static int -devlink_health_buffer_put_value(struct devlink_health_buffer *buffer, - u8 nla_type, void *value, int len) -{ - struct devlink_health_buffer_desc *desc; - int err; - - err = devlink_health_buffer_verify_len(buffer, len, sizeof(*desc)); - if (err) - return err; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA; - desc->nla_type = nla_type; - desc->len = len; - memcpy(&desc->value, value, len); - devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); - - buffer->bytes_left_metadata -= sizeof(*desc); - buffer->bytes_left -= len; - - return 0; -} - -int -devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value) -{ - int err; - - err = devlink_health_buffer_put_value(buffer, NLA_U8, &value, - sizeof(value)); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u8); - -int -devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value) -{ - int err; - - err = devlink_health_buffer_put_value(buffer, NLA_U32, &value, - sizeof(value)); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u32); - -int -devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value) -{ - int err; - - err = devlink_health_buffer_put_value(buffer, NLA_U64, &value, - sizeof(value)); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u64); - -int -devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name) -{ - int err; - - if (strlen(name) + 1 > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) - return -EINVAL; - - err = devlink_health_buffer_put_value(buffer, NLA_NUL_STRING, name, - strlen(name) + 1); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_string); - -int -devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len) -{ - int err; - - if (len > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) - return -EINVAL; - - err = devlink_health_buffer_put_value(buffer, NLA_BINARY, data, len); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_data); - -static int -devlink_health_buffer_fill_data(struct sk_buff *skb, - struct devlink_health_buffer_desc *desc) -{ - int err = -EINVAL; - - switch (desc->nla_type) { - case NLA_U8: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u8 *)desc->value); - break; - case NLA_U32: - err = nla_put_u32(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u32 *)desc->value); - break; - case NLA_U64: - err = nla_put_u64_64bit(skb, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u64 *)desc->value, DEVLINK_ATTR_PAD); - break; - case NLA_NUL_STRING: - err = nla_put_string(skb, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - (char *)&desc->value); - break; - case NLA_BINARY: - err = nla_put(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - desc->len, (void *)&desc->value); - break; - } - - return err; -} - -static int -devlink_health_buffer_fill_type(struct sk_buff *skb, - struct devlink_health_buffer_desc *desc) -{ - int err = -EINVAL; - - switch (desc->nla_type) { - case NLA_U8: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U8); - break; - case NLA_U32: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U32); - break; - case NLA_U64: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U64); - break; - case NLA_NUL_STRING: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_NUL_STRING); - break; - case NLA_BINARY: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_BINARY); - break; - } - - return err; -} - -static inline struct devlink_health_buffer_desc * -devlink_health_buffer_get_next_desc(struct devlink_health_buffer_desc *desc) -{ - return (void *)&desc->value + desc->len; -} - -static int -devlink_health_buffer_prepare_skb(struct sk_buff *skb, - struct devlink_health_buffer *buffer) -{ - struct devlink_health_buffer_desc *last_desc, *desc; - struct nlattr **buffer_nlattr; - int err; - int i = 0; - - buffer_nlattr = kcalloc(buffer->max_nested_depth, - sizeof(*buffer_nlattr), GFP_KERNEL); - if (!buffer_nlattr) - return -EINVAL; - - last_desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc = buffer->data; - while (desc != last_desc) { - switch (desc->attrtype) { - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY: - buffer_nlattr[i] = nla_nest_start(skb, desc->attrtype); - if (!buffer_nlattr[i]) - goto nla_put_failure; - i++; - break; - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA: - err = devlink_health_buffer_fill_data(skb, desc); - if (err) - goto nla_put_failure; - err = devlink_health_buffer_fill_type(skb, desc); - if (err) - goto nla_put_failure; - break; - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME: - err = nla_put_string(skb, desc->attrtype, - (char *)&desc->value); - if (err) - goto nla_put_failure; - break; - default: - WARN_ON(!desc->nest_end); - WARN_ON(i <= 0); - if (desc->nest_end == DEVLINK_HEALTH_BUFFER_NEST_END) - nla_nest_end(skb, buffer_nlattr[--i]); - else - nla_nest_cancel(skb, buffer_nlattr[--i]); - break; - } - desc = devlink_health_buffer_get_next_desc(desc); - } - - return 0; - -nla_put_failure: - kfree(buffer_nlattr); - return err; -} - -static int -devlink_health_buffer_snd(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_health_buffer **buffers_array, - u64 num_of_buffers) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *hdr; - int err; - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - /* Skip buffer if driver did not fill it up with any data */ - if (!buffers_array[i]->offset) - continue; - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) - goto nla_put_failure; - - err = devlink_health_buffer_prepare_skb(skb, buffers_array[i]); - if (err) - goto nla_put_failure; - - genlmsg_end(skb, hdr); - err = genlmsg_reply(skb, info); - if (err) - return err; - } - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) - goto nla_put_failure; - - err = genlmsg_reply(skb, info); - if (err) - return err; - - return 0; - -nla_put_failure: - err = -EIO; - nlmsg_free(skb); - return err; -} - -struct devlink_health_reporter { - struct list_head list; - struct devlink_health_buffer **dump_buffers_array; - struct mutex dump_lock; /* lock parallel read/write from dump buffers */ - struct devlink_health_buffer **diagnose_buffers_array; - struct mutex diagnose_lock; /* lock parallel read/write from diagnose buffers */ - void *priv; - const struct devlink_health_reporter_ops *ops; - struct devlink *devlink; - u64 graceful_period; - bool auto_recover; - u8 health_state; - u8 dump_avail; - u64 dump_ts; - u64 error_count; - u64 recovery_count; - u64 last_recovery_ts; -}; - -enum devlink_health_reporter_state { - DEVLINK_HEALTH_REPORTER_STATE_HEALTHY, - DEVLINK_HEALTH_REPORTER_STATE_ERROR, -}; - -void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return reporter->priv; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); - -static struct devlink_health_reporter * -devlink_health_reporter_find_by_name(struct devlink *devlink, - const char *reporter_name) -{ - struct devlink_health_reporter *reporter; - - list_for_each_entry(reporter, &devlink->reporter_list, list) - if (!strcmp(reporter->ops->name, reporter_name)) - return reporter; - return NULL; -} - -/** - * devlink_health_reporter_create - create devlink health reporter - * - * @devlink: devlink - * @ops: ops - * @graceful_period: to avoid recovery loops, in msecs - * @auto_recover: auto recover when error occurs - * @priv: priv - */ -struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) -{ - struct devlink_health_reporter *reporter; - - mutex_lock(&devlink->lock); - if (devlink_health_reporter_find_by_name(devlink, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } - - if (WARN_ON(ops->dump && !ops->dump_size) || - WARN_ON(ops->diagnose && !ops->diagnose_size) || - WARN_ON(auto_recover && !ops->recover) || - WARN_ON(graceful_period && !ops->recover)) { - reporter = ERR_PTR(-EINVAL); - goto unlock; - } - - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); - if (!reporter) { - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - - if (ops->dump) { - reporter->dump_buffers_array = - devlink_health_buffers_create(ops->dump_size); - if (!reporter->dump_buffers_array) { - kfree(reporter); - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - } - - if (ops->diagnose) { - reporter->diagnose_buffers_array = - devlink_health_buffers_create(ops->diagnose_size); - if (!reporter->diagnose_buffers_array) { - devlink_health_buffers_destroy(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(ops->dump_size)); - kfree(reporter); - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - } - - list_add_tail(&reporter->list, &devlink->reporter_list); - mutex_init(&reporter->dump_lock); - mutex_init(&reporter->diagnose_lock); - - reporter->priv = priv; - reporter->ops = ops; - reporter->devlink = devlink; - reporter->graceful_period = graceful_period; - reporter->auto_recover = auto_recover; -unlock: - mutex_unlock(&devlink->lock); - return reporter; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_create); - -/** - * devlink_health_reporter_destroy - destroy devlink health reporter - * - * @reporter: devlink health reporter to destroy - */ -void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - mutex_lock(&reporter->devlink->lock); - list_del(&reporter->list); - devlink_health_buffers_destroy(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); - devlink_health_buffers_destroy(reporter->diagnose_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size)); - kfree(reporter); - mutex_unlock(&reporter->devlink->lock); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); - -static int -devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) -{ - int err; - - if (!reporter->ops->recover) - return -EOPNOTSUPP; - - err = reporter->ops->recover(reporter, priv_ctx); - if (err) - return err; - - reporter->recovery_count++; - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - reporter->last_recovery_ts = jiffies; - - return 0; -} - -static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) -{ - int err; - - if (!reporter->ops->dump) - return 0; - - if (reporter->dump_avail) - return 0; - - devlink_health_buffers_reset(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); - err = reporter->ops->dump(reporter, reporter->dump_buffers_array, - DEVLINK_HEALTH_BUFFER_SIZE, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size), - priv_ctx); - if (!err) { - reporter->dump_avail = true; - reporter->dump_ts = jiffies; - } - - return err; -} - -int devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - struct devlink *devlink = reporter->devlink; - int err = 0; - - /* write a log message of the current error */ - WARN_ON(!msg); - trace_devlink_health_report(devlink, reporter->ops->name, msg); - reporter->error_count++; - - /* abort if the previous error wasn't recovered */ - if (reporter->auto_recover && - (reporter->health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - jiffies - reporter->last_recovery_ts < - msecs_to_jiffies(reporter->graceful_period))) { - trace_devlink_health_recover_aborted(devlink, - reporter->ops->name, - reporter->health_state, - jiffies - - reporter->last_recovery_ts); - return -ECANCELED; - } - - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); - mutex_unlock(&reporter->dump_lock); - - if (reporter->auto_recover) - err = devlink_health_reporter_recover(reporter, priv_ctx); - - return err; -} -EXPORT_SYMBOL_GPL(devlink_health_report); - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - char *reporter_name; - - if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) - return NULL; - - reporter_name = - nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - return devlink_health_reporter_find_by_name(devlink, reporter_name); -} - -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, - reporter->dump_avail)) - goto reporter_nest_cancel; - if (reporter->dump_avail && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; - -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - struct sk_buff *msg; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - info->snd_portid, info->snd_seq, - 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int -devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_health_reporter *reporter; - struct devlink *devlink; - int start = cb->args[0]; - int idx = 0; - int err; - - mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - continue; - mutex_lock(&devlink->lock); - list_for_each_entry(reporter, &devlink->reporter_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill(msg, devlink, - reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - mutex_unlock(&devlink->lock); - goto out; - } - idx++; - } - mutex_unlock(&devlink->lock); - } -out: - mutex_unlock(&devlink_mutex); - - cb->args[0] = idx; - return msg->len; -} - -static int -devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->recover && - (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) - return -EINVAL; - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) - reporter->graceful_period = - nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) - reporter->auto_recover = - nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); - - return 0; -} - -static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - return devlink_health_reporter_recover(reporter, NULL); -} - -static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - u64 num_of_buffers; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->diagnose) - return -EOPNOTSUPP; - - num_of_buffers = - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size); - - mutex_lock(&reporter->diagnose_lock); - devlink_health_buffers_reset(reporter->diagnose_buffers_array, - num_of_buffers); - - err = reporter->ops->diagnose(reporter, - reporter->diagnose_buffers_array, - DEVLINK_HEALTH_BUFFER_SIZE, - num_of_buffers); - if (err) - goto out; - - err = devlink_health_buffer_snd(info, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - 0, reporter->diagnose_buffers_array, - num_of_buffers); - if (err) - goto out; - - mutex_unlock(&reporter->diagnose_lock); - return 0; - -out: - mutex_unlock(&reporter->diagnose_lock); - return err; -} - -static void -devlink_health_dump_clear(struct devlink_health_reporter *reporter) -{ - reporter->dump_avail = false; - reporter->dump_ts = 0; - devlink_health_buffers_reset(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); -} - -static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - u64 num_of_buffers; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->dump) - return -EOPNOTSUPP; - - num_of_buffers = - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size); - - mutex_lock(&reporter->dump_lock); - err = devlink_health_do_dump(reporter, NULL); - if (err) - goto out; - - err = devlink_health_buffer_snd(info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - 0, reporter->dump_buffers_array, - num_of_buffers); - -out: - mutex_unlock(&reporter->dump_lock); - return err; -} - -static int -devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - mutex_lock(&reporter->dump_lock); - devlink_health_dump_clear(reporter); - mutex_unlock(&reporter->dump_lock); - return 0; -} - static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4631,9 +3622,6 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4854,51 +3842,6 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, - .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .policy = devlink_nl_policy, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, - .doit = devlink_nl_cmd_health_reporter_set_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - .doit = devlink_nl_cmd_health_reporter_recover_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - .doit = devlink_nl_cmd_health_reporter_diagnose_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .doit = devlink_nl_cmd_health_reporter_dump_get_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, - }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -4939,7 +3882,6 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->param_list); INIT_LIST_HEAD(&devlink->region_list); - INIT_LIST_HEAD(&devlink->reporter_list); mutex_init(&devlink->lock); return devlink; } -- cgit From a118b19d9145be15ce7bcc577bca343a246f8e6c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 25 Jan 2019 21:08:24 +0100 Subject: Documentation: net: phy: reflect latest changes to phylib API Recent changes to the phylib API - removed phy_stop_interrupts - replaced phy_start_interrupts with phy_request_interrupt - moved some functionality from phy_connect() and phy_disconnect() to phy_start() and phy_stop() respectively. Reflect these changes in the documentation. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- Documentation/networking/phy.txt | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt index bdec0f700bc1..7ecba4fd6cca 100644 --- a/Documentation/networking/phy.txt +++ b/Documentation/networking/phy.txt @@ -210,12 +210,16 @@ Letting the PHY Abstraction Layer do Everything Lastly, once the controller is ready to handle network traffic, you call phy_start(phydev). This tells the PAL that you are ready, and configures the - PHY to connect to the network. If you want to handle your own interrupts, - just set phydev->irq to PHY_IGNORE_INTERRUPT before you call phy_start. - Similarly, if you don't want to use interrupts, set phydev->irq to PHY_POLL. + PHY to connect to the network. If the MAC interrupt of your network driver + also handles PHY status changes, just set phydev->irq to PHY_IGNORE_INTERRUPT + before you call phy_start and use phy_mac_interrupt() from the network + driver. If you don't want to use interrupts, set phydev->irq to PHY_POLL. + phy_start() enables the PHY interrupts (if applicable) and starts the + phylib state machine. When you want to disconnect from the network (even if just briefly), you call - phy_stop(phydev). + phy_stop(phydev). This function also stops the phylib state machine and + disables PHY interrupts. Pause frames / flow control @@ -271,11 +275,9 @@ Doing it all yourself A convenience function to print out the PHY status neatly. - int phy_start_interrupts(struct phy_device *phydev); - int phy_stop_interrupts(struct phy_device *phydev); + void phy_request_interrupt(struct phy_device *phydev); - Requests the IRQ for the PHY interrupts, then enables them for - start, or disables then frees them for stop. + Requests the IRQ for the PHY interrupts. struct phy_device * phy_attach(struct net_device *dev, const char *phy_id, phy_interface_t interface); -- cgit From 25fe02d00a1e9468e0ae995beedb99867ec6701b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 26 Jan 2019 11:25:37 +0100 Subject: Documentation: net: phy: switch documentation to rst format Switch phylib documentation to rst format. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/phy.rst | 447 +++++++++++++++++++++++++++++++++++++ Documentation/networking/phy.txt | 429 ----------------------------------- 3 files changed, 448 insertions(+), 429 deletions(-) create mode 100644 Documentation/networking/phy.rst delete mode 100644 Documentation/networking/phy.txt (limited to 'Documentation') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 59e86de662cd..f1627ca2a0ea 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -29,6 +29,7 @@ Contents: msg_zerocopy failover net_failover + phy alias bridge snmp_counter diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst new file mode 100644 index 000000000000..0dd90d7df5ec --- /dev/null +++ b/Documentation/networking/phy.rst @@ -0,0 +1,447 @@ +===================== +PHY Abstraction Layer +===================== + +Purpose +======= + +Most network devices consist of set of registers which provide an interface +to a MAC layer, which communicates with the physical connection through a +PHY. The PHY concerns itself with negotiating link parameters with the link +partner on the other side of the network connection (typically, an ethernet +cable), and provides a register interface to allow drivers to determine what +settings were chosen, and to configure what settings are allowed. + +While these devices are distinct from the network devices, and conform to a +standard layout for the registers, it has been common practice to integrate +the PHY management code with the network driver. This has resulted in large +amounts of redundant code. Also, on embedded systems with multiple (and +sometimes quite different) ethernet controllers connected to the same +management bus, it is difficult to ensure safe use of the bus. + +Since the PHYs are devices, and the management busses through which they are +accessed are, in fact, busses, the PHY Abstraction Layer treats them as such. +In doing so, it has these goals: + +#. Increase code-reuse +#. Increase overall code-maintainability +#. Speed development time for new network drivers, and for new systems + +Basically, this layer is meant to provide an interface to PHY devices which +allows network driver writers to write as little code as possible, while +still providing a full feature set. + +The MDIO bus +============ + +Most network devices are connected to a PHY by means of a management bus. +Different devices use different busses (though some share common interfaces). +In order to take advantage of the PAL, each bus interface needs to be +registered as a distinct device. + +#. read and write functions must be implemented. Their prototypes are:: + + int write(struct mii_bus *bus, int mii_id, int regnum, u16 value); + int read(struct mii_bus *bus, int mii_id, int regnum); + + mii_id is the address on the bus for the PHY, and regnum is the register + number. These functions are guaranteed not to be called from interrupt + time, so it is safe for them to block, waiting for an interrupt to signal + the operation is complete + +#. A reset function is optional. This is used to return the bus to an + initialized state. + +#. A probe function is needed. This function should set up anything the bus + driver needs, setup the mii_bus structure, and register with the PAL using + mdiobus_register. Similarly, there's a remove function to undo all of + that (use mdiobus_unregister). + +#. Like any driver, the device_driver structure must be configured, and init + exit functions are used to register the driver. + +#. The bus must also be declared somewhere as a device, and registered. + +As an example for how one driver implemented an mdio bus driver, see +drivers/net/ethernet/freescale/fsl_pq_mdio.c and an associated DTS file +for one of the users. (e.g. "git grep fsl,.*-mdio arch/powerpc/boot/dts/") + +(RG)MII/electrical interface considerations +=========================================== + +The Reduced Gigabit Medium Independent Interface (RGMII) is a 12-pin +electrical signal interface using a synchronous 125Mhz clock signal and several +data lines. Due to this design decision, a 1.5ns to 2ns delay must be added +between the clock line (RXC or TXC) and the data lines to let the PHY (clock +sink) have enough setup and hold times to sample the data lines correctly. The +PHY library offers different types of PHY_INTERFACE_MODE_RGMII* values to let +the PHY driver and optionally the MAC driver, implement the required delay. The +values of phy_interface_t must be understood from the perspective of the PHY +device itself, leading to the following: + +* PHY_INTERFACE_MODE_RGMII: the PHY is not responsible for inserting any + internal delay by itself, it assumes that either the Ethernet MAC (if capable + or the PCB traces) insert the correct 1.5-2ns delay + +* PHY_INTERFACE_MODE_RGMII_TXID: the PHY should insert an internal delay + for the transmit data lines (TXD[3:0]) processed by the PHY device + +* PHY_INTERFACE_MODE_RGMII_RXID: the PHY should insert an internal delay + for the receive data lines (RXD[3:0]) processed by the PHY device + +* PHY_INTERFACE_MODE_RGMII_ID: the PHY should insert internal delays for + both transmit AND receive data lines from/to the PHY device + +Whenever possible, use the PHY side RGMII delay for these reasons: + +* PHY devices may offer sub-nanosecond granularity in how they allow a + receiver/transmitter side delay (e.g: 0.5, 1.0, 1.5ns) to be specified. Such + precision may be required to account for differences in PCB trace lengths + +* PHY devices are typically qualified for a large range of applications + (industrial, medical, automotive...), and they provide a constant and + reliable delay across temperature/pressure/voltage ranges + +* PHY device drivers in PHYLIB being reusable by nature, being able to + configure correctly a specified delay enables more designs with similar delay + requirements to be operate correctly + +For cases where the PHY is not capable of providing this delay, but the +Ethernet MAC driver is capable of doing so, the correct phy_interface_t value +should be PHY_INTERFACE_MODE_RGMII, and the Ethernet MAC driver should be +configured correctly in order to provide the required transmit and/or receive +side delay from the perspective of the PHY device. Conversely, if the Ethernet +MAC driver looks at the phy_interface_t value, for any other mode but +PHY_INTERFACE_MODE_RGMII, it should make sure that the MAC-level delays are +disabled. + +In case neither the Ethernet MAC, nor the PHY are capable of providing the +required delays, as defined per the RGMII standard, several options may be +available: + +* Some SoCs may offer a pin pad/mux/controller capable of configuring a given + set of pins'strength, delays, and voltage; and it may be a suitable + option to insert the expected 2ns RGMII delay. + +* Modifying the PCB design to include a fixed delay (e.g: using a specifically + designed serpentine), which may not require software configuration at all. + +Common problems with RGMII delay mismatch +----------------------------------------- + +When there is a RGMII delay mismatch between the Ethernet MAC and the PHY, this +will most likely result in the clock and data line signals to be unstable when +the PHY or MAC take a snapshot of these signals to translate them into logical +1 or 0 states and reconstruct the data being transmitted/received. Typical +symptoms include: + +* Transmission/reception partially works, and there is frequent or occasional + packet loss observed + +* Ethernet MAC may report some or all packets ingressing with a FCS/CRC error, + or just discard them all + +* Switching to lower speeds such as 10/100Mbits/sec makes the problem go away + (since there is enough setup/hold time in that case) + +Connecting to a PHY +=================== + +Sometime during startup, the network driver needs to establish a connection +between the PHY device, and the network device. At this time, the PHY's bus +and drivers need to all have been loaded, so it is ready for the connection. +At this point, there are several ways to connect to the PHY: + +#. The PAL handles everything, and only calls the network driver when + the link state changes, so it can react. + +#. The PAL handles everything except interrupts (usually because the + controller has the interrupt registers). + +#. The PAL handles everything, but checks in with the driver every second, + allowing the network driver to react first to any changes before the PAL + does. + +#. The PAL serves only as a library of functions, with the network device + manually calling functions to update status, and configure the PHY + + +Letting the PHY Abstraction Layer do Everything +=============================================== + +If you choose option 1 (The hope is that every driver can, but to still be +useful to drivers that can't), connecting to the PHY is simple: + +First, you need a function to react to changes in the link state. This +function follows this protocol:: + + static void adjust_link(struct net_device *dev); + +Next, you need to know the device name of the PHY connected to this device. +The name will look something like, "0:00", where the first number is the +bus id, and the second is the PHY's address on that bus. Typically, +the bus is responsible for making its ID unique. + +Now, to connect, just call this function:: + + phydev = phy_connect(dev, phy_name, &adjust_link, interface); + +*phydev* is a pointer to the phy_device structure which represents the PHY. +If phy_connect is successful, it will return the pointer. dev, here, is the +pointer to your net_device. Once done, this function will have started the +PHY's software state machine, and registered for the PHY's interrupt, if it +has one. The phydev structure will be populated with information about the +current state, though the PHY will not yet be truly operational at this +point. + +PHY-specific flags should be set in phydev->dev_flags prior to the call +to phy_connect() such that the underlying PHY driver can check for flags +and perform specific operations based on them. +This is useful if the system has put hardware restrictions on +the PHY/controller, of which the PHY needs to be aware. + +*interface* is a u32 which specifies the connection type used +between the controller and the PHY. Examples are GMII, MII, +RGMII, and SGMII. For a full list, see include/linux/phy.h + +Now just make sure that phydev->supported and phydev->advertising have any +values pruned from them which don't make sense for your controller (a 10/100 +controller may be connected to a gigabit capable PHY, so you would need to +mask off SUPPORTED_1000baseT*). See include/linux/ethtool.h for definitions +for these bitfields. Note that you should not SET any bits, except the +SUPPORTED_Pause and SUPPORTED_AsymPause bits (see below), or the PHY may get +put into an unsupported state. + +Lastly, once the controller is ready to handle network traffic, you call +phy_start(phydev). This tells the PAL that you are ready, and configures the +PHY to connect to the network. If the MAC interrupt of your network driver +also handles PHY status changes, just set phydev->irq to PHY_IGNORE_INTERRUPT +before you call phy_start and use phy_mac_interrupt() from the network +driver. If you don't want to use interrupts, set phydev->irq to PHY_POLL. +phy_start() enables the PHY interrupts (if applicable) and starts the +phylib state machine. + +When you want to disconnect from the network (even if just briefly), you call +phy_stop(phydev). This function also stops the phylib state machine and +disables PHY interrupts. + +Pause frames / flow control +=========================== + +The PHY does not participate directly in flow control/pause frames except by +making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in +MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC +controller supports such a thing. Since flow control/pause frames generation +involves the Ethernet MAC driver, it is recommended that this driver takes care +of properly indicating advertisement and support for such features by setting +the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done +either before or after phy_connect() and/or as a result of implementing the +ethtool::set_pauseparam feature. + + +Keeping Close Tabs on the PAL +============================= + +It is possible that the PAL's built-in state machine needs a little help to +keep your network device and the PHY properly in sync. If so, you can +register a helper function when connecting to the PHY, which will be called +every second before the state machine reacts to any changes. To do this, you +need to manually call phy_attach() and phy_prepare_link(), and then call +phy_start_machine() with the second argument set to point to your special +handler. + +Currently there are no examples of how to use this functionality, and testing +on it has been limited because the author does not have any drivers which use +it (they all use option 1). So Caveat Emptor. + +Doing it all yourself +===================== + +There's a remote chance that the PAL's built-in state machine cannot track +the complex interactions between the PHY and your network device. If this is +so, you can simply call phy_attach(), and not call phy_start_machine or +phy_prepare_link(). This will mean that phydev->state is entirely yours to +handle (phy_start and phy_stop toggle between some of the states, so you +might need to avoid them). + +An effort has been made to make sure that useful functionality can be +accessed without the state-machine running, and most of these functions are +descended from functions which did not interact with a complex state-machine. +However, again, no effort has been made so far to test running without the +state machine, so tryer beware. + +Here is a brief rundown of the functions:: + + int phy_read(struct phy_device *phydev, u16 regnum); + int phy_write(struct phy_device *phydev, u16 regnum, u16 val); + +Simple read/write primitives. They invoke the bus's read/write function +pointers. +:: + + void phy_print_status(struct phy_device *phydev); + +A convenience function to print out the PHY status neatly. +:: + + void phy_request_interrupt(struct phy_device *phydev); + +Requests the IRQ for the PHY interrupts. +:: + + struct phy_device * phy_attach(struct net_device *dev, const char *phy_id, + phy_interface_t interface); + +Attaches a network device to a particular PHY, binding the PHY to a generic +driver if none was found during bus initialization. +:: + + int phy_start_aneg(struct phy_device *phydev); + +Using variables inside the phydev structure, either configures advertising +and resets autonegotiation, or disables autonegotiation, and configures +forced settings. +:: + + static inline int phy_read_status(struct phy_device *phydev); + +Fills the phydev structure with up-to-date information about the current +settings in the PHY. +:: + + int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd); + +Ethtool convenience functions. +:: + + int phy_mii_ioctl(struct phy_device *phydev, + struct mii_ioctl_data *mii_data, int cmd); + +The MII ioctl. Note that this function will completely screw up the state +machine if you write registers like BMCR, BMSR, ADVERTISE, etc. Best to +use this only to write registers which are not standard, and don't set off +a renegotiation. + +PHY Device Drivers +================== + +With the PHY Abstraction Layer, adding support for new PHYs is +quite easy. In some cases, no work is required at all! However, +many PHYs require a little hand-holding to get up-and-running. + +Generic PHY driver +------------------ + +If the desired PHY doesn't have any errata, quirks, or special +features you want to support, then it may be best to not add +support, and let the PHY Abstraction Layer's Generic PHY Driver +do all of the work. + +Writing a PHY driver +-------------------- + +If you do need to write a PHY driver, the first thing to do is +make sure it can be matched with an appropriate PHY device. +This is done during bus initialization by reading the device's +UID (stored in registers 2 and 3), then comparing it to each +driver's phy_id field by ANDing it with each driver's +phy_id_mask field. Also, it needs a name. Here's an example:: + + static struct phy_driver dm9161_driver = { + .phy_id = 0x0181b880, + .name = "Davicom DM9161E", + .phy_id_mask = 0x0ffffff0, + ... + } + +Next, you need to specify what features (speed, duplex, autoneg, +etc) your PHY device and driver support. Most PHYs support +PHY_BASIC_FEATURES, but you can look in include/mii.h for other +features. + +Each driver consists of a number of function pointers, documented +in include/linux/phy.h under the phy_driver structure. + +Of these, only config_aneg and read_status are required to be +assigned by the driver code. The rest are optional. Also, it is +preferred to use the generic phy driver's versions of these two +functions if at all possible: genphy_read_status and +genphy_config_aneg. If this is not possible, it is likely that +you only need to perform some actions before and after invoking +these functions, and so your functions will wrap the generic +ones. + +Feel free to look at the Marvell, Cicada, and Davicom drivers in +drivers/net/phy/ for examples (the lxt and qsemi drivers have +not been tested as of this writing). + +The PHY's MMD register accesses are handled by the PAL framework +by default, but can be overridden by a specific PHY driver if +required. This could be the case if a PHY was released for +manufacturing before the MMD PHY register definitions were +standardized by the IEEE. Most modern PHYs will be able to use +the generic PAL framework for accessing the PHY's MMD registers. +An example of such usage is for Energy Efficient Ethernet support, +implemented in the PAL. This support uses the PAL to access MMD +registers for EEE query and configuration if the PHY supports +the IEEE standard access mechanisms, or can use the PHY's specific +access interfaces if overridden by the specific PHY driver. See +the Micrel driver in drivers/net/phy/ for an example of how this +can be implemented. + +Board Fixups +============ + +Sometimes the specific interaction between the platform and the PHY requires +special handling. For instance, to change where the PHY's clock input is, +or to add a delay to account for latency issues in the data path. In order +to support such contingencies, the PHY Layer allows platform code to register +fixups to be run when the PHY is brought up (or subsequently reset). + +When the PHY Layer brings up a PHY it checks to see if there are any fixups +registered for it, matching based on UID (contained in the PHY device's phy_id +field) and the bus identifier (contained in phydev->dev.bus_id). Both must +match, however two constants, PHY_ANY_ID and PHY_ANY_UID, are provided as +wildcards for the bus ID and UID, respectively. + +When a match is found, the PHY layer will invoke the run function associated +with the fixup. This function is passed a pointer to the phy_device of +interest. It should therefore only operate on that PHY. + +The platform code can either register the fixup using phy_register_fixup():: + + int phy_register_fixup(const char *phy_id, + u32 phy_uid, u32 phy_uid_mask, + int (*run)(struct phy_device *)); + +Or using one of the two stubs, phy_register_fixup_for_uid() and +phy_register_fixup_for_id():: + + int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, + int (*run)(struct phy_device *)); + int phy_register_fixup_for_id(const char *phy_id, + int (*run)(struct phy_device *)); + +The stubs set one of the two matching criteria, and set the other one to +match anything. + +When phy_register_fixup() or \*_for_uid()/\*_for_id() is called at module, +unregister fixup and free allocate memory are required. + +Call one of following function before unloading module:: + + int phy_unregister_fixup(const char *phy_id, u32 phy_uid, u32 phy_uid_mask); + int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); + int phy_register_fixup_for_id(const char *phy_id); + +Standards +========= + +IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, Section Two: +http://standards.ieee.org/getieee802/download/802.3-2008_section2.pdf + +RGMII v1.3: +http://web.archive.org/web/20160303212629/http://www.hp.com/rnd/pdfs/RGMIIv1_3.pdf + +RGMII v2.0: +http://web.archive.org/web/20160303171328/http://www.hp.com/rnd/pdfs/RGMIIv2_0_final_hp.pdf diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt deleted file mode 100644 index 7ecba4fd6cca..000000000000 --- a/Documentation/networking/phy.txt +++ /dev/null @@ -1,429 +0,0 @@ - -------- -PHY Abstraction Layer -(Updated 2008-04-08) - -Purpose - - Most network devices consist of set of registers which provide an interface - to a MAC layer, which communicates with the physical connection through a - PHY. The PHY concerns itself with negotiating link parameters with the link - partner on the other side of the network connection (typically, an ethernet - cable), and provides a register interface to allow drivers to determine what - settings were chosen, and to configure what settings are allowed. - - While these devices are distinct from the network devices, and conform to a - standard layout for the registers, it has been common practice to integrate - the PHY management code with the network driver. This has resulted in large - amounts of redundant code. Also, on embedded systems with multiple (and - sometimes quite different) ethernet controllers connected to the same - management bus, it is difficult to ensure safe use of the bus. - - Since the PHYs are devices, and the management busses through which they are - accessed are, in fact, busses, the PHY Abstraction Layer treats them as such. - In doing so, it has these goals: - - 1) Increase code-reuse - 2) Increase overall code-maintainability - 3) Speed development time for new network drivers, and for new systems - - Basically, this layer is meant to provide an interface to PHY devices which - allows network driver writers to write as little code as possible, while - still providing a full feature set. - -The MDIO bus - - Most network devices are connected to a PHY by means of a management bus. - Different devices use different busses (though some share common interfaces). - In order to take advantage of the PAL, each bus interface needs to be - registered as a distinct device. - - 1) read and write functions must be implemented. Their prototypes are: - - int write(struct mii_bus *bus, int mii_id, int regnum, u16 value); - int read(struct mii_bus *bus, int mii_id, int regnum); - - mii_id is the address on the bus for the PHY, and regnum is the register - number. These functions are guaranteed not to be called from interrupt - time, so it is safe for them to block, waiting for an interrupt to signal - the operation is complete - - 2) A reset function is optional. This is used to return the bus to an - initialized state. - - 3) A probe function is needed. This function should set up anything the bus - driver needs, setup the mii_bus structure, and register with the PAL using - mdiobus_register. Similarly, there's a remove function to undo all of - that (use mdiobus_unregister). - - 4) Like any driver, the device_driver structure must be configured, and init - exit functions are used to register the driver. - - 5) The bus must also be declared somewhere as a device, and registered. - - As an example for how one driver implemented an mdio bus driver, see - drivers/net/ethernet/freescale/fsl_pq_mdio.c and an associated DTS file - for one of the users. (e.g. "git grep fsl,.*-mdio arch/powerpc/boot/dts/") - -(RG)MII/electrical interface considerations - - The Reduced Gigabit Medium Independent Interface (RGMII) is a 12-pin - electrical signal interface using a synchronous 125Mhz clock signal and several - data lines. Due to this design decision, a 1.5ns to 2ns delay must be added - between the clock line (RXC or TXC) and the data lines to let the PHY (clock - sink) have enough setup and hold times to sample the data lines correctly. The - PHY library offers different types of PHY_INTERFACE_MODE_RGMII* values to let - the PHY driver and optionally the MAC driver, implement the required delay. The - values of phy_interface_t must be understood from the perspective of the PHY - device itself, leading to the following: - - * PHY_INTERFACE_MODE_RGMII: the PHY is not responsible for inserting any - internal delay by itself, it assumes that either the Ethernet MAC (if capable - or the PCB traces) insert the correct 1.5-2ns delay - - * PHY_INTERFACE_MODE_RGMII_TXID: the PHY should insert an internal delay - for the transmit data lines (TXD[3:0]) processed by the PHY device - - * PHY_INTERFACE_MODE_RGMII_RXID: the PHY should insert an internal delay - for the receive data lines (RXD[3:0]) processed by the PHY device - - * PHY_INTERFACE_MODE_RGMII_ID: the PHY should insert internal delays for - both transmit AND receive data lines from/to the PHY device - - Whenever possible, use the PHY side RGMII delay for these reasons: - - * PHY devices may offer sub-nanosecond granularity in how they allow a - receiver/transmitter side delay (e.g: 0.5, 1.0, 1.5ns) to be specified. Such - precision may be required to account for differences in PCB trace lengths - - * PHY devices are typically qualified for a large range of applications - (industrial, medical, automotive...), and they provide a constant and - reliable delay across temperature/pressure/voltage ranges - - * PHY device drivers in PHYLIB being reusable by nature, being able to - configure correctly a specified delay enables more designs with similar delay - requirements to be operate correctly - - For cases where the PHY is not capable of providing this delay, but the - Ethernet MAC driver is capable of doing so, the correct phy_interface_t value - should be PHY_INTERFACE_MODE_RGMII, and the Ethernet MAC driver should be - configured correctly in order to provide the required transmit and/or receive - side delay from the perspective of the PHY device. Conversely, if the Ethernet - MAC driver looks at the phy_interface_t value, for any other mode but - PHY_INTERFACE_MODE_RGMII, it should make sure that the MAC-level delays are - disabled. - - In case neither the Ethernet MAC, nor the PHY are capable of providing the - required delays, as defined per the RGMII standard, several options may be - available: - - * Some SoCs may offer a pin pad/mux/controller capable of configuring a given - set of pins'strength, delays, and voltage; and it may be a suitable - option to insert the expected 2ns RGMII delay. - - * Modifying the PCB design to include a fixed delay (e.g: using a specifically - designed serpentine), which may not require software configuration at all. - -Common problems with RGMII delay mismatch - - When there is a RGMII delay mismatch between the Ethernet MAC and the PHY, this - will most likely result in the clock and data line signals to be unstable when - the PHY or MAC take a snapshot of these signals to translate them into logical - 1 or 0 states and reconstruct the data being transmitted/received. Typical - symptoms include: - - * Transmission/reception partially works, and there is frequent or occasional - packet loss observed - - * Ethernet MAC may report some or all packets ingressing with a FCS/CRC error, - or just discard them all - - * Switching to lower speeds such as 10/100Mbits/sec makes the problem go away - (since there is enough setup/hold time in that case) - - -Connecting to a PHY - - Sometime during startup, the network driver needs to establish a connection - between the PHY device, and the network device. At this time, the PHY's bus - and drivers need to all have been loaded, so it is ready for the connection. - At this point, there are several ways to connect to the PHY: - - 1) The PAL handles everything, and only calls the network driver when - the link state changes, so it can react. - - 2) The PAL handles everything except interrupts (usually because the - controller has the interrupt registers). - - 3) The PAL handles everything, but checks in with the driver every second, - allowing the network driver to react first to any changes before the PAL - does. - - 4) The PAL serves only as a library of functions, with the network device - manually calling functions to update status, and configure the PHY - - -Letting the PHY Abstraction Layer do Everything - - If you choose option 1 (The hope is that every driver can, but to still be - useful to drivers that can't), connecting to the PHY is simple: - - First, you need a function to react to changes in the link state. This - function follows this protocol: - - static void adjust_link(struct net_device *dev); - - Next, you need to know the device name of the PHY connected to this device. - The name will look something like, "0:00", where the first number is the - bus id, and the second is the PHY's address on that bus. Typically, - the bus is responsible for making its ID unique. - - Now, to connect, just call this function: - - phydev = phy_connect(dev, phy_name, &adjust_link, interface); - - phydev is a pointer to the phy_device structure which represents the PHY. If - phy_connect is successful, it will return the pointer. dev, here, is the - pointer to your net_device. Once done, this function will have started the - PHY's software state machine, and registered for the PHY's interrupt, if it - has one. The phydev structure will be populated with information about the - current state, though the PHY will not yet be truly operational at this - point. - - PHY-specific flags should be set in phydev->dev_flags prior to the call - to phy_connect() such that the underlying PHY driver can check for flags - and perform specific operations based on them. - This is useful if the system has put hardware restrictions on - the PHY/controller, of which the PHY needs to be aware. - - interface is a u32 which specifies the connection type used - between the controller and the PHY. Examples are GMII, MII, - RGMII, and SGMII. For a full list, see include/linux/phy.h - - Now just make sure that phydev->supported and phydev->advertising have any - values pruned from them which don't make sense for your controller (a 10/100 - controller may be connected to a gigabit capable PHY, so you would need to - mask off SUPPORTED_1000baseT*). See include/linux/ethtool.h for definitions - for these bitfields. Note that you should not SET any bits, except the - SUPPORTED_Pause and SUPPORTED_AsymPause bits (see below), or the PHY may get - put into an unsupported state. - - Lastly, once the controller is ready to handle network traffic, you call - phy_start(phydev). This tells the PAL that you are ready, and configures the - PHY to connect to the network. If the MAC interrupt of your network driver - also handles PHY status changes, just set phydev->irq to PHY_IGNORE_INTERRUPT - before you call phy_start and use phy_mac_interrupt() from the network - driver. If you don't want to use interrupts, set phydev->irq to PHY_POLL. - phy_start() enables the PHY interrupts (if applicable) and starts the - phylib state machine. - - When you want to disconnect from the network (even if just briefly), you call - phy_stop(phydev). This function also stops the phylib state machine and - disables PHY interrupts. - -Pause frames / flow control - - The PHY does not participate directly in flow control/pause frames except by - making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in - MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC - controller supports such a thing. Since flow control/pause frames generation - involves the Ethernet MAC driver, it is recommended that this driver takes care - of properly indicating advertisement and support for such features by setting - the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done - either before or after phy_connect() and/or as a result of implementing the - ethtool::set_pauseparam feature. - - -Keeping Close Tabs on the PAL - - It is possible that the PAL's built-in state machine needs a little help to - keep your network device and the PHY properly in sync. If so, you can - register a helper function when connecting to the PHY, which will be called - every second before the state machine reacts to any changes. To do this, you - need to manually call phy_attach() and phy_prepare_link(), and then call - phy_start_machine() with the second argument set to point to your special - handler. - - Currently there are no examples of how to use this functionality, and testing - on it has been limited because the author does not have any drivers which use - it (they all use option 1). So Caveat Emptor. - -Doing it all yourself - - There's a remote chance that the PAL's built-in state machine cannot track - the complex interactions between the PHY and your network device. If this is - so, you can simply call phy_attach(), and not call phy_start_machine or - phy_prepare_link(). This will mean that phydev->state is entirely yours to - handle (phy_start and phy_stop toggle between some of the states, so you - might need to avoid them). - - An effort has been made to make sure that useful functionality can be - accessed without the state-machine running, and most of these functions are - descended from functions which did not interact with a complex state-machine. - However, again, no effort has been made so far to test running without the - state machine, so tryer beware. - - Here is a brief rundown of the functions: - - int phy_read(struct phy_device *phydev, u16 regnum); - int phy_write(struct phy_device *phydev, u16 regnum, u16 val); - - Simple read/write primitives. They invoke the bus's read/write function - pointers. - - void phy_print_status(struct phy_device *phydev); - - A convenience function to print out the PHY status neatly. - - void phy_request_interrupt(struct phy_device *phydev); - - Requests the IRQ for the PHY interrupts. - - struct phy_device * phy_attach(struct net_device *dev, const char *phy_id, - phy_interface_t interface); - - Attaches a network device to a particular PHY, binding the PHY to a generic - driver if none was found during bus initialization. - - int phy_start_aneg(struct phy_device *phydev); - - Using variables inside the phydev structure, either configures advertising - and resets autonegotiation, or disables autonegotiation, and configures - forced settings. - - static inline int phy_read_status(struct phy_device *phydev); - - Fills the phydev structure with up-to-date information about the current - settings in the PHY. - - int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd); - - Ethtool convenience functions. - - int phy_mii_ioctl(struct phy_device *phydev, - struct mii_ioctl_data *mii_data, int cmd); - - The MII ioctl. Note that this function will completely screw up the state - machine if you write registers like BMCR, BMSR, ADVERTISE, etc. Best to - use this only to write registers which are not standard, and don't set off - a renegotiation. - - -PHY Device Drivers - - With the PHY Abstraction Layer, adding support for new PHYs is - quite easy. In some cases, no work is required at all! However, - many PHYs require a little hand-holding to get up-and-running. - -Generic PHY driver - - If the desired PHY doesn't have any errata, quirks, or special - features you want to support, then it may be best to not add - support, and let the PHY Abstraction Layer's Generic PHY Driver - do all of the work. - -Writing a PHY driver - - If you do need to write a PHY driver, the first thing to do is - make sure it can be matched with an appropriate PHY device. - This is done during bus initialization by reading the device's - UID (stored in registers 2 and 3), then comparing it to each - driver's phy_id field by ANDing it with each driver's - phy_id_mask field. Also, it needs a name. Here's an example: - - static struct phy_driver dm9161_driver = { - .phy_id = 0x0181b880, - .name = "Davicom DM9161E", - .phy_id_mask = 0x0ffffff0, - ... - } - - Next, you need to specify what features (speed, duplex, autoneg, - etc) your PHY device and driver support. Most PHYs support - PHY_BASIC_FEATURES, but you can look in include/mii.h for other - features. - - Each driver consists of a number of function pointers, documented - in include/linux/phy.h under the phy_driver structure. - - Of these, only config_aneg and read_status are required to be - assigned by the driver code. The rest are optional. Also, it is - preferred to use the generic phy driver's versions of these two - functions if at all possible: genphy_read_status and - genphy_config_aneg. If this is not possible, it is likely that - you only need to perform some actions before and after invoking - these functions, and so your functions will wrap the generic - ones. - - Feel free to look at the Marvell, Cicada, and Davicom drivers in - drivers/net/phy/ for examples (the lxt and qsemi drivers have - not been tested as of this writing). - - The PHY's MMD register accesses are handled by the PAL framework - by default, but can be overridden by a specific PHY driver if - required. This could be the case if a PHY was released for - manufacturing before the MMD PHY register definitions were - standardized by the IEEE. Most modern PHYs will be able to use - the generic PAL framework for accessing the PHY's MMD registers. - An example of such usage is for Energy Efficient Ethernet support, - implemented in the PAL. This support uses the PAL to access MMD - registers for EEE query and configuration if the PHY supports - the IEEE standard access mechanisms, or can use the PHY's specific - access interfaces if overridden by the specific PHY driver. See - the Micrel driver in drivers/net/phy/ for an example of how this - can be implemented. - -Board Fixups - - Sometimes the specific interaction between the platform and the PHY requires - special handling. For instance, to change where the PHY's clock input is, - or to add a delay to account for latency issues in the data path. In order - to support such contingencies, the PHY Layer allows platform code to register - fixups to be run when the PHY is brought up (or subsequently reset). - - When the PHY Layer brings up a PHY it checks to see if there are any fixups - registered for it, matching based on UID (contained in the PHY device's phy_id - field) and the bus identifier (contained in phydev->dev.bus_id). Both must - match, however two constants, PHY_ANY_ID and PHY_ANY_UID, are provided as - wildcards for the bus ID and UID, respectively. - - When a match is found, the PHY layer will invoke the run function associated - with the fixup. This function is passed a pointer to the phy_device of - interest. It should therefore only operate on that PHY. - - The platform code can either register the fixup using phy_register_fixup(): - - int phy_register_fixup(const char *phy_id, - u32 phy_uid, u32 phy_uid_mask, - int (*run)(struct phy_device *)); - - Or using one of the two stubs, phy_register_fixup_for_uid() and - phy_register_fixup_for_id(): - - int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, - int (*run)(struct phy_device *)); - int phy_register_fixup_for_id(const char *phy_id, - int (*run)(struct phy_device *)); - - The stubs set one of the two matching criteria, and set the other one to - match anything. - - When phy_register_fixup() or *_for_uid()/*_for_id() is called at module, - unregister fixup and free allocate memory are required. - - Call one of following function before unloading module. - - int phy_unregister_fixup(const char *phy_id, u32 phy_uid, u32 phy_uid_mask); - int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); - int phy_register_fixup_for_id(const char *phy_id); - -Standards - - IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, Section Two: - http://standards.ieee.org/getieee802/download/802.3-2008_section2.pdf - - RGMII v1.3: - http://web.archive.org/web/20160303212629/http://www.hp.com/rnd/pdfs/RGMIIv1_3.pdf - - RGMII v2.0: - http://web.archive.org/web/20160303171328/http://www.hp.com/rnd/pdfs/RGMIIv2_0_final_hp.pdf -- cgit From d405c7407a5468d4fc11724d76063e0647d80106 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:25:59 -0500 Subject: bpf: allocate 0x06 to new eBPF instruction class JMP32 The new eBPF instruction class JMP32 uses the reserved class number 0x6. Kernel BPF ISA documentation updated accordingly. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- Documentation/networking/filter.txt | 15 ++++++++------- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 2196b824e96c..01603bc2eff1 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -865,7 +865,7 @@ Three LSB bits store instruction class which is one of: BPF_STX 0x03 BPF_STX 0x03 BPF_ALU 0x04 BPF_ALU 0x04 BPF_JMP 0x05 BPF_JMP 0x05 - BPF_RET 0x06 [ class 6 unused, for future if needed ] + BPF_RET 0x06 BPF_JMP32 0x06 BPF_MISC 0x07 BPF_ALU64 0x07 When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... @@ -902,9 +902,9 @@ If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ BPF_END 0xd0 /* eBPF only: endianness conversion */ -If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of: - BPF_JA 0x00 + BPF_JA 0x00 /* BPF_JMP only */ BPF_JEQ 0x10 BPF_JGT 0x20 BPF_JGE 0x30 @@ -912,8 +912,8 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: BPF_JNE 0x50 /* eBPF only: jump != */ BPF_JSGT 0x60 /* eBPF only: signed '>' */ BPF_JSGE 0x70 /* eBPF only: signed '>=' */ - BPF_CALL 0x80 /* eBPF only: function call */ - BPF_EXIT 0x90 /* eBPF only: function return */ + BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ + BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ BPF_JSLT 0xc0 /* eBPF only: signed '<' */ @@ -936,8 +936,9 @@ Classic BPF wastes the whole BPF_RET class to represent a single 'ret' operation. Classic BPF_RET | BPF_K means copy imm32 into return register and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT in eBPF means function exit only. The eBPF program needs to store return -value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is currently -unused and reserved for future use. +value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as +BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide +operands for the comparisons instead. For load and store instructions the 8-bit 'code' field is divided as: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2940a9854f6d..60b99b730a41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2940a9854f6d..60b99b730a41 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -- cgit From 39c6b53cc01fec4c7addd3e841fea5125639c0fd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Jan 2019 12:02:11 +0000 Subject: Documentation: add devlink param file for mlxsw driver Add initial documentation file for devlink params of mlxsw driver. Only "fw_load_policy" is now supported. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-params-mlxsw.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Documentation/networking/devlink-params-mlxsw.txt (limited to 'Documentation') diff --git a/Documentation/networking/devlink-params-mlxsw.txt b/Documentation/networking/devlink-params-mlxsw.txt new file mode 100644 index 000000000000..2c5c67a920c9 --- /dev/null +++ b/Documentation/networking/devlink-params-mlxsw.txt @@ -0,0 +1,2 @@ +fw_load_policy [DEVICE, GENERIC] + Configuration mode: driverinit -- cgit From 9389b5e9467f71360d83310502de1ba0dc8960f1 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 30 Jan 2019 11:24:06 +1000 Subject: dt-bindings: net: dsa: add new MT7530 binding to support MT7621 Add devicetree binding to support the compatible mt7530 switch as used in the MediaTek MT7621 SoC. Signed-off-by: Greg Ungerer Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Acked-by: Sean Wang Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/mt7530.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/mt7530.txt b/Documentation/devicetree/bindings/net/dsa/mt7530.txt index aa3527f71fdc..47aa205ee0bd 100644 --- a/Documentation/devicetree/bindings/net/dsa/mt7530.txt +++ b/Documentation/devicetree/bindings/net/dsa/mt7530.txt @@ -3,12 +3,16 @@ Mediatek MT7530 Ethernet switch Required properties: -- compatible: Must be compatible = "mediatek,mt7530"; +- compatible: may be compatible = "mediatek,mt7530" + or compatible = "mediatek,mt7621" - #address-cells: Must be 1. - #size-cells: Must be 0. - mediatek,mcm: Boolean; if defined, indicates that either MT7530 is the part on multi-chip module belong to MT7623A has or the remotely standalone chip as the function MT7623N reference board provided for. + +If compatible mediatek,mt7530 is set then the following properties are required + - core-supply: Phandle to the regulator node necessary for the core power. - io-supply: Phandle to the regulator node necessary for the I/O power. See Documentation/devicetree/bindings/regulator/mt6323-regulator.txt -- cgit From 7d4194633b29342d93501b53accebf638da134ad Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 1 Feb 2019 11:53:32 +0100 Subject: mac80211: fix missing/malformed documentation Fix the missing and malformed documentation that kernel-doc and sphinx warn about. While at it, also add some things to the docs to fix missing links. Sadly, the only way I could find to fix this was to add some trailing whitespace. Signed-off-by: Johannes Berg --- Documentation/driver-api/80211/mac80211.rst | 3 ++ include/net/mac80211.h | 44 +++++++++++++++++++++++------ net/mac80211/sta_info.h | 18 ++++++++++++ 3 files changed, 57 insertions(+), 8 deletions(-) (limited to 'Documentation') diff --git a/Documentation/driver-api/80211/mac80211.rst b/Documentation/driver-api/80211/mac80211.rst index 85a8335e80b6..eab40bcf3987 100644 --- a/Documentation/driver-api/80211/mac80211.rst +++ b/Documentation/driver-api/80211/mac80211.rst @@ -125,6 +125,9 @@ functions/definitions .. kernel-doc:: include/net/mac80211.h :functions: ieee80211_rx_status +.. kernel-doc:: include/net/mac80211.h + :functions: mac80211_rx_encoding_flags + .. kernel-doc:: include/net/mac80211.h :functions: mac80211_rx_flags diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 396b0c079c3b..de866a7253c9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -942,8 +942,32 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @band: the band to transmit on (use for checking for races) * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC * @ack_frame_id: internal frame ID for TX status, used internally - * @control: union for control data - * @status: union for status data + * @control: union part for control data + * @control.rates: TX rates array to try + * @control.rts_cts_rate_idx: rate for RTS or CTS + * @control.use_rts: use RTS + * @control.use_cts_prot: use RTS/CTS + * @control.short_preamble: use short preamble (CCK only) + * @control.skip_table: skip externally configured rate table + * @control.jiffies: timestamp for expiry on powersave clients + * @control.vif: virtual interface (may be NULL) + * @control.hw_key: key to encrypt with (may be NULL) + * @control.flags: control flags, see &enum mac80211_tx_control_flags + * @control.enqueue_time: enqueue time (for iTXQs) + * @driver_rates: alias to @control.rates to reserve space + * @pad: padding + * @rate_driver_data: driver use area if driver needs @control.rates + * @status: union part for status data + * @status.rates: attempted rates + * @status.ack_signal: ACK signal + * @status.ampdu_ack_len: AMPDU ack length + * @status.ampdu_len: AMPDU length + * @status.antenna: (legacy, kept only for iwlegacy) + * @status.tx_time: airtime consumed for transmission + * @status.is_valid_ack_signal: ACK signal is valid + * @status.status_driver_data: driver use area + * @ack: union part for pure ACK data + * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers * @ampdu_ack_len: number of acked aggregated frames. * relevant only if IEEE80211_TX_STAT_AMPDU was set. @@ -1163,6 +1187,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present * (&struct ieee80211_radiotap_he, mac80211 will fill in + * * - DATA3_DATA_MCS * - DATA3_DATA_DCM * - DATA3_CODING @@ -1170,6 +1195,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * - DATA5_DATA_BW_RU_ALLOC * - DATA6_NSTS * - DATA3_STBC + * * from the RX info data, so leave those zeroed when building this data) * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present * (&struct ieee80211_radiotap_he_mu) @@ -1220,7 +1246,7 @@ enum mac80211_rx_flags { * @RX_ENC_FLAG_HT_GF: This frame was received in a HT-greenfield transmission, * if the driver fills this value it should add * %IEEE80211_RADIOTAP_MCS_HAVE_FMT - * to hw.radiotap_mcs_details to advertise that fact + * to @hw.radiotap_mcs_details to advertise that fact. * @RX_ENC_FLAG_LDPC: LDPC was used * @RX_ENC_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 * @RX_ENC_FLAG_BF: packet was beamformed @@ -2333,12 +2359,14 @@ enum ieee80211_hw_flags { * @radiotap_he: HE radiotap validity flags * * @radiotap_timestamp: Information for the radiotap timestamp field; if the - * 'units_pos' member is set to a non-negative value it must be set to - * a combination of a IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a - * IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value, and then the timestamp + * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status - * device_timestamp. If the 'accuracy' member is non-negative, it's put - * into the accuracy radiotap field and the accuracy known flag is set. + * device_timestamp. + * @radiotap_timestamp.units_pos: Must be set to a combination of a + * IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a + * IEEE80211_RADIOTAP_TIMESTAMP_SPOS_* value. + * @radiotap_timestamp.accuracy: If non-negative, fills the accuracy in the + * radiotap field and the accuracy known flag will be set. * * @netdev_features: netdev features to be set in each netdev created * from this HW. Note that not all features are usable with mac80211, diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 05647d835894..71f7e4973329 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -497,10 +497,28 @@ struct ieee80211_sta_rx_stats { * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to * the BSS one. * @tx_stats: TX statistics + * @tx_stats.packets: # of packets transmitted + * @tx_stats.bytes: # of bytes in all packets transmitted + * @tx_stats.last_rate: last TX rate + * @tx_stats.msdu: # of transmitted MSDUs per TID * @rx_stats: RX statistics + * @rx_stats_avg: averaged RX statistics + * @rx_stats_avg.signal: averaged signal + * @rx_stats_avg.chain_signal: averaged per-chain signal * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs * this (by advertising the USES_RSS hw flag) * @status_stats: TX status statistics + * @status_stats.filtered: # of filtered frames + * @status_stats.retry_failed: # of frames that failed after retry + * @status_stats.retry_count: # of retries attempted + * @status_stats.lost_packets: # of lost packets + * @status_stats.last_tdls_pkt_time: timestamp of last TDLS packet + * @status_stats.msdu_retries: # of MSDU retries + * @status_stats.msdu_failed: # of failed MSDUs + * @status_stats.last_ack: last ack timestamp (jiffies) + * @status_stats.last_ack_signal: last ACK signal + * @status_stats.ack_signal_filled: last ACK signal validity + * @status_stats.avg_ack_signal: average ACK signal */ struct sta_info { /* General information, mostly static */ -- cgit From 3fc46fc9f68c71e21afde62984444b2ffdd1ea90 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Thu, 31 Jan 2019 11:14:18 +0100 Subject: ipconfig: add carrier_timeout kernel parameter commit 3fb72f1e6e61 ("ipconfig wait for carrier") added a "wait for carrier" policy, with a fixed worst case maximum wait of two minutes. Now make the wait for carrier timeout configurable on the kernel commandline and use the 120s as the default. The timeout messages introduced with commit 5e404cd65860 ("ipconfig: add informative timeout messages while waiting for carrier") are done in a fixed interval of 20 seconds, just like they were before (240/12). Signed-off-by: Martin Kepplinger Signed-off-by: David S. Miller --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ net/ipv4/ipconfig.c | 27 ++++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b799bcf67d7b..7afb2fedde0a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -461,6 +461,11 @@ possible to determine what the correct size should be. This option provides an override for these situations. + carrier_timeout= + [NET] Specifies amount of time (in seconds) that + the kernel should wait for a network carrier. By default + it waits 120 seconds. + ca_keys= [KEYS] This parameter identifies a specific key(s) on the system trusted keyring to be used for certificate trust validation. diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b9a9873c25c6..9bcca08efec9 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -85,7 +85,6 @@ /* Define the friendly delay before and after opening net devices */ #define CONF_POST_OPEN 10 /* After opening: 10 msecs */ -#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ @@ -101,6 +100,9 @@ #define NONE cpu_to_be32(INADDR_NONE) #define ANY cpu_to_be32(INADDR_ANY) +/* Wait for carrier timeout default in seconds */ +static unsigned int carrier_timeout = 120; + /* * Public IP configuration */ @@ -268,9 +270,9 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; - next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); + next_msg = start + msecs_to_jiffies(20000); while (time_before(jiffies, start + - msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) { + msecs_to_jiffies(carrier_timeout * 1000))) { int wait, elapsed; for_each_netdev(&init_net, dev) @@ -283,9 +285,9 @@ static int __init ic_open_devs(void) continue; elapsed = jiffies_to_msecs(jiffies - start); - wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000; + wait = (carrier_timeout * 1000 - elapsed + 500) / 1000; pr_info("Waiting up to %d more seconds for network.\n", wait); - next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); + next_msg = jiffies + msecs_to_jiffies(20000); } have_carrier: rtnl_unlock(); @@ -1780,3 +1782,18 @@ static int __init vendor_class_identifier_setup(char *addrs) return 1; } __setup("dhcpclass=", vendor_class_identifier_setup); + +static int __init set_carrier_timeout(char *str) +{ + ssize_t ret; + + if (!str) + return 0; + + ret = kstrtouint(str, 0, &carrier_timeout); + if (ret) + return 0; + + return 1; +} +__setup("carrier_timeout=", set_carrier_timeout); -- cgit From 785bd550c4fb0e12cb29dd24cf0a0be103f145a0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 31 Jan 2019 10:50:42 -0800 Subject: devlink: add generic info version names Add defines and docs for generic info versions. v3: - add docs; - separate patch (Jiri). Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink-info-versions.rst | 38 ++++++++++++++++++++++ Documentation/networking/index.rst | 1 + include/net/devlink.h | 14 ++++++++ 3 files changed, 53 insertions(+) create mode 100644 Documentation/networking/devlink-info-versions.rst (limited to 'Documentation') diff --git a/Documentation/networking/devlink-info-versions.rst b/Documentation/networking/devlink-info-versions.rst new file mode 100644 index 000000000000..7d4ecf6b6f34 --- /dev/null +++ b/Documentation/networking/devlink-info-versions.rst @@ -0,0 +1,38 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +===================== +Devlink info versions +===================== + +board.id +======== + +Unique identifier of the board design. + +board.rev +========= + +Board design revision. + +fw.mgmt +======= + +Control unit firmware version. This firmware is responsible for house +keeping tasks, PHY control etc. but not the packet-by-packet data path +operation. + +fw.app +====== + +Data path microcode controlling high-speed packet processing. + +fw.undi +======= + +UNDI software, may include the UEFI driver, firmware or both. + +fw.ncsi +======= + +Version of the software responsible for supporting/handling the +Network Controller Sideband Interface. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index f1627ca2a0ea..9a32451cd201 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -24,6 +24,7 @@ Contents: device_drivers/intel/i40e device_drivers/intel/iavf device_drivers/intel/ice + devlink-info-versions kapi z8530book msg_zerocopy diff --git a/include/net/devlink.h b/include/net/devlink.h index 6dc0ef964392..6b417f141fd6 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -428,6 +428,20 @@ enum devlink_param_wol_types { .validate = _validate, \ } +/* Part number, identifier of board design */ +#define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" +/* Revision of board design */ +#define DEVLINK_INFO_VERSION_GENERIC_BOARD_REV "board.rev" + +/* Control processor FW version */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_MGMT "fw.mgmt" +/* Data path microcode controlling high-speed packet processing */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_APP "fw.app" +/* UNDI software version */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_UNDI "fw.undi" +/* NCSI support/handler version */ +#define DEVLINK_INFO_VERSION_GENERIC_FW_NCSI "fw.ncsi" + struct devlink_region; struct devlink_info_req; -- cgit From 0bb16830ebb6220f35ee9fc2c60be0958fc5be6f Mon Sep 17 00:00:00 2001 From: Alex Williams Date: Thu, 31 Jan 2019 13:33:28 -0800 Subject: net: nixge: Update device-tree bindings with v3.00 Now the DMA engine is free to float elsewhere in the system map. Signed-off-by: Alex Williams Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/nixge.txt | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/nixge.txt b/Documentation/devicetree/bindings/net/nixge.txt index e55af7f0881a..44a7358b4399 100644 --- a/Documentation/devicetree/bindings/net/nixge.txt +++ b/Documentation/devicetree/bindings/net/nixge.txt @@ -1,8 +1,14 @@ * NI XGE Ethernet controller Required properties: -- compatible: Should be "ni,xge-enet-2.00" -- reg: Address and length of the register set for the device +- compatible: Should be "ni,xge-enet-3.00", but can be "ni,xge-enet-2.00" for + older device trees with DMA engines co-located in the address map, + with the one reg entry to describe the whole device. +- reg: Address and length of the register set for the device. It contains the + information of registers in the same order as described by reg-names. +- reg-names: Should contain the reg names + "dma": DMA engine control and status region + "ctrl": MDIO and PHY control and status region - interrupts: Should contain tx and rx interrupt - interrupt-names: Should be "rx" and "tx" - phy-mode: See ethernet.txt file in the same directory. @@ -12,8 +18,10 @@ Required properties: Examples (10G generic PHY): nixge0: ethernet@40000000 { - compatible = "ni,xge-enet-2.00"; - reg = <0x40000000 0x6000>; + compatible = "ni,xge-enet-3.00"; + reg = <0x40000000 0x4000 + 0x41002000 0x2000>; + reg-names = "dma", "ctrl"; nvmem-cells = <ð1_addr>; nvmem-cell-names = "address"; -- cgit From 9dd49211b8ca85dfe7ee9054f2695fe790fa0874 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 2 Feb 2019 07:34:52 -0800 Subject: socket: Update timestamping Documentation With the new y2038 safe timestamping options added, update the documentation to reflect the changes. Signed-off-by: Deepa Dinamani Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 43 +++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 9d1432e0aaa8..bbdaf8990031 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -6,11 +6,21 @@ The interfaces for receiving network packages timestamps are: * SO_TIMESTAMP Generates a timestamp for each incoming packet in (not necessarily monotonic) system time. Reports the timestamp via recvmsg() in a - control message as struct timeval (usec resolution). + control message in usec resolution. + SO_TIMESTAMP is defined as SO_TIMESTAMP_NEW or SO_TIMESTAMP_OLD + based on the architecture type and time_t representation of libc. + Control message format is in struct __kernel_old_timeval for + SO_TIMESTAMP_OLD and in struct __kernel_sock_timeval for + SO_TIMESTAMP_NEW options respectively. * SO_TIMESTAMPNS Same timestamping mechanism as SO_TIMESTAMP, but reports the - timestamp as struct timespec (nsec resolution). + timestamp as struct timespec in nsec resolution. + SO_TIMESTAMPNS is defined as SO_TIMESTAMPNS_NEW or SO_TIMESTAMPNS_OLD + based on the architecture type and time_t representation of libc. + Control message format is in struct timespec for SO_TIMESTAMPNS_OLD + and in struct __kernel_timespec for SO_TIMESTAMPNS_NEW options + respectively. * IP_MULTICAST_LOOP + SO_TIMESTAMP[NS] Only for multicast:approximate transmit timestamp obtained by @@ -22,7 +32,7 @@ The interfaces for receiving network packages timestamps are: timestamps for stream sockets. -1.1 SO_TIMESTAMP: +1.1 SO_TIMESTAMP (also SO_TIMESTAMP_OLD and SO_TIMESTAMP_NEW): This socket option enables timestamping of datagrams on the reception path. Because the destination socket, if any, is not known early in @@ -31,15 +41,25 @@ same is true for all early receive timestamp options. For interface details, see `man 7 socket`. +Always use SO_TIMESTAMP_NEW timestamp to always get timestamp in +struct __kernel_sock_timeval format. -1.2 SO_TIMESTAMPNS: +SO_TIMESTAMP_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + +1.2 SO_TIMESTAMPNS (also SO_TIMESTAMPNS_OLD and SO_TIMESTAMPNS_NEW): This option is identical to SO_TIMESTAMP except for the returned data type. Its struct timespec allows for higher resolution (ns) timestamps than the timeval of SO_TIMESTAMP (ms). +Always use SO_TIMESTAMPNS_NEW timestamp to always get timestamp in +struct __kernel_timespec format. + +SO_TIMESTAMPNS_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. -1.3 SO_TIMESTAMPING: +1.3 SO_TIMESTAMPING (also SO_TIMESTAMPING_OLD and SO_TIMESTAMPING_NEW): Supports multiple types of timestamp requests. As a result, this socket option takes a bitmap of flags, not a boolean. In @@ -323,10 +343,23 @@ SO_TIMESTAMP and SO_TIMESTAMPNS records can be retrieved. These timestamps are returned in a control message with cmsg_level SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type +For SO_TIMESTAMPING_OLD: + struct scm_timestamping { struct timespec ts[3]; }; +For SO_TIMESTAMPING_NEW: + +struct scm_timestamping64 { + struct __kernel_timespec ts[3]; + +Always use SO_TIMESTAMPING_NEW timestamp to always get timestamp in +struct scm_timestamping64 format. + +SO_TIMESTAMPING_OLD returns incorrect timestamps after the year 2038 +on 32 bit machines. + The structure can return up to three timestamps. This is a legacy feature. At least one field is non-zero at any time. Most timestamps are passed in ts[0]. Hardware timestamps are passed in ts[2]. -- cgit From 5468e82f7034f0ae175a3ce075441356099bdaa3 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 4 Feb 2019 11:26:18 +0100 Subject: net: phy: fixed-phy: Drop GPIO from fixed_phy_add() All users of the fixed_phy_add() pass -1 as GPIO number to the fixed phy driver, and all users of fixed_phy_register() pass -1 as GPIO number as well, except for the device tree MDIO bus. Any new users should create a proper device and pass the GPIO as a descriptor associated with the device so delete the GPIO argument from the calls and drop the code looking requesting a GPIO in fixed_phy_add(). In fixed phy_register(), investigate the "fixed-link" node and pick the GPIO descriptor from "link-gpios" if this property exists. Move the corresponding code out of of_mdio.c as the fixed phy code anyways requires OF to be in use. Tested-by: Andrew Lunn Signed-off-by: Linus Walleij Signed-off-by: David S. Miller --- .../networking/device_drivers/stmicro/stmmac.txt | 2 +- arch/m68k/coldfire/m5272.c | 2 +- arch/mips/ar7/platform.c | 4 +- arch/mips/bcm47xx/setup.c | 2 +- drivers/net/dsa/dsa_loop.c | 2 +- drivers/net/ethernet/broadcom/bgmac.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmmii.c | 2 +- drivers/net/phy/fixed_phy.c | 82 ++++++++++++++++------ drivers/net/usb/lan78xx.c | 3 +- drivers/of/of_mdio.c | 9 +-- include/linux/phy_fixed.h | 8 +-- 11 files changed, 72 insertions(+), 46 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/stmicro/stmmac.txt b/Documentation/networking/device_drivers/stmicro/stmmac.txt index 2bb07078f535..1ae979fd90d2 100644 --- a/Documentation/networking/device_drivers/stmicro/stmmac.txt +++ b/Documentation/networking/device_drivers/stmicro/stmmac.txt @@ -267,7 +267,7 @@ static struct fixed_phy_status stmmac0_fixed_phy_status = { During the board's device_init we can configure the first MAC for fixed_link by calling: - fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status, -1); + fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status); and the second one, with a real PHY device attached to the bus, by using the stmmac_mdio_bus_data structure (to provide the id, the reset procedure etc). diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index ad1185c68df7..6b3ab583c698 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -127,7 +127,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status, -1); + fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status); return 0; } diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c index f09262e0a72f..10ff07b7721e 100644 --- a/arch/mips/ar7/platform.c +++ b/arch/mips/ar7/platform.c @@ -683,7 +683,7 @@ static int __init ar7_register_devices(void) if (ar7_has_high_cpmac()) { res = fixed_phy_add(PHY_POLL, cpmac_high.id, - &fixed_phy_status, -1); + &fixed_phy_status); if (!res) { cpmac_get_mac(1, cpmac_high_data.dev_addr); @@ -696,7 +696,7 @@ static int __init ar7_register_devices(void) } else cpmac_low_data.phy_mask = 0xffffffff; - res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status, -1); + res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status); if (!res) { cpmac_get_mac(0, cpmac_low_data.dev_addr); res = platform_device_register(&cpmac_low); diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index fe3773539eff..82627c264964 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -274,7 +274,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status, -1); + fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 816f34d64736..17482ae09aa5 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -343,7 +343,7 @@ static int __init dsa_loop_init(void) unsigned int i; for (i = 0; i < NUM_FIXED_PHYS; i++) - phydevs[i] = fixed_phy_register(PHY_POLL, &status, -1, NULL); + phydevs[i] = fixed_phy_register(PHY_POLL, &status, NULL); return mdio_driver_register(&dsa_loop_drv); } diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 2d3a44c40221..4632dd5dbad1 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1446,7 +1446,7 @@ int bgmac_phy_connect_direct(struct bgmac *bgmac) struct phy_device *phy_dev; int err; - phy_dev = fixed_phy_register(PHY_POLL, &fphy_status, -1, NULL); + phy_dev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); if (!phy_dev || IS_ERR(phy_dev)) { dev_err(bgmac->dev, "Failed to register fixed PHY device\n"); return -ENODEV; diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index aceb9b7b55bd..51880d83131a 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -525,7 +525,7 @@ static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv) .asym_pause = 0, }; - phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1, NULL); + phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); if (!phydev || IS_ERR(phydev)) { dev_err(kdev, "failed to register fixed PHY device\n"); return -ENODEV; diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 47a8cb574c45..f136a23c1a35 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -38,7 +38,7 @@ struct fixed_phy { bool no_carrier; int (*link_update)(struct net_device *, struct fixed_phy_status *); struct list_head node; - int link_gpio; + struct gpio_desc *link_gpiod; }; static struct platform_device *pdev; @@ -67,8 +67,8 @@ EXPORT_SYMBOL_GPL(fixed_phy_change_carrier); static void fixed_phy_update(struct fixed_phy *fp) { - if (!fp->no_carrier && gpio_is_valid(fp->link_gpio)) - fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio); + if (!fp->no_carrier && fp->link_gpiod) + fp->status.link = !!gpiod_get_value_cansleep(fp->link_gpiod); } static int fixed_mdio_read(struct mii_bus *bus, int phy_addr, int reg_num) @@ -133,9 +133,9 @@ int fixed_phy_set_link_update(struct phy_device *phydev, } EXPORT_SYMBOL_GPL(fixed_phy_set_link_update); -int fixed_phy_add(unsigned int irq, int phy_addr, - struct fixed_phy_status *status, - int link_gpio) +static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, + struct fixed_phy_status *status, + struct gpio_desc *gpiod) { int ret; struct fixed_mdio_bus *fmb = &platform_fmb; @@ -156,24 +156,19 @@ int fixed_phy_add(unsigned int irq, int phy_addr, fp->addr = phy_addr; fp->status = *status; - fp->link_gpio = link_gpio; - - if (gpio_is_valid(fp->link_gpio)) { - ret = gpio_request_one(fp->link_gpio, GPIOF_DIR_IN, - "fixed-link-gpio-link"); - if (ret) - goto err_regs; - } + fp->link_gpiod = gpiod; fixed_phy_update(fp); list_add_tail(&fp->node, &fmb->phys); return 0; +} -err_regs: - kfree(fp); - return ret; +int fixed_phy_add(unsigned int irq, int phy_addr, + struct fixed_phy_status *status) { + + return fixed_phy_add_gpiod(irq, phy_addr, status, NULL); } EXPORT_SYMBOL_GPL(fixed_phy_add); @@ -187,8 +182,8 @@ static void fixed_phy_del(int phy_addr) list_for_each_entry_safe(fp, tmp, &fmb->phys, node) { if (fp->addr == phy_addr) { list_del(&fp->node); - if (gpio_is_valid(fp->link_gpio)) - gpio_free(fp->link_gpio); + if (fp->link_gpiod) + gpiod_put(fp->link_gpiod); kfree(fp); ida_simple_remove(&phy_fixed_ida, phy_addr); return; @@ -196,12 +191,50 @@ static void fixed_phy_del(int phy_addr) } } +#ifdef CONFIG_OF_GPIO +static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np) +{ + struct device_node *fixed_link_node; + struct gpio_desc *gpiod; + + if (!np) + return NULL; + + fixed_link_node = of_get_child_by_name(np, "fixed-link"); + if (!fixed_link_node) + return NULL; + + /* + * As the fixed link is just a device tree node without any + * Linux device associated with it, we simply have obtain + * the GPIO descriptor from the device tree like this. + */ + gpiod = gpiod_get_from_of_node(fixed_link_node, "link-gpios", 0, + GPIOD_IN, "mdio"); + of_node_put(fixed_link_node); + if (IS_ERR(gpiod)) { + if (PTR_ERR(gpiod) == -EPROBE_DEFER) + return gpiod; + pr_err("error getting GPIO for fixed link %pOF, proceed without\n", + fixed_link_node); + gpiod = NULL; + } + + return gpiod; +} +#else +static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np) +{ + return NULL; +} +#endif + struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, - int link_gpio, struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; + struct gpio_desc *gpiod = NULL; struct phy_device *phy; int phy_addr; int ret; @@ -209,12 +242,17 @@ struct phy_device *fixed_phy_register(unsigned int irq, if (!fmb->mii_bus || fmb->mii_bus->state != MDIOBUS_REGISTERED) return ERR_PTR(-EPROBE_DEFER); + /* Check if we have a GPIO associated with this fixed phy */ + gpiod = fixed_phy_get_gpiod(np); + if (IS_ERR(gpiod)) + return ERR_CAST(gpiod); + /* Get the next available PHY address, up to PHY_MAX_ADDR */ phy_addr = ida_simple_get(&phy_fixed_ida, 0, PHY_MAX_ADDR, GFP_KERNEL); if (phy_addr < 0) return ERR_PTR(phy_addr); - ret = fixed_phy_add(irq, phy_addr, status, link_gpio); + ret = fixed_phy_add_gpiod(irq, phy_addr, status, gpiod); if (ret < 0) { ida_simple_remove(&phy_fixed_ida, phy_addr); return ERR_PTR(ret); diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index e96bc0c6140f..3d92ea6fcc02 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2051,8 +2051,7 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) phydev = phy_find_first(dev->mdiobus); if (!phydev) { netdev_dbg(dev->net, "PHY Not Found!! Registering Fixed PHY\n"); - phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1, - NULL); + phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); if (IS_ERR(phydev)) { netdev_err(dev->net, "No PHY/fixed_PHY found\n"); return NULL; diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 5ad1342f5682..de6157357e26 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -463,7 +462,6 @@ int of_phy_register_fixed_link(struct device_node *np) struct device_node *fixed_link_node; u32 fixed_link_prop[5]; const char *managed; - int link_gpio = -1; if (of_property_read_string(np, "managed", &managed) == 0 && strcmp(managed, "in-band-status") == 0) { @@ -485,11 +483,7 @@ int of_phy_register_fixed_link(struct device_node *np) status.pause = of_property_read_bool(fixed_link_node, "pause"); status.asym_pause = of_property_read_bool(fixed_link_node, "asym-pause"); - link_gpio = of_get_named_gpio_flags(fixed_link_node, - "link-gpios", 0, NULL); of_node_put(fixed_link_node); - if (link_gpio == -EPROBE_DEFER) - return -EPROBE_DEFER; goto register_phy; } @@ -508,8 +502,7 @@ int of_phy_register_fixed_link(struct device_node *np) return -ENODEV; register_phy: - return PTR_ERR_OR_ZERO(fixed_phy_register(PHY_POLL, &status, link_gpio, - np)); + return PTR_ERR_OR_ZERO(fixed_phy_register(PHY_POLL, &status, np)); } EXPORT_SYMBOL(of_phy_register_fixed_link); diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 9525567b1951..c78fc203db43 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -15,11 +15,9 @@ struct device_node; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); extern int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status, - int link_gpio); + struct fixed_phy_status *status); extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, - int link_gpio, struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -27,14 +25,12 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, struct fixed_phy_status *)); #else static inline int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status, - int link_gpio) + struct fixed_phy_status *status) { return -ENODEV; } static inline struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, - int gpio_link, struct device_node *np) { return ERR_PTR(-ENODEV); -- cgit From e8cb0167ae684ed2f73cd880c385b74ef2ae702e Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 5 Feb 2019 13:41:24 +0100 Subject: bpf, doc: add RISC-V JIT to BPF documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update Documentation/networking/filter.txt and Documentation/sysctl/net.txt to mention RISC-V. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- Documentation/networking/filter.txt | 16 +++++++++------- Documentation/sysctl/net.txt | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 01603bc2eff1..b5e060edfc38 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -464,10 +464,11 @@ breakpoints: 0 1 JIT compiler ------------ -The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, PowerPC, -ARM, ARM64, MIPS and s390 and can be enabled through CONFIG_BPF_JIT. The JIT -compiler is transparently invoked for each attached filter from user space -or for internal kernel users if it has been previously enabled by root: +The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, +PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through +CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each +attached filter from user space or for internal kernel users if it has +been previously enabled by root: echo 1 > /proc/sys/net/core/bpf_jit_enable @@ -603,9 +604,10 @@ got from bpf_prog_create(), and 'ctx' the given context (e.g. skb pointer). All constraints and restrictions from bpf_check_classic() apply before a conversion to the new layout is being done behind the scenes! -Currently, the classic BPF format is being used for JITing on most 32-bit -architectures, whereas x86-64, aarch64, s390x, powerpc64, sparc64, arm32 perform -JIT compilation from eBPF instruction set. +Currently, the classic BPF format is being used for JITing on most +32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64, +sparc64, arm32, riscv (RV64G) perform JIT compilation from eBPF +instruction set. Some core changes of the new internal format: diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index bc0680706870..2ae91d3873bb 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -52,6 +52,7 @@ two flavors of JITs, the newer eBPF JIT currently supported on: - sparc64 - mips64 - s390x + - riscv And the older cBPF JIT supported on the following archs: - mips -- cgit From dd648818dad3ad5ab5151c7f5ca0bb55a0bf71de Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Mon, 4 Feb 2019 09:30:38 -0800 Subject: net: nixge: Make mdio child node optional Make MDIO child optional and only instantiate the MDIO bus if the child is actually present. There are currently no (in-tree) users of this binding; all (out-of-tree) users use overlays that get shipped together with the FPGA images that contain the IP. This will significantly increase maintainabilty of future revisions of this IP. Reviewed-by: Andrew Lunn Signed-off-by: Moritz Fischer Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/nixge.txt | 27 ++++++++++++++++++++++--- drivers/net/ethernet/ni/nixge.c | 19 +++++++++++------ 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/nixge.txt b/Documentation/devicetree/bindings/net/nixge.txt index 44a7358b4399..bb2929f9c64f 100644 --- a/Documentation/devicetree/bindings/net/nixge.txt +++ b/Documentation/devicetree/bindings/net/nixge.txt @@ -16,6 +16,9 @@ Required properties: - nvmem-cells: Phandle of nvmem cell containing the MAC address - nvmem-cell-names: Should be "address" +Optional properties: +- mdio subnode to indicate presence of MDIO controller + Examples (10G generic PHY): nixge0: ethernet@40000000 { compatible = "ni,xge-enet-3.00"; @@ -33,8 +36,26 @@ Examples (10G generic PHY): phy-mode = "xgmii"; phy-handle = <ðernet_phy1>; - ethernet_phy1: ethernet-phy@4 { - compatible = "ethernet-phy-ieee802.3-c45"; - reg = <4>; + mdio { + ethernet_phy1: ethernet-phy@4 { + compatible = "ethernet-phy-ieee802.3-c45"; + reg = <4>; + }; }; }; + +Examples (10G generic PHY, no MDIO): + nixge0: ethernet@40000000 { + compatible = "ni,xge-enet-2.00"; + reg = <0x40000000 0x6000>; + + nvmem-cells = <ð1_addr>; + nvmem-cell-names = "address"; + + interrupts = <0 29 IRQ_TYPE_LEVEL_HIGH>, <0 30 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "rx", "tx"; + interrupt-parent = <&intc>; + + phy-mode = "xgmii"; + phy-handle = <ðernet_phy1>; + }; diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c index 73a98bd2fcd2..c8dd1e4c759d 100644 --- a/drivers/net/ethernet/ni/nixge.c +++ b/drivers/net/ethernet/ni/nixge.c @@ -1284,6 +1284,7 @@ static int nixge_probe(struct platform_device *pdev) { struct nixge_priv *priv; struct net_device *ndev; + struct device_node *mn; const u8 *mac_addr; int err; @@ -1335,10 +1336,14 @@ static int nixge_probe(struct platform_device *pdev) priv->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD; priv->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD; - err = nixge_mdio_setup(priv, pdev->dev.of_node); - if (err) { - netdev_err(ndev, "error registering mdio bus"); - goto free_netdev; + mn = of_get_child_by_name(pdev->dev.of_node, "mdio"); + if (mn) { + err = nixge_mdio_setup(priv, mn); + of_node_put(mn); + if (err) { + netdev_err(ndev, "error registering mdio bus"); + goto free_netdev; + } } priv->phy_mode = of_get_phy_mode(pdev->dev.of_node); @@ -1364,7 +1369,8 @@ static int nixge_probe(struct platform_device *pdev) return 0; unregister_mdio: - mdiobus_unregister(priv->mii_bus); + if (priv->mii_bus) + mdiobus_unregister(priv->mii_bus); free_netdev: free_netdev(ndev); @@ -1379,7 +1385,8 @@ static int nixge_remove(struct platform_device *pdev) unregister_netdev(ndev); - mdiobus_unregister(priv->mii_bus); + if (priv->mii_bus) + mdiobus_unregister(priv->mii_bus); free_netdev(ndev); -- cgit From baaac2fb0dda4ce2204e6650138cd578c4b11236 Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Mon, 4 Feb 2019 09:30:40 -0800 Subject: dt-bindings: net: Add fixed-link support Update device-tree binding with fixed-link support. With fixed-link support the formerly required property 'phy-handle' is now optional if 'fixed-link' child is present. Signed-off-by: Moritz Fischer Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/nixge.txt | 33 ++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/nixge.txt b/Documentation/devicetree/bindings/net/nixge.txt index bb2929f9c64f..85d7240a9b20 100644 --- a/Documentation/devicetree/bindings/net/nixge.txt +++ b/Documentation/devicetree/bindings/net/nixge.txt @@ -12,12 +12,14 @@ Required properties: - interrupts: Should contain tx and rx interrupt - interrupt-names: Should be "rx" and "tx" - phy-mode: See ethernet.txt file in the same directory. -- phy-handle: See ethernet.txt file in the same directory. - nvmem-cells: Phandle of nvmem cell containing the MAC address - nvmem-cell-names: Should be "address" Optional properties: - mdio subnode to indicate presence of MDIO controller +- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory. + Use instead of phy-handle. +- phy-handle: See ethernet.txt file in the same directory. Examples (10G generic PHY): nixge0: ethernet@40000000 { @@ -59,3 +61,32 @@ Examples (10G generic PHY, no MDIO): phy-mode = "xgmii"; phy-handle = <ðernet_phy1>; }; + +Examples (1G generic fixed-link + MDIO): + nixge0: ethernet@40000000 { + compatible = "ni,xge-enet-2.00"; + reg = <0x40000000 0x6000>; + + nvmem-cells = <ð1_addr>; + nvmem-cell-names = "address"; + + interrupts = <0 29 IRQ_TYPE_LEVEL_HIGH>, <0 30 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "rx", "tx"; + interrupt-parent = <&intc>; + + phy-mode = "xgmii"; + + fixed-link { + speed = <1000>; + pause; + link-gpios = <&gpio0 63 GPIO_ACTIVE_HIGH>; + }; + + mdio { + ethernet_phy1: ethernet-phy@4 { + compatible = "ethernet-phy-ieee802.3-c22"; + reg = <4>; + }; + }; + + }; -- cgit From facd86390be23f0a956136a63d415d90f300fb88 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 12 Dec 2018 14:44:24 -0800 Subject: docs/networking: fix formatting of Intel drivers documentation The documentation of Intel drivers is missing the heading adornment for document titles. This causes the generated html to have TOC entries from these documents to appear as top level TOC entries: * Linux* Base Driver for Intel(R) Ethernet Network Connection * Contents * Identifying Your Adapter * Command Line Parameters * AutoNeg * Duplex ... Add overline heading adornment to document titles. Signed-off-by: Mike Rapoport Signed-off-by: Jeff Kirsher --- Documentation/networking/device_drivers/intel/e100.rst | 1 + Documentation/networking/device_drivers/intel/e1000.rst | 1 + Documentation/networking/device_drivers/intel/e1000e.rst | 1 + Documentation/networking/device_drivers/intel/fm10k.rst | 1 + Documentation/networking/device_drivers/intel/i40e.rst | 1 + Documentation/networking/device_drivers/intel/iavf.rst | 1 + Documentation/networking/device_drivers/intel/ice.rst | 1 + Documentation/networking/device_drivers/intel/igb.rst | 1 + Documentation/networking/device_drivers/intel/igbvf.rst | 1 + Documentation/networking/device_drivers/intel/ixgb.rst | 1 + Documentation/networking/device_drivers/intel/ixgbe.rst | 1 + Documentation/networking/device_drivers/intel/ixgbevf.rst | 1 + 12 files changed, 12 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/intel/e100.rst b/Documentation/networking/device_drivers/intel/e100.rst index 5e2839b4ec92..2b9f4887beda 100644 --- a/Documentation/networking/device_drivers/intel/e100.rst +++ b/Documentation/networking/device_drivers/intel/e100.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +============================================================== Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters ============================================================== diff --git a/Documentation/networking/device_drivers/intel/e1000.rst b/Documentation/networking/device_drivers/intel/e1000.rst index 6379d4d20771..956560b6e745 100644 --- a/Documentation/networking/device_drivers/intel/e1000.rst +++ b/Documentation/networking/device_drivers/intel/e1000.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +=========================================================== Linux* Base Driver for Intel(R) Ethernet Network Connection =========================================================== diff --git a/Documentation/networking/device_drivers/intel/e1000e.rst b/Documentation/networking/device_drivers/intel/e1000e.rst index 33554e5416c5..01999f05509c 100644 --- a/Documentation/networking/device_drivers/intel/e1000e.rst +++ b/Documentation/networking/device_drivers/intel/e1000e.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +====================================================== Linux* Driver for Intel(R) Ethernet Network Connection ====================================================== diff --git a/Documentation/networking/device_drivers/intel/fm10k.rst b/Documentation/networking/device_drivers/intel/fm10k.rst index bf5e5942f28d..ac3269e34f55 100644 --- a/Documentation/networking/device_drivers/intel/fm10k.rst +++ b/Documentation/networking/device_drivers/intel/fm10k.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +============================================================== Linux* Base Driver for Intel(R) Ethernet Multi-host Controller ============================================================== diff --git a/Documentation/networking/device_drivers/intel/i40e.rst b/Documentation/networking/device_drivers/intel/i40e.rst index 0cc16c525d10..848fd388fa6e 100644 --- a/Documentation/networking/device_drivers/intel/i40e.rst +++ b/Documentation/networking/device_drivers/intel/i40e.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +================================================================== Linux* Base Driver for the Intel(R) Ethernet Controller 700 Series ================================================================== diff --git a/Documentation/networking/device_drivers/intel/iavf.rst b/Documentation/networking/device_drivers/intel/iavf.rst index f8b42b64eb28..2d0c3baa1752 100644 --- a/Documentation/networking/device_drivers/intel/iavf.rst +++ b/Documentation/networking/device_drivers/intel/iavf.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +================================================================== Linux* Base Driver for Intel(R) Ethernet Adaptive Virtual Function ================================================================== diff --git a/Documentation/networking/device_drivers/intel/ice.rst b/Documentation/networking/device_drivers/intel/ice.rst index 4d118b827bbb..c220aa2711c6 100644 --- a/Documentation/networking/device_drivers/intel/ice.rst +++ b/Documentation/networking/device_drivers/intel/ice.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +=================================================================== Linux* Base Driver for the Intel(R) Ethernet Connection E800 Series =================================================================== diff --git a/Documentation/networking/device_drivers/intel/igb.rst b/Documentation/networking/device_drivers/intel/igb.rst index e87a4a72ea2d..fc8cfaa5dcfa 100644 --- a/Documentation/networking/device_drivers/intel/igb.rst +++ b/Documentation/networking/device_drivers/intel/igb.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +=========================================================== Linux* Base Driver for Intel(R) Ethernet Network Connection =========================================================== diff --git a/Documentation/networking/device_drivers/intel/igbvf.rst b/Documentation/networking/device_drivers/intel/igbvf.rst index a8a9ffa4f8d3..9cddabe8108e 100644 --- a/Documentation/networking/device_drivers/intel/igbvf.rst +++ b/Documentation/networking/device_drivers/intel/igbvf.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +============================================================ Linux* Base Virtual Function Driver for Intel(R) 1G Ethernet ============================================================ diff --git a/Documentation/networking/device_drivers/intel/ixgb.rst b/Documentation/networking/device_drivers/intel/ixgb.rst index 8bd80e27843d..945018207a92 100644 --- a/Documentation/networking/device_drivers/intel/ixgb.rst +++ b/Documentation/networking/device_drivers/intel/ixgb.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +===================================================================== Linux Base Driver for 10 Gigabit Intel(R) Ethernet Network Connection ===================================================================== diff --git a/Documentation/networking/device_drivers/intel/ixgbe.rst b/Documentation/networking/device_drivers/intel/ixgbe.rst index 86d887a63606..c7d25483fedb 100644 --- a/Documentation/networking/device_drivers/intel/ixgbe.rst +++ b/Documentation/networking/device_drivers/intel/ixgbe.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +============================================================================= Linux* Base Driver for the Intel(R) Ethernet 10 Gigabit PCI Express Adapters ============================================================================= diff --git a/Documentation/networking/device_drivers/intel/ixgbevf.rst b/Documentation/networking/device_drivers/intel/ixgbevf.rst index 56cde6366c2f..5d4977360157 100644 --- a/Documentation/networking/device_drivers/intel/ixgbevf.rst +++ b/Documentation/networking/device_drivers/intel/ixgbevf.rst @@ -1,5 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0+ +============================================================= Linux* Base Virtual Function Driver for Intel(R) 10G Ethernet ============================================================= -- cgit From db2ab7a08f06aa4dd784b87bafeb3169ee7f0d95 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 7 Feb 2019 11:36:42 +0200 Subject: devlink: Add Documentation/networking/devlink-health.txt This patch adds a new file to add information about devlink health mechanism. Signed-off-by: Aya Levin Signed-off-by: Eran Ben Elisha Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink-health.txt | 86 +++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 Documentation/networking/devlink-health.txt (limited to 'Documentation') diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt new file mode 100644 index 000000000000..1db3fbea0831 --- /dev/null +++ b/Documentation/networking/devlink-health.txt @@ -0,0 +1,86 @@ +The health mechanism is targeted for Real Time Alerting, in order to know when +something bad had happened to a PCI device +- Provide alert debug information +- Self healing +- If problem needs vendor support, provide a way to gather all needed debugging + information. + +The main idea is to unify and centralize driver health reports in the +generic devlink instance and allow the user to set different +attributes of the health reporting and recovery procedures. + +The devlink health reporter: +Device driver creates a "health reporter" per each error/health type. +Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) +or unknown (driver specific). +For each registered health reporter a driver can issue error/health reports +asynchronously. All health reports handling is done by devlink. +Device driver can provide specific callbacks for each "health reporter", e.g. + - Recovery procedures + - Diagnostics and object dump procedures + - OOB initial parameters +Different parts of the driver can register different types of health reporters +with different handlers. + +Once an error is reported, devlink health will do the following actions: + * A log is being send to the kernel trace events buffer + * Health status and statistics are being updated for the reporter instance + * Object dump is being taken and saved at the reporter instance (as long as + there is no other dump which is already stored) + * Auto recovery attempt is being done. Depends on: + - Auto-recovery configuration + - Grace period vs. time passed since last recover + +The user interface: +User can access/change each reporter's parameters and driver specific callbacks +via devlink, e.g per error type (per health reporter) + - Configure reporter's generic parameters (like: disable/enable auto recovery) + - Invoke recovery procedure + - Run diagnostics + - Object dump + +The devlink health interface (via netlink): +DEVLINK_CMD_HEALTH_REPORTER_GET + Retrieves status and configuration info per DEV and reporter. +DEVLINK_CMD_HEALTH_REPORTER_SET + Allows reporter-related configuration setting. +DEVLINK_CMD_HEALTH_REPORTER_RECOVER + Triggers a reporter's recovery procedure. +DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE + Retrieves diagnostics data from a reporter on a device. +DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET + Retrieves the last stored dump. Devlink health + saves a single dump. If an dump is not already stored by the devlink + for this reporter, devlink generates a new dump. + dump output is defined by the reporter. +DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR + Clears the last saved dump file for the specified reporter. + + + netlink + +--------------------------+ + | | + | + | + | | | + +--------------------------+ + |request for ops + |(diagnose, + mlx5_core devlink |recover, + |dump) ++--------+ +--------------------------+ +| | | reporter| | +| | | +---------v----------+ | +| | ops execution | | | | +| <----------------------------------+ | | +| | | | | | +| | | + ^------------------+ | +| | | | request for ops | +| | | | (recover, dump) | +| | | | | +| | | +-+------------------+ | +| | health report | | health handler | | +| +-------------------------------> | | +| | | +--------------------+ | +| | health reporter create | | +| +----------------------------> | ++--------+ +--------------------------+ -- cgit From 120382714c0456037b23b6e0c12f04bf2736e5e4 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 7 Feb 2019 16:19:05 +0000 Subject: dt-bindings: phy: Armada 38x common phy bindings Add the Marvell Armada 38x common phy bindings. Signed-off-by: Russell King Signed-off-by: David S. Miller --- .../bindings/phy/phy-armada38x-comphy.txt | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 Documentation/devicetree/bindings/phy/phy-armada38x-comphy.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/phy/phy-armada38x-comphy.txt b/Documentation/devicetree/bindings/phy/phy-armada38x-comphy.txt new file mode 100644 index 000000000000..ad49e5c01334 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-armada38x-comphy.txt @@ -0,0 +1,40 @@ +mvebu armada 38x comphy driver +------------------------------ + +This comphy controller can be found on Marvell Armada 38x. It provides a +number of shared PHYs used by various interfaces (network, sata, usb, +PCIe...). + +Required properties: + +- compatible: should be "marvell,armada-380-comphy" +- reg: should contain the comphy register location and length. +- #address-cells: should be 1. +- #size-cells: should be 0. + +A sub-node is required for each comphy lane provided by the comphy. + +Required properties (child nodes): + +- reg: comphy lane number. +- #phy-cells : from the generic phy bindings, must be 1. Defines the + input port to use for a given comphy lane. + +Example: + + comphy: phy@18300 { + compatible = "marvell,armada-380-comphy"; + reg = <0x18300 0x100>; + #address-cells = <1>; + #size-cells = <0>; + + cpm_comphy0: phy@0 { + reg = <0>; + #phy-cells = <1>; + }; + + cpm_comphy1: phy@1 { + reg = <1>; + #phy-cells = <1>; + }; + }; -- cgit From 4ca124f4d96d7c976f2753c874d095c0de83d280 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 7 Feb 2019 16:19:21 +0000 Subject: dt-bindings: net: mvneta: add phys property Add an optional phys property to the mvneta binding documentation for the common phy. Reviewed-by: Rob Herring Signed-off-by: Russell King Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt b/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt index bedcfd5a52cd..691f886cfc4a 100644 --- a/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt +++ b/Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt @@ -19,7 +19,7 @@ Optional properties: "marvell,armada-370-neta" and 9800B for others. - clock-names: List of names corresponding to clocks property; shall be "core" for core clock and "bus" for the optional bus clock. - +- phys: comphy for the ethernet port, see ../phy/phy-bindings.txt Optional properties (valid only for Armada XP/38x): -- cgit From 98bbf70c1c41fb9547c3a18c0f1b96f6ebb8eb1d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 7 Feb 2019 11:22:55 +0000 Subject: mlxsw: spectrum: add "acl_region_rehash_interval" devlink param Expose new driver-specific "acl_region_rehash_interval" devlink param which would allow user to alter default ACL region rehash interval. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-params-mlxsw.txt | 8 +++ drivers/net/ethernet/mellanox/mlxsw/core.h | 5 ++ drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 69 +++++++++++++++++++++- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 2 + drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 16 +++++ .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 36 +++++++++++ .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.h | 5 ++ 7 files changed, 139 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/devlink-params-mlxsw.txt b/Documentation/networking/devlink-params-mlxsw.txt index 2c5c67a920c9..c63ea9fc7009 100644 --- a/Documentation/networking/devlink-params-mlxsw.txt +++ b/Documentation/networking/devlink-params-mlxsw.txt @@ -1,2 +1,10 @@ fw_load_policy [DEVICE, GENERIC] Configuration mode: driverinit + +acl_region_rehash_interval [DEVICE, DRIVER-SPECIFIC] + Sets an interval for periodic ACL region rehashes. + The value is in milliseconds, minimal value is "3000". + Value "0" disables the periodic work. + The first rehash will be run right after value is set. + Type: u32 + Configuration mode: runtime diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 4e114f35ee0d..c8e16a305969 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -394,4 +394,9 @@ static inline void mlxsw_thermal_fini(struct mlxsw_thermal *thermal) #endif +enum mlxsw_devlink_param_id { + MLXSW_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, + MLXSW_DEVLINK_PARAM_ID_ACL_REGION_REHASH_INTERVAL, +}; + #endif diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 8dd808b7f931..7c9745cecbbd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4413,6 +4413,71 @@ static void mlxsw_sp_params_unregister(struct mlxsw_core *mlxsw_core) ARRAY_SIZE(mlxsw_sp_devlink_params)); } +static int +mlxsw_sp_params_acl_region_rehash_intrvl_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlxsw_core *mlxsw_core = devlink_priv(devlink); + struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + + ctx->val.vu32 = mlxsw_sp_acl_region_rehash_intrvl_get(mlxsw_sp); + return 0; +} + +static int +mlxsw_sp_params_acl_region_rehash_intrvl_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlxsw_core *mlxsw_core = devlink_priv(devlink); + struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + + return mlxsw_sp_acl_region_rehash_intrvl_set(mlxsw_sp, ctx->val.vu32); +} + +static const struct devlink_param mlxsw_sp2_devlink_params[] = { + DEVLINK_PARAM_DRIVER(MLXSW_DEVLINK_PARAM_ID_ACL_REGION_REHASH_INTERVAL, + "acl_region_rehash_interval", + DEVLINK_PARAM_TYPE_U32, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlxsw_sp_params_acl_region_rehash_intrvl_get, + mlxsw_sp_params_acl_region_rehash_intrvl_set, + NULL), +}; + +static int mlxsw_sp2_params_register(struct mlxsw_core *mlxsw_core) +{ + struct devlink *devlink = priv_to_devlink(mlxsw_core); + union devlink_param_value value; + int err; + + err = mlxsw_sp_params_register(mlxsw_core); + if (err) + return err; + + err = devlink_params_register(devlink, mlxsw_sp2_devlink_params, + ARRAY_SIZE(mlxsw_sp2_devlink_params)); + if (err) + goto err_devlink_params_register; + + value.vu32 = 0; + devlink_param_driverinit_value_set(devlink, + MLXSW_DEVLINK_PARAM_ID_ACL_REGION_REHASH_INTERVAL, + value); + return 0; + +err_devlink_params_register: + mlxsw_sp_params_unregister(mlxsw_core); + return err; +} + +static void mlxsw_sp2_params_unregister(struct mlxsw_core *mlxsw_core) +{ + devlink_params_unregister(priv_to_devlink(mlxsw_core), + mlxsw_sp2_devlink_params, + ARRAY_SIZE(mlxsw_sp2_devlink_params)); + mlxsw_sp_params_unregister(mlxsw_core); +} + static struct mlxsw_driver mlxsw_sp1_driver = { .kind = mlxsw_sp1_driver_name, .priv_size = sizeof(struct mlxsw_sp), @@ -4461,8 +4526,8 @@ static struct mlxsw_driver mlxsw_sp2_driver = { .sb_occ_tc_port_bind_get = mlxsw_sp_sb_occ_tc_port_bind_get, .txhdr_construct = mlxsw_sp_txhdr_construct, .resources_register = mlxsw_sp2_resources_register, - .params_register = mlxsw_sp_params_register, - .params_unregister = mlxsw_sp_params_unregister, + .params_register = mlxsw_sp2_params_register, + .params_unregister = mlxsw_sp2_params_unregister, .txhdr_len = MLXSW_TXHDR_LEN, .profile = &mlxsw_sp2_config_profile, .res_query_enabled = true, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 3d17b4a368f4..ceebc91f4f1d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -690,6 +690,8 @@ struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp); int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp); void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp); +u32 mlxsw_sp_acl_region_rehash_intrvl_get(struct mlxsw_sp *mlxsw_sp); +int mlxsw_sp_acl_region_rehash_intrvl_set(struct mlxsw_sp *mlxsw_sp, u32 val); /* spectrum_acl_tcam.c */ struct mlxsw_sp_acl_tcam; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 38e027815393..a146a44634e9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -912,3 +912,19 @@ void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp) mlxsw_afk_destroy(acl->afk); kfree(acl); } + +u32 mlxsw_sp_acl_region_rehash_intrvl_get(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_acl *acl = mlxsw_sp->acl; + + return mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get(mlxsw_sp, + &acl->tcam); +} + +int mlxsw_sp_acl_region_rehash_intrvl_set(struct mlxsw_sp *mlxsw_sp, u32 val) +{ + struct mlxsw_sp_acl *acl = mlxsw_sp->acl; + + return mlxsw_sp_acl_tcam_vregion_rehash_intrvl_set(mlxsw_sp, + &acl->tcam, val); +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 9239ff4e94c4..f2cb37c0d300 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -24,6 +24,7 @@ size_t mlxsw_sp_acl_tcam_priv_size(struct mlxsw_sp *mlxsw_sp) } #define MLXSW_SP_ACL_TCAM_VREGION_REHASH_INTRVL_DFLT 5000 /* ms */ +#define MLXSW_SP_ACL_TCAM_VREGION_REHASH_INTRVL_MIN 3000 /* ms */ int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_tcam *tcam) @@ -725,6 +726,41 @@ mlxsw_sp_acl_tcam_vregion_destroy(struct mlxsw_sp *mlxsw_sp, kfree(vregion); } +u32 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_acl_tcam *tcam) +{ + const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops; + u32 vregion_rehash_intrvl; + + if (WARN_ON(!ops->region_rehash_hints_get)) + return 0; + vregion_rehash_intrvl = tcam->vregion_rehash_intrvl; + return vregion_rehash_intrvl; +} + +int mlxsw_sp_acl_tcam_vregion_rehash_intrvl_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_acl_tcam *tcam, + u32 val) +{ + const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops; + struct mlxsw_sp_acl_tcam_vregion *vregion; + + if (val < MLXSW_SP_ACL_TCAM_VREGION_REHASH_INTRVL_MIN && val) + return -EINVAL; + if (WARN_ON(!ops->region_rehash_hints_get)) + return -EOPNOTSUPP; + tcam->vregion_rehash_intrvl = val; + rtnl_lock(); + list_for_each_entry(vregion, &tcam->vregion_list, tlist) { + if (val) + mlxsw_core_schedule_dw(&vregion->rehash_dw, 0); + else + cancel_delayed_work_sync(&vregion->rehash_dw); + } + rtnl_unlock(); + return 0; +} + static int mlxsw_sp_acl_tcam_vchunk_assoc(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_tcam_group *group, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h index 440a3483ed7b..96bd42a9fbc3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h @@ -28,6 +28,11 @@ int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_tcam *tcam); void mlxsw_sp_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_tcam *tcam); +u32 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_acl_tcam *tcam); +int mlxsw_sp_acl_tcam_vregion_rehash_intrvl_set(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_acl_tcam *tcam, + u32 val); int mlxsw_sp_acl_tcam_priority_get(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_rule_info *rulei, u32 *priority, bool fillup_priority); -- cgit From 83ef97d1d35c36bec37af6dea51858809c2af527 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Wed, 6 Feb 2019 18:56:08 +0100 Subject: net/macb: bindings doc/trivial: fix documentation for sama5d3 10/100 interface This removes a line left while adding the correct compatibility string for sama5d3 10/100 interface. Now use the "atmel,sama5d3-macb" string. Signed-off-by: Nicolas Ferre Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/macb.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index 3e17ac1d5d58..f5c414b10e27 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -3,8 +3,7 @@ Required properties: - compatible: Should be "cdns,[-]{macb|gem}" Use "cdns,at91rm9200-emac" Atmel at91rm9200 SoC. - Use "cdns,at91sam9260-macb" for Atmel at91sam9 SoCs or the 10/100Mbit IP - available on sama5d3 SoCs. + Use "cdns,at91sam9260-macb" for Atmel at91sam9 SoCs. Use "cdns,np4-macb" for NP4 SoC devices. Use "cdns,at32ap7000-macb" for other 10/100 usage or use the generic form: "cdns,macb". Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on -- cgit From 4973a1276ca89c3f1a1fd32b4c955144e04afd5d Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Wed, 6 Feb 2019 18:56:09 +0100 Subject: net/macb: bindings doc: add sam9x60 binding Add the compatibility sting documentation for sam9x60 10/100 interface. Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/macb.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index f5c414b10e27..174f292d8a3e 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -4,6 +4,7 @@ Required properties: - compatible: Should be "cdns,[-]{macb|gem}" Use "cdns,at91rm9200-emac" Atmel at91rm9200 SoC. Use "cdns,at91sam9260-macb" for Atmel at91sam9 SoCs. + Use "cdns,sam9x60-macb" for Microchip sam9x60 SoC. Use "cdns,np4-macb" for NP4 SoC devices. Use "cdns,at32ap7000-macb" for other 10/100 usage or use the generic form: "cdns,macb". Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on -- cgit From 132c4e9e6ac5d97185b03529a062bdf9d14dcb9d Mon Sep 17 00:00:00 2001 From: yupeng Date: Sat, 9 Feb 2019 14:46:18 -0800 Subject: add snmp counter document add document for tcp retransmission, tcp fast open, syn cookies, challenge ack, prune and several general counters Signed-off-by: yupeng Signed-off-by: David S. Miller --- Documentation/networking/snmp_counter.rst | 184 +++++++++++++++++++++++++++++- 1 file changed, 181 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index c5642f430d2e..52b026be028f 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -367,16 +367,19 @@ to the accept queue. TCP Fast Open ============= * TcpEstabResets + Defined in `RFC1213 tcpEstabResets`_. .. _RFC1213 tcpEstabResets: https://tools.ietf.org/html/rfc1213#page-48 * TcpAttemptFails + Defined in `RFC1213 tcpAttemptFails`_. .. _RFC1213 tcpAttemptFails: https://tools.ietf.org/html/rfc1213#page-48 * TcpOutRsts + Defined in `RFC1213 tcpOutRsts`_. The RFC says this counter indicates the 'segments sent containing the RST flag', but in linux kernel, this couner indicates the segments kerenl tried to send. The sending @@ -384,6 +387,30 @@ process might be failed due to some errors (e.g. memory alloc failed). .. _RFC1213 tcpOutRsts: https://tools.ietf.org/html/rfc1213#page-52 +* TcpExtTCPSpuriousRtxHostQueues + +When the TCP stack wants to retransmit a packet, and finds that packet +is not lost in the network, but the packet is not sent yet, the TCP +stack would give up the retransmission and update this counter. It +might happen if a packet stays too long time in a qdisc or driver +queue. + +* TcpEstabResets + +The socket receives a RST packet in Establish or CloseWait state. + +* TcpExtTCPKeepAlive + +This counter indicates many keepalive packets were sent. The keepalive +won't be enabled by default. A userspace program could enable it by +setting the SO_KEEPALIVE socket option. + +* TcpExtTCPSpuriousRTOs + +The spurious retransmission timeout detected by the `F-RTO`_ +algorithm. + +.. _F-RTO: https://tools.ietf.org/html/rfc5682 TCP Fast Path ============ @@ -609,6 +636,29 @@ packet yet, the sender would know packet 4 is out of order. The TCP stack of kernel will increase TcpExtTCPSACKReorder for both of the above scenarios. +* TcpExtTCPSlowStartRetrans + +The TCP stack wants to retransmit a packet and the congestion control +state is 'Loss'. + +* TcpExtTCPFastRetrans + +The TCP stack wants to retransmit a packet and the congestion control +state is not 'Loss'. + +* TcpExtTCPLostRetransmit + +A SACK points out that a retransmission packet is lost again. + +* TcpExtTCPRetransFail + +The TCP stack tries to deliver a retransmission packet to lower layers +but the lower layers return an error. + +* TcpExtTCPSynRetrans + +The TCP stack retransmits a SYN packet. + DSACK ===== The DSACK is defined in `RFC2883`_. The receiver uses DSACK to report @@ -790,8 +840,9 @@ unacknowledged number (more strict than `RFC 5961 section 5.2`_). .. _RFC 5961 section 5.2: https://tools.ietf.org/html/rfc5961#page-11 TCP receive window -================= +================== * TcpExtTCPWantZeroWindowAdv + Depending on current memory usage, the TCP stack tries to set receive window to zero. But the receive window might still be a no-zero value. For example, if the previous window size is 10, and the TCP @@ -799,14 +850,16 @@ stack receives 3 bytes, the current window size would be 7 even if the window size calculated by the memory usage is zero. * TcpExtTCPToZeroWindowAdv + The TCP receive window is set to zero from a no-zero value. * TcpExtTCPFromZeroWindowAdv + The TCP receive window is set to no-zero value from zero. Delayed ACK -========== +=========== The TCP Delayed ACK is a technique which is used for reducing the packet count in the network. For more details, please refer the `Delayed ACK wiki`_ @@ -814,10 +867,12 @@ packet count in the network. For more details, please refer the .. _Delayed ACK wiki: https://en.wikipedia.org/wiki/TCP_delayed_acknowledgment * TcpExtDelayedACKs + A delayed ACK timer expires. The TCP stack will send a pure ACK packet and exit the delayed ACK mode. * TcpExtDelayedACKLocked + A delayed ACK timer expires, but the TCP stack can't send an ACK immediately due to the socket is locked by a userspace program. The TCP stack will send a pure ACK later (after the userspace program @@ -826,24 +881,147 @@ TCP stack will also update TcpExtDelayedACKs and exit the delayed ACK mode. * TcpExtDelayedACKLost + It will be updated when the TCP stack receives a packet which has been ACKed. A Delayed ACK loss might cause this issue, but it would also be triggered by other reasons, such as a packet is duplicated in the network. Tail Loss Probe (TLP) -=================== +===================== TLP is an algorithm which is used to detect TCP packet loss. For more details, please refer the `TLP paper`_. .. _TLP paper: https://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01 * TcpExtTCPLossProbes + A TLP probe packet is sent. * TcpExtTCPLossProbeRecovery + A packet loss is detected and recovered by TLP. +TCP Fast Open +============= +TCP Fast Open is a technology which allows data transfer before the +3-way handshake complete. Please refer the `TCP Fast Open wiki`_ for a +general description. + +.. _TCP Fast Open wiki: https://en.wikipedia.org/wiki/TCP_Fast_Open + +* TcpExtTCPFastOpenActive + +When the TCP stack receives an ACK packet in the SYN-SENT status, and +the ACK packet acknowledges the data in the SYN packet, the TCP stack +understand the TFO cookie is accepted by the other side, then it +updates this counter. + +* TcpExtTCPFastOpenActiveFail + +This counter indicates that the TCP stack initiated a TCP Fast Open, +but it failed. This counter would be updated in three scenarios: (1) +the other side doesn't acknowledge the data in the SYN packet. (2) The +SYN packet which has the TFO cookie is timeout at least once. (3) +after the 3-way handshake, the retransmission timeout happens +net.ipv4.tcp_retries1 times, because some middle-boxes may black-hole +fast open after the handshake. + +* TcpExtTCPFastOpenPassive + +This counter indicates how many times the TCP stack accepts the fast +open request. + +* TcpExtTCPFastOpenPassiveFail + +This counter indicates how many times the TCP stack rejects the fast +open request. It is caused by either the TFO cookie is invalid or the +TCP stack finds an error during the socket creating process. + +* TcpExtTCPFastOpenListenOverflow + +When the pending fast open request number is larger than +fastopenq->max_qlen, the TCP stack will reject the fast open request +and update this counter. When this counter is updated, the TCP stack +won't update TcpExtTCPFastOpenPassive or +TcpExtTCPFastOpenPassiveFail. The fastopenq->max_qlen is set by the +TCP_FASTOPEN socket operation and it could not be larger than +net.core.somaxconn. For example: + +setsockopt(sfd, SOL_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)); + +* TcpExtTCPFastOpenCookieReqd + +This counter indicates how many times a client wants to request a TFO +cookie. + +SYN cookies +=========== +SYN cookies are used to mitigate SYN flood, for details, please refer +the `SYN cookies wiki`_. + +.. _SYN cookies wiki: https://en.wikipedia.org/wiki/SYN_cookies + +* TcpExtSyncookiesSent + +It indicates how many SYN cookies are sent. + +* TcpExtSyncookiesRecv + +How many reply packets of the SYN cookies the TCP stack receives. + +* TcpExtSyncookiesFailed + +The MSS decoded from the SYN cookie is invalid. When this counter is +updated, the received packet won't be treated as a SYN cookie and the +TcpExtSyncookiesRecv counter wont be updated. + +Challenge ACK +============= +For details of challenge ACK, please refer the explaination of +TcpExtTCPACKSkippedChallenge. + +* TcpExtTCPChallengeACK + +The number of challenge acks sent. + +* TcpExtTCPSYNChallenge + +The number of challenge acks sent in response to SYN packets. After +updates this counter, the TCP stack might send a challenge ACK and +update the TcpExtTCPChallengeACK counter, or it might also skip to +send the challenge and update the TcpExtTCPACKSkippedChallenge. + +prune +===== +When a socket is under memory pressure, the TCP stack will try to +reclaim memory from the receiving queue and out of order queue. One of +the reclaiming method is 'collapse', which means allocate a big sbk, +copy the contiguous skbs to the single big skb, and free these +contiguous skbs. + +* TcpExtPruneCalled + +The TCP stack tries to reclaim memory for a socket. After updates this +counter, the TCP stack will try to collapse the out of order queue and +the receiving queue. If the memory is still not enough, the TCP stack +will try to discard packets from the out of order queue (and update the +TcpExtOfoPruned counter) + +* TcpExtOfoPruned + +The TCP stack tries to discard packet on the out of order queue. + +* TcpExtRcvPruned + +After 'collapse' and discard packets from the out of order queue, if +the actually used memory is still larger than the max allowed memory, +this counter will be updated. It means the 'prune' fails. + +* TcpExtTCPRcvCollapsed + +This counter indicates how many skbs are freed during 'collapse'. + examples ======== -- cgit From 14fd1901e718138b22ae7cbd8995bfdeb4df578f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 Feb 2019 19:35:29 -0800 Subject: devlink: add a generic board.manufacture version name At Jiri's suggestion add a generic "board.manufacture" version identifier. Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink-info-versions.rst | 5 +++++ include/net/devlink.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/devlink-info-versions.rst b/Documentation/networking/devlink-info-versions.rst index 7d4ecf6b6f34..c79ad8593383 100644 --- a/Documentation/networking/devlink-info-versions.rst +++ b/Documentation/networking/devlink-info-versions.rst @@ -14,6 +14,11 @@ board.rev Board design revision. +board.manufacture +================= + +An identifier of the company or the facility which produced the part. + fw.mgmt ======= diff --git a/include/net/devlink.h b/include/net/devlink.h index 2b384a38911b..07660fe4c0e3 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -435,6 +435,8 @@ enum devlink_param_wol_types { #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" /* Revision of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_REV "board.rev" +/* Maker of the board */ +#define DEVLINK_INFO_VERSION_GENERIC_BOARD_MANUFACTURE "board.manufacture" /* Control processor FW version */ #define DEVLINK_INFO_VERSION_GENERIC_FW_MGMT "fw.mgmt" -- cgit From bd37fdf5243ccf324e3cc6fe92bf88d9a2fc1b10 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 10 Feb 2019 22:32:42 -0800 Subject: Documentation: fix some freescale dpio-driver.rst warnings Fix markup warnings for one list by using correct list syntax. Fix markup warnings for another list by using blank lines before the list. Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst:30: WARNING: Unexpected indentation. Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst:143: WARNING: Unexpected indentation. Signed-off-by: Randy Dunlap Cc: Stuart Yoder Cc: Laurentiu Tudor Cc: Ioana Radulescu Cc: netdev@vger.kernel.org Cc: Madalin Bucur Signed-off-by: David S. Miller --- .../device_drivers/freescale/dpaa2/dpio-driver.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst index a188466b6698..5045df990a4c 100644 --- a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst +++ b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst @@ -27,11 +27,12 @@ Driver Overview The DPIO driver is bound to DPIO objects discovered on the fsl-mc bus and provides services that: - A) allow other drivers, such as the Ethernet driver, to enqueue and dequeue + + A. allow other drivers, such as the Ethernet driver, to enqueue and dequeue frames for their respective objects - B) allow drivers to register callbacks for data availability notifications + B. allow drivers to register callbacks for data availability notifications when data becomes available on a queue or channel - C) allow drivers to manage hardware buffer pools + C. allow drivers to manage hardware buffer pools The Linux DPIO driver consists of 3 primary components-- DPIO object driver-- fsl-mc driver that manages the DPIO object @@ -140,11 +141,10 @@ QBman portal interface (qbman-portal.c) The qbman-portal component provides APIs to do the low level hardware bit twiddling for operations such as: - -initializing Qman software portals - - -building and sending portal commands - -portal interrupt configuration and processing + - initializing Qman software portals + - building and sending portal commands + - portal interrupt configuration and processing The qbman-portal APIs are not public to other drivers, and are only used by dpio-service. -- cgit From 2843bf518579e9fa357ba58708c7cff96946d084 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 12 Feb 2019 12:24:00 +0800 Subject: dt-binding: ptp_qoriq: add little-endian support Specify "little-endian" property if the 1588 timer IP block is little-endian mode. The default endian mode is big-endian. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/ptp/ptp-qoriq.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt index 8e7f8551d190..454c937076a2 100644 --- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt +++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt @@ -19,6 +19,9 @@ Clock Properties: - fsl,max-adj Maximum frequency adjustment in parts per billion. - fsl,extts-fifo The presence of this property indicates hardware support for the external trigger stamp FIFO. + - little-endian The presence of this property indicates the 1588 timer + IP block is little-endian mode. The default endian mode + is big-endian. These properties set the operational parameters for the PTP clock. You must choose these carefully for the clock to work right. -- cgit From cc0c207a5d18333fbfecc964a47ddb182fbcb720 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Feb 2019 16:58:25 -0800 Subject: net: Remove SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT Now that we have converted the bridge code and the drivers to check for bridge port(s) flags at the time we try to set them, there is no need for a get() -> set() sequence anymore and SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT therefore becomes unused. Reviewed-by: Ido Schimmel Signed-off-by: Florian Fainelli Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 6 ++---- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 11 +---------- drivers/net/ethernet/rocker/rocker_main.c | 14 +------------- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 10 +--------- include/net/switchdev.h | 2 -- net/dsa/slave.c | 16 +--------------- 6 files changed, 6 insertions(+), 53 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index f3244d87512a..79c8b0f16aee 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -232,10 +232,8 @@ Learning_sync attribute enables syncing of the learned/forgotten FDB entry to the bridge's FDB. It's possible, but not optimal, to enable learning on the device port and on the bridge port, and disable learning_sync. -To support learning and learning_sync port attributes, the driver implements -switchdev op switchdev_port_attr_get/set for -SWITCHDEV_ATTR_PORT_ID_BRIDGE_FLAGS. The driver should initialize the attributes -to the hardware defaults. +To support learning, the driver implements switchdev op +switchdev_port_attr_get/set for SWITCHDEV_ATTR_PORT_ID_BRIDGE_FLAGS. FDB Ageing ^^^^^^^^^^ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 9a8798f74d2b..bbb5a406232e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -434,16 +434,7 @@ static void mlxsw_sp_bridge_vlan_put(struct mlxsw_sp_bridge_vlan *bridge_vlan) static int mlxsw_sp_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { - switch (attr->id) { - case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: - attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD | - BR_MCAST_FLOOD; - break; - default: - return -EOPNOTSUPP; - } - - return 0; + return -EOPNOTSUPP; } static int diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 25129f7b5583..6b8273e2057d 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2069,19 +2069,7 @@ static const struct net_device_ops rocker_port_netdev_ops = { static int rocker_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { - const struct rocker_port *rocker_port = netdev_priv(dev); - int err = 0; - - switch (attr->id) { - case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: - err = rocker_world_port_attr_bridge_flags_support_get(rocker_port, - &attr->u.brport_flags_support); - break; - default: - return -EOPNOTSUPP; - } - - return err; + return -EOPNOTSUPP; } static int rocker_port_attr_set(struct net_device *dev, diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index 331625137717..de4dcabbc29a 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -643,15 +643,7 @@ static void ethsw_teardown_irqs(struct fsl_mc_device *sw_dev) static int swdev_port_attr_get(struct net_device *netdev, struct switchdev_attr *attr) { - switch (attr->id) { - case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: - attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD; - break; - default: - return -EOPNOTSUPP; - } - - return 0; + return -EOPNOTSUPP; } static int port_attr_stp_state_set(struct net_device *netdev, diff --git a/include/net/switchdev.h b/include/net/switchdev.h index de72b0a3867f..0f352019ef99 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -45,7 +45,6 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_UNDEFINED, SWITCHDEV_ATTR_ID_PORT_STP_STATE, SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, - SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT, SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_MROUTER, SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, @@ -63,7 +62,6 @@ struct switchdev_attr { union { u8 stp_state; /* PORT_STP_STATE */ unsigned long brport_flags; /* PORT_{PRE}_BRIDGE_FLAGS */ - unsigned long brport_flags_support; /* PORT_BRIDGE_FLAGS_SUPPORT */ bool mrouter; /* PORT_MROUTER */ clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 44cc4e50dd5a..db0a2651070f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -388,21 +388,7 @@ static int dsa_slave_get_port_parent_id(struct net_device *dev, static int dsa_slave_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { - struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_switch *ds = dp->ds; - - switch (attr->id) { - case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: - attr->u.brport_flags_support = 0; - if (ds->ops->port_egress_floods) - attr->u.brport_flags_support |= BR_FLOOD | - BR_MCAST_FLOOD; - break; - default: - return -EOPNOTSUPP; - } - - return 0; + return -EOPNOTSUPP; } static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, -- cgit From 010c8f01aa7fe18ea97d302e1c7e9ca83bc27433 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Feb 2019 16:58:26 -0800 Subject: net: Get rid of switchdev_port_attr_get() With the bridge no longer calling switchdev_port_attr_get() to obtain the supported bridge port flags from a driver but instead trying to set the bridge port flags directly and relying on driver to reject unsupported configurations, we can effectively get rid of switchdev_port_attr_get() entirely since this was the only place where it was called. Signed-off-by: Florian Fainelli Reviewed-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 7 ------- drivers/net/ethernet/rocker/rocker_main.c | 7 ------- drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 7 ------- include/net/switchdev.h | 8 -------- net/dsa/slave.c | 7 ------- 6 files changed, 1 insertion(+), 37 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 79c8b0f16aee..413abbae952f 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -233,7 +233,7 @@ the bridge's FDB. It's possible, but not optimal, to enable learning on the device port and on the bridge port, and disable learning_sync. To support learning, the driver implements switchdev op -switchdev_port_attr_get/set for SWITCHDEV_ATTR_PORT_ID_BRIDGE_FLAGS. +switchdev_port_attr_set for SWITCHDEV_ATTR_PORT_ID_{PRE}_BRIDGE_FLAGS. FDB Ageing ^^^^^^^^^^ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index bbb5a406232e..766f5b5f1cf5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -431,12 +431,6 @@ static void mlxsw_sp_bridge_vlan_put(struct mlxsw_sp_bridge_vlan *bridge_vlan) mlxsw_sp_bridge_vlan_destroy(bridge_vlan); } -static int mlxsw_sp_port_attr_get(struct net_device *dev, - struct switchdev_attr *attr) -{ - return -EOPNOTSUPP; -} - static int mlxsw_sp_port_bridge_vlan_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_vlan *bridge_vlan, @@ -1945,7 +1939,6 @@ static struct mlxsw_sp_port *mlxsw_sp_lag_rep_port(struct mlxsw_sp *mlxsw_sp, } static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = { - .switchdev_port_attr_get = mlxsw_sp_port_attr_get, .switchdev_port_attr_set = mlxsw_sp_port_attr_set, }; diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 6b8273e2057d..8200fbf91306 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2066,12 +2066,6 @@ static const struct net_device_ops rocker_port_netdev_ops = { * swdev interface ********************/ -static int rocker_port_attr_get(struct net_device *dev, - struct switchdev_attr *attr) -{ - return -EOPNOTSUPP; -} - static int rocker_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -2148,7 +2142,6 @@ static int rocker_port_obj_del(struct net_device *dev, } static const struct switchdev_ops rocker_port_switchdev_ops = { - .switchdev_port_attr_get = rocker_port_attr_get, .switchdev_port_attr_set = rocker_port_attr_set, }; diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c index de4dcabbc29a..018399ee8731 100644 --- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c +++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c @@ -640,12 +640,6 @@ static void ethsw_teardown_irqs(struct fsl_mc_device *sw_dev) fsl_mc_free_irqs(sw_dev); } -static int swdev_port_attr_get(struct net_device *netdev, - struct switchdev_attr *attr) -{ - return -EOPNOTSUPP; -} - static int port_attr_stp_state_set(struct net_device *netdev, struct switchdev_trans *trans, u8 state) @@ -932,7 +926,6 @@ static int swdev_port_obj_del(struct net_device *netdev, } static const struct switchdev_ops ethsw_port_switchdev_ops = { - .switchdev_port_attr_get = swdev_port_attr_get, .switchdev_port_attr_set = swdev_port_attr_set, }; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 0f352019ef99..45310ddf2d7e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -179,8 +179,6 @@ switchdev_notifier_info_to_extack(const struct switchdev_notifier_info *info) #ifdef CONFIG_NET_SWITCHDEV void switchdev_deferred_process(void); -int switchdev_port_attr_get(struct net_device *dev, - struct switchdev_attr *attr); int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr); int switchdev_port_obj_add(struct net_device *dev, @@ -225,12 +223,6 @@ static inline void switchdev_deferred_process(void) { } -static inline int switchdev_port_attr_get(struct net_device *dev, - struct switchdev_attr *attr) -{ - return -EOPNOTSUPP; -} - static inline int switchdev_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index db0a2651070f..a78b2bba0332 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -385,12 +385,6 @@ static int dsa_slave_get_port_parent_id(struct net_device *dev, return 0; } -static int dsa_slave_port_attr_get(struct net_device *dev, - struct switchdev_attr *attr) -{ - return -EOPNOTSUPP; -} - static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { @@ -1057,7 +1051,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { }; static const struct switchdev_ops dsa_slave_switchdev_ops = { - .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, }; -- cgit From 0a6c33e894a50e5483ca1f53d053985feabebbf3 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 22 Feb 2019 11:31:46 +0000 Subject: doc: add phylink documentation to the networking book Add some phylink documentation to the networking book detailing how to convert network drivers from phylib to phylink. Signed-off-by: Russell King Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/sfp-phylink.rst | 268 +++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 Documentation/networking/sfp-phylink.rst (limited to 'Documentation') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 9a32451cd201..b08cf145d5eb 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -31,6 +31,7 @@ Contents: failover net_failover phy + sfp-phylink alias bridge snmp_counter diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst new file mode 100644 index 000000000000..5bd26cb07244 --- /dev/null +++ b/Documentation/networking/sfp-phylink.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +phylink +======= + +Overview +======== + +phylink is a mechanism to support hot-pluggable networking modules +without needing to re-initialise the adapter on hot-plug events. + +phylink supports conventional phylib-based setups, fixed link setups +and SFP (Small Formfactor Pluggable) modules at present. + +Modes of operation +================== + +phylink has several modes of operation, which depend on the firmware +settings. + +1. PHY mode + + In PHY mode, we use phylib to read the current link settings from + the PHY, and pass them to the MAC driver. We expect the MAC driver + to configure exactly the modes that are specified without any + negotiation being enabled on the link. + +2. Fixed mode + + Fixed mode is the same as PHY mode as far as the MAC driver is + concerned. + +3. In-band mode + + In-band mode is used with 802.3z, SGMII and similar interface modes, + and we are expecting to use and honor the in-band negotiation or + control word sent across the serdes channel. + +By example, what this means is that: + +.. code-block:: none + + ð { + phy = <&phy>; + phy-mode = "sgmii"; + }; + +does not use in-band SGMII signalling. The PHY is expected to follow +exactly the settings given to it in its :c:func:`mac_config` function. +The link should be forced up or down appropriately in the +:c:func:`mac_link_up` and :c:func:`mac_link_down` functions. + +.. code-block:: none + + ð { + managed = "in-band-status"; + phy = <&phy>; + phy-mode = "sgmii"; + }; + +uses in-band mode, where results from the PHY's negotiation are passed +to the MAC through the SGMII control word, and the MAC is expected to +acknowledge the control word. The :c:func:`mac_link_up` and +:c:func:`mac_link_down` functions must not force the MAC side link +up and down. + +Rough guide to converting a network driver to sfp/phylink +========================================================= + +This guide briefly describes how to convert a network driver from +phylib to the sfp/phylink support. Please send patches to improve +this documentation. + +1. Optionally split the network driver's phylib update function into + three parts dealing with link-down, link-up and reconfiguring the + MAC settings. This can be done as a separate preparation commit. + + An example of this preparation can be found in git commit fc548b991fb0. + +2. Replace:: + + select FIXED_PHY + select PHYLIB + + with:: + + select PHYLINK + + in the driver's Kconfig stanza. + +3. Add:: + + #include + + to the driver's list of header files. + +4. Add:: + + struct phylink *phylink; + + to the driver's private data structure. We shall refer to the + driver's private data pointer as ``priv`` below, and the driver's + private data structure as ``struct foo_priv``. + +5. Replace the following functions: + + .. flat-table:: + :header-rows: 1 + :widths: 1 1 + :stub-columns: 0 + + * - Original function + - Replacement function + * - phy_start(phydev) + - phylink_start(priv->phylink) + * - phy_stop(phydev) + - phylink_stop(priv->phylink) + * - phy_mii_ioctl(phydev, ifr, cmd) + - phylink_mii_ioctl(priv->phylink, ifr, cmd) + * - phy_ethtool_get_wol(phydev, wol) + - phylink_ethtool_get_wol(priv->phylink, wol) + * - phy_ethtool_set_wol(phydev, wol) + - phylink_ethtool_set_wol(priv->phylink, wol) + * - phy_disconnect(phydev) + - phylink_disconnect_phy(priv->phylink) + + Please note that some of these functions must be called under the + rtnl lock, and will warn if not. This will normally be the case, + except if these are called from the driver suspend/resume paths. + +6. Add/replace ksettings get/set methods with: + + .. code-block:: c + + static int foo_ethtool_set_link_ksettings(struct net_device *dev, + const struct ethtool_link_ksettings *cmd) + { + struct foo_priv *priv = netdev_priv(dev); + + return phylink_ethtool_ksettings_set(priv->phylink, cmd); + } + + static int foo_ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) + { + struct foo_priv *priv = netdev_priv(dev); + + return phylink_ethtool_ksettings_get(priv->phylink, cmd); + } + +7. Replace the call to: + + phy_dev = of_phy_connect(dev, node, link_func, flags, phy_interface); + + and associated code with a call to: + + err = phylink_of_phy_connect(priv->phylink, node, flags); + + For the most part, ``flags`` can be zero; these flags are passed to + the of_phy_attach() inside this function call if a PHY is specified + in the DT node ``node``. + + ``node`` should be the DT node which contains the network phy property, + fixed link properties, and will also contain the sfp property. + + The setup of fixed links should also be removed; these are handled + internally by phylink. + + of_phy_connect() was also passed a function pointer for link updates. + This function is replaced by a different form of MAC updates + described below in (8). + + Manipulation of the PHY's supported/advertised happens within phylink + based on the validate callback, see below in (8). + + Note that the driver no longer needs to store the ``phy_interface``, + and also note that ``phy_interface`` becomes a dynamic property, + just like the speed, duplex etc. settings. + + Finally, note that the MAC driver has no direct access to the PHY + anymore; that is because in the phylink model, the PHY can be + dynamic. + +8. Add a :c:type:`struct phylink_mac_ops ` instance to + the driver, which is a table of function pointers, and implement + these functions. The old link update function for + :c:func:`of_phy_connect` becomes three methods: :c:func:`mac_link_up`, + :c:func:`mac_link_down`, and :c:func:`mac_config`. If step 1 was + performed, then the functionality will have been split there. + + It is important that if in-band negotiation is used, + :c:func:`mac_link_up` and :c:func:`mac_link_down` do not prevent the + in-band negotiation from completing, since these functions are called + when the in-band link state changes - otherwise the link will never + come up. + + The :c:func:`validate` method should mask the supplied supported mask, + and ``state->advertising`` with the supported ethtool link modes. + These are the new ethtool link modes, so bitmask operations must be + used. For an example, see drivers/net/ethernet/marvell/mvneta.c. + + The :c:func:`mac_link_state` method is used to read the link state + from the MAC, and report back the settings that the MAC is currently + using. This is particularly important for in-band negotiation + methods such as 1000base-X and SGMII. + + The :c:func:`mac_config` method is used to update the MAC with the + requested state, and must avoid unnecessarily taking the link down + when making changes to the MAC configuration. This means the + function should modify the state and only take the link down when + absolutely necessary to change the MAC configuration. An example + of how to do this can be found in :c:func:`mvneta_mac_config` in + drivers/net/ethernet/marvell/mvneta.c. + + For further information on these methods, please see the inline + documentation in :c:type:`struct phylink_mac_ops `. + +9. Remove calls to of_parse_phandle() for the PHY, + of_phy_register_fixed_link() for fixed links etc. from the probe + function, and replace with: + + .. code-block:: c + + struct phylink *phylink; + + phylink = phylink_create(dev, node, phy_mode, &phylink_ops); + if (IS_ERR(phylink)) { + err = PTR_ERR(phylink); + fail probe; + } + + priv->phylink = phylink; + + and arrange to destroy the phylink in the probe failure path as + appropriate and the removal path too by calling: + + .. code-block:: c + + phylink_destroy(priv->phylink); + +10. Arrange for MAC link state interrupts to be forwarded into + phylink, via: + + .. code-block:: c + + phylink_mac_change(priv->phylink, link_is_up); + + where ``link_is_up`` is true if the link is currently up or false + otherwise. + +11. Verify that the driver does not call:: + + netif_carrier_on() + netif_carrier_off() + + as these will interfere with phylink's tracking of the link state, + and cause phylink to omit calls via the :c:func:`mac_link_up` and + :c:func:`mac_link_down` methods. + +Network drivers should call phylink_stop() and phylink_start() via their +suspend/resume paths, which ensures that the appropriate +:c:type:`struct phylink_mac_ops ` methods are called +as necessary. + +For information describing the SFP cage in DT, please see the binding +documentation in the kernel source tree +``Documentation/devicetree/bindings/net/sff,sfp.txt`` -- cgit From 0f4a9b7d4ecbac191052cb80b84a46471fd30d80 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 21 Feb 2019 10:21:28 +0100 Subject: xsk: add FAQ to facilitate for first time users Added an FAQ section in Documentation/networking/af_xdp.rst to help first time users with common problems. As problems are getting identified, entries will be added to the FAQ. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- Documentation/networking/af_xdp.rst | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 4ae4f9d8f8fe..e14d7d40fc75 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -295,6 +295,41 @@ using:: For XDP_SKB mode, use the switch "-S" instead of "-N" and all options can be displayed with "-h", as usual. +FAQ +======= + +Q: I am not seeing any traffic on the socket. What am I doing wrong? + +A: When a netdev of a physical NIC is initialized, Linux usually + allocates one Rx and Tx queue pair per core. So on a 8 core system, + queue ids 0 to 7 will be allocated, one per core. In the AF_XDP + bind call or the xsk_socket__create libbpf function call, you + specify a specific queue id to bind to and it is only the traffic + towards that queue you are going to get on you socket. So in the + example above, if you bind to queue 0, you are NOT going to get any + traffic that is distributed to queues 1 through 7. If you are + lucky, you will see the traffic, but usually it will end up on one + of the queues you have not bound to. + + There are a number of ways to solve the problem of getting the + traffic you want to the queue id you bound to. If you want to see + all the traffic, you can force the netdev to only have 1 queue, queue + id 0, and then bind to queue 0. You can use ethtool to do this:: + + sudo ethtool -L combined 1 + + If you want to only see part of the traffic, you can program the + NIC through ethtool to filter out your traffic to a single queue id + that you can bind your XDP socket to. Here is one example in which + UDP traffic to and from port 4242 are sent to queue 2:: + + sudo ethtool -N rx-flow-hash udp4 fn + sudo ethtool -N flow-type udp4 src-port 4242 dst-port \ + 4242 action 2 + + A number of other ways are possible all up to the capabilitites of + the NIC you have. + Credits ======= @@ -309,4 +344,3 @@ Credits - Michael S. Tsirkin - Qi Z Zhang - Willem de Bruijn - -- cgit From 782eff094922ab6e75e8568355c8487686b239f5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 29 Dec 2018 13:22:11 +0100 Subject: dt-bindings: net: mt76: update binding for mt7603 driver In addition to MT7603E PCI devices, the driver supports the WLAN core on MT7628/MT7688, which needs to be defined in DT. Reviewed-by: Rob Herring Signed-off-by: Felix Fietkau --- .../bindings/net/wireless/mediatek,mt76.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt b/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt index 0c17a0ec9b7b..7b9a776230c0 100644 --- a/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt +++ b/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt @@ -4,6 +4,13 @@ This node provides properties for configuring the MediaTek mt76xx wireless device. The node is expected to be specified as a child node of the PCI controller to which the wireless chip is connected. +Alternatively, it can specify the wireless part of the MT7628/MT7688 SoC. +For SoC, use the compatible string "mediatek,mt7628-wmac" and the following +properties: + +- reg: Address and length of the register set for the device. +- interrupts: Main device interrupt + Optional properties: - mac-address: See ethernet.txt in the parent directory @@ -30,3 +37,15 @@ Optional nodes: }; }; }; + +MT7628 example: + +wmac: wmac@10300000 { + compatible = "mediatek,mt7628-wmac"; + reg = <0x10300000 0x100000>; + + interrupt-parent = <&cpuintc>; + interrupts = <6>; + + mediatek,mtd-eeprom = <&factory 0x0000>; +}; -- cgit From 7d19261bc0eb35080231f109687d119b183abab8 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Feb 2019 14:53:44 -0800 Subject: dt-bindings: net: btusb: add QCA6174A IDs There are two USB PID/VID variations I've seen for this chip, and I want to utilize the 'interrupts' property defined here already. Signed-off-by: Brian Norris Reviewed-by: Matthias Kaehlcke Reviewed-by: Rob Herring Signed-off-by: Marcel Holtmann --- Documentation/devicetree/bindings/net/btusb.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/btusb.txt b/Documentation/devicetree/bindings/net/btusb.txt index 37d67926dd6d..b1ad6ee68e90 100644 --- a/Documentation/devicetree/bindings/net/btusb.txt +++ b/Documentation/devicetree/bindings/net/btusb.txt @@ -9,6 +9,9 @@ Required properties: (more may be added later) are: "usb1286,204e" (Marvell 8997) + "usbcf3,e300" (Qualcomm QCA6174A) + "usb4ca,301a" (Qualcomm QCA6174A (Lite-On)) + Also, vendors that use btusb may have device additional properties, e.g: Documentation/devicetree/bindings/net/marvell-bt-8xxx.txt -- cgit From 43185c3b82c33df351bf6a605ca884c35099b6c6 Mon Sep 17 00:00:00 2001 From: Pankaj Bansal Date: Mon, 25 Feb 2019 06:16:53 +0000 Subject: dt-bindings: net: Add bindings for mdio mux consumers When we use the bindings defined in Documentation/devicetree/bindings/mux to define mdio mux in producer and consumer terms, it results in two devices. one is mux producer and other is mux consumer. Add the bindings needed for Mdio mux consumer devices. Signed-off-by: Pankaj Bansal Signed-off-by: David S. Miller --- .../bindings/net/mdio-mux-multiplexer.txt | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/mdio-mux-multiplexer.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/mdio-mux-multiplexer.txt b/Documentation/devicetree/bindings/net/mdio-mux-multiplexer.txt new file mode 100644 index 000000000000..534e38058fe0 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mdio-mux-multiplexer.txt @@ -0,0 +1,82 @@ +Properties for an MDIO bus multiplexer consumer device + +This is a special case of MDIO mux when MDIO mux is defined as a consumer +of a mux producer device. The mux producer can be of any type like mmio mux +producer, gpio mux producer or generic register based mux producer. + +Required properties in addition to the MDIO Bus multiplexer properties: + +- compatible : should be "mmio-mux-multiplexer" +- mux-controls : mux controller node to use for operating the mux +- mdio-parent-bus : phandle to the parent MDIO bus. + +each child node of mdio bus multiplexer consumer device represent a mdio +bus. + +for more information please refer +Documentation/devicetree/bindings/mux/mux-controller.txt +and Documentation/devicetree/bindings/net/mdio-mux.txt + +Example: +In below example the Mux producer and consumer are separate nodes. + +&i2c0 { + fpga@66 { // fpga connected to i2c + compatible = "fsl,lx2160aqds-fpga", "fsl,fpga-qixis-i2c", + "simple-mfd"; + reg = <0x66>; + + mux: mux-controller { // Mux Producer + compatible = "reg-mux"; + #mux-control-cells = <1>; + mux-reg-masks = <0x54 0xf8>, /* 0: reg 0x54, bits 7:3 */ + <0x54 0x07>; /* 1: reg 0x54, bits 2:0 */ + }; + }; +}; + +mdio-mux-1 { // Mux consumer + compatible = "mdio-mux-multiplexer"; + mux-controls = <&mux 0>; + mdio-parent-bus = <&emdio1>; + #address-cells = <1>; + #size-cells = <0>; + + mdio@0 { + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + }; + + mdio@8 { + reg = <0x8>; + #address-cells = <1>; + #size-cells = <0>; + }; + + .. + .. +}; + +mdio-mux-2 { // Mux consumer + compatible = "mdio-mux-multiplexer"; + mux-controls = <&mux 1>; + mdio-parent-bus = <&emdio2>; + #address-cells = <1>; + #size-cells = <0>; + + mdio@0 { + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + }; + + mdio@1 { + reg = <0x1>; + #address-cells = <1>; + #size-cells = <0>; + }; + + .. + .. +}; -- cgit From 20cc5ddecc42831cea5b9a3bed3254d04826bd55 Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Tue, 26 Feb 2019 15:42:23 +0200 Subject: dt-bindings: net: freescale: enetc: Add connection bindings for ENETC ethernet nodes Define connection bindings (external PHY connections and internal links) for the ENETC on-chip ethernet controllers. Signed-off-by: Claudiu Manoil Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- .../devicetree/bindings/net/fsl-enetc.txt | 69 ++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/fsl-enetc.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt new file mode 100644 index 000000000000..c812e25ae90f --- /dev/null +++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt @@ -0,0 +1,69 @@ +* ENETC ethernet device tree bindings + +Depending on board design and ENETC port type (internal or +external) there are two supported link modes specified by +below device tree bindings. + +Required properties: + +- reg : Specifies PCIe Device Number and Function + Number of the ENETC endpoint device, according + to parent node bindings. +- compatible : Should be "fsl,enetc". + +1) The ENETC external port is connected to a MDIO configurable phy: + +In this case, the ENETC node should include a "mdio" sub-node +that in turn should contain the "ethernet-phy" node describing the +external phy. Below properties are required, their bindings +already defined in ethernet.txt or phy.txt, under +Documentation/devicetree/bindings/net/*. + +Required: + +- phy-handle : Phandle to a PHY on the MDIO bus. + Defined in ethernet.txt. + +- phy-connection-type : Defined in ethernet.txt. + +- mdio : "mdio" node, defined in mdio.txt. + +- ethernet-phy : "ethernet-phy" node, defined in phy.txt. + +Example: + + ethernet@0,0 { + compatible = "fsl,enetc"; + reg = <0x000000 0 0 0 0>; + phy-handle = <&sgmii_phy0>; + phy-connection-type = "sgmii"; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + sgmii_phy0: ethernet-phy@2 { + reg = <0x2>; + }; + }; + }; + +2) The ENETC port is an internal port or has a fixed-link external +connection: + +In this case, the ENETC port node defines a fixed link connection, +as specified by "fixed-link.txt", under +Documentation/devicetree/bindings/net/*. + +Required: + +- fixed-link : "fixed-link" node, defined in "fixed-link.txt". + +Example: + ethernet@0,2 { + compatible = "fsl,enetc"; + reg = <0x000200 0 0 0 0>; + fixed-link { + speed = <1000>; + full-duplex; + }; + }; -- cgit From 5efc529fb428e042c08a598b9afc5c5e2c600d74 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 17:12:19 -0800 Subject: docs/btf: fix typos, improve wording Fix various typos, some of the formatting and wording for Documentation/btf.rst. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- Documentation/bpf/btf.rst | 108 +++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 55 deletions(-) (limited to 'Documentation') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 1d434c3a268d..1d761f1c5b2b 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -5,7 +5,7 @@ BPF Type Format (BTF) 1. Introduction *************** -BTF (BPF Type Format) is the meta data format which +BTF (BPF Type Format) is the metadata format which encodes the debug info related to BPF program/map. The name BTF was used initially to describe data types. The BTF was later extended to include @@ -40,8 +40,8 @@ details in :ref:`BTF_Type_String`. 2. BTF Type and String Encoding ******************************* -The file ``include/uapi/linux/btf.h`` provides high -level definition on how types/strings are encoded. +The file ``include/uapi/linux/btf.h`` provides high-level +definition of how types/strings are encoded. The beginning of data blob must be:: @@ -59,23 +59,23 @@ The beginning of data blob must be:: }; The magic is ``0xeB9F``, which has different encoding for big and little -endian system, and can be used to test whether BTF is generated for -big or little endian target. -The btf_header is designed to be extensible with hdr_len equal to -``sizeof(struct btf_header)`` when the data blob is generated. +endian systems, and can be used to test whether BTF is generated for +big- or little-endian target. +The ``btf_header`` is designed to be extensible with ``hdr_len`` equal to +``sizeof(struct btf_header)`` when a data blob is generated. 2.1 String Encoding =================== The first string in the string section must be a null string. -The rest of string table is a concatenation of other null-treminated +The rest of string table is a concatenation of other null-terminated strings. 2.2 Type Encoding ================= The type id ``0`` is reserved for ``void`` type. -The type section is parsed sequentially and the type id is assigned to +The type section is parsed sequentially and type id is assigned to each recognized type starting from id ``1``. Currently, the following types are supported:: @@ -122,9 +122,9 @@ Each type contains the following common data:: }; }; -For certain kinds, the common data are followed by kind specific data. -The ``name_off`` in ``struct btf_type`` specifies the offset in the string table. -The following details encoding of each kind. +For certain kinds, the common data are followed by kind-specific data. +The ``name_off`` in ``struct btf_type`` specifies the offset in the string +table. The following sections detail encoding of each kind. 2.2.1 BTF_KIND_INT ~~~~~~~~~~~~~~~~~~ @@ -136,7 +136,7 @@ The following details encoding of each kind. * ``info.vlen``: 0 * ``size``: the size of the int type in bytes. -``btf_type`` is followed by a ``u32`` with following bits arrangement:: +``btf_type`` is followed by a ``u32`` with the following bits arrangement:: #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) @@ -148,7 +148,7 @@ The ``BTF_INT_ENCODING`` has the following attributes:: #define BTF_INT_CHAR (1 << 1) #define BTF_INT_BOOL (1 << 2) -The ``BTF_INT_ENCODING()`` provides extra information, signness, +The ``BTF_INT_ENCODING()`` provides extra information: signedness, char, or bool, for the int type. The char and bool encoding are mostly useful for pretty print. At most one encoding can be specified for the int type. @@ -161,8 +161,7 @@ The maximum value of ``BTF_INT_BITS()`` is 128. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values for this int. For example, a bitfield struct -member has - +member has: * btf member bit offset 100 from the start of the structure, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` @@ -179,7 +178,7 @@ access the same bits as the above: The original intention of ``BTF_INT_OFFSET()`` is to provide flexibility of bitfield encoding. -Currently, both llvm and pahole generates ``BTF_INT_OFFSET() = 0`` +Currently, both llvm and pahole generate ``BTF_INT_OFFSET() = 0`` for all int types. 2.2.2 BTF_KIND_PTR @@ -204,7 +203,7 @@ No additional type data follow ``btf_type``. * ``info.vlen``: 0 * ``size/type``: 0, not used -btf_type is followed by one "struct btf_array":: +``btf_type`` is followed by one ``struct btf_array``:: struct btf_array { __u32 type; @@ -217,27 +216,26 @@ The ``struct btf_array`` encoding: * ``index_type``: the index type * ``nelems``: the number of elements for this array (``0`` is also allowed). -The ``index_type`` can be any regular int types -(u8, u16, u32, u64, unsigned __int128). -The original design of including ``index_type`` follows dwarf -which has a ``index_type`` for its array type. +The ``index_type`` can be any regular int type +(``u8``, ``u16``, ``u32``, ``u64``, ``unsigned __int128``). +The original design of including ``index_type`` follows DWARF, +which has an ``index_type`` for its array type. Currently in BTF, beyond type verification, the ``index_type`` is not used. The ``struct btf_array`` allows chaining through element type to represent -multiple dimensional arrays. For example, ``int a[5][6]``, the following -type system illustrates the chaining: +multidimensional arrays. For example, for ``int a[5][6]``, the following +type information illustrates the chaining: * [1]: int * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6`` * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5`` -Currently, both pahole and llvm collapse multiple dimensional array -into one dimensional array, e.g., ``a[5][6]``, the btf_array.nelems -equal to ``30``. This is because the original use case is map pretty -print where the whole array is dumped out so one dimensional array +Currently, both pahole and llvm collapse multidimensional array +into one-dimensional array, e.g., for ``a[5][6]``, the ``btf_array.nelems`` +is equal to ``30``. This is because the original use case is map pretty +print where the whole array is dumped out so one-dimensional array is enough. As more BTF usage is explored, pahole and llvm can be -changed to generate proper chained representation for -multiple dimensional arrays. +changed to generate proper chained representation for multidimensional arrays. 2.2.4 BTF_KIND_STRUCT ~~~~~~~~~~~~~~~~~~~~~ @@ -382,7 +380,7 @@ No additional type data follow ``btf_type``. No additional type data follow ``btf_type``. -A BTF_KIND_FUNC defines, not a type, but a subprogram (function) whose +A BTF_KIND_FUNC defines not a type, but a subprogram (function) whose signature is defined by ``type``. The subprogram is thus an instance of that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in the :ref:`BTF_Ext_Section` (ELF) or in the arguments to @@ -459,10 +457,10 @@ The workflow typically looks like: 3.1 BPF_BTF_LOAD ================ -Load a blob of BTF data into kernel. A blob of data -described in :ref:`BTF_Type_String` +Load a blob of BTF data into kernel. A blob of data, +described in :ref:`BTF_Type_String`, can be directly loaded into the kernel. -A ``btf_fd`` returns to userspace. +A ``btf_fd`` is returned to a userspace. 3.2 BPF_MAP_CREATE ================== @@ -487,7 +485,7 @@ In libbpf, the map can be defined with extra annotation like below: Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, key and value types for the map. During ELF parsing, libbpf is able to extract key/value type_id's -and assigned them to BPF_MAP_CREATE attributes automatically. +and assign them to BPF_MAP_CREATE attributes automatically. .. _BPF_Prog_Load: @@ -532,7 +530,7 @@ Below are requirements for func_info: bpf func boundaries. Below are requirements for line_info: - * the first insn in each func must points to a line_info record. + * the first insn in each func must have a line_info record pointing to it. * the line_info insn_off is in strictly increasing order. For line_info, the line number and column number are defined as below: @@ -544,26 +542,26 @@ For line_info, the line number and column number are defined as below: 3.4 BPF_{PROG,MAP}_GET_NEXT_ID In kernel, every loaded program, map or btf has a unique id. -The id won't change during the life time of the program, map or btf. +The id won't change during the lifetime of a program, map, or btf. The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID returns all id's, one for each command, to user space, for bpf -program or maps, -so the inspection tool can inspect all programs and maps. +program or maps, respectively, +so an inspection tool can inspect all programs and maps. 3.5 BPF_{PROG,MAP}_GET_FD_BY_ID -The introspection tool cannot use id to get details about program or maps. -A file descriptor needs to be obtained first for reference counting purpose. +An introspection tool cannot use id to get details about program or maps. +A file descriptor needs to be obtained first for reference-counting purpose. 3.6 BPF_OBJ_GET_INFO_BY_FD ========================== -Once a program/map fd is acquired, the introspection tool can +Once a program/map fd is acquired, an introspection tool can get the detailed information from kernel about this fd, -some of which is btf related. For example, -``bpf_map_info`` returns ``btf_id``, key/value type id. -``bpf_prog_info`` returns ``btf_id``, func_info and line info +some of which are BTF-related. For example, +``bpf_map_info`` returns ``btf_id`` and key/value type ids. +``bpf_prog_info`` returns ``btf_id``, func_info, and line info for translated bpf byte codes, and jited_line_info. 3.7 BPF_BTF_GET_FD_BY_ID @@ -574,9 +572,9 @@ bpf syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. Then, with command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally loaded into the kernel with BPF_BTF_LOAD, can be retrieved. -With the btf blob, ``bpf_map_info`` and ``bpf_prog_info``, the introspection +With the btf blob, ``bpf_map_info``, and ``bpf_prog_info``, an introspection tool has full btf knowledge and is able to pretty print map key/values, -dump func signatures, dump line info along with byte/jit codes. +dump func signatures and line info, along with byte/jit codes. 4. ELF File Format Interface **************************** @@ -625,8 +623,8 @@ The func_info is organized as below.:: ... ``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure -when .BTF.ext is generated. btf_ext_info_sec, defined below, is -the func_info for each specific ELF section.:: +when .BTF.ext is generated. ``btf_ext_info_sec``, defined below, is +a collection of func_info for each specific ELF section.:: struct btf_ext_info_sec { __u32 sec_name_off; /* offset to section name */ @@ -661,7 +659,7 @@ from the beginning of section (``btf_ext_info_sec->sec_name_off``). With BTF, the map key/value can be printed based on fields rather than simply raw bytes. This is especially -valuable for large structure or if you data structure +valuable for large structure or if your data structure has bitfields. For example, for the following map,:: enum A { A1, A2, A3, A4, A5 }; @@ -702,8 +700,8 @@ bpftool is able to pretty print like below: 5.2 bpftool prog dump ===================== -The following is an example to show func_info and line_info -can help prog dump with better kernel symbol name, function prototype +The following is an example showing how func_info and line_info +can help prog dump with better kernel symbol names, function prototypes and line information.:: $ bpftool prog dump jited pinned /sys/fs/bpf/test_btf_haskv @@ -733,10 +731,10 @@ and line information.:: ; counts = bpf_map_lookup_elem(&btf_map, &key); [...] -5.3 verifier log +5.3 Verifier Log ================ -The following is an example how line_info can help verifier failure debug.:: +The following is an example of how line_info can help debugging verification failure.:: /* The code at tools/testing/selftests/bpf/test_xdp_noinline.c * is modified as below. @@ -867,4 +865,4 @@ The assembly code (-S) is able to show the BTF encoding in assembly format.:: 7. Testing ********** -Kernel bpf selftest `test_btf.c` provides extensive set of BTF related tests. +Kernel bpf selftest `test_btf.c` provides extensive set of BTF-related tests. -- cgit From 9ab5305dbe3ffcd146852e28aa76a917e45c7541 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 17:12:20 -0800 Subject: docs/btf: reflow text to fill up to 78 characters Reflow paragraphs to more fully and evenly fill 78 character lines. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- Documentation/bpf/btf.rst | 300 ++++++++++++++++++++++------------------------ 1 file changed, 140 insertions(+), 160 deletions(-) (limited to 'Documentation') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 1d761f1c5b2b..9a60a5d60e38 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -5,43 +5,35 @@ BPF Type Format (BTF) 1. Introduction *************** -BTF (BPF Type Format) is the metadata format which -encodes the debug info related to BPF program/map. -The name BTF was used initially to describe -data types. The BTF was later extended to include -function info for defined subroutines, and line info -for source/line information. - -The debug info is used for map pretty print, function -signature, etc. The function signature enables better -bpf program/function kernel symbol. -The line info helps generate -source annotated translated byte code, jited code -and verifier log. +BTF (BPF Type Format) is the metadata format which encodes the debug info +related to BPF program/map. The name BTF was used initially to describe data +types. The BTF was later extended to include function info for defined +subroutines, and line info for source/line information. + +The debug info is used for map pretty print, function signature, etc. The +function signature enables better bpf program/function kernel symbol. The line +info helps generate source annotated translated byte code, jited code and +verifier log. The BTF specification contains two parts, * BTF kernel API * BTF ELF file format -The kernel API is the contract between -user space and kernel. The kernel verifies -the BTF info before using it. -The ELF file format is a user space contract -between ELF file and libbpf loader. +The kernel API is the contract between user space and kernel. The kernel +verifies the BTF info before using it. The ELF file format is a user space +contract between ELF file and libbpf loader. -The type and string sections are part of the -BTF kernel API, describing the debug info -(mostly types related) referenced by the bpf program. -These two sections are discussed in -details in :ref:`BTF_Type_String`. +The type and string sections are part of the BTF kernel API, describing the +debug info (mostly types related) referenced by the bpf program. These two +sections are discussed in details in :ref:`BTF_Type_String`. .. _BTF_Type_String: 2. BTF Type and String Encoding ******************************* -The file ``include/uapi/linux/btf.h`` provides high-level -definition of how types/strings are encoded. +The file ``include/uapi/linux/btf.h`` provides high-level definition of how +types/strings are encoded. The beginning of data blob must be:: @@ -59,25 +51,23 @@ The beginning of data blob must be:: }; The magic is ``0xeB9F``, which has different encoding for big and little -endian systems, and can be used to test whether BTF is generated for -big- or little-endian target. -The ``btf_header`` is designed to be extensible with ``hdr_len`` equal to -``sizeof(struct btf_header)`` when a data blob is generated. +endian systems, and can be used to test whether BTF is generated for big- or +little-endian target. The ``btf_header`` is designed to be extensible with +``hdr_len`` equal to ``sizeof(struct btf_header)`` when a data blob is +generated. 2.1 String Encoding =================== -The first string in the string section must be a null string. -The rest of string table is a concatenation of other null-terminated -strings. +The first string in the string section must be a null string. The rest of +string table is a concatenation of other null-terminated strings. 2.2 Type Encoding ================= -The type id ``0`` is reserved for ``void`` type. -The type section is parsed sequentially and type id is assigned to -each recognized type starting from id ``1``. -Currently, the following types are supported:: +The type id ``0`` is reserved for ``void`` type. The type section is parsed +sequentially and type id is assigned to each recognized type starting from id +``1``. Currently, the following types are supported:: #define BTF_KIND_INT 1 /* Integer */ #define BTF_KIND_PTR 2 /* Pointer */ @@ -122,9 +112,9 @@ Each type contains the following common data:: }; }; -For certain kinds, the common data are followed by kind-specific data. -The ``name_off`` in ``struct btf_type`` specifies the offset in the string -table. The following sections detail encoding of each kind. +For certain kinds, the common data are followed by kind-specific data. The +``name_off`` in ``struct btf_type`` specifies the offset in the string table. +The following sections detail encoding of each kind. 2.2.1 BTF_KIND_INT ~~~~~~~~~~~~~~~~~~ @@ -148,38 +138,33 @@ The ``BTF_INT_ENCODING`` has the following attributes:: #define BTF_INT_CHAR (1 << 1) #define BTF_INT_BOOL (1 << 2) -The ``BTF_INT_ENCODING()`` provides extra information: signedness, -char, or bool, for the int type. The char and bool encoding -are mostly useful for pretty print. At most one encoding can -be specified for the int type. - -The ``BTF_INT_BITS()`` specifies the number of actual bits held by -this int type. For example, a 4-bit bitfield encodes -``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8`` -must be equal to or greater than ``BTF_INT_BITS()`` for the type. -The maximum value of ``BTF_INT_BITS()`` is 128. - -The ``BTF_INT_OFFSET()`` specifies the starting bit offset to -calculate values for this int. For example, a bitfield struct -member has: - * btf member bit offset 100 from the start of the structure, - * btf member pointing to an int type, - * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` +The ``BTF_INT_ENCODING()`` provides extra information: signedness, char, or +bool, for the int type. The char and bool encoding are mostly useful for +pretty print. At most one encoding can be specified for the int type. + +The ``BTF_INT_BITS()`` specifies the number of actual bits held by this int +type. For example, a 4-bit bitfield encodes ``BTF_INT_BITS()`` equals to 4. +The ``btf_type.size * 8`` must be equal to or greater than ``BTF_INT_BITS()`` +for the type. The maximum value of ``BTF_INT_BITS()`` is 128. + +The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values +for this int. For example, a bitfield struct member has: * btf member bit +offset 100 from the start of the structure, * btf member pointing to an int +type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` -Then in the struct memory layout, this member will occupy -``4`` bits starting from bits ``100 + 2 = 102``. +Then in the struct memory layout, this member will occupy ``4`` bits starting +from bits ``100 + 2 = 102``. -Alternatively, the bitfield struct member can be the following to -access the same bits as the above: +Alternatively, the bitfield struct member can be the following to access the +same bits as the above: * btf member bit offset 102, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` -The original intention of ``BTF_INT_OFFSET()`` is to provide -flexibility of bitfield encoding. -Currently, both llvm and pahole generate ``BTF_INT_OFFSET() = 0`` -for all int types. +The original intention of ``BTF_INT_OFFSET()`` is to provide flexibility of +bitfield encoding. Currently, both llvm and pahole generate +``BTF_INT_OFFSET() = 0`` for all int types. 2.2.2 BTF_KIND_PTR ~~~~~~~~~~~~~~~~~~ @@ -216,26 +201,25 @@ The ``struct btf_array`` encoding: * ``index_type``: the index type * ``nelems``: the number of elements for this array (``0`` is also allowed). -The ``index_type`` can be any regular int type -(``u8``, ``u16``, ``u32``, ``u64``, ``unsigned __int128``). -The original design of including ``index_type`` follows DWARF, -which has an ``index_type`` for its array type. +The ``index_type`` can be any regular int type (``u8``, ``u16``, ``u32``, +``u64``, ``unsigned __int128``). The original design of including +``index_type`` follows DWARF, which has an ``index_type`` for its array type. Currently in BTF, beyond type verification, the ``index_type`` is not used. The ``struct btf_array`` allows chaining through element type to represent -multidimensional arrays. For example, for ``int a[5][6]``, the following -type information illustrates the chaining: +multidimensional arrays. For example, for ``int a[5][6]``, the following type +information illustrates the chaining: * [1]: int * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6`` * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5`` -Currently, both pahole and llvm collapse multidimensional array -into one-dimensional array, e.g., for ``a[5][6]``, the ``btf_array.nelems`` -is equal to ``30``. This is because the original use case is map pretty -print where the whole array is dumped out so one-dimensional array -is enough. As more BTF usage is explored, pahole and llvm can be -changed to generate proper chained representation for multidimensional arrays. +Currently, both pahole and llvm collapse multidimensional array into +one-dimensional array, e.g., for ``a[5][6]``, the ``btf_array.nelems`` is +equal to ``30``. This is because the original use case is map pretty print +where the whole array is dumped out so one-dimensional array is enough. As +more BTF usage is explored, pahole and llvm can be changed to generate proper +chained representation for multidimensional arrays. 2.2.4 BTF_KIND_STRUCT ~~~~~~~~~~~~~~~~~~~~~ @@ -262,28 +246,26 @@ changed to generate proper chained representation for multidimensional arrays. * ``type``: the member type * ``offset``: -If the type info ``kind_flag`` is not set, the offset contains -only bit offset of the member. Note that the base type of the -bitfield can only be int or enum type. If the bitfield size -is 32, the base type can be either int or enum type. -If the bitfield size is not 32, the base type must be int, -and int type ``BTF_INT_BITS()`` encodes the bitfield size. +If the type info ``kind_flag`` is not set, the offset contains only bit offset +of the member. Note that the base type of the bitfield can only be int or enum +type. If the bitfield size is 32, the base type can be either int or enum +type. If the bitfield size is not 32, the base type must be int, and int type +``BTF_INT_BITS()`` encodes the bitfield size. -If the ``kind_flag`` is set, the ``btf_member.offset`` -contains both member bitfield size and bit offset. The -bitfield size and bit offset are calculated as below.:: +If the ``kind_flag`` is set, the ``btf_member.offset`` contains both member +bitfield size and bit offset. The bitfield size and bit offset are calculated +as below.:: #define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) #define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) -In this case, if the base type is an int type, it must -be a regular int type: +In this case, if the base type is an int type, it must be a regular int type: * ``BTF_INT_OFFSET()`` must be 0. * ``BTF_INT_BITS()`` must be equal to ``{1,2,4,8,16} * 8``. -The following kernel patch introduced ``kind_flag`` and -explained why both modes exist: +The following kernel patch introduced ``kind_flag`` and explained why both +modes exist: https://github.com/torvalds/linux/commit/9d5f9f701b1891466fb3dbb1806ad97716f95cc3#diff-fa650a64fdd3968396883d2fe8215ff3 @@ -381,10 +363,10 @@ No additional type data follow ``btf_type``. No additional type data follow ``btf_type``. A BTF_KIND_FUNC defines not a type, but a subprogram (function) whose -signature is defined by ``type``. The subprogram is thus an instance of -that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in -the :ref:`BTF_Ext_Section` (ELF) or in the arguments to -:ref:`BPF_Prog_Load` (ABI). +signature is defined by ``type``. The subprogram is thus an instance of that +type. The BTF_KIND_FUNC may in turn be referenced by a func_info in the +:ref:`BTF_Ext_Section` (ELF) or in the arguments to :ref:`BPF_Prog_Load` +(ABI). 2.2.13 BTF_KIND_FUNC_PROTO ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -403,13 +385,13 @@ the :ref:`BTF_Ext_Section` (ELF) or in the arguments to __u32 type; }; -If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type, -then ``btf_param.name_off`` must point to a valid C identifier -except for the possible last argument representing the variable -argument. The btf_param.type refers to parameter type. +If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type, then +``btf_param.name_off`` must point to a valid C identifier except for the +possible last argument representing the variable argument. The btf_param.type +refers to parameter type. -If the function has variable arguments, the last parameter -is encoded with ``name_off = 0`` and ``type = 0``. +If the function has variable arguments, the last parameter is encoded with +``name_off = 0`` and ``type = 0``. 3. BTF Kernel API ***************** @@ -457,10 +439,9 @@ The workflow typically looks like: 3.1 BPF_BTF_LOAD ================ -Load a blob of BTF data into kernel. A blob of data, -described in :ref:`BTF_Type_String`, -can be directly loaded into the kernel. -A ``btf_fd`` is returned to a userspace. +Load a blob of BTF data into kernel. A blob of data, described in +:ref:`BTF_Type_String`, can be directly loaded into the kernel. A ``btf_fd`` +is returned to a userspace. 3.2 BPF_MAP_CREATE ================== @@ -482,18 +463,18 @@ In libbpf, the map can be defined with extra annotation like below: }; BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts); -Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, -key and value types for the map. -During ELF parsing, libbpf is able to extract key/value type_id's -and assign them to BPF_MAP_CREATE attributes automatically. +Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, key and +value types for the map. During ELF parsing, libbpf is able to extract +key/value type_id's and assign them to BPF_MAP_CREATE attributes +automatically. .. _BPF_Prog_Load: 3.3 BPF_PROG_LOAD ================= -During prog_load, func_info and line_info can be passed to kernel with -proper values for the following attributes: +During prog_load, func_info and line_info can be passed to kernel with proper +values for the following attributes: :: __u32 insn_cnt; @@ -520,9 +501,9 @@ The func_info and line_info are an array of below, respectively.:: __u32 line_col; /* line number and column number */ }; -func_info_rec_size is the size of each func_info record, and line_info_rec_size -is the size of each line_info record. Passing the record size to kernel make -it possible to extend the record itself in the future. +func_info_rec_size is the size of each func_info record, and +line_info_rec_size is the size of each line_info record. Passing the record +size to kernel make it possible to extend the record itself in the future. Below are requirements for func_info: * func_info[0].insn_off must be 0. @@ -541,13 +522,12 @@ For line_info, the line number and column number are defined as below: 3.4 BPF_{PROG,MAP}_GET_NEXT_ID -In kernel, every loaded program, map or btf has a unique id. -The id won't change during the lifetime of a program, map, or btf. +In kernel, every loaded program, map or btf has a unique id. The id won't +change during the lifetime of a program, map, or btf. -The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID -returns all id's, one for each command, to user space, for bpf -program or maps, respectively, -so an inspection tool can inspect all programs and maps. +The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID returns all id's, one for +each command, to user space, for bpf program or maps, respectively, so an +inspection tool can inspect all programs and maps. 3.5 BPF_{PROG,MAP}_GET_FD_BY_ID @@ -557,24 +537,23 @@ A file descriptor needs to be obtained first for reference-counting purpose. 3.6 BPF_OBJ_GET_INFO_BY_FD ========================== -Once a program/map fd is acquired, an introspection tool can -get the detailed information from kernel about this fd, -some of which are BTF-related. For example, -``bpf_map_info`` returns ``btf_id`` and key/value type ids. -``bpf_prog_info`` returns ``btf_id``, func_info, and line info -for translated bpf byte codes, and jited_line_info. +Once a program/map fd is acquired, an introspection tool can get the detailed +information from kernel about this fd, some of which are BTF-related. For +example, ``bpf_map_info`` returns ``btf_id`` and key/value type ids. +``bpf_prog_info`` returns ``btf_id``, func_info, and line info for translated +bpf byte codes, and jited_line_info. 3.7 BPF_BTF_GET_FD_BY_ID ======================== -With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``, -bpf syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. -Then, with command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally -loaded into the kernel with BPF_BTF_LOAD, can be retrieved. +With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``, bpf +syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. Then, with +command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally loaded into the +kernel with BPF_BTF_LOAD, can be retrieved. With the btf blob, ``bpf_map_info``, and ``bpf_prog_info``, an introspection -tool has full btf knowledge and is able to pretty print map key/values, -dump func signatures and line info, along with byte/jit codes. +tool has full btf knowledge and is able to pretty print map key/values, dump +func signatures and line info, along with byte/jit codes. 4. ELF File Format Interface **************************** @@ -582,19 +561,19 @@ dump func signatures and line info, along with byte/jit codes. 4.1 .BTF section ================ -The .BTF section contains type and string data. The format of this section -is same as the one describe in :ref:`BTF_Type_String`. +The .BTF section contains type and string data. The format of this section is +same as the one describe in :ref:`BTF_Type_String`. .. _BTF_Ext_Section: 4.2 .BTF.ext section ==================== -The .BTF.ext section encodes func_info and line_info which -needs loader manipulation before loading into the kernel. +The .BTF.ext section encodes func_info and line_info which needs loader +manipulation before loading into the kernel. -The specification for .BTF.ext section is defined at -``tools/lib/bpf/btf.h`` and ``tools/lib/bpf/btf.c``. +The specification for .BTF.ext section is defined at ``tools/lib/bpf/btf.h`` +and ``tools/lib/bpf/btf.c``. The current header of .BTF.ext section:: @@ -611,9 +590,9 @@ The current header of .BTF.ext section:: __u32 line_info_len; }; -It is very similar to .BTF section. Instead of type/string section, -it contains func_info and line_info section. See :ref:`BPF_Prog_Load` -for details about func_info and line_info record format. +It is very similar to .BTF section. Instead of type/string section, it +contains func_info and line_info section. See :ref:`BPF_Prog_Load` for details +about func_info and line_info record format. The func_info is organized as below.:: @@ -622,9 +601,9 @@ The func_info is organized as below.:: btf_ext_info_sec for section #2 /* func_info for section #2 */ ... -``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure -when .BTF.ext is generated. ``btf_ext_info_sec``, defined below, is -a collection of func_info for each specific ELF section.:: +``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure when +.BTF.ext is generated. ``btf_ext_info_sec``, defined below, is a collection of +func_info for each specific ELF section.:: struct btf_ext_info_sec { __u32 sec_name_off; /* offset to section name */ @@ -642,14 +621,14 @@ The line_info is organized as below.:: btf_ext_info_sec for section #2 /* line_info for section #2 */ ... -``line_info_rec_size`` specifies the size of ``bpf_line_info`` structure -when .BTF.ext is generated. +``line_info_rec_size`` specifies the size of ``bpf_line_info`` structure when +.BTF.ext is generated. The interpretation of ``bpf_func_info->insn_off`` and -``bpf_line_info->insn_off`` is different between kernel API and ELF API. -For kernel API, the ``insn_off`` is the instruction offset in the unit -of ``struct bpf_insn``. For ELF API, the ``insn_off`` is the byte offset -from the beginning of section (``btf_ext_info_sec->sec_name_off``). +``bpf_line_info->insn_off`` is different between kernel API and ELF API. For +kernel API, the ``insn_off`` is the instruction offset in the unit of ``struct +bpf_insn``. For ELF API, the ``insn_off`` is the byte offset from the +beginning of section (``btf_ext_info_sec->sec_name_off``). 5. Using BTF ************ @@ -657,10 +636,9 @@ from the beginning of section (``btf_ext_info_sec->sec_name_off``). 5.1 bpftool map pretty print ============================ -With BTF, the map key/value can be printed based on fields rather than -simply raw bytes. This is especially -valuable for large structure or if your data structure -has bitfields. For example, for the following map,:: +With BTF, the map key/value can be printed based on fields rather than simply +raw bytes. This is especially valuable for large structure or if your data +structure has bitfields. For example, for the following map,:: enum A { A1, A2, A3, A4, A5 }; typedef enum A ___A; @@ -700,9 +678,9 @@ bpftool is able to pretty print like below: 5.2 bpftool prog dump ===================== -The following is an example showing how func_info and line_info -can help prog dump with better kernel symbol names, function prototypes -and line information.:: +The following is an example showing how func_info and line_info can help prog +dump with better kernel symbol names, function prototypes and line +information.:: $ bpftool prog dump jited pinned /sys/fs/bpf/test_btf_haskv [...] @@ -734,7 +712,8 @@ and line information.:: 5.3 Verifier Log ================ -The following is an example of how line_info can help debugging verification failure.:: +The following is an example of how line_info can help debugging verification +failure.:: /* The code at tools/testing/selftests/bpf/test_xdp_noinline.c * is modified as below. @@ -763,8 +742,8 @@ You need latest pahole https://git.kernel.org/pub/scm/devel/pahole/pahole.git/ -or llvm (8.0 or later). The pahole acts as a dwarf2btf converter. It doesn't support .BTF.ext -and btf BTF_KIND_FUNC type yet. For example,:: +or llvm (8.0 or later). The pahole acts as a dwarf2btf converter. It doesn't +support .BTF.ext and btf BTF_KIND_FUNC type yet. For example,:: -bash-4.4$ cat t.c struct t { @@ -781,8 +760,9 @@ and btf BTF_KIND_FUNC type yet. For example,:: c type_id=2 bitfield_size=2 bits_offset=5 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED -The llvm is able to generate .BTF and .BTF.ext directly with -g for bpf target only. -The assembly code (-S) is able to show the BTF encoding in assembly format.:: +The llvm is able to generate .BTF and .BTF.ext directly with -g for bpf target +only. The assembly code (-S) is able to show the BTF encoding in assembly +format.:: -bash-4.4$ cat t2.c typedef int __int32; -- cgit From 46604676c8c6c4c07649767d32ae66f4429ccd6f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 28 Feb 2019 17:12:21 -0800 Subject: docs/bpf: minor casing/punctuation fixes Fix few casing and punctuation glitches. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- Documentation/bpf/bpf_design_QA.rst | 24 ++++++++++++------------ Documentation/networking/filter.txt | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 7cc9e368c1e9..10453c627135 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -36,27 +36,27 @@ consideration important quirks of other architectures) and defines calling convention that is compatible with C calling convention of the linux kernel on those architectures. -Q: can multiple return values be supported in the future? +Q: Can multiple return values be supported in the future? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A: NO. BPF allows only register R0 to be used as return value. -Q: can more than 5 function arguments be supported in the future? +Q: Can more than 5 function arguments be supported in the future? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A: NO. BPF calling convention only allows registers R1-R5 to be used as arguments. BPF is not a standalone instruction set. (unlike x64 ISA that allows msft, cdecl and other conventions) -Q: can BPF programs access instruction pointer or return address? +Q: Can BPF programs access instruction pointer or return address? ----------------------------------------------------------------- A: NO. -Q: can BPF programs access stack pointer ? +Q: Can BPF programs access stack pointer ? ------------------------------------------ A: NO. Only frame pointer (register R10) is accessible. From compiler point of view it's necessary to have stack pointer. -For example LLVM defines register R11 as stack pointer in its +For example, LLVM defines register R11 as stack pointer in its BPF backend, but it makes sure that generated code never uses it. Q: Does C-calling convention diminishes possible use cases? @@ -66,8 +66,8 @@ A: YES. BPF design forces addition of major functionality in the form of kernel helper functions and kernel objects like BPF maps with seamless interoperability between them. It lets kernel call into -BPF programs and programs call kernel helpers with zero overhead. -As all of them were native C code. That is particularly the case +BPF programs and programs call kernel helpers with zero overhead, +as all of them were native C code. That is particularly the case for JITed BPF programs that are indistinguishable from native kernel C code. @@ -75,9 +75,9 @@ Q: Does it mean that 'innovative' extensions to BPF code are disallowed? ------------------------------------------------------------------------ A: Soft yes. -At least for now until BPF core has support for +At least for now, until BPF core has support for bpf-to-bpf calls, indirect calls, loops, global variables, -jump tables, read only sections and all other normal constructs +jump tables, read-only sections, and all other normal constructs that C code can produce. Q: Can loops be supported in a safe way? @@ -109,16 +109,16 @@ For example why BPF_JNE and other compare and jumps are not cpu-like? A: This was necessary to avoid introducing flags into ISA which are impossible to make generic and efficient across CPU architectures. -Q: why BPF_DIV instruction doesn't map to x64 div? +Q: Why BPF_DIV instruction doesn't map to x64 div? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A: Because if we picked one-to-one relationship to x64 it would have made it more complicated to support on arm64 and other archs. Also it needs div-by-zero runtime check. -Q: why there is no BPF_SDIV for signed divide operation? +Q: Why there is no BPF_SDIV for signed divide operation? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A: Because it would be rarely used. llvm errors in such case and -prints a suggestion to use unsigned divide instead +prints a suggestion to use unsigned divide instead. Q: Why BPF has implicit prologue and epilogue? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index b5e060edfc38..319e5e041f38 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -829,7 +829,7 @@ tracing filters may do to maintain counters of events, for example. Register R9 is not used by socket filters either, but more complex filters may be running out of registers and would have to resort to spill/fill to stack. -Internal BPF can used as generic assembler for last step performance +Internal BPF can be used as a generic assembler for last step performance optimizations, socket filters and seccomp are using it as assembler. Tracing filters may use it as assembler to generate code from kernel. In kernel usage may not be bounded by security considerations, since generated internal BPF code -- cgit From 8e4a07405d9dae05aecb629d356ae0cb9821ea2c Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Wed, 27 Feb 2019 20:59:13 +0100 Subject: doc: net: ieee802154: introduce IEEE 802.15.4 subsystem doc in rst style Moving the ieee802154 docs from a plain text file into the new rst style. This commit only does the minimal needed change to bring the documentation over. Follow up patches will improve and extend on this. Signed-off-by: Stefan Schmidt Tested-by: Randy Dunlap Signed-off-by: David S. Miller --- Documentation/networking/ieee802154.rst | 180 ++++++++++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + 2 files changed, 181 insertions(+) create mode 100644 Documentation/networking/ieee802154.rst (limited to 'Documentation') diff --git a/Documentation/networking/ieee802154.rst b/Documentation/networking/ieee802154.rst new file mode 100644 index 000000000000..36ca823a1122 --- /dev/null +++ b/Documentation/networking/ieee802154.rst @@ -0,0 +1,180 @@ +=============================== +IEEE 802.15.4 Developer's Guide +=============================== + +Introduction +============ +The IEEE 802.15.4 working group focuses on standardization of the bottom +two layers: Medium Access Control (MAC) and Physical access (PHY). And there +are mainly two options available for upper layers: + +- ZigBee - proprietary protocol from the ZigBee Alliance +- 6LoWPAN - IPv6 networking over low rate personal area networks + +The goal of the Linux-wpan is to provide a complete implementation +of the IEEE 802.15.4 and 6LoWPAN protocols. IEEE 802.15.4 is a stack +of protocols for organizing Low-Rate Wireless Personal Area Networks. + +The stack is composed of three main parts: + +- IEEE 802.15.4 layer; We have chosen to use plain Berkeley socket API, + the generic Linux networking stack to transfer IEEE 802.15.4 data + messages and a special protocol over netlink for configuration/management +- MAC - provides access to shared channel and reliable data delivery +- PHY - represents device drivers + +Socket API +========== + +.. c:function:: int sd = socket(PF_IEEE802154, SOCK_DGRAM, 0); + +The address family, socket addresses etc. are defined in the +include/net/af_ieee802154.h header or in the special header +in the userspace package (see either http://wpan.cakelab.org/ or the +git tree at https://github.com/linux-wpan/wpan-tools). + +6LoWPAN Linux implementation +============================ + +The IEEE 802.15.4 standard specifies an MTU of 127 bytes, yielding about 80 +octets of actual MAC payload once security is turned on, on a wireless link +with a link throughput of 250 kbps or less. The 6LoWPAN adaptation format +[RFC4944] was specified to carry IPv6 datagrams over such constrained links, +taking into account limited bandwidth, memory, or energy resources that are +expected in applications such as wireless Sensor Networks. [RFC4944] defines +a Mesh Addressing header to support sub-IP forwarding, a Fragmentation header +to support the IPv6 minimum MTU requirement [RFC2460], and stateless header +compression for IPv6 datagrams (LOWPAN_HC1 and LOWPAN_HC2) to reduce the +relatively large IPv6 and UDP headers down to (in the best case) several bytes. + +In September 2011 the standard update was published - [RFC6282]. +It deprecates HC1 and HC2 compression and defines IPHC encoding format which is +used in this Linux implementation. + +All the code related to 6lowpan you may find in files: net/6lowpan/* +and net/ieee802154/6lowpan/* + +To setup a 6LoWPAN interface you need: +1. Add IEEE802.15.4 interface and set channel and PAN ID; +2. Add 6lowpan interface by command like: +# ip link add link wpan0 name lowpan0 type lowpan +3. Bring up 'lowpan0' interface + +Drivers +======= + +Like with WiFi, there are several types of devices implementing IEEE 802.15.4. +1) 'HardMAC'. The MAC layer is implemented in the device itself, the device +exports a management (e.g. MLME) and data API. +2) 'SoftMAC' or just radio. These types of devices are just radio transceivers +possibly with some kinds of acceleration like automatic CRC computation and +comparation, automagic ACK handling, address matching, etc. + +Those types of devices require different approach to be hooked into Linux kernel. + +HardMAC +------- + +See the header include/net/ieee802154_netdev.h. You have to implement Linux +net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family +code via plain sk_buffs. On skb reception skb->cb must contain additional +info as described in the struct ieee802154_mac_cb. During packet transmission +the skb->cb is used to provide additional data to device's header_ops->create +function. Be aware that this data can be overridden later (when socket code +submits skb to qdisc), so if you need something from that cb later, you should +store info in the skb->data on your own. + +To hook the MLME interface you have to populate the ml_priv field of your +net_device with a pointer to struct ieee802154_mlme_ops instance. The fields +assoc_req, assoc_resp, disassoc_req, start_req, and scan_req are optional. +All other fields are required. + +SoftMAC +------- + +The MAC is the middle layer in the IEEE 802.15.4 Linux stack. This moment it +provides interface for drivers registration and management of slave interfaces. + +NOTE: Currently the only monitor device type is supported - it's IEEE 802.15.4 +stack interface for network sniffers (e.g. WireShark). + +This layer is going to be extended soon. + +See header include/net/mac802154.h and several drivers in +drivers/net/ieee802154/. + +Fake drivers +------------ + +In addition there is a driver available which simulates a real device with +SoftMAC (fakelb - IEEE 802.15.4 loopback driver) interface. This option +provides a possibility to test and debug the stack without usage of real hardware. + +Device drivers API +================== + +The include/net/mac802154.h defines following functions: + +.. c:function:: struct ieee802154_dev *ieee802154_alloc_device (size_t priv_size, struct ieee802154_ops *ops) + +Allocation of IEEE 802.15.4 compatible device. + +.. c:function:: void ieee802154_free_device(struct ieee802154_dev *dev) + +Freeing allocated device. + +.. c:function:: int ieee802154_register_device(struct ieee802154_dev *dev) + +Register PHY in the system. + +.. c:function:: void ieee802154_unregister_device(struct ieee802154_dev *dev) + +Freeing registered PHY. + +.. c:function:: void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi): + +Telling 802.15.4 module there is a new received frame in the skb with +the RF Link Quality Indicator (LQI) from the hardware device. + +.. c:function:: void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, bool ifs_handling): + +Telling 802.15.4 module the frame in the skb is or going to be +transmitted through the hardware device + +The device driver must implement the following callbacks in the IEEE 802.15.4 +operations structure at least:: + + struct ieee802154_ops { + ... + int (*start)(struct ieee802154_hw *hw); + void (*stop)(struct ieee802154_hw *hw); + ... + int (*xmit_async)(struct ieee802154_hw *hw, struct sk_buff *skb); + int (*ed)(struct ieee802154_hw *hw, u8 *level); + int (*set_channel)(struct ieee802154_hw *hw, u8 page, u8 channel); + ... + }; + +.. c:function:: int start(struct ieee802154_hw *hw): + +Handler that 802.15.4 module calls for the hardware device initialization. + +.. c:function:: void stop(struct ieee802154_hw *hw): + +Handler that 802.15.4 module calls for the hardware device cleanup. + +.. c:function:: int xmit_async(struct ieee802154_hw *hw, struct sk_buff *skb): + +Handler that 802.15.4 module calls for each frame in the skb going to be +transmitted through the hardware device. + +.. c:function:: int ed(struct ieee802154_hw *hw, u8 *level): + +Handler that 802.15.4 module calls for Energy Detection from the hardware +device. + +.. c:function:: int set_channel(struct ieee802154_hw *hw, u8 page, u8 channel): + +Set radio for listening on specific channel of the hardware device. + +Moreover IEEE 802.15.4 device operations structure should be filled. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index b08cf145d5eb..f0da1b001514 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -25,6 +25,7 @@ Contents: device_drivers/intel/iavf device_drivers/intel/ice devlink-info-versions + ieee802154 kapi z8530book msg_zerocopy -- cgit From 8a42eda2582ada073d7320ca703fb5fa49600e87 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Wed, 27 Feb 2019 20:59:14 +0100 Subject: doc: net: ieee802154: remove old plain text docs after switching to rst The plain text docs are converted to rst now, which allows us to remove the old text file from the tree. Signed-off-by: Stefan Schmidt Signed-off-by: David S. Miller --- Documentation/networking/ieee802154.txt | 177 -------------------------------- 1 file changed, 177 deletions(-) delete mode 100644 Documentation/networking/ieee802154.txt (limited to 'Documentation') diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt deleted file mode 100644 index e74d8e1da0e2..000000000000 --- a/Documentation/networking/ieee802154.txt +++ /dev/null @@ -1,177 +0,0 @@ - - Linux IEEE 802.15.4 implementation - - -Introduction -============ -The IEEE 802.15.4 working group focuses on standardization of the bottom -two layers: Medium Access Control (MAC) and Physical access (PHY). And there -are mainly two options available for upper layers: - - ZigBee - proprietary protocol from the ZigBee Alliance - - 6LoWPAN - IPv6 networking over low rate personal area networks - -The goal of the Linux-wpan is to provide a complete implementation -of the IEEE 802.15.4 and 6LoWPAN protocols. IEEE 802.15.4 is a stack -of protocols for organizing Low-Rate Wireless Personal Area Networks. - -The stack is composed of three main parts: - - IEEE 802.15.4 layer; We have chosen to use plain Berkeley socket API, - the generic Linux networking stack to transfer IEEE 802.15.4 data - messages and a special protocol over netlink for configuration/management - - MAC - provides access to shared channel and reliable data delivery - - PHY - represents device drivers - - -Socket API -========== - -int sd = socket(PF_IEEE802154, SOCK_DGRAM, 0); -..... - -The address family, socket addresses etc. are defined in the -include/net/af_ieee802154.h header or in the special header -in the userspace package (see either http://wpan.cakelab.org/ or the -git tree at https://github.com/linux-wpan/wpan-tools). - - -Kernel side -============= - -Like with WiFi, there are several types of devices implementing IEEE 802.15.4. -1) 'HardMAC'. The MAC layer is implemented in the device itself, the device - exports a management (e.g. MLME) and data API. -2) 'SoftMAC' or just radio. These types of devices are just radio transceivers - possibly with some kinds of acceleration like automatic CRC computation and - comparation, automagic ACK handling, address matching, etc. - -Those types of devices require different approach to be hooked into Linux kernel. - - -HardMAC -======= - -See the header include/net/ieee802154_netdev.h. You have to implement Linux -net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family -code via plain sk_buffs. On skb reception skb->cb must contain additional -info as described in the struct ieee802154_mac_cb. During packet transmission -the skb->cb is used to provide additional data to device's header_ops->create -function. Be aware that this data can be overridden later (when socket code -submits skb to qdisc), so if you need something from that cb later, you should -store info in the skb->data on your own. - -To hook the MLME interface you have to populate the ml_priv field of your -net_device with a pointer to struct ieee802154_mlme_ops instance. The fields -assoc_req, assoc_resp, disassoc_req, start_req, and scan_req are optional. -All other fields are required. - - -SoftMAC -======= - -The MAC is the middle layer in the IEEE 802.15.4 Linux stack. This moment it -provides interface for drivers registration and management of slave interfaces. - -NOTE: Currently the only monitor device type is supported - it's IEEE 802.15.4 -stack interface for network sniffers (e.g. WireShark). - -This layer is going to be extended soon. - -See header include/net/mac802154.h and several drivers in -drivers/net/ieee802154/. - - -Device drivers API -================== - -The include/net/mac802154.h defines following functions: - - struct ieee802154_hw * - ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops): - allocation of IEEE 802.15.4 compatible hardware device - - - void ieee802154_free_hw(struct ieee802154_hw *hw): - freeing allocated hardware device - - - int ieee802154_register_hw(struct ieee802154_hw *hw): - register PHY which is the allocated hardware device, in the system - - - void ieee802154_unregister_hw(struct ieee802154_hw *hw): - freeing registered PHY - - - void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, - u8 lqi): - telling 802.15.4 module there is a new received frame in the skb with - the RF Link Quality Indicator (LQI) from the hardware device - - - void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, - bool ifs_handling): - telling 802.15.4 module the frame in the skb is or going to be - transmitted through the hardware device - -The device driver must implement the following callbacks in the IEEE 802.15.4 -operations structure at least: -struct ieee802154_ops { - ... - int (*start)(struct ieee802154_hw *hw); - void (*stop)(struct ieee802154_hw *hw); - ... - int (*xmit_async)(struct ieee802154_hw *hw, struct sk_buff *skb); - int (*ed)(struct ieee802154_hw *hw, u8 *level); - int (*set_channel)(struct ieee802154_hw *hw, u8 page, u8 channel); - ... -}; - - - int start(struct ieee802154_hw *hw): - handler that 802.15.4 module calls for the hardware device initialization. - - - void stop(struct ieee802154_hw *hw): - handler that 802.15.4 module calls for the hardware device cleanup. - - - int xmit_async(struct ieee802154_hw *hw, struct sk_buff *skb): - handler that 802.15.4 module calls for each frame in the skb going to be - transmitted through the hardware device. - - - int ed(struct ieee802154_hw *hw, u8 *level): - handler that 802.15.4 module calls for Energy Detection from the hardware - device. - - - int set_channel(struct ieee802154_hw *hw, u8 page, u8 channel): - set radio for listening on specific channel of the hardware device. - -Moreover IEEE 802.15.4 device operations structure should be filled. - -Fake drivers -============ - -In addition there is a driver available which simulates a real device with -SoftMAC (fakelb - IEEE 802.15.4 loopback driver) interface. This option -provides a possibility to test and debug the stack without usage of real hardware. - -See sources in drivers/net/ieee802154 folder for more details. - - -6LoWPAN Linux implementation -============================ - -The IEEE 802.15.4 standard specifies an MTU of 127 bytes, yielding about 80 -octets of actual MAC payload once security is turned on, on a wireless link -with a link throughput of 250 kbps or less. The 6LoWPAN adaptation format -[RFC4944] was specified to carry IPv6 datagrams over such constrained links, -taking into account limited bandwidth, memory, or energy resources that are -expected in applications such as wireless Sensor Networks. [RFC4944] defines -a Mesh Addressing header to support sub-IP forwarding, a Fragmentation header -to support the IPv6 minimum MTU requirement [RFC2460], and stateless header -compression for IPv6 datagrams (LOWPAN_HC1 and LOWPAN_HC2) to reduce the -relatively large IPv6 and UDP headers down to (in the best case) several bytes. - -In September 2011 the standard update was published - [RFC6282]. -It deprecates HC1 and HC2 compression and defines IPHC encoding format which is -used in this Linux implementation. - -All the code related to 6lowpan you may find in files: net/6lowpan/* -and net/ieee802154/6lowpan/* - -To setup a 6LoWPAN interface you need: -1. Add IEEE802.15.4 interface and set channel and PAN ID; -2. Add 6lowpan interface by command like: - # ip link add link wpan0 name lowpan0 type lowpan -3. Bring up 'lowpan0' interface -- cgit From 91cf8eceffc131d41f098351e1b290bdaab45ea7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Feb 2019 16:29:16 -0800 Subject: switchdev: Remove unused transaction item queue There are no more in tree users of the switchdev_trans_item_{dequeue,enqueue} or switchdev_trans_item structure in the kernel since commit 00fc0c51e35b ("rocker: Change world_ops API and implementation to be switchdev independant"). Remove this unused code and update the documentation accordingly since. Signed-off-by: Florian Fainelli Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 19 ------- include/net/switchdev.h | 12 ---- net/switchdev/switchdev.c | 100 +-------------------------------- 3 files changed, 2 insertions(+), 129 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 633dd1fd81b7..86174ce8cd13 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -371,22 +371,3 @@ The driver can monitor for updates to arp_tbl using the netevent notifier NETEVENT_NEIGH_UPDATE. The device can be programmed with resolved nexthops for the routes as arp_tbl updates. The driver implements ndo_neigh_destroy to know when arp_tbl neighbor entries are purged from the port. - -Transaction item queue -^^^^^^^^^^^^^^^^^^^^^^ - -For switchdev ops attr_set and obj_add, there is a 2 phase transaction model -used. First phase is to "prepare" anything needed, including various checks, -memory allocation, etc. The goal is to handle the stuff that is not unlikely -to fail here. The second phase is to "commit" the actual changes. - -Switchdev provides an infrastructure for sharing items (for example memory -allocations) between the two phases. - -The object created by a driver in "prepare" phase and it is queued up by: -switchdev_trans_item_enqueue() -During the "commit" phase, the driver gets the object by: -switchdev_trans_item_dequeue() - -If a transaction is aborted during "prepare" phase, switchdev code will handle -cleanup of the queued-up objects. diff --git a/include/net/switchdev.h b/include/net/switchdev.h index e4f751e19ecf..0ebd67ae7012 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -20,14 +20,7 @@ #define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1) #define SWITCHDEV_F_DEFER BIT(2) -struct switchdev_trans_item { - struct list_head list; - void *data; - void (*destructor)(const void *data); -}; - struct switchdev_trans { - struct list_head item_list; bool ph_prepare; }; @@ -105,11 +98,6 @@ struct switchdev_obj_port_mdb { #define SWITCHDEV_OBJ_PORT_MDB(OBJ) \ container_of((OBJ), struct switchdev_obj_port_mdb, obj) -void switchdev_trans_item_enqueue(struct switchdev_trans *trans, - void *data, void (*destructor)(void const *), - struct switchdev_trans_item *tritem); -void *switchdev_trans_item_dequeue(struct switchdev_trans *trans); - typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); enum switchdev_notifier_type { diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index d81cfcee9ad9..90ba4a1f0a6d 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -23,78 +23,6 @@ #include #include -/** - * switchdev_trans_item_enqueue - Enqueue data item to transaction queue - * - * @trans: transaction - * @data: pointer to data being queued - * @destructor: data destructor - * @tritem: transaction item being queued - * - * Enqeueue data item to transaction queue. tritem is typically placed in - * cointainter pointed at by data pointer. Destructor is called on - * transaction abort and after successful commit phase in case - * the caller did not dequeue the item before. - */ -void switchdev_trans_item_enqueue(struct switchdev_trans *trans, - void *data, void (*destructor)(void const *), - struct switchdev_trans_item *tritem) -{ - tritem->data = data; - tritem->destructor = destructor; - list_add_tail(&tritem->list, &trans->item_list); -} -EXPORT_SYMBOL_GPL(switchdev_trans_item_enqueue); - -static struct switchdev_trans_item * -__switchdev_trans_item_dequeue(struct switchdev_trans *trans) -{ - struct switchdev_trans_item *tritem; - - if (list_empty(&trans->item_list)) - return NULL; - tritem = list_first_entry(&trans->item_list, - struct switchdev_trans_item, list); - list_del(&tritem->list); - return tritem; -} - -/** - * switchdev_trans_item_dequeue - Dequeue data item from transaction queue - * - * @trans: transaction - */ -void *switchdev_trans_item_dequeue(struct switchdev_trans *trans) -{ - struct switchdev_trans_item *tritem; - - tritem = __switchdev_trans_item_dequeue(trans); - BUG_ON(!tritem); - return tritem->data; -} -EXPORT_SYMBOL_GPL(switchdev_trans_item_dequeue); - -static void switchdev_trans_init(struct switchdev_trans *trans) -{ - INIT_LIST_HEAD(&trans->item_list); -} - -static void switchdev_trans_items_destroy(struct switchdev_trans *trans) -{ - struct switchdev_trans_item *tritem; - - while ((tritem = __switchdev_trans_item_dequeue(trans))) - tritem->destructor(tritem->data); -} - -static void switchdev_trans_items_warn_destroy(struct net_device *dev, - struct switchdev_trans *trans) -{ - WARN(!list_empty(&trans->item_list), "%s: transaction item queue is not empty.\n", - dev->name); - switchdev_trans_items_destroy(trans); -} - static LIST_HEAD(deferred); static DEFINE_SPINLOCK(deferred_lock); @@ -208,8 +136,6 @@ static int switchdev_port_attr_set_now(struct net_device *dev, struct switchdev_trans trans; int err; - switchdev_trans_init(&trans); - /* Phase I: prepare for attr set. Driver/device should fail * here if there are going to be issues in the commit phase, * such as lack of resources or support. The driver/device @@ -220,17 +146,8 @@ static int switchdev_port_attr_set_now(struct net_device *dev, trans.ph_prepare = true; err = switchdev_port_attr_notify(SWITCHDEV_PORT_ATTR_SET, dev, attr, &trans); - if (err) { - /* Prepare phase failed: abort the transaction. Any - * resources reserved in the prepare phase are - * released. - */ - - if (err != -EOPNOTSUPP) - switchdev_trans_items_destroy(&trans); - + if (err) return err; - } /* Phase II: commit attr set. This cannot fail as a fault * of driver/device. If it does, it's a bug in the driver/device @@ -242,7 +159,6 @@ static int switchdev_port_attr_set_now(struct net_device *dev, &trans); WARN(err, "%s: Commit of attribute (id=%d) failed.\n", dev->name, attr->id); - switchdev_trans_items_warn_destroy(dev, &trans); return err; } @@ -341,8 +257,6 @@ static int switchdev_port_obj_add_now(struct net_device *dev, ASSERT_RTNL(); - switchdev_trans_init(&trans); - /* Phase I: prepare for obj add. Driver/device should fail * here if there are going to be issues in the commit phase, * such as lack of resources or support. The driver/device @@ -353,17 +267,8 @@ static int switchdev_port_obj_add_now(struct net_device *dev, trans.ph_prepare = true; err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, &trans, extack); - if (err) { - /* Prepare phase failed: abort the transaction. Any - * resources reserved in the prepare phase are - * released. - */ - - if (err != -EOPNOTSUPP) - switchdev_trans_items_destroy(&trans); - + if (err) return err; - } /* Phase II: commit obj add. This cannot fail as a fault * of driver/device. If it does, it's a bug in the driver/device @@ -374,7 +279,6 @@ static int switchdev_port_obj_add_now(struct net_device *dev, err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, &trans, extack); WARN(err, "%s: Commit of object (id=%d) failed.\n", dev->name, obj->id); - switchdev_trans_items_warn_destroy(dev, &trans); return err; } -- cgit From b805c403c859756175fefc213065125da16b808d Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Fri, 1 Mar 2019 10:14:07 +0800 Subject: dt-bindings: net: bluetooth: add support for MediaTek MT7663U and MT7668U UART devices Update binding document with adding support of MT7663U and MT7668U UART devices to mediatek-bluetooth. Reviewed-by: Rob Herring Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- .../devicetree/bindings/net/mediatek-bluetooth.txt | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt b/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt index 14ceb2a5b4e8..41a7dcc80f5b 100644 --- a/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt +++ b/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt @@ -33,3 +33,67 @@ Example: clock-names = "ref"; }; }; + +MediaTek UART based Bluetooth Devices +================================== + +This device is a serial attached device to UART device and thus it must be a +child node of the serial node with UART. + +Please refer to the following documents for generic properties: + + Documentation/devicetree/bindings/serial/slave-device.txt + +Required properties: + +- compatible: Must be + "mediatek,mt7663u-bluetooth": for MT7663U device + "mediatek,mt7668u-bluetooth": for MT7668U device +- vcc-supply: Main voltage regulator +- pinctrl-names: Should be "default", "runtime" +- pinctrl-0: Should contain UART RXD low when the device is powered up to + enter proper bootstrap mode. +- pinctrl-1: Should contain UART mode pin ctrl + +Optional properties: + +- reset-gpios: GPIO used to reset the device whose initial state keeps low, + if the GPIO is missing, then board-level design should be + guaranteed. +- current-speed: Current baud rate of the device whose defaults to 921600 + +Example: + + uart1_pins_boot: uart1-default { + pins-dat { + pinmux = ; + output-low; + }; + }; + + uart1_pins_runtime: uart1-runtime { + pins-dat { + pinmux = , + ; + }; + }; + + uart1: serial@11003000 { + compatible = "mediatek,mt7623-uart", + "mediatek,mt6577-uart"; + reg = <0 0x11003000 0 0x400>; + interrupts = ; + clocks = <&pericfg CLK_PERI_UART1_SEL>, + <&pericfg CLK_PERI_UART1>; + clock-names = "baud", "bus"; + + bluetooth { + compatible = "mediatek,mt7663u-bluetooth"; + vcc-supply = <®_5v>; + reset-gpios = <&pio 24 GPIO_ACTIVE_LOW>; + pinctrl-names = "default", "runtime"; + pinctrl-0 = <&uart1_pins_boot>; + pinctrl-1 = <&uart1_pins_runtime>; + current-speed = <921600>; + }; + }; -- cgit From a1c0ed24fe9babc53143a5452a0f85cae6e37ba7 Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Thu, 28 Feb 2019 19:57:22 -0800 Subject: dt-bindings: net: dsa: document additional Microchip KSZ9477 family switches Document additional Microchip KSZ9477 family switches. Show how KSZ8565 switch should be configured as the host port is port 7 instead of port 5. Signed-off-by: Tristram Ha Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/ksz.txt | 43 +++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt index 8d58c2a7de39..e7db7268fd0f 100644 --- a/Documentation/devicetree/bindings/net/dsa/ksz.txt +++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt @@ -7,6 +7,11 @@ Required properties: of the following: - "microchip,ksz9477" - "microchip,ksz9897" + - "microchip,ksz9896" + - "microchip,ksz9567" + - "microchip,ksz8565" + - "microchip,ksz9893" + - "microchip,ksz9563" Optional properties: @@ -73,4 +78,42 @@ Ethernet switch connected via SPI to the host, CPU port wired to eth0: }; }; }; + ksz8565: ksz8565@0 { + compatible = "microchip,ksz8565"; + reg = <0>; + + spi-max-frequency = <44000000>; + spi-cpha; + spi-cpol; + + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + label = "lan1"; + }; + port@1 { + reg = <1>; + label = "lan2"; + }; + port@2 { + reg = <2>; + label = "lan3"; + }; + port@3 { + reg = <3>; + label = "lan4"; + }; + port@6 { + reg = <6>; + label = "cpu"; + ethernet = <ð0>; + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + }; + }; }; -- cgit