iPXE
ipoib.c
Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
00003  *
00004  * This program is free software; you can redistribute it and/or
00005  * modify it under the terms of the GNU General Public License as
00006  * published by the Free Software Foundation; either version 2 of the
00007  * License, or any later version.
00008  *
00009  * This program is distributed in the hope that it will be useful, but
00010  * WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012  * General Public License for more details.
00013  *
00014  * You should have received a copy of the GNU General Public License
00015  * along with this program; if not, write to the Free Software
00016  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00017  * 02110-1301, USA.
00018  *
00019  * You can also choose to distribute this program under the terms of
00020  * the Unmodified Binary Distribution Licence (as given in the file
00021  * COPYING.UBDL), provided that you have satisfied its requirements.
00022  */
00023 
00024 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
00025 
00026 #include <stdint.h>
00027 #include <stdlib.h>
00028 #include <stdio.h>
00029 #include <unistd.h>
00030 #include <string.h>
00031 #include <byteswap.h>
00032 #include <errno.h>
00033 #include <ipxe/errortab.h>
00034 #include <ipxe/malloc.h>
00035 #include <ipxe/if_arp.h>
00036 #include <ipxe/arp.h>
00037 #include <ipxe/if_ether.h>
00038 #include <ipxe/ethernet.h>
00039 #include <ipxe/ip.h>
00040 #include <ipxe/iobuf.h>
00041 #include <ipxe/netdevice.h>
00042 #include <ipxe/infiniband.h>
00043 #include <ipxe/ib_pathrec.h>
00044 #include <ipxe/ib_mcast.h>
00045 #include <ipxe/retry.h>
00046 #include <ipxe/ipoib.h>
00047 
00048 /** @file
00049  *
00050  * IP over Infiniband
00051  */
00052 
00053 /* Disambiguate the various error causes */
00054 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
00055 #define EINFO_ENXIO_ARP_REPLY                                           \
00056         __einfo_uniqify ( EINFO_ENXIO, 0x01,                            \
00057                           "Missing REMAC for ARP reply target address" )
00058 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
00059 #define EINFO_ENXIO_NON_IPV4                                            \
00060         __einfo_uniqify ( EINFO_ENXIO, 0x02,                            \
00061                           "Missing REMAC for non-IPv4 packet" )
00062 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
00063 #define EINFO_ENXIO_ARP_SENT                                            \
00064         __einfo_uniqify ( EINFO_ENXIO, 0x03,                            \
00065                           "Missing REMAC for IPv4 packet (ARP sent)" )
00066 
00067 /** Number of IPoIB send work queue entries */
00068 #define IPOIB_NUM_SEND_WQES 8
00069 
00070 /** Number of IPoIB receive work queue entries */
00071 #define IPOIB_NUM_RECV_WQES 4
00072 
00073 /** Number of IPoIB completion entries */
00074 #define IPOIB_NUM_CQES 16
00075 
00076 /** An IPoIB broadcast address */
00077 struct ipoib_broadcast {
00078         /** MAC address */
00079         struct ipoib_mac mac;
00080         /** Address vector */
00081         struct ib_address_vector av;
00082         /** Multicast group membership */
00083         struct ib_mc_membership membership;
00084 };
00085 
00086 /** An IPoIB device */
00087 struct ipoib_device {
00088         /** Network device */
00089         struct net_device *netdev;
00090         /** Underlying Infiniband device */
00091         struct ib_device *ibdev;
00092         /** List of IPoIB devices */
00093         struct list_head list;
00094         /** Completion queue */
00095         struct ib_completion_queue *cq;
00096         /** Queue pair */
00097         struct ib_queue_pair *qp;
00098         /** Local MAC */
00099         struct ipoib_mac mac;
00100         /** Broadcast address */
00101         struct ipoib_broadcast broadcast;
00102         /** REMAC cache */
00103         struct list_head peers;
00104 };
00105 
00106 /** Broadcast IPoIB address */
00107 static struct ipoib_mac ipoib_broadcast = {
00108         .flags__qpn = htonl ( IB_QPN_BROADCAST ),
00109         .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
00110                        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
00111 };
00112 
00113 /** Link status for "broadcast join in progress" */
00114 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
00115 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
00116         ( EINFO_EINPROGRESS, 0x01, "Joining" )
00117 
00118 /** Human-readable message for the link status */
00119 struct errortab ipoib_errors[] __errortab = {
00120         __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
00121 };
00122 
00123 /** List of all IPoIB devices */
00124 static LIST_HEAD ( ipoib_devices );
00125 
00126 static struct net_device_operations ipoib_operations;
00127 
00128 /****************************************************************************
00129  *
00130  * IPoIB REMAC cache
00131  *
00132  ****************************************************************************
00133  */
00134 
00135 /** An IPoIB REMAC cache entry */
00136 struct ipoib_peer {
00137         /** List of REMAC cache entries */
00138         struct list_head list;
00139         /** Remote Ethermet MAC */
00140         struct ipoib_remac remac;
00141         /** MAC address */
00142         struct ipoib_mac mac;
00143 };
00144 
00145 /**
00146  * Find IPoIB MAC from REMAC
00147  *
00148  * @v ipoib             IPoIB device
00149  * @v remac             Remote Ethernet MAC
00150  * @ret mac             IPoIB MAC (or NULL if not found)
00151  */
00152 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
00153                                              const struct ipoib_remac *remac ) {
00154         struct ipoib_peer *peer;
00155 
00156         /* Check for broadcast or multicast REMAC.  We transmit
00157          * multicasts as broadcasts for simplicity.
00158          */
00159         if ( is_multicast_ether_addr ( remac ) )
00160                 return &ipoib->broadcast.mac;
00161 
00162         /* Try to find via REMAC cache */
00163         list_for_each_entry ( peer, &ipoib->peers, list ) {
00164                 if ( memcmp ( remac, &peer->remac,
00165                               sizeof ( peer->remac ) ) == 0 ) {
00166                         /* Move peer to start of list */
00167                         list_del ( &peer->list );
00168                         list_add ( &peer->list, &ipoib->peers );
00169                         return &peer->mac;
00170                 }
00171         }
00172 
00173         DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
00174                ipoib, eth_ntoa ( remac ) );
00175         return NULL;
00176 }
00177 
00178 /**
00179  * Add IPoIB MAC to REMAC cache
00180  *
00181  * @v ipoib             IPoIB device
00182  * @v remac             Remote Ethernet MAC
00183  * @v mac               IPoIB MAC
00184  * @ret rc              Return status code
00185  */
00186 static int ipoib_map_remac ( struct ipoib_device *ipoib,
00187                              const struct ipoib_remac *remac,
00188                              const struct ipoib_mac *mac ) {
00189         struct ipoib_peer *peer;
00190 
00191         /* Check for existing entry in REMAC cache */
00192         list_for_each_entry ( peer, &ipoib->peers, list ) {
00193                 if ( memcmp ( remac, &peer->remac,
00194                               sizeof ( peer->remac ) ) == 0 ) {
00195                         /* Move peer to start of list */
00196                         list_del ( &peer->list );
00197                         list_add ( &peer->list, &ipoib->peers );
00198                         /* Update MAC */
00199                         memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
00200                         return 0;
00201                 }
00202         }
00203 
00204         /* Create new entry */
00205         peer = malloc ( sizeof ( *peer ) );
00206         if ( ! peer )
00207                 return -ENOMEM;
00208         memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
00209         memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
00210         list_add ( &peer->list, &ipoib->peers );
00211 
00212         return 0;
00213 }
00214 
00215 /**
00216  * Flush REMAC cache
00217  *
00218  * @v ipoib             IPoIB device
00219  */
00220 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
00221         struct ipoib_peer *peer;
00222         struct ipoib_peer *tmp;
00223 
00224         list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
00225                 list_del ( &peer->list );
00226                 free ( peer );
00227         }
00228 }
00229 
00230 /**
00231  * Discard some entries from the REMAC cache
00232  *
00233  * @ret discarded       Number of cached items discarded
00234  */
00235 static unsigned int ipoib_discard_remac ( void ) {
00236         struct net_device *netdev;
00237         struct ipoib_device *ipoib;
00238         struct ipoib_peer *peer;
00239         unsigned int discarded = 0;
00240 
00241         /* Try to discard one cache entry for each IPoIB device */
00242         for_each_netdev ( netdev ) {
00243 
00244                 /* Skip non-IPoIB devices */
00245                 if ( netdev->op != &ipoib_operations )
00246                         continue;
00247                 ipoib = netdev->priv;
00248 
00249                 /* Discard least recently used cache entry (if any) */
00250                 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
00251                         list_del ( &peer->list );
00252                         free ( peer );
00253                         discarded++;
00254                         break;
00255                 }
00256         }
00257 
00258         return discarded;
00259 }
00260 
00261 /** IPoIB cache discarder */
00262 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
00263         .discard = ipoib_discard_remac,
00264 };
00265 
00266 /****************************************************************************
00267  *
00268  * IPoIB link layer
00269  *
00270  ****************************************************************************
00271  */
00272 
00273 /**
00274  * Initialise IPoIB link-layer address
00275  *
00276  * @v hw_addr           Hardware address
00277  * @v ll_addr           Link-layer address
00278  */
00279 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
00280         const uint8_t *guid = hw_addr;
00281         uint8_t *eth_addr = ll_addr;
00282         uint8_t guid_mask = IPOIB_GUID_MASK;
00283         unsigned int i;
00284 
00285         /* Extract bytes from GUID according to mask */
00286         for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
00287                 if ( guid_mask & 0x80 )
00288                         *(eth_addr++) = *guid;
00289         }
00290 }
00291 
00292 /** IPoIB protocol */
00293 struct ll_protocol ipoib_protocol __ll_protocol = {
00294         .name           = "IPoIB",
00295         .ll_proto       = htons ( ARPHRD_ETHER ),
00296         .hw_addr_len    = sizeof ( union ib_guid ),
00297         .ll_addr_len    = ETH_ALEN,
00298         .ll_header_len  = ETH_HLEN,
00299         .push           = eth_push,
00300         .pull           = eth_pull,
00301         .init_addr      = ipoib_init_addr,
00302         .ntoa           = eth_ntoa,
00303         .mc_hash        = eth_mc_hash,
00304         .eth_addr       = eth_eth_addr,
00305         .eui64          = eth_eui64,
00306         .flags          = LL_NAME_ONLY,
00307 };
00308 
00309 /**
00310  * Allocate IPoIB device
00311  *
00312  * @v priv_size         Size of driver private data
00313  * @ret netdev          Network device, or NULL
00314  */
00315 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
00316         struct net_device *netdev;
00317 
00318         netdev = alloc_netdev ( priv_size );
00319         if ( netdev ) {
00320                 netdev->ll_protocol = &ipoib_protocol;
00321                 netdev->ll_broadcast = eth_broadcast;
00322                 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
00323         }
00324         return netdev;
00325 }
00326 
00327 /****************************************************************************
00328  *
00329  * IPoIB translation layer
00330  *
00331  ****************************************************************************
00332  */
00333 
00334 /**
00335  * Translate transmitted ARP packet
00336  *
00337  * @v netdev            Network device
00338  * @v iobuf             Packet to be transmitted (with no link-layer headers)
00339  * @ret rc              Return status code
00340  */
00341 static int ipoib_translate_tx_arp ( struct net_device *netdev,
00342                                     struct io_buffer *iobuf ) {
00343         struct ipoib_device *ipoib = netdev->priv;
00344         struct arphdr *arphdr = iobuf->data;
00345         struct ipoib_mac *target_ha = NULL;
00346         void *sender_pa;
00347         void *target_pa;
00348 
00349         /* Do nothing unless ARP contains eIPoIB link-layer addresses */
00350         if ( arphdr->ar_hln != ETH_ALEN )
00351                 return 0;
00352 
00353         /* Fail unless we have room to expand packet */
00354         if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
00355                                               ETH_ALEN ) ) ) {
00356                 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
00357                        ipoib );
00358                 return -ENOBUFS;
00359         }
00360 
00361         /* Look up REMAC, if applicable */
00362         if ( arphdr->ar_op == ARPOP_REPLY ) {
00363                 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
00364                 if ( ! target_ha ) {
00365                         DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
00366                                ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
00367                         return -ENXIO_ARP_REPLY;
00368                 }
00369         }
00370 
00371         /* Construct new packet */
00372         iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
00373         sender_pa = arp_sender_pa ( arphdr );
00374         target_pa = arp_target_pa ( arphdr );
00375         arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
00376         arphdr->ar_hln = sizeof ( ipoib->mac );
00377         memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
00378         memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
00379         memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
00380         memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
00381         if ( target_ha ) {
00382                 memcpy ( arp_target_ha ( arphdr ), target_ha,
00383                          sizeof ( *target_ha ) );
00384         }
00385 
00386         return 0;
00387 }
00388 
00389 /**
00390  * Translate transmitted packet
00391  *
00392  * @v netdev            Network device
00393  * @v iobuf             Packet to be transmitted (with no link-layer headers)
00394  * @v net_proto         Network-layer protocol (in network byte order)
00395  * @ret rc              Return status code
00396  */
00397 static int ipoib_translate_tx ( struct net_device *netdev,
00398                                 struct io_buffer *iobuf, uint16_t net_proto ) {
00399 
00400         switch ( net_proto ) {
00401         case htons ( ETH_P_ARP ) :
00402                 return ipoib_translate_tx_arp ( netdev, iobuf );
00403         case htons ( ETH_P_IP ) :
00404                 /* No translation needed */
00405                 return 0;
00406         default:
00407                 /* Cannot handle other traffic via eIPoIB */
00408                 return -ENOTSUP;
00409         }
00410 }
00411 
00412 /**
00413  * Translate received ARP packet
00414  *
00415  * @v netdev            Network device
00416  * @v iobuf             Received packet (with no link-layer headers)
00417  * @v remac             Constructed Remote Ethernet MAC
00418  * @ret rc              Return status code
00419  */
00420 static int ipoib_translate_rx_arp ( struct net_device *netdev,
00421                                     struct io_buffer *iobuf,
00422                                     struct ipoib_remac *remac ) {
00423         struct ipoib_device *ipoib = netdev->priv;
00424         struct arphdr *arphdr = iobuf->data;
00425         void *sender_pa;
00426         void *target_pa;
00427         int rc;
00428 
00429         /* Do nothing unless ARP contains IPoIB link-layer addresses */
00430         if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
00431                 return 0;
00432 
00433         /* Create REMAC cache entry */
00434         if ( ( rc = ipoib_map_remac ( ipoib, remac,
00435                                       arp_sender_ha ( arphdr ) ) ) != 0 ) {
00436                 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
00437                        ipoib, strerror ( rc ) );
00438                 return rc;
00439         }
00440 
00441         /* Construct new packet */
00442         sender_pa = arp_sender_pa ( arphdr );
00443         target_pa = arp_target_pa ( arphdr );
00444         arphdr->ar_hrd = htons ( ARPHRD_ETHER );
00445         arphdr->ar_hln = ETH_ALEN;
00446         memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
00447         memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
00448         memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
00449         memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
00450         if ( arphdr->ar_op == ARPOP_REPLY ) {
00451                 /* Assume received replies were directed to us */
00452                 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
00453         }
00454         iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
00455 
00456         return 0;
00457 }
00458 
00459 /**
00460  * Translate received packet
00461  *
00462  * @v netdev            Network device
00463  * @v iobuf             Received packet (with no link-layer headers)
00464  * @v remac             Constructed Remote Ethernet MAC
00465  * @v net_proto         Network-layer protocol (in network byte order)
00466  * @ret rc              Return status code
00467  */
00468 static int ipoib_translate_rx ( struct net_device *netdev,
00469                                 struct io_buffer *iobuf,
00470                                 struct ipoib_remac *remac,
00471                                 uint16_t net_proto ) {
00472 
00473         switch ( net_proto ) {
00474         case htons ( ETH_P_ARP ) :
00475                 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
00476         case htons ( ETH_P_IP ) :
00477                 /* No translation needed */
00478                 return 0;
00479         default:
00480                 /* Cannot handle other traffic via eIPoIB */
00481                 return -ENOTSUP;
00482         }
00483 }
00484 
00485 /****************************************************************************
00486  *
00487  * IPoIB network device
00488  *
00489  ****************************************************************************
00490  */
00491 
00492 /**
00493  * Transmit packet via IPoIB network device
00494  *
00495  * @v netdev            Network device
00496  * @v iobuf             I/O buffer
00497  * @ret rc              Return status code
00498  */
00499 static int ipoib_transmit ( struct net_device *netdev,
00500                             struct io_buffer *iobuf ) {
00501         struct ipoib_device *ipoib = netdev->priv;
00502         struct ib_device *ibdev = ipoib->ibdev;
00503         struct ethhdr *ethhdr;
00504         struct iphdr *iphdr;
00505         struct ipoib_hdr *ipoib_hdr;
00506         struct ipoib_remac *remac;
00507         struct ipoib_mac *mac;
00508         struct ib_address_vector *dest;
00509         struct ib_address_vector av;
00510         uint16_t net_proto;
00511         int rc;
00512 
00513         /* Sanity check */
00514         if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
00515                 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
00516                 return -EINVAL;
00517         }
00518 
00519         /* Attempting transmission while link is down will put the
00520          * queue pair into an error state, so don't try it.
00521          */
00522         if ( ! ib_link_ok ( ibdev ) )
00523                 return -ENETUNREACH;
00524 
00525         /* Strip eIPoIB header */
00526         ethhdr = iobuf->data;
00527         remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
00528         net_proto = ethhdr->h_protocol;
00529         iob_pull ( iobuf, sizeof ( *ethhdr ) );
00530 
00531         /* Identify destination address */
00532         if ( is_multicast_ether_addr ( remac ) ) {
00533 
00534                 /* Transmit multicasts as broadcasts, for simplicity */
00535                 dest = &ipoib->broadcast.av;
00536 
00537         } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {
00538 
00539                 /* Construct address vector from IPoIB MAC */
00540                 dest = &av;
00541                 memset ( dest, 0, sizeof ( *dest ) );
00542                 dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
00543                 dest->qkey = ipoib->broadcast.av.qkey;
00544                 dest->gid_present = 1;
00545                 memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
00546                 if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
00547                         /* Path not resolved yet */
00548                         return rc;
00549                 }
00550 
00551         } else {
00552 
00553                 /* Generate a new ARP request (if possible) to trigger
00554                  * population of the REMAC cache entry.
00555                  */
00556                 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
00557                      ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
00558                         DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
00559                                "packet type %04x\n", ipoib,
00560                                eth_ntoa ( ethhdr->h_dest ),
00561                                ntohs ( net_proto ) );
00562                         return -ENXIO_NON_IPV4;
00563                 }
00564                 iphdr = iobuf->data;
00565                 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
00566                                              &iphdr->dest, &iphdr->src ) ) !=0){
00567                         DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
00568                                ipoib, eth_ntoa ( ethhdr->h_dest ),
00569                                inet_ntoa ( iphdr->dest ) );
00570                         DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
00571                                strerror ( rc ) );
00572                         return rc;
00573                 }
00574                 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
00575                        eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
00576                 DBGC  ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
00577                 return -ENXIO_ARP_SENT;
00578         }
00579 
00580         /* Translate packet if applicable */
00581         if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
00582                 return rc;
00583 
00584         /* Prepend real IPoIB header */
00585         ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
00586         ipoib_hdr->proto = net_proto;
00587         ipoib_hdr->reserved = 0;
00588 
00589         /* Transmit packet */
00590         return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
00591 }
00592 
00593 /**
00594  * Handle IPoIB send completion
00595  *
00596  * @v ibdev             Infiniband device
00597  * @v qp                Queue pair
00598  * @v iobuf             I/O buffer
00599  * @v rc                Completion status code
00600  */
00601 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
00602                                   struct ib_queue_pair *qp,
00603                                   struct io_buffer *iobuf, int rc ) {
00604         struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
00605 
00606         netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
00607 }
00608 
00609 /**
00610  * Handle IPoIB receive completion
00611  *
00612  * @v ibdev             Infiniband device
00613  * @v qp                Queue pair
00614  * @v dest              Destination address vector, or NULL
00615  * @v source            Source address vector, or NULL
00616  * @v iobuf             I/O buffer
00617  * @v rc                Completion status code
00618  */
00619 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
00620                                   struct ib_queue_pair *qp,
00621                                   struct ib_address_vector *dest,
00622                                   struct ib_address_vector *source,
00623                                   struct io_buffer *iobuf, int rc ) {
00624         struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
00625         struct net_device *netdev = ipoib->netdev;
00626         struct ipoib_hdr *ipoib_hdr;
00627         struct ethhdr *ethhdr;
00628         struct ipoib_remac remac;
00629         uint16_t net_proto;
00630 
00631         /* Record errors */
00632         if ( rc != 0 ) {
00633                 netdev_rx_err ( netdev, iobuf, rc );
00634                 return;
00635         }
00636 
00637         /* Sanity check */
00638         if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
00639                 DBGC ( ipoib, "IPoIB %p received packet too short to "
00640                        "contain IPoIB header\n", ipoib );
00641                 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
00642                 netdev_rx_err ( netdev, iobuf, -EIO );
00643                 return;
00644         }
00645         if ( ! source ) {
00646                 DBGC ( ipoib, "IPoIB %p received packet without address "
00647                        "vector\n", ipoib );
00648                 netdev_rx_err ( netdev, iobuf, -ENOTTY );
00649                 return;
00650         }
00651 
00652         /* Strip real IPoIB header */
00653         ipoib_hdr = iobuf->data;
00654         net_proto = ipoib_hdr->proto;
00655         iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
00656 
00657         /* Construct source address from remote QPN and LID */
00658         remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
00659         remac.lid = htons ( source->lid );
00660 
00661         /* Translate packet if applicable */
00662         if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
00663                                          net_proto ) ) != 0 ) {
00664                 netdev_rx_err ( netdev, iobuf, rc );
00665                 return;
00666         }
00667 
00668         /* Prepend eIPoIB header */
00669         ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
00670         memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
00671         ethhdr->h_protocol = net_proto;
00672 
00673         /* Construct destination address */
00674         if ( dest->gid_present && IB_GID_MULTICAST ( &dest->gid ) ) {
00675                 /* Multicast GID: use the Ethernet broadcast address */
00676                 memcpy ( &ethhdr->h_dest, eth_broadcast,
00677                          sizeof ( ethhdr->h_dest ) );
00678         } else {
00679                 /* Assume destination address is local Ethernet MAC */
00680                 memcpy ( &ethhdr->h_dest, netdev->ll_addr,
00681                          sizeof ( ethhdr->h_dest ) );
00682         }
00683 
00684         /* Hand off to network layer */
00685         netdev_rx ( netdev, iobuf );
00686 }
00687 
00688 /** IPoIB completion operations */
00689 static struct ib_completion_queue_operations ipoib_cq_op = {
00690         .complete_send = ipoib_complete_send,
00691         .complete_recv = ipoib_complete_recv,
00692 };
00693 
00694 /**
00695  * Allocate IPoIB receive I/O buffer
00696  *
00697  * @v len               Length of buffer
00698  * @ret iobuf           I/O buffer, or NULL
00699  *
00700  * Some Infiniband hardware requires 2kB alignment of receive buffers
00701  * and provides no way to disable header separation.  The result is
00702  * that there are only four bytes of link-layer header (the real IPoIB
00703  * header) before the payload.  This is not sufficient space to insert
00704  * an eIPoIB link-layer pseudo-header.
00705  *
00706  * We therefore allocate I/O buffers offset to start slightly before
00707  * the natural alignment boundary, in order to allow sufficient space.
00708  */
00709 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
00710         struct io_buffer *iobuf;
00711         size_t reserve_len;
00712 
00713         /* Calculate additional length required at start of buffer */
00714         reserve_len = ( sizeof ( struct ethhdr ) -
00715                         sizeof ( struct ipoib_hdr ) );
00716 
00717         /* Allocate buffer */
00718         iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
00719         if ( iobuf ) {
00720                 iob_reserve ( iobuf, reserve_len );
00721         }
00722         return iobuf;
00723 }
00724 
00725 /** IPoIB queue pair operations */
00726 static struct ib_queue_pair_operations ipoib_qp_op = {
00727         .alloc_iob = ipoib_alloc_iob,
00728 };
00729 
00730 /**
00731  * Poll IPoIB network device
00732  *
00733  * @v netdev            Network device
00734  */
00735 static void ipoib_poll ( struct net_device *netdev ) {
00736         struct ipoib_device *ipoib = netdev->priv;
00737         struct ib_device *ibdev = ipoib->ibdev;
00738 
00739         /* Poll Infiniband device */
00740         ib_poll_eq ( ibdev );
00741 
00742         /* Poll the retry timers (required for IPoIB multicast join) */
00743         retry_poll();
00744 }
00745 
00746 /**
00747  * Handle IPv4 broadcast multicast group join completion
00748  *
00749  * @v membership        Multicast group membership
00750  * @v rc                Status code
00751  */
00752 void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
00753         struct ipoib_device *ipoib = container_of ( membership,
00754                                                     struct ipoib_device,
00755                                                     broadcast.membership );
00756 
00757         /* Record join status as link status */
00758         netdev_link_err ( ipoib->netdev, rc );
00759 }
00760 
00761 /**
00762  * Join IPv4 broadcast multicast group
00763  *
00764  * @v ipoib             IPoIB device
00765  * @ret rc              Return status code
00766  */
00767 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
00768         int rc;
00769 
00770         /* Join multicast group */
00771         if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
00772                                     &ipoib->broadcast.membership,
00773                                     &ipoib->broadcast.av, 0,
00774                                     ipoib_join_complete ) ) != 0 ) {
00775                 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
00776                        ipoib, strerror ( rc ) );
00777                 return rc;
00778         }
00779 
00780         return 0;
00781 }
00782 
00783 /**
00784  * Leave IPv4 broadcast multicast group
00785  *
00786  * @v ipoib             IPoIB device
00787  */
00788 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
00789 
00790         /* Leave multicast group */
00791         ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
00792                          &ipoib->broadcast.membership );
00793 }
00794 
00795 /**
00796  * Handle link status change
00797  *
00798  * @v ipoib             IPoIB device
00799  */
00800 static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
00801         struct ib_device *ibdev = ipoib->ibdev;
00802         struct net_device *netdev = ipoib->netdev;
00803         int rc;
00804 
00805         /* Leave existing broadcast group */
00806         if ( ipoib->qp )
00807                 ipoib_leave_broadcast_group ( ipoib );
00808 
00809         /* Update MAC address based on potentially-new GID prefix */
00810         memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
00811                  sizeof ( ipoib->mac.gid.s.prefix ) );
00812 
00813         /* Update broadcast MAC GID based on potentially-new partition key */
00814         ipoib->broadcast.mac.gid.words[2] =
00815                 htons ( ibdev->pkey | IB_PKEY_FULL );
00816 
00817         /* Construct broadcast address vector from broadcast MAC address */
00818         memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
00819         ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
00820         ipoib->broadcast.av.gid_present = 1;
00821         memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
00822                  sizeof ( ipoib->broadcast.av.gid ) );
00823 
00824         /* Set net device link state to reflect Infiniband link state */
00825         rc = ib_link_rc ( ibdev );
00826         netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
00827 
00828         /* Join new broadcast group */
00829         if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
00830              ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
00831                 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
00832                        "%s\n", ipoib, strerror ( rc ) );
00833                 netdev_link_err ( netdev, rc );
00834                 return;
00835         }
00836 }
00837 
00838 /**
00839  * Open IPoIB network device
00840  *
00841  * @v netdev            Network device
00842  * @ret rc              Return status code
00843  */
00844 static int ipoib_open ( struct net_device *netdev ) {
00845         struct ipoib_device *ipoib = netdev->priv;
00846         struct ib_device *ibdev = ipoib->ibdev;
00847         int rc;
00848 
00849         /* Open IB device */
00850         if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
00851                 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
00852                        ipoib, strerror ( rc ) );
00853                 goto err_ib_open;
00854         }
00855 
00856         /* Allocate completion queue */
00857         if ( ( rc = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op,
00858                                    &ipoib->cq ) ) != 0 ) {
00859                 DBGC ( ipoib, "IPoIB %p could not create completion queue: "
00860                        "%s\n", ipoib, strerror ( rc ) );
00861                 goto err_create_cq;
00862         }
00863 
00864         /* Allocate queue pair */
00865         if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
00866                                    ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
00867                                    &ipoib_qp_op, netdev->name,
00868                                    &ipoib->qp ) ) != 0 ) {
00869                 DBGC ( ipoib, "IPoIB %p could not create queue pair: %s\n",
00870                        ipoib, strerror ( rc ) );
00871                 goto err_create_qp;
00872         }
00873         ib_qp_set_ownerdata ( ipoib->qp, ipoib );
00874 
00875         /* Update MAC address with QPN */
00876         ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
00877 
00878         /* Fill receive rings */
00879         ib_refill_recv ( ibdev, ipoib->qp );
00880 
00881         /* Fake a link status change to join the broadcast group */
00882         ipoib_link_state_changed ( ipoib );
00883 
00884         return 0;
00885 
00886         ib_destroy_qp ( ibdev, ipoib->qp );
00887  err_create_qp:
00888         ib_destroy_cq ( ibdev, ipoib->cq );
00889  err_create_cq:
00890         ib_close ( ibdev );
00891  err_ib_open:
00892         return rc;
00893 }
00894 
00895 /**
00896  * Close IPoIB network device
00897  *
00898  * @v netdev            Network device
00899  */
00900 static void ipoib_close ( struct net_device *netdev ) {
00901         struct ipoib_device *ipoib = netdev->priv;
00902         struct ib_device *ibdev = ipoib->ibdev;
00903 
00904         /* Flush REMAC cache */
00905         ipoib_flush_remac ( ipoib );
00906 
00907         /* Leave broadcast group */
00908         ipoib_leave_broadcast_group ( ipoib );
00909 
00910         /* Remove QPN from MAC address */
00911         ipoib->mac.flags__qpn = 0;
00912 
00913         /* Tear down the queues */
00914         ib_destroy_qp ( ibdev, ipoib->qp );
00915         ipoib->qp = NULL;
00916         ib_destroy_cq ( ibdev, ipoib->cq );
00917         ipoib->cq = NULL;
00918 
00919         /* Close IB device */
00920         ib_close ( ibdev );
00921 }
00922 
00923 /** IPoIB network device operations */
00924 static struct net_device_operations ipoib_operations = {
00925         .open           = ipoib_open,
00926         .close          = ipoib_close,
00927         .transmit       = ipoib_transmit,
00928         .poll           = ipoib_poll,
00929 };
00930 
00931 /**
00932  * Probe IPoIB device
00933  *
00934  * @v ibdev             Infiniband device
00935  * @ret rc              Return status code
00936  */
00937 static int ipoib_probe ( struct ib_device *ibdev ) {
00938         struct net_device *netdev;
00939         struct ipoib_device *ipoib;
00940         int rc;
00941 
00942         /* Allocate network device */
00943         netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
00944         if ( ! netdev )
00945                 return -ENOMEM;
00946         netdev_init ( netdev, &ipoib_operations );
00947         ipoib = netdev->priv;
00948         netdev->dev = ibdev->dev;
00949         memset ( ipoib, 0, sizeof ( *ipoib ) );
00950         ipoib->netdev = netdev;
00951         ipoib->ibdev = ibdev;
00952         INIT_LIST_HEAD ( &ipoib->peers );
00953 
00954         /* Extract hardware address */
00955         memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
00956                  sizeof ( ibdev->gid.s.guid ) );
00957         memcpy ( netdev->ll_addr, ibdev->lemac, ETH_ALEN );
00958 
00959         /* Set local MAC address */
00960         memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
00961                  sizeof ( ipoib->mac.gid.s.guid ) );
00962 
00963         /* Set default broadcast MAC address */
00964         memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
00965                  sizeof ( ipoib->broadcast.mac ) );
00966 
00967         /* Add to list of IPoIB devices */
00968         list_add_tail ( &ipoib->list, &ipoib_devices );
00969 
00970         /* Register network device */
00971         if ( ( rc = register_netdev ( netdev ) ) != 0 )
00972                 goto err_register_netdev;
00973 
00974         return 0;
00975 
00976         unregister_netdev ( netdev );
00977  err_register_netdev:
00978         list_del ( &ipoib->list );
00979         netdev_nullify ( netdev );
00980         netdev_put ( netdev );
00981         return rc;
00982 }
00983 
00984 /**
00985  * Handle device or link status change
00986  *
00987  * @v ibdev             Infiniband device
00988  */
00989 static void ipoib_notify ( struct ib_device *ibdev ) {
00990         struct ipoib_device *ipoib;
00991 
00992         /* Handle link status change for any attached IPoIB devices */
00993         list_for_each_entry ( ipoib, &ipoib_devices, list ) {
00994                 if ( ipoib->ibdev != ibdev )
00995                         continue;
00996                 ipoib_link_state_changed ( ipoib );
00997         }
00998 }
00999 
01000 /**
01001  * Remove IPoIB device
01002  *
01003  * @v ibdev             Infiniband device
01004  */
01005 static void ipoib_remove ( struct ib_device *ibdev ) {
01006         struct ipoib_device *ipoib;
01007         struct ipoib_device *tmp;
01008         struct net_device *netdev;
01009 
01010         /* Remove any attached IPoIB devices */
01011         list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
01012                 if ( ipoib->ibdev != ibdev )
01013                         continue;
01014                 netdev = ipoib->netdev;
01015                 unregister_netdev ( netdev );
01016                 list_del ( &ipoib->list );
01017                 netdev_nullify ( netdev );
01018                 netdev_put ( netdev );
01019         }
01020 }
01021 
01022 /** IPoIB driver */
01023 struct ib_driver ipoib_driver __ib_driver = {
01024         .name = "IPoIB",
01025         .probe = ipoib_probe,
01026         .notify = ipoib_notify,
01027         .remove = ipoib_remove,
01028 };
01029 
01030 /**
01031  * Find IPoIB network device
01032  *
01033  * @v ibdev             Infiniband device
01034  * @ret netdev          IPoIB network device, or NULL if not found
01035  */
01036 struct net_device * ipoib_netdev ( struct ib_device *ibdev ) {
01037         struct ipoib_device *ipoib;
01038 
01039         /* Find matching IPoIB device */
01040         list_for_each_entry ( ipoib, &ipoib_devices, list ) {
01041                 if ( ipoib->ibdev != ibdev )
01042                         continue;
01043                 return ipoib->netdev;
01044         }
01045         return NULL;
01046 }