iPXE
tcp.c
Go to the documentation of this file.
00001 #include <string.h>
00002 #include <stdlib.h>
00003 #include <stdio.h>
00004 #include <assert.h>
00005 #include <errno.h>
00006 #include <byteswap.h>
00007 #include <ipxe/timer.h>
00008 #include <ipxe/iobuf.h>
00009 #include <ipxe/malloc.h>
00010 #include <ipxe/init.h>
00011 #include <ipxe/retry.h>
00012 #include <ipxe/refcnt.h>
00013 #include <ipxe/pending.h>
00014 #include <ipxe/xfer.h>
00015 #include <ipxe/open.h>
00016 #include <ipxe/uri.h>
00017 #include <ipxe/netdevice.h>
00018 #include <ipxe/profile.h>
00019 #include <ipxe/process.h>
00020 #include <ipxe/tcpip.h>
00021 #include <ipxe/tcp.h>
00022 
00023 /** @file
00024  *
00025  * TCP protocol
00026  *
00027  */
00028 
00029 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
00030 
00031 /** A TCP connection */
00032 struct tcp_connection {
00033         /** Reference counter */
00034         struct refcnt refcnt;
00035         /** List of TCP connections */
00036         struct list_head list;
00037 
00038         /** Flags */
00039         unsigned int flags;
00040 
00041         /** Data transfer interface */
00042         struct interface xfer;
00043 
00044         /** Remote socket address */
00045         struct sockaddr_tcpip peer;
00046         /** Local port */
00047         unsigned int local_port;
00048         /** Maximum segment size */
00049         size_t mss;
00050 
00051         /** Current TCP state */
00052         unsigned int tcp_state;
00053         /** Previous TCP state
00054          *
00055          * Maintained only for debug messages
00056          */
00057         unsigned int prev_tcp_state;
00058         /** Current sequence number
00059          *
00060          * Equivalent to SND.UNA in RFC 793 terminology.
00061          */
00062         uint32_t snd_seq;
00063         /** Unacknowledged sequence count
00064          *
00065          * Equivalent to (SND.NXT-SND.UNA) in RFC 793 terminology.
00066          */
00067         uint32_t snd_sent;
00068         /** Send window
00069          *
00070          * Equivalent to SND.WND in RFC 793 terminology
00071          */
00072         uint32_t snd_win;
00073         /** Current acknowledgement number
00074          *
00075          * Equivalent to RCV.NXT in RFC 793 terminology.
00076          */
00077         uint32_t rcv_ack;
00078         /** Receive window
00079          *
00080          * Equivalent to RCV.WND in RFC 793 terminology.
00081          */
00082         uint32_t rcv_win;
00083         /** Received timestamp value
00084          *
00085          * Updated when a packet is received; copied to ts_recent when
00086          * the window is advanced.
00087          */
00088         uint32_t ts_val;
00089         /** Most recent received timestamp that advanced the window
00090          *
00091          * Equivalent to TS.Recent in RFC 1323 terminology.
00092          */
00093         uint32_t ts_recent;
00094         /** Send window scale
00095          *
00096          * Equivalent to Snd.Wind.Scale in RFC 1323 terminology
00097          */
00098         uint8_t snd_win_scale;
00099         /** Receive window scale
00100          *
00101          * Equivalent to Rcv.Wind.Scale in RFC 1323 terminology
00102          */
00103         uint8_t rcv_win_scale;
00104 
00105         /** Selective acknowledgement list (in host-endian order) */
00106         struct tcp_sack_block sack[TCP_SACK_MAX];
00107 
00108         /** Transmit queue */
00109         struct list_head tx_queue;
00110         /** Receive queue */
00111         struct list_head rx_queue;
00112         /** Transmission process */
00113         struct process process;
00114         /** Retransmission timer */
00115         struct retry_timer timer;
00116         /** Keepalive timer */
00117         struct retry_timer keepalive;
00118         /** Shutdown (TIME_WAIT) timer */
00119         struct retry_timer wait;
00120 
00121         /** Pending operations for SYN and FIN */
00122         struct pending_operation pending_flags;
00123         /** Pending operations for transmit queue */
00124         struct pending_operation pending_data;
00125 };
00126 
00127 /** TCP flags */
00128 enum tcp_flags {
00129         /** TCP data transfer interface has been closed */
00130         TCP_XFER_CLOSED = 0x0001,
00131         /** TCP timestamps are enabled */
00132         TCP_TS_ENABLED = 0x0002,
00133         /** TCP acknowledgement is pending */
00134         TCP_ACK_PENDING = 0x0004,
00135         /** TCP selective acknowledgement is enabled */
00136         TCP_SACK_ENABLED = 0x0008,
00137 };
00138 
00139 /** TCP internal header
00140  *
00141  * This is the header that replaces the TCP header for packets
00142  * enqueued on the receive queue.
00143  */
00144 struct tcp_rx_queued_header {
00145         /** SEQ value, in host-endian order
00146          *
00147          * This represents the SEQ value at the time the packet is
00148          * enqueued, and so excludes the SYN, if present.
00149          */
00150         uint32_t seq;
00151         /** Next SEQ value, in host-endian order */
00152         uint32_t nxt;
00153         /** Flags
00154          *
00155          * Only FIN is valid within this flags byte; all other flags
00156          * have already been processed by the time the packet is
00157          * enqueued.
00158          */
00159         uint8_t flags;
00160         /** Reserved */
00161         uint8_t reserved[3];
00162 };
00163 
00164 /**
00165  * List of registered TCP connections
00166  */
00167 static LIST_HEAD ( tcp_conns );
00168 
00169 /** Transmit profiler */
00170 static struct profiler tcp_tx_profiler __profiler = { .name = "tcp.tx" };
00171 
00172 /** Receive profiler */
00173 static struct profiler tcp_rx_profiler __profiler = { .name = "tcp.rx" };
00174 
00175 /** Data transfer profiler */
00176 static struct profiler tcp_xfer_profiler __profiler = { .name = "tcp.xfer" };
00177 
00178 /* Forward declarations */
00179 static struct process_descriptor tcp_process_desc;
00180 static struct interface_descriptor tcp_xfer_desc;
00181 static void tcp_expired ( struct retry_timer *timer, int over );
00182 static void tcp_keepalive_expired ( struct retry_timer *timer, int over );
00183 static void tcp_wait_expired ( struct retry_timer *timer, int over );
00184 static struct tcp_connection * tcp_demux ( unsigned int local_port );
00185 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
00186                         uint32_t win );
00187 
00188 /**
00189  * Name TCP state
00190  *
00191  * @v state             TCP state
00192  * @ret name            Name of TCP state
00193  */
00194 static inline __attribute__ (( always_inline )) const char *
00195 tcp_state ( int state ) {
00196         switch ( state ) {
00197         case TCP_CLOSED:                return "CLOSED";
00198         case TCP_LISTEN:                return "LISTEN";
00199         case TCP_SYN_SENT:              return "SYN_SENT";
00200         case TCP_SYN_RCVD:              return "SYN_RCVD";
00201         case TCP_ESTABLISHED:           return "ESTABLISHED";
00202         case TCP_FIN_WAIT_1:            return "FIN_WAIT_1";
00203         case TCP_FIN_WAIT_2:            return "FIN_WAIT_2";
00204         case TCP_CLOSING_OR_LAST_ACK:   return "CLOSING/LAST_ACK";
00205         case TCP_TIME_WAIT:             return "TIME_WAIT";
00206         case TCP_CLOSE_WAIT:            return "CLOSE_WAIT";
00207         default:                        return "INVALID";
00208         }
00209 }
00210 
00211 /**
00212  * Dump TCP state transition
00213  *
00214  * @v tcp               TCP connection
00215  */
00216 static inline __attribute__ (( always_inline )) void
00217 tcp_dump_state ( struct tcp_connection *tcp ) {
00218 
00219         if ( tcp->tcp_state != tcp->prev_tcp_state ) {
00220                 DBGC ( tcp, "TCP %p transitioned from %s to %s\n", tcp,
00221                        tcp_state ( tcp->prev_tcp_state ),
00222                        tcp_state ( tcp->tcp_state ) );
00223         }
00224         tcp->prev_tcp_state = tcp->tcp_state;
00225 }
00226 
00227 /**
00228  * Dump TCP flags
00229  *
00230  * @v flags             TCP flags
00231  */
00232 static inline __attribute__ (( always_inline )) void
00233 tcp_dump_flags ( struct tcp_connection *tcp, unsigned int flags ) {
00234         if ( flags & TCP_RST )
00235                 DBGC2 ( tcp, " RST" );
00236         if ( flags & TCP_SYN )
00237                 DBGC2 ( tcp, " SYN" );
00238         if ( flags & TCP_PSH )
00239                 DBGC2 ( tcp, " PSH" );
00240         if ( flags & TCP_FIN )
00241                 DBGC2 ( tcp, " FIN" );
00242         if ( flags & TCP_ACK )
00243                 DBGC2 ( tcp, " ACK" );
00244 }
00245 
00246 /***************************************************************************
00247  *
00248  * Open and close
00249  *
00250  ***************************************************************************
00251  */
00252 
00253 /**
00254  * Check if local TCP port is available
00255  *
00256  * @v port              Local port number
00257  * @ret port            Local port number, or negative error
00258  */
00259 static int tcp_port_available ( int port ) {
00260 
00261         return ( tcp_demux ( port ) ? -EADDRINUSE : port );
00262 }
00263 
00264 /**
00265  * Open a TCP connection
00266  *
00267  * @v xfer              Data transfer interface
00268  * @v peer              Peer socket address
00269  * @v local             Local socket address, or NULL
00270  * @ret rc              Return status code
00271  */
00272 static int tcp_open ( struct interface *xfer, struct sockaddr *peer,
00273                       struct sockaddr *local ) {
00274         struct sockaddr_tcpip *st_peer = ( struct sockaddr_tcpip * ) peer;
00275         struct sockaddr_tcpip *st_local = ( struct sockaddr_tcpip * ) local;
00276         struct tcp_connection *tcp;
00277         size_t mtu;
00278         int port;
00279         int rc;
00280 
00281         /* Allocate and initialise structure */
00282         tcp = zalloc ( sizeof ( *tcp ) );
00283         if ( ! tcp )
00284                 return -ENOMEM;
00285         DBGC ( tcp, "TCP %p allocated\n", tcp );
00286         ref_init ( &tcp->refcnt, NULL );
00287         intf_init ( &tcp->xfer, &tcp_xfer_desc, &tcp->refcnt );
00288         process_init_stopped ( &tcp->process, &tcp_process_desc, &tcp->refcnt );
00289         timer_init ( &tcp->timer, tcp_expired, &tcp->refcnt );
00290         timer_init ( &tcp->keepalive, tcp_keepalive_expired, &tcp->refcnt );
00291         timer_init ( &tcp->wait, tcp_wait_expired, &tcp->refcnt );
00292         tcp->prev_tcp_state = TCP_CLOSED;
00293         tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
00294         tcp_dump_state ( tcp );
00295         tcp->snd_seq = random();
00296         INIT_LIST_HEAD ( &tcp->tx_queue );
00297         INIT_LIST_HEAD ( &tcp->rx_queue );
00298         memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
00299 
00300         /* Calculate MSS */
00301         mtu = tcpip_mtu ( &tcp->peer );
00302         if ( ! mtu ) {
00303                 DBGC ( tcp, "TCP %p has no route to %s\n",
00304                        tcp, sock_ntoa ( peer ) );
00305                 rc = -ENETUNREACH;
00306                 goto err;
00307         }
00308         tcp->mss = ( mtu - sizeof ( struct tcp_header ) );
00309 
00310         /* Bind to local port */
00311         port = tcpip_bind ( st_local, tcp_port_available );
00312         if ( port < 0 ) {
00313                 rc = port;
00314                 DBGC ( tcp, "TCP %p could not bind: %s\n",
00315                        tcp, strerror ( rc ) );
00316                 goto err;
00317         }
00318         tcp->local_port = port;
00319         DBGC ( tcp, "TCP %p bound to port %d\n", tcp, tcp->local_port );
00320 
00321         /* Start timer to initiate SYN */
00322         start_timer_nodelay ( &tcp->timer );
00323 
00324         /* Add a pending operation for the SYN */
00325         pending_get ( &tcp->pending_flags );
00326 
00327         /* Attach parent interface, transfer reference to connection
00328          * list and return
00329          */
00330         intf_plug_plug ( &tcp->xfer, xfer );
00331         list_add ( &tcp->list, &tcp_conns );
00332         return 0;
00333 
00334  err:
00335         ref_put ( &tcp->refcnt );
00336         return rc;
00337 }
00338 
00339 /**
00340  * Close TCP connection
00341  *
00342  * @v tcp               TCP connection
00343  * @v rc                Reason for close
00344  *
00345  * Closes the data transfer interface.  If the TCP state machine is in
00346  * a suitable state, the connection will be deleted.
00347  */
00348 static void tcp_close ( struct tcp_connection *tcp, int rc ) {
00349         struct io_buffer *iobuf;
00350         struct io_buffer *tmp;
00351 
00352         /* Close data transfer interface */
00353         intf_shutdown ( &tcp->xfer, rc );
00354         tcp->flags |= TCP_XFER_CLOSED;
00355 
00356         /* If we are in CLOSED, or have otherwise not yet received a
00357          * SYN (i.e. we are in LISTEN or SYN_SENT), just delete the
00358          * connection.
00359          */
00360         if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
00361 
00362                 /* Transition to CLOSED for the sake of debugging messages */
00363                 tcp->tcp_state = TCP_CLOSED;
00364                 tcp_dump_state ( tcp );
00365 
00366                 /* Free any unprocessed I/O buffers */
00367                 list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
00368                         list_del ( &iobuf->list );
00369                         free_iob ( iobuf );
00370                 }
00371 
00372                 /* Free any unsent I/O buffers */
00373                 list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
00374                         list_del ( &iobuf->list );
00375                         free_iob ( iobuf );
00376                         pending_put ( &tcp->pending_data );
00377                 }
00378                 assert ( ! is_pending ( &tcp->pending_data ) );
00379 
00380                 /* Remove pending operations for SYN and FIN, if applicable */
00381                 pending_put ( &tcp->pending_flags );
00382                 pending_put ( &tcp->pending_flags );
00383 
00384                 /* Remove from list and drop reference */
00385                 process_del ( &tcp->process );
00386                 stop_timer ( &tcp->timer );
00387                 stop_timer ( &tcp->keepalive );
00388                 stop_timer ( &tcp->wait );
00389                 list_del ( &tcp->list );
00390                 ref_put ( &tcp->refcnt );
00391                 DBGC ( tcp, "TCP %p connection deleted\n", tcp );
00392                 return;
00393         }
00394 
00395         /* If we have not had our SYN acknowledged (i.e. we are in
00396          * SYN_RCVD), pretend that it has been acknowledged so that we
00397          * can send a FIN without breaking things.
00398          */
00399         if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
00400                 tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
00401 
00402         /* Stop keepalive timer */
00403         stop_timer ( &tcp->keepalive );
00404 
00405         /* If we have no data remaining to send, start sending FIN */
00406         if ( list_empty ( &tcp->tx_queue ) &&
00407              ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) ) {
00408 
00409                 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
00410                 tcp_dump_state ( tcp );
00411                 process_add ( &tcp->process );
00412 
00413                 /* Add a pending operation for the FIN */
00414                 pending_get ( &tcp->pending_flags );
00415         }
00416 }
00417 
00418 /***************************************************************************
00419  *
00420  * Transmit data path
00421  *
00422  ***************************************************************************
00423  */
00424 
00425 /**
00426  * Calculate transmission window
00427  *
00428  * @v tcp               TCP connection
00429  * @ret len             Maximum length that can be sent in a single packet
00430  */
00431 static size_t tcp_xmit_win ( struct tcp_connection *tcp ) {
00432         size_t len;
00433 
00434         /* Not ready if we're not in a suitable connection state */
00435         if ( ! TCP_CAN_SEND_DATA ( tcp->tcp_state ) )
00436                 return 0;
00437 
00438         /* Length is the minimum of the receiver's window and the path MTU */
00439         len = tcp->snd_win;
00440         if ( len > TCP_PATH_MTU )
00441                 len = TCP_PATH_MTU;
00442 
00443         return len;
00444 }
00445 
00446 /**
00447  * Check data-transfer flow control window
00448  *
00449  * @v tcp               TCP connection
00450  * @ret len             Length of window
00451  */
00452 static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
00453 
00454         /* Not ready if data queue is non-empty.  This imposes a limit
00455          * of only one unACKed packet in the TX queue at any time; we
00456          * do this to conserve memory usage.
00457          */
00458         if ( ! list_empty ( &tcp->tx_queue ) )
00459                 return 0;
00460 
00461         /* Return TCP window length */
00462         return tcp_xmit_win ( tcp );
00463 }
00464 
00465 /**
00466  * Find selective acknowledgement block
00467  *
00468  * @v tcp               TCP connection
00469  * @v seq               SEQ value in SACK block (in host-endian order)
00470  * @v sack              SACK block to fill in (in host-endian order)
00471  * @ret len             Length of SACK block
00472  */
00473 static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
00474                                  struct tcp_sack_block *sack ) {
00475         struct io_buffer *iobuf;
00476         struct tcp_rx_queued_header *tcpqhdr;
00477         uint32_t left = tcp->rcv_ack;
00478         uint32_t right = left;
00479 
00480         /* Find highest block which does not start after SEQ */
00481         list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
00482                 tcpqhdr = iobuf->data;
00483                 if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
00484                         if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
00485                                 break;
00486                         left = tcpqhdr->seq;
00487                 }
00488                 if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
00489                         right = tcpqhdr->nxt;
00490         }
00491 
00492         /* Fail if this block does not contain SEQ */
00493         if ( tcp_cmp ( right, seq ) < 0 )
00494                 return 0;
00495 
00496         /* Populate SACK block */
00497         sack->left = left;
00498         sack->right = right;
00499         return ( right - left );
00500 }
00501 
00502 /**
00503  * Update TCP selective acknowledgement list
00504  *
00505  * @v tcp               TCP connection
00506  * @v seq               SEQ value in first SACK block (in host-endian order)
00507  * @ret count           Number of SACK blocks
00508  */
00509 static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
00510         struct tcp_sack_block sack[TCP_SACK_MAX];
00511         unsigned int old = 0;
00512         unsigned int new = 0;
00513         unsigned int i;
00514         uint32_t len;
00515 
00516         /* Populate first new SACK block */
00517         len = tcp_sack_block ( tcp, seq, &sack[0] );
00518         if ( len )
00519                 new++;
00520 
00521         /* Populate remaining new SACK blocks based on old SACK blocks */
00522         for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
00523 
00524                 /* Stop if we run out of space in the new list */
00525                 if ( new == TCP_SACK_MAX )
00526                         break;
00527 
00528                 /* Skip empty old SACK blocks */
00529                 if ( tcp->sack[old].left == tcp->sack[old].right )
00530                         continue;
00531 
00532                 /* Populate new SACK block */
00533                 len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
00534                 if ( len == 0 )
00535                         continue;
00536 
00537                 /* Eliminate duplicates */
00538                 for ( i = 0 ; i < new ; i++ ) {
00539                         if ( sack[i].left == sack[new].left ) {
00540                                 new--;
00541                                 break;
00542                         }
00543                 }
00544                 new++;
00545         }
00546 
00547         /* Update SACK list */
00548         memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
00549         memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
00550         return new;
00551 }
00552 
00553 /**
00554  * Process TCP transmit queue
00555  *
00556  * @v tcp               TCP connection
00557  * @v max_len           Maximum length to process
00558  * @v dest              I/O buffer to fill with data, or NULL
00559  * @v remove            Remove data from queue
00560  * @ret len             Length of data processed
00561  *
00562  * This processes at most @c max_len bytes from the TCP connection's
00563  * transmit queue.  Data will be copied into the @c dest I/O buffer
00564  * (if provided) and, if @c remove is true, removed from the transmit
00565  * queue.
00566  */
00567 static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
00568                                      struct io_buffer *dest, int remove ) {
00569         struct io_buffer *iobuf;
00570         struct io_buffer *tmp;
00571         size_t frag_len;
00572         size_t len = 0;
00573 
00574         list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
00575                 frag_len = iob_len ( iobuf );
00576                 if ( frag_len > max_len )
00577                         frag_len = max_len;
00578                 if ( dest ) {
00579                         memcpy ( iob_put ( dest, frag_len ), iobuf->data,
00580                                  frag_len );
00581                 }
00582                 if ( remove ) {
00583                         iob_pull ( iobuf, frag_len );
00584                         if ( ! iob_len ( iobuf ) ) {
00585                                 list_del ( &iobuf->list );
00586                                 free_iob ( iobuf );
00587                                 pending_put ( &tcp->pending_data );
00588                         }
00589                 }
00590                 len += frag_len;
00591                 max_len -= frag_len;
00592         }
00593         return len;
00594 }
00595 
00596 /**
00597  * Transmit any outstanding data (with selective acknowledgement)
00598  *
00599  * @v tcp               TCP connection
00600  * @v sack_seq          SEQ for first selective acknowledgement (if any)
00601  * 
00602  * Transmits any outstanding data on the connection.
00603  *
00604  * Note that even if an error is returned, the retransmission timer
00605  * will have been started if necessary, and so the stack will
00606  * eventually attempt to retransmit the failed packet.
00607  */
00608 static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
00609         struct io_buffer *iobuf;
00610         struct tcp_header *tcphdr;
00611         struct tcp_mss_option *mssopt;
00612         struct tcp_window_scale_padded_option *wsopt;
00613         struct tcp_timestamp_padded_option *tsopt;
00614         struct tcp_sack_permitted_padded_option *spopt;
00615         struct tcp_sack_padded_option *sackopt;
00616         struct tcp_sack_block *sack;
00617         void *payload;
00618         unsigned int flags;
00619         unsigned int sack_count;
00620         unsigned int i;
00621         size_t len = 0;
00622         size_t sack_len;
00623         uint32_t seq_len;
00624         uint32_t max_rcv_win;
00625         uint32_t max_representable_win;
00626         int rc;
00627 
00628         /* Start profiling */
00629         profile_start ( &tcp_tx_profiler );
00630 
00631         /* If retransmission timer is already running, do nothing */
00632         if ( timer_running ( &tcp->timer ) )
00633                 return;
00634 
00635         /* Calculate both the actual (payload) and sequence space
00636          * lengths that we wish to transmit.
00637          */
00638         if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
00639                 len = tcp_process_tx_queue ( tcp, tcp_xmit_win ( tcp ),
00640                                              NULL, 0 );
00641         }
00642         seq_len = len;
00643         flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
00644         if ( flags & ( TCP_SYN | TCP_FIN ) ) {
00645                 /* SYN or FIN consume one byte, and we can never send both */
00646                 assert ( ! ( ( flags & TCP_SYN ) && ( flags & TCP_FIN ) ) );
00647                 seq_len++;
00648         }
00649         tcp->snd_sent = seq_len;
00650 
00651         /* If we have nothing to transmit, stop now */
00652         if ( ( seq_len == 0 ) && ! ( tcp->flags & TCP_ACK_PENDING ) )
00653                 return;
00654 
00655         /* If we are transmitting anything that requires
00656          * acknowledgement (i.e. consumes sequence space), start the
00657          * retransmission timer.  Do this before attempting to
00658          * allocate the I/O buffer, in case allocation itself fails.
00659          */
00660         if ( seq_len )
00661                 start_timer ( &tcp->timer );
00662 
00663         /* Allocate I/O buffer */
00664         iobuf = alloc_iob ( len + TCP_MAX_HEADER_LEN );
00665         if ( ! iobuf ) {
00666                 DBGC ( tcp, "TCP %p could not allocate iobuf for %08x..%08x "
00667                        "%08x\n", tcp, tcp->snd_seq, ( tcp->snd_seq + seq_len ),
00668                        tcp->rcv_ack );
00669                 return;
00670         }
00671         iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
00672 
00673         /* Fill data payload from transmit queue */
00674         tcp_process_tx_queue ( tcp, len, iobuf, 0 );
00675 
00676         /* Expand receive window if possible */
00677         max_rcv_win = xfer_window ( &tcp->xfer );
00678         if ( max_rcv_win > TCP_MAX_WINDOW_SIZE )
00679                 max_rcv_win = TCP_MAX_WINDOW_SIZE;
00680         max_representable_win = ( 0xffff << tcp->rcv_win_scale );
00681         if ( max_rcv_win > max_representable_win )
00682                 max_rcv_win = max_representable_win;
00683         max_rcv_win &= ~0x03; /* Keep everything dword-aligned */
00684         if ( tcp->rcv_win < max_rcv_win )
00685                 tcp->rcv_win = max_rcv_win;
00686 
00687         /* Fill up the TCP header */
00688         payload = iobuf->data;
00689         if ( flags & TCP_SYN ) {
00690                 mssopt = iob_push ( iobuf, sizeof ( *mssopt ) );
00691                 mssopt->kind = TCP_OPTION_MSS;
00692                 mssopt->length = sizeof ( *mssopt );
00693                 mssopt->mss = htons ( tcp->mss );
00694                 wsopt = iob_push ( iobuf, sizeof ( *wsopt ) );
00695                 wsopt->nop = TCP_OPTION_NOP;
00696                 wsopt->wsopt.kind = TCP_OPTION_WS;
00697                 wsopt->wsopt.length = sizeof ( wsopt->wsopt );
00698                 wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
00699                 spopt = iob_push ( iobuf, sizeof ( *spopt ) );
00700                 memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt->nop ) );
00701                 spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
00702                 spopt->spopt.length = sizeof ( spopt->spopt );
00703         }
00704         if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
00705                 tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
00706                 memset ( tsopt->nop, TCP_OPTION_NOP, sizeof ( tsopt->nop ) );
00707                 tsopt->tsopt.kind = TCP_OPTION_TS;
00708                 tsopt->tsopt.length = sizeof ( tsopt->tsopt );
00709                 tsopt->tsopt.tsval = htonl ( currticks() );
00710                 tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
00711         }
00712         if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
00713              ( ! list_empty ( &tcp->rx_queue ) ) &&
00714              ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
00715                 sack_len = ( sack_count * sizeof ( *sack ) );
00716                 sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
00717                 memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
00718                 sackopt->sackopt.kind = TCP_OPTION_SACK;
00719                 sackopt->sackopt.length =
00720                         ( sizeof ( sackopt->sackopt ) + sack_len );
00721                 sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
00722                 for ( i = 0 ; i < sack_count ; i++, sack++ ) {
00723                         sack->left = htonl ( tcp->sack[i].left );
00724                         sack->right = htonl ( tcp->sack[i].right );
00725                 }
00726         }
00727         if ( len != 0 )
00728                 flags |= TCP_PSH;
00729         tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
00730         memset ( tcphdr, 0, sizeof ( *tcphdr ) );
00731         tcphdr->src = htons ( tcp->local_port );
00732         tcphdr->dest = tcp->peer.st_port;
00733         tcphdr->seq = htonl ( tcp->snd_seq );
00734         tcphdr->ack = htonl ( tcp->rcv_ack );
00735         tcphdr->hlen = ( ( payload - iobuf->data ) << 2 );
00736         tcphdr->flags = flags;
00737         tcphdr->win = htons ( tcp->rcv_win >> tcp->rcv_win_scale );
00738         tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
00739 
00740         /* Dump header */
00741         DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x           %08x %4zd",
00742                 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
00743                 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) + seq_len ),
00744                 ntohl ( tcphdr->ack ), len );
00745         tcp_dump_flags ( tcp, tcphdr->flags );
00746         DBGC2 ( tcp, "\n" );
00747 
00748         /* Transmit packet */
00749         if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, &tcp->peer, NULL,
00750                                &tcphdr->csum ) ) != 0 ) {
00751                 DBGC ( tcp, "TCP %p could not transmit %08x..%08x %08x: %s\n",
00752                        tcp, tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ),
00753                        tcp->rcv_ack, strerror ( rc ) );
00754                 return;
00755         }
00756 
00757         /* Clear ACK-pending flag */
00758         tcp->flags &= ~TCP_ACK_PENDING;
00759 
00760         profile_stop ( &tcp_tx_profiler );
00761 }
00762 
00763 /**
00764  * Transmit any outstanding data
00765  *
00766  * @v tcp               TCP connection
00767  */
00768 static void tcp_xmit ( struct tcp_connection *tcp ) {
00769 
00770         /* Transmit without an explicit first SACK */
00771         tcp_xmit_sack ( tcp, tcp->rcv_ack );
00772 }
00773 
00774 /** TCP process descriptor */
00775 static struct process_descriptor tcp_process_desc =
00776         PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
00777 
00778 /**
00779  * Retransmission timer expired
00780  *
00781  * @v timer             Retransmission timer
00782  * @v over              Failure indicator
00783  */
00784 static void tcp_expired ( struct retry_timer *timer, int over ) {
00785         struct tcp_connection *tcp =
00786                 container_of ( timer, struct tcp_connection, timer );
00787 
00788         DBGC ( tcp, "TCP %p timer %s in %s for %08x..%08x %08x\n", tcp,
00789                ( over ? "expired" : "fired" ), tcp_state ( tcp->tcp_state ),
00790                tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
00791 
00792         assert ( ( tcp->tcp_state == TCP_SYN_SENT ) ||
00793                  ( tcp->tcp_state == TCP_SYN_RCVD ) ||
00794                  ( tcp->tcp_state == TCP_ESTABLISHED ) ||
00795                  ( tcp->tcp_state == TCP_FIN_WAIT_1 ) ||
00796                  ( tcp->tcp_state == TCP_CLOSE_WAIT ) ||
00797                  ( tcp->tcp_state == TCP_CLOSING_OR_LAST_ACK ) );
00798 
00799         if ( over ) {
00800                 /* If we have finally timed out and given up,
00801                  * terminate the connection
00802                  */
00803                 tcp->tcp_state = TCP_CLOSED;
00804                 tcp_dump_state ( tcp );
00805                 tcp_close ( tcp, -ETIMEDOUT );
00806         } else {
00807                 /* Otherwise, retransmit the packet */
00808                 tcp_xmit ( tcp );
00809         }
00810 }
00811 
00812 /**
00813  * Keepalive timer expired
00814  *
00815  * @v timer             Keepalive timer
00816  * @v over              Failure indicator
00817  */
00818 static void tcp_keepalive_expired ( struct retry_timer *timer,
00819                                     int over __unused ) {
00820         struct tcp_connection *tcp =
00821                 container_of ( timer, struct tcp_connection, keepalive );
00822 
00823         DBGC ( tcp, "TCP %p sending keepalive\n", tcp );
00824 
00825         /* Reset keepalive timer */
00826         start_timer_fixed ( &tcp->keepalive, TCP_KEEPALIVE_DELAY );
00827 
00828         /* Send keepalive.  We do this only to preserve or restore
00829          * state in intermediate devices (e.g. firewall NAT tables);
00830          * we don't actually care about eliciting a response to verify
00831          * that the peer is still alive.  We therefore send just a
00832          * pure ACK, to keep our transmit path simple.
00833          */
00834         tcp->flags |= TCP_ACK_PENDING;
00835         tcp_xmit ( tcp );
00836 }
00837 
00838 /**
00839  * Shutdown timer expired
00840  *
00841  * @v timer             Shutdown timer
00842  * @v over              Failure indicator
00843  */
00844 static void tcp_wait_expired ( struct retry_timer *timer, int over __unused ) {
00845         struct tcp_connection *tcp =
00846                 container_of ( timer, struct tcp_connection, wait );
00847 
00848         assert ( tcp->tcp_state == TCP_TIME_WAIT );
00849 
00850         DBGC ( tcp, "TCP %p wait complete in %s for %08x..%08x %08x\n", tcp,
00851                tcp_state ( tcp->tcp_state ), tcp->snd_seq,
00852                ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
00853 
00854         tcp->tcp_state = TCP_CLOSED;
00855         tcp_dump_state ( tcp );
00856         tcp_close ( tcp, 0 );
00857 }
00858 
00859 /**
00860  * Send RST response to incoming packet
00861  *
00862  * @v in_tcphdr         TCP header of incoming packet
00863  * @ret rc              Return status code
00864  */
00865 static int tcp_xmit_reset ( struct tcp_connection *tcp,
00866                             struct sockaddr_tcpip *st_dest,
00867                             struct tcp_header *in_tcphdr ) {
00868         struct io_buffer *iobuf;
00869         struct tcp_header *tcphdr;
00870         int rc;
00871 
00872         /* Allocate space for dataless TX buffer */
00873         iobuf = alloc_iob ( TCP_MAX_HEADER_LEN );
00874         if ( ! iobuf ) {
00875                 DBGC ( tcp, "TCP %p could not allocate iobuf for RST "
00876                        "%08x..%08x %08x\n", tcp, ntohl ( in_tcphdr->ack ),
00877                        ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ) );
00878                 return -ENOMEM;
00879         }
00880         iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
00881 
00882         /* Construct RST response */
00883         tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
00884         memset ( tcphdr, 0, sizeof ( *tcphdr ) );
00885         tcphdr->src = in_tcphdr->dest;
00886         tcphdr->dest = in_tcphdr->src;
00887         tcphdr->seq = in_tcphdr->ack;
00888         tcphdr->ack = in_tcphdr->seq;
00889         tcphdr->hlen = ( ( sizeof ( *tcphdr ) / 4 ) << 4 );
00890         tcphdr->flags = ( TCP_RST | TCP_ACK );
00891         tcphdr->win = htons ( 0 );
00892         tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
00893 
00894         /* Dump header */
00895         DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x           %08x %4d",
00896                 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
00897                 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) ),
00898                 ntohl ( tcphdr->ack ), 0 );
00899         tcp_dump_flags ( tcp, tcphdr->flags );
00900         DBGC2 ( tcp, "\n" );
00901 
00902         /* Transmit packet */
00903         if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, st_dest,
00904                                NULL, &tcphdr->csum ) ) != 0 ) {
00905                 DBGC ( tcp, "TCP %p could not transmit RST %08x..%08x %08x: "
00906                        "%s\n", tcp, ntohl ( in_tcphdr->ack ),
00907                        ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ),
00908                        strerror ( rc ) );
00909                 return rc;
00910         }
00911 
00912         return 0;
00913 }
00914 
00915 /***************************************************************************
00916  *
00917  * Receive data path
00918  *
00919  ***************************************************************************
00920  */
00921 
00922 /**
00923  * Identify TCP connection by local port number
00924  *
00925  * @v local_port        Local port
00926  * @ret tcp             TCP connection, or NULL
00927  */
00928 static struct tcp_connection * tcp_demux ( unsigned int local_port ) {
00929         struct tcp_connection *tcp;
00930 
00931         list_for_each_entry ( tcp, &tcp_conns, list ) {
00932                 if ( tcp->local_port == local_port )
00933                         return tcp;
00934         }
00935         return NULL;
00936 }
00937 
00938 /**
00939  * Parse TCP received options
00940  *
00941  * @v tcp               TCP connection (may be NULL)
00942  * @v tcphdr            TCP header
00943  * @v hlen              TCP header length
00944  * @v options           Options structure to fill in
00945  * @ret rc              Return status code
00946  */
00947 static int tcp_rx_opts ( struct tcp_connection *tcp,
00948                          const struct tcp_header *tcphdr, size_t hlen,
00949                          struct tcp_options *options ) {
00950         const void *data = ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) );
00951         const void *end = ( ( ( void * ) tcphdr ) + hlen );
00952         const struct tcp_option *option;
00953         unsigned int kind;
00954         size_t remaining;
00955         size_t min;
00956 
00957         /* Sanity check */
00958         assert ( hlen >= sizeof ( *tcphdr ) );
00959 
00960         /* Parse options */
00961         memset ( options, 0, sizeof ( *options ) );
00962         while ( ( remaining = ( end - data ) ) ) {
00963 
00964                 /* Extract option code */
00965                 option = data;
00966                 kind = option->kind;
00967 
00968                 /* Handle single-byte options */
00969                 if ( kind == TCP_OPTION_END )
00970                         break;
00971                 if ( kind == TCP_OPTION_NOP ) {
00972                         data++;
00973                         continue;
00974                 }
00975 
00976                 /* Handle multi-byte options */
00977                 min = sizeof ( *option );
00978                 switch ( kind ) {
00979                 case TCP_OPTION_MSS:
00980                         /* Ignore received MSS */
00981                         break;
00982                 case TCP_OPTION_WS:
00983                         options->wsopt = data;
00984                         min = sizeof ( *options->wsopt );
00985                         break;
00986                 case TCP_OPTION_SACK_PERMITTED:
00987                         options->spopt = data;
00988                         min = sizeof ( *options->spopt );
00989                         break;
00990                 case TCP_OPTION_SACK:
00991                         /* Ignore received SACKs */
00992                         break;
00993                 case TCP_OPTION_TS:
00994                         options->tsopt = data;
00995                         min = sizeof ( *options->tsopt );
00996                         break;
00997                 default:
00998                         DBGC ( tcp, "TCP %p received unknown option %d\n",
00999                                tcp, kind );
01000                         break;
01001                 }
01002                 if ( remaining < min ) {
01003                         DBGC ( tcp, "TCP %p received truncated option %d\n",
01004                                tcp, kind );
01005                         return -EINVAL;
01006                 }
01007                 if ( option->length < min ) {
01008                         DBGC ( tcp, "TCP %p received underlength option %d\n",
01009                                tcp, kind );
01010                         return -EINVAL;
01011                 }
01012                 if ( option->length > remaining ) {
01013                         DBGC ( tcp, "TCP %p received overlength option %d\n",
01014                                tcp, kind );
01015                         return -EINVAL;
01016                 }
01017                 data += option->length;
01018         }
01019 
01020         return 0;
01021 }
01022 
01023 /**
01024  * Consume received sequence space
01025  *
01026  * @v tcp               TCP connection
01027  * @v seq_len           Sequence space length to consume
01028  */
01029 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
01030         unsigned int sack;
01031 
01032         /* Sanity check */
01033         assert ( seq_len > 0 );
01034 
01035         /* Update acknowledgement number */
01036         tcp->rcv_ack += seq_len;
01037 
01038         /* Update window */
01039         if ( tcp->rcv_win > seq_len ) {
01040                 tcp->rcv_win -= seq_len;
01041         } else {
01042                 tcp->rcv_win = 0;
01043         }
01044 
01045         /* Update timestamp */
01046         tcp->ts_recent = tcp->ts_val;
01047 
01048         /* Update SACK list */
01049         for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
01050                 if ( tcp->sack[sack].left == tcp->sack[sack].right )
01051                         continue;
01052                 if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
01053                         tcp->sack[sack].left = tcp->rcv_ack;
01054                 if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
01055                         tcp->sack[sack].right = tcp->rcv_ack;
01056         }
01057 
01058         /* Mark ACK as pending */
01059         tcp->flags |= TCP_ACK_PENDING;
01060 }
01061 
01062 /**
01063  * Handle TCP received SYN
01064  *
01065  * @v tcp               TCP connection
01066  * @v seq               SEQ value (in host-endian order)
01067  * @v options           TCP options
01068  * @ret rc              Return status code
01069  */
01070 static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
01071                         struct tcp_options *options ) {
01072 
01073         /* Synchronise sequence numbers on first SYN */
01074         if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
01075                 tcp->rcv_ack = seq;
01076                 if ( options->tsopt )
01077                         tcp->flags |= TCP_TS_ENABLED;
01078                 if ( options->spopt )
01079                         tcp->flags |= TCP_SACK_ENABLED;
01080                 if ( options->wsopt ) {
01081                         tcp->snd_win_scale = options->wsopt->scale;
01082                         tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
01083                 }
01084                 DBGC ( tcp, "TCP %p using %stimestamps, %sSACK, TX window "
01085                        "x%d, RX window x%d\n", tcp,
01086                        ( ( tcp->flags & TCP_TS_ENABLED ) ? "" : "no " ),
01087                        ( ( tcp->flags & TCP_SACK_ENABLED ) ? "" : "no " ),
01088                        ( 1 << tcp->snd_win_scale ),
01089                        ( 1 << tcp->rcv_win_scale ) );
01090         }
01091 
01092         /* Ignore duplicate SYN */
01093         if ( seq != tcp->rcv_ack )
01094                 return 0;
01095 
01096         /* Acknowledge SYN */
01097         tcp_rx_seq ( tcp, 1 );
01098 
01099         /* Mark SYN as received and start sending ACKs with each packet */
01100         tcp->tcp_state |= ( TCP_STATE_SENT ( TCP_ACK ) |
01101                             TCP_STATE_RCVD ( TCP_SYN ) );
01102 
01103         return 0;
01104 }
01105 
01106 /**
01107  * Handle TCP received ACK
01108  *
01109  * @v tcp               TCP connection
01110  * @v ack               ACK value (in host-endian order)
01111  * @v win               WIN value (in host-endian order)
01112  * @ret rc              Return status code
01113  */
01114 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
01115                         uint32_t win ) {
01116         uint32_t ack_len = ( ack - tcp->snd_seq );
01117         size_t len;
01118         unsigned int acked_flags;
01119 
01120         /* Check for out-of-range or old duplicate ACKs */
01121         if ( ack_len > tcp->snd_sent ) {
01122                 DBGC ( tcp, "TCP %p received ACK for %08x..%08x, "
01123                        "sent only %08x..%08x\n", tcp, tcp->snd_seq,
01124                        ( tcp->snd_seq + ack_len ), tcp->snd_seq,
01125                        ( tcp->snd_seq + tcp->snd_sent ) );
01126 
01127                 if ( TCP_HAS_BEEN_ESTABLISHED ( tcp->tcp_state ) ) {
01128                         /* Just ignore what might be old duplicate ACKs */
01129                         return 0;
01130                 } else {
01131                         /* Send RST if an out-of-range ACK is received
01132                          * on a not-yet-established connection, as per
01133                          * RFC 793.
01134                          */
01135                         return -EINVAL;
01136                 }
01137         }
01138 
01139         /* Update window size */
01140         tcp->snd_win = win;
01141 
01142         /* Hold off (or start) the keepalive timer, if applicable */
01143         if ( ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) )
01144                 start_timer_fixed ( &tcp->keepalive, TCP_KEEPALIVE_DELAY );
01145 
01146         /* Ignore ACKs that don't actually acknowledge any new data.
01147          * (In particular, do not stop the retransmission timer; this
01148          * avoids creating a sorceror's apprentice syndrome when a
01149          * duplicate ACK is received and we still have data in our
01150          * transmit queue.)
01151          */
01152         if ( ack_len == 0 )
01153                 return 0;
01154 
01155         /* Stop the retransmission timer */
01156         stop_timer ( &tcp->timer );
01157 
01158         /* Determine acknowledged flags and data length */
01159         len = ack_len;
01160         acked_flags = ( TCP_FLAGS_SENDING ( tcp->tcp_state ) &
01161                         ( TCP_SYN | TCP_FIN ) );
01162         if ( acked_flags ) {
01163                 len--;
01164                 pending_put ( &tcp->pending_flags );
01165         }
01166 
01167         /* Update SEQ and sent counters */
01168         tcp->snd_seq = ack;
01169         tcp->snd_sent = 0;
01170 
01171         /* Remove any acknowledged data from transmit queue */
01172         tcp_process_tx_queue ( tcp, len, NULL, 1 );
01173                 
01174         /* Mark SYN/FIN as acknowledged if applicable. */
01175         if ( acked_flags )
01176                 tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
01177 
01178         /* Start sending FIN if we've had all possible data ACKed */
01179         if ( list_empty ( &tcp->tx_queue ) &&
01180              ( tcp->flags & TCP_XFER_CLOSED ) &&
01181              ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) ) {
01182                 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
01183                 pending_get ( &tcp->pending_flags );
01184         }
01185 
01186         return 0;
01187 }
01188 
01189 /**
01190  * Handle TCP received data
01191  *
01192  * @v tcp               TCP connection
01193  * @v seq               SEQ value (in host-endian order)
01194  * @v iobuf             I/O buffer
01195  * @ret rc              Return status code
01196  *
01197  * This function takes ownership of the I/O buffer.
01198  */
01199 static int tcp_rx_data ( struct tcp_connection *tcp, uint32_t seq,
01200                          struct io_buffer *iobuf ) {
01201         uint32_t already_rcvd;
01202         uint32_t len;
01203         int rc;
01204 
01205         /* Ignore duplicate or out-of-order data */
01206         already_rcvd = ( tcp->rcv_ack - seq );
01207         len = iob_len ( iobuf );
01208         if ( already_rcvd >= len ) {
01209                 free_iob ( iobuf );
01210                 return 0;
01211         }
01212         iob_pull ( iobuf, already_rcvd );
01213         len -= already_rcvd;
01214 
01215         /* Acknowledge new data */
01216         tcp_rx_seq ( tcp, len );
01217 
01218         /* Deliver data to application */
01219         profile_start ( &tcp_xfer_profiler );
01220         if ( ( rc = xfer_deliver_iob ( &tcp->xfer, iobuf ) ) != 0 ) {
01221                 DBGC ( tcp, "TCP %p could not deliver %08x..%08x: %s\n",
01222                        tcp, seq, ( seq + len ), strerror ( rc ) );
01223                 return rc;
01224         }
01225         profile_stop ( &tcp_xfer_profiler );
01226 
01227         return 0;
01228 }
01229 
01230 /**
01231  * Handle TCP received FIN
01232  *
01233  * @v tcp               TCP connection
01234  * @v seq               SEQ value (in host-endian order)
01235  * @ret rc              Return status code
01236  */
01237 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
01238 
01239         /* Ignore duplicate or out-of-order FIN */
01240         if ( seq != tcp->rcv_ack )
01241                 return 0;
01242 
01243         /* Acknowledge FIN */
01244         tcp_rx_seq ( tcp, 1 );
01245 
01246         /* Mark FIN as received */
01247         tcp->tcp_state |= TCP_STATE_RCVD ( TCP_FIN );
01248 
01249         /* Close connection */
01250         tcp_close ( tcp, 0 );
01251 
01252         return 0;
01253 }
01254 
01255 /**
01256  * Handle TCP received RST
01257  *
01258  * @v tcp               TCP connection
01259  * @v seq               SEQ value (in host-endian order)
01260  * @ret rc              Return status code
01261  */
01262 static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
01263 
01264         /* Accept RST only if it falls within the window.  If we have
01265          * not yet received a SYN, then we have no window to test
01266          * against, so fall back to checking that our SYN has been
01267          * ACKed.
01268          */
01269         if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
01270                 if ( ! tcp_in_window ( seq, tcp->rcv_ack, tcp->rcv_win ) )
01271                         return 0;
01272         } else {
01273                 if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
01274                         return 0;
01275         }
01276 
01277         /* Abort connection */
01278         tcp->tcp_state = TCP_CLOSED;
01279         tcp_dump_state ( tcp );
01280         tcp_close ( tcp, -ECONNRESET );
01281 
01282         DBGC ( tcp, "TCP %p connection reset by peer\n", tcp );
01283         return -ECONNRESET;
01284 }
01285 
01286 /**
01287  * Enqueue received TCP packet
01288  *
01289  * @v tcp               TCP connection
01290  * @v seq               SEQ value (in host-endian order)
01291  * @v flags             TCP flags
01292  * @v iobuf             I/O buffer
01293  */
01294 static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
01295                              uint8_t flags, struct io_buffer *iobuf ) {
01296         struct tcp_rx_queued_header *tcpqhdr;
01297         struct io_buffer *queued;
01298         size_t len;
01299         uint32_t seq_len;
01300         uint32_t nxt;
01301 
01302         /* Calculate remaining flags and sequence length.  Note that
01303          * SYN, if present, has already been processed by this point.
01304          */
01305         flags &= TCP_FIN;
01306         len = iob_len ( iobuf );
01307         seq_len = ( len + ( flags ? 1 : 0 ) );
01308         nxt = ( seq + seq_len );
01309 
01310         /* Discard immediately (to save memory) if:
01311          *
01312          * a) we have not yet received a SYN (and so have no defined
01313          *    receive window), or
01314          * b) the packet lies entirely outside the receive window, or
01315          * c) there is no further content to process.
01316          */
01317         if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
01318              ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
01319              ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
01320              ( seq_len == 0 ) ) {
01321                 free_iob ( iobuf );
01322                 return;
01323         }
01324 
01325         /* Add internal header */
01326         tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
01327         tcpqhdr->seq = seq;
01328         tcpqhdr->nxt = nxt;
01329         tcpqhdr->flags = flags;
01330 
01331         /* Add to RX queue */
01332         list_for_each_entry ( queued, &tcp->rx_queue, list ) {
01333                 tcpqhdr = queued->data;
01334                 if ( tcp_cmp ( seq, tcpqhdr->seq ) < 0 )
01335                         break;
01336         }
01337         list_add_tail ( &iobuf->list, &queued->list );
01338 }
01339 
01340 /**
01341  * Process receive queue
01342  *
01343  * @v tcp               TCP connection
01344  */
01345 static void tcp_process_rx_queue ( struct tcp_connection *tcp ) {
01346         struct io_buffer *iobuf;
01347         struct tcp_rx_queued_header *tcpqhdr;
01348         uint32_t seq;
01349         unsigned int flags;
01350         size_t len;
01351 
01352         /* Process all applicable received buffers.  Note that we
01353          * cannot use list_for_each_entry() to iterate over the RX
01354          * queue, since tcp_discard() may remove packets from the RX
01355          * queue while we are processing.
01356          */
01357         while ( ( iobuf = list_first_entry ( &tcp->rx_queue, struct io_buffer,
01358                                              list ) ) ) {
01359 
01360                 /* Stop processing when we hit the first gap */
01361                 tcpqhdr = iobuf->data;
01362                 if ( tcp_cmp ( tcpqhdr->seq, tcp->rcv_ack ) > 0 )
01363                         break;
01364 
01365                 /* Strip internal header and remove from RX queue */
01366                 list_del ( &iobuf->list );
01367                 seq = tcpqhdr->seq;
01368                 flags = tcpqhdr->flags;
01369                 iob_pull ( iobuf, sizeof ( *tcpqhdr ) );
01370                 len = iob_len ( iobuf );
01371 
01372                 /* Handle new data, if any */
01373                 tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
01374                 seq += len;
01375 
01376                 /* Handle FIN, if present */
01377                 if ( flags & TCP_FIN ) {
01378                         tcp_rx_fin ( tcp, seq );
01379                         seq++;
01380                 }
01381         }
01382 }
01383 
01384 /**
01385  * Process received packet
01386  *
01387  * @v iobuf             I/O buffer
01388  * @v netdev            Network device
01389  * @v st_src            Partially-filled source address
01390  * @v st_dest           Partially-filled destination address
01391  * @v pshdr_csum        Pseudo-header checksum
01392  * @ret rc              Return status code
01393   */
01394 static int tcp_rx ( struct io_buffer *iobuf,
01395                     struct net_device *netdev __unused,
01396                     struct sockaddr_tcpip *st_src,
01397                     struct sockaddr_tcpip *st_dest __unused,
01398                     uint16_t pshdr_csum ) {
01399         struct tcp_header *tcphdr = iobuf->data;
01400         struct tcp_connection *tcp;
01401         struct tcp_options options;
01402         size_t hlen;
01403         uint16_t csum;
01404         uint32_t seq;
01405         uint32_t ack;
01406         uint16_t raw_win;
01407         uint32_t win;
01408         unsigned int flags;
01409         size_t len;
01410         uint32_t seq_len;
01411         size_t old_xfer_window;
01412         int rc;
01413 
01414         /* Start profiling */
01415         profile_start ( &tcp_rx_profiler );
01416 
01417         /* Sanity check packet */
01418         if ( iob_len ( iobuf ) < sizeof ( *tcphdr ) ) {
01419                 DBG ( "TCP packet too short at %zd bytes (min %zd bytes)\n",
01420                       iob_len ( iobuf ), sizeof ( *tcphdr ) );
01421                 rc = -EINVAL;
01422                 goto discard;
01423         }
01424         hlen = ( ( tcphdr->hlen & TCP_MASK_HLEN ) / 16 ) * 4;
01425         if ( hlen < sizeof ( *tcphdr ) ) {
01426                 DBG ( "TCP header too short at %zd bytes (min %zd bytes)\n",
01427                       hlen, sizeof ( *tcphdr ) );
01428                 rc = -EINVAL;
01429                 goto discard;
01430         }
01431         if ( hlen > iob_len ( iobuf ) ) {
01432                 DBG ( "TCP header too long at %zd bytes (max %zd bytes)\n",
01433                       hlen, iob_len ( iobuf ) );
01434                 rc = -EINVAL;
01435                 goto discard;
01436         }
01437         csum = tcpip_continue_chksum ( pshdr_csum, iobuf->data,
01438                                        iob_len ( iobuf ) );
01439         if ( csum != 0 ) {
01440                 DBG ( "TCP checksum incorrect (is %04x including checksum "
01441                       "field, should be 0000)\n", csum );
01442                 rc = -EINVAL;
01443                 goto discard;
01444         }
01445         
01446         /* Parse parameters from header and strip header */
01447         tcp = tcp_demux ( ntohs ( tcphdr->dest ) );
01448         seq = ntohl ( tcphdr->seq );
01449         ack = ntohl ( tcphdr->ack );
01450         raw_win = ntohs ( tcphdr->win );
01451         flags = tcphdr->flags;
01452         if ( ( rc = tcp_rx_opts ( tcp, tcphdr, hlen, &options ) ) != 0 )
01453                 goto discard;
01454         if ( tcp && options.tsopt )
01455                 tcp->ts_val = ntohl ( options.tsopt->tsval );
01456         iob_pull ( iobuf, hlen );
01457         len = iob_len ( iobuf );
01458         seq_len = ( len + ( ( flags & TCP_SYN ) ? 1 : 0 ) +
01459                     ( ( flags & TCP_FIN ) ? 1 : 0 ) );
01460 
01461         /* Dump header */
01462         DBGC2 ( tcp, "TCP %p RX %d<-%d           %08x %08x..%08x %4zd",
01463                 tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
01464                 ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
01465                 ( ntohl ( tcphdr->seq ) + seq_len ), len );
01466         tcp_dump_flags ( tcp, tcphdr->flags );
01467         DBGC2 ( tcp, "\n" );
01468 
01469         /* If no connection was found, silently drop packet */
01470         if ( ! tcp ) {
01471                 rc = -ENOTCONN;
01472                 goto discard;
01473         }
01474 
01475         /* Record old data-transfer window */
01476         old_xfer_window = tcp_xfer_window ( tcp );
01477 
01478         /* Handle ACK, if present */
01479         if ( flags & TCP_ACK ) {
01480                 win = ( raw_win << tcp->snd_win_scale );
01481                 if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
01482                         tcp_xmit_reset ( tcp, st_src, tcphdr );
01483                         goto discard;
01484                 }
01485         }
01486 
01487         /* Force an ACK if this packet is out of order */
01488         if ( ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) &&
01489              ( seq != tcp->rcv_ack ) ) {
01490                 tcp->flags |= TCP_ACK_PENDING;
01491         }
01492 
01493         /* Handle SYN, if present */
01494         if ( flags & TCP_SYN ) {
01495                 tcp_rx_syn ( tcp, seq, &options );
01496                 seq++;
01497         }
01498 
01499         /* Handle RST, if present */
01500         if ( flags & TCP_RST ) {
01501                 if ( ( rc = tcp_rx_rst ( tcp, seq ) ) != 0 )
01502                         goto discard;
01503         }
01504 
01505         /* Enqueue received data */
01506         tcp_rx_enqueue ( tcp, seq, flags, iob_disown ( iobuf ) );
01507 
01508         /* Process receive queue */
01509         tcp_process_rx_queue ( tcp );
01510 
01511         /* Dump out any state change as a result of the received packet */
01512         tcp_dump_state ( tcp );
01513 
01514         /* Schedule transmission of ACK (and any pending data).  If we
01515          * have received any out-of-order packets (i.e. if the receive
01516          * queue remains non-empty after processing) then send the ACK
01517          * immediately in order to trigger Fast Retransmission.
01518          */
01519         if ( list_empty ( &tcp->rx_queue ) ) {
01520                 process_add ( &tcp->process );
01521         } else {
01522                 tcp_xmit_sack ( tcp, seq );
01523         }
01524 
01525         /* If this packet was the last we expect to receive, set up
01526          * timer to expire and cause the connection to be freed.
01527          */
01528         if ( TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) {
01529                 stop_timer ( &tcp->wait );
01530                 start_timer_fixed ( &tcp->wait, ( 2 * TCP_MSL ) );
01531         }
01532 
01533         /* Notify application if window has changed */
01534         if ( tcp_xfer_window ( tcp ) != old_xfer_window )
01535                 xfer_window_changed ( &tcp->xfer );
01536 
01537         profile_stop ( &tcp_rx_profiler );
01538         return 0;
01539 
01540  discard:
01541         /* Free received packet */
01542         free_iob ( iobuf );
01543         return rc;
01544 }
01545 
01546 /** TCP protocol */
01547 struct tcpip_protocol tcp_protocol __tcpip_protocol = {
01548         .name = "TCP",
01549         .rx = tcp_rx,
01550         .tcpip_proto = IP_TCP,
01551 };
01552 
01553 /**
01554  * Discard some cached TCP data
01555  *
01556  * @ret discarded       Number of cached items discarded
01557  */
01558 static unsigned int tcp_discard ( void ) {
01559         struct tcp_connection *tcp;
01560         struct io_buffer *iobuf;
01561         unsigned int discarded = 0;
01562 
01563         /* Try to drop one queued RX packet from each connection */
01564         list_for_each_entry ( tcp, &tcp_conns, list ) {
01565                 list_for_each_entry_reverse ( iobuf, &tcp->rx_queue, list ) {
01566 
01567                         /* Remove packet from queue */
01568                         list_del ( &iobuf->list );
01569                         free_iob ( iobuf );
01570 
01571                         /* Report discard */
01572                         discarded++;
01573                         break;
01574                 }
01575         }
01576 
01577         return discarded;
01578 }
01579 
01580 /** TCP cache discarder */
01581 struct cache_discarder tcp_discarder __cache_discarder ( CACHE_NORMAL ) = {
01582         .discard = tcp_discard,
01583 };
01584 
01585 /**
01586  * Find first TCP connection that has not yet been closed
01587  *
01588  * @ret tcp             First unclosed connection, or NULL
01589  */
01590 static struct tcp_connection * tcp_first_unclosed ( void ) {
01591         struct tcp_connection *tcp;
01592 
01593         /* Find first connection which has not yet been closed */
01594         list_for_each_entry ( tcp, &tcp_conns, list ) {
01595                 if ( ! ( tcp->flags & TCP_XFER_CLOSED ) )
01596                         return tcp;
01597         }
01598         return NULL;
01599 }
01600 
01601 /**
01602  * Find first TCP connection that has not yet finished all operations
01603  *
01604  * @ret tcp             First unfinished connection, or NULL
01605  */
01606 static struct tcp_connection * tcp_first_unfinished ( void ) {
01607         struct tcp_connection *tcp;
01608 
01609         /* Find first connection which has not yet closed gracefully,
01610          * or which still has a pending transmission (e.g. to ACK the
01611          * received FIN).
01612          */
01613         list_for_each_entry ( tcp, &tcp_conns, list ) {
01614                 if ( ( ! TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) ||
01615                      process_running ( &tcp->process ) ) {
01616                         return tcp;
01617                 }
01618         }
01619         return NULL;
01620 }
01621 
01622 /**
01623  * Shut down all TCP connections
01624  *
01625  */
01626 static void tcp_shutdown ( int booting __unused ) {
01627         struct tcp_connection *tcp;
01628         unsigned long start;
01629         unsigned long elapsed;
01630 
01631         /* Initiate a graceful close of all connections, allowing for
01632          * the fact that the connection list may change as we do so.
01633          */
01634         while ( ( tcp = tcp_first_unclosed() ) ) {
01635                 DBGC ( tcp, "TCP %p closing for shutdown\n", tcp );
01636                 tcp_close ( tcp, -ECANCELED );
01637         }
01638 
01639         /* Wait for all connections to finish closing gracefully */
01640         start = currticks();
01641         while ( ( tcp = tcp_first_unfinished() ) &&
01642                 ( ( elapsed = ( currticks() - start ) ) < TCP_FINISH_TIMEOUT )){
01643                 step();
01644         }
01645 
01646         /* Forcibly close any remaining connections */
01647         while ( ( tcp = list_first_entry ( &tcp_conns, struct tcp_connection,
01648                                            list ) ) != NULL ) {
01649                 tcp->tcp_state = TCP_CLOSED;
01650                 tcp_dump_state ( tcp );
01651                 tcp_close ( tcp, -ECANCELED );
01652         }
01653 }
01654 
01655 /** TCP shutdown function */
01656 struct startup_fn tcp_startup_fn __startup_fn ( STARTUP_LATE ) = {
01657         .shutdown = tcp_shutdown,
01658 };
01659 
01660 /***************************************************************************
01661  *
01662  * Data transfer interface
01663  *
01664  ***************************************************************************
01665  */
01666 
01667 /**
01668  * Close interface
01669  *
01670  * @v tcp               TCP connection
01671  * @v rc                Reason for close
01672  */
01673 static void tcp_xfer_close ( struct tcp_connection *tcp, int rc ) {
01674 
01675         /* Close data transfer interface */
01676         tcp_close ( tcp, rc );
01677 
01678         /* Transmit FIN, if possible */
01679         tcp_xmit ( tcp );
01680 }
01681 
01682 /**
01683  * Deliver datagram as I/O buffer
01684  *
01685  * @v tcp               TCP connection
01686  * @v iobuf             Datagram I/O buffer
01687  * @v meta              Data transfer metadata
01688  * @ret rc              Return status code
01689  */
01690 static int tcp_xfer_deliver ( struct tcp_connection *tcp,
01691                               struct io_buffer *iobuf,
01692                               struct xfer_metadata *meta __unused ) {
01693 
01694         /* Enqueue packet */
01695         list_add_tail ( &iobuf->list, &tcp->tx_queue );
01696 
01697         /* Each enqueued packet is a pending operation */
01698         pending_get ( &tcp->pending_data );
01699 
01700         /* Transmit data, if possible */
01701         tcp_xmit ( tcp );
01702 
01703         return 0;
01704 }
01705 
01706 /** TCP data transfer interface operations */
01707 static struct interface_operation tcp_xfer_operations[] = {
01708         INTF_OP ( xfer_deliver, struct tcp_connection *, tcp_xfer_deliver ),
01709         INTF_OP ( xfer_window, struct tcp_connection *, tcp_xfer_window ),
01710         INTF_OP ( intf_close, struct tcp_connection *, tcp_xfer_close ),
01711 };
01712 
01713 /** TCP data transfer interface descriptor */
01714 static struct interface_descriptor tcp_xfer_desc =
01715         INTF_DESC ( struct tcp_connection, xfer, tcp_xfer_operations );
01716 
01717 /***************************************************************************
01718  *
01719  * Openers
01720  *
01721  ***************************************************************************
01722  */
01723 
01724 /** TCP IPv4 socket opener */
01725 struct socket_opener tcp_ipv4_socket_opener __socket_opener = {
01726         .semantics      = TCP_SOCK_STREAM,
01727         .family         = AF_INET,
01728         .open           = tcp_open,
01729 };
01730 
01731 /** TCP IPv6 socket opener */
01732 struct socket_opener tcp_ipv6_socket_opener __socket_opener = {
01733         .semantics      = TCP_SOCK_STREAM,
01734         .family         = AF_INET6,
01735         .open           = tcp_open,
01736 };
01737 
01738 /** Linkage hack */
01739 int tcp_sock_stream = TCP_SOCK_STREAM;
01740 
01741 /**
01742  * Open TCP URI
01743  *
01744  * @v xfer              Data transfer interface
01745  * @v uri               URI
01746  * @ret rc              Return status code
01747  */
01748 static int tcp_open_uri ( struct interface *xfer, struct uri *uri ) {
01749         struct sockaddr_tcpip peer;
01750 
01751         /* Sanity check */
01752         if ( ! uri->host )
01753                 return -EINVAL;
01754 
01755         memset ( &peer, 0, sizeof ( peer ) );
01756         peer.st_port = htons ( uri_port ( uri, 0 ) );
01757         return xfer_open_named_socket ( xfer, SOCK_STREAM,
01758                                         ( struct sockaddr * ) &peer,
01759                                         uri->host, NULL );
01760 }
01761 
01762 /** TCP URI opener */
01763 struct uri_opener tcp_uri_opener __uri_opener = {
01764         .scheme         = "tcp",
01765         .open           = tcp_open_uri,
01766 };
01767