iPXE
tcp.c
Go to the documentation of this file.
00001 #include <string.h>
00002 #include <stdlib.h>
00003 #include <stdio.h>
00004 #include <assert.h>
00005 #include <errno.h>
00006 #include <byteswap.h>
00007 #include <ipxe/timer.h>
00008 #include <ipxe/iobuf.h>
00009 #include <ipxe/malloc.h>
00010 #include <ipxe/init.h>
00011 #include <ipxe/retry.h>
00012 #include <ipxe/refcnt.h>
00013 #include <ipxe/pending.h>
00014 #include <ipxe/xfer.h>
00015 #include <ipxe/open.h>
00016 #include <ipxe/uri.h>
00017 #include <ipxe/netdevice.h>
00018 #include <ipxe/profile.h>
00019 #include <ipxe/process.h>
00020 #include <ipxe/job.h>
00021 #include <ipxe/tcpip.h>
00022 #include <ipxe/tcp.h>
00023 
00024 /** @file
00025  *
00026  * TCP protocol
00027  *
00028  */
00029 
00030 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
00031 
00032 /** A TCP connection */
00033 struct tcp_connection {
00034         /** Reference counter */
00035         struct refcnt refcnt;
00036         /** List of TCP connections */
00037         struct list_head list;
00038 
00039         /** Flags */
00040         unsigned int flags;
00041 
00042         /** Data transfer interface */
00043         struct interface xfer;
00044 
00045         /** Remote socket address */
00046         struct sockaddr_tcpip peer;
00047         /** Local port */
00048         unsigned int local_port;
00049         /** Maximum segment size */
00050         size_t mss;
00051 
00052         /** Current TCP state */
00053         unsigned int tcp_state;
00054         /** Previous TCP state
00055          *
00056          * Maintained only for debug messages
00057          */
00058         unsigned int prev_tcp_state;
00059         /** Current sequence number
00060          *
00061          * Equivalent to SND.UNA in RFC 793 terminology.
00062          */
00063         uint32_t snd_seq;
00064         /** Unacknowledged sequence count
00065          *
00066          * Equivalent to (SND.NXT-SND.UNA) in RFC 793 terminology.
00067          */
00068         uint32_t snd_sent;
00069         /** Send window
00070          *
00071          * Equivalent to SND.WND in RFC 793 terminology
00072          */
00073         uint32_t snd_win;
00074         /** Current acknowledgement number
00075          *
00076          * Equivalent to RCV.NXT in RFC 793 terminology.
00077          */
00078         uint32_t rcv_ack;
00079         /** Receive window
00080          *
00081          * Equivalent to RCV.WND in RFC 793 terminology.
00082          */
00083         uint32_t rcv_win;
00084         /** Received timestamp value
00085          *
00086          * Updated when a packet is received; copied to ts_recent when
00087          * the window is advanced.
00088          */
00089         uint32_t ts_val;
00090         /** Most recent received timestamp that advanced the window
00091          *
00092          * Equivalent to TS.Recent in RFC 1323 terminology.
00093          */
00094         uint32_t ts_recent;
00095         /** Send window scale
00096          *
00097          * Equivalent to Snd.Wind.Scale in RFC 1323 terminology
00098          */
00099         uint8_t snd_win_scale;
00100         /** Receive window scale
00101          *
00102          * Equivalent to Rcv.Wind.Scale in RFC 1323 terminology
00103          */
00104         uint8_t rcv_win_scale;
00105 
00106         /** Selective acknowledgement list (in host-endian order) */
00107         struct tcp_sack_block sack[TCP_SACK_MAX];
00108 
00109         /** Transmit queue */
00110         struct list_head tx_queue;
00111         /** Receive queue */
00112         struct list_head rx_queue;
00113         /** Transmission process */
00114         struct process process;
00115         /** Retransmission timer */
00116         struct retry_timer timer;
00117         /** Keepalive timer */
00118         struct retry_timer keepalive;
00119         /** Shutdown (TIME_WAIT) timer */
00120         struct retry_timer wait;
00121 
00122         /** Pending operations for SYN and FIN */
00123         struct pending_operation pending_flags;
00124         /** Pending operations for transmit queue */
00125         struct pending_operation pending_data;
00126 };
00127 
00128 /** TCP flags */
00129 enum tcp_flags {
00130         /** TCP data transfer interface has been closed */
00131         TCP_XFER_CLOSED = 0x0001,
00132         /** TCP timestamps are enabled */
00133         TCP_TS_ENABLED = 0x0002,
00134         /** TCP acknowledgement is pending */
00135         TCP_ACK_PENDING = 0x0004,
00136         /** TCP selective acknowledgement is enabled */
00137         TCP_SACK_ENABLED = 0x0008,
00138 };
00139 
00140 /** TCP internal header
00141  *
00142  * This is the header that replaces the TCP header for packets
00143  * enqueued on the receive queue.
00144  */
00145 struct tcp_rx_queued_header {
00146         /** SEQ value, in host-endian order
00147          *
00148          * This represents the SEQ value at the time the packet is
00149          * enqueued, and so excludes the SYN, if present.
00150          */
00151         uint32_t seq;
00152         /** Next SEQ value, in host-endian order */
00153         uint32_t nxt;
00154         /** Flags
00155          *
00156          * Only FIN is valid within this flags byte; all other flags
00157          * have already been processed by the time the packet is
00158          * enqueued.
00159          */
00160         uint8_t flags;
00161         /** Reserved */
00162         uint8_t reserved[3];
00163 };
00164 
00165 /**
00166  * List of registered TCP connections
00167  */
00168 static LIST_HEAD ( tcp_conns );
00169 
00170 /** Transmit profiler */
00171 static struct profiler tcp_tx_profiler __profiler = { .name = "tcp.tx" };
00172 
00173 /** Receive profiler */
00174 static struct profiler tcp_rx_profiler __profiler = { .name = "tcp.rx" };
00175 
00176 /** Data transfer profiler */
00177 static struct profiler tcp_xfer_profiler __profiler = { .name = "tcp.xfer" };
00178 
00179 /* Forward declarations */
00180 static struct process_descriptor tcp_process_desc;
00181 static struct interface_descriptor tcp_xfer_desc;
00182 static void tcp_expired ( struct retry_timer *timer, int over );
00183 static void tcp_keepalive_expired ( struct retry_timer *timer, int over );
00184 static void tcp_wait_expired ( struct retry_timer *timer, int over );
00185 static struct tcp_connection * tcp_demux ( unsigned int local_port );
00186 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
00187                         uint32_t win );
00188 
00189 /**
00190  * Name TCP state
00191  *
00192  * @v state             TCP state
00193  * @ret name            Name of TCP state
00194  */
00195 static inline __attribute__ (( always_inline )) const char *
00196 tcp_state ( int state ) {
00197         switch ( state ) {
00198         case TCP_CLOSED:                return "CLOSED";
00199         case TCP_LISTEN:                return "LISTEN";
00200         case TCP_SYN_SENT:              return "SYN_SENT";
00201         case TCP_SYN_RCVD:              return "SYN_RCVD";
00202         case TCP_ESTABLISHED:           return "ESTABLISHED";
00203         case TCP_FIN_WAIT_1:            return "FIN_WAIT_1";
00204         case TCP_FIN_WAIT_2:            return "FIN_WAIT_2";
00205         case TCP_CLOSING_OR_LAST_ACK:   return "CLOSING/LAST_ACK";
00206         case TCP_TIME_WAIT:             return "TIME_WAIT";
00207         case TCP_CLOSE_WAIT:            return "CLOSE_WAIT";
00208         default:                        return "INVALID";
00209         }
00210 }
00211 
00212 /**
00213  * Dump TCP state transition
00214  *
00215  * @v tcp               TCP connection
00216  */
00217 static inline __attribute__ (( always_inline )) void
00218 tcp_dump_state ( struct tcp_connection *tcp ) {
00219 
00220         if ( tcp->tcp_state != tcp->prev_tcp_state ) {
00221                 DBGC ( tcp, "TCP %p transitioned from %s to %s\n", tcp,
00222                        tcp_state ( tcp->prev_tcp_state ),
00223                        tcp_state ( tcp->tcp_state ) );
00224         }
00225         tcp->prev_tcp_state = tcp->tcp_state;
00226 }
00227 
00228 /**
00229  * Dump TCP flags
00230  *
00231  * @v flags             TCP flags
00232  */
00233 static inline __attribute__ (( always_inline )) void
00234 tcp_dump_flags ( struct tcp_connection *tcp, unsigned int flags ) {
00235         if ( flags & TCP_RST )
00236                 DBGC2 ( tcp, " RST" );
00237         if ( flags & TCP_SYN )
00238                 DBGC2 ( tcp, " SYN" );
00239         if ( flags & TCP_PSH )
00240                 DBGC2 ( tcp, " PSH" );
00241         if ( flags & TCP_FIN )
00242                 DBGC2 ( tcp, " FIN" );
00243         if ( flags & TCP_ACK )
00244                 DBGC2 ( tcp, " ACK" );
00245 }
00246 
00247 /***************************************************************************
00248  *
00249  * Open and close
00250  *
00251  ***************************************************************************
00252  */
00253 
00254 /**
00255  * Check if local TCP port is available
00256  *
00257  * @v port              Local port number
00258  * @ret port            Local port number, or negative error
00259  */
00260 static int tcp_port_available ( int port ) {
00261 
00262         return ( tcp_demux ( port ) ? -EADDRINUSE : port );
00263 }
00264 
00265 /**
00266  * Open a TCP connection
00267  *
00268  * @v xfer              Data transfer interface
00269  * @v peer              Peer socket address
00270  * @v local             Local socket address, or NULL
00271  * @ret rc              Return status code
00272  */
00273 static int tcp_open ( struct interface *xfer, struct sockaddr *peer,
00274                       struct sockaddr *local ) {
00275         struct sockaddr_tcpip *st_peer = ( struct sockaddr_tcpip * ) peer;
00276         struct sockaddr_tcpip *st_local = ( struct sockaddr_tcpip * ) local;
00277         struct tcp_connection *tcp;
00278         size_t mtu;
00279         int port;
00280         int rc;
00281 
00282         /* Allocate and initialise structure */
00283         tcp = zalloc ( sizeof ( *tcp ) );
00284         if ( ! tcp )
00285                 return -ENOMEM;
00286         DBGC ( tcp, "TCP %p allocated\n", tcp );
00287         ref_init ( &tcp->refcnt, NULL );
00288         intf_init ( &tcp->xfer, &tcp_xfer_desc, &tcp->refcnt );
00289         process_init_stopped ( &tcp->process, &tcp_process_desc, &tcp->refcnt );
00290         timer_init ( &tcp->timer, tcp_expired, &tcp->refcnt );
00291         timer_init ( &tcp->keepalive, tcp_keepalive_expired, &tcp->refcnt );
00292         timer_init ( &tcp->wait, tcp_wait_expired, &tcp->refcnt );
00293         tcp->prev_tcp_state = TCP_CLOSED;
00294         tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
00295         tcp_dump_state ( tcp );
00296         tcp->snd_seq = random();
00297         INIT_LIST_HEAD ( &tcp->tx_queue );
00298         INIT_LIST_HEAD ( &tcp->rx_queue );
00299         memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
00300 
00301         /* Calculate MSS */
00302         mtu = tcpip_mtu ( &tcp->peer );
00303         if ( ! mtu ) {
00304                 DBGC ( tcp, "TCP %p has no route to %s\n",
00305                        tcp, sock_ntoa ( peer ) );
00306                 rc = -ENETUNREACH;
00307                 goto err;
00308         }
00309         tcp->mss = ( mtu - sizeof ( struct tcp_header ) );
00310 
00311         /* Bind to local port */
00312         port = tcpip_bind ( st_local, tcp_port_available );
00313         if ( port < 0 ) {
00314                 rc = port;
00315                 DBGC ( tcp, "TCP %p could not bind: %s\n",
00316                        tcp, strerror ( rc ) );
00317                 goto err;
00318         }
00319         tcp->local_port = port;
00320         DBGC ( tcp, "TCP %p bound to port %d\n", tcp, tcp->local_port );
00321 
00322         /* Start timer to initiate SYN */
00323         start_timer_nodelay ( &tcp->timer );
00324 
00325         /* Add a pending operation for the SYN */
00326         pending_get ( &tcp->pending_flags );
00327 
00328         /* Attach parent interface, transfer reference to connection
00329          * list and return
00330          */
00331         intf_plug_plug ( &tcp->xfer, xfer );
00332         list_add ( &tcp->list, &tcp_conns );
00333         return 0;
00334 
00335  err:
00336         ref_put ( &tcp->refcnt );
00337         return rc;
00338 }
00339 
00340 /**
00341  * Close TCP connection
00342  *
00343  * @v tcp               TCP connection
00344  * @v rc                Reason for close
00345  *
00346  * Closes the data transfer interface.  If the TCP state machine is in
00347  * a suitable state, the connection will be deleted.
00348  */
00349 static void tcp_close ( struct tcp_connection *tcp, int rc ) {
00350         struct io_buffer *iobuf;
00351         struct io_buffer *tmp;
00352 
00353         /* Close data transfer interface */
00354         intf_shutdown ( &tcp->xfer, rc );
00355         tcp->flags |= TCP_XFER_CLOSED;
00356 
00357         /* If we are in CLOSED, or have otherwise not yet received a
00358          * SYN (i.e. we are in LISTEN or SYN_SENT), just delete the
00359          * connection.
00360          */
00361         if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
00362 
00363                 /* Transition to CLOSED for the sake of debugging messages */
00364                 tcp->tcp_state = TCP_CLOSED;
00365                 tcp_dump_state ( tcp );
00366 
00367                 /* Free any unprocessed I/O buffers */
00368                 list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
00369                         list_del ( &iobuf->list );
00370                         free_iob ( iobuf );
00371                 }
00372 
00373                 /* Free any unsent I/O buffers */
00374                 list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
00375                         list_del ( &iobuf->list );
00376                         free_iob ( iobuf );
00377                         pending_put ( &tcp->pending_data );
00378                 }
00379                 assert ( ! is_pending ( &tcp->pending_data ) );
00380 
00381                 /* Remove pending operations for SYN and FIN, if applicable */
00382                 pending_put ( &tcp->pending_flags );
00383                 pending_put ( &tcp->pending_flags );
00384 
00385                 /* Remove from list and drop reference */
00386                 process_del ( &tcp->process );
00387                 stop_timer ( &tcp->timer );
00388                 stop_timer ( &tcp->keepalive );
00389                 stop_timer ( &tcp->wait );
00390                 list_del ( &tcp->list );
00391                 ref_put ( &tcp->refcnt );
00392                 DBGC ( tcp, "TCP %p connection deleted\n", tcp );
00393                 return;
00394         }
00395 
00396         /* If we have not had our SYN acknowledged (i.e. we are in
00397          * SYN_RCVD), pretend that it has been acknowledged so that we
00398          * can send a FIN without breaking things.
00399          */
00400         if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
00401                 tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
00402 
00403         /* Stop keepalive timer */
00404         stop_timer ( &tcp->keepalive );
00405 
00406         /* If we have no data remaining to send, start sending FIN */
00407         if ( list_empty ( &tcp->tx_queue ) &&
00408              ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) ) {
00409 
00410                 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
00411                 tcp_dump_state ( tcp );
00412                 process_add ( &tcp->process );
00413 
00414                 /* Add a pending operation for the FIN */
00415                 pending_get ( &tcp->pending_flags );
00416         }
00417 }
00418 
00419 /***************************************************************************
00420  *
00421  * Transmit data path
00422  *
00423  ***************************************************************************
00424  */
00425 
00426 /**
00427  * Calculate transmission window
00428  *
00429  * @v tcp               TCP connection
00430  * @ret len             Maximum length that can be sent in a single packet
00431  */
00432 static size_t tcp_xmit_win ( struct tcp_connection *tcp ) {
00433         size_t len;
00434 
00435         /* Not ready if we're not in a suitable connection state */
00436         if ( ! TCP_CAN_SEND_DATA ( tcp->tcp_state ) )
00437                 return 0;
00438 
00439         /* Length is the minimum of the receiver's window and the path MTU */
00440         len = tcp->snd_win;
00441         if ( len > TCP_PATH_MTU )
00442                 len = TCP_PATH_MTU;
00443 
00444         return len;
00445 }
00446 
00447 /**
00448  * Check data-transfer flow control window
00449  *
00450  * @v tcp               TCP connection
00451  * @ret len             Length of window
00452  */
00453 static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
00454 
00455         /* Not ready if data queue is non-empty.  This imposes a limit
00456          * of only one unACKed packet in the TX queue at any time; we
00457          * do this to conserve memory usage.
00458          */
00459         if ( ! list_empty ( &tcp->tx_queue ) )
00460                 return 0;
00461 
00462         /* Return TCP window length */
00463         return tcp_xmit_win ( tcp );
00464 }
00465 
00466 /**
00467  * Find selective acknowledgement block
00468  *
00469  * @v tcp               TCP connection
00470  * @v seq               SEQ value in SACK block (in host-endian order)
00471  * @v sack              SACK block to fill in (in host-endian order)
00472  * @ret len             Length of SACK block
00473  */
00474 static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
00475                                  struct tcp_sack_block *sack ) {
00476         struct io_buffer *iobuf;
00477         struct tcp_rx_queued_header *tcpqhdr;
00478         uint32_t left = tcp->rcv_ack;
00479         uint32_t right = left;
00480 
00481         /* Find highest block which does not start after SEQ */
00482         list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
00483                 tcpqhdr = iobuf->data;
00484                 if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
00485                         if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
00486                                 break;
00487                         left = tcpqhdr->seq;
00488                 }
00489                 if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
00490                         right = tcpqhdr->nxt;
00491         }
00492 
00493         /* Fail if this block does not contain SEQ */
00494         if ( tcp_cmp ( right, seq ) < 0 )
00495                 return 0;
00496 
00497         /* Populate SACK block */
00498         sack->left = left;
00499         sack->right = right;
00500         return ( right - left );
00501 }
00502 
00503 /**
00504  * Update TCP selective acknowledgement list
00505  *
00506  * @v tcp               TCP connection
00507  * @v seq               SEQ value in first SACK block (in host-endian order)
00508  * @ret count           Number of SACK blocks
00509  */
00510 static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
00511         struct tcp_sack_block sack[TCP_SACK_MAX];
00512         unsigned int old = 0;
00513         unsigned int new = 0;
00514         unsigned int i;
00515         uint32_t len;
00516 
00517         /* Populate first new SACK block */
00518         len = tcp_sack_block ( tcp, seq, &sack[0] );
00519         if ( len )
00520                 new++;
00521 
00522         /* Populate remaining new SACK blocks based on old SACK blocks */
00523         for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
00524 
00525                 /* Stop if we run out of space in the new list */
00526                 if ( new == TCP_SACK_MAX )
00527                         break;
00528 
00529                 /* Skip empty old SACK blocks */
00530                 if ( tcp->sack[old].left == tcp->sack[old].right )
00531                         continue;
00532 
00533                 /* Populate new SACK block */
00534                 len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
00535                 if ( len == 0 )
00536                         continue;
00537 
00538                 /* Eliminate duplicates */
00539                 for ( i = 0 ; i < new ; i++ ) {
00540                         if ( sack[i].left == sack[new].left ) {
00541                                 new--;
00542                                 break;
00543                         }
00544                 }
00545                 new++;
00546         }
00547 
00548         /* Update SACK list */
00549         memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
00550         memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
00551         return new;
00552 }
00553 
00554 /**
00555  * Process TCP transmit queue
00556  *
00557  * @v tcp               TCP connection
00558  * @v max_len           Maximum length to process
00559  * @v dest              I/O buffer to fill with data, or NULL
00560  * @v remove            Remove data from queue
00561  * @ret len             Length of data processed
00562  *
00563  * This processes at most @c max_len bytes from the TCP connection's
00564  * transmit queue.  Data will be copied into the @c dest I/O buffer
00565  * (if provided) and, if @c remove is true, removed from the transmit
00566  * queue.
00567  */
00568 static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
00569                                      struct io_buffer *dest, int remove ) {
00570         struct io_buffer *iobuf;
00571         struct io_buffer *tmp;
00572         size_t frag_len;
00573         size_t len = 0;
00574 
00575         list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
00576                 frag_len = iob_len ( iobuf );
00577                 if ( frag_len > max_len )
00578                         frag_len = max_len;
00579                 if ( dest ) {
00580                         memcpy ( iob_put ( dest, frag_len ), iobuf->data,
00581                                  frag_len );
00582                 }
00583                 if ( remove ) {
00584                         iob_pull ( iobuf, frag_len );
00585                         if ( ! iob_len ( iobuf ) ) {
00586                                 list_del ( &iobuf->list );
00587                                 free_iob ( iobuf );
00588                                 pending_put ( &tcp->pending_data );
00589                         }
00590                 }
00591                 len += frag_len;
00592                 max_len -= frag_len;
00593         }
00594         return len;
00595 }
00596 
00597 /**
00598  * Transmit any outstanding data (with selective acknowledgement)
00599  *
00600  * @v tcp               TCP connection
00601  * @v sack_seq          SEQ for first selective acknowledgement (if any)
00602  * 
00603  * Transmits any outstanding data on the connection.
00604  *
00605  * Note that even if an error is returned, the retransmission timer
00606  * will have been started if necessary, and so the stack will
00607  * eventually attempt to retransmit the failed packet.
00608  */
00609 static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
00610         struct io_buffer *iobuf;
00611         struct tcp_header *tcphdr;
00612         struct tcp_mss_option *mssopt;
00613         struct tcp_window_scale_padded_option *wsopt;
00614         struct tcp_timestamp_padded_option *tsopt;
00615         struct tcp_sack_permitted_padded_option *spopt;
00616         struct tcp_sack_padded_option *sackopt;
00617         struct tcp_sack_block *sack;
00618         void *payload;
00619         unsigned int flags;
00620         unsigned int sack_count;
00621         unsigned int i;
00622         size_t len = 0;
00623         size_t sack_len;
00624         uint32_t seq_len;
00625         uint32_t max_rcv_win;
00626         uint32_t max_representable_win;
00627         int rc;
00628 
00629         /* Start profiling */
00630         profile_start ( &tcp_tx_profiler );
00631 
00632         /* If retransmission timer is already running, do nothing */
00633         if ( timer_running ( &tcp->timer ) )
00634                 return;
00635 
00636         /* Calculate both the actual (payload) and sequence space
00637          * lengths that we wish to transmit.
00638          */
00639         if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
00640                 len = tcp_process_tx_queue ( tcp, tcp_xmit_win ( tcp ),
00641                                              NULL, 0 );
00642         }
00643         seq_len = len;
00644         flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
00645         if ( flags & ( TCP_SYN | TCP_FIN ) ) {
00646                 /* SYN or FIN consume one byte, and we can never send both */
00647                 assert ( ! ( ( flags & TCP_SYN ) && ( flags & TCP_FIN ) ) );
00648                 seq_len++;
00649         }
00650         tcp->snd_sent = seq_len;
00651 
00652         /* If we have nothing to transmit, stop now */
00653         if ( ( seq_len == 0 ) && ! ( tcp->flags & TCP_ACK_PENDING ) )
00654                 return;
00655 
00656         /* If we are transmitting anything that requires
00657          * acknowledgement (i.e. consumes sequence space), start the
00658          * retransmission timer.  Do this before attempting to
00659          * allocate the I/O buffer, in case allocation itself fails.
00660          */
00661         if ( seq_len )
00662                 start_timer ( &tcp->timer );
00663 
00664         /* Allocate I/O buffer */
00665         iobuf = alloc_iob ( len + TCP_MAX_HEADER_LEN );
00666         if ( ! iobuf ) {
00667                 DBGC ( tcp, "TCP %p could not allocate iobuf for %08x..%08x "
00668                        "%08x\n", tcp, tcp->snd_seq, ( tcp->snd_seq + seq_len ),
00669                        tcp->rcv_ack );
00670                 return;
00671         }
00672         iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
00673 
00674         /* Fill data payload from transmit queue */
00675         tcp_process_tx_queue ( tcp, len, iobuf, 0 );
00676 
00677         /* Expand receive window if possible */
00678         max_rcv_win = xfer_window ( &tcp->xfer );
00679         if ( max_rcv_win > TCP_MAX_WINDOW_SIZE )
00680                 max_rcv_win = TCP_MAX_WINDOW_SIZE;
00681         max_representable_win = ( 0xffff << tcp->rcv_win_scale );
00682         if ( max_rcv_win > max_representable_win )
00683                 max_rcv_win = max_representable_win;
00684         max_rcv_win &= ~0x03; /* Keep everything dword-aligned */
00685         if ( tcp->rcv_win < max_rcv_win )
00686                 tcp->rcv_win = max_rcv_win;
00687 
00688         /* Fill up the TCP header */
00689         payload = iobuf->data;
00690         if ( flags & TCP_SYN ) {
00691                 mssopt = iob_push ( iobuf, sizeof ( *mssopt ) );
00692                 mssopt->kind = TCP_OPTION_MSS;
00693                 mssopt->length = sizeof ( *mssopt );
00694                 mssopt->mss = htons ( tcp->mss );
00695                 wsopt = iob_push ( iobuf, sizeof ( *wsopt ) );
00696                 wsopt->nop = TCP_OPTION_NOP;
00697                 wsopt->wsopt.kind = TCP_OPTION_WS;
00698                 wsopt->wsopt.length = sizeof ( wsopt->wsopt );
00699                 wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
00700                 spopt = iob_push ( iobuf, sizeof ( *spopt ) );
00701                 memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt->nop ) );
00702                 spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
00703                 spopt->spopt.length = sizeof ( spopt->spopt );
00704         }
00705         if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
00706                 tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
00707                 memset ( tsopt->nop, TCP_OPTION_NOP, sizeof ( tsopt->nop ) );
00708                 tsopt->tsopt.kind = TCP_OPTION_TS;
00709                 tsopt->tsopt.length = sizeof ( tsopt->tsopt );
00710                 tsopt->tsopt.tsval = htonl ( currticks() );
00711                 tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
00712         }
00713         if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
00714              ( ! list_empty ( &tcp->rx_queue ) ) &&
00715              ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
00716                 sack_len = ( sack_count * sizeof ( *sack ) );
00717                 sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
00718                 memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
00719                 sackopt->sackopt.kind = TCP_OPTION_SACK;
00720                 sackopt->sackopt.length =
00721                         ( sizeof ( sackopt->sackopt ) + sack_len );
00722                 sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
00723                 for ( i = 0 ; i < sack_count ; i++, sack++ ) {
00724                         sack->left = htonl ( tcp->sack[i].left );
00725                         sack->right = htonl ( tcp->sack[i].right );
00726                 }
00727         }
00728         if ( len != 0 )
00729                 flags |= TCP_PSH;
00730         tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
00731         memset ( tcphdr, 0, sizeof ( *tcphdr ) );
00732         tcphdr->src = htons ( tcp->local_port );
00733         tcphdr->dest = tcp->peer.st_port;
00734         tcphdr->seq = htonl ( tcp->snd_seq );
00735         tcphdr->ack = htonl ( tcp->rcv_ack );
00736         tcphdr->hlen = ( ( payload - iobuf->data ) << 2 );
00737         tcphdr->flags = flags;
00738         tcphdr->win = htons ( tcp->rcv_win >> tcp->rcv_win_scale );
00739         tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
00740 
00741         /* Dump header */
00742         DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x           %08x %4zd",
00743                 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
00744                 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) + seq_len ),
00745                 ntohl ( tcphdr->ack ), len );
00746         tcp_dump_flags ( tcp, tcphdr->flags );
00747         DBGC2 ( tcp, "\n" );
00748 
00749         /* Transmit packet */
00750         if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, &tcp->peer, NULL,
00751                                &tcphdr->csum ) ) != 0 ) {
00752                 DBGC ( tcp, "TCP %p could not transmit %08x..%08x %08x: %s\n",
00753                        tcp, tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ),
00754                        tcp->rcv_ack, strerror ( rc ) );
00755                 return;
00756         }
00757 
00758         /* Clear ACK-pending flag */
00759         tcp->flags &= ~TCP_ACK_PENDING;
00760 
00761         profile_stop ( &tcp_tx_profiler );
00762 }
00763 
00764 /**
00765  * Transmit any outstanding data
00766  *
00767  * @v tcp               TCP connection
00768  */
00769 static void tcp_xmit ( struct tcp_connection *tcp ) {
00770 
00771         /* Transmit without an explicit first SACK */
00772         tcp_xmit_sack ( tcp, tcp->rcv_ack );
00773 }
00774 
00775 /** TCP process descriptor */
00776 static struct process_descriptor tcp_process_desc =
00777         PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
00778 
00779 /**
00780  * Retransmission timer expired
00781  *
00782  * @v timer             Retransmission timer
00783  * @v over              Failure indicator
00784  */
00785 static void tcp_expired ( struct retry_timer *timer, int over ) {
00786         struct tcp_connection *tcp =
00787                 container_of ( timer, struct tcp_connection, timer );
00788 
00789         DBGC ( tcp, "TCP %p timer %s in %s for %08x..%08x %08x\n", tcp,
00790                ( over ? "expired" : "fired" ), tcp_state ( tcp->tcp_state ),
00791                tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
00792 
00793         assert ( ( tcp->tcp_state == TCP_SYN_SENT ) ||
00794                  ( tcp->tcp_state == TCP_SYN_RCVD ) ||
00795                  ( tcp->tcp_state == TCP_ESTABLISHED ) ||
00796                  ( tcp->tcp_state == TCP_FIN_WAIT_1 ) ||
00797                  ( tcp->tcp_state == TCP_CLOSE_WAIT ) ||
00798                  ( tcp->tcp_state == TCP_CLOSING_OR_LAST_ACK ) );
00799 
00800         if ( over ) {
00801                 /* If we have finally timed out and given up,
00802                  * terminate the connection
00803                  */
00804                 tcp->tcp_state = TCP_CLOSED;
00805                 tcp_dump_state ( tcp );
00806                 tcp_close ( tcp, -ETIMEDOUT );
00807         } else {
00808                 /* Otherwise, retransmit the packet */
00809                 tcp_xmit ( tcp );
00810         }
00811 }
00812 
00813 /**
00814  * Keepalive timer expired
00815  *
00816  * @v timer             Keepalive timer
00817  * @v over              Failure indicator
00818  */
00819 static void tcp_keepalive_expired ( struct retry_timer *timer,
00820                                     int over __unused ) {
00821         struct tcp_connection *tcp =
00822                 container_of ( timer, struct tcp_connection, keepalive );
00823 
00824         DBGC ( tcp, "TCP %p sending keepalive\n", tcp );
00825 
00826         /* Reset keepalive timer */
00827         start_timer_fixed ( &tcp->keepalive, TCP_KEEPALIVE_DELAY );
00828 
00829         /* Send keepalive.  We do this only to preserve or restore
00830          * state in intermediate devices (e.g. firewall NAT tables);
00831          * we don't actually care about eliciting a response to verify
00832          * that the peer is still alive.  We therefore send just a
00833          * pure ACK, to keep our transmit path simple.
00834          */
00835         tcp->flags |= TCP_ACK_PENDING;
00836         tcp_xmit ( tcp );
00837 }
00838 
00839 /**
00840  * Shutdown timer expired
00841  *
00842  * @v timer             Shutdown timer
00843  * @v over              Failure indicator
00844  */
00845 static void tcp_wait_expired ( struct retry_timer *timer, int over __unused ) {
00846         struct tcp_connection *tcp =
00847                 container_of ( timer, struct tcp_connection, wait );
00848 
00849         assert ( tcp->tcp_state == TCP_TIME_WAIT );
00850 
00851         DBGC ( tcp, "TCP %p wait complete in %s for %08x..%08x %08x\n", tcp,
00852                tcp_state ( tcp->tcp_state ), tcp->snd_seq,
00853                ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
00854 
00855         tcp->tcp_state = TCP_CLOSED;
00856         tcp_dump_state ( tcp );
00857         tcp_close ( tcp, 0 );
00858 }
00859 
00860 /**
00861  * Send RST response to incoming packet
00862  *
00863  * @v in_tcphdr         TCP header of incoming packet
00864  * @ret rc              Return status code
00865  */
00866 static int tcp_xmit_reset ( struct tcp_connection *tcp,
00867                             struct sockaddr_tcpip *st_dest,
00868                             struct tcp_header *in_tcphdr ) {
00869         struct io_buffer *iobuf;
00870         struct tcp_header *tcphdr;
00871         int rc;
00872 
00873         /* Allocate space for dataless TX buffer */
00874         iobuf = alloc_iob ( TCP_MAX_HEADER_LEN );
00875         if ( ! iobuf ) {
00876                 DBGC ( tcp, "TCP %p could not allocate iobuf for RST "
00877                        "%08x..%08x %08x\n", tcp, ntohl ( in_tcphdr->ack ),
00878                        ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ) );
00879                 return -ENOMEM;
00880         }
00881         iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
00882 
00883         /* Construct RST response */
00884         tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
00885         memset ( tcphdr, 0, sizeof ( *tcphdr ) );
00886         tcphdr->src = in_tcphdr->dest;
00887         tcphdr->dest = in_tcphdr->src;
00888         tcphdr->seq = in_tcphdr->ack;
00889         tcphdr->ack = in_tcphdr->seq;
00890         tcphdr->hlen = ( ( sizeof ( *tcphdr ) / 4 ) << 4 );
00891         tcphdr->flags = ( TCP_RST | TCP_ACK );
00892         tcphdr->win = htons ( 0 );
00893         tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
00894 
00895         /* Dump header */
00896         DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x           %08x %4d",
00897                 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
00898                 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) ),
00899                 ntohl ( tcphdr->ack ), 0 );
00900         tcp_dump_flags ( tcp, tcphdr->flags );
00901         DBGC2 ( tcp, "\n" );
00902 
00903         /* Transmit packet */
00904         if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, st_dest,
00905                                NULL, &tcphdr->csum ) ) != 0 ) {
00906                 DBGC ( tcp, "TCP %p could not transmit RST %08x..%08x %08x: "
00907                        "%s\n", tcp, ntohl ( in_tcphdr->ack ),
00908                        ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ),
00909                        strerror ( rc ) );
00910                 return rc;
00911         }
00912 
00913         return 0;
00914 }
00915 
00916 /***************************************************************************
00917  *
00918  * Receive data path
00919  *
00920  ***************************************************************************
00921  */
00922 
00923 /**
00924  * Identify TCP connection by local port number
00925  *
00926  * @v local_port        Local port
00927  * @ret tcp             TCP connection, or NULL
00928  */
00929 static struct tcp_connection * tcp_demux ( unsigned int local_port ) {
00930         struct tcp_connection *tcp;
00931 
00932         list_for_each_entry ( tcp, &tcp_conns, list ) {
00933                 if ( tcp->local_port == local_port )
00934                         return tcp;
00935         }
00936         return NULL;
00937 }
00938 
00939 /**
00940  * Parse TCP received options
00941  *
00942  * @v tcp               TCP connection (may be NULL)
00943  * @v tcphdr            TCP header
00944  * @v hlen              TCP header length
00945  * @v options           Options structure to fill in
00946  * @ret rc              Return status code
00947  */
00948 static int tcp_rx_opts ( struct tcp_connection *tcp,
00949                          const struct tcp_header *tcphdr, size_t hlen,
00950                          struct tcp_options *options ) {
00951         const void *data = ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) );
00952         const void *end = ( ( ( void * ) tcphdr ) + hlen );
00953         const struct tcp_option *option;
00954         unsigned int kind;
00955         size_t remaining;
00956         size_t min;
00957 
00958         /* Sanity check */
00959         assert ( hlen >= sizeof ( *tcphdr ) );
00960 
00961         /* Parse options */
00962         memset ( options, 0, sizeof ( *options ) );
00963         while ( ( remaining = ( end - data ) ) ) {
00964 
00965                 /* Extract option code */
00966                 option = data;
00967                 kind = option->kind;
00968 
00969                 /* Handle single-byte options */
00970                 if ( kind == TCP_OPTION_END )
00971                         break;
00972                 if ( kind == TCP_OPTION_NOP ) {
00973                         data++;
00974                         continue;
00975                 }
00976 
00977                 /* Handle multi-byte options */
00978                 min = sizeof ( *option );
00979                 switch ( kind ) {
00980                 case TCP_OPTION_MSS:
00981                         /* Ignore received MSS */
00982                         break;
00983                 case TCP_OPTION_WS:
00984                         options->wsopt = data;
00985                         min = sizeof ( *options->wsopt );
00986                         break;
00987                 case TCP_OPTION_SACK_PERMITTED:
00988                         options->spopt = data;
00989                         min = sizeof ( *options->spopt );
00990                         break;
00991                 case TCP_OPTION_SACK:
00992                         /* Ignore received SACKs */
00993                         break;
00994                 case TCP_OPTION_TS:
00995                         options->tsopt = data;
00996                         min = sizeof ( *options->tsopt );
00997                         break;
00998                 default:
00999                         DBGC ( tcp, "TCP %p received unknown option %d\n",
01000                                tcp, kind );
01001                         break;
01002                 }
01003                 if ( remaining < min ) {
01004                         DBGC ( tcp, "TCP %p received truncated option %d\n",
01005                                tcp, kind );
01006                         return -EINVAL;
01007                 }
01008                 if ( option->length < min ) {
01009                         DBGC ( tcp, "TCP %p received underlength option %d\n",
01010                                tcp, kind );
01011                         return -EINVAL;
01012                 }
01013                 if ( option->length > remaining ) {
01014                         DBGC ( tcp, "TCP %p received overlength option %d\n",
01015                                tcp, kind );
01016                         return -EINVAL;
01017                 }
01018                 data += option->length;
01019         }
01020 
01021         return 0;
01022 }
01023 
01024 /**
01025  * Consume received sequence space
01026  *
01027  * @v tcp               TCP connection
01028  * @v seq_len           Sequence space length to consume
01029  */
01030 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
01031         unsigned int sack;
01032 
01033         /* Sanity check */
01034         assert ( seq_len > 0 );
01035 
01036         /* Update acknowledgement number */
01037         tcp->rcv_ack += seq_len;
01038 
01039         /* Update window */
01040         if ( tcp->rcv_win > seq_len ) {
01041                 tcp->rcv_win -= seq_len;
01042         } else {
01043                 tcp->rcv_win = 0;
01044         }
01045 
01046         /* Update timestamp */
01047         tcp->ts_recent = tcp->ts_val;
01048 
01049         /* Update SACK list */
01050         for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
01051                 if ( tcp->sack[sack].left == tcp->sack[sack].right )
01052                         continue;
01053                 if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
01054                         tcp->sack[sack].left = tcp->rcv_ack;
01055                 if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
01056                         tcp->sack[sack].right = tcp->rcv_ack;
01057         }
01058 
01059         /* Mark ACK as pending */
01060         tcp->flags |= TCP_ACK_PENDING;
01061 }
01062 
01063 /**
01064  * Handle TCP received SYN
01065  *
01066  * @v tcp               TCP connection
01067  * @v seq               SEQ value (in host-endian order)
01068  * @v options           TCP options
01069  * @ret rc              Return status code
01070  */
01071 static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
01072                         struct tcp_options *options ) {
01073 
01074         /* Synchronise sequence numbers on first SYN */
01075         if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
01076                 tcp->rcv_ack = seq;
01077                 if ( options->tsopt )
01078                         tcp->flags |= TCP_TS_ENABLED;
01079                 if ( options->spopt )
01080                         tcp->flags |= TCP_SACK_ENABLED;
01081                 if ( options->wsopt ) {
01082                         tcp->snd_win_scale = options->wsopt->scale;
01083                         tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
01084                 }
01085                 DBGC ( tcp, "TCP %p using %stimestamps, %sSACK, TX window "
01086                        "x%d, RX window x%d\n", tcp,
01087                        ( ( tcp->flags & TCP_TS_ENABLED ) ? "" : "no " ),
01088                        ( ( tcp->flags & TCP_SACK_ENABLED ) ? "" : "no " ),
01089                        ( 1 << tcp->snd_win_scale ),
01090                        ( 1 << tcp->rcv_win_scale ) );
01091         }
01092 
01093         /* Ignore duplicate SYN */
01094         if ( seq != tcp->rcv_ack )
01095                 return 0;
01096 
01097         /* Acknowledge SYN */
01098         tcp_rx_seq ( tcp, 1 );
01099 
01100         /* Mark SYN as received and start sending ACKs with each packet */
01101         tcp->tcp_state |= ( TCP_STATE_SENT ( TCP_ACK ) |
01102                             TCP_STATE_RCVD ( TCP_SYN ) );
01103 
01104         return 0;
01105 }
01106 
01107 /**
01108  * Handle TCP received ACK
01109  *
01110  * @v tcp               TCP connection
01111  * @v ack               ACK value (in host-endian order)
01112  * @v win               WIN value (in host-endian order)
01113  * @ret rc              Return status code
01114  */
01115 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
01116                         uint32_t win ) {
01117         uint32_t ack_len = ( ack - tcp->snd_seq );
01118         size_t len;
01119         unsigned int acked_flags;
01120 
01121         /* Check for out-of-range or old duplicate ACKs */
01122         if ( ack_len > tcp->snd_sent ) {
01123                 DBGC ( tcp, "TCP %p received ACK for %08x..%08x, "
01124                        "sent only %08x..%08x\n", tcp, tcp->snd_seq,
01125                        ( tcp->snd_seq + ack_len ), tcp->snd_seq,
01126                        ( tcp->snd_seq + tcp->snd_sent ) );
01127 
01128                 if ( TCP_HAS_BEEN_ESTABLISHED ( tcp->tcp_state ) ) {
01129                         /* Just ignore what might be old duplicate ACKs */
01130                         return 0;
01131                 } else {
01132                         /* Send RST if an out-of-range ACK is received
01133                          * on a not-yet-established connection, as per
01134                          * RFC 793.
01135                          */
01136                         return -EINVAL;
01137                 }
01138         }
01139 
01140         /* Update window size */
01141         tcp->snd_win = win;
01142 
01143         /* Hold off (or start) the keepalive timer, if applicable */
01144         if ( ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) )
01145                 start_timer_fixed ( &tcp->keepalive, TCP_KEEPALIVE_DELAY );
01146 
01147         /* Ignore ACKs that don't actually acknowledge any new data.
01148          * (In particular, do not stop the retransmission timer; this
01149          * avoids creating a sorceror's apprentice syndrome when a
01150          * duplicate ACK is received and we still have data in our
01151          * transmit queue.)
01152          */
01153         if ( ack_len == 0 )
01154                 return 0;
01155 
01156         /* Stop the retransmission timer */
01157         stop_timer ( &tcp->timer );
01158 
01159         /* Determine acknowledged flags and data length */
01160         len = ack_len;
01161         acked_flags = ( TCP_FLAGS_SENDING ( tcp->tcp_state ) &
01162                         ( TCP_SYN | TCP_FIN ) );
01163         if ( acked_flags ) {
01164                 len--;
01165                 pending_put ( &tcp->pending_flags );
01166         }
01167 
01168         /* Update SEQ and sent counters */
01169         tcp->snd_seq = ack;
01170         tcp->snd_sent = 0;
01171 
01172         /* Remove any acknowledged data from transmit queue */
01173         tcp_process_tx_queue ( tcp, len, NULL, 1 );
01174                 
01175         /* Mark SYN/FIN as acknowledged if applicable. */
01176         if ( acked_flags )
01177                 tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
01178 
01179         /* Start sending FIN if we've had all possible data ACKed */
01180         if ( list_empty ( &tcp->tx_queue ) &&
01181              ( tcp->flags & TCP_XFER_CLOSED ) &&
01182              ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) ) {
01183                 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
01184                 pending_get ( &tcp->pending_flags );
01185         }
01186 
01187         return 0;
01188 }
01189 
01190 /**
01191  * Handle TCP received data
01192  *
01193  * @v tcp               TCP connection
01194  * @v seq               SEQ value (in host-endian order)
01195  * @v iobuf             I/O buffer
01196  * @ret rc              Return status code
01197  *
01198  * This function takes ownership of the I/O buffer.
01199  */
01200 static int tcp_rx_data ( struct tcp_connection *tcp, uint32_t seq,
01201                          struct io_buffer *iobuf ) {
01202         uint32_t already_rcvd;
01203         uint32_t len;
01204         int rc;
01205 
01206         /* Ignore duplicate or out-of-order data */
01207         already_rcvd = ( tcp->rcv_ack - seq );
01208         len = iob_len ( iobuf );
01209         if ( already_rcvd >= len ) {
01210                 free_iob ( iobuf );
01211                 return 0;
01212         }
01213         iob_pull ( iobuf, already_rcvd );
01214         len -= already_rcvd;
01215 
01216         /* Acknowledge new data */
01217         tcp_rx_seq ( tcp, len );
01218 
01219         /* Deliver data to application */
01220         profile_start ( &tcp_xfer_profiler );
01221         if ( ( rc = xfer_deliver_iob ( &tcp->xfer, iobuf ) ) != 0 ) {
01222                 DBGC ( tcp, "TCP %p could not deliver %08x..%08x: %s\n",
01223                        tcp, seq, ( seq + len ), strerror ( rc ) );
01224                 return rc;
01225         }
01226         profile_stop ( &tcp_xfer_profiler );
01227 
01228         return 0;
01229 }
01230 
01231 /**
01232  * Handle TCP received FIN
01233  *
01234  * @v tcp               TCP connection
01235  * @v seq               SEQ value (in host-endian order)
01236  * @ret rc              Return status code
01237  */
01238 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
01239 
01240         /* Ignore duplicate or out-of-order FIN */
01241         if ( seq != tcp->rcv_ack )
01242                 return 0;
01243 
01244         /* Acknowledge FIN */
01245         tcp_rx_seq ( tcp, 1 );
01246 
01247         /* Mark FIN as received */
01248         tcp->tcp_state |= TCP_STATE_RCVD ( TCP_FIN );
01249 
01250         /* Close connection */
01251         tcp_close ( tcp, 0 );
01252 
01253         return 0;
01254 }
01255 
01256 /**
01257  * Handle TCP received RST
01258  *
01259  * @v tcp               TCP connection
01260  * @v seq               SEQ value (in host-endian order)
01261  * @ret rc              Return status code
01262  */
01263 static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
01264 
01265         /* Accept RST only if it falls within the window.  If we have
01266          * not yet received a SYN, then we have no window to test
01267          * against, so fall back to checking that our SYN has been
01268          * ACKed.
01269          */
01270         if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
01271                 if ( ! tcp_in_window ( seq, tcp->rcv_ack, tcp->rcv_win ) )
01272                         return 0;
01273         } else {
01274                 if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
01275                         return 0;
01276         }
01277 
01278         /* Abort connection */
01279         tcp->tcp_state = TCP_CLOSED;
01280         tcp_dump_state ( tcp );
01281         tcp_close ( tcp, -ECONNRESET );
01282 
01283         DBGC ( tcp, "TCP %p connection reset by peer\n", tcp );
01284         return -ECONNRESET;
01285 }
01286 
01287 /**
01288  * Enqueue received TCP packet
01289  *
01290  * @v tcp               TCP connection
01291  * @v seq               SEQ value (in host-endian order)
01292  * @v flags             TCP flags
01293  * @v iobuf             I/O buffer
01294  */
01295 static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
01296                              uint8_t flags, struct io_buffer *iobuf ) {
01297         struct tcp_rx_queued_header *tcpqhdr;
01298         struct io_buffer *queued;
01299         size_t len;
01300         uint32_t seq_len;
01301         uint32_t nxt;
01302 
01303         /* Calculate remaining flags and sequence length.  Note that
01304          * SYN, if present, has already been processed by this point.
01305          */
01306         flags &= TCP_FIN;
01307         len = iob_len ( iobuf );
01308         seq_len = ( len + ( flags ? 1 : 0 ) );
01309         nxt = ( seq + seq_len );
01310 
01311         /* Discard immediately (to save memory) if:
01312          *
01313          * a) we have not yet received a SYN (and so have no defined
01314          *    receive window), or
01315          * b) the packet lies entirely outside the receive window, or
01316          * c) there is no further content to process.
01317          */
01318         if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
01319              ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
01320              ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
01321              ( seq_len == 0 ) ) {
01322                 free_iob ( iobuf );
01323                 return;
01324         }
01325 
01326         /* Add internal header */
01327         tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
01328         tcpqhdr->seq = seq;
01329         tcpqhdr->nxt = nxt;
01330         tcpqhdr->flags = flags;
01331 
01332         /* Add to RX queue */
01333         list_for_each_entry ( queued, &tcp->rx_queue, list ) {
01334                 tcpqhdr = queued->data;
01335                 if ( tcp_cmp ( seq, tcpqhdr->seq ) < 0 )
01336                         break;
01337         }
01338         list_add_tail ( &iobuf->list, &queued->list );
01339 }
01340 
01341 /**
01342  * Process receive queue
01343  *
01344  * @v tcp               TCP connection
01345  */
01346 static void tcp_process_rx_queue ( struct tcp_connection *tcp ) {
01347         struct io_buffer *iobuf;
01348         struct tcp_rx_queued_header *tcpqhdr;
01349         uint32_t seq;
01350         unsigned int flags;
01351         size_t len;
01352 
01353         /* Process all applicable received buffers.  Note that we
01354          * cannot use list_for_each_entry() to iterate over the RX
01355          * queue, since tcp_discard() may remove packets from the RX
01356          * queue while we are processing.
01357          */
01358         while ( ( iobuf = list_first_entry ( &tcp->rx_queue, struct io_buffer,
01359                                              list ) ) ) {
01360 
01361                 /* Stop processing when we hit the first gap */
01362                 tcpqhdr = iobuf->data;
01363                 if ( tcp_cmp ( tcpqhdr->seq, tcp->rcv_ack ) > 0 )
01364                         break;
01365 
01366                 /* Strip internal header and remove from RX queue */
01367                 list_del ( &iobuf->list );
01368                 seq = tcpqhdr->seq;
01369                 flags = tcpqhdr->flags;
01370                 iob_pull ( iobuf, sizeof ( *tcpqhdr ) );
01371                 len = iob_len ( iobuf );
01372 
01373                 /* Handle new data, if any */
01374                 tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
01375                 seq += len;
01376 
01377                 /* Handle FIN, if present */
01378                 if ( flags & TCP_FIN ) {
01379                         tcp_rx_fin ( tcp, seq );
01380                         seq++;
01381                 }
01382         }
01383 }
01384 
01385 /**
01386  * Process received packet
01387  *
01388  * @v iobuf             I/O buffer
01389  * @v netdev            Network device
01390  * @v st_src            Partially-filled source address
01391  * @v st_dest           Partially-filled destination address
01392  * @v pshdr_csum        Pseudo-header checksum
01393  * @ret rc              Return status code
01394   */
01395 static int tcp_rx ( struct io_buffer *iobuf,
01396                     struct net_device *netdev __unused,
01397                     struct sockaddr_tcpip *st_src,
01398                     struct sockaddr_tcpip *st_dest __unused,
01399                     uint16_t pshdr_csum ) {
01400         struct tcp_header *tcphdr = iobuf->data;
01401         struct tcp_connection *tcp;
01402         struct tcp_options options;
01403         size_t hlen;
01404         uint16_t csum;
01405         uint32_t seq;
01406         uint32_t ack;
01407         uint16_t raw_win;
01408         uint32_t win;
01409         unsigned int flags;
01410         size_t len;
01411         uint32_t seq_len;
01412         size_t old_xfer_window;
01413         int rc;
01414 
01415         /* Start profiling */
01416         profile_start ( &tcp_rx_profiler );
01417 
01418         /* Sanity check packet */
01419         if ( iob_len ( iobuf ) < sizeof ( *tcphdr ) ) {
01420                 DBG ( "TCP packet too short at %zd bytes (min %zd bytes)\n",
01421                       iob_len ( iobuf ), sizeof ( *tcphdr ) );
01422                 rc = -EINVAL;
01423                 goto discard;
01424         }
01425         hlen = ( ( tcphdr->hlen & TCP_MASK_HLEN ) / 16 ) * 4;
01426         if ( hlen < sizeof ( *tcphdr ) ) {
01427                 DBG ( "TCP header too short at %zd bytes (min %zd bytes)\n",
01428                       hlen, sizeof ( *tcphdr ) );
01429                 rc = -EINVAL;
01430                 goto discard;
01431         }
01432         if ( hlen > iob_len ( iobuf ) ) {
01433                 DBG ( "TCP header too long at %zd bytes (max %zd bytes)\n",
01434                       hlen, iob_len ( iobuf ) );
01435                 rc = -EINVAL;
01436                 goto discard;
01437         }
01438         csum = tcpip_continue_chksum ( pshdr_csum, iobuf->data,
01439                                        iob_len ( iobuf ) );
01440         if ( csum != 0 ) {
01441                 DBG ( "TCP checksum incorrect (is %04x including checksum "
01442                       "field, should be 0000)\n", csum );
01443                 rc = -EINVAL;
01444                 goto discard;
01445         }
01446         
01447         /* Parse parameters from header and strip header */
01448         tcp = tcp_demux ( ntohs ( tcphdr->dest ) );
01449         seq = ntohl ( tcphdr->seq );
01450         ack = ntohl ( tcphdr->ack );
01451         raw_win = ntohs ( tcphdr->win );
01452         flags = tcphdr->flags;
01453         if ( ( rc = tcp_rx_opts ( tcp, tcphdr, hlen, &options ) ) != 0 )
01454                 goto discard;
01455         if ( tcp && options.tsopt )
01456                 tcp->ts_val = ntohl ( options.tsopt->tsval );
01457         iob_pull ( iobuf, hlen );
01458         len = iob_len ( iobuf );
01459         seq_len = ( len + ( ( flags & TCP_SYN ) ? 1 : 0 ) +
01460                     ( ( flags & TCP_FIN ) ? 1 : 0 ) );
01461 
01462         /* Dump header */
01463         DBGC2 ( tcp, "TCP %p RX %d<-%d           %08x %08x..%08x %4zd",
01464                 tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
01465                 ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
01466                 ( ntohl ( tcphdr->seq ) + seq_len ), len );
01467         tcp_dump_flags ( tcp, tcphdr->flags );
01468         DBGC2 ( tcp, "\n" );
01469 
01470         /* If no connection was found, silently drop packet */
01471         if ( ! tcp ) {
01472                 rc = -ENOTCONN;
01473                 goto discard;
01474         }
01475 
01476         /* Record old data-transfer window */
01477         old_xfer_window = tcp_xfer_window ( tcp );
01478 
01479         /* Handle ACK, if present */
01480         if ( flags & TCP_ACK ) {
01481                 win = ( raw_win << tcp->snd_win_scale );
01482                 if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
01483                         tcp_xmit_reset ( tcp, st_src, tcphdr );
01484                         goto discard;
01485                 }
01486         }
01487 
01488         /* Force an ACK if this packet is out of order */
01489         if ( ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) &&
01490              ( seq != tcp->rcv_ack ) ) {
01491                 tcp->flags |= TCP_ACK_PENDING;
01492         }
01493 
01494         /* Handle SYN, if present */
01495         if ( flags & TCP_SYN ) {
01496                 tcp_rx_syn ( tcp, seq, &options );
01497                 seq++;
01498         }
01499 
01500         /* Handle RST, if present */
01501         if ( flags & TCP_RST ) {
01502                 if ( ( rc = tcp_rx_rst ( tcp, seq ) ) != 0 )
01503                         goto discard;
01504         }
01505 
01506         /* Enqueue received data */
01507         tcp_rx_enqueue ( tcp, seq, flags, iob_disown ( iobuf ) );
01508 
01509         /* Process receive queue */
01510         tcp_process_rx_queue ( tcp );
01511 
01512         /* Dump out any state change as a result of the received packet */
01513         tcp_dump_state ( tcp );
01514 
01515         /* Schedule transmission of ACK (and any pending data).  If we
01516          * have received any out-of-order packets (i.e. if the receive
01517          * queue remains non-empty after processing) then send the ACK
01518          * immediately in order to trigger Fast Retransmission.
01519          */
01520         if ( list_empty ( &tcp->rx_queue ) ) {
01521                 process_add ( &tcp->process );
01522         } else {
01523                 tcp_xmit_sack ( tcp, seq );
01524         }
01525 
01526         /* If this packet was the last we expect to receive, set up
01527          * timer to expire and cause the connection to be freed.
01528          */
01529         if ( TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) {
01530                 stop_timer ( &tcp->wait );
01531                 start_timer_fixed ( &tcp->wait, ( 2 * TCP_MSL ) );
01532         }
01533 
01534         /* Notify application if window has changed */
01535         if ( tcp_xfer_window ( tcp ) != old_xfer_window )
01536                 xfer_window_changed ( &tcp->xfer );
01537 
01538         profile_stop ( &tcp_rx_profiler );
01539         return 0;
01540 
01541  discard:
01542         /* Free received packet */
01543         free_iob ( iobuf );
01544         return rc;
01545 }
01546 
01547 /** TCP protocol */
01548 struct tcpip_protocol tcp_protocol __tcpip_protocol = {
01549         .name = "TCP",
01550         .rx = tcp_rx,
01551         .tcpip_proto = IP_TCP,
01552 };
01553 
01554 /**
01555  * Discard some cached TCP data
01556  *
01557  * @ret discarded       Number of cached items discarded
01558  */
01559 static unsigned int tcp_discard ( void ) {
01560         struct tcp_connection *tcp;
01561         struct io_buffer *iobuf;
01562         unsigned int discarded = 0;
01563 
01564         /* Try to drop one queued RX packet from each connection */
01565         list_for_each_entry ( tcp, &tcp_conns, list ) {
01566                 list_for_each_entry_reverse ( iobuf, &tcp->rx_queue, list ) {
01567 
01568                         /* Remove packet from queue */
01569                         list_del ( &iobuf->list );
01570                         free_iob ( iobuf );
01571 
01572                         /* Report discard */
01573                         discarded++;
01574                         break;
01575                 }
01576         }
01577 
01578         return discarded;
01579 }
01580 
01581 /** TCP cache discarder */
01582 struct cache_discarder tcp_discarder __cache_discarder ( CACHE_NORMAL ) = {
01583         .discard = tcp_discard,
01584 };
01585 
01586 /**
01587  * Find first TCP connection that has not yet been closed
01588  *
01589  * @ret tcp             First unclosed connection, or NULL
01590  */
01591 static struct tcp_connection * tcp_first_unclosed ( void ) {
01592         struct tcp_connection *tcp;
01593 
01594         /* Find first connection which has not yet been closed */
01595         list_for_each_entry ( tcp, &tcp_conns, list ) {
01596                 if ( ! ( tcp->flags & TCP_XFER_CLOSED ) )
01597                         return tcp;
01598         }
01599         return NULL;
01600 }
01601 
01602 /**
01603  * Find first TCP connection that has not yet finished all operations
01604  *
01605  * @ret tcp             First unfinished connection, or NULL
01606  */
01607 static struct tcp_connection * tcp_first_unfinished ( void ) {
01608         struct tcp_connection *tcp;
01609 
01610         /* Find first connection which has not yet closed gracefully,
01611          * or which still has a pending transmission (e.g. to ACK the
01612          * received FIN).
01613          */
01614         list_for_each_entry ( tcp, &tcp_conns, list ) {
01615                 if ( ( ! TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) ||
01616                      process_running ( &tcp->process ) ) {
01617                         return tcp;
01618                 }
01619         }
01620         return NULL;
01621 }
01622 
01623 /**
01624  * Shut down all TCP connections
01625  *
01626  */
01627 static void tcp_shutdown ( int booting __unused ) {
01628         struct tcp_connection *tcp;
01629         unsigned long start;
01630         unsigned long elapsed;
01631 
01632         /* Initiate a graceful close of all connections, allowing for
01633          * the fact that the connection list may change as we do so.
01634          */
01635         while ( ( tcp = tcp_first_unclosed() ) ) {
01636                 DBGC ( tcp, "TCP %p closing for shutdown\n", tcp );
01637                 tcp_close ( tcp, -ECANCELED );
01638         }
01639 
01640         /* Wait for all connections to finish closing gracefully */
01641         start = currticks();
01642         while ( ( tcp = tcp_first_unfinished() ) &&
01643                 ( ( elapsed = ( currticks() - start ) ) < TCP_FINISH_TIMEOUT )){
01644                 step();
01645         }
01646 
01647         /* Forcibly close any remaining connections */
01648         while ( ( tcp = list_first_entry ( &tcp_conns, struct tcp_connection,
01649                                            list ) ) != NULL ) {
01650                 tcp->tcp_state = TCP_CLOSED;
01651                 tcp_dump_state ( tcp );
01652                 tcp_close ( tcp, -ECANCELED );
01653         }
01654 }
01655 
01656 /** TCP shutdown function */
01657 struct startup_fn tcp_startup_fn __startup_fn ( STARTUP_LATE ) = {
01658         .name = "tcp",
01659         .shutdown = tcp_shutdown,
01660 };
01661 
01662 /***************************************************************************
01663  *
01664  * Data transfer interface
01665  *
01666  ***************************************************************************
01667  */
01668 
01669 /**
01670  * Close interface
01671  *
01672  * @v tcp               TCP connection
01673  * @v rc                Reason for close
01674  */
01675 static void tcp_xfer_close ( struct tcp_connection *tcp, int rc ) {
01676 
01677         /* Close data transfer interface */
01678         tcp_close ( tcp, rc );
01679 
01680         /* Transmit FIN, if possible */
01681         tcp_xmit ( tcp );
01682 }
01683 
01684 /**
01685  * Deliver datagram as I/O buffer
01686  *
01687  * @v tcp               TCP connection
01688  * @v iobuf             Datagram I/O buffer
01689  * @v meta              Data transfer metadata
01690  * @ret rc              Return status code
01691  */
01692 static int tcp_xfer_deliver ( struct tcp_connection *tcp,
01693                               struct io_buffer *iobuf,
01694                               struct xfer_metadata *meta __unused ) {
01695 
01696         /* Enqueue packet */
01697         list_add_tail ( &iobuf->list, &tcp->tx_queue );
01698 
01699         /* Each enqueued packet is a pending operation */
01700         pending_get ( &tcp->pending_data );
01701 
01702         /* Transmit data, if possible */
01703         tcp_xmit ( tcp );
01704 
01705         return 0;
01706 }
01707 
01708 /**
01709  * Report job progress
01710  *
01711  * @v tcp               TCP connection
01712  * @v progress          Progress report to fill in
01713  * @ret ongoing_rc      Ongoing job status code (if known)
01714  */
01715 static int tcp_progress ( struct tcp_connection *tcp,
01716                           struct job_progress *progress ) {
01717 
01718         /* Report connection in progress if applicable */
01719         if ( ! TCP_HAS_BEEN_ESTABLISHED ( tcp->tcp_state ) ) {
01720                 snprintf ( progress->message, sizeof ( progress->message ),
01721                            "connecting" );
01722         }
01723 
01724         return 0;
01725 }
01726 
01727 /** TCP data transfer interface operations */
01728 static struct interface_operation tcp_xfer_operations[] = {
01729         INTF_OP ( xfer_deliver, struct tcp_connection *, tcp_xfer_deliver ),
01730         INTF_OP ( xfer_window, struct tcp_connection *, tcp_xfer_window ),
01731         INTF_OP ( job_progress, struct tcp_connection *, tcp_progress ),
01732         INTF_OP ( intf_close, struct tcp_connection *, tcp_xfer_close ),
01733 };
01734 
01735 /** TCP data transfer interface descriptor */
01736 static struct interface_descriptor tcp_xfer_desc =
01737         INTF_DESC ( struct tcp_connection, xfer, tcp_xfer_operations );
01738 
01739 /***************************************************************************
01740  *
01741  * Openers
01742  *
01743  ***************************************************************************
01744  */
01745 
01746 /** TCP IPv4 socket opener */
01747 struct socket_opener tcp_ipv4_socket_opener __socket_opener = {
01748         .semantics      = TCP_SOCK_STREAM,
01749         .family         = AF_INET,
01750         .open           = tcp_open,
01751 };
01752 
01753 /** TCP IPv6 socket opener */
01754 struct socket_opener tcp_ipv6_socket_opener __socket_opener = {
01755         .semantics      = TCP_SOCK_STREAM,
01756         .family         = AF_INET6,
01757         .open           = tcp_open,
01758 };
01759 
01760 /** Linkage hack */
01761 int tcp_sock_stream = TCP_SOCK_STREAM;
01762 
01763 /**
01764  * Open TCP URI
01765  *
01766  * @v xfer              Data transfer interface
01767  * @v uri               URI
01768  * @ret rc              Return status code
01769  */
01770 static int tcp_open_uri ( struct interface *xfer, struct uri *uri ) {
01771         struct sockaddr_tcpip peer;
01772 
01773         /* Sanity check */
01774         if ( ! uri->host )
01775                 return -EINVAL;
01776 
01777         memset ( &peer, 0, sizeof ( peer ) );
01778         peer.st_port = htons ( uri_port ( uri, 0 ) );
01779         return xfer_open_named_socket ( xfer, SOCK_STREAM,
01780                                         ( struct sockaddr * ) &peer,
01781                                         uri->host, NULL );
01782 }
01783 
01784 /** TCP URI opener */
01785 struct uri_opener tcp_uri_opener __uri_opener = {
01786         .scheme         = "tcp",
01787         .open           = tcp_open_uri,
01788 };
01789