iPXE
tcp.h
Go to the documentation of this file.
00001 #ifndef _IPXE_TCP_H
00002 #define _IPXE_TCP_H
00003 
00004 /** @file
00005  *
00006  * TCP protocol
00007  *
00008  * This file defines the iPXE TCP API.
00009  *
00010  */
00011 
00012 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
00013 
00014 #include <ipxe/tcpip.h>
00015 
00016 /**
00017  * A TCP header
00018  */
00019 struct tcp_header {
00020         uint16_t src;           /* Source port */
00021         uint16_t dest;          /* Destination port */
00022         uint32_t seq;           /* Sequence number */
00023         uint32_t ack;           /* Acknowledgement number */
00024         uint8_t hlen;           /* Header length (4), Reserved (4) */
00025         uint8_t flags;          /* Reserved (2), Flags (6) */
00026         uint16_t win;           /* Advertised window */
00027         uint16_t csum;          /* Checksum */
00028         uint16_t urg;           /* Urgent pointer */
00029 } __attribute__ (( packed ));
00030 
00031 /** @defgroup tcpopts TCP options
00032  * @{
00033  */
00034 
00035 /** End of TCP options list */
00036 #define TCP_OPTION_END 0
00037 
00038 /** TCP option pad */
00039 #define TCP_OPTION_NOP 1
00040 
00041 /** Generic TCP option */
00042 struct tcp_option {
00043         uint8_t kind;
00044         uint8_t length;
00045 } __attribute__ (( packed ));
00046 
00047 /** TCP MSS option */
00048 struct tcp_mss_option {
00049         uint8_t kind;
00050         uint8_t length;
00051         uint16_t mss;
00052 } __attribute__ (( packed ));
00053 
00054 /** Code for the TCP MSS option */
00055 #define TCP_OPTION_MSS 2
00056 
00057 /** TCP window scale option */
00058 struct tcp_window_scale_option {
00059         uint8_t kind;
00060         uint8_t length;
00061         uint8_t scale;
00062 } __attribute__ (( packed ));
00063 
00064 /** Padded TCP window scale option (used for sending) */
00065 struct tcp_window_scale_padded_option {
00066         uint8_t nop;
00067         struct tcp_window_scale_option wsopt;
00068 } __attribute (( packed ));
00069 
00070 /** Code for the TCP window scale option */
00071 #define TCP_OPTION_WS 3
00072 
00073 /** Advertised TCP window scale
00074  *
00075  * Using a scale factor of 2**9 provides for a maximum window of 32MB,
00076  * which is sufficient to allow Gigabit-speed transfers with a 200ms
00077  * RTT.  The minimum advertised window is 512 bytes, which is still
00078  * less than a single packet.
00079  */
00080 #define TCP_RX_WINDOW_SCALE 9
00081 
00082 /** TCP selective acknowledgement permitted option */
00083 struct tcp_sack_permitted_option {
00084         uint8_t kind;
00085         uint8_t length;
00086 } __attribute__ (( packed ));
00087 
00088 /** Padded TCP selective acknowledgement permitted option (used for sending) */
00089 struct tcp_sack_permitted_padded_option {
00090         uint8_t nop[2];
00091         struct tcp_sack_permitted_option spopt;
00092 } __attribute__ (( packed ));
00093 
00094 /** Code for the TCP selective acknowledgement permitted option */
00095 #define TCP_OPTION_SACK_PERMITTED 4
00096 
00097 /** TCP selective acknowledgement option */
00098 struct tcp_sack_option {
00099         uint8_t kind;
00100         uint8_t length;
00101 } __attribute__ (( packed ));
00102 
00103 /** TCP selective acknowledgement block */
00104 struct tcp_sack_block {
00105         uint32_t left;
00106         uint32_t right;
00107 } __attribute__ (( packed ));
00108 
00109 /** Maximum number of selective acknowledgement blocks
00110  *
00111  * This allows for the presence of the TCP timestamp option.
00112  */
00113 #define TCP_SACK_MAX 3
00114 
00115 /** Padded TCP selective acknowledgement option (used for sending) */
00116 struct tcp_sack_padded_option {
00117         uint8_t nop[2];
00118         struct tcp_sack_option sackopt;
00119 } __attribute__ (( packed ));
00120 
00121 /** Code for the TCP selective acknowledgement option */
00122 #define TCP_OPTION_SACK 5
00123 
00124 /** TCP timestamp option */
00125 struct tcp_timestamp_option {
00126         uint8_t kind;
00127         uint8_t length;
00128         uint32_t tsval;
00129         uint32_t tsecr;
00130 } __attribute__ (( packed ));
00131 
00132 /** Padded TCP timestamp option (used for sending) */
00133 struct tcp_timestamp_padded_option {
00134         uint8_t nop[2];
00135         struct tcp_timestamp_option tsopt;
00136 } __attribute__ (( packed ));
00137 
00138 /** Code for the TCP timestamp option */
00139 #define TCP_OPTION_TS 8
00140 
00141 /** Parsed TCP options */
00142 struct tcp_options {
00143         /** Window scale option, if present */
00144         const struct tcp_window_scale_option *wsopt;
00145         /** SACK permitted option, if present */
00146         const struct tcp_sack_permitted_option *spopt;
00147         /** Timestamp option, if present */
00148         const struct tcp_timestamp_option *tsopt;
00149 };
00150 
00151 /** @} */
00152 
00153 /*
00154  * TCP flags
00155  */
00156 #define TCP_CWR         0x80
00157 #define TCP_ECE         0x40
00158 #define TCP_URG         0x20
00159 #define TCP_ACK         0x10
00160 #define TCP_PSH         0x08
00161 #define TCP_RST         0x04
00162 #define TCP_SYN         0x02
00163 #define TCP_FIN         0x01
00164 
00165 /**
00166 * @defgroup tcpstates TCP states
00167 *
00168 * The TCP state is defined by a combination of the flags that have
00169 * been sent to the peer, the flags that have been acknowledged by the
00170 * peer, and the flags that have been received from the peer.
00171 *
00172 * @{
00173 */
00174 
00175 /** TCP flags that have been sent in outgoing packets */
00176 #define TCP_STATE_SENT(flags) ( (flags) << 0 )
00177 #define TCP_FLAGS_SENT(state) ( ( (state) >> 0 ) & 0xff )
00178 
00179 /** TCP flags that have been acknowledged by the peer
00180  *
00181  * Note that this applies only to SYN and FIN.
00182  */
00183 #define TCP_STATE_ACKED(flags) ( (flags) << 8 )
00184 #define TCP_FLAGS_ACKED(state) ( ( (state) >> 8 ) & 0xff )
00185 
00186 /** TCP flags that have been received from the peer
00187  *
00188  * Note that this applies only to SYN and FIN, and that once SYN has
00189  * been received, we should always be sending ACK.
00190  */
00191 #define TCP_STATE_RCVD(flags) ( (flags) << 16 )
00192 #define TCP_FLAGS_RCVD(state) ( ( (state) >> 16 ) & 0xff )
00193 
00194 /** TCP flags that are currently being sent in outgoing packets */
00195 #define TCP_FLAGS_SENDING(state) \
00196         ( TCP_FLAGS_SENT ( state ) & ~TCP_FLAGS_ACKED ( state ) )
00197 
00198 /** CLOSED
00199  *
00200  * The connection has not yet been used for anything.
00201  */
00202 #define TCP_CLOSED TCP_RST
00203 
00204 /** LISTEN
00205  *
00206  * Not currently used as a state; we have no support for listening
00207  * connections.  Given a unique value to avoid compiler warnings.
00208  */
00209 #define TCP_LISTEN 0
00210 
00211 /** SYN_SENT
00212  *
00213  * SYN has been sent, nothing has yet been received or acknowledged.
00214  */
00215 #define TCP_SYN_SENT    ( TCP_STATE_SENT ( TCP_SYN ) )
00216 
00217 /** SYN_RCVD
00218  *
00219  * SYN has been sent but not acknowledged, SYN has been received.
00220  */
00221 #define TCP_SYN_RCVD    ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK ) |            \
00222                           TCP_STATE_RCVD ( TCP_SYN ) )
00223 
00224 /** ESTABLISHED
00225  *
00226  * SYN has been sent and acknowledged, SYN has been received.
00227  */
00228 #define TCP_ESTABLISHED ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK ) |            \
00229                           TCP_STATE_ACKED ( TCP_SYN ) |                     \
00230                           TCP_STATE_RCVD ( TCP_SYN ) )
00231 
00232 /** FIN_WAIT_1
00233  *
00234  * SYN has been sent and acknowledged, SYN has been received, FIN has
00235  * been sent but not acknowledged, FIN has not been received.
00236  *
00237  * RFC 793 shows that we can enter FIN_WAIT_1 without have had SYN
00238  * acknowledged, i.e. if the application closes the connection after
00239  * sending and receiving SYN, but before having had SYN acknowledged.
00240  * However, we have to *pretend* that SYN has been acknowledged
00241  * anyway, otherwise we end up sending SYN and FIN in the same
00242  * sequence number slot.  Therefore, when we transition from SYN_RCVD
00243  * to FIN_WAIT_1, we have to remember to set TCP_STATE_ACKED(TCP_SYN)
00244  * and increment our sequence number.
00245  */
00246 #define TCP_FIN_WAIT_1  ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
00247                           TCP_STATE_ACKED ( TCP_SYN ) |                     \
00248                           TCP_STATE_RCVD ( TCP_SYN ) )
00249 
00250 /** FIN_WAIT_2
00251  *
00252  * SYN has been sent and acknowledged, SYN has been received, FIN has
00253  * been sent and acknowledged, FIN ha not been received.
00254  */
00255 #define TCP_FIN_WAIT_2  ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
00256                           TCP_STATE_ACKED ( TCP_SYN | TCP_FIN ) |           \
00257                           TCP_STATE_RCVD ( TCP_SYN ) )
00258 
00259 /** CLOSING / LAST_ACK
00260  *
00261  * SYN has been sent and acknowledged, SYN has been received, FIN has
00262  * been sent but not acknowledged, FIN has been received.
00263  *
00264  * This state actually encompasses both CLOSING and LAST_ACK; they are
00265  * identical with the definition of state that we use.  I don't
00266  * *believe* that they need to be distinguished.
00267  */
00268 #define TCP_CLOSING_OR_LAST_ACK                                             \
00269                         ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
00270                           TCP_STATE_ACKED ( TCP_SYN ) |                     \
00271                           TCP_STATE_RCVD ( TCP_SYN | TCP_FIN ) )
00272 
00273 /** TIME_WAIT
00274  *
00275  * SYN has been sent and acknowledged, SYN has been received, FIN has
00276  * been sent and acknowledged, FIN has been received.
00277  */
00278 #define TCP_TIME_WAIT   ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
00279                           TCP_STATE_ACKED ( TCP_SYN | TCP_FIN ) |           \
00280                           TCP_STATE_RCVD ( TCP_SYN | TCP_FIN ) )
00281 
00282 /** CLOSE_WAIT
00283  *
00284  * SYN has been sent and acknowledged, SYN has been received, FIN has
00285  * been received.
00286  */
00287 #define TCP_CLOSE_WAIT  ( TCP_STATE_SENT ( TCP_SYN | TCP_ACK ) |            \
00288                           TCP_STATE_ACKED ( TCP_SYN ) |                     \
00289                           TCP_STATE_RCVD ( TCP_SYN | TCP_FIN ) )
00290 
00291 /** Can send data in current state
00292  *
00293  * We can send data if and only if we have had our SYN acked and we
00294  * have not yet sent our FIN.
00295  */
00296 #define TCP_CAN_SEND_DATA(state)                                            \
00297         ( ( (state) & ( TCP_STATE_ACKED ( TCP_SYN ) |                       \
00298                         TCP_STATE_SENT ( TCP_FIN ) ) )                      \
00299           == TCP_STATE_ACKED ( TCP_SYN ) )
00300 
00301 /** Have ever been fully established
00302  *
00303  * We have been fully established if we have both received a SYN and
00304  * had our own SYN acked.
00305  */
00306 #define TCP_HAS_BEEN_ESTABLISHED(state)                                     \
00307         ( ( (state) & ( TCP_STATE_ACKED ( TCP_SYN ) |                       \
00308                         TCP_STATE_RCVD ( TCP_SYN ) ) )                      \
00309           == ( TCP_STATE_ACKED ( TCP_SYN ) | TCP_STATE_RCVD ( TCP_SYN ) ) )
00310 
00311 /** Have closed gracefully
00312  *
00313  * We have closed gracefully if we have both received a FIN and had
00314  * our own FIN acked.
00315  */
00316 #define TCP_CLOSED_GRACEFULLY(state)                                        \
00317         ( ( (state) & ( TCP_STATE_ACKED ( TCP_FIN ) |                       \
00318                         TCP_STATE_RCVD ( TCP_FIN ) ) )                      \
00319           == ( TCP_STATE_ACKED ( TCP_FIN ) | TCP_STATE_RCVD ( TCP_FIN ) ) )
00320 
00321 /** @} */
00322 
00323 /** Mask for TCP header length field */
00324 #define TCP_MASK_HLEN   0xf0
00325 
00326 /** Smallest port number on which a TCP connection can listen */
00327 #define TCP_MIN_PORT 1
00328 
00329 /**
00330  * Maxmimum advertised TCP window size
00331  *
00332  * The maximum bandwidth on any link is limited by
00333  *
00334  *    max_bandwidth * round_trip_time = tcp_window
00335  *
00336  * Some rough expectations for achievable bandwidths over various
00337  * links are:
00338  *
00339  *    a) Gigabit LAN: expected bandwidth 125MB/s, typical RTT 0.5ms,
00340  *       minimum required window 64kB
00341  *
00342  *    b) Home Internet connection: expected bandwidth 10MB/s, typical
00343  *       RTT 25ms, minimum required window 256kB
00344  *
00345  *    c) WAN: expected bandwidth 2MB/s, typical RTT 100ms, minimum
00346  *       required window 200kB.
00347  *
00348  * The maximum possible value for the TCP window size is 1GB (using
00349  * the maximum window scale of 2**14).  However, it is advisable to
00350  * keep the window size as small as possible (without limiting
00351  * bandwidth), since in the event of a lost packet the window size
00352  * represents the maximum amount that will need to be retransmitted.
00353  *
00354  * We therefore choose a maximum window size of 256kB.
00355  */
00356 #define TCP_MAX_WINDOW_SIZE     ( 256 * 1024 )
00357 
00358 /**
00359  * Path MTU
00360  *
00361  * IPv6 requires all data link layers to support a datagram size of
00362  * 1280 bytes.  We choose to use this as our maximum transmitted
00363  * datagram size, on the assumption that any practical link layer we
00364  * encounter will allow this size.  This is a very conservative
00365  * assumption in practice, but the impact of making such a
00366  * conservative assumption is insignificant since the amount of data
00367  * that we transmit (rather than receive) is negligible.
00368  *
00369  * We allow space within this 1280 bytes for an IPv6 header, a TCP
00370  * header, and a (padded) TCP timestamp option.
00371  */
00372 #define TCP_PATH_MTU                                                    \
00373         ( 1280 - 40 /* IPv6 */ - 20 /* TCP */ - 12 /* TCP timestamp */ )
00374 
00375 /** TCP maximum segment lifetime
00376  *
00377  * Currently set to 2 minutes, as per RFC 793.
00378  */
00379 #define TCP_MSL ( 2 * 60 * TICKS_PER_SEC )
00380 
00381 /**
00382  * TCP keepalive period
00383  *
00384  * We send keepalive ACKs after this period of inactivity has elapsed
00385  * on an established connection.
00386  */
00387 #define TCP_KEEPALIVE_DELAY ( 15 * TICKS_PER_SEC )
00388 
00389 /**
00390  * TCP maximum header length
00391  *
00392  */
00393 #define TCP_MAX_HEADER_LEN                                      \
00394         ( MAX_LL_NET_HEADER_LEN +                               \
00395           sizeof ( struct tcp_header ) +                        \
00396           sizeof ( struct tcp_mss_option ) +                    \
00397           sizeof ( struct tcp_window_scale_padded_option ) +    \
00398           sizeof ( struct tcp_timestamp_padded_option ) )
00399 
00400 /**
00401  * Compare TCP sequence numbers
00402  *
00403  * @v seq1              Sequence number 1
00404  * @v seq2              Sequence number 2
00405  * @ret diff            Sequence difference
00406  *
00407  * Analogous to memcmp(), returns an integer less than, equal to, or
00408  * greater than zero if @c seq1 is found, respectively, to be before,
00409  * equal to, or after @c seq2.
00410  */
00411 static inline __attribute__ (( always_inline )) int32_t
00412 tcp_cmp ( uint32_t seq1, uint32_t seq2 ) {
00413         return ( ( int32_t ) ( seq1 - seq2 ) );
00414 }
00415 
00416 /**
00417  * Check if TCP sequence number lies within window
00418  *
00419  * @v seq               Sequence number
00420  * @v start             Start of window
00421  * @v len               Length of window
00422  * @ret in_window       Sequence number is within window
00423  */
00424 static inline int tcp_in_window ( uint32_t seq, uint32_t start,
00425                                   uint32_t len ) {
00426         return ( ( seq - start ) < len );
00427 }
00428 
00429 /** TCP finish wait time
00430  *
00431  * Currently set to one second, since we should not allow a slowly
00432  * responding server to substantially delay a call to shutdown().
00433  */
00434 #define TCP_FINISH_TIMEOUT ( 1 * TICKS_PER_SEC )
00435 
00436 extern struct tcpip_protocol tcp_protocol __tcpip_protocol;
00437 
00438 #endif /* _IPXE_TCP_H */