iPXE
string.h
Go to the documentation of this file.
00001 #ifndef X86_BITS_STRING_H
00002 #define X86_BITS_STRING_H
00003 
00004 /*
00005  * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful, but
00013  * WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00020  * 02110-1301, USA.
00021  *
00022  * You can also choose to distribute this program under the terms of
00023  * the Unmodified Binary Distribution Licence (as given in the file
00024  * COPYING.UBDL), provided that you have satisfied its requirements.
00025  */
00026 
00027 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
00028 
00029 /** @file
00030  *
00031  * Optimised string operations
00032  *
00033  */
00034 
00035 extern void * __memcpy ( void *dest, const void *src, size_t len );
00036 extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
00037 
00038 /**
00039  * Copy memory area (where length is a compile-time constant)
00040  *
00041  * @v dest              Destination address
00042  * @v src               Source address
00043  * @v len               Length
00044  * @ret dest            Destination address
00045  */
00046 static inline __attribute__ (( always_inline )) void *
00047 __constant_memcpy ( void *dest, const void *src, size_t len ) {
00048         union {
00049                 uint32_t u32[2];
00050                 uint16_t u16[4];
00051                 uint8_t  u8[8];
00052         } __attribute__ (( __may_alias__ )) *dest_u = dest;
00053         const union {
00054                 uint32_t u32[2];
00055                 uint16_t u16[4];
00056                 uint8_t  u8[8];
00057         } __attribute__ (( __may_alias__ )) *src_u = src;
00058         const void *esi;
00059         void *edi;
00060 
00061         switch ( len ) {
00062         case 0 : /* 0 bytes */
00063                 return dest;
00064         /*
00065          * Single-register moves; these are always better than a
00066          * string operation.  We can clobber an arbitrary two
00067          * registers (data, source, dest can re-use source register)
00068          * instead of being restricted to esi and edi.  There's also a
00069          * much greater potential for optimising with nearby code.
00070          *
00071          */
00072         case 1 : /* 4 bytes */
00073                 dest_u->u8[0]  = src_u->u8[0];
00074                 return dest;
00075         case 2 : /* 6 bytes */
00076                 dest_u->u16[0] = src_u->u16[0];
00077                 return dest;
00078         case 4 : /* 4 bytes */
00079                 dest_u->u32[0] = src_u->u32[0];
00080                 return dest;
00081         /*
00082          * Double-register moves; these are probably still a win.
00083          *
00084          */
00085         case 3 : /* 12 bytes */
00086                 dest_u->u16[0] = src_u->u16[0];
00087                 dest_u->u8[2]  = src_u->u8[2];
00088                 return dest;
00089         case 5 : /* 10 bytes */
00090                 dest_u->u32[0] = src_u->u32[0];
00091                 dest_u->u8[4]  = src_u->u8[4];
00092                 return dest;
00093         case 6 : /* 12 bytes */
00094                 dest_u->u32[0] = src_u->u32[0];
00095                 dest_u->u16[2] = src_u->u16[2];
00096                 return dest;
00097         case 8 : /* 10 bytes */
00098                 dest_u->u32[0] = src_u->u32[0];
00099                 dest_u->u32[1] = src_u->u32[1];
00100                 return dest;
00101         }
00102 
00103         /* Even if we have to load up esi and edi ready for a string
00104          * operation, we can sometimes save space by using multiple
00105          * single-byte "movs" operations instead of loading up ecx and
00106          * using "rep movsb".
00107          *
00108          * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
00109          * to allow for saving/restoring ecx 50% of the time.
00110          *
00111          * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
00112          * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
00113          * but "movsl" moves twice as much data, so it balances out).
00114          *
00115          * The cutoff point therefore occurs around 26 bytes; the byte
00116          * requirements for each method are:
00117          *
00118          * len             16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
00119          * #bytes (ecx)     8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
00120          * #bytes (no ecx)  4  5  6  7  5  6  7  8  6  7  8  9  7  8  9 10
00121          */
00122 
00123         esi = src;
00124         edi = dest;
00125         
00126         if ( len >= 26 )
00127                 return __memcpy ( dest, src, len );
00128         
00129         if ( len >= 6*4 )
00130                 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
00131                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00132         if ( len >= 5*4 )
00133                 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
00134                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00135         if ( len >= 4*4 )
00136                 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
00137                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00138         if ( len >= 3*4 )
00139                 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
00140                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00141         if ( len >= 2*4 )
00142                 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
00143                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00144         if ( len >= 1*4 )
00145                 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
00146                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00147         if ( ( len % 4 ) >= 2 )
00148                 __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
00149                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00150         if ( ( len % 2 ) >= 1 )
00151                 __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
00152                                        : "0" ( edi ), "1" ( esi ) : "memory" );
00153 
00154         return dest;
00155 }
00156 
00157 /**
00158  * Copy memory area
00159  *
00160  * @v dest              Destination address
00161  * @v src               Source address
00162  * @v len               Length
00163  * @ret dest            Destination address
00164  */
00165 static inline __attribute__ (( always_inline )) void *
00166 memcpy ( void *dest, const void *src, size_t len ) {
00167         if ( __builtin_constant_p ( len ) ) {
00168                 return __constant_memcpy ( dest, src, len );
00169         } else {
00170                 return __memcpy ( dest, src, len );
00171         }
00172 }
00173 
00174 extern void * __memmove ( void *dest, const void *src, size_t len );
00175 
00176 /**
00177  * Copy (possibly overlapping) memory area
00178  *
00179  * @v dest              Destination address
00180  * @v src               Source address
00181  * @v len               Length
00182  * @ret dest            Destination address
00183  */
00184 static inline __attribute__ (( always_inline )) void *
00185 memmove ( void *dest, const void *src, size_t len ) {
00186         ssize_t offset = ( dest - src );
00187 
00188         if ( __builtin_constant_p ( offset ) ) {
00189                 if ( offset <= 0 ) {
00190                         return memcpy ( dest, src, len );
00191                 } else {
00192                         return __memcpy_reverse ( dest, src, len );
00193                 }
00194         } else {
00195                 return __memmove ( dest, src, len );
00196         }
00197 }
00198 
00199 /**
00200  * Fill memory region
00201  *
00202  * @v dest              Destination address
00203  * @v fill              Fill pattern
00204  * @v len               Length
00205  * @ret dest            Destination address
00206  */
00207 static inline __attribute__ (( always_inline )) void *
00208 __memset ( void *dest, int fill, size_t len ) {
00209         void *discard_D;
00210         size_t discard_c;
00211 
00212         __asm__ __volatile__ ( "rep stosb"
00213                                : "=&D" ( discard_D ), "=&c" ( discard_c )
00214                                : "0" ( dest ), "1" ( len ), "a" ( fill )
00215                                : "memory" );
00216         return dest;
00217 }
00218 
00219 /**
00220  * Fill memory region with zero (where length is a compile-time constant)
00221  *
00222  * @v dest              Destination address
00223  * @v len               Length
00224  * @ret dest            Destination address
00225  */
00226 static inline __attribute__ (( always_inline )) void *
00227 __constant_memset_zero ( void *dest, size_t len ) {
00228         union {
00229                 uint32_t u32[2];
00230                 uint16_t u16[4];
00231                 uint8_t  u8[8];
00232         } __attribute__ (( __may_alias__ )) *dest_u = dest;
00233         void *edi;
00234         uint32_t eax;
00235 
00236         switch ( len ) {
00237         case 0 : /* 0 bytes */
00238                 return dest;
00239 
00240         /* Single-register moves.  Almost certainly better than a
00241          * string operation.  We can avoid clobbering any registers,
00242          * we can reuse a zero that happens to already be in a
00243          * register, and we can optimise away the code entirely if the
00244          * memset() is used to clear a region which then gets
00245          * immediately overwritten.
00246          */
00247         case 1 : /* 3 bytes */
00248                 dest_u->u8[0] = 0;
00249                 return dest;
00250         case 2: /* 5 bytes */
00251                 dest_u->u16[0] = 0;
00252                 return dest;
00253         case 4: /* 6 bytes */
00254                 dest_u->u32[0] = 0;
00255                 return dest;
00256 
00257         /* Double-register moves.  Very probably better than a string
00258          * operation.
00259          */
00260         case 3 : /* 9 bytes */
00261                 dest_u->u16[0] = 0;
00262                 dest_u->u8[2]  = 0;
00263                 return dest;
00264         case 5 : /* 10 bytes */
00265                 dest_u->u32[0] = 0;
00266                 dest_u->u8[4]  = 0;
00267                 return dest;
00268         case 6 : /* 12 bytes */
00269                 dest_u->u32[0] = 0;
00270                 dest_u->u16[2] = 0;
00271                 return dest;
00272         case 8 : /* 13 bytes */
00273                 dest_u->u32[0] = 0;
00274                 dest_u->u32[1] = 0;
00275                 return dest;
00276         }
00277 
00278         /* As with memcpy(), we can potentially save space by using
00279          * multiple single-byte "stos" instructions instead of loading
00280          * up ecx and using "rep stosb".
00281          *
00282          * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
00283          * to allow for saving/restoring ecx 50% of the time.
00284          *
00285          * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
00286          *
00287          * The calculations are therefore the same as for memcpy(),
00288          * giving a cutoff point of around 26 bytes.
00289          */
00290 
00291         edi = dest;
00292         eax = 0;
00293 
00294         if ( len >= 26 )
00295                 return __memset ( dest, 0, len );
00296 
00297         if ( len >= 6*4 )
00298                 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
00299                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00300         if ( len >= 5*4 )
00301                 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
00302                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00303         if ( len >= 4*4 )
00304                 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
00305                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00306         if ( len >= 3*4 )
00307                 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
00308                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00309         if ( len >= 2*4 )
00310                 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
00311                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00312         if ( len >= 1*4 )
00313                 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
00314                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00315         if ( ( len % 4 ) >= 2 )
00316                 __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
00317                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00318         if ( ( len % 2 ) >= 1 )
00319                 __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
00320                                        : "0" ( edi ), "1" ( eax ) : "memory" );
00321 
00322         return dest;
00323 }
00324 
00325 /**
00326  * Fill memory region
00327  *
00328  * @v dest              Destination address
00329  * @v fill              Fill pattern
00330  * @v len               Length
00331  * @ret dest            Destination address
00332  */
00333 static inline __attribute__ (( always_inline )) void *
00334 memset ( void *dest, int fill, size_t len ) {
00335 
00336         if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
00337              __builtin_constant_p ( len ) ) {
00338                 return __constant_memset_zero ( dest, len );
00339         } else {
00340                 return __memset ( dest, fill, len );
00341         }
00342 }
00343 
00344 #endif /* X86_BITS_STRING_H */