// This file is part of the uSTL library, an STL implementation. // // Copyright (c) 2005 by Mike Sharov // This file is free software, distributed under the MIT License. #ifndef NDEBUG // Optimized code here. asserts slow it down, and are checked elsewhere. #define NDEBUG #endif #include "ualgo.h" namespace ustl { // Generic version for implementing fill_nX_fast on non-i386 architectures. template static inline void stosv (T*& p, size_t n, T v) { while (n--) *p++ = v; } #if __x86__ //---------------------------------------------------------------------- // Copy functions //---------------------------------------------------------------------- static inline void movsb_dir_up (void) { asm volatile ("cld"); } static inline void movsb_dir_down (void) { asm volatile ("std"); } static inline void movsb (const void*& src, size_t nBytes, void*& dest) { asm volatile ("rep;\n\tmovsb" : "=&S"(src), "=&D"(dest), "=&c"(nBytes) : "0"(src), "1"(dest), "2"(nBytes) : "memory"); } static inline void movsd (const void*& src, size_t nWords, void*& dest) { asm volatile ("rep;\n\tmovsl" : "=&S"(src), "=&D"(dest), "=&c"(nWords) : "0"(src), "1"(dest), "2"(nWords) : "memory"); } #if __MMX__ template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v) { asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); } #endif template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v) { asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); } template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v) { asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); } #if __MMX__ #define MMX_ALIGN 16U // Data must be aligned on this grain #define MMX_BS 32U // Assembly routines copy data this many bytes at a time. static inline void simd_block_copy (const void* src, void* dest) { const char* csrc = static_cast(src); char* cdest = static_cast(dest); #if __SSE__ asm ( "movaps\t%2, %%xmm0 \n\t" "movaps\t%3, %%xmm1 \n\t" "movntps\t%%xmm0, %0 \n\t" "movntps\t%%xmm1, %1" : "=m"(cdest[0]), "=m"(cdest[16]) : "m"(csrc[0]), "m"(csrc[16]) : "xmm0", "xmm1", "memory"); #else asm ( "movq %4, %%mm0 \n\t" "movq %5, %%mm1 \n\t" "movq %6, %%mm2 \n\t" "movq %7, %%mm3 \n\t" "movq %%mm0, %0 \n\t" "movq %%mm1, %1 \n\t" "movq %%mm2, %2 \n\t" "movq %%mm3, %3" : "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24]) : "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24]) : "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)", "memory"); #endif } static inline void simd_block_store (uint8_t* dest) { #if __SSE__ asm volatile ( "movntq %%mm0, %0\n\t" "movntq %%mm0, %1\n\t" "movntq %%mm0, %2\n\t" "movntq %%mm0, %3" : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24]) :: "memory"); #else asm volatile ( "movq %%mm0, %0 \n\t" "movq %%mm0, %1 \n\t" "movq %%mm0, %2 \n\t" "movq %%mm0, %3" : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24]) :: "memory"); #endif } static inline void simd_block_cleanup (void) { #if !__SSE__ simd::reset_mmx(); #endif asm volatile ("sfence"); } /// The fastest optimized raw memory copy. void copy_n_fast (const void* src, size_t nBytes, void* dest) noexcept { movsb_dir_up(); size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src); nHeadBytes = min (nHeadBytes, nBytes); movsb (src, nHeadBytes, dest); nBytes -= nHeadBytes; if (!(uintptr_t(dest) % MMX_ALIGN)) { const size_t nMiddleBlocks = nBytes / MMX_BS; for (uoff_t i = 0; i < nMiddleBlocks; ++ i) { prefetch (advance (src, 512), 0, 0); simd_block_copy (src, dest); src = advance (src, MMX_BS); dest = advance (dest, MMX_BS); } simd_block_cleanup(); nBytes %= MMX_BS; } movsb (src, nBytes, dest); } #endif // __MMX__ /// The fastest optimized backwards raw memory copy. void copy_backward_fast (const void* first, const void* last, void* result) noexcept { prefetch (first, 0, 0); prefetch (result, 1, 0); size_t nBytes (distance (first, last)); movsb_dir_down(); size_t nHeadBytes = uintptr_t(last) % 4; last = advance (last, -1); result = advance (result, -1); movsb (last, nHeadBytes, result); nBytes -= nHeadBytes; if (uintptr_t(result) % 4 == 3) { const size_t nMiddleBlocks = nBytes / 4; last = advance (last, -3); result = advance (result, -3); movsd (last, nMiddleBlocks, result); nBytes %= 4; } movsb (last, nBytes, result); movsb_dir_up(); } #endif // __x86__ //---------------------------------------------------------------------- // Fill functions //---------------------------------------------------------------------- #if __MMX__ template static inline void build_block (T) {} template <> inline void build_block (uint8_t v) { asm volatile ( "movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0" : : "g"(uint32_t(v)) : "mm0"); } template <> inline void build_block (uint16_t v) { asm volatile ( "movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0" : : "g"(uint32_t(v)) : "mm0"); } template <> inline void build_block (uint32_t v) { asm volatile ( "movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0" : : "g"(uint32_t(v)) : "mm0"); } static inline void simd_block_fill_loop (uint8_t*& dest, size_t count) { prefetch (advance (dest, 512), 1, 0); for (const uint8_t* destEnd = dest + count * MMX_BS; dest < destEnd; dest += MMX_BS) simd_block_store (dest); simd_block_cleanup(); simd::reset_mmx(); } template static inline void fill_n_fast (T* dest, size_t count, T v) { size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T); nHead = min (nHead, count); stosv (dest, nHead, v); count -= nHead; build_block (v); uint8_t* bdest = reinterpret_cast(dest); simd_block_fill_loop (bdest, count * sizeof(T) / MMX_BS); count %= MMX_BS; dest = reinterpret_cast(bdest); stosv (dest, count, v); } void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) noexcept { fill_n_fast (dest, count, v); } void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) noexcept { fill_n_fast (dest, count, v); } void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) noexcept { fill_n_fast (dest, count, v); } #else void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) noexcept { memset (dest, v, count); } void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) noexcept { stosv (dest, count, v); } void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) noexcept { stosv (dest, count, v); } #endif // __MMX__ /// Exchanges ranges [first, middle) and [middle, last) void rotate_fast (void* first, void* middle, void* last) noexcept { #if HAVE_ALLOCA_H const size_t half1 (distance (first, middle)), half2 (distance (middle, last)); const size_t hmin (min (half1, half2)); if (!hmin) return; void* buf = alloca (hmin); if (buf) { if (half2 < half1) { copy_n_fast (middle, half2, buf); copy_backward_fast (first, middle, last); copy_n_fast (buf, half2, first); } else { copy_n_fast (first, half1, buf); copy_n_fast (middle, half2, first); copy_n_fast (buf, half1, advance (first, half2)); } } else #else if (first == middle || middle == last) return; #endif { char* f = static_cast(first); char* m = static_cast(middle); char* l = static_cast(last); reverse (f, m); reverse (m, l); while (f != m && m != l) iter_swap (f++, --l); reverse (f, (f == m ? l : m)); } } #if __GNUC__ < 4 size_t popcount (uint32_t v) noexcept { const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333); return ((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24; } #if HAVE_INT64_T /// \brief Returns the number of 1s in \p v in binary. size_t popcount (uint64_t v) noexcept { v -= (v >> 1) & UINT64_C(0x5555555555555555); // Algorithm from Wikipedia v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333)); v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F); return (v * UINT64_C(0x0101010101010101)) >> 56; } #endif // HAVE_INT64_T #endif // !__GNUC__ //---------------------------------------------------------------------- // Miscellaneous instantiated stuff from headers which don't have enough // to warrant creation of a separate file.cc //---------------------------------------------------------------------- // Used in uspecial to print printable characters const char _FmtPrtChr[2][8]={"'%c'","%d"}; } // namespace ustl