gctf2023/pwn/flipper/dist/common/source/ustl/ualgobase.cpp

287 lines
8.7 KiB
C++
Raw Normal View History

2023-11-24 18:11:34 +00:00
// This file is part of the uSTL library, an STL implementation.
//
// Copyright (c) 2005 by Mike Sharov <msharov@users.sourceforge.net>
// This file is free software, distributed under the MIT License.
#ifndef NDEBUG // Optimized code here. asserts slow it down, and are checked elsewhere.
#define NDEBUG
#endif
#include "ualgo.h"
namespace ustl {
// Generic version for implementing fill_nX_fast on non-i386 architectures.
template <typename T> static inline void stosv (T*& p, size_t n, T v)
{ while (n--) *p++ = v; }
#if __x86__
//----------------------------------------------------------------------
// Copy functions
//----------------------------------------------------------------------
static inline void movsb_dir_up (void) { asm volatile ("cld"); }
static inline void movsb_dir_down (void) { asm volatile ("std"); }
static inline void movsb (const void*& src, size_t nBytes, void*& dest)
{
asm volatile ("rep;\n\tmovsb"
: "=&S"(src), "=&D"(dest), "=&c"(nBytes)
: "0"(src), "1"(dest), "2"(nBytes)
: "memory");
}
static inline void movsd (const void*& src, size_t nWords, void*& dest)
{
asm volatile ("rep;\n\tmovsl"
: "=&S"(src), "=&D"(dest), "=&c"(nWords)
: "0"(src), "1"(dest), "2"(nWords)
: "memory");
}
#if __MMX__
template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v)
{ asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
#endif
template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v)
{ asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v)
{ asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
#if __MMX__
#define MMX_ALIGN 16U // Data must be aligned on this grain
#define MMX_BS 32U // Assembly routines copy data this many bytes at a time.
static inline void simd_block_copy (const void* src, void* dest)
{
const char* csrc = static_cast<const char*>(src);
char* cdest = static_cast<char*>(dest);
#if __SSE__
asm (
"movaps\t%2, %%xmm0 \n\t"
"movaps\t%3, %%xmm1 \n\t"
"movntps\t%%xmm0, %0 \n\t"
"movntps\t%%xmm1, %1"
: "=m"(cdest[0]), "=m"(cdest[16])
: "m"(csrc[0]), "m"(csrc[16])
: "xmm0", "xmm1", "memory");
#else
asm (
"movq %4, %%mm0 \n\t"
"movq %5, %%mm1 \n\t"
"movq %6, %%mm2 \n\t"
"movq %7, %%mm3 \n\t"
"movq %%mm0, %0 \n\t"
"movq %%mm1, %1 \n\t"
"movq %%mm2, %2 \n\t"
"movq %%mm3, %3"
: "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24])
: "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24])
: "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)", "memory");
#endif
}
static inline void simd_block_store (uint8_t* dest)
{
#if __SSE__
asm volatile (
"movntq %%mm0, %0\n\t"
"movntq %%mm0, %1\n\t"
"movntq %%mm0, %2\n\t"
"movntq %%mm0, %3"
: "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
:: "memory");
#else
asm volatile (
"movq %%mm0, %0 \n\t"
"movq %%mm0, %1 \n\t"
"movq %%mm0, %2 \n\t"
"movq %%mm0, %3"
: "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
:: "memory");
#endif
}
static inline void simd_block_cleanup (void)
{
#if !__SSE__
simd::reset_mmx();
#endif
asm volatile ("sfence");
}
/// The fastest optimized raw memory copy.
void copy_n_fast (const void* src, size_t nBytes, void* dest) noexcept
{
movsb_dir_up();
size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src);
nHeadBytes = min (nHeadBytes, nBytes);
movsb (src, nHeadBytes, dest);
nBytes -= nHeadBytes;
if (!(uintptr_t(dest) % MMX_ALIGN)) {
const size_t nMiddleBlocks = nBytes / MMX_BS;
for (uoff_t i = 0; i < nMiddleBlocks; ++ i) {
prefetch (advance (src, 512), 0, 0);
simd_block_copy (src, dest);
src = advance (src, MMX_BS);
dest = advance (dest, MMX_BS);
}
simd_block_cleanup();
nBytes %= MMX_BS;
}
movsb (src, nBytes, dest);
}
#endif // __MMX__
/// The fastest optimized backwards raw memory copy.
void copy_backward_fast (const void* first, const void* last, void* result) noexcept
{
prefetch (first, 0, 0);
prefetch (result, 1, 0);
size_t nBytes (distance (first, last));
movsb_dir_down();
size_t nHeadBytes = uintptr_t(last) % 4;
last = advance (last, -1);
result = advance (result, -1);
movsb (last, nHeadBytes, result);
nBytes -= nHeadBytes;
if (uintptr_t(result) % 4 == 3) {
const size_t nMiddleBlocks = nBytes / 4;
last = advance (last, -3);
result = advance (result, -3);
movsd (last, nMiddleBlocks, result);
nBytes %= 4;
}
movsb (last, nBytes, result);
movsb_dir_up();
}
#endif // __x86__
//----------------------------------------------------------------------
// Fill functions
//----------------------------------------------------------------------
#if __MMX__
template <typename T> static inline void build_block (T) {}
template <> inline void build_block (uint8_t v)
{
asm volatile (
"movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
: : "g"(uint32_t(v)) : "mm0");
}
template <> inline void build_block (uint16_t v)
{
asm volatile (
"movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
: : "g"(uint32_t(v)) : "mm0");
}
template <> inline void build_block (uint32_t v)
{
asm volatile (
"movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0"
: : "g"(uint32_t(v)) : "mm0");
}
static inline void simd_block_fill_loop (uint8_t*& dest, size_t count)
{
prefetch (advance (dest, 512), 1, 0);
for (const uint8_t* destEnd = dest + count * MMX_BS; dest < destEnd; dest += MMX_BS)
simd_block_store (dest);
simd_block_cleanup();
simd::reset_mmx();
}
template <typename T>
static inline void fill_n_fast (T* dest, size_t count, T v)
{
size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T);
nHead = min (nHead, count);
stosv (dest, nHead, v);
count -= nHead;
build_block (v);
uint8_t* bdest = reinterpret_cast<uint8_t*>(dest);
simd_block_fill_loop (bdest, count * sizeof(T) / MMX_BS);
count %= MMX_BS;
dest = reinterpret_cast<T*>(bdest);
stosv (dest, count, v);
}
void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) noexcept
{ fill_n_fast (dest, count, v); }
void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) noexcept
{ fill_n_fast (dest, count, v); }
void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) noexcept
{ fill_n_fast (dest, count, v); }
#else
void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) noexcept { memset (dest, v, count); }
void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) noexcept { stosv (dest, count, v); }
void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) noexcept { stosv (dest, count, v); }
#endif // __MMX__
/// Exchanges ranges [first, middle) and [middle, last)
void rotate_fast (void* first, void* middle, void* last) noexcept
{
#if HAVE_ALLOCA_H
const size_t half1 (distance (first, middle)), half2 (distance (middle, last));
const size_t hmin (min (half1, half2));
if (!hmin)
return;
void* buf = alloca (hmin);
if (buf) {
if (half2 < half1) {
copy_n_fast (middle, half2, buf);
copy_backward_fast (first, middle, last);
copy_n_fast (buf, half2, first);
} else {
copy_n_fast (first, half1, buf);
copy_n_fast (middle, half2, first);
copy_n_fast (buf, half1, advance (first, half2));
}
} else
#else
if (first == middle || middle == last)
return;
#endif
{
char* f = static_cast<char*>(first);
char* m = static_cast<char*>(middle);
char* l = static_cast<char*>(last);
reverse (f, m);
reverse (m, l);
while (f != m && m != l)
iter_swap (f++, --l);
reverse (f, (f == m ? l : m));
}
}
#if __GNUC__ < 4
size_t popcount (uint32_t v) noexcept
{
const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide
const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
return ((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24;
}
#if HAVE_INT64_T
/// \brief Returns the number of 1s in \p v in binary.
size_t popcount (uint64_t v) noexcept
{
v -= (v >> 1) & UINT64_C(0x5555555555555555); // Algorithm from Wikipedia
v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333));
v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
return (v * UINT64_C(0x0101010101010101)) >> 56;
}
#endif // HAVE_INT64_T
#endif // !__GNUC__
//----------------------------------------------------------------------
// Miscellaneous instantiated stuff from headers which don't have enough
// to warrant creation of a separate file.cc
//----------------------------------------------------------------------
// Used in uspecial to print printable characters
const char _FmtPrtChr[2][8]={"'%c'","%d"};
} // namespace ustl