/* * Copyright (C) 2014 Cloudius Systems, Ltd. */ // // Seastar memory allocator // // This is a share-nothing allocator (memory allocated on one cpu must // be freed on the same cpu). // // Inspired by gperftools' tcmalloc. // // Memory map: // // 0x0000'sccc'vvvv'vvvv // // 0000 - required by architecture (only 48 bits of address space) // s - chosen to satisfy system allocator (1-7) // ccc - cpu number (0-12 bits allocated vary according to system) // v - virtual address within cpu (32-44 bits, according to how much ccc // leaves us // // Each page has a page structure that describes it. Within a cpu's // memory pool, the page array starts at offset 0, describing all pages // within that pool. Page 0 does not describe a valid page. // // Each pool can contain at most 2^32 pages (or 44 address bits), so we can // use a 32-bit integer to identify a page. // // Runs of pages are organized into spans. Free spans are organized into lists, // by size. When spans are broken up or coalesced, they may move into new lists. #include "memory.hh" #ifndef DEFAULT_ALLOCATOR #include "bitops.hh" #include "align.hh" #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_NUMA #include #endif namespace memory { static constexpr const size_t page_bits = 12; static constexpr const size_t page_size = 1 << page_bits; static constexpr const unsigned cpu_id_shift = 36; // FIXME: make dynamic using pageidx = uint32_t; struct page; class page_list; namespace bi = boost::intrusive; class page_list_link { uint32_t _prev; uint32_t _next; friend class page_list; }; static char* mem_base() { static char* known; static std::once_flag flag; std::call_once(flag, [] { size_t alloc = size_t(1) << 44; auto r = ::mmap(NULL, 2 * alloc, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); if (r == MAP_FAILED) { abort(); } ::madvise(r, 2 * alloc, MADV_DONTDUMP); auto cr = reinterpret_cast(r); known = align_up(cr, alloc); ::munmap(cr, known - cr); ::munmap(known + alloc, cr + 2 * alloc - (known + alloc)); }); return known; } class small_pool; struct free_object { free_object* next; }; struct page { bool free; uint8_t offset_in_span; uint16_t nr_small_alloc; uint32_t span_size; // in pages, if we're the head or the tail page_list_link link; small_pool* pool; // if used in a small_pool free_object* freelist; }; class page_list { uint32_t _front = 0; uint32_t _back = 0; public: page& front(page* ary) { return ary[_front]; } page& back(page* ary) { return ary[_back]; } bool empty() const { return !_front; } void erase(page* ary, page& span) { if (span.link._next) { ary[span.link._next].link._prev = span.link._prev; } else { _back = span.link._prev; } if (span.link._prev) { ary[span.link._prev].link._next = span.link._next; } else { _front = span.link._next; } } void push_front(page* ary, page& span) { auto idx = &span - ary; if (_front) { ary[_front].link._prev = idx; } else { _back = idx; } span.link._next = _front; span.link._prev = 0; _front = idx; } void pop_front(page* ary) { if (ary[_front].link._next) { ary[ary[_front].link._next].link._prev = 0; } else { _back = 0; } _front = ary[_front].link._next; } }; class small_pool { unsigned _object_size; unsigned _span_size; free_object* _free = nullptr; size_t _free_count = 0; unsigned _min_free; unsigned _max_free; page_list _span_list; static constexpr unsigned idx_frac_bits = 2; private: size_t span_bytes() const { return _span_size * page_size; } public: explicit small_pool(unsigned object_size) noexcept; ~small_pool(); void* allocate(); void deallocate(void* object); unsigned object_size() const { return _object_size; } static constexpr unsigned size_to_idx(unsigned size); static constexpr unsigned idx_to_size(unsigned idx); private: void add_more_objects(); void trim_free_list(); float waste(); }; // index 0b0001'1100 -> size (1 << 4) + 0b11 << (4 - 2) constexpr unsigned small_pool::idx_to_size(unsigned idx) { return (((1 << idx_frac_bits) | (idx & ((1 << idx_frac_bits) - 1))) << (idx >> idx_frac_bits)) >> idx_frac_bits; } static constexpr unsigned log2(unsigned n) { return std::numeric_limits::digits - count_leading_zeros(n - 1); } constexpr unsigned small_pool::size_to_idx(unsigned size) { return ((log2(size) << idx_frac_bits) - ((1 << idx_frac_bits) - 1)) + ((size - 1) >> (log2(size) - idx_frac_bits)); } class small_pool_array { public: static constexpr const unsigned nr_small_pools = small_pool::size_to_idx(4 * page_size) + 1; private: union u { small_pool a[nr_small_pools]; u() { for (unsigned i = 0; i < nr_small_pools; ++i) { new (&a[i]) small_pool(small_pool::idx_to_size(i)); } } ~u() { // cannot really call destructor, since other // objects may be freed after we are gone. } } _u; public: small_pool& operator[](unsigned idx) { return _u.a[idx]; } }; static constexpr const size_t max_small_allocation = small_pool::idx_to_size(small_pool_array::nr_small_pools - 1); struct cpu_pages { static constexpr unsigned min_free_pages = 20000000 / page_size; char* memory; page* pages; uint32_t nr_pages; uint32_t nr_free_pages; uint32_t current_min_free_pages = 0; unsigned cpu_id = -1U; std::function)> reclaim_hook; std::vector reclaimers; static constexpr const unsigned nr_span_lists = 32; union pla { pla() { for (auto&& e : free_spans) { new (&e) page_list; } } ~pla() { // no destructor -- might be freeing after we die } page_list free_spans[nr_span_lists]; // contains spans with span_size >= 2^idx } fsu; small_pool_array small_pools; static std::atomic cpu_id_gen; char* mem() { return memory; } void link(page_list& list, page* span); void unlink(page_list& list, page* span); struct trim { unsigned offset; unsigned nr_pages; }; template void* allocate_large_and_trim(unsigned nr_pages, Trimmer trimmer); void* allocate_large(unsigned nr_pages); void* allocate_large_aligned(unsigned align_pages, unsigned nr_pages); void free_large(void* ptr); void free_span(pageidx start, uint32_t nr_pages); void free_span_no_merge(pageidx start, uint32_t nr_pages); void* allocate_small(unsigned size); void free(void* ptr); void free(void* ptr, size_t size); size_t object_size(void* ptr); page* to_page(void* p) { return &pages[(reinterpret_cast(p) - mem()) / page_size]; } bool initialize(); void reclaim(); void set_reclaim_hook(std::function)> hook); void resize(size_t new_size); void do_resize(size_t new_size); }; static thread_local cpu_pages cpu_mem; std::atomic cpu_pages::cpu_id_gen; static inline unsigned index_of(unsigned pages) { return std::numeric_limits::digits - count_leading_zeros(pages) - 1; } void cpu_pages::unlink(page_list& list, page* span) { list.erase(pages, *span); } void cpu_pages::link(page_list& list, page* span) { list.push_front(pages, *span); } void cpu_pages::free_span_no_merge(uint32_t span_start, uint32_t nr_pages) { assert(nr_pages); nr_free_pages += nr_pages; auto span = &pages[span_start]; auto span_end = &pages[span_start + nr_pages - 1]; span->free = span_end->free = true; span->span_size = span_end->span_size = nr_pages; auto idx = index_of(nr_pages); link(fsu.free_spans[idx], span); } void cpu_pages::free_span(uint32_t span_start, uint32_t nr_pages) { page* before = &pages[span_start - 1]; if (before->free) { auto b_size = before->span_size; assert(b_size); span_start -= b_size; nr_pages += b_size; nr_free_pages -= b_size; unlink(fsu.free_spans[index_of(b_size)], before - (b_size - 1)); } page* after = &pages[span_start + nr_pages]; if (after->free) { auto a_size = after->span_size; assert(a_size); nr_pages += a_size; nr_free_pages -= a_size; unlink(fsu.free_spans[index_of(a_size)], after); } free_span_no_merge(span_start, nr_pages); } template void* cpu_pages::allocate_large_and_trim(unsigned n_pages, Trimmer trimmer) { auto idx = index_of(n_pages); assert(n_pages >= (1u << idx)); assert(n_pages < (2u << idx)); while (idx < nr_span_lists && fsu.free_spans[idx].empty()) { ++idx; } if (idx == nr_span_lists) { if (initialize()) { return allocate_large_and_trim(n_pages, trimmer); } // FIXME: request application to free memory throw std::bad_alloc(); } auto& list = fsu.free_spans[idx]; page* span = &list.front(pages); auto span_size = span->span_size; auto span_idx = span - pages; unlink(list, span); nr_free_pages -= span->span_size; trim t = trimmer(span_idx, nr_pages); if (t.offset) { free_span_no_merge(span_idx, t.offset); span_idx += t.offset; span_size -= t.offset; span = &pages[span_idx]; } if (t.nr_pages < span->span_size) { free_span_no_merge(span_idx + t.nr_pages, span_size - t.nr_pages); span_size = t.nr_pages; } auto span_end = &pages[span_idx + n_pages - 1]; span->free = span_end->free = false; span->span_size = span_end->span_size = n_pages; span->pool = nullptr; if (nr_free_pages < current_min_free_pages) { reclaim(); } return mem() + span_idx * page_size; } void* cpu_pages::allocate_large(unsigned n_pages) { return allocate_large_and_trim(n_pages, [n_pages] (unsigned idx, unsigned n) { return trim{0, std::min(n, n_pages)}; }); } void* cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) { return allocate_large_and_trim(n_pages + align_pages - 1, [=] (unsigned idx, unsigned n) { return trim{align_up(idx, align_pages) - idx, n_pages}; }); } void* cpu_pages::allocate_small(unsigned size) { auto idx = small_pool::size_to_idx(size); auto& pool = small_pools[idx]; assert(size <= pool.object_size()); return pool.allocate(); } void cpu_pages::free_large(void* ptr) { pageidx idx = (reinterpret_cast(ptr) - mem()) / page_size; page* span = &pages[idx]; free_span(idx, span->span_size); } size_t cpu_pages::object_size(void* ptr) { pageidx idx = (reinterpret_cast(ptr) - mem()) / page_size; page* span = &pages[idx]; if (span->pool) { return span->pool->object_size(); } else { return size_t(span->span_size) * page_size; } } void cpu_pages::free(void* ptr) { assert(((reinterpret_cast(ptr) >> cpu_id_shift) & 0xff) == cpu_id); page* span = to_page(ptr); if (span->pool) { span->pool->deallocate(ptr); } else { free_large(ptr); } } void cpu_pages::free(void* ptr, size_t size) { assert(((reinterpret_cast(ptr) >> cpu_id_shift) & 0xff) == cpu_id); if (size <= max_small_allocation) { auto pool = &small_pools[small_pool::size_to_idx(size)]; pool->deallocate(ptr); } else { free_large(ptr); } } bool cpu_pages::initialize() { if (nr_pages) { return false; } cpu_id = cpu_id_gen.fetch_add(1, std::memory_order_relaxed); auto base = mem_base() + (size_t(cpu_id) << cpu_id_shift); auto size = 32 << 20; // Small size for bootstrap auto r = ::mmap(base, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (r == MAP_FAILED) { abort(); } ::madvise(base, size, MADV_HUGEPAGE); pages = reinterpret_cast(base); memory = base; nr_pages = size / page_size; // we reserve the end page so we don't have to special case // the last span. auto reserved = align_up(sizeof(page) * (nr_pages + 1), page_size) / page_size; for (pageidx i = 0; i < reserved; ++i) { pages[i].free = false; } pages[nr_pages].free = false; free_span_no_merge(reserved, nr_pages - reserved); return true; } void cpu_pages::do_resize(size_t new_size) { auto new_pages = new_size / page_size; if (new_pages <= nr_pages) { return; } auto old_size = nr_pages * page_size; auto mmap_start = memory + old_size; auto mmap_size = new_size - old_size; auto r = ::mmap(mmap_start, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (r == MAP_FAILED) { abort(); } ::madvise(mmap_start, mmap_size, MADV_HUGEPAGE); ::madvise(mmap_start, mmap_size, MADV_HUGEPAGE); // one past last page structure is a sentinel auto new_page_array_pages = align_up(sizeof(page[new_pages + 1]), page_size) / page_size; auto new_page_array = reinterpret_cast(allocate_large(new_page_array_pages)); std::copy(pages, pages + nr_pages, new_page_array); // mark new one-past-last page as taken to avoid boundary conditions new_page_array[new_pages].free = false; auto old_pages = reinterpret_cast(pages); auto old_nr_pages = nr_pages; auto old_pages_size = align_up(sizeof(page[nr_pages + 1]), page_size); pages = new_page_array; nr_pages = new_pages; auto old_pages_start = (old_pages - memory) / page_size; if (old_pages_start == 0) { // keep page 0 allocated old_pages_start = 1; old_pages_size -= page_size; } free_span(old_pages_start, old_pages_size / page_size); free_span(old_nr_pages, new_pages - old_nr_pages); } void cpu_pages::resize(size_t new_size) { new_size = align_down(new_size, page_size); while (nr_pages * page_size < new_size) { // don't reallocate all at once, since there might not // be enough free memory available to relocate the pages array auto tmp_size = std::min(new_size, 4 * nr_pages * page_size); do_resize(tmp_size); } } void cpu_pages::reclaim() { current_min_free_pages = 0; reclaim_hook([this] { for (auto&& r : reclaimers) { r->do_reclaim(); } current_min_free_pages = min_free_pages; }); } void cpu_pages::set_reclaim_hook(std::function)> hook) { reclaim_hook = hook; current_min_free_pages = min_free_pages; } small_pool::small_pool(unsigned object_size) noexcept : _object_size(object_size), _span_size(1) { while (_object_size > span_bytes() || (_span_size < 32 && waste() > 0.05) || (span_bytes() / object_size < 32)) { _span_size *= 2; } _max_free = std::max(100, span_bytes() * 2 / _object_size); _min_free = _max_free / 2; } small_pool::~small_pool() { _min_free = _max_free = 0; trim_free_list(); } void* small_pool::allocate() { if (!_free) { add_more_objects(); } auto* obj = _free; _free = _free->next; --_free_count; return obj; } void small_pool::deallocate(void* object) { auto o = reinterpret_cast(object); o->next = _free; _free = o; ++_free_count; if (_free_count >= _max_free) { trim_free_list(); } } void small_pool::add_more_objects() { auto goal = (_min_free + _max_free) / 2; while (!_span_list.empty() && _free_count < goal) { page& span = _span_list.front(cpu_mem.pages); _span_list.pop_front(cpu_mem.pages); while (span.freelist) { auto obj = span.freelist; span.freelist = span.freelist->next; obj->next = _free; _free = obj; ++_free_count; ++span.nr_small_alloc; } } while (_free_count < goal) { auto data = reinterpret_cast(cpu_mem.allocate_large(_span_size)); auto span = cpu_mem.to_page(data); for (unsigned i = 0; i < _span_size; ++i) { span[i].offset_in_span = i; span[i].pool = this; } span->nr_small_alloc = 0; span->freelist = nullptr; for (unsigned offset = 0; offset <= span_bytes() - _object_size; offset += _object_size) { auto h = reinterpret_cast(data + offset); h->next = _free; _free = h; ++_free_count; ++span->nr_small_alloc; } } } void small_pool::trim_free_list() { auto goal = (_min_free + _max_free) / 2; while (_free && _free_count > goal) { auto obj = _free; _free = _free->next; --_free_count; page* span = cpu_mem.to_page(obj); span -= span->offset_in_span; if (!span->freelist) { new (&span->link) page_list_link(); _span_list.push_front(cpu_mem.pages, *span); } obj->next = span->freelist; span->freelist = obj; if (--span->nr_small_alloc == 0) { _span_list.erase(cpu_mem.pages, *span); cpu_mem.free_span(span - cpu_mem.pages, span->span_size); } } } float small_pool::waste() { return (span_bytes() % _object_size) / (1.0 * span_bytes()); } void* allocate_large(size_t size) { unsigned size_in_pages = (size + page_size - 1) >> page_bits; assert((size_t(size_in_pages) << page_bits) >= size); return cpu_mem.allocate_large(size_in_pages); } void* allocate_large_aligned(size_t align, size_t size) { unsigned size_in_pages = (size + page_size - 1) >> page_bits; unsigned align_in_pages = std::max(align, page_size) >> page_bits; return cpu_mem.allocate_large_aligned(align_in_pages, size_in_pages); } void free_large(void* ptr) { return cpu_mem.free_large(ptr); } size_t object_size(void* ptr) { return cpu_mem.object_size(ptr); } void* allocate(size_t size) { if (size <= sizeof(free_object)) { size = sizeof(free_object); } if (size <= max_small_allocation) { return cpu_mem.allocate_small(size); } else { return allocate_large(size); } } void* allocate_aligned(size_t align, size_t size) { size = std::max(size, align); if (size <= sizeof(free_object)) { size = sizeof(free_object); } if (size <= max_small_allocation) { return cpu_mem.allocate_small(size); } else { return allocate_large_aligned(align, size); } } void free(void* obj) { cpu_mem.free(obj); } void free(void* obj, size_t size) { cpu_mem.free(obj, size); } void set_reclaim_hook(std::function)> hook) { cpu_mem.set_reclaim_hook(hook); } reclaimer::reclaimer(std::function reclaim) : _reclaim(std::move(reclaim)) { cpu_mem.reclaimers.push_back(this); } reclaimer::~reclaimer() { auto& r = cpu_mem.reclaimers; r.erase(std::find(r.begin(), r.end(), this)); } void configure(std::vector m) { size_t total = 0; for (auto&& x : m) { total += x.bytes; } cpu_mem.resize(total); size_t pos = 0; for (auto&& x : m) { #ifdef HAVE_NUMA unsigned long nodemask = 1UL << x.nodeid; auto r = ::mbind(cpu_mem.mem() + pos, x.bytes, MPOL_BIND, &nodemask, std::numeric_limits::digits, MPOL_MF_MOVE); assert(r == 0); #endif pos += x.bytes; } } } using namespace memory; extern "C" void* malloc(size_t n) throw () { if (n == 0) { return nullptr; } try { return allocate(n); } catch (std::bad_alloc& ba) { return nullptr; } } extern "C" [[gnu::alias("malloc")]] void* __libc_malloc(size_t n) throw (); extern "C" void free(void* ptr) { if (ptr) { memory::free(ptr); } } extern "C" [[gnu::alias("free")]] void* __libc_free(void* obj) throw (); extern "C" void* calloc(size_t nmemb, size_t size) { auto s1 = __int128(nmemb) * __int128(size); assert(s1 == size_t(s1)); size_t s = s1; auto p = malloc(s); if (p) { std::memset(p, 0, s); } return p; } extern "C" [[gnu::alias("calloc")]] void* __libc_calloc(size_t n, size_t m) throw (); extern "C" void* realloc(void* ptr, size_t size) { auto old_size = ptr ? object_size(ptr) : 0; auto nptr = malloc(size); if (!nptr) { return nptr; } if (ptr) { std::memcpy(nptr, ptr, std::min(size, old_size)); ::free(ptr); } return nptr; } extern "C" [[gnu::alias("realloc")]] void* __libc_realloc(void* obj, size_t size) throw (); extern "C" int posix_memalign(void** ptr, size_t align, size_t size) { try { *ptr = allocate_aligned(align, size); return 0; } catch (std::bad_alloc&) { return ENOMEM; } } extern "C" [[gnu::alias("posix_memalign")]] int __libc_posix_memalign(void** ptr, size_t align, size_t size) throw (); extern "C" void* memalign(size_t align, size_t size) { try { return allocate_aligned(align, size); } catch (std::bad_alloc&) { return NULL; } } extern "C" [[gnu::alias("memalign")]] void* __libc_memalign(size_t align, size_t size); extern "C" void cfree(void* obj) { return ::free(obj); } extern "C" [[gnu::alias("cfree")]] void __libc_cfree(void* obj); extern "C" size_t malloc_usable_size(void* obj) { return object_size(obj); } extern "C" int malloc_trim(size_t pad) { return 0; } void* operator new(size_t size) { if (size == 0) { size = 1; } return allocate(size); } void* operator new[](size_t size) { if (size == 0) { size = 1; } return allocate(size); } void operator delete(void* ptr) throw () { if (ptr) { memory::free(ptr); } } void operator delete[](void* ptr) throw () { if (ptr) { memory::free(ptr); } } void operator delete(void* ptr, size_t size) { if (ptr) { memory::free(ptr, size); } } void operator delete[](void* ptr, size_t size) { if (ptr) { memory::free(ptr, size); } } void* operator new(size_t size, std::nothrow_t) throw () { if (size == 0) { size = 1; } try { return allocate(size); } catch (...) { return nullptr; } } void* operator new[](size_t size, std::nothrow_t) throw () { if (size == 0) { size = 1; } try { return allocate(size); } catch (...) { return nullptr; } } void operator delete(void* ptr, std::nothrow_t) throw () { if (ptr) { memory::free(ptr); } } void operator delete[](void* ptr, std::nothrow_t) throw () { if (ptr) { memory::free(ptr); } } void operator delete(void* ptr, size_t size, std::nothrow_t) throw () { if (ptr) { memory::free(ptr, size); } } void operator delete[](void* ptr, size_t size, std::nothrow_t) throw () { if (ptr) { memory::free(ptr, size); } } #else namespace memory { reclaimer::reclaimer(std::function reclaim) { } reclaimer::~reclaimer() { } void set_reclaim_hook(std::function)> hook) { } void configure(std::vector m) { } } #endif