small_pools, which are responsible for allocations up to 16kB, aren't able to provide a buffer with alignment stricter than a page. This results in aligned allocations being broken for buffers in the range 4kB - 16kB. This patch make sure that if the alignment requirement is too big for small_pool to handle allocate_large_aligned() is used instead. Fixes #36. Signed-off-by: Paweł Dziepak <pdziepak@quarnos.org>
1201 lines
33 KiB
C++
1201 lines
33 KiB
C++
/*
|
|
* This file is open source software, licensed to you under the terms
|
|
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
|
|
* distributed with this work for additional information regarding copyright
|
|
* ownership. You may not use this file except in compliance with the License.
|
|
*
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing,
|
|
* software distributed under the License is distributed on an
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
* KIND, either express or implied. See the License for the
|
|
* specific language governing permissions and limitations
|
|
* under the License.
|
|
*/
|
|
/*
|
|
* Copyright (C) 2014 Cloudius Systems, Ltd.
|
|
*/
|
|
|
|
//
|
|
// Seastar memory allocator
|
|
//
|
|
// This is a share-nothing allocator (memory allocated on one cpu must
|
|
// be freed on the same cpu).
|
|
//
|
|
// Inspired by gperftools' tcmalloc.
|
|
//
|
|
// Memory map:
|
|
//
|
|
// 0x0000'sccc'vvvv'vvvv
|
|
//
|
|
// 0000 - required by architecture (only 48 bits of address space)
|
|
// s - chosen to satisfy system allocator (1-7)
|
|
// ccc - cpu number (0-12 bits allocated vary according to system)
|
|
// v - virtual address within cpu (32-44 bits, according to how much ccc
|
|
// leaves us
|
|
//
|
|
// Each page has a page structure that describes it. Within a cpu's
|
|
// memory pool, the page array starts at offset 0, describing all pages
|
|
// within that pool. Page 0 does not describe a valid page.
|
|
//
|
|
// Each pool can contain at most 2^32 pages (or 44 address bits), so we can
|
|
// use a 32-bit integer to identify a page.
|
|
//
|
|
// Runs of pages are organized into spans. Free spans are organized into lists,
|
|
// by size. When spans are broken up or coalesced, they may move into new lists.
|
|
|
|
#include "memory.hh"
|
|
|
|
#ifndef DEFAULT_ALLOCATOR
|
|
|
|
#include "bitops.hh"
|
|
#include "align.hh"
|
|
#include "posix.hh"
|
|
#include "shared_ptr.hh"
|
|
#include <new>
|
|
#include <cstdint>
|
|
#include <algorithm>
|
|
#include <limits>
|
|
#include <cassert>
|
|
#include <atomic>
|
|
#include <mutex>
|
|
#include <experimental/optional>
|
|
#include <functional>
|
|
#include <cstring>
|
|
#include <boost/intrusive/list.hpp>
|
|
#include <sys/mman.h>
|
|
#ifdef HAVE_NUMA
|
|
#include <numaif.h>
|
|
#endif
|
|
|
|
namespace memory {
|
|
|
|
static constexpr const unsigned cpu_id_shift = 36; // FIXME: make dynamic
|
|
static constexpr const unsigned max_cpus = 256;
|
|
static constexpr const size_t cache_line_size = 64;
|
|
|
|
using pageidx = uint32_t;
|
|
|
|
struct page;
|
|
class page_list;
|
|
|
|
static thread_local uint64_t g_allocs;
|
|
static thread_local uint64_t g_frees;
|
|
static thread_local uint64_t g_cross_cpu_frees;
|
|
|
|
using std::experimental::optional;
|
|
|
|
using allocate_system_memory_fn
|
|
= std::function<mmap_area (optional<void*> where, size_t how_much)>;
|
|
|
|
namespace bi = boost::intrusive;
|
|
|
|
inline
|
|
unsigned object_cpu_id(const void* ptr) {
|
|
return (reinterpret_cast<uintptr_t>(ptr) >> cpu_id_shift) & 0xff;
|
|
}
|
|
|
|
class page_list_link {
|
|
uint32_t _prev;
|
|
uint32_t _next;
|
|
friend class page_list;
|
|
};
|
|
|
|
static char* mem_base() {
|
|
static char* known;
|
|
static std::once_flag flag;
|
|
std::call_once(flag, [] {
|
|
size_t alloc = size_t(1) << 44;
|
|
auto r = ::mmap(NULL, 2 * alloc,
|
|
PROT_NONE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
|
|
-1, 0);
|
|
if (r == MAP_FAILED) {
|
|
abort();
|
|
}
|
|
::madvise(r, 2 * alloc, MADV_DONTDUMP);
|
|
auto cr = reinterpret_cast<char*>(r);
|
|
known = align_up(cr, alloc);
|
|
::munmap(cr, known - cr);
|
|
::munmap(known + alloc, cr + 2 * alloc - (known + alloc));
|
|
});
|
|
return known;
|
|
}
|
|
|
|
class small_pool;
|
|
|
|
struct free_object {
|
|
free_object* next;
|
|
};
|
|
|
|
struct page {
|
|
bool free;
|
|
uint8_t offset_in_span;
|
|
uint16_t nr_small_alloc;
|
|
uint32_t span_size; // in pages, if we're the head or the tail
|
|
page_list_link link;
|
|
small_pool* pool; // if used in a small_pool
|
|
free_object* freelist;
|
|
};
|
|
|
|
class page_list {
|
|
uint32_t _front = 0;
|
|
uint32_t _back = 0;
|
|
public:
|
|
page& front(page* ary) { return ary[_front]; }
|
|
page& back(page* ary) { return ary[_back]; }
|
|
bool empty() const { return !_front; }
|
|
void erase(page* ary, page& span) {
|
|
if (span.link._next) {
|
|
ary[span.link._next].link._prev = span.link._prev;
|
|
} else {
|
|
_back = span.link._prev;
|
|
}
|
|
if (span.link._prev) {
|
|
ary[span.link._prev].link._next = span.link._next;
|
|
} else {
|
|
_front = span.link._next;
|
|
}
|
|
}
|
|
void push_front(page* ary, page& span) {
|
|
auto idx = &span - ary;
|
|
if (_front) {
|
|
ary[_front].link._prev = idx;
|
|
} else {
|
|
_back = idx;
|
|
}
|
|
span.link._next = _front;
|
|
span.link._prev = 0;
|
|
_front = idx;
|
|
}
|
|
void pop_front(page* ary) {
|
|
if (ary[_front].link._next) {
|
|
ary[ary[_front].link._next].link._prev = 0;
|
|
} else {
|
|
_back = 0;
|
|
}
|
|
_front = ary[_front].link._next;
|
|
}
|
|
};
|
|
|
|
class small_pool {
|
|
unsigned _object_size;
|
|
unsigned _span_size;
|
|
free_object* _free = nullptr;
|
|
size_t _free_count = 0;
|
|
unsigned _min_free;
|
|
unsigned _max_free;
|
|
page_list _span_list;
|
|
static constexpr unsigned idx_frac_bits = 2;
|
|
private:
|
|
size_t span_bytes() const { return _span_size * page_size; }
|
|
public:
|
|
explicit small_pool(unsigned object_size) noexcept;
|
|
~small_pool();
|
|
void* allocate();
|
|
void deallocate(void* object);
|
|
unsigned object_size() const { return _object_size; }
|
|
static constexpr unsigned size_to_idx(unsigned size);
|
|
static constexpr unsigned idx_to_size(unsigned idx);
|
|
private:
|
|
void add_more_objects();
|
|
void trim_free_list();
|
|
float waste();
|
|
};
|
|
|
|
// index 0b0001'1100 -> size (1 << 4) + 0b11 << (4 - 2)
|
|
|
|
constexpr unsigned
|
|
small_pool::idx_to_size(unsigned idx) {
|
|
return (((1 << idx_frac_bits) | (idx & ((1 << idx_frac_bits) - 1)))
|
|
<< (idx >> idx_frac_bits))
|
|
>> idx_frac_bits;
|
|
}
|
|
|
|
static constexpr unsigned log2(unsigned n) {
|
|
return std::numeric_limits<unsigned>::digits - count_leading_zeros(n - 1);
|
|
}
|
|
|
|
constexpr unsigned
|
|
small_pool::size_to_idx(unsigned size) {
|
|
return ((log2(size) << idx_frac_bits) - ((1 << idx_frac_bits) - 1))
|
|
+ ((size - 1) >> (log2(size) - idx_frac_bits));
|
|
}
|
|
|
|
class small_pool_array {
|
|
public:
|
|
static constexpr const unsigned nr_small_pools = small_pool::size_to_idx(4 * page_size) + 1;
|
|
private:
|
|
union u {
|
|
small_pool a[nr_small_pools];
|
|
u() {
|
|
for (unsigned i = 0; i < nr_small_pools; ++i) {
|
|
new (&a[i]) small_pool(small_pool::idx_to_size(i));
|
|
}
|
|
}
|
|
~u() {
|
|
// cannot really call destructor, since other
|
|
// objects may be freed after we are gone.
|
|
}
|
|
} _u;
|
|
public:
|
|
small_pool& operator[](unsigned idx) { return _u.a[idx]; }
|
|
};
|
|
|
|
static constexpr const size_t max_small_allocation
|
|
= small_pool::idx_to_size(small_pool_array::nr_small_pools - 1);
|
|
|
|
struct cross_cpu_free_item {
|
|
cross_cpu_free_item* next;
|
|
};
|
|
|
|
struct cpu_pages {
|
|
static constexpr unsigned min_free_pages = 20000000 / page_size;
|
|
char* memory;
|
|
page* pages;
|
|
uint32_t nr_pages;
|
|
uint32_t nr_free_pages;
|
|
uint32_t current_min_free_pages = 0;
|
|
unsigned cpu_id = -1U;
|
|
std::function<void (std::function<void ()>)> reclaim_hook;
|
|
std::vector<reclaimer*> reclaimers;
|
|
static constexpr const unsigned nr_span_lists = 32;
|
|
union pla {
|
|
pla() {
|
|
for (auto&& e : free_spans) {
|
|
new (&e) page_list;
|
|
}
|
|
}
|
|
~pla() {
|
|
// no destructor -- might be freeing after we die
|
|
}
|
|
page_list free_spans[nr_span_lists]; // contains spans with span_size >= 2^idx
|
|
} fsu;
|
|
small_pool_array small_pools;
|
|
alignas(cache_line_size) std::atomic<cross_cpu_free_item*> xcpu_freelist;
|
|
alignas(cache_line_size) std::vector<physical_address> virt_to_phys_map;
|
|
static std::atomic<unsigned> cpu_id_gen;
|
|
static cpu_pages* all_cpus[max_cpus];
|
|
char* mem() { return memory; }
|
|
|
|
void link(page_list& list, page* span);
|
|
void unlink(page_list& list, page* span);
|
|
struct trim {
|
|
unsigned offset;
|
|
unsigned nr_pages;
|
|
};
|
|
template <typename Trimmer>
|
|
void* allocate_large_and_trim(unsigned nr_pages, Trimmer trimmer);
|
|
void* allocate_large(unsigned nr_pages);
|
|
void* allocate_large_aligned(unsigned align_pages, unsigned nr_pages);
|
|
void free_large(void* ptr);
|
|
void free_span(pageidx start, uint32_t nr_pages);
|
|
void free_span_no_merge(pageidx start, uint32_t nr_pages);
|
|
void* allocate_small(unsigned size);
|
|
void free(void* ptr);
|
|
void free(void* ptr, size_t size);
|
|
void free_cross_cpu(unsigned cpu_id, void* ptr);
|
|
bool drain_cross_cpu_freelist();
|
|
size_t object_size(void* ptr);
|
|
page* to_page(void* p) {
|
|
return &pages[(reinterpret_cast<char*>(p) - mem()) / page_size];
|
|
}
|
|
|
|
bool initialize();
|
|
void reclaim();
|
|
void set_reclaim_hook(std::function<void (std::function<void ()>)> hook);
|
|
void resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
|
|
void do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem);
|
|
void replace_memory_backing(allocate_system_memory_fn alloc_sys_mem);
|
|
void init_virt_to_phys_map();
|
|
translation translate(const void* addr, size_t size);
|
|
};
|
|
|
|
static thread_local cpu_pages cpu_mem;
|
|
std::atomic<unsigned> cpu_pages::cpu_id_gen;
|
|
cpu_pages* cpu_pages::all_cpus[max_cpus];
|
|
|
|
// Free spans are store in the largest index i such that nr_pages >= 1 << i.
|
|
static inline
|
|
unsigned index_of(unsigned pages) {
|
|
return std::numeric_limits<unsigned>::digits - count_leading_zeros(pages) - 1;
|
|
}
|
|
|
|
// Smallest index i such that all spans stored in the index are >= pages.
|
|
static inline
|
|
unsigned index_of_conservative(unsigned pages) {
|
|
if (pages == 1) {
|
|
return 0;
|
|
}
|
|
return std::numeric_limits<unsigned>::digits - count_leading_zeros(pages - 1);
|
|
}
|
|
|
|
void
|
|
cpu_pages::unlink(page_list& list, page* span) {
|
|
list.erase(pages, *span);
|
|
}
|
|
|
|
void
|
|
cpu_pages::link(page_list& list, page* span) {
|
|
list.push_front(pages, *span);
|
|
}
|
|
|
|
void cpu_pages::free_span_no_merge(uint32_t span_start, uint32_t nr_pages) {
|
|
assert(nr_pages);
|
|
nr_free_pages += nr_pages;
|
|
auto span = &pages[span_start];
|
|
auto span_end = &pages[span_start + nr_pages - 1];
|
|
span->free = span_end->free = true;
|
|
span->span_size = span_end->span_size = nr_pages;
|
|
auto idx = index_of(nr_pages);
|
|
link(fsu.free_spans[idx], span);
|
|
}
|
|
|
|
void cpu_pages::free_span(uint32_t span_start, uint32_t nr_pages) {
|
|
page* before = &pages[span_start - 1];
|
|
if (before->free) {
|
|
auto b_size = before->span_size;
|
|
assert(b_size);
|
|
span_start -= b_size;
|
|
nr_pages += b_size;
|
|
nr_free_pages -= b_size;
|
|
unlink(fsu.free_spans[index_of(b_size)], before - (b_size - 1));
|
|
}
|
|
page* after = &pages[span_start + nr_pages];
|
|
if (after->free) {
|
|
auto a_size = after->span_size;
|
|
assert(a_size);
|
|
nr_pages += a_size;
|
|
nr_free_pages -= a_size;
|
|
unlink(fsu.free_spans[index_of(a_size)], after);
|
|
}
|
|
free_span_no_merge(span_start, nr_pages);
|
|
}
|
|
|
|
template <typename Trimmer>
|
|
void*
|
|
cpu_pages::allocate_large_and_trim(unsigned n_pages, Trimmer trimmer) {
|
|
auto idx = index_of_conservative(n_pages);
|
|
assert(n_pages < (2u << idx));
|
|
while (idx < nr_span_lists && fsu.free_spans[idx].empty()) {
|
|
++idx;
|
|
}
|
|
if (idx == nr_span_lists) {
|
|
if (initialize()) {
|
|
return allocate_large_and_trim(n_pages, trimmer);
|
|
}
|
|
// FIXME: request application to free memory
|
|
throw std::bad_alloc();
|
|
}
|
|
auto& list = fsu.free_spans[idx];
|
|
page* span = &list.front(pages);
|
|
auto span_size = span->span_size;
|
|
auto span_idx = span - pages;
|
|
unlink(list, span);
|
|
nr_free_pages -= span->span_size;
|
|
trim t = trimmer(span_idx, nr_pages);
|
|
if (t.offset) {
|
|
free_span_no_merge(span_idx, t.offset);
|
|
span_idx += t.offset;
|
|
span_size -= t.offset;
|
|
span = &pages[span_idx];
|
|
|
|
}
|
|
if (t.nr_pages < span_size) {
|
|
free_span_no_merge(span_idx + t.nr_pages, span_size - t.nr_pages);
|
|
span_size = t.nr_pages;
|
|
}
|
|
auto span_end = &pages[span_idx + t.nr_pages - 1];
|
|
span->free = span_end->free = false;
|
|
span->span_size = span_end->span_size = t.nr_pages;
|
|
span->pool = nullptr;
|
|
if (nr_free_pages < current_min_free_pages) {
|
|
drain_cross_cpu_freelist();
|
|
reclaim();
|
|
}
|
|
return mem() + span_idx * page_size;
|
|
}
|
|
|
|
void*
|
|
cpu_pages::allocate_large(unsigned n_pages) {
|
|
return allocate_large_and_trim(n_pages, [n_pages] (unsigned idx, unsigned n) {
|
|
return trim{0, std::min(n, n_pages)};
|
|
});
|
|
}
|
|
|
|
void*
|
|
cpu_pages::allocate_large_aligned(unsigned align_pages, unsigned n_pages) {
|
|
return allocate_large_and_trim(n_pages + align_pages - 1, [=] (unsigned idx, unsigned n) {
|
|
return trim{align_up(idx, align_pages) - idx, n_pages};
|
|
});
|
|
}
|
|
|
|
void*
|
|
cpu_pages::allocate_small(unsigned size) {
|
|
auto idx = small_pool::size_to_idx(size);
|
|
auto& pool = small_pools[idx];
|
|
assert(size <= pool.object_size());
|
|
return pool.allocate();
|
|
}
|
|
|
|
void cpu_pages::free_large(void* ptr) {
|
|
pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
|
|
page* span = &pages[idx];
|
|
free_span(idx, span->span_size);
|
|
}
|
|
|
|
size_t cpu_pages::object_size(void* ptr) {
|
|
pageidx idx = (reinterpret_cast<char*>(ptr) - mem()) / page_size;
|
|
page* span = &pages[idx];
|
|
if (span->pool) {
|
|
return span->pool->object_size();
|
|
} else {
|
|
return size_t(span->span_size) * page_size;
|
|
}
|
|
}
|
|
|
|
void cpu_pages::free_cross_cpu(unsigned cpu_id, void* ptr) {
|
|
auto p = reinterpret_cast<cross_cpu_free_item*>(ptr);
|
|
auto& list = all_cpus[cpu_id]->xcpu_freelist;
|
|
auto old = list.load(std::memory_order_relaxed);
|
|
do {
|
|
p->next = old;
|
|
} while (!list.compare_exchange_weak(old, p, std::memory_order_release, std::memory_order_relaxed));
|
|
++g_cross_cpu_frees;
|
|
}
|
|
|
|
bool cpu_pages::drain_cross_cpu_freelist() {
|
|
if (!xcpu_freelist.load(std::memory_order_relaxed)) {
|
|
return false;
|
|
}
|
|
auto p = xcpu_freelist.exchange(nullptr, std::memory_order_acquire);
|
|
while (p) {
|
|
auto n = p->next;
|
|
free(p);
|
|
p = n;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void cpu_pages::free(void* ptr) {
|
|
auto obj_cpu = object_cpu_id(ptr);
|
|
if (obj_cpu != cpu_id) {
|
|
return free_cross_cpu(obj_cpu, ptr);
|
|
}
|
|
page* span = to_page(ptr);
|
|
if (span->pool) {
|
|
span->pool->deallocate(ptr);
|
|
} else {
|
|
free_large(ptr);
|
|
}
|
|
}
|
|
|
|
void cpu_pages::free(void* ptr, size_t size) {
|
|
auto obj_cpu = object_cpu_id(ptr);
|
|
if (obj_cpu != cpu_id) {
|
|
return free_cross_cpu(obj_cpu, ptr);
|
|
}
|
|
if (size <= max_small_allocation) {
|
|
auto pool = &small_pools[small_pool::size_to_idx(size)];
|
|
pool->deallocate(ptr);
|
|
} else {
|
|
free_large(ptr);
|
|
}
|
|
}
|
|
|
|
bool cpu_pages::initialize() {
|
|
if (nr_pages) {
|
|
return false;
|
|
}
|
|
cpu_id = cpu_id_gen.fetch_add(1, std::memory_order_relaxed);
|
|
assert(cpu_id < max_cpus);
|
|
all_cpus[cpu_id] = this;
|
|
auto base = mem_base() + (size_t(cpu_id) << cpu_id_shift);
|
|
auto size = 32 << 20; // Small size for bootstrap
|
|
auto r = ::mmap(base, size,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
|
|
-1, 0);
|
|
if (r == MAP_FAILED) {
|
|
abort();
|
|
}
|
|
::madvise(base, size, MADV_HUGEPAGE);
|
|
pages = reinterpret_cast<page*>(base);
|
|
memory = base;
|
|
nr_pages = size / page_size;
|
|
// we reserve the end page so we don't have to special case
|
|
// the last span.
|
|
auto reserved = align_up(sizeof(page) * (nr_pages + 1), page_size) / page_size;
|
|
for (pageidx i = 0; i < reserved; ++i) {
|
|
pages[i].free = false;
|
|
}
|
|
pages[nr_pages].free = false;
|
|
free_span_no_merge(reserved, nr_pages - reserved);
|
|
return true;
|
|
}
|
|
|
|
mmap_area
|
|
allocate_anonymous_memory(optional<void*> where, size_t how_much) {
|
|
return mmap_anonymous(where.value_or(nullptr),
|
|
how_much,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | (where ? MAP_FIXED : 0));
|
|
}
|
|
|
|
mmap_area
|
|
allocate_hugetlbfs_memory(file_desc& fd, optional<void*> where, size_t how_much) {
|
|
auto pos = fd.size();
|
|
fd.truncate(pos + how_much);
|
|
auto ret = fd.map(
|
|
how_much,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_SHARED | MAP_POPULATE | (where ? MAP_FIXED : 0),
|
|
pos,
|
|
where.value_or(nullptr));
|
|
return ret;
|
|
}
|
|
|
|
void cpu_pages::replace_memory_backing(allocate_system_memory_fn alloc_sys_mem) {
|
|
// We would like to use ::mremap() to atomically replace the old anonymous
|
|
// memory with hugetlbfs backed memory, but mremap() does not support hugetlbfs
|
|
// (for no reason at all). So we must copy the anonymous memory to some other
|
|
// place, map hugetlbfs in place, and copy it back, without modifying it during
|
|
// the operation.
|
|
auto bytes = nr_pages * page_size;
|
|
auto old_mem = mem();
|
|
auto relocated_old_mem = mmap_anonymous(nullptr, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE);
|
|
std::memcpy(relocated_old_mem.get(), old_mem, bytes);
|
|
alloc_sys_mem({old_mem}, bytes).release();
|
|
std::memcpy(old_mem, relocated_old_mem.get(), bytes);
|
|
}
|
|
|
|
void cpu_pages::init_virt_to_phys_map() {
|
|
auto nr_entries = nr_pages / (huge_page_size / page_size);
|
|
virt_to_phys_map.resize(nr_entries);
|
|
auto fd = file_desc::open("/proc/self/pagemap", O_RDONLY | O_CLOEXEC);
|
|
for (size_t i = 0; i != nr_entries; ++i) {
|
|
uint64_t entry = 0;
|
|
auto phys = std::numeric_limits<physical_address>::max();
|
|
auto pfn = reinterpret_cast<uintptr_t>(mem() + i * huge_page_size) / page_size;
|
|
fd.pread(&entry, 8, pfn * 8);
|
|
assert(entry & 0x8000'0000'0000'0000);
|
|
phys = (entry & 0x007f'ffff'ffff'ffff) << page_bits;
|
|
virt_to_phys_map[i] = phys;
|
|
}
|
|
}
|
|
|
|
translation
|
|
cpu_pages::translate(const void* addr, size_t size) {
|
|
auto a = reinterpret_cast<uintptr_t>(addr) - reinterpret_cast<uintptr_t>(mem());
|
|
auto pfn = a / huge_page_size;
|
|
if (pfn >= virt_to_phys_map.size()) {
|
|
return {};
|
|
}
|
|
auto phys = virt_to_phys_map[pfn];
|
|
if (phys == std::numeric_limits<physical_address>::max()) {
|
|
return {};
|
|
}
|
|
auto translation_size = align_up(a + 1, huge_page_size) - a;
|
|
size = std::min(size, translation_size);
|
|
phys += a & (huge_page_size - 1);
|
|
return translation{phys, size};
|
|
}
|
|
|
|
void cpu_pages::do_resize(size_t new_size, allocate_system_memory_fn alloc_sys_mem) {
|
|
auto new_pages = new_size / page_size;
|
|
if (new_pages <= nr_pages) {
|
|
return;
|
|
}
|
|
auto old_size = nr_pages * page_size;
|
|
auto mmap_start = memory + old_size;
|
|
auto mmap_size = new_size - old_size;
|
|
auto mem = alloc_sys_mem({mmap_start}, mmap_size);
|
|
mem.release();
|
|
::madvise(mmap_start, mmap_size, MADV_HUGEPAGE);
|
|
// one past last page structure is a sentinel
|
|
auto new_page_array_pages = align_up(sizeof(page[new_pages + 1]), page_size) / page_size;
|
|
auto new_page_array
|
|
= reinterpret_cast<page*>(allocate_large(new_page_array_pages));
|
|
std::copy(pages, pages + nr_pages, new_page_array);
|
|
// mark new one-past-last page as taken to avoid boundary conditions
|
|
new_page_array[new_pages].free = false;
|
|
auto old_pages = reinterpret_cast<char*>(pages);
|
|
auto old_nr_pages = nr_pages;
|
|
auto old_pages_size = align_up(sizeof(page[nr_pages + 1]), page_size);
|
|
pages = new_page_array;
|
|
nr_pages = new_pages;
|
|
auto old_pages_start = (old_pages - memory) / page_size;
|
|
if (old_pages_start == 0) {
|
|
// keep page 0 allocated
|
|
old_pages_start = 1;
|
|
old_pages_size -= page_size;
|
|
}
|
|
free_span(old_pages_start, old_pages_size / page_size);
|
|
free_span(old_nr_pages, new_pages - old_nr_pages);
|
|
}
|
|
|
|
void cpu_pages::resize(size_t new_size, allocate_system_memory_fn alloc_memory) {
|
|
new_size = align_down(new_size, huge_page_size);
|
|
while (nr_pages * page_size < new_size) {
|
|
// don't reallocate all at once, since there might not
|
|
// be enough free memory available to relocate the pages array
|
|
auto tmp_size = std::min(new_size, 4 * nr_pages * page_size);
|
|
do_resize(tmp_size, alloc_memory);
|
|
}
|
|
}
|
|
|
|
void cpu_pages::reclaim() {
|
|
current_min_free_pages = 0;
|
|
reclaim_hook([this] {
|
|
for (auto&& r : reclaimers) {
|
|
r->do_reclaim();
|
|
}
|
|
current_min_free_pages = min_free_pages;
|
|
});
|
|
}
|
|
|
|
void cpu_pages::set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
|
|
reclaim_hook = hook;
|
|
current_min_free_pages = min_free_pages;
|
|
}
|
|
|
|
small_pool::small_pool(unsigned object_size) noexcept
|
|
: _object_size(object_size), _span_size(1) {
|
|
while (_object_size > span_bytes()
|
|
|| (_span_size < 32 && waste() > 0.05)
|
|
|| (span_bytes() / object_size < 32)) {
|
|
_span_size *= 2;
|
|
}
|
|
_max_free = std::max<unsigned>(100, span_bytes() * 2 / _object_size);
|
|
_min_free = _max_free / 2;
|
|
}
|
|
|
|
small_pool::~small_pool() {
|
|
_min_free = _max_free = 0;
|
|
trim_free_list();
|
|
}
|
|
|
|
void*
|
|
small_pool::allocate() {
|
|
if (!_free) {
|
|
add_more_objects();
|
|
}
|
|
auto* obj = _free;
|
|
_free = _free->next;
|
|
--_free_count;
|
|
return obj;
|
|
}
|
|
|
|
void
|
|
small_pool::deallocate(void* object) {
|
|
auto o = reinterpret_cast<free_object*>(object);
|
|
o->next = _free;
|
|
_free = o;
|
|
++_free_count;
|
|
if (_free_count >= _max_free) {
|
|
trim_free_list();
|
|
}
|
|
}
|
|
|
|
void
|
|
small_pool::add_more_objects() {
|
|
auto goal = (_min_free + _max_free) / 2;
|
|
while (!_span_list.empty() && _free_count < goal) {
|
|
page& span = _span_list.front(cpu_mem.pages);
|
|
_span_list.pop_front(cpu_mem.pages);
|
|
while (span.freelist) {
|
|
auto obj = span.freelist;
|
|
span.freelist = span.freelist->next;
|
|
obj->next = _free;
|
|
_free = obj;
|
|
++_free_count;
|
|
++span.nr_small_alloc;
|
|
}
|
|
}
|
|
while (_free_count < goal) {
|
|
auto data = reinterpret_cast<char*>(cpu_mem.allocate_large(_span_size));
|
|
auto span = cpu_mem.to_page(data);
|
|
for (unsigned i = 0; i < _span_size; ++i) {
|
|
span[i].offset_in_span = i;
|
|
span[i].pool = this;
|
|
}
|
|
span->nr_small_alloc = 0;
|
|
span->freelist = nullptr;
|
|
for (unsigned offset = 0; offset <= span_bytes() - _object_size; offset += _object_size) {
|
|
auto h = reinterpret_cast<free_object*>(data + offset);
|
|
h->next = _free;
|
|
_free = h;
|
|
++_free_count;
|
|
++span->nr_small_alloc;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
small_pool::trim_free_list() {
|
|
auto goal = (_min_free + _max_free) / 2;
|
|
while (_free && _free_count > goal) {
|
|
auto obj = _free;
|
|
_free = _free->next;
|
|
--_free_count;
|
|
page* span = cpu_mem.to_page(obj);
|
|
span -= span->offset_in_span;
|
|
if (!span->freelist) {
|
|
new (&span->link) page_list_link();
|
|
_span_list.push_front(cpu_mem.pages, *span);
|
|
}
|
|
obj->next = span->freelist;
|
|
span->freelist = obj;
|
|
if (--span->nr_small_alloc == 0) {
|
|
_span_list.erase(cpu_mem.pages, *span);
|
|
cpu_mem.free_span(span - cpu_mem.pages, span->span_size);
|
|
}
|
|
}
|
|
}
|
|
|
|
float small_pool::waste() {
|
|
return (span_bytes() % _object_size) / (1.0 * span_bytes());
|
|
}
|
|
|
|
void* allocate_large(size_t size) {
|
|
unsigned size_in_pages = (size + page_size - 1) >> page_bits;
|
|
assert((size_t(size_in_pages) << page_bits) >= size);
|
|
return cpu_mem.allocate_large(size_in_pages);
|
|
|
|
}
|
|
|
|
void* allocate_large_aligned(size_t align, size_t size) {
|
|
unsigned size_in_pages = (size + page_size - 1) >> page_bits;
|
|
unsigned align_in_pages = std::max(align, page_size) >> page_bits;
|
|
return cpu_mem.allocate_large_aligned(align_in_pages, size_in_pages);
|
|
}
|
|
|
|
void free_large(void* ptr) {
|
|
return cpu_mem.free_large(ptr);
|
|
}
|
|
|
|
size_t object_size(void* ptr) {
|
|
return cpu_pages::all_cpus[object_cpu_id(ptr)]->object_size(ptr);
|
|
}
|
|
|
|
void* allocate(size_t size) {
|
|
++g_allocs;
|
|
if (size <= sizeof(free_object)) {
|
|
size = sizeof(free_object);
|
|
}
|
|
if (size <= max_small_allocation) {
|
|
return cpu_mem.allocate_small(size);
|
|
} else {
|
|
return allocate_large(size);
|
|
}
|
|
}
|
|
|
|
void* allocate_aligned(size_t align, size_t size) {
|
|
++g_allocs;
|
|
size = std::max(size, align);
|
|
if (size <= sizeof(free_object)) {
|
|
size = sizeof(free_object);
|
|
}
|
|
if (size <= max_small_allocation && align <= page_size) {
|
|
// Our small allocator only guarantees alignment for power-of-two
|
|
// allocations which are not larger than a page.
|
|
size = 1 << log2(size);
|
|
return cpu_mem.allocate_small(size);
|
|
} else {
|
|
return allocate_large_aligned(align, size);
|
|
}
|
|
}
|
|
|
|
void free(void* obj) {
|
|
++g_frees;
|
|
cpu_mem.free(obj);
|
|
}
|
|
|
|
void free(void* obj, size_t size) {
|
|
++g_frees;
|
|
cpu_mem.free(obj, size);
|
|
}
|
|
|
|
void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
|
|
cpu_mem.set_reclaim_hook(hook);
|
|
}
|
|
|
|
reclaimer::reclaimer(std::function<void ()> reclaim)
|
|
: _reclaim(std::move(reclaim)) {
|
|
cpu_mem.reclaimers.push_back(this);
|
|
}
|
|
|
|
reclaimer::~reclaimer() {
|
|
auto& r = cpu_mem.reclaimers;
|
|
r.erase(std::find(r.begin(), r.end(), this));
|
|
}
|
|
|
|
void configure(std::vector<resource::memory> m,
|
|
optional<std::string> hugetlbfs_path) {
|
|
size_t total = 0;
|
|
for (auto&& x : m) {
|
|
total += x.bytes;
|
|
}
|
|
allocate_system_memory_fn sys_alloc = allocate_anonymous_memory;
|
|
if (hugetlbfs_path) {
|
|
// std::function is copyable, but file_desc is not, so we must use
|
|
// a shared_ptr to allow sys_alloc to be copied around
|
|
auto fdp = make_lw_shared<file_desc>(file_desc::temporary(*hugetlbfs_path));
|
|
sys_alloc = [fdp] (optional<void*> where, size_t how_much) {
|
|
return allocate_hugetlbfs_memory(*fdp, where, how_much);
|
|
};
|
|
cpu_mem.replace_memory_backing(sys_alloc);
|
|
}
|
|
cpu_mem.resize(total, sys_alloc);
|
|
size_t pos = 0;
|
|
for (auto&& x : m) {
|
|
#ifdef HAVE_NUMA
|
|
unsigned long nodemask = 1UL << x.nodeid;
|
|
auto r = ::mbind(cpu_mem.mem() + pos, x.bytes,
|
|
MPOL_BIND,
|
|
&nodemask, std::numeric_limits<unsigned long>::digits,
|
|
MPOL_MF_MOVE);
|
|
assert(r == 0);
|
|
#endif
|
|
pos += x.bytes;
|
|
}
|
|
if (hugetlbfs_path) {
|
|
cpu_mem.init_virt_to_phys_map();
|
|
}
|
|
}
|
|
|
|
statistics stats() {
|
|
return statistics{g_allocs, g_frees, g_cross_cpu_frees};
|
|
}
|
|
|
|
bool drain_cross_cpu_freelist() {
|
|
return cpu_mem.drain_cross_cpu_freelist();
|
|
}
|
|
|
|
translation
|
|
translate(const void* addr, size_t size) {
|
|
auto cpu_id = object_cpu_id(addr);
|
|
if (cpu_id >= max_cpus) {
|
|
return {};
|
|
}
|
|
auto cp = cpu_pages::all_cpus[cpu_id];
|
|
if (!cp) {
|
|
return {};
|
|
}
|
|
return cp->translate(addr, size);
|
|
}
|
|
|
|
}
|
|
|
|
using namespace memory;
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
[[gnu::externally_visible]]
|
|
void* malloc(size_t n) throw () {
|
|
if (n == 0) {
|
|
return nullptr;
|
|
}
|
|
try {
|
|
return allocate(n);
|
|
} catch (std::bad_alloc& ba) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("malloc")]]
|
|
[[gnu::visibility("default")]]
|
|
void* __libc_malloc(size_t n) throw ();
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
[[gnu::externally_visible]]
|
|
void free(void* ptr) {
|
|
if (ptr) {
|
|
memory::free(ptr);
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("free")]]
|
|
[[gnu::visibility("default")]]
|
|
void* __libc_free(void* obj) throw ();
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
void* calloc(size_t nmemb, size_t size) {
|
|
auto s1 = __int128(nmemb) * __int128(size);
|
|
assert(s1 == size_t(s1));
|
|
size_t s = s1;
|
|
auto p = malloc(s);
|
|
if (p) {
|
|
std::memset(p, 0, s);
|
|
}
|
|
return p;
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("calloc")]]
|
|
[[gnu::visibility("default")]]
|
|
void* __libc_calloc(size_t n, size_t m) throw ();
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
void* realloc(void* ptr, size_t size) {
|
|
auto old_size = ptr ? object_size(ptr) : 0;
|
|
auto nptr = malloc(size);
|
|
if (!nptr) {
|
|
return nptr;
|
|
}
|
|
if (ptr) {
|
|
std::memcpy(nptr, ptr, std::min(size, old_size));
|
|
::free(ptr);
|
|
}
|
|
return nptr;
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("realloc")]]
|
|
[[gnu::visibility("default")]]
|
|
void* __libc_realloc(void* obj, size_t size) throw ();
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
[[gnu::externally_visible]]
|
|
int posix_memalign(void** ptr, size_t align, size_t size) {
|
|
try {
|
|
*ptr = allocate_aligned(align, size);
|
|
return 0;
|
|
} catch (std::bad_alloc&) {
|
|
return ENOMEM;
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("posix_memalign")]]
|
|
[[gnu::visibility("default")]]
|
|
int __libc_posix_memalign(void** ptr, size_t align, size_t size) throw ();
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
void* memalign(size_t align, size_t size) {
|
|
try {
|
|
return allocate_aligned(align, size);
|
|
} catch (std::bad_alloc&) {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
void *aligned_alloc(size_t align, size_t size) {
|
|
try {
|
|
return allocate_aligned(align, size);
|
|
} catch (std::bad_alloc&) {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("memalign")]]
|
|
[[gnu::visibility("default")]]
|
|
void* __libc_memalign(size_t align, size_t size);
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
void cfree(void* obj) {
|
|
return ::free(obj);
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::alias("cfree")]]
|
|
[[gnu::visibility("default")]]
|
|
void __libc_cfree(void* obj);
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
size_t malloc_usable_size(void* obj) {
|
|
return object_size(obj);
|
|
}
|
|
|
|
extern "C"
|
|
[[gnu::visibility("default")]]
|
|
int malloc_trim(size_t pad) {
|
|
return 0;
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void* operator new(size_t size) {
|
|
if (size == 0) {
|
|
size = 1;
|
|
}
|
|
return allocate(size);
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void* operator new[](size_t size) {
|
|
if (size == 0) {
|
|
size = 1;
|
|
}
|
|
return allocate(size);
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete(void* ptr) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete[](void* ptr) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete(void* ptr, size_t size) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr, size);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete[](void* ptr, size_t size) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr, size);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void* operator new(size_t size, std::nothrow_t) throw () {
|
|
if (size == 0) {
|
|
size = 1;
|
|
}
|
|
try {
|
|
return allocate(size);
|
|
} catch (...) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void* operator new[](size_t size, std::nothrow_t) throw () {
|
|
if (size == 0) {
|
|
size = 1;
|
|
}
|
|
try {
|
|
return allocate(size);
|
|
} catch (...) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete(void* ptr, std::nothrow_t) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete[](void* ptr, std::nothrow_t) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete(void* ptr, size_t size, std::nothrow_t) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr, size);
|
|
}
|
|
}
|
|
|
|
[[gnu::visibility("default")]]
|
|
void operator delete[](void* ptr, size_t size, std::nothrow_t) throw () {
|
|
if (ptr) {
|
|
memory::free(ptr, size);
|
|
}
|
|
}
|
|
|
|
void* operator new(size_t size, with_alignment wa) {
|
|
return allocate_aligned(wa.alignment(), size);
|
|
}
|
|
|
|
void* operator new[](size_t size, with_alignment wa) {
|
|
return allocate_aligned(wa.alignment(), size);
|
|
}
|
|
|
|
void operator delete(void* ptr, with_alignment wa) {
|
|
// only called for matching operator new, so we know ptr != nullptr
|
|
return memory::free(ptr);
|
|
}
|
|
|
|
void operator delete[](void* ptr, with_alignment wa) {
|
|
// only called for matching operator new, so we know ptr != nullptr
|
|
return memory::free(ptr);
|
|
}
|
|
|
|
#else
|
|
|
|
namespace memory {
|
|
|
|
reclaimer::reclaimer(std::function<void ()> reclaim) {
|
|
}
|
|
|
|
reclaimer::~reclaimer() {
|
|
}
|
|
|
|
void set_reclaim_hook(std::function<void (std::function<void ()>)> hook) {
|
|
}
|
|
|
|
void configure(std::vector<resource::memory> m, std::experimental::optional<std::string> hugepages_path) {
|
|
}
|
|
|
|
statistics stats() {
|
|
return statistics{0, 0, 0};
|
|
}
|
|
|
|
bool drain_cross_cpu_freelist() {
|
|
return false;
|
|
}
|
|
|
|
translation
|
|
translate(const void* addr, size_t size) {
|
|
return {};
|
|
}
|
|
|
|
}
|
|
|
|
void* operator new(size_t size, with_alignment wa) {
|
|
void* ret;
|
|
if (posix_memalign(&ret, wa.alignment(), size) != 0) {
|
|
throw std::bad_alloc();
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void* operator new[](size_t size, with_alignment wa) {
|
|
void* ret;
|
|
if (posix_memalign(&ret, wa.alignment(), size) != 0) {
|
|
throw std::bad_alloc();
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void operator delete(void* ptr, with_alignment wa) {
|
|
return ::free(ptr);
|
|
}
|
|
|
|
void operator delete[](void* ptr, with_alignment wa) {
|
|
return ::free(ptr);
|
|
}
|
|
|
|
#endif
|