Shore Storage Manager: src/fc/mem

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 /*<std-header orig-src='shore' incl-file-exclusion='MEM_BLOCK_CPP'>
00024 
00025  $Id: mem_block.cpp,v 1.7 2010/08/23 23:01:07 nhall Exp $
00026 
00027 SHORE -- Scalable Heterogeneous Object REpository
00028 
00029 Copyright (c) 1994-99 Computer Sciences Department, University of
00030                       Wisconsin -- Madison
00031 All Rights Reserved.
00032 
00033 Permission to use, copy, modify and distribute this software and its
00034 documentation is hereby granted, provided that both the copyright
00035 notice and this permission notice appear in all copies of the
00036 software, derivative works or modified versions, and any portions
00037 thereof, and that both notices appear in supporting documentation.
00038 
00039 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00040 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00041 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00042 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00043 
00044 This software was developed with support by the Advanced Research
00045 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00046 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00047 Further funding for this work was provided by DARPA through
00048 Rome Research Laboratory Contract No. F30602-97-2-0247.
00049 
00050 */
00051 
00052 /**\cond skip */
00053 
00054 #include "w.h"
00055 #include "mem_block.h"
00056 #include "atomic_ops.h"
00057 #include <cstdlib>
00058 #include <stdio.h>
00059 #include <algorithm>
00060 #ifdef __linux
00061 #include <malloc.h>
00062 #endif
00063 
00064 // #include <cassert>
00065 #undef assert
00066 void assert_failed(const char *desc, const char *f, int l) {
00067     fprintf(stdout, "Assertion failed: %s at line %d file %s ", desc, l,f);
00068     w_assert0(0);
00069 }
00070 #define assert(x)   if (!(x)) assert_failed(#x, __FILE__, __LINE__);
00071 
00072 #define TEMPLATE_ARGS chip_size, chip_count, block_size
00073 
00074 #ifdef __GNUC__
00075 #if defined(__x86_64) || defined(i386) || defined(__i386__)
00076 #define NEED_POPC64 0
00077 #elif defined(__sparcv9)
00078 #define NEED_POPC64 0
00079 #else
00080 #define NEED_POPC64 1
00081 #endif
00082 #else // !defined(__GNUC__)
00083 #define NEED_POPC64 1
00084 #endif
00085 
00086 namespace memory_block {
00087 #if 0
00088 } // keep emacs happy...
00089 #endif
00090 
00091 #if NEED_POPC64==1
00092 // adapted from http://chessprogramming.wikispaces.com/Population+Count#SWAR-Popcount
00093 typedef unsigned long long u64;
00094 static inline
00095 long popc64(u64 x) {
00096     u64 k1 = 0x5555555555555555ull;
00097     u64 k2 = 0x3333333333333333ull;
00098     u64 k4 = 0x0f0f0f0f0f0f0f0full;
00099     u64 kf = 0x0101010101010101ull;
00100     x =  x       - ((x >> 1)  & k1); //put count of each 2 bits into those 2 bits
00101     x = (x & k2) + ((x >> 2)  & k2); //put count of each 4 bits into those 4 bits
00102     x = (x       +  (x >> 4)) & k4 ; //put count of each 8 bits into those 8 bits
00103     x = (x * kf) >> 56; //returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ...
00104     return x;
00105 }
00106 #endif
00107 
00108 size_t        block_bits::_popc(bitmap bm) {
00109 #if NEED_POPC64==1
00110 #warning "using home-brew popc routine"
00111     return popc64(bm);
00112 #elif defined(__x86_64) || defined(i386) || defined(__i386__)
00113 // #warning "Using __builtin_popcountll"
00114     return __builtin_popcountll(bm);
00115 #elif defined(__sparcv9)
00116 #warning "Using gcc inline asm to access sparcv9 'popc' instruction"
00117     long rval;
00118     __asm__("popc    %[in], %[out]" : [out] "=r"(rval) : [in] "r"(x));
00119     return rval;
00120 #else
00121 #error "NEED_POPC64: Platform not supported"
00122 #endif 
00123 }
00124 
00125 block_bits::block_bits(size_t chip_count)
00126     : _usable_chips(create_mask(chip_count))
00127     , _zombie_chips(0)
00128 {
00129     assert(chip_count <= 8*sizeof(bitmap));
00130 }
00131 
00132 size_t block_bits::acquire_contiguous(size_t chip_count) {
00133     (void) chip_count; // make gcc happy...
00134     
00135     /* find the rightmost set bit.
00136 
00137        If the map is smaller than the word size, but logically full we
00138        will compute an index equal to the capacity. If the map is
00139        physically full (_available == 0) then we'll still compute an
00140        index equal to capacity because 0-1 will underflow to all set
00141        bits. Therefore the check for full is the same whether the
00142        bitmap can use all its bits or not.
00143      */
00144     bitmap one_bit = _usable_chips &- _usable_chips;
00145     size_t index = _popc(one_bit-1);
00146     if(index < 8*sizeof(bitmap)) {
00147         // common case: we have space
00148         assert(index < chip_count);
00149         _usable_chips ^= one_bit;
00150     }
00151     else {
00152         // oops... full
00153         assert(index == 8*sizeof(bitmap));
00154     }
00155 
00156     return index;
00157 }
00158 
00159 void block_bits::release_contiguous(size_t index, size_t chip_count) {
00160     // assign this chip to the zombie set for later recycling
00161     (void) chip_count; // keep gcc happy
00162     assert(index < chip_count);
00163     bitmap to_free = bitmap(1) << index;
00164     assert(! (to_free & *usable_chips()));
00165 #ifdef I_WISH
00166     bitmap was_free = __sync_fetch_and_or(&_zombie_chips, to_free);
00167 #else
00168     membar_exit();
00169     bitmap volatile* ptr = &_zombie_chips;
00170     bitmap ov = *ptr;
00171     while(1) {
00172         bitmap nv = ov | to_free;
00173         bitmap cv = atomic_cas_64(ptr, ov, nv);
00174         if(cv == ov)
00175             break;
00176         ov = cv;
00177     }
00178     bitmap was_free = ov;
00179 #endif
00180     (void) was_free; // keep gcc happy
00181 
00182     assert( ! (was_free & to_free));
00183 }
00184 
00185 block_bits::bitmap block_bits::create_mask(size_t bits_set) {
00186     // doing it this way allows us to create a bitmap of all ones if need be
00187     return ~bitmap(0) >> (8*sizeof(bitmap) - bits_set);
00188 }
00189 
00190 void block_bits::recycle() {
00191     /* recycle the block.
00192 
00193        Whatever bits have gone zombie since we last recycled become
00194        the new set of usable bits. We also XOR them atomically back
00195        into the zombie set to clear them out there. That way we don't
00196        leak bits if a releasing thread races us and adds more bits to the
00197        zombie set after we read it.
00198     */
00199     bitmap newly_usable = *&_zombie_chips;
00200     _usable_chips |= newly_usable;
00201 #ifdef I_WISH
00202     __sync_xor_and_fetch(&_zombie_chips, newly_usable);
00203 #else
00204     membar_exit();
00205     bitmap volatile* ptr = &_zombie_chips;
00206     bitmap ov = *ptr;
00207     while(1) {
00208         bitmap nv = ov ^ newly_usable;
00209         bitmap cv = atomic_cas_64(ptr, ov, nv);
00210         if(cv == ov)
00211             break;
00212         ov = cv;
00213     }
00214 #endif
00215 }
00216 
00217 void* block::acquire_chip(size_t chip_size, size_t chip_count, size_t /*block_size*/) {
00218     size_t index = _bits.acquire_contiguous(chip_count); 
00219     return (index < chip_count)? _get(index, chip_size) : 0;
00220 }
00221 
00222 void block::release_chip(void* ptr, size_t chip_size, size_t chip_count, size_t block_size)
00223 {
00224     /* use pointer arithmetic to find the beginning of our block,
00225        where the block* lives.
00226 
00227        Our caller is responsible to be sure this address actually
00228        falls inside a memory block
00229      */
00230     union { void* v; size_t n; block* b; char* c; } u = {ptr}, v=u;
00231     u.n &= -block_size;
00232     size_t offset = v.c - u.b->_data;
00233     size_t idx = offset/chip_size;
00234     assert(u.b->_data + idx*chip_size == ptr);
00235     u.b->_bits.release_contiguous(idx, chip_count);
00236 }
00237 
00238 char* block::_get(size_t index, size_t chip_size) {
00239     return _data + index*chip_size;
00240 }
00241 
00242 block::block(size_t chip_size, size_t chip_count, size_t block_size)
00243     : _bits(chip_count)
00244     , _owner(0)
00245     , _next(0)
00246 {
00247     // make sure all the chips actually fit in this block
00248     char* end_of_block = _get(0, chip_size)-sizeof(this)+block_size;
00249     char* end_of_chips = _get(chip_count, chip_size);
00250     (void) end_of_block; // keep gcc happy
00251     (void) end_of_chips; // keep gcc happy
00252     assert(end_of_chips <= end_of_block);
00253     
00254     /* We purposefully don't check alignment here because some parts
00255        of the impl cheat for blocks which will never be used to
00256        allocate anything (the fake_block being the main culprit).
00257        The block_pool does check alignment, though.
00258      */
00259 }
00260 
00261 void* block_list::acquire(size_t chip_size, size_t chip_count, size_t block_size)
00262 {
00263     if(void* ptr = _tail->acquire_chip(TEMPLATE_ARGS)) {
00264         _hit_count++;
00265         return ptr;
00266     }
00267 
00268     // darn... gotta do it the hard way
00269     return _slow_acquire(TEMPLATE_ARGS);
00270 }
00271 
00272 
00273 block_list::block_list(block_pool* pool, size_t chip_size, size_t chip_count, size_t block_size)
00274     : _fake_block(TEMPLATE_ARGS)
00275     , _tail(&_fake_block)
00276     , _pool(pool)
00277     , _hit_count(0)
00278     , _avg_hit_rate(0)
00279 {
00280     /* make the fake block advertise that it has nothing to give
00281 
00282        The first time the user tries to /acquire/ the fast case will
00283        detect that the fake block is "full" and fall back to the slow
00284        case. The slow case knows about the fake block and will remove
00285        it from the list.
00286 
00287        This trick lets us minimize the number of branches required by
00288        the fast path acquire.
00289      */
00290     _fake_block._bits._usable_chips = 0;
00291     _fake_block._bits._zombie_chips = 0;
00292 }
00293 
00294 
00295 void* block_list::_slow_acquire(size_t chip_size, 
00296         size_t chip_count, size_t block_size)
00297 {
00298     _change_blocks(TEMPLATE_ARGS);
00299     return acquire(TEMPLATE_ARGS);
00300 }
00301 
00302 block* block_list::acquire_block(size_t block_size)
00303 {
00304     union { block* b; uintptr_t n; } u = {_pool->acquire_block(this)};
00305     (void) block_size; // keep gcc happy
00306     assert((u.n & -block_size) == u.n);
00307     return u.b;
00308     
00309 }
00310 void block_list::_change_blocks(size_t chip_size, 
00311         size_t chip_count, size_t block_size)
00312 {
00313     (void) chip_size; // keep gcc happy
00314 
00315     // first time through?
00316     if(_tail == &_fake_block) {
00317         _tail = acquire_block(block_size);
00318         _tail->_next = _tail;
00319         return;
00320     }
00321     
00322     /* Check whether we're chewing through our blocks too fast for the
00323        current ring size
00324 
00325        If we consistently recycle blocks while they're still more than
00326        half full then we need a bigger ring so old blocks have more
00327        time to cool off between reuses.
00328        
00329        To respond to end-of-spike gracefully, we're going to be pretty
00330        aggressive about returning blocks to the global pool: when
00331        recycling blocks we check for runs of blocks which do not meet
00332        some minimum utilization threshold, discarding all but one of
00333        them (keep the last for buffering purposes).
00334     */
00335     static double const    decay_rate = 1./5; // consider (roughly) the last 5 blocks
00336     // too few around suggests we should unload some extra blocks 
00337     size_t const max_available = chip_count - std::max((int)(.1*chip_count), 1);
00338     // more than 50% in-use suggests we've got too few blocks
00339     size_t const min_allocated = (chip_count+1)/2;
00340     
00341     _avg_hit_rate = _hit_count*(1-decay_rate) + _avg_hit_rate*decay_rate;
00342     if(_hit_count < min_allocated && _avg_hit_rate < min_allocated) {
00343         // too fast.. grow the ring
00344         block* new_block = acquire_block(block_size);
00345         new_block->_next = _tail->_next;
00346         _tail = _tail->_next = new_block;
00347     }
00348     else {
00349         // compress the run, if any
00350         block* prev = 0;
00351         block* cur = _tail;
00352         block* next;
00353         while(1) {
00354             next = cur->_next;
00355             next->recycle();
00356             if(next->_bits.usable_count() <= max_available)
00357             break;
00358             
00359             // compression possible?
00360             if(prev) {
00361                 assert(prev != cur);
00362                 assert(cur->_bits.usable_count() > max_available);
00363                 assert(next->_bits.usable_count() > max_available);
00364                 prev->_next = next;
00365                 _pool->release_block(cur);
00366                 cur = prev; // reset
00367             }
00368 
00369             // avoid the endless loop
00370             if(next == _tail) break;
00371             
00372             prev = cur;
00373             cur = next;
00374         }
00375 
00376         // recycle, repair the ring, and advance
00377         _tail = cur;
00378     }
00379 
00380     _hit_count = 0;
00381 }
00382 
00383 block_list::~block_list() {
00384     // don't free the fake block if we went unused!
00385     if(_tail == &_fake_block) return;
00386     
00387     // break the cycle so the loop terminates
00388     block* cur = _tail->_next;
00389     _tail->_next = 0;
00390 
00391     // release blocks until we hit the NULL
00392     while( (cur=_pool->release_block(cur)) ) ;
00393 }
00394 
00395 /**\endcond skip */
00396 } // namespace memory_block