lsn.h

00001 /* -*- mode:C++; c-basic-offset:4 -*-
00002      Shore-MT -- Multi-threaded port of the SHORE storage manager
00003    
00004                        Copyright (c) 2007-2009
00005       Data Intensive Applications and Systems Labaratory (DIAS)
00006                Ecole Polytechnique Federale de Lausanne
00007    
00008                          All Rights Reserved.
00009    
00010    Permission to use, copy, modify and distribute this software and
00011    its documentation is hereby granted, provided that both the
00012    copyright notice and this permission notice appear in all copies of
00013    the software, derivative works or modified versions, and any
00014    portions thereof, and that both notices appear in supporting
00015    documentation.
00016    
00017    This code is distributed in the hope that it will be useful, but
00018    WITHOUT ANY WARRANTY; without even the implied warranty of
00019    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS
00020    DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
00021    RESULTING FROM THE USE OF THIS SOFTWARE.
00022 */
00023 
00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_S_H'>
00025 
00026  $Id: lsn.h,v 1.4 2010/08/23 14:28:14 nhall Exp $
00027 
00028 SHORE -- Scalable Heterogeneous Object REpository
00029 
00030 Copyright (c) 1994-99 Computer Sciences Department, University of
00031                       Wisconsin -- Madison
00032 All Rights Reserved.
00033 
00034 Permission to use, copy, modify and distribute this software and its
00035 documentation is hereby granted, provided that both the copyright
00036 notice and this permission notice appear in all copies of the
00037 software, derivative works or modified versions, and any portions
00038 thereof, and that both notices appear in supporting documentation.
00039 
00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY
00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS
00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND
00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
00044 
00045 This software was developed with support by the Advanced Research
00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by
00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518.
00048 Further funding for this work was provided by DARPA through
00049 Rome Research Laboratory Contract No. F30602-97-2-0247.
00050 
00051 */
00052 
00053 #ifndef LSN_H
00054 #define LSN_H
00055 
00056 #include "w_defines.h"
00057 
00058 /*  -- do not edit anything above this line --   </std-header>*/
00059 
00060 #include "w_base.h"
00061 
00062 #define LSN_T
00063 /* FRJ: Major changes to lsn_t
00064  * Once the database runs long enough we will run out of
00065  * partition numbers (only 64k possible). 
00066  * Fortunately, this is a log, so lsn_t don't last forever. 
00067  * Eventually things become durable and the log partition 
00068  * file gets reclaimed (deleted). 
00069  * As long as the first partition is gone before 
00070  * the last one fills, we can simply wrap and change the 
00071  * sense of lsn_t comparisions.
00072  * as follows:
00073  *
00074    Suppose we use unsigned 8 bit partition numbers. If the current
00075    global lsn has pnum >= 128 (MSB set), any lsn we encounter with
00076    pnum < 128 (MSB clear) must be older:
00077                                                                                
00078    0        1        ...        126        127        128        129        ...         254        255
00079        
00080 
00081    On the other hand, if the current global lsn has pnum < 128
00082    (MSB clear), any lsn we encounter with pnum >= 128 (MSB set)
00083    must be *older* (ie, the pnum recently wrapped from 255 back to
00084    0):
00085 
00086    128        129        ...         254        255        0        1        ...        126        127
00087 
00088    The first situation is easy: regular comparisons provide the
00089    ordering we want; The second is trickier due to the wrapping of
00090    partition numbers. Fortunately, there's an easy way around the
00091    problem!  Since the MSB of the pnum is also the MSB of the
00092    64-bit value holding the lsn, if comparisons interpret the
00093    64-bit value as signed we get the proper ordering:
00094 
00095    -128        -127        ...        -2        -1        0        1        ...        126        127
00096 
00097    We assume there are enough lsn_t that less than half are in use at
00098    a time. So, we divide the universe of lsn's into four regions,
00099    marked by the most significant two bits of the file.
00100 
00101    00+01 - don't care
00102    01+10 - unsigned comparisons
00103    10+11 - unsigned comparisons
00104    11+00 - signed comparisons
00105 
00106    For now, though, we'll just assume overflow doesn't happen ;)
00107 */
00108 
00109 typedef w_base_t::int8_t sm_diskaddr_t;
00110 
00111 /**\addtogroup LSNS 
00112  *
00113  * \section LLR Locates Log Records 
00114  * A log sequence number generally points to a record in the log.
00115  * It consists of two parts:
00116  * - hi(), a.k.a., file(). This is a number that matches a log partition
00117  *                         file, e.g., "log.<file>"
00118  * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is
00119  *                        the first byte of a log record, or is the first
00120  *                        byte after the last log record in the file (where
00121  *                        the next log record could be written).
00122  *
00123  * \note Once the database runs long enough we will run out of
00124  * partition numbers (only 64k possible). 
00125  * Fortunately, this is a log, so lsn_t don't last forever. 
00126  * Eventually things become durable and the log partition 
00127  * file gets reclaimed (deleted). 
00128  * As long as the first partition is gone before 
00129  * the last one fills, we can simply wrap and change the 
00130  * sense of lsn_t comparisions.
00131  *
00132  * \subsection WORK Identifying Limit of Partial Rollback
00133  *
00134  * A savepoint (sm_save_point_t) is an lsn_t. It tells how far back
00135  * a transaction should undo its actions when ss_m::rollback_work is called.
00136  *
00137  * \section PTS Page Timestamps
00138  * Each page has an lsn_t that acts as a timestamp; it is the
00139  * lsn of the last log record that describes an update to the page.
00140  *
00141  * In recovery, when a page is read in, all log records with sequence
00142  * numbers less than the page lsn have been applied, and redo of these
00143  * log records is not necessary.
00144  *
00145  * The storage manager has other special cases: lsn_t(0,1) -- this is
00146  * in page.cpp, page_p::_format(), and indicates a freshly formatted page
00147  * with no further updates.
00148  * \subsection NPCD Nominal Page Corruption-Detection
00149  * Pages have two copies of their page lsn; one at the head and one at the
00150  * end of the page. Presumably if the two match, the page is uncorrupted,
00151  * though that is no guarantee.  Certainly if they do not match, 
00152  * something is wrong.
00153  *
00154  * \section BPFS Buffer Pool Frame Status 
00155  * The buffer pool's control blocks (bfcb_t) contain an lsn_t,
00156  * the "recovery lsn" or rec_lsn.  This is a timestamp that can
00157  * be compared with the page lsns to determine if the copy in the
00158  * buffer pool is up-to-date or not.
00159  *
00160  * A recovery lsn is a lower bound on the lsn of a log record
00161  * that updated the page in the frame. 
00162  * A clean page in the buffer pool has a rec_lsn of lsn_t::null.
00163  * Each time a page is fixed in EX mode, the buffer control block 
00164  * ensures that the rec_lsn is not lsn_t::null, thereby indicating that
00165  * this page is probably dirty and needs flushing or, possibly, 
00166  * is being flushed. 
00167  * The rec_lsn is set to the tail of the log at the time the fix is done; this
00168  * ensures that any log record written for an update to the page has at
00169  * least the rec_lsn sequence number. There might be several updates to
00170  * the page before the page is cleaned, so the rec_lsn is indeed a lower
00171  * bound, and that's all we can know about it.
00172  *
00173  * Special cases of log sequence numbers:
00174  * - null: not a valid lsn_t
00175  * - max:  soon to overflow
00176  * - lsn(0,1) : used when some pages are formatted.
00177  *
00178  */
00179 
00180 /*\bug GNATS 136: TODO make thread-safe for 32-bit platform */
00181 /**\brief Log Sequence Number. See \ref LSNS.
00182  *
00183  * \ingroup LSNS
00184  * \details
00185  *
00186  * A log sequence number points to a record in the log.
00187  * It consists of two parts:
00188  * - hi(), a.k.a., file(). This is a number that matches a log partition
00189  *                         file, e.g., "log.<file>"
00190  * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is
00191  *                        the first byte of a log record, or is the first
00192  *                        byte after the last log record in the file (where
00193  *                        the next log record could be written).
00194  *
00195  * All state is  stored in a single 64-bit value. 
00196  * This reading or setting is atomic on 
00197  * 64-bit platforms (though updates still need protection). 
00198  * \warning This is NOT atomic on 32-bit platforms.
00199  *
00200  * Because all state fits in 64 bits, 
00201  * there is a trade-off between maximum supported log partition size 
00202  * and number of partitions. Two reasonable choices are:
00203  *
00204  * - 16-bit partition numbers, up to 256TB per partition
00205  * - 32-bit partition numbers, up to 4GB per partition
00206  *
00207  * 48-bit offsets are larger, but (slightly) more expensive and likely 
00208  * to wrap sooner. 
00209  * 32-bit offsets are still pretty big, and the chance of wrapping 
00210  * is *much* smaller (though a production system could theoretically 
00211  * hit the limit, since the count persists as long as the database 
00212  * exists. 
00213  * For now we go with the 32-32 split.
00214  *
00215  * lsn_t no longer cares whether the disk can handle the full range
00216  * it supports. 
00217  * If you support 48-bit partition sizes and the disk can
00218  * only handle 32-bit offsets, the largest file will just happen to be
00219  * smaller than lsn_t technically supports.
00220  *
00221  * lsn_t does not cater to unaligned accesses. 
00222  * Log writes, in particular, 
00223  * are expected to be 8-byte aligned. 
00224  * The extra wasted bytes just aren't worth the performance hit of allowing
00225  * misalignment.
00226  *
00227  * \note Once the database runs long enough we will run out of
00228  * partition numbers (only 64k possible). 
00229  * Fortunately, this is a log, so lsn_t don't last forever. 
00230  * Eventually things become durable and the log partition 
00231  * file gets reclaimed (deleted). 
00232  * As long as the first partition is gone before 
00233  * the last one fills, we can simply wrap and change the 
00234  * sense of lsn_t comparisions.
00235  *
00236  */
00237 class lsn_t {
00238     enum { file_hwm  =    0xffff }; 
00239 public:
00240     enum { PARTITION_BITS=16 };
00241     enum { PARTITION_SHIFT=(64-PARTITION_BITS) };
00242 
00243     static w_base_t::uint8_t mask() {
00244         static w_base_t::uint8_t const ONE = 1;
00245         return (ONE << PARTITION_SHIFT)-1;
00246     }
00247     
00248     lsn_t() : _data(0) { }
00249 
00250     lsn_t(w_base_t::uint4_t f, sm_diskaddr_t r) : 
00251                 _data(from_file(f) | from_rba(r)) { }
00252 
00253     // copy operator
00254     lsn_t(const lsn_t & other) : _data(other._data) { }
00255 
00256     bool valid()             const { 
00257                                     // valid is essentially iff file != 0
00258 #if W_DEBUG_LEVEL > 1
00259                                     w_base_t::uint8_t  copy_of_data =  _data;
00260                                     w_base_t::uint8_t  m =  mask();
00261                                     bool first = copy_of_data > m;
00262                                     w_base_t::uint8_t  f = 
00263                                                     to_file(copy_of_data);
00264                                     bool second = (f != 0);
00265                                     w_assert2(first == second);
00266 #endif
00267                                    return (_data > mask()); 
00268                             }
00269 
00270     w_base_t::uint4_t hi()   const  { return file(); }
00271     w_base_t::uint4_t file() const { return to_file(_data); }
00272 
00273     sm_diskaddr_t     lo()   const  { return rba(); }
00274     sm_diskaddr_t     rba()  const { return to_rba(_data); }
00275 
00276     // WARNING: non-atomic read-modify-write operations!
00277     void copy_rba(const lsn_t &other) { 
00278                 _data = get_file(_data) | get_rba(other._data); }
00279     void set_rba(sm_diskaddr_t &other) { 
00280                 _data = get_file(_data) | get_rba(other); }
00281     
00282     // WARNING: non-atomic read-modify-write operations!
00283     lsn_t& advance(int amt) { _data += amt; return *this; }
00284 
00285     lsn_t &operator+=(long delta) { return advance(delta); }
00286     lsn_t operator+(long delta) const { return lsn_t(*this).advance(delta); }
00287 
00288     bool operator>(const lsn_t& l) const { return l < *this; }
00289     bool operator<(const lsn_t& l) const { return _data < l._data; }
00290     bool operator>=(const lsn_t& l) const { return !(*this < l); }
00291     bool operator<=(const lsn_t& l) const { return !(*this > l); }
00292     bool operator==(const lsn_t& l) const { return _data == l._data; }
00293     bool operator!=(const lsn_t& l) const { return !(*this == l); }
00294 
00295 
00296 /*
00297  * This is the SM's idea of on-disk and in-structure 
00298  * things that reflect the size of a "disk".  It
00299  * is different from fileoff_t because the two are
00300  * orthogonal.  A system may be constructed that
00301  * has big addresses, but is only running on 
00302  * a "small file" environment.
00303  */
00304     static const int sm_diskaddr_max;
00305 
00306     static const lsn_t null;
00307     static const lsn_t max;
00308     
00309 private:
00310     w_base_t::uint8_t        _data;
00311 
00312     static w_base_t::uint4_t to_file(w_base_t::uint8_t f) { 
00313                 return (w_base_t::uint4_t) (f >> PARTITION_SHIFT); }
00314 
00315     static w_base_t::uint8_t get_file(w_base_t::uint8_t data) { 
00316                 return data &~ mask(); }
00317 
00318     static w_base_t::uint8_t from_file(w_base_t::uint4_t data) { 
00319                 return ((w_base_t::uint8_t) data) << PARTITION_SHIFT; }
00320     
00321     static sm_diskaddr_t to_rba(w_base_t::uint8_t r) { 
00322                 return (sm_diskaddr_t) (r & mask()); }
00323 
00324     static w_base_t::uint8_t get_rba(w_base_t::uint8_t data) { 
00325                 return to_rba(data); }
00326 
00327     static w_base_t::uint8_t from_rba(sm_diskaddr_t data) { 
00328                 return to_rba(data); }
00329 };
00330 #endif

Generated on Mon Nov 8 11:12:36 2010 for Shore Storage Manager by  doxygen 1.4.7