00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_S_H'> 00025 00026 $Id: lsn.h,v 1.4 2010/08/23 14:28:14 nhall Exp $ 00027 00028 SHORE -- Scalable Heterogeneous Object REpository 00029 00030 Copyright (c) 1994-99 Computer Sciences Department, University of 00031 Wisconsin -- Madison 00032 All Rights Reserved. 00033 00034 Permission to use, copy, modify and distribute this software and its 00035 documentation is hereby granted, provided that both the copyright 00036 notice and this permission notice appear in all copies of the 00037 software, derivative works or modified versions, and any portions 00038 thereof, and that both notices appear in supporting documentation. 00039 00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00044 00045 This software was developed with support by the Advanced Research 00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00048 Further funding for this work was provided by DARPA through 00049 Rome Research Laboratory Contract No. F30602-97-2-0247. 00050 00051 */ 00052 00053 #ifndef LSN_H 00054 #define LSN_H 00055 00056 #include "w_defines.h" 00057 00058 /* -- do not edit anything above this line -- </std-header>*/ 00059 00060 #include "w_base.h" 00061 00062 #define LSN_T 00063 /* FRJ: Major changes to lsn_t 00064 * Once the database runs long enough we will run out of 00065 * partition numbers (only 64k possible). 00066 * Fortunately, this is a log, so lsn_t don't last forever. 00067 * Eventually things become durable and the log partition 00068 * file gets reclaimed (deleted). 00069 * As long as the first partition is gone before 00070 * the last one fills, we can simply wrap and change the 00071 * sense of lsn_t comparisions. 00072 * as follows: 00073 * 00074 Suppose we use unsigned 8 bit partition numbers. If the current 00075 global lsn has pnum >= 128 (MSB set), any lsn we encounter with 00076 pnum < 128 (MSB clear) must be older: 00077 00078 0 1 ... 126 127 128 129 ... 254 255 00079 00080 00081 On the other hand, if the current global lsn has pnum < 128 00082 (MSB clear), any lsn we encounter with pnum >= 128 (MSB set) 00083 must be *older* (ie, the pnum recently wrapped from 255 back to 00084 0): 00085 00086 128 129 ... 254 255 0 1 ... 126 127 00087 00088 The first situation is easy: regular comparisons provide the 00089 ordering we want; The second is trickier due to the wrapping of 00090 partition numbers. Fortunately, there's an easy way around the 00091 problem! Since the MSB of the pnum is also the MSB of the 00092 64-bit value holding the lsn, if comparisons interpret the 00093 64-bit value as signed we get the proper ordering: 00094 00095 -128 -127 ... -2 -1 0 1 ... 126 127 00096 00097 We assume there are enough lsn_t that less than half are in use at 00098 a time. So, we divide the universe of lsn's into four regions, 00099 marked by the most significant two bits of the file. 00100 00101 00+01 - don't care 00102 01+10 - unsigned comparisons 00103 10+11 - unsigned comparisons 00104 11+00 - signed comparisons 00105 00106 For now, though, we'll just assume overflow doesn't happen ;) 00107 */ 00108 00109 typedef w_base_t::int8_t sm_diskaddr_t; 00110 00111 /**\addtogroup LSNS 00112 * 00113 * \section LLR Locates Log Records 00114 * A log sequence number generally points to a record in the log. 00115 * It consists of two parts: 00116 * - hi(), a.k.a., file(). This is a number that matches a log partition 00117 * file, e.g., "log.<file>" 00118 * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is 00119 * the first byte of a log record, or is the first 00120 * byte after the last log record in the file (where 00121 * the next log record could be written). 00122 * 00123 * \note Once the database runs long enough we will run out of 00124 * partition numbers (only 64k possible). 00125 * Fortunately, this is a log, so lsn_t don't last forever. 00126 * Eventually things become durable and the log partition 00127 * file gets reclaimed (deleted). 00128 * As long as the first partition is gone before 00129 * the last one fills, we can simply wrap and change the 00130 * sense of lsn_t comparisions. 00131 * 00132 * \subsection WORK Identifying Limit of Partial Rollback 00133 * 00134 * A savepoint (sm_save_point_t) is an lsn_t. It tells how far back 00135 * a transaction should undo its actions when ss_m::rollback_work is called. 00136 * 00137 * \section PTS Page Timestamps 00138 * Each page has an lsn_t that acts as a timestamp; it is the 00139 * lsn of the last log record that describes an update to the page. 00140 * 00141 * In recovery, when a page is read in, all log records with sequence 00142 * numbers less than the page lsn have been applied, and redo of these 00143 * log records is not necessary. 00144 * 00145 * The storage manager has other special cases: lsn_t(0,1) -- this is 00146 * in page.cpp, page_p::_format(), and indicates a freshly formatted page 00147 * with no further updates. 00148 * \subsection NPCD Nominal Page Corruption-Detection 00149 * Pages have two copies of their page lsn; one at the head and one at the 00150 * end of the page. Presumably if the two match, the page is uncorrupted, 00151 * though that is no guarantee. Certainly if they do not match, 00152 * something is wrong. 00153 * 00154 * \section BPFS Buffer Pool Frame Status 00155 * The buffer pool's control blocks (bfcb_t) contain an lsn_t, 00156 * the "recovery lsn" or rec_lsn. This is a timestamp that can 00157 * be compared with the page lsns to determine if the copy in the 00158 * buffer pool is up-to-date or not. 00159 * 00160 * A recovery lsn is a lower bound on the lsn of a log record 00161 * that updated the page in the frame. 00162 * A clean page in the buffer pool has a rec_lsn of lsn_t::null. 00163 * Each time a page is fixed in EX mode, the buffer control block 00164 * ensures that the rec_lsn is not lsn_t::null, thereby indicating that 00165 * this page is probably dirty and needs flushing or, possibly, 00166 * is being flushed. 00167 * The rec_lsn is set to the tail of the log at the time the fix is done; this 00168 * ensures that any log record written for an update to the page has at 00169 * least the rec_lsn sequence number. There might be several updates to 00170 * the page before the page is cleaned, so the rec_lsn is indeed a lower 00171 * bound, and that's all we can know about it. 00172 * 00173 * Special cases of log sequence numbers: 00174 * - null: not a valid lsn_t 00175 * - max: soon to overflow 00176 * - lsn(0,1) : used when some pages are formatted. 00177 * 00178 */ 00179 00180 /*\bug GNATS 136: TODO make thread-safe for 32-bit platform */ 00181 /**\brief Log Sequence Number. See \ref LSNS. 00182 * 00183 * \ingroup LSNS 00184 * \details 00185 * 00186 * A log sequence number points to a record in the log. 00187 * It consists of two parts: 00188 * - hi(), a.k.a., file(). This is a number that matches a log partition 00189 * file, e.g., "log.<file>" 00190 * - lo(), a.k.a., rba(). This is byte-offset into the log partition, and is 00191 * the first byte of a log record, or is the first 00192 * byte after the last log record in the file (where 00193 * the next log record could be written). 00194 * 00195 * All state is stored in a single 64-bit value. 00196 * This reading or setting is atomic on 00197 * 64-bit platforms (though updates still need protection). 00198 * \warning This is NOT atomic on 32-bit platforms. 00199 * 00200 * Because all state fits in 64 bits, 00201 * there is a trade-off between maximum supported log partition size 00202 * and number of partitions. Two reasonable choices are: 00203 * 00204 * - 16-bit partition numbers, up to 256TB per partition 00205 * - 32-bit partition numbers, up to 4GB per partition 00206 * 00207 * 48-bit offsets are larger, but (slightly) more expensive and likely 00208 * to wrap sooner. 00209 * 32-bit offsets are still pretty big, and the chance of wrapping 00210 * is *much* smaller (though a production system could theoretically 00211 * hit the limit, since the count persists as long as the database 00212 * exists. 00213 * For now we go with the 32-32 split. 00214 * 00215 * lsn_t no longer cares whether the disk can handle the full range 00216 * it supports. 00217 * If you support 48-bit partition sizes and the disk can 00218 * only handle 32-bit offsets, the largest file will just happen to be 00219 * smaller than lsn_t technically supports. 00220 * 00221 * lsn_t does not cater to unaligned accesses. 00222 * Log writes, in particular, 00223 * are expected to be 8-byte aligned. 00224 * The extra wasted bytes just aren't worth the performance hit of allowing 00225 * misalignment. 00226 * 00227 * \note Once the database runs long enough we will run out of 00228 * partition numbers (only 64k possible). 00229 * Fortunately, this is a log, so lsn_t don't last forever. 00230 * Eventually things become durable and the log partition 00231 * file gets reclaimed (deleted). 00232 * As long as the first partition is gone before 00233 * the last one fills, we can simply wrap and change the 00234 * sense of lsn_t comparisions. 00235 * 00236 */ 00237 class lsn_t { 00238 enum { file_hwm = 0xffff }; 00239 public: 00240 enum { PARTITION_BITS=16 }; 00241 enum { PARTITION_SHIFT=(64-PARTITION_BITS) }; 00242 00243 static w_base_t::uint8_t mask() { 00244 static w_base_t::uint8_t const ONE = 1; 00245 return (ONE << PARTITION_SHIFT)-1; 00246 } 00247 00248 lsn_t() : _data(0) { } 00249 00250 lsn_t(w_base_t::uint4_t f, sm_diskaddr_t r) : 00251 _data(from_file(f) | from_rba(r)) { } 00252 00253 // copy operator 00254 lsn_t(const lsn_t & other) : _data(other._data) { } 00255 00256 bool valid() const { 00257 // valid is essentially iff file != 0 00258 #if W_DEBUG_LEVEL > 1 00259 w_base_t::uint8_t copy_of_data = _data; 00260 w_base_t::uint8_t m = mask(); 00261 bool first = copy_of_data > m; 00262 w_base_t::uint8_t f = 00263 to_file(copy_of_data); 00264 bool second = (f != 0); 00265 w_assert2(first == second); 00266 #endif 00267 return (_data > mask()); 00268 } 00269 00270 w_base_t::uint4_t hi() const { return file(); } 00271 w_base_t::uint4_t file() const { return to_file(_data); } 00272 00273 sm_diskaddr_t lo() const { return rba(); } 00274 sm_diskaddr_t rba() const { return to_rba(_data); } 00275 00276 // WARNING: non-atomic read-modify-write operations! 00277 void copy_rba(const lsn_t &other) { 00278 _data = get_file(_data) | get_rba(other._data); } 00279 void set_rba(sm_diskaddr_t &other) { 00280 _data = get_file(_data) | get_rba(other); } 00281 00282 // WARNING: non-atomic read-modify-write operations! 00283 lsn_t& advance(int amt) { _data += amt; return *this; } 00284 00285 lsn_t &operator+=(long delta) { return advance(delta); } 00286 lsn_t operator+(long delta) const { return lsn_t(*this).advance(delta); } 00287 00288 bool operator>(const lsn_t& l) const { return l < *this; } 00289 bool operator<(const lsn_t& l) const { return _data < l._data; } 00290 bool operator>=(const lsn_t& l) const { return !(*this < l); } 00291 bool operator<=(const lsn_t& l) const { return !(*this > l); } 00292 bool operator==(const lsn_t& l) const { return _data == l._data; } 00293 bool operator!=(const lsn_t& l) const { return !(*this == l); } 00294 00295 00296 /* 00297 * This is the SM's idea of on-disk and in-structure 00298 * things that reflect the size of a "disk". It 00299 * is different from fileoff_t because the two are 00300 * orthogonal. A system may be constructed that 00301 * has big addresses, but is only running on 00302 * a "small file" environment. 00303 */ 00304 static const int sm_diskaddr_max; 00305 00306 static const lsn_t null; 00307 static const lsn_t max; 00308 00309 private: 00310 w_base_t::uint8_t _data; 00311 00312 static w_base_t::uint4_t to_file(w_base_t::uint8_t f) { 00313 return (w_base_t::uint4_t) (f >> PARTITION_SHIFT); } 00314 00315 static w_base_t::uint8_t get_file(w_base_t::uint8_t data) { 00316 return data &~ mask(); } 00317 00318 static w_base_t::uint8_t from_file(w_base_t::uint4_t data) { 00319 return ((w_base_t::uint8_t) data) << PARTITION_SHIFT; } 00320 00321 static sm_diskaddr_t to_rba(w_base_t::uint8_t r) { 00322 return (sm_diskaddr_t) (r & mask()); } 00323 00324 static w_base_t::uint8_t get_rba(w_base_t::uint8_t data) { 00325 return to_rba(data); } 00326 00327 static w_base_t::uint8_t from_rba(sm_diskaddr_t data) { 00328 return to_rba(data); } 00329 }; 00330 #endif