00001 /* -*- mode:C++; c-basic-offset:4 -*- 00002 Shore-MT -- Multi-threaded port of the SHORE storage manager 00003 00004 Copyright (c) 2007-2009 00005 Data Intensive Applications and Systems Labaratory (DIAS) 00006 Ecole Polytechnique Federale de Lausanne 00007 00008 All Rights Reserved. 00009 00010 Permission to use, copy, modify and distribute this software and 00011 its documentation is hereby granted, provided that both the 00012 copyright notice and this permission notice appear in all copies of 00013 the software, derivative works or modified versions, and any 00014 portions thereof, and that both notices appear in supporting 00015 documentation. 00016 00017 This code is distributed in the hope that it will be useful, but 00018 WITHOUT ANY WARRANTY; without even the implied warranty of 00019 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE AUTHORS 00020 DISCLAIM ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER 00021 RESULTING FROM THE USE OF THIS SOFTWARE. 00022 */ 00023 00024 /*<std-header orig-src='shore' incl-file-exclusion='SM_BASE_H'> 00025 00026 $Id: sm_base.h,v 1.157 2010/10/27 17:04:23 nhall Exp $ 00027 00028 SHORE -- Scalable Heterogeneous Object REpository 00029 00030 Copyright (c) 1994-99 Computer Sciences Department, University of 00031 Wisconsin -- Madison 00032 All Rights Reserved. 00033 00034 Permission to use, copy, modify and distribute this software and its 00035 documentation is hereby granted, provided that both the copyright 00036 notice and this permission notice appear in all copies of the 00037 software, derivative works or modified versions, and any portions 00038 thereof, and that both notices appear in supporting documentation. 00039 00040 THE AUTHORS AND THE COMPUTER SCIENCES DEPARTMENT OF THE UNIVERSITY 00041 OF WISCONSIN - MADISON ALLOW FREE USE OF THIS SOFTWARE IN ITS 00042 "AS IS" CONDITION, AND THEY DISCLAIM ANY LIABILITY OF ANY KIND 00043 FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 00044 00045 This software was developed with support by the Advanced Research 00046 Project Agency, ARPA order number 018 (formerly 8230), monitored by 00047 the U.S. Army Research Laboratory under contract DAAB07-91-C-Q518. 00048 Further funding for this work was provided by DARPA through 00049 Rome Research Laboratory Contract No. F30602-97-2-0247. 00050 00051 */ 00052 00053 #ifndef SM_BASE_H 00054 #define SM_BASE_H 00055 00056 #include "w_defines.h" 00057 00058 /* -- do not edit anything above this line -- </std-header>*/ 00059 00060 /**\file sm_base.h 00061 * \ingroup Macros 00062 */ 00063 00064 #ifdef __GNUG__ 00065 #pragma interface 00066 #endif 00067 00068 #include <climits> 00069 #ifndef OPTION_H 00070 #include "option.h" 00071 #endif 00072 #ifndef __opt_error_def_gen_h__ 00073 #include "opt_error_def_gen.h" 00074 #endif 00075 00076 00077 class ErrLog; 00078 class sm_stats_info_t; 00079 class xct_t; 00080 class xct_i; 00081 00082 class device_m; 00083 class io_m; 00084 class bf_m; 00085 class comm_m; 00086 class log_m; 00087 class lock_m; 00088 00089 class tid_t; 00090 class option_t; 00091 00092 #ifndef SM_EXTENTSIZE 00093 #define SM_EXTENTSIZE 8 00094 #endif 00095 #ifndef SM_LOG_PARTITIONS 00096 #define SM_LOG_PARTITIONS 8 00097 #endif 00098 00099 typedef w_rc_t rc_t; 00100 00101 00102 /**\cond skip 00103 * This structure collects the depth on construction 00104 * and checks that it matches the depth on destruction; this 00105 * is to ensure that we haven't forgotten to release 00106 * an anchor somewhere. 00107 * It's been extended to check the # times 00108 * we have acquired the 1thread_log_mutex. 00109 * 00110 * We're defining the CHECK_NESTING_VARIABLES macro b/c 00111 * this work is spread out and we want to have 1 place to 00112 * determine whether it's turned on or off; don't want to 00113 * make the mistake of changing the debug level (on which 00114 * it depends) in only one of several places. 00115 * 00116 * NOTE: this doesn't work in a multi-threaded xct context. 00117 * That's b/c the check is too late -- once the count goes 00118 * to zero, another thread can change it and throw off all the 00119 * counts. To be sure, we'd have to use a TLS copy as well 00120 * as the common copy of these counts. 00121 */ 00122 #if W_DEBUG_LEVEL > 0 00123 #define CHECK_NESTING_VARIABLES 1 00124 #else 00125 #define CHECK_NESTING_VARIABLES 0 00126 #endif 00127 struct check_compensated_op_nesting { 00128 #if CHECK_NESTING_VARIABLES 00129 xct_t* _xd; 00130 int _depth; 00131 int _depth_of_acquires; 00132 int _line; 00133 const char *const _file; 00134 // static methods are so we can avoid having to 00135 // include xct.h here. 00136 static int compensated_op_depth(xct_t* xd, int dflt); 00137 static int acquire_1thread_log_depth(xct_t* xd, int dflt); 00138 00139 check_compensated_op_nesting(xct_t* xd, int line, const char *const file) 00140 : _xd(xd), 00141 _depth(_xd? compensated_op_depth(_xd, 0) : 0), 00142 _depth_of_acquires(_xd? acquire_1thread_log_depth(_xd, 0) : 0), 00143 _line(line), 00144 _file(file) 00145 { 00146 } 00147 00148 ~check_compensated_op_nesting() { 00149 if(_xd) { 00150 if( _depth != compensated_op_depth(_xd, _depth) ) { 00151 fprintf(stderr, 00152 "th.%d check_compensated_op_nesting(%d,%s) depth was %d is %d\n", 00153 sthread_t::me()->id, 00154 _line, _file, _depth, compensated_op_depth(_xd, _depth)); 00155 } 00156 00157 if(_depth_of_acquires != acquire_1thread_log_depth(_xd, _depth)) { 00158 fprintf(stderr, 00159 "th.%d check_acquire_1thread_log_depth (%d,%s) depth was %d is %d\n", 00160 sthread_t::me()->id, 00161 _line, _file, _depth_of_acquires, 00162 acquire_1thread_log_depth(_xd, _depth)); 00163 } 00164 00165 w_assert0(_depth == compensated_op_depth(_xd, _depth)); 00166 w_assert0(_depth_of_acquires == acquire_1thread_log_depth(_xd, _depth)); 00167 } 00168 } 00169 #else 00170 check_compensated_op_nesting(xct_t*, int, const char *const) { } 00171 #endif 00172 }; 00173 00174 00175 /**\brief Encapsulates a few types uses in the API */ 00176 class smlevel_0 : public w_base_t { 00177 public: 00178 // Give these enums names for doxygen purposes: 00179 enum error_constant_t { eNOERROR = 0, eFAILURE = -1 }; 00180 enum sm_constant_t { 00181 page_sz = SM_PAGESIZE, // page size (SM_PAGESIZE is set by makemake) 00182 ext_sz = SM_EXTENTSIZE, // extent size 00183 max_exts = max_int4, // max no. extents, must fit extnum_t 00184 #if defined(_POSIX_PATH_MAX) 00185 max_devname = _POSIX_PATH_MAX, // max length of unix path name 00186 // BEWARE: this might be larger than you want. Array sizes depend on it. 00187 // The default might be small enough, e.g., 256; getconf() yields the upper 00188 // bound on this value. 00189 #elif defined(MAXPATHLEN) 00190 max_devname = MAXPATHLEN, 00191 #else 00192 max_devname = 1024, 00193 #endif 00194 max_vols = 20, // max mounted volumes 00195 max_xct_thread = 20, // max threads in a xct 00196 max_servers = 15, // max servers to be connected with 00197 max_keycomp = 20, // max key component (for btree) 00198 max_openlog = SM_LOG_PARTITIONS, // max # log partitions 00199 max_dir_cache = max_vols * 10, 00200 00201 /* XXX I want to propogate sthread_t::iovec_max here, but 00202 it doesn't work because of sm_app.h not including 00203 the thread package. */ 00204 max_many_pages = 8, 00205 00206 srvid_map_sz = (max_servers - 1) / 8 + 1, 00207 ext_map_sz_in_bytes = ((ext_sz + 7) / 8), 00208 00209 dummy = 0 00210 }; 00211 00212 enum { 00213 max_rec_len = max_uint4 00214 }; 00215 00216 typedef sthread_base_t::fileoff_t fileoff_t; 00217 /* 00218 * Sizes-in-Kbytes for for things like volumes and devices. 00219 * A KB is assumes to be 1024 bytes. 00220 * Note: a different type was used for added type checking. 00221 */ 00222 typedef sthread_t::fileoff_t smksize_t; 00223 typedef w_base_t::base_stat_t base_stat_t; 00224 00225 /**\endcond skip */ 00226 00227 /* 00228 * rather than automatically aborting the transaction, when the 00229 * _log_warn_percent is exceeded, this callback is made, with a 00230 * pointer to the xct that did the writing, and with the 00231 * expectation that the result will be one of: 00232 * - return value == RCOK --> proceed 00233 * - return value == eUSERABORT --> victim to abort is given in the argument 00234 * 00235 * The server has the responsibility for choosing a victim and 00236 * for aborting the victim transaction. 00237 * 00238 */ 00239 00240 /**\brief Log space warning callback function type. 00241 * 00242 * For more details of how this is used, see the constructor ss_m::ss_m(). 00243 * 00244 * Storage manager methods check the available log space. 00245 * If the log is in danger of filling to the point that it will be 00246 * impossible to abort a transaction, a 00247 * callback is made to the server. The callback function is of this type. 00248 * The danger point is a threshold determined by the option sm_log_warn. 00249 * 00250 * The callback 00251 * function is meant to choose a victim xct and 00252 * tell if the xct should be 00253 * aborted by returning RC(eUSERABORT). 00254 * 00255 * Any other RC value is returned to the server through the call stack. 00256 * 00257 * The arguments: 00258 * @param[in] iter Pointer to an iterator over all xcts. 00259 * @param[out] victim Victim will be returned here. This is an in/out 00260 * paramter and is initially populated with the transaction that is 00261 * attached to the running thread. 00262 * @param[in] curr Bytes of log consumed by active transactions. 00263 * @param[in] thresh Threshhold just exceeded. 00264 * @param[in] logfile Character string name of oldest file to archive. 00265 * 00266 * This function must be careful not to return the same victim more 00267 * than once, even though the callback may be called many 00268 * times before the victim is completely aborted. 00269 * 00270 * When this function has archived the given log file, it needs 00271 * to notify the storage manager of that fact by calling 00272 * ss_m::log_file_was_archived(logfile) 00273 */ 00274 typedef w_rc_t (*LOG_WARN_CALLBACK_FUNC) ( 00275 xct_i* iter, 00276 xct_t *& victim, 00277 fileoff_t curr, 00278 fileoff_t thresh, 00279 const char *logfile 00280 ); 00281 /**\brief Callback function type for restoring an archived log file. 00282 * 00283 * @param[in] fname Original file name (with path). 00284 * @param[in] needed Partition number of the file needed. 00285 * 00286 * An alternative to aborting a transaction (when the log fills) 00287 * is to archive log files. 00288 * The server can use the log directory name to locate these files, 00289 * and may use the iterator and the static methods of xct_t to 00290 * determine which log file(s) to archive. 00291 * 00292 * Archiving and removing the older log files will work only if 00293 * the server also provides a LOG_ARCHIVED_CALLBACK_FUNCTION 00294 * to restore the 00295 * archived log files when the storage manager needs them for 00296 * rollback. 00297 * This is the function type used for that purpose. 00298 * 00299 * The function must locate the archived log file containing for the 00300 * partition number \a num, which was a suffix of the original log file's 00301 * name. 00302 * The log file must be restored with its original name. 00303 */ 00304 typedef w_base_t::uint4_t partition_number_t; 00305 typedef w_rc_t (*LOG_ARCHIVED_CALLBACK_FUNC) ( 00306 const char *fname, 00307 partition_number_t num 00308 ); 00309 00310 /**\cond skip */ 00311 enum switch_t { 00312 ON = 1, 00313 OFF = 0 00314 }; 00315 /**\endcond skip */ 00316 00317 /**\brief Comparison types used in scan_index_i 00318 * \enum cmp_t 00319 * Shorthand for CompareOp. 00320 */ 00321 enum cmp_t { bad_cmp_t=badOp, eq=eqOp, 00322 gt=gtOp, ge=geOp, lt=ltOp, le=leOp }; 00323 00324 00325 /* used by lock escalation routines */ 00326 enum escalation_options { 00327 dontEscalate = max_int4_minus1, 00328 dontEscalateDontPassOn, 00329 dontModifyThreshold = -1 00330 }; 00331 00332 /**\brief Types of stores. 00333 * \enum store_t 00334 */ 00335 enum store_t { 00336 t_bad_store_t, 00337 /// a b-tree or r-tree index 00338 t_index, 00339 /// a file of records 00340 t_file, 00341 /// t_lgrec is used for storing large record pages 00342 /// and is always associated with some t_file store 00343 t_lgrec 00344 }; 00345 00346 // types of indexes 00347 00348 /**\brief Index types */ 00349 enum ndx_t { 00350 t_bad_ndx_t, // illegal value 00351 t_btree, // B+tree with duplicates 00352 t_uni_btree, // Unique-key btree 00353 t_rtree // R*tree 00354 }; 00355 00356 /**\enum concurrency_t 00357 * \brief 00358 * Lock granularities 00359 * \details 00360 * - t_cc_bad Illegal 00361 * - t_cc_none No locking 00362 * - t_cc_record Record-level locking for files & records 00363 * - t_cc_page Page-level locking for files & records 00364 * - t_cc_file File-level locking for files & records 00365 * - t_cc_vol Volume-level locking for files and indexes 00366 * - t_cc_kvl Key-value locking for B+-Tree indexes 00367 * - t_cc_im Aries IM locking for B+-Tree indexes : experimental 00368 * - t_cc_modkvl Modified key-value locking: experimental 00369 * - t_cc_append Used internally \todo true? 00370 */ 00371 enum concurrency_t { 00372 t_cc_bad, // this is an illegal value 00373 t_cc_none, // no locking 00374 t_cc_record, // record-level 00375 t_cc_page, // page-level 00376 t_cc_file, // file-level 00377 t_cc_vol, 00378 t_cc_kvl, // key-value 00379 t_cc_im, // ARIES IM, not supported yet 00380 t_cc_modkvl, // modified ARIES KVL, for paradise use 00381 t_cc_append // append-only with scan_file_i 00382 }; 00383 00384 /**\enum pg_policy_t 00385 * \brief 00386 * File-compaction policy for creating records. 00387 * \details 00388 * - t_append : append new record to file (preserve order) 00389 * - t_cache : look in cache for pages with space for new record (does 00390 * not preserve order) 00391 * - t_compact: keep file compact even if it means searching the file 00392 * for space in which to create the file (does not preserve 00393 * order) 00394 * 00395 * These are masks - the following combinations are sensible: 00396 * 00397 * - t_append -- preserve sort order 00398 * - t_cache | t_append -- check the cache first, 00399 * append if no luck 00400 * - t_cache | t_compact | t_append -- append to file as a last resort 00401 */ 00402 enum pg_policy_t { 00403 t_append = 0x01, // retain sort order (cache 0 pages) 00404 t_cache = 0x02, // look in n cached pgs 00405 t_compact = 0x04 // scan file for space in pages 00406 00407 }; 00408 00409 /**\cond skip */ 00410 00411 /* 00412 * smlevel_0::operating_mode is always set to 00413 * ONE of these, but the function in_recovery() tests for 00414 * any of them, so we'll give them bit-mask values 00415 */ 00416 enum operating_mode_t { 00417 t_not_started = 0, 00418 t_in_analysis = 0x1, 00419 t_in_redo = 0x2, 00420 t_in_undo = 0x4, 00421 t_forward_processing = 0x8 00422 }; 00423 00424 static concurrency_t cc_alg; // concurrency control algorithm 00425 static bool cc_adaptive; // is PS-AA (adaptive) algorithm used? 00426 00427 #include "e_error_enum_gen.h" 00428 00429 static const w_error_info_t error_info[]; 00430 static void init_errorcodes(); 00431 00432 static void add_to_global_stats(const sm_stats_info_t &from); 00433 static void add_from_global_stats(sm_stats_info_t &to); 00434 00435 static device_m* dev; 00436 static io_m* io; 00437 static bf_m* bf; 00438 static lock_m* lm; 00439 00440 static log_m* log; 00441 static tid_t* redo_tid; 00442 00443 static LOG_WARN_CALLBACK_FUNC log_warn_callback; 00444 static LOG_ARCHIVED_CALLBACK_FUNC log_archived_callback; 00445 static fileoff_t log_warn_trigger; 00446 static int log_warn_exceed_percent; 00447 00448 static int dcommit_timeout; // to convey option to coordinator, 00449 // if it is created by VAS 00450 00451 static ErrLog* errlog; 00452 00453 static bool shutdown_clean; 00454 static bool shutting_down; 00455 static bool logging_enabled; 00456 static bool lock_caching_default; 00457 static bool do_prefetch; 00458 00459 static operating_mode_t operating_mode; 00460 static bool in_recovery() { 00461 return ((operating_mode & 00462 (t_in_redo | t_in_undo | t_in_analysis)) !=0); } 00463 static bool in_recovery_analysis() { 00464 return ((operating_mode & t_in_analysis) !=0); } 00465 static bool in_recovery_undo() { 00466 return ((operating_mode & t_in_undo ) !=0); } 00467 static bool in_recovery_redo() { 00468 return ((operating_mode & t_in_redo ) !=0); } 00469 00470 // these variable are the default values for lock escalation counts 00471 static w_base_t::int4_t defaultLockEscalateToPageThreshold; 00472 static w_base_t::int4_t defaultLockEscalateToStoreThreshold; 00473 static w_base_t::int4_t defaultLockEscalateToVolumeThreshold; 00474 00475 // These variables control the size of the log. 00476 static fileoff_t max_logsz; // max log file size 00477 00478 // This variable controls checkpoint frequency. 00479 // Checkpoints are taken every chkpt_displacement bytes 00480 // written to the log. 00481 static fileoff_t chkpt_displacement; 00482 00483 // The volume_format_version is used to test compatability 00484 // of software with a volume. Whenever a change is made 00485 // to the SM software that makes it incompatible with 00486 // previouly formatted volumes, this volume number should 00487 // be incremented. The value is set in sm.cpp. 00488 static w_base_t::uint4_t volume_format_version; 00489 00490 // This is a zeroed page for use wherever initialized memory 00491 // is needed. 00492 static char zero_page[page_sz]; 00493 00494 // option for controlling background buffer flush thread 00495 static option_t* _backgroundflush; 00496 00497 00498 /* 00499 * Pre-defined store IDs -- see also vol.h 00500 * 0 -- is reserved for the extent map and the store map 00501 * 1 -- directory (see dir.cpp) 00502 * 2 -- root index (see sm.cpp) 00503 */ 00504 enum { 00505 store_id_extentmap = 0, 00506 store_id_directory = 1, 00507 store_id_root_index = 2 00508 }; 00509 00510 enum { 00511 eINTERNAL = fcINTERNAL, 00512 eOS = fcOS, 00513 eOUTOFMEMORY = fcOUTOFMEMORY, 00514 eNOTFOUND = fcNOTFOUND, 00515 eNOTIMPLEMENTED = fcNOTIMPLEMENTED 00516 }; 00517 00518 enum store_flag_t { 00519 // NB: this had better match sm_store_property_t (sm_int_3.h) !!! 00520 // or at least be convted properly every time we come through the API 00521 st_bad = 0x0, 00522 st_regular = 0x01, // fully logged 00523 st_tmp = 0x02, // space logging only, 00524 // file destroy on dismount/restart 00525 st_load_file = 0x04, // not stored in the stnode_t, 00526 // only passed down to 00527 // io_m and then converted to tmp and added to the 00528 // list of load files for the xct. 00529 // no longer needed 00530 st_insert_file = 0x08, // stored in stnode, but not on page. 00531 // new pages are saved as tmp, old pages as regular. 00532 st_empty = 0x100 // store might be empty - used ONLY 00533 // as a function argument, NOT stored 00534 // persistently. Nevertheless, it's 00535 // defined here to be sure that if other 00536 // store flags are added, this doesn't 00537 // conflict with them. 00538 }; 00539 00540 /* 00541 * for use by set_store_deleting_log; 00542 * type of operation to perform on the stnode 00543 */ 00544 enum store_operation_t { 00545 t_delete_store, 00546 t_create_store, 00547 t_set_deleting, 00548 t_set_store_flags, 00549 t_set_first_ext}; 00550 00551 enum store_deleting_t { 00552 t_not_deleting_store = 0, // must be 0: code assumes it 00553 t_deleting_store, 00554 t_store_freeing_exts, 00555 t_unknown_deleting}; 00556 /**\endcond skip */ 00557 }; 00558 00559 /**\cond skip */ 00560 ostream& 00561 operator<<(ostream& o, smlevel_0::store_flag_t flag); 00562 00563 ostream& 00564 operator<<(ostream& o, const smlevel_0::store_operation_t op); 00565 00566 ostream& 00567 operator<<(ostream& o, const smlevel_0::store_deleting_t value); 00568 00569 /**\endcond skip */ 00570 00571 /*<std-footer incl-file-exclusion='SM_BASE_H'> -- do not edit anything below this line -- */ 00572 00573 #endif /*</std-footer>*/