00001 /* 00002 * libfid - Full-text Index Data structure library 00003 * Copyright (C) 2006, 2007, 2008, 2009 Robert Homann 00004 * 00005 * This library is free software; you can redistribute it and/or 00006 * modify it under the terms of the GNU Lesser General Public 00007 * License as published by the Free Software Foundation; either 00008 * version 2.1 of the License, or (at your option) any later version. 00009 * 00010 * This library is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 * Lesser General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public 00016 * License along with this library; if not, write to the Free Software 00017 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00018 * MA 02110-1301 USA 00019 */ 00020 00021 #ifndef SUFFIXARRAY_H 00022 #define SUFFIXARRAY_H 00023 /*!\addtogroup suffixarrays Enhanced Suffix Arrays 00024 * 00025 * The enhanced suffix array data types and accompanying functions are 00026 * defined in this group. The integral data type is fid_Suffixarray. 00027 * Suffix-intervals are represented by the fid_Suffixinterval data 00028 * structure. 00029 */ 00030 /*@{*/ 00031 /*! 00032 * \brief Representation of an enhanced suffix array. 00033 * 00034 * This data structure gathers all data required to operate on the enhanced 00035 * suffix array. 00036 */ 00037 typedef struct 00038 { 00039 fid_Uintsize uisize; /*!<\brief Integer size. */ 00040 fid_Sequences sequences; /*!<\brief Reference to sequence data. */ 00041 fid_Alphabet alpha; /*!<\brief The alphabet the sequence data is encoded 00042 * with. */ 00043 fid_Uint48constptr suftab; /*!<\brief Plain suffix array, which is 00044 * basically a correctly typed shortcut to the 00045 * content of fid_Suffixarray::suffile. */ 00046 fid_Uint48constptr skiptab; /*!<\brief Skip table, basically the correctly 00047 * typed content of fid_Suffixarray::skpfile. */ 00048 fid_Uint48constptr stitab; /*!<\brief Inverse suffix array, basically the 00049 * correctly typed content of 00050 * fid_Suffixarray::stifile. */ 00051 fid_Uint48 num_of_large_lcps; /*!<\brief The number of large lcp-values stored 00052 * in file fid_Suffixarray::llvfile. */ 00053 00054 fid_Mappedfile suffile; /*!<\brief File containing the plain suffix array. */ 00055 fid_Mappedfile lcpfile; /*!<\brief File containing the lcp-table. */ 00056 fid_Mappedfile llvfile; /*!<\brief File containing the large lcp-values. */ 00057 fid_Mappedfile skpfile; /*!<\brief File containing the skip table. */ 00058 fid_Mappedfile stifile; /*!<\brief File containing the inverse suffix array. */ 00059 } fid_Suffixarray; 00060 00061 /*! 00062 * \brief Representation of a suffix-interval in 32 bit enhanced suffix array. 00063 * 00064 * Suffix-intervals are used to represent locations in an enhanced suffix 00065 * array. They basically consist of an interval in the suffix array, i.e., a 00066 * left and a right boundary (or lower and upper, depending on your 00067 * imagination), and a depth. A suffix-interval with depth \p d and boundaries 00068 * \p left and \p right is valid iff all suffixes within its boundaries share a 00069 * common prefix of length at least \p d. A suffix-interval is called an 00070 * lcp-interval if the longest common prefix of all suffixes in the 00071 * suffix-interval has length \p d. Suffix-intervals with left and right 00072 * boundaries being equal are called singletons and have no defined depth (see 00073 * #fid_SUFFIXINTERVAL_SINGLETON_32 and #fid_SUFFIXINTERVAL_SINGLETON_64). 00074 * 00075 * Use fid_suffixinterval_init() or fid_suffixinterval_init_root() (i.e., 00076 * fid_suffixinterval_init_root_32() and fid_suffixinterval_init_root_64()) to 00077 * initialize this data structure. Function fid_suffixinterval_to_lcpinterval() 00078 * can be used to turn a valid suffix-interval into an lcp-interval. 00079 * 00080 * \see #fid_Suffixinterval_64 00081 */ 00082 typedef struct 00083 { 00084 fid_Uint32 depth; /*!< Depth of the suffix-interval. */ 00085 fid_Uint32 left, /*!< Left boundary. */ 00086 right; /*!< Right boundary. */ 00087 } fid_Suffixinterval_32; 00088 00089 /*! 00090 * \brief Representation of a suffix-interval in 64 bit enhanced suffix array. 00091 * 00092 * \see #fid_Suffixinterval_32 00093 */ 00094 typedef struct 00095 { 00096 fid_Uint64 depth; /*!< Depth of the suffix-interval. */ 00097 fid_Uint64 left, /*!< Left boundary. */ 00098 right; /*!< Right boundary. */ 00099 } fid_Suffixinterval_64; 00100 00101 /*! 00102 * \brief Special value for #fid_Suffixinterval_32::depth to indicate a 00103 * singleton. 00104 * \see #fid_SUFFIXINTERVAL_SINGLETON_64 00105 */ 00106 #define fid_SUFFIXINTERVAL_SINGLETON_32 (~(fid_Uint32)0) 00107 00108 /*! 00109 * \brief Special value for #fid_Suffixinterval_64::depth to indicate a 00110 * singleton. 00111 * \see #fid_SUFFIXINTERVAL_SINGLETON_32 00112 */ 00113 #define fid_SUFFIXINTERVAL_SINGLETON_64 (~(fid_Uint64)0) 00114 00115 /*!\brief Table request: suffix array. */ 00116 #define fid_TABLE_SUF ((fid_Tablerequest)0x00000100) 00117 00118 /*!\brief Table request: lcp table. */ 00119 #define fid_TABLE_LCP ((fid_Tablerequest)0x00000200) 00120 00121 /*!\brief Table request: skip table. */ 00122 #define fid_TABLE_SKP ((fid_Tablerequest)0x00000400) 00123 00124 /*!\brief Table request: inverse suffix array. */ 00125 #define fid_TABLE_STI ((fid_Tablerequest)0x00000800) 00126 00127 /*!\brief Table request: all tables related to offline matching. */ 00128 #define fid_TABLES_OFFLINE (fid_TABLE_SUF|fid_TABLE_LCP|fid_TABLE_SKP|fid_TABLE_STI) 00129 00130 /*!\brief Table request bit mask: bits related to offline matching. */ 00131 #define fid_TABLES_OFFLINE_MASK ((fid_Tablerequest)0xffffff00) 00132 00133 /*!\brief Table request: all tables. */ 00134 #define fid_TABLES_ALL (fid_TABLES_ONLINE|fid_TABLES_OFFLINE) 00135 00136 /*! 00137 * \brief Size of the buffer for fid_suffixarray_get_intervals(). 00138 * 00139 * The size is determined as the maximum number of regular symbols, plus 1 00140 * for the left boundary for the special interval boundary, plus 1 for the 00141 * (non-existent) left boundary of the interval following the special 00142 * interval. 00143 */ 00144 #define fid_INTERVALBUFFERSIZE ((size_t)fid_SYMBOLMAX+2) 00145 00146 /*! 00147 * \brief Determine lcp-value of a 32 bit suffix. 00148 * 00149 * This is implemented as a macro since it would be a huge performance penality 00150 * to call a function for something that is a simple assignment in the vast 00151 * majority of all cases. A function call occurs if the lcp-value stored in the 00152 * main lcp-table is equal to \c UCHAR_MAX (255), however, since then the real, 00153 * potentially larger, lcp-value must be looked up inside an exception table. 00154 * 00155 * \param L The lcp-value is assigned to this variable, which should be of type 00156 * #fid_Uint32 or #fid_Uint64. 00157 * \param ESA An enhanced suffix array. 00158 * \param I The suffix index for which the lcp-value should be determined. 00159 * 00160 * \see #fid_LCP_64() 00161 */ 00162 #define fid_LCP_32(L,ESA,I)\ 00163 if(((L)=(ESA)->lcpfile.content[I]) == (unsigned char)UCHAR_MAX)\ 00164 {\ 00165 (L)=fid_suffixarray_find_large_lcp_32(ESA,I);\ 00166 } 00167 00168 /*! 00169 * \brief Determine lcp-value of a 64 bit suffix. 00170 * \see #fid_LCP_32() 00171 */ 00172 #define fid_LCP_64(L,ESA,I)\ 00173 if(((L)=(ESA)->lcpfile.content[I]) == (unsigned char)UCHAR_MAX)\ 00174 {\ 00175 (L)=fid_suffixarray_find_large_lcp_64(ESA,I);\ 00176 } 00177 00178 /*! 00179 * \brief Initialize a fid_Suffixinterval structure. 00180 * 00181 * This has been implemented as a macro mainly for speed since it is used all 00182 * the time. We don't want to rely on the compiler to inline this 00183 * automatically, do we? 00184 * 00185 * \param SI The fid_Suffixinterval structure to be initialized. 00186 * \param D Depth. 00187 * \param L,R Left and right boundaries. 00188 */ 00189 #define fid_suffixinterval_init(SI,D,L,R)\ 00190 (SI)->depth=(D);\ 00191 (SI)->left=(L);\ 00192 (SI)->right=(R) 00193 00194 /*! 00195 * \brief Initialize a #fid_Suffixinterval_32 structure corresponding to the 00196 * root interval of a given enhanced suffix array. 00197 * 00198 * \param SI The #fid_Suffixinterval_32 structure to be initialized. 00199 * \param ESA A 32 bit enhanced suffix array. 00200 * 00201 * \see #fid_suffixinterval_init_root_64() 00202 */ 00203 #define fid_suffixinterval_init_root_32(SI,ESA)\ 00204 fid_suffixinterval_init(SI,0,0,(ESA)->sequences.total_length.v_uint32-1) 00205 00206 /*! 00207 * \brief Initialize a #fid_Suffixinterval_64 structure corresponding to the 00208 * root interval of a given enhanced suffix array. 00209 * 00210 * \param SI The #fid_Suffixinterval_64 structure to be initialized. 00211 * \param ESA A 64 bit enhanced suffix array. 00212 * 00213 * \see #fid_suffixinterval_init_root_32() 00214 */ 00215 #define fid_suffixinterval_init_root_64(SI,ESA)\ 00216 fid_suffixinterval_init(SI,0,0,(ESA)->sequences.total_length.v_uint64-1) 00217 00218 /*!\brief Special return value for #fid_Esatraversecallback_32 and 00219 * #fid_Esatraversecallback_32 functions. */ 00220 #define fid_TRAVERSE_CONTINUE ((int)0) 00221 00222 /*!\brief Special return value for #fid_Esatraversecallback_32 and 00223 * #fid_Esatraversecallback_32 functions. */ 00224 #define fid_TRAVERSE_SKIP ((int)-1) 00225 00226 /*! 00227 * \brief Callback function prototype for enhanced suffix array traversals. 00228 * 00229 * For each lcp-interval in an enhanced suffix array passed to 00230 * fid_suffixarray_traverse(), a callback function is called. That callback 00231 * function must follow the signature of this type. 00232 * 00233 * \param esa The traversed enhanced suffix array. 00234 * \param si An lcp-interval. This is really an lcp-interval, not just a 00235 * suffix-interval, so there is no need to turn this into an lcp-interval 00236 * inside the function. 00237 * \param pdepth The depth of the parent lcp-interval. 00238 * \param error Error messages go here. 00239 * \param user_data Pointer to any data, may be used freely within the callback 00240 * function. 00241 * 00242 * \returns A positive integer for stopping the traversal, or one of the 00243 * following special return codes. If the function returns a positive integer, 00244 * then that exact value will be the return value of 00245 * fid_suffixarray_traverse(). 00246 * 00247 * \retval #fid_TRAVERSE_CONTINUE Continue traversal. 00248 * \retval #fid_TRAVERSE_SKIP Do not traverse deeper into current subtree, 00249 * continue with next sibling. 00250 * 00251 * \see #fid_Esatraversecallback_64 00252 */ 00253 typedef int (*fid_Esatraversecallback_32)(const fid_Suffixarray *esa, 00254 const fid_Suffixinterval_32 *si, 00255 fid_Uint32 pdepth, fid_Error *error, 00256 void *user_data); 00257 00258 /*! 00259 * \brief Callback function prototype for enhanced suffix array traversals. 00260 * \see #fid_Esatraversecallback_32 00261 */ 00262 typedef int (*fid_Esatraversecallback_64)(const fid_Suffixarray *esa, 00263 const fid_Suffixinterval_64 *si, 00264 fid_Uint64 pdepth, fid_Error *error, 00265 void *user_data); 00266 00267 #ifdef __cplusplus 00268 extern "C" { 00269 #endif 00270 void fid_suffixarray_init(fid_Suffixarray *esa, fid_Uintsize uisize); 00271 int fid_suffixarray_load_from_files(fid_Suffixarray *esa, 00272 const char *basefilename, 00273 fid_Tablerequest tables, 00274 fid_Uintsize uisize, fid_Error *error); 00275 int fid_suffixarray_realize(fid_Suffixarray *esa, fid_Tablerequest tables, 00276 fid_Error *error); 00277 /*!\copydoc #fid_suffixarray_traverse 00278 * \brief 32 bit version of #fid_suffixarray_traverse(). */ 00279 #line 278 00280 int fid_suffixarray_traverse_32(const fid_Suffixarray *esa, 00281 fid_Esatraversecallback_32 callback, 00282 void *user_data, fid_Error *error); 00283 /*!\copydoc #fid_suffixarray_traverse 00284 * \brief 64 bit version of #fid_suffixarray_traverse(). */ 00285 #line 278 00286 int fid_suffixarray_traverse_64(const fid_Suffixarray *esa, 00287 fid_Esatraversecallback_64 callback, 00288 void *user_data, fid_Error *error); 00289 /*!\copydoc #fid_suffixarray_get_intervals 00290 * \brief 32 bit version of #fid_suffixarray_get_intervals(). */ 00291 #line 281 00292 fid_Uint32 fid_suffixarray_get_intervals_32(const fid_Suffixarray *esa, 00293 const fid_Suffixinterval_32 *si, 00294 fid_Uint32 *intervals); 00295 /*!\copydoc #fid_suffixarray_get_intervals 00296 * \brief 64 bit version of #fid_suffixarray_get_intervals(). */ 00297 #line 281 00298 fid_Uint64 fid_suffixarray_get_intervals_64(const fid_Suffixarray *esa, 00299 const fid_Suffixinterval_64 *si, 00300 fid_Uint64 *intervals); 00301 /*!\copydoc #fid_suffixarray_find_embedded_interval 00302 * \brief 32 bit version of #fid_suffixarray_find_embedded_interval(). */ 00303 #line 284 00304 int fid_suffixarray_find_embedded_interval_32(const fid_Suffixarray *esa, 00305 fid_Suffixinterval_32 *si, 00306 fid_Symbol symbol); 00307 /*!\copydoc #fid_suffixarray_find_embedded_interval 00308 * \brief 64 bit version of #fid_suffixarray_find_embedded_interval(). */ 00309 #line 284 00310 int fid_suffixarray_find_embedded_interval_64(const fid_Suffixarray *esa, 00311 fid_Suffixinterval_64 *si, 00312 fid_Symbol symbol); 00313 /*!\copydoc #fid_suffixarray_extend_interval 00314 * \brief 32 bit version of #fid_suffixarray_extend_interval(). */ 00315 #line 287 00316 int fid_suffixarray_extend_interval_32(const fid_Suffixarray *esa, 00317 const fid_Symbol *pattern, fid_Uint32 plen, 00318 fid_Suffixinterval_32 *si); 00319 /*!\copydoc #fid_suffixarray_extend_interval 00320 * \brief 64 bit version of #fid_suffixarray_extend_interval(). */ 00321 #line 287 00322 int fid_suffixarray_extend_interval_64(const fid_Suffixarray *esa, 00323 const fid_Symbol *pattern, fid_Uint64 plen, 00324 fid_Suffixinterval_64 *si); 00325 /*!\copydoc #fid_suffixarray_find_interval 00326 * \brief 32 bit version of #fid_suffixarray_find_interval(). */ 00327 #line 290 00328 int fid_suffixarray_find_interval_32(const fid_Suffixarray *esa, 00329 const fid_Symbol *pattern, fid_Uint32 plen, 00330 fid_Suffixinterval_32 *si); 00331 /*!\copydoc #fid_suffixarray_find_interval 00332 * \brief 64 bit version of #fid_suffixarray_find_interval(). */ 00333 #line 290 00334 int fid_suffixarray_find_interval_64(const fid_Suffixarray *esa, 00335 const fid_Symbol *pattern, fid_Uint64 plen, 00336 fid_Suffixinterval_64 *si); 00337 /*!\copydoc #fid_suffixarray_find_large_lcp 00338 * \brief 32 bit version of #fid_suffixarray_find_large_lcp(). */ 00339 #line 293 00340 fid_Uint32 fid_suffixarray_find_large_lcp_32(const fid_Suffixarray *esa, 00341 fid_Uint32 suffix); 00342 /*!\copydoc #fid_suffixarray_find_large_lcp 00343 * \brief 64 bit version of #fid_suffixarray_find_large_lcp(). */ 00344 #line 293 00345 fid_Uint64 fid_suffixarray_find_large_lcp_64(const fid_Suffixarray *esa, 00346 fid_Uint64 suffix); 00347 /*!\copydoc #fid_suffixarray_suffix_length 00348 * \brief 32 bit version of #fid_suffixarray_suffix_length(). */ 00349 #line 295 00350 fid_Uint32 fid_suffixarray_suffix_length_32(const fid_Suffixarray *esa, 00351 fid_Uint32 suffix); 00352 /*!\copydoc #fid_suffixarray_suffix_length 00353 * \brief 64 bit version of #fid_suffixarray_suffix_length(). */ 00354 #line 295 00355 fid_Uint64 fid_suffixarray_suffix_length_64(const fid_Suffixarray *esa, 00356 fid_Uint64 suffix); 00357 /*!\copydoc #fid_suffixarray_compute_distribution 00358 * \brief 32 bit version of #fid_suffixarray_compute_distribution(). */ 00359 #line 297 00360 void fid_suffixarray_compute_distribution_32(fid_Suffixarray *esa); 00361 /*!\copydoc #fid_suffixarray_compute_distribution 00362 * \brief 64 bit version of #fid_suffixarray_compute_distribution(). */ 00363 #line 297 00364 void fid_suffixarray_compute_distribution_64(fid_Suffixarray *esa); 00365 #line 299 00366 void fid_suffixarray_free(fid_Suffixarray *esa); 00367 void fid_suffixarray_dump(const fid_Suffixarray *esa, FILE *stream); 00368 /*!\copydoc #fid_suffixarray_dump_intervals 00369 * \brief 32 bit version of #fid_suffixarray_dump_intervals(). */ 00370 #line 302 00371 void fid_suffixarray_dump_intervals_32(const fid_Suffixarray *esa, 00372 const fid_Uint32 intervals[fid_INTERVALBUFFERSIZE], 00373 FILE *stream); 00374 /*!\copydoc #fid_suffixarray_dump_intervals 00375 * \brief 64 bit version of #fid_suffixarray_dump_intervals(). */ 00376 #line 302 00377 void fid_suffixarray_dump_intervals_64(const fid_Suffixarray *esa, 00378 const fid_Uint64 intervals[fid_INTERVALBUFFERSIZE], 00379 FILE *stream); 00380 /*!\copydoc #fid_suffixarray_dump_suffix 00381 * \brief 32 bit version of #fid_suffixarray_dump_suffix(). */ 00382 #line 305 00383 void fid_suffixarray_dump_suffix_32(const fid_Suffixarray *esa, fid_Uint32 suffix, 00384 fid_Uint32 length, FILE *stream); 00385 /*!\copydoc #fid_suffixarray_dump_suffix 00386 * \brief 64 bit version of #fid_suffixarray_dump_suffix(). */ 00387 #line 305 00388 void fid_suffixarray_dump_suffix_64(const fid_Suffixarray *esa, fid_Uint64 suffix, 00389 fid_Uint64 length, FILE *stream); 00390 /*!\copydoc #fid_suffixarray_print 00391 * \brief 32 bit version of #fid_suffixarray_print(). */ 00392 #line 307 00393 void fid_suffixarray_print_32(const fid_Suffixarray *esa, 00394 fid_Tablerequest request, FILE *stream); 00395 /*!\copydoc #fid_suffixarray_print 00396 * \brief 64 bit version of #fid_suffixarray_print(). */ 00397 #line 307 00398 void fid_suffixarray_print_64(const fid_Suffixarray *esa, 00399 fid_Tablerequest request, FILE *stream); 00400 /*!\copydoc #fid_suffixinterval_lcpvalue 00401 * \brief 32 bit version of #fid_suffixinterval_lcpvalue(). */ 00402 #line 309 00403 fid_Uint32 fid_suffixinterval_lcpvalue_32(const fid_Suffixinterval_32 *si, 00404 const fid_Suffixarray *esa); 00405 /*!\copydoc #fid_suffixinterval_lcpvalue 00406 * \brief 64 bit version of #fid_suffixinterval_lcpvalue(). */ 00407 #line 309 00408 fid_Uint64 fid_suffixinterval_lcpvalue_64(const fid_Suffixinterval_64 *si, 00409 const fid_Suffixarray *esa); 00410 /*!\copydoc #fid_suffixinterval_to_lcpinterval 00411 * \brief 32 bit version of #fid_suffixinterval_to_lcpinterval(). */ 00412 #line 311 00413 void fid_suffixinterval_to_lcpinterval_32(fid_Suffixinterval_32 *si, 00414 const fid_Suffixarray *esa); 00415 /*!\copydoc #fid_suffixinterval_to_lcpinterval 00416 * \brief 64 bit version of #fid_suffixinterval_to_lcpinterval(). */ 00417 #line 311 00418 void fid_suffixinterval_to_lcpinterval_64(fid_Suffixinterval_64 *si, 00419 const fid_Suffixarray *esa); 00420 /*!\copydoc #fid_suffixinterval_find_right 00421 * \brief 32 bit version of #fid_suffixinterval_find_right(). */ 00422 #line 313 00423 void fid_suffixinterval_find_right_32(fid_Suffixinterval_32 *si, 00424 const fid_Suffixarray *esa); 00425 /*!\copydoc #fid_suffixinterval_find_right 00426 * \brief 64 bit version of #fid_suffixinterval_find_right(). */ 00427 #line 313 00428 void fid_suffixinterval_find_right_64(fid_Suffixinterval_64 *si, 00429 const fid_Suffixarray *esa); 00430 /*!\copydoc #fid_suffixinterval_homepos 00431 * \brief 32 bit version of #fid_suffixinterval_homepos(). */ 00432 #line 315 00433 fid_Uint32 fid_suffixinterval_homepos_32(const fid_Suffixinterval_32 *si, 00434 const fid_Suffixarray *esa); 00435 /*!\copydoc #fid_suffixinterval_homepos 00436 * \brief 64 bit version of #fid_suffixinterval_homepos(). */ 00437 #line 315 00438 fid_Uint64 fid_suffixinterval_homepos_64(const fid_Suffixinterval_64 *si, 00439 const fid_Suffixarray *esa); 00440 /*!\copydoc #fid_suffixinterval_dump 00441 * \brief 32 bit version of #fid_suffixinterval_dump(). */ 00442 #line 317 00443 void fid_suffixinterval_dump_32(const fid_Suffixinterval_32 *si, 00444 const fid_Suffixarray *esa, FILE *stream); 00445 /*!\copydoc #fid_suffixinterval_dump 00446 * \brief 64 bit version of #fid_suffixinterval_dump(). */ 00447 #line 317 00448 void fid_suffixinterval_dump_64(const fid_Suffixinterval_64 *si, 00449 const fid_Suffixarray *esa, FILE *stream); 00450 #line 320 00451 char *fid_tablerequest_to_string(fid_Tablerequest request, const char *sep); 00452 #ifdef __cplusplus 00453 } 00454 #endif 00455 /*@}*/ 00456 00457 #endif /*! SUFFIXARRAY_H */