00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "arrays.h"
00022
00023 typedef struct
00024 {
00025 UINT sufindex;
00026 UINT lcp;
00027 } Largelcpentry;
00028
00029 typedef struct
00030 {
00031 UINT intervals[fid_INTERVALBUFFERSIZE];
00032 fid_Symbol sym;
00033 UINT depth;
00034 } Stackelem;
00035
00036 fid_DYNARRAY_DECLARE(Stack,Stackelem);
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055 static int suffixarray_realize(fid_Suffixarray *esa, fid_Tablerequest tables,
00056 fid_Error *error)
00057 {
00058 if((tables&fid_TABLE_SUF) != 0)
00059 {
00060 esa->suftab.VU=fid_CAST_POINTER(esa->suffile.content,const UINT);
00061 }
00062
00063 if((tables&fid_TABLE_LCP) != 0)
00064 {
00065 if(esa->llvfile.content == NULL)
00066 {
00067 esa->num_of_large_lcps.VU=0;
00068 }
00069 else
00070 {
00071 TO_UINT(esa->num_of_large_lcps.VU,
00072 (size_t)(esa->llvfile.occupied/(2*sizeof(UINT))),error,
00073 fid_error_throw(error,
00074 "Size of file \"%s\" too large to be "
00075 "represented in 32 bits.\n",
00076 esa->llvfile.filename);
00077 return -1;);
00078 }
00079 }
00080
00081 if((tables&fid_TABLE_SKP) != 0)
00082 {
00083 esa->skiptab.VU=fid_CAST_POINTER(esa->skpfile.content,const UINT);
00084 }
00085
00086 if((tables&fid_TABLE_STI) != 0)
00087 {
00088 esa->stitab.VU=fid_CAST_POINTER(esa->stifile.content,const UINT);
00089 }
00090
00091 return 0;
00092 }
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109 static int init_stack(Stack *stack, const fid_Suffixarray *esa,
00110 fid_Error *error)
00111 {
00112 fid_Suffixinterval si;
00113
00114 fid_DYNARRAY_INIT(stack,Stackelem);
00115 fid_DYNARRAY_PUSH(stack,Stackelem,128,
00116 fid_DYNARRAY_FREE(stack,Stackelem);
00117 fid_OUTOFMEM(error);
00118 return -1;);
00119 assert(stack->dyndata != NULL);
00120 fid_suffixinterval_init_root(&si,esa);
00121 if(fid_suffixarray_get_intervals(esa,&si,
00122 fid_DYNARRAY_TOPELEM(stack,
00123 Stackelem).intervals) > 0)
00124 {
00125 fid_DYNARRAY_TOPELEM(stack,Stackelem).sym=0;
00126 fid_DYNARRAY_TOPELEM(stack,Stackelem).depth=0;
00127 }
00128
00129 return 0;
00130 }
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156 int fid_suffixarray_traverse(const fid_Suffixarray *esa,
00157 fid_Esatraversecallback callback,
00158 void *user_data, fid_Error *error)
00159 {
00160 fid_Suffixinterval si;
00161 Stack stack;
00162 Stackelem *elem;
00163 fid_Symbol wcsym;
00164 int retval=0;
00165
00166 assert(esa != NULL);
00167 assert(callback != NULL);
00168
00169
00170 if(init_stack(&stack,esa,error) == -1)
00171 {
00172 return -1;
00173 }
00174
00175 wcsym=esa->alpha.num_of_syms-1;
00176
00177 while(stack.occupied > 0)
00178 {
00179 elem=&fid_DYNARRAY_TOPELEM(&stack,Stackelem);
00180 while(elem->sym < wcsym)
00181 {
00182 assert(fid_REGULARSYMBOL(elem->sym));
00183 fid_suffixinterval_init(&si,elem->depth+1,
00184 elem->intervals[(size_t)elem->sym],
00185 elem->intervals[elem->sym+1]-1);
00186 ++elem->sym;
00187 if(si.left+1 < elem->intervals[(size_t)elem->sym])
00188 {
00189
00190
00191 fid_suffixinterval_to_lcpinterval(&si,esa);
00192 if((retval=callback(esa,&si,elem->depth,error,user_data)) > 0)
00193 {
00194 goto stop_traversal;
00195 }
00196 else if(retval == fid_TRAVERSE_SKIP)
00197 {
00198 continue;
00199 }
00200
00201 fid_DYNARRAY_PUSH(&stack,Stackelem,128,
00202 fid_DYNARRAY_FREE(&stack,Stackelem);
00203 fid_OUTOFMEM(error);
00204 return -1;);
00205 if(fid_suffixarray_get_intervals(esa,&si,
00206 fid_DYNARRAY_TOPELEM(&stack,
00207 Stackelem).intervals) > 0)
00208 {
00209
00210 fid_DYNARRAY_TOPELEM(&stack,Stackelem).sym=0;
00211 fid_DYNARRAY_TOPELEM(&stack,Stackelem).depth=si.depth;
00212 break;
00213 }
00214 else
00215 {
00216
00217 fid_DYNARRAY_POP(&stack,Stackelem);
00218 }
00219 }
00220 else if(si.left == si.right)
00221 {
00222
00223 si.depth=fid_SUFFIXINTERVAL_SINGLETON;
00224 if((retval=callback(esa,&si,elem->depth,error,user_data)) > 0)
00225 {
00226 goto stop_traversal;
00227 }
00228 }
00229 }
00230
00231 if(fid_DYNARRAY_TOPELEM(&stack,Stackelem).sym >= wcsym)
00232 {
00233
00234 fid_DYNARRAY_POP(&stack,Stackelem);
00235 }
00236 }
00237
00238 stop_traversal:
00239 fid_DYNARRAY_FREE(&stack,Stackelem);
00240 return (retval <= 0)?0:retval;
00241 }
00242
00243
00244 #ifdef DEBUG
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265 static void check_interval(const fid_Suffixarray *esa, const fid_Suffixinterval *si,
00266 int notspecial)
00267 {
00268 assert(si->left <= si->right);
00269 assert(si->left < esa->sequences.total_length.VU);
00270 assert(si->right < esa->sequences.total_length.VU);
00271
00272 if(notspecial)
00273 {
00274
00275 if(esa->suftab.VU[si->left]+si->depth >= esa->sequences.total_length.VU)
00276 {
00277 fprintf(stderr,"Suffix " UINTFMT " too short in " UINTFMT
00278 "-[" UINTFMT "," UINTFMT "].\n",
00279 si->left,si->depth,si->left,si->right);
00280 abort();
00281 }
00282 if(esa->suftab.VU[si->right]+si->depth >= esa->sequences.total_length.VU)
00283 {
00284 fprintf(stderr,"Suffix " UINTFMT " too short in " UINTFMT
00285 "-[" UINTFMT "," UINTFMT "].\n",
00286 si->right,si->depth,si->left,si->right);
00287 abort();
00288 }
00289
00290
00291 if(memcmp(&esa->sequences.tisfile.content[esa->suftab.VU[si->left]],
00292 &esa->sequences.tisfile.content[esa->suftab.VU[si->right]],
00293 (size_t)(si->depth+1)) != 0)
00294 {
00295 fprintf(stderr,"Prefixes differ within " UINTFMT "-[" UINTFMT
00296 "," UINTFMT "].\n",si->depth,si->left,si->right);
00297 fid_sequences_dump_range(&esa->sequences.tisfile.content[esa->suftab.VU[si->left]],
00298 si->depth+1,&esa->alpha,"String 1: ",0,stderr);
00299 fid_sequences_dump_range(&esa->sequences.tisfile.content[esa->suftab.VU[si->right]],
00300 si->depth+1,&esa->alpha,"String 2: ",0,stderr);
00301 abort();
00302 }
00303 }
00304 else
00305 {
00306
00307
00308 if((esa->suftab.VU[si->left]+si->depth < esa->sequences.total_length.VU &&
00309 fid_REGULARSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[si->left]+si->depth))) ||
00310 (si->left < si->right-1 &&
00311 fid_REGULARSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[si->right-1]+si->depth))))
00312 {
00313 fprintf(stderr,"Special interval " UINTFMT "-[" UINTFMT
00314 "," UINTFMT "] contains non-special prefixes.\n",
00315 si->depth,si->left,si->right);
00316 fid_sequences_dump_range(&esa->sequences.tisfile.content[esa->suftab.VU[si->left]],
00317 si->depth+1,&esa->alpha,"String 1: ",0,stderr);
00318 if(si->left < si->right-1)
00319 {
00320 fid_sequences_dump_range(&esa->sequences.tisfile.content[esa->suftab.VU[si->right-1]],
00321 si->depth+1,&esa->alpha,"String 2: ",0,stderr);
00322 }
00323 abort();
00324 }
00325 }
00326
00327
00328 if(si->left > 0)
00329 {
00330 if(esa->sequences.total_length.VU-esa->suftab.VU[si->left-1] > si->depth &&
00331 memcmp(&esa->sequences.tisfile.content[esa->suftab.VU[si->left-1]],
00332 &esa->sequences.tisfile.content[esa->suftab.VU[si->left]],
00333 (size_t)si->depth+1) == 0)
00334 {
00335 fprintf(stderr,"Suffix boundary " UINTFMT " is not left-maximal "
00336 "in " UINTFMT "-[" UINTFMT "," UINTFMT "].\n",
00337 si->left,si->depth,si->left,si->right);
00338 abort();
00339 }
00340 }
00341
00342
00343 if(si->right < esa->sequences.total_length.VU)
00344 {
00345 if(esa->sequences.total_length.VU-esa->suftab.VU[si->right+1] > si->depth &&
00346 memcmp(&esa->sequences.tisfile.content[esa->suftab.VU[si->right]],
00347 &esa->sequences.tisfile.content[esa->suftab.VU[si->right+1]],
00348 (size_t)si->depth+1) == 0)
00349 {
00350 fprintf(stderr,"Suffix boundary " UINTFMT " is not right-maximal "
00351 "in " UINTFMT "-[" UINTFMT "," UINTFMT "].\n",
00352 si->right,si->depth,si->left,si->right);
00353 abort();
00354 }
00355 }
00356 }
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376 static void check_intervals(const fid_Suffixarray *esa, UINT depth,
00377 UINT *intervals, UINT count,
00378 int dump_intervals)
00379 {
00380 fid_Suffixinterval si;
00381 UINT counted;
00382 fid_Symbol sym;
00383
00384 for(sym=0; sym <= esa->alpha.num_of_syms; ++sym)
00385 {
00386 if(dump_intervals && sym < esa->alpha.num_of_syms)
00387 {
00388 fprintf(stderr,"%c -> [" UINTFMT "," UINTFMT "] %c\n",
00389 fid_PRINT_SYMBOL(&esa->alpha,sym),
00390 intervals[(size_t)sym],intervals[sym+1]-1,
00391 intervals[(size_t)sym] < intervals[sym+1]?' ':'*');
00392 }
00393 assert(intervals[(size_t)sym] != ~(UINT)0);
00394 }
00395 for(; sym < fid_INTERVALBUFFERSIZE; ++sym)
00396 {
00397 assert(intervals[(size_t)sym] == ~(UINT)0);
00398 }
00399
00400 counted=0;
00401 si.depth=depth;
00402 for(sym=0; sym < esa->alpha.num_of_syms; ++sym)
00403 {
00404 if(intervals[(size_t)sym] < intervals[sym+1])
00405 {
00406 si.left=intervals[(size_t)sym];
00407 si.right=intervals[sym+1]-1;
00408 check_interval(esa,&si,(sym+(UINT)1 < esa->alpha.num_of_syms));
00409 ++counted;
00410 }
00411 }
00412 assert(counted == count);
00413 }
00414
00415
00416 #endif
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433 static UINT binary_search(const fid_Suffixarray *esa, fid_Symbol sym,
00434 const fid_Suffixinterval *si)
00435 {
00436 UINT left, right, pivot;
00437
00438 assert(si->left < si->right);
00439 assert(si->left < esa->sequences.total_length.VU);
00440 assert(si->right < esa->sequences.total_length.VU);
00441 assert(fid_REGULARSYMBOL(sym));
00442
00443 left=si->left;
00444 right=si->right;
00445
00446 while(left+1 < right)
00447 {
00448 pivot=left+((right-left) >> 1);
00449 if(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[pivot]+si->depth) <= sym)
00450 {
00451 left=pivot;
00452 }
00453 else
00454 {
00455 right=pivot-1;
00456 }
00457 }
00458
00459
00460
00461 if(esa->suftab.VU[right]+si->depth < esa->sequences.total_length.VU &&
00462 fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[right]+si->depth) == sym)
00463 {
00464 return right;
00465 }
00466 else
00467 {
00468 assert(left+1 == right);
00469 assert(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[left]+si->depth) == sym);
00470 return left;
00471 }
00472 }
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492 UINT fid_suffixarray_get_intervals(const fid_Suffixarray *esa,
00493 const fid_Suffixinterval *si,
00494 UINT *intervals)
00495 {
00496 fid_Symbol sym, nextsym, wcsym;
00497 fid_Suffixinterval current_interval;
00498 UINT count=0;
00499 #ifdef DEBUG
00500 UINT debugindex;
00501 #endif
00502
00503 assert(esa != NULL);
00504 assert(esa->sequences.tisfile.content != NULL && esa->suffile.content != NULL && esa->suftab.VU != NULL);
00505 assert(esa->alpha.num_of_syms > 0);
00506 assert(si != NULL);
00507 assert(si->left <= si->right);
00508 assert(si->left < esa->sequences.total_length.VU);
00509 assert(si->right < esa->sequences.total_length.VU);
00510 assert(intervals != NULL);
00511
00512 #ifdef DEBUG
00513 memset(intervals,0xff,fid_INTERVALBUFFERSIZE*sizeof(UINT));
00514 for(debugindex=si->left; debugindex < si->right; ++debugindex)
00515 {
00516 if(esa->suftab.VU[debugindex]+si->depth >= esa->sequences.total_length.VU)
00517 {
00518 abort();
00519 }
00520 }
00521 #endif
00522
00523 wcsym=(fid_Symbol)(esa->alpha.num_of_syms-1);
00524 current_interval=*si;
00525 intervals[0]=current_interval.left;
00526 nextsym=fid_READ_SYMBOL(&esa->sequences,
00527 esa->suftab.VU[current_interval.left]+si->depth);
00528
00529 for(sym=0; current_interval.left <= current_interval.right; ++sym)
00530 {
00531 assert(esa->suftab.VU[current_interval.left]+si->depth <= esa->sequences.total_length.VU);
00532 if(esa->suftab.VU[current_interval.left]+si->depth == esa->sequences.total_length.VU)
00533 {
00534
00535 nextsym=(fid_Symbol)(wcsym+1);
00536 break;
00537 }
00538
00539 nextsym=fid_READ_SYMBOL(&esa->sequences,
00540 esa->suftab.VU[current_interval.left]+si->depth);
00541
00542 if(fid_SPECIALSYMBOL(nextsym))
00543 {
00544
00545 nextsym=wcsym;
00546 break;
00547 }
00548
00549 for(; sym < nextsym; ++sym)
00550 {
00551
00552 intervals[sym+1]=current_interval.left;
00553 }
00554
00555 assert(fid_REGULARSYMBOL(sym));
00556
00557 if(current_interval.left < current_interval.right)
00558 {
00559 intervals[sym+1]=binary_search(esa,sym,¤t_interval)+1;
00560 }
00561 else
00562 {
00563 intervals[sym+1]=current_interval.left+1;
00564 }
00565 assert(intervals[sym+1] <= esa->sequences.total_length.VU);
00566 assert(intervals[sym+1] >= current_interval.left);
00567 current_interval.left=intervals[sym+1];
00568 ++count;
00569 }
00570
00571 for(; sym < wcsym; ++sym)
00572 {
00573
00574 intervals[sym+1]=intervals[(size_t)sym];
00575 }
00576 if(nextsym <= wcsym)
00577 {
00578 intervals[wcsym+1]=si->right+1;
00579 if(nextsym == wcsym)
00580 {
00581 ++count;
00582 }
00583 }
00584 else
00585 {
00586 intervals[wcsym+1]=si->right;
00587 }
00588
00589 assert(count > 0);
00590
00591 #ifdef DEBUG
00592 check_intervals(esa,si->depth,intervals,count,0);
00593 #endif
00594
00595 return count;
00596 }
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614 int fid_suffixarray_find_embedded_interval(const fid_Suffixarray *esa,
00615 fid_Suffixinterval *si,
00616 fid_Symbol symbol)
00617 {
00618 UINT left, right, pivot;
00619 #ifdef DEBUG
00620 UINT i;
00621 #endif
00622
00623 assert(esa != NULL);
00624 assert(esa->sequences.tisfile.content != NULL && esa->suftab.VU != NULL);
00625 assert(si != NULL);
00626 assert(si->left <= si->right);
00627 assert(si->left < esa->sequences.total_length.VU);
00628 assert(si->right < esa->sequences.total_length.VU);
00629 assert(symbol < esa->alpha.num_of_syms-1 || fid_SPECIALSYMBOL(symbol));
00630
00631 #ifdef DEBUG
00632 for(i=si->left; i < si->right; ++i)
00633 {
00634 if(esa->suftab.VU[i]+si->depth >= esa->sequences.total_length.VU)
00635 {
00636 abort();
00637 }
00638 }
00639 #endif
00640
00641 if(fid_REGULARSYMBOL(symbol))
00642 {
00643
00644 left=si->left;
00645 right=si->right;
00646 while(left+1 < right)
00647 {
00648 pivot=left+((right-left) >> 1);
00649 if(esa->suftab.VU[pivot]+si->depth < esa->sequences.total_length.VU &&
00650 fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[pivot]+si->depth) < symbol)
00651 {
00652 left=pivot+1;
00653 }
00654 else
00655 {
00656 right=pivot;
00657 }
00658 }
00659
00660 if(esa->suftab.VU[left]+si->depth < esa->sequences.total_length.VU &&
00661 fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[left]+si->depth) == symbol)
00662 {
00663 assert(left < esa->sequences.total_length.VU);
00664 si->left=left;
00665 }
00666 else
00667 {
00668 if(left == right ||
00669 esa->suftab.VU[right]+si->depth == esa->sequences.total_length.VU ||
00670 fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[right]+si->depth) != symbol)
00671 {
00672 return -1;
00673 }
00674 assert(right < esa->sequences.total_length.VU);
00675 si->left=right;
00676 }
00677
00678
00679 left=si->left;
00680 right=si->right;
00681 while(left+1 < right)
00682 {
00683 pivot=left+((right-left) >> 1);
00684 if(esa->suftab.VU[pivot]+si->depth < esa->sequences.total_length.VU &&
00685 fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[pivot]+si->depth) <= symbol)
00686 {
00687 left=pivot;
00688 }
00689 else
00690 {
00691 right=pivot-1;
00692 }
00693 }
00694
00695 if(right < esa->sequences.total_length.VU &&
00696 esa->suftab.VU[right]+si->depth < esa->sequences.total_length.VU &&
00697 fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[right]+si->depth) == symbol)
00698 {
00699 si->right=right;
00700 }
00701 else
00702 {
00703 assert(left+1 == right);
00704 assert(left < esa->sequences.total_length.VU);
00705 assert(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[left]+si->depth) == symbol);
00706 si->right=left;
00707 }
00708 }
00709 else
00710 {
00711
00712
00713 left=si->left;
00714 right=si->right;
00715 while(left+1 < right)
00716 {
00717 pivot=left+((right-left) >> 1);
00718 if(fid_REGULARSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[pivot]+si->depth)))
00719 {
00720 left=pivot+1;
00721 }
00722 else
00723 {
00724 right=pivot;
00725 }
00726 }
00727
00728 if(esa->suftab.VU[left]+si->depth < esa->sequences.total_length.VU &&
00729 fid_SPECIALSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[left]+si->depth)))
00730 {
00731 assert(esa->suftab.VU[left-1]+si->depth == esa->sequences.total_length.VU || left == si->left || fid_REGULARSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[left-1]+si->depth)));
00732 si->left=left;
00733 }
00734 else
00735 {
00736 if(left == right ||
00737 esa->suftab.VU[right]+si->depth == esa->sequences.total_length.VU ||
00738 fid_REGULARSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[right]+si->depth)))
00739 {
00740 return -1;
00741 }
00742 assert(fid_REGULARSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[right-1]+si->depth)));
00743 assert(esa->suftab.VU[right]+si->depth == esa->sequences.total_length.VU || fid_SPECIALSYMBOL(fid_READ_SYMBOL(&esa->sequences,esa->suftab.VU[right]+si->depth)));
00744 si->left=right;
00745 }
00746 }
00747
00748 #ifdef DEBUG
00749 check_interval(esa,si,fid_REGULARSYMBOL(symbol));
00750 #endif
00751
00752 ++si->depth;
00753
00754 return 0;
00755 }
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767
00768
00769
00770
00771
00772
00773
00774
00775
00776
00777 int fid_suffixarray_extend_interval(const fid_Suffixarray *esa,
00778 const fid_Symbol *pattern, UINT plen,
00779 fid_Suffixinterval *si)
00780 {
00781 assert(esa != NULL);
00782 assert(si != NULL);
00783 assert(si->left <= si->right);
00784 assert(si->left < esa->sequences.total_length.VU);
00785 assert(si->right < esa->sequences.total_length.VU);
00786 assert(pattern != NULL);
00787 assert(plen > 0);
00788
00789 while(si->depth < plen)
00790 {
00791 if(fid_suffixarray_find_embedded_interval(esa,si,pattern[si->depth]) != 0)
00792 {
00793 return -1;
00794 }
00795 }
00796
00797 return 0;
00798 }
00799
00800
00801
00802
00803
00804
00805
00806
00807
00808
00809
00810
00811
00812
00813
00814
00815 int fid_suffixarray_find_interval(const fid_Suffixarray *esa,
00816 const fid_Symbol *pattern, UINT plen,
00817 fid_Suffixinterval *si)
00818 {
00819 assert(esa != NULL);
00820 assert(esa->sequences.total_length.VU > 0);
00821 assert(si != NULL);
00822
00823 fid_suffixinterval_init_root(si,esa);
00824 return fid_suffixarray_extend_interval(esa,pattern,plen,si);
00825 }
00826
00827
00828
00829
00830
00831
00832
00833
00834
00835
00836 UINT fid_suffixarray_find_large_lcp(const fid_Suffixarray *esa,
00837 UINT suffix)
00838 {
00839 UINT left, right, pivot;
00840
00841 assert(esa != NULL);
00842 assert(esa->llvfile.content != NULL);
00843 assert(esa->num_of_large_lcps.VU > 0);
00844
00845 left=0;
00846 right=esa->num_of_large_lcps.VU-1;
00847 while(left+1 < right)
00848 {
00849 pivot=left+((right-left) >> 1);
00850 if(fid_CAST_POINTER(esa->llvfile.content,Largelcpentry)[pivot].sufindex < suffix)
00851 {
00852 left=pivot;
00853 }
00854 else
00855 {
00856 right=pivot;
00857 }
00858 }
00859
00860 if(left == right ||
00861 fid_CAST_POINTER(esa->llvfile.content,Largelcpentry)[left].sufindex == suffix)
00862 {
00863 assert(fid_CAST_POINTER(esa->llvfile.content,Largelcpentry)[left].lcp >= (UINT)UCHAR_MAX);
00864 return fid_CAST_POINTER(esa->llvfile.content,Largelcpentry)[left].lcp;
00865 }
00866 else
00867 {
00868 assert(fid_CAST_POINTER(esa->llvfile.content,Largelcpentry)[right].lcp >= (UINT)UCHAR_MAX);
00869 return fid_CAST_POINTER(esa->llvfile.content,Largelcpentry)[right].lcp;
00870 }
00871 }
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881
00882
00883
00884
00885
00886
00887
00888 UINT fid_suffixarray_suffix_length(const fid_Suffixarray *esa,
00889 UINT suffix)
00890 {
00891 UINT offset, left, right;
00892
00893 assert(esa != NULL);
00894 assert(esa->sequences.tisfile.content != NULL);
00895 assert(esa->suftab.VU != NULL);
00896 assert(suffix <= esa->sequences.total_length.VU);
00897
00898 offset=esa->suftab.VU[suffix];
00899 fid_sequences_offset_to_boundaries(&esa->sequences,offset,&left,&right);
00900
00901 return right-offset;
00902 }
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913
00914
00915
00916
00917
00918
00919
00920
00921
00922
00923
00924
00925 void fid_suffixarray_compute_distribution(fid_Suffixarray *esa)
00926 {
00927 fid_Suffixinterval root;
00928 UINT intervals[fid_INTERVALBUFFERSIZE];
00929 UINT sym, wcsym;
00930 double dbsize;
00931
00932 assert(esa != NULL);
00933 assert(esa->suftab.VU != NULL);
00934 assert(esa->sequences.alpha != NULL);
00935 assert(esa->sequences.tisfile.content != NULL);
00936 assert(esa->sequences.tisfile.occupied > 0);
00937
00938 fid_suffixinterval_init_root(&root,esa);
00939 (void)fid_suffixarray_get_intervals(esa,&root,intervals);
00940
00941 memset(esa->sequences.distribution,0,sizeof(esa->sequences.distribution));
00942 dbsize=(double)(esa->sequences.tisfile.occupied+1-
00943 esa->sequences.num_of_sequences.VU);
00944 wcsym=esa->alpha.num_of_syms-1;
00945 for(sym=0; sym < wcsym; ++sym)
00946 {
00947 esa->sequences.distribution[sym]=
00948 (double)(intervals[sym+1]-intervals[sym])/dbsize;
00949 }
00950 esa->sequences.distribution[wcsym]=
00951 (double)(intervals[wcsym+1]-intervals[wcsym]+1-
00952 esa->sequences.num_of_sequences.VU)/dbsize;
00953 esa->sequences.distribution[(size_t)fid_WILDCARD]=
00954 esa->sequences.distribution[wcsym];
00955 }
00956
00957
00958
00959
00960
00961
00962
00963
00964
00965 static void suffixarray_dump(const fid_Suffixarray *esa, FILE *stream)
00966 {
00967 UINT suffixes;
00968 int show_suffixes=1;
00969
00970 if(esa->sequences.tisfile.content != NULL)
00971 {
00972 suffixes=(UINT)esa->sequences.tisfile.occupied;
00973 }
00974 else if(esa->suffile.content != NULL)
00975 {
00976 suffixes=(UINT)(esa->suffile.occupied/sizeof(UINT));
00977 }
00978 else if(esa->suffile.content != NULL)
00979 {
00980 suffixes=(UINT)(esa->suffile.occupied/sizeof(UINT));
00981 }
00982 else
00983 {
00984 suffixes=0;
00985 show_suffixes=0;
00986 }
00987
00988 if(show_suffixes)
00989 {
00990 fprintf(stream,"Suffixes: " UINTFMT "\n",suffixes);
00991 }
00992
00993 if(esa->lcpfile.content != NULL)
00994 {
00995 fprintf(stream,"Large LCPs: " UINTFMT "\n",esa->num_of_large_lcps.VU);
00996 }
00997 }
00998
00999
01000
01001
01002
01003
01004
01005
01006
01007
01008
01009
01010
01011
01012
01013
01014 void fid_suffixarray_dump_intervals(const fid_Suffixarray *esa,
01015 const UINT intervals[fid_INTERVALBUFFERSIZE],
01016 FILE *stream)
01017 {
01018 fid_Suffixinterval si;
01019 fid_Symbol sym;
01020 int have_empty_ivs=0;
01021
01022 assert(intervals != NULL);
01023
01024 if(stream == NULL)
01025 {
01026 return;
01027 }
01028
01029 for(sym=0; sym < esa->alpha.num_of_syms; ++sym)
01030 {
01031 fid_suffixinterval_init(&si,1,intervals[(size_t)sym],intervals[sym+1]-1);
01032 if(si.left < si.right)
01033 {
01034 fid_suffixinterval_to_lcpinterval(&si,esa);
01035 fprintf(stream,"%c: " UINTFMT "-[" UINTFMT ","
01036 UINTFMT "]\n",fid_PRINT_SYMBOL(&esa->alpha,sym),
01037 si.depth,si.left,si.right);
01038 }
01039 else if(si.left == si.right)
01040 {
01041
01042 fprintf(stream,"%c: --[" UINTFMT "," UINTFMT "]\n",
01043 fid_PRINT_SYMBOL(&esa->alpha,sym),si.left,si.right);
01044 }
01045 else
01046 {
01047 have_empty_ivs=1;
01048 }
01049 }
01050
01051 if(have_empty_ivs)
01052 {
01053 fprintf(stream,"No suffix-intervals for:");
01054 for(sym=0; sym < esa->alpha.num_of_syms; ++sym)
01055 {
01056 if(intervals[(size_t)sym] == intervals[sym+1])
01057 {
01058 fprintf(stream," %c",fid_PRINT_SYMBOL(&esa->alpha,sym));
01059 }
01060 }
01061 (void)fputc('\n',stream);
01062 }
01063 }
01064
01065
01066
01067
01068
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078
01079 void fid_suffixarray_dump_suffix(const fid_Suffixarray *esa, UINT suffix,
01080 UINT length, FILE *stream)
01081 {
01082 assert(esa != NULL);
01083 assert(esa->sequences.tisfile.content != NULL);
01084 assert(esa->suftab.VU != NULL);
01085
01086 if(stream == NULL)
01087 {
01088 return;
01089 }
01090
01091 if(esa->suftab.VU[suffix]+length > esa->sequences.total_length.VU)
01092 {
01093 length=esa->sequences.total_length.VU-esa->suftab.VU[suffix];
01094 }
01095
01096 fid_sequences_dump_range(&esa->sequences.tisfile.content[esa->suftab.VU[suffix]],
01097 length,&esa->alpha,NULL,1,stream);
01098 }
01099
01100
01101
01102
01103
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116 void fid_suffixarray_print(const fid_Suffixarray *esa,
01117 fid_Tablerequest request, FILE *stream)
01118 {
01119 UINT lcp=0, i;
01120
01121 if(stream == NULL || (request&fid_TABLES_ALL) == 0)
01122 {
01123 return;
01124 }
01125
01126 fprintf(stream," i ");
01127 if((request&fid_TABLE_SUF) != 0) fprintf(stream," SUF[i]");
01128 if((request&fid_TABLE_LCP) != 0) fprintf(stream," LCP[i]");
01129 if((request&fid_TABLE_STI) != 0) fprintf(stream," STI[i]");
01130 if((request&fid_TABLE_SKP) != 0) fprintf(stream," SKP[i]");
01131 if((request&fid_TABLE_TIS) != 0) fprintf(stream," Suffixes");
01132 (void)fputc('\n',stream);
01133 for(i=0; i < esa->sequences.total_length.VU; ++i)
01134 {
01135 fprintf(stream,TABVALFMT ":",(fid_Uint64)i);
01136 if((request&fid_TABLE_SUF) != 0)
01137 fprintf(stream," " TABVALFMT,(fid_Uint64)esa->suftab.VU[i]);
01138 if((request&fid_TABLE_LCP) != 0)
01139 fprintf(stream," " TABVALFMT,(fid_Uint64)lcp);
01140 if((request&fid_TABLE_STI) != 0)
01141 fprintf(stream," " TABVALFMT,(fid_Uint64)esa->stitab.VU[i]);
01142 if((request&fid_TABLE_SKP) != 0)
01143 fprintf(stream," " TABVALFMT,(fid_Uint64)esa->skiptab.VU[i]);
01144 if((request&fid_TABLE_TIS) != 0)
01145 {
01146 fprintf(stream," ");
01147 fid_suffixarray_dump_suffix(esa,i,lcp+1,stream);
01148 }
01149 else
01150 {
01151 (void)fputc('\n',stream);
01152 }
01153 if((request&(fid_TABLE_LCP|fid_TABLE_TIS)) != 0)
01154 {
01155 fid_LCP(lcp,esa,i+1);
01156 }
01157 }
01158 }
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168
01169
01170
01171
01172
01173
01174
01175
01176
01177 UINT fid_suffixinterval_lcpvalue(const fid_Suffixinterval *si,
01178 const fid_Suffixarray *esa)
01179 {
01180 UINT lcpvalue, maxlcp;
01181 UINT left_suffix, right_suffix;
01182 fid_Symbol symbol;
01183
01184 assert(si != NULL);
01185 assert(si->depth > 0);
01186 assert(si->left < esa->sequences.total_length.VU);
01187 assert(si->right < esa->sequences.total_length.VU);
01188 assert(esa != NULL);
01189
01190 if(si->left == si->right)
01191 {
01192 return fid_SUFFIXINTERVAL_SINGLETON;
01193 }
01194
01195 left_suffix=esa->suftab.VU[si->left];
01196 if(fid_SPECIALSYMBOL(fid_READ_SYMBOL(&esa->sequences,left_suffix+si->depth-1)))
01197 {
01198 return si->depth;
01199 }
01200 right_suffix=esa->suftab.VU[si->right];
01201 maxlcp=esa->sequences.total_length.VU-
01202 ((left_suffix < right_suffix)?right_suffix:left_suffix);
01203
01204 assert(maxlcp >= si->depth);
01205 assert(fid_READ_SYMBOL(&esa->sequences,left_suffix+si->depth-1) == fid_READ_SYMBOL(&esa->sequences,right_suffix+si->depth-1));
01206
01207 for(lcpvalue=si->depth;
01208 lcpvalue < maxlcp &&
01209 (symbol=fid_READ_SYMBOL(&esa->sequences,left_suffix+lcpvalue)) ==
01210 fid_READ_SYMBOL(&esa->sequences,right_suffix+lcpvalue) &&
01211 fid_REGULARSYMBOL(symbol);
01212 ++lcpvalue)
01213 {
01214
01215 }
01216
01217 return lcpvalue;
01218 }
01219
01220
01221
01222
01223
01224
01225
01226
01227
01228
01229
01230
01231 void fid_suffixinterval_to_lcpinterval(fid_Suffixinterval *si,
01232 const fid_Suffixarray *esa)
01233 {
01234 si->depth=fid_suffixinterval_lcpvalue(si,esa);
01235 }
01236
01237
01238
01239
01240
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250
01251 void fid_suffixinterval_find_right(fid_Suffixinterval *si,
01252 const fid_Suffixarray *esa)
01253 {
01254 UINT suffix, lcp;
01255
01256 assert(si != NULL);
01257 assert(si->left < esa->sequences.total_length.VU);
01258 assert(esa != NULL);
01259
01260 suffix=si->left;
01261
01262 do
01263 {
01264 ++suffix;
01265 fid_LCP(lcp,esa,suffix);
01266 } while(lcp >= si->depth);
01267
01268 si->right=suffix-1;
01269 }
01270
01271
01272
01273
01274
01275
01276
01277
01278
01279
01280
01281
01282
01283 UINT fid_suffixinterval_homepos(const fid_Suffixinterval *si,
01284 const fid_Suffixarray *esa)
01285 {
01286 UINT leftlcp, rightlcp;
01287
01288 assert(si != NULL);
01289 assert(si->left < esa->sequences.total_length.VU);
01290 assert(si->right < esa->sequences.total_length.VU);
01291 assert(si->left < si->right);
01292 assert(esa != NULL);
01293 assert(esa->lcpfile.content != NULL);
01294
01295 fid_LCP(leftlcp,esa,si->left);
01296 fid_LCP(rightlcp,esa,si->right+1);
01297 if(leftlcp >= rightlcp)
01298 {
01299 return si->left;
01300 }
01301 else
01302 {
01303 return si->right;
01304 }
01305 }
01306
01307
01308
01309
01310
01311
01312
01313
01314
01315
01316
01317 void fid_suffixinterval_dump(const fid_Suffixinterval *si,
01318 const fid_Suffixarray *esa, FILE *stream)
01319 {
01320 const fid_Symbol *seq;
01321 #ifdef DEBUG
01322 const fid_Symbol *seqright;
01323 #endif
01324 UINT i;
01325
01326 assert(esa != NULL);
01327 assert(esa->sequences.tisfile.content != NULL);
01328 assert(esa->suftab.VU != NULL);
01329
01330 if(stream == NULL)
01331 {
01332 return;
01333 }
01334
01335 seq=&esa->sequences.tisfile.content[esa->suftab.VU[si->left]];
01336 #ifdef DEBUG
01337 seqright=&esa->sequences.tisfile.content[esa->suftab.VU[si->right]];
01338 #endif
01339
01340 for(i=0; i < si->depth; ++i)
01341 {
01342 #ifdef DEBUG
01343 assert(seq[i] == seqright[i]);
01344 #endif
01345 (void)fputc(fid_PRINT_SYMBOL(&esa->alpha,seq[i]),stream);
01346 }
01347
01348 fprintf(stream,"-[" UINTFMT "," UINTFMT "]\n",si->left,si->right);
01349 }
01350
01351