00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifdef HAVE_CONFIG_H
00022 #include "config.h"
00023 #endif
00024
00025 #include <stdlib.h>
00026 #include <string.h>
00027 #include <errno.h>
00028 #include <assert.h>
00029
00030 #include "libdefs.h"
00031 #include "error.h"
00032 #include "arrays.h"
00033 #include "alphabet.h"
00034 #include "fileutils.h"
00035 #include "sequences.h"
00036 #include "dbfiles.h"
00037 #include "utilities.h"
00038 #include "projectfile.h"
00039 #include "projectfileparser.h"
00040
00041 #define CANNOTPARSE "Cannot parse project file \"%s\", "
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054 int fid_projectfile_init(fid_Projectfile *project, const char *prjbasename,
00055 fid_Error *error)
00056 {
00057 assert(project != NULL);
00058 assert(prjbasename != NULL);
00059 assert(fid_WORDSIZE == 32 || fid_WORDSIZE == 64);
00060
00061 project->integer_size_in_bits=fid_WORDSIZE;
00062 project->integer_size_in_bytes=(project->integer_size_in_bits >> 3);
00063 if(project->integer_size_in_bytes == 4)
00064 {
00065 project->uisize=fid_UINTSIZE_32;
00066 }
00067 else if(project->integer_size_in_bytes == 8)
00068 {
00069 project->uisize=fid_UINTSIZE_64;
00070 }
00071 else
00072 {
00073 abort();
00074 }
00075
00076 if(*prjbasename == '\0')
00077 {
00078 fid_error_throw(error,"Project name empty.");
00079 return -1;
00080 }
00081
00082 if((project->filename=fid_filename_create(prjbasename,"prj",error)) == NULL)
00083 {
00084 return -1;
00085 }
00086
00087 if((project->prjbasename=strdup(prjbasename)) == NULL)
00088 {
00089 free(project->filename);
00090 project->filename=NULL;
00091 fid_OUTOFMEM(error);
00092 return -1;
00093 }
00094
00095 fid_DYNARRAY_INIT(&project->dbfiles,fid_DBfileinfo);
00096
00097 project->endianess_known=1;
00098 #ifdef WORDS_BIGENDIAN
00099 project->littleendian=0;
00100 #else
00101 project->littleendian=1;
00102 #endif
00103 project->totallength=0;
00104 project->num_of_sequences=0;
00105 project->num_of_query_sequences=0;
00106 project->prefixlength=0;
00107 project->large_lcp_values=0;
00108 project->max_branchdepth=0;
00109 project->num_of_specials=0;
00110 project->num_of_special_ranges=0;
00111 project->longest=0;
00112 project->len_of_special_prefix=0;
00113 project->len_of_special_suffix=0;
00114
00115 return 0;
00116 }
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142 static int get_next_token_pair(const char *buffer, size_t bufsize,
00143 size_t *lineno, size_t *offset,
00144 size_t *tokenoffset, size_t *tokenlen,
00145 size_t *valueoffset, size_t *valuelen,
00146 const char *filename, fid_Error *error)
00147 {
00148 char c;
00149
00150 if(*offset >= bufsize)
00151 {
00152 return 1;
00153 }
00154
00155
00156 c='\n';
00157 while(c == '\n')
00158 {
00159 while(*offset < bufsize && (c=buffer[*offset]) == '\n')
00160 {
00161 ++*offset;
00162 ++*lineno;
00163 }
00164 if(*offset < bufsize)
00165 {
00166 assert(c != '\n');
00167 if(c == '#')
00168 {
00169 while(*offset < bufsize && (c=buffer[*offset]) != '\n')
00170 {
00171 ++*offset;
00172 }
00173 if(*offset >= bufsize)
00174 {
00175 return 1;
00176 }
00177 assert(c == '\n');
00178 ++*offset;
00179 ++*lineno;
00180 }
00181 }
00182 else
00183 {
00184 return 1;
00185 }
00186 }
00187
00188
00189 *tokenoffset=*offset;
00190 while(*offset < bufsize && (c=buffer[*offset]) != '\n')
00191 {
00192 if(c == '=')
00193 {
00194 if(*tokenoffset < *offset)
00195 {
00196
00197 *tokenlen=*offset-*tokenoffset;
00198 *valueoffset=++*offset;
00199 while(*offset < bufsize && buffer[*offset] != '\n')
00200 {
00201 ++*offset;
00202 }
00203 if(*valueoffset < *offset)
00204 {
00205 *valuelen=*offset-*valueoffset;
00206
00207 ++*offset;
00208 return 0;
00209 }
00210 else
00211 {
00212 fid_error_throw(error,CANNOTPARSE "empty assignment in line %lu.",
00213 filename,(unsigned long)*lineno);
00214 return -1;
00215 }
00216 }
00217 else
00218 {
00219 fid_error_throw(error,CANNOTPARSE "line %lu begins with '=' instead "
00220 "of token.",filename,(unsigned long)*lineno);
00221 return -1;
00222 }
00223 }
00224 ++*offset;
00225 }
00226
00227 return 1;
00228 }
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238 static void reset_tokentable(Tokentableentry *tokens, size_t size)
00239 {
00240 size_t i;
00241
00242 for(i=0; i < size; ++i)
00243 {
00244 tokens[i].set=0;
00245 }
00246 }
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268 static TokenID get_token_id(Tokentableentry *tokens, size_t size,
00269 const char *token, size_t len)
00270 {
00271 size_t i;
00272
00273 for(i=0; i < size; ++i)
00274 {
00275 if(tokens[i].length == len && memcmp(token,tokens[i].name,len) == 0)
00276 {
00277 if(tokens[i].set && !tokens[i].allow_duplicate)
00278 {
00279 return TOKEN_ALREADYSET;
00280 }
00281 else
00282 {
00283 tokens[i].set=1;
00284 return (TokenID)i;
00285 }
00286 }
00287 }
00288 return TOKEN_UNKNOWN;
00289 }
00290
00291
00292
00293 #ifndef HAVE_MEMRCHR
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312 static const void *memrchr(const void *s, int c, size_t n)
00313 {
00314 const char *ptr;
00315
00316 if(n > 0)
00317 {
00318 ptr=(const char *)s+n;
00319 do
00320 {
00321 if(*(--ptr) == (char)c)
00322 {
00323 return ptr;
00324 }
00325 } while(ptr > (const char *)s);
00326 }
00327
00328 return NULL;
00329 }
00330
00331
00332 #endif
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345 static void to_parsebuf(char *parsebuf, const char *str, size_t length)
00346 {
00347 assert(length < PARSEBUFFERSIZE);
00348 memcpy(parsebuf,str,length);
00349 parsebuf[length]='\0';
00350 }
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368 static int parse_dbfile(fid_Projectfile *project, const char *value,
00369 size_t valuelen, char *parsebuf, fid_Error *error)
00370 {
00371 const char *inputfilelen=NULL, *seqlen;
00372 size_t dbfile_length=0, inputfilelen_length=0, seqlen_length=0;
00373 fid_Uint64 parsed_seqlength, parsed_inputlength;
00374 char *parsed_name;
00375 int ok=0;
00376
00377
00378 if((seqlen=(const char *)memrchr(value,' ',valuelen)) != NULL &&
00379 seqlen > value+3)
00380 {
00381 seqlen_length=valuelen-1-(seqlen-value);
00382 if((inputfilelen=(const char *)memrchr(value,' ',valuelen-seqlen_length-1)) != NULL &&
00383 inputfilelen > value+1)
00384 {
00385 inputfilelen_length=valuelen-seqlen_length-2-(inputfilelen-value);
00386 dbfile_length=valuelen-inputfilelen_length-seqlen_length-2;
00387 ++seqlen;
00388 ++inputfilelen;
00389 if(inputfilelen_length >= PARSEBUFFERSIZE || seqlen_length >= PARSEBUFFERSIZE)
00390 {
00391 fid_error_throw(error,"Value too large.");
00392 }
00393 else
00394 {
00395 ok=1;
00396 }
00397 }
00398 }
00399
00400 if(ok)
00401 {
00402
00403
00404 to_parsebuf(parsebuf,seqlen,seqlen_length);
00405 if(fid_utils_parse_uint_64(parsebuf,&parsed_seqlength,error) == 0)
00406 {
00407 to_parsebuf(parsebuf,inputfilelen,inputfilelen_length);
00408 if(fid_utils_parse_uint_64(parsebuf,&parsed_inputlength,error) == 0)
00409 {
00410 if((parsed_name=(char *)malloc((dbfile_length+1)*sizeof(char))) != NULL)
00411 {
00412
00413 memcpy(parsed_name,value,dbfile_length);
00414 parsed_name[dbfile_length]='\0';
00415 if(fid_dbfiles_append(&project->dbfiles,parsed_name,
00416 parsed_inputlength,parsed_seqlength,error) == 0)
00417 {
00418 return 0;
00419 }
00420 free(parsed_name);
00421 }
00422 else
00423 {
00424 fid_OUTOFMEM(error);
00425 }
00426 }
00427 }
00428 }
00429
00430 return -1;
00431 }
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446 static int projectfile_parse(fid_Projectfile *project, const char *buffer,
00447 size_t bufsize, fid_Error *error)
00448 {
00449 size_t lineno=(size_t)1, offset=0;
00450 size_t tokenoffset=0, tokenlen=0, valueoffset=0, valuelen=0;
00451 fid_Uint64 temp;
00452 fid_Uint64 num_of_db_sequences=0;
00453 int retcode;
00454 TokenID tid;
00455 char parsebuf[PARSEBUFFERSIZE];
00456 static Tokentableentry tokens[TOKEN_NUMOFTOKENS]=
00457 {
00458 TOKEN_INSERT("dbfile",1),
00459 TOKEN_INSERT("totallength",0),
00460 TOKEN_INSERT("numofsequences",0),
00461 TOKEN_INSERT("numofdbsequences",0),
00462 TOKEN_INSERT("numofquerysequences",0),
00463 TOKEN_INSERT("prefixlength",0),
00464 TOKEN_INSERT("largelcpvalues",0),
00465 TOKEN_INSERT("maxbranchdepth",0),
00466 TOKEN_INSERT("integersize",0),
00467 TOKEN_INSERT("littleendian",0),
00468 TOKEN_INSERT("specialcharacters",0),
00469 TOKEN_INSERT("specialranges",0),
00470 TOKEN_INSERT("longest",0),
00471 TOKEN_INSERT("lengthofspecialprefix",0),
00472 TOKEN_INSERT("lengthofspecialsuffix",0)
00473 };
00474
00475 assert(project != NULL);
00476 assert(project->filename != NULL);
00477 assert(buffer != NULL);
00478 assert(bufsize > 0);
00479
00480 reset_tokentable(tokens,(size_t)TOKEN_NUMOFTOKENS);
00481 project->endianess_known=0;
00482 while((retcode=get_next_token_pair(buffer,bufsize,&lineno,&offset,
00483 &tokenoffset,&tokenlen,&valueoffset,
00484 &valuelen,project->filename,error)) == 0)
00485 {
00486 assert(tokenlen > 0);
00487 assert(valuelen > 0);
00488
00489 tid=get_token_id(tokens,(size_t)TOKEN_NUMOFTOKENS,buffer+tokenoffset,
00490 tokenlen);
00491 if(tid != TOKEN_UNKNOWN)
00492 {
00493 if(tid == TOKEN_ALREADYSET)
00494 {
00495 fid_error_throw(error,CANNOTPARSE "duplicate specification of token in "
00496 "line %lu.",project->filename,(unsigned long)lineno);
00497 retcode=-1;
00498 break;
00499 }
00500 else if(tid != TOKEN_DBFILE)
00501 {
00502 to_parsebuf(parsebuf,buffer+valueoffset,valuelen);
00503 }
00504 }
00505
00506 switch(tid)
00507 {
00508 case TOKEN_DBFILE:
00509 retcode=parse_dbfile(project,buffer+valueoffset,valuelen,parsebuf,error);
00510 break;
00511 case TOKEN_TOTALLENGTH:
00512 retcode=fid_utils_parse_uint_64(parsebuf,&project->totallength,error);
00513 break;
00514 case TOKEN_NUMOFSEQUENCES:
00515 retcode=fid_utils_parse_uint_64(parsebuf,&project->num_of_sequences,
00516 error);
00517 break;
00518 case TOKEN_NUMOFDBSEQUENCES:
00519 retcode=fid_utils_parse_uint_64(parsebuf,&num_of_db_sequences,error);
00520 break;
00521 case TOKEN_NUMOFQUERYSEQUENCES:
00522 retcode=fid_utils_parse_uint_64(parsebuf,
00523 &project->num_of_query_sequences,error);
00524 break;
00525 case TOKEN_PREFIXLENGTH:
00526 retcode=fid_utils_parse_uint_64(parsebuf,&project->prefixlength,error);
00527 break;
00528 case TOKEN_LARGELCPVALUES:
00529 retcode=fid_utils_parse_uint_64(parsebuf,&project->large_lcp_values,
00530 error);
00531 break;
00532 case TOKEN_MAXBRANCHDEPTH:
00533 retcode=fid_utils_parse_uint_64(parsebuf,&project->max_branchdepth,error);
00534 break;
00535 case TOKEN_INTEGERSIZE:
00536 if((retcode=fid_utils_parse_uint_64(parsebuf,&temp,error)) == 0)
00537 {
00538 if(temp != (fid_Uint64)32 && temp != (fid_Uint64)64)
00539 {
00540 fid_error_throw(error,CANNOTPARSE "unsupported integer size.",
00541 project->filename);
00542 retcode=-1;
00543 }
00544 else
00545 {
00546 project->integer_size_in_bits=(unsigned char)temp;
00547 project->integer_size_in_bytes=
00548 (unsigned char)(project->integer_size_in_bits >> 3);
00549 if(temp == 32)
00550 {
00551 project->uisize=fid_UINTSIZE_32;
00552 }
00553 else
00554 {
00555 project->uisize=fid_UINTSIZE_64;
00556 }
00557 }
00558 }
00559 break;
00560 case TOKEN_LITTLEENDIAN:
00561 if((retcode=fid_utils_parse_uint_64(parsebuf,&temp,error)) == 0)
00562 {
00563 if(temp == 0 || temp == 1)
00564 {
00565 project->littleendian=(char)temp;
00566 project->endianess_known=1;
00567 }
00568 else
00569 {
00570 fid_error_throw(error,CANNOTPARSE "token \"%s\" can only be set "
00571 "to 0 or 1.",project->filename,tokens[tid].name);
00572 retcode=-1;
00573 }
00574 }
00575 break;
00576 case TOKEN_SPECIALCHARS:
00577 retcode=fid_utils_parse_uint_64(parsebuf,&project->num_of_specials,error);
00578 break;
00579 case TOKEN_SPECIALRANGES:
00580 retcode=
00581 fid_utils_parse_uint_64(parsebuf,&project->num_of_special_ranges,error);
00582 break;
00583 case TOKEN_LONGEST:
00584 retcode=fid_utils_parse_uint_64(parsebuf,&project->longest,error);
00585 break;
00586 case TOKEN_LENOFSPECIALPFX:
00587 retcode=
00588 fid_utils_parse_uint_64(parsebuf,&project->len_of_special_prefix,error);
00589 break;
00590 case TOKEN_LENOFSPECIALSUF:
00591 retcode=
00592 fid_utils_parse_uint_64(parsebuf,&project->len_of_special_suffix,error);
00593 break;
00594 case TOKEN_UNKNOWN:
00595 fprintf(stderr,"ignoring unknown token in line %lu of %s\n",
00596 (unsigned long)lineno,project->filename);
00597 break;
00598 default:
00599 abort();
00600 }
00601 if(retcode != 0)
00602 {
00603 fid_error_throw(error,CANNOTPARSE "invalid value for \"%s\" in "
00604 "line %lu.",project->filename,tokens[tid].name,
00605 (unsigned long)lineno);
00606 break;
00607 }
00608 ++lineno;
00609 }
00610
00611 assert(retcode != 0);
00612
00613 if(retcode > 0)
00614 {
00615 if(!tokens[TOKEN_INTEGERSIZE].set)
00616 {
00617 fid_error_throw(error,CANNOTPARSE "integer size not specified "
00618 "(need \"%s=<32|64>\").",project->filename,
00619 tokens[TOKEN_INTEGERSIZE].name);
00620 retcode=-1;
00621 }
00622 }
00623
00624 if(retcode > 0)
00625 {
00626 return 0;
00627 }
00628 else
00629 {
00630 fid_projectfile_free(project);
00631 return retcode;
00632 }
00633 }
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649 int fid_projectfile_parse_from_file(fid_Projectfile *project,
00650 const char *prjbasename, fid_Error *error)
00651 {
00652 fid_Mappedfile prjfile;
00653 int retcode;
00654
00655 assert(project != NULL);
00656 assert(prjbasename != NULL);
00657
00658 if((retcode=fid_projectfile_init(project,prjbasename,error)) == 0)
00659 {
00660 if((retcode=fid_file_map(&prjfile,project->filename,0,0,error)) == 0)
00661 {
00662 retcode=projectfile_parse(project,(const char *)prjfile.content,
00663 prjfile.occupied,error);
00664 fid_file_unmap(&prjfile);
00665 }
00666 else
00667 {
00668 fid_projectfile_free(project);
00669 }
00670 }
00671 return retcode;
00672 }
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688 int fid_projectfile_write(const fid_Projectfile *project, fid_Error *error)
00689 {
00690 fid_Mappedfile prjfile;
00691 const fid_DBfileinfo *entry;
00692 size_t i;
00693 int retcode;
00694
00695 assert(project != NULL);
00696 assert(project->filename != NULL);
00697
00698 if((retcode=fid_file_new(&prjfile,project->filename,error)) == 0)
00699 {
00700 for(i=0; retcode == 0 && i < project->dbfiles.occupied; ++i)
00701 {
00702 entry=&project->dbfiles.dyndata[i];
00703 retcode=
00704 fid_file_write(&prjfile,error,
00705 "dbfile=%s " fid_U64FMT " " fid_U64FMT "\n",
00706 entry->name,entry->inputfile_length,
00707 entry->sequence_length);
00708 }
00709
00710 if(retcode == 0)
00711 {
00712 retcode=
00713 fid_file_write(&prjfile,error,
00714 "totallength=" fid_U64FMT "\n"
00715 "specialcharacters=" fid_U64FMT "\n"
00716 "specialranges=" fid_U64FMT "\n"
00717 "lengthofspecialprefix=" fid_U64FMT "\n"
00718 "lengthofspecialsuffix=" fid_U64FMT "\n"
00719 "numofsequences=" fid_U64FMT "\n"
00720 "numofdbsequences=" fid_U64FMT "\n"
00721 "numofquerysequences=" fid_U64FMT "\n"
00722 "longest=" fid_U64FMT "\n"
00723 "prefixlength=" fid_U64FMT "\n"
00724 "largelcpvalues=" fid_U64FMT "\n"
00725 "maxbranchdepth=" fid_U64FMT "\n"
00726 "integersize=%hhu\n",
00727 project->totallength,project->num_of_specials,
00728 project->num_of_special_ranges,
00729 project->len_of_special_prefix,
00730 project->len_of_special_suffix,
00731 project->num_of_sequences,
00732 project->num_of_sequences-project->num_of_query_sequences,
00733 project->num_of_query_sequences,project->longest,
00734 project->prefixlength,project->large_lcp_values,
00735 project->max_branchdepth,project->integer_size_in_bits);
00736 if(retcode == 0 && project->endianess_known)
00737 {
00738 retcode=fid_file_write(&prjfile,error,"littleendian=%d\n",
00739 project->littleendian?1:0);
00740 }
00741 }
00742
00743 fid_file_unmap(&prjfile);
00744 }
00745 return retcode;
00746 }
00747
00748
00749
00750
00751
00752
00753
00754
00755 void fid_projectfile_free(fid_Projectfile *project)
00756 {
00757 assert(project != NULL);
00758 free(project->filename);
00759 free(project->prjbasename);
00760 fid_dbfiles_free(&project->dbfiles);
00761 #ifdef DEBUG
00762 memset(project,0,sizeof(fid_Projectfile));
00763 #endif
00764 }
00765
00766
00767
00768
00769
00770
00771
00772
00773
00774
00775 void fid_projectfile_dump(const fid_Projectfile *project, FILE *stream)
00776 {
00777 size_t i;
00778 const fid_DBfileinfo *entry;
00779
00780 assert(project != NULL);
00781 assert(project->filename != NULL);
00782
00783 if(stream == NULL)
00784 {
00785 return;
00786 }
00787
00788 fprintf(stream,"----------\nProject file \"%s\"\n",project->filename);
00789 fprintf(stream,"base name \"%s\"\n"
00790 "input databases %lu\n",project->prjbasename,
00791 (unsigned long)project->dbfiles.occupied);
00792 for(i=0; i < project->dbfiles.occupied; ++i)
00793 {
00794 entry=&project->dbfiles.dyndata[i];
00795 fprintf(stream," \"%s\"\n"
00796 " input file size %lu, sequence size %lu\n",
00797 entry->name,
00798 (unsigned long)entry->inputfile_length,
00799 (unsigned long)entry->sequence_length);
00800 }
00801 fprintf(stream,"integer size %hhu (%hhu bytes)\n"
00802 "total length " fid_U64FMT "\n"
00803 "number of sequences " fid_U64FMT "\n"
00804 "number of database sequences %lu\n"
00805 "number of query sequences " fid_U64FMT "\n"
00806 "bucket table prefix length " fid_U64FMT "\n"
00807 "number of large LCP values " fid_U64FMT "\n"
00808 "maximum branching depth " fid_U64FMT "\n",
00809 project->integer_size_in_bits,project->integer_size_in_bytes,
00810 project->totallength,project->num_of_sequences,
00811 (unsigned long)project->dbfiles.occupied,
00812 project->num_of_query_sequences,project->prefixlength,
00813 project->large_lcp_values,project->max_branchdepth);
00814 fprintf(stream,"----------\n");
00815 }
00816
00817