00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifdef HAVE_CONFIG_H
00022 #include "config.h"
00023 #endif
00024
00025 #include <stdlib.h>
00026 #include <string.h>
00027 #include <ctype.h>
00028 #include <assert.h>
00029
00030 #include "libdefs.h"
00031 #include "error.h"
00032 #include "arrays.h"
00033 #include "alphabet.h"
00034 #include "fileutils.h"
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054 int fid_alphabet_init_from_speclines(fid_Alphabet *alpha, const char *str,
00055 size_t len, fid_Error *error)
00056 {
00057 size_t pos=0, cindex;
00058 char c;
00059
00060 assert(alpha != NULL);
00061 assert(str != NULL);
00062
00063 if(len == 0)
00064 {
00065 len=strlen((const char *)str);
00066 }
00067
00068 while(len > 0 && str[len-1] == '\n')
00069 {
00070 --len;
00071 }
00072 if(len == 0)
00073 {
00074
00075 fid_error_throw(error,"Empty alphabet specification.");
00076 return -1;
00077 }
00078
00079 alpha->num_of_chars=0;
00080 alpha->num_of_syms=0;
00081 memset(alpha->char_to_sym,(int)fid_UNDEF,sizeof(alpha->char_to_sym));
00082 memset(alpha->sym_to_char,'\0',sizeof(alpha->sym_to_char));
00083
00084 while(pos < len)
00085 {
00086 c=str[pos];
00087 if(c == '#')
00088 {
00089
00090 for(++pos; pos < len && str[pos] != '\n'; ++pos)
00091 {
00092
00093 }
00094 }
00095 else if(c == '\n')
00096 {
00097
00098 ++pos;
00099 }
00100 else
00101 {
00102
00103 alpha->sym_to_char[alpha->num_of_syms]=c;
00104 for(; pos < len && str[pos] != '\n'; ++pos)
00105 {
00106 c=str[pos];
00107 cindex=fid_CHAR_AS_INDEX(c);
00108 if(alpha->char_to_sym[cindex] != fid_UNDEF)
00109 {
00110
00111 if(isprint((int)c))
00112 {
00113 fid_error_throw(error,"Duplicate character '%c' in alphabet "
00114 "specification.",c);
00115 }
00116 else
00117 {
00118 fid_error_throw(error,"Duplicate character in alphabet "
00119 "specification (non-printable 0x%02hhx).",
00120 (unsigned char)c);
00121 }
00122 return -1;
00123 }
00124 ++alpha->num_of_chars;
00125 assert(alpha->num_of_syms <= UCHAR_MAX);
00126 alpha->char_to_sym[cindex]=(fid_Symbol)alpha->num_of_syms;
00127 }
00128 ++alpha->num_of_syms;
00129 ++pos;
00130 }
00131 }
00132
00133 if(alpha->num_of_syms == (fid_Uint16)1)
00134 {
00135
00136 fid_error_throw(error,"Alphabet specification contains only one "
00137 "line. Unary alphabets are not supported.");
00138 return -1;
00139 }
00140
00141
00142 for(pos=0; pos < sizeof(alpha->char_to_sym); ++pos)
00143 {
00144 if(alpha->char_to_sym[pos] == (fid_Symbol)(alpha->num_of_syms-1))
00145 {
00146 alpha->char_to_sym[pos]=fid_WILDCARD;
00147 }
00148 }
00149 alpha->sym_to_char[(size_t)fid_WILDCARD]=
00150 alpha->sym_to_char[alpha->num_of_syms-1];
00151
00152 return 0;
00153 }
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170 int fid_alphabet_init_from_specfile(fid_Alphabet *alpha, const char *filename,
00171 fid_Error *error)
00172 {
00173 fid_Mappedfile file;
00174 int retcode;
00175
00176 if((retcode=fid_file_map(&file,filename,0,0,error)) == 0)
00177 {
00178 retcode=fid_alphabet_init_from_speclines(alpha,(const char *)file.content,
00179 file.occupied,error);
00180 fid_file_unmap(&file);
00181 }
00182 if(retcode != 0)
00183 {
00184 fid_error_throw(error,"Could not read alphabet specification file.");
00185 }
00186 return retcode;
00187 }
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206 int fid_alphabet_init_from_string(fid_Alphabet *alpha, const char *str,
00207 size_t len, fid_Error *error)
00208 {
00209 size_t pos;
00210 char c;
00211
00212 assert(alpha != NULL);
00213 assert(str != NULL);
00214
00215 if(len == 0)
00216 {
00217 len=strlen((const char *)str);
00218 }
00219 if(len == 0)
00220 {
00221
00222 fid_error_throw(error,"Cannot generate alphabet from empty string.");
00223 return -1;
00224 }
00225
00226 alpha->num_of_chars=0;
00227 alpha->num_of_syms=0;
00228 memset(alpha->char_to_sym,(int)fid_UNDEF,sizeof(alpha->char_to_sym));
00229 memset(alpha->sym_to_char,'\0',sizeof(alpha->sym_to_char));
00230
00231 for(pos=0; pos < len; ++pos)
00232 {
00233 c=str[pos];
00234 if(alpha->char_to_sym[fid_CHAR_AS_INDEX(c)] == fid_UNDEF)
00235 {
00236 if(alpha->num_of_syms > fid_SYMBOLMAX)
00237 {
00238 fid_error_throw(error,"Too many distinct symbols in string, "
00239 "cannot generate alphabets with more than "
00240 fid_SYMFMT " symbols.",fid_SYMBOLMAX);
00241 return -1;
00242 }
00243 alpha->sym_to_char[alpha->num_of_syms]=c;
00244 alpha->char_to_sym[fid_CHAR_AS_INDEX(c)]=
00245 (fid_Symbol)alpha->num_of_syms;
00246 ++alpha->num_of_syms;
00247 ++alpha->num_of_chars;
00248 }
00249 }
00250
00251 return 0;
00252 }
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265 void fid_alphabet_init_standard(fid_Alphabet *alpha, fid_Alphabettype type)
00266 {
00267 static const char *alphas[]=
00268 {
00269 "aA\ncC\ngG\ntT\nnsywrkvbdhmNSYWRKVBDHM",
00270 "aA\ncC\ngG\nuU\nnsywrkvbdhmNSYWRKVBDHM",
00271 "aA\ncC\ngG\ntTuU\nnsywrkvbdhmNSYWRKVBDHM",
00272 "L\nV\nI\nF\nK\nR\nE\nD\nA\nG\nS\nT\nN\nQ\nY\nW\nP\nH\nM\nC\nXUBZO*-"
00273 };
00274
00275 assert(type == fid_ALPHABET_DNA || type == fid_ALPHABET_RNA || type == fid_ALPHABET_DNARNA || type == fid_ALPHABET_PROTEIN);
00276 (void)fid_alphabet_init_from_speclines(alpha,alphas[type],0,NULL);
00277 }
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297 int fid_alphabet_add_wildcard(fid_Alphabet *alpha, char wcchar,
00298 fid_Error *error)
00299 {
00300 fid_Symbol sym;
00301 size_t wccharindex=fid_CHAR_AS_INDEX(wcchar);
00302 static const char *cannot_message="Cannot add wildcard character ";
00303
00304 assert(alpha != NULL);
00305
00306 if(wcchar == '\0')
00307 {
00308 fid_error_throw(error,"%s'\\0' to alphabet, invalid character.",
00309 cannot_message);
00310 return -1;
00311 }
00312 if(alpha->num_of_chars >= UCHAR_MAX)
00313 {
00314 fid_error_throw(error,"%s'%c' to alphabet, too many characters in "
00315 "alphabet.",cannot_message,wcchar);
00316 return -1;
00317 }
00318 else if((sym=alpha->char_to_sym[wccharindex]) == fid_WILDCARD)
00319 {
00320
00321 return 0;
00322 }
00323 else if(sym != fid_UNDEF)
00324 {
00325 fid_error_throw(error,"%s'%c' to alphabet, character already mapped to "
00326 "symbol " fid_SYMFMT ".",
00327 cannot_message,wcchar,alpha->char_to_sym[wccharindex]);
00328 return -1;
00329 }
00330
00331 if(alpha->sym_to_char[(size_t)fid_WILDCARD] == '\0')
00332 {
00333
00334 if(alpha->num_of_syms > fid_SYMBOLMAX)
00335 {
00336 fid_error_throw(error,"%s'%c' to alphabet, too many symbols in alphabet.",
00337 cannot_message,wcchar);
00338 return -1;
00339 }
00340
00341 alpha->sym_to_char[(size_t)fid_WILDCARD]=wcchar;
00342 alpha->sym_to_char[alpha->num_of_syms]=wcchar;
00343 ++alpha->num_of_syms;
00344 }
00345
00346 alpha->char_to_sym[wccharindex]=fid_WILDCARD;
00347 ++alpha->num_of_chars;
00348
00349 return 0;
00350 }
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372 size_t fid_alphabet_transform_string(const fid_Alphabet *alpha,
00373 const char *string, size_t length,
00374 fid_Symbol *transformed,
00375 int no_special_symbols)
00376 {
00377 size_t i;
00378
00379 assert(alpha != NULL);
00380 assert(string != NULL);
00381 assert(transformed != NULL);
00382
00383 if(length == 0)
00384 {
00385 if((length=strlen(string)) == 0)
00386 {
00387 return 0;
00388 }
00389 }
00390
00391 if(no_special_symbols)
00392 {
00393 for(i=0; i < length; ++i)
00394 {
00395 if((transformed[i]=
00396 alpha->char_to_sym[fid_CHAR_AS_INDEX(string[i])]) > fid_SYMBOLMAX)
00397 {
00398 return i+1;
00399 }
00400 }
00401 }
00402 else
00403 {
00404 for(i=0; i < length; ++i)
00405 {
00406 transformed[i]=alpha->char_to_sym[fid_CHAR_AS_INDEX(string[i])];
00407 }
00408 }
00409
00410 return 0;
00411 }
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434 size_t fid_alphabet_transform_string_inplace(const fid_Alphabet *alpha,
00435 char *string, size_t length,
00436 int no_special_symbols)
00437 {
00438 return fid_alphabet_transform_string(alpha,string,length,(fid_Symbol *)string,
00439 no_special_symbols);
00440 }
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469 fid_Symbol *fid_alphabet_transform_string_new(const fid_Alphabet *alpha,
00470 const char *string,
00471 size_t length,
00472 int no_special_symbols,
00473 fid_Error *error)
00474 {
00475 fid_Symbol *transformed;
00476 size_t errpos;
00477
00478 assert(alpha != NULL);
00479 assert(string != NULL);
00480
00481 if(length == 0)
00482 {
00483 if((length=strlen(string)) == 0)
00484 {
00485 return NULL;
00486 }
00487 }
00488
00489 if((transformed=(fid_Symbol *)malloc(length*sizeof(fid_Symbol))) != NULL)
00490 {
00491 if((errpos=fid_alphabet_transform_string(alpha,string,length,transformed,
00492 no_special_symbols)) > 0)
00493 {
00494 free(transformed);
00495 fid_error_throw(error,"Cannot transform string, character '%c' is "
00496 "mapped to a non-regular symbol.",string[errpos-1]);
00497 return NULL;
00498 }
00499 }
00500 else
00501 {
00502 fid_OUTOFMEM(error);
00503 }
00504
00505 return transformed;
00506 }
00507
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523 int fid_alphabet_write_to_file(const fid_Alphabet *alpha,
00524 const char *basefilename, fid_Error *error)
00525 {
00526 char *filename;
00527 fid_Mappedfile file;
00528 fid_Symbol sym;
00529 unsigned int c;
00530
00531 assert(alpha != NULL);
00532 assert(alpha->num_of_syms > 0);
00533 assert(alpha->num_of_chars > 0);
00534 assert(alpha->num_of_chars >= alpha->num_of_syms);
00535 assert(basefilename != NULL);
00536
00537 if((filename=fid_filename_create(basefilename,"al1",error)) == NULL)
00538 {
00539 return -1;
00540 }
00541
00542 if(fid_file_allocate(&file,filename,(size_t)alpha->num_of_chars*2,error) == 0)
00543 {
00544 assert(alpha->num_of_syms <= fid_SYMBOLMAX);
00545 for(sym=0; sym < alpha->num_of_syms-1; ++sym)
00546 {
00547 for(c=0; c <= UCHAR_MAX; ++c)
00548 {
00549 if(alpha->char_to_sym[c] == sym)
00550 {
00551 file.content[file.occupied++]=(unsigned char)c;
00552 assert(file.occupied <= file.allocated);
00553 }
00554 }
00555 file.content[file.occupied++]='\n';
00556 assert(file.occupied <= file.allocated);
00557 }
00558
00559 for(c=0; c <= UCHAR_MAX; ++c)
00560 {
00561 if(alpha->char_to_sym[c] == fid_WILDCARD)
00562 {
00563 file.content[file.occupied++]=(unsigned char)c;
00564 assert(file.occupied <= file.allocated);
00565 }
00566 }
00567 file.content[file.occupied++]='\n';
00568 assert(file.occupied <= file.allocated);
00569
00570 fid_file_unmap(&file);
00571 }
00572
00573 free(filename);
00574 return 0;
00575 }
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585
00586 void fid_alphabet_dump(const fid_Alphabet *alpha, FILE *stream)
00587 {
00588 int i;
00589 fid_Symbol cs;
00590
00591 assert(alpha != NULL);
00592 assert(alpha->num_of_chars <= UCHAR_MAX);
00593 assert(alpha->num_of_syms <= fid_SYMBOLMAX);
00594
00595 if(stream == NULL)
00596 {
00597 return;
00598 }
00599
00600 fprintf(stream,"----------\nAlphabet characters (%hu):\n",
00601 alpha->num_of_chars);
00602 for(i=0; i < (int)(UCHAR_MAX+1); ++i)
00603 {
00604 cs=alpha->char_to_sym[i];
00605 assert(cs < fid_SEPARATOR);
00606 if(cs != fid_UNDEF && cs != fid_WILDCARD)
00607 {
00608 if(isprint(i))
00609 {
00610 fprintf(stream," %c",(char)i);
00611 }
00612 else
00613 {
00614 fprintf(stream," [%c]",i);
00615 }
00616 fprintf(stream," -> " fid_SYMFMT "\n",cs);
00617 }
00618 }
00619 fprintf(stream," {");
00620 for(i=0; i < (int)sizeof(alpha->char_to_sym); ++i)
00621 {
00622 if(alpha->char_to_sym[i] == fid_WILDCARD)
00623 {
00624 if(isprint(i))
00625 {
00626 fprintf(stream,"%c",(char)i);
00627 }
00628 else
00629 {
00630 fprintf(stream,"[%d]",i);
00631 }
00632 }
00633 }
00634 fprintf(stream,"}\n");
00635
00636 fprintf(stream,"Alphabet symbols (%hu):\n",alpha->num_of_syms);
00637 for(i=0; i < (int)sizeof(alpha->sym_to_char); ++i)
00638 {
00639 cs=alpha->sym_to_char[i];
00640 if(cs != '\0')
00641 {
00642 fprintf(stream," %d -> ",i);
00643 if(isprint(cs))
00644 {
00645 fprintf(stream," %c\n",(char)cs);
00646 }
00647 else
00648 {
00649 fprintf(stream," [" fid_SYMFMT "]\n",cs);
00650 }
00651 }
00652 }
00653 fprintf(stream,"----------\n");
00654 }
00655
00656