00001 /* 00002 * libfid - Full-text Index Data structure library 00003 * Copyright (C) 2006, 2007, 2009 Robert Homann 00004 * 00005 * This library is free software; you can redistribute it and/or 00006 * modify it under the terms of the GNU Lesser General Public 00007 * License as published by the Free Software Foundation; either 00008 * version 2.1 of the License, or (at your option) any later version. 00009 * 00010 * This library is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00013 * Lesser General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public 00016 * License along with this library; if not, write to the Free Software 00017 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 00018 * MA 02110-1301 USA 00019 */ 00020 00021 #ifndef ALPHABET_H 00022 #define ALPHABET_H 00023 /*!\addtogroup alphabets Alphabet handling 00024 * Conversion between printable characters and binary symbols. 00025 */ 00026 /*@{*/ 00027 #ifdef __cplusplus 00028 #include <climits> 00029 #else /* !__cplusplus */ 00030 #include <limits.h> 00031 #endif /* __cplusplus */ 00032 #include "arrays.h" 00033 00034 /*! 00035 * \brief Use this type to denote a binary transformed sequence symbol. 00036 * 00037 * This type has been introduced for pure documentary reasons. There is no, 00038 * and probably never will be, any form of wide character or unicode support. 00039 * It would be safe to use <code>unsigned char</code> all the time, but using 00040 * \c fid_Symbol instead makes the code much more readable and 00041 * understandable. 00042 */ 00043 typedef unsigned char fid_Symbol; 00044 00045 /*! 00046 * \brief Format string for printing the numeric value of a #fid_Symbol. 00047 */ 00048 #define fid_SYMFMT "%hhu" 00049 00050 /*! 00051 * \brief An array of symbols, i.e., a sequence of dynamic size. 00052 */ 00053 fid_DYNARRAY_DECLARE(fid_ArraySymbol,fid_Symbol); 00054 00055 /*! 00056 * \brief Special symbol: sequence separator. 00057 */ 00058 #define fid_SEPARATOR ((fid_Symbol)UCHAR_MAX) 00059 00060 /*! 00061 * \brief Special symbol: wildcard character. 00062 */ 00063 #define fid_WILDCARD ((fid_Symbol)(UCHAR_MAX-1)) 00064 00065 /*! 00066 * \brief Special symbol: undefined symbol. 00067 */ 00068 #define fid_UNDEF ((fid_Symbol)(UCHAR_MAX-2)) 00069 00070 /*! 00071 * \brief Maximum allowed value for a symbol. 00072 * 00073 * Note that this is not the maximum \em number of symbols, but the maximum 00074 * allowed \em value of a symbol. 00075 */ 00076 #define fid_SYMBOLMAX ((fid_Symbol)(UCHAR_MAX-3)) 00077 00078 /*! 00079 * \brief Check whether symbol \p S is a sequence symbol or not. 00080 * 00081 * Note that undefined characters are considered regular symbols, but wildcards 00082 * and sequence separators are not. 00083 * 00084 * \param S A symbol of type #fid_Symbol. 00085 */ 00086 #define fid_REGULARSYMBOL(S) ((S) <= fid_UNDEF) 00087 00088 /*! 00089 * \brief The opposite of fid_REGULARSYMBOL(). 00090 */ 00091 #define fid_SPECIALSYMBOL(S) ((S) > fid_UNDEF) 00092 00093 /*! 00094 * \brief Definition of an alphabet. 00095 * 00096 * The structure basically consists of two arrays, one for mapping 00097 * characters to symbols, and one for mapping symbols to characters. Its 00098 * purpose is to map printable characters to binary symbols, which are 00099 * consecutive numbers. There are always as many or more characters than 00100 * there are symbols. At each position in fid_Alphabet::char_to_sym the 00101 * corresponding symbol is stored, and at each position in 00102 * fid_Alphabet::sym_to_char one of the corresponding character is stored. 00103 * The set of characters that are mapped to the same symbol are called a 00104 * character class. The latter is only useful for printing out sequences 00105 * to present them to the user, all algorithms should work on the binary 00106 * representation. 00107 * 00108 * Note that there is no entry for fid_SEPARATOR in the array of symbols. 00109 * Also note that up to fid_SYMBOLMAX symbols can be supported. The reason 00110 * to have UCHAR_MAX+1 entries in the array of characters is that there are 00111 * only 256 characters in the ASCII character set. It could easily be 00112 * extended to contain much more entries to support larger character sets, 00113 * but the array of symbols cannot grow as easily unless one would accept 00114 * that a symbol occupies more than one byte. A larger range of characters 00115 * would hence imply that they form up to fid_SYMBOLMAX character classes. 00116 */ 00117 typedef struct 00118 { 00119 fid_Uint16 num_of_chars; /*!<\brief Number of printable characters defined by 00120 * this alphabet. */ 00121 fid_Uint16 num_of_syms; /*!<\brief Number of symbols defined by this 00122 * alphabet. */ 00123 fid_Symbol char_to_sym[UCHAR_MAX+1]; /*!<\brief Mapping from printable 00124 * characters to binary symbols. */ 00125 char sym_to_char[fid_WILDCARD+1]; /*!<\brief Mapping from binary 00126 * symbols to printable characters. */ 00127 } fid_Alphabet; 00128 00129 /*! 00130 * \brief Transform binary symbol into its printable form, honoring specials. 00131 * 00132 * Use this macro whenver presenting alphabet encoded sequences to human 00133 * beings. The given symbol is transformed into its printable form, so that 00134 * undefined symbols and sequence separators are also printed correctly. 00135 * 00136 * \param ALPHA A pointer to a fid_Alphabet structure. 00137 * \param S A symbol encoded by and to be decoded via alphabet \p ALPHA. 00138 * 00139 * \returns A printable character. 00140 */ 00141 #define fid_PRINT_SYMBOL(ALPHA,S)\ 00142 ((S) == fid_UNDEF\ 00143 ?'~'\ 00144 :((S) == fid_SEPARATOR\ 00145 ?'|'\ 00146 :(ALPHA)->sym_to_char[(size_t)(S)])) 00147 00148 /*! 00149 * \brief Type cast printable character into unsigned array index. 00150 * 00151 * Converting a signed char into some bigger unsigned type can go very wrong 00152 * if not done carefully. This macro is careful. Use it for accessing 00153 * fid_Alphabet::char_to_sym. 00154 * 00155 * \param C A printable character, type \c char. 00156 * 00157 * \returns An array index. 00158 */ 00159 #define fid_CHAR_AS_INDEX(C) ((size_t)((unsigned char)(C))) 00160 00161 /*! 00162 * \brief Identifiers of built-in alphabets. 00163 * 00164 * An alphabet structure can be initialized by a library function to 00165 * define one of the standard alphabets supported by the library. 00166 */ 00167 typedef enum 00168 { 00169 fid_ALPHABET_DNA, /*!<\brief Standard DNA alphabet with common wildcards. */ 00170 fid_ALPHABET_RNA, /*!<\brief Standard RNA alphabet with common wildcards. */ 00171 fid_ALPHABET_DNARNA, /*!<\brief Mixed DNA and RNA alphabet, with \e T and 00172 * \e U being defined equivalent. */ 00173 fid_ALPHABET_PROTEIN /*!<\brief Standard amino acid alphabet. */ 00174 } fid_Alphabettype; 00175 00176 #ifdef __cplusplus 00177 extern "C" { 00178 #endif 00179 int fid_alphabet_init_from_speclines(fid_Alphabet *alpha, const char *str, 00180 size_t len, fid_Error *error); 00181 int fid_alphabet_init_from_specfile(fid_Alphabet *alpha, const char *filename, 00182 fid_Error *error); 00183 int fid_alphabet_init_from_string(fid_Alphabet *alpha, const char *string, 00184 size_t length, fid_Error *error); 00185 void fid_alphabet_init_standard(fid_Alphabet *alpha, fid_Alphabettype type); 00186 int fid_alphabet_add_wildcard(fid_Alphabet *alpha, char wcchar, 00187 fid_Error *error); 00188 size_t fid_alphabet_transform_string(const fid_Alphabet *alpha, 00189 const char *string, size_t length, 00190 fid_Symbol *transformed, 00191 int no_special_symbols); 00192 size_t fid_alphabet_transform_string_inplace(const fid_Alphabet *alpha, 00193 char *string, size_t length, 00194 int no_special_symbols); 00195 /*@null@*/ 00196 fid_Symbol *fid_alphabet_transform_string_new(const fid_Alphabet *alpha, 00197 const char *string, 00198 size_t length, 00199 int no_special_symbols, 00200 fid_Error *error); 00201 int fid_alphabet_write_to_file(const fid_Alphabet *alpha, 00202 const char *basefilename, 00203 fid_Error *error); 00204 void fid_alphabet_dump(const fid_Alphabet *alpha, FILE *stream); 00205 #ifdef __cplusplus 00206 } 00207 #endif 00208 /*@}*/ 00209 00210 #endif /* !ALPHABET_H */