Data Structures | |
struct | fid_Sequences |
A structure that holds references to persistent suffix array tables. More... | |
struct | fid_Sequencefileinfo |
Length information about sequence file. More... | |
Defines | |
#define | fid_TABLE_NONE ((fid_Tablerequest)0x00000000) |
Table request: nothing. | |
#define | fid_TABLE_TIS ((fid_Tablerequest)0x00000001) |
Table request: transformed input sequence. | |
#define | fid_TABLE_OIS ((fid_Tablerequest)0x00000002) |
Table request: original input sequence. | |
#define | fid_TABLE_DES ((fid_Tablerequest)0x00000004) |
Table request: sequence descriptions. | |
#define | fid_TABLES_ONLINE (fid_TABLE_TIS|fid_TABLE_OIS|fid_TABLE_DES) |
Table request: all tables related to online matching. | |
#define | fid_TABLES_ONLINE_MASK ((fid_Tablerequest)0x000000ff) |
Table request bit mask: bits related to online matching. | |
#define | fid_READ_SYMBOL(SEQS, I) (SEQS)->tisfile.content[I] |
Read symbol from transformed input sequences of a fid_Sequences structure. | |
#define | fid_SEQUENCE_DESCRIPTION_32(S, I) ((const char *)((S)->desfile.content+(S)->descriptions.v_uint32[I])) |
Return pointer to description of sequence I . | |
#define | fid_SEQUENCE_DESCRIPTION_64(S, I) ((const char *)((S)->desfile.content+(S)->descriptions.v_uint64[I])) |
#define | fid_SEQUENCE_DESCRIPTION_LENGTH_32(S, I) ((S)->descriptions.v_uint32[(I)+1]-(S)->descriptions.v_uint32[I]) |
Return the length of the description of sequence I . | |
#define | fid_SEQUENCE_DESCRIPTION_LENGTH_64(S, I) ((S)->descriptions.v_uint64[(I)+1]-(S)->descriptions.v_uint64[I]) |
Typedefs | |
typedef fid_Uint32 | fid_Tablerequest |
Type for encoding enhanced suffix array table requests. | |
typedef int(* | fid_Sequenceiterfun_32 )(const fid_Sequences *seqs, fid_Uint32 seqnum, const fid_Symbol *sequence, fid_Uint32 seqlen, void *user_data) |
Type of callback function expected by fid_sequences_iterate_32(). | |
typedef int(* | fid_Sequenceiterfun_64 )(const fid_Sequences *seqs, fid_Uint64 seqnum, const fid_Symbol *sequence, fid_Uint64 seqlen, void *user_data) |
Type of callback function expected by fid_sequences_iterate_64(). | |
Enumerations | |
enum | fid_Fileformat { fid_FORMAT_UNDEF, fid_FORMAT_FASTA, fid_FORMAT_UNIPROT } |
Enumeration of supported biological sequence file formats. More... | |
Functions | |
void | fid_sequences_init (fid_Sequences *seqs, fid_Uintsize uisize, const fid_Alphabet *alpha) |
Initialize an fid_Sequences structure. | |
int | fid_sequences_map (fid_Sequences *seqs, const char *basefilename, fid_Tablerequest tables, fid_Filenamebuffer *fnamebuf, fid_Error *error) |
Map online tables of enhanced suffix array into memory. | |
int | fid_sequences_realize (fid_Sequences *seqs, fid_Tablerequest tables, fid_Error *error) |
Update data fields in fid_Sequences structure according to mapped files. | |
int | fid_sequences_parse_from_file_to_file (const char *infilename, fid_Fileformat format, fid_Tablerequest tables, const char *basefilename, const fid_Alphabet *alpha, fid_Uintsize uisize, size_t *input_file_size, fid_Error *error) |
Parse sequence set from the given file, write output to files. | |
int | fid_sequences_parse_from_file_to_memory (const char *infilename, fid_Fileformat format, fid_Tablerequest tables, fid_Sequences *result, const fid_Alphabet *alpha, size_t padding, fid_Uintsize uisize, size_t *input_file_size, fid_Error *error) |
Parse sequence set from the given file to memory. | |
int | fid_sequences_parse_from_file_to_lengths (const char *infilename, fid_Fileformat format, fid_Sequencefileinfo *seqinfo, const fid_Alphabet *alpha, fid_Error *error) |
Parse sequence set from given file, but only gather statistics. | |
int | fid_sequences_parse_from_memory_to_file (const char *infilename, const char *buffer, size_t bufsize, fid_Fileformat format, fid_Tablerequest tables, const char *basefilename, const fid_Alphabet *alpha, fid_Uintsize uisize, fid_Error *error) |
Parse sequence set from the provided buffer, write output to files. | |
int | fid_sequences_parse_from_memory_to_memory (const char *infilename, const char *buffer, size_t bufsize, fid_Fileformat format, fid_Tablerequest tables, fid_Sequences *result, const fid_Alphabet *alpha, size_t padding, fid_Uintsize uisize, fid_Error *error) |
Parse sequence set from the provided buffer to memory. | |
int | fid_sequences_parse_from_memory_to_lengths (const char *infilename, const char *buffer, size_t bufsize, fid_Fileformat format, fid_Sequencefileinfo *seqinfo, const fid_Alphabet *alpha, fid_Error *error) |
Parse sequence set from the provided buffer, but only gather statistics. | |
fid_Uint32 | fid_sequences_offset_to_index_32 (const fid_Sequences *seqs, fid_Uint32 offset) |
32 bit version of fid_sequences_offset_to_index(). | |
fid_Uint64 | fid_sequences_offset_to_index_64 (const fid_Sequences *seqs, fid_Uint64 offset) |
64 bit version of fid_sequences_offset_to_index(). | |
void | fid_sequences_index_to_boundaries_32 (const fid_Sequences *seqs, fid_Uint32 seqindex, fid_Uint32 *left, fid_Uint32 *right) |
32 bit version of fid_sequences_index_to_boundaries(). | |
void | fid_sequences_index_to_boundaries_64 (const fid_Sequences *seqs, fid_Uint64 seqindex, fid_Uint64 *left, fid_Uint64 *right) |
64 bit version of fid_sequences_index_to_boundaries(). | |
void | fid_sequences_offset_to_boundaries_32 (const fid_Sequences *seqs, fid_Uint32 offset, fid_Uint32 *left, fid_Uint32 *right) |
32 bit version of fid_sequences_offset_to_boundaries(). | |
void | fid_sequences_offset_to_boundaries_64 (const fid_Sequences *seqs, fid_Uint64 offset, fid_Uint64 *left, fid_Uint64 *right) |
64 bit version of fid_sequences_offset_to_boundaries(). | |
int | fid_sequences_iterate_range_32 (const fid_Sequences *seqs, fid_Uint32 from, fid_Uint32 to, const fid_Sequenceiterfun_32 iterfun, void *user_data) |
32 bit version of fid_sequences_iterate_range(). | |
int | fid_sequences_iterate_range_64 (const fid_Sequences *seqs, fid_Uint64 from, fid_Uint64 to, const fid_Sequenceiterfun_64 iterfun, void *user_data) |
64 bit version of fid_sequences_iterate_range(). | |
int | fid_sequences_iterate_32 (const fid_Sequences *seqs, const fid_Sequenceiterfun_32 iterfun, void *user_data) |
32 bit version of fid_sequences_iterate(). | |
int | fid_sequences_iterate_64 (const fid_Sequences *seqs, const fid_Sequenceiterfun_64 iterfun, void *user_data) |
64 bit version of fid_sequences_iterate(). | |
void | fid_sequences_compute_distribution (fid_Sequences *seqs) |
Compute character distribution of given sequences. | |
void | fid_sequences_free (fid_Sequences *seqs) |
Unmap sequences and free memory. | |
void | fid_sequences_dump_range_32 (const fid_Symbol *seq, fid_Uint32 length, const fid_Alphabet *alpha, const char *str, int stop_at_separator, FILE *stream) |
32 bit version of fid_sequences_dump_range(). | |
void | fid_sequences_dump_range_64 (const fid_Symbol *seq, fid_Uint64 length, const fid_Alphabet *alpha, const char *str, int stop_at_separator, FILE *stream) |
64 bit version of fid_sequences_dump_range(). |
#define fid_TABLE_NONE ((fid_Tablerequest)0x00000000) |
Table request: nothing.
Definition at line 104 of file sequences.h.
Referenced by fid_create_online_files().
#define fid_TABLE_TIS ((fid_Tablerequest)0x00000001) |
Table request: transformed input sequence.
Definition at line 107 of file sequences.h.
Referenced by fid_create_online_files(), fid_sequences_map(), fid_sequences_realize(), fid_suffixarray_print(), and fid_tablerequest_to_string().
#define fid_TABLE_OIS ((fid_Tablerequest)0x00000002) |
Table request: original input sequence.
Definition at line 110 of file sequences.h.
Referenced by fid_create_online_files(), fid_sequences_map(), and fid_sequences_realize().
#define fid_TABLE_DES ((fid_Tablerequest)0x00000004) |
Table request: sequence descriptions.
Definition at line 113 of file sequences.h.
Referenced by fid_create_online_files(), fid_sequences_map(), and fid_sequences_parse_from_memory_to_file().
#define fid_TABLES_ONLINE (fid_TABLE_TIS|fid_TABLE_OIS|fid_TABLE_DES) |
Table request: all tables related to online matching.
Definition at line 116 of file sequences.h.
Referenced by fid_sequences_free(), and fid_tablerequest_to_string().
#define fid_TABLES_ONLINE_MASK ((fid_Tablerequest)0x000000ff) |
Table request bit mask: bits related to online matching.
Definition at line 119 of file sequences.h.
Referenced by fid_suffixarray_load_from_files(), and fid_suffixarray_realize().
#define fid_READ_SYMBOL | ( | SEQS, | |||
I | ) | (SEQS)->tisfile.content[I] |
Read symbol from transformed input sequences of a fid_Sequences structure.
SEQS | Pointer to a fid_Sequences structure. | |
I | Absolute sequence position. |
I
. Definition at line 142 of file sequences.h.
Referenced by fid_suffixarray_find_embedded_interval(), fid_suffixarray_get_intervals(), and fid_suffixinterval_lcpvalue().
#define fid_SEQUENCE_DESCRIPTION_32 | ( | S, | |||
I | ) | ((const char *)((S)->desfile.content+(S)->descriptions.v_uint32[I])) |
Return pointer to description of sequence I
.
S | Pointer to an fid_Sequences structure. | |
I | Sequence index. |
I
. Definition at line 153 of file sequences.h.
#define fid_SEQUENCE_DESCRIPTION_64 | ( | S, | |||
I | ) | ((const char *)((S)->desfile.content+(S)->descriptions.v_uint64[I])) |
#define fid_SEQUENCE_DESCRIPTION_LENGTH_32 | ( | S, | |||
I | ) | ((S)->descriptions.v_uint32[(I)+1]-(S)->descriptions.v_uint32[I]) |
Return the length of the description of sequence I
.
S | Pointer to an fid_Sequences structure. | |
I | Sequence index. |
I
. Definition at line 168 of file sequences.h.
#define fid_SEQUENCE_DESCRIPTION_LENGTH_64 | ( | S, | |||
I | ) | ((S)->descriptions.v_uint64[(I)+1]-(S)->descriptions.v_uint64[I]) |
typedef fid_Uint32 fid_Tablerequest |
Type for encoding enhanced suffix array table requests.
When mapping an enhanced suffix array into memory, pass a value of this type to the corresponding function to tell it which tables it should read. This is a bit vector that must be initialized by OR'ing predefined values such like fid_TABLE_TIS or fid_TABLE_SUF. Always use this type for it has a fixed size on all architectures.
Definition at line 101 of file sequences.h.
typedef int(* fid_Sequenceiterfun_32)(const fid_Sequences *seqs, fid_Uint32 seqnum, const fid_Symbol *sequence, fid_Uint32 seqlen, void *user_data) |
Type of callback function expected by fid_sequences_iterate_32().
To use the sequence iterator functions, the caller must declare a callback function correponding to this type and pass the function pointer to the iterator function. For each sequence in a fid_Sequences structure, that callback function is called.
seqs | The structure that contains the processed sequences. | |
seqnum | Current sequence index. | |
sequence | Direct pointer to the current sequence. | |
seqlen | Length of the current sequence. | |
user_data | Arbitrary pointer passed through by the iterator functions to omit the need for global variables. |
Definition at line 194 of file sequences.h.
typedef int(* fid_Sequenceiterfun_64)(const fid_Sequences *seqs, fid_Uint64 seqnum, const fid_Symbol *sequence, fid_Uint64 seqlen, void *user_data) |
Type of callback function expected by fid_sequences_iterate_64().
Definition at line 203 of file sequences.h.
enum fid_Fileformat |
Enumeration of supported biological sequence file formats.
These values can be used to force the sequence parser to try a specific format instead of leaving the decision on how to read the data to the parser. However, enabling auto-detection by choosing fid_Fileformat::fid_FORMAT_UNDEF should be good enough in virtually any case.
fid_FORMAT_UNDEF | No specific file format. Use for auto-detection. |
fid_FORMAT_FASTA | FASTA file format. |
fid_FORMAT_UNIPROT | UniProt file format. |
Definition at line 67 of file sequences.h.
void fid_sequences_init | ( | fid_Sequences * | seqs, | |
fid_Uintsize | uisize, | |||
const fid_Alphabet * | alpha | |||
) |
Initialize an fid_Sequences structure.
seqs | The structure to be initialized. | |
uisize | Required word size. Some fields in fid_Sequences, especially the tables of sequence separator positions and sequence description separator positions, depend on how these data are stored on file. | |
alpha | The alphabet to be associated with this structure. It is legal to pass a NULL pointer. |
Definition at line 246 of file sequences.c.
References fid_Sequences::alpha, fid_Mappedfile::content, fid_Sequences::descriptions, fid_Sequences::desfile, fid_SWITCH48, fid_Sequences::num_of_sequences, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::separators, fid_Sequences::sspfile, fid_Sequences::tisfile, fid_Sequences::total_length, fid_Sequences::uisize, fid_Uint48constptr::v_uint32, fid_Uint48::v_uint32, fid_Uint48constptr::v_uint64, and fid_Uint48::v_uint64.
Referenced by fid_create_online_files(), and fid_suffixarray_init().
int fid_sequences_map | ( | fid_Sequences * | seqs, | |
const char * | basefilename, | |||
fid_Tablerequest | tables, | |||
fid_Filenamebuffer * | fnamebuf, | |||
fid_Error * | error | |||
) |
Map online tables of enhanced suffix array into memory.
seqs | This structure will be associated with the mapped file. | |
basefilename | Basename from which filenames corresponding to the mapped files are generated. | |
tables | requested tables, online bits only. | |
fnamebuf | Filename buffer for generating filenames. If NULL is passed, then a local buffer will be used. | |
error | Error messages go here. |
Definition at line 313 of file sequences.c.
References fid_Mappedfile::content, fid_Sequences::desfile, fid_file_unmap(), fid_filenamebuffer_free(), fid_filenamebuffer_init_local(), fid_sequences_realize(), fid_TABLE_DES, fid_TABLE_OIS, fid_TABLE_TIS, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::sspfile, and fid_Sequences::tisfile.
Referenced by fid_suffixarray_load_from_files().
int fid_sequences_realize | ( | fid_Sequences * | seqs, | |
fid_Tablerequest | tables, | |||
fid_Error * | error | |||
) |
Update data fields in fid_Sequences structure according to mapped files.
All data fields whose correct values can be derived from the sizes of the associated mapped files are updated by this function (e.g., fid_Sequences::num_of_sequences can be derived from the size of the file associated with fid_Sequences::sspfile). No function should set these fields manually, just call this function instead. A table request can be defined to ask the function to look at only some of the files and update only values derived from these.
seqs | The structure to be updated. | |
tables | Update only values derivable from the tables specified in this request. To update all values, pass fid_TABLES_ONLINE. | |
error | Error messages go here. |
Definition at line 454 of file sequences.c.
References fid_Mappedfile::content, fid_error_throw(), fid_SWITCH48, fid_TABLE_OIS, fid_TABLE_TIS, fid_Mappedfile::occupied, fid_Sequences::oisfile, fid_Sequences::tisfile, and fid_Sequences::uisize.
Referenced by fid_sequences_free(), fid_sequences_map(), and fid_suffixarray_realize().
int fid_sequences_parse_from_file_to_file | ( | const char * | infilename, | |
fid_Fileformat | format, | |||
fid_Tablerequest | tables, | |||
const char * | basefilename, | |||
const fid_Alphabet * | alpha, | |||
fid_Uintsize | uisize, | |||
size_t * | input_file_size, | |||
fid_Error * | error | |||
) |
Parse sequence set from the given file, write output to files.
For a more descent description of this function, see fid_sequences_parse_from_memory_to_file() since this function depends on that function.
infilename | The name of a file that contains biological data in some textual format such like FASTA. | |
format | The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF. | |
tables | Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error. | |
basefilename | The base name of the generated tables. This should be a simple string suitable to generate a file name from. Suffixes ".tis", ".ois", ".sds", ".des", and ".ssp"are appended to this string to generate the final file names of the tables. | |
alpha | The alphabet that is used to transform the textual data into binary. | |
uisize | Sequence separator positions and description separator positions are stored as integers of this size. | |
input_file_size | If not NULL , then the total size of the input file infilename is returned in this argument. | |
error | Error messages go here. |
Definition at line 611 of file sequences.c.
References fid_sequences_parse_from_memory_to_file().
int fid_sequences_parse_from_file_to_memory | ( | const char * | infilename, | |
fid_Fileformat | format, | |||
fid_Tablerequest | tables, | |||
fid_Sequences * | seqs, | |||
const fid_Alphabet * | alpha, | |||
size_t | padding, | |||
fid_Uintsize | uisize, | |||
size_t * | input_file_size, | |||
fid_Error * | error | |||
) |
Parse sequence set from the given file to memory.
For a more descent description of this function, see fid_sequences_parse_from_memory_to_memory() since this function depends on that function.
infilename | The name of a file that contains biological data in some textual format such like FASTA. | |
format | The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF. | |
tables | Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error. | |
seqs | The data structure that stores the parsed tables. | |
alpha | The alphabet that is used to transform the textual data into binary. | |
padding | Number of symbols to append to the transformed sequence data. This is useful for algorithms that can be simplified under the assumption that the sequence data extends a certain number of symbols beyond the actual end of sequence. The symbols appended to the end are fid_SEPARATOR symbols. The value of padding will not be reflected in the lengths stored in seq . | |
uisize | Sequence separator positions and description separator positions are stored as integers of this size. | |
input_file_size | If not NULL , then the total size of the input file infilename is returned in this argument. | |
error | Error messages go here. |
Definition at line 686 of file sequences.c.
References fid_sequences_parse_from_memory_to_memory().
int fid_sequences_parse_from_file_to_lengths | ( | const char * | infilename, | |
fid_Fileformat | format, | |||
fid_Sequencefileinfo * | seqinfo, | |||
const fid_Alphabet * | alpha, | |||
fid_Error * | error | |||
) |
Parse sequence set from given file, but only gather statistics.
For a more descent description of this function, see fid_sequences_parse_from_memory_to_lengths() since this function depends on that function.
infilename | The name of a file that contains biological data in some textual format such like FASTA. | |
format | The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF. | |
seqinfo | The sizes of the transformed sequence data, descriptions, etc., are returned here. | |
alpha | The alphabet that is used to transform the textual data into binary. | |
error | Error messages go here. |
Definition at line 747 of file sequences.c.
References fid_sequences_parse_from_memory_to_lengths().
int fid_sequences_parse_from_memory_to_file | ( | const char * | infilename, | |
const char * | buffer, | |||
size_t | bufsize, | |||
fid_Fileformat | format, | |||
fid_Tablerequest | tables, | |||
const char * | basefilename, | |||
const fid_Alphabet * | alpha, | |||
fid_Uintsize | uisize, | |||
fid_Error * | error | |||
) |
Parse sequence set from the provided buffer, write output to files.
This function reads the content of buffer
that contains a set of biological sequences in some supported format, and writes it into a more convenient, binary form as a set of tables. No index generation takes place here, this is only the preprocessing of online data. The input file format is attempted to be auto-detected, but can also be predetermined. A memory mapped file provides sufficient input to this function. The output is a set of files on disk corresponding to the entities parsed from the input.
Internally, the set of sequences is parsed into an fid_Sequences structure, whose file members fid_Sequences::tisfile, fid_Sequences::oisfile, fid_Sequences::sspfile, fid_Sequences::desfile, and fid_Sequences::sdsfile,(depending on the tables request) are used to construct the corresponding table. That is, this function does not allocate RAM, but solely relies on memory mapped files. It creates growing files directly on disk rather than taking the risk to be swapped out by the OS. When the function returns, all files will be closed again. To operate on the preprocessed data (e.g., to construct an enhanced suffix array), map it into memory again.
infilename | The name of the input file (for error messages only). | |
buffer | The buffer that contains biological data in some textual format such like FASTA. | |
bufsize | The size of buffer in bytes. By this, buffer is not required to be zero-terminated. This value should be positive. | |
format | The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF. | |
tables | Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error. | |
basefilename | The base name of the generated tables. This should be a simple string suitable to generate a file name from. Suffixes ".tis", ".ois", ".ssp", ".des", and ".sds" are appended to this string to generate the final file names of the tables. | |
alpha | The alphabet that is used to transform the textual data into binary. | |
uisize | Sequence separator positions and description separator positions are stored as integers of this size. | |
error | Error messages go here. |
Definition at line 874 of file sequences.c.
References fid_Sequences::desfile, fid_create_online_files(), fid_file_unmap(), fid_SWITCH48, fid_TABLE_DES, fid_Sequences::num_of_sequences, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::sspfile, fid_Sequences::tisfile, fid_Uint48::v_uint32, and fid_Uint48::v_uint64.
Referenced by fid_sequences_parse_from_file_to_file().
int fid_sequences_parse_from_memory_to_memory | ( | const char * | infilename, | |
const char * | buffer, | |||
size_t | bufsize, | |||
fid_Fileformat | format, | |||
fid_Tablerequest | tables, | |||
fid_Sequences * | seqs, | |||
const fid_Alphabet * | alpha, | |||
size_t | padding, | |||
fid_Uintsize | uisize, | |||
fid_Error * | error | |||
) |
Parse sequence set from the provided buffer to memory.
This function reads the content of buffer
that contains a set of biological sequences in some supported format, and writes it into a more convenient, binary form as a set of tables. No index generation takes place here, this is only the preprocessing of online data. The input file format is attempted to be auto-detected, but can also be predetermined. A memory mapped file provides sufficient input to this function. The output of this function is binary identical to the output of fid_sequences_parse_from_memory_to_file(), only that it is not written to files directly.
The output is a set of dynamic arrays corresponding to the entities parsed from the input, stored in a fid_Sequences structure, seqs
. The fid_Mappedfile::content pointers of the files represented by seqs
are pointers to allocated memory, so they are not to be unmapped directly. The sequences can be flushed to files after parsing, and must be freed via fid_sequences_free() when they are not needed anymore.
infilename | The name of the input file (for error messages only). | |
buffer | The buffer that contains biological data in some textual format such like FASTA. | |
bufsize | The size of buffer in bytes. By this, buffer is not required to be zero-terminated. This value should be positive. | |
format | The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF. | |
tables | Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error. | |
seqs | The parsed sequences. | |
alpha | The alphabet that is used to transform the textual data into binary. | |
padding | Number of symbols to append to the transformed sequence data. This is useful for algorithms that can be simplified under the assumption that the sequence data extends a certain number of symbols beyond the actual end of sequence. The symbols appended to the end are fid_SEPARATOR symbols. The value of padding will not be reflected in the lengths stored in seq . | |
uisize | Sequence separator positions and description separator positions are stored as integers of this size. | |
error | Error messages go here. |
Definition at line 977 of file sequences.c.
References fid_SWITCH48.
Referenced by fid_sequences_parse_from_file_to_memory().
int fid_sequences_parse_from_memory_to_lengths | ( | const char * | infilename, | |
const char * | buffer, | |||
size_t | bufsize, | |||
fid_Fileformat | format, | |||
fid_Sequencefileinfo * | seqinfo, | |||
const fid_Alphabet * | alpha, | |||
fid_Error * | error | |||
) |
Parse sequence set from the provided buffer, but only gather statistics.
This function just parses the given buffer, but does not store the parsed sequences anywhere. Instead, the function counts the number of sequences, determines it total length, etc. (see fid_Sequencefileinfo), and returns these data to the caller. The main purpose of this function is to determine if the length of the raw sequence data as stored in enhanced suffix arrays can be represented in 32 bit integers or not.
infilename | The name of the input file (for error messages only). | |
buffer | The buffer that contains biological data in some textual format such like FASTA. | |
bufsize | The size of buffer in bytes. By this, buffer is not required to be zero-terminated. This value should be positive. | |
format | The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF. | |
seqinfo | The information about the parsed sequences. | |
alpha | The alphabet that is used to transform the textual data into binary. | |
error | Error messages go here. |
Definition at line 1079 of file sequences.c.
References fid_Sequencefileinfo::input_file_size, and fid_Sequencefileinfo::num_of_sequences.
Referenced by fid_sequences_parse_from_file_to_lengths().
fid_Uint32 fid_sequences_offset_to_index_32 | ( | const fid_Sequences * | seqs, | |
fid_Uint32 | offset | |||
) |
32 bit version of fid_sequences_offset_to_index().
Determine sequence index from sequence offset.
It is an error to pass the position of a sequence separator as offset.
seqs | Set of sequences. | |
offset | Offset into the concatenated set of sequences. |
fid_Uint64 fid_sequences_offset_to_index_64 | ( | const fid_Sequences * | seqs, | |
fid_Uint64 | offset | |||
) |
64 bit version of fid_sequences_offset_to_index().
Determine sequence index from sequence offset.
It is an error to pass the position of a sequence separator as offset.
seqs | Set of sequences. | |
offset | Offset into the concatenated set of sequences. |
void fid_sequences_index_to_boundaries_32 | ( | const fid_Sequences * | seqs, | |
fid_Uint32 | seqindex, | |||
fid_Uint32 * | left, | |||
fid_Uint32 * | right | |||
) |
32 bit version of fid_sequences_index_to_boundaries().
Determine sequence boundaries from sequence index.
seqs | Set of sequences. | |
seqindex | Sequence index; the first sequence is identified by 0. | |
left,right | The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence. |
void fid_sequences_index_to_boundaries_64 | ( | const fid_Sequences * | seqs, | |
fid_Uint64 | seqindex, | |||
fid_Uint64 * | left, | |||
fid_Uint64 * | right | |||
) |
64 bit version of fid_sequences_index_to_boundaries().
Determine sequence boundaries from sequence index.
seqs | Set of sequences. | |
seqindex | Sequence index; the first sequence is identified by 0. | |
left,right | The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence. |
void fid_sequences_offset_to_boundaries_32 | ( | const fid_Sequences * | seqs, | |
fid_Uint32 | offset, | |||
fid_Uint32 * | left, | |||
fid_Uint32 * | right | |||
) |
32 bit version of fid_sequences_offset_to_boundaries().
Determine sequence boundaries from sequence offset.
This function performs a binary search to find the sequence index first, so use fid_sequences_index_to_boundaries() instead if you already know the sequence index. It is an error to pass the position of a sequence separator as offset.
seqs | Set of sequences. | |
offset | Offset into the concatenated set of sequences. | |
left,right | The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence. |
void fid_sequences_offset_to_boundaries_64 | ( | const fid_Sequences * | seqs, | |
fid_Uint64 | offset, | |||
fid_Uint64 * | left, | |||
fid_Uint64 * | right | |||
) |
64 bit version of fid_sequences_offset_to_boundaries().
Determine sequence boundaries from sequence offset.
This function performs a binary search to find the sequence index first, so use fid_sequences_index_to_boundaries() instead if you already know the sequence index. It is an error to pass the position of a sequence separator as offset.
seqs | Set of sequences. | |
offset | Offset into the concatenated set of sequences. | |
left,right | The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence. |
int fid_sequences_iterate_range_32 | ( | const fid_Sequences * | seqs, | |
fid_Uint32 | from, | |||
fid_Uint32 | to, | |||
const fid_Sequenceiterfun_32 | iterfun, | |||
void * | user_data | |||
) |
32 bit version of fid_sequences_iterate_range().
Call a callback function for each sequence in given range.
For each sequence stored in seqs
in the range specified by sequences indices from
and to
, call the callback function iterfun
. The iteration can be interrupted by the callback function returning a non-zero value.
seqs | The sequences to be iterated over. | |
from,to | The range of sequences to be processed. These values must be valid, i.e., from must not be greater than to , and none of them must be greater than fid_Sequences::num_of_sequences-1, behavior is undefined in these cases. | |
iterfun | The callback function. | |
user_data | Arbitrary pointer passed through to iterfun . |
iterfun
otherwise. int fid_sequences_iterate_range_64 | ( | const fid_Sequences * | seqs, | |
fid_Uint64 | from, | |||
fid_Uint64 | to, | |||
const fid_Sequenceiterfun_64 | iterfun, | |||
void * | user_data | |||
) |
64 bit version of fid_sequences_iterate_range().
Call a callback function for each sequence in given range.
For each sequence stored in seqs
in the range specified by sequences indices from
and to
, call the callback function iterfun
. The iteration can be interrupted by the callback function returning a non-zero value.
seqs | The sequences to be iterated over. | |
from,to | The range of sequences to be processed. These values must be valid, i.e., from must not be greater than to , and none of them must be greater than fid_Sequences::num_of_sequences-1, behavior is undefined in these cases. | |
iterfun | The callback function. | |
user_data | Arbitrary pointer passed through to iterfun . |
iterfun
otherwise. int fid_sequences_iterate_32 | ( | const fid_Sequences * | seqs, | |
const fid_Sequenceiterfun_32 | iterfun, | |||
void * | user_data | |||
) |
32 bit version of fid_sequences_iterate().
Call a callback function for each sequence.
For each sequence stored in seqs
, call the callback function iterfun
. The iteration can be interrupted by the callback function returning a non-zero value.
seqs | The sequences to be iterated over. | |
iterfun | The callback function. | |
user_data | Arbitrary pointer passed through to iterfun . |
iterfun
otherwise. int fid_sequences_iterate_64 | ( | const fid_Sequences * | seqs, | |
const fid_Sequenceiterfun_64 | iterfun, | |||
void * | user_data | |||
) |
64 bit version of fid_sequences_iterate().
Call a callback function for each sequence.
For each sequence stored in seqs
, call the callback function iterfun
. The iteration can be interrupted by the callback function returning a non-zero value.
seqs | The sequences to be iterated over. | |
iterfun | The callback function. | |
user_data | Arbitrary pointer passed through to iterfun . |
iterfun
otherwise. void fid_sequences_compute_distribution | ( | fid_Sequences * | seqs | ) |
Compute character distribution of given sequences.
The fid_Sequences::distribution will be filled by this function, such that each entry s contains to the relative frequency of symbol s.
The length of the sequence is corrected by the number of sequence separators present in the input sequence, so these will not be taken into account. Note that the entry for wildcards will be filled twice, once at index fid_WILDCARD, and once at the first index after the last normal symbol. The entries for separators and undefined characters will both be set to 0.0.
Note that fid_suffixarray_compute_distribution() is faster than this function, especially on large datasets.
seqs | The sequences whose character distribution should be determined. |
Definition at line 529 of file sequences.c.
References fid_Sequences::alpha, fid_Mappedfile::content, fid_Sequences::distribution, fid_SWITCH48, fid_WILDCARD, fid_Sequences::num_of_sequences, fid_Alphabet::num_of_syms, fid_Mappedfile::occupied, fid_Sequences::tisfile, fid_Sequences::uisize, fid_Uint48::v_uint32, and fid_Uint48::v_uint64.
void fid_sequences_free | ( | fid_Sequences * | seqs | ) |
Unmap sequences and free memory.
seqs | The structure to be freed. |
Definition at line 570 of file sequences.c.
References fid_Sequences::desfile, fid_file_unmap(), fid_sequences_realize(), fid_TABLES_ONLINE, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::sspfile, and fid_Sequences::tisfile.
Referenced by fid_suffixarray_free(), and fid_suffixarray_load_from_files().
void fid_sequences_dump_range_32 | ( | const fid_Symbol * | seq, | |
fid_Uint32 | length, | |||
const fid_Alphabet * | alpha, | |||
const char * | str, | |||
int | stop_at_separator, | |||
FILE * | stream | |||
) |
32 bit version of fid_sequences_dump_range().
Print piece of sequence to stream.
seq | Some binary encoded sequence of symbols. | |
length | The number of symbols to be read from seq and printed to stream . | |
alpha | Alphabet used to transform symbols to printable characters. | |
str | If not NULL , then print this ASCII string in front of the transformed sequence (useful to distinguish multiple sequences). | |
stop_at_separator | If true, then stop printing when a sequence separator is encountered. If false, print some special character for each sequence separator encountered and continue. | |
stream | An output stream to which the sequence is printed. If NULL , nothing will be printed. |
void fid_sequences_dump_range_64 | ( | const fid_Symbol * | seq, | |
fid_Uint64 | length, | |||
const fid_Alphabet * | alpha, | |||
const char * | str, | |||
int | stop_at_separator, | |||
FILE * | stream | |||
) |
64 bit version of fid_sequences_dump_range().
Print piece of sequence to stream.
seq | Some binary encoded sequence of symbols. | |
length | The number of symbols to be read from seq and printed to stream . | |
alpha | Alphabet used to transform symbols to printable characters. | |
str | If not NULL , then print this ASCII string in front of the transformed sequence (useful to distinguish multiple sequences). | |
stop_at_separator | If true, then stop printing when a sequence separator is encountered. If false, print some special character for each sequence separator encountered and continue. | |
stream | An output stream to which the sequence is printed. If NULL , nothing will be printed. |