Binary encoded multiple sequences


Detailed Description

This group contains data structures and functions for handling sequences.

Data Structures

struct  fid_Sequences
 A structure that holds references to persistent suffix array tables. More...
struct  fid_Sequencefileinfo
 Length information about sequence file. More...

Defines

#define fid_TABLE_NONE   ((fid_Tablerequest)0x00000000)
 Table request: nothing.
#define fid_TABLE_TIS   ((fid_Tablerequest)0x00000001)
 Table request: transformed input sequence.
#define fid_TABLE_OIS   ((fid_Tablerequest)0x00000002)
 Table request: original input sequence.
#define fid_TABLE_DES   ((fid_Tablerequest)0x00000004)
 Table request: sequence descriptions.
#define fid_TABLES_ONLINE   (fid_TABLE_TIS|fid_TABLE_OIS|fid_TABLE_DES)
 Table request: all tables related to online matching.
#define fid_TABLES_ONLINE_MASK   ((fid_Tablerequest)0x000000ff)
 Table request bit mask: bits related to online matching.
#define fid_READ_SYMBOL(SEQS, I)   (SEQS)->tisfile.content[I]
 Read symbol from transformed input sequences of a fid_Sequences structure.
#define fid_SEQUENCE_DESCRIPTION_32(S, I)   ((const char *)((S)->desfile.content+(S)->descriptions.v_uint32[I]))
 Return pointer to description of sequence I.
#define fid_SEQUENCE_DESCRIPTION_64(S, I)   ((const char *)((S)->desfile.content+(S)->descriptions.v_uint64[I]))
#define fid_SEQUENCE_DESCRIPTION_LENGTH_32(S, I)   ((S)->descriptions.v_uint32[(I)+1]-(S)->descriptions.v_uint32[I])
 Return the length of the description of sequence I.
#define fid_SEQUENCE_DESCRIPTION_LENGTH_64(S, I)   ((S)->descriptions.v_uint64[(I)+1]-(S)->descriptions.v_uint64[I])

Typedefs

typedef fid_Uint32 fid_Tablerequest
 Type for encoding enhanced suffix array table requests.
typedef int(* fid_Sequenceiterfun_32 )(const fid_Sequences *seqs, fid_Uint32 seqnum, const fid_Symbol *sequence, fid_Uint32 seqlen, void *user_data)
 Type of callback function expected by fid_sequences_iterate_32().
typedef int(* fid_Sequenceiterfun_64 )(const fid_Sequences *seqs, fid_Uint64 seqnum, const fid_Symbol *sequence, fid_Uint64 seqlen, void *user_data)
 Type of callback function expected by fid_sequences_iterate_64().

Enumerations

enum  fid_Fileformat { fid_FORMAT_UNDEF, fid_FORMAT_FASTA, fid_FORMAT_UNIPROT }
 Enumeration of supported biological sequence file formats. More...

Functions

void fid_sequences_init (fid_Sequences *seqs, fid_Uintsize uisize, const fid_Alphabet *alpha)
 Initialize an fid_Sequences structure.
int fid_sequences_map (fid_Sequences *seqs, const char *basefilename, fid_Tablerequest tables, fid_Filenamebuffer *fnamebuf, fid_Error *error)
 Map online tables of enhanced suffix array into memory.
int fid_sequences_realize (fid_Sequences *seqs, fid_Tablerequest tables, fid_Error *error)
 Update data fields in fid_Sequences structure according to mapped files.
int fid_sequences_parse_from_file_to_file (const char *infilename, fid_Fileformat format, fid_Tablerequest tables, const char *basefilename, const fid_Alphabet *alpha, fid_Uintsize uisize, size_t *input_file_size, fid_Error *error)
 Parse sequence set from the given file, write output to files.
int fid_sequences_parse_from_file_to_memory (const char *infilename, fid_Fileformat format, fid_Tablerequest tables, fid_Sequences *result, const fid_Alphabet *alpha, size_t padding, fid_Uintsize uisize, size_t *input_file_size, fid_Error *error)
 Parse sequence set from the given file to memory.
int fid_sequences_parse_from_file_to_lengths (const char *infilename, fid_Fileformat format, fid_Sequencefileinfo *seqinfo, const fid_Alphabet *alpha, fid_Error *error)
 Parse sequence set from given file, but only gather statistics.
int fid_sequences_parse_from_memory_to_file (const char *infilename, const char *buffer, size_t bufsize, fid_Fileformat format, fid_Tablerequest tables, const char *basefilename, const fid_Alphabet *alpha, fid_Uintsize uisize, fid_Error *error)
 Parse sequence set from the provided buffer, write output to files.
int fid_sequences_parse_from_memory_to_memory (const char *infilename, const char *buffer, size_t bufsize, fid_Fileformat format, fid_Tablerequest tables, fid_Sequences *result, const fid_Alphabet *alpha, size_t padding, fid_Uintsize uisize, fid_Error *error)
 Parse sequence set from the provided buffer to memory.
int fid_sequences_parse_from_memory_to_lengths (const char *infilename, const char *buffer, size_t bufsize, fid_Fileformat format, fid_Sequencefileinfo *seqinfo, const fid_Alphabet *alpha, fid_Error *error)
 Parse sequence set from the provided buffer, but only gather statistics.
fid_Uint32 fid_sequences_offset_to_index_32 (const fid_Sequences *seqs, fid_Uint32 offset)
 32 bit version of fid_sequences_offset_to_index().
fid_Uint64 fid_sequences_offset_to_index_64 (const fid_Sequences *seqs, fid_Uint64 offset)
 64 bit version of fid_sequences_offset_to_index().
void fid_sequences_index_to_boundaries_32 (const fid_Sequences *seqs, fid_Uint32 seqindex, fid_Uint32 *left, fid_Uint32 *right)
 32 bit version of fid_sequences_index_to_boundaries().
void fid_sequences_index_to_boundaries_64 (const fid_Sequences *seqs, fid_Uint64 seqindex, fid_Uint64 *left, fid_Uint64 *right)
 64 bit version of fid_sequences_index_to_boundaries().
void fid_sequences_offset_to_boundaries_32 (const fid_Sequences *seqs, fid_Uint32 offset, fid_Uint32 *left, fid_Uint32 *right)
 32 bit version of fid_sequences_offset_to_boundaries().
void fid_sequences_offset_to_boundaries_64 (const fid_Sequences *seqs, fid_Uint64 offset, fid_Uint64 *left, fid_Uint64 *right)
 64 bit version of fid_sequences_offset_to_boundaries().
int fid_sequences_iterate_range_32 (const fid_Sequences *seqs, fid_Uint32 from, fid_Uint32 to, const fid_Sequenceiterfun_32 iterfun, void *user_data)
 32 bit version of fid_sequences_iterate_range().
int fid_sequences_iterate_range_64 (const fid_Sequences *seqs, fid_Uint64 from, fid_Uint64 to, const fid_Sequenceiterfun_64 iterfun, void *user_data)
 64 bit version of fid_sequences_iterate_range().
int fid_sequences_iterate_32 (const fid_Sequences *seqs, const fid_Sequenceiterfun_32 iterfun, void *user_data)
 32 bit version of fid_sequences_iterate().
int fid_sequences_iterate_64 (const fid_Sequences *seqs, const fid_Sequenceiterfun_64 iterfun, void *user_data)
 64 bit version of fid_sequences_iterate().
void fid_sequences_compute_distribution (fid_Sequences *seqs)
 Compute character distribution of given sequences.
void fid_sequences_free (fid_Sequences *seqs)
 Unmap sequences and free memory.
void fid_sequences_dump_range_32 (const fid_Symbol *seq, fid_Uint32 length, const fid_Alphabet *alpha, const char *str, int stop_at_separator, FILE *stream)
 32 bit version of fid_sequences_dump_range().
void fid_sequences_dump_range_64 (const fid_Symbol *seq, fid_Uint64 length, const fid_Alphabet *alpha, const char *str, int stop_at_separator, FILE *stream)
 64 bit version of fid_sequences_dump_range().

Define Documentation

#define fid_TABLE_NONE   ((fid_Tablerequest)0x00000000)

Table request: nothing.

Definition at line 104 of file sequences.h.

Referenced by fid_create_online_files().

#define fid_TABLE_TIS   ((fid_Tablerequest)0x00000001)

Table request: transformed input sequence.

Definition at line 107 of file sequences.h.

Referenced by fid_create_online_files(), fid_sequences_map(), fid_sequences_realize(), fid_suffixarray_print(), and fid_tablerequest_to_string().

#define fid_TABLE_OIS   ((fid_Tablerequest)0x00000002)

Table request: original input sequence.

Definition at line 110 of file sequences.h.

Referenced by fid_create_online_files(), fid_sequences_map(), and fid_sequences_realize().

#define fid_TABLE_DES   ((fid_Tablerequest)0x00000004)

Table request: sequence descriptions.

Definition at line 113 of file sequences.h.

Referenced by fid_create_online_files(), fid_sequences_map(), and fid_sequences_parse_from_memory_to_file().

#define fid_TABLES_ONLINE   (fid_TABLE_TIS|fid_TABLE_OIS|fid_TABLE_DES)

Table request: all tables related to online matching.

Definition at line 116 of file sequences.h.

Referenced by fid_sequences_free(), and fid_tablerequest_to_string().

#define fid_TABLES_ONLINE_MASK   ((fid_Tablerequest)0x000000ff)

Table request bit mask: bits related to online matching.

Definition at line 119 of file sequences.h.

Referenced by fid_suffixarray_load_from_files(), and fid_suffixarray_realize().

#define fid_READ_SYMBOL ( SEQS,
 )     (SEQS)->tisfile.content[I]

Read symbol from transformed input sequences of a fid_Sequences structure.

Parameters:
SEQS Pointer to a fid_Sequences structure.
I Absolute sequence position.
Returns:
The symbol at position I.

Definition at line 142 of file sequences.h.

Referenced by fid_suffixarray_find_embedded_interval(), fid_suffixarray_get_intervals(), and fid_suffixinterval_lcpvalue().

#define fid_SEQUENCE_DESCRIPTION_32 ( S,
 )     ((const char *)((S)->desfile.content+(S)->descriptions.v_uint32[I]))

Return pointer to description of sequence I.

Parameters:
S Pointer to an fid_Sequences structure.
I Sequence index.
Returns:
Pointer to the description of sequence I.

Definition at line 153 of file sequences.h.

#define fid_SEQUENCE_DESCRIPTION_64 ( S,
 )     ((const char *)((S)->desfile.content+(S)->descriptions.v_uint64[I]))

See also:
fid_SEQUENCE_DESCRIPTION_32()

Definition at line 157 of file sequences.h.

#define fid_SEQUENCE_DESCRIPTION_LENGTH_32 ( S,
 )     ((S)->descriptions.v_uint32[(I)+1]-(S)->descriptions.v_uint32[I])

Return the length of the description of sequence I.

Parameters:
S Pointer to an fid_Sequences structure.
I Sequence index.
Returns:
Length of the description of sequence I.

Definition at line 168 of file sequences.h.

#define fid_SEQUENCE_DESCRIPTION_LENGTH_64 ( S,
 )     ((S)->descriptions.v_uint64[(I)+1]-(S)->descriptions.v_uint64[I])

See also:
fid_SEQUENCE_DESCRIPTION_LENGTH_32()

Definition at line 172 of file sequences.h.


Typedef Documentation

Type for encoding enhanced suffix array table requests.

When mapping an enhanced suffix array into memory, pass a value of this type to the corresponding function to tell it which tables it should read. This is a bit vector that must be initialized by OR'ing predefined values such like fid_TABLE_TIS or fid_TABLE_SUF. Always use this type for it has a fixed size on all architectures.

Definition at line 101 of file sequences.h.

typedef int(* fid_Sequenceiterfun_32)(const fid_Sequences *seqs, fid_Uint32 seqnum, const fid_Symbol *sequence, fid_Uint32 seqlen, void *user_data)

Type of callback function expected by fid_sequences_iterate_32().

To use the sequence iterator functions, the caller must declare a callback function correponding to this type and pass the function pointer to the iterator function. For each sequence in a fid_Sequences structure, that callback function is called.

Parameters:
seqs The structure that contains the processed sequences.
seqnum Current sequence index.
sequence Direct pointer to the current sequence.
seqlen Length of the current sequence.
user_data Arbitrary pointer passed through by the iterator functions to omit the need for global variables.
Returns:
0 to continue iterating over sequences, any other value to stop.
See also:
fid_Sequenceiterfun_64

Definition at line 194 of file sequences.h.

typedef int(* fid_Sequenceiterfun_64)(const fid_Sequences *seqs, fid_Uint64 seqnum, const fid_Symbol *sequence, fid_Uint64 seqlen, void *user_data)

Type of callback function expected by fid_sequences_iterate_64().

See also:
fid_Sequenceiterfun_32

Definition at line 203 of file sequences.h.


Enumeration Type Documentation

Enumeration of supported biological sequence file formats.

These values can be used to force the sequence parser to try a specific format instead of leaving the decision on how to read the data to the parser. However, enabling auto-detection by choosing fid_Fileformat::fid_FORMAT_UNDEF should be good enough in virtually any case.

Enumerator:
fid_FORMAT_UNDEF  No specific file format. Use for auto-detection.
fid_FORMAT_FASTA  FASTA file format.
fid_FORMAT_UNIPROT  UniProt file format.

Definition at line 67 of file sequences.h.


Function Documentation

void fid_sequences_init ( fid_Sequences seqs,
fid_Uintsize  uisize,
const fid_Alphabet alpha 
)

Initialize an fid_Sequences structure.

Parameters:
seqs The structure to be initialized.
uisize Required word size. Some fields in fid_Sequences, especially the tables of sequence separator positions and sequence description separator positions, depend on how these data are stored on file.
alpha The alphabet to be associated with this structure. It is legal to pass a NULL pointer.

Definition at line 246 of file sequences.c.

References fid_Sequences::alpha, fid_Mappedfile::content, fid_Sequences::descriptions, fid_Sequences::desfile, fid_SWITCH48, fid_Sequences::num_of_sequences, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::separators, fid_Sequences::sspfile, fid_Sequences::tisfile, fid_Sequences::total_length, fid_Sequences::uisize, fid_Uint48constptr::v_uint32, fid_Uint48::v_uint32, fid_Uint48constptr::v_uint64, and fid_Uint48::v_uint64.

Referenced by fid_create_online_files(), and fid_suffixarray_init().

int fid_sequences_map ( fid_Sequences seqs,
const char *  basefilename,
fid_Tablerequest  tables,
fid_Filenamebuffer fnamebuf,
fid_Error error 
)

Map online tables of enhanced suffix array into memory.

Parameters:
seqs This structure will be associated with the mapped file.
basefilename Basename from which filenames corresponding to the mapped files are generated.
tables requested tables, online bits only.
fnamebuf Filename buffer for generating filenames. If NULL is passed, then a local buffer will be used.
error Error messages go here.
Returns:
0 on success, -1 on error.

Definition at line 313 of file sequences.c.

References fid_Mappedfile::content, fid_Sequences::desfile, fid_file_unmap(), fid_filenamebuffer_free(), fid_filenamebuffer_init_local(), fid_sequences_realize(), fid_TABLE_DES, fid_TABLE_OIS, fid_TABLE_TIS, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::sspfile, and fid_Sequences::tisfile.

Referenced by fid_suffixarray_load_from_files().

int fid_sequences_realize ( fid_Sequences seqs,
fid_Tablerequest  tables,
fid_Error error 
)

Update data fields in fid_Sequences structure according to mapped files.

All data fields whose correct values can be derived from the sizes of the associated mapped files are updated by this function (e.g., fid_Sequences::num_of_sequences can be derived from the size of the file associated with fid_Sequences::sspfile). No function should set these fields manually, just call this function instead. A table request can be defined to ask the function to look at only some of the files and update only values derived from these.

Parameters:
seqs The structure to be updated.
tables Update only values derivable from the tables specified in this request. To update all values, pass fid_TABLES_ONLINE.
error Error messages go here.
Returns:
0 on success, -1 on error.

Definition at line 454 of file sequences.c.

References fid_Mappedfile::content, fid_error_throw(), fid_SWITCH48, fid_TABLE_OIS, fid_TABLE_TIS, fid_Mappedfile::occupied, fid_Sequences::oisfile, fid_Sequences::tisfile, and fid_Sequences::uisize.

Referenced by fid_sequences_free(), fid_sequences_map(), and fid_suffixarray_realize().

int fid_sequences_parse_from_file_to_file ( const char *  infilename,
fid_Fileformat  format,
fid_Tablerequest  tables,
const char *  basefilename,
const fid_Alphabet alpha,
fid_Uintsize  uisize,
size_t *  input_file_size,
fid_Error error 
)

Parse sequence set from the given file, write output to files.

For a more descent description of this function, see fid_sequences_parse_from_memory_to_file() since this function depends on that function.

Parameters:
infilename The name of a file that contains biological data in some textual format such like FASTA.
format The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF.
tables Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error.
basefilename The base name of the generated tables. This should be a simple string suitable to generate a file name from. Suffixes ".tis", ".ois", ".sds", ".des", and ".ssp"are appended to this string to generate the final file names of the tables.
alpha The alphabet that is used to transform the textual data into binary.
uisize Sequence separator positions and description separator positions are stored as integers of this size.
input_file_size If not NULL, then the total size of the input file infilename is returned in this argument.
error Error messages go here.
Returns:
0 on success, -1 on error.

Definition at line 611 of file sequences.c.

References fid_sequences_parse_from_memory_to_file().

int fid_sequences_parse_from_file_to_memory ( const char *  infilename,
fid_Fileformat  format,
fid_Tablerequest  tables,
fid_Sequences seqs,
const fid_Alphabet alpha,
size_t  padding,
fid_Uintsize  uisize,
size_t *  input_file_size,
fid_Error error 
)

Parse sequence set from the given file to memory.

For a more descent description of this function, see fid_sequences_parse_from_memory_to_memory() since this function depends on that function.

Parameters:
infilename The name of a file that contains biological data in some textual format such like FASTA.
format The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF.
tables Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error.
seqs The data structure that stores the parsed tables.
alpha The alphabet that is used to transform the textual data into binary.
padding Number of symbols to append to the transformed sequence data. This is useful for algorithms that can be simplified under the assumption that the sequence data extends a certain number of symbols beyond the actual end of sequence. The symbols appended to the end are fid_SEPARATOR symbols. The value of padding will not be reflected in the lengths stored in seq.
uisize Sequence separator positions and description separator positions are stored as integers of this size.
input_file_size If not NULL, then the total size of the input file infilename is returned in this argument.
error Error messages go here.
Returns:
0 on success, -1 on error.
Todo:
This function is at least 95% identical to fid_sequences_parse_from_file_to_file(), so we should refactor this.

Definition at line 686 of file sequences.c.

References fid_sequences_parse_from_memory_to_memory().

int fid_sequences_parse_from_file_to_lengths ( const char *  infilename,
fid_Fileformat  format,
fid_Sequencefileinfo seqinfo,
const fid_Alphabet alpha,
fid_Error error 
)

Parse sequence set from given file, but only gather statistics.

For a more descent description of this function, see fid_sequences_parse_from_memory_to_lengths() since this function depends on that function.

Parameters:
infilename The name of a file that contains biological data in some textual format such like FASTA.
format The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF.
seqinfo The sizes of the transformed sequence data, descriptions, etc., are returned here.
alpha The alphabet that is used to transform the textual data into binary.
error Error messages go here.
Returns:
0 on success, -1 on error.
Todo:
This function is at least 95% identical to fid_sequences_parse_from_file_to_memory(), so we should refactor this.

Definition at line 747 of file sequences.c.

References fid_sequences_parse_from_memory_to_lengths().

int fid_sequences_parse_from_memory_to_file ( const char *  infilename,
const char *  buffer,
size_t  bufsize,
fid_Fileformat  format,
fid_Tablerequest  tables,
const char *  basefilename,
const fid_Alphabet alpha,
fid_Uintsize  uisize,
fid_Error error 
)

Parse sequence set from the provided buffer, write output to files.

This function reads the content of buffer that contains a set of biological sequences in some supported format, and writes it into a more convenient, binary form as a set of tables. No index generation takes place here, this is only the preprocessing of online data. The input file format is attempted to be auto-detected, but can also be predetermined. A memory mapped file provides sufficient input to this function. The output is a set of files on disk corresponding to the entities parsed from the input.

Internally, the set of sequences is parsed into an fid_Sequences structure, whose file members fid_Sequences::tisfile, fid_Sequences::oisfile, fid_Sequences::sspfile, fid_Sequences::desfile, and fid_Sequences::sdsfile,(depending on the tables request) are used to construct the corresponding table. That is, this function does not allocate RAM, but solely relies on memory mapped files. It creates growing files directly on disk rather than taking the risk to be swapped out by the OS. When the function returns, all files will be closed again. To operate on the preprocessed data (e.g., to construct an enhanced suffix array), map it into memory again.

Note:
Resizing of files on NFS-mounted volumes can be slow, depending on the NFS implementation, so parsing to memory first via fid_sequences_parse_from_file_to_memory() or fid_sequences_parse_from_memory_to_memory(), and then writing the result to file via fid_file_dump_to_file() might be more advantageous under certain circumstances.
Parameters:
infilename The name of the input file (for error messages only).
buffer The buffer that contains biological data in some textual format such like FASTA.
bufsize The size of buffer in bytes. By this, buffer is not required to be zero-terminated. This value should be positive.
format The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF.
tables Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error.
basefilename The base name of the generated tables. This should be a simple string suitable to generate a file name from. Suffixes ".tis", ".ois", ".ssp", ".des", and ".sds" are appended to this string to generate the final file names of the tables.
alpha The alphabet that is used to transform the textual data into binary.
uisize Sequence separator positions and description separator positions are stored as integers of this size.
error Error messages go here.
Returns:
0 on success, -1 on error.

Definition at line 874 of file sequences.c.

References fid_Sequences::desfile, fid_create_online_files(), fid_file_unmap(), fid_SWITCH48, fid_TABLE_DES, fid_Sequences::num_of_sequences, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::sspfile, fid_Sequences::tisfile, fid_Uint48::v_uint32, and fid_Uint48::v_uint64.

Referenced by fid_sequences_parse_from_file_to_file().

int fid_sequences_parse_from_memory_to_memory ( const char *  infilename,
const char *  buffer,
size_t  bufsize,
fid_Fileformat  format,
fid_Tablerequest  tables,
fid_Sequences seqs,
const fid_Alphabet alpha,
size_t  padding,
fid_Uintsize  uisize,
fid_Error error 
)

Parse sequence set from the provided buffer to memory.

This function reads the content of buffer that contains a set of biological sequences in some supported format, and writes it into a more convenient, binary form as a set of tables. No index generation takes place here, this is only the preprocessing of online data. The input file format is attempted to be auto-detected, but can also be predetermined. A memory mapped file provides sufficient input to this function. The output of this function is binary identical to the output of fid_sequences_parse_from_memory_to_file(), only that it is not written to files directly.

The output is a set of dynamic arrays corresponding to the entities parsed from the input, stored in a fid_Sequences structure, seqs. The fid_Mappedfile::content pointers of the files represented by seqs are pointers to allocated memory, so they are not to be unmapped directly. The sequences can be flushed to files after parsing, and must be freed via fid_sequences_free() when they are not needed anymore.

Parameters:
infilename The name of the input file (for error messages only).
buffer The buffer that contains biological data in some textual format such like FASTA.
bufsize The size of buffer in bytes. By this, buffer is not required to be zero-terminated. This value should be positive.
format The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF.
tables Bitvector representing a request which tables are to be generated. This value should be the result of OR'ing one or more of fid_TABLE_TIS, fid_TABLE_OIS, and fid_TABLE_DES. Other requests are ignored and do not generate an error.
seqs The parsed sequences.
alpha The alphabet that is used to transform the textual data into binary.
padding Number of symbols to append to the transformed sequence data. This is useful for algorithms that can be simplified under the assumption that the sequence data extends a certain number of symbols beyond the actual end of sequence. The symbols appended to the end are fid_SEPARATOR symbols. The value of padding will not be reflected in the lengths stored in seq.
uisize Sequence separator positions and description separator positions are stored as integers of this size.
error Error messages go here.
Returns:
0 on success, -1 on error.

Definition at line 977 of file sequences.c.

References fid_SWITCH48.

Referenced by fid_sequences_parse_from_file_to_memory().

int fid_sequences_parse_from_memory_to_lengths ( const char *  infilename,
const char *  buffer,
size_t  bufsize,
fid_Fileformat  format,
fid_Sequencefileinfo seqinfo,
const fid_Alphabet alpha,
fid_Error error 
)

Parse sequence set from the provided buffer, but only gather statistics.

This function just parses the given buffer, but does not store the parsed sequences anywhere. Instead, the function counts the number of sequences, determines it total length, etc. (see fid_Sequencefileinfo), and returns these data to the caller. The main purpose of this function is to determine if the length of the raw sequence data as stored in enhanced suffix arrays can be represented in 32 bit integers or not.

Parameters:
infilename The name of the input file (for error messages only).
buffer The buffer that contains biological data in some textual format such like FASTA.
bufsize The size of buffer in bytes. By this, buffer is not required to be zero-terminated. This value should be positive.
format The predetermined file format. If auto-detection should be tried, pass fid_Fileformat::fid_FORMAT_UNDEF.
seqinfo The information about the parsed sequences.
alpha The alphabet that is used to transform the textual data into binary.
error Error messages go here.
Returns:
0 on success, -1 on error.

Definition at line 1079 of file sequences.c.

References fid_Sequencefileinfo::input_file_size, and fid_Sequencefileinfo::num_of_sequences.

Referenced by fid_sequences_parse_from_file_to_lengths().

fid_Uint32 fid_sequences_offset_to_index_32 ( const fid_Sequences seqs,
fid_Uint32  offset 
)

32 bit version of fid_sequences_offset_to_index().

Determine sequence index from sequence offset.

It is an error to pass the position of a sequence separator as offset.

Parameters:
seqs Set of sequences.
offset Offset into the concatenated set of sequences.
Returns:
The sequence index corresponding to the given offset.

fid_Uint64 fid_sequences_offset_to_index_64 ( const fid_Sequences seqs,
fid_Uint64  offset 
)

64 bit version of fid_sequences_offset_to_index().

Determine sequence index from sequence offset.

It is an error to pass the position of a sequence separator as offset.

Parameters:
seqs Set of sequences.
offset Offset into the concatenated set of sequences.
Returns:
The sequence index corresponding to the given offset.

void fid_sequences_index_to_boundaries_32 ( const fid_Sequences seqs,
fid_Uint32  seqindex,
fid_Uint32 left,
fid_Uint32 right 
)

32 bit version of fid_sequences_index_to_boundaries().

Determine sequence boundaries from sequence index.

Parameters:
seqs Set of sequences.
seqindex Sequence index; the first sequence is identified by 0.
left,right The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence.

void fid_sequences_index_to_boundaries_64 ( const fid_Sequences seqs,
fid_Uint64  seqindex,
fid_Uint64 left,
fid_Uint64 right 
)

64 bit version of fid_sequences_index_to_boundaries().

Determine sequence boundaries from sequence index.

Parameters:
seqs Set of sequences.
seqindex Sequence index; the first sequence is identified by 0.
left,right The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence.

void fid_sequences_offset_to_boundaries_32 ( const fid_Sequences seqs,
fid_Uint32  offset,
fid_Uint32 left,
fid_Uint32 right 
)

32 bit version of fid_sequences_offset_to_boundaries().

Determine sequence boundaries from sequence offset.

This function performs a binary search to find the sequence index first, so use fid_sequences_index_to_boundaries() instead if you already know the sequence index. It is an error to pass the position of a sequence separator as offset.

Parameters:
seqs Set of sequences.
offset Offset into the concatenated set of sequences.
left,right The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence.

void fid_sequences_offset_to_boundaries_64 ( const fid_Sequences seqs,
fid_Uint64  offset,
fid_Uint64 left,
fid_Uint64 right 
)

64 bit version of fid_sequences_offset_to_boundaries().

Determine sequence boundaries from sequence offset.

This function performs a binary search to find the sequence index first, so use fid_sequences_index_to_boundaries() instead if you already know the sequence index. It is an error to pass the position of a sequence separator as offset.

Parameters:
seqs Set of sequences.
offset Offset into the concatenated set of sequences.
left,right The sequence boundaries will be written to these integers. Note that sequence separators are excluded from the returned sequence.

int fid_sequences_iterate_range_32 ( const fid_Sequences seqs,
fid_Uint32  from,
fid_Uint32  to,
const fid_Sequenceiterfun_32  iterfun,
void *  user_data 
)

32 bit version of fid_sequences_iterate_range().

Call a callback function for each sequence in given range.

For each sequence stored in seqs in the range specified by sequences indices from and to, call the callback function iterfun. The iteration can be interrupted by the callback function returning a non-zero value.

Parameters:
seqs The sequences to be iterated over.
from,to The range of sequences to be processed. These values must be valid, i.e., from must not be greater than to, and none of them must be greater than fid_Sequences::num_of_sequences-1, behavior is undefined in these cases.
iterfun The callback function.
user_data Arbitrary pointer passed through to iterfun.
Returns:
0 on success, or the return value of iterfun otherwise.

int fid_sequences_iterate_range_64 ( const fid_Sequences seqs,
fid_Uint64  from,
fid_Uint64  to,
const fid_Sequenceiterfun_64  iterfun,
void *  user_data 
)

64 bit version of fid_sequences_iterate_range().

Call a callback function for each sequence in given range.

For each sequence stored in seqs in the range specified by sequences indices from and to, call the callback function iterfun. The iteration can be interrupted by the callback function returning a non-zero value.

Parameters:
seqs The sequences to be iterated over.
from,to The range of sequences to be processed. These values must be valid, i.e., from must not be greater than to, and none of them must be greater than fid_Sequences::num_of_sequences-1, behavior is undefined in these cases.
iterfun The callback function.
user_data Arbitrary pointer passed through to iterfun.
Returns:
0 on success, or the return value of iterfun otherwise.

int fid_sequences_iterate_32 ( const fid_Sequences seqs,
const fid_Sequenceiterfun_32  iterfun,
void *  user_data 
)

32 bit version of fid_sequences_iterate().

Call a callback function for each sequence.

For each sequence stored in seqs, call the callback function iterfun. The iteration can be interrupted by the callback function returning a non-zero value.

Parameters:
seqs The sequences to be iterated over.
iterfun The callback function.
user_data Arbitrary pointer passed through to iterfun.
Returns:
0 on success, or the return value of iterfun otherwise.

int fid_sequences_iterate_64 ( const fid_Sequences seqs,
const fid_Sequenceiterfun_64  iterfun,
void *  user_data 
)

64 bit version of fid_sequences_iterate().

Call a callback function for each sequence.

For each sequence stored in seqs, call the callback function iterfun. The iteration can be interrupted by the callback function returning a non-zero value.

Parameters:
seqs The sequences to be iterated over.
iterfun The callback function.
user_data Arbitrary pointer passed through to iterfun.
Returns:
0 on success, or the return value of iterfun otherwise.

void fid_sequences_compute_distribution ( fid_Sequences seqs  ) 

Compute character distribution of given sequences.

The fid_Sequences::distribution will be filled by this function, such that each entry s contains to the relative frequency of symbol s.

The length of the sequence is corrected by the number of sequence separators present in the input sequence, so these will not be taken into account. Note that the entry for wildcards will be filled twice, once at index fid_WILDCARD, and once at the first index after the last normal symbol. The entries for separators and undefined characters will both be set to 0.0.

Note that fid_suffixarray_compute_distribution() is faster than this function, especially on large datasets.

Parameters:
seqs The sequences whose character distribution should be determined.

Definition at line 529 of file sequences.c.

References fid_Sequences::alpha, fid_Mappedfile::content, fid_Sequences::distribution, fid_SWITCH48, fid_WILDCARD, fid_Sequences::num_of_sequences, fid_Alphabet::num_of_syms, fid_Mappedfile::occupied, fid_Sequences::tisfile, fid_Sequences::uisize, fid_Uint48::v_uint32, and fid_Uint48::v_uint64.

void fid_sequences_free ( fid_Sequences seqs  ) 

Unmap sequences and free memory.

Parameters:
seqs The structure to be freed.

Definition at line 570 of file sequences.c.

References fid_Sequences::desfile, fid_file_unmap(), fid_sequences_realize(), fid_TABLES_ONLINE, fid_Sequences::oisfile, fid_Sequences::sdsfile, fid_Sequences::sspfile, and fid_Sequences::tisfile.

Referenced by fid_suffixarray_free(), and fid_suffixarray_load_from_files().

void fid_sequences_dump_range_32 ( const fid_Symbol seq,
fid_Uint32  length,
const fid_Alphabet alpha,
const char *  str,
int  stop_at_separator,
FILE *  stream 
)

32 bit version of fid_sequences_dump_range().

Print piece of sequence to stream.

Parameters:
seq Some binary encoded sequence of symbols.
length The number of symbols to be read from seq and printed to stream.
alpha Alphabet used to transform symbols to printable characters.
str If not NULL, then print this ASCII string in front of the transformed sequence (useful to distinguish multiple sequences).
stop_at_separator If true, then stop printing when a sequence separator is encountered. If false, print some special character for each sequence separator encountered and continue.
stream An output stream to which the sequence is printed. If NULL, nothing will be printed.

void fid_sequences_dump_range_64 ( const fid_Symbol seq,
fid_Uint64  length,
const fid_Alphabet alpha,
const char *  str,
int  stop_at_separator,
FILE *  stream 
)

64 bit version of fid_sequences_dump_range().

Print piece of sequence to stream.

Parameters:
seq Some binary encoded sequence of symbols.
length The number of symbols to be read from seq and printed to stream.
alpha Alphabet used to transform symbols to printable characters.
str If not NULL, then print this ASCII string in front of the transformed sequence (useful to distinguish multiple sequences).
stop_at_separator If true, then stop printing when a sequence separator is encountered. If false, print some special character for each sequence separator encountered and continue.
stream An output stream to which the sequence is printed. If NULL, nothing will be printed.


Generated on Wed Jul 8 17:21:16 2009 for Full-text Index Data structure library by  doxygen 1.5.9