Documentation
¶
Index ¶
- Constants
- Variables
- func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
- func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
- func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
- func ClearSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)
- func Combinations2(set []uint64) [][2]uint64
- func Execute()
- func IntSlice2StringSlice(vals []int) []string
- func MeanStdev(values []float64) (float64, float64)
- func ParseByteSize(val string) (int64, error)
- func RC(s []byte) []byte
- func RecycleChaining2Result(chains *[]*Chain2Result)
- func RecycleChainingResult(chains *[]*[]int32)
- func RecycleSearchResultOfAGenome(r *SearchResultOfAGenome)
- func RecycleSeqComparatorResult(r *SeqComparatorResult)
- func RecycleSubstrPairs(poolSub *sync.Pool, poolSubs *sync.Pool, subs *[]*SubstrPair)
- func TrimSubStrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int, minDist float64)
- type Chain2Result
- type Chain3Result
- type Chainer
- type Chainer2
- type Chainer3
- type Chaining2Options
- type Chaining3Options
- type ChainingOptions
- type Index
- func (idx *Index) Close() error
- func (idx *Index) RecycleSearchResult(r *SearchResult)
- func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
- func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)
- func (idx *Index) Search(query *Query) (*[]*SearchResult, error)
- func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
- type IndexBuildingOptions
- type IndexInfo
- type IndexSearchingOptions
- type Options
- type Query
- type SearchResult
- type SearchResultOfAGenome
- type SearchResultOfASequence
- type SearchResultReader
- type SearchResultsHeap
- type SeqComparator
- type SeqComparatorOptions
- type SeqComparatorResult
- type SimilarityDetail
- type SubstrPair
- type Uint64Slice
Constants ¶
const BITS_BATCH_IDX = 17
BITS_BATCH_IDX is the number of bits to store the genome batch index.
const BITS_FLAGS = BITS_STRAND + BITS_REVERSE
BITS_FLAGS is the number of bits to store two bits
const BITS_GENOME_IDX = 17
BITS_GENOME_IDX is the number of bits to store the genome index.
const BITS_IDX = BITS_BATCH_IDX + BITS_BATCH_IDX
BITS_IDX is the number of bits to strore batch index and genome index.
const BITS_IDX_FLAGS = BITS_IDX + BITS_FLAGS
BITS_IDX_FLAGS is the sum of BITS_IDX and BITS_FLAGS
const BITS_NONE_IDX = 64 - BITS_BATCH_IDX - BITS_GENOME_IDX
BITS_NONE_IDX is the number of bits to store data except for batch index and genome index.
const BITS_NONE_POS = 64 - BITS_POSITION
BITS_NONE_POS is the number of bits except for position
const BITS_POSITION = 28
BITS_POSITION is the number of bits to store the k-mer position/coordinate.
const BITS_REVERSE = 1
BITS_SUFFIX_IDX is the flag to indicate if the k-mer is reversed.
const BITS_STRAND = 1
BITS_STRAND is the flag to indicate if the k-mer is from the reverse complement strand.
const DirGenomes = "genomes"
DirGenomes is the directory of genomes datas
const DirSeeds = "seeds"
DirSeeds is the directory of k-mer-value data files
const ExtSeeds = ".bin"
ExtSeeds is file extention of k-mer-value data files
const ExtTmpDir = ".tmp"
ExtTmpDir is the path extension for temporary files
const FileGenomeChunks = "genomes.chunks.bin"
FileGenomeChunks store lists of batch+genome index of genome chunks
const FileGenomeIndex = "genomes.map.bin"
FileGenomeIndex maps genome id to genome batch id and index in the batch
const FileGenomes = "genomes.bin"
FileGenomes is the name of each genome file
const FileInfo = "info.toml"
FileInfo is the summary file
const FileMasks = "masks.bin"
FileMasks is the file for storing lexichash mask
const FileSeedPositions = "seed_positions.bin"
FileSeedPositions is the name of seed position file
const MASK_GENOME_IDX = (1 << BITS_GENOME_IDX) - 1
MASK_GENOME_IDX is the mask of genome index.
const MASK_NONE_IDX = (1 << BITS_NONE_IDX) - 1
MASK_NONE_IDX is the mask of non-index data
const MASK_REVERSE = 1
MASK_REVERSE is the mask of reversed flag
const MAX_GENOME_SIZE = 1 << BITS_POSITION
MAX_GENOME_SIZE is the maximum genome size, 268435456
const NO_VALID_SEQS = "no_valid_seqs"
NO_VALID_SEQS means there are no valid sequences in a genome file.
const OpD = uint64('D')
const OpH = uint64('H')
const OpI = uint64('I')
const OpM = uint64('M')
const OpX = uint64('X')
const TOO_LARGE_GENOME = "too_large_genome"
TOO_LARGE_GENOME means the genome is too big to index.
const TOO_MANY_SEQS = "too_many_seqs"
TOO_MANY_SEQS means there are too many sequences, as we require: $total_bases + ($num_contigs - 1) * $interval_size <= 268,435,456
Variables ¶
var BufferSize = 65536 // os.Getpagesize()
BufferSize is size of buffer
var COMMIT = ""
can pass from from command line: commit=$(git rev-parse --short HEAD) go build -trimpath -o=lexicmap -ldflags="-s -w -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$commit" -tags netgo
var DefaultChaining2Options = Chaining2Options{
MaxGap: 50,
MinScore: 50,
MinAlignLen: 50,
MaxDistance: 100,
BandCount: 50,
BandBase: 100,
}
DefaultChaining2Options is the defalt vaule of Chaining2Option.
var DefaultChaining3Options = Chaining3Options{
MaxGap: 5,
MinScore: 1,
MinAlignLen: 2,
MaxDistance: 10,
BandCount: 20,
BandBase: 10,
}
DefaultChaining3Options is the defalt vaule of Chaining2Option.
var DefaultChainingOptions = ChainingOptions{
MaxGap: 5000,
MinScore: 40,
MaxDistance: 10000,
}
DefaultChainingOptions is the defalt vaule of ChainingOption.
var DefaultIndexSearchingOptions = IndexSearchingOptions{ NumCPUs: runtime.NumCPU(), MaxOpenFiles: 512, MinPrefix: 15, MinSinglePrefix: 17, TopN: 500, MaxGap: 5000, MaxDistance: 10000, ExtendLength: 2000, ExtendLength2: 50, MinQueryAlignedFractionInAGenome: 70, MaxEvalue: 10, }
var DefaultSeqComparatorOptions = SeqComparatorOptions{ K: 32, MinPrefix: 11, Chaining2Options: Chaining2Options{ MaxGap: 50, MinScore: 50, MaxDistance: 50, BandBase: 100, }, MinAlignedFraction: 0, }
DefaultSeqComparatorOptions contains the default options for SeqComparatorOptions.
var MainVersion uint8 = 3
MainVersion is use for checking compatibility
var MinorVersion uint8 = 4
MinorVersion is less important
var RootCmd = &cobra.Command{ Use: "lexicmap", Short: "efficient sequence alignment against millions of prokaryotic genomes", Long: fmt.Sprintf(` LexicMap: efficient sequence alignment against millions of prokaryotic genomes Version: v%s Documents: https://bioinf.shenwei.me/LexicMap Source code: https://github.com/shenwei356/LexicMap Please cite: https://doi.org/10.1038/s41587-025-02812-8 Nature Biotechnology (2025) `, VERSION), }
RootCmd represents the base command when called without any subcommands
var Strands = [2]byte{'+', '-'}
Strands could be used to output strand for a reverse complement flag
var VERSION = "0.8.1"
VERSION is the version
Functions ¶
func BuildIndex ¶ added in v0.2.0
func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
BuildIndex builds index from a list of input files
func CheckIndexBuildingOptions ¶ added in v0.2.0
func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
CheckIndexBuildingOptions checks some important options
func CheckIndexSearchingOptions ¶ added in v0.2.0
func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
func ClearSubstrPairs ¶ added in v0.2.0
func ClearSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)
ClearSubstrPairs removes nested/embedded and same anchors. k is the largest k-mer size.
func Combinations2 ¶
Note: set should not have duplicates
func Execute ¶
func Execute()
Execute adds all child commands to the root command sets flags appropriately. This is called by main.main(). It only needs to happen once to the rootCmd.
func IntSlice2StringSlice ¶
func ParseByteSize ¶ added in v0.4.0
ParseByteSize parses byte size from string
func RecycleChaining2Result ¶ added in v0.2.0
func RecycleChaining2Result(chains *[]*Chain2Result)
RecycleChaining2Result reycles the chaining paths. Please remember to call this after using the results.
func RecycleChainingResult ¶ added in v0.2.0
func RecycleChainingResult(chains *[]*[]int32)
RecycleChainingResult reycles the chaining results. Please remember to call this after using the results.
func RecycleSearchResultOfAGenome ¶ added in v0.8.0
func RecycleSearchResultOfAGenome(r *SearchResultOfAGenome)
func RecycleSeqComparatorResult ¶ added in v0.2.0
func RecycleSeqComparatorResult(r *SeqComparatorResult)
RecycleSeqComparatorResult recycles a SeqComparatorResult
func RecycleSubstrPairs ¶ added in v0.2.0
func RecycleSubstrPairs(poolSub *sync.Pool, poolSubs *sync.Pool, subs *[]*SubstrPair)
RecycleSubstrPairs recycles a list of SubstrPairs
func TrimSubStrPairs ¶ added in v0.5.0
func TrimSubStrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int, minDist float64)
TrimSubStrPairs trims anchors for query/subjects with tandem repeats in either end.
case 1: embeded anchor in query/target
61: 156-186 (+) vs 1163-1193 (+), len:31 62: 157-187 (-) vs 1164-1194 (-), len:31 63: 158-188 (+) vs 1165-1195 (+), len:31 64: 168-195 (-) vs 1168-1195 (-), len:28 65: 175-202 (-) vs 1168-1195 (-), len:28 <--- 66: 182-209 (-) vs 1168-1195 (-), len:28 <--- 67: 189-216 (-) vs 1168-1195 (-), len:28 <--- 68: 196-223 (+) vs 1168-1195 (+), len:28 <--- 69: 203-230 (+) vs 1168-1195 (+), len:28 <--- 70: 210-237 (-) vs 1168-1195 (-), len:28 <--- 71: 217-244 (-) vs 1168-1195 (-), len:28 <--- gap=7, overlap=28 (28/28)
case 2: big overlap + big gap
727: 789-819 (-) vs 789-819 (-), len:31 728: 790-820 (-) vs 790-820 (-), len:31 729: 804-821 (-) vs 821-838 (-), len:18 <--- gap=17, overlap=17 (17/18)
Types ¶
type Chain2Result ¶ added in v0.3.0
type Chain2Result struct {
NAnchors int // The number of substrings
AlignedFraction float64
MatchedBases int // The number of matched bases.
AlignedBasesQ int // The number of aligned bases in Query sequence
AlignedBasesT int // The number of aligned bases in Subject sequence
PIdent float64 // percentage of identity
AlignedLength int // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT
Gaps int // The number of gaps
QBegin, QEnd int // Query begin/end position (0-based)
TBegin, TEnd int // Target begin/end position (0-based)
MaxExtLen int // max extend length
// for output
CIGAR []byte // cigar string
QSeq []byte // query seq
TSeq []byte // target seq
Alignment []byte // alignment text
// statistic
Score int
BitScore int
Evalue float64
// contains filtered or unexported fields
}
Chain2Result represents a result of a chain
func (*Chain2Result) Reset ¶ added in v0.3.0
func (r *Chain2Result) Reset()
Reset resets a Chain2Result
type Chain3Result ¶ added in v0.6.0
type Chain3Result struct {
NAnchors int // The number of substrings
AlignedFraction float64
MatchedBases int // The number of matched bases.
AlignedBasesQ int // The number of aligned bases in Query sequence
AlignedBasesT int // The number of aligned bases in Subject sequence
PIdent float64 // percentage of identity
AlignedLength int // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT
Gaps int // The number of gaps
QBegin, QEnd int // Query begin/end position (0-based)
TBegin, TEnd int // Target begin/end position (0-based)
}
Chain3Result represents a result of a chain
func (*Chain3Result) Reset ¶ added in v0.6.0
func (r *Chain3Result) Reset()
Reset resets a Chain3Result
type Chainer ¶ added in v0.2.0
type Chainer struct {
// contains filtered or unexported fields
}
Chainer is an object for chaining the lexichash substrings between query and reference sequences.
func NewChainer ¶ added in v0.2.0
func NewChainer(options *ChainingOptions) *Chainer
NewChainer creates a new chainer.
type Chainer2 ¶ added in v0.2.0
type Chainer2 struct {
// contains filtered or unexported fields
}
Chainer2 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer2 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.
func NewChainer2 ¶ added in v0.2.0
func NewChainer2(options *Chaining2Options) *Chainer2
NewChainer creates a new chainer.
func (*Chainer2) Chain ¶ added in v0.2.0
func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)
Chain finds the possible chain paths. Please remember to call RecycleChaining2Result after using the results. Returned results:
- Chain2Results.
- The number of matched bases.
- The number of aligned bases.
- QBegin.
- QEnd.
- TBegin.
- TEnd.
type Chainer3 ¶ added in v0.6.0
type Chainer3 struct {
// contains filtered or unexported fields
}
Chainer3 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer3 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.
func NewChainer3 ¶ added in v0.6.0
func NewChainer3(options *Chaining3Options) *Chainer3
NewChainer creates a new chainer.
func (*Chainer3) Chain ¶ added in v0.6.0
func (ce *Chainer3) Chain(subs *[]*SubstrPair) *Chain3Result
Chain finds the possible chain path. Please remember to call RecycleChaining3Result after using the results. Returned results:
- Chain3Result.
- The number of matched bases.
- The number of aligned bases.
- QBegin.
- QEnd.
- TBegin.
- TEnd.
type Chaining2Options ¶ added in v0.2.0
type Chaining2Options struct {
MaxGap int
MinScore int // minimum score of a chain
MinAlignLen int
MinIdentity float64
MaxDistance int
BandCount int // only check i in range of i − A < j < i
BandBase int // only check i where i.Qstart+i.Len + A < j.Qstart
}
Chaining2Options contains all options in chaining.
type Chaining3Options ¶ added in v0.6.0
type Chaining3Options struct {
MaxGap int
MinScore int // minimum score of a chain
MinAlignLen int
MinIdentity float64
MaxDistance int
BandCount int // only check i in range of i − A < j < i
BandBase int // only check i where i.Qstart+i.Len + A < j.Qstart
}
Chaining3Options contains all options in chaining.
type ChainingOptions ¶ added in v0.2.0
ChainingOptions contains all options in chaining.
type Index ¶ added in v0.2.0
type Index struct {
// k-mer-value searchers
Searchers []*kv.Searcher
InMemorySearchers []*kv.InMemorySearcher
BatchGenomeIndex2GenomeID map[uint64][]byte
Taxonomy *taxdump.Taxonomy
// contains filtered or unexported fields
}
Index creates a LexicMap index from a path and supports searching with query sequences.
func NewIndexSearcher ¶ added in v0.2.0
func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)
NewIndexSearcher creates a new searcher
func (*Index) RecycleSearchResult ¶ added in v0.2.0
func (idx *Index) RecycleSearchResult(r *SearchResult)
RecycleSearchResults recycles a search result object
func (*Index) RecycleSearchResults ¶ added in v0.2.0
func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
RecycleSearchResults recycles search results objects
func (*Index) RecycleSimilarityDetails ¶ added in v0.3.0
func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)
RecycleSimilarityDetails recycles a list of SimilarityDetails
func (*Index) Search ¶ added in v0.2.0
func (idx *Index) Search(query *Query) (*[]*SearchResult, error)
Search queries the index with a sequence. After using the result, do not forget to call RecycleSearchResult().
func (*Index) SetSeqCompareOptions ¶ added in v0.2.0
func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
SetSeqCompareOptions sets the sequence comparing options
type IndexBuildingOptions ¶ added in v0.2.0
type IndexBuildingOptions struct {
// general
NumCPUs int
Verbose bool // show log
Log2File bool // log file
Force bool // force overwrite existed index
MaxOpenFiles int // maximum opened files, used in merging indexes
MergeThreads int // Maximum Concurrent Merge Jobs
MinSeqLen int // minimum sequence length, should be >= k
// skipping extremely large genome
MaxGenomeSize int // Maximum genome size. Extremely large genomes (non-isolate assemblies) will be skipped
BigGenomeFile string // Out file of skipped files with genomes
// LexicHash
MaskFile string // file of custom masks
K int // k-mer size
Masks int // number of masks
RandSeed int64 // random seed
// filling sketching deserts
DisableDesertFilling bool // disable desert filling (just for analysis index)
DesertMaxLen uint32 // maxi length of sketching deserts
DesertExpectedSeedDist int // expected distance between seeds
DesertSeedPosRange int // the upstream and down stream region for adding a seeds
// generate mask from the top N biggest genomes
TopN int // Select the the top N largest genomes for generating masks
PrefixExt int // Extension length of prefixes
Chunks int // the number of chunks for storing k-mer data
Partitions int // the number of partitions for indexing k-mer data
GenomeBatchSize int // the maximum number of genomes of a batch
ReRefName *regexp.Regexp // for extracting genome id from the file name
ReSeqExclude []*regexp.Regexp // for excluding sequences according to name pattern
ContigInterval int // the length of N's between contigs
SaveSeedPositions bool
Debug bool
}
IndexBuildingOptions contains all options for building an LexicMap index.
type IndexInfo ¶ added in v0.2.0
type IndexInfo struct {
MainVersion uint8 `toml:"main-version" comment:"Index format"`
MinorVersion uint8 `toml:"minor-version"`
K uint8 `toml:"max-K" comment:"LexicHash"`
Masks int `toml:"masks"`
RandSeed int64 `toml:"rand-seed"`
MaxDesert int `toml:"max-seed-dist" comment:"Seed distance"`
SeedDistInDesert int `toml:"seed-dist-in-desert"`
Chunks int `toml:"chunks" comment:"Seeds (k-mer-value data) files"`
Partitions int `toml:"index-partitions"`
InputGenomes int `toml:"input-genomes" comment:"Input genomes"`
InputBases int64 `toml:"input-bases" comment:"Input bases"`
Genomes int `` /* 239-byte string literal not displayed */
GenomeBatchSize int `toml:"genome-batch-size"`
GenomeBatches int `toml:"genome-batches"`
ContigInterval int `toml:"contig-interval"`
}
IndexInfo contains summary of the index
type IndexSearchingOptions ¶ added in v0.2.0
type IndexSearchingOptions struct {
// general
NumCPUs int
Verbose bool // show log
Log2File bool // log file
MaxOpenFiles int // maximum opened files, used in merging indexes
InMemorySearch bool // load the seed/kv data into memory
MinPrefix uint8 // minimum prefix length, e.g., 15
// MaxMismatch int // maximum mismatch, e.g., 3
MinSinglePrefix uint8 // minimum prefix length of the single seed, e.g., 20
// MinMatchedBases uint8 // the total matched bases
TopN int // keep the topN scores, e.g, 10
// seeds chaining
MaxGap float64 // e.g., 5000
MaxDistance float64 // e.g., 20k
// alignment
ExtendLength int // the length of extra sequence on the flanking of seeds.
ExtendLength2 int // the length of extra sequence on the flanking of pseudo-alignment region.
// alignment filtering
MinQueryAlignedFractionInAGenome float64 // minimum query aligned fraction in the target genome
MaxEvalue float64
// Output
OutputSeq bool
// debug
Debug bool
// filter results by taxid
TaxdumpDir string
Genome2TaxIdFile string
TaxIds []uint32
NegativeTaxIds []uint32
KeepGenomesWithoutTaxId bool
}
IndexSearchingOptions contains all options for searching
type Options ¶
type Options struct {
NumCPUs int
Verbose bool
LogFile string
Log2File bool
Compress bool
CompressionLevel int
}
Options contains the global flags
type Query ¶
type Query struct {
// contains filtered or unexported fields
}
Query is an object for each query sequence, it also contains the query result.
type SearchResult ¶ added in v0.2.0
type SearchResult struct {
Subs *[]*SubstrPair // matched substring pairs (query,target)
BatchGenomeIndex uint64 // just for finding genome chunks of the same genome
GenomeBatch int
GenomeIndex int
// ID []byte
GenomeSize int
Score float64 // score for soring
Chains *[]*[]int32
// more about the alignment detail
SimilarityDetails *[]*SimilarityDetail // sequence comparing
AlignedFraction float64 // query coverage per genome
}
SearchResult stores a search result in a genome for the given query sequence.
func (*SearchResult) Reset ¶ added in v0.2.0
func (r *SearchResult) Reset()
func (*SearchResult) SortBySeqID ¶ added in v0.4.0
func (sr *SearchResult) SortBySeqID()
type SearchResultOfAGenome ¶ added in v0.8.0
type SearchResultOfASequence ¶ added in v0.8.0
type SearchResultOfASequence struct {
Sseqid string
Cls string // should be smaller than int
Hsp string // should be smaller than int
QcovHSP string
AlenHSP string
Pident string
Gaps string
Qstart string
Qend string
Sstart string
Send string
Sstr string
Slen string
Evalue string
Bitscore string
Extra string
}
type SearchResultReader ¶ added in v0.8.0
type SearchResultReader struct {
// contains filtered or unexported fields
}
func NewSearchResultReader ¶ added in v0.8.0
func NewSearchResultReader(file string, query string, bufferSize int64) (*SearchResultReader, error)
func (*SearchResultReader) Next ¶ added in v0.8.0
func (r *SearchResultReader) Next() *SearchResultOfAGenome
type SearchResultsHeap ¶ added in v0.8.0
type SearchResultsHeap struct {
// contains filtered or unexported fields
}
func (SearchResultsHeap) Len ¶ added in v0.8.0
func (h SearchResultsHeap) Len() int
func (SearchResultsHeap) Less ¶ added in v0.8.0
func (h SearchResultsHeap) Less(i, j int) bool
func (SearchResultsHeap) Pop ¶ added in v0.8.0
func (h SearchResultsHeap) Pop() interface{}
func (SearchResultsHeap) Push ¶ added in v0.8.0
func (h SearchResultsHeap) Push(x interface{})
func (SearchResultsHeap) Swap ¶ added in v0.8.0
func (h SearchResultsHeap) Swap(i, j int)
type SeqComparator ¶ added in v0.2.0
type SeqComparator struct {
// contains filtered or unexported fields
}
SeqComparator is for fast and accurate similarity estimation of two sequences, which are in the same strand (important).
func NewSeqComparator ¶ added in v0.2.0
func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator
NewSeqComparator creates a new SeqComparator with given options. No options checking now.
func (*SeqComparator) Compare ¶ added in v0.2.0
func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)
Compare matchs k-mers for the query sequence (begin: end), chains them up, and computes the similarity. Please remember to call RecycleSeqComparatorResult() to recycle the result.
func (*SeqComparator) Index ¶ added in v0.2.0
func (cpr *SeqComparator) Index(s []byte) error
Index initializes the SeqComparator with the query sequence.
func (*SeqComparator) RecycleIndex ¶ added in v0.2.0
func (cpr *SeqComparator) RecycleIndex()
RecycleIndex recycles the Index (tree data). Please call this if you do not need the comparator anymore.
type SeqComparatorOptions ¶ added in v0.2.0
type SeqComparatorOptions struct {
// indexing
K uint8
MinPrefix uint8
// chaining
Chaining2Options
// seq similarity
MinAlignedFraction float64 // minimum query aligned fraction in a HSP
MinIdentity float64
}
SeqComparatorOptions contains options for comparing two sequences.
type SeqComparatorResult ¶ added in v0.2.0
type SeqComparatorResult struct {
AlignedBases int // The number of aligned bases.
AlignedFraction float64 // query (original query) coverage per HSP
MatchedBases int
PIdent float64
QueryLen int // length of the original query, used to compute/update AlignedFraction
QBegin int
QEnd int
TBegin int
TEnd int
TSeq []byte // target seq
Chains *[]*Chain2Result
}
SeqComparatorResult contains the details of a seq comparison result.
func (*SeqComparatorResult) Update ¶ added in v0.3.0
func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)
Update updates the data with new chains. However it does not considerate gaps.
func (*SeqComparatorResult) Update2 ¶ added in v0.4.0
func (r *SeqComparatorResult) Update2(chains *[]*Chain2Result, queryLen int)
Update2 only compute the aligned fraction for all chains
type SimilarityDetail ¶ added in v0.2.0
type SimilarityDetail struct {
// QBegin int
// QEnd int
// TBegin int
// TEnd int
RC bool
SimilarityScore float64
Similarity *SeqComparatorResult
// Chain *[]int
NSeeds int
// sequence details
SeqLen int
SeqID []byte // seqid of the region
}
SimilarityDetail is the similarity detail of one reference sequence
type SubstrPair ¶ added in v0.2.0
type SubstrPair struct {
QBegin int32 // start position of the substring (0-based) in query
TBegin int32 // start position of the substring (0-based) in reference
Len uint8 // prefix length
TRC bool // is the substring from the reference seq on the negative strand.
QRC bool // is the substring from the query seq on the negative strand.
}
SubstrPair represents a pair of found substrings/seeds, it's also called an anchor.
func (SubstrPair) String ¶ added in v0.2.0
func (s SubstrPair) String() string
type Uint64Slice ¶
type Uint64Slice []uint64
func (Uint64Slice) Len ¶
func (s Uint64Slice) Len() int
func (Uint64Slice) Less ¶
func (s Uint64Slice) Less(i, j int) bool
func (*Uint64Slice) Pop ¶
func (s *Uint64Slice) Pop() interface{}
func (*Uint64Slice) Push ¶
func (s *Uint64Slice) Push(x interface{})
func (Uint64Slice) Swap ¶
func (s Uint64Slice) Swap(i, j int)
Source Files
¶
- 2blast.go
- autocomplete.go
- edit-genome-ids.go
- genomes.go
- index.go
- kmers.go
- lib-chaining.go
- lib-chaining2.go
- lib-chaining3.go
- lib-index-build.go
- lib-index-merge.go
- lib-index-search-util.go
- lib-index-search.go
- lib-seq_compare.go
- masks.go
- merge-search-results.go
- re-merge.go
- recount-bases.go
- reindex-seeds.go
- root.go
- search.go
- seed-pos.go
- subseq.go
- util-cli.go
- util-io.go
- util-logging.go
- util.go
- utils.go
- version.go