cmd

package

v0.8.1 Latest Latest Go to latest Published: Dec 19, 2025 License: MIT Imports: 59 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/shenwei356/LexicMap

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error
func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error
func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error
func ClearSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)
func Combinations2(set []uint64) [][2]uint64
func Execute()
func IntSlice2StringSlice(vals []int) []string
func MeanStdev(values []float64) (float64, float64)
func ParseByteSize(val string) (int64, error)
func RC(s []byte) []byte
func RecycleChaining2Result(chains *[]*Chain2Result)
func RecycleChainingResult(chains *[]*[]int32)
func RecycleSearchResultOfAGenome(r *SearchResultOfAGenome)
func RecycleSeqComparatorResult(r *SeqComparatorResult)
func RecycleSubstrPairs(poolSub *sync.Pool, poolSubs *sync.Pool, subs *[]*SubstrPair)
func TrimSubStrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int, minDist float64)
type Chain2Result
- func (r *Chain2Result) Reset()
type Chain3Result
- func (r *Chain3Result) Reset()
type Chainer
- func NewChainer(options *ChainingOptions) *Chainer
- func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int32, float64)
type Chainer2
- func NewChainer2(options *Chaining2Options) *Chainer2
- func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)
type Chainer3
- func NewChainer3(options *Chaining3Options) *Chainer3
- func (ce *Chainer3) Chain(subs *[]*SubstrPair) *Chain3Result
type Chaining2Options
type Chaining3Options
type ChainingOptions
type Index
- func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)
- func (idx *Index) Close() error
- func (idx *Index) RecycleSearchResult(r *SearchResult)
- func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)
- func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)
- func (idx *Index) Search(query *Query) (*[]*SearchResult, error)
- func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)
type IndexBuildingOptions
type IndexInfo
type IndexSearchingOptions
type Options
type Query
- func (q *Query) Reset()
type SearchResult
- func (r *SearchResult) Reset()
- func (sr *SearchResult) SortBySeqID()
type SearchResultOfAGenome
type SearchResultOfASequence
type SearchResultReader
- func NewSearchResultReader(file string, query string, bufferSize int64) (*SearchResultReader, error)
- func (r *SearchResultReader) Next() *SearchResultOfAGenome
type SearchResultsHeap
- func (h SearchResultsHeap) Len() int
- func (h SearchResultsHeap) Less(i, j int) bool
- func (h SearchResultsHeap) Pop() interface{}
- func (h SearchResultsHeap) Push(x interface{})
- func (h SearchResultsHeap) Swap(i, j int)
type SeqComparator
- func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator
- func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)
- func (cpr *SeqComparator) Index(s []byte) error
- func (cpr *SeqComparator) RecycleIndex()
type SeqComparatorOptions
type SeqComparatorResult
- func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)
- func (r *SeqComparatorResult) Update2(chains *[]*Chain2Result, queryLen int)
type SimilarityDetail
type SubstrPair
- func (s SubstrPair) String() string
type Uint64Slice
- func (s Uint64Slice) Len() int
- func (s Uint64Slice) Less(i, j int) bool
- func (s *Uint64Slice) Pop() interface{}
- func (s *Uint64Slice) Push(x interface{})
- func (s Uint64Slice) Swap(i, j int)

Constants ¶

View Source

const BITS_BATCH_IDX = 17

BITS_BATCH_IDX is the number of bits to store the genome batch index.

View Source

const BITS_FLAGS = BITS_STRAND + BITS_REVERSE

BITS_FLAGS is the number of bits to store two bits

View Source

const BITS_GENOME_IDX = 17

BITS_GENOME_IDX is the number of bits to store the genome index.

View Source

const BITS_IDX = BITS_BATCH_IDX + BITS_BATCH_IDX

BITS_IDX is the number of bits to strore batch index and genome index.

View Source

const BITS_IDX_FLAGS = BITS_IDX + BITS_FLAGS

BITS_IDX_FLAGS is the sum of BITS_IDX and BITS_FLAGS

View Source

const BITS_NONE_IDX = 64 - BITS_BATCH_IDX - BITS_GENOME_IDX

BITS_NONE_IDX is the number of bits to store data except for batch index and genome index.

View Source

const BITS_NONE_POS = 64 - BITS_POSITION

BITS_NONE_POS is the number of bits except for position

View Source

const BITS_POSITION = 28

BITS_POSITION is the number of bits to store the k-mer position/coordinate.

View Source

const BITS_REVERSE = 1

BITS_SUFFIX_IDX is the flag to indicate if the k-mer is reversed.

View Source

const BITS_STRAND = 1

BITS_STRAND is the flag to indicate if the k-mer is from the reverse complement strand.

View Source

const DirGenomes = "genomes"

DirGenomes is the directory of genomes datas

View Source

const DirSeeds = "seeds"

DirSeeds is the directory of k-mer-value data files

View Source

const ExtSeeds = ".bin"

ExtSeeds is file extention of k-mer-value data files

View Source

const ExtTmpDir = ".tmp"

ExtTmpDir is the path extension for temporary files

View Source

const FileGenomeChunks = "genomes.chunks.bin"

FileGenomeChunks store lists of batch+genome index of genome chunks

View Source

const FileGenomeIndex = "genomes.map.bin"

FileGenomeIndex maps genome id to genome batch id and index in the batch

View Source

const FileGenomes = "genomes.bin"

FileGenomes is the name of each genome file

View Source

const FileInfo = "info.toml"

FileInfo is the summary file

View Source

const FileMasks = "masks.bin"

FileMasks is the file for storing lexichash mask

View Source

const FileSeedPositions = "seed_positions.bin"

FileSeedPositions is the name of seed position file

View Source

const MASK_GENOME_IDX = (1 << BITS_GENOME_IDX) - 1

MASK_GENOME_IDX is the mask of genome index.

View Source

const MASK_NONE_IDX = (1 << BITS_NONE_IDX) - 1

MASK_NONE_IDX is the mask of non-index data

View Source

const MASK_REVERSE = 1

MASK_REVERSE is the mask of reversed flag

View Source

const MAX_GENOME_SIZE = 1 << BITS_POSITION

MAX_GENOME_SIZE is the maximum genome size, 268435456

View Source

const NO_VALID_SEQS = "no_valid_seqs"

NO_VALID_SEQS means there are no valid sequences in a genome file.

View Source

const OpD = uint64('D')

View Source

const OpH = uint64('H')

View Source

const OpI = uint64('I')

View Source

const OpM = uint64('M')

View Source

const OpX = uint64('X')

View Source

const TOO_LARGE_GENOME = "too_large_genome"

TOO_LARGE_GENOME means the genome is too big to index.

View Source

const TOO_MANY_SEQS = "too_many_seqs"

TOO_MANY_SEQS means there are too many sequences, as we require: $total_bases + ($num_contigs - 1) * $interval_size <= 268,435,456

Variables ¶

View Source

var BufferSize = 65536 // os.Getpagesize()

BufferSize is size of buffer

View Source

var COMMIT = ""

can pass from from command line: commit=$(git rev-parse --short HEAD) go build -trimpath -o=lexicmap -ldflags="-s -w -X github.com/shenwei356/LexicMap/lexicmap/cmd.COMMIT=$commit" -tags netgo

View Source

var DefaultChaining2Options = Chaining2Options{
	MaxGap:      50,
	MinScore:    50,
	MinAlignLen: 50,

	MaxDistance: 100,
	BandCount:   50,
	BandBase:    100,
}

DefaultChaining2Options is the defalt vaule of Chaining2Option.

View Source

var DefaultChaining3Options = Chaining3Options{
	MaxGap:      5,
	MinScore:    1,
	MinAlignLen: 2,

	MaxDistance: 10,
	BandCount:   20,
	BandBase:    10,
}

DefaultChaining3Options is the defalt vaule of Chaining2Option.

View Source

var DefaultChainingOptions = ChainingOptions{
	MaxGap:      5000,
	MinScore:    40,
	MaxDistance: 10000,
}

DefaultChainingOptions is the defalt vaule of ChainingOption.

View Source

var DefaultIndexSearchingOptions = IndexSearchingOptions{
	NumCPUs:      runtime.NumCPU(),
	MaxOpenFiles: 512,

	MinPrefix: 15,

	MinSinglePrefix: 17,

	TopN: 500,

	MaxGap:      5000,
	MaxDistance: 10000,

	ExtendLength:                     2000,
	ExtendLength2:                    50,
	MinQueryAlignedFractionInAGenome: 70,
	MaxEvalue:                        10,
}

View Source

var DefaultSeqComparatorOptions = SeqComparatorOptions{
	K:         32,
	MinPrefix: 11,

	Chaining2Options: Chaining2Options{

		MaxGap: 50,

		MinScore: 50,

		MaxDistance: 50,

		BandBase: 100,
	},

	MinAlignedFraction: 0,
}

DefaultSeqComparatorOptions contains the default options for SeqComparatorOptions.

View Source

var MainVersion uint8 = 3

MainVersion is use for checking compatibility

View Source

var MinorVersion uint8 = 4

MinorVersion is less important

View Source

var RootCmd = &cobra.Command{
	Use:   "lexicmap",
	Short: "efficient sequence alignment against millions of prokaryotic genomes",
	Long: fmt.Sprintf(`
   LexicMap: efficient sequence alignment against millions of prokaryotic genomes

    Version: v%s
  Documents: https://bioinf.shenwei.me/LexicMap
Source code: https://github.com/shenwei356/LexicMap
Please cite: https://doi.org/10.1038/s41587-025-02812-8 Nature Biotechnology (2025)

`, VERSION),
}

RootCmd represents the base command when called without any subcommands

View Source

var Strands = [2]byte{'+', '-'}

Strands could be used to output strand for a reverse complement flag

View Source

var VERSION = "0.8.1"

VERSION is the version

Functions ¶

func BuildIndex ¶ added in v0.2.0

func BuildIndex(outdir string, infiles []string, opt *IndexBuildingOptions) error

BuildIndex builds index from a list of input files

func CheckIndexBuildingOptions ¶ added in v0.2.0

func CheckIndexBuildingOptions(opt *IndexBuildingOptions) error

CheckIndexBuildingOptions checks some important options

func CheckIndexSearchingOptions ¶ added in v0.2.0

func CheckIndexSearchingOptions(opt *IndexSearchingOptions) error

func ClearSubstrPairs ¶ added in v0.2.0

func ClearSubstrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int)

ClearSubstrPairs removes nested/embedded and same anchors. k is the largest k-mer size.

func Execute ¶

func Execute()

Execute adds all child commands to the root command sets flags appropriately. This is called by main.main(). It only needs to happen once to the rootCmd.

func IntSlice2StringSlice ¶

func IntSlice2StringSlice(vals []int) []string

func MeanStdev ¶

func MeanStdev(values []float64) (float64, float64)

func ParseByteSize ¶ added in v0.4.0

func ParseByteSize(val string) (int64, error)

ParseByteSize parses byte size from string

func RC ¶ added in v0.2.0

func RC(s []byte) []byte

RC computes the reverse complement sequence

func RecycleChaining2Result ¶ added in v0.2.0

func RecycleChaining2Result(chains *[]*Chain2Result)

RecycleChaining2Result reycles the chaining paths. Please remember to call this after using the results.

func RecycleChainingResult ¶ added in v0.2.0

func RecycleChainingResult(chains *[]*[]int32)

RecycleChainingResult reycles the chaining results. Please remember to call this after using the results.

func RecycleSearchResultOfAGenome ¶ added in v0.8.0

func RecycleSearchResultOfAGenome(r *SearchResultOfAGenome)

func RecycleSeqComparatorResult ¶ added in v0.2.0

func RecycleSeqComparatorResult(r *SeqComparatorResult)

RecycleSeqComparatorResult recycles a SeqComparatorResult

func RecycleSubstrPairs ¶ added in v0.2.0

func RecycleSubstrPairs(poolSub *sync.Pool, poolSubs *sync.Pool, subs *[]*SubstrPair)

RecycleSubstrPairs recycles a list of SubstrPairs

func TrimSubStrPairs ¶ added in v0.5.0

func TrimSubStrPairs(poolSub *sync.Pool, subs *[]*SubstrPair, k int, minDist float64)

TrimSubStrPairs trims anchors for query/subjects with tandem repeats in either end.

case 1: embeded anchor in query/target

61: 156-186 (+) vs 1163-1193 (+), len:31
62: 157-187 (-) vs 1164-1194 (-), len:31
63: 158-188 (+) vs 1165-1195 (+), len:31
64: 168-195 (-) vs 1168-1195 (-), len:28
65: 175-202 (-) vs 1168-1195 (-), len:28 <---
66: 182-209 (-) vs 1168-1195 (-), len:28 <---
67: 189-216 (-) vs 1168-1195 (-), len:28 <---
68: 196-223 (+) vs 1168-1195 (+), len:28 <---
69: 203-230 (+) vs 1168-1195 (+), len:28 <---
70: 210-237 (-) vs 1168-1195 (-), len:28 <---
71: 217-244 (-) vs 1168-1195 (-), len:28 <--- gap=7, overlap=28 (28/28)

case 2: big overlap + big gap

727: 789-819 (-) vs 789-819 (-), len:31
728: 790-820 (-) vs 790-820 (-), len:31
729: 804-821 (-) vs 821-838 (-), len:18 <--- gap=17, overlap=17 (17/18)

Types ¶

type Chain2Result ¶ added in v0.3.0

type Chain2Result struct {
	NAnchors int // The number of substrings

	AlignedFraction float64

	MatchedBases  int     // The number of matched bases.
	AlignedBasesQ int     // The number of aligned bases in Query sequence
	AlignedBasesT int     // The number of aligned bases in Subject sequence
	PIdent        float64 // percentage of identity
	AlignedLength int     // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT
	Gaps          int     // The number of gaps

	QBegin, QEnd int // Query begin/end position (0-based)
	TBegin, TEnd int // Target begin/end position (0-based)

	MaxExtLen int // max extend length

	// for output
	CIGAR     []byte // cigar string
	QSeq      []byte // query seq
	TSeq      []byte // target seq
	Alignment []byte // alignment text

	// statistic
	Score    int
	BitScore int
	Evalue   float64
	// contains filtered or unexported fields
}

Chain2Result represents a result of a chain

func (*Chain2Result) Reset ¶ added in v0.3.0

func (r *Chain2Result) Reset()

Reset resets a Chain2Result

type Chain3Result ¶ added in v0.6.0

type Chain3Result struct {
	NAnchors int // The number of substrings

	AlignedFraction float64

	MatchedBases  int     // The number of matched bases.
	AlignedBasesQ int     // The number of aligned bases in Query sequence
	AlignedBasesT int     // The number of aligned bases in Subject sequence
	PIdent        float64 // percentage of identity
	AlignedLength int     // Aligned length, might be longer than AlignedBasesQ or AlignedBasesT
	Gaps          int     // The number of gaps

	QBegin, QEnd int // Query begin/end position (0-based)
	TBegin, TEnd int // Target begin/end position (0-based)
}

Chain3Result represents a result of a chain

func (*Chain3Result) Reset ¶ added in v0.6.0

func (r *Chain3Result) Reset()

Reset resets a Chain3Result

type Chainer ¶ added in v0.2.0

type Chainer struct {
	// contains filtered or unexported fields
}

Chainer is an object for chaining the lexichash substrings between query and reference sequences.

func NewChainer ¶ added in v0.2.0

func NewChainer(options *ChainingOptions) *Chainer

NewChainer creates a new chainer.

func (*Chainer) Chain ¶ added in v0.2.0

func (ce *Chainer) Chain(subs *[]*SubstrPair) (*[]*[]int32, float64)

Chain finds the possible seed paths. Please remember to call RecycleChainingResult after using the results.

type Chainer2 ¶ added in v0.2.0

type Chainer2 struct {
	// contains filtered or unexported fields
}

Chainer2 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer2 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.

func NewChainer2 ¶ added in v0.2.0

func NewChainer2(options *Chaining2Options) *Chainer2

NewChainer creates a new chainer.

func (*Chainer2) Chain ¶ added in v0.2.0

func (ce *Chainer2) Chain(subs *[]*SubstrPair) (*[]*Chain2Result, int, int, int, int, int, int, int)

Chain finds the possible chain paths. Please remember to call RecycleChaining2Result after using the results. Returned results:

Chain2Results.
The number of matched bases.
The number of aligned bases.
QBegin.
QEnd.
TBegin.
TEnd.

type Chainer3 ¶ added in v0.6.0

type Chainer3 struct {
	// contains filtered or unexported fields
}

Chainer3 is an object for chaining the anchors in two similar sequences. Anchors/seeds/substrings in Chainer3 is denser than those in Chainer, and the chaining score function is also much simpler, only considering the lengths of anchors and gaps between them.

func NewChainer3 ¶ added in v0.6.0

func NewChainer3(options *Chaining3Options) *Chainer3

NewChainer creates a new chainer.

func (*Chainer3) Chain ¶ added in v0.6.0

func (ce *Chainer3) Chain(subs *[]*SubstrPair) *Chain3Result

Chain finds the possible chain path. Please remember to call RecycleChaining3Result after using the results. Returned results:

Chain3Result.
The number of matched bases.
The number of aligned bases.
QBegin.
QEnd.
TBegin.
TEnd.

type Chaining2Options ¶ added in v0.2.0

type Chaining2Options struct {
	MaxGap      int
	MinScore    int // minimum score of a chain
	MinAlignLen int
	MinIdentity float64
	MaxDistance int
	BandCount   int // only check i in range of  i − A < j < i
	BandBase    int // only check i where i.Qstart+i.Len + A < j.Qstart
}

Chaining2Options contains all options in chaining.

type Chaining3Options ¶ added in v0.6.0

type Chaining3Options struct {
	MaxGap      int
	MinScore    int // minimum score of a chain
	MinAlignLen int
	MinIdentity float64
	MaxDistance int
	BandCount   int // only check i in range of  i − A < j < i
	BandBase    int // only check i where i.Qstart+i.Len + A < j.Qstart
}

Chaining3Options contains all options in chaining.

type ChainingOptions ¶ added in v0.2.0

type ChainingOptions struct {
	MaxGap      float64
	MinLen      uint8
	MinScore    float64
	MaxDistance float64
}

ChainingOptions contains all options in chaining.

type Index ¶ added in v0.2.0

type Index struct {

	// k-mer-value searchers
	Searchers         []*kv.Searcher
	InMemorySearchers []*kv.InMemorySearcher

	BatchGenomeIndex2GenomeID map[uint64][]byte

	Taxonomy *taxdump.Taxonomy
	// contains filtered or unexported fields
}

Index creates a LexicMap index from a path and supports searching with query sequences.

func NewIndexSearcher ¶ added in v0.2.0

func NewIndexSearcher(outDir string, opt *IndexSearchingOptions) (*Index, error)

NewIndexSearcher creates a new searcher

func (*Index) Close ¶ added in v0.2.0

func (idx *Index) Close() error

Close closes the searcher.

func (*Index) RecycleSearchResult ¶ added in v0.2.0

func (idx *Index) RecycleSearchResult(r *SearchResult)

RecycleSearchResults recycles a search result object

func (*Index) RecycleSearchResults ¶ added in v0.2.0

func (idx *Index) RecycleSearchResults(sr *[]*SearchResult)

RecycleSearchResults recycles search results objects

func (*Index) RecycleSimilarityDetails ¶ added in v0.3.0

func (idx *Index) RecycleSimilarityDetails(sds *[]*SimilarityDetail)

RecycleSimilarityDetails recycles a list of SimilarityDetails

func (*Index) Search ¶ added in v0.2.0

func (idx *Index) Search(query *Query) (*[]*SearchResult, error)

Search queries the index with a sequence. After using the result, do not forget to call RecycleSearchResult().

func (*Index) SetSeqCompareOptions ¶ added in v0.2.0

func (idx *Index) SetSeqCompareOptions(sco *SeqComparatorOptions)

SetSeqCompareOptions sets the sequence comparing options

type IndexBuildingOptions ¶ added in v0.2.0

type IndexBuildingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	Force        bool // force overwrite existed index
	MaxOpenFiles int  // maximum opened files, used in merging indexes
	MergeThreads int  // Maximum Concurrent Merge Jobs

	MinSeqLen int // minimum sequence length, should be >= k

	// skipping extremely large genome
	MaxGenomeSize int    // Maximum genome size. Extremely large genomes (non-isolate assemblies) will be skipped
	BigGenomeFile string // Out file of skipped files with genomes

	// LexicHash
	MaskFile string // file of custom masks

	K        int   // k-mer size
	Masks    int   // number of masks
	RandSeed int64 // random seed

	// filling sketching deserts
	DisableDesertFilling   bool   // disable desert filling (just for analysis index)
	DesertMaxLen           uint32 // maxi length of sketching deserts
	DesertExpectedSeedDist int    // expected distance between seeds
	DesertSeedPosRange     int    // the upstream and down stream region for adding a seeds

	// generate mask from the top N biggest genomes
	TopN      int // Select the the top N largest genomes for generating masks
	PrefixExt int // Extension length of prefixes

	Chunks     int // the number of chunks for storing k-mer data
	Partitions int // the number of partitions for indexing k-mer data

	GenomeBatchSize int // the maximum number of genomes of a batch

	ReRefName    *regexp.Regexp   // for extracting genome id from the file name
	ReSeqExclude []*regexp.Regexp // for excluding sequences according to name pattern

	ContigInterval int // the length of N's between contigs

	SaveSeedPositions bool

	Debug bool
}

IndexBuildingOptions contains all options for building an LexicMap index.

type IndexInfo ¶ added in v0.2.0

type IndexInfo struct {
	MainVersion      uint8 `toml:"main-version" comment:"Index format"`
	MinorVersion     uint8 `toml:"minor-version"`
	K                uint8 `toml:"max-K" comment:"LexicHash"`
	Masks            int   `toml:"masks"`
	RandSeed         int64 `toml:"rand-seed"`
	MaxDesert        int   `toml:"max-seed-dist" comment:"Seed distance"`
	SeedDistInDesert int   `toml:"seed-dist-in-desert"`
	Chunks           int   `toml:"chunks" comment:"Seeds (k-mer-value data) files"`
	Partitions       int   `toml:"index-partitions"`
	InputGenomes     int   `toml:"input-genomes" comment:"Input genomes"`
	InputBases       int64 `toml:"input-bases" comment:"Input bases"`
	Genomes          int   `` /* 239-byte string literal not displayed */
	GenomeBatchSize  int   `toml:"genome-batch-size"`
	GenomeBatches    int   `toml:"genome-batches"`
	ContigInterval   int   `toml:"contig-interval"`
}

IndexInfo contains summary of the index

type IndexSearchingOptions ¶ added in v0.2.0

type IndexSearchingOptions struct {
	// general
	NumCPUs      int
	Verbose      bool // show log
	Log2File     bool // log file
	MaxOpenFiles int  // maximum opened files, used in merging indexes

	InMemorySearch bool  // load the seed/kv data into memory
	MinPrefix      uint8 // minimum prefix length, e.g., 15
	// MaxMismatch     int   // maximum mismatch, e.g., 3
	MinSinglePrefix uint8 // minimum prefix length of the single seed, e.g., 20
	// MinMatchedBases uint8 // the total matched bases
	TopN int // keep the topN scores, e.g, 10

	// seeds chaining
	MaxGap      float64 // e.g., 5000
	MaxDistance float64 // e.g., 20k

	// alignment
	ExtendLength  int // the length of extra sequence on the flanking of seeds.
	ExtendLength2 int // the length of extra sequence on the flanking of pseudo-alignment region.

	// alignment filtering
	MinQueryAlignedFractionInAGenome float64 // minimum query aligned fraction in the target genome
	MaxEvalue                        float64

	// Output
	OutputSeq bool

	// debug
	Debug bool

	// filter results by taxid
	TaxdumpDir              string
	Genome2TaxIdFile        string
	TaxIds                  []uint32
	NegativeTaxIds          []uint32
	KeepGenomesWithoutTaxId bool
}

IndexSearchingOptions contains all options for searching

type Options ¶

type Options struct {
	NumCPUs int
	Verbose bool

	LogFile  string
	Log2File bool

	Compress         bool
	CompressionLevel int
}

Options contains the global flags

type Query ¶

type Query struct {
	// contains filtered or unexported fields
}

Query is an object for each query sequence, it also contains the query result.

func (*Query) Reset ¶

func (q *Query) Reset()

Reset reset the data for next round of using

type SearchResult ¶ added in v0.2.0

type SearchResult struct {
	Subs *[]*SubstrPair // matched substring pairs (query,target)

	BatchGenomeIndex uint64 // just for finding genome chunks of the same genome

	GenomeBatch int
	GenomeIndex int
	// ID          []byte
	GenomeSize int

	Score  float64 //  score for soring
	Chains *[]*[]int32

	// more about the alignment detail
	SimilarityDetails *[]*SimilarityDetail // sequence comparing
	AlignedFraction   float64              // query coverage per genome
}

SearchResult stores a search result in a genome for the given query sequence.

func (*SearchResult) Reset ¶ added in v0.2.0

func (r *SearchResult) Reset()

func (*SearchResult) SortBySeqID ¶ added in v0.4.0

func (sr *SearchResult) SortBySeqID()

type SearchResultOfAGenome ¶ added in v0.8.0

type SearchResultOfAGenome struct {
	Hits    int
	Query   string
	Qlen    string
	Sgenome string
	QcovGnm string

	Score float64 // float64(BitScore) * PIdent

	Records []*SearchResultOfASequence
	// contains filtered or unexported fields
}

type SearchResultOfASequence ¶ added in v0.8.0

type SearchResultOfASequence struct {
	Sseqid   string
	Cls      string // should be smaller than int
	Hsp      string // should be smaller than int
	QcovHSP  string
	AlenHSP  string
	Pident   string
	Gaps     string
	Qstart   string
	Qend     string
	Sstart   string
	Send     string
	Sstr     string
	Slen     string
	Evalue   string
	Bitscore string

	Extra string
}

type SearchResultReader ¶ added in v0.8.0

type SearchResultReader struct {
	// contains filtered or unexported fields
}

func NewSearchResultReader ¶ added in v0.8.0

func NewSearchResultReader(file string, query string, bufferSize int64) (*SearchResultReader, error)

func (*SearchResultReader) Next ¶ added in v0.8.0

func (r *SearchResultReader) Next() *SearchResultOfAGenome

type SearchResultsHeap ¶ added in v0.8.0

type SearchResultsHeap struct {
	// contains filtered or unexported fields
}

func (SearchResultsHeap) Len ¶ added in v0.8.0

func (h SearchResultsHeap) Len() int

func (SearchResultsHeap) Less ¶ added in v0.8.0

func (h SearchResultsHeap) Less(i, j int) bool

func (SearchResultsHeap) Pop ¶ added in v0.8.0

func (h SearchResultsHeap) Pop() interface{}

func (SearchResultsHeap) Push ¶ added in v0.8.0

func (h SearchResultsHeap) Push(x interface{})

func (SearchResultsHeap) Swap ¶ added in v0.8.0

func (h SearchResultsHeap) Swap(i, j int)

type SeqComparator ¶ added in v0.2.0

type SeqComparator struct {
	// contains filtered or unexported fields
}

SeqComparator is for fast and accurate similarity estimation of two sequences, which are in the same strand (important).

func NewSeqComparator ¶ added in v0.2.0

func NewSeqComparator(options *SeqComparatorOptions, poolChainers *sync.Pool) *SeqComparator

NewSeqComparator creates a new SeqComparator with given options. No options checking now.

func (*SeqComparator) Compare ¶ added in v0.2.0

func (cpr *SeqComparator) Compare(begin, end uint32, s []byte, queryLen int) (*SeqComparatorResult, error)

Compare matchs k-mers for the query sequence (begin: end), chains them up, and computes the similarity. Please remember to call RecycleSeqComparatorResult() to recycle the result.

func (*SeqComparator) Index ¶ added in v0.2.0

func (cpr *SeqComparator) Index(s []byte) error

Index initializes the SeqComparator with the query sequence.

func (*SeqComparator) RecycleIndex ¶ added in v0.2.0

func (cpr *SeqComparator) RecycleIndex()

RecycleIndex recycles the Index (tree data). Please call this if you do not need the comparator anymore.

type SeqComparatorOptions ¶ added in v0.2.0

type SeqComparatorOptions struct {
	// indexing
	K         uint8
	MinPrefix uint8

	// chaining
	Chaining2Options

	// seq similarity
	MinAlignedFraction float64 // minimum query aligned fraction in a HSP

	MinIdentity float64
}

SeqComparatorOptions contains options for comparing two sequences.

type SeqComparatorResult ¶ added in v0.2.0

type SeqComparatorResult struct {
	AlignedBases int // The number of aligned bases.

	AlignedFraction float64 // query (original query) coverage per HSP

	MatchedBases int
	PIdent       float64

	QueryLen int // length of the original query, used to compute/update AlignedFraction

	QBegin int
	QEnd   int
	TBegin int
	TEnd   int

	TSeq []byte // target seq

	Chains *[]*Chain2Result
}

SeqComparatorResult contains the details of a seq comparison result.

func (*SeqComparatorResult) Update ¶ added in v0.3.0

func (r *SeqComparatorResult) Update(chains *[]*Chain2Result, queryLen int)

Update updates the data with new chains. However it does not considerate gaps.

func (*SeqComparatorResult) Update2 ¶ added in v0.4.0

func (r *SeqComparatorResult) Update2(chains *[]*Chain2Result, queryLen int)

Update2 only compute the aligned fraction for all chains

type SimilarityDetail ¶ added in v0.2.0

type SimilarityDetail struct {
	// QBegin int
	// QEnd   int
	// TBegin int
	// TEnd   int
	RC bool

	SimilarityScore float64
	Similarity      *SeqComparatorResult
	// Chain           *[]int
	NSeeds int

	// sequence details
	SeqLen int
	SeqID  []byte // seqid of the region
}

SimilarityDetail is the similarity detail of one reference sequence

type SubstrPair ¶ added in v0.2.0

type SubstrPair struct {
	QBegin int32 // start position of the substring (0-based) in query
	TBegin int32 // start position of the substring (0-based) in reference

	Len uint8 // prefix length

	TRC bool // is the substring from the reference seq on the negative strand.
	QRC bool // is the substring from the query seq on the negative strand.

}

SubstrPair represents a pair of found substrings/seeds, it's also called an anchor.

func (SubstrPair) String ¶ added in v0.2.0

func (s SubstrPair) String() string

type Uint64Slice ¶

type Uint64Slice []uint64

func (Uint64Slice) Len ¶

func (s Uint64Slice) Len() int

func (Uint64Slice) Less ¶

func (s Uint64Slice) Less(i, j int) bool

func (*Uint64Slice) Pop ¶

func (s *Uint64Slice) Pop() interface{}

func (*Uint64Slice) Push ¶

func (s *Uint64Slice) Push(x interface{})

func (Uint64Slice) Swap ¶

func (s Uint64Slice) Swap(i, j int)

Directories ¶

Path	Synopsis
genome
kv
seedposition
tree
util

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func BuildIndex ¶ added in v0.2.0

func CheckIndexBuildingOptions ¶ added in v0.2.0

func CheckIndexSearchingOptions ¶ added in v0.2.0

func ClearSubstrPairs ¶ added in v0.2.0

func Combinations2 ¶

func Execute ¶

func IntSlice2StringSlice ¶

func MeanStdev ¶

func ParseByteSize ¶ added in v0.4.0

func RC ¶ added in v0.2.0

func RecycleChaining2Result ¶ added in v0.2.0

func RecycleChainingResult ¶ added in v0.2.0

func RecycleSearchResultOfAGenome ¶ added in v0.8.0

func RecycleSeqComparatorResult ¶ added in v0.2.0

func RecycleSubstrPairs ¶ added in v0.2.0

func TrimSubStrPairs ¶ added in v0.5.0

Types ¶

type Chain2Result ¶ added in v0.3.0

func (*Chain2Result) Reset ¶ added in v0.3.0

type Chain3Result ¶ added in v0.6.0

func (*Chain3Result) Reset ¶ added in v0.6.0

type Chainer ¶ added in v0.2.0

func NewChainer ¶ added in v0.2.0

func (*Chainer) Chain ¶ added in v0.2.0

type Chainer2 ¶ added in v0.2.0

func NewChainer2 ¶ added in v0.2.0

func (*Chainer2) Chain ¶ added in v0.2.0

type Chainer3 ¶ added in v0.6.0

func NewChainer3 ¶ added in v0.6.0

func (*Chainer3) Chain ¶ added in v0.6.0

type Chaining2Options ¶ added in v0.2.0

type Chaining3Options ¶ added in v0.6.0

type ChainingOptions ¶ added in v0.2.0

type Index ¶ added in v0.2.0

func NewIndexSearcher ¶ added in v0.2.0

func (*Index) Close ¶ added in v0.2.0

func (*Index) RecycleSearchResult ¶ added in v0.2.0

func (*Index) RecycleSearchResults ¶ added in v0.2.0

func (*Index) RecycleSimilarityDetails ¶ added in v0.3.0

func (*Index) Search ¶ added in v0.2.0

func (*Index) SetSeqCompareOptions ¶ added in v0.2.0

type IndexBuildingOptions ¶ added in v0.2.0

type IndexInfo ¶ added in v0.2.0

type IndexSearchingOptions ¶ added in v0.2.0

type Options ¶

type Query ¶

func (*Query) Reset ¶

type SearchResult ¶ added in v0.2.0

func (*SearchResult) Reset ¶ added in v0.2.0

func (*SearchResult) SortBySeqID ¶ added in v0.4.0

type SearchResultOfAGenome ¶ added in v0.8.0

type SearchResultOfASequence ¶ added in v0.8.0

type SearchResultReader ¶ added in v0.8.0

func NewSearchResultReader ¶ added in v0.8.0

func (*SearchResultReader) Next ¶ added in v0.8.0

type SearchResultsHeap ¶ added in v0.8.0

func (SearchResultsHeap) Len ¶ added in v0.8.0

func (SearchResultsHeap) Less ¶ added in v0.8.0

func (SearchResultsHeap) Pop ¶ added in v0.8.0

func (SearchResultsHeap) Push ¶ added in v0.8.0

func (SearchResultsHeap) Swap ¶ added in v0.8.0

type SeqComparator ¶ added in v0.2.0

func NewSeqComparator ¶ added in v0.2.0

func (*SeqComparator) Compare ¶ added in v0.2.0

func (*SeqComparator) Index ¶ added in v0.2.0

func (*SeqComparator) RecycleIndex ¶ added in v0.2.0

type SeqComparatorOptions ¶ added in v0.2.0

type SeqComparatorResult ¶ added in v0.2.0

func (*SeqComparatorResult) Update ¶ added in v0.3.0

func (*SeqComparatorResult) Update2 ¶ added in v0.4.0

type SimilarityDetail ¶ added in v0.2.0

type SubstrPair ¶ added in v0.2.0

func (SubstrPair) String ¶ added in v0.2.0

type Uint64Slice ¶

func (Uint64Slice) Len ¶