diff --git a/pkg/custom_slice.go b/pkg/custom_slice.go index 4704b95..9b6b685 100644 --- a/pkg/custom_slice.go +++ b/pkg/custom_slice.go @@ -19,7 +19,7 @@ func (a *PositiveSlice[T]) Get(idx int) T { } func (a *PositiveSlice[T]) Set(idx int, value T) { - if idx < 0 || idx >= len(a.valid) { // idx is outside the slice + if idx >= len(a.valid) { // idx is outside the slice // expand data array to 2*idx newData := make([]T, 2*idx+1) copy(newData, a.data) diff --git a/pkg/types.go b/pkg/types.go index e23c34b..1211736 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -40,25 +40,22 @@ func UnpackWavefrontLoHi(lohi WavefrontLoHi) (int, int) { return loBM, hiBM } -// bitpacked wavefront values with 1 valid bit, 3 traceback bits, and 28 bits for the diag distance -// technically this restricts to alignments with less than 268 million characters but that should be sufficient for most cases -type WavefrontValue uint32 - -// TODO: add 64 bit packed value in case more than 268 million characters are needed +// bitpacked wavefront values with 1 valid bit, 3 traceback bits, and 60 bits for the diag distance +type WavefrontValue uint64 // PackWavefrontValue: packs a diag value and traceback into a WavefrontValue -func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue { - validBM := uint32(0x8000_0000) - tracebackBM := uint32(traceback&0x0000_0007) << 28 - valueBM := value & 0x0FFF_FFFF +func PackWavefrontValue(value uint64, traceback Traceback) WavefrontValue { + validBM := uint64(0x8000_0000_0000_0000) + tracebackBM := uint64(traceback&0x0000_0007) << 60 + valueBM := uint64(value) & 0x0FFF_FFFF_FFFF_FFFF return WavefrontValue(validBM | tracebackBM | valueBM) } // UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback -func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) { - validBM := wfv&0x8000_0000 != 0 - tracebackBM := uint8(wfv & 0x7000_0000 >> 28) - valueBM := uint32(wfv & 0x0FFF_FFFF) +func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint64, Traceback) { + validBM := wfv&0x8000_0000_0000_0000 != 0 + tracebackBM := uint8(wfv & 0x7000_0000_0000_0000 >> 60) + valueBM := uint64(wfv & 0x0000_0000_FFFF_FFFF) return validBM, valueBM, Traceback(tracebackBM) } @@ -131,12 +128,12 @@ func NewWavefrontComponent() *WavefrontComponent { } // GetVal: get value for wavefront=score, diag=k => returns ok, value, traceback -func (w *WavefrontComponent) GetVal(score int, k int) (bool, uint32, Traceback) { +func (w *WavefrontComponent) GetVal(score int, k int) (bool, uint64, Traceback) { return UnpackWavefrontValue(w.W.Get(score).Get(k)) } // SetVal: set value, traceback for wavefront=score, diag=k -func (w *WavefrontComponent) SetVal(score int, k int, val uint32, tb Traceback) { +func (w *WavefrontComponent) SetVal(score int, k int, val uint64, tb Traceback) { w.W.Get(score).Set(k, PackWavefrontValue(val, tb)) } diff --git a/pkg/utils.go b/pkg/utils.go index 94db843..fa5840b 100644 --- a/pkg/utils.go +++ b/pkg/utils.go @@ -7,6 +7,7 @@ import ( "golang.org/x/exp/constraints" ) +// convert an unsigned into to string func UIntToString(num uint) string { // num assumed to be positive var builder strings.Builder @@ -25,6 +26,7 @@ func UIntToString(num uint) string { // num assumed to be positive return string(str) } +// decode runlength encoded string such as CIGARs func RunLengthDecode(encoded string) string { decoded := strings.Builder{} length := len(encoded) @@ -51,28 +53,17 @@ func RunLengthDecode(encoded string) string { return decoded.String() } +// given the min index, return the item in values at that index func SafeMin[T constraints.Integer](values []T, idx int) T { return values[idx] } +// given the max index, return the item in values at that index func SafeMax[T constraints.Integer](values []T, idx int) T { return values[idx] } -func SafeArgMax[T constraints.Integer](valids []bool, values []T) (bool, int) { - hasValid := false - maxIndex := 0 - maxValue := math.MinInt - for i := range valids { - if valids[i] && int(values[i]) > maxValue { - hasValid = true - maxIndex = i - maxValue = int(values[i]) - } - } - return hasValid, maxIndex -} - +// given array of values and corresponding array of valid flags, find the min of value which is valid or return false if there does not exist any func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) { hasValid := false minIndex := 0 @@ -91,6 +82,22 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) { } } +// given array of values and corresponding array of valid flags, find the max of value which is valid or return false if there does not exist any +func SafeArgMax[T constraints.Integer](valids []bool, values []T) (bool, int) { + hasValid := false + maxIndex := 0 + maxValue := math.MinInt + for i := range valids { + if valids[i] && int(values[i]) > maxValue { + hasValid = true + maxIndex = i + maxValue = int(values[i]) + } + } + return hasValid, maxIndex +} + +// set the lext lo and hi bounds for wavefronts M, I, D func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty) (int, int) { x := penalties.X o := penalties.O @@ -121,6 +128,7 @@ func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponen return lo, hi } +// set the traceback and diag value for the next I wavefront func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penalties Penalty) { o := penalties.O e := penalties.E @@ -128,13 +136,14 @@ func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penal a_ok, a, _ := M.GetVal(score-o-e, k-1) b_ok, b, _ := I.GetVal(score-e, k-1) - ok, nextITraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b}) - nextIVal := SafeMax([]uint32{a, b}, nextITraceback) + 1 // important that the +1 is here + ok, nextITraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint64{a, b}) + nextIVal := SafeMax([]uint64{a, b}, nextITraceback) + 1 // important that the +1 is here if ok { I.SetVal(score, k, nextIVal, []Traceback{OpenIns, ExtdIns}[nextITraceback]) } } +// set the traceback and diag value for the next D wavefront func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) { o := penalties.O e := penalties.E @@ -142,13 +151,14 @@ func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penal a_ok, a, _ := M.GetVal(score-o-e, k+1) b_ok, b, _ := D.GetVal(score-e, k+1) - ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b}) - nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) + ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint64{a, b}) + nextDVal := SafeMax([]uint64{a, b}, nextDTraceback) if ok { D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback]) } } +// set the traceback and diag value for the next M wavefront func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) { x := penalties.X @@ -157,8 +167,8 @@ func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, b_ok, b, _ := I.GetVal(score, k) c_ok, c, _ := D.GetVal(score, k) - ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c}) - nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback) + ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint64{a, b, c}) + nextMVal := SafeMax([]uint64{a, b, c}, nextMTraceback) if ok { M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback]) } diff --git a/pkg/wfa.go b/pkg/wfa.go index 56e2b48..a6ca9f2 100644 --- a/pkg/wfa.go +++ b/pkg/wfa.go @@ -4,11 +4,12 @@ import ( "strings" ) +// WFAlign takes strings s1, s2, penalties, and returns the score and CIGAR if doCIGAR is true func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { n := len(s1) m := len(s2) - A_k := m - n - A_offset := uint32(m) + A_k := m - n // diagonal where both sequences end + A_offset := uint64(m) // offset along a_k diagonal corresponding to end score := 0 M := NewWavefrontComponent() M.SetLoHi(0, 0, 0) @@ -19,7 +20,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { for { WFExtend(M, s1, n, s2, m, score) ok, val, _ := M.GetVal(score, A_k) - if ok && val >= A_offset { + if ok && val >= A_offset { // exit when M_(s,a_k) >= A_offset, ie the wavefront has reached the end break } score = score + 1 @@ -27,7 +28,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { } CIGAR := "" - if doCIGAR { + if doCIGAR { // if doCIGAR, then perform backtrace, otherwise just return the score CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2) } @@ -39,23 +40,24 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { func WFExtend(M *WavefrontComponent, s1 string, n int, s2 string, m int, score int) { _, lo, hi := M.GetLoHi(score) - for k := lo; k <= hi; k++ { + for k := lo; k <= hi; k++ { // for each diagonal in current wavefront // v = M[score][k] - k // h = M[score][k] - ok, hu, _ := M.GetVal(score, k) - h := int(hu) - v := h - k - - // exit early if v or h are invalid + ok, uh, tb := M.GetVal(score, k) + // exit early if M_(s,l) is invalid if !ok { continue } - for v < n && h < m && s1[v] == s2[h] { - _, val, tb := M.GetVal(score, k) - M.SetVal(score, k, val+1, tb) + h := int(uh) + v := h - k + // in the paper, we do v++, h++, M_(s,k)++ + // however, note that h = M_(s,k) so instead we just do v++, h++ and set M_(s,k) at the end + // this saves a some memory reads and writes + for v < n && h < m && s1[v] == s2[h] { // extend diagonal for the next set of matches v++ h++ } + M.SetVal(score, k, uint64(h), tb) } } @@ -63,14 +65,14 @@ func WFNext(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, // get this score's lo, hi lo, hi := NextLoHi(M, I, D, score, penalties) - for k := lo; k <= hi; k++ { + for k := lo; k <= hi; k++ { // for each diagonal, extend the matrices for the next wavefronts NextI(M, I, score, k, penalties) NextD(M, D, score, k, penalties) NextM(M, I, D, score, k, penalties) } } -func WFBacktrace(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string { +func WFBacktrace(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint64, s1 string, s2 string) string { x := penalties.X o := penalties.O e := penalties.E diff --git a/test/wfa_test.go b/test/wfa_test.go index 6a7cdb3..2574fec 100644 --- a/test/wfa_test.go +++ b/test/wfa_test.go @@ -36,8 +36,8 @@ func randRange[T constraints.Integer](min, max int) T { func TestWavefrontPacking(t *testing.T) { for range 1000 { - val := randRange[uint32](0, 1000) - tb := wfa.Traceback(randRange[uint32](0, 7)) + val := randRange[uint64](0, 1000) + tb := wfa.Traceback(randRange[uint64](0, 7)) v := wfa.PackWavefrontValue(val, tb) valid, gotVal, gotTB := wfa.UnpackWavefrontValue(v)