3 Commits

Author SHA1 Message Date
63a375995c minor code cleanup, use uint64 for wavefront values 2025-05-07 21:20:13 +00:00
2351faf2d7 cleanup code 2025-05-01 18:04:42 +00:00
a446bbd923 fix readme,
update go mod
2025-03-12 00:09:18 +00:00
8 changed files with 80 additions and 81 deletions

View File

@@ -1,4 +1,4 @@
.PHONY: build clean test .PHONY: build clean test dev-init
build: clean build: clean
@echo "======================== Building Binary =======================" @echo "======================== Building Binary ======================="
@@ -23,4 +23,8 @@ test:
go tool pprof -top mem.prof go tool pprof -top mem.prof
@rm -f mem.prof @rm -f mem.prof
@rm -f test.test @rm -f test.test
dev-init:
apt install minify
go get -t wfa/test

View File

@@ -1,6 +1,6 @@
# Using WFA-JS # Using WFA-JS
Download `wfa.js` and `wfa.wasm`from [releases](https://git.tronnet.net/tronnet/WFA-JS/releases) to your project. Add to your script: Download `wfa.js` and `wfa.wasm`from [releases](https://git.tronnet.net/alu/WFA-JS/releases) to your project. Add to your script:
``` ```
import wfa from "./wfa.js" import wfa from "./wfa.js"

2
go.mod
View File

@@ -1,6 +1,6 @@
module wfa module wfa
go 1.23.2 go 1.23.6
require ( require (
github.com/schollz/progressbar/v3 v3.17.1 github.com/schollz/progressbar/v3 v3.17.1

View File

@@ -6,38 +6,31 @@ type PositiveSlice[T any] struct {
defaultValue T defaultValue T
} }
func (a *PositiveSlice[T]) TranslateIndex(idx int) int {
return idx
}
func (a *PositiveSlice[T]) Valid(idx int) bool { func (a *PositiveSlice[T]) Valid(idx int) bool {
actualIdx := a.TranslateIndex(idx) return 0 <= idx && idx < len(a.valid) && a.valid[idx]
return 0 <= actualIdx && actualIdx < len(a.valid) && a.valid[actualIdx]
} }
func (a *PositiveSlice[T]) Get(idx int) T { func (a *PositiveSlice[T]) Get(idx int) T {
actualIdx := a.TranslateIndex(idx) if 0 <= idx && idx < len(a.valid) && a.valid[idx] { // idx is in the slice
if 0 <= actualIdx && actualIdx < len(a.valid) && a.valid[actualIdx] { // idx is in the slice return a.data[idx]
return a.data[actualIdx]
} else { // idx is out of the slice } else { // idx is out of the slice
return a.defaultValue return a.defaultValue
} }
} }
func (a *PositiveSlice[T]) Set(idx int, value T) { func (a *PositiveSlice[T]) Set(idx int, value T) {
actualIdx := a.TranslateIndex(idx) if idx >= len(a.valid) { // idx is outside the slice
if actualIdx < 0 || actualIdx >= len(a.valid) { // idx is outside the slice // expand data array to 2*idx
// expand data array to actualIdx newData := make([]T, 2*idx+1)
newData := make([]T, 2*actualIdx+1)
copy(newData, a.data) copy(newData, a.data)
a.data = newData a.data = newData
// expand valid array to actualIdx // expand valid array to 2*idx
newValid := make([]bool, 2*actualIdx+1) newValid := make([]bool, 2*idx+1)
copy(newValid, a.valid) copy(newValid, a.valid)
a.valid = newValid a.valid = newValid
} }
a.data[actualIdx] = value a.data[idx] = value
a.valid[actualIdx] = true a.valid[idx] = true
} }

View File

@@ -40,25 +40,22 @@ func UnpackWavefrontLoHi(lohi WavefrontLoHi) (int, int) {
return loBM, hiBM return loBM, hiBM
} }
// bitpacked wavefront values with 1 valid bit, 3 traceback bits, and 28 bits for the diag distance // bitpacked wavefront values with 1 valid bit, 3 traceback bits, and 60 bits for the diag distance
// technically this restricts to alignments with less than 268 million characters but that should be sufficient for most cases type WavefrontValue uint64
type WavefrontValue uint32
// TODO: add 64 bit packed value in case more than 268 million characters are needed
// PackWavefrontValue: packs a diag value and traceback into a WavefrontValue // PackWavefrontValue: packs a diag value and traceback into a WavefrontValue
func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue { func PackWavefrontValue(value uint64, traceback Traceback) WavefrontValue {
validBM := uint32(0x8000_0000) validBM := uint64(0x8000_0000_0000_0000)
tracebackBM := uint32(traceback&0x0000_0007) << 28 tracebackBM := uint64(traceback&0x0000_0007) << 60
valueBM := value & 0x0FFF_FFFF valueBM := uint64(value) & 0x0FFF_FFFF_FFFF_FFFF
return WavefrontValue(validBM | tracebackBM | valueBM) return WavefrontValue(validBM | tracebackBM | valueBM)
} }
// UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback // UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback
func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) { func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint64, Traceback) {
validBM := wfv&0x8000_0000 != 0 validBM := wfv&0x8000_0000_0000_0000 != 0
tracebackBM := uint8(wfv & 0x7000_0000 >> 28) tracebackBM := uint8(wfv & 0x7000_0000_0000_0000 >> 60)
valueBM := uint32(wfv & 0x0FFF_FFFF) valueBM := uint64(wfv & 0x0000_0000_FFFF_FFFF)
return validBM, valueBM, Traceback(tracebackBM) return validBM, valueBM, Traceback(tracebackBM)
} }
@@ -71,12 +68,9 @@ type Wavefront struct { // since wavefronts store diag distance, they should nev
// NewWavefront: returns a new wavefront with size accomodating lo and hi (inclusive) // NewWavefront: returns a new wavefront with size accomodating lo and hi (inclusive)
func NewWavefront(lo int, hi int) *Wavefront { func NewWavefront(lo int, hi int) *Wavefront {
a := &Wavefront{} a := &Wavefront{}
a.lohi = PackWavefrontLoHi(lo, hi) a.lohi = PackWavefrontLoHi(lo, hi)
size := hi - lo size := hi - lo
a.data = make([]WavefrontValue, size+1)
newData := make([]WavefrontValue, size+1)
a.data = newData
return a return a
} }
@@ -134,12 +128,12 @@ func NewWavefrontComponent() *WavefrontComponent {
} }
// GetVal: get value for wavefront=score, diag=k => returns ok, value, traceback // GetVal: get value for wavefront=score, diag=k => returns ok, value, traceback
func (w *WavefrontComponent) GetVal(score int, k int) (bool, uint32, Traceback) { func (w *WavefrontComponent) GetVal(score int, k int) (bool, uint64, Traceback) {
return UnpackWavefrontValue(w.W.Get(score).Get(k)) return UnpackWavefrontValue(w.W.Get(score).Get(k))
} }
// SetVal: set value, traceback for wavefront=score, diag=k // SetVal: set value, traceback for wavefront=score, diag=k
func (w *WavefrontComponent) SetVal(score int, k int, val uint32, tb Traceback) { func (w *WavefrontComponent) SetVal(score int, k int, val uint64, tb Traceback) {
w.W.Get(score).Set(k, PackWavefrontValue(val, tb)) w.W.Get(score).Set(k, PackWavefrontValue(val, tb))
} }

View File

@@ -7,6 +7,7 @@ import (
"golang.org/x/exp/constraints" "golang.org/x/exp/constraints"
) )
// convert an unsigned into to string
func UIntToString(num uint) string { // num assumed to be positive func UIntToString(num uint) string { // num assumed to be positive
var builder strings.Builder var builder strings.Builder
@@ -25,6 +26,7 @@ func UIntToString(num uint) string { // num assumed to be positive
return string(str) return string(str)
} }
// decode runlength encoded string such as CIGARs
func RunLengthDecode(encoded string) string { func RunLengthDecode(encoded string) string {
decoded := strings.Builder{} decoded := strings.Builder{}
length := len(encoded) length := len(encoded)
@@ -51,32 +53,17 @@ func RunLengthDecode(encoded string) string {
return decoded.String() return decoded.String()
} }
// given the min index, return the item in values at that index
func SafeMin[T constraints.Integer](values []T, idx int) T { func SafeMin[T constraints.Integer](values []T, idx int) T {
return values[idx] return values[idx]
} }
// given the max index, return the item in values at that index
func SafeMax[T constraints.Integer](values []T, idx int) T { func SafeMax[T constraints.Integer](values []T, idx int) T {
return values[idx] return values[idx]
} }
func SafeArgMax[T constraints.Integer](valids []bool, values []T) (bool, int) { // given array of values and corresponding array of valid flags, find the min of value which is valid or return false if there does not exist any
hasValid := false
maxIndex := 0
maxValue := math.MinInt
for i := 0; i < len(valids); i++ {
if valids[i] && int(values[i]) > maxValue {
hasValid = true
maxIndex = i
maxValue = int(values[i])
}
}
if hasValid {
return true, maxIndex
} else {
return false, 0
}
}
func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) { func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
hasValid := false hasValid := false
minIndex := 0 minIndex := 0
@@ -95,6 +82,22 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
} }
} }
// given array of values and corresponding array of valid flags, find the max of value which is valid or return false if there does not exist any
func SafeArgMax[T constraints.Integer](valids []bool, values []T) (bool, int) {
hasValid := false
maxIndex := 0
maxValue := math.MinInt
for i := range valids {
if valids[i] && int(values[i]) > maxValue {
hasValid = true
maxIndex = i
maxValue = int(values[i])
}
}
return hasValid, maxIndex
}
// set the lext lo and hi bounds for wavefronts M, I, D
func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty) (int, int) { func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty) (int, int) {
x := penalties.X x := penalties.X
o := penalties.O o := penalties.O
@@ -125,6 +128,7 @@ func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponen
return lo, hi return lo, hi
} }
// set the traceback and diag value for the next I wavefront
func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penalties Penalty) { func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penalties Penalty) {
o := penalties.O o := penalties.O
e := penalties.E e := penalties.E
@@ -132,13 +136,14 @@ func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penal
a_ok, a, _ := M.GetVal(score-o-e, k-1) a_ok, a, _ := M.GetVal(score-o-e, k-1)
b_ok, b, _ := I.GetVal(score-e, k-1) b_ok, b, _ := I.GetVal(score-e, k-1)
ok, nextITraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b}) ok, nextITraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint64{a, b})
nextIVal := SafeMax([]uint32{a, b}, nextITraceback) + 1 // important that the +1 is here nextIVal := SafeMax([]uint64{a, b}, nextITraceback) + 1 // important that the +1 is here
if ok { if ok {
I.SetVal(score, k, nextIVal, []Traceback{OpenIns, ExtdIns}[nextITraceback]) I.SetVal(score, k, nextIVal, []Traceback{OpenIns, ExtdIns}[nextITraceback])
} }
} }
// set the traceback and diag value for the next D wavefront
func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) { func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) {
o := penalties.O o := penalties.O
e := penalties.E e := penalties.E
@@ -146,13 +151,14 @@ func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penal
a_ok, a, _ := M.GetVal(score-o-e, k+1) a_ok, a, _ := M.GetVal(score-o-e, k+1)
b_ok, b, _ := D.GetVal(score-e, k+1) b_ok, b, _ := D.GetVal(score-e, k+1)
ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b}) ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint64{a, b})
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) nextDVal := SafeMax([]uint64{a, b}, nextDTraceback)
if ok { if ok {
D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback]) D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback])
} }
} }
// set the traceback and diag value for the next M wavefront
func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) { func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) {
x := penalties.X x := penalties.X
@@ -161,8 +167,8 @@ func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent,
b_ok, b, _ := I.GetVal(score, k) b_ok, b, _ := I.GetVal(score, k)
c_ok, c, _ := D.GetVal(score, k) c_ok, c, _ := D.GetVal(score, k)
ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c}) ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint64{a, b, c})
nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback) nextMVal := SafeMax([]uint64{a, b, c}, nextMTraceback)
if ok { if ok {
M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback]) M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback])
} }

View File

@@ -4,11 +4,12 @@ import (
"strings" "strings"
) )
// WFAlign takes strings s1, s2, penalties, and returns the score and CIGAR if doCIGAR is true
func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
n := len(s1) n := len(s1)
m := len(s2) m := len(s2)
A_k := m - n A_k := m - n // diagonal where both sequences end
A_offset := uint32(m) A_offset := uint64(m) // offset along a_k diagonal corresponding to end
score := 0 score := 0
M := NewWavefrontComponent() M := NewWavefrontComponent()
M.SetLoHi(0, 0, 0) M.SetLoHi(0, 0, 0)
@@ -19,7 +20,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
for { for {
WFExtend(M, s1, n, s2, m, score) WFExtend(M, s1, n, s2, m, score)
ok, val, _ := M.GetVal(score, A_k) ok, val, _ := M.GetVal(score, A_k)
if ok && val >= A_offset { if ok && val >= A_offset { // exit when M_(s,a_k) >= A_offset, ie the wavefront has reached the end
break break
} }
score = score + 1 score = score + 1
@@ -27,7 +28,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
} }
CIGAR := "" CIGAR := ""
if doCIGAR { if doCIGAR { // if doCIGAR, then perform backtrace, otherwise just return the score
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2) CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2)
} }
@@ -39,23 +40,24 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
func WFExtend(M *WavefrontComponent, s1 string, n int, s2 string, m int, score int) { func WFExtend(M *WavefrontComponent, s1 string, n int, s2 string, m int, score int) {
_, lo, hi := M.GetLoHi(score) _, lo, hi := M.GetLoHi(score)
for k := lo; k <= hi; k++ { for k := lo; k <= hi; k++ { // for each diagonal in current wavefront
// v = M[score][k] - k // v = M[score][k] - k
// h = M[score][k] // h = M[score][k]
ok, hu, _ := M.GetVal(score, k) ok, uh, tb := M.GetVal(score, k)
h := int(hu) // exit early if M_(s,l) is invalid
v := h - k
// exit early if v or h are invalid
if !ok { if !ok {
continue continue
} }
for v < n && h < m && s1[v] == s2[h] { h := int(uh)
_, val, tb := M.GetVal(score, k) v := h - k
M.SetVal(score, k, val+1, tb) // in the paper, we do v++, h++, M_(s,k)++
// however, note that h = M_(s,k) so instead we just do v++, h++ and set M_(s,k) at the end
// this saves a some memory reads and writes
for v < n && h < m && s1[v] == s2[h] { // extend diagonal for the next set of matches
v++ v++
h++ h++
} }
M.SetVal(score, k, uint64(h), tb)
} }
} }
@@ -63,14 +65,14 @@ func WFNext(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent,
// get this score's lo, hi // get this score's lo, hi
lo, hi := NextLoHi(M, I, D, score, penalties) lo, hi := NextLoHi(M, I, D, score, penalties)
for k := lo; k <= hi; k++ { for k := lo; k <= hi; k++ { // for each diagonal, extend the matrices for the next wavefronts
NextI(M, I, score, k, penalties) NextI(M, I, score, k, penalties)
NextD(M, D, score, k, penalties) NextD(M, D, score, k, penalties)
NextM(M, I, D, score, k, penalties) NextM(M, I, D, score, k, penalties)
} }
} }
func WFBacktrace(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string { func WFBacktrace(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint64, s1 string, s2 string) string {
x := penalties.X x := penalties.X
o := penalties.O o := penalties.O
e := penalties.E e := penalties.E

View File

@@ -36,8 +36,8 @@ func randRange[T constraints.Integer](min, max int) T {
func TestWavefrontPacking(t *testing.T) { func TestWavefrontPacking(t *testing.T) {
for range 1000 { for range 1000 {
val := randRange[uint32](0, 1000) val := randRange[uint64](0, 1000)
tb := wfa.Traceback(randRange[uint32](0, 7)) tb := wfa.Traceback(randRange[uint64](0, 7))
v := wfa.PackWavefrontValue(val, tb) v := wfa.PackWavefrontValue(val, tb)
valid, gotVal, gotTB := wfa.UnpackWavefrontValue(v) valid, gotVal, gotTB := wfa.UnpackWavefrontValue(v)