3 Commits

Author SHA1 Message Date
63a375995c minor code cleanup, use uint64 for wavefront values 2025-05-07 21:20:13 +00:00
2351faf2d7 cleanup code 2025-05-01 18:04:42 +00:00
a446bbd923 fix readme,
update go mod
2025-03-12 00:09:18 +00:00
8 changed files with 80 additions and 81 deletions

View File

@@ -1,4 +1,4 @@
.PHONY: build clean test
.PHONY: build clean test dev-init
build: clean
@echo "======================== Building Binary ======================="
@@ -24,3 +24,7 @@ test:
@rm -f mem.prof
@rm -f test.test
dev-init:
apt install minify
go get -t wfa/test

View File

@@ -1,6 +1,6 @@
# Using WFA-JS
Download `wfa.js` and `wfa.wasm`from [releases](https://git.tronnet.net/tronnet/WFA-JS/releases) to your project. Add to your script:
Download `wfa.js` and `wfa.wasm`from [releases](https://git.tronnet.net/alu/WFA-JS/releases) to your project. Add to your script:
```
import wfa from "./wfa.js"

2
go.mod
View File

@@ -1,6 +1,6 @@
module wfa
go 1.23.2
go 1.23.6
require (
github.com/schollz/progressbar/v3 v3.17.1

View File

@@ -6,38 +6,31 @@ type PositiveSlice[T any] struct {
defaultValue T
}
func (a *PositiveSlice[T]) TranslateIndex(idx int) int {
return idx
}
func (a *PositiveSlice[T]) Valid(idx int) bool {
actualIdx := a.TranslateIndex(idx)
return 0 <= actualIdx && actualIdx < len(a.valid) && a.valid[actualIdx]
return 0 <= idx && idx < len(a.valid) && a.valid[idx]
}
func (a *PositiveSlice[T]) Get(idx int) T {
actualIdx := a.TranslateIndex(idx)
if 0 <= actualIdx && actualIdx < len(a.valid) && a.valid[actualIdx] { // idx is in the slice
return a.data[actualIdx]
if 0 <= idx && idx < len(a.valid) && a.valid[idx] { // idx is in the slice
return a.data[idx]
} else { // idx is out of the slice
return a.defaultValue
}
}
func (a *PositiveSlice[T]) Set(idx int, value T) {
actualIdx := a.TranslateIndex(idx)
if actualIdx < 0 || actualIdx >= len(a.valid) { // idx is outside the slice
// expand data array to actualIdx
newData := make([]T, 2*actualIdx+1)
if idx >= len(a.valid) { // idx is outside the slice
// expand data array to 2*idx
newData := make([]T, 2*idx+1)
copy(newData, a.data)
a.data = newData
// expand valid array to actualIdx
newValid := make([]bool, 2*actualIdx+1)
// expand valid array to 2*idx
newValid := make([]bool, 2*idx+1)
copy(newValid, a.valid)
a.valid = newValid
}
a.data[actualIdx] = value
a.valid[actualIdx] = true
a.data[idx] = value
a.valid[idx] = true
}

View File

@@ -40,25 +40,22 @@ func UnpackWavefrontLoHi(lohi WavefrontLoHi) (int, int) {
return loBM, hiBM
}
// bitpacked wavefront values with 1 valid bit, 3 traceback bits, and 28 bits for the diag distance
// technically this restricts to alignments with less than 268 million characters but that should be sufficient for most cases
type WavefrontValue uint32
// TODO: add 64 bit packed value in case more than 268 million characters are needed
// bitpacked wavefront values with 1 valid bit, 3 traceback bits, and 60 bits for the diag distance
type WavefrontValue uint64
// PackWavefrontValue: packs a diag value and traceback into a WavefrontValue
func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue {
validBM := uint32(0x8000_0000)
tracebackBM := uint32(traceback&0x0000_0007) << 28
valueBM := value & 0x0FFF_FFFF
func PackWavefrontValue(value uint64, traceback Traceback) WavefrontValue {
validBM := uint64(0x8000_0000_0000_0000)
tracebackBM := uint64(traceback&0x0000_0007) << 60
valueBM := uint64(value) & 0x0FFF_FFFF_FFFF_FFFF
return WavefrontValue(validBM | tracebackBM | valueBM)
}
// UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback
func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) {
validBM := wfv&0x8000_0000 != 0
tracebackBM := uint8(wfv & 0x7000_0000 >> 28)
valueBM := uint32(wfv & 0x0FFF_FFFF)
func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint64, Traceback) {
validBM := wfv&0x8000_0000_0000_0000 != 0
tracebackBM := uint8(wfv & 0x7000_0000_0000_0000 >> 60)
valueBM := uint64(wfv & 0x0000_0000_FFFF_FFFF)
return validBM, valueBM, Traceback(tracebackBM)
}
@@ -71,12 +68,9 @@ type Wavefront struct { // since wavefronts store diag distance, they should nev
// NewWavefront: returns a new wavefront with size accomodating lo and hi (inclusive)
func NewWavefront(lo int, hi int) *Wavefront {
a := &Wavefront{}
a.lohi = PackWavefrontLoHi(lo, hi)
size := hi - lo
newData := make([]WavefrontValue, size+1)
a.data = newData
a.data = make([]WavefrontValue, size+1)
return a
}
@@ -134,12 +128,12 @@ func NewWavefrontComponent() *WavefrontComponent {
}
// GetVal: get value for wavefront=score, diag=k => returns ok, value, traceback
func (w *WavefrontComponent) GetVal(score int, k int) (bool, uint32, Traceback) {
func (w *WavefrontComponent) GetVal(score int, k int) (bool, uint64, Traceback) {
return UnpackWavefrontValue(w.W.Get(score).Get(k))
}
// SetVal: set value, traceback for wavefront=score, diag=k
func (w *WavefrontComponent) SetVal(score int, k int, val uint32, tb Traceback) {
func (w *WavefrontComponent) SetVal(score int, k int, val uint64, tb Traceback) {
w.W.Get(score).Set(k, PackWavefrontValue(val, tb))
}

View File

@@ -7,6 +7,7 @@ import (
"golang.org/x/exp/constraints"
)
// convert an unsigned into to string
func UIntToString(num uint) string { // num assumed to be positive
var builder strings.Builder
@@ -25,6 +26,7 @@ func UIntToString(num uint) string { // num assumed to be positive
return string(str)
}
// decode runlength encoded string such as CIGARs
func RunLengthDecode(encoded string) string {
decoded := strings.Builder{}
length := len(encoded)
@@ -51,32 +53,17 @@ func RunLengthDecode(encoded string) string {
return decoded.String()
}
// given the min index, return the item in values at that index
func SafeMin[T constraints.Integer](values []T, idx int) T {
return values[idx]
}
// given the max index, return the item in values at that index
func SafeMax[T constraints.Integer](values []T, idx int) T {
return values[idx]
}
func SafeArgMax[T constraints.Integer](valids []bool, values []T) (bool, int) {
hasValid := false
maxIndex := 0
maxValue := math.MinInt
for i := 0; i < len(valids); i++ {
if valids[i] && int(values[i]) > maxValue {
hasValid = true
maxIndex = i
maxValue = int(values[i])
}
}
if hasValid {
return true, maxIndex
} else {
return false, 0
}
}
// given array of values and corresponding array of valid flags, find the min of value which is valid or return false if there does not exist any
func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
hasValid := false
minIndex := 0
@@ -95,6 +82,22 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
}
}
// given array of values and corresponding array of valid flags, find the max of value which is valid or return false if there does not exist any
func SafeArgMax[T constraints.Integer](valids []bool, values []T) (bool, int) {
hasValid := false
maxIndex := 0
maxValue := math.MinInt
for i := range valids {
if valids[i] && int(values[i]) > maxValue {
hasValid = true
maxIndex = i
maxValue = int(values[i])
}
}
return hasValid, maxIndex
}
// set the lext lo and hi bounds for wavefronts M, I, D
func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty) (int, int) {
x := penalties.X
o := penalties.O
@@ -125,6 +128,7 @@ func NextLoHi(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponen
return lo, hi
}
// set the traceback and diag value for the next I wavefront
func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penalties Penalty) {
o := penalties.O
e := penalties.E
@@ -132,13 +136,14 @@ func NextI(M *WavefrontComponent, I *WavefrontComponent, score int, k int, penal
a_ok, a, _ := M.GetVal(score-o-e, k-1)
b_ok, b, _ := I.GetVal(score-e, k-1)
ok, nextITraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b})
nextIVal := SafeMax([]uint32{a, b}, nextITraceback) + 1 // important that the +1 is here
ok, nextITraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint64{a, b})
nextIVal := SafeMax([]uint64{a, b}, nextITraceback) + 1 // important that the +1 is here
if ok {
I.SetVal(score, k, nextIVal, []Traceback{OpenIns, ExtdIns}[nextITraceback])
}
}
// set the traceback and diag value for the next D wavefront
func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) {
o := penalties.O
e := penalties.E
@@ -146,13 +151,14 @@ func NextD(M *WavefrontComponent, D *WavefrontComponent, score int, k int, penal
a_ok, a, _ := M.GetVal(score-o-e, k+1)
b_ok, b, _ := D.GetVal(score-e, k+1)
ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b})
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback)
ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint64{a, b})
nextDVal := SafeMax([]uint64{a, b}, nextDTraceback)
if ok {
D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback])
}
}
// set the traceback and diag value for the next M wavefront
func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, k int, penalties Penalty) {
x := penalties.X
@@ -161,8 +167,8 @@ func NextM(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent,
b_ok, b, _ := I.GetVal(score, k)
c_ok, c, _ := D.GetVal(score, k)
ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c})
nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback)
ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint64{a, b, c})
nextMVal := SafeMax([]uint64{a, b, c}, nextMTraceback)
if ok {
M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback])
}

View File

@@ -4,11 +4,12 @@ import (
"strings"
)
// WFAlign takes strings s1, s2, penalties, and returns the score and CIGAR if doCIGAR is true
func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
n := len(s1)
m := len(s2)
A_k := m - n
A_offset := uint32(m)
A_k := m - n // diagonal where both sequences end
A_offset := uint64(m) // offset along a_k diagonal corresponding to end
score := 0
M := NewWavefrontComponent()
M.SetLoHi(0, 0, 0)
@@ -19,7 +20,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
for {
WFExtend(M, s1, n, s2, m, score)
ok, val, _ := M.GetVal(score, A_k)
if ok && val >= A_offset {
if ok && val >= A_offset { // exit when M_(s,a_k) >= A_offset, ie the wavefront has reached the end
break
}
score = score + 1
@@ -27,7 +28,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
}
CIGAR := ""
if doCIGAR {
if doCIGAR { // if doCIGAR, then perform backtrace, otherwise just return the score
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2)
}
@@ -39,23 +40,24 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
func WFExtend(M *WavefrontComponent, s1 string, n int, s2 string, m int, score int) {
_, lo, hi := M.GetLoHi(score)
for k := lo; k <= hi; k++ {
for k := lo; k <= hi; k++ { // for each diagonal in current wavefront
// v = M[score][k] - k
// h = M[score][k]
ok, hu, _ := M.GetVal(score, k)
h := int(hu)
v := h - k
// exit early if v or h are invalid
ok, uh, tb := M.GetVal(score, k)
// exit early if M_(s,l) is invalid
if !ok {
continue
}
for v < n && h < m && s1[v] == s2[h] {
_, val, tb := M.GetVal(score, k)
M.SetVal(score, k, val+1, tb)
h := int(uh)
v := h - k
// in the paper, we do v++, h++, M_(s,k)++
// however, note that h = M_(s,k) so instead we just do v++, h++ and set M_(s,k) at the end
// this saves a some memory reads and writes
for v < n && h < m && s1[v] == s2[h] { // extend diagonal for the next set of matches
v++
h++
}
M.SetVal(score, k, uint64(h), tb)
}
}
@@ -63,14 +65,14 @@ func WFNext(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent,
// get this score's lo, hi
lo, hi := NextLoHi(M, I, D, score, penalties)
for k := lo; k <= hi; k++ {
for k := lo; k <= hi; k++ { // for each diagonal, extend the matrices for the next wavefronts
NextI(M, I, score, k, penalties)
NextD(M, D, score, k, penalties)
NextM(M, I, D, score, k, penalties)
}
}
func WFBacktrace(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string {
func WFBacktrace(M *WavefrontComponent, I *WavefrontComponent, D *WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint64, s1 string, s2 string) string {
x := penalties.X
o := penalties.O
e := penalties.E

View File

@@ -36,8 +36,8 @@ func randRange[T constraints.Integer](min, max int) T {
func TestWavefrontPacking(t *testing.T) {
for range 1000 {
val := randRange[uint32](0, 1000)
tb := wfa.Traceback(randRange[uint32](0, 7))
val := randRange[uint64](0, 1000)
tb := wfa.Traceback(randRange[uint64](0, 7))
v := wfa.PackWavefrontValue(val, tb)
valid, gotVal, gotTB := wfa.UnpackWavefrontValue(v)