fix issue in WFBacktrace and change format to proper CIGAR,

add test to ensure CIGAR correctness in the case of different traceback results,
add DecodeCIGAR function to exports
This commit is contained in:
Arthur Lu 2024-11-07 19:01:01 +00:00
parent 3da3ddf10c
commit cde429cb80
6 changed files with 248 additions and 64 deletions

View File

@ -3,7 +3,7 @@
build: clean build: clean
@echo "======================== Building Binary =======================" @echo "======================== Building Binary ======================="
minify wfa.js > dist/wfa.js minify wfa.js > dist/wfa.js
GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -no-debug -opt=2 -target=wasm -o dist/wfa.wasm . GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -panic=trap -no-debug -opt=2 -target=wasm -o dist/wfa.wasm .
clean: clean:
@echo "======================== Cleaning Project ======================" @echo "======================== Cleaning Project ======================"

19
main.go
View File

@ -9,6 +9,7 @@ import (
func main() { func main() {
c := make(chan bool) c := make(chan bool)
js.Global().Set("wfAlign", js.FuncOf(wfAlign)) js.Global().Set("wfAlign", js.FuncOf(wfAlign))
js.Global().Set("DecodeCIGAR", js.FuncOf(DecodeCIGAR))
<-c <-c
} }
@ -70,3 +71,21 @@ func wfAlign(this js.Value, args []js.Value) interface{} {
return js.ValueOf(resultMap) return js.ValueOf(resultMap)
} }
func DecodeCIGAR(this js.Value, args []js.Value) interface{} {
if len(args) != 1 {
fmt.Println("invalid number of args, requires 1: CIGAR")
return nil
}
if args[0].Type() != js.TypeString {
fmt.Println("s1 should be a string")
return nil
}
CIGAR := args[0].String()
decoded := wfa.RunLengthDecode(CIGAR)
return js.ValueOf(decoded)
}

View File

@ -39,10 +39,10 @@ func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue {
} }
// UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback // UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback
func UnpackWavefrontValue(wf WavefrontValue) (bool, uint32, Traceback) { func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) {
valueBM := uint32(wf & 0x0FFF_FFFF) valueBM := uint32(wfv & 0x0FFF_FFFF)
tracebackBM := uint8(wf & 0x7000_0000 >> 28) tracebackBM := uint8(wfv & 0x7000_0000 >> 28)
validBM := wf&0x8000_0000 != 0 validBM := wfv&0x8000_0000 != 0
return validBM, valueBM, Traceback(tracebackBM) return validBM, valueBM, Traceback(tracebackBM)
} }

View File

@ -2,11 +2,55 @@ package wfa
import ( import (
"math" "math"
"unicode/utf8" "strings"
"golang.org/x/exp/constraints" "golang.org/x/exp/constraints"
) )
func UIntToString(num uint) string { // num assumed to be positive
var builder strings.Builder
for num > 0 {
digit := num % 10
builder.WriteRune(rune('0' + digit))
num /= 10
}
// Reverse the string as we built it in reverse order
str := []rune(builder.String())
for i, j := 0, len(str)-1; i < j; i, j = i+1, j-1 {
str[i], str[j] = str[j], str[i]
}
return string(str)
}
func RunLengthDecode(encoded string) string {
decoded := strings.Builder{}
length := len(encoded)
i := 0
for i < length {
// If the current character is a digit, we need to extract the run length
runLength := 0
for i < length && encoded[i] >= '0' && encoded[i] <= '9' {
runLength = runLength*10 + int(encoded[i]-'0')
i++
}
// The next character will be the character to repeat
if i < length {
char := encoded[i]
for j := 0; j < runLength; j++ {
decoded.WriteByte(char)
}
i++ // Move past the character
}
}
return decoded.String()
}
func SafeMin[T constraints.Integer](values []T, idx int) T { func SafeMin[T constraints.Integer](values []T, idx int) T {
return values[idx] return values[idx]
} }
@ -51,21 +95,6 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
} }
} }
func Reverse(s string) string {
size := len(s)
buf := make([]byte, size)
for start := 0; start < size; {
r, n := utf8.DecodeRuneInString(s[start:])
start += n
utf8.EncodeRune(buf[size-start:], r)
}
return string(buf)
}
func Splice(s string, c rune, idx int) string {
return s[:idx] + string(c) + s[idx:]
}
func NextLoHi(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty) (int, int) { func NextLoHi(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty) (int, int) {
x := penalties.X x := penalties.X
o := penalties.O o := penalties.O
@ -117,11 +146,8 @@ func NextD(M WavefrontComponent, D WavefrontComponent, score int, k int, penalti
a_ok, a, _ := M.GetVal(score-o-e, k+1) a_ok, a, _ := M.GetVal(score-o-e, k+1)
b_ok, b, _ := D.GetVal(score-e, k+1) b_ok, b, _ := D.GetVal(score-e, k+1)
ok, nextDTraceback := SafeArgMax( ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b})
[]bool{a_ok, b_ok}, nextDVal := SafeMax([]uint32{a, b}, nextDTraceback)
[]uint32{a, b},
)
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) // nothing special
if ok { if ok {
D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback]) D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback])
} }
@ -137,7 +163,6 @@ func NextM(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sco
ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c}) ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c})
nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback) nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback)
if ok { if ok {
M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback]) M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback])
} }

View File

@ -1,5 +1,9 @@
package wfa package wfa
import (
"strings"
)
func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
n := len(s1) n := len(s1)
m := len(s2) m := len(s2)
@ -25,7 +29,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
CIGAR := "" CIGAR := ""
if doCIGAR { if doCIGAR {
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, s1, s2) CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2)
} }
return Result{ return Result{
@ -67,76 +71,128 @@ func WFNext(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sc
} }
} }
func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, s1 string, s2 string) string { func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string {
traceback_CIGAR := []string{"I", "I", "D", "D", "X", "", "", ""}
x := penalties.X x := penalties.X
o := penalties.O o := penalties.O
e := penalties.E e := penalties.E
CIGAR_rev := ""
tb_s := score tb_s := score
tb_k := A_k tb_k := A_k
_, _, current_traceback := M.GetVal(tb_s, tb_k)
done := false done := false
_, current_dist, current_traceback := M.GetVal(tb_s, tb_k)
Ops := []rune{'~'}
Counts := []uint{0}
idx := 0
for !done { for !done {
CIGAR_rev = CIGAR_rev + traceback_CIGAR[current_traceback]
switch current_traceback { switch current_traceback {
case OpenIns: case OpenIns:
if Ops[idx] == 'I' {
Counts[idx]++
} else {
Ops = append(Ops, 'I')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - o - e tb_s = tb_s - o - e
tb_k = tb_k - 1 tb_k = tb_k - 1
_, _, current_traceback = M.GetVal(tb_s, tb_k) _, current_dist, current_traceback = M.GetVal(tb_s, tb_k)
case ExtdIns: case ExtdIns:
if Ops[idx] == 'I' {
Counts[idx]++
} else {
Ops = append(Ops, 'I')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - e tb_s = tb_s - e
tb_k = tb_k - 1 tb_k = tb_k - 1
_, _, current_traceback = I.GetVal(tb_s, tb_k) _, current_dist, current_traceback = I.GetVal(tb_s, tb_k)
case OpenDel: case OpenDel:
if Ops[idx] == 'D' {
Counts[idx]++
} else {
Ops = append(Ops, 'D')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - o - e tb_s = tb_s - o - e
tb_k = tb_k + 1 tb_k = tb_k + 1
_, _, current_traceback = M.GetVal(tb_s, tb_k) _, current_dist, current_traceback = M.GetVal(tb_s, tb_k)
case ExtdDel: case ExtdDel:
if Ops[idx] == 'D' {
Counts[idx]++
} else {
Ops = append(Ops, 'D')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - e tb_s = tb_s - e
tb_k = tb_k + 1 tb_k = tb_k + 1
_, _, current_traceback = D.GetVal(tb_s, tb_k) _, current_dist, current_traceback = D.GetVal(tb_s, tb_k)
case Sub: case Sub:
tb_s = tb_s - x tb_s = tb_s - x
// tb_k = tb_k; // tb_k = tb_k;
_, _, current_traceback = M.GetVal(tb_s, tb_k) _, next_dist, next_traceback := M.GetVal(tb_s, tb_k)
if int(current_dist-next_dist)-1 > 0 {
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist-next_dist)-1)
idx++
}
if Ops[idx] == 'X' {
Counts[idx]++
} else {
Ops = append(Ops, 'X')
Counts = append(Counts, 1)
idx++
}
current_dist = next_dist
current_traceback = next_traceback
case Ins: case Ins:
// tb_s = tb_s; // tb_s = tb_s;
// tb_k = tb_k; // tb_k = tb_k;
_, _, current_traceback = I.GetVal(tb_s, tb_k) _, next_dist, next_traceback := I.GetVal(tb_s, tb_k)
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist-next_dist))
idx++
current_dist = next_dist
current_traceback = next_traceback
case Del: case Del:
// tb_s = tb_s; // tb_s = tb_s;
// tb_k = tb_k; // tb_k = tb_k;
_, _, current_traceback = D.GetVal(tb_s, tb_k) _, next_dist, next_traceback := D.GetVal(tb_s, tb_k)
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist-next_dist))
idx++
current_dist = next_dist
current_traceback = next_traceback
case End: case End:
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist))
idx++
done = true done = true
} }
} }
CIGAR_part := Reverse(CIGAR_rev) CIGAR := strings.Builder{}
c := 0 for i := len(Ops) - 1; i > 0; i-- {
i := 0 CIGAR.WriteString(UIntToString(Counts[i]))
j := 0 CIGAR.WriteRune(Ops[i])
for i < len(s1) && j < len(s2) {
if s1[i] == s2[j] {
//CIGAR_part.splice(c, 0, "M")
CIGAR_part = Splice(CIGAR_part, 'M', c)
c++
i++
j++
} else if CIGAR_part[c] == 'X' {
c++
i++
j++
} else if CIGAR_part[c] == 'I' {
c++
j++
} else if CIGAR_part[c] == 'D' {
c++
i++
}
} }
return CIGAR_part return CIGAR.String()
} }

View File

@ -3,6 +3,7 @@ package tests
import ( import (
"bufio" "bufio"
"encoding/json" "encoding/json"
"log"
"math/rand/v2" "math/rand/v2"
"os" "os"
"strconv" "strconv"
@ -46,6 +47,71 @@ func TestWavefrontPacking(t *testing.T) {
} }
} }
func GetScoreFromCIGAR(CIGAR string, penalties wfa.Penalty) int {
unpackedCIGAR := wfa.RunLengthDecode(CIGAR)
previousOp := '~'
score := 0
for _, Op := range unpackedCIGAR {
if Op == 'M' {
score = score + penalties.M
} else if Op == 'X' {
score = score + penalties.X
} else if (Op == 'I' && previousOp != 'I') || (Op == 'D' && previousOp != 'D') {
score = score + penalties.O + penalties.E
} else if (Op == 'I' && previousOp == 'I') || (Op == 'D' && previousOp == 'D') {
score = score + penalties.E
}
previousOp = Op
}
return score
}
func CheckCIGARCorrectness(s1 string, s2 string, CIGAR string) bool {
unpackedCIGAR := wfa.RunLengthDecode(CIGAR)
i := 0
j := 0
s1Aligned := strings.Builder{}
alignment := strings.Builder{}
s2Aligned := strings.Builder{}
for c := 0; c < len(unpackedCIGAR); c++ {
Op := unpackedCIGAR[c]
if Op == 'M' {
s1Aligned.WriteByte(s1[i])
alignment.WriteRune('|')
s2Aligned.WriteByte(s2[j])
i++
j++
} else if Op == 'X' {
s1Aligned.WriteByte(s1[i])
alignment.WriteRune(' ')
s2Aligned.WriteByte(s2[j])
i++
j++
} else if Op == 'I' {
s1Aligned.WriteRune('-')
alignment.WriteRune(' ')
s2Aligned.WriteByte(s2[j])
j++
} else if Op == 'D' {
s1Aligned.WriteByte(s1[i])
alignment.WriteRune('|')
s2Aligned.WriteRune('-')
i++
}
}
if i == len(s1) && j == len(s2) {
return true
} else {
log.Printf("\n%s\n%s\n%s\n i=%d, j=%d, |s1|=%d, |s2|=%d\n", s1Aligned.String(), alignment.String(), s2Aligned.String(), i, j, len(s1), len(s2))
return false
}
}
func TestWFA(t *testing.T) { func TestWFA(t *testing.T) {
content, _ := os.ReadFile(testJsonPath) content, _ := os.ReadFile(testJsonPath)
@ -73,7 +139,9 @@ func TestWFA(t *testing.T) {
for solutions.Scan() { for solutions.Scan() {
solution := solutions.Text() solution := solutions.Text()
expectedScore, _ := strconv.Atoi(strings.Split(solution, "\t")[0]) expectedScore, _ := strconv.Atoi(strings.Split(solution, "\t")[0])
expectedCIGAR := strings.Split(solution, "\t")[1]
sequences.Scan() sequences.Scan()
s1 := sequences.Text() s1 := sequences.Text()
@ -85,9 +153,25 @@ func TestWFA(t *testing.T) {
x := wfa.WFAlign(s1, s2, testPenalties, true) x := wfa.WFAlign(s1, s2, testPenalties, true)
gotScore := x.Score gotScore := x.Score
gotCIGAR := x.CIGAR
if gotScore != -1*expectedScore { if gotScore != -1*expectedScore {
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d\n`, testName, idx, s1, s2, gotScore, expectedScore) t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d`, testName, idx, s1, s2, gotScore, expectedScore)
os.Exit(1)
}
if gotCIGAR != expectedCIGAR {
checkScore := GetScoreFromCIGAR(gotCIGAR, testPenalties)
CIGARCorrectness := CheckCIGARCorrectness(s1, s2, gotCIGAR)
if checkScore != gotScore && checkScore != -1*expectedScore { // nonequivalent alignment
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR)
t.Errorf(`test: %s#%d, recalculated score: %d`, testName, idx, checkScore)
os.Exit(1)
}
if !CIGARCorrectness {
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR)
os.Exit(1)
}
} }
idx++ idx++