fix issue in WFBacktrace and change format to proper CIGAR,

add test to ensure CIGAR correctness in the case of different traceback results,
add DecodeCIGAR function to exports
This commit is contained in:
Arthur Lu 2024-11-07 19:01:01 +00:00
parent 3da3ddf10c
commit cde429cb80
6 changed files with 248 additions and 64 deletions

View File

@ -3,7 +3,7 @@
build: clean
@echo "======================== Building Binary ======================="
minify wfa.js > dist/wfa.js
GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -no-debug -opt=2 -target=wasm -o dist/wfa.wasm .
GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -panic=trap -no-debug -opt=2 -target=wasm -o dist/wfa.wasm .
clean:
@echo "======================== Cleaning Project ======================"

19
main.go
View File

@ -9,6 +9,7 @@ import (
func main() {
c := make(chan bool)
js.Global().Set("wfAlign", js.FuncOf(wfAlign))
js.Global().Set("DecodeCIGAR", js.FuncOf(DecodeCIGAR))
<-c
}
@ -70,3 +71,21 @@ func wfAlign(this js.Value, args []js.Value) interface{} {
return js.ValueOf(resultMap)
}
func DecodeCIGAR(this js.Value, args []js.Value) interface{} {
if len(args) != 1 {
fmt.Println("invalid number of args, requires 1: CIGAR")
return nil
}
if args[0].Type() != js.TypeString {
fmt.Println("s1 should be a string")
return nil
}
CIGAR := args[0].String()
decoded := wfa.RunLengthDecode(CIGAR)
return js.ValueOf(decoded)
}

View File

@ -39,10 +39,10 @@ func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue {
}
// UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback
func UnpackWavefrontValue(wf WavefrontValue) (bool, uint32, Traceback) {
valueBM := uint32(wf & 0x0FFF_FFFF)
tracebackBM := uint8(wf & 0x7000_0000 >> 28)
validBM := wf&0x8000_0000 != 0
func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) {
valueBM := uint32(wfv & 0x0FFF_FFFF)
tracebackBM := uint8(wfv & 0x7000_0000 >> 28)
validBM := wfv&0x8000_0000 != 0
return validBM, valueBM, Traceback(tracebackBM)
}

View File

@ -2,11 +2,55 @@ package wfa
import (
"math"
"unicode/utf8"
"strings"
"golang.org/x/exp/constraints"
)
func UIntToString(num uint) string { // num assumed to be positive
var builder strings.Builder
for num > 0 {
digit := num % 10
builder.WriteRune(rune('0' + digit))
num /= 10
}
// Reverse the string as we built it in reverse order
str := []rune(builder.String())
for i, j := 0, len(str)-1; i < j; i, j = i+1, j-1 {
str[i], str[j] = str[j], str[i]
}
return string(str)
}
func RunLengthDecode(encoded string) string {
decoded := strings.Builder{}
length := len(encoded)
i := 0
for i < length {
// If the current character is a digit, we need to extract the run length
runLength := 0
for i < length && encoded[i] >= '0' && encoded[i] <= '9' {
runLength = runLength*10 + int(encoded[i]-'0')
i++
}
// The next character will be the character to repeat
if i < length {
char := encoded[i]
for j := 0; j < runLength; j++ {
decoded.WriteByte(char)
}
i++ // Move past the character
}
}
return decoded.String()
}
func SafeMin[T constraints.Integer](values []T, idx int) T {
return values[idx]
}
@ -51,21 +95,6 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
}
}
func Reverse(s string) string {
size := len(s)
buf := make([]byte, size)
for start := 0; start < size; {
r, n := utf8.DecodeRuneInString(s[start:])
start += n
utf8.EncodeRune(buf[size-start:], r)
}
return string(buf)
}
func Splice(s string, c rune, idx int) string {
return s[:idx] + string(c) + s[idx:]
}
func NextLoHi(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty) (int, int) {
x := penalties.X
o := penalties.O
@ -117,11 +146,8 @@ func NextD(M WavefrontComponent, D WavefrontComponent, score int, k int, penalti
a_ok, a, _ := M.GetVal(score-o-e, k+1)
b_ok, b, _ := D.GetVal(score-e, k+1)
ok, nextDTraceback := SafeArgMax(
[]bool{a_ok, b_ok},
[]uint32{a, b},
)
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) // nothing special
ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b})
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback)
if ok {
D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback])
}
@ -137,7 +163,6 @@ func NextM(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sco
ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c})
nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback)
if ok {
M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback])
}

View File

@ -1,5 +1,9 @@
package wfa
import (
"strings"
)
func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
n := len(s1)
m := len(s2)
@ -25,7 +29,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
CIGAR := ""
if doCIGAR {
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, s1, s2)
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2)
}
return Result{
@ -67,76 +71,128 @@ func WFNext(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sc
}
}
func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, s1 string, s2 string) string {
traceback_CIGAR := []string{"I", "I", "D", "D", "X", "", "", ""}
func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string {
x := penalties.X
o := penalties.O
e := penalties.E
CIGAR_rev := ""
tb_s := score
tb_k := A_k
_, _, current_traceback := M.GetVal(tb_s, tb_k)
done := false
_, current_dist, current_traceback := M.GetVal(tb_s, tb_k)
Ops := []rune{'~'}
Counts := []uint{0}
idx := 0
for !done {
CIGAR_rev = CIGAR_rev + traceback_CIGAR[current_traceback]
switch current_traceback {
case OpenIns:
if Ops[idx] == 'I' {
Counts[idx]++
} else {
Ops = append(Ops, 'I')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - o - e
tb_k = tb_k - 1
_, _, current_traceback = M.GetVal(tb_s, tb_k)
_, current_dist, current_traceback = M.GetVal(tb_s, tb_k)
case ExtdIns:
if Ops[idx] == 'I' {
Counts[idx]++
} else {
Ops = append(Ops, 'I')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - e
tb_k = tb_k - 1
_, _, current_traceback = I.GetVal(tb_s, tb_k)
_, current_dist, current_traceback = I.GetVal(tb_s, tb_k)
case OpenDel:
if Ops[idx] == 'D' {
Counts[idx]++
} else {
Ops = append(Ops, 'D')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - o - e
tb_k = tb_k + 1
_, _, current_traceback = M.GetVal(tb_s, tb_k)
_, current_dist, current_traceback = M.GetVal(tb_s, tb_k)
case ExtdDel:
if Ops[idx] == 'D' {
Counts[idx]++
} else {
Ops = append(Ops, 'D')
Counts = append(Counts, 1)
idx++
}
tb_s = tb_s - e
tb_k = tb_k + 1
_, _, current_traceback = D.GetVal(tb_s, tb_k)
_, current_dist, current_traceback = D.GetVal(tb_s, tb_k)
case Sub:
tb_s = tb_s - x
// tb_k = tb_k;
_, _, current_traceback = M.GetVal(tb_s, tb_k)
_, next_dist, next_traceback := M.GetVal(tb_s, tb_k)
if int(current_dist-next_dist)-1 > 0 {
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist-next_dist)-1)
idx++
}
if Ops[idx] == 'X' {
Counts[idx]++
} else {
Ops = append(Ops, 'X')
Counts = append(Counts, 1)
idx++
}
current_dist = next_dist
current_traceback = next_traceback
case Ins:
// tb_s = tb_s;
// tb_k = tb_k;
_, _, current_traceback = I.GetVal(tb_s, tb_k)
_, next_dist, next_traceback := I.GetVal(tb_s, tb_k)
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist-next_dist))
idx++
current_dist = next_dist
current_traceback = next_traceback
case Del:
// tb_s = tb_s;
// tb_k = tb_k;
_, _, current_traceback = D.GetVal(tb_s, tb_k)
_, next_dist, next_traceback := D.GetVal(tb_s, tb_k)
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist-next_dist))
idx++
current_dist = next_dist
current_traceback = next_traceback
case End:
Ops = append(Ops, 'M')
Counts = append(Counts, uint(current_dist))
idx++
done = true
}
}
CIGAR_part := Reverse(CIGAR_rev)
c := 0
i := 0
j := 0
for i < len(s1) && j < len(s2) {
if s1[i] == s2[j] {
//CIGAR_part.splice(c, 0, "M")
CIGAR_part = Splice(CIGAR_part, 'M', c)
c++
i++
j++
} else if CIGAR_part[c] == 'X' {
c++
i++
j++
} else if CIGAR_part[c] == 'I' {
c++
j++
} else if CIGAR_part[c] == 'D' {
c++
i++
}
CIGAR := strings.Builder{}
for i := len(Ops) - 1; i > 0; i-- {
CIGAR.WriteString(UIntToString(Counts[i]))
CIGAR.WriteRune(Ops[i])
}
return CIGAR_part
return CIGAR.String()
}

View File

@ -3,6 +3,7 @@ package tests
import (
"bufio"
"encoding/json"
"log"
"math/rand/v2"
"os"
"strconv"
@ -46,6 +47,71 @@ func TestWavefrontPacking(t *testing.T) {
}
}
func GetScoreFromCIGAR(CIGAR string, penalties wfa.Penalty) int {
unpackedCIGAR := wfa.RunLengthDecode(CIGAR)
previousOp := '~'
score := 0
for _, Op := range unpackedCIGAR {
if Op == 'M' {
score = score + penalties.M
} else if Op == 'X' {
score = score + penalties.X
} else if (Op == 'I' && previousOp != 'I') || (Op == 'D' && previousOp != 'D') {
score = score + penalties.O + penalties.E
} else if (Op == 'I' && previousOp == 'I') || (Op == 'D' && previousOp == 'D') {
score = score + penalties.E
}
previousOp = Op
}
return score
}
func CheckCIGARCorrectness(s1 string, s2 string, CIGAR string) bool {
unpackedCIGAR := wfa.RunLengthDecode(CIGAR)
i := 0
j := 0
s1Aligned := strings.Builder{}
alignment := strings.Builder{}
s2Aligned := strings.Builder{}
for c := 0; c < len(unpackedCIGAR); c++ {
Op := unpackedCIGAR[c]
if Op == 'M' {
s1Aligned.WriteByte(s1[i])
alignment.WriteRune('|')
s2Aligned.WriteByte(s2[j])
i++
j++
} else if Op == 'X' {
s1Aligned.WriteByte(s1[i])
alignment.WriteRune(' ')
s2Aligned.WriteByte(s2[j])
i++
j++
} else if Op == 'I' {
s1Aligned.WriteRune('-')
alignment.WriteRune(' ')
s2Aligned.WriteByte(s2[j])
j++
} else if Op == 'D' {
s1Aligned.WriteByte(s1[i])
alignment.WriteRune('|')
s2Aligned.WriteRune('-')
i++
}
}
if i == len(s1) && j == len(s2) {
return true
} else {
log.Printf("\n%s\n%s\n%s\n i=%d, j=%d, |s1|=%d, |s2|=%d\n", s1Aligned.String(), alignment.String(), s2Aligned.String(), i, j, len(s1), len(s2))
return false
}
}
func TestWFA(t *testing.T) {
content, _ := os.ReadFile(testJsonPath)
@ -73,7 +139,9 @@ func TestWFA(t *testing.T) {
for solutions.Scan() {
solution := solutions.Text()
expectedScore, _ := strconv.Atoi(strings.Split(solution, "\t")[0])
expectedCIGAR := strings.Split(solution, "\t")[1]
sequences.Scan()
s1 := sequences.Text()
@ -85,9 +153,25 @@ func TestWFA(t *testing.T) {
x := wfa.WFAlign(s1, s2, testPenalties, true)
gotScore := x.Score
gotCIGAR := x.CIGAR
if gotScore != -1*expectedScore {
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d\n`, testName, idx, s1, s2, gotScore, expectedScore)
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d`, testName, idx, s1, s2, gotScore, expectedScore)
os.Exit(1)
}
if gotCIGAR != expectedCIGAR {
checkScore := GetScoreFromCIGAR(gotCIGAR, testPenalties)
CIGARCorrectness := CheckCIGARCorrectness(s1, s2, gotCIGAR)
if checkScore != gotScore && checkScore != -1*expectedScore { // nonequivalent alignment
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR)
t.Errorf(`test: %s#%d, recalculated score: %d`, testName, idx, checkScore)
os.Exit(1)
}
if !CIGARCorrectness {
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR)
os.Exit(1)
}
}
idx++