diff --git a/Makefile b/Makefile index 9666920..d311783 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ build: clean @echo "======================== Building Binary =======================" minify wfa.js > dist/wfa.js - GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -no-debug -opt=2 -target=wasm -o dist/wfa.wasm . + GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -panic=trap -no-debug -opt=2 -target=wasm -o dist/wfa.wasm . clean: @echo "======================== Cleaning Project ======================" diff --git a/main.go b/main.go index 35cd236..a552e0f 100644 --- a/main.go +++ b/main.go @@ -9,6 +9,7 @@ import ( func main() { c := make(chan bool) js.Global().Set("wfAlign", js.FuncOf(wfAlign)) + js.Global().Set("DecodeCIGAR", js.FuncOf(DecodeCIGAR)) <-c } @@ -70,3 +71,21 @@ func wfAlign(this js.Value, args []js.Value) interface{} { return js.ValueOf(resultMap) } + +func DecodeCIGAR(this js.Value, args []js.Value) interface{} { + if len(args) != 1 { + fmt.Println("invalid number of args, requires 1: CIGAR") + return nil + } + + if args[0].Type() != js.TypeString { + fmt.Println("s1 should be a string") + return nil + } + + CIGAR := args[0].String() + + decoded := wfa.RunLengthDecode(CIGAR) + + return js.ValueOf(decoded) +} diff --git a/pkg/types.go b/pkg/types.go index e7733b5..431c152 100644 --- a/pkg/types.go +++ b/pkg/types.go @@ -39,10 +39,10 @@ func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue { } // UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback -func UnpackWavefrontValue(wf WavefrontValue) (bool, uint32, Traceback) { - valueBM := uint32(wf & 0x0FFF_FFFF) - tracebackBM := uint8(wf & 0x7000_0000 >> 28) - validBM := wf&0x8000_0000 != 0 +func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) { + valueBM := uint32(wfv & 0x0FFF_FFFF) + tracebackBM := uint8(wfv & 0x7000_0000 >> 28) + validBM := wfv&0x8000_0000 != 0 return validBM, valueBM, Traceback(tracebackBM) } diff --git a/pkg/utils.go b/pkg/utils.go index a806159..2971f0a 100644 --- a/pkg/utils.go +++ b/pkg/utils.go @@ -2,11 +2,55 @@ package wfa import ( "math" - "unicode/utf8" + "strings" "golang.org/x/exp/constraints" ) +func UIntToString(num uint) string { // num assumed to be positive + var builder strings.Builder + + for num > 0 { + digit := num % 10 + builder.WriteRune(rune('0' + digit)) + num /= 10 + } + + // Reverse the string as we built it in reverse order + str := []rune(builder.String()) + for i, j := 0, len(str)-1; i < j; i, j = i+1, j-1 { + str[i], str[j] = str[j], str[i] + } + + return string(str) +} + +func RunLengthDecode(encoded string) string { + decoded := strings.Builder{} + length := len(encoded) + i := 0 + + for i < length { + // If the current character is a digit, we need to extract the run length + runLength := 0 + for i < length && encoded[i] >= '0' && encoded[i] <= '9' { + runLength = runLength*10 + int(encoded[i]-'0') + i++ + } + + // The next character will be the character to repeat + if i < length { + char := encoded[i] + for j := 0; j < runLength; j++ { + decoded.WriteByte(char) + } + i++ // Move past the character + } + } + + return decoded.String() +} + func SafeMin[T constraints.Integer](values []T, idx int) T { return values[idx] } @@ -51,21 +95,6 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) { } } -func Reverse(s string) string { - size := len(s) - buf := make([]byte, size) - for start := 0; start < size; { - r, n := utf8.DecodeRuneInString(s[start:]) - start += n - utf8.EncodeRune(buf[size-start:], r) - } - return string(buf) -} - -func Splice(s string, c rune, idx int) string { - return s[:idx] + string(c) + s[idx:] -} - func NextLoHi(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty) (int, int) { x := penalties.X o := penalties.O @@ -117,11 +146,8 @@ func NextD(M WavefrontComponent, D WavefrontComponent, score int, k int, penalti a_ok, a, _ := M.GetVal(score-o-e, k+1) b_ok, b, _ := D.GetVal(score-e, k+1) - ok, nextDTraceback := SafeArgMax( - []bool{a_ok, b_ok}, - []uint32{a, b}, - ) - nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) // nothing special + ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b}) + nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) if ok { D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback]) } @@ -137,7 +163,6 @@ func NextM(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sco ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c}) nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback) - if ok { M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback]) } diff --git a/pkg/wfa.go b/pkg/wfa.go index 1bc52d5..b716cbb 100644 --- a/pkg/wfa.go +++ b/pkg/wfa.go @@ -1,5 +1,9 @@ package wfa +import ( + "strings" +) + func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { n := len(s1) m := len(s2) @@ -25,7 +29,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result { CIGAR := "" if doCIGAR { - CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, s1, s2) + CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2) } return Result{ @@ -67,76 +71,128 @@ func WFNext(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sc } } -func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, s1 string, s2 string) string { - traceback_CIGAR := []string{"I", "I", "D", "D", "X", "", "", ""} +func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string { x := penalties.X o := penalties.O e := penalties.E - CIGAR_rev := "" + tb_s := score tb_k := A_k - _, _, current_traceback := M.GetVal(tb_s, tb_k) done := false + _, current_dist, current_traceback := M.GetVal(tb_s, tb_k) + + Ops := []rune{'~'} + Counts := []uint{0} + idx := 0 + for !done { - CIGAR_rev = CIGAR_rev + traceback_CIGAR[current_traceback] switch current_traceback { case OpenIns: + if Ops[idx] == 'I' { + Counts[idx]++ + } else { + Ops = append(Ops, 'I') + Counts = append(Counts, 1) + idx++ + } + tb_s = tb_s - o - e tb_k = tb_k - 1 - _, _, current_traceback = M.GetVal(tb_s, tb_k) + _, current_dist, current_traceback = M.GetVal(tb_s, tb_k) case ExtdIns: + if Ops[idx] == 'I' { + Counts[idx]++ + } else { + Ops = append(Ops, 'I') + Counts = append(Counts, 1) + idx++ + } + tb_s = tb_s - e tb_k = tb_k - 1 - _, _, current_traceback = I.GetVal(tb_s, tb_k) + _, current_dist, current_traceback = I.GetVal(tb_s, tb_k) case OpenDel: + if Ops[idx] == 'D' { + Counts[idx]++ + } else { + Ops = append(Ops, 'D') + Counts = append(Counts, 1) + idx++ + } + tb_s = tb_s - o - e tb_k = tb_k + 1 - _, _, current_traceback = M.GetVal(tb_s, tb_k) + _, current_dist, current_traceback = M.GetVal(tb_s, tb_k) case ExtdDel: + if Ops[idx] == 'D' { + Counts[idx]++ + } else { + Ops = append(Ops, 'D') + Counts = append(Counts, 1) + idx++ + } + tb_s = tb_s - e tb_k = tb_k + 1 - _, _, current_traceback = D.GetVal(tb_s, tb_k) + _, current_dist, current_traceback = D.GetVal(tb_s, tb_k) case Sub: tb_s = tb_s - x // tb_k = tb_k; - _, _, current_traceback = M.GetVal(tb_s, tb_k) + _, next_dist, next_traceback := M.GetVal(tb_s, tb_k) + + if int(current_dist-next_dist)-1 > 0 { + Ops = append(Ops, 'M') + Counts = append(Counts, uint(current_dist-next_dist)-1) + idx++ + } + + if Ops[idx] == 'X' { + Counts[idx]++ + } else { + Ops = append(Ops, 'X') + Counts = append(Counts, 1) + idx++ + } + + current_dist = next_dist + current_traceback = next_traceback case Ins: // tb_s = tb_s; // tb_k = tb_k; - _, _, current_traceback = I.GetVal(tb_s, tb_k) + _, next_dist, next_traceback := I.GetVal(tb_s, tb_k) + + Ops = append(Ops, 'M') + Counts = append(Counts, uint(current_dist-next_dist)) + idx++ + + current_dist = next_dist + current_traceback = next_traceback case Del: // tb_s = tb_s; // tb_k = tb_k; - _, _, current_traceback = D.GetVal(tb_s, tb_k) + _, next_dist, next_traceback := D.GetVal(tb_s, tb_k) + + Ops = append(Ops, 'M') + Counts = append(Counts, uint(current_dist-next_dist)) + idx++ + + current_dist = next_dist + current_traceback = next_traceback case End: + Ops = append(Ops, 'M') + Counts = append(Counts, uint(current_dist)) + idx++ + done = true } } - CIGAR_part := Reverse(CIGAR_rev) - c := 0 - i := 0 - j := 0 - for i < len(s1) && j < len(s2) { - if s1[i] == s2[j] { - //CIGAR_part.splice(c, 0, "M") - CIGAR_part = Splice(CIGAR_part, 'M', c) - c++ - i++ - j++ - } else if CIGAR_part[c] == 'X' { - c++ - i++ - j++ - } else if CIGAR_part[c] == 'I' { - c++ - j++ - } else if CIGAR_part[c] == 'D' { - c++ - i++ - } + CIGAR := strings.Builder{} + for i := len(Ops) - 1; i > 0; i-- { + CIGAR.WriteString(UIntToString(Counts[i])) + CIGAR.WriteRune(Ops[i]) } - return CIGAR_part + return CIGAR.String() } diff --git a/test/wfa_test.go b/test/wfa_test.go index 5107b66..7fdee7d 100644 --- a/test/wfa_test.go +++ b/test/wfa_test.go @@ -3,6 +3,7 @@ package tests import ( "bufio" "encoding/json" + "log" "math/rand/v2" "os" "strconv" @@ -46,6 +47,71 @@ func TestWavefrontPacking(t *testing.T) { } } +func GetScoreFromCIGAR(CIGAR string, penalties wfa.Penalty) int { + unpackedCIGAR := wfa.RunLengthDecode(CIGAR) + previousOp := '~' + score := 0 + for _, Op := range unpackedCIGAR { + if Op == 'M' { + score = score + penalties.M + } else if Op == 'X' { + score = score + penalties.X + } else if (Op == 'I' && previousOp != 'I') || (Op == 'D' && previousOp != 'D') { + score = score + penalties.O + penalties.E + } else if (Op == 'I' && previousOp == 'I') || (Op == 'D' && previousOp == 'D') { + score = score + penalties.E + } + previousOp = Op + } + return score +} + +func CheckCIGARCorrectness(s1 string, s2 string, CIGAR string) bool { + unpackedCIGAR := wfa.RunLengthDecode(CIGAR) + i := 0 + j := 0 + + s1Aligned := strings.Builder{} + alignment := strings.Builder{} + s2Aligned := strings.Builder{} + + for c := 0; c < len(unpackedCIGAR); c++ { + Op := unpackedCIGAR[c] + if Op == 'M' { + s1Aligned.WriteByte(s1[i]) + alignment.WriteRune('|') + s2Aligned.WriteByte(s2[j]) + i++ + j++ + } else if Op == 'X' { + s1Aligned.WriteByte(s1[i]) + alignment.WriteRune(' ') + s2Aligned.WriteByte(s2[j]) + i++ + j++ + } else if Op == 'I' { + + s1Aligned.WriteRune('-') + alignment.WriteRune(' ') + s2Aligned.WriteByte(s2[j]) + + j++ + } else if Op == 'D' { + s1Aligned.WriteByte(s1[i]) + alignment.WriteRune('|') + s2Aligned.WriteRune('-') + i++ + } + } + + if i == len(s1) && j == len(s2) { + return true + } else { + log.Printf("\n%s\n%s\n%s\n i=%d, j=%d, |s1|=%d, |s2|=%d\n", s1Aligned.String(), alignment.String(), s2Aligned.String(), i, j, len(s1), len(s2)) + return false + } +} + func TestWFA(t *testing.T) { content, _ := os.ReadFile(testJsonPath) @@ -73,7 +139,9 @@ func TestWFA(t *testing.T) { for solutions.Scan() { solution := solutions.Text() + expectedScore, _ := strconv.Atoi(strings.Split(solution, "\t")[0]) + expectedCIGAR := strings.Split(solution, "\t")[1] sequences.Scan() s1 := sequences.Text() @@ -85,9 +153,25 @@ func TestWFA(t *testing.T) { x := wfa.WFAlign(s1, s2, testPenalties, true) gotScore := x.Score + gotCIGAR := x.CIGAR if gotScore != -1*expectedScore { - t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d\n`, testName, idx, s1, s2, gotScore, expectedScore) + t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d`, testName, idx, s1, s2, gotScore, expectedScore) + os.Exit(1) + } + + if gotCIGAR != expectedCIGAR { + checkScore := GetScoreFromCIGAR(gotCIGAR, testPenalties) + CIGARCorrectness := CheckCIGARCorrectness(s1, s2, gotCIGAR) + if checkScore != gotScore && checkScore != -1*expectedScore { // nonequivalent alignment + t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR) + t.Errorf(`test: %s#%d, recalculated score: %d`, testName, idx, checkScore) + os.Exit(1) + } + if !CIGARCorrectness { + t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR) + os.Exit(1) + } } idx++