fix issue in WFBacktrace and change format to proper CIGAR,
add test to ensure CIGAR correctness in the case of different traceback results, add DecodeCIGAR function to exports
This commit is contained in:
parent
3da3ddf10c
commit
cde429cb80
2
Makefile
2
Makefile
@ -3,7 +3,7 @@
|
||||
build: clean
|
||||
@echo "======================== Building Binary ======================="
|
||||
minify wfa.js > dist/wfa.js
|
||||
GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -no-debug -opt=2 -target=wasm -o dist/wfa.wasm .
|
||||
GOOS=js GOARCH=wasm CGO_ENABLED=0 tinygo build -panic=trap -no-debug -opt=2 -target=wasm -o dist/wfa.wasm .
|
||||
|
||||
clean:
|
||||
@echo "======================== Cleaning Project ======================"
|
||||
|
19
main.go
19
main.go
@ -9,6 +9,7 @@ import (
|
||||
func main() {
|
||||
c := make(chan bool)
|
||||
js.Global().Set("wfAlign", js.FuncOf(wfAlign))
|
||||
js.Global().Set("DecodeCIGAR", js.FuncOf(DecodeCIGAR))
|
||||
<-c
|
||||
}
|
||||
|
||||
@ -70,3 +71,21 @@ func wfAlign(this js.Value, args []js.Value) interface{} {
|
||||
|
||||
return js.ValueOf(resultMap)
|
||||
}
|
||||
|
||||
func DecodeCIGAR(this js.Value, args []js.Value) interface{} {
|
||||
if len(args) != 1 {
|
||||
fmt.Println("invalid number of args, requires 1: CIGAR")
|
||||
return nil
|
||||
}
|
||||
|
||||
if args[0].Type() != js.TypeString {
|
||||
fmt.Println("s1 should be a string")
|
||||
return nil
|
||||
}
|
||||
|
||||
CIGAR := args[0].String()
|
||||
|
||||
decoded := wfa.RunLengthDecode(CIGAR)
|
||||
|
||||
return js.ValueOf(decoded)
|
||||
}
|
||||
|
@ -39,10 +39,10 @@ func PackWavefrontValue(value uint32, traceback Traceback) WavefrontValue {
|
||||
}
|
||||
|
||||
// UnpackWavefrontValue: opens a WavefrontValue into a valid bool, diag value and traceback
|
||||
func UnpackWavefrontValue(wf WavefrontValue) (bool, uint32, Traceback) {
|
||||
valueBM := uint32(wf & 0x0FFF_FFFF)
|
||||
tracebackBM := uint8(wf & 0x7000_0000 >> 28)
|
||||
validBM := wf&0x8000_0000 != 0
|
||||
func UnpackWavefrontValue(wfv WavefrontValue) (bool, uint32, Traceback) {
|
||||
valueBM := uint32(wfv & 0x0FFF_FFFF)
|
||||
tracebackBM := uint8(wfv & 0x7000_0000 >> 28)
|
||||
validBM := wfv&0x8000_0000 != 0
|
||||
return validBM, valueBM, Traceback(tracebackBM)
|
||||
}
|
||||
|
||||
|
69
pkg/utils.go
69
pkg/utils.go
@ -2,11 +2,55 @@ package wfa
|
||||
|
||||
import (
|
||||
"math"
|
||||
"unicode/utf8"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/constraints"
|
||||
)
|
||||
|
||||
func UIntToString(num uint) string { // num assumed to be positive
|
||||
var builder strings.Builder
|
||||
|
||||
for num > 0 {
|
||||
digit := num % 10
|
||||
builder.WriteRune(rune('0' + digit))
|
||||
num /= 10
|
||||
}
|
||||
|
||||
// Reverse the string as we built it in reverse order
|
||||
str := []rune(builder.String())
|
||||
for i, j := 0, len(str)-1; i < j; i, j = i+1, j-1 {
|
||||
str[i], str[j] = str[j], str[i]
|
||||
}
|
||||
|
||||
return string(str)
|
||||
}
|
||||
|
||||
func RunLengthDecode(encoded string) string {
|
||||
decoded := strings.Builder{}
|
||||
length := len(encoded)
|
||||
i := 0
|
||||
|
||||
for i < length {
|
||||
// If the current character is a digit, we need to extract the run length
|
||||
runLength := 0
|
||||
for i < length && encoded[i] >= '0' && encoded[i] <= '9' {
|
||||
runLength = runLength*10 + int(encoded[i]-'0')
|
||||
i++
|
||||
}
|
||||
|
||||
// The next character will be the character to repeat
|
||||
if i < length {
|
||||
char := encoded[i]
|
||||
for j := 0; j < runLength; j++ {
|
||||
decoded.WriteByte(char)
|
||||
}
|
||||
i++ // Move past the character
|
||||
}
|
||||
}
|
||||
|
||||
return decoded.String()
|
||||
}
|
||||
|
||||
func SafeMin[T constraints.Integer](values []T, idx int) T {
|
||||
return values[idx]
|
||||
}
|
||||
@ -51,21 +95,6 @@ func SafeArgMin[T constraints.Integer](valids []bool, values []T) (bool, int) {
|
||||
}
|
||||
}
|
||||
|
||||
func Reverse(s string) string {
|
||||
size := len(s)
|
||||
buf := make([]byte, size)
|
||||
for start := 0; start < size; {
|
||||
r, n := utf8.DecodeRuneInString(s[start:])
|
||||
start += n
|
||||
utf8.EncodeRune(buf[size-start:], r)
|
||||
}
|
||||
return string(buf)
|
||||
}
|
||||
|
||||
func Splice(s string, c rune, idx int) string {
|
||||
return s[:idx] + string(c) + s[idx:]
|
||||
}
|
||||
|
||||
func NextLoHi(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty) (int, int) {
|
||||
x := penalties.X
|
||||
o := penalties.O
|
||||
@ -117,11 +146,8 @@ func NextD(M WavefrontComponent, D WavefrontComponent, score int, k int, penalti
|
||||
a_ok, a, _ := M.GetVal(score-o-e, k+1)
|
||||
b_ok, b, _ := D.GetVal(score-e, k+1)
|
||||
|
||||
ok, nextDTraceback := SafeArgMax(
|
||||
[]bool{a_ok, b_ok},
|
||||
[]uint32{a, b},
|
||||
)
|
||||
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback) // nothing special
|
||||
ok, nextDTraceback := SafeArgMax([]bool{a_ok, b_ok}, []uint32{a, b})
|
||||
nextDVal := SafeMax([]uint32{a, b}, nextDTraceback)
|
||||
if ok {
|
||||
D.SetVal(score, k, nextDVal, []Traceback{OpenDel, ExtdDel}[nextDTraceback])
|
||||
}
|
||||
@ -137,7 +163,6 @@ func NextM(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sco
|
||||
|
||||
ok, nextMTraceback := SafeArgMax([]bool{a_ok, b_ok, c_ok}, []uint32{a, b, c})
|
||||
nextMVal := SafeMax([]uint32{a, b, c}, nextMTraceback)
|
||||
|
||||
if ok {
|
||||
M.SetVal(score, k, nextMVal, []Traceback{Sub, Ins, Del}[nextMTraceback])
|
||||
}
|
||||
|
128
pkg/wfa.go
128
pkg/wfa.go
@ -1,5 +1,9 @@
|
||||
package wfa
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
|
||||
n := len(s1)
|
||||
m := len(s2)
|
||||
@ -25,7 +29,7 @@ func WFAlign(s1 string, s2 string, penalties Penalty, doCIGAR bool) Result {
|
||||
|
||||
CIGAR := ""
|
||||
if doCIGAR {
|
||||
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, s1, s2)
|
||||
CIGAR = WFBacktrace(M, I, D, score, penalties, A_k, A_offset, s1, s2)
|
||||
}
|
||||
|
||||
return Result{
|
||||
@ -67,76 +71,128 @@ func WFNext(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, sc
|
||||
}
|
||||
}
|
||||
|
||||
func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, s1 string, s2 string) string {
|
||||
traceback_CIGAR := []string{"I", "I", "D", "D", "X", "", "", ""}
|
||||
func WFBacktrace(M WavefrontComponent, I WavefrontComponent, D WavefrontComponent, score int, penalties Penalty, A_k int, A_offset uint32, s1 string, s2 string) string {
|
||||
x := penalties.X
|
||||
o := penalties.O
|
||||
e := penalties.E
|
||||
CIGAR_rev := ""
|
||||
|
||||
tb_s := score
|
||||
tb_k := A_k
|
||||
_, _, current_traceback := M.GetVal(tb_s, tb_k)
|
||||
done := false
|
||||
|
||||
_, current_dist, current_traceback := M.GetVal(tb_s, tb_k)
|
||||
|
||||
Ops := []rune{'~'}
|
||||
Counts := []uint{0}
|
||||
idx := 0
|
||||
|
||||
for !done {
|
||||
CIGAR_rev = CIGAR_rev + traceback_CIGAR[current_traceback]
|
||||
switch current_traceback {
|
||||
case OpenIns:
|
||||
if Ops[idx] == 'I' {
|
||||
Counts[idx]++
|
||||
} else {
|
||||
Ops = append(Ops, 'I')
|
||||
Counts = append(Counts, 1)
|
||||
idx++
|
||||
}
|
||||
|
||||
tb_s = tb_s - o - e
|
||||
tb_k = tb_k - 1
|
||||
_, _, current_traceback = M.GetVal(tb_s, tb_k)
|
||||
_, current_dist, current_traceback = M.GetVal(tb_s, tb_k)
|
||||
case ExtdIns:
|
||||
if Ops[idx] == 'I' {
|
||||
Counts[idx]++
|
||||
} else {
|
||||
Ops = append(Ops, 'I')
|
||||
Counts = append(Counts, 1)
|
||||
idx++
|
||||
}
|
||||
|
||||
tb_s = tb_s - e
|
||||
tb_k = tb_k - 1
|
||||
_, _, current_traceback = I.GetVal(tb_s, tb_k)
|
||||
_, current_dist, current_traceback = I.GetVal(tb_s, tb_k)
|
||||
case OpenDel:
|
||||
if Ops[idx] == 'D' {
|
||||
Counts[idx]++
|
||||
} else {
|
||||
Ops = append(Ops, 'D')
|
||||
Counts = append(Counts, 1)
|
||||
idx++
|
||||
}
|
||||
|
||||
tb_s = tb_s - o - e
|
||||
tb_k = tb_k + 1
|
||||
_, _, current_traceback = M.GetVal(tb_s, tb_k)
|
||||
_, current_dist, current_traceback = M.GetVal(tb_s, tb_k)
|
||||
case ExtdDel:
|
||||
if Ops[idx] == 'D' {
|
||||
Counts[idx]++
|
||||
} else {
|
||||
Ops = append(Ops, 'D')
|
||||
Counts = append(Counts, 1)
|
||||
idx++
|
||||
}
|
||||
|
||||
tb_s = tb_s - e
|
||||
tb_k = tb_k + 1
|
||||
_, _, current_traceback = D.GetVal(tb_s, tb_k)
|
||||
_, current_dist, current_traceback = D.GetVal(tb_s, tb_k)
|
||||
case Sub:
|
||||
tb_s = tb_s - x
|
||||
// tb_k = tb_k;
|
||||
_, _, current_traceback = M.GetVal(tb_s, tb_k)
|
||||
_, next_dist, next_traceback := M.GetVal(tb_s, tb_k)
|
||||
|
||||
if int(current_dist-next_dist)-1 > 0 {
|
||||
Ops = append(Ops, 'M')
|
||||
Counts = append(Counts, uint(current_dist-next_dist)-1)
|
||||
idx++
|
||||
}
|
||||
|
||||
if Ops[idx] == 'X' {
|
||||
Counts[idx]++
|
||||
} else {
|
||||
Ops = append(Ops, 'X')
|
||||
Counts = append(Counts, 1)
|
||||
idx++
|
||||
}
|
||||
|
||||
current_dist = next_dist
|
||||
current_traceback = next_traceback
|
||||
case Ins:
|
||||
// tb_s = tb_s;
|
||||
// tb_k = tb_k;
|
||||
_, _, current_traceback = I.GetVal(tb_s, tb_k)
|
||||
_, next_dist, next_traceback := I.GetVal(tb_s, tb_k)
|
||||
|
||||
Ops = append(Ops, 'M')
|
||||
Counts = append(Counts, uint(current_dist-next_dist))
|
||||
idx++
|
||||
|
||||
current_dist = next_dist
|
||||
current_traceback = next_traceback
|
||||
case Del:
|
||||
// tb_s = tb_s;
|
||||
// tb_k = tb_k;
|
||||
_, _, current_traceback = D.GetVal(tb_s, tb_k)
|
||||
_, next_dist, next_traceback := D.GetVal(tb_s, tb_k)
|
||||
|
||||
Ops = append(Ops, 'M')
|
||||
Counts = append(Counts, uint(current_dist-next_dist))
|
||||
idx++
|
||||
|
||||
current_dist = next_dist
|
||||
current_traceback = next_traceback
|
||||
case End:
|
||||
Ops = append(Ops, 'M')
|
||||
Counts = append(Counts, uint(current_dist))
|
||||
idx++
|
||||
|
||||
done = true
|
||||
}
|
||||
}
|
||||
|
||||
CIGAR_part := Reverse(CIGAR_rev)
|
||||
c := 0
|
||||
i := 0
|
||||
j := 0
|
||||
for i < len(s1) && j < len(s2) {
|
||||
if s1[i] == s2[j] {
|
||||
//CIGAR_part.splice(c, 0, "M")
|
||||
CIGAR_part = Splice(CIGAR_part, 'M', c)
|
||||
c++
|
||||
i++
|
||||
j++
|
||||
} else if CIGAR_part[c] == 'X' {
|
||||
c++
|
||||
i++
|
||||
j++
|
||||
} else if CIGAR_part[c] == 'I' {
|
||||
c++
|
||||
j++
|
||||
} else if CIGAR_part[c] == 'D' {
|
||||
c++
|
||||
i++
|
||||
}
|
||||
CIGAR := strings.Builder{}
|
||||
for i := len(Ops) - 1; i > 0; i-- {
|
||||
CIGAR.WriteString(UIntToString(Counts[i]))
|
||||
CIGAR.WriteRune(Ops[i])
|
||||
}
|
||||
|
||||
return CIGAR_part
|
||||
return CIGAR.String()
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package tests
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math/rand/v2"
|
||||
"os"
|
||||
"strconv"
|
||||
@ -46,6 +47,71 @@ func TestWavefrontPacking(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func GetScoreFromCIGAR(CIGAR string, penalties wfa.Penalty) int {
|
||||
unpackedCIGAR := wfa.RunLengthDecode(CIGAR)
|
||||
previousOp := '~'
|
||||
score := 0
|
||||
for _, Op := range unpackedCIGAR {
|
||||
if Op == 'M' {
|
||||
score = score + penalties.M
|
||||
} else if Op == 'X' {
|
||||
score = score + penalties.X
|
||||
} else if (Op == 'I' && previousOp != 'I') || (Op == 'D' && previousOp != 'D') {
|
||||
score = score + penalties.O + penalties.E
|
||||
} else if (Op == 'I' && previousOp == 'I') || (Op == 'D' && previousOp == 'D') {
|
||||
score = score + penalties.E
|
||||
}
|
||||
previousOp = Op
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func CheckCIGARCorrectness(s1 string, s2 string, CIGAR string) bool {
|
||||
unpackedCIGAR := wfa.RunLengthDecode(CIGAR)
|
||||
i := 0
|
||||
j := 0
|
||||
|
||||
s1Aligned := strings.Builder{}
|
||||
alignment := strings.Builder{}
|
||||
s2Aligned := strings.Builder{}
|
||||
|
||||
for c := 0; c < len(unpackedCIGAR); c++ {
|
||||
Op := unpackedCIGAR[c]
|
||||
if Op == 'M' {
|
||||
s1Aligned.WriteByte(s1[i])
|
||||
alignment.WriteRune('|')
|
||||
s2Aligned.WriteByte(s2[j])
|
||||
i++
|
||||
j++
|
||||
} else if Op == 'X' {
|
||||
s1Aligned.WriteByte(s1[i])
|
||||
alignment.WriteRune(' ')
|
||||
s2Aligned.WriteByte(s2[j])
|
||||
i++
|
||||
j++
|
||||
} else if Op == 'I' {
|
||||
|
||||
s1Aligned.WriteRune('-')
|
||||
alignment.WriteRune(' ')
|
||||
s2Aligned.WriteByte(s2[j])
|
||||
|
||||
j++
|
||||
} else if Op == 'D' {
|
||||
s1Aligned.WriteByte(s1[i])
|
||||
alignment.WriteRune('|')
|
||||
s2Aligned.WriteRune('-')
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
if i == len(s1) && j == len(s2) {
|
||||
return true
|
||||
} else {
|
||||
log.Printf("\n%s\n%s\n%s\n i=%d, j=%d, |s1|=%d, |s2|=%d\n", s1Aligned.String(), alignment.String(), s2Aligned.String(), i, j, len(s1), len(s2))
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func TestWFA(t *testing.T) {
|
||||
content, _ := os.ReadFile(testJsonPath)
|
||||
|
||||
@ -73,7 +139,9 @@ func TestWFA(t *testing.T) {
|
||||
|
||||
for solutions.Scan() {
|
||||
solution := solutions.Text()
|
||||
|
||||
expectedScore, _ := strconv.Atoi(strings.Split(solution, "\t")[0])
|
||||
expectedCIGAR := strings.Split(solution, "\t")[1]
|
||||
|
||||
sequences.Scan()
|
||||
s1 := sequences.Text()
|
||||
@ -85,9 +153,25 @@ func TestWFA(t *testing.T) {
|
||||
|
||||
x := wfa.WFAlign(s1, s2, testPenalties, true)
|
||||
gotScore := x.Score
|
||||
gotCIGAR := x.CIGAR
|
||||
|
||||
if gotScore != -1*expectedScore {
|
||||
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d\n`, testName, idx, s1, s2, gotScore, expectedScore)
|
||||
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: %d, expected: %d`, testName, idx, s1, s2, gotScore, expectedScore)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if gotCIGAR != expectedCIGAR {
|
||||
checkScore := GetScoreFromCIGAR(gotCIGAR, testPenalties)
|
||||
CIGARCorrectness := CheckCIGARCorrectness(s1, s2, gotCIGAR)
|
||||
if checkScore != gotScore && checkScore != -1*expectedScore { // nonequivalent alignment
|
||||
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR)
|
||||
t.Errorf(`test: %s#%d, recalculated score: %d`, testName, idx, checkScore)
|
||||
os.Exit(1)
|
||||
}
|
||||
if !CIGARCorrectness {
|
||||
t.Errorf(`test: %s#%d, s1: %s, s2: %s, got: [%s], expected: [%s]`, testName, idx, s1, s2, gotCIGAR, expectedCIGAR)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
idx++
|
||||
|
Loading…
Reference in New Issue
Block a user