Skip to content

Commit f83c74b

Browse files
yihuangmmsqe
andauthored
Problem: memiavl don't recover corrupted wal tail (#1073)
* Problem: memiavl don't recover corrupted wal tail Solution: - fix in wal and update dependencies * Update CHANGELOG.md Signed-off-by: yihuang <huang@crypto.com> * fix truncate wal index * don't patch upstream * Update CHANGELOG.md Signed-off-by: yihuang <huang@crypto.com> * no dep change * fix lint * fix lint * fix lint --------- Signed-off-by: yihuang <huang@crypto.com> Co-authored-by: mmsqe <mavis@crypto.com>
1 parent 06a0b5d commit f83c74b

File tree

4 files changed

+148
-2
lines changed

4 files changed

+148
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
- [#1058](https://github.com/crypto-org-chain/cronos/pull/1058) Fix decode log for multi topics in websocket subscribe ([ethermint commit](https://github.com/crypto-org-chain/ethermint/commit/2136ad029860c819942ad1836dd3f42585002233)).
2929
- [#1062](https://github.com/crypto-org-chain/cronos/pull/1062) Update cometbft `v0.34.29` with several minor bug fixes and low-severity security-fixes.
3030
- [#1075](https://github.com/crypto-org-chain/cronos/pull/1075) Add missing close in memiavl to avoid resource leaks.
31+
- [#1073](https://github.com/crypto-org-chain/cronos/pull/1073) memiavl automatically truncate corrupted wal tail.
3132

3233
### Features
3334

memiavl/db.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func Load(dir string, opts Options) (*DB, error) {
129129
}
130130
}
131131

132-
wal, err := wal.Open(walPath(dir), &wal.Options{NoCopy: true, NoSync: true})
132+
wal, err := OpenWAL(walPath(dir), &wal.Options{NoCopy: true, NoSync: true})
133133
if err != nil {
134134
return nil, err
135135
}
@@ -316,7 +316,7 @@ func (db *DB) pruneSnapshots() {
316316
db.logger.Error("failed to find first snapshot", "err", err)
317317
}
318318

319-
if err := db.wal.TruncateFront(uint64(earliestVersion + 1)); err != nil {
319+
if err := db.wal.TruncateFront(walIndex(earliestVersion+1, db.initialVersion)); err != nil {
320320
db.logger.Error("failed to truncate wal", "err", err, "version", earliestVersion+1)
321321
}
322322
}()

memiavl/wal.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package memiavl
2+
3+
import (
4+
"bytes"
5+
"encoding/binary"
6+
"fmt"
7+
"os"
8+
"path/filepath"
9+
"unsafe"
10+
11+
"github.com/tidwall/gjson"
12+
"github.com/tidwall/wal"
13+
)
14+
15+
// OpenWAL opens the write ahead log, try to truncate the corrupted tail if there's any
16+
// TODO fix in upstream: https://github.com/tidwall/wal/pull/22
17+
func OpenWAL(dir string, opts *wal.Options) (*wal.Log, error) {
18+
log, err := wal.Open(dir, opts)
19+
if err == wal.ErrCorrupt {
20+
// try to truncate corrupted tail
21+
var fis []os.DirEntry
22+
fis, err = os.ReadDir(dir)
23+
if err != nil {
24+
return nil, fmt.Errorf("read wal dir fail: %w", err)
25+
}
26+
var lastSeg string
27+
for _, fi := range fis {
28+
if fi.IsDir() || len(fi.Name()) < 20 {
29+
continue
30+
}
31+
lastSeg = fi.Name()
32+
}
33+
34+
if len(lastSeg) == 0 {
35+
return nil, err
36+
}
37+
if err = truncateCorruptedTail(filepath.Join(dir, lastSeg), opts.LogFormat); err != nil {
38+
return nil, fmt.Errorf("truncate corrupted tail fail: %w", err)
39+
}
40+
41+
// try again
42+
return wal.Open(dir, opts)
43+
}
44+
45+
return log, err
46+
}
47+
48+
func truncateCorruptedTail(path string, format wal.LogFormat) error {
49+
data, err := os.ReadFile(path)
50+
if err != nil {
51+
return err
52+
}
53+
var pos int
54+
for len(data) > 0 {
55+
var n int
56+
if format == wal.JSON {
57+
n, err = loadNextJSONEntry(data)
58+
} else {
59+
n, err = loadNextBinaryEntry(data)
60+
}
61+
if err == wal.ErrCorrupt {
62+
break
63+
}
64+
if err != nil {
65+
return err
66+
}
67+
data = data[n:]
68+
pos += n
69+
}
70+
if pos != len(data) {
71+
return os.Truncate(path, int64(pos))
72+
}
73+
return nil
74+
}
75+
76+
func loadNextJSONEntry(data []byte) (n int, err error) {
77+
// {"index":number,"data":string}
78+
idx := bytes.IndexByte(data, '\n')
79+
if idx == -1 {
80+
return 0, wal.ErrCorrupt
81+
}
82+
line := data[:idx]
83+
dres := gjson.Get(*(*string)(unsafe.Pointer(&line)), "data")
84+
if dres.Type != gjson.String {
85+
return 0, wal.ErrCorrupt
86+
}
87+
return idx + 1, nil
88+
}
89+
90+
func loadNextBinaryEntry(data []byte) (n int, err error) {
91+
// data_size + data
92+
size, n := binary.Uvarint(data)
93+
if n <= 0 {
94+
return 0, wal.ErrCorrupt
95+
}
96+
if uint64(len(data)-n) < size {
97+
return 0, wal.ErrCorrupt
98+
}
99+
return n + int(size), nil
100+
}

memiavl/wal_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package memiavl
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/stretchr/testify/require"
9+
"github.com/tidwall/wal"
10+
)
11+
12+
func TestCorruptedTail(t *testing.T) {
13+
opts := &wal.Options{
14+
LogFormat: wal.JSON,
15+
}
16+
dir := t.TempDir()
17+
18+
testCases := []struct {
19+
name string
20+
logs []byte
21+
lastIndex uint64
22+
}{
23+
{"failure-1", []byte("\n"), 0},
24+
{"failure-2", []byte(`{}` + "\n"), 0},
25+
{"failure-3", []byte(`{"index":"1"}` + "\n"), 0},
26+
{"failure-4", []byte(`{"index":"1","data":"?"}`), 0},
27+
{"failure-5", []byte(`{"index":1,"data":"?"}` + "\n" + `{"index":"1","data":"?"}`), 1},
28+
}
29+
30+
for _, tc := range testCases {
31+
t.Run(tc.name, func(t *testing.T) {
32+
os.WriteFile(filepath.Join(dir, "00000000000000000001"), tc.logs, 0o600)
33+
34+
_, err := wal.Open(dir, opts)
35+
require.Equal(t, wal.ErrCorrupt, err)
36+
37+
log, err := OpenWAL(dir, opts)
38+
require.NoError(t, err)
39+
40+
lastIndex, err := log.LastIndex()
41+
require.NoError(t, err)
42+
require.Equal(t, tc.lastIndex, lastIndex)
43+
})
44+
}
45+
}

0 commit comments

Comments
 (0)