Skip to content

Commit e8453da

Browse files
committed
Change extension filter to list of filters and stops
1 parent 30737b1 commit e8453da

File tree

4 files changed

+263
-72
lines changed

4 files changed

+263
-72
lines changed

cmd/src/snapshot_upload.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ func copyDumpToBucket(ctx context.Context, src io.ReadSeeker, stat fs.FileInfo,
154154

155155
// Do a partial copy that trims out unwanted statements
156156
if trimExtensions {
157-
written, err := pgdump.PartialCopyWithoutExtensions(object, src, progressFn)
157+
written, err := pgdump.CommentOutInvalidLines(object, src, progressFn)
158158
if err != nil {
159159
return errors.Wrap(err, "trim extensions and upload")
160160
}

internal/pgdump/extensions.go

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,45 +8,105 @@ import (
88
"github.com/sourcegraph/sourcegraph/lib/errors"
99
)
1010

11-
// PartialCopyWithoutExtensions will perform a partial copy of a SQL database dump from
12-
// src to dst while commenting out EXTENSIONs-related statements. When it determines there
13-
// are no more EXTENSIONs-related statements, it will return, resetting src to the position
14-
// of the last contents written to dst.
11+
// CommentOutInvalidLines will comment out lines in the customer's SQL database dump file
12+
// which gcloud sql import errors out on
1513
//
16-
// This is needed for import to Google Cloud Storage, which does not like many EXTENSION
17-
// statements. For more details, see https://cloud.google.com/sql/docs/postgres/import-export/import-export-dmp
14+
// It performs a partial copy of a SQL database dump from
15+
// src to dst while commenting out the problematic lines.
16+
// When it determines there are no more EXTENSIONs-related statements,
17+
// it will return, resetting src to the position of the last contents written to dst.
18+
//
19+
// This is needed for import to Google Cloud Storage, which does not like many statements which pg_dump may insert
20+
// For more details, see https://cloud.google.com/sql/docs/postgres/import-export/import-export-dmp
1821
//
1922
// Filtering requires reading entire lines into memory - this can be a very expensive
20-
// operation, so when filtering is complete the more efficient io.Copy should be used
23+
// operation, so when filtering is complete, the more efficient io.Copy should be used
2124
// to perform the remainder of the copy from src to dst.
22-
func PartialCopyWithoutExtensions(dst io.Writer, src io.ReadSeeker, progressFn func(int64)) (int64, error) {
25+
func CommentOutInvalidLines(dst io.Writer, src io.ReadSeeker, progressFn func(int64)) (int64, error) {
2326
var (
2427
reader = bufio.NewReader(src)
25-
// position we have consumed up to, track separately because bufio.Reader may have
26-
// read ahead on src. This allows us to reset src later.
28+
29+
// Position we have consumed up to
30+
// Tracked separately because bufio.Reader may have read ahead on src
31+
// This allows us to reset src later
2732
consumed int64
28-
// number of bytes we have actually written to dst - it should always be returned.
33+
34+
// Number of bytes we have actually written to dst
35+
// It should always be returned
2936
written int64
30-
// set to true when we have done all our filtering
31-
noMoreExtensions bool
37+
38+
// Set to true when we start to hit lines which indicate that we may be finished filtering
39+
noMoreLinesToFilter bool
40+
41+
filterEndMarkers = []string{
42+
"CREATE TABLE",
43+
"INSERT INTO",
44+
}
45+
46+
linesToFilter = []string{
47+
48+
"DROP DATABASE",
49+
"CREATE DATABASE",
50+
"COMMENT ON DATABASE",
51+
52+
"DROP SCHEMA",
53+
"CREATE SCHEMA",
54+
"COMMENT ON SCHEMA",
55+
56+
"DROP EXTENSION",
57+
"CREATE EXTENSION",
58+
"COMMENT ON EXTENSION",
59+
60+
"SET transaction_timeout", // pg_dump v17, importing to Postgres 16
61+
62+
"\\connect",
63+
// "\\restrict",
64+
// "\\unrestrict",
65+
}
3266
)
3367

34-
for !noMoreExtensions {
68+
for !noMoreLinesToFilter {
69+
3570
// Read up to a line, keeping track of our position in src
3671
line, err := reader.ReadBytes('\n')
3772
consumed += int64(len(line))
38-
if err != nil {
73+
74+
// If this function has read through the whole file,
75+
// then hand the last line
76+
if err == io.EOF {
77+
noMoreLinesToFilter = true
78+
79+
// If the reader has found a different error,
80+
// then return what we've processed so far
81+
} else if err != nil {
3982
return written, err
4083
}
4184

42-
// Once we start seeing table creations, we are definitely done with extensions,
43-
// so we can hand off the rest to the superior io.Copy implementation.
44-
if bytes.HasPrefix(line, []byte("CREATE TABLE")) {
45-
// we are done with extensions
46-
noMoreExtensions = true
47-
} else if bytes.HasPrefix(line, []byte("COMMENT ON EXTENSION")) {
48-
// comment out this line
49-
line = append([]byte("-- "), line...)
85+
// Once we start seeing these lines,
86+
// we are probably done with the invalid statements,
87+
// so we can hand off the rest to the more efficient io.Copy implementation
88+
for _, filterEndMarker := range filterEndMarkers {
89+
90+
if bytes.HasPrefix(line, []byte(filterEndMarker)) {
91+
92+
// We are probably done with the invalid statements
93+
noMoreLinesToFilter = true
94+
break
95+
96+
}
97+
}
98+
99+
if !noMoreLinesToFilter {
100+
101+
for _, lineToFilter := range linesToFilter {
102+
103+
if bytes.HasPrefix(line, []byte(lineToFilter)) {
104+
105+
line = append([]byte("-- "), line...)
106+
break
107+
108+
}
109+
}
50110
}
51111

52112
// Write this line and update our progress before returning on error
@@ -58,7 +118,8 @@ func PartialCopyWithoutExtensions(dst io.Writer, src io.ReadSeeker, progressFn f
58118
}
59119
}
60120

61-
// No more extensions - reset src to the last actual consumed position
121+
// No more lines to filter
122+
// Reset src to the last actual consumed position
62123
_, err := src.Seek(consumed, io.SeekStart)
63124
if err != nil {
64125
return written, errors.Wrap(err, "reset src position")

internal/pgdump/extensions_test.go

Lines changed: 176 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,67 +4,197 @@ import (
44
"bytes"
55
"io"
66
"os"
7-
"path/filepath"
87
"runtime"
98
"testing"
109

1110
"github.com/hexops/autogold"
12-
"github.com/stretchr/testify/assert"
1311
"github.com/stretchr/testify/require"
1412
)
1513

16-
func TestPartialCopyWithoutExtensions(t *testing.T) {
17-
if runtime.GOOS == "windows" {
18-
t.Skip("Test doesn't work on Windows of weirdness with t.TempDir() handling")
19-
}
20-
21-
// Create test data - there is no stdlib in-memory io.ReadSeeker implementation
22-
src, err := os.Create(filepath.Join(t.TempDir(), t.Name()))
14+
// createTestFile creates a temporary file with the given content for testing
15+
func createTestFile(t *testing.T, content string) *os.File {
16+
src, err := os.CreateTemp(t.TempDir(), "test-*.sql")
2317
require.NoError(t, err)
24-
_, err = src.WriteString(`-- Some comment
25-
26-
CREATE EXTENSION foobar
27-
28-
COMMENT ON EXTENSION barbaz
29-
30-
CREATE TYPE asdf
31-
32-
CREATE TABLE robert (
33-
...
34-
)
35-
36-
CREATE TABLE bobhead (
37-
...
38-
)`)
18+
_, err = src.WriteString(content)
3919
require.NoError(t, err)
4020
_, err = src.Seek(0, io.SeekStart)
4121
require.NoError(t, err)
22+
return src
23+
}
4224

43-
// Set up target to assert against
44-
var dst bytes.Buffer
45-
46-
// Perform partial copy
47-
_, err = PartialCopyWithoutExtensions(&dst, src, func(i int64) {})
48-
assert.NoError(t, err)
49-
50-
// Copy rest of contents
51-
_, err = io.Copy(&dst, src)
52-
assert.NoError(t, err)
53-
54-
// Assert contents (update with -update)
55-
autogold.Want("partial-copy-without-extensions", `-- Some comment
25+
func TestCommentOutInvalidLines(t *testing.T) {
26+
if runtime.GOOS == "windows" {
27+
t.Skip("Test doesn't work on Windows of weirdness with t.TempDir() handling")
28+
}
5629

57-
CREATE EXTENSION foobar
30+
tests := []struct {
31+
name string
32+
input string
33+
want string
34+
}{
35+
36+
{
37+
name: "EOF - input file doesn't contain any filterEndMarkers, or linesToFilter",
38+
input: `
39+
--
40+
-- PostgreSQL database dump
41+
--
42+
`,
43+
want: `
44+
--
45+
-- PostgreSQL database dump
46+
--
47+
`,
48+
},
49+
{
50+
name: "EOF - input file doesn't contain any filterEndMarkers, but does contain linesToFilter",
51+
input: `
52+
DROP DATABASE pgsql;
53+
`,
54+
want: `
55+
-- DROP DATABASE pgsql;
56+
`,
57+
},
58+
{
59+
name: "Customer-realistic dump, with extensions and schemas",
60+
input: `
61+
--
62+
-- PostgreSQL database dump
63+
--
64+
65+
\restrict 1e9XN4yltwkS6RMoyhkFC6hmzrkbz4fZIVvJSYP3h5B1Qvii1WnhlslcPAzK8Tb
66+
67+
-- Dumped from database version 12.22
68+
-- Dumped by pg_dump version 14.19 (Homebrew)
69+
70+
-- Started on 2025-10-29 20:49:56 IST
71+
72+
SET transaction_timeout = 10;
73+
SET statement_timeout = 0;
74+
SET lock_timeout = 0;
75+
SET idle_in_transaction_session_timeout = 0;
76+
SET client_encoding = 'UTF8';
77+
SET standard_conforming_strings = on;
78+
SELECT pg_catalog.set_config('search_path', '', false);
79+
SET check_function_bodies = false;
80+
SET xmloption = content;
81+
SET client_min_messages = warning;
82+
SET row_security = off;
83+
84+
DROP DATABASE pgsql IF EXISTS;
85+
CREATE DATABASE pgsql;
86+
COMMENT ON DATABASE pgsql IS 'database';
87+
88+
\connect pgsql
89+
90+
DROP SCHEMA public IF EXISTS;
91+
CREATE SCHEMA public;
92+
COMMENT ON SCHEMA public IS 'schema';
93+
94+
DROP EXTENSION IF EXISTS pg_stat_statements;
95+
CREATE EXTENSION pg_stat_statements;
96+
COMMENT ON EXTENSION pg_stat_statements IS 'extension';
97+
98+
ALTER TABLE IF EXISTS ONLY "public"."webhooks" DROP CONSTRAINT IF EXISTS "webhooks_updated_by_user_id_fkey";
99+
DROP TRIGGER IF EXISTS "versions_insert" ON "public"."versions";
100+
DROP INDEX IF EXISTS "public"."webhook_logs_status_code_idx";
101+
DROP SEQUENCE IF EXISTS "public"."webhooks_id_seq";
102+
DROP TABLE IF EXISTS "public"."webhooks";
103+
SET default_tablespace = '';
104+
SET default_table_access_method = "heap";
105+
106+
CREATE TABLE "public"."access_tokens" (
107+
"id" bigint NOT NULL,
108+
"subject_user_id" integer NOT NULL,
109+
"value_sha256" "bytea" NOT NULL,
110+
"note" "text" NOT NULL,
111+
"created_at" timestamp with time zone DEFAULT "now"() NOT NULL,
112+
"last_used_at" timestamp with time zone,
113+
"deleted_at" timestamp with time zone,
114+
"creator_user_id" integer NOT NULL,
115+
"scopes" "text"[] NOT NULL,
116+
"internal" boolean DEFAULT false
117+
);
118+
119+
\unrestrict 1e9XN4yltwkS6RMoyhkFC6hmzrkbz4fZIVvJSYP3h5B1Qvii1WnhlslcPAzK8Tb
120+
`,
121+
want: `
122+
--
123+
-- PostgreSQL database dump
124+
--
125+
126+
\restrict 1e9XN4yltwkS6RMoyhkFC6hmzrkbz4fZIVvJSYP3h5B1Qvii1WnhlslcPAzK8Tb
127+
128+
-- Dumped from database version 12.22
129+
-- Dumped by pg_dump version 14.19 (Homebrew)
130+
131+
-- Started on 2025-10-29 20:49:56 IST
132+
133+
-- SET transaction_timeout = 10;
134+
SET statement_timeout = 0;
135+
SET lock_timeout = 0;
136+
SET idle_in_transaction_session_timeout = 0;
137+
SET client_encoding = 'UTF8';
138+
SET standard_conforming_strings = on;
139+
SELECT pg_catalog.set_config('search_path', '', false);
140+
SET check_function_bodies = false;
141+
SET xmloption = content;
142+
SET client_min_messages = warning;
143+
SET row_security = off;
144+
145+
-- DROP DATABASE pgsql IF EXISTS;
146+
-- CREATE DATABASE pgsql;
147+
-- COMMENT ON DATABASE pgsql IS 'database';
148+
149+
-- \connect pgsql
150+
151+
-- DROP SCHEMA public IF EXISTS;
152+
-- CREATE SCHEMA public;
153+
-- COMMENT ON SCHEMA public IS 'schema';
154+
155+
-- DROP EXTENSION IF EXISTS pg_stat_statements;
156+
-- CREATE EXTENSION pg_stat_statements;
157+
-- COMMENT ON EXTENSION pg_stat_statements IS 'extension';
158+
159+
ALTER TABLE IF EXISTS ONLY "public"."webhooks" DROP CONSTRAINT IF EXISTS "webhooks_updated_by_user_id_fkey";
160+
DROP TRIGGER IF EXISTS "versions_insert" ON "public"."versions";
161+
DROP INDEX IF EXISTS "public"."webhook_logs_status_code_idx";
162+
DROP SEQUENCE IF EXISTS "public"."webhooks_id_seq";
163+
DROP TABLE IF EXISTS "public"."webhooks";
164+
SET default_tablespace = '';
165+
SET default_table_access_method = "heap";
166+
167+
CREATE TABLE "public"."access_tokens" (
168+
"id" bigint NOT NULL,
169+
"subject_user_id" integer NOT NULL,
170+
"value_sha256" "bytea" NOT NULL,
171+
"note" "text" NOT NULL,
172+
"created_at" timestamp with time zone DEFAULT "now"() NOT NULL,
173+
"last_used_at" timestamp with time zone,
174+
"deleted_at" timestamp with time zone,
175+
"creator_user_id" integer NOT NULL,
176+
"scopes" "text"[] NOT NULL,
177+
"internal" boolean DEFAULT false
178+
);
179+
180+
\unrestrict 1e9XN4yltwkS6RMoyhkFC6hmzrkbz4fZIVvJSYP3h5B1Qvii1WnhlslcPAzK8Tb
181+
`,
182+
},
183+
}
58184

59-
-- COMMENT ON EXTENSION barbaz
185+
for _, tt := range tests {
186+
t.Run(tt.name, func(t *testing.T) {
187+
src := createTestFile(t, tt.input)
188+
var dst bytes.Buffer
60189

61-
CREATE TYPE asdf
190+
_, err := CommentOutInvalidLines(&dst, src, func(i int64) {})
191+
require.NoError(t, err)
62192

63-
CREATE TABLE robert (
64-
...
65-
)
193+
// Copy rest of contents
194+
_, err = io.Copy(&dst, src)
195+
require.NoError(t, err)
66196

67-
CREATE TABLE bobhead (
68-
...
69-
)`).Equal(t, dst.String())
197+
autogold.Want(tt.name, tt.want).Equal(t, dst.String())
198+
})
199+
}
70200
}

0 commit comments

Comments
 (0)