Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add migration type for modifying columns #49

Merged
merged 8 commits into from
Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/Database/PostgreSQL/PQTypes/Checks.hs
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,48 @@ checkDBConsistency options domains tablesWithVersions migrations = do
runSQL_ "COMMIT"
runQuery_ (sqlDropIndexConcurrently tname idx) `finally` begin
updateTableVersion

ModifyColumnMigration tableName cursorSql updateSql batchSize -> do
logMigration
when (batchSize < 1000) $ do
error "Batch size cannot be less than 1000"
withCursorSQL "migration_cursor" NoScroll Hold cursorSql $ \cursor -> do
-- Vacuum should be done approximately once every 5% of the table
-- has been updated, or every 1000 rows as a minimum.
--
-- In PostgreSQL, when a record is updated, a new version of this
-- record is created. The old one is destroyed by the "vacuum"
-- command when no transaction needs it anymore. So there's an
-- autovacuum daemon whose purpose is to do this cleanup, and that
-- is sufficient most of the time. We assume that it's tuned to try
-- to keep the "bloat" (dead records) at around 10% of the table
-- size in the environment, and it's also tuned to not saturate the
-- server with IO operations while doing the vacuum - vacuuming is
-- IO intensive as there are a lot of reads and rewrites, which
-- makes it slow and costly. So, autovacuum wouldn't be able to keep
-- up with the aggressive batch update. Therefore we need to run
-- vacuum ourselves, to keep things in check. The 5% limit is
-- arbitrary, but a reasonable ballpark estimate: it more or less
-- makes sure we keep dead records in the 10% envelope and the table
-- doesn't grow too much during the operation.
vacuumThreshold <- max 1000 . fromIntegral . (`div` 20) <$> getRowEstimate tableName
let cursorLoop processed = do
cursorFetch_ cursor (CD_Forward batchSize)
primaryKeys <- fetchMany id
unless (null primaryKeys) $ do
updateSql primaryKeys
if processed + batchSize >= vacuumThreshold
then do
bracket_ (runSQL_ "COMMIT")
(runSQL_ "BEGIN")
(runQuery_ $ "VACUUM" <+> tableName)
cursorLoop 0
else do
commit
cursorLoop (processed + batchSize)
cursorLoop 0
updateTableVersion

where
logMigration = do
logInfo_ $ arrListTable mgrTableName
Expand All @@ -839,6 +881,16 @@ checkDBConsistency options domains tablesWithVersions migrations = do
sqlSet "version" (succ mgrFrom)
sqlWhereEq "name" (T.unpack . unRawSQL $ mgrTableName)

-- Get the estimated number of rows of the given table. It might not
-- work properly if the table is present in multiple database schemas.
-- See https://wiki.postgresql.org/wiki/Count_estimate.
getRowEstimate :: MonadDB m => RawSQL () -> m Int32
getRowEstimate tableName = do
runQuery_ . sqlSelect "pg_class" $ do
sqlResult "reltuples::integer"
sqlWhereEq "relname" $ unRawSQL tableName
fetchOne runIdentity

runMigrations :: [(Text, Int32)] -> m ()
runMigrations dbTablesWithVersions = do
let migrationsToRun = findMigrationsToRun dbTablesWithVersions
Expand Down
24 changes: 24 additions & 0 deletions src/Database/PostgreSQL/PQTypes/Model/Migration.hs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ module Database.PostgreSQL.PQTypes.Model.Migration (

import Data.Int

import Database.PostgreSQL.PQTypes.FromRow (FromRow)
import Database.PostgreSQL.PQTypes.Model.Index
import Database.PostgreSQL.PQTypes.Model.Table
import Database.PostgreSQL.PQTypes.SQL (SQL)
import Database.PostgreSQL.PQTypes.SQL.Raw

-- | Migration action to run, either an arbitrary 'MonadDB' action, or
Expand All @@ -57,6 +59,26 @@ data MigrationAction m =
(RawSQL ()) -- ^ Table name
TableIndex -- ^ Index

-- | Migration for modifying columns. Parameters are:
--
-- Name of the table that the cursor is associated with. It has to be the same as in the
-- cursor SQL, see the second parameter.
--
-- SQL that will be used for the cursor.
--
-- Function that takes a list of primary keys provided by the cursor SQL and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, it doesn't have to be primary keys right? As long as you can uniquely identify a column.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unique not null, i suppose (just to be pedantic, but that's important)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After re-reading my comment I have to make a correction: I mean ROW, not COLUMN.

But I'm not sure how not null applies here. You either uniquely identify a record or don't, there's no space for NULLs in that. Maybe you meant that column that one would/could reference has UNIQUE NOT NULL constraint, but I believe that is what uniquely identify a row/record implies in that case.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unique not null is just the sql constraint. unique is a dangerous term when talking about sql databases, because null = null returns null, which is neither true nor false, except if you really are in a predicate context, where it finally becomes false. so i supposed you were talking about the constraint, which has to be unique not null if you really want to enforce the column to be unique (in the usual sense of unique). So as I said, that's me being pedantic, but i just wanted to be sure the constraint, in the end, is unique not null

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this what people mean when they say a candidate key

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah more or less, i didn't want to confuse people. that could also work with superkeys, but those are relational algebra terms, like candidate keys. Anything that can identify uniquely a record. I just said unique not null, just because "unique" is tricky in SQL, I wanted to be sure that everybody was on the same page.

-- runs an arbitrary computation within MonadDB. The function might be called
-- repeatedly depending on the number of primary keys. See the last argument.
--
-- Number of primary keys fetched at once by the cursor SQL.
-- To handle multi-column primary keys, the following needs to be done:
--
-- 1. Get the list of tuples from PostgreSQL.
-- 2. Unzip them into a tuple of lists in Haskell.
-- 3. Pass the lists to PostgreSQL as separate parameters and zip them back in the SQL,
-- see https://stackoverflow.com/questions/12414750/is-there-something-like-a-zip-function-in-postgresql-that-combines-two-arrays for more details.
| forall t . FromRow t => ModifyColumnMigration (RawSQL ()) SQL ([t] -> m ()) Int

-- | Migration object.
data Migration m =
Migration {
Expand All @@ -78,6 +100,7 @@ isStandardMigration Migration{..} =
DropTableMigration{} -> False
CreateIndexConcurrentlyMigration{} -> False
DropIndexConcurrentlyMigration{} -> False
ModifyColumnMigration{} -> False

isDropTableMigration :: Migration m -> Bool
isDropTableMigration Migration{..} =
Expand All @@ -86,3 +109,4 @@ isDropTableMigration Migration{..} =
DropTableMigration{} -> True
CreateIndexConcurrentlyMigration{} -> False
DropIndexConcurrentlyMigration{} -> False
ModifyColumnMigration{} -> False
143 changes: 143 additions & 0 deletions test/Main.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
module Main where

import Control.Monad.Catch
import Control.Monad (forM_)
import Control.Monad.IO.Class
import Data.Either
import Data.List (zip4)
import Data.Monoid
import Prelude
import Data.Typeable
import Data.UUID.Types
import qualified Data.Set as Set
Expand Down Expand Up @@ -1280,6 +1284,144 @@ triggerTests connSource =
eitherExc :: MonadCatch m => (SomeException -> m ()) -> (a -> m ()) -> m a -> m ()
eitherExc left right c = try c >>= either left right

migrationTest5 :: ConnectionSourceM (LogT IO) -> TestTree
migrationTest5 connSource =
testCaseSteps' "Migration test 5" connSource $ \step -> do
freshTestDB step

step "Creating the database (schema version 1)..."
migrateDatabase defaultExtrasOptions ["pgcrypto"] [] [] [table1] [createTableMigration table1]
checkDatabase defaultExtrasOptions [] [] [table1]

step "Populating the 'bank' table..."
runQuery_ . sqlInsert "bank" $ do
sqlSetList "name" $ (\i -> "bank" <> show i) <$> numbers
sqlSetList "location" $ (\i -> "location" <> show i) <$> numbers

-- Explicitly vacuum to update the catalog so that getting the row number estimates
-- works. The bracket_ trick is here because vacuum can't run inside a transaction
-- block, which every test runs in.
bracket_ (runSQL_ "COMMIT")
(runSQL_ "BEGIN")
(runSQL_ "VACUUM bank")

forM_ (zip4 tables migrations steps assertions) $
\(table, migration, step', assertion) -> do
step step'
migrateDatabase defaultExtrasOptions ["pgcrypto"] [] [] [table] [migration]
checkDatabase defaultExtrasOptions [] [] [table]
uncurry assertNoException assertion

freshTestDB step

where
-- Chosen by a fair dice roll.
numbers = [1..101] :: [Int]
table1 = tableBankSchema1
tables = [ table1 { tblVersion = 2
, tblColumns = tblColumns table1 ++ [stringColumn]
}
, table1 { tblVersion = 3
, tblColumns = tblColumns table1 ++ [stringColumn]
}
, table1 { tblVersion = 4
, tblColumns = tblColumns table1 ++ [stringColumn, boolColumn]
}
, table1 { tblVersion = 5
, tblColumns = tblColumns table1 ++ [stringColumn, boolColumn]
}
]

migrations = [ addStringColumnMigration
, copyStringColumnMigration
, addBoolColumnMigration
, modifyBoolColumnMigration
]

steps = [ "Adding string column (version 1 -> version 2)..."
, "Copying string column (version 2 -> version 3)..."
, "Adding bool column (version 3 -> version 4)..."
, "Modifying bool column (version 4 -> version 5)..."
]

assertions =
[ ("Check that the string column has been added" :: String, checkAddStringColumn)
, ("Check that the string data has been copied", checkCopyStringColumn)
, ("Check that the bool column has been added", checkAddBoolColumn)
, ("Check that the bool column has been modified", checkModifyBoolColumn)
]

stringColumn = tblColumn { colName = "name_new"
, colType = TextT
}

boolColumn = tblColumn { colName = "name_is_true"
, colType = BoolT
, colNullable = False
, colDefault = Just "false"
}

cursorSql = "SELECT id FROM bank" :: SQL

addStringColumnMigration = Migration
{ mgrTableName = "bank"
, mgrFrom = 1
, mgrAction = StandardMigration $
runQuery_ $ sqlAlterTable "bank" [ sqlAddColumn stringColumn ]
}

copyStringColumnMigration = Migration
{ mgrTableName = "bank"
, mgrFrom = 2
, mgrAction = ModifyColumnMigration "bank" cursorSql copyColumnSql 1000
}
copyColumnSql :: MonadDB m => [Identity UUID] -> m ()
copyColumnSql primaryKeys =
runQuery_ . sqlUpdate "bank" $ do
sqlSetCmd "name_new" "bank.name"
sqlWhereIn "bank.id" $ runIdentity <$> primaryKeys

addBoolColumnMigration = Migration
{ mgrTableName = "bank"
, mgrFrom = 3
, mgrAction = StandardMigration $
runQuery_ $ sqlAlterTable "bank" [ sqlAddColumn boolColumn ]
}

modifyBoolColumnMigration = Migration
{ mgrTableName = "bank"
, mgrFrom = 4
, mgrAction = ModifyColumnMigration "bank" cursorSql modifyColumnSql 1000
}
modifyColumnSql :: MonadDB m => [Identity UUID] -> m ()
modifyColumnSql primaryKeys =
runQuery_ . sqlUpdate "bank" $ do
sqlSet "name_is_true" True
sqlWhereIn "bank.id" $ runIdentity <$> primaryKeys

checkAddStringColumn = do
runQuery_ . sqlSelect "bank" $ sqlResult "name_new"
rows :: [Maybe T.Text] <- fetchMany runIdentity
liftIO . assertEqual "No name_new in empty column" True $ all (== Nothing) rows

checkCopyStringColumn = do
runQuery_ . sqlSelect "bank" $ sqlResult "name_new"
rows_new :: [Maybe T.Text] <- fetchMany runIdentity
runQuery_ . sqlSelect "bank" $ sqlResult "name"
rows_old :: [Maybe T.Text] <- fetchMany runIdentity
liftIO . assertEqual "All name_new are equal name" True $
all (uncurry (==)) $ zip rows_new rows_old

checkAddBoolColumn = do
runQuery_ . sqlSelect "bank" $ sqlResult "name_is_true"
rows :: [Maybe Bool] <- fetchMany runIdentity
liftIO . assertEqual "All name_is_true default to false" True $ all (== Just False) rows

checkModifyBoolColumn = do
runQuery_ . sqlSelect "bank" $ sqlResult "name_is_true"
rows :: [Maybe Bool] <- fetchMany runIdentity
liftIO . assertEqual "All name_is_true are true" True $ all (== Just True) rows

assertNoException :: String -> TestM () -> TestM ()
assertNoException t c = eitherExc
(const $ liftIO $ assertFailure ("Exception thrown for: " ++ t))
Expand Down Expand Up @@ -1319,6 +1461,7 @@ main = do
, migrationTest2 connSource
, migrationTest3 connSource
, migrationTest4 connSource
, migrationTest5 connSource
, triggerTests connSource
]
where
Expand Down