scrive · arybczak · Aug 2, 2022 · Apr 26, 2021 · Apr 27, 2021 · Apr 28, 2021
diff --git a/src/Database/PostgreSQL/PQTypes/Checks.hs b/src/Database/PostgreSQL/PQTypes/Checks.hs
@@ -829,6 +829,45 @@ checkDBConsistency options domains tablesWithVersions migrations = do
           runSQL_ "COMMIT"
           runQuery_ (sqlDropIndexConcurrently tname idx) `finally` begin
           updateTableVersion
+
+        ModifyColumnMigration tableName cursorSql updateSql batchSize -> do
+          logMigration
+          withCursorSQL "migration_cursor" NoScroll Hold cursorSql $ \cursor -> do
+            -- Vacuum should be done approximately once every 5% of the table has been
+            -- updated, or every 50 rows as a minimum.
+            --
+            -- In PostgreSQL, when a record is updated, a new version of this record is
+            -- created. The old one is destroyed by the "vacuum" command when no
+            -- transaction needs it anymore. So there's an autovacuum daemon whose purpose
+            -- is to do this cleanup, and that is sufficient most of the time. We assume
+            -- that it's tuned to try to keep the "bloat" (dead records) at around 10% of
+            -- the table size in the environment, and it's also tuned to not saturate the
+            -- server with IO operations while doing the vacuum - vacuuming is IO
+            -- intensive as there are a lot of reads and rewrites, which makes it slow and
+            -- costly. So, autovacuum wouldn't be able to keep up with the aggressive
+            -- batch update. Therefore we need to run vacuum ourselves, to keep things in
+            -- check. The 5% limit is arbitrary, but a reasonable ballpark estimate: it
+            -- more or less makes sure we keep dead records in the 10% envelope and the
+            -- table doesn't grow too much during the operation.
+            vacuumThreshold <- max 50 . fromIntegral . (`div` 20) <$> getRowEstimate tableName
+            let cursorLoop processed = do
+                  cursorFetch_ cursor (CD_Forward batchSize)
+                  primaryKeys <- fetchMany id
+                  logInfo_ $ T.pack $ "PKS: " ++ show (length primaryKeys)
+                  unless (null primaryKeys) $ do
+                    updateSql primaryKeys
+                    commit
+                    if processed + batchSize >= vacuumThreshold
+                    then do
+                      bracket_ (runSQL_ "COMMIT")
+                               (runSQL_ "BEGIN")
+                               (runQuery_ $ "VACUUM" <+> tableName)
+                      cursorLoop 0
+                    else
+                      cursorLoop (processed + batchSize)
+            cursorLoop 0
+          updateTableVersion
+
       where
         logMigration = do
           logInfo_ $ arrListTable mgrTableName
@@ -839,6 +878,16 @@ checkDBConsistency options domains tablesWithVersions migrations = do
             sqlSet "version"  (succ mgrFrom)
             sqlWhereEq "name" (T.unpack . unRawSQL $ mgrTableName)
 
+        -- Get the estimated number of rows of the given table. It might not work properly
+        -- if the table is present in multiple database schemas.
+        -- See https://wiki.postgresql.org/wiki/Count_estimate.
+        getRowEstimate :: MonadDB m => RawSQL () -> m Int32
+        getRowEstimate tableName = do
+          runQuery_ . sqlSelect "pg_class" $ do
+            sqlResult "reltuples::integer"
+            sqlWhereEq "relname" $ unRawSQL tableName
+          fetchOne runIdentity
+
     runMigrations :: [(Text, Int32)] -> m ()
     runMigrations dbTablesWithVersions = do
       let migrationsToRun = findMigrationsToRun dbTablesWithVersions

diff --git a/src/Database/PostgreSQL/PQTypes/Model/Migration.hs b/src/Database/PostgreSQL/PQTypes/Model/Migration.hs
@@ -31,8 +31,10 @@ module Database.PostgreSQL.PQTypes.Model.Migration (
 
 import Data.Int
 
+import Database.PostgreSQL.PQTypes.FromRow (FromRow)
 import Database.PostgreSQL.PQTypes.Model.Index
 import Database.PostgreSQL.PQTypes.Model.Table
+import Database.PostgreSQL.PQTypes.SQL (SQL)
 import Database.PostgreSQL.PQTypes.SQL.Raw
 
 -- | Migration action to run, either an arbitrary 'MonadDB' action, or
@@ -57,6 +59,26 @@ data MigrationAction m =
       (RawSQL ()) -- ^ Table name
       TableIndex  -- ^ Index
 
+  -- | Migration for modifying columns. Parameters are:
+  --
+  -- Name of the table that the cursor is associated with. It has to be the same as in the
+  -- cursor SQL, see the second parameter.
+  --
+  -- SQL that will be used for the cursor.
+  --
+  -- Function that takes a list of primary keys provided by the cursor SQL and
+  -- runs an arbitrary computation within MonadDB. The function might be called
+  -- repeatedly depending on the number of primary keys. See the last argument.
+  --
+  -- Number of primary keys fetched at once by the cursor SQL.
+  -- To handle multi-column primary keys, the following needs to be done:
+  --
+  --   1. Get the list of tuples from PostgreSQL.
+  --   2. Unzip them into a tuple of lists in Haskell.
+  --   3. Pass the lists to PostgreSQL as separate parameters and zip them back in the SQL,
+  --      see https://stackoverflow.com/questions/12414750/is-there-something-like-a-zip-function-in-postgresql-that-combines-two-arrays for more details.
+  | forall t . FromRow t => ModifyColumnMigration (RawSQL ()) SQL ([t] -> m ()) Int
+
 -- | Migration object.
 data Migration m =
   Migration {
@@ -78,6 +100,7 @@ isStandardMigration Migration{..} =
     DropTableMigration{}               -> False
     CreateIndexConcurrentlyMigration{} -> False
     DropIndexConcurrentlyMigration{}   -> False
+    ModifyColumnMigration{}            -> False
 
 isDropTableMigration :: Migration m -> Bool
 isDropTableMigration Migration{..} =
@@ -86,3 +109,4 @@ isDropTableMigration Migration{..} =
     DropTableMigration{}               -> True
     CreateIndexConcurrentlyMigration{} -> False
     DropIndexConcurrentlyMigration{}   -> False
+    ModifyColumnMigration{}            -> False
diff --git a/test/Main.hs b/test/Main.hs
@@ -1,8 +1,12 @@
 module Main where
 
 import Control.Monad.Catch
+import Control.Monad (forM_)
 import Control.Monad.IO.Class
 import Data.Either
+import Data.List (zip4)
+import Data.Monoid
+import Prelude
 import Data.Typeable
 import Data.UUID.Types
 import qualified Data.Set as Set
@@ -1280,6 +1284,144 @@ triggerTests connSource =
 eitherExc :: MonadCatch m => (SomeException -> m ()) -> (a -> m ()) -> m a -> m ()
 eitherExc left right c = try c >>= either left right
 
+migrationTest5 :: ConnectionSourceM (LogT IO) -> TestTree
+migrationTest5 connSource =
+  testCaseSteps' "Migration test 5" connSource $ \step -> do
+    freshTestDB step
+
+    step "Creating the database (schema version 1)..."
+    migrateDatabase defaultExtrasOptions ["pgcrypto"] [] [] [table1] [createTableMigration table1]
+    checkDatabase defaultExtrasOptions [] [] [table1]
+
+    step "Populating the 'bank' table..."
+    runQuery_ . sqlInsert "bank" $ do
+      sqlSetList "name" $ (\i -> "bank" <> show i) <$> numbers
+      sqlSetList "location" $ (\i -> "location" <> show i) <$> numbers
+
+    -- Explicitly vacuum to update the catalog so that getting the row number estimates
+    -- works. The bracket_ trick is here because vacuum can't run inside a transaction
+    -- block, which every test runs in.
+    bracket_ (runSQL_ "COMMIT")
+             (runSQL_ "BEGIN")
+             (runSQL_ "VACUUM bank")
+
+    forM_ (zip4 tables migrations steps assertions) $
+      \(table, migration, step', assertion) -> do
+        step step'
+        migrateDatabase defaultExtrasOptions ["pgcrypto"] [] [] [table] [migration]
+        checkDatabase defaultExtrasOptions [] [] [table]
+        uncurry assertNoException assertion
+
+    freshTestDB step
+
+  where
+    -- Chosen by a fair dice roll.
+    numbers = [1..101] :: [Int]
+    table1 = tableBankSchema1
+    tables = [ table1 { tblVersion = 2
+                      , tblColumns = tblColumns table1 ++ [stringColumn]
+                      }
+             , table1 { tblVersion = 3
+                      , tblColumns = tblColumns table1 ++ [stringColumn]
+                      }
+             , table1 { tblVersion = 4
+                      , tblColumns = tblColumns table1 ++ [stringColumn, boolColumn]
+                      }
+             , table1 { tblVersion = 5
+                      , tblColumns = tblColumns table1 ++ [stringColumn, boolColumn]
+                      }
+             ]
+
+    migrations = [ addStringColumnMigration
+                 , copyStringColumnMigration
+                 , addBoolColumnMigration
+                 , modifyBoolColumnMigration
+                 ]
+
+    steps = [ "Adding string column (version 1 -> version 2)..."
+            , "Copying string column (version 2 -> version 3)..."
+            , "Adding bool column (version 3 -> version 4)..."
+            , "Modifying bool column (version 4 -> version 5)..."
+            ]
+
+    assertions =
+      [ ("Check that the string column has been added" :: String, checkAddStringColumn)
+      , ("Check that the string data has been copied", checkCopyStringColumn)
+      , ("Check that the bool column has been added", checkAddBoolColumn)
+      , ("Check that the bool column has been modified", checkModifyBoolColumn)
+      ]
+
+    stringColumn = tblColumn { colName = "name_new"
+                             , colType = TextT
+                             }
+
+    boolColumn = tblColumn { colName = "name_is_true"
+                            , colType = BoolT
+                            , colNullable = False
+                            , colDefault = Just "false"
+                            }
+
+    cursorSql = "SELECT id FROM bank" :: SQL
+
+    addStringColumnMigration = Migration
+      { mgrTableName = "bank"
+      , mgrFrom = 1
+      , mgrAction = StandardMigration $
+        runQuery_ $ sqlAlterTable "bank" [ sqlAddColumn stringColumn ]
+      }
+
+    copyStringColumnMigration = Migration
+      { mgrTableName = "bank"
+      , mgrFrom = 2
+      , mgrAction = ModifyColumnMigration "bank" cursorSql copyColumnSql 5
+      }
+    copyColumnSql :: MonadDB m => [Identity UUID] -> m ()
+    copyColumnSql primaryKeys =
+      runQuery_ . sqlUpdate "bank" $ do
+        sqlSetCmd "name_new" "bank.name"
+        sqlWhereIn "bank.id" $ runIdentity <$> primaryKeys
+
+    addBoolColumnMigration = Migration
+      { mgrTableName = "bank"
+      , mgrFrom = 3
+      , mgrAction = StandardMigration $
+        runQuery_ $ sqlAlterTable "bank" [ sqlAddColumn boolColumn ]
+      }
+
+    modifyBoolColumnMigration = Migration
+      { mgrTableName = "bank"
+      , mgrFrom = 4
+      , mgrAction = ModifyColumnMigration "bank" cursorSql modifyColumnSql 120
+      }
+    modifyColumnSql :: MonadDB m => [Identity UUID] -> m ()
+    modifyColumnSql primaryKeys =
+      runQuery_ . sqlUpdate "bank" $ do
+        sqlSet "name_is_true" True
+        sqlWhereIn "bank.id" $ runIdentity <$> primaryKeys
+
+    checkAddStringColumn = do
+      runQuery_ . sqlSelect "bank" $ sqlResult "name_new"
+      rows :: [Maybe T.Text] <- fetchMany runIdentity
+      liftIO . assertEqual "No name_new in empty column" True $ all (== Nothing) rows
+
+    checkCopyStringColumn = do
+      runQuery_ . sqlSelect "bank" $ sqlResult "name_new"
+      rows_new :: [Maybe T.Text] <- fetchMany runIdentity
+      runQuery_ . sqlSelect "bank" $ sqlResult "name"
+      rows_old :: [Maybe T.Text] <- fetchMany runIdentity
+      liftIO . assertEqual "All name_new are equal name" True $
+        all (uncurry (==)) $ zip rows_new rows_old
+
+    checkAddBoolColumn = do
+      runQuery_ . sqlSelect "bank" $ sqlResult "name_is_true"
+      rows :: [Maybe Bool] <- fetchMany runIdentity
+      liftIO . assertEqual "All name_is_true default to false" True $ all (== Just False) rows
+
+    checkModifyBoolColumn = do
+      runQuery_ . sqlSelect "bank" $ sqlResult "name_is_true"
+      rows :: [Maybe Bool] <- fetchMany runIdentity
+      liftIO . assertEqual "All name_is_true are true" True $ all (== Just True) rows
+
 assertNoException :: String -> TestM () -> TestM ()
 assertNoException t c = eitherExc
   (const $ liftIO $ assertFailure ("Exception thrown for: " ++ t))
@@ -1319,6 +1461,7 @@ main = do
                          , migrationTest2 connSource
                          , migrationTest3 connSource
                          , migrationTest4 connSource
+                         , migrationTest5 connSource
                          , triggerTests connSource
                          ]
   where