[Tarantool-patches] [PATCH vshard 1/2] storage: introduce upgrade strategy

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sun Mar 22 22:12:55 MSK 2020


Hi! Thanks for the review!

> On 21/03/2020 21:59, Vladislav Shpilevoy wrote:
> 
>> +
>> +local function schema_upgrade_to_0_1_16_0(username)
>> +    -- Since 0.1.16.0 the old versioning by
>> +    -- 'oncevshard:storage:<number>' is dropped because it is not
>> +    -- really extendible nor understandable.
>> +    log.info('Insert vshard_version into _schema')
>> +    box.space._schema:replace({'vshard_version', 0, 1, 16, 0})
>> +    box.space._schema:delete({'oncevshard:storage:1'})
>> +end
> 
> I think it's better to wrap all upgrades into transaction or may be the whole upgrade procedure below.
Ok, see my comment in the end of the email.

>> +local function schema_current_version()
>> +    local version = box.space._schema:get({'vshard_version'})
>> +    if version == nil then
>> +        version = {0, 1, 15, 0}
>> +    else
>> +        version = version:totable()
>> +        version = {version[2], version[3], version[4], version[5]}
> 
> You could use {version:unpack(2)} here. However feel free to ignore this comment.

Cool, didn't know unpack/totable has parameters.

====================
 local function schema_current_version()
     local version = box.space._schema:get({'vshard_version'})
     if version == nil then
-        version = {0, 1, 15, 0}
+        return {0, 1, 15, 0}
     else
-        version = version:totable()
-        version = {version[2], version[3], version[4], version[5]}
+        return version:totable(2)
     end
-    return version
 end
====================

>> +    end
>> +    return version
>> +end
>> +
>> +local schema_latest_version = {0, 1, 16, 0}
>> +
>> +local function schema_upgrade_replica()
>> +    local version = schema_current_version()
>> +    -- Replica can't do upgrade - it is read-only. And it
>> +    -- shouldn't anyway - that would conflict with master doing
>> +    -- the same. So the upgrade is either non-critical, and the
>> +    -- replica can work with the new code but old schema. Or it
>> +    -- it is critical, and need to wait the schema upgrade from
>> +    -- the master.
>> +    -- Or it may happen, that the upgrade just is not possible.
>> +    -- For example, when an auto-upgrade tries to change a too old
>> +    -- schema to the newest, skipping some intermediate versions.
>> +    -- For example, from 1.2.3.4 to 1.7.8.9, when it is assumed
>> +    -- that a safe upgrade should go 1.2.3.4 -> 1.2.4.1 ->
>> +    -- 1.3.1.1 and so on step by step.
>> +    if schema_compare_versions(version, schema_latest_version) == 0 then
>> +        log.info('Replica\'s vshard schema version is up to date - %s',
>> +                 table.concat(version, '.'))
> 
> You don't have such messages on a master (that current schema is the latest). I think the log level here should be "verbose".

I better drop this message. Not to spam into the log.

>> +    else
>> +        log.info('Replica\' vshard schema version is outdated - current '..
>> +                 '%s vs latest %s, but the replica still can work',
>> +                 table.concat(version, '.'),
>> +                 table.concat(schema_latest_version, '.'))
>> +    end
>> +    -- In future for hard changes the replica may be suspended
>> +    -- until its schema is synced with master. Or it may
>> +    -- reject to upgrade in case of incompatible changes. Now
>> +    -- there are too few versions so as such problems could
>> +    -- appear.
>> +end
>> +
>> +local function schema_upgrade_master(username, password)
>> +    local _schema = box.space._schema
>> +    local is_old_versioning = _schema:get({'oncevshard:storage:1'}) ~= nil
>> +    local version = schema_current_version()
>> +    local is_bootstrap = not box.space._bucket
>> +
>> +    if is_bootstrap then
>> +        schema_init_0_1_15_0(username, password)
>> +    elseif is_old_versioning then
>> +        log.info('The instance does not have vshard_version record. '..
>> +                 'It is 0.1.15.0.')
>> +    end
>> +    local handlers = {
>> +        {version = {0, 1, 16, 0}, func = schema_upgrade_to_0_1_16_0},
>> +    }
>> +    assert(schema_compare_versions(handlers[#handlers].version,
>> +                                   schema_latest_version) == 0)
>> +    for _, handler in ipairs(handlers) do
>> +        local next_version = handler.version
>> +        if schema_compare_versions(next_version, version) > 0 then
>> +            local next_version_str = table.concat(next_version, '.')
>> +            log.info("Upgrade vshard to {%s}", next_version_str)
>> +            handler.func(username, password)
> 
> I think it will be good if you wrap it into pcall and add possible error handling. I see that all changes quite trivial but there is no guarantee that it will be in future. Ideally, if you rollback all already applied changes. This is an opportunity for user to downgrade module to previous version and report about a bug without any inconsistencies after upgrade.

I tried. See diff below.

Absolutely atomic non-yield upgrade is not really possible. Some
future upgrade may require change of _bucket involving thousands
of tuples, or both DDL + DML change.

But for 0.1.16.0 the atomicity is possible, I added
box.begin/commit/rollback.

Also as you can see, I left begin/commit inside upgrade handler, so
each version upgrade may decide in its handler when start a
transaction, how many tuples to commit at once, how to rollback in
case the upgrade consists of several transactions.

Besides, upgrade is atomic only in scope of one version. If you upgrade
from ver1 to ver2, and then fail to upgrade from ver2 to ver3, only
ver3 is rolled back. Otherwise it would be too complicated to add
'downgrade' handler for each version, and support it forever. With current
way you can reload/restart with ver2's code, fix problems, and try ver3
again.

However this is hard to tell, since formally we have now only ver1 and ver2.

Besides,
1) I moved handler array to a global variable so as not to create the array
on each storage.cfg();
2) I added os.exit() in case upgrade succeeded, but version bump in _schema
failed. Because downgrade can't be done already, and on the other hand it is
not acceptable to work with the newer schema, but an older version. I didn't
trust version bump to handlers since it will be forgotten someday for sure.
(Bump happens in 0.1.16.0 handler, but only because it is a part of schema
change).

====================
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index a20bb21..c0e768a 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -323,8 +323,10 @@ local function schema_upgrade_to_0_1_16_0(username)
     -- 'oncevshard:storage:<number>' is dropped because it is not
     -- really extendible nor understandable.
     log.info('Insert vshard_version into _schema')
+    box.begin()
     box.space._schema:replace({'vshard_version', 0, 1, 16, 0})
     box.space._schema:delete({'oncevshard:storage:1'})
+    box.commit()
 end
 
 local function schema_current_version()
@@ -365,6 +367,16 @@ local function schema_upgrade_replica()
     -- appear.
 end
 
+-- Every handler should be atomic. It is either applied whole, or
+-- not applied at all. Atomic upgrade helps to downgrade in case
+-- something goes wrong. At least by doing restart with the latest
+-- successfully applied version. However, atomicity does not
+-- prohibit yields, in case the upgrade, for example, affects huge
+-- number of tuples (_bucket records, maybe).
+local schema_upgrade_handlers = {
+    {version = {0, 1, 16, 0}, func = schema_upgrade_to_0_1_16_0},
+}
+
 local function schema_upgrade_master(username, password)
     local _schema = box.space._schema
     local is_old_versioning = _schema:get({'oncevshard:storage:1'}) ~= nil
@@ -377,19 +389,28 @@ local function schema_upgrade_master(username, password)
         log.info('The instance does not have vshard_version record. '..
                  'It is 0.1.15.0.')
     end
-    local handlers = {
-        {version = {0, 1, 16, 0}, func = schema_upgrade_to_0_1_16_0},
-    }
-    assert(schema_compare_versions(handlers[#handlers].version,
+    local handler_latest_version =
+        schema_upgrade_handlers[#schema_upgrade_handlers].version
+    assert(schema_compare_versions(handler_latest_version,
                                    schema_latest_version) == 0)
-    for _, handler in ipairs(handlers) do
+    for _, handler in ipairs(schema_upgrade_handlers) do
         local next_version = handler.version
         if schema_compare_versions(next_version, version) > 0 then
             local next_version_str = table.concat(next_version, '.')
             log.info("Upgrade vshard to {%s}", next_version_str)
-            handler.func(username, password)
+            local ok, err = pcall(handler.func, username, password)
+            if not ok then
+                log.info("Couldn't upgrade to {%s}: %s", next_version_str, err)
+                error(err)
+            end
             log.info("Successful vshard upgrade to {%s}", next_version_str)
-            _schema:replace({'vshard_version', unpack(next_version)})
+            ok, err = pcall(_schema.replace, _schema,
+                            {'vshard_version', unpack(next_version)})
+            if not ok then
+                log.info("Upgraded to {%s} but couldn't update _schema "..
+                         "'vshard_version' - fatal error: %s", err)
+                os.exit(-1)
+            end
         end
     end
 end
====================


More information about the Tarantool-patches mailing list