From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpng3.m.smailru.net (smtpng3.m.smailru.net [94.100.177.149]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 464E54696C3 for ; Sun, 22 Mar 2020 22:12:57 +0300 (MSK) References: <364ea64d7bf6beef8cea83ff2fbb432ba967cd80.1584817081.git.v.shpilevoy@tarantool.org> <41dcbae6-b009-5f2e-ca28-a71ea4541488@tarantool.org> From: Vladislav Shpilevoy Message-ID: Date: Sun, 22 Mar 2020 20:12:55 +0100 MIME-Version: 1.0 In-Reply-To: <41dcbae6-b009-5f2e-ca28-a71ea4541488@tarantool.org> Content-Type: text/plain; charset="utf-8" Content-Language: en-US Content-Transfer-Encoding: 8bit Subject: Re: [Tarantool-patches] [PATCH vshard 1/2] storage: introduce upgrade strategy List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Oleg Babin , tarantool-patches@dev.tarantool.org, yaroslav.dynnikov@tarantool.org Hi! Thanks for the review! > On 21/03/2020 21:59, Vladislav Shpilevoy wrote: > >> + >> +local function schema_upgrade_to_0_1_16_0(username) >> +    -- Since 0.1.16.0 the old versioning by >> +    -- 'oncevshard:storage:' is dropped because it is not >> +    -- really extendible nor understandable. >> +    log.info('Insert vshard_version into _schema') >> +    box.space._schema:replace({'vshard_version', 0, 1, 16, 0}) >> +    box.space._schema:delete({'oncevshard:storage:1'}) >> +end > > I think it's better to wrap all upgrades into transaction or may be the whole upgrade procedure below. Ok, see my comment in the end of the email. >> +local function schema_current_version() >> +    local version = box.space._schema:get({'vshard_version'}) >> +    if version == nil then >> +        version = {0, 1, 15, 0} >> +    else >> +        version = version:totable() >> +        version = {version[2], version[3], version[4], version[5]} > > You could use {version:unpack(2)} here. However feel free to ignore this comment. Cool, didn't know unpack/totable has parameters. ==================== local function schema_current_version() local version = box.space._schema:get({'vshard_version'}) if version == nil then - version = {0, 1, 15, 0} + return {0, 1, 15, 0} else - version = version:totable() - version = {version[2], version[3], version[4], version[5]} + return version:totable(2) end - return version end ==================== >> +    end >> +    return version >> +end >> + >> +local schema_latest_version = {0, 1, 16, 0} >> + >> +local function schema_upgrade_replica() >> +    local version = schema_current_version() >> +    -- Replica can't do upgrade - it is read-only. And it >> +    -- shouldn't anyway - that would conflict with master doing >> +    -- the same. So the upgrade is either non-critical, and the >> +    -- replica can work with the new code but old schema. Or it >> +    -- it is critical, and need to wait the schema upgrade from >> +    -- the master. >> +    -- Or it may happen, that the upgrade just is not possible. >> +    -- For example, when an auto-upgrade tries to change a too old >> +    -- schema to the newest, skipping some intermediate versions. >> +    -- For example, from 1.2.3.4 to 1.7.8.9, when it is assumed >> +    -- that a safe upgrade should go 1.2.3.4 -> 1.2.4.1 -> >> +    -- 1.3.1.1 and so on step by step. >> +    if schema_compare_versions(version, schema_latest_version) == 0 then >> +        log.info('Replica\'s vshard schema version is up to date - %s', >> +                 table.concat(version, '.')) > > You don't have such messages on a master (that current schema is the latest). I think the log level here should be "verbose". I better drop this message. Not to spam into the log. >> +    else >> +        log.info('Replica\' vshard schema version is outdated - current '.. >> +                 '%s vs latest %s, but the replica still can work', >> +                 table.concat(version, '.'), >> +                 table.concat(schema_latest_version, '.')) >> +    end >> +    -- In future for hard changes the replica may be suspended >> +    -- until its schema is synced with master. Or it may >> +    -- reject to upgrade in case of incompatible changes. Now >> +    -- there are too few versions so as such problems could >> +    -- appear. >> +end >> + >> +local function schema_upgrade_master(username, password) >> +    local _schema = box.space._schema >> +    local is_old_versioning = _schema:get({'oncevshard:storage:1'}) ~= nil >> +    local version = schema_current_version() >> +    local is_bootstrap = not box.space._bucket >> + >> +    if is_bootstrap then >> +        schema_init_0_1_15_0(username, password) >> +    elseif is_old_versioning then >> +        log.info('The instance does not have vshard_version record. '.. >> +                 'It is 0.1.15.0.') >> +    end >> +    local handlers = { >> +        {version = {0, 1, 16, 0}, func = schema_upgrade_to_0_1_16_0}, >> +    } >> +    assert(schema_compare_versions(handlers[#handlers].version, >> +                                   schema_latest_version) == 0) >> +    for _, handler in ipairs(handlers) do >> +        local next_version = handler.version >> +        if schema_compare_versions(next_version, version) > 0 then >> +            local next_version_str = table.concat(next_version, '.') >> +            log.info("Upgrade vshard to {%s}", next_version_str) >> +            handler.func(username, password) > > I think it will be good if you wrap it into pcall and add possible error handling. I see that all changes quite trivial but there is no guarantee that it will be in future. Ideally, if you rollback all already applied changes. This is an opportunity for user to downgrade module to previous version and report about a bug without any inconsistencies after upgrade. I tried. See diff below. Absolutely atomic non-yield upgrade is not really possible. Some future upgrade may require change of _bucket involving thousands of tuples, or both DDL + DML change. But for 0.1.16.0 the atomicity is possible, I added box.begin/commit/rollback. Also as you can see, I left begin/commit inside upgrade handler, so each version upgrade may decide in its handler when start a transaction, how many tuples to commit at once, how to rollback in case the upgrade consists of several transactions. Besides, upgrade is atomic only in scope of one version. If you upgrade from ver1 to ver2, and then fail to upgrade from ver2 to ver3, only ver3 is rolled back. Otherwise it would be too complicated to add 'downgrade' handler for each version, and support it forever. With current way you can reload/restart with ver2's code, fix problems, and try ver3 again. However this is hard to tell, since formally we have now only ver1 and ver2. Besides, 1) I moved handler array to a global variable so as not to create the array on each storage.cfg(); 2) I added os.exit() in case upgrade succeeded, but version bump in _schema failed. Because downgrade can't be done already, and on the other hand it is not acceptable to work with the newer schema, but an older version. I didn't trust version bump to handlers since it will be forgotten someday for sure. (Bump happens in 0.1.16.0 handler, but only because it is a part of schema change). ==================== diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index a20bb21..c0e768a 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -323,8 +323,10 @@ local function schema_upgrade_to_0_1_16_0(username) -- 'oncevshard:storage:' is dropped because it is not -- really extendible nor understandable. log.info('Insert vshard_version into _schema') + box.begin() box.space._schema:replace({'vshard_version', 0, 1, 16, 0}) box.space._schema:delete({'oncevshard:storage:1'}) + box.commit() end local function schema_current_version() @@ -365,6 +367,16 @@ local function schema_upgrade_replica() -- appear. end +-- Every handler should be atomic. It is either applied whole, or +-- not applied at all. Atomic upgrade helps to downgrade in case +-- something goes wrong. At least by doing restart with the latest +-- successfully applied version. However, atomicity does not +-- prohibit yields, in case the upgrade, for example, affects huge +-- number of tuples (_bucket records, maybe). +local schema_upgrade_handlers = { + {version = {0, 1, 16, 0}, func = schema_upgrade_to_0_1_16_0}, +} + local function schema_upgrade_master(username, password) local _schema = box.space._schema local is_old_versioning = _schema:get({'oncevshard:storage:1'}) ~= nil @@ -377,19 +389,28 @@ local function schema_upgrade_master(username, password) log.info('The instance does not have vshard_version record. '.. 'It is 0.1.15.0.') end - local handlers = { - {version = {0, 1, 16, 0}, func = schema_upgrade_to_0_1_16_0}, - } - assert(schema_compare_versions(handlers[#handlers].version, + local handler_latest_version = + schema_upgrade_handlers[#schema_upgrade_handlers].version + assert(schema_compare_versions(handler_latest_version, schema_latest_version) == 0) - for _, handler in ipairs(handlers) do + for _, handler in ipairs(schema_upgrade_handlers) do local next_version = handler.version if schema_compare_versions(next_version, version) > 0 then local next_version_str = table.concat(next_version, '.') log.info("Upgrade vshard to {%s}", next_version_str) - handler.func(username, password) + local ok, err = pcall(handler.func, username, password) + if not ok then + log.info("Couldn't upgrade to {%s}: %s", next_version_str, err) + error(err) + end log.info("Successful vshard upgrade to {%s}", next_version_str) - _schema:replace({'vshard_version', unpack(next_version)}) + ok, err = pcall(_schema.replace, _schema, + {'vshard_version', unpack(next_version)}) + if not ok then + log.info("Upgraded to {%s} but couldn't update _schema ".. + "'vshard_version' - fatal error: %s", err) + os.exit(-1) + end end end end ====================