diff --git a/0020-src-os-bluestore-BlueFS.cc.patch b/0020-src-os-bluestore-BlueFS.cc.patch new file mode 100644 index 0000000..49ed09a --- /dev/null +++ b/0020-src-os-bluestore-BlueFS.cc.patch @@ -0,0 +1,438 @@ +diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in +index 7a4e581fbec..3a277418f73 100644 +--- a/src/common/options/global.yaml.in ++++ b/src/common/options/global.yaml.in +@@ -3260,6 +3260,13 @@ options: + slow shutdown is primarilyy useful for doing memory leak checking with valgrind. + default: true + with_legacy: true ++- name: osd_fast_shutdown_timeout ++ type: int ++ level: advanced ++ desc: timeout in seconds for osd fast-shutdown (0 is unlimited) ++ default: 15 ++ with_legacy: true ++ min: 0 + - name: osd_fast_shutdown_notify_mon + type: bool + level: advanced +@@ -4931,6 +4938,12 @@ options: + This setting is used only when OSD is doing ``--mkfs``. + Next runs of OSD retrieve sharding from disk. + default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P ++- name: bluestore_qfsck_on_mount ++ type: bool ++ level: dev ++ desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state ++ default: true ++ with_legacy: true + - name: bluestore_fsck_on_mount + type: bool + level: dev +diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h +index d934d092919..44d67c26e88 100644 +--- a/src/os/ObjectStore.h ++++ b/src/os/ObjectStore.h +@@ -288,7 +288,8 @@ public: + virtual bool needs_journal() = 0; //< requires a journal + virtual bool wants_journal() = 0; //< prefers a journal + virtual bool allows_journal() = 0; //< allows a journal +- ++ virtual void prepare_for_fast_shutdown() {} ++ virtual bool has_null_manager() { return false; } + // return store min allocation size, if applicable + virtual uint64_t get_min_alloc_size() const { + return 0; +diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc +index 0b9bb0bba8e..baae7c5ab2b 100644 +--- a/src/os/bluestore/BlueFS.cc ++++ b/src/os/bluestore/BlueFS.cc +@@ -2507,6 +2507,9 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback, + } + #endif + _flush_bdev(); ++ ++log.seq_live; ++ dirty.seq_live = log.seq_live; ++ log.t.seq = log.seq_live; + + super.memorized_layout = layout; + super.log_fnode = log_file->fnode; +diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc +index d1a0fe4897c..86062f290f0 100644 +--- a/src/os/bluestore/BlueStore.cc ++++ b/src/os/bluestore/BlueStore.cc +@@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num) + } + } + ++//--------------------------------------------- ++bool BlueStore::has_null_manager() ++{ ++ return (fm && fm->is_null_manager()); ++} ++ + int BlueStore::_mount() + { + dout(5) << __func__ << "NCB:: path " << path << dendl; ++ + _kv_only = false; + if (cct->_conf->bluestore_fsck_on_mount) { + dout(5) << __func__ << "::NCB::calling fsck()" << dendl; +@@ -7681,12 +7688,15 @@ int BlueStore::umount() + #endif + dout(20) << __func__ << " stopping kv thread" << dendl; + _kv_stop(); +- _shutdown_cache(); ++ // skip cache cleanup step on fast shutdown ++ if (likely(!m_fast_shutdown)) { ++ _shutdown_cache(); ++ } + dout(20) << __func__ << " closing" << dendl; + } +- + _close_db_and_around(); +- if (cct->_conf->bluestore_fsck_on_umount) { ++ // disable fsck on fast-shutdown ++ if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) { + int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep); + if (rc < 0) + return rc; +@@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node( + return 0; + } + ++void BlueStore::prepare_for_fast_shutdown() ++{ ++ m_fast_shutdown = true; ++} ++ + int BlueStore::get_devices(set *ls) + { + if (bdev) { +@@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf, + string key_prefix; + _key_encode_u64(pool_id, &key_prefix); + *out_per_pool_omap = per_pool_omap != OMAP_BULK; +- if (*out_per_pool_omap) { ++ // stop calls after db was closed ++ if (*out_per_pool_omap && db) { + auto prefix = per_pool_omap == OMAP_PER_POOL ? + PREFIX_PERPOOL_OMAP : + PREFIX_PERPG_OMAP; +@@ -18344,11 +18360,10 @@ int BlueStore::store_allocator(Allocator* src_allocator) + return -1; + } + } +- ++ bluefs->compact_log(); + // reuse previous file-allocation if exists + ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr); + bool overwrite_file = (ret == 0); +- //derr << __func__ << "bluefs->open_for_write(" << overwrite_file << ")" << dendl; + BlueFS::FileWriter *p_handle = nullptr; + ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file); + if (ret != 0) { +@@ -18358,8 +18373,9 @@ int BlueStore::store_allocator(Allocator* src_allocator) + + uint64_t file_size = p_handle->file->fnode.size; + uint64_t allocated = p_handle->file->fnode.get_allocated(); +- dout(5) << "file_size=" << file_size << ", allocated=" << allocated << dendl; ++ dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl; + ++ bluefs->sync_metadata(false); + unique_ptr allocator(clone_allocator_without_bluefs(src_allocator)); + if (!allocator) { + bluefs->close_writer(p_handle); +@@ -18431,12 +18447,11 @@ int BlueStore::store_allocator(Allocator* src_allocator) + bluefs->fsync(p_handle); + + utime_t duration = ceph_clock_now() - start_time; +- dout(5) <<"WRITE-extent_count=" << extent_count << ", file_size=" << p_handle->file->fnode.size << dendl; ++ dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl; + dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl; + + bluefs->close_writer(p_handle); + need_to_destage_allocation_file = false; +- dout(10) << "need_to_destage_allocation_file was clear" << dendl; + return 0; + } + +@@ -18628,7 +18643,7 @@ int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t + utime_t duration = ceph_clock_now() - start_time; + dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= " + << read_alloc_size << ", file_size=" << file_size << dendl; +- dout(5) << "READ duration=" << duration << " seconds, s_serial=" << s_serial << dendl; ++ dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl; + *num = extent_count; + *bytes = read_alloc_size; + return 0; +@@ -18923,7 +18938,7 @@ int BlueStore::read_allocation_from_drive_on_startup() + + utime_t start = ceph_clock_now(); + read_alloc_stats_t stats = {}; +- SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size)); ++ SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size)); + ret = reconstruct_allocations(&sbmap, stats); + if (ret != 0) { + return ret; +@@ -19025,15 +19040,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t + return 0; + } else { + derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl; +- std::cout << "===================================================================" << std::endl; +- for (uint64_t i = 0; i < idx1; i++) { +- std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl; +- } +- +- std::cout << "===================================================================" << std::endl; +- for (uint64_t i = 0; i < idx2; i++) { +- std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl; +- } + return -1; + } + } +@@ -19081,9 +19087,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() + utime_t start = ceph_clock_now(); + + auto shutdown_cache = make_scope_guard([&] { +- std::cout << "Allocation Recovery was completed in " << duration +- << " seconds; insert_count=" << stats.insert_count +- << "; extent_count=" << stats.extent_count << std::endl; ++ dout(1) << "Allocation Recovery was completed in " << duration ++ << " seconds; insert_count=" << stats.insert_count ++ << "; extent_count=" << stats.extent_count << dendl; + _shutdown_cache(); + _close_db_and_around(); + }); +@@ -19092,7 +19098,7 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() + auto allocator = unique_ptr(create_bitmap_allocator(bdev->get_size())); + //reconstruct allocations into a temp simple-bitmap and copy into allocator + { +- SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size)); ++ SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size)); + ret = reconstruct_allocations(&sbmap, stats); + if (ret != 0) { + return ret; +@@ -19113,14 +19119,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool() + }; + allocator->dump(count_entries); + ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target); +- if (ret != 0) { ++ if (ret == 0) { + dout(5) << "Allocator drive - file integrity check OK" << dendl; + } else { + derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl; + } + } + +- std::cout << stats << std::endl; ++ dout(1) << stats << dendl; + return ret; + } + +diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h +index 72cfc2d076b..0f804595ebb 100644 +--- a/src/os/bluestore/BlueStore.h ++++ b/src/os/bluestore/BlueStore.h +@@ -2764,7 +2764,7 @@ public: + + private: + int32_t ondisk_format = 0; ///< value detected on mount +- ++ bool m_fast_shutdown = false; + int _upgrade_super(); ///< upgrade (called during open_super) + uint64_t _get_ondisk_reserved() const; + void _prepare_ondisk_format_super(KeyValueDB::Transaction& t); +@@ -2783,6 +2783,9 @@ public: + bool wants_journal() override { return false; }; + bool allows_journal() override { return false; }; + ++ void prepare_for_fast_shutdown() override; ++ virtual bool has_null_manager(); ++ + uint64_t get_min_alloc_size() const override { + return min_alloc_size; + } +diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc +index 7658fb59911..6def6621c1e 100644 +--- a/src/osd/OSD.cc ++++ b/src/osd/OSD.cc +@@ -4245,27 +4245,44 @@ PerfCounters* OSD::create_recoverystate_perf() + + int OSD::shutdown() + { ++ // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here! ++ //cct->_conf->osd_fast_shutdown = true; ++ ++ dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = " ++ << cct->_conf->osd_fast_shutdown ++ << ", null-fm = " << store->has_null_manager() << dendl; ++ ++ utime_t start_time_func = ceph_clock_now(); ++ + if (cct->_conf->osd_fast_shutdown) { + derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl; + if (cct->_conf->osd_fast_shutdown_notify_mon) + service.prepare_to_stop(); +- cct->_log->flush(); +- _exit(0); +- } + +- if (!service.prepare_to_stop()) ++ // There is no state we need to keep wehn running in NULL-FM moode ++ if (!store->has_null_manager()) { ++ cct->_log->flush(); ++ _exit(0); ++ } ++ } else if (!service.prepare_to_stop()) { + return 0; // already shutting down ++ } ++ + osd_lock.lock(); + if (is_stopping()) { + osd_lock.unlock(); + return 0; + } +- dout(0) << "shutdown" << dendl; + ++ if (!cct->_conf->osd_fast_shutdown) { ++ dout(0) << "shutdown" << dendl; ++ } ++ ++ // don't accept new task for this OSD + set_state(STATE_STOPPING); + +- // Debugging +- if (cct->_conf.get_val("osd_debug_shutdown")) { ++ // Disabled debugging during fast-shutdown ++ if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val("osd_debug_shutdown")) { + cct->_conf.set_val("debug_osd", "100"); + cct->_conf.set_val("debug_journal", "100"); + cct->_conf.set_val("debug_filestore", "100"); +@@ -4274,6 +4291,45 @@ int OSD::shutdown() + cct->_conf.apply_changes(nullptr); + } + ++ if (cct->_conf->osd_fast_shutdown) { ++ // first, stop new task from being taken from op_shardedwq ++ // and clear all pending tasks ++ op_shardedwq.stop_for_fast_shutdown(); ++ ++ utime_t start_time_timer = ceph_clock_now(); ++ tick_timer.shutdown(); ++ { ++ std::lock_guard l(tick_timer_lock); ++ tick_timer_without_osd_lock.shutdown(); ++ } ++ ++ osd_lock.unlock(); ++ utime_t start_time_osd_drain = ceph_clock_now(); ++ ++ // then, wait on osd_op_tp to drain (TBD: should probably add a timeout) ++ osd_op_tp.drain(); ++ osd_op_tp.stop(); ++ ++ utime_t start_time_umount = ceph_clock_now(); ++ store->prepare_for_fast_shutdown(); ++ std::lock_guard lock(osd_lock); ++ // TBD: assert in allocator that nothing is being add ++ store->umount(); ++ ++ utime_t end_time = ceph_clock_now(); ++ if (cct->_conf->osd_fast_shutdown_timeout) { ++ ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout); ++ } ++ dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl; ++ dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl; ++ dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl; ++ dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl; ++ cct->_log->flush(); ++ ++ // now it is safe to exit ++ _exit(0); ++ } ++ + // stop MgrClient earlier as it's more like an internal consumer of OSD + mgrc.shutdown(); + +@@ -4435,6 +4491,9 @@ int OSD::shutdown() + hb_front_server_messenger->shutdown(); + hb_back_server_messenger->shutdown(); + ++ utime_t duration = ceph_clock_now() - start_time_func; ++ dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl; ++ + tracing::osd::tracer.shutdown(); + + return r; +@@ -11058,6 +11117,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb) + } + + void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { ++ if (unlikely(m_fast_shutdown) ) { ++ // stop enqueing when we are in the middle of a fast shutdown ++ return; ++ } ++ + uint32_t shard_index = + item.get_ordering_token().hash_to_shard(osd->shards.size()); + +@@ -11088,6 +11152,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) { + + void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) + { ++ if (unlikely(m_fast_shutdown) ) { ++ // stop enqueing when we are in the middle of a fast shutdown ++ return; ++ } ++ + auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size()); + auto& sdata = osd->shards[shard_index]; + ceph_assert(sdata); +@@ -11114,6 +11183,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item) + sdata->sdata_cond.notify_one(); + } + ++void OSD::ShardedOpWQ::stop_for_fast_shutdown() ++{ ++ uint32_t shard_index = 0; ++ m_fast_shutdown = true; ++ ++ for (; shard_index < osd->num_shards; shard_index++) { ++ auto& sdata = osd->shards[shard_index]; ++ ceph_assert(sdata); ++ sdata->shard_lock.lock(); ++ int work_count = 0; ++ while(! sdata->scheduler->empty() ) { ++ auto work_item = sdata->scheduler->dequeue(); ++ work_count++; ++ } ++ sdata->shard_lock.unlock(); ++ } ++} ++ + namespace ceph::osd_cmds { + + int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, +diff --git a/src/osd/OSD.h b/src/osd/OSD.h +index 30d0b0b4aef..2da5de10aa6 100644 +--- a/src/osd/OSD.h ++++ b/src/osd/OSD.h +@@ -1592,7 +1592,7 @@ protected: + : public ShardedThreadPool::ShardedWQ + { + OSD *osd; +- ++ bool m_fast_shutdown = false; + public: + ShardedOpWQ(OSD *o, + ceph::timespan ti, +@@ -1610,6 +1610,8 @@ protected: + /// try to do some work + void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override; + ++ void stop_for_fast_shutdown(); ++ + /// enqueue a new item + void _enqueue(OpSchedulerItem&& item) override; + diff --git a/ceph.spec b/ceph.spec index 04a8657..d8ef8c6 100644 --- a/ceph.spec +++ b/ceph.spec @@ -151,7 +151,7 @@ ################################################################################# Name: ceph Version: 17.1.0 -Release: 0.3.28.g1b309fef%{?dist} +Release: 0.4.31.g1ccf6db7%{?dist} %if 0%{?fedora} || 0%{?rhel} Epoch: 2 %endif @@ -169,7 +169,7 @@ Group: System/Filesystems URL: http://ceph.com/ #Source0: https://download.ceph.com/tarballs/ceph-%{version}.tar.gz #Source0: https://1.chacra.ceph.com/r/ceph/quincy/... -Source0: ceph-17.1.0-28-g1b309fef.tar.bz2 +Source0: ceph-17.1.0-31-g1ccf6db7.tar.bz2 Patch0001: 0001-src-common-crc32c_intel_fast.patch Patch0003: 0003-src-common-bitstr.h.patch Patch0008: 0008-cmake-modules-Finduring.cmake.patch @@ -180,6 +180,7 @@ Patch0016: 0016-src-tracing-patch Patch0017: 0017-gcc-12-omnibus.patch Patch0018: 0018-src-rgw-store-dbstore-CMakeLists.txt.patch Patch0019: 0019-cmake-modules-CheckCxxAtomic.cmake.patch +Patch0020: 0020-src-os-bluestore-BlueFS.cc.patch # ceph 14.0.1 does not support 32-bit architectures, bugs #1727788, #1727787 ExcludeArch: i686 armv7hl %if 0%{?suse_version} @@ -1255,7 +1256,7 @@ This package provides Ceph default alerts for Prometheus. # common ################################################################################# %prep -%autosetup -p1 -n ceph-17.1.0-28-g1b309fef +%autosetup -p1 -n ceph-17.1.0-31-g1ccf6db7 %build # Disable lto on systems that do not support symver attribute @@ -2547,9 +2548,15 @@ exit 0 %config %{_sysconfdir}/prometheus/ceph/ceph_default_alerts.yml %changelog -* Thu Mar 10 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.2.28-g77b78287 +* Thu Mar 17 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.4.31-g1ccf6db7 +- 17.1.0 snapshot 31 plus rhbz#2064219 (ceph #53266, #54561) + +* Wed Mar 16 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.3.28-g77b78287 - 17.1.0 snapshot 28 +* Sat Mar 12 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.2.rc1 +- 17.1.0 RC1 + * Mon Feb 28 2022 Kaleb S. KEITHLEY - 2:17.1.0-0.1.rc1 - 17.1.0 RC1 diff --git a/sources b/sources index 1826439..c6c9df3 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (ceph-17.1.0-28-g1b309fef.tar.bz2) = 6074b6f74fe8cfd776c4e7a93f409b6a27f00d1663e9f49452544b344155a96b3facb49024c5eafccaa656a15702b901fc5e924f9c1556350a42c2b807d77832 +SHA512 (ceph-17.1.0-31-g1ccf6db7.tar.bz2) = 008f7c58639c2a2f074a5971ba5b84ca3b3397d6799691c8587c96277cf352218f405a147a5b90c037af54c0a9b3eaec60053a051fe2d345f9a1de0c46538959