Skip to content

Commit 20009cc

Browse files
Add tests, small fixes
1 parent df6d4e2 commit 20009cc

File tree

7 files changed

+513
-72
lines changed

7 files changed

+513
-72
lines changed

src/vt/objgroup/proxy/proxy_objgroup.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,10 @@ struct Proxy {
460460

461461
if constexpr(has_user_traits_v<Serializer, CheckpointTrait>){
462462
vtAssert(old_proxy != no_obj_group, "ObjGroups must be pre-instantiated to be checkpointed or restored");
463-
vtAssert(old_proxy == proxy_, "The proxy ID bits of this ObjGroup do not match the ID found in the checkpoint!" \
464-
" Varying IDs is not yet supported.");
463+
vtWarnIf(old_proxy != proxy_,
464+
"Checkpointed ObjGroup and deserialize target do not match!");
465+
proxy_ = old_proxy;
466+
465467
auto objPtr = get();
466468

467469
bool null = objPtr == nullptr;

src/vt/vrt/collection/manager.h

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ struct CollectionManager
352352
* \param[in] is_collective whether the collection is collective
353353
* \param[in] is_migratable whether the collection is migratable
354354
* \param[in] request_match an input proxy which we would like to use, if
355-
* there are no existing conflicts. no_vrt_proxy indicates no
355+
* there are no existing conflicts. no_vrt_proxy indicates no
356356
* request.
357357
*
358358
* \return the collection proxy bits
@@ -1562,7 +1562,7 @@ struct CollectionManager
15621562
*/
15631563
template <typename ColT, typename IndexT = typename ColT::IndexType>
15641564
bool getDynamicMembership(VirtualProxyType proxy);
1565-
1565+
15661566
/**
15671567
* \brief Get the local indices that are currently on this node
15681568
*
@@ -1635,28 +1635,16 @@ struct CollectionManager
16351635
);
16361636

16371637
/**
1638-
* \internal \struct MigrateRequestMsg
1638+
* \internal \brief Migrate element to restore location from checkpoint
16391639
*
1640-
* \brief Migrate local element, potentially requested by remote location
1641-
*/
1642-
1643-
/**
1644-
* \brief Migrate a remote proxy element to a node, by messaging that
1645-
* node to initiate a migration. Immediately returns a rooted epoch
1646-
* containing the request message.
1647-
*/
1648-
template <typename ColT>
1649-
EpochType requestMigrateDeferred(
1650-
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elem, NodeType destination
1651-
);
1652-
1653-
/**
1654-
* \brief Migrate a remote proxy element to a node, by messaging that
1655-
* node to initiate a migration. Returns after migration is complete.
1640+
* \param[in] node the node
1641+
* \param[in] idx the element index
1642+
* \param[in] proxy the collection proxy
16561643
*/
16571644
template <typename ColT>
1658-
void requestMigrate(
1659-
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elem, NodeType destination
1645+
static void migrateToRestoreLocation(
1646+
NodeType node, typename ColT::IndexType idx,
1647+
CollectionProxyWrapType<ColT> proxy
16601648
);
16611649

16621650
/**

src/vt/vrt/collection/manager.impl.h

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2249,40 +2249,28 @@ void CollectionManager::checkpointToFile(
22492249

22502250
namespace detail {
22512251
template <typename ColT>
2252-
inline void MigrateRequestHandler (
2253-
ColT*, VrtElmProxy<ColT, typename ColT::IndexType> proxy_elm, NodeType dest
2252+
inline void restoreOffHomeElement(
2253+
ColT*, NodeType node, typename ColT::IndexType idx,
2254+
CollectionProxy<ColT> proxy
22542255
) {
2255-
theCollection()->migrate(proxy_elm, dest);
2256+
theCollection()->migrate(proxy(idx), node);
22562257
}
22572258
} /* end namespace detail */
22582259

22592260
template <typename ColT>
2260-
EpochType CollectionManager::requestMigrateDeferred(
2261-
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elm, NodeType destination
2262-
) {
2263-
auto ep = theTerm()->makeEpochRooted(
2264-
"Request element migration", term::UseDS{true}
2265-
);
2266-
theMsg()->pushEpoch(ep);
2267-
2268-
proxy_elm.template send<detail::MigrateRequestHandler<ColT>>(
2269-
proxy_elm, destination
2270-
);
2271-
2272-
theMsg()->popEpoch(ep);
2273-
theTerm()->finishedEpoch(ep);
2274-
return ep;
2275-
}
2276-
2277-
template <typename ColT>
2278-
void CollectionManager::requestMigrate(
2279-
VrtElmProxy<ColT, typename ColT::IndexType> proxy_elem, NodeType destination
2261+
/*static*/ void CollectionManager::migrateToRestoreLocation(
2262+
NodeType node, typename ColT::IndexType idx,
2263+
CollectionProxyWrapType<ColT> proxy
22802264
) {
2281-
auto ep = requestMigrateDeferred(proxy_elem, destination);
2282-
vt::runSchedulerThrough(ep);
2265+
if (proxy(idx).tryGetLocalPtr() != nullptr) {
2266+
theCollection()->migrate(proxy(idx), node);
2267+
} else {
2268+
proxy(idx).template send<detail::restoreOffHomeElement<ColT>>(
2269+
node, idx, proxy
2270+
);
2271+
}
22832272
}
22842273

2285-
22862274
template <typename ColT>
22872275
void CollectionManager::restoreFromFileInPlace(
22882276
CollectionProxyWrapType<ColT> proxy,
@@ -2309,14 +2297,23 @@ void CollectionManager::restoreFromFileInPlace(
23092297
metadata_file_name
23102298
);
23112299

2312-
//Everyone shuffles any elements not where their data is
23132300
runInEpochCollective([&]{
23142301
for (auto&& elm : directory->elements_) {
23152302
auto idx = elm.idx_;
23162303
auto file_name = elm.file_name_;
23172304

23182305
if (proxy(idx).tryGetLocalPtr() == nullptr) {
2319-
requestMigrateDeferred(proxy(idx), theContext()->getNode());
2306+
auto mapped_node = getMappedNode<ColT>(proxy, idx);
2307+
vtAssertExpr(mapped_node != uninitialized_destination);
2308+
auto this_node = theContext()->getNode();
2309+
2310+
if (mapped_node != this_node) {
2311+
theMsg()->send<migrateToRestoreLocation<ColT>>(
2312+
vt::Node{mapped_node}, this_node, idx, proxy
2313+
);
2314+
} else {
2315+
migrateToRestoreLocation<ColT>(this_node, idx, proxy);
2316+
}
23202317
}
23212318
}
23222319
});

src/vt/vrt/proxy/collection_elm_proxy.impl.h

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ void VrtElmProxy<ColT, IndexT>::serialize(SerT& s) {
5858
//Only serialize the ColT object if checkpointing.
5959
if constexpr(checkpoint::has_user_traits_v<SerT, CheckpointTrait>){
6060
ColT* local_elm_ptr = this->tryGetLocalPtr();
61-
vtAssert(local_elm_ptr != nullptr || s.isUnpacking(), "Must serialize/size elements from the node they are at");
61+
vtAssert(local_elm_ptr != nullptr || s.isUnpacking(),
62+
"Must serialize/size elements from the node they are at");
6263

6364
//Traits for nested serialize/deserialize
6465
using checkpoint::serializerUserTraits::CopyTraits;
@@ -67,43 +68,73 @@ void VrtElmProxy<ColT, IndexT>::serialize(SerT& s) {
6768
::template With<CheckpointInternalTrait>
6869
>;
6970

70-
//Weird nested serialization to enable asynchronous deserializing w/o changing semantics.
71+
//Weird nested serialization to enable asynchronous deserializing w/o
72+
//changing semantics.
7173
if(!(s.isPacking() || s.isUnpacking())){
72-
int size = checkpoint::getSize<ColT, CheckpointlessTraits>(*local_elm_ptr);
74+
int size = checkpoint::getSize<ColT, CheckpointlessTraits>(
75+
*local_elm_ptr
76+
);
7377
s | size;
7478
//Don't use nullptr to avoid warning
7579
s.contiguousBytes(&size, 1, size);
7680
} else if(s.isPacking()){
77-
auto serialized_elm = checkpoint::serialize<ColT, CheckpointlessTraits>(*local_elm_ptr);
81+
auto serialized_elm = checkpoint::serialize<ColT, CheckpointlessTraits>(
82+
*local_elm_ptr
83+
);
7884
int size = serialized_elm->getSize();
7985
s | size;
8086
s.contiguousBytes(serialized_elm->getBuffer(), 1, size);
8187
} else if(s.isUnpacking()){
8288
int size = 0;
8389
s | size;
8490

85-
auto buf = std::make_unique<char[]>(size);
86-
s.contiguousBytes(buf.get(), 1, size);
91+
auto buf = std::make_shared<std::vector<char>>(size);
92+
s.contiguousBytes(buf->data(), 1, size);
8793

8894
if(local_elm_ptr != nullptr){
89-
checkpoint::deserializeInPlace<ColT, CheckpointlessTraits>(buf.get(), local_elm_ptr);
95+
checkpoint::deserializeInPlace<ColT, CheckpointlessTraits>(
96+
buf->data(), local_elm_ptr
97+
);
9098
} else {
91-
//The element is somewhere else so we'll need to request a migration to here.
92-
auto ep = theCollection()->requestMigrateDeferred(*this, theContext()->getNode());
99+
//TODO: Investigate skipping the actual full migration and instead
100+
// simply deleting remote node's copy, building a fresh one here,
101+
// and updating system as if migration happened. Would probably be
102+
// noticeably faster on large systems and w/ large elements.
103+
CollectionProxy<ColT, IndexT> col(this->getCollectionProxy());
104+
std::shared_ptr<int> listener_id = std::make_shared<int>(-1);
93105

94-
theTerm()->addActionUnique(ep, std::move([elm_proxy = *this, buffer = std::move(buf)]{
95-
auto elm_ptr = elm_proxy.tryGetLocalPtr();
96-
assert(elm_ptr != nullptr);
97-
checkpoint::deserializeInPlace<ColT, CheckpointlessTraits>(buffer.get(), elm_ptr);
98-
}));
106+
//Listen for this element to migrate in,
107+
//then immediately deserialize
108+
using listener::ElementEventEnum;
109+
listener::ListenFnType<IndexT> m_listener =
110+
[*this, buf, listener_id]
111+
(ElementEventEnum event, IndexT idx, NodeType) mutable {
112+
if(!(idx == getIndex())) return;
113+
if(event != ElementEventEnum::ElementMigratedIn) return;
114+
115+
auto elm_ptr = this->tryGetLocalPtr();
116+
checkpoint::deserializeInPlace<ColT, CheckpointlessTraits>(
117+
buf->data(), elm_ptr
118+
);
119+
theCollection()->unregisterElementListener<ColT>(
120+
this->getCollectionProxy(), *listener_id
121+
);
122+
};
123+
*listener_id = theCollection()->registerElementListener<ColT>(
124+
this->getCollectionProxy(), m_listener
125+
);
126+
127+
theCollection()->migrateToRestoreLocation(
128+
theContext()->getNode(), getIndex(), col
129+
);
99130
}
100131
}
101132
}
102133
}
103134

104135
template <typename ColT, typename IndexT>
105136
template <typename SerT>
106-
std::unique_ptr<ColT>
137+
std::unique_ptr<ColT>
107138
VrtElmProxy<ColT, IndexT>::deserializeToElm(SerT& s) {
108139
//Still have to hit data in the same order.
109140
ProxyCollectionElmTraits<ColT, IndexT>::serialize(s);

src/vt/vrt/proxy/collection_proxy.impl.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#define INCLUDED_VT_VRT_PROXY_COLLECTION_PROXY_IMPL_H
4646

4747
#include "vt/config.h"
48+
#include "vt/configs/error/soft_error.h"
4849
#include "vt/vrt/proxy/collection_proxy.h"
4950
#include "vt/vrt/proxy/base_elm_proxy.h"
5051
#include "vt/vrt/collection/proxy_traits/proxy_col_traits.h"
@@ -152,12 +153,9 @@ void CollectionProxy<ColT, IndexT>::serialize(
152153
//TODO: magistrate's virtualized serialization support may enable checkpointing
153154
//mapper objects. Pre-registered functions could be doable as well.
154155

155-
//TODO: chkpt location manager so we don't have to message back and forth so much?
156-
//auto lm = theLocMan()->getCollectionLM<IndexType>(proxy.getProxy());
157-
158156
//If unpacking, we may need to make the collection to unpack into.
159157
if(s.isUnpacking() && oldProxy != proxy){
160-
vtAssert(oldProxy == no_vrt_proxy,
158+
vtWarnIf(oldProxy != no_vrt_proxy,
161159
"Checkpointed proxy and deserialize target do not match!");
162160

163161
//The checkpointed proxy doesn't exist, we need to create it

0 commit comments

Comments
 (0)