Skip to content

Commit 2a44dc7

Browse files
laramielcopybara-github
authored andcommitted
Rework "file" kvstore write paths for better network file support.
* When writing, stat before rename. Stat after rename was done to reduce spurious failures on some windows paths; we may need to watch this case more closely. * Remove fallback stat-by-filenames; these should not be necessary when doing operations in the new order. * Merge RAII FileLock class with WriteLockHelper class. * Add verbose logging for Lock, Unlock, and Delete. I ran this on a WSL instance, and it appears to resolve the wsl network filesystem issues in: #160 (comment) PiperOrigin-RevId: 665436157 Change-Id: Ia8048936a636f181e53edd247c3847280a0d66ca
1 parent d5edeac commit 2a44dc7

File tree

5 files changed

+207
-196
lines changed

5 files changed

+207
-196
lines changed

tensorstore/internal/os/file_util.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ using FileDescriptor = HANDLE; // HANDLE
5353
/// File descriptor traits for use with `UniqueHandle`.
5454
struct FileDescriptorTraits {
5555
static FileDescriptor Invalid() { return ((FileDescriptor)-1); }
56-
static void Close(FileDescriptor handle);
56+
static void Close(FileDescriptor fd);
5757
};
5858

5959
/// Representation of file metadata.

tensorstore/internal/os/file_util_posix.cc

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ void UnlockFcntlLock(FileDescriptor fd) {
105105
// This is not strictly necessary as the posix/linux locks will be released
106106
// when the fd is closed, but it allows easier reasoning by making locking
107107
// behave similarly across platforms.
108+
TS_DETAIL_LOG_BEGIN << " fd=" << fd;
108109
while (true) {
109110
struct ::flock lock;
110111
lock.l_type = F_UNLCK;
@@ -119,15 +120,15 @@ void UnlockFcntlLock(FileDescriptor fd) {
119120
}
120121
}
121122
if (errno == EINTR) continue;
122-
ABSL_LOG_FIRST_N(INFO, 1)
123-
<< StatusFromOsError(errno, "Failed to release lock");
123+
TS_DETAIL_LOG_ERROR << " fd=" << fd;
124124
return;
125125
}
126126
ABSL_UNREACHABLE();
127127
}
128128
#endif
129129

130130
void UnlockFlockLock(FileDescriptor fd) {
131+
TS_DETAIL_LOG_BEGIN << " fd=" << fd;
131132
while (true) {
132133
{
133134
PotentiallyBlockingRegion region;
@@ -136,8 +137,7 @@ void UnlockFlockLock(FileDescriptor fd) {
136137
}
137138
}
138139
if (errno == EINTR) continue;
139-
ABSL_LOG_FIRST_N(INFO, 1)
140-
<< StatusFromOsError(errno, "Failed to release lock");
140+
TS_DETAIL_LOG_ERROR << " fd=" << fd;
141141
return;
142142
}
143143
ABSL_UNREACHABLE();
@@ -146,6 +146,7 @@ void UnlockFlockLock(FileDescriptor fd) {
146146
} // namespace
147147

148148
Result<UnlockFn> AcquireFdLock(FileDescriptor fd) {
149+
TS_DETAIL_LOG_BEGIN << " fd=" << fd;
149150
#if defined(F_OFD_SETLKW)
150151
while (true) {
151152
// This blocks until the lock is acquired (SETLKW). If any signal is
@@ -165,21 +166,25 @@ Result<UnlockFn> AcquireFdLock(FileDescriptor fd) {
165166
{
166167
PotentiallyBlockingRegion region;
167168
if (::fcntl(fd, F_OFD_SETLKW, &lock) != -1) {
169+
TS_DETAIL_LOG_END << " fd=" << fd;
168170
return UnlockFcntlLock;
169171
}
170172
}
171173
if (errno == EINTR) continue;
172174
if (errno == EINVAL || errno == ENOTSUP) break;
175+
TS_DETAIL_LOG_ERROR << " fd=" << fd;
173176
return StatusFromOsError(errno, "Failed to lock file");
174177
}
175178
#endif
176179
while (true) {
177180
{
178181
PotentiallyBlockingRegion region;
179182
if (::flock(fd, LOCK_EX) != -1) {
183+
TS_DETAIL_LOG_END << " fd=" << fd;
180184
return UnlockFlockLock;
181185
}
182186
if (errno == EINTR) continue;
187+
TS_DETAIL_LOG_ERROR << " fd=" << fd;
183188
return StatusFromOsError(errno, "Failed to lock file");
184189
}
185190
}
@@ -188,23 +193,22 @@ Result<UnlockFn> AcquireFdLock(FileDescriptor fd) {
188193

189194
Result<UniqueFileDescriptor> OpenExistingFileForReading(
190195
const std::string& path) {
191-
TS_DETAIL_LOG_BEGIN << " path=" << tensorstore::QuoteString(path);
196+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
192197
FileDescriptor fd;
193198
{
194199
PotentiallyBlockingRegion region;
195200
fd = ::open(path.c_str(), O_RDONLY | O_CLOEXEC);
196201
}
197202
if (fd == FileDescriptorTraits::Invalid()) {
198-
TS_DETAIL_LOG_ERROR << " path=" << tensorstore::QuoteString(path);
203+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
199204
return StatusFromOsError(errno, "Failed to open: ", QuoteString(path));
200205
}
201-
TS_DETAIL_LOG_END << " path=" << tensorstore::QuoteString(path)
202-
<< ", fd=" << fd;
206+
TS_DETAIL_LOG_END << " path=" << QuoteString(path) << ", fd=" << fd;
203207
return UniqueFileDescriptor(fd);
204208
}
205209

206210
Result<UniqueFileDescriptor> OpenFileForWriting(const std::string& path) {
207-
TS_DETAIL_LOG_BEGIN << " path=" << tensorstore::QuoteString(path);
211+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
208212
FileDescriptor fd = FileDescriptorTraits::Invalid();
209213
const auto attempt_open = [&] {
210214
PotentiallyBlockingRegion region;
@@ -227,11 +231,10 @@ Result<UniqueFileDescriptor> OpenFileForWriting(const std::string& path) {
227231
}
228232
#endif
229233
if (fd == FileDescriptorTraits::Invalid()) {
230-
TS_DETAIL_LOG_ERROR << " path=" << tensorstore::QuoteString(path);
234+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
231235
return StatusFromOsError(errno, "Failed to create: ", QuoteString(path));
232236
}
233-
TS_DETAIL_LOG_END << " path=" << tensorstore::QuoteString(path)
234-
<< ", fd=" << fd;
237+
TS_DETAIL_LOG_END << " path=" << QuoteString(path) << ", fd=" << fd;
235238
return UniqueFileDescriptor(fd);
236239
}
237240

@@ -306,27 +309,39 @@ absl::Status TruncateFile(FileDescriptor fd) {
306309

307310
absl::Status RenameOpenFile(FileDescriptor fd, const std::string& old_name,
308311
const std::string& new_name) {
312+
TS_DETAIL_LOG_BEGIN << " fd=" << fd << ", old_name=" << QuoteString(old_name)
313+
<< ", new_name=" << QuoteString(new_name);
309314
PotentiallyBlockingRegion region;
310315
if (::rename(old_name.c_str(), new_name.c_str()) == 0) {
316+
TS_DETAIL_LOG_END << " fd=" << fd << ", old_name=" << QuoteString(old_name)
317+
<< ", new_name=" << QuoteString(new_name);
311318
return absl::OkStatus();
312319
}
320+
TS_DETAIL_LOG_ERROR << " fd=" << fd << ", old_name=" << QuoteString(old_name)
321+
<< ", new_name=" << QuoteString(new_name);
313322
return StatusFromOsError(errno, "Failed to rename: ", QuoteString(old_name),
314323
" to: ", QuoteString(new_name));
315324
}
316325

317326
absl::Status DeleteOpenFile(FileDescriptor fd, const std::string& path) {
327+
TS_DETAIL_LOG_BEGIN << " fd=" << fd << ", path=" << QuoteString(path);
318328
PotentiallyBlockingRegion region;
319329
if (::unlink(path.c_str()) == 0) {
330+
TS_DETAIL_LOG_END << " fd=" << fd;
320331
return absl::OkStatus();
321332
}
333+
TS_DETAIL_LOG_ERROR << " fd=" << fd;
322334
return StatusFromOsError(errno, "Failed to delete: ", QuoteString(path));
323335
}
324336

325337
absl::Status DeleteFile(const std::string& path) {
338+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
326339
PotentiallyBlockingRegion region;
327340
if (::unlink(path.c_str()) == 0) {
341+
TS_DETAIL_LOG_END << " path=" << QuoteString(path);
328342
return absl::OkStatus();
329343
}
344+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
330345
return StatusFromOsError(errno, "Failed to delete: ", QuoteString(path));
331346
}
332347

@@ -353,13 +368,13 @@ absl::Status GetFileInfo(FileDescriptor fd, FileInfo* info) {
353368
}
354369

355370
absl::Status GetFileInfo(const std::string& path, FileInfo* info) {
356-
TS_DETAIL_LOG_BEGIN << " path=" << tensorstore::QuoteString(path);
371+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
357372
PotentiallyBlockingRegion region;
358373
if (::stat(path.c_str(), info) == 0) {
359-
TS_DETAIL_LOG_END << " path=" << tensorstore::QuoteString(path);
374+
TS_DETAIL_LOG_END << " path=" << QuoteString(path);
360375
return absl::OkStatus();
361376
}
362-
TS_DETAIL_LOG_ERROR << " path=" << tensorstore::QuoteString(path);
377+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
363378
return StatusFromOsError(errno);
364379
}
365380

tensorstore/internal/os/file_util_win.cc

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -173,36 +173,41 @@ Result<DWORD> GetFileAttributes(const std::wstring& filename) {
173173
#endif
174174

175175
void UnlockWin32Lock(FileDescriptor fd) {
176+
TS_DETAIL_LOG_BEGIN << " handle=" << fd;
176177
auto lock_offset = GetLockOverlapped();
177178
// Ignore any errors.
178179
::UnlockFileEx(fd, /*dwReserved=*/0, /*nNumberOfBytesToUnlockLow=*/1,
179180
/*nNumberOfBytesToUnlockHigh=*/0,
180181
/*lpOverlapped=*/&lock_offset);
182+
TS_DETAIL_LOG_END << " handle=" << fd;
181183
}
182184

183185
} // namespace
184186

185-
void FileDescriptorTraits::Close(FileDescriptor handle) {
186-
TS_DETAIL_LOG_BEGIN << " handle=" << handle;
187-
::CloseHandle(handle);
188-
TS_DETAIL_LOG_END << " handle=" << handle;
187+
void FileDescriptorTraits::Close(FileDescriptor fd) {
188+
TS_DETAIL_LOG_BEGIN << " handle=" << fd;
189+
::CloseHandle(fd);
190+
TS_DETAIL_LOG_END << " handle=" << fd;
189191
}
190192

191193
Result<UnlockFn> AcquireFdLock(FileDescriptor fd) {
194+
TS_DETAIL_LOG_BEGIN << " handle=" << fd;
192195
auto lock_offset = GetLockOverlapped();
193196
if (::LockFileEx(fd, /*dwFlags=*/LOCKFILE_EXCLUSIVE_LOCK,
194197
/*dwReserved=*/0,
195198
/*nNumberOfBytesToLockLow=*/1,
196199
/*nNumberOfBytesToLockHigh=*/0,
197200
/*lpOverlapped=*/&lock_offset)) {
201+
TS_DETAIL_LOG_END << " handle=" << fd;
198202
return UnlockWin32Lock;
199203
}
204+
TS_DETAIL_LOG_ERROR << " handle=" << fd;
200205
return StatusFromOsError(::GetLastError(), "Failed to lock file");
201206
}
202207

203208
Result<UniqueFileDescriptor> OpenExistingFileForReading(
204209
const std::string& path) {
205-
TS_DETAIL_LOG_BEGIN << " path=" << tensorstore::QuoteString(path);
210+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
206211
std::wstring wpath;
207212
TENSORSTORE_RETURN_IF_ERROR(ConvertUTF8ToWindowsWide(path, wpath));
208213

@@ -215,17 +220,16 @@ Result<UniqueFileDescriptor> OpenExistingFileForReading(
215220
/*hTemplateFile=*/nullptr);
216221

217222
if (fd == FileDescriptorTraits::Invalid()) {
218-
TS_DETAIL_LOG_ERROR << " path=" << tensorstore::QuoteString(path);
223+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
219224
return StatusFromOsError(::GetLastError(),
220225
"Failed to open: ", QuoteString(path));
221226
}
222-
TS_DETAIL_LOG_END << " path=" << tensorstore::QuoteString(path)
223-
<< ", handle=" << fd;
227+
TS_DETAIL_LOG_END << " path=" << QuoteString(path) << ", handle=" << fd;
224228
return UniqueFileDescriptor(fd);
225229
}
226230

227231
Result<UniqueFileDescriptor> OpenFileForWriting(const std::string& path) {
228-
TS_DETAIL_LOG_BEGIN << " path=" << tensorstore::QuoteString(path);
232+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
229233
std::wstring wpath;
230234
TENSORSTORE_RETURN_IF_ERROR(ConvertUTF8ToWindowsWide(path, wpath));
231235

@@ -242,12 +246,11 @@ Result<UniqueFileDescriptor> OpenFileForWriting(const std::string& path) {
242246
/*hTemplateFile=*/nullptr);
243247

244248
if (fd == FileDescriptorTraits::Invalid()) {
245-
TS_DETAIL_LOG_ERROR << " path=" << tensorstore::QuoteString(path);
249+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
246250
return StatusFromOsError(::GetLastError(),
247251
"Failed to create: ", QuoteString(path));
248252
}
249-
TS_DETAIL_LOG_END << " path=" << tensorstore::QuoteString(path)
250-
<< ", handle=" << fd;
253+
TS_DETAIL_LOG_END << " path=" << QuoteString(path) << ", handle=" << fd;
251254
return UniqueFileDescriptor(fd);
252255
}
253256

@@ -310,11 +313,15 @@ absl::Status TruncateFile(FileDescriptor fd) {
310313

311314
absl::Status RenameOpenFile(FileDescriptor fd, const std::string& old_name,
312315
const std::string& new_name) {
316+
TS_DETAIL_LOG_BEGIN << " handle=" << fd
317+
<< ", old_name=" << QuoteString(old_name)
318+
<< ", new_name=" << QuoteString(new_name);
313319
std::wstring wpath_new;
314320
TENSORSTORE_RETURN_IF_ERROR(ConvertUTF8ToWindowsWide(new_name, wpath_new));
315321

316322
// Try using Posix semantics.
317323
if (RenameFilePosix(fd, wpath_new)) {
324+
TS_DETAIL_LOG_END << " handle=" << fd;
318325
return absl::OkStatus();
319326
}
320327

@@ -330,9 +337,11 @@ absl::Status RenameOpenFile(FileDescriptor fd, const std::string& old_name,
330337
// Try using MoveFileEx, which may not be atomic.
331338
if (::MoveFileExW(wpath_old.c_str(), wpath_new.c_str(),
332339
MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH)) {
340+
TS_DETAIL_LOG_END << " handle=" << fd;
333341
return absl::OkStatus();
334342
}
335343
344+
TS_DETAIL_LOG_ERROR << " handle=" << fd;
336345
return StatusFromOsError(::GetLastError(),
337346
"Failed to rename: ", QuoteString(old_name),
338347
" to: ", QuoteString(new_name));
@@ -347,6 +356,7 @@ absl::Status DeleteOpenFile(FileDescriptor fd, const std::string& path) {
347356
// result in the normal read/write paths failing with an error. To avoid
348357
// that problem, we first rename the file to a random name, with a suffix of
349358
// `kLockSuffix` to prevent it from being included in List results.
359+
TS_DETAIL_LOG_BEGIN << " handle=" << fd << ", path=" << QuoteString(path);
350360
unsigned int buf[5];
351361
for (int i = 0; i < 5; ++i) {
352362
::rand_s(&buf[i]);
@@ -374,6 +384,7 @@ absl::Status DeleteOpenFile(FileDescriptor fd, const std::string& path) {
374384
}
375385
// Attempt to delete the open handle using posix semantics?
376386
if (DeleteFilePosix(fd)) {
387+
TS_DETAIL_LOG_END << " handle=" << fd;
377388
return absl::OkStatus();
378389
}
379390
#ifndef NDEBUG
@@ -383,8 +394,10 @@ absl::Status DeleteOpenFile(FileDescriptor fd, const std::string& path) {
383394
#endif
384395
// The file has been renamed, so delete the renamed file.
385396
if (::DeleteFileW(wpath_temp.c_str())) {
397+
TS_DETAIL_LOG_END << " handle=" << fd;
386398
return absl::OkStatus();
387399
}
400+
TS_DETAIL_LOG_ERROR << " handle=" << fd;
388401
return StatusFromOsError(::GetLastError(),
389402
"Failed to delete: ", QuoteString(path));
390403
}
@@ -431,7 +444,7 @@ absl::Status GetFileInfo(FileDescriptor fd, FileInfo* info) {
431444
}
432445
433446
absl::Status GetFileInfo(const std::string& path, FileInfo* info) {
434-
TS_DETAIL_LOG_BEGIN << " path=" << tensorstore::QuoteString(path);
447+
TS_DETAIL_LOG_BEGIN << " path=" << QuoteString(path);
435448
436449
// The typedef uses BY_HANDLE_FILE_INFO, which includes device and index
437450
// metadata, and requires an open handle.
@@ -446,11 +459,11 @@ absl::Status GetFileInfo(const std::string& path, FileInfo* info) {
446459
/*hTemplateFile=*/nullptr));
447460
if (stat_fd.valid()) {
448461
if (::GetFileInformationByHandle(stat_fd.get(), info)) {
449-
TS_DETAIL_LOG_END << " path=" << tensorstore::QuoteString(path);
462+
TS_DETAIL_LOG_END << " path=" << QuoteString(path);
450463
return absl::OkStatus();
451464
}
452465
}
453-
TS_DETAIL_LOG_ERROR << " path=" << tensorstore::QuoteString(path);
466+
TS_DETAIL_LOG_ERROR << " path=" << QuoteString(path);
454467
return StatusFromOsError(::GetLastError());
455468
}
456469

tensorstore/kvstore/file/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ tensorstore_cc_library(
5151
"//tensorstore/util:result",
5252
"//tensorstore/util:span",
5353
"//tensorstore/util:status",
54-
"//tensorstore/util:str_cat",
5554
"//tensorstore/util/execution",
5655
"//tensorstore/util/garbage_collection",
5756
"@com_google_absl//absl/base:core_headers",
5857
"@com_google_absl//absl/functional:function_ref",
58+
"@com_google_absl//absl/log:absl_check",
5959
"@com_google_absl//absl/log:absl_log",
6060
"@com_google_absl//absl/status",
6161
"@com_google_absl//absl/strings",

0 commit comments

Comments
 (0)