Skip to content

Commit 3cbf3fc

Browse files
committed
fix(web): prevent unnecessary index.json timestamp cascade updates
Preserve unchanged FileEntry and DirectoryEntry objects when regenerating static data indexes to prevent lastUpdated timestamp cascades through parent indexes. Changes: - processJsonFiles: Reuse existing FileEntry if contentHash unchanged - processSubdirectories: Reuse existing DirectoryEntry if subdirectory's lastUpdated unchanged - Add unit tests for change detection logic Problem: Previously, every index regeneration created fresh FileEntry objects with current lastRetrieved timestamps, causing parent indexes to detect changes via JSON string comparison and update their lastUpdated timestamps even when no actual content changed. Solution: By preserving existing entry objects for unchanged files/directories, the JSON comparison in hasIndexContentChanged() correctly detects that the structure is unchanged, preventing unnecessary timestamp updates throughout the index hierarchy. Related: Prevents cascade updates when only individual files are re-downloaded with identical content.
1 parent bac9f3d commit 3cbf3fc

File tree

2 files changed

+316
-30
lines changed

2 files changed

+316
-30
lines changed

apps/web/src/lib/utils/static-data-index-generator.ts

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -200,16 +200,29 @@ export const generateIndexForEntityType = async (entityDir: string, entityType:
200200
);
201201
}
202202

203-
// Process each file to extract metadata
204-
const files = await processJsonFiles({ entityDir, jsonFiles, entityType });
203+
// Read existing index to preserve unchanged entries
204+
const indexPath = path.join(entityDir, INDEX_FILENAME);
205+
let existingIndex: DirectoryIndex | null = null;
206+
207+
try {
208+
if (await fileExists(indexPath)) {
209+
const existingContent = await fs.readFile(indexPath, "utf8");
210+
existingIndex = JSON.parse(existingContent);
211+
}
212+
} catch (error) {
213+
console.warn(`⚠️ Failed to read existing index: ${error}`);
214+
}
215+
216+
// Process each file to extract metadata, preserving unchanged entries
217+
const files = await processJsonFiles({ entityDir, jsonFiles, entityType, existingIndex });
205218

206219
// Process subdirectories if recursive
207220
let directories: Record<string, DirectoryEntry> = {};
208221
let maxLastUpdated = new Date().toISOString();
209222

210223
if (recursive) {
211224
const { directories: subDirs, maxLastUpdated: subMaxUpdated } =
212-
await processSubdirectories({ entityDir, entityType, recursive });
225+
await processSubdirectories({ entityDir, entityType, recursive, existingIndex });
213226
directories = subDirs;
214227
maxLastUpdated = subMaxUpdated;
215228
}
@@ -218,19 +231,6 @@ export const generateIndexForEntityType = async (entityDir: string, entityType:
218231
// ISO strings are lexicographically comparable - use string comparison
219232
const overallLastUpdated = maxLastUpdated > currentIsoString ? maxLastUpdated : currentIsoString;
220233

221-
// Read existing index to check if content has changed
222-
const indexPath = path.join(entityDir, INDEX_FILENAME);
223-
let existingIndex: DirectoryIndex | null = null;
224-
225-
try {
226-
if (await fileExists(indexPath)) {
227-
const existingContent = await fs.readFile(indexPath, "utf8");
228-
existingIndex = JSON.parse(existingContent);
229-
}
230-
} catch (error) {
231-
console.warn(`⚠️ Failed to read existing index: ${error}`);
232-
}
233-
234234
// Check if content has actually changed (excluding lastUpdated field)
235235
const contentChanged = hasIndexContentChanged({
236236
existingIndex,
@@ -354,21 +354,26 @@ const ensureDirectoryExists = async (dirPath: string): Promise<void> => {
354354

355355
/**
356356
* Process JSON files in a directory and extract metadata
357+
* Preserves existing FileEntry objects for unchanged files to prevent unnecessary timestamp updates
357358
* @param root0
358359
* @param root0.entityDir
359360
* @param root0.jsonFiles
360361
* @param root0.entityType
362+
* @param root0.existingIndex
361363
*/
362364
const processJsonFiles = async ({
363365
entityDir,
364366
jsonFiles,
365367
entityType,
368+
existingIndex,
366369
}: {
367370
entityDir: string;
368371
jsonFiles: string[];
369372
entityType: StaticEntityType;
373+
existingIndex?: DirectoryIndex | null;
370374
}): Promise<Record<string, FileEntry>> => {
371375
const files: Record<string, FileEntry> = {};
376+
const existingFiles = existingIndex?.files || {};
372377

373378
for (const fileName of jsonFiles) {
374379
const filePath = path.join(entityDir, fileName);
@@ -386,15 +391,22 @@ const processJsonFiles = async ({
386391
);
387392
}
388393

389-
// Create FileEntry with reconstructed URL
390-
const reconstructedUrl = `https://api.openalex.org/${entityType}/${entityId}`;
391-
392-
files[entityId] = {
393-
$ref: `./${fileName}`,
394-
contentHash: await generateContentHash(data),
395-
lastRetrieved: fileStats.mtime.toISOString(),
396-
url: reconstructedUrl,
397-
};
394+
const currentHash = await generateContentHash(data);
395+
const existingEntry = existingFiles[entityId];
396+
397+
// Preserve existing entry if content hash unchanged (prevents timestamp cascade)
398+
if (existingEntry && existingEntry.contentHash === currentHash) {
399+
files[entityId] = existingEntry;
400+
} else {
401+
// New or changed file - create fresh entry with current timestamp
402+
const reconstructedUrl = `https://api.openalex.org/${entityType}/${entityId}`;
403+
files[entityId] = {
404+
$ref: `./${fileName}`,
405+
contentHash: currentHash,
406+
lastRetrieved: fileStats.mtime.toISOString(),
407+
url: reconstructedUrl,
408+
};
409+
}
398410
} catch (error) {
399411
console.warn(`⚠️ Failed to validate file ${fileName}:`, error);
400412
// Skip invalid files rather than adding them
@@ -406,23 +418,28 @@ const processJsonFiles = async ({
406418

407419
/**
408420
* Process subdirectories and generate their indexes
421+
* Preserves existing DirectoryEntry objects for unchanged subdirectories to prevent unnecessary timestamp updates
409422
* @param root0
410423
* @param root0.entityDir
411424
* @param root0.entityType
412425
* @param root0.recursive
426+
* @param root0.existingIndex
413427
*/
414428
const processSubdirectories = async ({
415429
entityDir,
416430
entityType,
431+
existingIndex,
417432
}: {
418433
entityDir: string;
419434
entityType: StaticEntityType;
420435
recursive?: boolean;
436+
existingIndex?: DirectoryIndex | null;
421437
}): Promise<{
422438
directories: Record<string, DirectoryEntry>;
423439
maxLastUpdated: string;
424440
}> => {
425441
const directories: Record<string, DirectoryEntry> = {};
442+
const existingDirs = existingIndex?.directories || {};
426443
let maxLastUpdated = new Date().toISOString();
427444

428445
try {
@@ -451,17 +468,23 @@ const processSubdirectories = async ({
451468
if (await fileExists(subIndexPath)) {
452469
const subContent = await fs.readFile(subIndexPath, "utf8");
453470
const subIndex: DirectoryIndex = JSON.parse(subContent);
471+
const existingEntry = existingDirs[subdir];
454472

455473
// Track the maximum lastUpdated timestamp
456474
if (subIndex.lastUpdated > maxLastUpdated) {
457475
maxLastUpdated = subIndex.lastUpdated;
458476
}
459477

460-
// Build directory entry
461-
directories[subdir] = {
462-
$ref: `./${subdir}`,
463-
lastModified: subIndex.lastUpdated,
464-
};
478+
// Preserve existing directory entry if subdirectory's lastUpdated unchanged
479+
if (existingEntry && existingEntry.lastModified === subIndex.lastUpdated) {
480+
directories[subdir] = existingEntry;
481+
} else {
482+
// Subdirectory changed - create fresh directory entry
483+
directories[subdir] = {
484+
$ref: `./${subdir}`,
485+
lastModified: subIndex.lastUpdated,
486+
};
487+
}
465488
} else {
466489
console.warn(`⚠️ No index found for subdirectory: ${subPath}`);
467490
}

0 commit comments

Comments
 (0)