BREAKING CHANGE: Improve eval output hash with semantic names instead of raw commands (#6346) [ci fast]

pditommaso · claude · web-flow · commit d86be1a19e89 · 2025-08-15T11:27:17.000+02:00
This commit fixes issue #5470 by implementing a more robust approach to including eval output commands in task hash calculation. Instead of using raw command strings directly, we now use semantic parameter names paired with command values, creating a symmetric pattern with input parameter hashing. Key improvements over the reverted approach (b0fe0a9): - Uses semantic names (nxf_out_eval_*) instead of raw bash commands for better readability - Maintains deterministic ordering through sorting for cache consistency - Follows the same name+value pattern as input parameters for symmetry - Separates hash computation logic into testable computeEvalOutputsContent() method - Provides comprehensive comments explaining the rationale BREAKING CHANGE: This change will invalidate existing task cache entries that use output eval parameters, requiring re-execution of those tasks. The cache invalidation is intentional and necessary to ensure proper cache behavior when eval output definitions change. Fixes #5470 Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy b/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy
@@ -2211,6 +2211,13 @@ class TaskProcessor {
             keys.add( it.value )
         }
 
+        // add eval output commands to the hash for proper cache invalidation (fixes issue #5470)
+        final outEvals = task.getOutputEvals()
+        if( outEvals ) {
+            keys.add("eval_outputs")
+            keys.add(computeEvalOutputsContent(outEvals))
+        }
+
         // add all variable references in the task script but not declared as input/output
         def vars = getTaskGlobalVars(task)
         if( vars ) {
@@ -2614,4 +2621,42 @@ class TaskProcessor {
             handleException( error, currentTask.get() )
         }
     }
+
+    /**
+     * Compute a deterministic string representation of eval output commands for cache hashing.
+     * This method creates a consistent hash key based on the semantic names and command values
+     * of eval outputs, ensuring cache invalidation when eval outputs change.
+     *
+     * @param outEvals Map of eval parameter names to their command strings
+     * @return A concatenated string of "name=command" pairs, sorted for deterministic hashing
+     */
+    protected String computeEvalOutputsContent(Map<String, String> outEvals) {
+        // Assert precondition that outEvals should not be null or empty when this method is called
+        assert outEvals != null && !outEvals.isEmpty(), "Eval outputs should not be null or empty"
+        
+        final result = new StringBuilder()
+        
+        // Sort entries by key for deterministic ordering. This ensures that the same set of
+        // eval outputs always produces the same hash regardless of map iteration order,
+        // which is critical for cache consistency across different JVM runs.
+        // Without sorting, HashMap iteration order can vary between executions, leading to
+        // different cache keys for identical eval output configurations and causing
+        // unnecessary cache misses and task re-execution
+        final sortedEntries = outEvals.entrySet().sort { a, b -> a.key.compareTo(b.key) }
+        
+        // Build content using for loop to concatenate "name=command" pairs.
+        // This creates a symmetric pattern with input parameter hashing where both
+        // the parameter name and its value contribute to the cache key
+        for( Map.Entry<String, String> entry : sortedEntries ) {
+            // Add newline separator between entries for readability in debug scenarios
+            if( result.length() > 0 ) {
+                result.append('\n')
+            }
+            // Format: "semantic_name=bash_command" - both name and command value are
+            // included because changing either should invalidate the task cache
+            result.append(entry.key).append('=').append(entry.value)
+        }
+        
+        return result.toString()
+    }
 }
diff --git a/modules/nextflow/src/test/groovy/nextflow/processor/TaskProcessorTest.groovy b/modules/nextflow/src/test/groovy/nextflow/processor/TaskProcessorTest.groovy
@@ -1218,4 +1218,30 @@ class TaskProcessorTest extends Specification {
         0 * collector.collect(task)
         1 * exec.submit(task)
     }
+
+    def 'should compute eval outputs content deterministically'() {
+
+        setup:
+        def session = Mock(Session)
+        def script = Mock(BaseScript)
+        def config = Mock(ProcessConfig)
+        def processor = new DummyProcessor('test', session, script, config)
+
+        when:
+        def result1 = processor.computeEvalOutputsContent([
+            'nxf_out_eval_2': 'echo "value2"',
+            'nxf_out_eval_1': 'echo "value1"',
+            'nxf_out_eval_3': 'echo "value3"'
+        ])
+        
+        def result2 = processor.computeEvalOutputsContent([
+            'nxf_out_eval_3': 'echo "value3"',
+            'nxf_out_eval_1': 'echo "value1"',
+            'nxf_out_eval_2': 'echo "value2"'
+        ])
+
+        then:
+        result1 == result2
+        result1 == 'nxf_out_eval_1=echo "value1"\nnxf_out_eval_2=echo "value2"\nnxf_out_eval_3=echo "value3"'
+    }
 }