From 7ef544416a13b64400e9b2237f17c9da9c7d3c5b Mon Sep 17 00:00:00 2001
From: Christian Kehl <c.kehl@uu.nl>
Date: Mon, 28 Dec 2020 17:02:08 +0100
Subject: [PATCH 1/5] benchmarking - after seeing that memory trends between
 sync and async memory measures are equivalent, we disable the async mode to
 reduce compute times

---
 parcels/particleset_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parcels/particleset_benchmark.py b/parcels/particleset_benchmark.py
index fb9a55ce7..5bc903acb 100644
--- a/parcels/particleset_benchmark.py
+++ b/parcels/particleset_benchmark.py
@@ -48,7 +48,7 @@ def measure_mem_usage():
         return rsc.ru_maxrss*1024
     return rsc.ru_maxrss
 
-USE_ASYNC_MEMLOG = True
+USE_ASYNC_MEMLOG = False
 USE_RUSE_SYNC_MEMLOG = False  # can be faulty
 
 class ParticleSet_Benchmark(ParticleSet):
@@ -206,6 +206,7 @@ def execute(self, pyfunc=AdvectionRK4, endtime=None, runtime=None, dt=1.,
         if verbose_progress:
             pbar = self._create_progressbar_(_starttime, endtime)
 
+        mem_used_start = 0
         if USE_ASYNC_MEMLOG:
             self.async_mem_log.measure_func = measure_mem
             mem_used_start = measure_mem()

From 7d50c0a4743fb84a40c4b1f4a0775ea6fd415e46 Mon Sep 17 00:00:00 2001
From: Christian Kehl <c.kehl@uu.nl>
Date: Tue, 12 May 2020 12:04:45 +0200
Subject: [PATCH 2/5] resolved the two main list-comprehension slow-downs

---
 parcels/kernel.py | 48 +++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/parcels/kernel.py b/parcels/kernel.py
index 95a785e4f..304c2d9c6 100644
--- a/parcels/kernel.py
+++ b/parcels/kernel.py
@@ -343,12 +343,15 @@ def execute_python(self, pset, endtime, dt):
                     break
 
     def remove_deleted(self, pset, output_file, endtime):
-        """Utility to remove all particles that signalled deletion"""
-        indices = [i for i, p in enumerate(pset.particles)
-                   if p.state in [ErrorCode.Delete]]
-        if len(indices) > 0 and output_file is not None:
-            output_file.write(pset[indices], endtime, deleted_only=True)
-        pset.remove(indices)
+            """Utility to remove all particles that signalled deletion"""
+            states = np.array([p.state for p in pset.particles])
+            delstates = states==int(ErrorCode.Delete)
+            indices = np.nonzero(delstates)
+            n_deleted = np.count_nonzero(delstates)
+            if n_deleted > 0 and output_file is not None:
+                output_file.write(pset[indices], endtime, deleted_only=True)
+            pset.remove(indices)
+
 
     def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_once=False):
         """Execute this Kernel over a ParticleSet for several timesteps"""
@@ -358,14 +361,6 @@ def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_on
         if abs(dt) < 1e-6 and not execute_once:
             logger.warning_once("'dt' is too small, causing numerical accuracy limit problems. Please chose a higher 'dt' and rather scale the 'time' axis of the field accordingly. (related issue #762)")
 
-        # def remove_deleted(pset):
-        #     """Utility to remove all particles that signalled deletion"""
-        #     indices = [i for i, p in enumerate(pset.particles)
-        #                if p.state in [ErrorCode.Delete]]
-        #     if len(indices) > 0 and output_file is not None:
-        #         output_file.write(pset[indices], endtime, deleted_only=True)
-        #     pset.remove(indices)
-
         if recovery is None:
             recovery = {}
         elif ErrorCode.ErrorOutOfBounds in recovery and ErrorCode.ErrorThroughSurface not in recovery:
@@ -387,9 +382,16 @@ def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_on
         self.remove_deleted(pset, output_file=output_file, endtime=endtime)
 
         # Identify particles that threw errors
-        error_particles = [p for p in pset.particles if p.state not in [ErrorCode.Success, ErrorCode.Evaluate]]
-
-        while len(error_particles) > 0:
+        states = np.array([p.state for p in pset.particles])
+        estates = np.zeros(states.shape, dtype=np.bool)
+        for ecval in {ErrorCode.Success, ErrorCode.Evaluate}:
+            estates |= states==int(ecval)
+        estates = np.invert(estates)
+        error_indices = np.nonzero(estates)
+        n_errors = np.count_nonzero(estates)
+        error_particles = pset[error_indices]
+
+        while n_errors > 0:
             # Apply recovery kernel
             for p in error_particles:
                 if p.state == ErrorCode.StopExecution:
@@ -397,7 +399,6 @@ def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_on
                 if p.state == ErrorCode.Repeat:
                     p.reset_state()
                 elif p.state == ErrorCode.Delete:
-                    # p.delete()
                     pass
                 elif p.state in recovery_map:
                     recovery_kernel = recovery_map[p.state]
@@ -407,7 +408,6 @@ def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_on
                         p.reset_state()
                 else:
                     logger.warning_once('Deleting particle {} because of non-recoverable error'.format(p.id))
-                    # logger.warning('Deleting particle because of bug in #749 and #737 - particle state: {}'.format(ErrorCode.toString(p.state)))
                     p.delete()
 
             # Remove all particles that signalled deletion
@@ -419,7 +419,15 @@ def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_on
             else:
                 self.execute_python(pset, endtime, dt)
 
-            error_particles = [p for p in pset.particles if p.state not in [ErrorCode.Success, ErrorCode.Evaluate]]
+            # error_particles = [p for p in pset.particles if p.state not in [ErrorCode.Success, ErrorCode.Evaluate]]
+            states = np.array([p.state for p in pset.particles])
+            estates = np.zeros(states.shape, dtype=np.bool)
+            for ecval in {ErrorCode.Success, ErrorCode.Evaluate}:
+                estates |= states == int(ecval)
+            estates = np.invert(estates)
+            error_indices = np.nonzero(estates)
+            n_errors = np.count_nonzero(estates)
+            error_particles = pset[error_indices]
 
     def merge(self, kernel):
         funcname = self.funcname + kernel.funcname

From e9a9c357305523a178c8cdeeb644ee6378cf8ccd Mon Sep 17 00:00:00 2001
From: Christian Kehl <c.kehl@uu.nl>
Date: Tue, 12 May 2020 13:34:17 +0200
Subject: [PATCH 3/5] avoid unnecessary ndarray copies by initialising them in
 C-contiguous order - to be tested

---
 parcels/field.py  | 19 ++++++++++++-------
 parcels/grid.py   | 30 ++++++++++++++++++++++--------
 parcels/kernel.py | 12 ++++++++----
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/parcels/field.py b/parcels/field.py
index 339497833..fce2ba86a 100644
--- a/parcels/field.py
+++ b/parcels/field.py
@@ -179,8 +179,8 @@ def __init__(self, name, data, lon=None, lat=None, depth=None, time=None, grid=N
         # (data_full_zdim = grid.zdim if no indices are used, for A- and C-grids and for some B-grids). It is used for the B-grid,
         # since some datasets do not provide the deeper level of data (which is ignored by the interpolation).
         self.data_full_zdim = kwargs.pop('data_full_zdim', None)
-        self.data_chunks = []
-        self.c_data_chunks = []
+        self.data_chunks = []   # the data buffer of the FileBuffer raw loaded data - shall be a list of C-contiguous arrays
+        self.c_data_chunks = []  # C-pointers to the data_chunks array
         self.nchunks = []
         self.chunk_set = False
         self.filebuffers = [None] * 3
@@ -990,7 +990,8 @@ def chunk_setup(self):
 
         self.data_chunks = [None] * npartitions
         self.c_data_chunks = [None] * npartitions
-        self.grid.load_chunk = np.zeros(npartitions, dtype=c_int)
+        # self.grid.load_chunk = np.zeros(npartitions, dtype=c_int)
+        self.grid.load_chunk = np.zeros(npartitions, dtype=c_int, order='C')
         # self.grid.chunk_info format: number of dimensions (without tdim); number of chunks per dimensions;
         #      chunksizes (the 0th dim sizes for all chunk of dim[0], then so on for next dims
         self.grid.chunk_info = [[len(self.nchunks)-1], list(self.nchunks[1:]), sum(list(list(ci) for ci in chunks[1:]), [])]
@@ -1009,7 +1010,8 @@ def chunk_data(self):
             for block_id in range(len(self.grid.load_chunk)):
                 if self.grid.load_chunk[block_id] == 1 or self.grid.load_chunk[block_id] > 1 and self.data_chunks[block_id] is None:
                     block = self.get_block(block_id)
-                    self.data_chunks[block_id] = np.array(self.data.blocks[(slice(self.grid.tdim),) + block])
+                    # self.data_chunks[block_id] = np.array(self.data.blocks[(slice(self.grid.tdim),) + block])
+                    self.data_chunks[block_id] = np.array(self.data.blocks[(slice(self.grid.tdim),) + block], order='C')
                 elif self.grid.load_chunk[block_id] == 0:
                     if isinstance(self.data_chunks, list):
                         self.data_chunks[block_id] = None
@@ -1022,8 +1024,10 @@ def chunk_data(self):
             else:
                 self.data_chunks[0, :] = None
             self.c_data_chunks[0] = None
+
             self.grid.load_chunk[0] = 2
-            self.data_chunks[0] = np.array(self.data)
+            # self.data_chunks[0] = np.array(self.data)
+            self.data_chunks[0] = np.array(self.data, order='C')
 
     @property
     def ctypes_struct(self):
@@ -1046,8 +1050,9 @@ class CField(Structure):
             if self.grid.load_chunk[i] == 1:
                 raise ValueError('data_chunks should have been loaded by now if requested. grid.load_chunk[bid] cannot be 1')
             if self.grid.load_chunk[i] > 1:
-                if not self.data_chunks[i].flags.c_contiguous:
-                    self.data_chunks[i] = self.data_chunks[i].copy()
+                if not self.data_chunks[i].flags['C_CONTIGUOUS']:
+                    # self.data_chunks[i] = self.data_chunks[i].copy()
+                    self.data_chunks[i] = np.array(self.data_chunks[i], order='C')
                 self.c_data_chunks[i] = self.data_chunks[i].ctypes.data_as(POINTER(POINTER(c_float)))
             else:
                 self.c_data_chunks[i] = None
diff --git a/parcels/grid.py b/parcels/grid.py
index 1ae31fa77..a4afd63d6 100644
--- a/parcels/grid.py
+++ b/parcels/grid.py
@@ -35,9 +35,15 @@ class Grid(object):
     """
 
     def __init__(self, lon, lat, time, time_origin, mesh):
-        self.lon = lon
-        self.lat = lat
-        self.time = np.zeros(1, dtype=np.float64) if time is None else time
+        self.lon = lon  # should be a C-contiguous array of floats
+        if not self.lon.flags['C_CONTIGUOUS']:
+            self.lon = np.array(self.lon, order='C')
+        self.lat = lat  # should be a C-contiguous array of floats
+        if not self.lat.flags['C_CONTIGUOUS']:
+            self.lat = np.array(self.lat, order='C')
+        self.time = np.zeros(1, dtype=np.float64) if time is None else time # should be a C-contiguous array of float64
+        if not self.time.flags['C_CONTIGUOUS']:
+            self.time = np.array(self.time, order='C')
         if not self.lon.dtype == np.float32:
             logger.warning_once("Casting lon data to np.float32")
             self.lon = self.lon.astype(np.float32)
@@ -61,7 +67,7 @@ def __init__(self, lon, lat, time, time_origin, mesh):
         self.defer_load = False
         self.lonlat_minmax = np.array([np.nanmin(lon), np.nanmax(lon), np.nanmin(lat), np.nanmax(lat)], dtype=np.float32)
         self.periods = 0
-        self.load_chunk = []
+        self.load_chunk = []    # should be a C-contiguous array of ints
         self.chunk_info = None
         self.master_chunksize = None
         self._add_last_periodic_data_timestep = False
@@ -303,7 +309,9 @@ def __init__(self, lon, lat, depth=None, time=None, time_origin=None, mesh='flat
             assert(len(depth.shape) == 1), 'depth is not a vector'
 
         self.gtype = GridCode.RectilinearZGrid
-        self.depth = np.zeros(1, dtype=np.float32) if depth is None else depth
+        self.depth = np.zeros(1, dtype=np.float32) if depth is None else depth  # should be a C-contiguous array of floats
+        if not self.depth.flags['C_CONTIGUOUS']:
+            self.depth = np.array(self.depth, order='C')
         self.zdim = self.depth.size
         self.z4d = -1  # only used in RectilinearSGrid
         if not self.depth.dtype == np.float32:
@@ -339,7 +347,9 @@ def __init__(self, lon, lat, depth, time=None, time_origin=None, mesh='flat'):
         assert(isinstance(depth, np.ndarray) and len(depth.shape) in [3, 4]), 'depth is not a 3D or 4D numpy array'
 
         self.gtype = GridCode.RectilinearSGrid
-        self.depth = depth
+        self.depth = depth  # should be a C-contiguous array of floats
+        if not self.depth.flags['C_CONTIGUOUS']:
+            self.depth = np.array(self.depth, order='C')
         self.zdim = self.depth.shape[-3]
         self.z4d = len(self.depth.shape) == 4
         if self.z4d:
@@ -435,7 +445,9 @@ def __init__(self, lon, lat, depth=None, time=None, time_origin=None, mesh='flat
             assert(len(depth.shape) == 1), 'depth is not a vector'
 
         self.gtype = GridCode.CurvilinearZGrid
-        self.depth = np.zeros(1, dtype=np.float32) if depth is None else depth
+        self.depth = np.zeros(1, dtype=np.float32) if depth is None else depth  # should be a C-contiguous array of floats
+        if not self.depth.flags['C_CONTIGUOUS']:
+            self.depth = np.array(self.depth, order='C')
         self.zdim = self.depth.size
         self.z4d = -1  # only for SGrid
         if not self.depth.dtype == np.float32:
@@ -470,7 +482,9 @@ def __init__(self, lon, lat, depth, time=None, time_origin=None, mesh='flat'):
         assert(isinstance(depth, np.ndarray) and len(depth.shape) in [3, 4]), 'depth is not a 4D numpy array'
 
         self.gtype = GridCode.CurvilinearSGrid
-        self.depth = depth
+        self.depth = depth  # should be a C-contiguous array of floats
+        if not self.depth.flags['C_CONTIGUOUS']:
+            self.depth = np.array(self.depth, order='C')
         self.zdim = self.depth.shape[-3]
         self.z4d = len(self.depth.shape) == 4
         if self.z4d:
diff --git a/parcels/kernel.py b/parcels/kernel.py
index 304c2d9c6..e8fc121ee 100644
--- a/parcels/kernel.py
+++ b/parcels/kernel.py
@@ -240,13 +240,17 @@ def execute_jit(self, pset, endtime, dt):
             g.load_chunk = np.where(g.load_chunk == 1, 2, g.load_chunk)
             if len(g.load_chunk) > 0:  # not the case if a field in not called in the kernel
                 if not g.load_chunk.flags.c_contiguous:
-                    g.load_chunk = g.load_chunk.copy()
+                    # g.load_chunk = g.load_chunk.copy()
+                    g.load_chunk = np.array(g.load_chunk, order='C')
             if not g.depth.flags.c_contiguous:
-                g.depth = g.depth.copy()
+                # g.depth = g.depth.copy()
+                g.depth = np.array(g.depth, order='C')
             if not g.lon.flags.c_contiguous:
-                g.lon = g.lon.copy()
+                # g.lon = g.lon.copy()
+                g.lon = np.array(g.lon, order='C')
             if not g.lat.flags.c_contiguous:
-                g.lat = g.lat.copy()
+                # g.lat = g.lat.copy()
+                g.lat = np.array(g.lat, order='C')
 
         fargs = [byref(f.ctypes_struct) for f in self.field_args.values()]
         fargs += [c_double(f) for f in self.const_args.values()]

From b103fcac854f4aee42cc245b0bc45aab152fa4d0 Mon Sep 17 00:00:00 2001
From: Christian Kehl <c.kehl@uu.nl>
Date: Fri, 23 Oct 2020 12:53:44 +0200
Subject: [PATCH 4/5] merged conflicts in the stommel benchmark

---
 performance/benchmark_stommel.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/performance/benchmark_stommel.py b/performance/benchmark_stommel.py
index 93e0cf958..a20dc60bd 100644
--- a/performance/benchmark_stommel.py
+++ b/performance/benchmark_stommel.py
@@ -418,31 +418,6 @@ def Age(particle, fieldset, time):
         #endtime = ostime.time()
         endtime = ostime.process_time()
 
-    # if MPI:
-    #     mpi_comm = MPI.COMM_WORLD
-    #     if mpi_comm.Get_rank() == 0:
-    #         dt_time = []
-    #         for i in range(len(perflog.times_steps)):
-    #             if i==0:
-    #                 dt_time.append( (perflog.times_steps[i]-global_t_0) )
-    #             else:
-    #                 dt_time.append( (perflog.times_steps[i]-perflog.times_steps[i-1]) )
-    #         sys.stdout.write("final # particles: {}\n".format(perflog.Nparticles_step[len(perflog.Nparticles_step)-1]))
-    #         sys.stdout.write("Time of pset.execute(): {} sec.\n".format(endtime-starttime))
-    #         avg_time = np.mean(np.array(dt_time, dtype=np.float64))
-    #         sys.stdout.write("Avg. kernel update time: {} msec.\n".format(avg_time*1000.0))
-    # else:
-    #     dt_time = []
-    #     for i in range(len(perflog.times_steps)):
-    #         if i == 0:
-    #             dt_time.append((perflog.times_steps[i] - global_t_0))
-    #         else:
-    #             dt_time.append((perflog.times_steps[i] - perflog.times_steps[i - 1]))
-    #     sys.stdout.write("final # particles: {}\n".format(perflog.Nparticles_step[len(perflog.Nparticles_step)-1]))
-    #     sys.stdout.write("Time of pset.execute(): {} sec.\n".format(endtime - starttime))
-    #     avg_time = np.mean(np.array(dt_time, dtype=np.float64))
-    #     sys.stdout.write("Avg. kernel update time: {} msec.\n".format(avg_time * 1000.0))
-
     if MPI:
         mpi_comm = MPI.COMM_WORLD
         mpi_comm.Barrier()

From 77730ef69855d6f95648652b851c01a8e8b8faf4 Mon Sep 17 00:00:00 2001
From: Christian Kehl <c.kehl@uu.nl>
Date: Fri, 23 Oct 2020 13:20:46 +0200
Subject: [PATCH 5/5] Merged with benchmark rebase

---
 parcels/kernel_benchmark.py                      | 12 ++++++++----
 parcels/particleset_benchmark.py                 |  8 --------
 performance/benchmark_CMEMS.py                   |  1 -
 performance/benchmark_deep_migration_NPacific.py |  3 +--
 performance/benchmark_galapagos_backwards.py     |  1 -
 performance/benchmark_palaeo_Y2K.py              |  1 -
 performance/benchmark_perlin.py                  | 16 ----------------
 performance/benchmark_stommel.py                 |  9 +--------
 8 files changed, 10 insertions(+), 41 deletions(-)

diff --git a/parcels/kernel_benchmark.py b/parcels/kernel_benchmark.py
index 52ea86ff2..efdabfa03 100644
--- a/parcels/kernel_benchmark.py
+++ b/parcels/kernel_benchmark.py
@@ -108,13 +108,17 @@ def execute_jit(self, pset, endtime, dt):
             g.load_chunk = np.where(g.load_chunk == 1, 2, g.load_chunk)
             if len(g.load_chunk) > 0:  # not the case if a field in not called in the kernel
                 if not g.load_chunk.flags.c_contiguous:
-                    g.load_chunk = g.load_chunk.copy()
+                    # g.load_chunk = g.load_chunk.copy()
+                    g.load_chunk = np.array(g.load_chunk, order='C')
             if not g.depth.flags.c_contiguous:
-                g.depth = g.depth.copy()
+                # g.depth = g.depth.copy()
+                g.depth = np.array(g.depth, order='C')
             if not g.lon.flags.c_contiguous:
-                g.lon = g.lon.copy()
+                # g.lon = g.lon.copy()
+                g.lon = np.array(g.lon, order='C')
             if not g.lat.flags.c_contiguous:
-                g.lat = g.lat.copy()
+                # g.lat = g.lat.copy()
+                g.lat = np.array(g.lat, order='C')
         self._mem_io_timings.stop_timing()
         self._mem_io_timings.accumulate_timing()
 
diff --git a/parcels/particleset_benchmark.py b/parcels/particleset_benchmark.py
index 5bc903acb..573bc7a0e 100644
--- a/parcels/particleset_benchmark.py
+++ b/parcels/particleset_benchmark.py
@@ -302,14 +302,6 @@ def execute(self, pyfunc=AdvectionRK4, endtime=None, runtime=None, dt=1.,
             self.total_log.stop_timing()
             self.total_log.accumulate_timing()
             mem_B_used_total = 0
-            # if MPI:
-            #     mpi_comm = MPI.COMM_WORLD
-            #     mem_B_used = self.process.memory_info().rss
-            #     mem_B_used_total = mpi_comm.reduce(mem_B_used, op=MPI.SUM, root=0)
-            # else:
-            #     mem_B_used_total = self.process.memory_info().rss
-            # mem_B_used_total = self.process.memory_info().rss
-            mem_B_used_total = 0
             if USE_RUSE_SYNC_MEMLOG:
                 mem_B_used_total = measure_mem_usage()
             else:
diff --git a/performance/benchmark_CMEMS.py b/performance/benchmark_CMEMS.py
index e31bb4057..39f0276dd 100644
--- a/performance/benchmark_CMEMS.py
+++ b/performance/benchmark_CMEMS.py
@@ -347,7 +347,6 @@ def perIterGC():
 
     if MPI:
         mpi_comm = MPI.COMM_WORLD
-        # mpi_comm.Barrier()
         Nparticles = mpi_comm.reduce(np.array(pset.nparticle_log.get_params()), op=MPI.SUM, root=0)
         Nmem = mpi_comm.reduce(np.array(pset.mem_log.get_params()), op=MPI.SUM, root=0)
         if mpi_comm.Get_rank() == 0:
diff --git a/performance/benchmark_deep_migration_NPacific.py b/performance/benchmark_deep_migration_NPacific.py
index 22b9c5681..c872973c9 100644
--- a/performance/benchmark_deep_migration_NPacific.py
+++ b/performance/benchmark_deep_migration_NPacific.py
@@ -189,7 +189,7 @@ def DeleteParticle(particle, fieldset, time):
 def perIterGC():
     gc.collect()
 
-def getclosest_ij(lats,lons,latpt,lonpt):     
+def getclosest_ij(lats,lons,latpt,lonpt):
     """Function to find the index of the closest point to a certain lon/lat value."""
     dist_sq = (lats-latpt)**2 + (lons-lonpt)**2                 # find squared distance of every point on grid
     minindex_flattened = dist_sq.argmin()                       # 1D index of minimum dist_sq element
@@ -436,7 +436,6 @@ def Profiles(particle, fieldset, time):
 
     if MPI:
         mpi_comm = MPI.COMM_WORLD
-        # mpi_comm.Barrier()
         Nparticles = mpi_comm.reduce(np.array(pset.nparticle_log.get_params()), op=MPI.SUM, root=0)
         Nmem = mpi_comm.reduce(np.array(pset.mem_log.get_params()), op=MPI.SUM, root=0)
         if mpi_comm.Get_rank() == 0:
diff --git a/performance/benchmark_galapagos_backwards.py b/performance/benchmark_galapagos_backwards.py
index ab55302d6..c30452080 100644
--- a/performance/benchmark_galapagos_backwards.py
+++ b/performance/benchmark_galapagos_backwards.py
@@ -200,7 +200,6 @@ def periodicBC(particle, fieldSet, time):
 
     if MPI:
         mpi_comm = MPI.COMM_WORLD
-        # mpi_comm.Barrier()
         Nparticles = mpi_comm.reduce(np.array(pset.nparticle_log.get_params()), op=MPI.SUM, root=0)
         Nmem = mpi_comm.reduce(np.array(pset.mem_log.get_params()), op=MPI.SUM, root=0)
         if mpi_comm.Get_rank() == 0:
diff --git a/performance/benchmark_palaeo_Y2K.py b/performance/benchmark_palaeo_Y2K.py
index 415ff69bc..32e3a93e6 100644
--- a/performance/benchmark_palaeo_Y2K.py
+++ b/performance/benchmark_palaeo_Y2K.py
@@ -361,7 +361,6 @@ class DinoParticle(JITParticle):
 
     if MPI:
         mpi_comm = MPI.COMM_WORLD
-        # mpi_comm.Barrier()
         Nparticles = mpi_comm.reduce(np.array(pset.nparticle_log.get_params()), op=MPI.SUM, root=0)
         Nmem = mpi_comm.reduce(np.array(pset.mem_log.get_params()), op=MPI.SUM, root=0)
         if mpi_comm.Get_rank() == 0:
diff --git a/performance/benchmark_perlin.py b/performance/benchmark_perlin.py
index cb1089d73..ec3be8804 100644
--- a/performance/benchmark_perlin.py
+++ b/performance/benchmark_perlin.py
@@ -88,30 +88,15 @@ def perlin_fieldset_from_numpy(periodic_wrap=False):
 
     # Coordinates of the test fieldset (on A-grid in deg)
     lon = np.linspace(-a*0.5, a*0.5, img_shape[0], dtype=np.float32)
-    # sys.stdout.write("lon field: {}\n".format(lon.size))
     lat = np.linspace(-b*0.5, b*0.5, img_shape[1], dtype=np.float32)
-    # sys.stdout.write("lat field: {}\n".format(lat.size))
     totime = tsteps*tscale*24.0*60.0*60.0
     time = np.linspace(0., totime, tsteps, dtype=np.float64)
-    # sys.stdout.write("time field: {}\n".format(time.size))
 
     # Define arrays U (zonal), V (meridional)
     U = perlin2d.generate_fractal_noise_temporal2d(img_shape, tsteps, (perlinres[1], perlinres[2]), noctaves, perlin_persistence, max_shift=((-1, 2), (-1, 2)))
     U = np.transpose(U, (0,2,1))
-    # U = np.swapaxes(U, 1, 2)
-    # print("U-statistics - min: {:10.7f}; max: {:10.7f}; avg. {:10.7f}; std_dev: {:10.7f}".format(U.min(initial=0), U.max(initial=0), U.mean(), U.std()))
     V = perlin2d.generate_fractal_noise_temporal2d(img_shape, tsteps, (perlinres[1], perlinres[2]), noctaves, perlin_persistence, max_shift=((-1, 2), (-1, 2)))
     V = np.transpose(V, (0,2,1))
-    # V = np.swapaxes(V, 1, 2)
-    # print("V-statistics - min: {:10.7f}; max: {:10.7f}; avg. {:10.7f}; std_dev: {:10.7f}".format(V.min(initial=0), V.max(initial=0), V.mean(), V.std()))
-
-    # U = perlin3d.generate_fractal_noise_3d(img_shape, perlinres, noctaves, perlin_persistence) * scalefac
-    # U = np.transpose(U, (0,2,1))
-    # sys.stdout.write("U field shape: {} - [tdim][ydim][xdim]=[{}][{}][{}]\n".format(U.shape, time.shape[0], lat.shape[0], lon.shape[0]))
-    # V = perlin3d.generate_fractal_noise_3d(img_shape, perlinres, noctaves, perlin_persistence) * scalefac
-    # V = np.transpose(V, (0,2,1))
-    # sys.stdout.write("V field shape: {} - [tdim][ydim][xdim]=[{}][{}][{}]\n".format(V.shape, time.shape[0], lat.shape[0], lon.shape[0]))
-
     data = {'U': U, 'V': V}
     dimensions = {'time': time, 'lon': lon, 'lat': lat}
     if periodic_wrap:
@@ -424,7 +409,6 @@ def Age(particle, fieldset, time):
 
     if MPI:
         mpi_comm = MPI.COMM_WORLD
-        # mpi_comm.Barrier()
         Nparticles = mpi_comm.reduce(np.array(pset.nparticle_log.get_params()), op=MPI.SUM, root=0)
         Nmem = mpi_comm.reduce(np.array(pset.mem_log.get_params()), op=MPI.SUM, root=0)
         if mpi_comm.Get_rank() == 0:
diff --git a/performance/benchmark_stommel.py b/performance/benchmark_stommel.py
index a20dc60bd..c1c3f14a6 100644
--- a/performance/benchmark_stommel.py
+++ b/performance/benchmark_stommel.py
@@ -6,8 +6,8 @@
 from parcels import AdvectionEE, AdvectionRK45, AdvectionRK4
 from parcels import FieldSet, ScipyParticle, JITParticle, Variable, AdvectionRK4, ErrorCode
 from parcels.particleset_benchmark import ParticleSet_Benchmark as ParticleSet
-# from parcels.kernel_benchmark import Kernel_Benchmark as Kernel
 from parcels.field import VectorField, NestedField, SummedField
+# from parcels.kernel_benchmark import Kernel_Benchmark as Kernel
 # from parcels import plotTrajectoriesFile_loadedField
 from datetime import timedelta as delta
 import math
@@ -383,11 +383,8 @@ def Age(particle, fieldset, time):
         mpi_comm = MPI.COMM_WORLD
         mpi_rank = mpi_comm.Get_rank()
         if mpi_rank==0:
-            # starttime = ostime.time()
-            # starttime = MPI.Wtime()
             starttime = ostime.process_time()
     else:
-        # starttime = ostime.time()
         starttime = ostime.process_time()
     kernels = pset.Kernel(AdvectionRK4,delete_cfiles=True)
     if agingParticles:
@@ -411,11 +408,8 @@ def Age(particle, fieldset, time):
         mpi_comm = MPI.COMM_WORLD
         mpi_rank = mpi_comm.Get_rank()
         if mpi_rank==0:
-            # endtime = ostime.time()
-            # endtime = MPI.Wtime()
             endtime = ostime.process_time()
     else:
-        #endtime = ostime.time()
         endtime = ostime.process_time()
 
     if MPI:
@@ -451,7 +445,6 @@ def Age(particle, fieldset, time):
 
     if MPI:
         mpi_comm = MPI.COMM_WORLD
-        # mpi_comm.Barrier()
         Nparticles = mpi_comm.reduce(np.array(pset.nparticle_log.get_params()), op=MPI.SUM, root=0)
         Nmem = mpi_comm.reduce(np.array(pset.mem_log.get_params()), op=MPI.SUM, root=0)
         if mpi_comm.Get_rank() == 0: