🍱 Regridding with bounds nicely rounded to multiples of 250

Creating new groundtruth NetCDF grids using GMT surface, replacing the ones last created in b90bd74 in #112. Besides having updated to the GMT 6.0.0rc1 tagged release, the main change here is with using nicely rounded bounds (to 250 units in EPSG:3031) instead of arbitrary decimal points. This will really help resolve some of the problems with points not being included in our RMSE_test calculations near the grid's edges (see #152), and integer coordinates are just nicer to debug won't you say? Specifically, the data_prep.get_region was refactored to use `gmt info -I xxx.csv` instead of pure pandas, returning an `xmin/xmax/ymin/ymax` string that has an extended region optimized for `gmt surface`. There is a "surface [WARNING]: Your grid dimensions are mutually prime. Convergence is very unlikely" which I'm just gonna ignore for now. Note that data_prep.ascii_to_xyz was one-line patched to drop NaNs as there were some points in the WISE_ISODYN_RadarByFlight.XYZ file with missing elevation (z) values (since #112...) that was messing up gmt.info in the refactored data_prep.get_region. Unit tests have been modified accordingly, and the grids in the integration tests are now downloaded/created in folder /tmp to avoid messing with the actual files in highres. Matplotlib plots of the grids in data_prep.ipynb have been updated, and the new grids will be released in v0.9.2.
weiji14 · Jun 21, 2019 · 0681bf3 · 0681bf3
1 parent 1edb16e
commit 0681bf3
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 72 deletions.
diff --git a/data_prep.ipynb b/data_prep.ipynb
diff --git a/data_prep.py b/data_prep.py
@@ -30,6 +30,7 @@
 import shutil
 import sys
 import tarfile
+import tempfile
 import urllib
 import yaml
 import zipfile
@@ -306,6 +307,7 @@ def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
         )
         for f in files
     )
+    df.dropna(axis="index", inplace=True)  # drop rows with NaN values
     df.reset_index(drop=True, inplace=True)  # reset index after concatenation
 
     ## Advanced table read with conversions
@@ -354,19 +356,32 @@ def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
 # ![Clean XYZ Table to Raster Grid via interpolation function](https://yuml.me/diagram/scruffy;dir:LR/class/[Clean-XYZ-Table|*.xyz]->[Interpolation-Function],[Interpolation-Function]->[Raster-Grid|*.tif/*.nc])
 
 # %%
-def get_region(xyz_data: pd.DataFrame) -> str:
+def get_region(xyz_data: pd.DataFrame, round_increment: int = 250) -> str:
     """
-    Gets the bounding box region of an xyz pandas.DataFrame in string
-    format xmin/xmax/ymin/ymax rounded to 5 decimal places.
-    Used for the -R 'region of interest' parameter in GMT.
-
-    >>> xyz_data = pd.DataFrame(np.random.RandomState(seed=42).rand(30).reshape(10, 3))
+    Gets an extended bounding box region for points in an xyz pandas.DataFrame with
+    columns x, y, and z. The coordinates will be rounded to values specified by the
+    round_increment parameter. Implementation uses gmt.info with the -I (increment)
+    setting, see also https://gmt.soest.hawaii.edu/doc/latest/gmtinfo.html#i
+
+    The output region is returned in a string format 'xmin/xmax/ymin/ymax' directly
+    usable as the -R 'region of interest' parameter in GMT. Indeed, the rounding is
+    specifically optimized to give grid dimensions for fastest results in programs like
+    GMT surface.
+
+    >>> xyz_data = pd.DataFrame(
+    ...     10000 * np.random.RandomState(seed=42).rand(30).reshape(10, 3),
+    ...     columns=["x", "y", "z"],
+    ... )
     >>> get_region(xyz_data=xyz_data)
-    '0.05808/0.83244/0.02058/0.95071'
+    '-250/9500/0/9750'
     """
-    xmin, ymin, _ = xyz_data.min(axis="rows")
-    xmax, ymax, _ = xyz_data.max(axis="rows")
-    return f"{xmin:.5f}/{xmax:.5f}/{ymin:.5f}/{ymax:.5f}"
+    assert (xyz_data.columns == pd.Index(data=["x", "y", "z"], dtype="object")).all()
+
+    with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
+        xyz_data.to_csv(tmpfile.name, header=False, index=False)
+        region = gmt.info(fname=tmpfile.name, I=f"s{round_increment}").strip()[2:]
+
+    return region
 
 
 # %%
@@ -381,18 +396,19 @@ def xyz_to_grid(
     """
     Performs interpolation of x, y, z point data to a raster grid.
 
-    >>> xyz_data = 1000*pd.DataFrame(np.random.RandomState(seed=42).rand(60).reshape(20, 3))
+    >>> xyz_data = pd.DataFrame(
+    ...     600 * np.random.RandomState(seed=42).rand(60).reshape(20, 3),
+    ...     columns=["x", "y", "z"],
+    ... )
     >>> region = get_region(xyz_data=xyz_data)
     >>> grid = xyz_to_grid(xyz_data=xyz_data, region=region, spacing=250)
     >>> grid.to_array().shape
-    (1, 5, 5)
+    (1, 4, 4)
     >>> grid.to_array().values
-    array([[[403.17618 , 544.92535 , 670.7824  , 980.75055 , 961.47723 ],
-            [379.0757  , 459.26407 , 314.38297 , 377.78555 , 546.0469  ],
-            [450.67664 , 343.26    ,  88.391594, 260.10492 , 452.3337  ],
-            [586.09906 , 469.74008 , 216.8168  , 486.9802  , 642.2116  ],
-            [451.4794  , 652.7244  , 325.77896 , 879.8973  , 916.7921  ]]],
-          dtype=float32)
+    array([[[135.95927, 294.8152 , 615.32513, 706.4135 ],
+            [224.46773, 191.75693, 298.87057, 521.85254],
+            [242.41916, 149.34332, 430.84137, 603.4236 ],
+            [154.60516, 194.24237, 447.6185 , 645.13574]]], dtype=float32)
     """
     ## Preprocessing with blockmedian
     with gmt.helpers.GMTTempFile(suffix=".txt") as tmpfile:
@@ -414,7 +430,7 @@ def xyz_to_grid(
         region=region,
         spacing=f"{spacing}+e",
         T=tension,
-        V="",
+        V="n",  # normal verbosity: produce only fatal error messages
         M=f"{mask_cell_radius}c",
     )
 
@@ -435,7 +451,7 @@ def xyz_to_grid(
     grid_dict[name] = xyz_to_grid(
         xyz_data=xyz_data, region=region, outfile=f"highres/{name}.nc"
     )
-    print(f"done! {grid_dict[name].to_array().shape}")
+    print(f"done! {grid_dict[name].z.shape}")
 
 # %% [markdown]
 # ### 2.3 Plot raster grids

diff --git a/features/steps/test_data_prep.py b/features/steps/test_data_prep.py
@@ -45,7 +45,8 @@ def process_data_through_pipeline_and_get_output(context, pipeline_file):
 @when("interpolate the xyz data table to {output_file}")
 def interpolate_xyz_data_to_grid(context, output_file):
     region = context.data_prep.get_region(context.xyz_data)
-    context.outfile = os.path.join("highres", output_file)
+    context.outfile = os.path.join("/tmp/highres", output_file)
+    os.makedirs(os.path.dirname(context.outfile), exist_ok=True)
     context.data_prep.xyz_to_grid(
         xyz_data=context.xyz_data, region=region, outfile=context.outfile
     )
@@ -60,7 +61,8 @@ def open_raster_grid_to_check(context):
 @given("a big {dataset_type} raster grid {raster_grid}")
 def get_a_raster_grid(context, dataset_type, raster_grid):
     context.raster_grid = raster_grid
-    context.filepath = os.path.join(dataset_type, raster_grid)
+    context.filepath = os.path.join("/tmp", dataset_type, raster_grid)
+    os.makedirs(os.path.dirname(context.filepath), exist_ok=True)
     url = (
         f"https://github.com/weiji14/deepbedmap/releases/download/v0.7.0/{raster_grid}"
     )

diff --git a/misc/README.md b/misc/README.md
@@ -6,6 +6,6 @@ Filename|Location|Resolution|Literature Citation|Data Citation
 ---|---|---|---|---
 Arthern_accumulation_bedmap2_grid1.tif|Antarctica|1000m|[Arthern2006Accumulation](https://doi.org/10.1029/2004JD005667)|
 lisa750_2013182_2017120_0000_0400_vv_v1.tif|Antarctica|750m|[Fahnestock2019LISA](https://doi.org/10.1016/j.rse.2015.11.023)|[DOI](https://doi.org/10.7265/nxpc-e997)
-3 *.dbf files|Antarctica|nan|[Mouginot2017MEASURES](https://doi.org/10.1126/science.1235798)|[DOI](https://doi.org/10.5067/AXE4121732AD)
+3 *.shx files|Antarctica|nan|[Mouginot2017MEASURES](https://doi.org/10.1126/science.1235798)|[DOI](https://doi.org/10.5067/AXE4121732AD)
 2 *.tif files|Antarctica|100m|[Noh2018REMA](https://doi.org/10.1016/j.isprsjprs.2017.12.008)|[DOI](https://doi.org/10.7910/DVN/SAIK8B)
 MEaSUREs_IceFlowSpeed_450m.tif|Antarctica|450m|[Rignot2011MEASURES](https://doi.org/10.1126/science.1208336)|[DOI](https://doi.org/10.5067/D7GK8F5J8M8R)
diff --git a/test_ipynb.ipynb b/test_ipynb.ipynb
@@ -143,13 +143,16 @@
       "Expecting nothing\n",
       "ok\n",
       "Trying:\n",
-      "    xyz_data = pd.DataFrame(np.random.RandomState(seed=42).rand(30).reshape(10, 3))\n",
+      "    xyz_data = pd.DataFrame(\n",
+      "        10000 * np.random.RandomState(seed=42).rand(30).reshape(10, 3),\n",
+      "        columns=[\"x\", \"y\", \"z\"],\n",
+      "    )\n",
       "Expecting nothing\n",
       "ok\n",
       "Trying:\n",
       "    get_region(xyz_data=xyz_data)\n",
       "Expecting:\n",
-      "    '0.05808/0.83244/0.02058/0.95071'\n",
+      "    '-250/9500/0/9750'\n",
       "ok\n",
       "Trying:\n",
       "    xr.DataArray(\n",
@@ -196,7 +199,10 @@
       "Expecting nothing\n",
       "ok\n",
       "Trying:\n",
-      "    xyz_data = 1000*pd.DataFrame(np.random.RandomState(seed=42).rand(60).reshape(20, 3))\n",
+      "    xyz_data = pd.DataFrame(\n",
+      "        600 * np.random.RandomState(seed=42).rand(60).reshape(20, 3),\n",
+      "        columns=[\"x\", \"y\", \"z\"],\n",
+      "    )\n",
       "Expecting nothing\n",
       "ok\n",
       "Trying:\n",
@@ -210,17 +216,15 @@
       "Trying:\n",
       "    grid.to_array().shape\n",
       "Expecting:\n",
-      "    (1, 5, 5)\n",
+      "    (1, 4, 4)\n",
       "ok\n",
       "Trying:\n",
       "    grid.to_array().values\n",
       "Expecting:\n",
-      "    array([[[403.17618 , 544.92535 , 670.7824  , 980.75055 , 961.47723 ],\n",
-      "            [379.0757  , 459.26407 , 314.38297 , 377.78555 , 546.0469  ],\n",
-      "            [450.67664 , 343.26    ,  88.391594, 260.10492 , 452.3337  ],\n",
-      "            [586.09906 , 469.74008 , 216.8168  , 486.9802  , 642.2116  ],\n",
-      "            [451.4794  , 652.7244  , 325.77896 , 879.8973  , 916.7921  ]]],\n",
-      "          dtype=float32)\n",
+      "    array([[[135.95927, 294.8152 , 615.32513, 706.4135 ],\n",
+      "            [224.46773, 191.75693, 298.87057, 521.85254],\n",
+      "            [242.41916, 149.34332, 430.84137, 603.4236 ],\n",
+      "            [154.60516, 194.24237, 447.6185 , 645.13574]]], dtype=float32)\n",
       "ok\n",
       "2 items had no tests:\n",
       "    data_prep\n",
@@ -511,13 +515,13 @@
       "    Given a collection of raw high resolution datasets bed_WGS84_grid.txt # features/steps/test_data_prep.py:25\n",
       "    When we process the data through bed_WGS84_grid.json                  # features/steps/test_data_prep.py:38\n",
       "    And interpolate the xyz data table to bed_WGS84_grid.nc               # features/steps/test_data_prep.py:45\n",
-      "    Then a high resolution raster grid is returned                        # features/steps/test_data_prep.py:54\n",
+      "    Then a high resolution raster grid is returned                        # features/steps/test_data_prep.py:55\n",
       "\n",
       "  Scenario Outline: Tile datasets -- @1.1 Raster grids to tile                 # features/data_prep.feature:36\n",
-      "    Given a big highres raster grid 2010tr.nc                                  # features/steps/test_data_prep.py:60\n",
-      "    And a collection of square bounding boxes \"model/train/tiles_3031.geojson\" # features/steps/test_data_prep.py:70\n",
-      "    When we crop the big raster grid using those bounding boxes                # features/steps/test_data_prep.py:80\n",
-      "    Then a stack of small raster tiles is returned                             # features/steps/test_data_prep.py:87\n",
+      "    Given a big highres raster grid 2010tr.nc                                  # features/steps/test_data_prep.py:61\n",
+      "    And a collection of square bounding boxes \"model/train/tiles_3031.geojson\" # features/steps/test_data_prep.py:72\n",
+      "    When we crop the big raster grid using those bounding boxes                # features/steps/test_data_prep.py:82\n",
+      "    Then a stack of small raster tiles is returned                             # features/steps/test_data_prep.py:89\n",
       "\n"
      ]
     }