Skip to content

Commit

Permalink
🍱 Regridding with bounds nicely rounded to multiples of 250
Browse files Browse the repository at this point in the history
Creating new groundtruth NetCDF grids using GMT surface, replacing the ones last created in b90bd74 in #112. Besides having updated to the GMT 6.0.0rc1 tagged release, the main change here is with using nicely rounded bounds (to 250 units in EPSG:3031) instead of arbitrary decimal points. This will really help resolve some of the problems with points not being included in our RMSE_test calculations near the grid's edges (see #152), and integer coordinates are just nicer to debug won't you say?

Specifically, the data_prep.get_region was refactored to use `gmt info -I xxx.csv` instead of pure pandas, returning an `xmin/xmax/ymin/ymax` string that has an extended region optimized for `gmt surface`. There is a "surface [WARNING]: Your grid dimensions are mutually prime.  Convergence is very unlikely" which I'm just gonna ignore for now. Note that data_prep.ascii_to_xyz was one-line patched to drop NaNs as there were some points in the WISE_ISODYN_RadarByFlight.XYZ file with missing elevation (z) values (since #112...) that was messing up gmt.info in the refactored data_prep.get_region. Unit tests have been modified accordingly, and the grids in the integration tests are now downloaded/created in folder /tmp to avoid messing with the actual files in highres. Matplotlib plots of the grids in data_prep.ipynb have been updated, and the new grids will be released in v0.9.2.
  • Loading branch information
weiji14 committed Jun 21, 2019
1 parent 1edb16e commit 0681bf3
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 72 deletions.
84 changes: 50 additions & 34 deletions data_prep.ipynb

Large diffs are not rendered by default.

56 changes: 36 additions & 20 deletions data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import shutil
import sys
import tarfile
import tempfile
import urllib
import yaml
import zipfile
Expand Down Expand Up @@ -306,6 +307,7 @@ def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
)
for f in files
)
df.dropna(axis="index", inplace=True) # drop rows with NaN values
df.reset_index(drop=True, inplace=True) # reset index after concatenation

## Advanced table read with conversions
Expand Down Expand Up @@ -354,19 +356,32 @@ def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
# ![Clean XYZ Table to Raster Grid via interpolation function](https://yuml.me/diagram/scruffy;dir:LR/class/[Clean-XYZ-Table|*.xyz]->[Interpolation-Function],[Interpolation-Function]->[Raster-Grid|*.tif/*.nc])

# %%
def get_region(xyz_data: pd.DataFrame) -> str:
def get_region(xyz_data: pd.DataFrame, round_increment: int = 250) -> str:
"""
Gets the bounding box region of an xyz pandas.DataFrame in string
format xmin/xmax/ymin/ymax rounded to 5 decimal places.
Used for the -R 'region of interest' parameter in GMT.
>>> xyz_data = pd.DataFrame(np.random.RandomState(seed=42).rand(30).reshape(10, 3))
Gets an extended bounding box region for points in an xyz pandas.DataFrame with
columns x, y, and z. The coordinates will be rounded to values specified by the
round_increment parameter. Implementation uses gmt.info with the -I (increment)
setting, see also https://gmt.soest.hawaii.edu/doc/latest/gmtinfo.html#i
The output region is returned in a string format 'xmin/xmax/ymin/ymax' directly
usable as the -R 'region of interest' parameter in GMT. Indeed, the rounding is
specifically optimized to give grid dimensions for fastest results in programs like
GMT surface.
>>> xyz_data = pd.DataFrame(
... 10000 * np.random.RandomState(seed=42).rand(30).reshape(10, 3),
... columns=["x", "y", "z"],
... )
>>> get_region(xyz_data=xyz_data)
'0.05808/0.83244/0.02058/0.95071'
'-250/9500/0/9750'
"""
xmin, ymin, _ = xyz_data.min(axis="rows")
xmax, ymax, _ = xyz_data.max(axis="rows")
return f"{xmin:.5f}/{xmax:.5f}/{ymin:.5f}/{ymax:.5f}"
assert (xyz_data.columns == pd.Index(data=["x", "y", "z"], dtype="object")).all()

with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
xyz_data.to_csv(tmpfile.name, header=False, index=False)
region = gmt.info(fname=tmpfile.name, I=f"s{round_increment}").strip()[2:]

return region


# %%
Expand All @@ -381,18 +396,19 @@ def xyz_to_grid(
"""
Performs interpolation of x, y, z point data to a raster grid.
>>> xyz_data = 1000*pd.DataFrame(np.random.RandomState(seed=42).rand(60).reshape(20, 3))
>>> xyz_data = pd.DataFrame(
... 600 * np.random.RandomState(seed=42).rand(60).reshape(20, 3),
... columns=["x", "y", "z"],
... )
>>> region = get_region(xyz_data=xyz_data)
>>> grid = xyz_to_grid(xyz_data=xyz_data, region=region, spacing=250)
>>> grid.to_array().shape
(1, 5, 5)
(1, 4, 4)
>>> grid.to_array().values
array([[[403.17618 , 544.92535 , 670.7824 , 980.75055 , 961.47723 ],
[379.0757 , 459.26407 , 314.38297 , 377.78555 , 546.0469 ],
[450.67664 , 343.26 , 88.391594, 260.10492 , 452.3337 ],
[586.09906 , 469.74008 , 216.8168 , 486.9802 , 642.2116 ],
[451.4794 , 652.7244 , 325.77896 , 879.8973 , 916.7921 ]]],
dtype=float32)
array([[[135.95927, 294.8152 , 615.32513, 706.4135 ],
[224.46773, 191.75693, 298.87057, 521.85254],
[242.41916, 149.34332, 430.84137, 603.4236 ],
[154.60516, 194.24237, 447.6185 , 645.13574]]], dtype=float32)
"""
## Preprocessing with blockmedian
with gmt.helpers.GMTTempFile(suffix=".txt") as tmpfile:
Expand All @@ -414,7 +430,7 @@ def xyz_to_grid(
region=region,
spacing=f"{spacing}+e",
T=tension,
V="",
V="n", # normal verbosity: produce only fatal error messages
M=f"{mask_cell_radius}c",
)

Expand All @@ -435,7 +451,7 @@ def xyz_to_grid(
grid_dict[name] = xyz_to_grid(
xyz_data=xyz_data, region=region, outfile=f"highres/{name}.nc"
)
print(f"done! {grid_dict[name].to_array().shape}")
print(f"done! {grid_dict[name].z.shape}")

# %% [markdown]
# ### 2.3 Plot raster grids
Expand Down
6 changes: 4 additions & 2 deletions features/steps/test_data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def process_data_through_pipeline_and_get_output(context, pipeline_file):
@when("interpolate the xyz data table to {output_file}")
def interpolate_xyz_data_to_grid(context, output_file):
region = context.data_prep.get_region(context.xyz_data)
context.outfile = os.path.join("highres", output_file)
context.outfile = os.path.join("/tmp/highres", output_file)
os.makedirs(os.path.dirname(context.outfile), exist_ok=True)
context.data_prep.xyz_to_grid(
xyz_data=context.xyz_data, region=region, outfile=context.outfile
)
Expand All @@ -60,7 +61,8 @@ def open_raster_grid_to_check(context):
@given("a big {dataset_type} raster grid {raster_grid}")
def get_a_raster_grid(context, dataset_type, raster_grid):
context.raster_grid = raster_grid
context.filepath = os.path.join(dataset_type, raster_grid)
context.filepath = os.path.join("/tmp", dataset_type, raster_grid)
os.makedirs(os.path.dirname(context.filepath), exist_ok=True)
url = (
f"https://github.com/weiji14/deepbedmap/releases/download/v0.7.0/{raster_grid}"
)
Expand Down
2 changes: 1 addition & 1 deletion misc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ Filename|Location|Resolution|Literature Citation|Data Citation
---|---|---|---|---
Arthern_accumulation_bedmap2_grid1.tif|Antarctica|1000m|[Arthern2006Accumulation](https://doi.org/10.1029/2004JD005667)|
lisa750_2013182_2017120_0000_0400_vv_v1.tif|Antarctica|750m|[Fahnestock2019LISA](https://doi.org/10.1016/j.rse.2015.11.023)|[DOI](https://doi.org/10.7265/nxpc-e997)
3 *.dbf files|Antarctica|nan|[Mouginot2017MEASURES](https://doi.org/10.1126/science.1235798)|[DOI](https://doi.org/10.5067/AXE4121732AD)
3 *.shx files|Antarctica|nan|[Mouginot2017MEASURES](https://doi.org/10.1126/science.1235798)|[DOI](https://doi.org/10.5067/AXE4121732AD)
2 *.tif files|Antarctica|100m|[Noh2018REMA](https://doi.org/10.1016/j.isprsjprs.2017.12.008)|[DOI](https://doi.org/10.7910/DVN/SAIK8B)
MEaSUREs_IceFlowSpeed_450m.tif|Antarctica|450m|[Rignot2011MEASURES](https://doi.org/10.1126/science.1208336)|[DOI](https://doi.org/10.5067/D7GK8F5J8M8R)
34 changes: 19 additions & 15 deletions test_ipynb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,16 @@
"Expecting nothing\n",
"ok\n",
"Trying:\n",
" xyz_data = pd.DataFrame(np.random.RandomState(seed=42).rand(30).reshape(10, 3))\n",
" xyz_data = pd.DataFrame(\n",
" 10000 * np.random.RandomState(seed=42).rand(30).reshape(10, 3),\n",
" columns=[\"x\", \"y\", \"z\"],\n",
" )\n",
"Expecting nothing\n",
"ok\n",
"Trying:\n",
" get_region(xyz_data=xyz_data)\n",
"Expecting:\n",
" '0.05808/0.83244/0.02058/0.95071'\n",
" '-250/9500/0/9750'\n",
"ok\n",
"Trying:\n",
" xr.DataArray(\n",
Expand Down Expand Up @@ -196,7 +199,10 @@
"Expecting nothing\n",
"ok\n",
"Trying:\n",
" xyz_data = 1000*pd.DataFrame(np.random.RandomState(seed=42).rand(60).reshape(20, 3))\n",
" xyz_data = pd.DataFrame(\n",
" 600 * np.random.RandomState(seed=42).rand(60).reshape(20, 3),\n",
" columns=[\"x\", \"y\", \"z\"],\n",
" )\n",
"Expecting nothing\n",
"ok\n",
"Trying:\n",
Expand All @@ -210,17 +216,15 @@
"Trying:\n",
" grid.to_array().shape\n",
"Expecting:\n",
" (1, 5, 5)\n",
" (1, 4, 4)\n",
"ok\n",
"Trying:\n",
" grid.to_array().values\n",
"Expecting:\n",
" array([[[403.17618 , 544.92535 , 670.7824 , 980.75055 , 961.47723 ],\n",
" [379.0757 , 459.26407 , 314.38297 , 377.78555 , 546.0469 ],\n",
" [450.67664 , 343.26 , 88.391594, 260.10492 , 452.3337 ],\n",
" [586.09906 , 469.74008 , 216.8168 , 486.9802 , 642.2116 ],\n",
" [451.4794 , 652.7244 , 325.77896 , 879.8973 , 916.7921 ]]],\n",
" dtype=float32)\n",
" array([[[135.95927, 294.8152 , 615.32513, 706.4135 ],\n",
" [224.46773, 191.75693, 298.87057, 521.85254],\n",
" [242.41916, 149.34332, 430.84137, 603.4236 ],\n",
" [154.60516, 194.24237, 447.6185 , 645.13574]]], dtype=float32)\n",
"ok\n",
"2 items had no tests:\n",
" data_prep\n",
Expand Down Expand Up @@ -511,13 +515,13 @@
" Given a collection of raw high resolution datasets bed_WGS84_grid.txt # features/steps/test_data_prep.py:25\n",
" When we process the data through bed_WGS84_grid.json # features/steps/test_data_prep.py:38\n",
" And interpolate the xyz data table to bed_WGS84_grid.nc # features/steps/test_data_prep.py:45\n",
" Then a high resolution raster grid is returned # features/steps/test_data_prep.py:54\n",
" Then a high resolution raster grid is returned # features/steps/test_data_prep.py:55\n",
"\n",
" Scenario Outline: Tile datasets -- @1.1 Raster grids to tile # features/data_prep.feature:36\n",
" Given a big highres raster grid 2010tr.nc # features/steps/test_data_prep.py:60\n",
" And a collection of square bounding boxes \"model/train/tiles_3031.geojson\" # features/steps/test_data_prep.py:70\n",
" When we crop the big raster grid using those bounding boxes # features/steps/test_data_prep.py:80\n",
" Then a stack of small raster tiles is returned # features/steps/test_data_prep.py:87\n",
" Given a big highres raster grid 2010tr.nc # features/steps/test_data_prep.py:61\n",
" And a collection of square bounding boxes \"model/train/tiles_3031.geojson\" # features/steps/test_data_prep.py:72\n",
" When we crop the big raster grid using those bounding boxes # features/steps/test_data_prep.py:82\n",
" Then a stack of small raster tiles is returned # features/steps/test_data_prep.py:89\n",
"\n"
]
}
Expand Down

0 comments on commit 0681bf3

Please sign in to comment.