inbo · florisvdh · Sep 22, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/src/update_watersurfaces_refpoints/.Rprofile b/src/update_watersurfaces_refpoints/.Rprofile
@@ -0,0 +1 @@
+source("renv/activate.R")
diff --git a/src/update_watersurfaces_refpoints/10_update_watersurfaces_refpoints.Rmd b/src/update_watersurfaces_refpoints/10_update_watersurfaces_refpoints.Rmd
@@ -0,0 +1,297 @@
+# Update the _interim_ version of the watersurfaces_refpoints data source
+
+Setting version specific information, defining what the 'previous' and 'current' version mean.
+The current version refers to the current _interim_ version of watersurfaces_hab to be used to derive new points for watersurfaces_refpoints.
+
+```{r}
+previous_version <- "v6"
+# in future versions, we should also add:
+# previous_interim_version <- "v..."
+# This is because we will need to take into account already chosen points for 
+# both the regular and the interim versions.
+current_version <- "v6.1_interim"
+xxh64sum_refpoints_previous_tsv <- "b075fbc80c0b55a4"
+xxh64sum_refpoints_previous_yml <- "4ec27b705950e01b"
+xxh64sum_wsh_previous <- "e2920c4932008387"
+xxh64sum_wsh_current <- "d35532db5c4b41ff"
+```
+
+Locating `n2khab_data`.
+
+```{r}
+n2khab_data_path <- locate_n2khab_data()
+```
+
+Note that the code requires a setup of `n2khab_data` that supports concurrent versions, by implementing approach \[3] of <https://github.com/inbo/n2khab/issues/113>.
+
+## Reading the previous versions of watersurfaces_refpoints and watersurfaces_hab
+
+Verifying and reading the previous version of watersurfaces_refpoints.
+
+```{r paged.print = FALSE}
+refpoints_path_previous <- file.path(
+  n2khab_data_path,
+  str_c(
+    "20_processed/_versions/watersurfaces_refpoints/watersurfaces_refpoints_",
+    previous_version
+  )
+)
+# file version verification
+stopifnot(
+  xxh64sum(file.path(refpoints_path_previous, "watersurfaces_refpoints.tsv")) ==
+    xxh64sum_refpoints_previous_tsv
+)
+stopifnot(
+  xxh64sum(file.path(refpoints_path_previous, "watersurfaces_refpoints.yml")) ==
+    xxh64sum_refpoints_previous_yml
+)
+# read as tibble
+refpoints_previous <- 
+  read_vc("watersurfaces_refpoints", root = refpoints_path_previous) %>% 
+  as_tibble()
+refpoints_previous
+```
+
+Note that we consider `in_polygon` to never be overwritten, i.e. it is tied to a specific `polygon_id` at the time of its first arrival in watersurfaces_refpoints.
+It is relevant to keep this attribute since older polygons may be involved which don't occur in recent versions of watersurfaces_hab.
+The column `in_polygon` tells whether the point lies in the polygon or not; with current approaches the point should always be inside the polygon.
+
+Verifying and reading the previous version of watersurfaces_hab.
+
+```{r}
+wsh_path_previous <- file.path(
+  n2khab_data_path,
+  str_c(
+    "20_processed/_versions/watersurfaces_hab/watersurfaces_hab_",
+    previous_version
+  ),
+  "watersurfaces_hab.gpkg"
+)
+# file version verification
+stopifnot(xxh64sum(wsh_path_previous) == xxh64sum_wsh_previous)
+# read
+wsh_previous <- 
+  read_watersurfaces_hab(
+    file = wsh_path_previous,
+    version = str_c("watersurfaces_hab_", previous_version),
+    interpreted = TRUE
+  )
+```
+
+
+
+## Searching for new watersurface polygons in the current interim version of watersurfaces_hab
+
+Verifying and reading the current _interim_ version of watersurfaces_hab.
+
+```{r}
+wsh_path_current <- file.path(
+  n2khab_data_path,
+  str_c(
+    "20_processed/_versions/watersurfaces_hab/watersurfaces_hab_",
+    current_version
+  ),
+  "watersurfaces_hab.gpkg"
+)
+# file version verification
+stopifnot(xxh64sum(wsh_path_current) == xxh64sum_wsh_current)
+# read
+wsh <- 
+  read_watersurfaces_hab(
+    file = wsh_path_current,
+    version = str_c("watersurfaces_hab_", current_version),
+    interpreted = TRUE
+  )
+```
+
+Checking that all polygons of the previous version of watersurfaces_hab are present in the previous version of watersurfaces_refpoints.
+
+```{r}
+wsh_previous$watersurfaces_polygons %>% 
+  anti_join(refpoints_previous, join_by(polygon_id)) %>% 
+  nrow() == 0
+```
+
+Listing new polygons in watersurfaces_hab according to `polygon_id`:
+
+```{r paged.print = FALSE}
+wsh_new <- 
+  wsh$watersurfaces_polygons %>% 
+  select(polygon_id) %>% 
+  anti_join(refpoints_previous, join_by(polygon_id))
+wsh_new
+```
+
+However, it is possible and expected that new `polygon_id`s may pop up in new versions of watersurfaces_hab that at least overlap previous polygons, if not be equal to them.
+First of all, this happens because `polygon_id` values may be different in interim versions of watersurfaces_hab compared to regular versions.
+Evolution of the watersurfaces data source is another reason, which we already had for the regular releases: some polygons previously not distinguished in watersurfaces may already have been distinguished in previous watersurfaces_hab using the `polygon_id` coming from the habitatmap.
+The other way around (disappearing from watersurfaces but still present in habitatmap) is also not excluded and might also be a cause.
+Another reason may be the unstable presence of habitat types and ribs in watersurfaces, causing it to disappear and reappear later in consecutive versions, in which case we also want to recycle the old reference point.
+
+For this reason, we need to check which 'new polygons by `polygon_id`' spatially still match an existing reference point.
+Those polygons should adopt that point instead of receiving a new one.
+
+Note however that we only accept points that are marked to be inside a polygon at the time of their creation.
+This is done to prevent that points outside the polygon that they represent can also be used to represent another (new) polygon that contains it.
+
+Polygons with new `polygon_id` but existing reference point:
+
+```{r paged.print = FALSE}
+refpoints_previous_sf <- 
+  refpoints_previous %>%
+  st_as_sf(coords = c("x", "y"), crs = 31370, remove = FALSE)
+wsh_new_point_exists <- 
+  wsh_new %>% 
+  st_join(
+    refpoints_previous_sf %>% 
+      filter(in_polygon) %>% 
+      select(polygon_id_previous = polygon_id),
+    join = st_contains,
+    left = FALSE
+  ) %>% 
+  st_drop_geometry()
+wsh_new_point_exists
+```
+
+Checking that the point to be adopted is unique.
+
+```{r}
+wsh_new_point_exists %>% 
+  count(polygon_id) %>% 
+  filter(n > 1) %>% 
+  nrow() == 0
+```
+
+We can see that some polygons match multiple reference points, i.e. correspond to multiple polygons of the previous version of watersurfaces_hab:
+
+```{r paged.print = FALSE}
+wsh_new_point_exists %>% 
+  mutate(
+    nr_points = n(),
+    .by = polygon_id
+  ) %>% 
+  filter(nr_points > 1) %>% 
+  print(n = Inf)
+```
+
-
+```{r paged.print = FALSE}
+
+check_overlap <- wsh_new_point_exists %>% 
+  filter(!is.na(polygon_id_previous)) %>%
+  mutate(
+    nr_points = n(),
+    .by = polygon_id
+  )
+
+wsh_pt <- wsh$watersurfaces_polygons %>%
+  filter(polygon_id %in% check_overlap_1$polygon_id) %>%
+  st_point_on_surface()
+
+compare_refpoint <- wsh_pt %>%
+  st_drop_geometry() %>%
+  mutate(x_new = st_coordinates(wsh_pt)[,1],
+         y_new = st_coordinates(wsh_pt)[,2]) %>%
+  left_join(check_overlap, by = "polygon_id") %>%
+  left_join(refpoints_previous %>%
+              rename(polygon_id_previous = polygon_id), 
+            by = "polygon_id_previous") %>%
+  mutate(distance = round(sqrt((x_new - x) ^ 2 + (y_new - y) ^ 2), 3))
-
+```{r paged.print = FALSE}
+
+check_overlap <- wsh_new_point_exists %>% 
+  filter(!is.na(polygon_id_previous)) %>%
+  mutate(
+    nr_points = n(),
+    .by = polygon_id
+  )
+
+wsh_pt <- wsh$watersurfaces_polygons %>%
+  filter(polygon_id %in% check_overlap_1$polygon_id) %>%
+  st_point_on_surface()
+
+compare_refpoint <- wsh_pt %>%
+  st_drop_geometry() %>%
+  mutate(x_new = st_coordinates(wsh_pt)[,1],
+         y_new = st_coordinates(wsh_pt)[,2]) %>%
+  left_join(check_overlap, by = "polygon_id") %>%
+  left_join(refpoints_previous %>%
+              rename(polygon_id_previous = polygon_id), 
+            by = "polygon_id_previous") %>%
+  mutate(distance = round(sqrt((x_new - x) ^ 2 + (y_new - y) ^ 2), 3))
+In these cases, we will choose the point where `polygon_id_previous` is from the watersurfaces data source, or else the one that includes the oldest year (if year can be extracted), and then the alphabetically first one.
+
+```{r}
+wsh_new_point_exists <- 
+  wsh_new_point_exists %>% 
+  mutate(
+    year_hms_previous = str_match(polygon_id_previous, "_v(\\d+)$")[, 2] %>% 
+      as.integer(),
+    polygon_id = as.character(polygon_id)
+    ) %>% 
+  mutate(
+    year_hms_previous_resolved = min(year_hms_previous),
+    .by = polygon_id
+  ) %>% 
+  filter(
+    (is.na(year_hms_previous) & is.na(year_hms_previous_resolved)) |
+      (!is.na(year_hms_previous) & year_hms_previous == year_hms_previous_resolved)
+  ) %>% 
+  arrange(polygon_id, polygon_id_previous) %>% 
+  filter(
+    polygon_id_previous == first(polygon_id_previous),
+    .by = polygon_id
+  ) %>% 
+  select(polygon_id, polygon_id_previous)
+```
+
+Rechecking that the point to be adopted is unique.
+
+```{r}
+stopifnot(
+  wsh_new_point_exists %>% 
+    count(polygon_id) %>% 
+    filter(n > 1) %>% 
+    nrow() == 0
+)
+```
+
+
+
+## Defining the additional list of reference points
+
+Generating the points to be added to the previous version of watersurfaces_refpoints.
+We make a distinction between:
+
+- polygons with a new `polygon_id` that overlap existing reference points: they adopt those points;
+- and totally new polygons, which receive a new point.
+
+```{r paged.print = FALSE}
+refpoints_new <- 
+  wsh_new %>% 
+  st_join(
+    refpoints_previous_sf %>% 
+      filter(in_polygon) %>% 
+      # defining which point to take if multiple match:
+      semi_join(
+        wsh_new_point_exists,
+        join_by(polygon_id == polygon_id_previous)
+      ) %>% 
+      select(x, y),
+    join = st_contains
+  ) %>% 
+  st_sf(agr = "identity") %>% 
+  st_point_on_surface() %>% 
+  mutate(
+    x = ifelse(is.na(x), round(st_coordinates(.)[, 1], 2), x),
+    y = ifelse(is.na(y), round(st_coordinates(.)[, 2], 2), y)
+  ) %>%
+  st_drop_geometry() %>% 
+  mutate(in_polygon = TRUE) %>% 
+  relocate(in_polygon, .after = polygon_id)
+refpoints_new
+```
+
+
+
+## Writing new version of watersurfaces_refpoints
+
+Setting target directory.
+
+```{r}
+refpoints_path_current <- file.path(
+  n2khab_data_path,
+  str_c(
+    "20_processed/_versions/watersurfaces_refpoints/watersurfaces_refpoints_",
+    current_version
+  )
+)
+if (!dir.exists(refpoints_path_current)) {
+  dir.create(refpoints_path_current, recursive = TRUE)
+}
+```
+
+Copying previous version into target directory, so that overwriting it with git2rdata will minimize diffs between versions.
+
+```{r}
+file.copy(
+  file.path(
+    refpoints_path_previous,
+    str_c("watersurfaces_refpoints",c(".tsv", ".yml"))),
+  refpoints_path_current,
+  overwrite = TRUE
+) %>% 
+  invisible()
+```
+
+Combining previous and new refpoints and writing as the new version of watersurfaces_refpoints.
+We only use `polygon_id`, `in_polygon`, `x` and `y` columns as these are the ones unique to this data object.
+Spatial attributes can always be added as needed, provided that the CRS is set correctly (EPSG:31370).
+
+```{r}
+refpoints_previous %>% 
+  bind_rows(refpoints_new) %>% 
+  arrange(polygon_id) %>% 
+  write_vc(
+    "watersurfaces_refpoints",
+    root = refpoints_path_current,
+    sorting = "polygon_id",
+    strict = FALSE,
+    digits = 10
+  )
+```
diff --git a/src/update_watersurfaces_refpoints/20_check_result.Rmd b/src/update_watersurfaces_refpoints/20_check_result.Rmd
@@ -0,0 +1,78 @@
+# Check results
+
+```{r}
+watersurfaces_refpoints <- 
+  read_vc("watersurfaces_refpoints", root = refpoints_path_current) %>% 
+  as_tibble()
+```
+
+## Checksums
+
+```{r}
+tibble(algorithm = c("xxh64", "md5", "sha256")) %>% 
+  crossing(
+    tibble(
+      filepath = file.path(
+        refpoints_path_current,
+        str_c("watersurfaces_refpoints",c(".tsv", ".yml")))
+    )
+  ) %>% 
+  mutate(
+    file = basename(filepath),
+    checksum = checksum(filepath, algorithm),
+    .by = c(filepath, algorithm)
+  ) %>% 
+  select(-filepath) %>% 
+  kable()
+```
+
+## Contents
+
+Printing the object.
+
+```{r paged.print = FALSE}
+watersurfaces_refpoints
+```
+
+Are all `polygon_id` values unique?
+
+```{r}
+n_distinct(watersurfaces_refpoints$polygon_id) == nrow(watersurfaces_refpoints)
+```
+
+Are all polygons of the current version of watersurfaces_hab represented?
+
+```{r}
+wsh$watersurfaces_polygons %>% 
+  semi_join(watersurfaces_refpoints, join_by(polygon_id)) %>% 
+  nrow() == nrow(wsh$watersurfaces_polygons)
+```
+
+Are all coordinates unique for the polygons in the current version of watersurfaces_hab?
+
+```{r}
+watersurfaces_refpoints %>% 
+  semi_join(wsh$watersurfaces_polygons, join_by(polygon_id)) %>% 
+  count(x, y) %>% 
+  filter(n > 1) %>% 
+  nrow() == 0
+```
+
+No missing values?
+
+```{r}
+watersurfaces_refpoints %>% 
+  summarize(across(everything(), \(x) all(!is.na(x)))) %>% 
+  as.matrix() %>% 
+  all()
+```
+
+Are all coordinates situated within (or only marginally outside) Flanders?
+
+```{r}
+watersurfaces_refpoints %>% 
+  st_as_sf(coords = c("x", "y"), crs = 31370) %>% 
+  .[read_admin_areas(dsn = "flanders") %>% st_buffer(50), ] %>% 
+  nrow() == nrow(watersurfaces_refpoints)
+```
+