Skip to content

Fix/gfs preprocessing #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions docs/bug_fixes/gfs_preprocessing_fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# GFS Preprocessing Data Format Fix

## Issues
1. Dimension naming conflict between 'variable' and 'channel'
2. Longitude range mismatch (-180 to 180 vs 0 to 360)
3. Data structure incompatibility with model expectations

## Solution

### 1. Correct Preprocessing Script
```python
import xarray as xr

def preprocess_gfs(year: int):
gfs = xr.open_mfdataset(f"/mnt/storage_b/nwp/gfs/global/{year}*.zarr.zip", engine="zarr")

# Fix longitude range
gfs['longitude'] = ((gfs['longitude'] + 360) % 360)

# Select UK region (in 0-360 range)
gfs = gfs.sel(
latitude=slice(65, 45),
longitude=slice(0, 360)
)

# Stack variables into channel dimension
gfs = gfs.to_array(dim="channel") # Use channel instead of variable

# Optimize chunking
gfs = gfs.chunk({
'init_time_utc': len(gfs.init_time_utc),
'step': 10,
'latitude': 1,
'longitude': 1
})

return gfs
```

### 2. Expected Data Structure
- Dimensions: (init_time_utc, step, channel, latitude, longitude)
- Longitude range: [0, 360)
- Single stacked DataArray with channel dimension

### 3. Verification
```python
ds = xr.open_zarr("path/to/gfs.zarr")
assert "channel" in ds.dims
assert 0 <= ds.longitude.min() < ds.longitude.max() <= 360
```
12 changes: 12 additions & 0 deletions docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,18 @@ open-data-pvnet metoffice load --year 2023 --month 12 --day 1 --region uk
open-data-pvnet metoffice load --year 2023 --month 1 --day 16 --region uk --remote
```

#### GFS Data Processing
```bash
# Process and archive GFS data for a specific year
open-data-pvnet gfs archive --year 2023

# Process specific month with upload to HuggingFace
open-data-pvnet gfs archive --year 2023 --month 12 --overwrite

# Process without uploading to HuggingFace
open-data-pvnet gfs archive --year 2023 --skip-upload
```

### Error Handling
Common error messages and their solutions:
- "No datasets found": Check if the specified date has available data
Expand Down
34 changes: 32 additions & 2 deletions src/open_data_pvnet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from open_data_pvnet.scripts.archive import handle_archive
from open_data_pvnet.nwp.met_office import CONFIG_PATHS
from open_data_pvnet.nwp.dwd import process_dwd_data
from open_data_pvnet.nwp.gfs import process_gfs_data

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -203,6 +204,24 @@ def configure_parser():
consolidate_parser = operation_subparsers.add_parser("consolidate", help="Consolidate data")
_add_common_arguments(consolidate_parser, provider)

# Add GFS parser
gfs_parser = subparsers.add_parser("gfs", help="Commands for GFS data")
operation_subparsers = gfs_parser.add_subparsers(dest="operation", help="Operation to perform")

# Archive operation parser for GFS
archive_parser = operation_subparsers.add_parser("archive", help="Archive GFS data")
archive_parser.add_argument("--year", type=int, required=True, help="Year to process")
archive_parser.add_argument("--month", type=int, help="Month to process")
archive_parser.add_argument("--day", type=int, help="Day to process")
archive_parser.add_argument("--skip-upload", action="store_true", help="Skip uploading to HuggingFace")
archive_parser.add_argument("--overwrite", "-o", action="store_true", help="Overwrite existing files")
archive_parser.add_argument(
"--archive-type",
choices=["zarr.zip", "tar"],
default="zarr.zip",
help="Type of archive to create"
)

return parser


Expand Down Expand Up @@ -419,7 +438,8 @@ def main():
open-data-pvnet metoffice consolidate --year 2023 --month 12 --day 1

GFS Data:
Partially implemented
# Archive GFS data for a specific day
open-data-pvnet gfs archive --year 2023 --month 1 --day 1 --skip-upload

DWD Data:
# Archive DWD data for a specific day
Expand Down Expand Up @@ -496,7 +516,17 @@ def main():
"overwrite": args.overwrite,
"archive_type": getattr(args, "archive_type", "zarr.zip"),
}
archive_to_hf(**archive_kwargs)
if args.command == "gfs":
process_gfs_data(
year=args.year,
month=args.month,
day=args.day,
skip_upload=args.skip_upload,
overwrite=args.overwrite,
archive_type=args.archive_type
)
else:
archive_to_hf(**archive_kwargs)

return 0

Expand Down
101 changes: 98 additions & 3 deletions src/open_data_pvnet/nwp/gfs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,103 @@
import xarray as xr
import logging
from pathlib import Path
from open_data_pvnet.utils.data_uploader import upload_to_huggingface

logger = logging.getLogger(__name__)

def process_gfs_data(
year: int,
month: int = None,
day: int = None,
skip_upload: bool = False,
overwrite: bool = False,
archive_type: str = "zarr.zip",
) -> None:
"""
Process GFS data for a given time period and optionally upload to HuggingFace.

def process_gfs_data(year, month):
logger.info(f"Downloading GFS data for {year}-{month}")
raise NotImplementedError("The process_gfs_data function is not implemented yet.")
Args:
year (int): Year to process
month (int, optional): Month to process. If None, processes entire year
day (int, optional): Day to process. If None, processes entire month/year
skip_upload (bool): If True, skips uploading to HuggingFace
overwrite (bool): If True, overwrites existing files
archive_type (str): Type of archive to create ("zarr.zip" or "tar")
"""
try:
# Load GFS data
gfs = xr.open_mfdataset(
f"/mnt/storage_b/nwp/gfs/global/{year}*.zarr.zip",
engine="zarr"
)
logger.info(f"Loaded GFS data for {year}")

# Fix longitude range and select UK region
gfs['longitude'] = ((gfs['longitude'] + 360) % 360)
gfs = gfs.sel(
latitude=slice(65, 45),
longitude=slice(350, 362) # UK region in [0, 360) range
)

# Stack variables into channel dimension
gfs = gfs.to_array(dim="channel")

# Optimize chunking
chunk_sizes = {
'init_time_utc': 1,
'step': 4,
'channel': -1, # Keep all channels together
'latitude': 1,
'longitude': 1
}
gfs = gfs.chunk(chunk_sizes)

# Save locally
output_dir = Path(f"data/gfs/uk/gfs_uk_{year}.zarr")
output_dir.parent.mkdir(parents=True, exist_ok=True)
gfs.to_zarr(output_dir, mode='w')
logger.info(f"Saved processed data to {output_dir}")

# Upload to HuggingFace if requested
if not skip_upload:
upload_to_huggingface(
config_path=Path("config.yaml"),
folder_name=str(year),
year=year,
month=month if month else 1,
day=day if day else 1,
overwrite=overwrite,
archive_type=archive_type
)
logger.info("Upload to HuggingFace completed")

except Exception as e:
logger.error(f"Error processing GFS data: {e}")
raise

def verify_gfs_data(zarr_path: str) -> bool:
"""
Verify that the processed GFS data meets the expected format.

Args:
zarr_path (str): Path to the zarr file to verify

Returns:
bool: True if verification passes
"""
try:
ds = xr.open_zarr(zarr_path)

# Verify dimensions
required_dims = {"init_time_utc", "latitude", "longitude", "channel"}
if not all(dim in ds.dims for dim in required_dims):
raise ValueError(f"Dataset missing required dimensions: {required_dims}")

# Verify longitude range
assert 0 <= ds.longitude.min() < ds.longitude.max() <= 360, \
"Longitude range must be [0, 360)"

return True
except Exception as e:
logger.error(f"Verification failed: {e}")
return False
Loading
Loading