|
7 | 7 | import numcodecs |
8 | 8 | import numpy as np |
9 | 9 | import pandas as pd |
10 | | -import xarray as xr |
| 10 | +import zarr |
11 | 11 |
|
12 | | -from . import core |
| 12 | +from . import core, provenance |
13 | 13 |
|
14 | 14 | logger = logging.getLogger(__name__) |
15 | 15 |
|
16 | 16 | DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7) |
| 17 | +BED_ZARR_VERSION = 0.1 |
17 | 18 |
|
18 | 19 |
|
19 | 20 | class BedType(Enum): |
@@ -200,18 +201,29 @@ def bed2zarr( |
200 | 201 | fields = update_field_bounds(data, bed_type) |
201 | 202 | dtypes = {f.name: f.smallest_dtype() for f in fields} |
202 | 203 | data.index.name = "records" |
203 | | - ds = xr.Dataset.from_dataframe(data) |
204 | | - for k, v in dtypes.items(): |
205 | | - ds[k] = ds[k].astype(v) |
206 | | - if records_chunk_size is None: |
207 | | - records_chunk_size = len(data) |
208 | | - chunks = { |
209 | | - "records": records_chunk_size, |
210 | | - "contigs": len(contig_id), |
211 | | - } |
212 | | - ds["contig_id"] = xr.DataArray(contig_id, dims=["contigs"]) |
| 204 | + data = data.astype(dtypes) |
| 205 | + store = zarr.DirectoryStore(zarr_path) |
| 206 | + root = zarr.group(store=store) |
| 207 | + root.attrs.update( |
| 208 | + { |
| 209 | + "bed_zarr_version": f"{BED_ZARR_VERSION}", |
| 210 | + "source": f"bio2zarr-{provenance.__version__}", |
| 211 | + } |
| 212 | + ) |
| 213 | + for field in fields[0 : bed_type.value]: |
| 214 | + if field.name == "strand": |
| 215 | + root.array( |
| 216 | + field.name, |
| 217 | + data[field.name].values, |
| 218 | + chunks=(records_chunk_size,), |
| 219 | + dtype="<U1", |
| 220 | + ) |
| 221 | + else: |
| 222 | + root.array( |
| 223 | + field.name, |
| 224 | + data[field.name].values, |
| 225 | + chunks=(records_chunk_size,), |
| 226 | + ) |
| 227 | + root.array("contig_id", contig_id, chunks=(len(contig_id),)) |
213 | 228 | if bed_type.value >= BedType.BED4.value: |
214 | | - ds["name_id"] = xr.DataArray(name_id, dims=["names"]) |
215 | | - chunks["names"] = len(name_id) |
216 | | - ds = ds.chunk(chunks) |
217 | | - ds.to_zarr(zarr_path, mode="w") |
| 229 | + root.array("name_id", name_id, chunks=(len(name_id),)) |
0 commit comments