1
1
import functools
2
2
import itertools
3
+ import warnings
3
4
from contextlib import contextmanager
4
5
from dataclasses import dataclass
5
6
from pathlib import Path
20
21
import numpy as np
21
22
import xarray as xr
22
23
from cyvcf2 import VCF , Variant
23
- from numcodecs import Blosc , PackBits
24
+ from numcodecs import PackBits
24
25
25
26
from sgkit import variables
26
27
from sgkit .io .utils import zarrs_to_dataset
40
41
3 # equivalent to DEFAULT_ALT_NUMBER in vcf_read.py in scikit_allel
41
42
)
42
43
44
+ try :
45
+ from numcodecs import Blosc
46
+
47
+ DEFAULT_COMPRESSOR = Blosc (cname = "zstd" , clevel = 7 , shuffle = Blosc .AUTOSHUFFLE )
48
+ except ImportError : # pragma: no cover
49
+ warnings .warn ("Cannot import Blosc, falling back to no compression" , RuntimeWarning )
50
+ DEFAULT_COMPRESSOR = None
51
+
43
52
44
53
@contextmanager
45
54
def open_vcf (path : PathType ) -> Iterator [VCF ]:
@@ -332,9 +341,7 @@ def vcf_to_zarr_sequential(
332
341
region : Optional [str ] = None ,
333
342
chunk_length : int = 10_000 ,
334
343
chunk_width : int = 1_000 ,
335
- compressor : Optional [Any ] = Blosc (
336
- cname = "zstd" , clevel = 7 , shuffle = Blosc .AUTOSHUFFLE
337
- ),
344
+ compressor : Optional [Any ] = DEFAULT_COMPRESSOR ,
338
345
encoding : Optional [Any ] = None ,
339
346
ploidy : int = 2 ,
340
347
mixed_ploidy : bool = False ,
@@ -478,7 +485,6 @@ def get_chunk_size(dim: Hashable, size: int) -> int:
478
485
# values from function args (encoding) take precedence over default_encoding
479
486
encoding = encoding or {}
480
487
merged_encoding = {** default_encoding , ** encoding }
481
- print (merged_encoding )
482
488
483
489
ds .to_zarr (output , mode = "w" , encoding = merged_encoding )
484
490
first_variants_chunk = False
@@ -493,9 +499,7 @@ def vcf_to_zarr_parallel(
493
499
regions : Union [None , Sequence [str ], Sequence [Optional [Sequence [str ]]]],
494
500
chunk_length : int = 10_000 ,
495
501
chunk_width : int = 1_000 ,
496
- compressor : Optional [Any ] = Blosc (
497
- cname = "zstd" , clevel = 7 , shuffle = Blosc .AUTOSHUFFLE
498
- ),
502
+ compressor : Optional [Any ] = DEFAULT_COMPRESSOR ,
499
503
encoding : Optional [Any ] = None ,
500
504
temp_chunk_length : Optional [int ] = None ,
501
505
tempdir : Optional [PathType ] = None ,
@@ -549,9 +553,7 @@ def vcf_to_zarrs(
549
553
regions : Union [None , Sequence [str ], Sequence [Optional [Sequence [str ]]]],
550
554
chunk_length : int = 10_000 ,
551
555
chunk_width : int = 1_000 ,
552
- compressor : Optional [Any ] = Blosc (
553
- cname = "zstd" , clevel = 7 , shuffle = Blosc .AUTOSHUFFLE
554
- ),
556
+ compressor : Optional [Any ] = DEFAULT_COMPRESSOR ,
555
557
encoding : Optional [Any ] = None ,
556
558
output_storage_options : Optional [Dict [str , str ]] = None ,
557
559
ploidy : int = 2 ,
@@ -690,9 +692,7 @@ def vcf_to_zarr(
690
692
regions : Union [None , Sequence [str ], Sequence [Optional [Sequence [str ]]]] = None ,
691
693
chunk_length : int = 10_000 ,
692
694
chunk_width : int = 1_000 ,
693
- compressor : Optional [Any ] = Blosc (
694
- cname = "zstd" , clevel = 7 , shuffle = Blosc .AUTOSHUFFLE
695
- ),
695
+ compressor : Optional [Any ] = DEFAULT_COMPRESSOR ,
696
696
encoding : Optional [Any ] = None ,
697
697
temp_chunk_length : Optional [int ] = None ,
698
698
tempdir : Optional [PathType ] = None ,
0 commit comments