initializeHdf5Write and writeDictToHdf5Chunk modified to be able to w… (

#38) * initializeHdf5Write and writeDictToHdf5Chunk modified to be able to write several groups in a file * initializeHdf5Write and writeDictToHdf5Chunk modified to be able to write dictionaries of dictionaries * Modifying unit test, and adding explanation to initializeHdf5Write * Fixing the tests for the new input formats
LSSTDESC · May 16, 2022 · f73b06f · f73b06f
1 parent a107f0d
commit f73b06f
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 27 deletions.
diff --git a/nb/multipleWriteHdf5_example.ipynb b/nb/multipleWriteHdf5_example.ipynb
@@ -85,7 +85,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dout = set_lengths(get_shapes_and_type(data['data']), 10000)"
+    "dout = {'data':set_lengths(get_shapes_and_type(data['data']), 10000)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71602bd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(dout)"
    ]
   },
   {
@@ -104,7 +114,7 @@
    "outputs": [],
    "source": [
     "os.unlink('test_multi_write.hdf5')\n",
-    "group, fout = tables_io.io.initializeHdf5Write('test_multi_write.hdf5', 'data', **dout)"
+    "groups, fout = tables_io.io.initializeHdf5Write('test_multi_write.hdf5', **dout)"
    ]
   },
   {
@@ -126,7 +136,7 @@
     "    data = make_test_data()\n",
     "    start = i*1000\n",
     "    end = (i+1)*1000\n",
-    "    tables_io.io.writeDictToHdf5Chunk(group, data['data'], start, end)"
+    "    tables_io.io.writeDictToHdf5Chunk(groups, data, start, end)"
    ]
   },
   {
@@ -186,7 +196,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -200,7 +210,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,

diff --git a/tables_io/ioUtils.py b/tables_io/ioUtils.py
@@ -79,15 +79,13 @@ def getInputDataLengthHdf5(filepath, groupname=None):
     return nrow
 
 
-def initializeHdf5Write(filepath, groupname=None, **kwds):
+def initializeHdf5Write(filepath, **kwds):
     """ Prepares an hdf5 file for output
 
     Parameters
     ----------
     filepath : `str`
         The output file name
-    groupname : `str` or `None`
-        The output group name
 
     Returns
     -------
@@ -98,34 +96,38 @@ def initializeHdf5Write(filepath, groupname=None, **kwds):
 
     Notes
     -----
-    The keywords should be used to create_datasets within the hdf5 file.
-    Each keyword should provide a tuple of ( (shape), (dtype) )
-
+    The keywords should be used to create groups within the hdf5 file.
+    Each keyword should provide a dictionary with the data set information of the form:
+     group = {'data1' : ( (shape1), (dtype1) ), 'data2' : ( (shape2), (dtype2) )}
+
+    group : `str` 
+        Name of the Hdf5 group
+    data  : `str`
+        Name of the column to be written
     shape : `tuple` ( `int` )
         The shape of the data for this dataset
     dtype : `str`
         The data type for this dataset
 
     For exmaple
-    `initialize_writeout('test.hdf5', scalar=((100000,), 'f4'), vect=((100000, 3), 'f4'))`
+    `initializeHdf5Write('test.hdf5', data = dict(scalar=((100000,), 'f4'), vect=((100000, 3), 'f4'))`
 
-    Would initialize an hdf5 file with two datasets, with shapes and data types as given
+    Would initialize an hdf5 file with one group and two datasets, with shapes and data types as given
     """
     outdir = os.path.dirname(os.path.abspath(filepath))
     if not os.path.exists(outdir):  #pragma: no cover
         os.makedirs(outdir, exist_ok=True)
     outf = h5py.File(filepath, "w")
-    if groupname is None:  #pragma: no cover
-        group = outf
-    else:
-        group = outf.create_group(groupname)
-
+    groups = {}
     for k, v in kwds.items():
-        group.create_dataset(k, v[0], v[1])
-    return group, outf
+        group = outf.create_group(k)
+        groups[k] = group
+        for key, shape in v.items():
+            group.create_dataset(key, shape[0], shape[1])
+    return groups, outf
 
 
-def writeDictToHdf5Chunk(fout, odict, start, end, **kwds):
+def writeDictToHdf5Chunk(groups, odict, start, end, **kwds):
     """ Writes a data chunk to an hdf5 file
 
     Parameters
@@ -156,9 +158,10 @@ def writeDictToHdf5Chunk(fout, odict, start, end, **kwds):
 
     I.e., if `key` is present in kwds in will override the name.
     """
-    for key, val in odict.items():
-        k_out = kwds.get(key, key)
-        fout[k_out][start:end] = val
+    for group_name, group in groups.items():
+        for key, val in odict[group_name].items():
+            k_out = kwds.get(key, key)
+            group[k_out][start:end] = val
 
 
 def finalizeHdf5Write(fout, groupname=None, **kwds):

diff --git a/tests/test_fileIO.py b/tests/test_fileIO.py
@@ -52,10 +52,10 @@ def test_write_output_file():
     zgrid = np.linspace(0, 4, nbins)
     zmode = zgrid[np.argmax(pz_pdf, axis=1)]
 
-    data_dict = dict(zmode=zmode, pz_pdf=pz_pdf)
+    data_dict = {'data': dict(zmode=zmode, pz_pdf=pz_pdf)}
 
-    group, outf = io.initializeHdf5Write(test_outfile, 'data', photoz_mode=((npdf,), 'f4'), photoz_pdf=((npdf, nbins), 'f4'))
-    io.writeDictToHdf5Chunk(group, data_dict, 0, npdf, zmode='photoz_mode', pz_pdf='photoz_pdf')
+    groups, outf = io.initializeHdf5Write(test_outfile, data = dict(photoz_mode=((npdf,), 'f4'), photoz_pdf=((npdf, nbins), 'f4')))
+    io.writeDictToHdf5Chunk(groups, data_dict, 0, npdf, zmode='photoz_mode', pz_pdf='photoz_pdf')
     io.finalizeHdf5Write(outf, 'md', zgrid=zgrid)
 
     os.unlink(test_outfile)