Skip to content

Commit 4f19e7d

Browse files
authored
Adding Unicode support (#21)
* Adding unicode * All tests passing * Bumping the version * Adding C# + adding tests * Adding to release notes
1 parent 8609cd2 commit 4f19e7d

21 files changed

+287
-51
lines changed

CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Changelog
22

3+
## [2021-02-10 - Version 1.3.0](https://github.com/matajoh/libnpy/releases/tag/v1.3.0)
4+
5+
New Features:
6+
- Support for Unicode string tensors (npy type 'U')
7+
8+
Breaking change:
9+
- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects
10+
311
## [2021-02-09 - Version 1.2.2](https://github.com/matajoh/libnpy/releases/tag/v1.2.2)
412

513
Improvements:

CSharpWrapper/NumpyIONative.i

+18-14
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
%include "std_vector.i"
1010
%include "std_string.i"
11+
%include "std_wstring.i"
1112
%include "stdint.i"
1213
%include "arrays_csharp.i"
1314
%include "typemaps.i"
@@ -25,7 +26,8 @@ enum class data_type_t : std::uint8_t {
2526
INT64,
2627
UINT64,
2728
FLOAT32,
28-
FLOAT64
29+
FLOAT64,
30+
UNICODE_STRING
2931
};
3032

3133
%rename(Endian) endian_t;
@@ -43,6 +45,13 @@ enum class compression_method_t : std::uint16_t {
4345
DEFLATED = 8
4446
};
4547

48+
%typemap(ctype, out="void *") const wstring * "wchar_t *"
49+
%typemap(imtype,
50+
inattributes="[global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]",
51+
outattributes="[return: global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]"
52+
) const wstring * "string[]"
53+
%typemap(cstype) const wstring * "string[]"
54+
4655
%template(UInt8Buffer) std::vector<unsigned char>;
4756
%template(Int8Buffer) std::vector<signed char>;
4857
%template(UInt16Buffer) std::vector<unsigned short>;
@@ -53,6 +62,8 @@ enum class compression_method_t : std::uint16_t {
5362
%template(Int64Buffer) std::vector<long long>;
5463
%template(Float32Buffer) std::vector<float>;
5564
%template(Float64Buffer) std::vector<double>;
65+
%apply const std::wstring & {std::wstring &};
66+
%template(UnicodeStringBuffer) std::vector<std::wstring>;
5667

5768
%template(Shape) std::vector<size_t>;
5869

@@ -96,17 +107,6 @@ header_info peek(const std::string& path);
96107
template <typename T>
97108
class tensor {
98109
public:
99-
%apply unsigned char FIXED[] {const unsigned char *source};
100-
%apply signed char FIXED[] {const signed char *source};
101-
%apply unsigned short FIXED[] {const unsigned short *source};
102-
%apply short FIXED[] {const short *source};
103-
%apply unsigned int FIXED[] {const unsigned int *source};
104-
%apply int FIXED[] {const int *source};
105-
%apply unsigned long long FIXED[] {const unsigned long long *source};
106-
%apply long long FIXED[] {const long long *source};
107-
%apply float FIXED[] {const float *source};
108-
%apply double FIXED[] {const double *source};
109-
110110
%exception tensor(const std::string& path) %{
111111
try{
112112
$action
@@ -139,7 +139,7 @@ public:
139139
%rename(Save) save;
140140
void save(const std::string& path, endian_t endian = endian_t::NATIVE);
141141

142-
%exception copy_from(const T* source, size_t nitems) %{
142+
%exception copy_from(const std::vector<T>& source) %{
143143
try{
144144
$action
145145
} catch (std::invalid_argument& e){
@@ -150,7 +150,7 @@ public:
150150

151151
%csmethodmodifiers copy_from "public unsafe override";
152152
%rename(CopyFrom) copy_from;
153-
void copy_from(const T* source, size_t itemCount);
153+
void copy_from(const std::vector<T>& source);
154154

155155
%csmethodmodifiers values "protected override"
156156
%rename(getValues) values;
@@ -223,6 +223,8 @@ public:
223223
%template(Float32Tensor) tensor<float>;
224224
%typemap(csbase) SWIGTYPE "Tensor<double, Float64Buffer>";
225225
%template(Float64Tensor) tensor<double>;
226+
%typemap(csbase) SWIGTYPE "Tensor<string, UnicodeStringBuffer>";
227+
%template(UnicodeStringTensor) tensor<std::wstring>;
226228

227229
%typemap(csbase) SWIGTYPE ""
228230

@@ -261,6 +263,7 @@ public:
261263
%template(Write) write<long long>;
262264
%template(Write) write<float>;
263265
%template(Write) write<double>;
266+
%template(Write) write<std::wstring>;
264267
};
265268

266269
%rename(NPZInputStream) inpzstream;
@@ -329,4 +332,5 @@ public:
329332
%template(ReadInt64) read<long long>;
330333
%template(ReadFloat32) read<float>;
331334
%template(ReadFloat64) read<double>;
335+
%template(ReadUnicodeString) read<std::wstring>;
332336
};

CSharpWrapper/Tensor.cs

+3-4
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,11 @@ namespace NumpyIO
2222
public abstract class Tensor<T, B> where B : IList<T>
2323
{
2424
/// <summary>
25-
/// Copy the data from the provided array. These values will
25+
/// Copy the data from the provided buffer. These values will
2626
/// be copied into the underlying C++ type.
2727
/// </summary>
28-
/// <param name="source">The source array</param>
29-
/// <param name="nitems">The number of items to copy</param>
30-
public abstract void CopyFrom(T[] source, uint nitems);
28+
/// <param name="source">The source buffer</param>
29+
public abstract void CopyFrom(B source);
3130

3231
/// <summary>
3332
/// Save the tensor to the provided location on the disk.

RELEASE_NOTES

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
1-
Improvements:
2-
- Bug fix for a missing comma on 1d shape
1+
New Features:
2+
- Support for Unicode string tensors (npy type 'U')
3+
4+
Breaking change:
5+
- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.2.2
1+
1.3.0

assets/test/test.npz

626 Bytes
Binary file not shown.

assets/test/test_compressed.npz

283 Bytes
Binary file not shown.

assets/test/unicode.npy

528 Bytes
Binary file not shown.

include/npy/core.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ enum class data_type_t : std::uint8_t
6666
/** 32-bit floating point value (float) */
6767
FLOAT32,
6868
/** 64-bit floating point value (double) */
69-
FLOAT64
69+
FLOAT64,
70+
/** Unicode string (std::wstring) */
71+
UNICODE_STRING
7072
};
7173

7274
/** Convert a data type and endianness to a NPY dtype string.

include/npy/npy.h

+130-22
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ struct header_info
5656

5757
/** A vector of values indicating the shape of each dimension of the tensor. */
5858
std::vector<size_t> shape;
59+
60+
/** Value used to indicate the maximum length of an element (used by Unicode strings) */
61+
std::size_t max_element_length;
5962
};
6063

6164
/** Writes an NPY header to the provided stream.
@@ -110,6 +113,25 @@ void write_npy_header(std::basic_ostream<CHAR> &output,
110113
output.write(reinterpret_cast<const CHAR *>(end.data()), end.length());
111114
}
112115

116+
template<typename T, typename CHAR>
117+
void copy_to(const T* data_ptr, std::size_t num_elements, std::basic_ostream<CHAR>& output, npy::endian_t endianness)
118+
{
119+
if (endianness == npy::endian_t::NATIVE || endianness == native_endian())
120+
{
121+
output.write(reinterpret_cast<const CHAR *>(data_ptr), num_elements * sizeof(T));
122+
}
123+
else
124+
{
125+
CHAR buffer[sizeof(T)];
126+
for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr)
127+
{
128+
const CHAR *start = reinterpret_cast<const CHAR *>(curr);
129+
std::reverse_copy(start, start + sizeof(T), buffer);
130+
output.write(buffer, sizeof(T));
131+
}
132+
}
133+
}
134+
113135
/** Saves a tensor to the provided stream.
114136
* \tparam T the data type
115137
* \tparam TENSOR the tensor type.
@@ -120,32 +142,72 @@ void write_npy_header(std::basic_ostream<CHAR> &output,
120142
*/
121143
template <typename T,
122144
template <typename> class TENSOR,
123-
typename CHAR>
145+
typename CHAR,
146+
std::enable_if_t<!std::is_same<std::wstring, T>::value, int> = 42>
124147
void save(std::basic_ostream<CHAR> &output,
125148
const TENSOR<T> &tensor,
126149
endian_t endianness = npy::endian_t::NATIVE)
127150
{
128151
auto dtype = to_dtype(tensor.dtype(), endianness);
129152
write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape());
153+
copy_to(tensor.data(), tensor.size(), output, endianness);
154+
};
130155

131-
if (endianness == npy::endian_t::NATIVE ||
132-
endianness == native_endian() ||
133-
dtype[0] == '|')
156+
/** Saves a unicode string tensor to the provided stream.
157+
* \tparam TENSOR the tensor type.
158+
* \param output the output stream
159+
* \param tensor the tensor
160+
* \param endianness the endianness to use in saving the tensor
161+
* \sa npy::tensor
162+
*/
163+
template <typename T,
164+
template <typename> class TENSOR,
165+
typename CHAR,
166+
std::enable_if_t<std::is_same<std::wstring, T>::value, int> = 42>
167+
void save(std::basic_ostream<CHAR> &output,
168+
const TENSOR<std::wstring> &tensor,
169+
endian_t endianness = npy::endian_t::NATIVE)
170+
{
171+
std::size_t max_length = 0;
172+
for(const auto& element : tensor)
134173
{
135-
output.write(reinterpret_cast<const CHAR *>(tensor.data()), tensor.size() * sizeof(T));
174+
if(element.size() > max_length)
175+
{
176+
max_length = element.size();
177+
}
136178
}
137-
else
179+
180+
if(endianness == npy::endian_t::NATIVE)
138181
{
139-
CHAR buffer[sizeof(T)];
140-
for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr)
182+
endianness = native_endian();
183+
}
184+
185+
std::string dtype = ">U" + std::to_string(max_length);
186+
if(endianness == npy::endian_t::LITTLE)
187+
{
188+
dtype = "<U" + std::to_string(max_length);
189+
}
190+
191+
write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape());
192+
193+
std::vector<std::int32_t> unicode(tensor.size() * max_length, 0);
194+
auto word_start = unicode.begin();
195+
for(const auto& element : tensor)
196+
{
197+
auto char_it = word_start;
198+
for(const auto& wchar : element)
141199
{
142-
const CHAR *start = reinterpret_cast<const CHAR *>(curr);
143-
std::reverse_copy(start, start + sizeof(T), buffer);
144-
output.write(buffer, sizeof(T));
200+
*char_it = static_cast<std::int32_t>(wchar);
201+
char_it += 1;
145202
}
203+
204+
word_start += max_length;
146205
}
206+
207+
copy_to(unicode.data(), unicode.size(), output, endianness);
147208
};
148209

210+
149211
/** Saves a tensor to the provided location on disk.
150212
* \tparam T the data type
151213
* \tparam TENSOR the tensor type.
@@ -166,7 +228,7 @@ void save(const std::string &path,
166228
throw std::invalid_argument("path");
167229
}
168230

169-
save(output, tensor, endianness);
231+
save<T, TENSOR, char>(output, tensor, endianness);
170232
};
171233

172234
/** Read an NPY header from the provided stream.
@@ -202,6 +264,26 @@ header_info read_npy_header(std::basic_istream<CHAR> &input)
202264
return header_info(dictionary);
203265
}
204266

267+
template <typename T, typename CHAR>
268+
void copy_to(std::basic_istream<CHAR> &input, T* data_ptr, std::size_t num_elements, npy::endian_t endianness)
269+
{
270+
if (endianness == npy::endian_t::NATIVE || endianness == native_endian())
271+
{
272+
CHAR *start = reinterpret_cast<CHAR *>(data_ptr);
273+
input.read(start, num_elements * sizeof(T));
274+
}
275+
else
276+
{
277+
CHAR buffer[sizeof(T)];
278+
for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr)
279+
{
280+
input.read(buffer, sizeof(T));
281+
CHAR *start = reinterpret_cast<CHAR *>(curr);
282+
std::reverse_copy(buffer, buffer + sizeof(T), start);
283+
}
284+
}
285+
}
286+
205287
/** Loads a tensor in NPY format from the provided stream. The type of the tensor
206288
* must match the data to be read.
207289
* \tparam T the data type
@@ -212,7 +294,8 @@ header_info read_npy_header(std::basic_istream<CHAR> &input)
212294
*/
213295
template <typename T,
214296
template <typename> class TENSOR,
215-
typename CHAR>
297+
typename CHAR,
298+
std::enable_if_t<!std::is_same<std::wstring, T>::value, int> = 42>
216299
TENSOR<T> load(std::basic_istream<CHAR> &input)
217300
{
218301
header_info info = read_npy_header(input);
@@ -222,20 +305,45 @@ TENSOR<T> load(std::basic_istream<CHAR> &input)
222305
throw std::logic_error("requested dtype does not match stream's dtype");
223306
}
224307

225-
if (info.endianness == npy::endian_t::NATIVE || info.endianness == native_endian())
308+
copy_to(input, tensor.data(), tensor.size(), info.endianness);
309+
return tensor;
310+
}
311+
312+
313+
/** Loads a unicode string tensor in NPY format from the provided stream. The type of the tensor
314+
* must match the data to be read.
315+
* \tparam T the data type
316+
* \tparam TENSOR the tensor type
317+
* \param input the input stream
318+
* \return an object of type TENSOR<T> read from the stream
319+
* \sa npy::tensor
320+
*/
321+
template <typename T,
322+
template <typename> class TENSOR,
323+
typename CHAR,
324+
std::enable_if_t<std::is_same<std::wstring, T>::value, int> = 42>
325+
TENSOR<T> load(std::basic_istream<CHAR> &input)
326+
{
327+
header_info info = read_npy_header(input);
328+
TENSOR<T> tensor(info.shape, info.fortran_order);
329+
if (info.dtype != tensor.dtype())
226330
{
227-
CHAR *start = reinterpret_cast<CHAR *>(tensor.data());
228-
input.read(start, tensor.size() * sizeof(T));
331+
throw std::logic_error("requested dtype does not match stream's dtype");
229332
}
230-
else
333+
334+
std::vector<std::int32_t> unicode(tensor.size() * info.max_element_length, 0);
335+
copy_to(input, unicode.data(), unicode.size(), info.endianness);
336+
337+
auto word_start = unicode.begin();
338+
for(auto& element : tensor)
231339
{
232-
CHAR buffer[sizeof(T)];
233-
for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr)
340+
auto char_it = word_start;
341+
for(std::size_t i=0; i<info.max_element_length && *char_it > 0; ++i, ++char_it)
234342
{
235-
input.read(buffer, sizeof(T));
236-
CHAR *start = reinterpret_cast<CHAR *>(curr);
237-
std::reverse_copy(buffer, buffer + sizeof(T), start);
343+
element.push_back(static_cast<wchar_t>(*char_it));
238344
}
345+
346+
word_start += info.max_element_length;
239347
}
240348

241349
return tensor;

include/npy/npz.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class onpzstream
9595
}
9696

9797
omemstream output;
98-
save(output, tensor);
98+
save<T, TENSOR, omemstream::char_type>(output, tensor);
9999

100100
std::string suffix = ".npy";
101101
std::string name = filename;

0 commit comments

Comments
 (0)