-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnatsort.m
332 lines (331 loc) · 11.9 KB
/
natsort.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
function [X,ndx,dbg] = natsort(X,rgx,varargin)
% Alphanumeric / Natural-Order sort the strings in a cell array of strings (1xN char).
%
% (c) 2012-2019 Stephen Cobeldick
%
% Alphanumeric sort a cell array of strings: sorts by character order and
% also by the values of any number substrings. Default: match all integer
% number substrings and perform a case-insensitive ascending sort.
%
%%% Example:
% >> X = {'x2', 'x10', 'x1'};
% >> sort(X)
% ans = 'x1' 'x10' 'x2'
% >> natsort(X)
% ans = 'x1' 'x2' 'x10'
%
%%% Syntax:
% Y = natsort(X)
% Y = natsort(X,rgx)
% Y = natsort(X,rgx,<options>)
% [Y,ndx,dbg] = natsort(X,...)
%
% To sort filenames or filepaths use NATSORTFILES (FEX 47434).
% To sort the rows of a cell array of strings use NATSORTROWS (FEX 47433).
%
%% Number Substrings %%
%
% By default consecutive digit characters are interpreted as an integer.
% Specifying the optional regular expression pattern allows the numbers to
% include a +/- sign, decimal digits, exponent E-notation, quantifiers,
% or look-around matching. For information on defining regular expressions:
% http://www.mathworks.com/help/matlab/matlab_prog/regular-expressions.html
%
% The number substrings are parsed by SSCANF into numeric values, using
% either the *default format '%f' or the user-supplied format specifier.
%
% This table shows examples of regular expression patterns for some common
% notations and ways of writing numbers, with suitable SSCANF formats:
%
% Regular | Number Substring | Number Substring | SSCANF
% Expression: | Match Examples: | Match Description: | Format Specifier:
% ==============|==================|===============================|==================
% * \d+ | 0, 123, 4, 56789 | unsigned integer | %f %i %u %lu
% --------------|------------------|-------------------------------|------------------
% [-+]?\d+ | +1, 23, -45, 678 | integer with optional +/- sign| %f %i %d %ld
% --------------|------------------|-------------------------------|------------------
% \d+\.?\d* | 012, 3.45, 678.9 | integer or decimal | %f
% (\d+|Inf|NaN) | 123, 4, NaN, Inf | integer, Inf, or NaN | %f
% \d+\.\d+e\d+ | 0.123e4, 5.67e08 | exponential notation | %f
% --------------|------------------|-------------------------------|------------------
% 0[0-7]+ | 012, 03456, 0700 | octal notation & prefix | %o %i
% [0-7]+ | 12, 3456, 700 | octal notation | %o
% --------------|------------------|-------------------------------|------------------
% 0X[0-9A-F]+ | 0X0, 0X3E7, 0XFF | hexadecimal notation & prefix | %x %i
% [0-9A-F]+ | 0, 3E7, FF | hexadecimal notation | %x
% --------------|------------------|-------------------------------|------------------
% 0B[01]+ | 0B1, 0B101, 0B10 | binary notation & prefix | %b (not SSCANF)
% [01]+ | 1, 101, 10 | binary notation | %b (not SSCANF)
% --------------|------------------|-------------------------------|------------------
%
%% Debugging Output Array %%
%
% The third output is a cell array <dbg>, to check if the numbers have
% been matched by the regular expression <rgx> and converted to numeric
% by the SSCANF format. The rows of <dbg> are linearly indexed from <X>,
% even columns contain numbers, odd columns contain split substrings:
%
% >> [~,~,dbg] = natsort(X)
% dbg =
% 'x' [ 2]
% 'x' [10]
% 'x' [ 1]
%
%% Examples %%
%
%%% Multiple integers (e.g. release version numbers):
% >> A = {'v10.6', 'v9.10', 'v9.5', 'v10.10', 'v9.10.20', 'v9.10.8'};
% >> sort(A)
% ans = 'v10.10' 'v10.6' 'v9.10' 'v9.10.20' 'v9.10.8' 'v9.5'
% >> natsort(A)
% ans = 'v9.5' 'v9.10' 'v9.10.8' 'v9.10.20' 'v10.6' 'v10.10'
%
%%% Integer, decimal, NaN, or Inf numbers, possibly with +/- signs:
% >> B = {'test+NaN', 'test11.5', 'test-1.4', 'test', 'test-Inf', 'test+0.3'};
% >> sort(B)
% ans = 'test' 'test+0.3' 'test+NaN' 'test-1.4' 'test-Inf' 'test11.5'
% >> natsort(B, '[-+]?(NaN|Inf|\d+\.?\d*)')
% ans = 'test' 'test-Inf' 'test-1.4' 'test+0.3' 'test11.5' 'test+NaN'
%
%%% Integer or decimal numbers, possibly with an exponent:
% >> C = {'0.56e007', '', '43E-2', '10000', '9.8'};
% >> sort(C)
% ans = '' '0.56e007' '10000' '43E-2' '9.8'
% >> natsort(C, '\d+\.?\d*([eE][-+]?\d+)?')
% ans = '' '43E-2' '9.8' '10000' '0.56e007'
%
%%% Hexadecimal numbers (with '0X' prefix):
% >> D = {'a0X7C4z', 'a0X5z', 'a0X18z', 'a0XFz'};
% >> sort(D)
% ans = 'a0X18z' 'a0X5z' 'a0X7C4z' 'a0XFz'
% >> natsort(D, '0X[0-9A-F]+', '%i')
% ans = 'a0X5z' 'a0XFz' 'a0X18z' 'a0X7C4z'
%
%%% Binary numbers:
% >> E = {'a11111000100z', 'a101z', 'a000000000011000z', 'a1111z'};
% >> sort(E)
% ans = 'a000000000011000z' 'a101z' 'a11111000100z' 'a1111z'
% >> natsort(E, '[01]+', '%b')
% ans = 'a101z' 'a1111z' 'a000000000011000z' 'a11111000100z'
%
%%% Case sensitivity:
% >> F = {'a2', 'A20', 'A1', 'a10', 'A2', 'a1'};
% >> natsort(F, [], 'ignorecase') % default
% ans = 'A1' 'a1' 'a2' 'A2' 'a10' 'A20'
% >> natsort(F, [], 'matchcase')
% ans = 'A1' 'A2' 'A20' 'a1' 'a2' 'a10'
%
%%% Sort order:
% >> G = {'2', 'a', '', '3', 'B', '1'};
% >> natsort(G, [], 'ascend') % default
% ans = '' '1' '2' '3' 'a' 'B'
% >> natsort(G, [], 'descend')
% ans = 'B' 'a' '3' '2' '1' ''
% >> natsort(G, [], 'num<char') % default
% ans = '' '1' '2' '3' 'a' 'B'
% >> natsort(G, [], 'char<num')
% ans = '' 'a' 'B' '1' '2' '3'
%
%%% UINT64 numbers (with full precision):
% >> natsort({'a18446744073709551615z', 'a18446744073709551614z'}, [], '%lu')
% ans = 'a18446744073709551614z' 'a18446744073709551615z'
%
%% Input and Output Arguments %%
%
%%% Inputs (*==default):
% X = CellArrayOfCharRowVectors, to be sorted into natural-order.
% rgx = Regular expression to match number substrings, '\d+'*
% = [] uses the default regular expression, which matches integers.
% <options> can be entered in any order, as many as required:
% = Sort direction: 'descend'/'ascend'*
% = NaN/number order: 'NaN<num'/'num<NaN'*
% = Character/number order: 'char<num'/'num<char'*
% = Character case handling: 'matchcase'/'ignorecase'*
% = SSCANF number conversion format, e.g.: '%f'*, '%x', '%li', '%b', etc.
%
%%% Outputs:
% Y = CellArrayOfCharRowVectors, <X> sorted into natural-order.
% ndx = NumericArray, such that Y = X(ndx). The same size as <X>.
% dbg = CellArray of the parsed characters and number values.
% Each row is one input char vector, linear-indexed from <X>.
%
% See also SORT NATSORTFILES NATSORTROWS CELLSTR REGEXP IREGEXP SSCANF
%% Input Wrangling %%
%
assert(iscell(X),'First input <X> must be a cell array.')
tmp = cellfun('isclass',X,'char') & cellfun('size',X,1)<2 & cellfun('ndims',X)<3;
assert(all(tmp(:)),'First input <X> must be a cell array of char row vectors (1xN char).')
%
if nargin<2 || isnumeric(rgx)&&isempty(rgx)
rgx = '\d+';
else
assert(ischar(rgx)&&ndims(rgx)<3&&size(rgx,1)==1,...
'Second input <rgx> must be a regular expression (char row vector).') %#ok<ISMAT>
end
%
% Optional arguments:
tmp = cellfun('isclass',varargin,'char') & cellfun('size',varargin,1)<2 & cellfun('ndims',varargin)<3;
assert(all(tmp(:)),'All optional arguments must be char row vectors (1xN char).')
% Character case:
ccm = strcmpi(varargin,'matchcase');
ccx = strcmpi(varargin,'ignorecase')|ccm;
% Sort direction:
sdd = strcmpi(varargin,'descend');
sdx = strcmpi(varargin,'ascend')|sdd;
% Char/num order:
chb = strcmpi(varargin,'char<num');
chx = strcmpi(varargin,'num<char')|chb;
% NaN/num order:
nab = strcmpi(varargin,'NaN<num');
nax = strcmpi(varargin,'num<NaN')|nab;
% SSCANF format:
sfx = ~cellfun('isempty',regexp(varargin,'^%([bdiuoxfeg]|l[diuox])$'));
%
nsAssert(1,varargin,sdx,'Sort direction')
nsAssert(1,varargin,chx,'Char<->num')
nsAssert(1,varargin,nax,'NaN<->num')
nsAssert(1,varargin,sfx,'SSCANF format')
nsAssert(0,varargin,~(ccx|sdx|chx|nax|sfx))
%
% SSCANF format:
if nnz(sfx)
fmt = varargin{sfx};
if strcmpi(fmt,'%b')
cls = 'double';
else
cls = class(sscanf('0',fmt));
end
else
fmt = '%f';
cls = 'double';
end
%
%% Identify Numbers %%
%
[mat,spl] = regexpi(X(:),rgx,'match','split',varargin{ccx});
%
% Determine lengths:
nmx = numel(X);
nmn = cellfun('length',mat);
nms = cellfun('length',spl);
mxs = max(nms);
%
% Preallocate arrays:
bon = bsxfun(@le,1:mxs,nmn).';
bos = bsxfun(@le,1:mxs,nms).';
arn = zeros(mxs,nmx,cls);
ars = cell(mxs,nmx);
ars(:) = {''};
ars(bos) = [spl{:}];
%
%% Convert Numbers to Numeric %%
%
if nmx
tmp = [mat{:}];
if strcmp(fmt,'%b')
tmp = regexprep(tmp,'^0[Bb]','');
vec = cellfun(@(s)sum(pow2(s-'0',numel(s)-1:-1:0)),tmp);
else
vec = sscanf(sprintf(' %s',tmp{:}),fmt);
end
assert(numel(vec)==numel(tmp),'The %s format must return one value for each input number.',fmt)
else
vec = [];
end
%
%% Debugging Array %%
%
if nmx && nargout>2
dbg = cell(mxs,nmx);
dbg(:) = {''};
dbg(bon) = num2cell(vec);
dbg = reshape(permute(cat(3,ars,dbg),[3,1,2]),[],nmx).';
idf = [find(~all(cellfun('isempty',dbg),1),1,'last'),1];
dbg = dbg(:,1:idf(1));
else
dbg = {};
end
%
%% Sort Columns %%
%
if ~any(ccm) % ignorecase
ars = lower(ars);
end
%
if nmx && any(chb) % char<num
boe = ~cellfun('isempty',ars(bon));
for k = reshape(find(bon),1,[])
ars{k}(end+1) = char(65535);
end
[idr,idc] = find(bon);
idn = sub2ind(size(bon),boe(:)+idr(:),idc(:));
bon(:) = false;
bon(idn) = true;
arn(idn) = vec;
bon(isnan(arn)) = ~any(nab);
ndx = 1:nmx;
if any(sdd) % descending
for k = mxs:-1:1
[~,idx] = sort(nsGroup(ars(k,ndx)),'descend');
ndx = ndx(idx);
[~,idx] = sort(arn(k,ndx),'descend');
ndx = ndx(idx);
[~,idx] = sort(bon(k,ndx),'descend');
ndx = ndx(idx);
end
else % ascending
for k = mxs:-1:1
[~,idx] = sort(ars(k,ndx));
ndx = ndx(idx);
[~,idx] = sort(arn(k,ndx),'ascend');
ndx = ndx(idx);
[~,idx] = sort(bon(k,ndx),'ascend');
ndx = ndx(idx);
end
end
else % num<char
arn(bon) = vec;
bon(isnan(arn)) = ~any(nab);
if any(sdd) % descending
[~,ndx] = sort(nsGroup(ars(mxs,:)),'descend');
for k = mxs-1:-1:1
[~,idx] = sort(arn(k,ndx),'descend');
ndx = ndx(idx);
[~,idx] = sort(bon(k,ndx),'descend');
ndx = ndx(idx);
[~,idx] = sort(nsGroup(ars(k,ndx)),'descend');
ndx = ndx(idx);
end
else % ascending
[~,ndx] = sort(ars(mxs,:));
for k = mxs-1:-1:1
[~,idx] = sort(arn(k,ndx),'ascend');
ndx = ndx(idx);
[~,idx] = sort(bon(k,ndx),'ascend');
ndx = ndx(idx);
[~,idx] = sort(ars(k,ndx));
ndx = ndx(idx);
end
end
end
%
ndx = reshape(ndx,size(X));
X = X(ndx);
%
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%natsort
function nsAssert(val,inp,idx,varargin)
% Throw an error if an option is overspecified.
if nnz(idx)>val
tmp = {'Unknown input arguments',' option may only be specified once. Provided inputs'};
error('%s:%s',[varargin{:},tmp{1+val}],sprintf('\n''%s''',inp{idx}))
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsAssert
function grp = nsGroup(vec)
% Groups of a cell array of strings, equivalent to [~,~,grp]=unique(vec);
[vec,idx] = sort(vec);
grp = cumsum([true,~strcmp(vec(1:end-1),vec(2:end))]);
grp(idx) = grp;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%nsGroup