forked from mtex-toolbox/mtex
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtxt2mat.m
2540 lines (2243 loc) · 93.2 KB
/
txt2mat.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
function [A,ffn,num_header,sr_input_ca,hl,fpos] = txt2mat(varargin)
% TXT2MAT read an ascii file and convert a data table to matrix
%% Description
% Syntax:
% A = txt2mat
% A = txt2mat(fn)
% [A,ffn,nh,SR,hl,fpos] = txt2mat(fn,nh,nc,cstr,SR,SX)
% [A,ffn,nh,SR,hl,fpos] = txt2mat(fn,... 'param',value,...)
%
% with
%
% A output data matrix
% ffn full file name
% nh number of header lines
% hl header lines (as a string)
% fpos file position of last character read and converted from ascii file
%
% fn file or path name
% nh number of header lines
% nc number of data columns
% cstr conversion string
% SR cell array of replacement strings sr<i>, SR = {sr1,sr2,...}
% SX cell array of invalid line strings sx<i>, SX = {sx1,sx2,...}
%
% All input arguments are optional. See below for param/value-pairs.
%
% TXT2MAT reads the ascii file <fn> and extracts the values found in a
% data table with <nc> columns to a matrix, skipping <nh> header lines.
% When extracting the data, <cstr> is used as conversion type specifier for
% each line (see sscanf online doc for conversion specifiers).
%
% If <fn> points to an existing directory rather than a file or is an
% empty string, a file selection dialogue is displayed.
%
% Additional strings <sr1>,<sr2>,.. can be supplied within a cell array
% <SR> to perform single character substitution before the data is
% converted: each of the first n-1 characters of an <n> character string is
% replaced by the <n>-th character.
% A further optional input argument is a cell array <SX> containing strings
% <sx1>,<sx2>,.. that mark lines containing invalid data. If every line
% containing invalid data can be caught by the <SX>, TXT2MAT will speed up
% significantly (see EXAMPLE 3). Any lines that are recognized to be
% invalid are completely ignored (and there is no corresponding row in A).
%
% If the number of header lines <nh> or the number of data columns <nc> are
% not provided, TXT2MAT performs some automatic analysis of the file format.
% This will need the numbers in the file to be decimals (with decimal point
% or comma) and the data arrangement to be more or less regular (see also
% remark 1).
% If <nc> is negative, TXT2MAT internally initializes the output matrix <A>
% with |<nc>| columns, but allows for expanding <A> if more numeric values
% are found in any line of the file. To this end, TXT2MAT is forced to
% switch to line by line conversion.
%
% If some lines of the data table can not be (fully) converted, the
% corresponding rows in A are padded with NaNs.
%
% For further options and to facilitate the argument assignment, the
% param/value-notation can be used instead of the single argument syntax
% txt2mat(ffn,nh,nc,cstr,SR,SX)
% The following table lists the param/value-pairs and their corresponding
% single argument, if existing:
%
% Param-string Value type Example value single arg.
% 'NumHeaderLines' Scalar 13 nh
% 'NumColumns' Scalar 9 nc
% 'ConvString' String ['%d.%d.%d' repmat('%f',1,6)] cstr
% 'ReplaceChar' Cell {')Rx ',';: '} SR
% 'BadLineString' Cell {'Warng', 'Bad'} SX
% 'ReplaceExpr' Cell {{'True','1'},{'#NaN','#Inf','NaN'}} -
% 'ReplaceRegExpr' Cell {{';\s*(?=;)','; NaN'}} -
% 'DialogString' String 'Now choose a log file' -
% 'InfoLevel' Scalar 1 -
% 'ReadMode' String 'auto' -
% 'NumericType' String 'single' -
% 'RowRange' 2x1-Vector [2501 5000] -
% 'FilePos' Scalar 0 -
% 'MemPar' Scalar 2^17 -
%
% The param/value-pairs may follow the usual arguments in any order, e.g.
% txt2mat('file.txt',13,9,'BadLineString',{'Bad'},'ConvString','%f'). Only
% the single file name argument must be given as the first input.
%
% Param/value-pairs with additional functionality:
%
% � The 'ReplaceExpr' argument works similar to the 'ReplaceChar' argument.
% It just replaces whole expressions instead of single characters. A cell
% array containing at least one cell array of strings must be provided.
% Such a cell array of strings consists of <n> strings, each of the first
% <n-1> strings is replaced by the <n>-th string. For example, with
% {{'R1a','R1b, 'S1'}, {'R2a','R2b','R2c', 'S2'}}
% all the 'R<n>'-strings are replaced by the corresponding 'S<n>' string.
% In general, replacing whole strings takes more time than 'ReplaceChar',
% especially if the strings differ in size.
% Expression replacements are performed before character replacements.
%
% � By the help of the 'ReplaceRegExpr' argument regular expressions can be
% replaced. The usage is analogous to 'ReplaceExpr'. Regular expression
% replacements are carried out before any other replacement (see
% EXAMPLE 5).
%
% � The 'DialogString' argument provides the text shown in the title bar of
% the file selection dialogue that may appear.
%
% � The 'InfoLevel' argument controls the verbosity of TXT2MAT's outputs in
% the command window and the message boxes. Currently known values are:
% 0, 1, 2 (default)
%
% � 'ReadMode' is one of 'matrix', 'line', 'auto' (default), or 'block'.
% 'matrix': Read and convert sections of multiple lines simultaneously,
% requiring each line to contain the same number of values.
% Finding an improper number of values in such a section will
% cause an error (see also remark 2).
% 'line': Read and convert text line by line, allowing different
% numbers of values per line (slower than 'matrix' mode).
% 'auto': Try 'matrix' first, continue with 'line' if an error occurs.
% 'block': Read and convert sections of multiple lines simultaneously
% and fill up the data matrix regardless of how many values
% occur in each text line. Only a warning is issued if a
% section's number of values is not a multiple of the number of
% columns of the output data matrix. This is the fastest mode.
%
% � 'NumericType' is one of 'int8', 'int16', 'int32', 'int64', 'uint8',
% 'uint16', 'uint32', 'uint64', 'single', or 'double' (default),
% determining the numeric class of the output matrix A. If the numeric
% class does not support NaNs, missing elements are padded with zeros
% instead. Reduce memory consumption by choosing an appropriate numeric
% class, if needed.
%
% � The 'RowRange' value is a sorted positive integer two element vector
% defining an interval of data rows to be converted (header lines do not
% count, but lines that will be recognized as invalid - see above - do).
% If the vector's second element exceeds the number of valid data rows in
% the file, the data is extracted up to the end of the file (Inf is
% allowed as second argument). It may save memory and computation time if
% only a small part of data has to be extracted from a huge text file.
%
% � The 'FilePos' value <fp> is a nonnegative integer scalar. <fp>
% characters from the beginning of the file will be ignored, i.e. not be
% read. If you run TXT2MAT with a 'RowRange' argument, you may
% use the <fpos> output as an 'FilePos' input during the next run in
% order to continue from where you stopped. By that you can split up the
% conversion process e.g. when the file is too big to be read as a whole
% (see EXAMPLE 6).
%
% � The 'MemPar' argument provides the minimum amount of characters TXT2MAT
% will process simultaneously as an internal text section (= a set of
% text lines). It must be a positive integer. The value does not affect
% the outputs, but computation time and memory usage. The roughly
% optimized default is 65536; usually there is no need to change it.
%
% -------------------------------------------------------------------------
%
% REMARKS
%
% 1) prerequisites for the automatic file format analysis (if the number of
% header lines and data columns is not given):
% � header lines can be detected by either non-numeric characters or
% a strongly deviating number of numeric items in relation to the
% data section (<10%)
% � tab, space, slash, comma, colon, and semicolon are accepted as
% delimiters (e.g. "10/11/2006 08:30 1; 3.3; 0.52" is ok)
% � after the optional user supplied replacements have been carried out,
% the data section must contain the delimiters and the decimal numbers
% only (point or comma are accepted as decimal character).
% Note: if you do not trigger the internal file format analysis, i.e.
% you do provide both the number of header lines and the number of data
% columns, you also have to care for an eventual decimal _comma_ and
% non-whitespace delimiters. Such a comma can be replaced with a '.',
% and the whitespaces can either be included into a suitable conversion
% string or be replaced with whitespaces (see e.g. the 'ReplaceChar'
% argument)
%
% 2) In matrix mode, txt2mat checks that the conversion string is suitable
% and that the number of values read from a section of the file is the
% product of the number of text lines and the number of columns. This
% may be true even if the number of values per line is not uniform and
% txt2mat may be misled. So using matrix mode you should be sure that
% all lines that can't be sorted out by a bad line marker string contain
% the same number of values.
%
% 3) Since txt2mat.m is a comparatively large file, generating a preparsed
% file txt2mat.p once will speed up the first call during a matlab
% session. Set the current directory to where you saved txt2mat.m and
% type
% >> pcode txt2mat
% For further information, see the 'pcode' documentation.
%
% -------------------------------------------------------------------------
%
% EXAMPLE 1:
%
% A = txt2mat; % choose a file and let TXT2MAT analyse its format
%
% -------------------------------------------------------------------------
%
% EXAMPLE 2:
%
% Supposed your ascii file C:\mydata.log contains the following lines:
% �
% 10 11 2006 08 35.225 1 3.3 0.52
% 31 05 2008 12 12 0 0.0 0.00
% 7 01 2010 15 23.5 -1 3.3 0.535
% �
% type
%
% A = txt2mat('C:\mydata.log',0,8);
%
% or just
%
% A = txt2mat('C:\mydata.log');
%
% Here, TXT2MAT uses its automatic file layout detection as the header line
% and column number is not given. With the file looking like this:
% �
% some example data
% plus another header line
% 10/11/2006 08:35,225 1; 3,3; 0,52
% 31/05/2008 12:12 0; 0,0; 0,00
% 7/01/2010 15:23,5 -1; 3,3; 0,535
% �
% txt2mat('C:\mydata.log') returns the same output data matrix as above.
%
% -------------------------------------------------------------------------
%
% EXAMPLE 3:
%
% Supposed your ascii file C:\mydata.log starts as follows:
% �
% ;$FILEVERSION=1.1
% ;$STARTTIME=38546.6741619815
% ;---+-- ----+---- --+-- ----+--- + -+ -- -- --
% 3) 7,2 Rx 0300 8 01 A3 58 4D
% 4) 7,3 Rx 0310 8 06 6E 2B 9F
% 5) 9,5 Warng FFFFFFFF 4 00 00 00 08 BUSHEAVY
% 6) 12,9 Rx 0320 8 02 E1 F6 EF
% �
% you may specify
% nh = 3 % header lines,
% nc = 12 % data columns,
% cstr = '%f %f %x %x %x %x %x %x' % as conversion string for floats and
% % hexadecimals,
% sr1 = ')Rx ' % as first replacement string to blank the characters
% ')','R', and 'x' (if you don't want to include them
% in the conversion string), and
% sr2 = ',.' % to replace the decimal comma with a dot, and
%
% sx1 = 'Warng' % as a marker for invalid lines
%
% A = txt2mat('C:\mydata.log', nh, nc, cstr, {sr1,sr2}, {'Warng'});
%
% A =
% 3 7.2 768 8 1 163 88 77
% 4 7.3 784 8 6 110 43 159
% 6 12.9 800 8 2 225 246 239
% ...
%
% If you make use of the param/value-pairs, you can also write more clearly
%
% t2mOpts = {'NumHeaderLines', 3 , ...
% 'NumColumns' , 12 , ...
% 'ReplaceChar' , {')Rx ',',.'} , ...
% 'ConvString' , '%f %f %x %x %x %x %x %x' , ...
% 'BadLineString' , {'Warng'} };
%
% A = txt2mat('C:\mydata.log', t2mOpts{:});
%
% Without the {'Warng'} argument, A would have been
%
% 3 7.2 768 8 1 163 88 77
% 4 7.3 784 8 6 110 43 159
% 5 9.5 NaN NaN NaN NaN NaN NaN
% 6 12.9 800 8 2 225 246 239
% ...
%
% -------------------------------------------------------------------------
%
% EXAMPLE 4:
%
% Supposed your ascii file C:\mydata.log begins with the following lines:
% �
% datetime % ppm % ppm Nm
% datetime real8 real8 real8
% 30.10.2006 14:24:06,131 6,4459 478,519 6,5343
% 30.10.2006 14:24:17,400 6,4093 484,959 6,5343
% 30.10.2006 14:24:17,499 6,4093 484,959 6,5343
% �
% you might specify
% nh = 2 % header lines,
% nc = 9 % data columns,
% cstr = ['%d.%d.%d' repmat('%f',1,6)] % as conversion string for
% % integers and hexadecimals,
% sr1 = ': ' % as first replacement string to blank the ':'
% sr2 = ',.' % to replace the decimal comma with a dot, and
%
% A = txt2mat('C:\mydata.log', nh, nc, cstr, {sr1,sr2});
%
% A =
% 30 10 2006 14 24 6.131 6.4459 478.519 6.5343
% 30 10 2006 14 24 17.4 6.4093 484.959 6.5343
% 30 10 2006 14 24 17.499 6.4093 484.959 6.5343
% ...
%
%
% A = txt2mat('C:\mydata.log','ReplaceRegExpr',{{'\.(\d+)\.',' $1 '}});
%
% yields the same result, but uses the built-in file layout analysis to
% determine the number of header lines, the number of columns, the
% delimiters, and the decimal character. You only help TXT2MAT by
% telling it to replace dots surrounding the month number with spaces via
% the regular expression replacement. So you can use the latter command on
% similar files which have a different or previously unknown number of
% header lines etc., too.
%
% -------------------------------------------------------------------------
%
% EXAMPLE 5:
%
% If the data table of your file contains some gaps that can be identified
% by some repeated delimiters (here ;)
% �
% ; 02; 03; 04; 05;
% 11; ; 13; 14; 15;
% 21; ; 23; ;;
% ; 32; 33; 34; 35;
% �
% you can fill them with NaNs by the help of 'ReplaceRegExpr':
%
% A = txt2mat('C:\mydata.log','ReplaceRegExpr',...
% {{'((?<=;\s*);)|(^\s*;)','NaN;'}});
%
% A =
% NaN 2 3 4 5
% 11 NaN 13 14 15
% 21 NaN 23 NaN NaN
% NaN 32 33 34 35
%
%
% -------------------------------------------------------------------------
%
% EXAMPLE 6:
%
% If you want to process the contents of mydata.log step by step,
% converting one million lines at a time:
%
% fp = 0; % File position to start with (beginning of file)
% A = NaN; % initialize output matrix
% nhl = 12; % number of header lines for the first call
%
% while numel(A)>0
% [A,ffn,nh,SR,hl,fp] = txt2mat('C:\mydata.log','RowRange',[1,1e6], ...
% 'FilePos',fp,'NumHeaderLines',nhl);
% nhl = 0; % there are no further header lines
%
% % process intermediate results...
% end
%
% -------------------------------------------------------------------------
%
% EXAMPLE 7:
%
% You can use the read mode 'block' on very large files with a constant
% number of values per line to save some import time compared to the
% 'matrix' mode. Besides, as TXT2MAT does not check for line breaks within
% the (internally processed) sections of a file, you can use the block mode
% to fill up any output matrix with a fixed number of columns.
% �
% 1 2 3 4 5
% 6 7 8 9 10
%
% 11 12 13 14 15
% 16 17 18 19 20
% 21 22
% 23 24 25
% 26 27 28 29 30
%
% �
%
% A = txt2mat('C:\mydata.txt',0,5,'ReadMode','block')
%
% A =
% 1 2 3 4 5
% 6 7 8 9 10
% 11 12 13 14 15
% 16 17 18 19 20
% 21 22 23 24 25
% 26 27 28 29 30
%
%
% Instead, if you want to preserve the line break information, use read
% mode 'line':
%
% A = txt2mat('C:\mydata.txt',0,5,'ReadMode','line')
%
% or
%
% A = txt2mat('C:\mydata.txt',0,-1)
%
% A =
% 1 2 3 4 5
% 6 7 8 9 10
% NaN NaN NaN NaN NaN
% 11 12 13 14 15
% 16 17 18 19 20
% 21 22 NaN NaN NaN
% 23 24 25 NaN NaN
% 26 27 28 29 30
%
% The first command reads up to 5 elements per line, starting from the
% first, and puts them to a Nx5 matrix, whereas the second one
% automatically expands the column size of the output to fit in the maximum
% number of elements occuring in a line. This is effected by the negative
% column number argument that also implies read mode 'line' here.
%
% -------------------------------------------------------------------------
%
% See also SSCANF
% --- Author: -------------------------------------------------------------
% Copyright 2005-2008 A.T�nnesmann
% $Revision: 6.01 $ $Date: 2008/10/25 21:25:09 $
% --- E-Mail: -------------------------------------------------------------
% x=-2:3;
% disp(char(round([-0.32*x.^5+0.43*x.^4+1.75*x.^3-5.90*x.^2-0.95*x+116,...
% -4.44*x.^5+9.12*x.^4+29.8*x.^3-33.6*x.^2-52.9*x+ 98])))
% --- History -------------------------------------------------------------
% 05.61
% � fixed bug: possible wrong headerlines output when using 'FilePos'
% � fixed bug: produced an error if a bad line marker string was already
% found in the first data line
% � corrected user information if sscanf fails in matrix mode
% � added some more help lines
% 05.62
% � allow negative NumColumns argument to capture a priori unknown
% numbers of values per line
% 05.82 beta
% � support regular expression replacements ('ReplaceRegExpr' argument)
% � consider user supplied replacements when analysing the file layout
% 05.86 beta
% � some code clean-up (argincheck subfunction, ...)
% 05.86.1
% � fixed bug: possibly wrong numeric matlab version number detection
% 05.90
% � consider skippable lines when analysing the file layout
% � code rearrangements (subfun for line termination detection, ...)
% 05.96
% � subfuns to find line breaks / bad-line pos and to initialize output A
% � better handling of errors and 'degenerate' files, e.g. exit without
% an error if the file selection dialogue was cancelled
% 05.97
% � fixed bug: error in file analysis if first line contains bad line
% marker
% � fixed bug: a bad line marker is ignored if the string is split up by
% two consecutive internal sections
% � better code readability in FindLineBreaks subfunction
% 05.97.1
% � simplifications by skipping the header when reading from the file;
% the header is now read separately and is not affected by any
% replacements
% � corrected handling of bad line markers that already appear in header
% 05.98
% � corrected search for long bad line marker strings that could exceed
% text dimensions
% � speed-up by improved finding of line break positions
% 06.00
% � introduction of 'high speed' read mode "block" requiring less line
% break information
% � 'MemPar' buffer value changed to scalar
% � antipodal memory demand by translating smaller text portions to char
% � modified help
% 06.01
% � fixed bug: possible error message in file analysis when only header
% line number is given
% --- Wish list -----------------------------------------------------------
%% Definitions
spuint = uint8(32); % Space (= ascii whitespace limit) as uint8
% find out matlab version as a decimal, up to the second dot:
if (isOctave ())
vn = 7; %% use the "modern" Matlab constructs in this function.
else
v = ver('matlab');
vs= v.Version;
vsDotPos = [strfind(vs,'.'), Inf, Inf];
vn= str2double(vs(1:min(numel(vs),vsDotPos(2)-1)));
end
%% Get input arguments
% check the arguments in the (still amendable) subfunction 'argincheck':
ia = argincheck(varargin);
if ~isempty(ia.errmsg)
error(ia.errmsg)
end
% unwrap input argument information
is_argin_num_header = ia.is_argin_num_header;
num_header = ia.num_header;
is_argin_num_colon = ia.is_argin_num_colon;
num_colon = ia.num_colon;
conv_str = ia.conv_str;
sr_input_ca = ia.sr_input_ca;
num_sr = ia.num_sr;
kl_input_ca = ia.kl_input_ca;
num_kl = ia.num_kl;
replace_expr = ia.replace_expr;
num_er = ia.num_er;
idx_rng = ia.idx_rng;
% ldx_rng = ia.ldx_rng; % has become obsolete since v6.00
infolvl = ia.infolvl;
is_argin_readmode = ia.is_argin_readmode;
readmode = ia.readmode;
numerictype = ia.numerictype;
is_argin_rowrange = ia.is_argin_rowrange;
rowrange = ia.rowrange;
filepos = ia.filepos;
is_argin_filepos = ia.is_argin_filepos;
replace_regex = ia.replace_regex;
num_rr = ia.num_rr;
ffn = ia.ffn;
ffn_short = ia.ffn_short;
if exist(ffn,'file')~=2 % check again (e.g. after ESC in open file dialogue)
[A,ffn,num_header,sr_input_ca,hl,fpos] = deal([]);
if infolvl>=1
disp('Exiting txt2mat: No existing file given.')
end
return
end
clear varargin ia
%% Analyse data format
% try some automatic data format analysis if needed (by function anatxt)
doAnalyzeFile = ~all([is_argin_num_header, is_argin_num_colon]); %, is_argin_conv_str]); % commented out as so far anatxt's conv_str is only '%f'
if doAnalyzeFile
% call subfunction anatxt:
[ffn, ana_num_header, ana_num_colon, ana_conv_str, ana_sr_input_ca,...
ana_rm, num_ali, ana_hl, ferrmsg, aerrmsg] = anatxt(ffn,filepos,sr_input_ca,replace_expr,replace_regex,kl_input_ca,num_header,vn);
% quit if errors occurred
if ~isempty(aerrmsg)
[A,sr_input_ca,fpos] = deal([]);
num_header = ana_num_header;
hl = ana_hl;
if infolvl>=1
disp(['Exiting txt2mat: file analysis: ' aerrmsg])
end
return
end
% accept required results from anatxt:
if ~is_argin_num_header
num_header = ana_num_header;
end
if ~is_argin_num_colon
num_colon = ana_num_colon;
end
%if ~is_argin_conv_str
% conv_str = ana_conv_str;
%end
if ~is_argin_readmode
readmode = ana_rm;
end
% add new replacement strings from anatxt:
is_new_sr = ~ismember(ana_sr_input_ca, sr_input_ca);
num_sr = num_sr + sum(is_new_sr);
sr_input_ca = [sr_input_ca,ana_sr_input_ca(is_new_sr)];
% display information:
if infolvl >= 2
disp(repmat('*',1,length(ffn)+2));
disp(['* ' ffn]);
if numel(ferrmsg)==0
sr_display_str = '';
for idx = 1:num_sr
sr_display_str = [sr_display_str ' �' sr_input_ca{idx} '�']; %#ok<AGROW>
end
disp(['* read mode: ' readmode]);
disp(['* ' num2str(num_ali) ' data lines analysed' ]);
disp(['* ' num2str(num_header) ' header line(s)']);
disp(['* ' num2str(abs(num_colon)) ' data column(s)']);
disp(['* ' num2str(num_sr) ' string replacement(s)' sr_display_str]);
else
disp(['* fread error: ' ferrmsg '.']);
end
disp(repmat('*',1,length(ffn)+2));
end % if
% return if anatxt did not detect valid data
if ana_num_colon==0
A = [];
hl = '';
fpos = filepos;
return
end
end
%% Detect line termination character
if infolvl >= 1
hw = waitbar(0,'detect line termination character ...');
set(hw,'Name',[mfilename ' - ' ffn_short]);
hasWaitbar = true;
else
hasWaitbar = false;
end
lbfull = detect_line_break_characters(ffn);
% DETECT_LINE_BREAK_CHARACTERS find out type of line termination of a file
%
% lb = detect_line_break_characters(ffn)
%
% with
% ffn ascii file name
% lb line break character(s) as uint8, i.e.
% [13 10] (cr+lf) for standard DOS / Windows files
% [10] (lf) for Unix files
% [13] (cr) for Mac files
%
% The DOS style values are returned as defaults if no such line breaks are
% found.
lbuint = lbfull(end);
lbchar = char(lbuint);
num_lbfull = numel(lbfull);
%% Open file and set position indicator to end of header
% ... and extract header separately if not already done
logfid = fopen(ffn);
if num_header > 0
if doAnalyzeFile % header lines have already been extracted
hl = ana_hl;
lenHeader = numel(hl);
fseek(logfid,filepos+lenHeader,'bof');
else
if is_argin_filepos
fseek(logfid,filepos,'bof');
end
read_len = 65536; % (quite small) size of text sections just for header line extraction
do_read = true;
num_lb_curr = 0;
countLoop = 0;
while do_read
[f8p,lenf8p] = fread(logfid,read_len,'*uint8'); % current text section
ldcp_curr = find(f8p==lbuint); % line break positions in current text section
num_lb_curr = num_lb_curr + numel(ldcp_curr); % number of line breaks so far
do_read = (lenf8p == read_len) && (num_lb_curr < num_header);
countLoop = countLoop + 1;
end
if num_lb_curr >= num_header
lenHeader = ldcp_curr(end-(num_lb_curr-num_header)) + (countLoop-1)*read_len;
if countLoop == 1
% take the complete header from the first section
hl = char(f8p(1:lenHeader)).';
fseek(logfid,filepos+lenHeader,'bof');
else
% the header did not fit into a single section, so re-read
% it as a whole
fseek(logfid,filepos,'bof');
hl = char(fread(logfid,lenHeader).');
end
else
% exit here as we have found less line breaks than the given
% number of header lines!
fseek(logfid,filepos,'bof');
hl = char(fread(logfid).');
fpos = ftell(logfid);
fclose(logfid);
[A,sr_input_ca] = deal([]);
if infolvl>=1
% disp(['Exiting txt2mat: ' num2str(num_header) ' header lines expected, but only ' num2str(num_lb_curr) ' line breaks found.'])
close(hw)
end
return
end
end
else
lenHeader = 0;
hl = '';
if is_argin_filepos
fseek(logfid,filepos,'bof');
end
end
%% Read in ASCII file - case 1: portions only, as RowRange is given.
% RowRange should be given if the file is too huge to be read at once by
% fread. In this case multiple freads are used to read in consecutive
% sections of the text. By counting the line breaks those rows of the text
% that match the RowRange argument are added to the 'core' variable f8 that
% is later used for the numeric conversion.
% By definition, a line begins with its first character and ends with its
% last termination character.
if hasWaitbar
waitbar(0.01,hw,'reading file ...');
end
% numHeader = 0; % auxilliary variable replacing "num_header" during the code reconstruction
if is_argin_rowrange
do_read = true; % loop condition
num_lb_prev = 0;
read_len = idx_rng;
f8 = [];
while do_read
[f8p,lenf8p] = fread(logfid,read_len,'*uint8'); % current text section
ldcp_curr = find(f8p==lbuint); % line break positions in current text section
num_lb_curr = numel(ldcp_curr);
% add lines of interest to f8
if (rowrange(1) <= num_lb_prev+num_lb_curr+1) && (num_lb_prev < rowrange(2))
if rowrange(1) <= num_lb_prev + 1 % lines of interest started before current section
sdx = 1; % start index is beginning of section => the part of the section to be added to f8 includes the start of the section
else % lines of interest start within current section
num_lines_to_omit = rowrange(1)-1-num_lb_prev; % how many lines not to add
sdx = ldcp_curr(num_lines_to_omit)+1; % start right after the omitted lines
end
if rowrange(2) > num_lb_curr+num_lb_prev % lines of interest end beyond current section
edx = lenf8p; % end index is length of section => the part of the section to be added to f8 includes the end of the section
else % lines of interest end within current section
num_lines_to_add = rowrange(2)-num_lb_prev; % how many lines to add
edx = ldcp_curr(num_lines_to_add); % corresponding end index
end
f8 = [f8; f8p(sdx:edx)]; %#ok<AGROW>
fpos = ftell(logfid)-lenf8p+edx; % position of the latest added character
end
% quit loop if all rows of interest are read or if end of file is reached
if num_lb_prev >= rowrange(2) || lenf8p<read_len
do_read = false;
end
num_lb_prev = num_lb_prev + num_lb_curr; % absolute number of dectected line breaks
end
end
%% Read in ASCII file - case 2: full file. Then close file.
if ~is_argin_rowrange
[f8,fcount] = fread(logfid,Inf,'*uint8');
fpos = fcount + filepos + lenHeader;
end
if ftell(logfid) == -1
error(ferror(fid, 'clear'));
end
fclose(logfid);
if numel(f8)==0
A = [];
if infolvl>=1
% disp('Exiting txt2mat: no numeric data found.')
close(hw)
end
return
end
%% Clean up trailing whitespaces
% replace all trailing whitespaces by spaces and a final line break
% (quick&dirty)
if hasWaitbar
waitbar(0.05,hw,'cleaning up whitespaces ...');
end
cnt_trail_white = 0;
is_ws_at_end = true;
while is_ws_at_end % step through the endmost characters
if f8(end-cnt_trail_white) <= spuint % is it a whitespace?
cnt_trail_white = cnt_trail_white + 1;
else
f8(end-cnt_trail_white+1:end) = spuint; % fill with spaces
if cnt_trail_white >= num_lbfull
f8(end-num_lbfull+(1:num_lbfull)) = lbfull; %#ok<AGROW> % replace endmost space(s) by a line break
else
f8(end+(1:num_lbfull)) = lbfull; %#ok<AGROW> % append a final line break
end
is_ws_at_end = false;
end
end % while
%% Find linebreak indices and bad line positions
% as finding the line breaks is time-critical, "LbAwareness" is
% introduced to tell us what we know about line break positions:
% 0: nothing
% 1: the positions of the final line break in every section
% 2: the above + the number of lines up to each of those line breaks
% 3: all line break positions
% determine the minimum LbAwareness required for the numeric conversion:
switch lower(readmode)
case 'block'
MinLbAwareness = 1;
case {'matrix','auto'}
MinLbAwareness = 2;
case 'line'
MinLbAwareness = 3;
end
kl_idc = []; % default (no indices of rows to be deleted)
if num_kl > 0
if hasWaitbar
waitbar(0.10,hw,'finding line breaks ...');
end
[lf_idc, cntLb, secLbIdc, kl_idc] = FindLineBreaks(f8, lbuint, ...
idx_rng, true, false, num_kl, kl_input_ca);
LbAwareness = 3;
else
LbAwareness = 0;
end
%% Delete rows marked as bad
% Have we found bad line markers?
if ~isempty(kl_idc)
if hasWaitbar
waitbar(0.15,hw,'deleting rows ...');
end
% find indices of line breaks bordering a marker
[L,R] = neighbours(kl_idc, lf_idc);
% care for multiple markers within a single row
if any(diff(L) <= 0) && any(diff(R) <= 0)
L = unique(L);
R = unique(R);
end
% delete the bad rows
f8 = cutvec(f8,L+1,R,false,vn);
LbAwareness = 0;
end % if
clear L R kl_idc
%% Find line break positions
if LbAwareness == 0
if hasWaitbar
waitbar(0.20,hw,'updating line break positions ...');
end
% Find out if we have to expect text length changes due to the
% replacemets
doExpectLengthChange = false; % default
if num_rr > 0
% always expect changes by regular expressions
doExpectLengthChange = true;
else
% check for string replacements that will change the length
for edx = 1:num_er
if any(diff(cellfun('length', replace_expr{edx})))
doExpectLengthChange = true;
break
end
end
end
if doExpectLengthChange || strcmpi(readmode,'block')
% - make K1
doFindAll = false;
doCount = false;
LbAwareness = 1;
else
if strcmpi(readmode,'line')
% - make K3
doFindAll = true;
doCount = true;
LbAwareness = 3;
else % readmode is 'auto' or 'matrix'
% - make K2
doFindAll = false;
doCount = true;
LbAwareness = 2;
end
end
[lf_idc,cntLb,secLbIdc] = FindLineBreaks(f8, lbuint, idx_rng, doFindAll, doCount, 0, {});
end
%% Replace (regular) expressions and characters
%f8=char(f8); % quicker with strrep, required by sscanf
if num_rr > 0
has_length_changed = true;
else
has_length_changed = false; % flag for changes of length of f8 by replacements
end
if any([num_sr,num_er,num_rr] > 0 )
if hasWaitbar
waitbar(0.20,hw,'replacing strings ...');
end
numSectionLb = numel(secLbIdc);
% If a ReplaceExpr begins with a line break character, such a character
% will temporarily be prepended to each replacement section to apply
% the replacement to the _first_ line of a section, too. Check this
% case here:
doPrependLb = false; % default
numPrepend = 0; % default
if num_er>0
% put all the characters from the ReplaceExpr strings into an
% uint8-array:
uint8Replace = uint8(char([replace_expr{:}]));
% check if any row starts with a line break:
if any(uint8Replace(:,1)==lbuint)
doPrependLb = true;
numPrepend = 1;
end
end
for sdx = 2:numSectionLb
if doPrependLb
f8_akt = char([lbuint, f8(lf_idc(secLbIdc(sdx-1))+1 : lf_idc(secLbIdc(sdx))).']);
else
f8_akt = char(f8(lf_idc(secLbIdc(sdx-1))+1 : lf_idc(secLbIdc(sdx))).');
end
if num_er > 0 || num_rr > 0
len_f8_akt = lf_idc(secLbIdc(sdx)) - lf_idc(secLbIdc(sdx-1)); % length of current section before replacements
% Replacements, e.g. {'odd','one','1'} replaces 'odd' and 'one' by '1'
% Regular Expression Replacements: ============================
for vdx = 1:num_rr % step through replacements arguments
srarg = replace_regex{vdx}; % pick a single argument...
for xdx = 1:(numel(srarg)-1)
f8_akt = regexprep(f8_akt, srarg{xdx}, srarg{end}); % ... and perform replacements
end % for
end % for
% Expression Replacements: ====================================
for vdx = 1:num_er % step through replacements arguments
srarg = replace_expr{vdx}; % pick a single argument...
for xdx = 1:(numel(srarg)-1)
f8_akt = strrep(f8_akt, srarg{xdx}, srarg{end}); % ... and perform replacements
if ~has_length_changed && (len_f8_akt~=numel(f8_akt))
has_length_changed = true; % detect a change of length of f8
end
end % for
end % for
% update f8-sections by f8_akt ================================
exten = numel(f8_akt) - len_f8_akt; % extension by replacements
if exten == 0
if doPrependLb
f8( lf_idc(secLbIdc(sdx-1))+1 : lf_idc(secLbIdc(sdx)) ) = uint8(f8_akt(1+numPrepend:end)).';
else
f8( lf_idc(secLbIdc(sdx-1))+1 : lf_idc(secLbIdc(sdx)) ) = uint8(f8_akt).';
end
else
if doPrependLb
f8 = [f8(1:lf_idc(secLbIdc(sdx-1))); uint8(f8_akt(1+numPrepend:end)).'; f8(lf_idc(secLbIdc(sdx))+1:end)];
else
f8 = [f8(1:lf_idc(secLbIdc(sdx-1))); uint8(f8_akt).' ; f8(lf_idc(secLbIdc(sdx))+1:end)];
end
% update linebreak indices of the following sections
% (but we don't know the lb indices of the current one anymore):
lf_idc(secLbIdc(sdx:end)) = lf_idc(secLbIdc(sdx:end)) + exten;
end
end % if num_er > 0 || num_rr > 0
% Character Replacements: =========================================
for vdx = 1:num_sr % step through replacement arguments
srarg = sr_input_ca{vdx}; % pick a single argument