forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BigFile.cpp
1552 lines (1463 loc) · 53.1 KB
/
BigFile.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// JAB: this is required for pwrite() in this module
#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 500
#include "gb-include.h"
#include "BigFile.h"
#include "Dir.h"
#include "Threads.h"
#include "Stats.h"
#include "Statsdb.h"
#include "DiskPageCache.h"
// main.cpp will wait for this to be zero before exiting so all unlink/renames
// can complete
long g_unlinkRenameThreads = 0;
long long g_lastDiskReadStarted = 0LL;
long long g_lastDiskReadCompleted = 0LL;
bool g_diskIsStuck = false;
static void doneWrapper ( void *state , ThreadEntry *t ) ;
static bool readwrite_r ( FileState *fstate , ThreadEntry *t ) ;
BigFile::~BigFile () {
close();
}
//#define O_DIRECT 040000
BigFile::BigFile () {
m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
m_flags = O_RDWR ; // | O_DIRECT;
// NULLify all ptrs to files
for ( long i = 0 ; i < MAX_PART_FILES ; i++ ) m_files[i] = NULL;
m_maxParts = 0;
m_numParts = 0;
m_pc = NULL;
m_vfd = -1;
m_vfdAllowed = false;
m_fileSize = -1;
m_lastModified = -1;
m_numThreads = 0;
m_isClosing = false;
g_lastDiskReadStarted = 0;
g_lastDiskReadCompleted = 0;
g_diskIsStuck = false;
}
// we alternate parts into "dirname" and "stripeDir"
bool BigFile::set ( char *dir , char *baseFilename , char *stripeDir ) {
// reset filsize
m_fileSize = -1;
m_lastModified = -1;
// m_baseFilename contains the "dir" in it
//sprintf(m_baseFilename ,"%s/%s", dirname , baseFilename );
strcpy ( m_baseFilename , baseFilename );
strcpy ( m_dir , dir );
if ( stripeDir ) strcpy ( m_stripeDir , stripeDir );
else m_stripeDir[0] = '\0';
// reset # of parts
m_numParts = 0;
m_maxParts = 0;
// now add parts from both directories
if ( ! addParts ( m_dir ) ) return false;
if ( ! addParts ( m_stripeDir ) ) return false;
return true;
}
bool BigFile::reset ( ) {
// reset filsize
m_fileSize = -1;
m_lastModified = -1;
// m_baseFilename contains the "dir" in it
//sprintf(m_baseFilename ,"%s/%s", dirname , baseFilename );
//strcpy ( m_baseFilename , baseFilename );
//strcpy ( m_dir , dir );
//if ( stripeDir ) strcpy ( m_stripeDir , stripeDir );
//else m_stripeDir[0] = '\0';
// reset # of parts
m_numParts = 0;
m_maxParts = 0;
// now add parts from both directories
if ( ! addParts ( m_dir ) ) return false;
if ( ! addParts ( m_stripeDir ) ) return false;
return true;
}
bool BigFile::addParts ( char *dirname ) {
// if dirname is NULL return true
if ( ! dirname[0] ) return true;
// . now set the names of all the Files that we consist of
// . get the directory entry and find out what parts we have
Dir dir;
dir.set ( dirname );
// set our directory class
if (!dir.open()) return log("disk: openDir (\"%s\") failed",dirname);
// match files with this pattern in the directory
char pattern[256];
sprintf(pattern,"%s*", m_baseFilename );
// length of the base filename
long blen = gbstrlen ( m_baseFilename );
// . set our m_files array
// . addFile() will return false on problems
// . the lower the fileId the older the file (w/ exception of #0)
char *filename;
while ( ( filename = dir.getNextFilename ( pattern ) ) ) {
// if filename len is exactly blen it's part 0
long flen = gbstrlen(filename);
long part = -1;
if ( flen == blen ) part = 0;
// some files have the same first X chars, like
// indexdb.store-info-bak but are not part files
else if ( flen > blen && strncmp(filename+blen,".part",5)!=0)
continue;
// otherwise must end in .part%i
else if (flen - blen < 6 ) {
log ("disk: Part extension too small for \"%s\". "
"Must end in .partN to be valid.",
filename);
continue;
}
else part = atoi ( filename + blen + 5 );
// ensure not too big
if ( part >= MAX_PART_FILES ) {
log ("disk: Part number of %li is too big for "
"\"%s\". Should be less than %li.",
(long)part,filename,(long)MAX_PART_FILES);
continue;
}
// make this part file
if ( ! addPart ( part ) ) return false;
}
// now set the names of all our files
//for ( long n = 0 ; n < MAX_PART_FILES ; n++ )
//m_files[n].set ( makeFilename ( n, m_baseFilename ) );
return true;
}
bool BigFile::addPart ( long n ) {
if ( n >= MAX_PART_FILES )
return log("disk: Part number %li > %li.",
n,(long)MAX_PART_FILES);
File *f ;
try { f = new (File); }
catch ( ... ) {
g_errno = ENOMEM;
return log("BigFile: new(%i): %s",sizeof(File),
mstrerror(g_errno));
}
mnew ( f , sizeof(File) , "BigFile" );
char buf[1024];
makeFilename_r ( m_baseFilename , NULL, n , buf );
f->set ( buf );
m_files [ n ] = f;
m_numParts++;
// set maxPart
if ( n+1 > m_maxParts ) m_maxParts = n+1;
return true;
}
bool BigFile::doesExist ( ) {
return m_numParts;
}
// if we can open it with a valid fd, then it exists
bool BigFile::doesPartExist ( long n ) {
if ( n >= MAX_PART_FILES ) return false;
bool exists = (bool)m_files[n];
return exists;
}
// . overide File::open so we can set m_numParts
// . set maxFileSize when opening a new file for writing and using
// DiskPageCache
// . use maxFileSize of -1 for us to use getFileSize() to set it
bool BigFile::open ( int flags , class DiskPageCache *pc ,
long long maxFileSize ,
int permissions ) {
m_flags = flags;
m_pc = pc;
m_permissions = permissions;
m_isClosing = false;
// . init the page cache for this vfd
// . this returns our "virtual fd", not the same as File::m_vfd
// . returns -1 and sets g_errno on failure
// . we pass m_vfd to getPages() and addPages()
if ( m_pc ) {
if ( maxFileSize == -1 ) maxFileSize = getFileSize();
m_vfd = m_pc->getVfd ( maxFileSize, m_vfdAllowed );
g_errno = 0;
}
return true;
}
// get the filename of the nth file using m_dir/m_stripeDir & m_baseFilename
void BigFile::makeFilename_r ( char *baseFilename ,
char *baseFilenameDir ,
long n ,
char *buf ) {
char *dir = m_dir;
if ( baseFilenameDir && baseFilenameDir[0] ) dir = baseFilenameDir;
//static char s[1024];
if ( (n % 2) == 0 || ! m_stripeDir[0] )
sprintf ( buf, "%s/%s", dir , baseFilename );
else sprintf ( buf, "%s/%s", m_stripeDir, baseFilename );
if ( n == 0 ) return ;
sprintf ( buf + gbstrlen(buf) , ".part%li", n );
}
//int BigFile::getfdByOffset ( long long offset ) {
// return getfd ( offset / MAX_PART_SIZE , true /*forReading?*/ );
//}
// . get the fd of the nth file
// . will try to open the file if it hasn't yet been opened
int BigFile::getfd ( long n , bool forReading , long *vfd ) {
// boundary check
if ( n >= MAX_PART_FILES )
return log("disk: Part number %li > %li. fd not available.",
n,(long)MAX_PART_FILES) - 1;
// get the File ptr from the table
File *f = m_files[n];
// if part does not exist then create it!
if ( ! f ) {
// don't create File if we're getting it for reading
if ( forReading ) return -1;
if ( ! addPart (n) ) return -1;
f = m_files[n];
}
// open it if not opened
if ( ! f->isOpen() ) {
if ( ! f->open ( m_flags , m_permissions ) ) {
log("disk: Failed to open file part #%li.",n);
return -1;
}
}
// set it virtual fd, too
if ( vfd ) *vfd = f->m_vfd;
// get it's file descriptor
int fd = f->getfd ( ) ;
if ( fd >= -1 ) return fd;
// otherwise, fd is -2 and it's never been opened?!?!
g_errno = EBADENGINEER;
log(LOG_LOGIC,"disk: fd is -2.");
return -1;
}
// . return -2 on error
// . return -1 if does not exist
// . otherwise return the big file's complete file size (can be well over 2gb)
long long BigFile::getFileSize ( ) {
// return if already computed
if ( m_fileSize >= 0 ) return m_fileSize;
// add up the sizes of each file
long long totalSize = 0;
for ( long n = 0 ; n < m_maxParts ; n++ ) {
// we can have headless big files... count the heads
if ( ! m_files[n] ) { totalSize += MAX_PART_SIZE; continue; }
// . returns -2 on error, -1 if does not exist
// . TODO: it returns 0 if does not exist! FIX...
long size = m_files[n]->getFileSize();
if ( size == -2 ) return -2;
if ( size == -1 ) break;
totalSize += size;
}
// save time
m_fileSize = totalSize;
return totalSize;
}
// . return -2 on error
// . return -1 if does not exist
// . otherwise returns the oldest of the last mod dates of all the part files
time_t BigFile::getLastModifiedTime ( ) {
// return if already computed
if ( m_lastModified >= 0 ) return m_lastModified;
// add up the sizes of each file
time_t min = -1;
for ( long n = 0 ; n < m_maxParts ; n++ ) {
// we can have headless big files... count the heads
if ( ! m_files[n] ) continue;
// returns -1 on error, 0 if file does not exist
time_t date = m_files[n]->getLastModifiedTime();
if ( date == -1 ) return -2;
if ( date == 0 ) break;
// check min
if ( date < min || min == -1 ) min = date;
}
// save time
m_lastModified = min;
return m_lastModified;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we need a ptr to the ptr to this BigFile so if we get deleted and
// a signal is still pending for us, the callback will know we are nuked
bool BigFile::read ( void *buf ,
long size ,
long long offset ,
FileState *fs ,
void *state ,
void (* callback)(void *state) ,
long niceness ,
bool allowPageCache ,
bool hitDisk ,
long allocOff ) {
g_errno = 0;
return readwrite ( buf , size , offset , false/*doWrite?*/,
fs , state, callback , niceness , allowPageCache ,
hitDisk , allocOff );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool BigFile::write ( void *buf ,
long size ,
long long offset ,
FileState *fs ,
void *state ,
void (* callback)(void *state) ,
long niceness ,
bool allowPageCache ) {
// sanity check
if ( g_conf.m_readOnlyMode ) {
logf(LOG_DEBUG,"disk: BigFile: Trying to write while in "
"read only mode.");
return true;
}
g_errno = 0;
//if ( m_pc && m_pc->m_isOverriden ) allowPageCache = false;
return readwrite ( buf , size , offset , true/*doWrite?*/ ,
fs , state, callback , niceness , allowPageCache ,
true , 0 );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we divide into 2 writes in case write spans 2 files
// . only BigFiles will support non-blocking read/writes for now
// . damn, i thought linux supported non-blocking file reads, but it doesn't!
// . we use the aio.h calls
// . we should us kaio from sgi cuz it's in the kernel and only uses 4 threads
// whereas using librt.a creates a thread every time we call aio_read/write()
// . fstate is used by aio_read/write()
// . we need a ptr to the ptr to this BigFile so if we get deleted and
// a signal is still pending for us, the callback will know we are nuked
bool BigFile::readwrite ( void *buf ,
long size ,
long long offset ,
bool doWrite ,
FileState *fstate ,
void *state ,
void (* callback) ( void *state ) ,
long niceness ,
bool allowPageCache ,
bool hitDisk ,
long allocOff ) {
// are we blocking?
bool isNonBlocking = m_flags & O_NONBLOCK;
// if we're non blocking and caller didn't supply an "fstate"
if ( isNonBlocking && ! fstate ) {
g_errno = EBADENGINEER;
log(LOG_LOGIC,"disk: readwrite() call is "
"specified as non-blocking, but no state provided.");
return true;
}
// reset file size in case we change it here
if ( doWrite ) {
m_fileSize = -1;
m_lastModified = getTimeLocal();
}
// . sanity check
// . when our offset was just a long 2gig+ files, when dumped,
// had negative offsets, bad engineer
if ( offset < 0 ) {
log(LOG_LOGIC,"disk: readwrite() offset is %lli "
"< 0. dumping core.",offset);
char *xx = NULL; *xx = 0;
}
// if we're not blocking use a fake fstate
FileState tmp;
if ( ! fstate ) fstate = &tmp;
// . no error yet
// . need this up here in case it is a cache hit from a re-call
// due to a EFILECLOSED error
//fstate->m_errno = 0;
// offset to read into "buf"
long bufOff = 0;
// point to start of space allocated to hold what we read. "buf"
// should be >= allocBuf + allocOff, depending on value of bufOff
char *allocBuf = NULL;
long allocSize;
// reset this
fstate->m_errno = 0;
// . try to get as much as we can from page cache first
// . the vfd of the big file will be the vfd of its last File class
if ( ! doWrite && m_pc && allowPageCache ) {
long oldOff = offset;
// we have to set these so RdbScan doesn't freak out if we
// have it all cached and return without hitting disk
fstate->m_bytesDone = size;
fstate->m_bytesToGo = size;
//log("getting pages off=%lli size=%li",offset,size);
// now we pass in a ptr to the buf ptr, because if buf is NULL
// this will allocate one for us if it has some pages in the
// cache that we can use.
m_pc->getPages (m_vfd,(char **)&buf,size,offset,&size,&offset,
&allocBuf,&allocSize,allocOff);
//log("got pages off=%lli size=%li",offset,size);
bufOff = offset - oldOff;
// comment out for test
if ( size == 0 ) {
// let caller/RdbScan know about the newly alloc'd buf
fstate->m_buf = (char *)buf;
fstate->m_allocBuf = allocBuf;
fstate->m_allocSize = allocSize;
fstate->m_allocOff = allocOff;
return true;
}
// check
//if ( m_pc->m_isOverriden && size < 0 ) {
// fstate->m_bytesDone += size;
// fstate->m_bytesToGo += size;
// return true;
//}
}
// sanity check. if you set hitDisk to false, you must allow
// us to check the page cache! silly bean!
if ( ! allowPageCache && ! hitDisk ) { char*xx=NULL;*xx=0; }
//if ( m_pc && m_pc->m_isOverriden )
// log ( LOG_INFO, "bigfile: HITTING DISK!! %li",
// (long)allowPageCache );
// set up fstate
fstate->m_this = this;
// buf may be NULL if caller passed in a NULL "buf" and it did not hit
// the disk page cache. Threads.cpp will have to allocate it right
// before it launches the thread.
fstate->m_buf = (char *)buf + bufOff;
// if getPages() allocates a buf, this will point to it
fstate->m_allocBuf = allocBuf;
fstate->m_allocSize = allocSize;
// when buf is passed in as NULL we allocate it in Threads.cpp right
// before we launch it to save memory. it may also be allocated in
// DiskPageCache.cpp. we have to know where to start storing
// the read into it for RdbScan, it is not immediately at the
// beginning of the allocated buffer because RdbScan may have to
// turn the first key from a 6 byte half key into a 12 byte key so it
// needs some initial padding. this is because RdbLists should never
// start with a 6 byte half key.
fstate->m_allocOff = allocOff;
fstate->m_bytesToGo = size;
fstate->m_offset = offset;
fstate->m_doWrite = doWrite;
fstate->m_bytesDone = 0;
fstate->m_state = state;
fstate->m_callback = callback;
fstate->m_niceness = niceness;
fstate->m_flags = m_flags;
// . set our fd's before entering the thread in case RdbMerge
// calls our unlinkPart()
// . it's thread-UNsafe to call getfd() from within the thread
// . FUCK! what if we get unlinked and another file gets this fd!!
// . now we do do unlinks in a thread in File.cpp, but since we
// employ the getCloseCount_r() scheme we can detect when this
// situation occurs and pass a g_errno back to the caller.
fstate->m_filenum1 = offset / MAX_PART_SIZE;
fstate->m_filenum2 = (offset + size ) / MAX_PART_SIZE;
// . save the open count for this fd
// . if it changes when we're done with the read we do a re-read
// . it gets incremented once every time File calls ::open and gets
// back this fd
// . fd1 and fd1 are now set in Threads.cpp since we only want to do
// the open right before we actually launch the thread.
//fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite ,
// &fstate->m_vfd1);
//fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite ,
// &fstate->m_vfd2);
fstate->m_fd1 = -3;
fstate->m_fd2 = -3;
fstate->m_vfd1 = -3;
fstate->m_vfd2 = -3;
// . if we are writing, prevent these fds from being closed on us
// by File::closedLeastUsed(), because the fd could then be re-opened
// by someone else doing a write and we end up writing to THAT FILE!
// . the closeCount mechanism helps us DETECT when something like this
// happens, but it will not prevent the write from going through
if ( doWrite ) {
// actually have to do the open here for writing so it
// can prevent the fds from being closed on us
fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite,
&fstate->m_vfd1);
fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite,
&fstate->m_vfd2);
//File *f1 = m_files [ fstate->m_filenum1 ];
//File *f2 = m_files [ fstate->m_filenum2 ];
enterWriteMode( fstate->m_vfd1 );
enterWriteMode( fstate->m_vfd2 );
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
}
// get the close counts after calling getfd() since if getfd() calls
// File::open() that will inc the counts
// closeCount1 and 2 are now set in Threads.cpp since we want to only
// open the fd right before we launch the thread.
//fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
//fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
fstate->m_errno = 0;
fstate->m_errno2 = 0;
fstate->m_startTime = gettimeofdayInMilliseconds();
fstate->m_pc = m_pc;
if ( ! allowPageCache )
fstate->m_pc = NULL;
fstate->m_vfd = m_vfd;
// if hitDisk was false we only check the page cache!
if ( ! hitDisk ) return true;
// if disk stuck, forget about it! but make the spider disk reads
// wait until it is unstuck. just don't want to screw up the queries..
if ( g_diskIsStuck && niceness == 0 && ! doWrite ) {
g_errno = fstate->m_errno = EDISKSTUCK;
return true;
}
// . if we're blocking then do it now
// . this should return false and set g_errno on error, true otherwise
if ( ! isNonBlocking ) goto skipThread;
// . otherwise, spawn a thread to do this i/o
// . this returns false and sets g_errno on error, true on success
// . we should return false cuz we blocked
// . thread will add signal to g_loop on completion to call
if ( g_threads.call ( DISK_THREAD/*threadType*/, niceness , fstate ,
doneWrapper , readwriteWrapper_r) ) return false;
// note it
if ( g_errno ) {
static time_t s_time = 0;
time_t now = getTime();
if ( now - s_time > 5 ) {
log (LOG_INFO,"disk: Thread call failed: %s.",
mstrerror(g_errno));
s_time = now;
}
}
// sanity check
if ( ! callback ) { char *xx = NULL; *xx = 0; }
// NOW we return on error because if we already have 5000 disk threads
// queued up, what is the point in blocking ourselves off? that makes
// us look like a dead host and very unresponsive. As long as this
// request originated through Multicast, then multicast will sleep
// and retry. Msg3 could retry, the multicast thing should be more
// for running out of udp slots though...
if ( g_errno && ! doWrite && g_errno != ENOTHREADSLOTS ) {
log (LOG_INFO,"disk: May retry later.");
return true;
}
// otherwise, thread spawn failed, do it blocking then
g_errno = 0;
if ( ! g_threads.m_disabled ) {
static long s_lastTime = 0;
long now = getTime();
if ( now - s_lastTime >= 1 ) {
s_lastTime = now;
log (LOG_INFO,
"disk: Doing blocking disk access. This will hurt "
"performance. isWrite=%li.",(long)doWrite);
}
}
// come here if we haven't spawned a thread
skipThread:
// if there was no room in the thread queue, then we must do this here
fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite ,
&fstate->m_vfd1);
fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite ,
&fstate->m_vfd2);
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
// clear g_errno from the failed thread spawn
g_errno = 0;
// since Threads.cpp usually allocs the buffer before launching,
// we must do it here now
FileState *fs = fstate;
if ( ! fs->m_doWrite && ! fs->m_buf && fs->m_bytesToGo > 0 ) {
long need = fs->m_bytesToGo + fs->m_allocOff;
char *p = (char *) mmalloc ( need , "ThreadReadBuf" );
if ( p ) {
fs->m_buf = p + fs->m_allocOff;
fs->m_allocBuf = p;
fs->m_allocSize = need;
}
else
log("disk: read buf alloc failed for %li "
"bytes.",need);
}
// . this returns false and sets errno on error
// . set g_errno to the errno
if ( ! readwrite_r ( fstate , NULL ) ) g_errno = errno;
// exit write mode
if ( doWrite ) {
//File *f1 = m_files [ fstate->m_filenum1 ];
//File *f2 = m_files [ fstate->m_filenum2 ];
//f1->exitWriteMode();
//f2->exitWriteMode();
exitWriteMode( fstate->m_vfd1 );
exitWriteMode( fstate->m_vfd2 );
}
// set this up here
fstate->m_bytesDone = fstate->m_bytesToGo;
// and this too
fstate->m_doneTime = gettimeofdayInMilliseconds();
// if it read less than 8MB/s bitch
long long now = gettimeofdayInMilliseconds() ;
long long took = now - fstate->m_startTime ;
long rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
if ( rate < 8000 && fstate->m_niceness <= 0 ) {
log(LOG_INFO,"disk: Read %li bytes in %lli ms (%liMB/s).",
fstate->m_bytesDone,took,rate);
g_stats.m_slowDiskReads++;
}
// default graph color is black
int color = 0x00000000;
char *label = "disk_read";
// use red for writes, though
if ( fstate->m_doWrite ) {
color = 0x00ff0000;
label = "disk_write";
}
// but gray for low priority reads
else if ( fstate->m_niceness > 0 ) color = 0x00808080;
// add the stat
g_stats.addStat_r ( fstate->m_bytesDone ,
fstate->m_startTime ,
now ,
//label ,
color );
// add to statsdb as well
//g_statsdb.addStat ( fstate->m_niceness,
// label,
// fstate->m_startTime,
// now,
// fstate->m_bytesDone);
// store read/written pages into page cache
if ( ! g_errno && fstate->m_pc )
fstate->m_pc->addPages ( fstate->m_vfd ,
fstate->m_buf ,
fstate->m_bytesDone ,
fstate->m_offset ,
fstate->m_niceness );
// now log our stuff here
if ( g_errno && g_errno != EBADENGINEER )
log("disk: readwrite: %s", mstrerror(g_errno));
// . this EBADENGINEER can happen right after a merge if
// the file is renamed because the fd may have changed from
// under us
// . i added EBADF because RbdDump was failing because of this when
// trying to write the tree to a file
// . EBADF happens when we unlink a file from under a read or write
// . the closeCount code below was not saving us from coring on EBADF
// because the closeCount is only changed if another file is opened
// with that fd, it is not incremented on a close() but rather on
// an open()
/*
if ( g_errno == EBADENGINEER ) { // || g_errno == EBADF ) {
long fn1 = fstate->m_filenum1;
long fn2 = fstate->m_filenum2;
char *s = getFilename();
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%li)",s,fn1);
log(LOG_DEBUG,"disk: Closing old fd2 (%s,%li)",s,fn2);
// get the File ptr from the table
File *f1 = getFile(fn1);
File *f2 = getFile(fn2);
if ( f2 == f1 ) f2 = NULL;
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%li)",s,fn1);
if ( f2) log(LOG_DEBUG,"disk: Closing old fd2 (%s,%li)",s,fn2);
if ( f1 ) f1->close();
if ( f2 ) f2->close();
}
*/
// we didn't block so return true
return true;
}
// . this should be called from the main process after getting our call OUR callback here
void doneWrapper ( void *state , ThreadEntry *t ) {
FileState *fstate = (FileState *)state;
// any writes we did in the disk read thread were done to the
// "tmp" FileState class on the stack, so now we have the real deal
// we can update all this junk.
fstate->m_bytesDone = fstate->m_bytesToGo;
fstate->m_doneTime = t->m_exitTime; // set in Threads.cpp
fstate->m_errno = t->m_errno;
// exit write mode
if ( fstate->m_doWrite ) {
// THIS could have been deleted!!
//BigFile *THIS = fstate->m_this;
//File *f1 = THIS->m_files [ fstate->m_filenum1 ];
//File *f2 = THIS->m_files [ fstate->m_filenum2 ];
//f1->exitWriteMode();
//f2->exitWriteMode();
exitWriteMode( fstate->m_vfd1 );
exitWriteMode( fstate->m_vfd2 );
}
// if it read less than 8MB/s bitch
long long took = fstate->m_doneTime - fstate->m_startTime;
long rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
bool slow = false;
if ( rate < 8000 ) slow = true;
if ( fstate->m_errno == EDISKSTUCK ) slow = true;
if ( slow && fstate->m_niceness <= 0 ) {
if ( fstate->m_errno != EDISKSTUCK )
log(LOG_INFO, "disk: Read %li bytes in %lli ms (%liMB/s).",
fstate->m_bytesDone,took,rate);
g_stats.m_slowDiskReads++;
}
// get the BigFIle
//BigFile *THIS = fs->m_this;
// recall g_errno from state's m_errno
g_errno = fstate->m_errno;
// might have had the file renamed/unlinked from under us
if ( ! g_errno ) g_errno = fstate->m_errno2;
// fstate has his own m_pc in case BigFile got deleted, we cannot
// reference it...
if ( ! g_errno && fstate->m_pc )
fstate->m_pc->addPages ( fstate->m_vfd ,
fstate->m_buf ,
fstate->m_bytesDone ,
fstate->m_offset ,
fstate->m_niceness );
// add the stat
if ( ! g_errno ) {
// default graph color is black
int color = 0x00000000;
char *label = "disk_read";
// use red for writes, though
if ( fstate->m_doWrite ) {
color = 0x00ff0000;
label = "disk_write";
}
// but gray for low priority reads
else if ( fstate->m_niceness > 0 ) color = 0x00808080;
// add it
g_stats.addStat_r ( fstate->m_bytesDone ,
fstate->m_startTime ,
fstate->m_doneTime ,
//label ,
color );
// add to statsdb as well
//g_statsdb.addStat ( fstate->m_niceness,
// label,
// fstate->m_startTime,
// fstate->m_doneTime,
// fstate->m_bytesDone);
}
// debug msg
//char *s = "read";
//if ( fstate->m_doWrite ) s = "wrote";
//char *t = "no"; // are we blocking?
//if ( fstate->m_this->getFlags() & O_NONBLOCK ) t = "yes";
// this is bad for real-time threads cuz our unlink() routine may
// have been called by RdbMerge and our m_files may be altered
//log("disk::readwrite: %s %li bytes from %s(nonBlock=%s)",s,n,
// m_files[filenum]->getFilename(),t);
//log("disk::readwrite_r: %s %li bytes (nonBlock=%s)",
// s,fstate->m_bytesDone/*n*/,t);
// debug msg
//long took = gettimeofdayInMilliseconds() - fstate->m_startTime ;
//log("read of %li bytes took %li ms",fstate->m_bytesDone, took);
// now log our stuff here
long tt = LOG_WARN;
if ( g_errno == EFILECLOSED ) tt = LOG_INFO;
if ( g_errno && g_errno != EDISKSTUCK )
log (tt,"disk: %s. fd1=%li vfd=%li "
"off=%lli toread=%li.",
mstrerror(g_errno),
(long)fstate->m_fd1,(long)fstate->m_vfd,
(long long)fstate->m_offset ,
(long)fstate->m_bytesToGo );
// someone is closing our fd without setting File::s_vfds[fd] to -1
if ( g_errno && g_errno != EDISKSTUCK ) {
//int fd1 = fstate->m_fd1;
//int fd2 = fstate->m_fd2;
int vfd1 = fstate->m_vfd1;
int vfd2 = fstate->m_vfd2;
int ofd1 = getfdFromVfd(vfd1);
int ofd2 = getfdFromVfd(vfd2);
log(tt,"disk: vfd1=%i s_fds[%i]=%i.",vfd1,vfd1,ofd1);
log(tt,"disk: vfd2=%i s_fds[%i]=%i.",vfd2,vfd2,ofd2);
}
// . this EBADENGINEER can happen right after a merge if
// the file is renamed because the fd may have changed from
// under us
// . i added EBADF because RbdDump was failing because of this when
// trying to write the tree to a file
// . the closeCount code below was not saving us from coring on EBADF
// because the closeCount is only changed if another file is opened
// with that fd, it is not incremented on a close() but rather on
// an open()
/*
if ( g_errno == EBADENGINEER ) { // || g_errno == EBADF ) {
long fn1 = fstate->m_filenum1;
long fn2 = fstate->m_filenum2;
// CAUTION: if file got delete THIS will be invalid!!!
BigFile *THIS = fstate->m_this;
char *s = THIS->getFilename();
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%li)",s,fn1);
log(LOG_DEBUG,"disk: Closing old fd2 (%s,%li)",s,fn2);
// get the File ptr from the table
File *f1 = THIS->getFile(fn1);
File *f2 = THIS->getFile(fn2);
if ( f2 == f1 ) f2 = NULL;
if ( f1 ) { f1->close();log(LOG_DEBUG,"disk: Closed old fd1");}
if ( f2 ) { f2->close();log(LOG_DEBUG,"disk: Closed old fd2");}
}
*/
// call the callback, with errno set if there was an error
fstate->m_callback ( fstate->m_state );
}
void *readwriteWrapper_r ( void *state , ThreadEntry *t ) {
// debug msg
//log("disk: this thread id = %li",(long)pthread_self());
// if we were queued and now we are launching stuck, just return now
//if ( g_diskIsStuck ) {
// t->m_errno = EDISKSTUCK;
// return NULL;
//}
// if we got hit before we set m_readyForBail to true we must have
// been hit pre-launch... so bail quickly in that case...
if ( t && t->m_callback == ohcrap ) {
t->m_errno = EDISKSTUCK;
return NULL;
}
// extract our class
FileState *orig = (FileState *)state;
// save this shit on the stack in case fstate gets pull from under us
FileState tmp;
memcpy ( &tmp , orig , sizeof(FileState ));
FileState *fstate = &tmp;
// lead Threads::bailOnReads() know we can be bailed on now since
// we have copied over all the date we can from fstate, which can
// be pulled out from under us now
t->m_readyForBail = true;
// get THIS
//BigFile *THIS = fstate->m_this;
// clear thread's errno
errno = 0;
// . make it so we go away immediately upon receiving a cancellation
// signal rather than queing the signal until we call
// pthread_testcancel()
// . this allows us to immediately hault disk reads/writes that are
// lower priority than i/o's we're about to do
// . this is so merging won't affect queries per second so much
//int err = pthread_setcanceltype ( PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
//if ( err != 0 ) log("readwriteWrapper: pthread_setcanceltype: %s",
// mstrerror(err) );
// . do the readwrite_r() since we're a thread now
// . this SHOULD NOT set g_errno, we're a thread!
// . it does have it's own errno however
// . if this gets a cancel signal in the read() it will stop blocking
// and errno will be EINTR
again:
bool status = readwrite_r ( fstate , t ) ;
// did our callback get pre-called by Process.cpp/Threads.cpp?
// fstate is probably invalid then, so watch out!
if ( t && t->m_callback == ohcrap ) return NULL;
// set errno
if ( ! status ) fstate->m_errno = errno;
// test again here
//pthread_testcancel();
// get the two files
File *f1 = NULL;
File *f2 = NULL;
// when we exit, m_this is invalid!!!
if ( fstate->m_filenum1 < fstate->m_this->m_maxParts )
f1 = fstate->m_this->m_files[fstate->m_filenum1];
if ( fstate->m_filenum2 < fstate->m_this->m_maxParts )
f2 = fstate->m_this->m_files[fstate->m_filenum2];
// . if open count changed on us our file got unlinked from under us
// and another file was opened with that same fd!!!
// . just fail the read so caller knows it is bad
// . do not do this for writes because RdbDump can fail when writing!
// . in that case hopefully write will fail if the fd was re-opened
// for another file in RDONLY mode, but, if per chance it opens
// a different file for dumping or merging with this same fd then
// we may be seriously screwing things up!! TODO: investigate
// . f1 and f2 can be non-null and invalid here now on the ssds
// i saw this happen on gk153... i preserved the core/gb on there
//if ( (getCloseCount_r (fstate->m_fd1) != fstate->m_closeCount1 ||
// getCloseCount_r (fstate->m_fd2) != fstate->m_closeCount2 )) {
if ( ! f1 ||
! f2 ||
f1->m_closeCount != fstate->m_closeCount1 ||
f2->m_closeCount != fstate->m_closeCount2 ) {
long cc1 = -1;
long cc2 = -1;
if ( f1 ) cc1 = f1->m_closeCount;
if ( f2 ) cc2 = f2->m_closeCount;
log("file: c1a=%li c1b=%li c2a=%li c2b=%li",
cc1,fstate->m_closeCount1,
cc2,fstate->m_closeCount2);
if ( ! fstate->m_doWrite ) fstate->m_errno = EFILECLOSED;
// we use s_writing[] locks in File.cpp to prevent a write
// operation's fd from being closed under him
else log("PANIC: fd closed on us while writing. This should "
"never happen!! Simultaneous writes?");
}
// if it wasn't cancelled, just interrupted, try again
if ( errno == EINTR ) {
errno = 0;
fstate->m_errno = 0;
goto again;
}
// turn off the cancel-ability of this thread
//pthread_setcancelstate ( PTHREAD_CANCEL_DISABLE , NULL );
// set done time even if errno set
// - mdw, can't set this here now because fstate might be invalid...
//long long now = gettimeofdayInMilliseconds() ;
//fstate->m_doneTime = now;
/*
// add the stat
if ( ! errno ) {
// default graph color is black
int color = 0x00000000;
char *label = "disk_read";
// use red for writes, though
if ( fstate->m_doWrite ) {
color = 0x00ff0000;
label = "disk_write";
}
// but gray for low priority reads
else if ( fstate->m_niceness > 0 ) color = 0x00808080;
// add it
g_stats.addStat_r ( fstate->m_bytesDone ,
fstate->m_startTime ,
now ,
label ,
color );
}
*/
// debug msg
//fprintf(stderr,"BigFile exiting thread, state=%lu\n",(long)fstate);
// . we're all done, tell g_threads
// . this never returns
// . the state must be unique per thread so we know what thread this is
// . i tried using pthread_self() but we'd have to store it in
// g_thread's ThreadEntry ourselves, as a thread
// . the thread's cleanUp handler should call g_threads.exit(fstate)
//g_threads.exit ( fstate );
//pthread_exit ( NULL );
// update this since our updates were done to the FileState "tmp"
// which is just on the stack
t->m_errno = fstate->m_errno;
// bogus return
return NULL;
}
// . returns false and sets errno on error, true on success
// . don't log shit when you're in a thread anymore
// . if we receive a cancel sig while in pread/pwrite it will return -1
// and set errno to EINTR
bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
// if no buffer to read into the alloc in Threads.cpp failed
if ( ! fstate->m_buf ) {
errno = EBUFTOOSMALL;
return log( "disk: read buf is NULL. malloc failed?");
}
// how many total bytes to write?
long bytesToGo = fstate->m_bytesToGo; //- fstate->m_bytesDone;
// how many bytes we've written so far
long bytesDone = fstate->m_bytesDone;
// get current offset