-
Notifications
You must be signed in to change notification settings - Fork 8
/
wander.c
1757 lines (1389 loc) · 48.2 KB
/
wander.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
* reiser4/README */
/* Reiser4 Wandering Log */
/* You should read http://www.namesys.com/txn-doc.html
That describes how filesystem operations are performed as atomic
transactions, and how we try to arrange it so that we can write most of the
data only once while performing the operation atomically.
For the purposes of this code, it is enough for it to understand that it
has been told a given block should be written either once, or twice (if
twice then once to the wandered location and once to the real location).
This code guarantees that those blocks that are defined to be part of an
atom either all take effect or none of them take effect.
The "relocate set" of nodes are submitted to write by the jnode_flush()
routine, and the "overwrite set" is submitted by reiser4_write_log().
This is because with the overwrite set we seek to optimize writes, and
with the relocate set we seek to cause disk order to correlate with the
"parent first order" (preorder).
reiser4_write_log() allocates and writes wandered blocks and maintains
additional on-disk structures of the atom as wander records (each wander
record occupies one block) for storing of the "wandered map" (a table which
contains a relation between wandered and real block numbers) and other
information which might be needed at transaction recovery time.
The wander records are unidirectionally linked into a circle: each wander
record contains a block number of the next wander record, the last wander
record points to the first one.
One wander record (named "tx head" in this file) has a format which is
different from the other wander records. The "tx head" has a reference to the
"tx head" block of the previously committed atom. Also, "tx head" contains
fs information (the free blocks counter, and the oid allocator state) which
is logged in a special way .
There are two journal control blocks, named journal header and journal
footer which have fixed on-disk locations. The journal header has a
reference to the "tx head" block of the last committed atom. The journal
footer points to the "tx head" of the last flushed atom. The atom is
"played" when all blocks from its overwrite set are written to disk the
second time (i.e. written to their real locations).
NOTE: People who know reiserfs internals and its journal structure might be
confused with these terms journal footer and journal header. There is a table
with terms of similar semantics in reiserfs (reiser3) and reiser4:
REISER3 TERM | REISER4 TERM | DESCRIPTION
--------------------+-----------------------+----------------------------
commit record | journal header | atomic write of this record
| | ends transaction commit
--------------------+-----------------------+----------------------------
journal header | journal footer | atomic write of this record
| | ends post-commit writes.
| | After successful
| | writing of this journal
| | blocks (in reiser3) or
| | wandered blocks/records are
| | free for re-use.
--------------------+-----------------------+----------------------------
The atom commit process is the following:
1. The overwrite set is taken from atom's clean list, and its size is
counted.
2. The number of necessary wander records (including tx head) is calculated,
and the wander record blocks are allocated.
3. Allocate wandered blocks and populate wander records by wandered map.
4. submit write requests for wander records and wandered blocks.
5. wait until submitted write requests complete.
6. update journal header: change the pointer to the block number of just
written tx head, submit an i/o for modified journal header block and wait
for i/o completion.
NOTE: The special logging for bitmap blocks and some reiser4 super block
fields makes processes of atom commit, flush and recovering a bit more
complex (see comments in the source code for details).
The atom playing process is the following:
1. Write atom's overwrite set in-place.
2. Wait on i/o.
3. Update journal footer: change the pointer to block number of tx head
block of the atom we currently flushing, submit an i/o, wait on i/o
completion.
4. Free disk space which was used for wandered blocks and wander records.
After the freeing of wandered blocks and wander records we have that journal
footer points to the on-disk structure which might be overwritten soon.
Neither the log writer nor the journal recovery procedure use that pointer
for accessing the data. When the journal recovery procedure finds the oldest
transaction it compares the journal footer pointer value with the "prev_tx"
pointer value in tx head, if values are equal the oldest not flushed
transaction is found.
NOTE on disk space leakage: the information about of what blocks and how many
blocks are allocated for wandered blocks, wandered records is not written to
the disk because of special logging for bitmaps and some super blocks
counters. After a system crash we the reiser4 does not remember those
objects allocation, thus we have no such a kind of disk space leakage.
*/
/* Special logging of reiser4 super block fields. */
/* There are some reiser4 super block fields (free block count and OID allocator
state (number of files and next free OID) which are logged separately from
super block to avoid unnecessary atom fusion.
So, the reiser4 super block can be not captured by a transaction with
allocates/deallocates disk blocks or create/delete file objects. Moreover,
the reiser4 on-disk super block is not touched when such a transaction is
committed and flushed. Those "counters logged specially" are logged in "tx
head" blocks and in the journal footer block.
A step-by-step description of special logging:
0. The per-atom information about deleted or created files and allocated or
freed blocks is collected during the transaction. The atom's
->nr_objects_created and ->nr_objects_deleted are for object
deletion/creation tracking, the numbers of allocated and freed blocks are
calculated using atom's delete set and atom's capture list -- all new and
relocated nodes should be on atom's clean list and should have JNODE_RELOC
bit set.
1. The "logged specially" reiser4 super block fields have their "committed"
versions in the reiser4 in-memory super block. They get modified only at
atom commit time. The atom's commit thread has an exclusive access to those
"committed" fields because the log writer implementation supports only one
atom commit a time (there is a per-fs "commit" mutex). At
that time "committed" counters are modified using per-atom information
collected during the transaction. These counters are stored on disk as a
part of tx head block when atom is committed.
2. When the atom is flushed the value of the free block counter and the OID
allocator state get written to the journal footer block. A special journal
procedure (journal_recover_sb_data()) takes those values from the journal
footer and updates the reiser4 in-memory super block.
NOTE: That means free block count and OID allocator state are logged
separately from the reiser4 super block regardless of the fact that the
reiser4 super block has fields to store both the free block counter and the
OID allocator.
Writing the whole super block at commit time requires knowing true values of
all its fields without changes made by not yet committed transactions. It is
possible by having their "committed" version of the super block like the
reiser4 bitmap blocks have "committed" and "working" versions. However,
another scheme was implemented which stores special logged values in the
unused free space inside transaction head block. In my opinion it has an
advantage of not writing whole super block when only part of it was
modified. */
#include "debug.h"
#include "dformat.h"
#include "txnmgr.h"
#include "jnode.h"
#include "znode.h"
#include "block_alloc.h"
#include "page_cache.h"
#include "wander.h"
#include "reiser4.h"
#include "super.h"
#include "vfs_ops.h"
#include "writeout.h"
#include "inode.h"
#include "entd.h"
#include <linux/types.h>
#include <linux/fs.h> /* for struct super_block */
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
#include <linux/bio.h> /* for struct bio */
#include <linux/blkdev.h>
static int write_jnodes_to_disk_extent(
jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
/* The commit_handle is a container for objects needed at atom commit time */
struct commit_handle {
/* A pointer to atom's list of OVRWR nodes */
struct list_head *overwrite_set;
/* atom's overwrite set size */
int overwrite_set_size;
/* jnodes for wander record blocks */
struct list_head tx_list;
/* number of wander records */
__u32 tx_size;
/* 'committed' sb counters are saved here until atom is completely
flushed */
__u64 free_blocks;
__u64 nr_files;
__u64 next_oid;
/* A pointer to the atom which is being committed */
txn_atom *atom;
/* A pointer to current super block */
struct super_block *super;
/* The counter of modified bitmaps */
reiser4_block_nr nr_bitmap;
};
static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
{
memset(ch, 0, sizeof(struct commit_handle));
INIT_LIST_HEAD(&ch->tx_list);
ch->atom = atom;
ch->super = reiser4_get_current_sb();
}
static void done_commit_handle(struct commit_handle *ch)
{
assert("zam-690", list_empty(&ch->tx_list));
}
/* fill journal header block data */
static void format_journal_header(struct commit_handle *ch)
{
struct reiser4_super_info_data *sbinfo;
struct journal_header *header;
jnode *txhead;
sbinfo = get_super_private(ch->super);
assert("zam-479", sbinfo != NULL);
assert("zam-480", sbinfo->journal_header != NULL);
txhead = list_entry(ch->tx_list.next, jnode, capture_link);
jload(sbinfo->journal_header);
header = (struct journal_header *)jdata(sbinfo->journal_header);
assert("zam-484", header != NULL);
put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
&header->last_committed_tx);
jrelse(sbinfo->journal_header);
}
/* fill journal footer block data */
static void format_journal_footer(struct commit_handle *ch)
{
struct reiser4_super_info_data *sbinfo;
struct journal_footer *footer;
jnode *tx_head;
sbinfo = get_super_private(ch->super);
tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
assert("zam-493", sbinfo != NULL);
assert("zam-494", sbinfo->journal_header != NULL);
check_me("zam-691", jload(sbinfo->journal_footer) == 0);
footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
assert("zam-495", footer != NULL);
put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
&footer->last_flushed_tx);
put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
jrelse(sbinfo->journal_footer);
}
/* wander record capacity depends on current block size */
static int wander_record_capacity(const struct super_block *super)
{
return (super->s_blocksize -
sizeof(struct wander_record_header)) /
sizeof(struct wander_entry);
}
/* Fill first wander record (tx head) in accordance with supplied given data */
static void format_tx_head(struct commit_handle *ch)
{
jnode *tx_head;
jnode *next;
struct tx_header *header;
tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
assert("zam-692", &ch->tx_list != &tx_head->capture_link);
next = list_entry(tx_head->capture_link.next, jnode, capture_link);
if (&ch->tx_list == &next->capture_link)
next = tx_head;
header = (struct tx_header *)jdata(tx_head);
assert("zam-460", header != NULL);
assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
&header->prev_tx);
put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
}
/* prepare ordinary wander record block (fill all service fields) */
static void
format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
{
struct wander_record_header *LRH;
jnode *next;
assert("zam-464", node != NULL);
LRH = (struct wander_record_header *)jdata(node);
next = list_entry(node->capture_link.next, jnode, capture_link);
if (&ch->tx_list == &next->capture_link)
next = list_entry(ch->tx_list.next, jnode, capture_link);
assert("zam-465", LRH != NULL);
assert("zam-463",
ch->super->s_blocksize > sizeof(struct wander_record_header));
memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
put_unaligned(cpu_to_le32(serial), &LRH->serial);
put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
}
/* add one wandered map entry to formatted wander record */
static void
store_entry(jnode * node, int index, const reiser4_block_nr * a,
const reiser4_block_nr * b)
{
char *data;
struct wander_entry *pairs;
data = jdata(node);
assert("zam-451", data != NULL);
pairs =
(struct wander_entry *)(data + sizeof(struct wander_record_header));
put_unaligned(cpu_to_le64(*a), &pairs[index].original);
put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
}
/* currently, wander records contains contain only wandered map, which depend on
overwrite set size */
static void get_tx_size(struct commit_handle *ch)
{
assert("zam-440", ch->overwrite_set_size != 0);
assert("zam-695", ch->tx_size == 0);
/* count all ordinary wander records
(<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
for tx head block */
ch->tx_size =
(ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
2;
}
/* A special structure for using in store_wmap_actor() for saving its state
between calls */
struct store_wmap_params {
jnode *cur; /* jnode of current wander record to fill */
int idx; /* free element index in wander record */
int capacity; /* capacity */
#if REISER4_DEBUG
struct list_head *tx_list;
#endif
};
/* an actor for use in blocknr_set_iterator routine which populates the list
of pre-formatted wander records by wandered map info */
static int
store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
const reiser4_block_nr * b, void *data)
{
struct store_wmap_params *params = data;
if (params->idx >= params->capacity) {
/* a new wander record should be taken from the tx_list */
params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
assert("zam-454",
params->tx_list != ¶ms->cur->capture_link);
params->idx = 0;
}
store_entry(params->cur, params->idx, a, b);
params->idx++;
return 0;
}
/* This function is called after Relocate set gets written to disk, Overwrite
set is written to wandered locations and all wander records are written
also. Updated journal header blocks contains a pointer (block number) to
first wander record of the just written transaction */
static int update_journal_header(struct commit_handle *ch)
{
struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
jnode *jh = sbinfo->journal_header;
jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
int ret;
format_journal_header(ch);
ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
WRITEOUT_FLUSH_FUA);
if (ret)
return ret;
/* blk_run_address_space(sbinfo->fake->i_mapping);
* blk_run_queues(); */
ret = jwait_io(jh, WRITE);
if (ret)
return ret;
sbinfo->last_committed_tx = *jnode_get_block(head);
return 0;
}
/* This function is called after write-back is finished. We update journal
footer block and free blocks which were occupied by wandered blocks and
transaction wander records */
static int update_journal_footer(struct commit_handle *ch)
{
reiser4_super_info_data *sbinfo = get_super_private(ch->super);
jnode *jf = sbinfo->journal_footer;
int ret;
format_journal_footer(ch);
ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
WRITEOUT_FLUSH_FUA);
if (ret)
return ret;
/* blk_run_address_space(sbinfo->fake->i_mapping);
* blk_run_queue(); */
ret = jwait_io(jf, WRITE);
if (ret)
return ret;
return 0;
}
/* free block numbers of wander records of already written in place transaction */
static void dealloc_tx_list(struct commit_handle *ch)
{
while (!list_empty(&ch->tx_list)) {
jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
list_del(&cur->capture_link);
ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
reiser4_dealloc_block(jnode_get_block(cur), 0,
BA_DEFER | BA_FORMATTED);
unpin_jnode_data(cur);
reiser4_drop_io_head(cur);
}
}
/* An actor for use in block_nr_iterator() routine which frees wandered blocks
from atom's overwrite set. */
static int
dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
const reiser4_block_nr * a UNUSED_ARG,
const reiser4_block_nr * b, void *data UNUSED_ARG)
{
assert("zam-499", b != NULL);
assert("zam-500", *b != 0);
assert("zam-501", !reiser4_blocknr_is_fake(b));
reiser4_dealloc_block(b, 0, BA_DEFER | BA_FORMATTED);
return 0;
}
/* free wandered block locations of already written in place transaction */
static void dealloc_wmap(struct commit_handle *ch)
{
assert("zam-696", ch->atom != NULL);
blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
dealloc_wmap_actor, NULL, 1);
}
/* helper function for alloc wandered blocks, which refill set of block
numbers needed for wandered blocks */
static int
get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
{
reiser4_blocknr_hint hint;
int ret;
reiser4_block_nr wide_len = count;
/* FIXME-ZAM: A special policy needed for allocation of wandered blocks
ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
reserved allocation area so as to get the best qualities of fixed
journals? */
reiser4_blocknr_hint_init(&hint);
hint.block_stage = BLOCK_GRABBED;
ret = reiser4_alloc_blocks(&hint, start, &wide_len,
BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
*len = (int)wide_len;
return ret;
}
/*
* roll back changes made before issuing BIO in the case of IO error.
*/
static void undo_bio(struct bio *bio)
{
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *pg;
jnode *node;
pg = bvec->bv_page;
end_page_writeback(pg);
node = jprivate(pg);
spin_lock_jnode(node);
JF_CLR(node, JNODE_WRITEBACK);
JF_SET(node, JNODE_DIRTY);
spin_unlock_jnode(node);
}
bio_put(bio);
}
/* put overwrite set back to atom's clean list */
static void put_overwrite_set(struct commit_handle *ch)
{
jnode *cur;
list_for_each_entry(cur, ch->overwrite_set, capture_link)
jrelse_tail(cur);
}
/* Count overwrite set size, grab disk space for wandered blocks allocation.
Since we have a separate list for atom's overwrite set we just scan the list,
count bitmap and other not leaf nodes which wandered blocks allocation we
have to grab space for. */
static int get_overwrite_set(struct commit_handle *ch)
{
int ret;
jnode *cur;
__u64 nr_not_leaves = 0;
#if REISER4_DEBUG
__u64 nr_formatted_leaves = 0;
__u64 nr_unformatted_leaves = 0;
#endif
assert("zam-697", ch->overwrite_set_size == 0);
ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
while (ch->overwrite_set != &cur->capture_link) {
jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
/* Count bitmap locks for getting correct statistics what number
* of blocks were cleared by the transaction commit. */
if (jnode_get_type(cur) == JNODE_BITMAP)
ch->nr_bitmap++;
assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
|| jnode_get_type(cur) == JNODE_BITMAP);
if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
/* we replace fake znode by another (real)
znode which is suggested by disk_layout
plugin */
/* FIXME: it looks like fake znode should be
replaced by jnode supplied by
disk_layout. */
struct super_block *s = reiser4_get_current_sb();
reiser4_super_info_data *sbinfo =
get_current_super_private();
if (sbinfo->df_plug->log_super) {
jnode *sj = sbinfo->df_plug->log_super(s);
assert("zam-593", sj != NULL);
if (IS_ERR(sj))
return PTR_ERR(sj);
spin_lock_jnode(sj);
JF_SET(sj, JNODE_OVRWR);
insert_into_atom_ovrwr_list(ch->atom, sj);
spin_unlock_jnode(sj);
/* jload it as the rest of overwrite set */
jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
ch->overwrite_set_size++;
}
spin_lock_jnode(cur);
reiser4_uncapture_block(cur);
jput(cur);
} else {
int ret;
ch->overwrite_set_size++;
ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
if (ret)
reiser4_panic("zam-783",
"cannot load e-flushed jnode back (ret = %d)\n",
ret);
}
/* Count not leaves here because we have to grab disk space
* for wandered blocks. They were not counted as "flush
* reserved". Counting should be done _after_ nodes are pinned
* into memory by jload(). */
if (!jnode_is_leaf(cur))
nr_not_leaves++;
else {
#if REISER4_DEBUG
/* at this point @cur either has JNODE_FLUSH_RESERVED
* or is eflushed. Locking is not strong enough to
* write an assertion checking for this. */
if (jnode_is_znode(cur))
nr_formatted_leaves++;
else
nr_unformatted_leaves++;
#endif
JF_CLR(cur, JNODE_FLUSH_RESERVED);
}
cur = next;
}
/* Grab space for writing (wandered blocks) of not leaves found in
* overwrite set. */
ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
if (ret)
return ret;
/* Disk space for allocation of wandered blocks of leaf nodes already
* reserved as "flush reserved", move it to grabbed space counter. */
spin_lock_atom(ch->atom);
assert("zam-940",
nr_formatted_leaves + nr_unformatted_leaves <=
ch->atom->flush_reserved);
flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
spin_unlock_atom(ch->atom);
return ch->overwrite_set_size;
}
/**
* write_jnodes_to_disk_extent - submit write request
* @head:
* @first: first jnode of the list
* @nr: number of jnodes on the list
* @block_p:
* @fq:
* @flags: used to decide whether page is to get PG_reclaim flag
*
* Submits a write request for @nr jnodes beginning from the @first, other
* jnodes are after the @first on the double-linked "capture" list. All jnodes
* will be written to the disk region of @nr blocks starting with @block_p block
* number. If @fq is not NULL it means that waiting for i/o completion will be
* done more efficiently by using flush_queue_t objects.
* This function is the one which writes list of jnodes in batch mode. It does
* all low-level things as bio construction and page states manipulation.
*
* ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
* aggregated in this function instead of being left to the layers below
*
* FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
* Why that layer needed? Why BIOs cannot be constructed here?
*/
static int write_jnodes_to_disk_extent(
jnode *first, int nr, const reiser4_block_nr *block_p,
flush_queue_t *fq, int flags)
{
struct super_block *super = reiser4_get_current_sb();
int op_flags = (flags & WRITEOUT_FLUSH_FUA) ? REQ_PREFLUSH | REQ_FUA : 0;
jnode *cur = first;
reiser4_block_nr block;
assert("zam-571", first != NULL);
assert("zam-572", block_p != NULL);
assert("zam-570", nr > 0);
block = *block_p;
while (nr > 0) {
struct bio *bio;
int nr_blocks = bio_max_segs(nr);
int i;
int nr_used;
bio = bio_alloc(GFP_NOIO, nr_blocks);
if (!bio)
return RETERR(-ENOMEM);
bio_set_dev(bio, super->s_bdev);
bio->bi_iter.bi_sector = block * (super->s_blocksize >> 9);
for (nr_used = 0, i = 0; i < nr_blocks; i++) {
struct page *pg;
pg = jnode_page(cur);
assert("zam-573", pg != NULL);
get_page(pg);
lock_and_wait_page_writeback(pg);
if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
/*
* underlying device is satiated. Stop adding
* pages to the bio.
*/
unlock_page(pg);
put_page(pg);
break;
}
spin_lock_jnode(cur);
assert("nikita-3166",
pg->mapping == jnode_get_mapping(cur));
assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
#if REISER4_DEBUG
spin_lock(&cur->load);
assert("nikita-3165", !jnode_is_releasable(cur));
spin_unlock(&cur->load);
#endif
JF_SET(cur, JNODE_WRITEBACK);
JF_CLR(cur, JNODE_DIRTY);
ON_DEBUG(cur->written++);
assert("edward-1647",
ergo(jnode_is_znode(cur), JF_ISSET(cur, JNODE_PARSED)));
spin_unlock_jnode(cur);
/*
* update checksum
*/
if (jnode_is_znode(cur)) {
zload(JZNODE(cur));
if (node_plugin_by_node(JZNODE(cur))->csum)
node_plugin_by_node(JZNODE(cur))->csum(JZNODE(cur), 0);
zrelse(JZNODE(cur));
}
ClearPageError(pg);
set_page_writeback(pg);
if (get_current_context()->entd) {
/* this is ent thread */
entd_context *ent = get_entd_context(super);
struct wbq *rq, *next;
spin_lock(&ent->guard);
if (pg == ent->cur_request->page) {
/*
* entd is called for this page. This
* request is not in th etodo list
*/
ent->cur_request->written = 1;
} else {
/*
* if we have written a page for which writepage
* is called for - move request to another list.
*/
list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
assert("", rq->magic == WBQ_MAGIC);
if (pg == rq->page) {
/*
* remove request from
* entd's queue, but do
* not wake up a thread
* which put this
* request
*/
list_del_init(&rq->link);
ent->nr_todo_reqs --;
list_add_tail(&rq->link, &ent->done_list);
ent->nr_done_reqs ++;
rq->written = 1;
break;
}
}
}
spin_unlock(&ent->guard);
}
clear_page_dirty_for_io(pg);
unlock_page(pg);
cur = list_entry(cur->capture_link.next, jnode, capture_link);
nr_used++;
}
if (nr_used > 0) {
assert("nikita-3453",
bio->bi_iter.bi_size == super->s_blocksize * nr_used);
/* Check if we are allowed to write at all */
if (sb_rdonly(super))
undo_bio(bio);
else {
add_fq_to_bio(fq, bio);
bio_get(bio);
bio_set_op_attrs(bio, WRITE, op_flags);
submit_bio(bio);
bio_put(bio);
}
block += nr_used - 1;
update_blocknr_hint_default(super, &block);
block += 1;
} else {
bio_put(bio);
}
nr -= nr_used;
}
return 0;
}
/* This is a procedure which recovers a contiguous sequences of disk block
numbers in the given list of j-nodes and submits write requests on this
per-sequence basis */
int
write_jnode_list(struct list_head *head, flush_queue_t *fq,
long *nr_submitted, int flags)
{
int ret;
jnode *beg = list_entry(head->next, jnode, capture_link);
while (head != &beg->capture_link) {
int nr = 1;
jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
while (head != &cur->capture_link) {
if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
break;
++nr;
cur = list_entry(cur->capture_link.next, jnode, capture_link);
}
ret = write_jnodes_to_disk_extent(
beg, nr, jnode_get_block(beg), fq, flags);
if (ret)
return ret;
if (nr_submitted)
*nr_submitted += nr;
beg = cur;
}
return 0;
}
/* add given wandered mapping to atom's wandered map */
static int
add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
{
int ret;
blocknr_set_entry *new_bsep = NULL;
reiser4_block_nr block;
txn_atom *atom;
assert("zam-568", block_p != NULL);
block = *block_p;
assert("zam-569", len > 0);
while ((len--) > 0) {
do {
atom = get_current_atom_locked();
assert("zam-536",
!reiser4_blocknr_is_fake(jnode_get_block(cur)));
ret =
blocknr_set_add_pair(atom, &atom->wandered_map,
&new_bsep,
jnode_get_block(cur), &block);
} while (ret == -E_REPEAT);
if (ret) {
/* deallocate blocks which were not added to wandered
map */
reiser4_block_nr wide_len = len;
reiser4_dealloc_blocks(&block, &wide_len,
BLOCK_NOT_COUNTED,
BA_FORMATTED
/* formatted, without defer */ );
return ret;
}
spin_unlock_atom(atom);
cur = list_entry(cur->capture_link.next, jnode, capture_link);
++block;
}
return 0;
}
/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
submit IO for allocated blocks. We assume that current atom is in a stage
when any atom fusion is impossible and atom is unlocked and it is safe. */
static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
{
reiser4_block_nr block;
int rest;
int len;
int ret;
jnode *cur;
assert("zam-534", ch->overwrite_set_size > 0);
rest = ch->overwrite_set_size;
cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
while (ch->overwrite_set != &cur->capture_link) {
assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
ret = get_more_wandered_blocks(rest, &block, &len);
if (ret)
return ret;
rest -= len;
ret = add_region_to_wmap(cur, len, &block);
if (ret)
return ret;
ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
if (ret)
return ret;
while ((len--) > 0) {
assert("zam-604",
ch->overwrite_set != &cur->capture_link);
cur = list_entry(cur->capture_link.next, jnode, capture_link);
}
}
return 0;
}
/* allocate given number of nodes over the journal area and link them into a
list, return pointer to the first jnode in the list */
static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
{
reiser4_blocknr_hint hint;
reiser4_block_nr allocated = 0;
reiser4_block_nr first, len;
jnode *cur;
jnode *txhead;
int ret;
reiser4_context *ctx;
reiser4_super_info_data *sbinfo;
assert("zam-698", ch->tx_size > 0);
assert("zam-699", list_empty_careful(&ch->tx_list));
ctx = get_current_context();
sbinfo = get_super_private(ctx->super);