@@ -849,6 +849,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
849
849
return ret ;
850
850
}
851
851
852
+ static bool io_do_coalesce_buffer (struct page * * * pages , int * nr_pages ,
853
+ struct io_imu_folio_data * data , int nr_folios )
854
+ {
855
+ struct page * * page_array = * pages , * * new_array = NULL ;
856
+ int nr_pages_left = * nr_pages , i , j ;
857
+
858
+ /* Store head pages only*/
859
+ new_array = kvmalloc_array (nr_folios , sizeof (struct page * ),
860
+ GFP_KERNEL );
861
+ if (!new_array )
862
+ return false;
863
+
864
+ new_array [0 ] = compound_head (page_array [0 ]);
865
+ /*
866
+ * The pages are bound to the folio, it doesn't
867
+ * actually unpin them but drops all but one reference,
868
+ * which is usually put down by io_buffer_unmap().
869
+ * Note, needs a better helper.
870
+ */
871
+ if (data -> nr_pages_head > 1 )
872
+ unpin_user_pages (& page_array [1 ], data -> nr_pages_head - 1 );
873
+
874
+ j = data -> nr_pages_head ;
875
+ nr_pages_left -= data -> nr_pages_head ;
876
+ for (i = 1 ; i < nr_folios ; i ++ ) {
877
+ unsigned int nr_unpin ;
878
+
879
+ new_array [i ] = page_array [j ];
880
+ nr_unpin = min_t (unsigned int , nr_pages_left - 1 ,
881
+ data -> nr_pages_mid - 1 );
882
+ if (nr_unpin )
883
+ unpin_user_pages (& page_array [j + 1 ], nr_unpin );
884
+ j += data -> nr_pages_mid ;
885
+ nr_pages_left -= data -> nr_pages_mid ;
886
+ }
887
+ kvfree (page_array );
888
+ * pages = new_array ;
889
+ * nr_pages = nr_folios ;
890
+ return true;
891
+ }
892
+
893
+ static bool io_try_coalesce_buffer (struct page * * * pages , int * nr_pages ,
894
+ struct io_imu_folio_data * data )
895
+ {
896
+ struct page * * page_array = * pages ;
897
+ struct folio * folio = page_folio (page_array [0 ]);
898
+ unsigned int count = 1 , nr_folios = 1 ;
899
+ int i ;
900
+
901
+ if (* nr_pages <= 1 )
902
+ return false;
903
+
904
+ data -> nr_pages_mid = folio_nr_pages (folio );
905
+ if (data -> nr_pages_mid == 1 )
906
+ return false;
907
+
908
+ data -> folio_shift = folio_shift (folio );
909
+ /*
910
+ * Check if pages are contiguous inside a folio, and all folios have
911
+ * the same page count except for the head and tail.
912
+ */
913
+ for (i = 1 ; i < * nr_pages ; i ++ ) {
914
+ if (page_folio (page_array [i ]) == folio &&
915
+ page_array [i ] == page_array [i - 1 ] + 1 ) {
916
+ count ++ ;
917
+ continue ;
918
+ }
919
+
920
+ if (nr_folios == 1 ) {
921
+ if (folio_page_idx (folio , page_array [i - 1 ]) !=
922
+ data -> nr_pages_mid - 1 )
923
+ return false;
924
+
925
+ data -> nr_pages_head = count ;
926
+ } else if (count != data -> nr_pages_mid ) {
927
+ return false;
928
+ }
929
+
930
+ folio = page_folio (page_array [i ]);
931
+ if (folio_size (folio ) != (1UL << data -> folio_shift ) ||
932
+ folio_page_idx (folio , page_array [i ]) != 0 )
933
+ return false;
934
+
935
+ count = 1 ;
936
+ nr_folios ++ ;
937
+ }
938
+ if (nr_folios == 1 )
939
+ data -> nr_pages_head = count ;
940
+
941
+ return io_do_coalesce_buffer (pages , nr_pages , data , nr_folios );
942
+ }
943
+
852
944
static int io_sqe_buffer_register (struct io_ring_ctx * ctx , struct iovec * iov ,
853
945
struct io_mapped_ubuf * * pimu ,
854
946
struct page * * last_hpage )
@@ -858,7 +950,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
858
950
unsigned long off ;
859
951
size_t size ;
860
952
int ret , nr_pages , i ;
861
- struct folio * folio = NULL ;
953
+ struct io_imu_folio_data data ;
954
+ bool coalesced ;
862
955
863
956
* pimu = (struct io_mapped_ubuf * )& dummy_ubuf ;
864
957
if (!iov -> iov_base )
@@ -873,31 +966,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
873
966
goto done ;
874
967
}
875
968
876
- /* If it's a huge page, try to coalesce them into a single bvec entry */
877
- if (nr_pages > 1 ) {
878
- folio = page_folio (pages [0 ]);
879
- for (i = 1 ; i < nr_pages ; i ++ ) {
880
- /*
881
- * Pages must be consecutive and on the same folio for
882
- * this to work
883
- */
884
- if (page_folio (pages [i ]) != folio ||
885
- pages [i ] != pages [i - 1 ] + 1 ) {
886
- folio = NULL ;
887
- break ;
888
- }
889
- }
890
- if (folio ) {
891
- /*
892
- * The pages are bound to the folio, it doesn't
893
- * actually unpin them but drops all but one reference,
894
- * which is usually put down by io_buffer_unmap().
895
- * Note, needs a better helper.
896
- */
897
- unpin_user_pages (& pages [1 ], nr_pages - 1 );
898
- nr_pages = 1 ;
899
- }
900
- }
969
+ /* If it's huge page(s), try to coalesce them into fewer bvec entries */
970
+ coalesced = io_try_coalesce_buffer (& pages , & nr_pages , & data );
901
971
902
972
imu = kvmalloc (struct_size (imu , bvec , nr_pages ), GFP_KERNEL );
903
973
if (!imu )
@@ -909,25 +979,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
909
979
goto done ;
910
980
}
911
981
912
- off = (unsigned long ) iov -> iov_base & ~PAGE_MASK ;
913
982
size = iov -> iov_len ;
914
983
/* store original address for later verification */
915
984
imu -> ubuf = (unsigned long ) iov -> iov_base ;
916
985
imu -> ubuf_end = imu -> ubuf + iov -> iov_len ;
917
986
imu -> nr_bvecs = nr_pages ;
918
987
imu -> folio_shift = PAGE_SHIFT ;
919
988
imu -> folio_mask = PAGE_MASK ;
989
+ if (coalesced ) {
990
+ imu -> folio_shift = data .folio_shift ;
991
+ imu -> folio_mask = ~((1UL << data .folio_shift ) - 1 );
992
+ }
993
+ off = (unsigned long ) iov -> iov_base & ~imu -> folio_mask ;
920
994
* pimu = imu ;
921
995
ret = 0 ;
922
996
923
- if (folio ) {
924
- bvec_set_page (& imu -> bvec [0 ], pages [0 ], size , off );
925
- goto done ;
926
- }
927
997
for (i = 0 ; i < nr_pages ; i ++ ) {
928
998
size_t vec_len ;
929
999
930
- vec_len = min_t (size_t , size , PAGE_SIZE - off );
1000
+ vec_len = min_t (size_t , size , ( 1UL << imu -> folio_shift ) - off );
931
1001
bvec_set_page (& imu -> bvec [i ], pages [i ], vec_len , off );
932
1002
off = 0 ;
933
1003
size -= vec_len ;
0 commit comments