Skip to content

Commit 225f81b

Browse files
puranjaymohanKernel Patches Daemon
authored andcommitted
bpf: arena: use kmalloc_nolock() in place of kvcalloc()
To make arena_alloc_pages() safe to be called from any context, replace kvcalloc() with kmalloc_nolock() so as it doesn't sleep or take any locks. kmalloc_nolock() returns NULL for allocations larger than KMALLOC_MAX_CACHE_SIZE, which is (PAGE_SIZE * 2) = 8KB on systems with 4KB pages. So, round down the allocation done by kmalloc_nolock to 1024 * 8 and reuse the array in a loop. Signed-off-by: Puranjay Mohan <[email protected]>
1 parent 6204c37 commit 225f81b

File tree

1 file changed

+57
-26
lines changed

1 file changed

+57
-26
lines changed

kernel/bpf/arena.c

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
4444
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
4545

46+
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt);
47+
4648
struct bpf_arena {
4749
struct bpf_map map;
4850
u64 user_vm_start;
@@ -492,7 +494,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
492494
/* user_vm_end/start are fixed before bpf prog runs */
493495
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
494496
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
497+
struct apply_range_data data;
495498
struct page **pages = NULL;
499+
long remaining, mapped = 0;
500+
long alloc_pages;
496501
long pgoff = 0;
497502
u32 uaddr32;
498503
int ret, i;
@@ -509,52 +514,78 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
509514
return 0;
510515
}
511516

512-
/* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
513-
pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
517+
/*
518+
* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed.
519+
*/
520+
alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
521+
pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE);
514522
if (!pages)
515523
return 0;
524+
data.pages = pages;
516525

517-
guard(mutex)(&arena->lock);
526+
mutex_lock(&arena->lock);
518527

519528
if (uaddr) {
520529
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
521530
if (ret)
522-
goto out_free_pages;
531+
goto out_unlock_free_pages;
523532
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
524533
} else {
525534
ret = pgoff = range_tree_find(&arena->rt, page_cnt);
526535
if (pgoff >= 0)
527536
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
528537
}
529538
if (ret)
530-
goto out_free_pages;
531-
532-
struct apply_range_data data = { .pages = pages, .i = 0 };
533-
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
534-
if (ret)
535-
goto out;
539+
goto out_unlock_free_pages;
536540

541+
remaining = page_cnt;
537542
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
538-
/* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
539-
* will not overflow 32-bit. Lower 32-bit need to represent
540-
* contiguous user address range.
541-
* Map these pages at kern_vm_start base.
542-
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
543-
* lower 32-bit and it's ok.
544-
*/
545-
ret = apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
546-
page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
547-
if (ret) {
548-
for (i = 0; i < page_cnt; i++)
549-
__free_page(pages[i]);
550-
goto out;
543+
544+
while (remaining) {
545+
long this_batch = min(remaining, alloc_pages);
546+
547+
/* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
548+
memset(pages, 0, this_batch * sizeof(struct page *));
549+
data.i = 0;
550+
551+
ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages);
552+
if (ret)
553+
goto out;
554+
555+
/* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
556+
* will not overflow 32-bit. Lower 32-bit need to represent
557+
* contiguous user address range.
558+
* Map these pages at kern_vm_start base.
559+
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
560+
* lower 32-bit and it's ok.
561+
*/
562+
ret = apply_to_page_range(&init_mm,
563+
kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT),
564+
this_batch << PAGE_SHIFT, apply_range_set_cb, &data);
565+
if (ret) {
566+
/* data.i pages were mapped, account them and free the remaining */
567+
mapped += data.i;
568+
for (i = data.i; i < this_batch; i++)
569+
__free_page(pages[i]);
570+
goto out;
571+
}
572+
573+
mapped += this_batch;
574+
remaining -= this_batch;
551575
}
552-
kvfree(pages);
576+
mutex_unlock(&arena->lock);
577+
kfree_nolock(pages);
553578
return clear_lo32(arena->user_vm_start) + uaddr32;
554579
out:
555-
range_tree_set(&arena->rt, pgoff, page_cnt);
580+
range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
581+
mutex_unlock(&arena->lock);
582+
if (mapped)
583+
arena_free_pages(arena, clear_lo32(arena->user_vm_start) + uaddr32, mapped);
584+
goto out_free_pages;
585+
out_unlock_free_pages:
586+
mutex_unlock(&arena->lock);
556587
out_free_pages:
557-
kvfree(pages);
588+
kfree_nolock(pages);
558589
return 0;
559590
}
560591

0 commit comments

Comments
 (0)