From 3b28147404b8a0242f1c676de68e5863195fef67 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:42 -0600 Subject: [PATCH 01/53] ntsync: Return the fd from NTSYNC_IOC_CREATE_SEM. Simplify the user API a bit by returning the fd as return value from the ioctl instead of through the argument pointer. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 7 ++----- include/uapi/linux/ntsync.h | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 4954553b7baa6f..2e7f698268c1f9 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -165,7 +165,6 @@ static int ntsync_obj_get_fd(struct ntsync_obj *obj) static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) { - struct ntsync_sem_args __user *user_args = argp; struct ntsync_sem_args args; struct ntsync_obj *sem; int fd; @@ -182,12 +181,10 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) sem->u.sem.count = args.count; sem->u.sem.max = args.max; fd = ntsync_obj_get_fd(sem); - if (fd < 0) { + if (fd < 0) kfree(sem); - return fd; - } - return put_user(fd, &user_args->sem); + return fd; } static int ntsync_char_open(struct inode *inode, struct file *file) diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index dcfa38fdc93c61..27d8cb3dd5b7c8 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -11,12 +11,11 @@ #include struct ntsync_sem_args { - __u32 sem; __u32 count; __u32 max; }; -#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) +#define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) From 2a00cf5a5608f84a9e9ab1a3b6182d9e111765c2 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:43 -0600 Subject: [PATCH 02/53] ntsync: Rename NTSYNC_IOC_SEM_POST to NTSYNC_IOC_SEM_RELEASE. Use the more common "release" terminology, which is also the term used by NT, instead of "post" (which is used by POSIX). Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 10 +++++----- include/uapi/linux/ntsync.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 2e7f698268c1f9..cb3a3bd97ba0cb 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -57,7 +57,7 @@ struct ntsync_device { * Actually change the semaphore state, returning -EOVERFLOW if it is made * invalid. */ -static int post_sem_state(struct ntsync_obj *sem, __u32 count) +static int release_sem_state(struct ntsync_obj *sem, __u32 count) { __u32 sum; @@ -71,7 +71,7 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count) return 0; } -static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) +static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) { __u32 __user *user_args = argp; __u32 prev_count; @@ -87,7 +87,7 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) spin_lock(&sem->lock); prev_count = sem->u.sem.count; - ret = post_sem_state(sem, args); + ret = release_sem_state(sem, args); spin_unlock(&sem->lock); @@ -114,8 +114,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { - case NTSYNC_IOC_SEM_POST: - return ntsync_sem_post(obj, argp); + case NTSYNC_IOC_SEM_RELEASE: + return ntsync_sem_release(obj, argp); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 27d8cb3dd5b7c8..9af9d8125553d3 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -17,6 +17,6 @@ struct ntsync_sem_args { #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) -#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) +#define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #endif From 7b1d918f56c792335f24d155a05a2ce93adb756e Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:44 -0600 Subject: [PATCH 03/53] ntsync: Introduce NTSYNC_IOC_WAIT_ANY. This corresponds to part of the functionality of the NT syscall NtWaitForMultipleObjects(). Specifically, it implements the behaviour where the third argument (wait_any) is TRUE, and it does not handle alertable waits. Those features have been split out into separate patches to ease review. This patch therefore implements the wait/wake infrastructure which comprises the core of ntsync's functionality. NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike poll(), it "consumes" objects when they are signaled. For semaphores, this means decreasing one from the internal counter. At most one object can be consumed by this function. This wait/wake model is fundamentally different from that used anywhere else in the kernel, and for that reason ntsync does not use any existing infrastructure, such as futexes, kernel mutexes or semaphores, or wait_event(). Up to 64 objects can be waited on at once. As soon as one is signaled, the object with the lowest index is consumed, and that index is returned via the "index" field. A timeout is supported. The timeout is passed as a u64 nanosecond value, which represents absolute time measured against either the MONOTONIC or REALTIME clock (controlled by the flags argument). If U64_MAX is passed, the ioctl waits indefinitely. This ioctl validates that all objects belong to the relevant device. This is not necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch. Some padding fields are added for alignment and for fields which will be added in future patches (split out to ease review). Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 247 +++++++++++++++++++++++++++++++++++- include/uapi/linux/ntsync.h | 14 ++ 2 files changed, 260 insertions(+), 1 deletion(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index cb3a3bd97ba0cb..900cc5ce5761f3 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -6,11 +6,16 @@ */ #include +#include #include #include +#include +#include #include #include #include +#include +#include #include #include #include @@ -30,6 +35,8 @@ enum ntsync_type { * * Both rely on struct file for reference counting. Individual * ntsync_obj objects take a reference to the device when created. + * Wait operations take a reference to each object being waited on for + * the duration of the wait. */ struct ntsync_obj { @@ -47,12 +54,55 @@ struct ntsync_obj { __u32 max; } sem; } u; + + struct list_head any_waiters; +}; + +struct ntsync_q_entry { + struct list_head node; + struct ntsync_q *q; + struct ntsync_obj *obj; + __u32 index; +}; + +struct ntsync_q { + struct task_struct *task; + + /* + * Protected via atomic_try_cmpxchg(). Only the thread that wins the + * compare-and-swap may actually change object states and wake this + * task. + */ + atomic_t signaled; + + __u32 count; + struct ntsync_q_entry entries[]; }; struct ntsync_device { struct file *file; }; +static void try_wake_any_sem(struct ntsync_obj *sem) +{ + struct ntsync_q_entry *entry; + + lockdep_assert_held(&sem->lock); + + list_for_each_entry(entry, &sem->any_waiters, node) { + struct ntsync_q *q = entry->q; + int signaled = -1; + + if (!sem->u.sem.count) + break; + + if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { + sem->u.sem.count--; + wake_up_process(q->task); + } + } +} + /* * Actually change the semaphore state, returning -EOVERFLOW if it is made * invalid. @@ -87,7 +137,9 @@ static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) spin_lock(&sem->lock); prev_count = sem->u.sem.count; - ret = release_sem_state(sem, args); + ret = post_sem_state(sem, args); + if (!ret) + try_wake_any_sem(sem); spin_unlock(&sem->lock); @@ -140,6 +192,7 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, obj->dev = dev; get_file(dev->file); spin_lock_init(&obj->lock); + INIT_LIST_HEAD(&obj->any_waiters); return obj; } @@ -187,6 +240,196 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) return fd; } +static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) +{ + struct file *file = fget(fd); + struct ntsync_obj *obj; + + if (!file) + return NULL; + + if (file->f_op != &ntsync_obj_fops) { + fput(file); + return NULL; + } + + obj = file->private_data; + if (obj->dev != dev) { + fput(file); + return NULL; + } + + return obj; +} + +static void put_obj(struct ntsync_obj *obj) +{ + fput(obj->file); +} + +static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args) +{ + ktime_t timeout = ns_to_ktime(args->timeout); + clockid_t clock = CLOCK_MONOTONIC; + ktime_t *timeout_ptr; + int ret = 0; + + timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout); + + if (args->flags & NTSYNC_WAIT_REALTIME) + clock = CLOCK_REALTIME; + + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_read(&q->signaled) != -1) { + ret = 0; + break; + } + ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock); + } while (ret < 0); + __set_current_state(TASK_RUNNING); + + return ret; +} + +/* + * Allocate and initialize the ntsync_q structure, but do not queue us yet. + */ +static int setup_wait(struct ntsync_device *dev, + const struct ntsync_wait_args *args, + struct ntsync_q **ret_q) +{ + const __u32 count = args->count; + int fds[NTSYNC_MAX_WAIT_COUNT]; + struct ntsync_q *q; + __u32 i, j; + + if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME)) + return -EINVAL; + + if (args->count > NTSYNC_MAX_WAIT_COUNT) + return -EINVAL; + + if (copy_from_user(fds, u64_to_user_ptr(args->objs), + array_size(count, sizeof(*fds)))) + return -EFAULT; + + q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); + if (!q) + return -ENOMEM; + q->task = current; + atomic_set(&q->signaled, -1); + q->count = count; + + for (i = 0; i < count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = get_obj(dev, fds[i]); + + if (!obj) + goto err; + + entry->obj = obj; + entry->q = q; + entry->index = i; + } + + *ret_q = q; + return 0; + +err: + for (j = 0; j < i; j++) + put_obj(q->entries[j].obj); + kfree(q); + return -EINVAL; +} + +static void try_wake_any_obj(struct ntsync_obj *obj) +{ + switch (obj->type) { + case NTSYNC_TYPE_SEM: + try_wake_any_sem(obj); + break; + } +} + +static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) +{ + struct ntsync_wait_args args; + struct ntsync_q *q; + int signaled; + __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + ret = setup_wait(dev, &args, &q); + if (ret < 0) + return ret; + + /* queue ourselves */ + + for (i = 0; i < args.count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_add_tail(&entry->node, &obj->any_waiters); + spin_unlock(&obj->lock); + } + + /* check if we are already signaled */ + + for (i = 0; i < args.count; i++) { + struct ntsync_obj *obj = q->entries[i].obj; + + if (atomic_read(&q->signaled) != -1) + break; + + spin_lock(&obj->lock); + try_wake_any_obj(obj); + spin_unlock(&obj->lock); + } + + /* sleep */ + + ret = ntsync_schedule(q, &args); + + /* and finally, unqueue */ + + for (i = 0; i < args.count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + + spin_lock(&obj->lock); + list_del(&entry->node); + spin_unlock(&obj->lock); + + put_obj(obj); + } + + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct ntsync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ + ret = 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; + } else if (!ret) { + ret = -ETIMEDOUT; + } + + kfree(q); + return ret; +} + static int ntsync_char_open(struct inode *inode, struct file *file) { struct ntsync_device *dev; @@ -218,6 +461,8 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NTSYNC_IOC_CREATE_SEM: return ntsync_create_sem(dev, argp); + case NTSYNC_IOC_WAIT_ANY: + return ntsync_wait_any(dev, argp); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 9af9d8125553d3..40ffdc41d5bb31 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -15,7 +15,21 @@ struct ntsync_sem_args { __u32 max; }; +#define NTSYNC_WAIT_REALTIME 0x1 + +struct ntsync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 index; + __u32 flags; + __u32 pad[3]; +}; + +#define NTSYNC_MAX_WAIT_COUNT 64 + #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) +#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) From 1a3a482e7d90e60f5848b6997368b3ca711f7213 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:45 -0600 Subject: [PATCH 04/53] ntsync: Introduce NTSYNC_IOC_WAIT_ALL. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is similar to NTSYNC_IOC_WAIT_ANY, but waits until all of the objects are simultaneously signaled, and then acquires all of them as a single atomic operation. Because acquisition of multiple objects is atomic, some complex locking is required. We cannot simply spin-lock multiple objects simultaneously, as that may disable preƫmption for a problematically long time. Instead, modifying any object which may be involved in a wait-all operation takes a device-wide sleeping mutex, "wait_all_lock", instead of the normal object spinlock. Because wait-for-all is a rare operation, in order to optimize wait-for-any, this lock is only taken when necessary. "all_hint" is used to mark objects which are involved in a wait-for-all operation, and if an object is not, only its spinlock is taken. The locking scheme used here was written by Peter Zijlstra. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 336 ++++++++++++++++++++++++++++++++++-- include/uapi/linux/ntsync.h | 1 + 2 files changed, 323 insertions(+), 14 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 900cc5ce5761f3..c8beb5504bccbc 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ enum ntsync_type { struct ntsync_obj { spinlock_t lock; + int dev_locked; enum ntsync_type type; @@ -55,7 +57,30 @@ struct ntsync_obj { } sem; } u; + /* + * any_waiters is protected by the object lock, but all_waiters is + * protected by the device wait_all_lock. + */ struct list_head any_waiters; + struct list_head all_waiters; + + /* + * Hint describing how many tasks are queued on this object in a + * wait-all operation. + * + * Any time we do a wake, we may need to wake "all" waiters as well as + * "any" waiters. In order to atomically wake "all" waiters, we must + * lock all of the objects, and that means grabbing the wait_all_lock + * below (and, due to lock ordering rules, before locking this object). + * However, wait-all is a rare operation, and grabbing the wait-all + * lock for every wake would create unnecessary contention. + * Therefore we first check whether all_hint is zero, and, if it is, + * we skip trying to wake "all" waiters. + * + * Since wait requests must originate from user-space threads, we're + * limited here by PID_MAX_LIMIT, so there's no risk of overflow. + */ + atomic_t all_hint; }; struct ntsync_q_entry { @@ -75,19 +100,198 @@ struct ntsync_q { */ atomic_t signaled; + bool all; __u32 count; struct ntsync_q_entry entries[]; }; struct ntsync_device { + /* + * Wait-all operations must atomically grab all objects, and be totally + * ordered with respect to each other and wait-any operations. + * If one thread is trying to acquire several objects, another thread + * cannot touch the object at the same time. + * + * This device-wide lock is used to serialize wait-for-all + * operations, and operations on an object that is involved in a + * wait-for-all. + */ + struct mutex wait_all_lock; + struct file *file; }; +/* + * Single objects are locked using obj->lock. + * + * Multiple objects are 'locked' while holding dev->wait_all_lock. + * In this case however, individual objects are not locked by holding + * obj->lock, but by setting obj->dev_locked. + * + * This means that in order to lock a single object, the sequence is slightly + * more complicated than usual. Specifically it needs to check obj->dev_locked + * after acquiring obj->lock, if set, it needs to drop the lock and acquire + * dev->wait_all_lock in order to serialize against the multi-object operation. + */ + +static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) +{ + lockdep_assert_held(&dev->wait_all_lock); + lockdep_assert(obj->dev == dev); + spin_lock(&obj->lock); + /* + * By setting obj->dev_locked inside obj->lock, it is ensured that + * anyone holding obj->lock must see the value. + */ + obj->dev_locked = 1; + spin_unlock(&obj->lock); +} + +static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) +{ + lockdep_assert_held(&dev->wait_all_lock); + lockdep_assert(obj->dev == dev); + spin_lock(&obj->lock); + obj->dev_locked = 0; + spin_unlock(&obj->lock); +} + +static void obj_lock(struct ntsync_obj *obj) +{ + struct ntsync_device *dev = obj->dev; + + for (;;) { + spin_lock(&obj->lock); + if (likely(!obj->dev_locked)) + break; + + spin_unlock(&obj->lock); + mutex_lock(&dev->wait_all_lock); + spin_lock(&obj->lock); + /* + * obj->dev_locked should be set and released under the same + * wait_all_lock section, since we now own this lock, it should + * be clear. + */ + lockdep_assert(!obj->dev_locked); + spin_unlock(&obj->lock); + mutex_unlock(&dev->wait_all_lock); + } +} + +static void obj_unlock(struct ntsync_obj *obj) +{ + spin_unlock(&obj->lock); +} + +static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) +{ + bool all; + + obj_lock(obj); + all = atomic_read(&obj->all_hint); + if (unlikely(all)) { + obj_unlock(obj); + mutex_lock(&dev->wait_all_lock); + dev_lock_obj(dev, obj); + } + + return all; +} + +static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all) +{ + if (all) { + dev_unlock_obj(dev, obj); + mutex_unlock(&dev->wait_all_lock); + } else { + obj_unlock(obj); + } +} + +#define ntsync_assert_held(obj) \ + lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \ + ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ + (obj)->dev_locked)) + +static bool is_signaled(struct ntsync_obj *obj) +{ + ntsync_assert_held(obj); + + switch (obj->type) { + case NTSYNC_TYPE_SEM: + return !!obj->u.sem.count; + } + + WARN(1, "bad object type %#x\n", obj->type); + return false; +} + +/* + * "locked_obj" is an optional pointer to an object which is already locked and + * should not be locked again. This is necessary so that changing an object's + * state and waking it can be a single atomic operation. + */ +static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, + struct ntsync_obj *locked_obj) +{ + __u32 count = q->count; + bool can_wake = true; + int signaled = -1; + __u32 i; + + lockdep_assert_held(&dev->wait_all_lock); + if (locked_obj) + lockdep_assert(locked_obj->dev_locked); + + for (i = 0; i < count; i++) { + if (q->entries[i].obj != locked_obj) + dev_lock_obj(dev, q->entries[i].obj); + } + + for (i = 0; i < count; i++) { + if (!is_signaled(q->entries[i].obj)) { + can_wake = false; + break; + } + } + + if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) { + for (i = 0; i < count; i++) { + struct ntsync_obj *obj = q->entries[i].obj; + + switch (obj->type) { + case NTSYNC_TYPE_SEM: + obj->u.sem.count--; + break; + } + } + wake_up_process(q->task); + } + + for (i = 0; i < count; i++) { + if (q->entries[i].obj != locked_obj) + dev_unlock_obj(dev, q->entries[i].obj); + } +} + +static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj) +{ + struct ntsync_q_entry *entry; + + lockdep_assert_held(&dev->wait_all_lock); + lockdep_assert(obj->dev_locked); + + list_for_each_entry(entry, &obj->all_waiters, node) + try_wake_all(dev, entry->q, obj); +} + static void try_wake_any_sem(struct ntsync_obj *sem) { struct ntsync_q_entry *entry; - lockdep_assert_held(&sem->lock); + ntsync_assert_held(sem); + lockdep_assert(sem->type == NTSYNC_TYPE_SEM); list_for_each_entry(entry, &sem->any_waiters, node) { struct ntsync_q *q = entry->q; @@ -111,7 +315,7 @@ static int release_sem_state(struct ntsync_obj *sem, __u32 count) { __u32 sum; - lockdep_assert_held(&sem->lock); + ntsync_assert_held(sem); if (check_add_overflow(sem->u.sem.count, count, &sum) || sum > sem->u.sem.max) @@ -123,9 +327,11 @@ static int release_sem_state(struct ntsync_obj *sem, __u32 count) static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) { + struct ntsync_device *dev = sem->dev; __u32 __user *user_args = argp; __u32 prev_count; __u32 args; + bool all; int ret; if (copy_from_user(&args, argp, sizeof(args))) @@ -134,14 +340,17 @@ static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) if (sem->type != NTSYNC_TYPE_SEM) return -EINVAL; - spin_lock(&sem->lock); + all = ntsync_lock_obj(dev, sem); prev_count = sem->u.sem.count; - ret = post_sem_state(sem, args); - if (!ret) + ret = release_sem_state(sem, args); + if (!ret) { + if (all) + try_wake_all_obj(dev, sem); try_wake_any_sem(sem); + } - spin_unlock(&sem->lock); + ntsync_unlock_obj(dev, sem, all); if (!ret && put_user(prev_count, user_args)) ret = -EFAULT; @@ -193,6 +402,8 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, get_file(dev->file); spin_lock_init(&obj->lock); INIT_LIST_HEAD(&obj->any_waiters); + INIT_LIST_HEAD(&obj->all_waiters); + atomic_set(&obj->all_hint, 0); return obj; } @@ -301,7 +512,7 @@ static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_ar * Allocate and initialize the ntsync_q structure, but do not queue us yet. */ static int setup_wait(struct ntsync_device *dev, - const struct ntsync_wait_args *args, + const struct ntsync_wait_args *args, bool all, struct ntsync_q **ret_q) { const __u32 count = args->count; @@ -324,6 +535,7 @@ static int setup_wait(struct ntsync_device *dev, return -ENOMEM; q->task = current; atomic_set(&q->signaled, -1); + q->all = all; q->count = count; for (i = 0; i < count; i++) { @@ -333,6 +545,16 @@ static int setup_wait(struct ntsync_device *dev, if (!obj) goto err; + if (all) { + /* Check that the objects are all distinct. */ + for (j = 0; j < i; j++) { + if (obj == q->entries[j].obj) { + put_obj(obj); + goto err; + } + } + } + entry->obj = obj; entry->q = q; entry->index = i; @@ -362,13 +584,14 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) struct ntsync_wait_args args; struct ntsync_q *q; int signaled; + bool all; __u32 i; int ret; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; - ret = setup_wait(dev, &args, &q); + ret = setup_wait(dev, &args, false, &q); if (ret < 0) return ret; @@ -378,9 +601,9 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) struct ntsync_q_entry *entry = &q->entries[i]; struct ntsync_obj *obj = entry->obj; - spin_lock(&obj->lock); + all = ntsync_lock_obj(dev, obj); list_add_tail(&entry->node, &obj->any_waiters); - spin_unlock(&obj->lock); + ntsync_unlock_obj(dev, obj, all); } /* check if we are already signaled */ @@ -391,9 +614,9 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) if (atomic_read(&q->signaled) != -1) break; - spin_lock(&obj->lock); + all = ntsync_lock_obj(dev, obj); try_wake_any_obj(obj); - spin_unlock(&obj->lock); + ntsync_unlock_obj(dev, obj, all); } /* sleep */ @@ -406,13 +629,94 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) struct ntsync_q_entry *entry = &q->entries[i]; struct ntsync_obj *obj = entry->obj; - spin_lock(&obj->lock); + all = ntsync_lock_obj(dev, obj); list_del(&entry->node); - spin_unlock(&obj->lock); + ntsync_unlock_obj(dev, obj, all); + + put_obj(obj); + } + + signaled = atomic_read(&q->signaled); + if (signaled != -1) { + struct ntsync_wait_args __user *user_args = argp; + + /* even if we caught a signal, we need to communicate success */ + ret = 0; + + if (put_user(signaled, &user_args->index)) + ret = -EFAULT; + } else if (!ret) { + ret = -ETIMEDOUT; + } + + kfree(q); + return ret; +} + +static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) +{ + struct ntsync_wait_args args; + struct ntsync_q *q; + int signaled; + __u32 i; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + ret = setup_wait(dev, &args, true, &q); + if (ret < 0) + return ret; + + /* queue ourselves */ + + mutex_lock(&dev->wait_all_lock); + + for (i = 0; i < args.count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + + atomic_inc(&obj->all_hint); + + /* + * obj->all_waiters is protected by dev->wait_all_lock rather + * than obj->lock, so there is no need to acquire obj->lock + * here. + */ + list_add_tail(&entry->node, &obj->all_waiters); + } + + /* check if we are already signaled */ + + try_wake_all(dev, q, NULL); + + mutex_unlock(&dev->wait_all_lock); + + /* sleep */ + + ret = ntsync_schedule(q, &args); + + /* and finally, unqueue */ + + mutex_lock(&dev->wait_all_lock); + + for (i = 0; i < args.count; i++) { + struct ntsync_q_entry *entry = &q->entries[i]; + struct ntsync_obj *obj = entry->obj; + + /* + * obj->all_waiters is protected by dev->wait_all_lock rather + * than obj->lock, so there is no need to acquire it here. + */ + list_del(&entry->node); + + atomic_dec(&obj->all_hint); put_obj(obj); } + mutex_unlock(&dev->wait_all_lock); + signaled = atomic_read(&q->signaled); if (signaled != -1) { struct ntsync_wait_args __user *user_args = argp; @@ -438,6 +742,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file) if (!dev) return -ENOMEM; + mutex_init(&dev->wait_all_lock); + file->private_data = dev; dev->file = file; return nonseekable_open(inode, file); @@ -461,6 +767,8 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NTSYNC_IOC_CREATE_SEM: return ntsync_create_sem(dev, argp); + case NTSYNC_IOC_WAIT_ALL: + return ntsync_wait_all(dev, argp); case NTSYNC_IOC_WAIT_ANY: return ntsync_wait_any(dev, argp); default: diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 40ffdc41d5bb31..d64420ffbcd71c 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -30,6 +30,7 @@ struct ntsync_wait_args { #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) +#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) From 688808d01208b2f1a23b2c016aa6a437d9120da5 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:46 -0600 Subject: [PATCH 05/53] ntsync: Introduce NTSYNC_IOC_CREATE_MUTEX. This corresponds to the NT syscall NtCreateMutant(). An NT mutex is recursive, with a 32-bit recursion counter. When acquired via NtWaitForMultipleObjects(), the recursion counter is incremented by one. The OS records the thread which acquired it. The OS records the thread which acquired it. However, in order to keep this driver self-contained, the owning thread ID is managed by user-space, and passed as a parameter to all relevant ioctls. The initial owner and recursion count, if any, are specified when the mutex is created. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 74 +++++++++++++++++++++++++++++++++++-- include/uapi/linux/ntsync.h | 9 ++++- 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index c8beb5504bccbc..a2826dbff2f46c 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -25,6 +25,7 @@ enum ntsync_type { NTSYNC_TYPE_SEM, + NTSYNC_TYPE_MUTEX, }; /* @@ -55,6 +56,10 @@ struct ntsync_obj { __u32 count; __u32 max; } sem; + struct { + __u32 count; + pid_t owner; + } mutex; } u; /* @@ -92,6 +97,7 @@ struct ntsync_q_entry { struct ntsync_q { struct task_struct *task; + __u32 owner; /* * Protected via atomic_try_cmpxchg(). Only the thread that wins the @@ -214,13 +220,17 @@ static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ (obj)->dev_locked)) -static bool is_signaled(struct ntsync_obj *obj) +static bool is_signaled(struct ntsync_obj *obj, __u32 owner) { ntsync_assert_held(obj); switch (obj->type) { case NTSYNC_TYPE_SEM: return !!obj->u.sem.count; + case NTSYNC_TYPE_MUTEX: + if (obj->u.mutex.owner && obj->u.mutex.owner != owner) + return false; + return obj->u.mutex.count < UINT_MAX; } WARN(1, "bad object type %#x\n", obj->type); @@ -250,7 +260,7 @@ static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, } for (i = 0; i < count; i++) { - if (!is_signaled(q->entries[i].obj)) { + if (!is_signaled(q->entries[i].obj, q->owner)) { can_wake = false; break; } @@ -264,6 +274,10 @@ static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, case NTSYNC_TYPE_SEM: obj->u.sem.count--; break; + case NTSYNC_TYPE_MUTEX: + obj->u.mutex.count++; + obj->u.mutex.owner = q->owner; + break; } } wake_up_process(q->task); @@ -307,6 +321,30 @@ static void try_wake_any_sem(struct ntsync_obj *sem) } } +static void try_wake_any_mutex(struct ntsync_obj *mutex) +{ + struct ntsync_q_entry *entry; + + ntsync_assert_held(mutex); + lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX); + + list_for_each_entry(entry, &mutex->any_waiters, node) { + struct ntsync_q *q = entry->q; + int signaled = -1; + + if (mutex->u.mutex.count == UINT_MAX) + break; + if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner) + continue; + + if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { + mutex->u.mutex.count++; + mutex->u.mutex.owner = q->owner; + wake_up_process(q->task); + } + } +} + /* * Actually change the semaphore state, returning -EOVERFLOW if it is made * invalid. @@ -451,6 +489,30 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) return fd; } +static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp) +{ + struct ntsync_mutex_args args; + struct ntsync_obj *mutex; + int fd; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + if (!args.owner != !args.count) + return -EINVAL; + + mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX); + if (!mutex) + return -ENOMEM; + mutex->u.mutex.count = args.count; + mutex->u.mutex.owner = args.owner; + fd = ntsync_obj_get_fd(mutex); + if (fd < 0) + kfree(mutex); + + return fd; +} + static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) { struct file *file = fget(fd); @@ -520,7 +582,7 @@ static int setup_wait(struct ntsync_device *dev, struct ntsync_q *q; __u32 i, j; - if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME)) + if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME)) return -EINVAL; if (args->count > NTSYNC_MAX_WAIT_COUNT) @@ -534,6 +596,7 @@ static int setup_wait(struct ntsync_device *dev, if (!q) return -ENOMEM; q->task = current; + q->owner = args->owner; atomic_set(&q->signaled, -1); q->all = all; q->count = count; @@ -576,6 +639,9 @@ static void try_wake_any_obj(struct ntsync_obj *obj) case NTSYNC_TYPE_SEM: try_wake_any_sem(obj); break; + case NTSYNC_TYPE_MUTEX: + try_wake_any_mutex(obj); + break; } } @@ -765,6 +831,8 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { + case NTSYNC_IOC_CREATE_MUTEX: + return ntsync_create_mutex(dev, argp); case NTSYNC_IOC_CREATE_SEM: return ntsync_create_sem(dev, argp); case NTSYNC_IOC_WAIT_ALL: diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index d64420ffbcd71c..bb7fb94f585619 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -15,6 +15,11 @@ struct ntsync_sem_args { __u32 max; }; +struct ntsync_mutex_args { + __u32 owner; + __u32 count; +}; + #define NTSYNC_WAIT_REALTIME 0x1 struct ntsync_wait_args { @@ -23,7 +28,8 @@ struct ntsync_wait_args { __u32 count; __u32 index; __u32 flags; - __u32 pad[3]; + __u32 owner; + __u32 pad[2]; }; #define NTSYNC_MAX_WAIT_COUNT 64 @@ -31,6 +37,7 @@ struct ntsync_wait_args { #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) +#define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) From c230c60f3f3a06c58d052ef981c84e3de4788760 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:47 -0600 Subject: [PATCH 06/53] ntsync: Introduce NTSYNC_IOC_MUTEX_UNLOCK. This corresponds to the NT syscall NtReleaseMutant(). This syscall decrements the mutex's recursion count by one, and returns the previous value. If the mutex is not owned by the current task, the function instead fails and returns -EPERM. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 53 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 1 + 2 files changed, 54 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index a2826dbff2f46c..33e26240d9e7c9 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -396,6 +396,57 @@ static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) return ret; } +/* + * Actually change the mutex state, returning -EPERM if not the owner. + */ +static int unlock_mutex_state(struct ntsync_obj *mutex, + const struct ntsync_mutex_args *args) +{ + ntsync_assert_held(mutex); + + if (mutex->u.mutex.owner != args->owner) + return -EPERM; + + if (!--mutex->u.mutex.count) + mutex->u.mutex.owner = 0; + return 0; +} + +static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp) +{ + struct ntsync_mutex_args __user *user_args = argp; + struct ntsync_device *dev = mutex->dev; + struct ntsync_mutex_args args; + __u32 prev_count; + bool all; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + if (!args.owner) + return -EINVAL; + + if (mutex->type != NTSYNC_TYPE_MUTEX) + return -EINVAL; + + all = ntsync_lock_obj(dev, mutex); + + prev_count = mutex->u.mutex.count; + ret = unlock_mutex_state(mutex, &args); + if (!ret) { + if (all) + try_wake_all_obj(dev, mutex); + try_wake_any_mutex(mutex); + } + + ntsync_unlock_obj(dev, mutex, all); + + if (!ret && put_user(prev_count, &user_args->count)) + ret = -EFAULT; + + return ret; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -415,6 +466,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NTSYNC_IOC_SEM_RELEASE: return ntsync_sem_release(obj, argp); + case NTSYNC_IOC_MUTEX_UNLOCK: + return ntsync_mutex_unlock(obj, argp); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index bb7fb94f585619..9186304b253c64 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -40,5 +40,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) +#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) #endif From a64343d9ec44e64c495195c67e47fe8ba9217928 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:48 -0600 Subject: [PATCH 07/53] ntsync: Introduce NTSYNC_IOC_MUTEX_KILL. This does not correspond to any NT syscall. Rather, when a thread dies, it should be called by the NT emulator for each mutex, with the TID of the dying thread. NT mutexes are robust (in the pthread sense). When an NT thread dies, any mutexes it owned are immediately released. Acquisition of those mutexes by other threads will return a special value indicating that the mutex was abandoned, like EOWNERDEAD returned from pthread_mutex_lock(), and EOWNERDEAD is indeed used here for that purpose. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 61 +++++++++++++++++++++++++++++++++++-- include/uapi/linux/ntsync.h | 1 + 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 33e26240d9e7c9..03768ac254256f 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -59,6 +59,7 @@ struct ntsync_obj { struct { __u32 count; pid_t owner; + bool ownerdead; } mutex; } u; @@ -107,6 +108,7 @@ struct ntsync_q { atomic_t signaled; bool all; + bool ownerdead; __u32 count; struct ntsync_q_entry entries[]; }; @@ -275,6 +277,9 @@ static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, obj->u.sem.count--; break; case NTSYNC_TYPE_MUTEX: + if (obj->u.mutex.ownerdead) + q->ownerdead = true; + obj->u.mutex.ownerdead = false; obj->u.mutex.count++; obj->u.mutex.owner = q->owner; break; @@ -338,6 +343,9 @@ static void try_wake_any_mutex(struct ntsync_obj *mutex) continue; if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { + if (mutex->u.mutex.ownerdead) + q->ownerdead = true; + mutex->u.mutex.ownerdead = false; mutex->u.mutex.count++; mutex->u.mutex.owner = q->owner; wake_up_process(q->task); @@ -447,6 +455,52 @@ static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp) return ret; } +/* + * Actually change the mutex state to mark its owner as dead, + * returning -EPERM if not the owner. + */ +static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner) +{ + ntsync_assert_held(mutex); + + if (mutex->u.mutex.owner != owner) + return -EPERM; + + mutex->u.mutex.ownerdead = true; + mutex->u.mutex.owner = 0; + mutex->u.mutex.count = 0; + return 0; +} + +static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp) +{ + struct ntsync_device *dev = mutex->dev; + __u32 owner; + bool all; + int ret; + + if (get_user(owner, (__u32 __user *)argp)) + return -EFAULT; + if (!owner) + return -EINVAL; + + if (mutex->type != NTSYNC_TYPE_MUTEX) + return -EINVAL; + + all = ntsync_lock_obj(dev, mutex); + + ret = kill_mutex_state(mutex, owner); + if (!ret) { + if (all) + try_wake_all_obj(dev, mutex); + try_wake_any_mutex(mutex); + } + + ntsync_unlock_obj(dev, mutex, all); + + return ret; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -468,6 +522,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, return ntsync_sem_release(obj, argp); case NTSYNC_IOC_MUTEX_UNLOCK: return ntsync_mutex_unlock(obj, argp); + case NTSYNC_IOC_MUTEX_KILL: + return ntsync_mutex_kill(obj, argp); default: return -ENOIOCTLCMD; } @@ -652,6 +708,7 @@ static int setup_wait(struct ntsync_device *dev, q->owner = args->owner; atomic_set(&q->signaled, -1); q->all = all; + q->ownerdead = false; q->count = count; for (i = 0; i < count; i++) { @@ -760,7 +817,7 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) struct ntsync_wait_args __user *user_args = argp; /* even if we caught a signal, we need to communicate success */ - ret = 0; + ret = q->ownerdead ? -EOWNERDEAD : 0; if (put_user(signaled, &user_args->index)) ret = -EFAULT; @@ -841,7 +898,7 @@ static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) struct ntsync_wait_args __user *user_args = argp; /* even if we caught a signal, we need to communicate success */ - ret = 0; + ret = q->ownerdead ? -EOWNERDEAD : 0; if (put_user(signaled, &user_args->index)) ret = -EFAULT; diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 9186304b253c64..633958d90be319 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -41,5 +41,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) +#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) #endif From 1f22f586ccefacfa7a6e095a1ac0466eae1c133c Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:49 -0600 Subject: [PATCH 08/53] ntsync: Introduce NTSYNC_IOC_CREATE_EVENT. This correspond to the NT syscall NtCreateEvent(). An NT event holds a single bit of state denoting whether it is signaled or unsignaled. There are two types of events: manual-reset and automatic-reset. When an automatic-reset event is acquired via a wait function, its state is reset to unsignaled. Manual-reset events are not affected by wait functions. Whether the event is manual-reset, and its initial state, are specified at creation time. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 59 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 6 ++++ 2 files changed, 65 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 03768ac254256f..3e8827b6f48046 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -26,6 +26,7 @@ enum ntsync_type { NTSYNC_TYPE_SEM, NTSYNC_TYPE_MUTEX, + NTSYNC_TYPE_EVENT, }; /* @@ -61,6 +62,10 @@ struct ntsync_obj { pid_t owner; bool ownerdead; } mutex; + struct { + bool manual; + bool signaled; + } event; } u; /* @@ -233,6 +238,8 @@ static bool is_signaled(struct ntsync_obj *obj, __u32 owner) if (obj->u.mutex.owner && obj->u.mutex.owner != owner) return false; return obj->u.mutex.count < UINT_MAX; + case NTSYNC_TYPE_EVENT: + return obj->u.event.signaled; } WARN(1, "bad object type %#x\n", obj->type); @@ -283,6 +290,10 @@ static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, obj->u.mutex.count++; obj->u.mutex.owner = q->owner; break; + case NTSYNC_TYPE_EVENT: + if (!obj->u.event.manual) + obj->u.event.signaled = false; + break; } } wake_up_process(q->task); @@ -353,6 +364,28 @@ static void try_wake_any_mutex(struct ntsync_obj *mutex) } } +static void try_wake_any_event(struct ntsync_obj *event) +{ + struct ntsync_q_entry *entry; + + ntsync_assert_held(event); + lockdep_assert(event->type == NTSYNC_TYPE_EVENT); + + list_for_each_entry(entry, &event->any_waiters, node) { + struct ntsync_q *q = entry->q; + int signaled = -1; + + if (!event->u.event.signaled) + break; + + if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { + if (!event->u.event.manual) + event->u.event.signaled = false; + wake_up_process(q->task); + } + } +} + /* * Actually change the semaphore state, returning -EOVERFLOW if it is made * invalid. @@ -622,6 +655,27 @@ static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp) return fd; } +static int ntsync_create_event(struct ntsync_device *dev, void __user *argp) +{ + struct ntsync_event_args args; + struct ntsync_obj *event; + int fd; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT); + if (!event) + return -ENOMEM; + event->u.event.manual = args.manual; + event->u.event.signaled = args.signaled; + fd = ntsync_obj_get_fd(event); + if (fd < 0) + kfree(event); + + return fd; +} + static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) { struct file *file = fget(fd); @@ -752,6 +806,9 @@ static void try_wake_any_obj(struct ntsync_obj *obj) case NTSYNC_TYPE_MUTEX: try_wake_any_mutex(obj); break; + case NTSYNC_TYPE_EVENT: + try_wake_any_event(obj); + break; } } @@ -941,6 +998,8 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { + case NTSYNC_IOC_CREATE_EVENT: + return ntsync_create_event(dev, argp); case NTSYNC_IOC_CREATE_MUTEX: return ntsync_create_mutex(dev, argp); case NTSYNC_IOC_CREATE_SEM: diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 633958d90be319..e08aa02f4dccac 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -20,6 +20,11 @@ struct ntsync_mutex_args { __u32 count; }; +struct ntsync_event_args { + __u32 manual; + __u32 signaled; +}; + #define NTSYNC_WAIT_REALTIME 0x1 struct ntsync_wait_args { @@ -38,6 +43,7 @@ struct ntsync_wait_args { #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) #define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) +#define NTSYNC_IOC_CREATE_EVENT _IOW ('N', 0x87, struct ntsync_event_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) From ed84944a527c763a5e368a331ff6df5a2cb59827 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:50 -0600 Subject: [PATCH 09/53] ntsync: Introduce NTSYNC_IOC_EVENT_SET. This corresponds to the NT syscall NtSetEvent(). This sets the event to the signaled state, and returns its previous state. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 27 +++++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 1 + 2 files changed, 28 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 3e8827b6f48046..0a87f8ad599326 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -534,6 +534,31 @@ static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp) return ret; } +static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) +{ + struct ntsync_device *dev = event->dev; + __u32 prev_state; + bool all; + + if (event->type != NTSYNC_TYPE_EVENT) + return -EINVAL; + + all = ntsync_lock_obj(dev, event); + + prev_state = event->u.event.signaled; + event->u.event.signaled = true; + if (all) + try_wake_all_obj(dev, event); + try_wake_any_event(event); + + ntsync_unlock_obj(dev, event, all); + + if (put_user(prev_state, (__u32 __user *)argp)) + return -EFAULT; + + return 0; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -557,6 +582,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, return ntsync_mutex_unlock(obj, argp); case NTSYNC_IOC_MUTEX_KILL: return ntsync_mutex_kill(obj, argp); + case NTSYNC_IOC_EVENT_SET: + return ntsync_event_set(obj, argp); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index e08aa02f4dccac..db2512f6e3f42c 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -48,5 +48,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) +#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) #endif From d2c534d99649cf653f1be3e57a75e42b0900164d Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:51 -0600 Subject: [PATCH 10/53] ntsync: Introduce NTSYNC_IOC_EVENT_RESET. This corresponds to the NT syscall NtResetEvent(). This sets the event to the unsignaled state, and returns its previous state. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 1 + 2 files changed, 25 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 0a87f8ad599326..b31443aa9692b3 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -559,6 +559,28 @@ static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) return 0; } +static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp) +{ + struct ntsync_device *dev = event->dev; + __u32 prev_state; + bool all; + + if (event->type != NTSYNC_TYPE_EVENT) + return -EINVAL; + + all = ntsync_lock_obj(dev, event); + + prev_state = event->u.event.signaled; + event->u.event.signaled = false; + + ntsync_unlock_obj(dev, event, all); + + if (put_user(prev_state, (__u32 __user *)argp)) + return -EFAULT; + + return 0; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -584,6 +606,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, return ntsync_mutex_kill(obj, argp); case NTSYNC_IOC_EVENT_SET: return ntsync_event_set(obj, argp); + case NTSYNC_IOC_EVENT_RESET: + return ntsync_event_reset(obj, argp); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index db2512f6e3f42c..d74c4e4f93d811 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -49,5 +49,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) +#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) #endif From ea57fd7522c8dad5ad010795f96b77a96885f159 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:52 -0600 Subject: [PATCH 11/53] ntsync: Introduce NTSYNC_IOC_EVENT_PULSE. This corresponds to the NT syscall NtPulseEvent(). This wakes up any waiters as if the event had been set, but does not set the event, instead resetting it if it had been signalled. Thus, for a manual-reset event, all waiters are woken, whereas for an auto-reset event, at most one waiter is woken. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 8 ++++++-- include/uapi/linux/ntsync.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index b31443aa9692b3..141ebe8a7d4f0f 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -534,7 +534,7 @@ static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp) return ret; } -static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) +static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse) { struct ntsync_device *dev = event->dev; __u32 prev_state; @@ -550,6 +550,8 @@ static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) if (all) try_wake_all_obj(dev, event); try_wake_any_event(event); + if (pulse) + event->u.event.signaled = false; ntsync_unlock_obj(dev, event, all); @@ -605,9 +607,11 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, case NTSYNC_IOC_MUTEX_KILL: return ntsync_mutex_kill(obj, argp); case NTSYNC_IOC_EVENT_SET: - return ntsync_event_set(obj, argp); + return ntsync_event_set(obj, argp, false); case NTSYNC_IOC_EVENT_RESET: return ntsync_event_reset(obj, argp); + case NTSYNC_IOC_EVENT_PULSE: + return ntsync_event_set(obj, argp, true); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index d74c4e4f93d811..9eab7666b8b99a 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -50,5 +50,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) +#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) #endif From 1558746d46f1d7e0e7e4b71f93737d93cc3e655e Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:53 -0600 Subject: [PATCH 12/53] ntsync: Introduce NTSYNC_IOC_SEM_READ. This corresponds to the NT syscall NtQuerySemaphore(). This returns the current count and maximum count of the semaphore. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 1 + 2 files changed, 25 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 141ebe8a7d4f0f..b2043412c5cd3a 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -583,6 +583,28 @@ static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp) return 0; } +static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp) +{ + struct ntsync_sem_args __user *user_args = argp; + struct ntsync_device *dev = sem->dev; + struct ntsync_sem_args args; + bool all; + + if (sem->type != NTSYNC_TYPE_SEM) + return -EINVAL; + + all = ntsync_lock_obj(dev, sem); + + args.count = sem->u.sem.count; + args.max = sem->u.sem.max; + + ntsync_unlock_obj(dev, sem, all); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return 0; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -602,6 +624,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NTSYNC_IOC_SEM_RELEASE: return ntsync_sem_release(obj, argp); + case NTSYNC_IOC_SEM_READ: + return ntsync_sem_read(obj, argp); case NTSYNC_IOC_MUTEX_UNLOCK: return ntsync_mutex_unlock(obj, argp); case NTSYNC_IOC_MUTEX_KILL: diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 9eab7666b8b99a..e36b4cfb7d3469 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -51,5 +51,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) +#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) #endif From acec3dca56b016576c2de892a9d76ea3611567ff Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:54 -0600 Subject: [PATCH 13/53] ntsync: Introduce NTSYNC_IOC_MUTEX_READ. This corresponds to the NT syscall NtQueryMutant(). This returns the recursion count, owner, and abandoned state of the mutex. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 26 ++++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 1 + 2 files changed, 27 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index b2043412c5cd3a..0047b13b6ebd4c 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -605,6 +605,30 @@ static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp) return 0; } +static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp) +{ + struct ntsync_mutex_args __user *user_args = argp; + struct ntsync_device *dev = mutex->dev; + struct ntsync_mutex_args args; + bool all; + int ret; + + if (mutex->type != NTSYNC_TYPE_MUTEX) + return -EINVAL; + + all = ntsync_lock_obj(dev, mutex); + + args.count = mutex->u.mutex.count; + args.owner = mutex->u.mutex.owner; + ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0; + + ntsync_unlock_obj(dev, mutex, all); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return ret; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -630,6 +654,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, return ntsync_mutex_unlock(obj, argp); case NTSYNC_IOC_MUTEX_KILL: return ntsync_mutex_kill(obj, argp); + case NTSYNC_IOC_MUTEX_READ: + return ntsync_mutex_read(obj, argp); case NTSYNC_IOC_EVENT_SET: return ntsync_event_set(obj, argp, false); case NTSYNC_IOC_EVENT_RESET: diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index e36b4cfb7d3469..207646e63ef3b0 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -52,5 +52,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) +#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) #endif From cea2c04f920b05f6fa3e0931e413909abcb57201 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:55 -0600 Subject: [PATCH 14/53] ntsync: Introduce NTSYNC_IOC_EVENT_READ. This corresponds to the NT syscall NtQueryEvent(). This returns the signaled state of the event and whether it is manual-reset. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ include/uapi/linux/ntsync.h | 1 + 2 files changed, 25 insertions(+) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 0047b13b6ebd4c..78dc405bb75906 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -629,6 +629,28 @@ static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp) return ret; } +static int ntsync_event_read(struct ntsync_obj *event, void __user *argp) +{ + struct ntsync_event_args __user *user_args = argp; + struct ntsync_device *dev = event->dev; + struct ntsync_event_args args; + bool all; + + if (event->type != NTSYNC_TYPE_EVENT) + return -EINVAL; + + all = ntsync_lock_obj(dev, event); + + args.manual = event->u.event.manual; + args.signaled = event->u.event.signaled; + + ntsync_unlock_obj(dev, event, all); + + if (copy_to_user(user_args, &args, sizeof(args))) + return -EFAULT; + return 0; +} + static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; @@ -662,6 +684,8 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, return ntsync_event_reset(obj, argp); case NTSYNC_IOC_EVENT_PULSE: return ntsync_event_set(obj, argp, true); + case NTSYNC_IOC_EVENT_READ: + return ntsync_event_read(obj, argp); default: return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 207646e63ef3b0..b9d208a8c00fe9 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -53,5 +53,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) #define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) +#define NTSYNC_IOC_EVENT_READ _IOR ('N', 0x8d, struct ntsync_event_args) #endif From 51f7688f6aa19aa61773d7ec0b795d0079bbdde3 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:56 -0600 Subject: [PATCH 15/53] ntsync: Introduce alertable waits. NT waits can optionally be made "alertable". This is a special channel for thread wakeup that is mildly similar to SIGIO. A thread has an internal single bit of "alerted" state, and if a thread is alerted while an alertable wait, the wait will return a special value, consume the "alerted" state, and will not consume any of its objects. Alerts are implemented using events; the user-space NT emulator is expected to create an internal ntsync event for each thread and pass that event to wait functions. Signed-off-by: Elizabeth Figura --- drivers/misc/ntsync.c | 70 ++++++++++++++++++++++++++++++++----- include/uapi/linux/ntsync.h | 3 +- 2 files changed, 63 insertions(+), 10 deletions(-) diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 78dc405bb75906..457ff28b789f75 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -869,22 +869,29 @@ static int setup_wait(struct ntsync_device *dev, const struct ntsync_wait_args *args, bool all, struct ntsync_q **ret_q) { + int fds[NTSYNC_MAX_WAIT_COUNT + 1]; const __u32 count = args->count; - int fds[NTSYNC_MAX_WAIT_COUNT]; struct ntsync_q *q; + __u32 total_count; __u32 i, j; - if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME)) + if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME)) return -EINVAL; if (args->count > NTSYNC_MAX_WAIT_COUNT) return -EINVAL; + total_count = count; + if (args->alert) + total_count++; + if (copy_from_user(fds, u64_to_user_ptr(args->objs), array_size(count, sizeof(*fds)))) return -EFAULT; + if (args->alert) + fds[count] = args->alert; - q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); + q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL); if (!q) return -ENOMEM; q->task = current; @@ -894,7 +901,7 @@ static int setup_wait(struct ntsync_device *dev, q->ownerdead = false; q->count = count; - for (i = 0; i < count; i++) { + for (i = 0; i < total_count; i++) { struct ntsync_q_entry *entry = &q->entries[i]; struct ntsync_obj *obj = get_obj(dev, fds[i]); @@ -944,10 +951,10 @@ static void try_wake_any_obj(struct ntsync_obj *obj) static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) { struct ntsync_wait_args args; + __u32 i, total_count; struct ntsync_q *q; int signaled; bool all; - __u32 i; int ret; if (copy_from_user(&args, argp, sizeof(args))) @@ -957,9 +964,13 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) if (ret < 0) return ret; + total_count = args.count; + if (args.alert) + total_count++; + /* queue ourselves */ - for (i = 0; i < args.count; i++) { + for (i = 0; i < total_count; i++) { struct ntsync_q_entry *entry = &q->entries[i]; struct ntsync_obj *obj = entry->obj; @@ -968,9 +979,15 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) ntsync_unlock_obj(dev, obj, all); } - /* check if we are already signaled */ + /* + * Check if we are already signaled. + * + * Note that the API requires that normal objects are checked before + * the alert event. Hence we queue the alert event last, and check + * objects in order. + */ - for (i = 0; i < args.count; i++) { + for (i = 0; i < total_count; i++) { struct ntsync_obj *obj = q->entries[i].obj; if (atomic_read(&q->signaled) != -1) @@ -987,7 +1004,7 @@ static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) /* and finally, unqueue */ - for (i = 0; i < args.count; i++) { + for (i = 0; i < total_count; i++) { struct ntsync_q_entry *entry = &q->entries[i]; struct ntsync_obj *obj = entry->obj; @@ -1047,6 +1064,14 @@ static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) */ list_add_tail(&entry->node, &obj->all_waiters); } + if (args.alert) { + struct ntsync_q_entry *entry = &q->entries[args.count]; + struct ntsync_obj *obj = entry->obj; + + dev_lock_obj(dev, obj); + list_add_tail(&entry->node, &obj->any_waiters); + dev_unlock_obj(dev, obj); + } /* check if we are already signaled */ @@ -1054,6 +1079,21 @@ static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) mutex_unlock(&dev->wait_all_lock); + /* + * Check if the alert event is signaled, making sure to do so only + * after checking if the other objects are signaled. + */ + + if (args.alert) { + struct ntsync_obj *obj = q->entries[args.count].obj; + + if (atomic_read(&q->signaled) == -1) { + bool all = ntsync_lock_obj(dev, obj); + try_wake_any_obj(obj); + ntsync_unlock_obj(dev, obj, all); + } + } + /* sleep */ ret = ntsync_schedule(q, &args); @@ -1079,6 +1119,18 @@ static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) mutex_unlock(&dev->wait_all_lock); + if (args.alert) { + struct ntsync_q_entry *entry = &q->entries[args.count]; + struct ntsync_obj *obj = entry->obj; + bool all; + + all = ntsync_lock_obj(dev, obj); + list_del(&entry->node); + ntsync_unlock_obj(dev, obj, all); + + put_obj(obj); + } + signaled = atomic_read(&q->signaled); if (signaled != -1) { struct ntsync_wait_args __user *user_args = argp; diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index b9d208a8c00fe9..6d06793512b150 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -34,7 +34,8 @@ struct ntsync_wait_args { __u32 index; __u32 flags; __u32 owner; - __u32 pad[2]; + __u32 alert; + __u32 pad; }; #define NTSYNC_MAX_WAIT_COUNT 64 From 10e6ec3fed3e148ec562d36f963505f3f85edd00 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:57 -0600 Subject: [PATCH 16/53] selftests: ntsync: Add some tests for semaphore state. Wine has tests for its synchronization primitives, but these are more accessible to kernel developers, and also allow us to test some edge cases that Wine does not care about. This patch adds tests for semaphore-specific ioctls NTSYNC_IOC_SEM_POST and NTSYNC_IOC_SEM_READ, and waiting on semaphores. Signed-off-by: Elizabeth Figura --- tools/testing/selftests/Makefile | 1 + .../selftests/drivers/ntsync/.gitignore | 1 + .../testing/selftests/drivers/ntsync/Makefile | 7 + tools/testing/selftests/drivers/ntsync/config | 1 + .../testing/selftests/drivers/ntsync/ntsync.c | 145 ++++++++++++++++++ 5 files changed, 155 insertions(+) create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile create mode 100644 tools/testing/selftests/drivers/ntsync/config create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 9cf769d415687d..d2ae8d2db3c6bc 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -18,6 +18,7 @@ TARGETS += devices/error_logs TARGETS += devices/probe TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf +TARGETS += drivers/ntsync TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net TARGETS += drivers/net/bonding diff --git a/tools/testing/selftests/drivers/ntsync/.gitignore b/tools/testing/selftests/drivers/ntsync/.gitignore new file mode 100644 index 00000000000000..848573a3d3eafc --- /dev/null +++ b/tools/testing/selftests/drivers/ntsync/.gitignore @@ -0,0 +1 @@ +ntsync diff --git a/tools/testing/selftests/drivers/ntsync/Makefile b/tools/testing/selftests/drivers/ntsync/Makefile new file mode 100644 index 00000000000000..dbf2b055c0b287 --- /dev/null +++ b/tools/testing/selftests/drivers/ntsync/Makefile @@ -0,0 +1,7 @@ +# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only +TEST_GEN_PROGS := ntsync + +CFLAGS += $(KHDR_INCLUDES) +LDLIBS += -lpthread + +include ../../lib.mk diff --git a/tools/testing/selftests/drivers/ntsync/config b/tools/testing/selftests/drivers/ntsync/config new file mode 100644 index 00000000000000..60539c826d0624 --- /dev/null +++ b/tools/testing/selftests/drivers/ntsync/config @@ -0,0 +1 @@ +CONFIG_WINESYNC=y diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c new file mode 100644 index 00000000000000..cc72da0a91de54 --- /dev/null +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Various unit tests for the "ntsync" synchronization primitive driver. + * + * Copyright (C) 2021-2022 Elizabeth Figura + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "../../kselftest_harness.h" + +static int read_sem_state(int sem, __u32 *count, __u32 *max) +{ + struct ntsync_sem_args args; + int ret; + + memset(&args, 0xcc, sizeof(args)); + ret = ioctl(sem, NTSYNC_IOC_SEM_READ, &args); + *count = args.count; + *max = args.max; + return ret; +} + +#define check_sem_state(sem, count, max) \ + ({ \ + __u32 __count, __max; \ + int ret = read_sem_state((sem), &__count, &__max); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((count), __count); \ + EXPECT_EQ((max), __max); \ + }) + +static int release_sem(int sem, __u32 *count) +{ + return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count); +} + +static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) +{ + struct ntsync_wait_args args = {0}; + struct timespec timeout; + int ret; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + args.timeout = timeout.tv_sec * 1000000000 + timeout.tv_nsec; + args.count = count; + args.objs = (uintptr_t)objs; + args.owner = owner; + args.index = 0xdeadbeef; + ret = ioctl(fd, NTSYNC_IOC_WAIT_ANY, &args); + *index = args.index; + return ret; +} + +TEST(semaphore_state) +{ + struct ntsync_sem_args sem_args; + struct timespec timeout; + __u32 count, index; + int fd, ret, sem; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 3; + sem_args.max = 2; + sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_EQ(-1, sem); + EXPECT_EQ(EINVAL, errno); + + sem_args.count = 2; + sem_args.max = 2; + sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, sem); + check_sem_state(sem, 2, 2); + + count = 0; + ret = release_sem(sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_sem_state(sem, 2, 2); + + count = 1; + ret = release_sem(sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(sem, 2, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(sem, 1, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(sem, 0, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + count = 3; + ret = release_sem(sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(sem, 0, 2); + + count = 2; + ret = release_sem(sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(sem, 2, 2); + + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + ret = wait_any(fd, 1, &sem, 123, &index); + EXPECT_EQ(0, ret); + + count = 1; + ret = release_sem(sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(sem, 1, 2); + + count = ~0u; + ret = release_sem(sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(sem, 1, 2); + + close(sem); + + close(fd); +} + +TEST_HARNESS_MAIN From 99125a9905c270a1b589c32726ab64cee8cb6344 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:58 -0600 Subject: [PATCH 17/53] selftests: ntsync: Add some tests for mutex state. Test mutex-specific ioctls NTSYNC_IOC_MUTEX_UNLOCK and NTSYNC_IOC_MUTEX_READ, and waiting on mutexes. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index cc72da0a91de54..4db65490b6a125 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -40,6 +40,39 @@ static int release_sem(int sem, __u32 *count) return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count); } +static int read_mutex_state(int mutex, __u32 *count, __u32 *owner) +{ + struct ntsync_mutex_args args; + int ret; + + memset(&args, 0xcc, sizeof(args)); + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &args); + *count = args.count; + *owner = args.owner; + return ret; +} + +#define check_mutex_state(mutex, count, owner) \ + ({ \ + __u32 __count, __owner; \ + int ret = read_mutex_state((mutex), &__count, &__owner); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((count), __count); \ + EXPECT_EQ((owner), __owner); \ + }) + +static int unlock_mutex(int mutex, __u32 owner, __u32 *count) +{ + struct ntsync_mutex_args args; + int ret; + + args.owner = owner; + args.count = 0xdeadbeef; + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_UNLOCK, &args); + *count = args.count; + return ret; +} + static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) { struct ntsync_wait_args args = {0}; @@ -142,4 +175,158 @@ TEST(semaphore_state) close(fd); } +TEST(mutex_state) +{ + struct ntsync_mutex_args mutex_args; + __u32 owner, count, index; + struct timespec timeout; + int fd, ret, mutex; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + mutex_args.owner = 123; + mutex_args.count = 0; + mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(-1, mutex); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 0; + mutex_args.count = 2; + mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_EQ(-1, mutex); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 123; + mutex_args.count = 2; + mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, mutex); + check_mutex_state(mutex, 2, 123); + + ret = unlock_mutex(mutex, 0, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = unlock_mutex(mutex, 456, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + check_mutex_state(mutex, 2, 123); + + ret = unlock_mutex(mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_mutex_state(mutex, 1, 123); + + ret = unlock_mutex(mutex, 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + check_mutex_state(mutex, 0, 0); + + ret = unlock_mutex(mutex, 123, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + + ret = wait_any(fd, 1, &mutex, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(mutex, 1, 456); + + ret = wait_any(fd, 1, &mutex, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(mutex, 2, 456); + + ret = unlock_mutex(mutex, 456, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_mutex_state(mutex, 1, 456); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 0; + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + owner = 123; + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EPERM, errno); + check_mutex_state(mutex, 1, 456); + + owner = 456; + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(0, ret); + + memset(&mutex_args, 0xcc, sizeof(mutex_args)); + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + memset(&mutex_args, 0xcc, sizeof(mutex_args)); + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, index); + check_mutex_state(mutex, 1, 123); + + owner = 123; + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(0, ret); + + memset(&mutex_args, 0xcc, sizeof(mutex_args)); + ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, mutex_args.count); + EXPECT_EQ(0, mutex_args.owner); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(0, index); + check_mutex_state(mutex, 1, 123); + + close(mutex); + + mutex_args.owner = 0; + mutex_args.count = 0; + mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, mutex); + check_mutex_state(mutex, 0, 0); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_mutex_state(mutex, 1, 123); + + close(mutex); + + mutex_args.owner = 123; + mutex_args.count = ~0u; + mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, mutex); + check_mutex_state(mutex, ~0u, 123); + + ret = wait_any(fd, 1, &mutex, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + close(mutex); + + close(fd); +} + TEST_HARNESS_MAIN From 6d43b01f6e2015dd08717395c09612a3743e1527 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:59 -0600 Subject: [PATCH 18/53] selftests: ntsync: Add some tests for NTSYNC_IOC_WAIT_ANY. Test basic synchronous functionality of NTSYNC_IOC_WAIT_ANY, when objects are considered signaled or not signaled, and how they are affected by a successful wait. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 4db65490b6a125..9781a74253eebc 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -329,4 +329,118 @@ TEST(mutex_state) close(fd); } +TEST(test_wait_any) +{ + int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret; + struct ntsync_mutex_args mutex_args = {0}; + struct ntsync_sem_args sem_args = {0}; + __u32 owner, index, count, i; + struct timespec timeout; + + clock_gettime(CLOCK_MONOTONIC, &timeout); + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 3; + objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[0]); + + mutex_args.owner = 0; + mutex_args.count = 0; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, objs[1]); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 1, 3); + check_mutex_state(objs[1], 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 0, 3); + check_mutex_state(objs[1], 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + check_sem_state(objs[0], 0, 3); + check_mutex_state(objs[1], 1, 123); + + count = 1; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 0, 3); + check_mutex_state(objs[1], 1, 123); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + check_sem_state(objs[0], 0, 3); + check_mutex_state(objs[1], 2, 123); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 123; + ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(0, ret); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + EXPECT_EQ(1, index); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + + close(objs[1]); + + /* test waiting on the same object twice */ + + count = 2; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + objs[1] = objs[0]; + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 1, 3); + + ret = wait_any(fd, 0, NULL, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + for (i = 1; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i) + objs[i] = objs[0]; + + ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT + 1, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + ret = wait_any(fd, -1, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + close(objs[0]); + + close(fd); +} + TEST_HARNESS_MAIN From a1f9c7498306667ef5b29f59dba180e4b59e4fc4 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:00 -0600 Subject: [PATCH 19/53] selftests: ntsync: Add some tests for NTSYNC_IOC_WAIT_ALL. Test basic synchronous functionality of NTSYNC_IOC_WAIT_ALL, and when objects are considered simultaneously signaled. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 93 ++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 9781a74253eebc..aba11eaf9a7cb4 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -73,7 +73,8 @@ static int unlock_mutex(int mutex, __u32 owner, __u32 *count) return ret; } -static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) +static int wait_objs(int fd, unsigned long request, __u32 count, + const int *objs, __u32 owner, __u32 *index) { struct ntsync_wait_args args = {0}; struct timespec timeout; @@ -86,11 +87,21 @@ static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *in args.objs = (uintptr_t)objs; args.owner = owner; args.index = 0xdeadbeef; - ret = ioctl(fd, NTSYNC_IOC_WAIT_ANY, &args); + ret = ioctl(fd, request, &args); *index = args.index; return ret; } +static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) +{ + return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, index); +} + +static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) +{ + return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, index); +} + TEST(semaphore_state) { struct ntsync_sem_args sem_args; @@ -443,4 +454,82 @@ TEST(test_wait_any) close(fd); } +TEST(test_wait_all) +{ + struct ntsync_mutex_args mutex_args = {0}; + struct ntsync_sem_args sem_args = {0}; + __u32 owner, index, count; + int objs[2], fd, ret; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 3; + objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[0]); + + mutex_args.owner = 0; + mutex_args.count = 0; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, objs[1]); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 1, 3); + check_mutex_state(objs[1], 1, 123); + + ret = wait_all(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + check_sem_state(objs[0], 1, 3); + check_mutex_state(objs[1], 1, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 0, 3); + check_mutex_state(objs[1], 2, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + check_sem_state(objs[0], 0, 3); + check_mutex_state(objs[1], 2, 123); + + count = 3; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 2, 3); + check_mutex_state(objs[1], 3, 123); + + owner = 123; + ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(0, ret); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); + check_sem_state(objs[0], 1, 3); + check_mutex_state(objs[1], 1, 123); + + close(objs[1]); + + /* test waiting on the same object twice */ + objs[1] = objs[0]; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + close(objs[0]); + + close(fd); +} + TEST_HARNESS_MAIN From 23b8a6c56a0bd81f9245df500026cb5fbb76198e Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:01 -0600 Subject: [PATCH 20/53] selftests: ntsync: Add some tests for wakeup signaling with WINESYNC_IOC_WAIT_ANY. Test contended "wait-for-any" waits, to make sure that scheduling and wakeup logic works correctly. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index aba11eaf9a7cb4..ad3f031c0b9267 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -532,4 +532,147 @@ TEST(test_wait_all) close(fd); } +struct wake_args { + int fd; + int obj; +}; + +struct wait_args { + int fd; + unsigned long request; + struct ntsync_wait_args *args; + int ret; + int err; +}; + +static void *wait_thread(void *arg) +{ + struct wait_args *args = arg; + + args->ret = ioctl(args->fd, args->request, args->args); + args->err = errno; + return NULL; +} + +static __u64 get_abs_timeout(unsigned int ms) +{ + struct timespec timeout; + clock_gettime(CLOCK_MONOTONIC, &timeout); + return (timeout.tv_sec * 1000000000) + timeout.tv_nsec + (ms * 1000000); +} + +static int wait_for_thread(pthread_t thread, unsigned int ms) +{ + struct timespec timeout; + + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_nsec += ms * 1000000; + timeout.tv_sec += (timeout.tv_nsec / 1000000000); + timeout.tv_nsec %= 1000000000; + return pthread_timedjoin_np(thread, NULL, &timeout); +} + +TEST(wake_any) +{ + struct ntsync_mutex_args mutex_args = {0}; + struct ntsync_wait_args wait_args = {0}; + struct ntsync_sem_args sem_args = {0}; + struct wait_args thread_args; + int objs[2], fd, ret; + __u32 count, index; + pthread_t thread; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 3; + objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[0]); + + mutex_args.owner = 123; + mutex_args.count = 1; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, objs[1]); + + /* test waking the semaphore */ + + wait_args.timeout = get_abs_timeout(1000); + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 456; + wait_args.index = 0xdeadbeef; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = NTSYNC_IOC_WAIT_ANY; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + count = 1; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(objs[0], 0, 3); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(0, wait_args.index); + + /* test waking the mutex */ + + /* first grab it again for owner 123 */ + ret = wait_any(fd, 1, &objs[1], 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + wait_args.timeout = get_abs_timeout(1000); + wait_args.owner = 456; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = unlock_mutex(objs[1], 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + ret = unlock_mutex(objs[1], 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, mutex_args.count); + check_mutex_state(objs[1], 1, 456); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + /* delete an object while it's being waited on */ + + wait_args.timeout = get_abs_timeout(200); + wait_args.owner = 123; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + close(objs[0]); + close(objs[1]); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); + EXPECT_EQ(-1, thread_args.ret); + EXPECT_EQ(ETIMEDOUT, thread_args.err); + + close(fd); +} + TEST_HARNESS_MAIN From 9afa397c9d9d549f4c77b553975269bec5e3e7fe Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:02 -0600 Subject: [PATCH 21/53] selftests: ntsync: Add some tests for wakeup signaling with WINESYNC_IOC_WAIT_ALL. Test contended "wait-for-all" waits, to make sure that scheduling and wakeup logic works correctly, and that the wait only exits once objects are all simultaneously signaled. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index ad3f031c0b9267..6bf0f10679d8aa 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -675,4 +675,95 @@ TEST(wake_any) close(fd); } +TEST(wake_all) +{ + struct ntsync_mutex_args mutex_args = {0}; + struct ntsync_wait_args wait_args = {0}; + struct ntsync_sem_args sem_args = {0}; + struct wait_args thread_args; + int objs[2], fd, ret; + __u32 count, index; + pthread_t thread; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 3; + objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[0]); + + mutex_args.owner = 123; + mutex_args.count = 1; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, objs[1]); + + wait_args.timeout = get_abs_timeout(1000); + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 456; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = NTSYNC_IOC_WAIT_ALL; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + count = 1; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + check_sem_state(objs[0], 1, 3); + + ret = wait_any(fd, 1, &objs[0], 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = unlock_mutex(objs[1], 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + + check_mutex_state(objs[1], 0, 0); + + count = 2; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(objs[0], 1, 3); + check_mutex_state(objs[1], 1, 456); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + + /* delete an object while it's being waited on */ + + wait_args.timeout = get_abs_timeout(200); + wait_args.owner = 123; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + close(objs[0]); + close(objs[1]); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); + EXPECT_EQ(-1, thread_args.ret); + EXPECT_EQ(ETIMEDOUT, thread_args.err); + + close(fd); +} + TEST_HARNESS_MAIN From 15ba4142a1aca7e05769a36add44a8f8e1f771dc Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:03 -0600 Subject: [PATCH 22/53] selftests: ntsync: Add some tests for manual-reset event state. Test event-specific ioctls NTSYNC_IOC_EVENT_SET, NTSYNC_IOC_EVENT_RESET, NTSYNC_IOC_EVENT_PULSE, NTSYNC_IOC_EVENT_READ for manual-reset events, and waiting on manual-reset events. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 6bf0f10679d8aa..4024a5c48bbb6a 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -73,6 +73,27 @@ static int unlock_mutex(int mutex, __u32 owner, __u32 *count) return ret; } +static int read_event_state(int event, __u32 *signaled, __u32 *manual) +{ + struct ntsync_event_args args; + int ret; + + memset(&args, 0xcc, sizeof(args)); + ret = ioctl(event, NTSYNC_IOC_EVENT_READ, &args); + *signaled = args.signaled; + *manual = args.manual; + return ret; +} + +#define check_event_state(event, signaled, manual) \ + ({ \ + __u32 __signaled, __manual; \ + int ret = read_event_state((event), &__signaled, &__manual); \ + EXPECT_EQ(0, ret); \ + EXPECT_EQ((signaled), __signaled); \ + EXPECT_EQ((manual), __manual); \ + }) + static int wait_objs(int fd, unsigned long request, __u32 count, const int *objs, __u32 owner, __u32 *index) { @@ -340,6 +361,71 @@ TEST(mutex_state) close(fd); } +TEST(manual_event_state) +{ + struct ntsync_event_args event_args; + __u32 index, signaled; + int fd, event, ret; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + event_args.manual = 1; + event_args.signaled = 0; + event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, event); + check_event_state(event, 0, 1); + + signaled = 0xdeadbeef; + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(event, 1, 1); + + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + check_event_state(event, 1, 1); + + ret = wait_any(fd, 1, &event, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_event_state(event, 1, 1); + + signaled = 0xdeadbeef; + ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + check_event_state(event, 0, 1); + + ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(event, 0, 1); + + ret = wait_any(fd, 1, &event, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + + ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + check_event_state(event, 0, 1); + + ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(event, 0, 1); + + close(event); + + close(fd); +} + TEST(test_wait_any) { int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret; From ef0f422e0717b560410ffeb4db82d778e14b756f Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:04 -0600 Subject: [PATCH 23/53] selftests: ntsync: Add some tests for auto-reset event state. Test event-specific ioctls NTSYNC_IOC_EVENT_SET, NTSYNC_IOC_EVENT_RESET, NTSYNC_IOC_EVENT_PULSE, NTSYNC_IOC_EVENT_READ for auto-reset events, and waiting on auto-reset events. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 4024a5c48bbb6a..66a096cb204565 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -426,6 +426,62 @@ TEST(manual_event_state) close(fd); } +TEST(auto_event_state) +{ + struct ntsync_event_args event_args; + __u32 index, signaled; + int fd, event, ret; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + event_args.manual = 0; + event_args.signaled = 1; + event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, event); + + check_event_state(event, 1, 0); + + signaled = 0xdeadbeef; + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + check_event_state(event, 1, 0); + + ret = wait_any(fd, 1, &event, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_event_state(event, 0, 0); + + signaled = 0xdeadbeef; + ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(event, 0, 0); + + ret = wait_any(fd, 1, &event, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + + ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + check_event_state(event, 0, 0); + + ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(event, 0, 0); + + close(event); + + close(fd); +} + TEST(test_wait_any) { int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret; From 31634f5c7fcd42a7581db73774943fe727495549 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:05 -0600 Subject: [PATCH 24/53] selftests: ntsync: Add some tests for wakeup signaling with events. Expand the contended wait tests, which previously only covered events and semaphores, to cover events as well. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 145 +++++++++++++++++- 1 file changed, 141 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 66a096cb204565..33348f0b621f29 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -598,6 +598,7 @@ TEST(test_wait_any) TEST(test_wait_all) { + struct ntsync_event_args event_args = {0}; struct ntsync_mutex_args mutex_args = {0}; struct ntsync_sem_args sem_args = {0}; __u32 owner, index, count; @@ -663,6 +664,19 @@ TEST(test_wait_all) close(objs[1]); + event_args.manual = true; + event_args.signaled = true; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, objs[1]); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + check_sem_state(objs[0], 0, 3); + check_event_state(objs[1], 1, 1); + + close(objs[1]); + /* test waiting on the same object twice */ objs[1] = objs[0]; ret = wait_all(fd, 2, objs, 123, &index); @@ -716,12 +730,13 @@ static int wait_for_thread(pthread_t thread, unsigned int ms) TEST(wake_any) { + struct ntsync_event_args event_args = {0}; struct ntsync_mutex_args mutex_args = {0}; struct ntsync_wait_args wait_args = {0}; struct ntsync_sem_args sem_args = {0}; struct wait_args thread_args; + __u32 count, index, signaled; int objs[2], fd, ret; - __u32 count, index; pthread_t thread; fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); @@ -796,6 +811,94 @@ TEST(wake_any) EXPECT_EQ(0, thread_args.ret); EXPECT_EQ(1, wait_args.index); + close(objs[1]); + + /* test waking events */ + + event_args.manual = false; + event_args.signaled = false; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, objs[1]); + + wait_args.timeout = get_abs_timeout(1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(objs[1], 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + wait_args.timeout = get_abs_timeout(1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(objs[1], 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + close(objs[1]); + + event_args.manual = true; + event_args.signaled = false; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, objs[1]); + + wait_args.timeout = get_abs_timeout(1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(objs[1], 1, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + + ret = ioctl(objs[1], NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + + wait_args.timeout = get_abs_timeout(1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_event_state(objs[1], 0, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + /* delete an object while it's being waited on */ wait_args.timeout = get_abs_timeout(200); @@ -819,12 +922,14 @@ TEST(wake_any) TEST(wake_all) { + struct ntsync_event_args manual_event_args = {0}; + struct ntsync_event_args auto_event_args = {0}; struct ntsync_mutex_args mutex_args = {0}; struct ntsync_wait_args wait_args = {0}; struct ntsync_sem_args sem_args = {0}; struct wait_args thread_args; - int objs[2], fd, ret; - __u32 count, index; + __u32 count, index, signaled; + int objs[4], fd, ret; pthread_t thread; fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); @@ -840,9 +945,19 @@ TEST(wake_all) objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); EXPECT_LE(0, objs[1]); + manual_event_args.manual = true; + manual_event_args.signaled = true; + objs[2] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args); + EXPECT_LE(0, objs[2]); + + auto_event_args.manual = false; + auto_event_args.signaled = true; + objs[3] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args); + EXPECT_EQ(0, objs[3]); + wait_args.timeout = get_abs_timeout(1000); wait_args.objs = (uintptr_t)objs; - wait_args.count = 2; + wait_args.count = 4; wait_args.owner = 456; thread_args.fd = fd; thread_args.args = &wait_args; @@ -876,12 +991,32 @@ TEST(wake_all) check_mutex_state(objs[1], 0, 0); + ret = ioctl(objs[2], NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + count = 2; ret = release_sem(objs[0], &count); EXPECT_EQ(0, ret); EXPECT_EQ(0, count); + check_sem_state(objs[0], 2, 3); + + ret = ioctl(objs[3], NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + + ret = ioctl(objs[2], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + + ret = ioctl(objs[3], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + check_sem_state(objs[0], 1, 3); check_mutex_state(objs[1], 1, 456); + check_event_state(objs[2], 1, 1); + check_event_state(objs[3], 0, 0); ret = wait_for_thread(thread, 100); EXPECT_EQ(0, ret); @@ -899,6 +1034,8 @@ TEST(wake_all) close(objs[0]); close(objs[1]); + close(objs[2]); + close(objs[3]); ret = wait_for_thread(thread, 200); EXPECT_EQ(0, ret); From bc74faaeaa636c396b93787984a8f36b581ced1f Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:06 -0600 Subject: [PATCH 25/53] selftests: ntsync: Add tests for alertable waits. Test the "alert" functionality of NTSYNC_IOC_WAIT_ALL and NTSYNC_IOC_WAIT_ANY, when a wait is woken with an alert and when it is woken by an object. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 167 +++++++++++++++++- 1 file changed, 164 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 33348f0b621f29..72f078dde444be 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -95,7 +95,7 @@ static int read_event_state(int event, __u32 *signaled, __u32 *manual) }) static int wait_objs(int fd, unsigned long request, __u32 count, - const int *objs, __u32 owner, __u32 *index) + const int *objs, __u32 owner, int alert, __u32 *index) { struct ntsync_wait_args args = {0}; struct timespec timeout; @@ -108,6 +108,7 @@ static int wait_objs(int fd, unsigned long request, __u32 count, args.objs = (uintptr_t)objs; args.owner = owner; args.index = 0xdeadbeef; + args.alert = alert; ret = ioctl(fd, request, &args); *index = args.index; return ret; @@ -115,12 +116,26 @@ static int wait_objs(int fd, unsigned long request, __u32 count, static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) { - return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, index); + return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, 0, index); } static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) { - return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, index); + return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, 0, index); +} + +static int wait_any_alert(int fd, __u32 count, const int *objs, + __u32 owner, int alert, __u32 *index) +{ + return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, + count, objs, owner, alert, index); +} + +static int wait_all_alert(int fd, __u32 count, const int *objs, + __u32 owner, int alert, __u32 *index) +{ + return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, + count, objs, owner, alert, index); } TEST(semaphore_state) @@ -1045,4 +1060,150 @@ TEST(wake_all) close(fd); } +TEST(alert_any) +{ + struct ntsync_event_args event_args = {0}; + struct ntsync_sem_args sem_args = {0}; + __u32 index, count, signaled; + int objs[2], event, fd, ret; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 0; + sem_args.max = 2; + objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[0]); + + sem_args.count = 1; + sem_args.max = 2; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[1]); + + event_args.manual = true; + event_args.signaled = true; + event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, event); + + ret = wait_any_alert(fd, 0, NULL, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 0, NULL, 123, event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + + ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + close(event); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; + event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, event); + + count = 1; + ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + + ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + close(event); + + close(objs[0]); + close(objs[1]); + + close(fd); +} + +TEST(alert_all) +{ + struct ntsync_event_args event_args = {0}; + struct ntsync_sem_args sem_args = {0}; + __u32 index, count, signaled; + int objs[2], event, fd, ret; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, fd); + + sem_args.count = 2; + sem_args.max = 2; + objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[0]); + + sem_args.count = 1; + sem_args.max = 2; + objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); + EXPECT_LE(0, objs[1]); + + event_args.manual = true; + event_args.signaled = true; + event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, event); + + ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + close(event); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; + event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, event); + + count = 2; + ret = release_sem(objs[1], &count); + EXPECT_EQ(0, ret); + + ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + + ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + close(event); + + close(objs[0]); + close(objs[1]); + + close(fd); +} + TEST_HARNESS_MAIN From 89ff6f981f965eb55d983f8b81702f40ce281208 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:07 -0600 Subject: [PATCH 26/53] selftests: ntsync: Add some tests for wakeup signaling via alerts. Expand the alert tests to cover alerting a thread mid-wait, to test that the relevant scheduling logic works correctly. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 72f078dde444be..298b279729aa86 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -1063,9 +1063,12 @@ TEST(wake_all) TEST(alert_any) { struct ntsync_event_args event_args = {0}; + struct ntsync_wait_args wait_args = {0}; struct ntsync_sem_args sem_args = {0}; __u32 index, count, signaled; + struct wait_args thread_args; int objs[2], event, fd, ret; + pthread_t thread; fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ASSERT_LE(0, fd); @@ -1107,6 +1110,34 @@ TEST(alert_any) EXPECT_EQ(0, ret); EXPECT_EQ(2, index); + /* test wakeup via alert */ + + ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + + wait_args.timeout = get_abs_timeout(1000); + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; + wait_args.alert = event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = NTSYNC_IOC_WAIT_ANY; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + close(event); /* test with an auto-reset event */ @@ -1143,9 +1174,12 @@ TEST(alert_any) TEST(alert_all) { struct ntsync_event_args event_args = {0}; + struct ntsync_wait_args wait_args = {0}; struct ntsync_sem_args sem_args = {0}; + struct wait_args thread_args; __u32 index, count, signaled; int objs[2], event, fd, ret; + pthread_t thread; fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ASSERT_LE(0, fd); @@ -1173,6 +1207,34 @@ TEST(alert_all) EXPECT_EQ(0, ret); EXPECT_EQ(2, index); + /* test wakeup via alert */ + + ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + + wait_args.timeout = get_abs_timeout(1000); + wait_args.objs = (uintptr_t)objs; + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; + wait_args.alert = event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = NTSYNC_IOC_WAIT_ALL; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + + ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + close(event); /* test with an auto-reset event */ From c43b4e751eeebf013ba91fb4120ef3ff0ca9c781 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:08 -0600 Subject: [PATCH 27/53] selftests: ntsync: Add a stress test for contended waits. Test a more realistic usage pattern, and one with heavy contention, in order to actually exercise ntsync's internal synchronization. This test has several threads in a tight loop acquiring a mutex, modifying some shared data, and then releasing the mutex. At the end we check if the data is consistent. Signed-off-by: Elizabeth Figura --- .../testing/selftests/drivers/ntsync/ntsync.c | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c index 298b279729aa86..3aad311574c44e 100644 --- a/tools/testing/selftests/drivers/ntsync/ntsync.c +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c @@ -1268,4 +1268,76 @@ TEST(alert_all) close(fd); } +#define STRESS_LOOPS 10000 +#define STRESS_THREADS 4 + +static unsigned int stress_counter; +static int stress_device, stress_start_event, stress_mutex; + +static void *stress_thread(void *arg) +{ + struct ntsync_wait_args wait_args = {0}; + __u32 index, count, i; + int ret; + + wait_args.timeout = UINT64_MAX; + wait_args.count = 1; + wait_args.objs = (uintptr_t)&stress_start_event; + wait_args.owner = gettid(); + wait_args.index = 0xdeadbeef; + + ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args); + + wait_args.objs = (uintptr_t)&stress_mutex; + + for (i = 0; i < STRESS_LOOPS; ++i) { + ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args); + + ++stress_counter; + + unlock_mutex(stress_mutex, wait_args.owner, &count); + } + + return NULL; +} + +TEST(stress_wait) +{ + struct ntsync_event_args event_args; + struct ntsync_mutex_args mutex_args; + pthread_t threads[STRESS_THREADS]; + __u32 signaled, i; + int ret; + + stress_device = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); + ASSERT_LE(0, stress_device); + + mutex_args.owner = 0; + mutex_args.count = 0; + stress_mutex = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); + EXPECT_LE(0, stress_mutex); + + event_args.manual = 1; + event_args.signaled = 0; + stress_start_event = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args); + EXPECT_LE(0, stress_start_event); + + for (i = 0; i < STRESS_THREADS; ++i) + pthread_create(&threads[i], NULL, stress_thread, NULL); + + ret = ioctl(stress_start_event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + + for (i = 0; i < STRESS_THREADS; ++i) { + ret = pthread_join(threads[i], NULL); + EXPECT_EQ(0, ret); + } + + EXPECT_EQ(STRESS_LOOPS * STRESS_THREADS, stress_counter); + + close(stress_start_event); + close(stress_mutex); + close(stress_device); +} + TEST_HARNESS_MAIN From ebdcd1d3b5f142787a8193ac731562e918d5df3e Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:09 -0600 Subject: [PATCH 28/53] maintainers: Add an entry for ntsync. Add myself as maintainer, supported by CodeWeavers. Signed-off-by: Elizabeth Figura --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d0f18fdba068b0..13fb284b2feae8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16486,6 +16486,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git F: Documentation/filesystems/ntfs3.rst F: fs/ntfs3/ +NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER +M: Elizabeth Figura +L: wine-devel@winehq.org +S: Supported +F: Documentation/userspace-api/ntsync.rst +F: drivers/misc/ntsync.c +F: include/uapi/linux/ntsync.h +F: tools/testing/selftests/drivers/ntsync/ + NUBUS SUBSYSTEM M: Finn Thain L: linux-m68k@lists.linux-m68k.org From 6b32e214a723b20584cca544234670e09acd471b Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:10 -0600 Subject: [PATCH 29/53] docs: ntsync: Add documentation for the ntsync uAPI. Add an overall explanation of the driver architecture, and complete and precise specification for its intended behaviour. Signed-off-by: Elizabeth Figura --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/ntsync.rst | 385 +++++++++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 Documentation/userspace-api/ntsync.rst diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 274cc7546efc2a..9c1b15cd89ab0d 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -63,6 +63,7 @@ Everything else vduse futex2 perf_ring_buffer + ntsync .. only:: subproject and html diff --git a/Documentation/userspace-api/ntsync.rst b/Documentation/userspace-api/ntsync.rst new file mode 100644 index 00000000000000..25e7c4aef96813 --- /dev/null +++ b/Documentation/userspace-api/ntsync.rst @@ -0,0 +1,385 @@ +=================================== +NT synchronization primitive driver +=================================== + +This page documents the user-space API for the ntsync driver. + +ntsync is a support driver for emulation of NT synchronization +primitives by user-space NT emulators. It exists because implementation +in user-space, using existing tools, cannot match Windows performance +while offering accurate semantics. It is implemented entirely in +software, and does not drive any hardware device. + +This interface is meant as a compatibility tool only, and should not +be used for general synchronization. Instead use generic, versatile +interfaces such as futex(2) and poll(2). + +Synchronization primitives +========================== + +The ntsync driver exposes three types of synchronization primitives: +semaphores, mutexes, and events. + +A semaphore holds a single volatile 32-bit counter, and a static 32-bit +integer denoting the maximum value. It is considered signaled (that is, +can be acquired without contention, or will wake up a waiting thread) +when the counter is nonzero. The counter is decremented by one when a +wait is satisfied. Both the initial and maximum count are established +when the semaphore is created. + +A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit +identifier denoting its owner. A mutex is considered signaled when its +owner is zero (indicating that it is not owned). The recursion count is +incremented when a wait is satisfied, and ownership is set to the given +identifier. + +A mutex also holds an internal flag denoting whether its previous owner +has died; such a mutex is said to be abandoned. Owner death is not +tracked automatically based on thread death, but rather must be +communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is +inherently considered unowned. + +Except for the "unowned" semantics of zero, the actual value of the +owner identifier is not interpreted by the ntsync driver at all. The +intended use is to store a thread identifier; however, the ntsync +driver does not actually validate that a calling thread provides +consistent or unique identifiers. + +An event is similar to a semaphore with a maximum count of one. It holds +a volatile boolean state denoting whether it is signaled or not. There +are two types of events, auto-reset and manual-reset. An auto-reset +event is designaled when a wait is satisfied; a manual-reset event is +not. The event type is specified when the event is created. + +Unless specified otherwise, all operations on an object are atomic and +totally ordered with respect to other operations on the same object. + +Objects are represented by files. When all file descriptors to an +object are closed, that object is deleted. + +Char device +=========== + +The ntsync driver creates a single char device /dev/ntsync. Each file +description opened on the device represents a unique instance intended +to back an individual NT virtual machine. Objects created by one ntsync +instance may only be used with other objects created by the same +instance. + +ioctl reference +=============== + +All operations on the device are done through ioctls. There are four +structures used in ioctl calls:: + + struct ntsync_sem_args { + __u32 count; + __u32 max; + }; + + struct ntsync_mutex_args { + __u32 owner; + __u32 count; + }; + + struct ntsync_event_args { + __u32 signaled; + __u32 manual; + }; + + struct ntsync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 owner; + __u32 index; + __u32 alert; + __u32 flags; + __u32 pad; + }; + +Depending on the ioctl, members of the structure may be used as input, +output, or not at all. + +The ioctls on the device file are as follows: + +.. c:macro:: NTSYNC_IOC_CREATE_SEM + + Create a semaphore object. Takes a pointer to struct + :c:type:`ntsync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``count`` + - Initial count of the semaphore. + * - ``max`` + - Maximum count of the semaphore. + + Fails with ``EINVAL`` if ``count`` is greater than ``max``. + On success, returns a file descriptor the created semaphore. + +.. c:macro:: NTSYNC_IOC_CREATE_MUTEX + + Create a mutex object. Takes a pointer to struct + :c:type:`ntsync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``count`` + - Initial recursion count of the mutex. + * - ``owner`` + - Initial owner of the mutex. + + If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is + zero and ``count`` is nonzero, the function fails with ``EINVAL``. + On success, returns a file descriptor the created mutex. + +.. c:macro:: NTSYNC_IOC_CREATE_EVENT + + Create an event object. Takes a pointer to struct + :c:type:`ntsync_event_args`, which is used as follows: + + .. list-table:: + + * - ``signaled`` + - If nonzero, the event is initially signaled, otherwise + nonsignaled. + * - ``manual`` + - If nonzero, the event is a manual-reset event, otherwise + auto-reset. + + On success, returns a file descriptor the created event. + +The ioctls on the individual objects are as follows: + +.. c:macro:: NTSYNC_IOC_SEM_POST + + Post to a semaphore object. Takes a pointer to a 32-bit integer, + which on input holds the count to be added to the semaphore, and on + output contains its previous count. + + If adding to the semaphore's current count would raise the latter + past the semaphore's maximum count, the ioctl fails with + ``EOVERFLOW`` and the semaphore is not affected. If raising the + semaphore's count causes it to become signaled, eligible threads + waiting on this semaphore will be woken and the semaphore's count + decremented appropriately. + +.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK + + Release a mutex object. Takes a pointer to struct + :c:type:`ntsync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``owner`` + - Specifies the owner trying to release this mutex. + * - ``count`` + - On output, contains the previous recursion count. + + If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner`` + is not the current owner of the mutex, the ioctl fails with + ``EPERM``. + + The mutex's count will be decremented by one. If decrementing the + mutex's count causes it to become zero, the mutex is marked as + unowned and signaled, and eligible threads waiting on it will be + woken as appropriate. + +.. c:macro:: NTSYNC_IOC_SET_EVENT + + Signal an event object. Takes a pointer to a 32-bit integer, which on + output contains the previous state of the event. + + Eligible threads will be woken, and auto-reset events will be + designaled appropriately. + +.. c:macro:: NTSYNC_IOC_RESET_EVENT + + Designal an event object. Takes a pointer to a 32-bit integer, which + on output contains the previous state of the event. + +.. c:macro:: NTSYNC_IOC_PULSE_EVENT + + Wake threads waiting on an event object while leaving it in an + unsignaled state. Takes a pointer to a 32-bit integer, which on + output contains the previous state of the event. + + A pulse operation can be thought of as a set followed by a reset, + performed as a single atomic operation. If two threads are waiting on + an auto-reset event which is pulsed, only one will be woken. If two + threads are waiting a manual-reset event which is pulsed, both will + be woken. However, in both cases, the event will be unsignaled + afterwards, and a simultaneous read operation will always report the + event as unsignaled. + +.. c:macro:: NTSYNC_IOC_READ_SEM + + Read the current state of a semaphore object. Takes a pointer to + struct :c:type:`ntsync_sem_args`, which is used as follows: + + .. list-table:: + + * - ``count`` + - On output, contains the current count of the semaphore. + * - ``max`` + - On output, contains the maximum count of the semaphore. + +.. c:macro:: NTSYNC_IOC_READ_MUTEX + + Read the current state of a mutex object. Takes a pointer to struct + :c:type:`ntsync_mutex_args`, which is used as follows: + + .. list-table:: + + * - ``owner`` + - On output, contains the current owner of the mutex, or zero + if the mutex is not currently owned. + * - ``count`` + - On output, contains the current recursion count of the mutex. + + If the mutex is marked as abandoned, the function fails with + ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to + zero. + +.. c:macro:: NTSYNC_IOC_READ_EVENT + + Read the current state of an event object. Takes a pointer to struct + :c:type:`ntsync_event_args`, which is used as follows: + + .. list-table:: + + * - ``signaled`` + - On output, contains the current state of the event. + * - ``manual`` + - On output, contains 1 if the event is a manual-reset event, + and 0 otherwise. + +.. c:macro:: NTSYNC_IOC_KILL_OWNER + + Mark a mutex as unowned and abandoned if it is owned by the given + owner. Takes an input-only pointer to a 32-bit integer denoting the + owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the + owner does not own the mutex, the function fails with ``EPERM``. + + Eligible threads waiting on the mutex will be woken as appropriate + (and such waits will fail with ``EOWNERDEAD``, as described below). + +.. c:macro:: NTSYNC_IOC_WAIT_ANY + + Poll on any of a list of objects, atomically acquiring at most one. + Takes a pointer to struct :c:type:`ntsync_wait_args`, which is + used as follows: + + .. list-table:: + + * - ``timeout`` + - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME`` + is set, the timeout is measured against the REALTIME clock; + otherwise it is measured against the MONOTONIC clock. If the + timeout is equal to or earlier than the current time, the + function returns immediately without sleeping. If ``timeout`` + is U64_MAX, the function will sleep until an object is + signaled, and will not fail with ``ETIMEDOUT``. + * - ``objs`` + - Pointer to an array of ``count`` file descriptors + (specified as an integer so that the structure has the same + size regardless of architecture). If any object is + invalid, the function fails with ``EINVAL``. + * - ``count`` + - Number of objects specified in the ``objs`` array. + If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails + with ``EINVAL``. + * - ``owner`` + - Mutex owner identifier. If any object in ``objs`` is a mutex, + the ioctl will attempt to acquire that mutex on behalf of + ``owner``. If ``owner`` is zero, the ioctl fails with + ``EINVAL``. + * - ``index`` + - On success, contains the index (into ``objs``) of the object + which was signaled. If ``alert`` was signaled instead, + this contains ``count``. + * - ``alert`` + - Optional event object file descriptor. If nonzero, this + specifies an "alert" event object which, if signaled, will + terminate the wait. If nonzero, the identifier must point to a + valid event. + * - ``flags`` + - Zero or more flags. Currently the only flag is + ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be + measured against the REALTIME clock instead of MONOTONIC. + * - ``pad`` + - Unused, must be set to zero. + + This function attempts to acquire one of the given objects. If unable + to do so, it sleeps until an object becomes signaled, subsequently + acquiring it, or the timeout expires. In the latter case the ioctl + fails with ``ETIMEDOUT``. The function only acquires one object, even + if multiple objects are signaled. + + A semaphore is considered to be signaled if its count is nonzero, and + is acquired by decrementing its count by one. A mutex is considered + to be signaled if it is unowned or if its owner matches the ``owner`` + argument, and is acquired by incrementing its recursion count by one + and setting its owner to the ``owner`` argument. An auto-reset event + is acquired by designaling it; a manual-reset event is not affected + by acquisition. + + Acquisition is atomic and totally ordered with respect to other + operations on the same object. If two wait operations (with different + ``owner`` identifiers) are queued on the same mutex, only one is + signaled. If two wait operations are queued on the same semaphore, + and a value of one is posted to it, only one is signaled. + + If an abandoned mutex is acquired, the ioctl fails with + ``EOWNERDEAD``. Although this is a failure return, the function may + otherwise be considered successful. The mutex is marked as owned by + the given owner (with a recursion count of 1) and as no longer + abandoned, and ``index`` is still set to the index of the mutex. + + The ``alert`` argument is an "extra" event which can terminate the + wait, independently of all other objects. + + It is valid to pass the same object more than once, including by + passing the same event in the ``objs`` array and in ``alert``. If a + wakeup occurs due to that object being signaled, ``index`` is set to + the lowest index corresponding to that object. + + The function may fail with ``EINTR`` if a signal is received. + +.. c:macro:: NTSYNC_IOC_WAIT_ALL + + Poll on a list of objects, atomically acquiring all of them. Takes a + pointer to struct :c:type:`ntsync_wait_args`, which is used + identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is + always filled with zero on success if not woken via alert. + + This function attempts to simultaneously acquire all of the given + objects. If unable to do so, it sleeps until all objects become + simultaneously signaled, subsequently acquiring them, or the timeout + expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no + objects are modified. + + Objects may become signaled and subsequently designaled (through + acquisition by other threads) while this thread is sleeping. Only + once all objects are simultaneously signaled does the ioctl acquire + them and return. The entire acquisition is atomic and totally ordered + with respect to other operations on any of the given objects. + + If an abandoned mutex is acquired, the ioctl fails with + ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are + nevertheless marked as acquired. Note that if multiple mutex objects + are specified, there is no way to know which were marked as + abandoned. + + As with "any" waits, the ``alert`` argument is an "extra" event which + can terminate the wait. Critically, however, an "all" wait will + succeed if all members in ``objs`` are signaled, *or* if ``alert`` is + signaled. In the latter case ``index`` will be set to ``count``. As + with "any" waits, if both conditions are filled, the former takes + priority, and objects in ``objs`` will be acquired. + + Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same + object more than once, nor is it valid to pass the same object in + ``objs`` and in ``alert``. If this is attempted, the function fails + with ``EINVAL``. From 0618f376ba4677529a34d85124da5b8124cbe4a6 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:35:11 -0600 Subject: [PATCH 30/53] ntsync: No longer depend on BROKEN. f5b335dc025cfee90957efa90dc72fada0d5abb4 ("misc: ntsync: mark driver as "broken" to prevent from building") was committed to avoid the driver being used while only part of its functionality was released. Since the rest of the functionality has now been committed, revert this. Signed-off-by: Elizabeth Figura --- drivers/misc/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 3fe7e2a9bd294d..6c8b999a5e0853 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -517,7 +517,6 @@ config OPEN_DICE config NTSYNC tristate "NT synchronization primitive emulation" - depends on BROKEN help This module provides kernel support for emulation of Windows NT synchronization primitives. It is not a hardware driver. From 9a742d7ca2cdec4776625b70e4938e6b3527be1e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 7 Dec 2016 21:13:16 +1100 Subject: [PATCH 31/53] Make threaded IRQs optionally the default which can be disabled. Signed-off-by: Kai Krakow --- include/linux/interrupt.h | 3 +++ kernel/irq/Kconfig | 17 +++++++++++++++++ kernel/irq/manage.c | 11 +++++++++++ 3 files changed, 31 insertions(+) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b378fbf885ce37..98ddeddddca004 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -516,6 +516,9 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) +# elif defined(CONFIG_FORCE_IRQ_THREADING) +DECLARE_STATIC_KEY_TRUE(force_irqthreads_key); +# define force_irqthreads() (static_branch_likely(&force_irqthreads_key)) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 529adb1f58593c..72aae01fc431b4 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -116,6 +116,23 @@ config GENERIC_IRQ_STAT_SNAPSHOT config IRQ_FORCED_THREADING bool +config FORCE_IRQ_THREADING + bool "Make IRQ threading compulsory" + depends on IRQ_FORCED_THREADING + default n + help + + Make IRQ threading mandatory for any IRQ handlers that support it + instead of being optional and requiring the threadirqs kernel + parameter. Instead they can be optionally disabled with the + nothreadirqs kernel parameter. + + Enabling this may make some architectures not boot with runqueue + sharing and MuQSS. + + Enable if you are building for a desktop or low latency system, + otherwise say N. + config SPARSE_IRQ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ help diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0803d6bd29698..f2dc23a9ece95b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -25,7 +25,18 @@ #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_FORCE_IRQ_THREADING +DEFINE_STATIC_KEY_TRUE(force_irqthreads_key); +#else DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); +#endif + +static int __init setup_noforced_irqthreads(char *arg) +{ + static_branch_disable(&force_irqthreads_key); + return 0; +} +early_param("nothreadirqs", setup_noforced_irqthreads); static int __init setup_forced_irqthreads(char *arg) { From 6c7919e36ba840a56e1d8b206e98055a289c512f Mon Sep 17 00:00:00 2001 From: Paul Gofman Date: Wed, 6 May 2020 14:37:44 +0300 Subject: [PATCH 32/53] mm: Support soft dirty flag reset for VA range. v2: ported from 6.1 to 6.6 Signed-off-by: Kai Krakow --- fs/proc/task_mmu.c | 129 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 103 insertions(+), 26 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 536b7dc4538182..6c115f041d0147 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1363,6 +1363,8 @@ enum clear_refs_types { struct clear_refs_private { enum clear_refs_types type; + unsigned long start, end; + bool clear_range; }; #ifdef CONFIG_MEM_SOFT_DIRTY @@ -1454,6 +1456,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct folio *folio; + BUG_ON(addr < cp->start || end > cp->end); + ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { @@ -1511,9 +1515,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; - if (vma->vm_flags & VM_PFNMAP) + if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) return 1; + BUG_ON(start < cp->start || end > cp->end); + /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. @@ -1537,10 +1543,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF] = {}; + char buffer[18] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + unsigned long start, end; + bool clear_range; int itype; int rv; @@ -1548,12 +1556,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &itype); - if (rv < 0) - return rv; - type = (enum clear_refs_types)itype; - if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) - return -EINVAL; + + if (buffer[0] == '6') + { + static int once; + + if (!once++) + printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); + + if (count != 17) + return -EINVAL; + + type = CLEAR_REFS_SOFT_DIRTY; + start = *(unsigned long *)(buffer + 1); + end = *(unsigned long *)(buffer + 1 + 8); + } + else + { + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) + return -EINVAL; + + start = 0; + end = -1UL; + } task = get_proc_task(file_inode(file)); if (!task) @@ -1566,40 +1596,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .type = type, }; - if (mmap_write_lock_killable(mm)) { - count = -EINTR; - goto out_mm; + if (start || end != -1UL) + { + start = min(start, -1UL) & PAGE_MASK; + end = min(end, -1UL) & PAGE_MASK; + + if (start >= end) + { + count = -EINVAL; + goto out_mm; + } + clear_range = true; } + else + { + clear_range = false; + } + + cp.start = start; + cp.end = end; + cp.clear_range = clear_range; + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - goto out_unlock; + mmap_write_unlock(mm); + goto out_mm; } if (type == CLEAR_REFS_SOFT_DIRTY) { - for_each_vma(vmi, vma) { - if (!(vma->vm_flags & VM_SOFTDIRTY)) - continue; - vm_flags_clear(vma, VM_SOFTDIRTY); - vma_set_page_prot(vma); + if (mmap_read_lock_killable(mm)) { + count = -EINTR; + goto out_mm; } - + if (!clear_range) + for_each_vma(vmi, vma) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + mmap_read_unlock(mm); + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + for_each_vma(vmi, vma) { + vm_flags_clear(vma, VM_SOFTDIRTY); + vma_set_page_prot(vma); + } + mmap_write_downgrade(mm); + break; + } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, mm, 0, -1UL); + 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); + else + { + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + } + walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); + mmap_read_unlock(mm); + } + else + { + mmap_write_unlock(mm); } -out_unlock: - mmap_write_unlock(mm); out_mm: mmput(mm); } @@ -1631,6 +1707,7 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) #define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) @@ -1704,13 +1781,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); @@ -1774,7 +1851,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) @@ -1795,7 +1872,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; + flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); From d5b947eb2035ebe0ffb8ce6ce9687844b3e260c7 Mon Sep 17 00:00:00 2001 From: Paul Gofman Date: Thu, 7 May 2020 14:05:31 +0300 Subject: [PATCH 33/53] mm: Support soft dirty flag read with reset. v2: ported from 6.1 to 6.6 Signed-off-by: Kai Krakow --- fs/proc/base.c | 3 + fs/proc/internal.h | 1 + fs/proc/task_mmu.c | 139 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 127 insertions(+), 16 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index a2541f5204af06..5cb8e097ce1fc3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3349,6 +3349,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), +#ifdef CONFIG_MEM_SOFT_DIRTY + REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), +#endif #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4e0c5b57ffdbb8..7676322518cfb9 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -349,6 +349,7 @@ extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_pagemap_reset_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6c115f041d0147..af4fe0b299840b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1385,7 +1385,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return folio_maybe_dma_pinned(folio); } -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* @@ -1395,37 +1395,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); + bool ret = false; if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) - return; + return ret; old_pte = ptep_modify_prot_start(vma, addr, pte); + ret = pte_soft_dirty(old_pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { + ret = pte_swp_soft_dirty(ptent); ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } + return ret; } #else -static inline void clear_soft_dirty(struct vm_area_struct *vma, +static inline bool clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + return false; } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + bool ret = false; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); + + ret = pmd_soft_dirty(old); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) @@ -1436,14 +1445,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + ret = pmd_swp_soft_dirty(pmd); pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } + return ret; } #else -static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, +static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { + return false; } #endif @@ -1697,6 +1709,7 @@ struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; + bool reset; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -1728,6 +1741,14 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) return 0; } +static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) +{ + ((unsigned long *)pm->buffer)[pm->pos++] = addr; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { @@ -1735,6 +1756,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, unsigned long addr = start; int err = 0; + if (pm->reset) + goto out; + while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); @@ -1843,6 +1867,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct page *page = NULL; struct folio *folio = NULL; + if (pm->reset) + { + if (clear_soft_dirty_pmd(vma, addr, pmdp)) + { + for (; addr != end; addr += PAGE_SIZE) + { + err = add_addr_to_pagemap(addr, pm); + if (err) + break; + } + } + goto trans_huge_done; + } + if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1905,6 +1943,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, frame += (1 << MAX_SWAPFILES_SHIFT); } } +trans_huge_done: spin_unlock(ptl); return err; } @@ -1920,10 +1959,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { - pagemap_entry_t pme; + if (pm->reset) + { + if (clear_soft_dirty(vma, addr, pte)) + err = add_addr_to_pagemap(addr, pm); + } + else + { + pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(&pme, pm); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); + err = add_to_pagemap(&pme, pm); + } if (err) break; } @@ -2023,8 +2070,8 @@ static const struct mm_walk_ops pagemap_ops = { * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -static ssize_t pagemap_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t do_pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, bool reset) { struct mm_struct *mm = file->private_data; struct pagemapread pm; @@ -2033,6 +2080,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; + struct mmu_notifier_range range; + size_t buffer_len; if (!mm || !mmget_not_zero(mm)) goto out; @@ -2048,19 +2097,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); + pm.reset = reset; + + buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); - pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); + pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; - end_vaddr = mm->task_size; + + start_vaddr = svpfn << PAGE_SHIFT; + + if (reset) + { + if (count < sizeof(end_vaddr)) + { + ret = -EINVAL; + goto out_mm; + } + if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) + return -EFAULT; + end_vaddr = min(end_vaddr, mm->task_size); + } + else + { + end_vaddr = mm->task_size; + start_vaddr = end_vaddr; + } /* watch out for wraparound */ - start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; @@ -2085,18 +2153,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + pm.len = min(buffer_len, count / PM_ENTRY_BYTES); + + end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; + + if (reset) + { + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, mm, start_vaddr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); + if (reset) + { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); + } mmap_read_unlock(mm); - start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); + BUG_ON(ret && ret != PM_END_OF_BUFFER); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; @@ -2104,6 +2189,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, copied += len; buf += len; count -= len; + + start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) @@ -2117,6 +2204,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, return ret; } +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, false); +} + +static ssize_t pagemap_reset_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return do_pagemap_read(file, buf, count, ppos, true); +} + static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; @@ -2908,6 +3007,14 @@ const struct file_operations proc_pagemap_operations = { .unlocked_ioctl = do_pagemap_cmd, .compat_ioctl = do_pagemap_cmd, }; + +const struct file_operations proc_pagemap_reset_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_reset_read, + .open = pagemap_open, + .release = pagemap_release, +}; + #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA From 8a2d3bb1222d1c62f3a44ccd38c8d4137533b5df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 25 Oct 2021 09:49:42 -0300 Subject: [PATCH 34/53] futex: Add entry point for FUTEX_WAIT_MULTIPLE (opcode 31) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an option to wait on multiple futexes using the old interface, that uses opcode 31 through futex() syscall. Do that by just translation the old interface to use the new code. This allows old and stable versions of Proton to still use fsync in new kernel releases. Signed-off-by: AndrƩ Almeida --- include/uapi/linux/futex.h | 13 +++++++ kernel/futex/syscalls.c | 75 +++++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index d2ee625ea18900..45b7b27f4c137b 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -22,6 +22,7 @@ #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_LOCK_PI2 13 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -93,6 +94,18 @@ struct futex_waitv { __u32 __reserved; }; +/** + * struct futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 4b6da9116aa6c3..12beb7e0ad0f34 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -138,6 +138,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_WAIT_MULTIPLE: return true; } return false; @@ -150,13 +151,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) return -EINVAL; *t = timespec64_to_ktime(*ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } +/** + * futex_read_wait_block - Read an array of futex_wait_block from userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function creates and allocate an array of futex_q (we zero it to + * initialize the fields) and then, for each futex_wait_block element from + * userspace, fill a futex_q element with proper values. + */ +inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count) +{ + unsigned int i; + struct futex_vector *futexv; + struct futex_wait_block fwb; + struct futex_wait_block __user *entry = + (struct futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_WAITV_MAX) + return ERR_PTR(-EINVAL); + + futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL); + if (!futexv) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(futexv); + return ERR_PTR(-EFAULT); + } + + futexv[i].w.flags = FUTEX_32; + futexv[i].w.val = fwb.val; + futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr); + futexv[i].q = futex_q_init; + } + + return futexv; +} + +int futex_wait_multiple(struct futex_vector *vs, unsigned int count, + struct hrtimer_sleeper *to); + +int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count) +{ + int ret; + struct futex_vector *vs; + struct hrtimer_sleeper *to = NULL, timeout; + + to = futex_setup_timer(abs_time, &timeout, 0, 0); + + vs = futex_read_wait_block(uaddr, count); + + if (IS_ERR(vs)) + return PTR_ERR(vs); + + ret = futex_wait_multiple(vs, count, abs_time ? to : NULL); + kfree(vs); + + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + return ret; +} + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) @@ -176,6 +243,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } @@ -506,6 +576,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, tp = &t; } + if (cmd == FUTEX_WAIT_MULTIPLE) + return futex_opcode_31(tp, uaddr, val); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ From 08267306b53108aa010bce6dd0d0c212f650f961 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sun, 8 Mar 2020 00:31:35 -0800 Subject: [PATCH 35/53] ZEN: Disable stack conservation for GCC There's plenty of room on the stack for a few more inlined bytes here and there. The measured stack usage at runtime is still safe without this, and performance is surely improved at a microscopic level, so remove it. Signed-off-by: Sultan Alsawaf --- Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index 18c2a7cf9e9134..44bdb0c03c1737 100644 --- a/Makefile +++ b/Makefile @@ -1003,11 +1003,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check -# conserve stack if available -ifdef CONFIG_CC_IS_GCC -KBUILD_CFLAGS += -fconserve-stack -endif - # Ensure compilers do not transform certain loops into calls to wcslen() KBUILD_CFLAGS += -fno-builtin-wcslen From 9f5126d691ba1d10efd3ca5063aac5c374913ec7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 2 Jun 2016 23:36:32 -0500 Subject: [PATCH 36/53] ZEN: Initialize ata before graphics ATA init is the long pole in the boot process, and its asynchronous. move the graphics init after it so that ata and graphics initialize in parallel Signed-off-by: Kai Krakow --- drivers/Makefile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 45d1c3e630f754..4f5ab2429a7f07 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -64,14 +64,8 @@ obj-y += char/ # iommu/ comes before gpu as gpu are using iommu controllers obj-y += iommu/ -# gpu/ comes after char for AGP vs DRM startup and after iommu -obj-y += gpu/ - obj-$(CONFIG_CONNECTOR) += connector/ -# i810fb depends on char/agp/ -obj-$(CONFIG_FB_I810) += video/fbdev/i810/ - obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ @@ -83,6 +77,13 @@ obj-y += macintosh/ obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ + +# gpu/ comes after char for AGP vs DRM startup and after iommu +obj-y += gpu/ + +# i810fb depends on char/agp/ +obj-$(CONFIG_FB_I810) += video/fbdev/i810/ + obj-$(CONFIG_TARGET_CORE) += target/ obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ From 6b5e7b6f36c3f1b19a644fab91ca0323a1412dfe Mon Sep 17 00:00:00 2001 From: Kenny Levinsen Date: Sun, 27 Dec 2020 14:43:13 +0000 Subject: [PATCH 37/53] ZEN: Input: evdev - use call_rcu when detaching client Significant time was spent on synchronize_rcu in evdev_detach_client when applications closed evdev devices. Switching VT away from a graphical environment commonly leads to mass input device closures, which could lead to noticable delays on systems with many input devices. Replace synchronize_rcu with call_rcu, deferring reclaim of the evdev client struct till after the RCU grace period instead of blocking the calling application. While this does not solve all slow evdev fd closures, it takes care of a good portion of them, including this simple test: #include #include int main(int argc, char *argv[]) { int idx, fd; const char *path = "/dev/input/event0"; for (idx = 0; idx < 1000; idx++) { if ((fd = open(path, O_RDWR)) == -1) { return -1; } close(fd); } return 0; } Time to completion of above test when run locally: Before: 0m27.111s After: 0m0.018s Signed-off-by: Kenny Levinsen --- drivers/input/evdev.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index b5cbb57ee5f600..a0f7fa1518c660 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; + struct rcu_head rcu; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; @@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev, spin_unlock(&evdev->client_lock); } +static void evdev_reclaim_client(struct rcu_head *rp) +{ + struct evdev_client *client = container_of(rp, struct evdev_client, rcu); + unsigned int i; + for (i = 0; i < EV_CNT; ++i) + bitmap_free(client->evmasks[i]); + kvfree(client); +} + static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); - synchronize_rcu(); + call_rcu(&client->rcu, evdev_reclaim_client); } static int evdev_open_device(struct evdev *evdev) @@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; - unsigned int i; mutex_lock(&evdev->mutex); @@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file) evdev_detach_client(evdev, client); - for (i = 0; i < EV_CNT; ++i) - bitmap_free(client->evmasks[i]); - - kvfree(client); - evdev_close_device(evdev); return 0; @@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file) err_free_client: evdev_detach_client(evdev, client); - kvfree(client); return error; } From 072c94330c55444d2e8e54f17d4f9c02f116e719 Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Thu, 27 Apr 2023 14:43:57 -0500 Subject: [PATCH 38/53] ZEN: Set default max map count to (INT_MAX - 5) Per [Fedora][1], they intend to change the default max map count for their distribution to improve OOTB compatibility with games played through Steam/Proton. The value they picked comes from the Steam Deck, which defaults to INT_MAX - MAPCOUNT_ELF_CORE_MARGIN. Since most ZEN and Liquorix users probably play games, follow Valve's lead and raise this value to their default. [1]: https://fedoraproject.org/wiki/Changes/IncreaseVmMaxMapCount Signed-off-by: Kai Krakow --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8617adc6becd1f..98a00ed2cdf9a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,7 +201,7 @@ static inline void __mm_zero_struct_page(struct page *page) * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +#define DEFAULT_MAX_MAP_COUNT (INT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; From df06925f7d416f8b2088cfbc8dfa20f512e07e04 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sun, 19 Apr 2020 19:59:18 -0700 Subject: [PATCH 39/53] ZEN: mm: Stop kswapd early when nothing's waiting for it to free pages Contains: - mm: Stop kswapd early when nothing's waiting for it to free pages Keeping kswapd running when all the failed allocations that invoked it are satisfied incurs a high overhead due to unnecessary page eviction and writeback, as well as spurious VM pressure events to various registered shrinkers. When kswapd doesn't need to work to make an allocation succeed anymore, stop it prematurely to save resources. Signed-off-by: Sultan Alsawaf - mm: Don't stop kswapd on a per-node basis when there are no waiters The page allocator wakes all kswapds in an allocation context's allowed nodemask in the slow path, so it doesn't make sense to have the kswapd- waiter count per each NUMA node. Instead, it should be a global counter to stop all kswapds when there are no failed allocation requests. Signed-off-by: Sultan Alsawaf - mm: Increment kswapd_waiters for throttled direct reclaimers Throttled direct reclaimers will wake up kswapd and wait for kswapd to satisfy their page allocation request, even when the failed allocation lacks the __GFP_KSWAPD_RECLAIM flag in its gfp mask. As a result, kswapd may think that there are no waiters and thus exit prematurely, causing throttled direct reclaimers lacking __GFP_KSWAPD_RECLAIM to stall on waiting for kswapd to wake them up. Incrementing the kswapd_waiters counter when such direct reclaimers become throttled fixes the problem. Signed-off-by: Sultan Alsawaf Signed-off-by: Kai Krakow --- mm/internal.h | 1 + mm/page_alloc.c | 17 ++++++++++++++--- mm/vmscan.c | 19 +++++++++++++------ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 9e0577413087c2..6df892978f899c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -734,6 +734,7 @@ extern void post_alloc_hook(struct page *page, unsigned int order, extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; +extern atomic_long_t kswapd_waiters; void free_unref_page(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 882903f42300b8..3b3980ff29c1c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -88,6 +88,8 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0); + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -4255,6 +4257,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool woke_kswapd = false; if (unlikely(nofail)) { /* @@ -4314,8 +4317,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; } - if (alloc_flags & ALLOC_KSWAPD) + if (alloc_flags & ALLOC_KSWAPD) { + if (!woke_kswapd) { + atomic_long_inc(&kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, gfp_mask, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4525,9 +4533,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_long_dec(&kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 0eb5d510d4f6b6..05c993c3c6f987 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6364,7 +6364,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 0; } -static bool allow_direct_reclaim(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -6393,6 +6393,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) wmark_ok = free_pages > pfmemalloc_reserve / 2; + /* The throttled direct reclaimer is now a kswapd waiter */ + if (unlikely(!using_kswapd && !wmark_ok)) + atomic_long_inc(&kswapd_waiters); + /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) @@ -6458,7 +6462,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (allow_direct_reclaim(pgdat)) + if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM)) goto out; break; } @@ -6480,11 +6484,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat), HZ); + allow_direct_reclaim(pgdat, true), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); + allow_direct_reclaim(pgdat, true)); + + if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM))) + atomic_long_dec(&kswapd_waiters); if (fatal_signal_pending(current)) return true; @@ -6987,14 +6994,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - allow_direct_reclaim(pgdat)) + allow_direct_reclaim(pgdat, true)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); - if (was_frozen || ret) + if (was_frozen || ret || !atomic_long_read(&kswapd_waiters)) break; /* From 430129d0042f4c6db7e73d2a9aba05811529d0a9 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:10:06 +0100 Subject: [PATCH 40/53] ZEN: INTERACTIVE: Base config item Signed-off-by: Kai Krakow --- init/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index d3755b2264bdfb..33f6fb4788ea36 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -162,6 +162,12 @@ config THREAD_INFO_IN_TASK menu "General setup" +config ZEN_INTERACTIVE + bool "Tune kernel for interactivity" + default y + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + config BROKEN bool From de351d3d98cbffd834f07c6b29c48270e0519d5a Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:11:05 +0100 Subject: [PATCH 41/53] ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices Signed-off-by: Kai Krakow --- block/elevator.c | 4 ++++ init/Kconfig | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 43ba4ab1ada7fd..8cb9e2fc13eb8e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -568,7 +568,11 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) + return elevator_find_get("bfq"); +#else return elevator_find_get("mq-deadline"); +#endif } /* diff --git a/init/Kconfig b/init/Kconfig index 33f6fb4788ea36..47db00e783f33c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -168,6 +168,10 @@ config ZEN_INTERACTIVE help Tunes the kernel for responsiveness at the cost of throughput and power usage. + --- Block Layer ---------------------------------------- + + Default scheduler for SQ..: mq-deadline -> bfq + config BROKEN bool From 9fcc435caec2c50859e5678d5edb8b79c2c53b6f Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 12 Dec 2022 00:03:03 +0100 Subject: [PATCH 42/53] ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices Signed-off-by: Kai Krakow --- block/elevator.c | 6 ++++++ init/Kconfig | 1 + 2 files changed, 7 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 8cb9e2fc13eb8e..3e9e41359d84aa 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -566,7 +566,13 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) +#if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_MQ_IOSCHED_KYBER) + return elevator_find_get("kyber"); +#elif defined(CONFIG_ZEN_INTERACTIVE) + return elevator_find_get("mq-deadline"); +#else return NULL; +#endif #if defined(CONFIG_ZEN_INTERACTIVE) && defined(CONFIG_IOSCHED_BFQ) return elevator_find_get("bfq"); diff --git a/init/Kconfig b/init/Kconfig index 47db00e783f33c..b5840fd0af47fc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -171,6 +171,7 @@ config ZEN_INTERACTIVE --- Block Layer ---------------------------------------- Default scheduler for SQ..: mq-deadline -> bfq + Default scheduler for MQ..: none -> kyber config BROKEN bool From c423a6315492cf56072ff13191e1ce29e6f4d5d6 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:21:09 +0100 Subject: [PATCH 43/53] ZEN: INTERACTIVE: Enable background reclaim of hugepages Use [defer+madvise] as default khugepaged defrag strategy: For some reason, the default strategy to respond to THP fault fallbacks is still just madvise, meaning stall if the program wants transparent hugepages, but don't trigger a background reclaim / compaction if THP begins to fail allocations. This creates a snowball affect where we still use the THP code paths, but we almost always fail once a system has been active and busy for a while. The option "defer" was created for interactive systems where THP can still improve performance. If we have to fallback to a regular page due to an allocation failure or anything else, we will trigger a background reclaim and compaction so future THP attempts succeed and previous attempts eventually have their smaller pages combined without stalling running applications. We still want madvise to stall applications that explicitely want THP, so defer+madvise _does_ make a ton of sense. Make it the default for interactive systems, especially if the kernel maintainer left transparent hugepages on "always". Reasoning and details in the original patch: https://lwn.net/Articles/711248/ Signed-off-by: Kai Krakow --- init/Kconfig | 4 ++++ mm/huge_memory.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index b5840fd0af47fc..03f0b886522be4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -173,6 +173,10 @@ config ZEN_INTERACTIVE Default scheduler for SQ..: mq-deadline -> bfq Default scheduler for MQ..: none -> kyber + --- Virtual Memory Subsystem --------------------------- + + Background-reclaim hugepages...: no -> yes + config BROKEN bool diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f94a9d41358555..5aa8937ae70f63 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -65,7 +65,11 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1< Date: Wed, 11 Aug 2021 18:47:46 -0500 Subject: [PATCH 44/53] ZEN: INTERACTIVE: Tune mgLRU to protect cache used in the last second Although not identical to the le9 patches that protect a byte-amount of cache through tunables, multigenerational LRU now supports protecting cache accessed in the last X milliseconds. In #218, Yu recommends starting with 1000ms and tuning as needed. This looks like a safe default and turning on this feature should help users that don't know they need it. Signed-off-by: Kai Krakow --- init/Kconfig | 1 + mm/vmscan.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 03f0b886522be4..73bbb1eb36d9de 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -176,6 +176,7 @@ config ZEN_INTERACTIVE --- Virtual Memory Subsystem --------------------------- Background-reclaim hugepages...: no -> yes + MG-LRU minimum cache TTL.......: 0 -> 1000 ms config BROKEN bool diff --git a/mm/vmscan.c b/mm/vmscan.c index 05c993c3c6f987..7bea4c6e1f520a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4007,7 +4007,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned long lru_gen_min_ttl __read_mostly = HZ; +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { From cb4521043ea760885313e37f49b9aabae5a3327f Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Tue, 31 Oct 2023 19:03:10 +0100 Subject: [PATCH 45/53] ZEN: INTERACTIVE: Tune EEVDF for interactivity 5.7: Take "sysctl_sched_nr_migrate" tune from early XanMod builds of 128. As of 5.7, XanMod uses 256 but that may affect applications that require timely response to IRQs. 5.15: Per [a comment][1] on our ZEN INTERACTIVE commit, reducing the cost of migration causes the system less responsive under high load. Most likely the combination of reduced migration cost + the higher number of tasks that can be migrated at once contributes to this. To better handle this situation, restore the mainline migration cost value and also reduce the max number of tasks that can be migrated in batch from 128 to 64. If this doesn't help, we'll restore the reduced migration cost and keep total number of tasks that can be migrated at once to 32. [1]: https://github.com/zen-kernel/zen-kernel/commit/be5ba234ca0a5aabe74bfc7e1f636f085bd3823c#commitcomment-63159674 6.6: Port the tuning to EEVDF, which removed a couple of settings. 6.7: Instead of increasing the number of tasks that migrate at once, migrate the amount acceptable for PREEMPT_RT, but reduce the cost so migrations occur more often. This should make CFS/EEVDF behave more like out-of-tree schedulers that aggressively use idle cores to reduce latency, but without the jank caused by rebalancing too many tasks at once. Signed-off-by: Kai Krakow --- init/Kconfig | 7 +++++++ kernel/sched/fair.c | 13 +++++++++++++ kernel/sched/sched.h | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 73bbb1eb36d9de..63ada8f2abce5e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -178,6 +178,13 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes MG-LRU minimum cache TTL.......: 0 -> 1000 ms + --- EEVDF CPU Scheduler -------------------------------- + + Minimal granularity............: 0.75 -> 0.4 ms + Migration cost.................: 0.5 -> 0.25 ms + Bandwidth slice size...........: 5 -> 3 ms + Task rebalancing threshold.....: 32 -> 8 + config BROKEN bool diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 443f6a9ef3f8f6..7f183a4f5f1697 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +unsigned int sysctl_sched_base_slice = 400000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; +#else unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; +#endif +#ifdef CONFIG_ZEN_INTERACTIVE +const_debug unsigned int sysctl_sched_migration_cost = 250000UL; +#else const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif static int __init setup_sched_thermal_decay_shift(char *str) { @@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ +#ifdef CONFIG_ZEN_INTERACTIVE +static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d79de755c1c269..f8aa96de60dad8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2797,7 +2797,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); -#ifdef CONFIG_PREEMPT_RT +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ZEN_INTERACTIVE) # define SCHED_NR_MIGRATE_BREAK 8 #else # define SCHED_NR_MIGRATE_BREAK 32 From cb1b0a95cb75b7cd11a72456c0645eca3d8fc7e0 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" Date: Mon, 27 Jan 2020 18:27:16 +0100 Subject: [PATCH 46/53] ZEN: INTERACTIVE: Tune ondemand governor for interactivity 4.10: During some personal testing with the Dolphin emulator, MuQSS has serious problems scaling its frequencies causing poor performance where boosting the CPU frequencies would have fixed them. Reducing the up_threshold to 45 with MuQSS appears to fix the issue, letting the introduction to "Star Wars: Rogue Leader" run at 100% speed versus about 80% on my test system. Also, lets refactor the definitions and include some indentation to help the reader discern what the scope of all the macros are. 5.4: On the last custom kernel benchmark from Phoronix with Xanmod, Michael configured all the kernels to run using ondemand instead of the kernel's [default selection][1]. This reminded me that another option outside of the kernels control is the user's choice to change the cpufreq governor, for better or for worse. In Liquorix, performance is the default governor whether you're running acpi-cpufreq or intel-pstate. I expect laptop users to install TLP or LMT to control the power balance on their system, especially when they're plugged in or on battery. However, it's pretty clear to me a lot of people would choose ondemand over performance since it's not obvious it has huge performance ramifications with MuQSS, and ondemand otherwise is "good enough" for most people. Lets codify lower up thresholds for MuQSS to more closely synergize with its aggressive thread migration behavior. This way when ondemand is configured, you get sort of a "performance-lite" type of result but with the power savings you expect when leaving the running system idle. [1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel 5.14: Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot more of mainline scheduling and do a good job of pinning single threaded tasks to their respective core, there's still applications that confusingly run steady near 50% and benefit from going full speed or turbo when they need to run (emulators for more recent consoles come to mind). Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60. 5.15: Remove MuQSS cpufreq configuration. Signed-off-by: Kai Krakow --- drivers/cpufreq/cpufreq_ondemand.c | 8 +++++++- init/Kconfig | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 0e65d37c923113..be91dfe6b3f044 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -18,10 +18,16 @@ #include "cpufreq_ondemand.h" /* On-demand governor macros */ +#if defined(CONFIG_ZEN_INTERACTIVE) +#define DEF_FREQUENCY_UP_THRESHOLD (55) +#define MICRO_FREQUENCY_UP_THRESHOLD (60) +#define DEF_SAMPLING_DOWN_FACTOR (5) +#else #define DEF_FREQUENCY_UP_THRESHOLD (80) +#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define DEF_SAMPLING_DOWN_FACTOR (1) +#endif #define MAX_SAMPLING_DOWN_FACTOR (100000) -#define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/init/Kconfig b/init/Kconfig index 63ada8f2abce5e..19e177cb855116 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -185,6 +185,12 @@ config ZEN_INTERACTIVE Bandwidth slice size...........: 5 -> 3 ms Task rebalancing threshold.....: 32 -> 8 + --- CPUFreq Settings ----------------------------------- + + Ondemand sampling down factor..: 1 -> 5 + Ondemand default up threshold..: 80 -> 55 + Ondemand micro up threshold....: 95 -> 60 + config BROKEN bool From 6b2a4e3875a3d3461754a000579e314b9cd0dcbb Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Sat, 5 Mar 2022 11:37:14 -0600 Subject: [PATCH 47/53] ZEN: INTERACTIVE: mm: Disable unevictable compaction This option is already disabled when CONFIG_PREEMPT_RT is enabled, lets turn it off when CONFIG_ZEN_INTERACTIVE is set as well. Signed-off-by: Kai Krakow --- init/Kconfig | 1 + mm/Kconfig | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 19e177cb855116..b308a28668c959 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -177,6 +177,7 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes MG-LRU minimum cache TTL.......: 0 -> 1000 ms + Compact unevictable............: yes -> no --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/Kconfig b/mm/Kconfig index 33fa51d608dc51..2c9579015f885f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -648,7 +648,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION - default 0 if PREEMPT_RT + default 0 if PREEMPT_RT || ZEN_INTERACTIVE default 1 # From 6dd3eb7836b6b2995291998275e8da0aead28a8b Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 28 Mar 2020 13:06:28 -0700 Subject: [PATCH 48/53] ZEN: INTERACTIVE: mm: Disable watermark boosting by default What watermark boosting does is preemptively fire up kswapd to free memory when there hasn't been an allocation failure. It does this by increasing kswapd's high watermark goal and then firing up kswapd. The reason why this causes freezes is because, with the increased high watermark goal, kswapd will steal memory from processes that need it in order to make forward progress. These processes will, in turn, try to allocate memory again, which will cause kswapd to steal necessary pages from those processes again, in a positive feedback loop known as page thrashing. When page thrashing occurs, your system is essentially livelocked until the necessary forward progress can be made to stop processes from trying to continuously allocate memory and trigger kswapd to steal it back. This problem already occurs with kswapd *without* watermark boosting, but it's usually only encountered on machines with a small amount of memory and/or a slow CPU. Watermark boosting just makes the existing problem worse enough to notice on higher spec'd machines. Disable watermark boosting by default since it's a total dumpster fire. I can't imagine why anyone would want to explicitly enable it, but the option is there in case someone does. Signed-off-by: Sultan Alsawaf --- init/Kconfig | 1 + mm/page_alloc.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index b308a28668c959..c4a204c86d47ae 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -178,6 +178,7 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes MG-LRU minimum cache TTL.......: 0 -> 1000 ms Compact unevictable............: yes -> no + Watermark boost factor.........: 1.5 -> 0 --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b3980ff29c1c5..064a480def7d85 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -273,7 +273,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_ZEN_INTERACTIVE +static int watermark_boost_factor __read_mostly; +#else static int watermark_boost_factor __read_mostly = 15000; +#endif static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ From afb3999f2d088552b072e9656f3176e3c7c266d4 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 20 Oct 2021 20:50:11 -0700 Subject: [PATCH 49/53] ZEN: INTERACTIVE: mm: Lower the non-hugetlbpage pageblock size to reduce scheduling delays The page allocator processes free pages in groups of pageblocks, where the size of a pageblock is typically quite large (1024 pages without hugetlbpage support). Pageblocks are processed atomically with the zone lock held, which can cause severe scheduling delays on both the CPU going through the pageblock and any other CPUs waiting to acquire the zone lock. A frequent offender is move_freepages_block(), which is used by rmqueue() for page allocation. As it turns out, there's no requirement for pageblocks to be so large, so the pageblock order can simply be reduced to ease the scheduling delays and zone lock contention. PAGE_ALLOC_COSTLY_ORDER is used as a reasonable setting to ensure non-costly page allocation requests can still be serviced without always needing to free up more than one pageblock's worth of pages at a time. This has a noticeable effect on overall system latency when memory pressure is elevated. The various mm functions which operate on pageblocks no longer appear in the preemptoff tracer, where previously they would spend up to 100 ms on a mobile arm64 CPU processing a pageblock with preemption disabled and the zone lock held. Signed-off-by: Sultan Alsawaf --- include/linux/pageblock-flags.h | 4 ++++ init/Kconfig | 1 + 2 files changed, 5 insertions(+) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index fc6b9c87cb0a8c..6ce05520553073 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -52,7 +52,11 @@ extern unsigned int pageblock_order; #else /* CONFIG_TRANSPARENT_HUGEPAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ +#ifdef CONFIG_ZEN_INTERACTIVE +#define pageblock_order PAGE_ALLOC_COSTLY_ORDER +#else #define pageblock_order MAX_PAGE_ORDER +#endif #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/init/Kconfig b/init/Kconfig index c4a204c86d47ae..c72edde4c6652a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -179,6 +179,7 @@ config ZEN_INTERACTIVE MG-LRU minimum cache TTL.......: 0 -> 1000 ms Compact unevictable............: yes -> no Watermark boost factor.........: 1.5 -> 0 + Pageblock order................: 10 -> 3 --- EEVDF CPU Scheduler -------------------------------- From 7681da0097145ba88b7a4b66e33d73892ca998f4 Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Sat, 21 May 2022 15:15:09 -0500 Subject: [PATCH 50/53] ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops Queueing in dm-crypt for crypto operations reduces performance on modern systems. As discussed in an article from Cloudflare, they discovered that queuing was introduced because the crypto subsystem used to be synchronous. Since it's now asynchronous, we get double queueing when using the subsystem through dm-crypt. This is obviously undesirable and reduces throughput and increases latency. Disable queueing when using our Zen Interactive configuration. Fixes: https://github.com/zen-kernel/zen-kernel/issues/282 Signed-off-by: Kai Krakow --- drivers/md/dm-crypt.c | 5 +++++ init/Kconfig | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 78c975d7cd5f42..a123ad7b471f95 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3308,6 +3308,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } +#ifdef CONFIG_ZEN_INTERACTIVE + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); +#endif + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); if (ret < 0) goto bad; diff --git a/init/Kconfig b/init/Kconfig index c72edde4c6652a..1c707c02043991 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -172,6 +172,7 @@ config ZEN_INTERACTIVE Default scheduler for SQ..: mq-deadline -> bfq Default scheduler for MQ..: none -> kyber + DM-Crypt workqueues.......: yes -> no --- Virtual Memory Subsystem --------------------------- From 83ee719a439752ac6429aabeb0cae684853b9e26 Mon Sep 17 00:00:00 2001 From: Steven Barrett Date: Mon, 5 Sep 2022 11:35:20 -0500 Subject: [PATCH 51/53] ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead Per an [issue][1] on the chromium project, swap-in readahead causes more jank than not. This might be caused by poor optimization on the swapping code, or the fact under memory pressure, we're pulling in pages we don't need, causing more swapping. Either way, this is mainline/upstream to Chromium, and ChromeOS developers care a lot about system responsiveness. Lets implement the same change so Zen Kernel users benefit. [1]: https://bugs.chromium.org/p/chromium/issues/detail?id=263561 Signed-off-by: Kai Krakow --- init/Kconfig | 1 + mm/swap.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 1c707c02043991..b7b6702b43ee40 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -181,6 +181,7 @@ config ZEN_INTERACTIVE Compact unevictable............: yes -> no Watermark boost factor.........: 1.5 -> 0 Pageblock order................: 10 -> 3 + Swap-in readahead..............: 3 -> 0 --- EEVDF CPU Scheduler -------------------------------- diff --git a/mm/swap.c b/mm/swap.c index 59f30a981c6f96..bbe49ea7b80edd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1080,6 +1080,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) */ void __init swap_setup(void) { +#ifdef CONFIG_ZEN_INTERACTIVE + /* Only swap-in pages requested, avoid readahead */ + page_cluster = 0; +#else unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ @@ -1091,4 +1095,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ +#endif } From c6a5e6887b3332b393b537341bea6c564448ed12 Mon Sep 17 00:00:00 2001 From: Masahito S Date: Wed, 27 Aug 2025 08:07:40 +0900 Subject: [PATCH 52/53] linux6.12.37-bore-6.5.2 --- include/linux/sched.h | 29 +++ include/linux/sched/bore.h | 39 ++++ init/Kconfig | 17 ++ kernel/Kconfig.hz | 17 ++ kernel/fork.c | 8 + kernel/futex/waitwake.c | 11 ++ kernel/sched/Makefile | 1 + kernel/sched/bore.c | 393 +++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 ++ kernel/sched/debug.c | 61 +++++- kernel/sched/fair.c | 118 +++++++++-- kernel/sched/features.h | 3 + kernel/sched/sched.h | 9 + 13 files changed, 703 insertions(+), 15 deletions(-) create mode 100644 include/linux/sched/bore.h create mode 100644 kernel/sched/bore.c diff --git a/include/linux/sched.h b/include/linux/sched.h index 0d1d70aded38f6..2a0aba074796b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -782,6 +782,32 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_SCHED_BORE +#define BORE_BC_TIMESTAMP_SHIFT 16 + +struct bore_bc { + u64 timestamp: 48; + u64 penalty: 16; +}; + +struct bore_ctx { + struct bore_bc subtree; + struct bore_bc group; + u64 burst_time; + u16 prev_penalty; + u16 curr_penalty; + union { + u16 penalty; + struct { + u8 _; + u8 score; + }; + }; + bool stop_update; + bool futex_waiting; +}; +#endif /* CONFIG_SCHED_BORE */ + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -842,6 +868,9 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif +#ifdef CONFIG_SCHED_BORE + struct bore_ctx bore; +#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h new file mode 100644 index 00000000000000..5afb5fed4e102a --- /dev/null +++ b/include/linux/sched/bore.h @@ -0,0 +1,39 @@ +#ifndef _KERNEL_SCHED_BORE_H +#define _KERNEL_SCHED_BORE_H + +#include +#include +#include +#include +#include + +#define SCHED_BORE_AUTHOR "Masahito Suzuki" +#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" + +#define SCHED_BORE_VERSION "6.5.2" + +extern u8 __read_mostly sched_bore; +extern u8 __read_mostly sched_burst_inherit_type; +extern u8 __read_mostly sched_burst_smoothness; +extern u8 __read_mostly sched_burst_penalty_offset; +extern uint __read_mostly sched_burst_penalty_scale; +extern uint __read_mostly sched_burst_cache_lifetime; + +extern u8 effective_prio_bore(struct task_struct *p); +extern void update_curr_bore(struct task_struct *p, u64 delta_exec); +extern void restart_burst_bore(struct task_struct *p); +extern void restart_burst_rescale_deadline_bore(struct task_struct *p); +extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, + u64 clone_flags, u64 now); +extern void sched_init_bore(void); +extern void reset_task_bore(struct task_struct *p); + +extern int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +extern void reweight_entity( + struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); + +#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/init/Kconfig b/init/Kconfig index b7b6702b43ee40..c702425c758c84 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1383,6 +1383,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. +config SCHED_BORE + bool "Burst-Oriented Response Enhancer" + default y + help + In Desktop and Mobile computing, one might prefer interactive + tasks to keep responsive no matter what they run in the background. + + Enabling this kernel feature modifies the scheduler to discriminate + tasks by their burst time (runtime since it last went sleeping or + yielding state) and prioritize those that run less bursty. + Such tasks usually include window compositor, widgets backend, + terminal emulator, video playback, games and so on. + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888ef1..4ee584ce93dfe7 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -57,3 +57,20 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS + +config MIN_BASE_SLICE_NS + int "Default value for min_base_slice_ns" + default 2000000 + help + The BORE Scheduler automatically calculates the optimal base + slice for the configured HZ using the following equation: + + base_slice_ns = + 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) + + This option sets the default lower bound limit of the base slice + to prevent the loss of task throughput due to overscheduling. + + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. diff --git a/kernel/fork.c b/kernel/fork.c index 97c9afe3efc38d..e14b29847c858a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -113,6 +113,10 @@ #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + #include #define CREATE_TRACE_POINTS @@ -2524,6 +2528,10 @@ __latent_entropy struct task_struct *copy_process( * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); +#ifdef CONFIG_SCHED_BORE + if (likely(p->pid)) + task_fork_bore(p, current, clone_flags, p->start_time); +#endif /* CONFIG_SCHED_BORE */ /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index a9056acb75eef9..e6042508651c4b 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,6 +4,9 @@ #include #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif // CONFIG_SCHED_BORE #include "futex.h" @@ -367,7 +370,15 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) +#ifdef CONFIG_SCHED_BORE + { + current->bore.futex_waiting = true; +#endif // CONFIG_SCHED_BORE schedule(); +#ifdef CONFIG_SCHED_BORE + current->bore.futex_waiting = false; + } +#endif // CONFIG_SCHED_BORE } __set_current_state(TASK_RUNNING); } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd4520..83cbd093b8b586 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -32,3 +32,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c new file mode 100644 index 00000000000000..c9e76eda35c82d --- /dev/null +++ b/kernel/sched/bore.c @@ -0,0 +1,393 @@ +/* + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler + * Copyright (C) 2021-2025 Masahito Suzuki + */ +#include +#include +#include +#include "sched.h" + +#ifdef CONFIG_SCHED_BORE +u8 __read_mostly sched_bore = 1; +u8 __read_mostly sched_burst_inherit_type = 2; +u8 __read_mostly sched_burst_smoothness = 1; +u8 __read_mostly sched_burst_penalty_offset = 24; +uint __read_mostly sched_burst_penalty_scale = 1536; +uint __read_mostly sched_burst_cache_lifetime = 75000000; +static int __maybe_unused maxval_prio = 39; +static int __maybe_unused maxval_6_bits = 63; +static int __maybe_unused maxval_8_bits = 255; +static int __maybe_unused maxval_12_bits = 4095; + +#define MAX_BURST_PENALTY ((40U << 8) - 1) +#define BURST_CACHE_STOP_COUNT 63 + +static u32 (*inherit_penalty_fn)(struct task_struct *, u64, u64); + +static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { + if (!v) return 0; + u32 exponent = fls64(v), + mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp)); + return exponent << fp | mantissa; +} + +static inline u32 calc_burst_penalty(u64 burst_time) { + u32 greed = log2p1_u64_u32fp(burst_time, 8), + tolerance = sched_burst_penalty_offset << 8, + penalty = max(0, (s32)(greed - tolerance)), + scaled_penalty = penalty * sched_burst_penalty_scale >> 10; + return min(MAX_BURST_PENALTY, scaled_penalty); +} + +static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { + u64 unscaled, rescaled; + unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); + rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); + return rescaled; +} + +static inline u32 binary_smooth(u32 new, u32 old) { + if (new <= old) return new; + + u32 increment = new - old, + shift = sched_burst_smoothness, + divisor = 1U << shift; + + return old + ((increment + divisor - 1) >> shift); +} + +static void reweight_task_by_prio(struct task_struct *p, int prio) { + if (task_has_idle_policy(p)) return; + + struct sched_entity *se = &p->se; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + if (se->on_rq) { + p->bore.stop_update = true; + reweight_entity(cfs_rq_of(se), se, weight); + p->bore.stop_update = false; + } else + se->load.weight = weight; + se->load.inv_weight = sched_prio_to_wmult[prio]; +} + +u8 effective_prio_bore(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + if (likely(sched_bore)) + prio += p->bore.score; + return (u8)clamp(prio, 0, maxval_prio); +} + +static void update_penalty(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + + u8 prev_prio = effective_prio_bore(p); + u32 penalty = 0; + + if (!(p->flags & PF_KTHREAD)) { + u32 curr_penalty = ctx->curr_penalty; + penalty = ctx->prev_penalty; + if (penalty < curr_penalty) + penalty = curr_penalty; + } + ctx->penalty = penalty; + + u8 new_prio = effective_prio_bore(p); + if (new_prio != prev_prio) + reweight_task_by_prio(p, new_prio); +} + +void update_curr_bore(struct task_struct *p, u64 delta_exec) { + struct bore_ctx *ctx = &p->bore; + if (ctx->stop_update) return; + + ctx->burst_time += delta_exec; + u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); + + if (curr_penalty <= ctx->prev_penalty) return; + update_penalty(p); +} + +void restart_burst_bore(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); + ctx->prev_penalty = new_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + update_penalty(p); +} + +void restart_burst_rescale_deadline_bore(struct task_struct *p) { + struct sched_entity *se = &p->se; + s64 vscaled, vremain = se->deadline - se->vruntime; + + u8 old_prio = effective_prio_bore(p); + restart_burst_bore(p); + u8 new_prio = effective_prio_bore(p); + + if (old_prio > new_prio) { + vscaled = rescale_slice(abs(vremain), old_prio, new_prio); + if (unlikely(vremain < 0)) + vscaled = -vscaled; + se->deadline = se->vruntime + vscaled; + } +} + +static inline bool task_is_bore_eligible(struct task_struct *p) +{return p && p->sched_class == &fair_sched_class && !p->exit_state;} + +#ifndef for_each_child_task +#define for_each_child_task(p, t) \ + list_for_each_entry(t, &(p)->children, sibling) +#endif + +static inline u32 count_children_upto2(struct task_struct *p) { + struct list_head *head = &p->children; + struct list_head *next = head->next; + return (next != head) + (next->next != head); +} + +static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { + u64 timestamp = bc->timestamp << BORE_BC_TIMESTAMP_SHIFT; + return now - timestamp > sched_burst_cache_lifetime; +} + +static void update_burst_cache(struct bore_bc *bc, + struct task_struct *p, u32 count, u32 total, u64 now) { + u32 average = count ? total / count : 0; + bc->penalty = max(average, p->bore.penalty); + bc->timestamp = now >> BORE_BC_TIMESTAMP_SHIFT; +} + +static u32 inherit_none(struct task_struct *parent, + u64 clone_flags, u64 now) +{ return 0; } + +static u32 inherit_from_parent(struct task_struct *parent, + u64 clone_flags, u64 now) { + if (clone_flags & CLONE_PARENT) + parent = parent->real_parent; + + struct bore_bc *bc = &parent->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *child; + u32 count = 0, total = 0; + for_each_child_task(parent, child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(child)) continue; + count++; + total += child->bore.penalty; + } + + update_burst_cache(bc, parent, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_ancestor_hub(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct task_struct *ancestor = parent; + u32 sole_child_count = 0; + + if (clone_flags & CLONE_PARENT) { + ancestor = ancestor->real_parent; + sole_child_count = 1; + } + + for (struct task_struct *next; + (next = ancestor->real_parent) != ancestor && + count_children_upto2(ancestor) <= sole_child_count; + ancestor = next, sole_child_count = 1) {} + + struct bore_bc *bc = &ancestor->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *direct_child; + u32 count = 0, total = 0; + for_each_child_task(ancestor, direct_child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + struct task_struct *descendant = direct_child; + while (count_children_upto2(descendant) == 1) + descendant = list_first_entry(&descendant->children, + struct task_struct, sibling); + + if (!task_is_bore_eligible(descendant)) continue; + count++; + total += descendant->bore.penalty; + } + + update_burst_cache(bc, ancestor, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { + struct task_struct *leader = p->group_leader; + struct bore_bc *bc = &leader->bore.group; + + if (burst_cache_expired(bc, now)) { + struct task_struct *sibling; + u32 count = 0, total = 0; + + for_each_thread(leader, sibling) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(sibling)) continue; + count++; + total += sibling->bore.penalty; + } + + update_burst_cache(bc, leader, count, total, now); + } + + return bc->penalty; +} + +void task_fork_bore(struct task_struct *p, + struct task_struct *parent, u64 clone_flags, u64 now) { + if (!task_is_bore_eligible(p) || unlikely(!sched_bore)) return; + + struct bore_ctx *ctx = &p->bore; + u32 inherited_penalty = (clone_flags & CLONE_THREAD)? + inherit_from_thread_group(parent, now): + inherit_penalty_fn(parent, clone_flags, now); + + if (ctx->prev_penalty < inherited_penalty) + ctx->prev_penalty = inherited_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + ctx->stop_update = false; + ctx->futex_waiting = false; + update_penalty(p); +} + +void reset_task_bore(struct task_struct *p) +{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } + +static void update_inherit_type(void) { + switch(sched_burst_inherit_type) { + case 1: + inherit_penalty_fn = inherit_from_parent; + break; + case 2: + inherit_penalty_fn = inherit_from_ancestor_hub; + break; + default: + inherit_penalty_fn = inherit_none; + } +} + +void __init sched_init_bore(void) { + printk(KERN_INFO "%s %s by %s\n", + SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); + + reset_task_bore(&init_task); + update_inherit_type(); +} + +static void readjust_all_task_weights(void) { + struct task_struct *task; + struct rq *rq; + struct rq_flags rf; + + scoped_guard(write_lock_irq, &tasklist_lock) + for_each_process(task) { + if (!task_is_bore_eligible(task)) continue; + rq = task_rq_lock(task, &rf); + update_rq_clock(rq); + reweight_task_by_prio(task, effective_prio_bore(task)); + task_rq_unlock(rq, task, &rf); + } +} + +int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_inherit_type(); + + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_bore_sysctls[] = { + { + .procname = "sched_bore", + .data = &sched_bore, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_bore_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_inherit_type", + .data = &sched_burst_inherit_type, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_inherit_type_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "sched_burst_penalty_offset", + .data = &sched_burst_penalty_offset, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_6_bits, + }, + { + .procname = "sched_burst_penalty_scale", + .data = &sched_burst_penalty_scale, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_12_bits, + }, + { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static int __init sched_bore_sysctl_init(void) { + register_sysctl_init("kernel", sched_bore_sysctls); + return 0; +} +late_initcall(sched_bore_sysctl_init); + +#endif // CONFIG_SYSCTL +#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9bb1b4c58421f..83b743b823a7eb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,6 +97,10 @@ #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1370,7 +1374,11 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { +#ifdef CONFIG_SCHED_BORE + int prio = effective_prio_bore(p); +#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; +#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8393,6 +8401,10 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); +#endif /* CONFIG_SCHED_BORE */ + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1e3bc0774efd51..71fe8719a9bf28 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = { }; #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_BORE +#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ +static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ +{ \ + char buf[16]; \ + unsigned int value; \ +\ + if (cnt > 15) \ + cnt = 15; \ +\ + if (copy_from_user(&buf, ubuf, cnt)) \ + return -EFAULT; \ + buf[cnt] = '\0'; \ +\ + if (kstrtouint(buf, 10, &value)) \ + return -EINVAL; \ +\ + sysctl_sched_##name = value; \ + sched_update_##update_func(); \ +\ + *ppos += cnt; \ + return cnt; \ +} \ +\ +static int sched_##name##_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", sysctl_sched_##name); \ + return 0; \ +} \ +\ +static int sched_##name##_open(struct inode *inode, struct file *filp) \ +{ \ + return single_open(filp, sched_##name##_show, NULL); \ +} \ +\ +static const struct file_operations sched_##name##_fops = { \ + .open = sched_##name##_open, \ + .write = sched_##name##_write, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) +#undef DEFINE_SYSCTL_SCHED_FUNC +#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; - +#endif /* CONFIG_SCHED_BORE */ #endif /* SMP */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -504,13 +550,20 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifdef CONFIG_SCHED_BORE + debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); + debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); +#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); #ifdef CONFIG_SMP +#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -755,6 +808,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE + SEQ_printf(m, " %2d", p->bore.score); +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1244,6 +1300,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.load.weight); #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_BORE + P(bore.score); +#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f183a4f5f1697..a4550327427bf1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,10 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * The initial- and re-scaling of tunables is configurable * @@ -64,22 +68,36 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant + * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) + * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice + * (default min_base_slice = 2000000 constant, units: nanoseconds) + * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds */ -#ifdef CONFIG_ZEN_INTERACTIVE + +#ifdef CONFIG_SCHED_BORE +static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; +unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +#else /* !CONFIG_SCHED_BORE */ +# ifdef CONFIG_ZEN_INTERACTIVE unsigned int sysctl_sched_base_slice = 400000ULL; static unsigned int normalized_sysctl_sched_base_slice = 400000ULL; -#else +# else /* !CONFIG_ZEN_INTERACTIVE */ unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -#endif +# endif /* CONFIG_ZEN_INTERACTIVE */ +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_ZEN_INTERACTIVE const_debug unsigned int sysctl_sched_migration_cost = 250000UL; @@ -201,6 +219,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE +static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} +void sched_update_min_base_slice(void) { update_sysctl(); } +#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -231,6 +256,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } +#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -708,6 +734,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE + limit >>= !!sched_bore; +#endif /* CONFIG_SCHED_BORE */ return clamp(vlag, -limit, limit); } @@ -954,7 +983,16 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; +#if !defined(CONFIG_SCHED_BORE) if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) +#else /* CONFIG_SCHED_BORE */ + bool run_to_parity = likely(sched_bore) ? + sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY); + if (run_to_parity && curr && protect_slice(curr) && + (!entity_is_task(curr) || + !task_of(curr)->bore.futex_waiting || + unlikely(!sched_bore))) +#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1013,6 +1051,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP +#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1024,6 +1063,7 @@ int sched_update_scaling(void) return 0; } +#endif /* CONFIG_SCHED_BORE */ #endif #endif @@ -1261,6 +1301,9 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *p = task_of(curr); +#ifdef CONFIG_SCHED_BORE + update_curr_bore(p, delta_exec); +#endif /* CONFIG_SCHED_BORE */ update_curr_task(p, delta_exec); /* @@ -3909,7 +3952,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, se->deadline = avruntime + vslice; } -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; @@ -5309,12 +5352,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice, vruntime = avg_vruntime(cfs_rq); + u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5399,7 +5441,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } - +#ifdef CONFIG_SCHED_BORE + if (entity_is_task(se) && + likely(sched_bore) && + task_of(se)->bore.futex_waiting) + goto vslice_found; +#endif /* !CONFIG_SCHED_BORE */ + vslice = calc_delta_fair(se->slice, se); +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); + else +#endif /* CONFIG_SCHED_BORE */ /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5408,6 +5461,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; +#ifdef CONFIG_SCHED_BORE +vslice_found: +#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5420,7 +5476,7 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); static void -requeue_delayed_entity(struct sched_entity *se); +requeue_delayed_entity(struct sched_entity *se, int flags); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5583,6 +5639,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); +#ifdef CONFIG_SCHED_BORE + if (sched_feat(DELAY_ZERO) && likely(sched_bore)) + update_entity_lag(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -7003,7 +7063,7 @@ static int sched_idle_cpu(int cpu) #endif static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -7016,13 +7076,22 @@ requeue_delayed_entity(struct sched_entity *se) SCHED_WARN_ON(!se->on_rq); if (sched_feat(DELAY_ZERO)) { +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + flags |= ENQUEUE_WAKEUP; + else { +#endif /* CONFIG_SCHED_BORE */ + flags = 0; update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + } +#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_running--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, 0); + place_entity(cfs_rq, se, flags); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_running++; @@ -7059,7 +7128,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); return; } @@ -7077,7 +7146,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); break; } cfs_rq = cfs_rq_of(se); @@ -7299,6 +7368,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); +#ifdef CONFIG_SCHED_BORE + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se = &p->se; + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + if (cfs_rq->curr == se) + update_curr(cfs_rq_of(&p->se)); + restart_burst_bore(p); + } +#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -9119,16 +9197,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ +#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_BORE + restart_burst_rescale_deadline_bore(curr); + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13307,6 +13394,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) SCHED_WARN_ON(p->se.sched_delayed); attach_task_cfs_rq(p); +#ifdef CONFIG_SCHED_BORE + reset_task_bore(p); +#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 050d7503064e3a..b81b5a14750e16 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -18,6 +18,9 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) * 0-lag point or until is has exhausted it's slice. */ SCHED_FEAT(RUN_TO_PARITY, true) +#ifdef CONFIG_SCHED_BORE +SCHED_FEAT(RUN_TO_PARITY_BORE, false) +#endif /* CONFIG_SCHED_BORE */ /* * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for * current. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f8aa96de60dad8..0610b03a69040f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2084,7 +2084,11 @@ static inline void update_sched_domain_debugfs(void) { } static inline void dirty_sched_domain_sysctl(int cpu) { } #endif +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); +#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); +#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -2806,7 +2810,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_BORE +extern unsigned int sysctl_sched_min_base_slice; +extern __read_mostly uint sysctl_sched_base_slice; +#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; From 5f3b5c8ea9fb04955ebc91712e48ad14e3751444 Mon Sep 17 00:00:00 2001 From: Masahito S Date: Sat, 10 May 2025 13:19:56 +0900 Subject: [PATCH 53/53] sched/fair: Prefer full-idle SMT cores When selecting an idle CPU for a task, always try to prioritize full-idle SMT cores (CPUs belonging to an SMT core where all its sibling are idle) over partially-idle cores. Signed-off-by: Andrea Righi Original patch at: https://web.git.kernel.org/pub/scm/linux/kernel/git/arighi/linux.git/commit/?h=sched-smt-idle --- kernel/sched/fair.c | 69 ++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a4550327427bf1..0624f23cf0547e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7737,9 +7737,14 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas return new_cpu; } +static inline bool is_idle_cpu(int cpu) +{ + return available_idle_cpu(cpu) || sched_idle_cpu(cpu); +} + static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + if (is_idle_cpu(cpu) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; @@ -7750,6 +7755,24 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); +/* + * Return true if all the CPUs in the SMT core where @cpu belongs are idle, + * false otherwise. + */ +static bool is_idle_core(int cpu) +{ + int sibling; + + if (!sched_smt_active()) + return is_idle_cpu(cpu); + + for_each_cpu(sibling, cpu_smt_mask(cpu)) + if (!is_idle_cpu(sibling)) + return false; + + return true; +} + static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; @@ -7832,29 +7855,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { - if (cpu == target) - continue; - /* - * Check if the CPU is in the LLC scheduling domain of @target. - * Due to isolcpus, there is no guarantee that all the siblings are in the domain. - */ - if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } - - return -1; -} - #else /* CONFIG_SCHED_SMT */ static inline void set_idle_cores(int cpu, int val) @@ -7871,9 +7871,9 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline bool is_idle_core(int cpu) { - return -1; + return is_idle_cpu(cpu); } #endif /* CONFIG_SCHED_SMT */ @@ -7970,7 +7970,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!is_idle_cpu(cpu)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -8041,7 +8041,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (is_idle_core(target) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -8049,7 +8049,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + is_idle_core(prev) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -8081,7 +8081,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + is_idle_core(recent_used_cpu) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -8117,16 +8117,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - if (sched_smt_active()) { + if (sched_smt_active()) has_idle_core = test_idle_cores(target); - if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); - if ((unsigned int)i < nr_cpumask_bits) - return i; - } - } - i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i;