From e25fc8b5118fba87acd5bc677641926a913bb3a5 Mon Sep 17 00:00:00 2001 From: Shell Date: Thu, 16 May 2024 15:45:12 +0800 Subject: [PATCH] [libcpu/arm64] add C11 atomic ticket spinlock (#8882) * [libcpu/arm64] add C11 atomic ticket spinlock Replace the former implementation of flag-based spinlock which is unfair Besides, C11 atomic implementation is more readable (it's C anyway), and maintainable. Cause toolchain can use their builtin optimization and tune for different micro-architectures. For example armv8.5 introduces a better instruction. The compiler can help with that when it knows your target platform in support of it. Signed-off-by: Shell * fixup: RT_CPUS_NR --------- Signed-off-by: Shell --- .../utest/testcases/kernel/sched_sem_tc.c | 4 +- libcpu/Kconfig | 3 + libcpu/aarch64/common/context_gcc.S | 4 + libcpu/aarch64/common/cpu.c | 109 ++++++---- libcpu/aarch64/common/cpu_gcc.S | 196 ++++++++++++------ libcpu/aarch64/common/cpuport.h | 38 +++- src/cpu_mp.c | 2 +- src/scheduler_mp.c | 1 + src/thread.c | 3 +- 9 files changed, 257 insertions(+), 103 deletions(-) diff --git a/examples/utest/testcases/kernel/sched_sem_tc.c b/examples/utest/testcases/kernel/sched_sem_tc.c index a6302daba2..f071e143fe 100644 --- a/examples/utest/testcases/kernel/sched_sem_tc.c +++ b/examples/utest/testcases/kernel/sched_sem_tc.c @@ -29,7 +29,7 @@ #error the thread priority should at least be greater than idle #endif -static rt_atomic_t _star_counter = 1; +static rt_atomic_t _star_counter; static struct rt_semaphore _thr_exit_sem; static struct rt_semaphore _level_waiting[TEST_LEVEL_COUNTS]; static rt_thread_t _thread_matrix[TEST_LEVEL_COUNTS][KERN_TEST_CONCURRENT_THREADS]; @@ -157,6 +157,8 @@ static void scheduler_tc(void) static rt_err_t utest_tc_init(void) { LOG_I("Setup environment..."); + _star_counter = 1; + rt_memset(_load_average, 0, sizeof(_load_average)); rt_sem_init(&_thr_exit_sem, "test", 0, RT_IPC_FLAG_PRIO); for (size_t i = 0; i < TEST_LEVEL_COUNTS; i++) diff --git a/libcpu/Kconfig b/libcpu/Kconfig index 200c2f6295..540794c43f 100644 --- a/libcpu/Kconfig +++ b/libcpu/Kconfig @@ -12,6 +12,9 @@ if ARCH_ARMV8 && ARCH_CPU_64BIT config ARCH_HAVE_EFFICIENT_UNALIGNED_ACCESS bool default y + config ARCH_USING_GENERIC_CPUID + bool "Using generic cpuid implemenation" + default n endmenu endif diff --git a/libcpu/aarch64/common/context_gcc.S b/libcpu/aarch64/common/context_gcc.S index 9feebd0a89..f27144b764 100644 --- a/libcpu/aarch64/common/context_gcc.S +++ b/libcpu/aarch64/common/context_gcc.S @@ -44,7 +44,11 @@ int rt_hw_cpu_id(void) .weak rt_hw_cpu_id .type rt_hw_cpu_id, @function rt_hw_cpu_id: +#if RT_CPUS_NR > 1 mrs x0, tpidr_el1 +#else + mov x0, xzr +#endif ret /* diff --git a/libcpu/aarch64/common/cpu.c b/libcpu/aarch64/common/cpu.c index 75781abdae..72ccccac24 100644 --- a/libcpu/aarch64/common/cpu.c +++ b/libcpu/aarch64/common/cpu.c @@ -8,6 +8,7 @@ * 2011-09-15 Bernard first version * 2019-07-28 zdzn add smp support * 2023-02-21 GuEe-GUI mov cpu ofw init to setup + * 2024-04-29 Shell Add generic ticket spinlock using C11 atomic */ #include @@ -55,65 +56,101 @@ rt_weak rt_uint64_t rt_cpu_mpidr_early[] = }; #endif /* RT_USING_SMART */ -static inline void arch_spin_lock(arch_spinlock_t *lock) -{ - unsigned int tmp; +/* in support of C11 atomic */ +#if __STDC_VERSION__ >= 201112L +#include - asm volatile( - " sevl\n" - "1: wfe\n" - "2: ldaxr %w0, %1\n" - " cbnz %w0, 1b\n" - " stxr %w0, %w2, %1\n" - " cbnz %w0, 2b\n" - : "=&r" (tmp), "+Q" (lock->lock) - : "r" (1) - : "cc", "memory"); +union _spinlock +{ + _Atomic(rt_uint32_t) _value; + struct + { + _Atomic(rt_uint16_t) owner; + _Atomic(rt_uint16_t) next; + } ticket; +}; + +void rt_hw_spin_lock_init(rt_hw_spinlock_t *_lock) +{ + union _spinlock *lock = (void *)_lock; + + /** + * just a dummy note that this is an atomic operation, though it alway is + * even without usage of atomic API in arm64 + */ + atomic_store_explicit(&lock->_value, 0, memory_order_relaxed); } -static inline int arch_spin_trylock(arch_spinlock_t *lock) +rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *_lock) { - unsigned int tmp; + rt_bool_t rc; + rt_uint32_t readonce; + union _spinlock temp; + union _spinlock *lock = (void *)_lock; - asm volatile( - " ldaxr %w0, %1\n" - " cbnz %w0, 1f\n" - " stxr %w0, %w2, %1\n" - "1:\n" - : "=&r" (tmp), "+Q" (lock->lock) - : "r" (1) - : "cc", "memory"); + readonce = atomic_load_explicit(&lock->_value, memory_order_acquire); + temp._value = readonce; - return !tmp; + if (temp.ticket.owner != temp.ticket.next) + { + rc = RT_FALSE; + } + else + { + temp.ticket.next += 1; + rc = atomic_compare_exchange_strong_explicit( + &lock->_value, &readonce, temp._value, + memory_order_acquire, memory_order_relaxed); + } + return rc; } -static inline void arch_spin_unlock(arch_spinlock_t *lock) +rt_inline rt_base_t _load_acq_exclusive(_Atomic(rt_uint16_t) *halfword) { - asm volatile( - " stlr %w1, %0\n" - : "=Q" (lock->lock) : "r" (0) : "memory"); + rt_uint32_t old; + __asm__ volatile("ldaxrh %w0, [%1]" + : "=&r"(old) + : "r"(halfword) + : "memory"); + return old; } -void rt_hw_spin_lock_init(arch_spinlock_t *lock) +rt_inline void _send_event_local(void) { - lock->lock = 0; + __asm__ volatile("sevl"); } -void rt_hw_spin_lock(rt_hw_spinlock_t *lock) +rt_inline void _wait_for_event(void) { - arch_spin_lock(lock); + __asm__ volatile("wfe" ::: "memory"); } -void rt_hw_spin_unlock(rt_hw_spinlock_t *lock) +void rt_hw_spin_lock(rt_hw_spinlock_t *_lock) { - arch_spin_unlock(lock); + union _spinlock *lock = (void *)_lock; + rt_uint16_t ticket = + atomic_fetch_add_explicit(&lock->ticket.next, 1, memory_order_relaxed); + + if (atomic_load_explicit(&lock->ticket.owner, memory_order_acquire) != + ticket) + { + _send_event_local(); + do + { + _wait_for_event(); + } + while (_load_acq_exclusive(&lock->ticket.owner) != ticket); + } } -rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *lock) +void rt_hw_spin_unlock(rt_hw_spinlock_t *_lock) { - return arch_spin_trylock(lock); + union _spinlock *lock = (void *)_lock; + atomic_fetch_add_explicit(&lock->ticket.owner, 1, memory_order_release); } +#endif + static int _cpus_init_data_hardcoded(int num_cpus, rt_uint64_t *cpu_hw_ids, struct cpu_ops_t *cpu_ops[]) { // load in cpu_hw_ids in cpuid_to_hwid, diff --git a/libcpu/aarch64/common/cpu_gcc.S b/libcpu/aarch64/common/cpu_gcc.S index 445c104dce..e0e6544eb9 100644 --- a/libcpu/aarch64/common/cpu_gcc.S +++ b/libcpu/aarch64/common/cpu_gcc.S @@ -1,103 +1,177 @@ /* - * Copyright (c) 2006-2020, RT-Thread Development Team + * Copyright (c) 2006-2024, RT-Thread Development Team * * SPDX-License-Identifier: Apache-2.0 * * Date Author Notes * 2018-10-06 ZhaoXiaowei the first version + * 2024-04-28 Shell add generic spinlock implementation */ - + .text .globl rt_hw_get_current_el rt_hw_get_current_el: - MRS X0, CurrentEL - CMP X0, 0xc - B.EQ 3f - CMP X0, 0x8 - B.EQ 2f - CMP X0, 0x4 - B.EQ 1f - - LDR X0, =0 - B 0f + MRS X0, CurrentEL + CMP X0, 0xc + B.EQ 3f + CMP X0, 0x8 + B.EQ 2f + CMP X0, 0x4 + B.EQ 1f + + LDR X0, =0 + B 0f 3: - LDR X0, =3 - B 0f + LDR X0, =3 + B 0f 2: - LDR X0, =2 - B 0f + LDR X0, =2 + B 0f 1: - LDR X0, =1 - B 0f + LDR X0, =1 + B 0f 0: - RET + RET .globl rt_hw_set_current_vbar rt_hw_set_current_vbar: - MRS X1, CurrentEL - CMP X1, 0xc - B.EQ 3f - CMP X1, 0x8 - B.EQ 2f - CMP X1, 0x4 - B.EQ 1f - B 0f + MRS X1, CurrentEL + CMP X1, 0xc + B.EQ 3f + CMP X1, 0x8 + B.EQ 2f + CMP X1, 0x4 + B.EQ 1f + B 0f 3: - MSR VBAR_EL3,X0 - B 0f + MSR VBAR_EL3,X0 + B 0f 2: - MSR VBAR_EL2,X0 - B 0f + MSR VBAR_EL2,X0 + B 0f 1: - MSR VBAR_EL1,X0 - B 0f + MSR VBAR_EL1,X0 + B 0f 0: - RET + RET .globl rt_hw_set_elx_env rt_hw_set_elx_env: - MRS X1, CurrentEL - CMP X1, 0xc - B.EQ 3f - CMP X1, 0x8 - B.EQ 2f - CMP X1, 0x4 - B.EQ 1f - B 0f + MRS X1, CurrentEL + CMP X1, 0xc + B.EQ 3f + CMP X1, 0x8 + B.EQ 2f + CMP X1, 0x4 + B.EQ 1f + B 0f 3: - MRS X0, SCR_EL3 - ORR X0, X0, #0xF /* SCR_EL3.NS|IRQ|FIQ|EA */ - MSR SCR_EL3, X0 - B 0f + MRS X0, SCR_EL3 + ORR X0, X0, #0xF /* SCR_EL3.NS|IRQ|FIQ|EA */ + MSR SCR_EL3, X0 + B 0f 2: - MRS X0, HCR_EL2 - ORR X0, X0, #0x38 - MSR HCR_EL2, X0 - B 0f + MRS X0, HCR_EL2 + ORR X0, X0, #0x38 + MSR HCR_EL2, X0 + B 0f 1: - B 0f + B 0f 0: - RET + RET -.global rt_cpu_vector_set_base +.globl rt_cpu_vector_set_base rt_cpu_vector_set_base: - MSR VBAR_EL1,X0 + MSR VBAR_EL1,X0 RET /** * unsigned long rt_hw_ffz(unsigned long x) */ -.global rt_hw_ffz +.globl rt_hw_ffz rt_hw_ffz: - mvn x1, x0 - clz x0, x1 - mov x1, #0x3f - sub x0, x1, x0 + mvn x1, x0 + clz x0, x1 + mov x1, #0x3f + sub x0, x1, x0 ret -.global rt_hw_clz +.globl rt_hw_clz rt_hw_clz: - clz x0, x0 + clz x0, x0 + ret + +/** + * Spinlock (fallback implementation) + */ + +rt_hw_spin_lock_init: + .weak rt_hw_spin_lock_init + stlr wzr, [x0] + ret + +rt_hw_spin_trylock: + .weak rt_hw_spin_trylock + sub sp, sp, #16 + ldar w2, [x0] + add x1, sp, 8 + stlr w2, [x1] + ldarh w1, [x1] + and w1, w1, 65535 + add x3, sp, 10 + ldarh w3, [x3] + cmp w1, w3, uxth + beq 1f + mov w0, 0 + add sp, sp, 16 + ret +1: + add x1, sp, 10 +2: + ldaxrh w3, [x1] + add w3, w3, 1 + stlxrh w4, w3, [x1] + cbnz w4, 2b + add x1, sp, 8 + ldar w1, [x1] +3: + ldaxr w3, [x0] + cmp w3, w2 + bne 4f + stxr w4, w1, [x0] + cbnz w4, 3b +4: + cset w0, eq + add sp, sp, 16 + ret + +rt_hw_spin_lock: + .weak rt_hw_spin_lock + add x1, x0, 2 +1: + ldxrh w2, [x1] + add w3, w2, 1 + stxrh w4, w3, [x1] + cbnz w4, 1b + and w2, w2, 65535 + ldarh w1, [x0] + cmp w2, w1, uxth + beq 3f + sevl +2: + wfe + ldaxrh w1, [x0] + cmp w2, w1 + bne 2b +3: + ret + +rt_hw_spin_unlock: + .weak rt_hw_spin_unlock + ldxrh w1, [x0] + add w1, w1, 1 + stlxrh w2, w1, [x0] + cbnz w2, rt_hw_spin_unlock ret diff --git a/libcpu/aarch64/common/cpuport.h b/libcpu/aarch64/common/cpuport.h index 5329b20933..15500894db 100644 --- a/libcpu/aarch64/common/cpuport.h +++ b/libcpu/aarch64/common/cpuport.h @@ -17,10 +17,42 @@ #include #ifdef RT_USING_SMP -typedef struct { - volatile unsigned int lock; + +/** + * Spinlock + */ + +typedef struct +{ + rt_uint32_t value; } rt_hw_spinlock_t; -#endif + +/** + * Generic hw-cpu-id + */ +#ifdef ARCH_USING_GENERIC_CPUID + +#if RT_CPUS_NR > 1 + +rt_inline int rt_hw_cpu_id(void) +{ + long cpuid; + __asm__ volatile("mrs %0, tpidr_el1":"=r"(cpuid)); + return cpuid; +} + +#else + +rt_inline int rt_hw_cpu_id(void) +{ + return 0; +} + +#endif /* RT_CPUS_NR > 1 */ + +#endif /* ARCH_USING_GENERIC_CPUID */ + +#endif /* RT_USING_SMP */ #define rt_hw_barrier(cmd, ...) \ __asm__ volatile (RT_STRINGIFY(cmd) " "RT_STRINGIFY(__VA_ARGS__):::"memory") diff --git a/src/cpu_mp.c b/src/cpu_mp.c index 238ddb420b..a10e35f479 100644 --- a/src/cpu_mp.c +++ b/src/cpu_mp.c @@ -146,7 +146,7 @@ rt_base_t rt_cpus_lock(void) pcpu = rt_cpu_self(); if (pcpu->current_thread != RT_NULL) { - register rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest)); + rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest)); rt_atomic_add(&(pcpu->current_thread->cpus_lock_nest), 1); if (lock_nest == 0) diff --git a/src/scheduler_mp.c b/src/scheduler_mp.c index ee32d9bc60..c1626c5f99 100644 --- a/src/scheduler_mp.c +++ b/src/scheduler_mp.c @@ -1089,6 +1089,7 @@ void rt_exit_critical_safe(rt_base_t critical_level) void rt_exit_critical_safe(rt_base_t critical_level) { + RT_UNUSED(critical_level); return rt_exit_critical(); } diff --git a/src/thread.c b/src/thread.c index 77f65bc68b..8d4752409d 100644 --- a/src/thread.c +++ b/src/thread.c @@ -363,7 +363,8 @@ rt_thread_t rt_thread_self(void) self = rt_cpu_self()->current_thread; rt_hw_local_irq_enable(lock); return self; -#else + +#else /* !RT_USING_SMP */ extern rt_thread_t rt_current_thread; return rt_current_thread;