[libcpu/arm64] add C11 atomic ticket spinlock (#8882)

* [libcpu/arm64] add C11 atomic ticket spinlock Replace the former implementation of flag-based spinlock which is unfair Besides, C11 atomic implementation is more readable (it's C anyway), and maintainable. Cause toolchain can use their builtin optimization and tune for different micro-architectures. For example armv8.5 introduces a better instruction. The compiler can help with that when it knows your target platform in support of it. Signed-off-by: Shell <smokewood@qq.com> * fixup: RT_CPUS_NR --------- Signed-off-by: Shell <smokewood@qq.com>
2024-05-16 15:45:12 +08:00 · 2024-05-16 15:45:12 +08:00 · e25fc8b511
parent e46333496f
commit e25fc8b511
9 changed files with 257 additions and 103 deletions
--- a/examples/utest/testcases/kernel/sched_sem_tc.c
+++ b/examples/utest/testcases/kernel/sched_sem_tc.c
@ -29,7 +29,7 @@
 #error the thread priority should at least be greater than idle
 #endif

-static rt_atomic_t _star_counter = 1;
+static rt_atomic_t _star_counter;
 static struct rt_semaphore _thr_exit_sem;
 static struct rt_semaphore _level_waiting[TEST_LEVEL_COUNTS];
 static rt_thread_t _thread_matrix[TEST_LEVEL_COUNTS][KERN_TEST_CONCURRENT_THREADS];
@ -157,6 +157,8 @@ static void scheduler_tc(void)
 static rt_err_t utest_tc_init(void)
 {
    LOG_I("Setup environment...");
+    _star_counter = 1;
+    rt_memset(_load_average, 0, sizeof(_load_average));
    rt_sem_init(&_thr_exit_sem, "test", 0, RT_IPC_FLAG_PRIO);

    for (size_t i = 0; i < TEST_LEVEL_COUNTS; i++)
--- a/libcpu/Kconfig
+++ b/libcpu/Kconfig
@ -12,6 +12,9 @@ if ARCH_ARMV8 && ARCH_CPU_64BIT
    config ARCH_HAVE_EFFICIENT_UNALIGNED_ACCESS
        bool
        default y
+    config ARCH_USING_GENERIC_CPUID
+        bool "Using generic cpuid implemenation"
+        default n
    endmenu
 endif

--- a/libcpu/aarch64/common/context_gcc.S
+++ b/libcpu/aarch64/common/context_gcc.S
@ -44,7 +44,11 @@ int rt_hw_cpu_id(void)
 .weak rt_hw_cpu_id
 .type rt_hw_cpu_id, @function
 rt_hw_cpu_id:
+#if RT_CPUS_NR > 1
    mrs x0, tpidr_el1
+#else
+    mov x0, xzr
+#endif
    ret

 /*
--- a/libcpu/aarch64/common/cpu.c
+++ b/libcpu/aarch64/common/cpu.c
@ -8,6 +8,7 @@
 * 2011-09-15     Bernard      first version
 * 2019-07-28     zdzn         add smp support
 * 2023-02-21     GuEe-GUI     mov cpu ofw init to setup
+ * 2024-04-29     Shell        Add generic ticket spinlock using C11 atomic
 */

 #include <rthw.h>
@ -55,65 +56,101 @@ rt_weak rt_uint64_t rt_cpu_mpidr_early[] =
 };
 #endif /* RT_USING_SMART */

-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-    unsigned int tmp;
+/* in support of C11 atomic */
+#if __STDC_VERSION__ >= 201112L
+#include <stdatomic.h>

-    asm volatile(
-    "   sevl\n"
-    "1: wfe\n"
-    "2: ldaxr   %w0, %1\n"
-    "   cbnz    %w0, 1b\n"
-    "   stxr    %w0, %w2, %1\n"
-    "   cbnz    %w0, 2b\n"
-    : "=&r" (tmp), "+Q" (lock->lock)
-    : "r" (1)
-    : "cc", "memory");
+union _spinlock
+{
+    _Atomic(rt_uint32_t) _value;
+    struct
+    {
+        _Atomic(rt_uint16_t) owner;
+        _Atomic(rt_uint16_t) next;
+    } ticket;
+};
+
+void rt_hw_spin_lock_init(rt_hw_spinlock_t *_lock)
+{
+    union _spinlock *lock = (void *)_lock;
+
+    /**
+     * just a dummy note that this is an atomic operation, though it alway is
+     * even without usage of atomic API in arm64
+     */
+    atomic_store_explicit(&lock->_value, 0, memory_order_relaxed);
 }

-static inline int arch_spin_trylock(arch_spinlock_t *lock)
+rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *_lock)
 {
-    unsigned int tmp;
+    rt_bool_t rc;
+    rt_uint32_t readonce;
+    union _spinlock temp;
+    union _spinlock *lock = (void *)_lock;

-    asm volatile(
-    "  ldaxr   %w0, %1\n"
-    "  cbnz    %w0, 1f\n"
-    "  stxr    %w0, %w2, %1\n"
-    "1:\n"
-    : "=&r" (tmp), "+Q" (lock->lock)
-    : "r" (1)
-    : "cc", "memory");
+    readonce = atomic_load_explicit(&lock->_value, memory_order_acquire);
+    temp._value = readonce;

-    return !tmp;
+    if (temp.ticket.owner != temp.ticket.next)
+    {
+        rc = RT_FALSE;
+    }
+    else
+    {
+        temp.ticket.next += 1;
+        rc = atomic_compare_exchange_strong_explicit(
+            &lock->_value, &readonce, temp._value,
+            memory_order_acquire, memory_order_relaxed);
+    }
+    return rc;
 }

-static inline void arch_spin_unlock(arch_spinlock_t *lock)
+rt_inline rt_base_t _load_acq_exclusive(_Atomic(rt_uint16_t) *halfword)
 {
-    asm volatile(
-    " stlr    %w1, %0\n"
-    : "=Q" (lock->lock) : "r" (0) : "memory");
+    rt_uint32_t old;
+    __asm__ volatile("ldaxrh %w0, [%1]"
+                     : "=&r"(old)
+                     : "r"(halfword)
+                     :  "memory");
+    return old;
 }

-void rt_hw_spin_lock_init(arch_spinlock_t *lock)
+rt_inline void _send_event_local(void)
 {
-    lock->lock = 0;
+    __asm__ volatile("sevl");
 }

-void rt_hw_spin_lock(rt_hw_spinlock_t *lock)
+rt_inline void _wait_for_event(void)
 {
-    arch_spin_lock(lock);
+    __asm__ volatile("wfe" ::: "memory");
 }

-void rt_hw_spin_unlock(rt_hw_spinlock_t *lock)
+void rt_hw_spin_lock(rt_hw_spinlock_t *_lock)
 {
-    arch_spin_unlock(lock);
+    union _spinlock *lock = (void *)_lock;
+    rt_uint16_t ticket =
+        atomic_fetch_add_explicit(&lock->ticket.next, 1, memory_order_relaxed);
+
+    if (atomic_load_explicit(&lock->ticket.owner, memory_order_acquire) !=
+        ticket)
+    {
+        _send_event_local();
+        do
+        {
+            _wait_for_event();
+        }
+        while (_load_acq_exclusive(&lock->ticket.owner) != ticket);
+    }
 }

-rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *lock)
+void rt_hw_spin_unlock(rt_hw_spinlock_t *_lock)
 {
-    return arch_spin_trylock(lock);
+    union _spinlock *lock = (void *)_lock;
+    atomic_fetch_add_explicit(&lock->ticket.owner, 1, memory_order_release);
 }

+#endif
+
 static int _cpus_init_data_hardcoded(int num_cpus, rt_uint64_t *cpu_hw_ids, struct cpu_ops_t *cpu_ops[])
 {
    // load in cpu_hw_ids in cpuid_to_hwid,
--- a/libcpu/aarch64/common/cpu_gcc.S
+++ b/libcpu/aarch64/common/cpu_gcc.S
@ -1,10 +1,11 @@
 /*
- * Copyright (c) 2006-2020, RT-Thread Development Team
+ * Copyright (c) 2006-2024, RT-Thread Development Team
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Date           Author       Notes
 * 2018-10-06     ZhaoXiaowei    the first version
+ * 2024-04-28     Shell        add generic spinlock implementation
 */

 .text
@ -80,7 +81,7 @@ rt_hw_set_elx_env:
 0:
    RET

-.global rt_cpu_vector_set_base
+.globl rt_cpu_vector_set_base
 rt_cpu_vector_set_base:
    MSR        VBAR_EL1,X0
    RET
@ -89,7 +90,7 @@ rt_cpu_vector_set_base:
 /**
 * unsigned long rt_hw_ffz(unsigned long x)
 */
-.global rt_hw_ffz
+.globl rt_hw_ffz
 rt_hw_ffz:
    mvn     x1, x0
    clz     x0, x1
@ -97,7 +98,80 @@ rt_hw_ffz:
    sub     x0, x1, x0
    ret

-.global rt_hw_clz
+.globl rt_hw_clz
 rt_hw_clz:
    clz     x0, x0
    ret
+
+/**
+ * Spinlock (fallback implementation)
+ */
+
+rt_hw_spin_lock_init:
+    .weak   rt_hw_spin_lock_init
+    stlr    wzr, [x0]
+    ret
+
+rt_hw_spin_trylock:
+    .weak   rt_hw_spin_trylock
+    sub     sp, sp, #16
+    ldar    w2, [x0]
+    add     x1, sp, 8
+    stlr    w2, [x1]
+    ldarh   w1, [x1]
+    and     w1, w1, 65535
+    add     x3, sp, 10
+    ldarh   w3, [x3]
+    cmp     w1, w3, uxth
+    beq     1f
+    mov     w0, 0
+    add     sp, sp, 16
+    ret
+1:
+    add     x1, sp, 10
+2:
+    ldaxrh  w3, [x1]
+    add     w3, w3, 1
+    stlxrh  w4, w3, [x1]
+    cbnz    w4, 2b
+    add     x1, sp, 8
+    ldar    w1, [x1]
+3:
+    ldaxr   w3, [x0]
+    cmp     w3, w2
+    bne     4f
+    stxr    w4, w1, [x0]
+    cbnz    w4, 3b
+4:
+    cset    w0, eq
+    add     sp, sp, 16
+    ret
+
+rt_hw_spin_lock:
+    .weak   rt_hw_spin_lock
+    add     x1, x0, 2
+1:
+    ldxrh   w2, [x1]
+    add     w3, w2, 1
+    stxrh   w4, w3, [x1]
+    cbnz    w4, 1b
+    and     w2, w2, 65535
+    ldarh   w1, [x0]
+    cmp     w2, w1, uxth
+    beq     3f
+    sevl
+2:
+    wfe
+    ldaxrh  w1, [x0]
+    cmp     w2, w1
+    bne     2b
+3:
+    ret
+
+rt_hw_spin_unlock:
+    .weak   rt_hw_spin_unlock
+    ldxrh   w1, [x0]
+    add     w1, w1, 1
+    stlxrh  w2, w1, [x0]
+    cbnz    w2, rt_hw_spin_unlock
+    ret
--- a/libcpu/aarch64/common/cpuport.h
+++ b/libcpu/aarch64/common/cpuport.h
@ -17,10 +17,42 @@
 #include <rtdef.h>

 #ifdef RT_USING_SMP
-typedef struct {
-    volatile unsigned int lock;
+
+/**
+ * Spinlock
+ */
+
+typedef struct
+{
+    rt_uint32_t value;
 } rt_hw_spinlock_t;
-#endif
+
+/**
+ * Generic hw-cpu-id
+ */
+#ifdef ARCH_USING_GENERIC_CPUID
+
+#if RT_CPUS_NR > 1
+
+rt_inline int rt_hw_cpu_id(void)
+{
+    long cpuid;
+    __asm__ volatile("mrs %0, tpidr_el1":"=r"(cpuid));
+    return cpuid;
+}
+
+#else
+
+rt_inline int rt_hw_cpu_id(void)
+{
+    return 0;
+}
+
+#endif /* RT_CPUS_NR > 1 */
+
+#endif /* ARCH_USING_GENERIC_CPUID */
+
+#endif /* RT_USING_SMP */

 #define rt_hw_barrier(cmd, ...) \
    __asm__ volatile (RT_STRINGIFY(cmd) " "RT_STRINGIFY(__VA_ARGS__):::"memory")
--- a/src/cpu_mp.c
+++ b/src/cpu_mp.c
@ -146,7 +146,7 @@ rt_base_t rt_cpus_lock(void)
    pcpu = rt_cpu_self();
    if (pcpu->current_thread != RT_NULL)
    {
-        register rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest));
+        rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest));

        rt_atomic_add(&(pcpu->current_thread->cpus_lock_nest), 1);
        if (lock_nest == 0)
--- a/src/scheduler_mp.c
+++ b/src/scheduler_mp.c
@ -1089,6 +1089,7 @@ void rt_exit_critical_safe(rt_base_t critical_level)

 void rt_exit_critical_safe(rt_base_t critical_level)
 {
+    RT_UNUSED(critical_level);
    return rt_exit_critical();
 }

--- a/src/thread.c
+++ b/src/thread.c
@ -363,7 +363,8 @@ rt_thread_t rt_thread_self(void)
    self = rt_cpu_self()->current_thread;
    rt_hw_local_irq_enable(lock);
    return self;
-#else
+
+#else /* !RT_USING_SMP */
    extern rt_thread_t rt_current_thread;

    return rt_current_thread;