[libcpu/arm64] add C11 atomic ticket spinlock (#8882)

* [libcpu/arm64] add C11 atomic ticket spinlock

Replace the former implementation of flag-based spinlock which is unfair

Besides, C11 atomic implementation is more readable (it's C anyway),
and maintainable. Cause toolchain can use their builtin optimization and
tune for different micro-architectures. For example armv8.5 introduces a
better instruction. The compiler can help with that when it knows your
target platform in support of it.

Signed-off-by: Shell <smokewood@qq.com>

* fixup: RT_CPUS_NR

---------

Signed-off-by: Shell <smokewood@qq.com>
This commit is contained in:
Shell 2024-05-16 15:45:12 +08:00 committed by GitHub
parent e46333496f
commit e25fc8b511
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 257 additions and 103 deletions

View File

@ -29,7 +29,7 @@
#error the thread priority should at least be greater than idle
#endif
static rt_atomic_t _star_counter = 1;
static rt_atomic_t _star_counter;
static struct rt_semaphore _thr_exit_sem;
static struct rt_semaphore _level_waiting[TEST_LEVEL_COUNTS];
static rt_thread_t _thread_matrix[TEST_LEVEL_COUNTS][KERN_TEST_CONCURRENT_THREADS];
@ -157,6 +157,8 @@ static void scheduler_tc(void)
static rt_err_t utest_tc_init(void)
{
LOG_I("Setup environment...");
_star_counter = 1;
rt_memset(_load_average, 0, sizeof(_load_average));
rt_sem_init(&_thr_exit_sem, "test", 0, RT_IPC_FLAG_PRIO);
for (size_t i = 0; i < TEST_LEVEL_COUNTS; i++)

View File

@ -12,6 +12,9 @@ if ARCH_ARMV8 && ARCH_CPU_64BIT
config ARCH_HAVE_EFFICIENT_UNALIGNED_ACCESS
bool
default y
config ARCH_USING_GENERIC_CPUID
bool "Using generic cpuid implemenation"
default n
endmenu
endif

View File

@ -44,7 +44,11 @@ int rt_hw_cpu_id(void)
.weak rt_hw_cpu_id
.type rt_hw_cpu_id, @function
rt_hw_cpu_id:
#if RT_CPUS_NR > 1
mrs x0, tpidr_el1
#else
mov x0, xzr
#endif
ret
/*

View File

@ -8,6 +8,7 @@
* 2011-09-15 Bernard first version
* 2019-07-28 zdzn add smp support
* 2023-02-21 GuEe-GUI mov cpu ofw init to setup
* 2024-04-29 Shell Add generic ticket spinlock using C11 atomic
*/
#include <rthw.h>
@ -55,65 +56,101 @@ rt_weak rt_uint64_t rt_cpu_mpidr_early[] =
};
#endif /* RT_USING_SMART */
static inline void arch_spin_lock(arch_spinlock_t *lock)
{
unsigned int tmp;
/* in support of C11 atomic */
#if __STDC_VERSION__ >= 201112L
#include <stdatomic.h>
asm volatile(
" sevl\n"
"1: wfe\n"
"2: ldaxr %w0, %1\n"
" cbnz %w0, 1b\n"
" stxr %w0, %w2, %1\n"
" cbnz %w0, 2b\n"
: "=&r" (tmp), "+Q" (lock->lock)
: "r" (1)
: "cc", "memory");
union _spinlock
{
_Atomic(rt_uint32_t) _value;
struct
{
_Atomic(rt_uint16_t) owner;
_Atomic(rt_uint16_t) next;
} ticket;
};
void rt_hw_spin_lock_init(rt_hw_spinlock_t *_lock)
{
union _spinlock *lock = (void *)_lock;
/**
* just a dummy note that this is an atomic operation, though it alway is
* even without usage of atomic API in arm64
*/
atomic_store_explicit(&lock->_value, 0, memory_order_relaxed);
}
static inline int arch_spin_trylock(arch_spinlock_t *lock)
rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *_lock)
{
unsigned int tmp;
rt_bool_t rc;
rt_uint32_t readonce;
union _spinlock temp;
union _spinlock *lock = (void *)_lock;
asm volatile(
" ldaxr %w0, %1\n"
" cbnz %w0, 1f\n"
" stxr %w0, %w2, %1\n"
"1:\n"
: "=&r" (tmp), "+Q" (lock->lock)
: "r" (1)
: "cc", "memory");
readonce = atomic_load_explicit(&lock->_value, memory_order_acquire);
temp._value = readonce;
return !tmp;
if (temp.ticket.owner != temp.ticket.next)
{
rc = RT_FALSE;
}
else
{
temp.ticket.next += 1;
rc = atomic_compare_exchange_strong_explicit(
&lock->_value, &readonce, temp._value,
memory_order_acquire, memory_order_relaxed);
}
return rc;
}
static inline void arch_spin_unlock(arch_spinlock_t *lock)
rt_inline rt_base_t _load_acq_exclusive(_Atomic(rt_uint16_t) *halfword)
{
asm volatile(
" stlr %w1, %0\n"
: "=Q" (lock->lock) : "r" (0) : "memory");
rt_uint32_t old;
__asm__ volatile("ldaxrh %w0, [%1]"
: "=&r"(old)
: "r"(halfword)
: "memory");
return old;
}
void rt_hw_spin_lock_init(arch_spinlock_t *lock)
rt_inline void _send_event_local(void)
{
lock->lock = 0;
__asm__ volatile("sevl");
}
void rt_hw_spin_lock(rt_hw_spinlock_t *lock)
rt_inline void _wait_for_event(void)
{
arch_spin_lock(lock);
__asm__ volatile("wfe" ::: "memory");
}
void rt_hw_spin_unlock(rt_hw_spinlock_t *lock)
void rt_hw_spin_lock(rt_hw_spinlock_t *_lock)
{
arch_spin_unlock(lock);
union _spinlock *lock = (void *)_lock;
rt_uint16_t ticket =
atomic_fetch_add_explicit(&lock->ticket.next, 1, memory_order_relaxed);
if (atomic_load_explicit(&lock->ticket.owner, memory_order_acquire) !=
ticket)
{
_send_event_local();
do
{
_wait_for_event();
}
while (_load_acq_exclusive(&lock->ticket.owner) != ticket);
}
}
rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *lock)
void rt_hw_spin_unlock(rt_hw_spinlock_t *_lock)
{
return arch_spin_trylock(lock);
union _spinlock *lock = (void *)_lock;
atomic_fetch_add_explicit(&lock->ticket.owner, 1, memory_order_release);
}
#endif
static int _cpus_init_data_hardcoded(int num_cpus, rt_uint64_t *cpu_hw_ids, struct cpu_ops_t *cpu_ops[])
{
// load in cpu_hw_ids in cpuid_to_hwid,

View File

@ -1,10 +1,11 @@
/*
* Copyright (c) 2006-2020, RT-Thread Development Team
* Copyright (c) 2006-2024, RT-Thread Development Team
*
* SPDX-License-Identifier: Apache-2.0
*
* Date Author Notes
* 2018-10-06 ZhaoXiaowei the first version
* 2024-04-28 Shell add generic spinlock implementation
*/
.text
@ -80,7 +81,7 @@ rt_hw_set_elx_env:
0:
RET
.global rt_cpu_vector_set_base
.globl rt_cpu_vector_set_base
rt_cpu_vector_set_base:
MSR VBAR_EL1,X0
RET
@ -89,7 +90,7 @@ rt_cpu_vector_set_base:
/**
* unsigned long rt_hw_ffz(unsigned long x)
*/
.global rt_hw_ffz
.globl rt_hw_ffz
rt_hw_ffz:
mvn x1, x0
clz x0, x1
@ -97,7 +98,80 @@ rt_hw_ffz:
sub x0, x1, x0
ret
.global rt_hw_clz
.globl rt_hw_clz
rt_hw_clz:
clz x0, x0
ret
/**
* Spinlock (fallback implementation)
*/
rt_hw_spin_lock_init:
.weak rt_hw_spin_lock_init
stlr wzr, [x0]
ret
rt_hw_spin_trylock:
.weak rt_hw_spin_trylock
sub sp, sp, #16
ldar w2, [x0]
add x1, sp, 8
stlr w2, [x1]
ldarh w1, [x1]
and w1, w1, 65535
add x3, sp, 10
ldarh w3, [x3]
cmp w1, w3, uxth
beq 1f
mov w0, 0
add sp, sp, 16
ret
1:
add x1, sp, 10
2:
ldaxrh w3, [x1]
add w3, w3, 1
stlxrh w4, w3, [x1]
cbnz w4, 2b
add x1, sp, 8
ldar w1, [x1]
3:
ldaxr w3, [x0]
cmp w3, w2
bne 4f
stxr w4, w1, [x0]
cbnz w4, 3b
4:
cset w0, eq
add sp, sp, 16
ret
rt_hw_spin_lock:
.weak rt_hw_spin_lock
add x1, x0, 2
1:
ldxrh w2, [x1]
add w3, w2, 1
stxrh w4, w3, [x1]
cbnz w4, 1b
and w2, w2, 65535
ldarh w1, [x0]
cmp w2, w1, uxth
beq 3f
sevl
2:
wfe
ldaxrh w1, [x0]
cmp w2, w1
bne 2b
3:
ret
rt_hw_spin_unlock:
.weak rt_hw_spin_unlock
ldxrh w1, [x0]
add w1, w1, 1
stlxrh w2, w1, [x0]
cbnz w2, rt_hw_spin_unlock
ret

View File

@ -17,10 +17,42 @@
#include <rtdef.h>
#ifdef RT_USING_SMP
typedef struct {
volatile unsigned int lock;
/**
* Spinlock
*/
typedef struct
{
rt_uint32_t value;
} rt_hw_spinlock_t;
#endif
/**
* Generic hw-cpu-id
*/
#ifdef ARCH_USING_GENERIC_CPUID
#if RT_CPUS_NR > 1
rt_inline int rt_hw_cpu_id(void)
{
long cpuid;
__asm__ volatile("mrs %0, tpidr_el1":"=r"(cpuid));
return cpuid;
}
#else
rt_inline int rt_hw_cpu_id(void)
{
return 0;
}
#endif /* RT_CPUS_NR > 1 */
#endif /* ARCH_USING_GENERIC_CPUID */
#endif /* RT_USING_SMP */
#define rt_hw_barrier(cmd, ...) \
__asm__ volatile (RT_STRINGIFY(cmd) " "RT_STRINGIFY(__VA_ARGS__):::"memory")

View File

@ -146,7 +146,7 @@ rt_base_t rt_cpus_lock(void)
pcpu = rt_cpu_self();
if (pcpu->current_thread != RT_NULL)
{
register rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest));
rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest));
rt_atomic_add(&(pcpu->current_thread->cpus_lock_nest), 1);
if (lock_nest == 0)

View File

@ -1089,6 +1089,7 @@ void rt_exit_critical_safe(rt_base_t critical_level)
void rt_exit_critical_safe(rt_base_t critical_level)
{
RT_UNUSED(critical_level);
return rt_exit_critical();
}

View File

@ -363,7 +363,8 @@ rt_thread_t rt_thread_self(void)
self = rt_cpu_self()->current_thread;
rt_hw_local_irq_enable(lock);
return self;
#else
#else /* !RT_USING_SMP */
extern rt_thread_t rt_current_thread;
return rt_current_thread;