[libcpu/arm64] add C11 atomic ticket spinlock (#8882)

* [libcpu/arm64] add C11 atomic ticket spinlock

Replace the former implementation of flag-based spinlock which is unfair

Besides, C11 atomic implementation is more readable (it's C anyway),
and maintainable. Cause toolchain can use their builtin optimization and
tune for different micro-architectures. For example armv8.5 introduces a
better instruction. The compiler can help with that when it knows your
target platform in support of it.

Signed-off-by: Shell <smokewood@qq.com>

* fixup: RT_CPUS_NR

---------

Signed-off-by: Shell <smokewood@qq.com>
This commit is contained in:
Shell 2024-05-16 15:45:12 +08:00 committed by GitHub
parent e46333496f
commit e25fc8b511
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 257 additions and 103 deletions

View File

@ -29,7 +29,7 @@
#error the thread priority should at least be greater than idle #error the thread priority should at least be greater than idle
#endif #endif
static rt_atomic_t _star_counter = 1; static rt_atomic_t _star_counter;
static struct rt_semaphore _thr_exit_sem; static struct rt_semaphore _thr_exit_sem;
static struct rt_semaphore _level_waiting[TEST_LEVEL_COUNTS]; static struct rt_semaphore _level_waiting[TEST_LEVEL_COUNTS];
static rt_thread_t _thread_matrix[TEST_LEVEL_COUNTS][KERN_TEST_CONCURRENT_THREADS]; static rt_thread_t _thread_matrix[TEST_LEVEL_COUNTS][KERN_TEST_CONCURRENT_THREADS];
@ -157,6 +157,8 @@ static void scheduler_tc(void)
static rt_err_t utest_tc_init(void) static rt_err_t utest_tc_init(void)
{ {
LOG_I("Setup environment..."); LOG_I("Setup environment...");
_star_counter = 1;
rt_memset(_load_average, 0, sizeof(_load_average));
rt_sem_init(&_thr_exit_sem, "test", 0, RT_IPC_FLAG_PRIO); rt_sem_init(&_thr_exit_sem, "test", 0, RT_IPC_FLAG_PRIO);
for (size_t i = 0; i < TEST_LEVEL_COUNTS; i++) for (size_t i = 0; i < TEST_LEVEL_COUNTS; i++)

View File

@ -12,6 +12,9 @@ if ARCH_ARMV8 && ARCH_CPU_64BIT
config ARCH_HAVE_EFFICIENT_UNALIGNED_ACCESS config ARCH_HAVE_EFFICIENT_UNALIGNED_ACCESS
bool bool
default y default y
config ARCH_USING_GENERIC_CPUID
bool "Using generic cpuid implemenation"
default n
endmenu endmenu
endif endif

View File

@ -44,7 +44,11 @@ int rt_hw_cpu_id(void)
.weak rt_hw_cpu_id .weak rt_hw_cpu_id
.type rt_hw_cpu_id, @function .type rt_hw_cpu_id, @function
rt_hw_cpu_id: rt_hw_cpu_id:
#if RT_CPUS_NR > 1
mrs x0, tpidr_el1 mrs x0, tpidr_el1
#else
mov x0, xzr
#endif
ret ret
/* /*

View File

@ -8,6 +8,7 @@
* 2011-09-15 Bernard first version * 2011-09-15 Bernard first version
* 2019-07-28 zdzn add smp support * 2019-07-28 zdzn add smp support
* 2023-02-21 GuEe-GUI mov cpu ofw init to setup * 2023-02-21 GuEe-GUI mov cpu ofw init to setup
* 2024-04-29 Shell Add generic ticket spinlock using C11 atomic
*/ */
#include <rthw.h> #include <rthw.h>
@ -55,65 +56,101 @@ rt_weak rt_uint64_t rt_cpu_mpidr_early[] =
}; };
#endif /* RT_USING_SMART */ #endif /* RT_USING_SMART */
static inline void arch_spin_lock(arch_spinlock_t *lock) /* in support of C11 atomic */
{ #if __STDC_VERSION__ >= 201112L
unsigned int tmp; #include <stdatomic.h>
asm volatile( union _spinlock
" sevl\n" {
"1: wfe\n" _Atomic(rt_uint32_t) _value;
"2: ldaxr %w0, %1\n" struct
" cbnz %w0, 1b\n" {
" stxr %w0, %w2, %1\n" _Atomic(rt_uint16_t) owner;
" cbnz %w0, 2b\n" _Atomic(rt_uint16_t) next;
: "=&r" (tmp), "+Q" (lock->lock) } ticket;
: "r" (1) };
: "cc", "memory");
void rt_hw_spin_lock_init(rt_hw_spinlock_t *_lock)
{
union _spinlock *lock = (void *)_lock;
/**
* just a dummy note that this is an atomic operation, though it alway is
* even without usage of atomic API in arm64
*/
atomic_store_explicit(&lock->_value, 0, memory_order_relaxed);
} }
static inline int arch_spin_trylock(arch_spinlock_t *lock) rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *_lock)
{ {
unsigned int tmp; rt_bool_t rc;
rt_uint32_t readonce;
union _spinlock temp;
union _spinlock *lock = (void *)_lock;
asm volatile( readonce = atomic_load_explicit(&lock->_value, memory_order_acquire);
" ldaxr %w0, %1\n" temp._value = readonce;
" cbnz %w0, 1f\n"
" stxr %w0, %w2, %1\n"
"1:\n"
: "=&r" (tmp), "+Q" (lock->lock)
: "r" (1)
: "cc", "memory");
return !tmp; if (temp.ticket.owner != temp.ticket.next)
{
rc = RT_FALSE;
}
else
{
temp.ticket.next += 1;
rc = atomic_compare_exchange_strong_explicit(
&lock->_value, &readonce, temp._value,
memory_order_acquire, memory_order_relaxed);
}
return rc;
} }
static inline void arch_spin_unlock(arch_spinlock_t *lock) rt_inline rt_base_t _load_acq_exclusive(_Atomic(rt_uint16_t) *halfword)
{ {
asm volatile( rt_uint32_t old;
" stlr %w1, %0\n" __asm__ volatile("ldaxrh %w0, [%1]"
: "=Q" (lock->lock) : "r" (0) : "memory"); : "=&r"(old)
: "r"(halfword)
: "memory");
return old;
} }
void rt_hw_spin_lock_init(arch_spinlock_t *lock) rt_inline void _send_event_local(void)
{ {
lock->lock = 0; __asm__ volatile("sevl");
} }
void rt_hw_spin_lock(rt_hw_spinlock_t *lock) rt_inline void _wait_for_event(void)
{ {
arch_spin_lock(lock); __asm__ volatile("wfe" ::: "memory");
} }
void rt_hw_spin_unlock(rt_hw_spinlock_t *lock) void rt_hw_spin_lock(rt_hw_spinlock_t *_lock)
{ {
arch_spin_unlock(lock); union _spinlock *lock = (void *)_lock;
rt_uint16_t ticket =
atomic_fetch_add_explicit(&lock->ticket.next, 1, memory_order_relaxed);
if (atomic_load_explicit(&lock->ticket.owner, memory_order_acquire) !=
ticket)
{
_send_event_local();
do
{
_wait_for_event();
}
while (_load_acq_exclusive(&lock->ticket.owner) != ticket);
}
} }
rt_bool_t rt_hw_spin_trylock(rt_hw_spinlock_t *lock) void rt_hw_spin_unlock(rt_hw_spinlock_t *_lock)
{ {
return arch_spin_trylock(lock); union _spinlock *lock = (void *)_lock;
atomic_fetch_add_explicit(&lock->ticket.owner, 1, memory_order_release);
} }
#endif
static int _cpus_init_data_hardcoded(int num_cpus, rt_uint64_t *cpu_hw_ids, struct cpu_ops_t *cpu_ops[]) static int _cpus_init_data_hardcoded(int num_cpus, rt_uint64_t *cpu_hw_ids, struct cpu_ops_t *cpu_ops[])
{ {
// load in cpu_hw_ids in cpuid_to_hwid, // load in cpu_hw_ids in cpuid_to_hwid,

View File

@ -1,103 +1,177 @@
/* /*
* Copyright (c) 2006-2020, RT-Thread Development Team * Copyright (c) 2006-2024, RT-Thread Development Team
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
* *
* Date Author Notes * Date Author Notes
* 2018-10-06 ZhaoXiaowei the first version * 2018-10-06 ZhaoXiaowei the first version
* 2024-04-28 Shell add generic spinlock implementation
*/ */
.text .text
.globl rt_hw_get_current_el .globl rt_hw_get_current_el
rt_hw_get_current_el: rt_hw_get_current_el:
MRS X0, CurrentEL MRS X0, CurrentEL
CMP X0, 0xc CMP X0, 0xc
B.EQ 3f B.EQ 3f
CMP X0, 0x8 CMP X0, 0x8
B.EQ 2f B.EQ 2f
CMP X0, 0x4 CMP X0, 0x4
B.EQ 1f B.EQ 1f
LDR X0, =0 LDR X0, =0
B 0f B 0f
3: 3:
LDR X0, =3 LDR X0, =3
B 0f B 0f
2: 2:
LDR X0, =2 LDR X0, =2
B 0f B 0f
1: 1:
LDR X0, =1 LDR X0, =1
B 0f B 0f
0: 0:
RET RET
.globl rt_hw_set_current_vbar .globl rt_hw_set_current_vbar
rt_hw_set_current_vbar: rt_hw_set_current_vbar:
MRS X1, CurrentEL MRS X1, CurrentEL
CMP X1, 0xc CMP X1, 0xc
B.EQ 3f B.EQ 3f
CMP X1, 0x8 CMP X1, 0x8
B.EQ 2f B.EQ 2f
CMP X1, 0x4 CMP X1, 0x4
B.EQ 1f B.EQ 1f
B 0f B 0f
3: 3:
MSR VBAR_EL3,X0 MSR VBAR_EL3,X0
B 0f B 0f
2: 2:
MSR VBAR_EL2,X0 MSR VBAR_EL2,X0
B 0f B 0f
1: 1:
MSR VBAR_EL1,X0 MSR VBAR_EL1,X0
B 0f B 0f
0: 0:
RET RET
.globl rt_hw_set_elx_env .globl rt_hw_set_elx_env
rt_hw_set_elx_env: rt_hw_set_elx_env:
MRS X1, CurrentEL MRS X1, CurrentEL
CMP X1, 0xc CMP X1, 0xc
B.EQ 3f B.EQ 3f
CMP X1, 0x8 CMP X1, 0x8
B.EQ 2f B.EQ 2f
CMP X1, 0x4 CMP X1, 0x4
B.EQ 1f B.EQ 1f
B 0f B 0f
3: 3:
MRS X0, SCR_EL3 MRS X0, SCR_EL3
ORR X0, X0, #0xF /* SCR_EL3.NS|IRQ|FIQ|EA */ ORR X0, X0, #0xF /* SCR_EL3.NS|IRQ|FIQ|EA */
MSR SCR_EL3, X0 MSR SCR_EL3, X0
B 0f B 0f
2: 2:
MRS X0, HCR_EL2 MRS X0, HCR_EL2
ORR X0, X0, #0x38 ORR X0, X0, #0x38
MSR HCR_EL2, X0 MSR HCR_EL2, X0
B 0f B 0f
1: 1:
B 0f B 0f
0: 0:
RET RET
.global rt_cpu_vector_set_base .globl rt_cpu_vector_set_base
rt_cpu_vector_set_base: rt_cpu_vector_set_base:
MSR VBAR_EL1,X0 MSR VBAR_EL1,X0
RET RET
/** /**
* unsigned long rt_hw_ffz(unsigned long x) * unsigned long rt_hw_ffz(unsigned long x)
*/ */
.global rt_hw_ffz .globl rt_hw_ffz
rt_hw_ffz: rt_hw_ffz:
mvn x1, x0 mvn x1, x0
clz x0, x1 clz x0, x1
mov x1, #0x3f mov x1, #0x3f
sub x0, x1, x0 sub x0, x1, x0
ret ret
.global rt_hw_clz .globl rt_hw_clz
rt_hw_clz: rt_hw_clz:
clz x0, x0 clz x0, x0
ret
/**
* Spinlock (fallback implementation)
*/
rt_hw_spin_lock_init:
.weak rt_hw_spin_lock_init
stlr wzr, [x0]
ret
rt_hw_spin_trylock:
.weak rt_hw_spin_trylock
sub sp, sp, #16
ldar w2, [x0]
add x1, sp, 8
stlr w2, [x1]
ldarh w1, [x1]
and w1, w1, 65535
add x3, sp, 10
ldarh w3, [x3]
cmp w1, w3, uxth
beq 1f
mov w0, 0
add sp, sp, 16
ret
1:
add x1, sp, 10
2:
ldaxrh w3, [x1]
add w3, w3, 1
stlxrh w4, w3, [x1]
cbnz w4, 2b
add x1, sp, 8
ldar w1, [x1]
3:
ldaxr w3, [x0]
cmp w3, w2
bne 4f
stxr w4, w1, [x0]
cbnz w4, 3b
4:
cset w0, eq
add sp, sp, 16
ret
rt_hw_spin_lock:
.weak rt_hw_spin_lock
add x1, x0, 2
1:
ldxrh w2, [x1]
add w3, w2, 1
stxrh w4, w3, [x1]
cbnz w4, 1b
and w2, w2, 65535
ldarh w1, [x0]
cmp w2, w1, uxth
beq 3f
sevl
2:
wfe
ldaxrh w1, [x0]
cmp w2, w1
bne 2b
3:
ret
rt_hw_spin_unlock:
.weak rt_hw_spin_unlock
ldxrh w1, [x0]
add w1, w1, 1
stlxrh w2, w1, [x0]
cbnz w2, rt_hw_spin_unlock
ret ret

View File

@ -17,10 +17,42 @@
#include <rtdef.h> #include <rtdef.h>
#ifdef RT_USING_SMP #ifdef RT_USING_SMP
typedef struct {
volatile unsigned int lock; /**
* Spinlock
*/
typedef struct
{
rt_uint32_t value;
} rt_hw_spinlock_t; } rt_hw_spinlock_t;
#endif
/**
* Generic hw-cpu-id
*/
#ifdef ARCH_USING_GENERIC_CPUID
#if RT_CPUS_NR > 1
rt_inline int rt_hw_cpu_id(void)
{
long cpuid;
__asm__ volatile("mrs %0, tpidr_el1":"=r"(cpuid));
return cpuid;
}
#else
rt_inline int rt_hw_cpu_id(void)
{
return 0;
}
#endif /* RT_CPUS_NR > 1 */
#endif /* ARCH_USING_GENERIC_CPUID */
#endif /* RT_USING_SMP */
#define rt_hw_barrier(cmd, ...) \ #define rt_hw_barrier(cmd, ...) \
__asm__ volatile (RT_STRINGIFY(cmd) " "RT_STRINGIFY(__VA_ARGS__):::"memory") __asm__ volatile (RT_STRINGIFY(cmd) " "RT_STRINGIFY(__VA_ARGS__):::"memory")

View File

@ -146,7 +146,7 @@ rt_base_t rt_cpus_lock(void)
pcpu = rt_cpu_self(); pcpu = rt_cpu_self();
if (pcpu->current_thread != RT_NULL) if (pcpu->current_thread != RT_NULL)
{ {
register rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest)); rt_ubase_t lock_nest = rt_atomic_load(&(pcpu->current_thread->cpus_lock_nest));
rt_atomic_add(&(pcpu->current_thread->cpus_lock_nest), 1); rt_atomic_add(&(pcpu->current_thread->cpus_lock_nest), 1);
if (lock_nest == 0) if (lock_nest == 0)

View File

@ -1089,6 +1089,7 @@ void rt_exit_critical_safe(rt_base_t critical_level)
void rt_exit_critical_safe(rt_base_t critical_level) void rt_exit_critical_safe(rt_base_t critical_level)
{ {
RT_UNUSED(critical_level);
return rt_exit_critical(); return rt_exit_critical();
} }

View File

@ -363,7 +363,8 @@ rt_thread_t rt_thread_self(void)
self = rt_cpu_self()->current_thread; self = rt_cpu_self()->current_thread;
rt_hw_local_irq_enable(lock); rt_hw_local_irq_enable(lock);
return self; return self;
#else
#else /* !RT_USING_SMP */
extern rt_thread_t rt_current_thread; extern rt_thread_t rt_current_thread;
return rt_current_thread; return rt_current_thread;