diff --git a/components/drivers/pic/pic-gic-common.c b/components/drivers/pic/pic-gic-common.c
index 07e5f0b18d..5c6dc030cf 100644
--- a/components/drivers/pic/pic-gic-common.c
+++ b/components/drivers/pic/pic-gic-common.c
@@ -71,8 +71,9 @@ void gic_common_sgi_config(void *base, void *data, int irq_base)
         pirq = rt_pic_find_ipi(data, ipi);      \
         pirq->mode = RT_IRQ_MODE_EDGE_RISING;   \
 
-        DECLARE_GIC_IPI(RT_SCHEDULE_IPI, 0);
-        DECLARE_GIC_IPI(RT_STOP_IPI, 1);
+        DECLARE_GIC_IPI(RT_SCHEDULE_IPI, RT_SCHEDULE_IPI);
+        DECLARE_GIC_IPI(RT_STOP_IPI, RT_STOP_IPI);
+        DECLARE_GIC_IPI(RT_SMP_CALL_IPI, RT_SMP_CALL_IPI);
 
 #undef DECLARE_GIC_IPI
     }
diff --git a/components/drivers/pic/pic.c b/components/drivers/pic/pic.c
index 9f8f6f3cf7..eebe3fd6fe 100644
--- a/components/drivers/pic/pic.c
+++ b/components/drivers/pic/pic.c
@@ -31,6 +31,7 @@ static int _ipi_hash[] =
 #ifdef RT_USING_SMP
     [RT_SCHEDULE_IPI] = RT_SCHEDULE_IPI,
     [RT_STOP_IPI] = RT_STOP_IPI,
+    [RT_SMP_CALL_IPI] = RT_SMP_CALL_IPI,
 #endif
 };
 
diff --git a/components/drivers/smp/smp.c b/components/drivers/smp/smp.c
index da098717e4..fe37d2069d 100644
--- a/components/drivers/smp/smp.c
+++ b/components/drivers/smp/smp.c
@@ -6,6 +6,8 @@
  * Change Logs:
  * Date           Author       Notes
  * 2024/9/12      zhujiale     the first version
+ * 2024/10/24     Shell        added non-blocking IPI calling method;
+ *                             fixup data racing
  */
 
 #include "smp.h"
@@ -14,37 +16,299 @@
 #define DBG_LVL DBG_INFO
 #include <rtdbg.h>
 
-static struct rt_smp_call rt_smp_work[RT_CPUS_NR];
-static rt_atomic_t        rt_smp_wait;
-
-static rt_err_t smp_call_handler(struct rt_smp_event *event)
+static struct smp_data
 {
-    switch (event->event_id)
+    /* call request data to each cores */
+    struct rt_smp_call_req call_req_cores[RT_CPUS_NR];
+
+    /* call queue of this core */
+    rt_ll_slist_t call_queue;
+} _smp_data_cores[RT_CPUS_NR];
+
+#define _CALL_REQ_USAGE_FREED 0
+#define _CALL_REQ_USAGE_BUSY 1
+static void _call_req_take(struct rt_smp_call_req *req)
+{
+    rt_base_t exp;
+    do
     {
-    case SMP_CALL_EVENT_FUNC:
-        event->func(event->data);
-        rt_atomic_add(&rt_smp_wait, 1);
+        exp = _CALL_REQ_USAGE_FREED;
+    }
+    while (!rt_atomic_compare_exchange_strong(&req->event.typed.usage_tracer, &exp, _CALL_REQ_USAGE_BUSY));
+}
+
+static void _call_req_release(struct rt_smp_call_req *req)
+{
+    rt_atomic_store(&req->event.typed.usage_tracer, _CALL_REQ_USAGE_FREED);
+}
+
+void rt_smp_request_wait_freed(struct rt_smp_call_req *req)
+{
+    rt_base_t usage_tracer;
+
+    RT_DEBUG_IN_THREAD_CONTEXT;
+
+    usage_tracer = rt_atomic_load(&req->event.typed.usage_tracer);
+    while (usage_tracer != _CALL_REQ_USAGE_FREED)
+    {
+        rt_thread_yield();
+        usage_tracer = rt_atomic_load(&req->event.typed.usage_tracer);
+    }
+}
+
+static void _mask_out_cpu(struct rt_smp_event *event, int oncpu)
+{
+    rt_base_t new_mask, old_mask;
+    rt_atomic_t *maskp = event->typed.calling_cpu_mask;
+    do
+    {
+        old_mask = rt_atomic_load(maskp);
+        new_mask = old_mask & ~(1ul << oncpu);
+    } while (!rt_atomic_compare_exchange_strong(maskp, &old_mask, new_mask));
+}
+
+static void _do_glob_request(struct rt_smp_call_req *req_global,
+                                             struct rt_smp_call_req *req_local)
+{
+    struct rt_smp_event *event;
+
+    /* release the global request data */
+    rt_memcpy(req_local, req_global, sizeof(struct rt_smp_call_req));
+    rt_hw_spin_unlock(&req_global->freed_lock);
+
+    event = &req_local->event;
+    RT_ASSERT(!!event->func);
+    event->func(event->data);
+
+    return ;
+}
+
+static void _do_request(struct rt_smp_call_req *req)
+{
+    struct rt_smp_event *event;
+
+    event = &req->event;
+    RT_ASSERT(!!event->func);
+    event->func(event->data);
+
+    _call_req_release(req);
+    return ;
+}
+
+static rt_err_t _smp_call_handler(struct rt_smp_call_req *req, int oncpu)
+{
+    switch (req->event.event_id)
+    {
+    case SMP_CALL_EVENT_GLOB_SYNC:
+    {
+        struct rt_smp_call_req req_local;
+        _do_glob_request(req, &req_local);
+        _mask_out_cpu(&req_local.event, oncpu);
         break;
+    }
+    case SMP_CALL_EVENT_GLOB_ASYNC:
+    {
+        struct rt_smp_call_req req_local;
+        _do_glob_request(req, &req_local);
+        break;
+    }
+    case SMP_CALL_EVENT_REQUEST:
+    {
+        _do_request(req);
+        break;
+    }
     default:
         LOG_E("error event id\n");
         return -RT_ERROR;
     }
     return RT_EOK;
 }
+
 void rt_smp_call_ipi_handler(int vector, void *param)
 {
-    int cur_cpu = rt_hw_cpu_id();
+    int oncpu = rt_hw_cpu_id();
+    struct rt_smp_call_req *request;
 
-    rt_spin_lock(&rt_smp_work[cur_cpu].lock);
-    if (rt_smp_work[cur_cpu].event.event_id)
+    RT_ASSERT(rt_interrupt_get_nest());
+
+    while (1)
     {
-        if (smp_call_handler(&rt_smp_work[cur_cpu].event) != RT_EOK)
+        rt_ll_slist_t *node = rt_ll_slist_dequeue(&_smp_data_cores[oncpu].call_queue);
+        if (node)
         {
-            LOG_E("Have no event\n");
+            request = rt_list_entry(node, struct rt_smp_call_req, slist_node);
+
+            _smp_call_handler(request, oncpu);
+        }
+        else
+        {
+            break;
         }
-        rt_memset(&rt_smp_work[cur_cpu].event, 0, sizeof(struct rt_smp_event));
     }
-    rt_spin_unlock(&rt_smp_work[cur_cpu].lock);
+}
+
+static void _smp_call_remote_request(int callcpu, rt_smp_call_cb_t func,
+                                     void *data, rt_uint8_t flags,
+                                     struct rt_smp_call_req *call_req)
+{
+    rt_base_t cpu_mask = 1ul << callcpu;
+
+    _call_req_take(call_req);
+
+    rt_ll_slist_enqueue(&_smp_data_cores[callcpu].call_queue, &call_req->slist_node);
+
+    rt_hw_ipi_send(RT_SMP_CALL_IPI, cpu_mask);
+}
+
+/**
+ * @brief SMP call request with user provided @call_req. Compare to
+ *        rt_smp_call_func* family, you can call it in ISR or IRQ-masked
+ *        environment.
+ *
+ * @param callcpu the logical core id of the target
+ * @param flags control flags of your request
+ * @param call_req the pre-initialized request data
+ * @return rt_err_t RT_EOK on succeed, otherwise the errno to failure
+ */
+rt_err_t rt_smp_call_request(int callcpu, rt_uint8_t flags, struct rt_smp_call_req *call_req)
+{
+    rt_ubase_t clvl;
+    int oncpu;
+
+    if (rt_atomic_load(&call_req->event.typed.usage_tracer) ==
+        _CALL_REQ_USAGE_BUSY)
+    {
+        return -RT_EBUSY;
+    }
+
+    if (flags & SMP_CALL_WAIT_ALL)
+    {
+        return -RT_EINVAL;
+    }
+
+    clvl = rt_enter_critical();
+    oncpu = rt_hw_cpu_id();
+
+    if (oncpu == callcpu && !(flags & SMP_CALL_NO_LOCAL))
+    {
+        rt_ubase_t level;
+
+        /* handle IPI on irq-masked environment */
+        level = rt_hw_local_irq_disable();
+        call_req->event.func(call_req->event.data);
+        rt_hw_local_irq_enable(level);
+    }
+    else if (callcpu < RT_CPUS_NR)
+    {
+        _smp_call_remote_request(callcpu, call_req->event.func, call_req->event.data, flags, call_req);
+    }
+
+    rt_exit_critical_safe(clvl);
+
+    return RT_EOK;
+}
+
+void rt_smp_call_req_init(struct rt_smp_call_req *call_req,
+                          rt_smp_call_cb_t func, void *data)
+{
+    call_req->event.typed.usage_tracer = 0;
+    call_req->event.data = data;
+    call_req->event.func = func;
+    call_req->event.event_id = SMP_CALL_EVENT_REQUEST;
+}
+
+static void _smp_call_func_cond(int oncpu, rt_ubase_t cpu_mask,
+                                rt_smp_call_cb_t func, void *data,
+                                rt_uint8_t flags, rt_smp_cond_t cond)
+{
+    rt_ubase_t          tmp_mask;
+    rt_bool_t           sync_call = RT_FALSE;
+    rt_ubase_t          oncpu_mask = 1 << oncpu;
+    rt_atomic_t         calling_cpu_mask, *maskp;
+    int                 tmp_id = 0, rcpu_cnt = 0, event_id, call_local;
+
+    if (!(flags & SMP_CALL_NO_LOCAL) && (oncpu_mask & cpu_mask))
+    {
+        call_local = RT_TRUE;
+        cpu_mask = cpu_mask & (~oncpu_mask);
+    }
+    else
+    {
+        call_local = RT_FALSE;
+    }
+
+    if (cpu_mask)
+    {
+        tmp_mask = cpu_mask;
+
+        if (flags & SMP_CALL_WAIT_ALL)
+        {
+            sync_call = RT_TRUE;
+            maskp = &calling_cpu_mask;
+            event_id = SMP_CALL_EVENT_GLOB_SYNC;
+            rt_atomic_store(maskp, cpu_mask);
+        }
+        else
+        {
+            event_id = SMP_CALL_EVENT_GLOB_ASYNC;
+            maskp = RT_NULL;
+        }
+
+        while (tmp_mask)
+        {
+            struct rt_smp_call_req *call_req;
+            struct rt_smp_event *event;
+            int lz_bit = __rt_ffsl(tmp_mask);
+
+            tmp_id = lz_bit - 1;
+            tmp_mask &= ~(1ul << tmp_id);
+
+            if (cond && !cond(tmp_id, data))
+            {
+                cpu_mask &= ~(1ul << tmp_id);
+                continue;
+            }
+
+            /* need to wait one more */
+            rcpu_cnt++;
+
+            call_req = &_smp_data_cores[oncpu].call_req_cores[tmp_id];
+
+            /* very careful here, spinning wait on previous occupation */
+            rt_hw_spin_lock(&call_req->freed_lock);
+
+            event                   = &call_req->event;
+            event->event_id         = event_id;
+            event->func             = func;
+            event->data             = data;
+            event->typed.calling_cpu_mask = maskp;
+
+            rt_ll_slist_enqueue(&_smp_data_cores[tmp_id].call_queue, &call_req->slist_node);
+        }
+
+        if (cpu_mask)
+        {
+            RT_ASSERT(rcpu_cnt);
+
+            rt_hw_ipi_send(RT_SMP_CALL_IPI, cpu_mask);
+        }
+    }
+
+    if (call_local && (!cond || cond(tmp_id, data)))
+    {
+        rt_ubase_t level;
+
+        /* callback on local with sims ISR */
+        level = rt_hw_local_irq_disable();
+        func(data);
+        rt_hw_local_irq_enable(level);
+    }
+
+    if (sync_call && rcpu_cnt)
+    {
+        while (rt_atomic_load(maskp) & cpu_mask)
+            ;
+    }
 }
 
 /**
@@ -58,86 +322,53 @@ void rt_smp_call_ipi_handler(int vector, void *param)
  *             else it will call function on specified CPU and return immediately
  * @param cond the condition function pointer,if you set it then it will call function only when cond return true
  */
-void rt_smp_call_func_cond(int cpu_mask, rt_smp_call_func_back func, void *data, rt_uint8_t flag, rt_smp_cond cond)
+void rt_smp_call_func_cond(rt_ubase_t cpu_mask, rt_smp_call_cb_t func, void *data, rt_uint8_t flag, rt_smp_cond_t cond)
 {
-    RT_DEBUG_NOT_IN_INTERRUPT;
-    struct rt_smp_event event;
-    rt_bool_t           need_call = RT_TRUE, need_wait = RT_FALSE;
-    int                 cur_cpu = rt_hw_cpu_id();
-    int                 cpuid   = 1 << cur_cpu;
-    int                 tmp_id = 0, cpu_nr = 0;
-    int                 tmp_mask;
-    int                 irq_flag;
+    int oncpu;
+    rt_ubase_t clvl;
 
-    if (flag == SMP_CALL_WAIT_ALL)
+    RT_ASSERT(!rt_hw_interrupt_is_disabled());
+
+    clvl = rt_enter_critical();
+    oncpu = rt_hw_cpu_id();
+
+    if (cpu_mask <= RT_ALL_CPU)
     {
-        need_wait = RT_TRUE;
-        rt_atomic_store(&rt_smp_wait, 0);
+        _smp_call_func_cond(oncpu, cpu_mask, func, data, flag, cond);
     }
 
-    if (cpuid & cpu_mask)
-    {
-        func(data);
-        cpu_mask = cpu_mask & (~cpuid);
-    }
-
-    if (!cpu_mask)
-        need_call = RT_FALSE;
-
-    tmp_mask = cpu_mask;
-    if (need_call)
-    {
-        while (tmp_mask)
-        {
-            if ((tmp_mask & 1) && (tmp_id < RT_CPUS_NR))
-            {
-                if (cond && !cond(tmp_id, data))
-                    continue;
-                cpu_nr++;
-                event.event_id = SMP_CALL_EVENT_FUNC;
-                event.func     = func;
-                event.data     = data;
-                event.cpu_mask = cpu_mask;
-                irq_flag       = rt_spin_lock_irqsave(&rt_smp_work[tmp_id].lock);
-                rt_smp_work[tmp_id].event = event;
-                rt_spin_unlock_irqrestore(&rt_smp_work[tmp_id].lock,irq_flag);
-            }
-            tmp_id++;
-            tmp_mask = tmp_mask >> 1;
-        }
-        rt_hw_ipi_send(RT_FUNC_IPI, cpu_mask);
-    }
-
-    if (need_wait)
-    {
-        while (rt_atomic_load(&rt_smp_wait) != cpu_nr);
-    }
+    rt_exit_critical_safe(clvl);
 }
 
-void rt_smp_call_each_cpu(rt_smp_call_func_back func, void *data, rt_uint8_t flag)
+void rt_smp_call_each_cpu(rt_smp_call_cb_t func, void *data, rt_uint8_t flag)
 {
     rt_smp_call_func_cond(RT_ALL_CPU, func, data, flag, RT_NULL);
 }
 
-void rt_smp_call_each_cpu_cond(rt_smp_call_func_back func, void *data, rt_uint8_t flag, rt_smp_cond cond_func)
+void rt_smp_call_each_cpu_cond(rt_smp_call_cb_t func, void *data, rt_uint8_t flag, rt_smp_cond_t cond_func)
 {
     rt_smp_call_func_cond(RT_ALL_CPU, func, data, flag, cond_func);
 }
-void rt_smp_call_any_cpu(int cpu_mask, rt_smp_call_func_back func, void *data, rt_uint8_t flag)
+
+void rt_smp_call_cpu_mask(rt_ubase_t cpu_mask, rt_smp_call_cb_t func, void *data, rt_uint8_t flag)
 {
     rt_smp_call_func_cond(cpu_mask, func, data, flag, RT_NULL);
 }
 
-void rt_smp_call_any_cpu_cond(int cpu_mask, rt_smp_call_func_back func, void *data, rt_uint8_t flag, rt_smp_cond cond_func)
+void rt_smp_call_cpu_mask_cond(rt_ubase_t cpu_mask, rt_smp_call_cb_t func, void *data, rt_uint8_t flag, rt_smp_cond_t cond_func)
 {
     rt_smp_call_func_cond(cpu_mask, func, data, flag, cond_func);
 }
 
-void rt_smp_init(void)
+void rt_smp_call_init(void)
 {
+    rt_memset(&_smp_data_cores, 0, sizeof(_smp_data_cores));
+
     for (int i = 0; i < RT_CPUS_NR; i++)
     {
-        rt_memset(&rt_smp_work[i], 0, sizeof(struct rt_smp_call));
-        rt_spin_lock_init(&rt_smp_work[i].lock);
+        for (int j = 0; j < RT_CPUS_NR; j++)
+        {
+            rt_hw_spin_lock_init(&_smp_data_cores[i].call_req_cores[j].freed_lock);
+        }
     }
 }
diff --git a/components/drivers/smp/smp.h b/components/drivers/smp/smp.h
index 6932fd1a7f..18198b39f7 100644
--- a/components/drivers/smp/smp.h
+++ b/components/drivers/smp/smp.h
@@ -1,34 +1,69 @@
+/*
+ * Copyright (c) 2006-2024 RT-Thread Development Team
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Change Logs:
+ * Date           Author       Notes
+ * 2024/9/12      zhujiale     the first version
+ * 2024/10/24     Shell        added non-blocking IPI calling method
+ */
+
 #ifndef __SMP_IPI_H__
 #define __SMP_IPI_H__
 #include <rtthread.h>
-typedef void (*rt_smp_call_func_back)(void *data);
-typedef rt_bool_t (*rt_smp_cond)(int cpu, void *info);
 
-#define SMP_CALL_EVENT_FUNC 0x1
+/* callback of smp call */
+typedef void (*rt_smp_call_cb_t)(void *data);
+typedef rt_bool_t (*rt_smp_cond_t)(int cpu, void *info);
 
-#define SMP_CALL_WAIT_ALL (1 << 0)
-#define SMP_CALL_NO_WAIT  (1 << 1)
+#define SMP_CALL_EVENT_GLOB_ASYNC 0x1
+#define SMP_CALL_EVENT_GLOB_SYNC  0x2
+#define SMP_CALL_EVENT_REQUEST    0x4
+
+#define SMP_CALL_WAIT_ALL (1ul << 0)
+#define SMP_CALL_NO_LOCAL (1ul << 1)
+#define SMP_CALL_SIGNAL   (1ul << 2)
 
 #define RT_ALL_CPU ((1 << RT_CPUS_NR) - 1)
 struct rt_smp_event
 {
-    int                   cpu_mask;
     int                   event_id;
     void                 *data;
-    rt_smp_call_func_back func;
-};
-struct rt_smp_call
-{
-    struct rt_spinlock  lock;
-    struct rt_smp_event event;
+    rt_smp_call_cb_t      func;
+
+    union
+    {
+        rt_atomic_t      *calling_cpu_mask;
+        rt_atomic_t       usage_tracer;
+    } typed;
 };
 
+struct rt_smp_call_req
+{
+    /* handle the busy status synchronization */
+    rt_hw_spinlock_t    freed_lock;
+    struct rt_smp_event event;
+    rt_ll_slist_t       slist_node;
+};
 
 void rt_smp_call_ipi_handler(int vector, void *param);
-void rt_smp_call_each_cpu(rt_smp_call_func_back func, void *data, rt_uint8_t flag);
-void rt_smp_call_each_cpu_cond(rt_smp_call_func_back func, void *data, rt_uint8_t flag, rt_smp_cond cond_func);
-void rt_smp_call_any_cpu(int cpu_mask, rt_smp_call_func_back func, void *data, rt_uint8_t flag);
-void rt_smp_call_any_cpu_cond(int cpu_mask, rt_smp_call_func_back func, void *data, rt_uint8_t flag, rt_smp_cond cond_func);
-void rt_smp_init(void);
+void rt_smp_call_each_cpu(rt_smp_call_cb_t func, void *data, rt_uint8_t flags);
+void rt_smp_call_each_cpu_cond(rt_smp_call_cb_t func, void *data, rt_uint8_t flag, rt_smp_cond_t cond_func);
+void rt_smp_call_cpu_mask(rt_ubase_t cpu_mask, rt_smp_call_cb_t func, void *data, rt_uint8_t flags);
+void rt_smp_call_cpu_mask_cond(rt_ubase_t cpu_mask, rt_smp_call_cb_t func, void *data, rt_uint8_t flag, rt_smp_cond_t cond_func);
+void rt_smp_call_init(void);
 
+rt_err_t rt_smp_call_request(int callcpu, rt_uint8_t flags, struct rt_smp_call_req *call_req);
+void rt_smp_call_req_init(struct rt_smp_call_req *call_req,
+                          rt_smp_call_cb_t func, void *data);
+void rt_smp_request_wait_freed(struct rt_smp_call_req *req);
+
+#define rt_smp_for_each_cpu(_iter) for (_iter = 0; (_iter) < RT_CPUS_NR; (_iter)++)
+rt_inline size_t rt_smp_get_next_remote(size_t iter, size_t cpuid)
+{
+    iter++;
+    return iter == cpuid ? iter + 1 : iter;
+}
+#define rt_smp_for_each_remote_cpu(_iter, _cpuid) for (_iter = rt_smp_get_next_remote(-1, _cpuid); (_iter) < RT_CPUS_NR; _iter=rt_smp_get_next_remote(_iter, _cpuid))
 #endif
diff --git a/include/rtdef.h b/include/rtdef.h
index 1e7641ec56..d392f84050 100644
--- a/include/rtdef.h
+++ b/include/rtdef.h
@@ -674,8 +674,8 @@ typedef struct rt_cpu_usage_stats *rt_cpu_usage_stats_t;
 #define RT_STOP_IPI                     1
 #endif /* RT_STOP_IPI */
 
-#ifndef RT_FUNC_IPI
-#define RT_FUNC_IPI                     2
+#ifndef RT_SMP_CALL_IPI
+#define RT_SMP_CALL_IPI                 2
 #endif
 
 #define RT_MAX_IPI                      3
diff --git a/libcpu/aarch64/common/setup.c b/libcpu/aarch64/common/setup.c
index 6c16801a10..4ea111d06a 100644
--- a/libcpu/aarch64/common/setup.c
+++ b/libcpu/aarch64/common/setup.c
@@ -302,14 +302,14 @@ void rt_hw_common_setup(void)
     rt_thread_idle_sethook(rt_hw_idle_wfi);
 
 #ifdef RT_USING_SMP
-    rt_smp_init();
+    rt_smp_call_init();
     /* Install the IPI handle */
     rt_hw_ipi_handler_install(RT_SCHEDULE_IPI, rt_scheduler_ipi_handler);
     rt_hw_ipi_handler_install(RT_STOP_IPI, rt_scheduler_ipi_handler);
-    rt_hw_ipi_handler_install(RT_FUNC_IPI, rt_smp_call_ipi_handler);
+    rt_hw_ipi_handler_install(RT_SMP_CALL_IPI, rt_smp_call_ipi_handler);
     rt_hw_interrupt_umask(RT_SCHEDULE_IPI);
     rt_hw_interrupt_umask(RT_STOP_IPI);
-    rt_hw_interrupt_umask(RT_FUNC_IPI);
+    rt_hw_interrupt_umask(RT_SMP_CALL_IPI);
 #endif
 }
 
@@ -391,6 +391,7 @@ rt_weak void rt_hw_secondary_cpu_bsp_start(void)
 
     rt_hw_interrupt_umask(RT_SCHEDULE_IPI);
     rt_hw_interrupt_umask(RT_STOP_IPI);
+    rt_hw_interrupt_umask(RT_SMP_CALL_IPI);
 
     LOG_I("Call cpu %d on %s", cpu_id, "success");