AMD GCN: Implement circular buffering.

The GCN port outputs stdout and stderr via a shared-memory interface. Previously the buffer was limited to 1000 write operations, which was enough for testing purposes, but easy to exhaust. This patch implements a new circular buffering system allowing a greater amount of output. The interface must allow hundreds of hardware threads to output simultaneously. The new limit is UINT32_MAX write operations. Unfortunately, there's no way to tell if the host side has also been updated. This code will misbehave unless the gcn-run from GCC is also updated (although it's fine the other way around), but that patch has already been committed. OK? Andrew Stubbs Mentor Graphics / CodeSourcery
2019-03-18 16:18:09 +00:00 · 2019-03-18 16:18:09 +00:00 · 62c66a39bd
parent 38322b9bf6
commit 62c66a39bd
1 changed files with 38 additions and 17 deletions
--- a/newlib/libc/sys/amdgcn/write.c
+++ b/newlib/libc/sys/amdgcn/write.c
@ -26,10 +26,14 @@
 
   The next_output counter must be atomically incremented for each
   print output.  Only when the print data is fully written can the
-   "written" flag be set.  */
+   "written" flag be set.
+
+   The buffer is circular; the host increments the consumed counter
+   and clears the written flag as it goes, opening up slots for reuse.
+   The counters always use absolute numbers.  */
 struct output {
  int return_value;
-  int next_output;
+  unsigned int next_output;
  struct printf_data {
    int written;
    char msg[128];
@ -39,7 +43,8 @@ struct output {
      double dvalue;
      char text[128];
    };
-  } queue[1000];
+  } queue[1024];
+  unsigned int consumed;
 };

 _READ_WRITE_RETURN_TYPE write (int fd, const void *buf, size_t count)
@ -55,33 +60,49 @@ _READ_WRITE_RETURN_TYPE write (int fd, const void *buf, size_t count)
  struct output *data = (struct output *)kernargs[2];

  /* Each output slot allows 256 bytes, so reserve as many as we need. */
-  int slot_count = ((count+1)/256)+1;
-  int index = __atomic_fetch_add (&data->next_output, slot_count,
-				  __ATOMIC_ACQUIRE);
+  unsigned int slot_count = ((count+1)/256)+1;
+  unsigned int index = __atomic_fetch_add (&data->next_output, slot_count,
+					   __ATOMIC_ACQUIRE);
+
+  if ((unsigned int)(index + slot_count) < data->consumed)
+    {
+      /* Overflow.  */
+      errno = EFBIG;
+      return 0;
+    }
+
  for (int c = count;
-       c >= 0 && index < 1000;
+       c >= 0;
       buf += 256, c -= 256, index++)
    {
+      unsigned int slot = index % 1024;
+
+      /* Spinlock while the host catches up.  */
+      if (index >= 1024)
+	while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
+	       <= (index - 1024))
+	  asm ("s_sleep 64");
+
      if (c < 128)
 	{
-	  memcpy (data->queue[index].msg, buf, c);
-	  data->queue[index].msg[c] = '\0';
-	  data->queue[index].text[0] = '\0';
+	  memcpy (data->queue[slot].msg, buf, c);
+	  data->queue[slot].msg[c] = '\0';
+	  data->queue[slot].text[0] = '\0';
 	}
      else if (c < 256)
 	{
-	  memcpy (data->queue[index].msg, buf, 128);
-	  memcpy (data->queue[index].text, buf+128, c-128);
-	  data->queue[index].text[c-128] = '\0';
+	  memcpy (data->queue[slot].msg, buf, 128);
+	  memcpy (data->queue[slot].text, buf+128, c-128);
+	  data->queue[slot].text[c-128] = '\0';
 	}
      else
 	{
-	  memcpy (data->queue[index].msg, buf, 128);
-	  memcpy (data->queue[index].text, buf+128, 128);
+	  memcpy (data->queue[slot].msg, buf, 128);
+	  memcpy (data->queue[slot].text, buf+128, 128);
 	}

-      data->queue[index].type = 3; /* Raw.  */
-      __atomic_store_n (&data->queue[index].written, 1, __ATOMIC_RELEASE);
+      data->queue[slot].type = 3; /* Raw.  */
+      __atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
    }

  return count;