AMD GCN: Implement circular buffering.

The GCN port outputs stdout and stderr via a shared-memory interface.
Previously the buffer was limited to 1000 write operations, which was enough
for testing purposes, but easy to exhaust.

This patch implements a new circular buffering system allowing a greater
amount of output.  The interface must allow hundreds of hardware threads to
output simultaneously.  The new limit is UINT32_MAX write operations.

Unfortunately, there's no way to tell if the host side has also been updated.
This code will misbehave unless the gcn-run from GCC is also updated (although
it's fine the other way around), but that patch has already been committed.

OK?

Andrew Stubbs
Mentor Graphics / CodeSourcery
This commit is contained in:
Andrew Stubbs 2019-03-18 16:18:09 +00:00 committed by Corinna Vinschen
parent 38322b9bf6
commit 62c66a39bd
1 changed files with 38 additions and 17 deletions

View File

@ -26,10 +26,14 @@
The next_output counter must be atomically incremented for each The next_output counter must be atomically incremented for each
print output. Only when the print data is fully written can the print output. Only when the print data is fully written can the
"written" flag be set. */ "written" flag be set.
The buffer is circular; the host increments the consumed counter
and clears the written flag as it goes, opening up slots for reuse.
The counters always use absolute numbers. */
struct output { struct output {
int return_value; int return_value;
int next_output; unsigned int next_output;
struct printf_data { struct printf_data {
int written; int written;
char msg[128]; char msg[128];
@ -39,7 +43,8 @@ struct output {
double dvalue; double dvalue;
char text[128]; char text[128];
}; };
} queue[1000]; } queue[1024];
unsigned int consumed;
}; };
_READ_WRITE_RETURN_TYPE write (int fd, const void *buf, size_t count) _READ_WRITE_RETURN_TYPE write (int fd, const void *buf, size_t count)
@ -55,33 +60,49 @@ _READ_WRITE_RETURN_TYPE write (int fd, const void *buf, size_t count)
struct output *data = (struct output *)kernargs[2]; struct output *data = (struct output *)kernargs[2];
/* Each output slot allows 256 bytes, so reserve as many as we need. */ /* Each output slot allows 256 bytes, so reserve as many as we need. */
int slot_count = ((count+1)/256)+1; unsigned int slot_count = ((count+1)/256)+1;
int index = __atomic_fetch_add (&data->next_output, slot_count, unsigned int index = __atomic_fetch_add (&data->next_output, slot_count,
__ATOMIC_ACQUIRE); __ATOMIC_ACQUIRE);
if ((unsigned int)(index + slot_count) < data->consumed)
{
/* Overflow. */
errno = EFBIG;
return 0;
}
for (int c = count; for (int c = count;
c >= 0 && index < 1000; c >= 0;
buf += 256, c -= 256, index++) buf += 256, c -= 256, index++)
{ {
unsigned int slot = index % 1024;
/* Spinlock while the host catches up. */
if (index >= 1024)
while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
<= (index - 1024))
asm ("s_sleep 64");
if (c < 128) if (c < 128)
{ {
memcpy (data->queue[index].msg, buf, c); memcpy (data->queue[slot].msg, buf, c);
data->queue[index].msg[c] = '\0'; data->queue[slot].msg[c] = '\0';
data->queue[index].text[0] = '\0'; data->queue[slot].text[0] = '\0';
} }
else if (c < 256) else if (c < 256)
{ {
memcpy (data->queue[index].msg, buf, 128); memcpy (data->queue[slot].msg, buf, 128);
memcpy (data->queue[index].text, buf+128, c-128); memcpy (data->queue[slot].text, buf+128, c-128);
data->queue[index].text[c-128] = '\0'; data->queue[slot].text[c-128] = '\0';
} }
else else
{ {
memcpy (data->queue[index].msg, buf, 128); memcpy (data->queue[slot].msg, buf, 128);
memcpy (data->queue[index].text, buf+128, 128); memcpy (data->queue[slot].text, buf+128, 128);
} }
data->queue[index].type = 3; /* Raw. */ data->queue[slot].type = 3; /* Raw. */
__atomic_store_n (&data->queue[index].written, 1, __ATOMIC_RELEASE); __atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
} }
return count; return count;