297 lines
10 KiB
C
297 lines
10 KiB
C
#include <rtgui/rtgui_xml.h>
|
|
|
|
/* Internal states that the parser can be in at any given time. */
|
|
enum {
|
|
STAT_START = 0, /* starting base state, default state */
|
|
STAT_TEXT, /* text state */
|
|
STAT_START_TAG, /* start tag state */
|
|
STAT_START_TAGNAME, /* start tagname state */
|
|
STAT_START_TAGNAME_END, /* start tagname ending state */
|
|
STAT_END_TAG, /* end tag state */
|
|
STAT_END_TAGNAME, /* end tag tagname state */
|
|
STAT_END_TAGNAME_END, /* end tag tagname ending */
|
|
STAT_EMPTY_TAG, /* empty tag state */
|
|
STAT_SPACE, /* linear whitespace state */
|
|
STAT_ATTR_NAME, /* attribute name state */
|
|
STAT_ATTR_NAME_END, /* attribute name ending state */
|
|
STAT_ATTR_VAL, /* attribute value starting state */
|
|
STAT_ATTR_VAL2, /* attribute value state */
|
|
STAT_ERROR /* error state */
|
|
};
|
|
|
|
/* character classes that we will match against; This could be expanded if
|
|
need be, however, we are aiming for simple. */
|
|
enum {
|
|
CLASS_TYPE_NONE = 0, /* matches nothing, a base state */
|
|
CLASS_TYPE_LEFT_ANGLE, /* matches start tag '<' */
|
|
CLASS_TYPE_SLASH, /* matches forward slash */
|
|
CLASS_TYPE_RIGHT_ANGLE, /* matches end tag '>' */
|
|
CLASS_TYPE_EQUALS, /* matches equals sign */
|
|
CLASS_TYPE_QUOTE, /* matches double-quotes */
|
|
CLASS_TYPE_LETTERS, /* matches a-zA-Z letters and digits 0-9 */
|
|
CLASS_TYPE_SPACE, /* matches whitespace */
|
|
CLASS_TYPE_ANY /* matches any ASCII character; will match all
|
|
above classes */
|
|
};
|
|
|
|
/* xml state transition table */
|
|
struct rtgui_xml_state
|
|
{
|
|
rt_uint8_t state;
|
|
rt_uint8_t class_type;
|
|
rt_uint8_t next_state;
|
|
rt_uint8_t event;
|
|
};
|
|
|
|
/* Note: States must be grouped in match order AND grouped together! */
|
|
static const struct rtgui_xml_state RTGUI_XML_STATES [] = {
|
|
/* [0-2] starting state, which also serves as the default state in case
|
|
of error */
|
|
{ STAT_START, CLASS_TYPE_SPACE, STAT_SPACE, EVENT_NONE },
|
|
{ STAT_START, CLASS_TYPE_LEFT_ANGLE, STAT_START_TAG, EVENT_NONE },
|
|
{ STAT_START, CLASS_TYPE_ANY, STAT_TEXT, EVENT_COPY },
|
|
|
|
/* [3-5] space state handles linear white space */
|
|
{ STAT_SPACE, CLASS_TYPE_SPACE, STAT_SPACE, EVENT_NONE },
|
|
{ STAT_SPACE, CLASS_TYPE_LEFT_ANGLE, STAT_START_TAG, EVENT_TEXT },
|
|
{ STAT_SPACE, CLASS_TYPE_ANY, STAT_TEXT, EVENT_COPY },
|
|
|
|
/* [6-8] handle start tag */
|
|
{ STAT_START_TAG, CLASS_TYPE_LETTERS, STAT_START_TAGNAME, EVENT_COPY },
|
|
{ STAT_START_TAG, CLASS_TYPE_SLASH, STAT_END_TAG, EVENT_COPY },
|
|
/* below added since some individuals get a little carried away with
|
|
spacing around tag names, e.g. < tag > */
|
|
{ STAT_START_TAG, CLASS_TYPE_SPACE, STAT_START_TAG, EVENT_NONE },
|
|
|
|
/* [9-12] handle start tag name */
|
|
{ STAT_START_TAGNAME, CLASS_TYPE_LETTERS, STAT_START_TAGNAME, EVENT_NONE },
|
|
{ STAT_START_TAGNAME, CLASS_TYPE_SPACE, STAT_START_TAGNAME_END, EVENT_START },
|
|
/* below added for tags without any space between tag and ending
|
|
slash, e.g., <br/> */
|
|
{ STAT_START_TAGNAME, CLASS_TYPE_SLASH, STAT_EMPTY_TAG, EVENT_END },
|
|
{ STAT_START_TAGNAME, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_START },
|
|
|
|
/* [13-16] handle start tag name end */
|
|
{ STAT_START_TAGNAME_END, CLASS_TYPE_LETTERS, STAT_ATTR_NAME, EVENT_COPY },
|
|
/* below added to handle additional space in between attribute value
|
|
pairs in start tags, e.g., <tag attr="2" attr2="test" > */
|
|
{ STAT_START_TAGNAME_END, CLASS_TYPE_SPACE, STAT_START_TAGNAME_END, EVENT_NONE },
|
|
{ STAT_START_TAGNAME_END, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_START },
|
|
/* below supports tags that are self-closing, e.g., <br /> */
|
|
{ STAT_START_TAGNAME_END, CLASS_TYPE_SLASH, STAT_EMPTY_TAG, EVENT_COPY },
|
|
|
|
/* [17] handle empty tags, e.g., <br /> */
|
|
{ STAT_EMPTY_TAG, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_END },
|
|
|
|
/* [18] handle end tag, e.g., <tag /> */
|
|
{ STAT_END_TAG, CLASS_TYPE_LETTERS, STAT_END_TAGNAME, EVENT_NONE },
|
|
|
|
/* [19-21] handle end tag name */
|
|
{ STAT_END_TAGNAME, CLASS_TYPE_LETTERS, STAT_END_TAGNAME, EVENT_NONE },
|
|
{ STAT_END_TAGNAME, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_END },
|
|
/* below adds support for spaces at the end of an end tag (before
|
|
closing bracket) */
|
|
{ STAT_END_TAGNAME, CLASS_TYPE_SPACE, STAT_END_TAGNAME_END, EVENT_END },
|
|
|
|
/* [22] handle ending of end tag name */
|
|
{ STAT_END_TAGNAME_END, CLASS_TYPE_SPACE, STAT_END_TAGNAME_END, EVENT_NONE },
|
|
{ STAT_END_TAGNAME_END, CLASS_TYPE_RIGHT_ANGLE,STAT_START, EVENT_NONE },
|
|
|
|
/* [23-25] handle text */
|
|
{ STAT_TEXT, CLASS_TYPE_SPACE, STAT_SPACE, EVENT_NONE },
|
|
{ STAT_TEXT, CLASS_TYPE_LEFT_ANGLE, STAT_START_TAG, EVENT_TEXT },
|
|
{ STAT_TEXT, CLASS_TYPE_ANY, STAT_TEXT, EVENT_NONE },
|
|
|
|
/* [26-30] handle attribute names */
|
|
{ STAT_ATTR_NAME, CLASS_TYPE_LETTERS, STAT_ATTR_NAME, EVENT_COPY },
|
|
/* below add support for space before the equals sign, e.g, <tag
|
|
attr ="2"> */
|
|
{ STAT_ATTR_NAME, CLASS_TYPE_SPACE, STAT_ATTR_NAME_END, EVENT_NAME },
|
|
{ STAT_ATTR_NAME, CLASS_TYPE_EQUALS, STAT_ATTR_VAL, EVENT_NAME },
|
|
|
|
/* [31-33] attribute name end */
|
|
{ STAT_ATTR_NAME_END, CLASS_TYPE_SPACE, STAT_ATTR_NAME_END, EVENT_NONE },
|
|
{ STAT_ATTR_NAME_END, CLASS_TYPE_LETTERS, STAT_ATTR_NAME, EVENT_COPY },
|
|
{ STAT_ATTR_NAME_END, CLASS_TYPE_EQUALS, STAT_ATTR_VAL, EVENT_NONE },
|
|
|
|
/* [34-35] handle attribute values, initial quote and spaces */
|
|
{ STAT_ATTR_VAL, CLASS_TYPE_QUOTE, STAT_ATTR_VAL2, EVENT_NONE },
|
|
/* below handles initial spaces before quoted attribute value */
|
|
{ STAT_ATTR_VAL, CLASS_TYPE_SPACE, STAT_ATTR_VAL, EVENT_NONE },
|
|
|
|
/* [36-37] handle actual attribute values */
|
|
{ STAT_ATTR_VAL2, CLASS_TYPE_QUOTE, STAT_START_TAGNAME_END, EVENT_VAL },
|
|
{ STAT_ATTR_VAL2, CLASS_TYPE_LETTERS, STAT_ATTR_VAL2, EVENT_COPY },
|
|
{ STAT_ATTR_VAL2, CLASS_TYPE_SLASH, STAT_ATTR_VAL2, EVENT_NONE },
|
|
|
|
/* End of table marker */
|
|
{ STAT_ERROR, CLASS_TYPE_NONE, STAT_ERROR, EVENT_NONE }
|
|
};
|
|
|
|
struct rtgui_xml
|
|
{
|
|
/* event handler */
|
|
rtgui_xml_event_handler_t event_handler;
|
|
void* user;
|
|
|
|
char* buffer; /* xml buffer */
|
|
rt_size_t buffer_size; /* buffer size */
|
|
rt_size_t position; /* current position in buffer */
|
|
rt_uint16_t state, event; /* current state and event */
|
|
|
|
rt_bool_t copy; /* copy text into tmp buffer */
|
|
rt_bool_t halt; /* halt parsing of document */
|
|
};
|
|
|
|
rtgui_xml_t* rtgui_xml_create(rt_size_t buffer_size, rtgui_xml_event_handler_t handler,
|
|
void* user)
|
|
{
|
|
rtgui_xml_t* xml = (rtgui_xml_t*) rtgui_malloc(sizeof(struct rtgui_xml));
|
|
rt_memset(xml, 0, sizeof(rtgui_xml_t));
|
|
|
|
xml->event_handler = handler;
|
|
xml->user = user;
|
|
|
|
/* create buffer */
|
|
xml->buffer_size = buffer_size;
|
|
xml->buffer = (char*)rtgui_malloc(xml->buffer_size);
|
|
return xml;
|
|
}
|
|
|
|
void rtgui_xml_destroy(rtgui_xml_t* xml)
|
|
{
|
|
if(xml)
|
|
{
|
|
rtgui_free(xml->buffer);
|
|
rtgui_free(xml);
|
|
}
|
|
}
|
|
|
|
const char* rtgui_xml_event_str(rt_uint8_t event)
|
|
{
|
|
switch(event)
|
|
{
|
|
case EVENT_START:
|
|
return "start tag";
|
|
break;
|
|
case EVENT_END:
|
|
return "end tag";
|
|
break;
|
|
case EVENT_TEXT:
|
|
return "text";
|
|
break;
|
|
case EVENT_NAME:
|
|
return "attr name";
|
|
break;
|
|
case EVENT_VAL:
|
|
return "attr val";
|
|
break;
|
|
case EVENT_END_DOC:
|
|
return "end document";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return "err";
|
|
}
|
|
|
|
int rtgui_xml_parse(rtgui_xml_t* xml, const char* buf, rt_size_t len)
|
|
{
|
|
int i, j, c, match;
|
|
|
|
#define is_space(ch) \
|
|
((rt_uint32_t)(ch - 9) < 5u || ch == ' ')
|
|
#define is_alpha(ch) \
|
|
((rt_uint32_t)((ch | 0x20) - 'a') < 26u)
|
|
#define is_digit(ch) \
|
|
((rt_uint32_t)(ch - '0') < 10u)
|
|
#define is_letters(ch) \
|
|
(is_alpha(ch) || is_digit(ch) || (ch == '.'))
|
|
|
|
for(i=0; i<len; i++)
|
|
{
|
|
if(xml->halt) break;
|
|
|
|
c = buf[i] & 0xff;
|
|
|
|
/* search in state table */
|
|
for(j=0, match = 0; RTGUI_XML_STATES[j].state != STAT_ERROR; j++)
|
|
{
|
|
if(RTGUI_XML_STATES[j].state != xml->state)
|
|
continue;
|
|
|
|
switch(RTGUI_XML_STATES[j].class_type)
|
|
{
|
|
case CLASS_TYPE_LETTERS:
|
|
match = is_letters(c);
|
|
break;
|
|
case CLASS_TYPE_LEFT_ANGLE:
|
|
match = (c == '<');
|
|
break;
|
|
case CLASS_TYPE_SLASH:
|
|
match = (c == '/');
|
|
break;
|
|
case CLASS_TYPE_RIGHT_ANGLE:
|
|
match = (c == '>');
|
|
break;
|
|
case CLASS_TYPE_EQUALS:
|
|
match = (c == '=');
|
|
break;
|
|
case CLASS_TYPE_QUOTE:
|
|
match = (c == '"');
|
|
break;
|
|
case CLASS_TYPE_SPACE:
|
|
match = is_space(c);
|
|
break;
|
|
case CLASS_TYPE_ANY:
|
|
match = 1;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* we matched a character class */
|
|
if(match)
|
|
{
|
|
if(RTGUI_XML_STATES[j].event == EVENT_COPY)
|
|
{
|
|
xml->copy = RT_TRUE;
|
|
}
|
|
else if(RTGUI_XML_STATES[j].event != EVENT_NONE)
|
|
{
|
|
if(xml->copy == RT_TRUE)
|
|
{
|
|
/* basically we are guaranteed never to have an event of
|
|
type EVENT_COPY or EVENT_NONE here. */
|
|
xml->event = RTGUI_XML_STATES[j].event;
|
|
xml->buffer[xml->position] = 0; /* make a string */
|
|
|
|
if(!xml->event_handler(RTGUI_XML_STATES[j].event,
|
|
xml->buffer, xml->position ,
|
|
xml->user))
|
|
{
|
|
xml->halt = 1; /* stop parsing from here out */
|
|
}
|
|
xml->position = 0;
|
|
xml->copy = RT_FALSE;
|
|
}
|
|
}
|
|
if(xml->copy == RT_TRUE)
|
|
{
|
|
/* check to see if we have room; one less for trailing
|
|
nul */
|
|
if(xml->position < xml->buffer_size-1)
|
|
{
|
|
xml->buffer[xml->position] = buf[i];
|
|
xml->position++;
|
|
}
|
|
}
|
|
xml->state = RTGUI_XML_STATES[j].next_state; /* change state */
|
|
break; /* break out of loop though state search */
|
|
}
|
|
}
|
|
}
|
|
|
|
return !xml->halt;
|
|
}
|