The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#define QK_OUTSIDE   0
#define QK_VAL_1     1
#define QK_COMMENT   2
#define QK_PI        3
#define QK_BANG      4
#define QK_CDATA     5
#define QK_NAME_1    6
#define QK_NAME_X    7
#define QK_NAME_GAP  8
#define QK_ATT_SPACE 9
#define QK_ATT_NAME  10
#define QK_ATT_EQ1   11
#define QK_ATT_EQX   12
#define QK_ATT_QUOT  13
#define QK_ENAME_X   14
#define QK_ERROR     15
#define QK_VAL_X     16


#ifndef NULL
  #define NULL 0x00
#endif

class attc;

class nodec {
  public:
  nodec *curchild;
  nodec *parent;
  nodec *next;
  nodec *firstchild;
  nodec *lastchild;
  attc  *firstatt;
  attc  *lastatt;
  int   numchildren;
  int   numatt;
  char  *name;
  int   namelen;
  char  *value;
  int   vallen;
  int   type; // cdata or normal
  int   numvals;
  nodec( nodec *newparent );
  nodec();
  nodec *addchild( char *newname, int newnamelen ) { return addchild(newname,newnamelen,0,0,0); };
  nodec *addchild( char *newname, int newnamelen, char *newval, int newvallen, int newtype );
  attc  *addatt  ( char *newname, int newnamelen ) { return addatt( newname, newnamelen, 0, 0 ); };
  attc  *addatt  ( char *newname, int newnamelen, char *newval, int newvallen );
};

class attc {
  public:
  nodec *parent;
  attc  *next;
  char  *name;
  int   namelen;
  char  *value;
  int   vallen;
  attc( nodec *newparent );
};

class parserc {
  nodec *pcurnode;
  attc  *curatt;
  public:
  nodec *parse( char *newbuf );
  void  descend    () { pcurnode = pcurnode->firstchild;  };
  void  ascend     () { pcurnode = pcurnode->parent;      };
  int   num_nodes  () { return     pcurnode->numchildren; };
  int   num_att    () { return     pcurnode->numatt;      };
  void  next_node  () { pcurnode = pcurnode->next;        };
  void  next_att   () { curatt   = curatt  ->next;        };
  void  first_att  () { curatt   = pcurnode->firstatt;    };
  int   node_type  () { return     pcurnode->type;        };
  char  *node_name ();
  char  *node_value();
  char  *att_name  ();
  char  *att_value ();
};

char *parserc::node_name() {
  if( !pcurnode->name ) return "";
  int len = pcurnode->namelen;
  if( !len ) return "";
  pcurnode->name[ len ] = 0x00;
  return pcurnode->name;
}

char *parserc::node_value() {
  if( !pcurnode->value ) return "";
  int len = pcurnode->vallen;
  if( !len ) return "";
  pcurnode->value[ len ] = 0x00;
  return pcurnode->value;
}

char *parserc::att_name() {
  int len = curatt->namelen;
  if(!len) return "";
  curatt->name[ curatt->namelen ] = 0x00;
  return curatt->name;
}

char *parserc::att_value() {
  int len = curatt->vallen;
  if(!len) return "";
  curatt->value[ curatt->vallen ] = 0x00;
  return curatt->value;
}

nodec::nodec( nodec *newparent ) {
  next        = NULL;
  parent      = newparent;
  firstchild  = NULL;
  lastchild   = NULL;
  numchildren = 0;
  numatt      = 0;
  namelen     = 0;
  vallen      = 0;
  numvals     = 0;
}

nodec::nodec() {
  next        = NULL;
  parent      = NULL;
  firstchild  = NULL;
  lastchild   = NULL;
  numchildren = 0;
  numatt      = 0;
  name        = 0x00;
  value       = 0x00;
  namelen     = 0;
  vallen      = 0;
}

attc::attc( nodec *newparent ) {
  next    = NULL;
  parent  = newparent;
  namelen = 0;
  vallen  = 0;
  name    = 0x00;
  value   = 0x00;
}

nodec* parserc::parse( char *xmlin ) {
    nodec *root = new nodec;
    
    char  *tagname, *attname, *attval, *val;
    int   pos         = 0;
    int   state       = QK_VAL_1;
    int   tagname_len = 0;
    int   attname_len = 0;
    int   attval_len  = 0;
    nodec *curnode    = root;
    attc  *curatt     = NULL;
    char  let;
    
    while( 1 ) {
        if( curnode == 0x00 ) break;
        
        // itty bit faster without the following
        // protect from garbage being fed to processor
        //if( tagname_len > 38   ) break;
        //if( attname_len > 38   ) break;
        //if( attval_len  > 148  ) break;
        
        let = xmlin[ pos ];
        if( !let ) break;

        switch( state ) {

            case QK_OUTSIDE: // outside, waiting for <
                if( let == '<' ) {
                    state       = QK_NAME_1;
                    tagname_len = 0; // for safety
                }
                break;
            
            case QK_VAL_1:
            case QK_VAL_X: // outside grabbing value
                if( let == '<' ) {
                    if( xmlin[ pos + 1 ] == '!') {
                        if( xmlin[ pos + 2 ] == '[' ) { // <![
                            if( xmlin[ pos + 3 ] == 'C' &&
                                xmlin[ pos + 4 ] == 'D' &&
                                xmlin[ pos + 5 ] == 'A' &&
                                xmlin[ pos + 6 ] == 'T' &&
                                xmlin[ pos + 7 ] == 'A'    ) {
                                    pos += 8;
                                    state = QK_CDATA;
                                    break;
                            }
                            else {
                                state = QK_ERROR; // unrecognized <![ section
                                break;
                            }
                        }
                        else if( xmlin[ pos + 2 ] == '-' && // <!--
                            xmlin[ pos + 3 ] == '-' ) {
                                pos += 3;
                                state = QK_COMMENT;
                                break;
                            }
                        else {
                            state = QK_BANG; // unrecognized <! section
                            break;
                        }
                    }
                    else if( xmlin[ pos + 1 ] == '?' ) { // <?
                        pos++;
                        state = QK_PI;
                        break;
                    }
                    else { 
                        state           = QK_NAME_1;
                        tagname_len     = 0; // for safety
                        break;
                    }
                }
                if( state == QK_VAL_1 ) {
                  if( !curnode->numvals ) {
                    curnode->value = &xmlin[ pos ];
                    curnode->vallen = 1;
                  }
                  curnode->numvals++;
                  state = QK_VAL_X;
                }
                else {
                  if( curnode->numvals == 1 ) {
                    curnode->vallen++;
                  }
                }
                break;

            case QK_COMMENT:
                if( let == '-' &&
                    xmlin[ pos + 1 ] == '-' &&
                    xmlin[ pos + 2 ] == '>' ) {
                        state = QK_VAL_1;
                        pos += 2;
                        break;
                    }
                break;

            case QK_PI:
                if( let == '?' &&
                    xmlin[ pos + 1 ] == '>' ) {
                        state = QK_VAL_1;
                        pos ++;
                    }
                break;

            case QK_BANG:
                if( let == '>' ) {
                  state = QK_VAL_1;
                }
                break;

            case QK_CDATA: // cdata section
                if( let == ']' &&
                    xmlin[ pos + 1 ] == ']' &&
                    xmlin[ pos + 2 ] == '>' ) {
                        state = QK_VAL_1;
                        pos += 2;
                        break;
                }
                if( !curnode->numvals ) {
                  curnode->value = &xmlin[ pos ];
                  curnode->vallen = 0;
                  curnode->numvals = 1;
                }
                if( curnode->numvals == 1 ) curnode->vallen++;
                
                break;

            case QK_NAME_1: // grabbing first letter after <
                if( let == ' ' || let == 0x0d || let == 0x0a ) break;
                if( let == '/' ) {
                    state       = QK_ENAME_X;
                    tagname_len = 0; // needed to reset
                    break;
                }
                tagname       = &xmlin[ pos ];
                tagname_len   = 1;
                state         = QK_NAME_X;
                break;

            case QK_NAME_X: // grabbing tag name
                if( let == ' ' || let == 0x0d || let == 0x0a ) {
                    curnode                = curnode->addchild( tagname, tagname_len );
                    state                  = QK_NAME_GAP;
                    attname_len            = 0;
                }
                else if( let == '>' ) {
                    curnode                = curnode->addchild( tagname, tagname_len );
                    state                  = QK_VAL_1;
                }
                else if( let == '/' ) {
                    state                  = QK_VAL_1;
                    curnode->addchild( tagname, tagname_len );
                    tagname_len            = 0;
                    pos++;
                }
                else {
                    tagname_len++;
                }
                break;

            case QK_NAME_GAP: // space after tag name was found
                if( let == ' ' || let == 0x0d || let == 0x0a ) break;
                if( let == '>' ) {
                    state = QK_VAL_1;
                    break;
                }
                if( let == '/' ) {
                    state = QK_VAL_1;
                    curnode = curnode->parent;
                    pos++; // am assuming next char is >
                    break;
                }
                if( let == '=' ) {
                    state = QK_ERROR; // error state
                    break;
                }
                
                state   = QK_ATT_NAME;
                attname = &xmlin[ pos ];
                attname_len = 1; // this really is needed... don't remove it
                
                break;

            case QK_ATT_SPACE:
                // we already grabbed the att name, now either go to name gap or att eq
                if( let == ' ' || let == 0x0d || let == 0x0a ) break;
                if( let == '=' ) {
                    state = QK_ATT_EQ1;
                    break;
                }
                // we have another attribute name, so continue
            
            case QK_ATT_NAME: // grabbing attribute name
                if( let == '/' ) {
                    state = QK_VAL_1;
                    curnode = curnode->parent;
                    pos++;
                }
                else if( let == ' ' && xmlin[ pos + 1 ] == '=' ) {
                    break;
                }
                else if( let == ' ' ) state = QK_ATT_SPACE;
                else if( let == '>' ) state = QK_VAL_1;
                else if( let == '=' ) {
                    state      = QK_ATT_EQ1;
                    attval_len = 0;
                }
                else {
                    if( !attname_len ) attname = &xmlin[pos];
                    attname_len++;
                    break;
                }
                curatt = curnode->addatt( attname, attname_len );
                attname_len            = 0;
                break;

            case QK_ATT_EQ1: // grabbing attribute value (after eq)
                if( let == '/' && xmlin[ pos + 1 ] == '>' ) {
                    state = QK_VAL_1;
                    curnode = curnode->parent;
                    pos++;
                }
                else if( let == '"' ) state = QK_ATT_QUOT;
                else if( let == '>' ) state = QK_VAL_1;
                else if( let == ' ' ) break; //state = QK_NAME_GAP;
                else {
                    if( !attval_len ) attval = &xmlin[ pos ];
                    attval_len++;
                    state = QK_ATT_EQX;
                }
                break;

            case QK_ATT_EQX:
                if( let == '/' && xmlin[ pos + 1 ] == '>' ) {
                    state = QK_VAL_1;
                    curnode = curnode->parent;
                    pos++;
                }
                else if( let == '>' ) state = QK_VAL_1;
                else if( let == ' ' ) state = QK_NAME_GAP;
                else {
                    if( !attval_len ) attval = &xmlin[ pos ];
                    attval_len++;
                    break;
                }
                
                curatt->value = attval;
                curatt->vallen = attval_len;
                attval_len    = 0;
                break;
            
            case QK_ATT_QUOT:
                if( let == '"' ) state = QK_NAME_GAP;
                else {
                    if( !attval_len ) attval = &xmlin[ pos ];
                    attval_len++;
                    break;
                }
                
                if( attval_len ) {
                    curatt->value = attval;
                    
                }
                curatt->vallen = attval_len;
                
                attval_len           = 0;
                break;

            case QK_ENAME_X: // grabbing close tag name
                if( let == '>' ) {
                    state = QK_VAL_1;
                    curnode->namelen = tagname_len;
                    curnode = curnode->parent;
                }
                tagname_len++;
                break;
            case QK_ERROR:
                // if we ever get here it is a bad thing
                break;
        }
        pos++;
    }
    pcurnode = root;
    pcurnode->curchild = pcurnode->firstchild;
    return root;
}

nodec *nodec::addchild( char *newname, int newnamelen, char *newval, int newvallen, int newtype ) {
  nodec *newnode   = new nodec( this );
  newnode->name    = newname;
  newnode->namelen = newnamelen;
  newnode->value   = newval;
  newnode->vallen  = newvallen;
  newnode->type    = newtype;
  if( numchildren == 0 ) {
    firstchild = newnode;
    lastchild  = newnode;
  }
  else {
    lastchild->next = newnode;
    lastchild = newnode;
  }
  numchildren++;
  return newnode;
}

attc *nodec::addatt( char *newname, int newnamelen, char *newval, int newvallen ) {
  attc *newatt    = new attc( this );
  newatt->name    = newname;
  newatt->namelen = newnamelen;
  newatt->value   = newval;
  newatt->vallen  = newvallen;
  if( numatt == 0 ) {
    firstatt = newatt;
    lastatt  = newatt;
  }
  else {
    lastatt->next = newatt;
    lastatt = newatt;
  }
  numatt++;
  return newatt;
}