libStreamedXML/src/libStreamedXML/Parser.cpp

602 lines
17 KiB
C++

/* libStreamedXML/src/lib/Core/Parser.cpp
*
* (c)2005, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
//#define DEBUG_STATE_MACHINE
#ifdef DEBUG_STATE_MACHINE
#include <iostream>
#endif
namespace lsx {
const wchar_t* Parser::xmlCdataMarker = L"<![CDATA[";
const wchar_t* Parser::xmlRestartMarker = L"<![RESTART[";
Parser::Parser(Callback* callback, bool expandEntities)
: callback(callback), expandEntities(expandEntities)
{
reset();
}
void Parser::reset()
{
elemStack.clear();
elemName.clear();
elemAttrs.clear();
buffer.clear();
elementDepth = 0;
skipNextNewline = false;
state = StateNone;
restartCount = 0;
line = 0;
col = 0;
}
void Parser::streamRestart()
{
reset();
state = StateRestartMarker;
}
void Parser::feedChar(wchar_t ch)
{
#ifdef DEBUG_STATE_MACHINE
std::wcout << L"Character: ``" << ch << L"'' (Unicode 0x"
<< std::hex << ch << std::dec
<< L"), state " << state << L".\n";
#endif
#define ERROR(_reason) do { \
state = StateError; \
throw NotWellFormed(_reason, line, col); \
}while(0)
if(ch == xmlRestartMarker[restartCount]) {
if(++restartCount == 11) {
streamRestart();
return;
}
} else {
restartCount = 0;
}
TokenClass c;
switch(ch) {
case L'&':
c = ClassEntity;
break;
case L'<':
c = ClassOpenTag;
break;
case L'\r':
skipNextNewline = true;
ch = L'\n';
c = ClassWhitespace;
++line;
col = 0;
goto doBuffer;
case 0x2028:
ch = L'\n';
c = ClassWhitespace;
++line;
col = 0;
break;
case L' ':
case L'\t':
c = ClassWhitespace;
break;
case 0x85:
case L'\n':
if(skipNextNewline) return;
ch = L'\n';
c = ClassWhitespace;
++line;
col = 0;
break;
case L':':
case L'_':
case L'a' ... L'z':
case L'A' ... L'Z':
case 0xC0 ... 0xD6:
case 0xD8 ... 0xF6:
case 0xF8 ... 0x2FF:
case 0x370 ... 0x37D:
case 0x37F ... 0x1FFF:
case 0x200C ... 0x200D:
case 0x2070 ... 0x218F:
case 0x2C00 ... 0x2FEF:
case 0x3001 ... 0xD7FF:
case 0xF900 ... 0xFDCF:
case 0xFDF0 ... 0xFFFD:
case 0x10000 ... 0xEFFFF:
c = ClassNameStartChar;
break;
case L'-':
case L'.':
case L'0' ... L'9':
case 0xB7:
case 0x300 ... 0x36F:
case 0x203F ... 0x2040:
c = ClassNameChar;
break;
default:
if((ch >= 0x00 && ch <= 0x08) ||
(ch >= 0x0B && ch <= 0x1F) ||
(ch >= 0x7F && ch <= 0x9F)
) {
ERROR(L"Restricted character encountered.");
}
c = ClassOther;
}
skipNextNewline = false;
doBuffer:
// deal with char appropriately, according to state
switch(state) {
case StateError:
return;
case StateNone:
switch(c) {
case ClassWhitespace:
buffer += ch;
break;
case ClassOpenTag:
if(!buffer.empty()) callback->whiteSpace(buffer);
state = StateOpen;
buffer.clear();
break;
case ClassEntity:
if(expandEntities) {
if(!elementDepth) ERROR(L"Entities cannot appear at stream level.");
if(!buffer.empty()) callback->whiteSpace(buffer);
buffer.clear();
parsingAttr = false;
state = StateEntity;
break;
}
// fall through
default:
if(!elementDepth) ERROR(L"Content cannot appear at stream level.");
if(!buffer.empty()) callback->whiteSpace(buffer);
state = StateData;
buffer = ch;
break;
}
break;
case StateData:
switch(c) {
case ClassOpenTag:
callback->content(buffer);
buffer.clear();
state = StateOpen;
break;
case ClassEntity:
callback->content(buffer);
buffer.clear();
parsingAttr = false;
state = StateEntity;
break;
default:
buffer += ch;
break;
}
break;
case StateCDATA:
if(ch == L']') state = StateCDATA1;
else buffer += ch;
break;
case StateCDATA1:
if(ch == L']') state = StateCDATA2;
else {
buffer += L']';
buffer += ch;
state = StateCDATA;
}
break;
case StateCDATA2:
if(ch == L'>') {
callback->cdata(buffer);
buffer.clear();
state = StateNone;
} else if(ch == L']') {
buffer += ch;
} else {
buffer += L"]]";
buffer += ch;
state = StateCDATA;
}
break;
case StateRestartMarker:
if(ch == L']') state = StateRestartMarker1;
else buffer += ch;
break;
case StateRestartMarker1:
if(ch == L']') state = StateRestartMarker2;
else {
buffer += L']';
buffer += ch;
state = StateRestartMarker;
}
break;
case StateRestartMarker2:
if(ch == L'>') {
callback->streamRestart(buffer);
buffer.clear();
reset();
} else if(ch == L']') {
buffer += L']';
break;
} else {
buffer += L"]]";
buffer += ch;
state = StateRestartMarker;
}
break;
case StateOpen:
switch(c) {
case ClassNameStartChar:
state = StateElemName;
elemAttrs.clear();
buffer = ch;
break;
default:
if(ch == L'!') state = StateOpenBang;
else if(ch == L'?') {
state = StatePI;
buffer2.clear();
} else if(ch == L'/') {
if(!elementDepth) ERROR(L"Encountered a close tag at stream level.");
state = StateClose;
} else ERROR(L"Invalid start character for element.");
break;
}
break;
case StatePI:
if(ch == L'?') state = StatePI2;
else if(c == ClassWhitespace) {
state = StatePIData;
buffer.clear();
} else buffer2 += ch;
break;
case StatePI2:
if(ch != L'>') ERROR(L"Invalid target for PI");
else {
callback->PI(buffer2, L"");
buffer.clear();
state = StateNone;
}
break;
case StatePIData:
if(ch == L'?') state = StatePI3;
else buffer += ch;
break;
case StatePI3:
if(ch == L'>') {
callback->PI(buffer2, buffer);
buffer.clear();
state = StateNone;
} else {
buffer += L'?';
buffer += ch;
}
break;
case StateOpenBang:
if(ch == L'[') {
// restart markers handled by lower layer
state = StateOpenCdataMarker;
xmlCount = 3;
if(!elementDepth) ERROR(L"CDATA sections not valid at stream level.");
} else if(ch == L'-') state = StateOpenComment;
else ERROR(L"Invalid special tag.");
break;
case StateOpenCdataMarker:
if(ch != xmlCdataMarker[xmlCount]) ERROR(L"Invalid marked section.");
if(!xmlCdataMarker[++xmlCount]) state = StateCDATA;
break;
case StateOpenComment:
if(ch != L'-') ERROR(L"Invalid special tag.");
state = StateComment;
buffer.clear();
break;
case StateComment:
if(ch == L'-') state = StateComment2;
else buffer += ch;
break;
case StateComment2:
if(ch == L'-') state = StateComment3;
else {
buffer += L'-';
buffer += ch;
}
break;
case StateComment3:
if(ch != L'>') ERROR(L"`--' not valid in comments");
callback->comment(buffer);
buffer.clear();
state = StateNone;
break;
case StateElemName:
switch(c) {
case ClassWhitespace:
state = StateElemTag;
elemName = buffer;
buffer.clear();
break;
case ClassNameStartChar:
case ClassNameChar:
buffer += ch;
break;
default:
switch(ch) {
case L'>':
elemStack.push_back(buffer);
callback->element(buffer, elemAttrs);
state = StateNone;
++elementDepth;
buffer.clear();
break;
case L'/':
state = StateNeedClose;
break;
default:
ERROR(L"Invalid character in tag name.");
}
}
break;
case StateElemTag:
switch(c) {
case ClassWhitespace:
break;
case ClassNameStartChar:
state = StateElemAttrName;
buffer = ch;
break;
default:
switch(ch) {
case L'>':
elemStack.push_back(elemName);
callback->element(elemName, elemAttrs);
buffer.clear();
state = StateNone;
++elementDepth;
break;
case L'/':
state = StateNeedClose;
break;
default:
ERROR(L"Invalid character in tag.");
}
}
break;
case StateElemAttrName:
switch(c) {
case ClassNameStartChar:
case ClassNameChar:
buffer += ch;
break;
default:
if(ch != L'=') ERROR(L"Invalid character in attribute name.");
state = StateElemAttrEq;
buffer2 = buffer;
buffer.clear();
break;
}
break;
case StateElemAttrEq:
if(ch == L'\'') singleQuote = true;
else if(ch == L'"') singleQuote = false;
else ERROR(L"Invalid character in attribute.");
state = StateElemAttrVal;
break;
case StateElemAttrVal:
if((singleQuote && ch == L'\'') || (!singleQuote && ch == L'"')) {
elemAttrs[buffer2] = buffer;
buffer.clear();
state = StateElemAttrDone;
} else if(expandEntities && ch == L'&') {
buffer3 = buffer;
buffer.clear();
parsingAttr = true;
state = StateEntity;
} else buffer += ch;
break;
case StateElemAttrDone:
switch(c) {
case ClassWhitespace:
state = StateElemTag;
break;
default:
if(ch == L'/') {
state = StateNeedClose;
} else if(ch == L'>') {
callback->element(elemName, elemAttrs);
elemStack.push_back(elemName);
buffer.clear();
state = StateNone;
++elementDepth;
} else ERROR(L"Invalid character after attribute.");
break;
}
break;
case StateNeedClose:
if(ch != L'>') ERROR(L"Stray `/' in open tag.");
callback->element(elemName, elemAttrs);
callback->closeTag(elemName);
buffer.clear();
state = StateNone;
break;
case StateClose:
if(c != ClassNameStartChar) ERROR(L"Invalid character in close tag name.");
buffer = ch;
state = StateClosing;
break;
case StateClosing:
switch(c) {
case ClassNameStartChar:
case ClassNameChar:
buffer += ch;
break;
case ClassWhitespace:
state = StateNeedClose2;
break;
default:
if(ch != L'>') ERROR(L"Invalid character in close tag name.");
if(elemStack.back() != buffer) ERROR(L"Mismatched close tag.");
elemStack.pop_back();
callback->closeTag(buffer);
buffer.clear();
state = StateNone;
--elementDepth;
}
break;
case StateNeedClose2:
if(c == ClassWhitespace) break;
if(ch != L'>') ERROR(L"Invalid data in close tag.");
if(elemStack.back() != elemName) ERROR(L"Mismatched close tag.");
elemStack.pop_back();
callback->closeTag(elemName);
buffer.clear();
state = StateNone;
--elementDepth;
break;
case StateEntity:
if(ch == L'#') {
state = StateCharEntity;
entityChar = 0;
} else if(c == ClassNameStartChar) {
buffer = ch;
state = StateEntityName;
} else ERROR(L"Invalid entity name.");
break;
case StateCharEntity:
if(ch == ';') {
if(parsingAttr) {
buffer = buffer3 + entityChar;
state = StateElemAttrVal;
break;
}
buffer = entityChar;
state = StateData;
break;
} else if(ch >= L'0' && ch <= L'9') {
if(entityChar > 214748364 || (entityChar == 214748364 && ch >= L'8'))
ERROR(L"Character code too large in character entity.");
entityChar *= 10;
entityChar += (ch - L'0');
} else ERROR(L"Invalid character in character entity.");
break;
case StateEntityName:
if(ch == L';') {
if(parsingAttr) {
buffer = buffer3 + entityRef(buffer);
state = StateElemAttrVal;
break;
}
buffer = entityRef(buffer);
state = StateData;
break;
}
if(c != ClassNameChar && c != ClassNameStartChar) ERROR(L"Invalid entity name.");
buffer += ch;
break;
}
#undef ERROR
++col;
}
std::wstring Parser::entityRef(const std::wstring& ent)
{
if(ent == L"quot") return L"\"";
if(ent == L"amp") return L"&";
if(ent == L"apos") return L"'";
if(ent == L"lt") return L"<";
if(ent == L"gt") return L">";
std::wstring result;
if(callback->entityRef(ent, result)) return result;
throw UnknownEntity(ent, line, col);
}
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4
*/