602 lines
17 KiB
C++
602 lines
17 KiB
C++
/* libStreamedXML/src/lib/Core/Parser.cpp
|
|
*
|
|
* (c)2005, Laurence Withers. Released under the GNU GPL. See file
|
|
* COPYING for more information / terms of license.
|
|
*/
|
|
|
|
//#define DEBUG_STATE_MACHINE
|
|
|
|
#ifdef DEBUG_STATE_MACHINE
|
|
#include <iostream>
|
|
#endif
|
|
|
|
namespace lsx {
|
|
|
|
|
|
|
|
const wchar_t* Parser::xmlCdataMarker = L"<![CDATA[";
|
|
const wchar_t* Parser::xmlRestartMarker = L"<![RESTART[";
|
|
|
|
|
|
|
|
Parser::Parser(Callback* callback, bool expandEntities)
|
|
: callback(callback), expandEntities(expandEntities)
|
|
{
|
|
reset();
|
|
}
|
|
|
|
|
|
|
|
void Parser::reset()
|
|
{
|
|
elemStack.clear();
|
|
elemName.clear();
|
|
elemAttrs.clear();
|
|
buffer.clear();
|
|
elementDepth = 0;
|
|
skipNextNewline = false;
|
|
state = StateNone;
|
|
restartCount = 0;
|
|
line = 0;
|
|
col = 0;
|
|
}
|
|
|
|
|
|
|
|
void Parser::streamRestart()
|
|
{
|
|
reset();
|
|
state = StateRestartMarker;
|
|
}
|
|
|
|
|
|
|
|
void Parser::feedChar(wchar_t ch)
|
|
{
|
|
#ifdef DEBUG_STATE_MACHINE
|
|
std::wcout << L"Character: ``" << ch << L"'' (Unicode 0x"
|
|
<< std::hex << ch << std::dec
|
|
<< L"), state " << state << L".\n";
|
|
#endif
|
|
|
|
#define ERROR(_reason) do { \
|
|
state = StateError; \
|
|
throw NotWellFormed(_reason, line, col); \
|
|
}while(0)
|
|
if(ch == xmlRestartMarker[restartCount]) {
|
|
if(++restartCount == 11) {
|
|
streamRestart();
|
|
return;
|
|
}
|
|
} else {
|
|
restartCount = 0;
|
|
}
|
|
|
|
TokenClass c;
|
|
switch(ch) {
|
|
case L'&':
|
|
c = ClassEntity;
|
|
break;
|
|
|
|
case L'<':
|
|
c = ClassOpenTag;
|
|
break;
|
|
|
|
case L'\r':
|
|
skipNextNewline = true;
|
|
ch = L'\n';
|
|
c = ClassWhitespace;
|
|
++line;
|
|
col = 0;
|
|
goto doBuffer;
|
|
|
|
case 0x2028:
|
|
ch = L'\n';
|
|
c = ClassWhitespace;
|
|
++line;
|
|
col = 0;
|
|
break;
|
|
|
|
case L' ':
|
|
case L'\t':
|
|
c = ClassWhitespace;
|
|
break;
|
|
|
|
case 0x85:
|
|
case L'\n':
|
|
if(skipNextNewline) return;
|
|
ch = L'\n';
|
|
c = ClassWhitespace;
|
|
++line;
|
|
col = 0;
|
|
break;
|
|
|
|
case L':':
|
|
case L'_':
|
|
case L'a' ... L'z':
|
|
case L'A' ... L'Z':
|
|
case 0xC0 ... 0xD6:
|
|
case 0xD8 ... 0xF6:
|
|
case 0xF8 ... 0x2FF:
|
|
case 0x370 ... 0x37D:
|
|
case 0x37F ... 0x1FFF:
|
|
case 0x200C ... 0x200D:
|
|
case 0x2070 ... 0x218F:
|
|
case 0x2C00 ... 0x2FEF:
|
|
case 0x3001 ... 0xD7FF:
|
|
case 0xF900 ... 0xFDCF:
|
|
case 0xFDF0 ... 0xFFFD:
|
|
case 0x10000 ... 0xEFFFF:
|
|
c = ClassNameStartChar;
|
|
break;
|
|
|
|
case L'-':
|
|
case L'.':
|
|
case L'0' ... L'9':
|
|
case 0xB7:
|
|
case 0x300 ... 0x36F:
|
|
case 0x203F ... 0x2040:
|
|
c = ClassNameChar;
|
|
break;
|
|
|
|
default:
|
|
if((ch >= 0x00 && ch <= 0x08) ||
|
|
(ch >= 0x0B && ch <= 0x1F) ||
|
|
(ch >= 0x7F && ch <= 0x9F)
|
|
) {
|
|
ERROR(L"Restricted character encountered.");
|
|
}
|
|
c = ClassOther;
|
|
}
|
|
|
|
skipNextNewline = false;
|
|
doBuffer:
|
|
// deal with char appropriately, according to state
|
|
switch(state) {
|
|
case StateError:
|
|
return;
|
|
|
|
case StateNone:
|
|
switch(c) {
|
|
case ClassWhitespace:
|
|
buffer += ch;
|
|
break;
|
|
|
|
case ClassOpenTag:
|
|
if(!buffer.empty()) callback->whiteSpace(buffer);
|
|
state = StateOpen;
|
|
buffer.clear();
|
|
break;
|
|
|
|
case ClassEntity:
|
|
if(expandEntities) {
|
|
if(!elementDepth) ERROR(L"Entities cannot appear at stream level.");
|
|
if(!buffer.empty()) callback->whiteSpace(buffer);
|
|
buffer.clear();
|
|
parsingAttr = false;
|
|
state = StateEntity;
|
|
break;
|
|
}
|
|
|
|
// fall through
|
|
default:
|
|
if(!elementDepth) ERROR(L"Content cannot appear at stream level.");
|
|
if(!buffer.empty()) callback->whiteSpace(buffer);
|
|
state = StateData;
|
|
buffer = ch;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case StateData:
|
|
switch(c) {
|
|
case ClassOpenTag:
|
|
callback->content(buffer);
|
|
buffer.clear();
|
|
state = StateOpen;
|
|
break;
|
|
|
|
case ClassEntity:
|
|
callback->content(buffer);
|
|
buffer.clear();
|
|
parsingAttr = false;
|
|
state = StateEntity;
|
|
break;
|
|
|
|
default:
|
|
buffer += ch;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case StateCDATA:
|
|
if(ch == L']') state = StateCDATA1;
|
|
else buffer += ch;
|
|
break;
|
|
|
|
case StateCDATA1:
|
|
if(ch == L']') state = StateCDATA2;
|
|
else {
|
|
buffer += L']';
|
|
buffer += ch;
|
|
state = StateCDATA;
|
|
}
|
|
break;
|
|
|
|
case StateCDATA2:
|
|
if(ch == L'>') {
|
|
callback->cdata(buffer);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
} else if(ch == L']') {
|
|
buffer += ch;
|
|
} else {
|
|
buffer += L"]]";
|
|
buffer += ch;
|
|
state = StateCDATA;
|
|
}
|
|
break;
|
|
|
|
case StateRestartMarker:
|
|
if(ch == L']') state = StateRestartMarker1;
|
|
else buffer += ch;
|
|
break;
|
|
|
|
case StateRestartMarker1:
|
|
if(ch == L']') state = StateRestartMarker2;
|
|
else {
|
|
buffer += L']';
|
|
buffer += ch;
|
|
state = StateRestartMarker;
|
|
}
|
|
break;
|
|
|
|
case StateRestartMarker2:
|
|
if(ch == L'>') {
|
|
callback->streamRestart(buffer);
|
|
buffer.clear();
|
|
reset();
|
|
} else if(ch == L']') {
|
|
buffer += L']';
|
|
break;
|
|
} else {
|
|
buffer += L"]]";
|
|
buffer += ch;
|
|
state = StateRestartMarker;
|
|
}
|
|
break;
|
|
|
|
case StateOpen:
|
|
switch(c) {
|
|
case ClassNameStartChar:
|
|
state = StateElemName;
|
|
elemAttrs.clear();
|
|
buffer = ch;
|
|
break;
|
|
|
|
default:
|
|
if(ch == L'!') state = StateOpenBang;
|
|
else if(ch == L'?') {
|
|
state = StatePI;
|
|
buffer2.clear();
|
|
} else if(ch == L'/') {
|
|
if(!elementDepth) ERROR(L"Encountered a close tag at stream level.");
|
|
state = StateClose;
|
|
} else ERROR(L"Invalid start character for element.");
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case StatePI:
|
|
if(ch == L'?') state = StatePI2;
|
|
else if(c == ClassWhitespace) {
|
|
state = StatePIData;
|
|
buffer.clear();
|
|
} else buffer2 += ch;
|
|
break;
|
|
|
|
case StatePI2:
|
|
if(ch != L'>') ERROR(L"Invalid target for PI");
|
|
else {
|
|
callback->PI(buffer2, L"");
|
|
buffer.clear();
|
|
state = StateNone;
|
|
}
|
|
break;
|
|
|
|
case StatePIData:
|
|
if(ch == L'?') state = StatePI3;
|
|
else buffer += ch;
|
|
break;
|
|
|
|
case StatePI3:
|
|
if(ch == L'>') {
|
|
callback->PI(buffer2, buffer);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
} else {
|
|
buffer += L'?';
|
|
buffer += ch;
|
|
}
|
|
break;
|
|
|
|
case StateOpenBang:
|
|
if(ch == L'[') {
|
|
// restart markers handled by lower layer
|
|
state = StateOpenCdataMarker;
|
|
xmlCount = 3;
|
|
if(!elementDepth) ERROR(L"CDATA sections not valid at stream level.");
|
|
} else if(ch == L'-') state = StateOpenComment;
|
|
else ERROR(L"Invalid special tag.");
|
|
break;
|
|
|
|
case StateOpenCdataMarker:
|
|
if(ch != xmlCdataMarker[xmlCount]) ERROR(L"Invalid marked section.");
|
|
if(!xmlCdataMarker[++xmlCount]) state = StateCDATA;
|
|
break;
|
|
|
|
case StateOpenComment:
|
|
if(ch != L'-') ERROR(L"Invalid special tag.");
|
|
state = StateComment;
|
|
buffer.clear();
|
|
break;
|
|
|
|
case StateComment:
|
|
if(ch == L'-') state = StateComment2;
|
|
else buffer += ch;
|
|
break;
|
|
|
|
case StateComment2:
|
|
if(ch == L'-') state = StateComment3;
|
|
else {
|
|
buffer += L'-';
|
|
buffer += ch;
|
|
}
|
|
break;
|
|
|
|
case StateComment3:
|
|
if(ch != L'>') ERROR(L"`--' not valid in comments");
|
|
callback->comment(buffer);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
break;
|
|
|
|
case StateElemName:
|
|
switch(c) {
|
|
case ClassWhitespace:
|
|
state = StateElemTag;
|
|
elemName = buffer;
|
|
buffer.clear();
|
|
break;
|
|
|
|
case ClassNameStartChar:
|
|
case ClassNameChar:
|
|
buffer += ch;
|
|
break;
|
|
|
|
default:
|
|
switch(ch) {
|
|
case L'>':
|
|
elemStack.push_back(buffer);
|
|
callback->element(buffer, elemAttrs);
|
|
state = StateNone;
|
|
++elementDepth;
|
|
buffer.clear();
|
|
break;
|
|
|
|
case L'/':
|
|
state = StateNeedClose;
|
|
break;
|
|
|
|
default:
|
|
ERROR(L"Invalid character in tag name.");
|
|
}
|
|
}
|
|
break;
|
|
|
|
case StateElemTag:
|
|
switch(c) {
|
|
case ClassWhitespace:
|
|
break;
|
|
|
|
case ClassNameStartChar:
|
|
state = StateElemAttrName;
|
|
buffer = ch;
|
|
break;
|
|
|
|
default:
|
|
switch(ch) {
|
|
case L'>':
|
|
elemStack.push_back(elemName);
|
|
callback->element(elemName, elemAttrs);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
++elementDepth;
|
|
break;
|
|
|
|
case L'/':
|
|
state = StateNeedClose;
|
|
break;
|
|
|
|
default:
|
|
ERROR(L"Invalid character in tag.");
|
|
}
|
|
}
|
|
break;
|
|
|
|
case StateElemAttrName:
|
|
switch(c) {
|
|
case ClassNameStartChar:
|
|
case ClassNameChar:
|
|
buffer += ch;
|
|
break;
|
|
|
|
default:
|
|
if(ch != L'=') ERROR(L"Invalid character in attribute name.");
|
|
state = StateElemAttrEq;
|
|
buffer2 = buffer;
|
|
buffer.clear();
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case StateElemAttrEq:
|
|
if(ch == L'\'') singleQuote = true;
|
|
else if(ch == L'"') singleQuote = false;
|
|
else ERROR(L"Invalid character in attribute.");
|
|
state = StateElemAttrVal;
|
|
break;
|
|
|
|
case StateElemAttrVal:
|
|
if((singleQuote && ch == L'\'') || (!singleQuote && ch == L'"')) {
|
|
elemAttrs[buffer2] = buffer;
|
|
buffer.clear();
|
|
state = StateElemAttrDone;
|
|
} else if(expandEntities && ch == L'&') {
|
|
buffer3 = buffer;
|
|
buffer.clear();
|
|
parsingAttr = true;
|
|
state = StateEntity;
|
|
} else buffer += ch;
|
|
break;
|
|
|
|
case StateElemAttrDone:
|
|
switch(c) {
|
|
case ClassWhitespace:
|
|
state = StateElemTag;
|
|
break;
|
|
|
|
default:
|
|
if(ch == L'/') {
|
|
state = StateNeedClose;
|
|
} else if(ch == L'>') {
|
|
callback->element(elemName, elemAttrs);
|
|
elemStack.push_back(elemName);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
++elementDepth;
|
|
} else ERROR(L"Invalid character after attribute.");
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case StateNeedClose:
|
|
if(ch != L'>') ERROR(L"Stray `/' in open tag.");
|
|
callback->element(elemName, elemAttrs);
|
|
callback->closeTag(elemName);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
break;
|
|
|
|
case StateClose:
|
|
if(c != ClassNameStartChar) ERROR(L"Invalid character in close tag name.");
|
|
buffer = ch;
|
|
state = StateClosing;
|
|
break;
|
|
|
|
case StateClosing:
|
|
switch(c) {
|
|
case ClassNameStartChar:
|
|
case ClassNameChar:
|
|
buffer += ch;
|
|
break;
|
|
|
|
case ClassWhitespace:
|
|
state = StateNeedClose2;
|
|
break;
|
|
|
|
default:
|
|
if(ch != L'>') ERROR(L"Invalid character in close tag name.");
|
|
if(elemStack.back() != buffer) ERROR(L"Mismatched close tag.");
|
|
elemStack.pop_back();
|
|
callback->closeTag(buffer);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
--elementDepth;
|
|
}
|
|
break;
|
|
|
|
case StateNeedClose2:
|
|
if(c == ClassWhitespace) break;
|
|
if(ch != L'>') ERROR(L"Invalid data in close tag.");
|
|
if(elemStack.back() != elemName) ERROR(L"Mismatched close tag.");
|
|
elemStack.pop_back();
|
|
callback->closeTag(elemName);
|
|
buffer.clear();
|
|
state = StateNone;
|
|
--elementDepth;
|
|
break;
|
|
|
|
case StateEntity:
|
|
if(ch == L'#') {
|
|
state = StateCharEntity;
|
|
entityChar = 0;
|
|
} else if(c == ClassNameStartChar) {
|
|
buffer = ch;
|
|
state = StateEntityName;
|
|
} else ERROR(L"Invalid entity name.");
|
|
break;
|
|
|
|
case StateCharEntity:
|
|
if(ch == ';') {
|
|
if(parsingAttr) {
|
|
buffer = buffer3 + entityChar;
|
|
state = StateElemAttrVal;
|
|
break;
|
|
}
|
|
|
|
buffer = entityChar;
|
|
state = StateData;
|
|
break;
|
|
|
|
} else if(ch >= L'0' && ch <= L'9') {
|
|
if(entityChar > 214748364 || (entityChar == 214748364 && ch >= L'8'))
|
|
ERROR(L"Character code too large in character entity.");
|
|
entityChar *= 10;
|
|
entityChar += (ch - L'0');
|
|
} else ERROR(L"Invalid character in character entity.");
|
|
break;
|
|
|
|
case StateEntityName:
|
|
if(ch == L';') {
|
|
if(parsingAttr) {
|
|
buffer = buffer3 + entityRef(buffer);
|
|
state = StateElemAttrVal;
|
|
break;
|
|
}
|
|
|
|
buffer = entityRef(buffer);
|
|
state = StateData;
|
|
break;
|
|
}
|
|
if(c != ClassNameChar && c != ClassNameStartChar) ERROR(L"Invalid entity name.");
|
|
buffer += ch;
|
|
break;
|
|
}
|
|
#undef ERROR
|
|
++col;
|
|
}
|
|
|
|
|
|
|
|
std::wstring Parser::entityRef(const std::wstring& ent)
|
|
{
|
|
if(ent == L"quot") return L"\"";
|
|
if(ent == L"amp") return L"&";
|
|
if(ent == L"apos") return L"'";
|
|
if(ent == L"lt") return L"<";
|
|
if(ent == L"gt") return L">";
|
|
|
|
std::wstring result;
|
|
if(callback->entityRef(ent, result)) return result;
|
|
throw UnknownEntity(ent, line, col);
|
|
}
|
|
|
|
|
|
|
|
}
|
|
/* options for text editors
|
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
|
vim: expandtab:ts=4:sw=4
|
|
*/
|