diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e902bc..9b1856f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 2.6) project (YAML_CPP) +SET(CMAKE_CXX_FLAGS "-O2") + enable_testing() if(WIN32) @@ -36,4 +38,4 @@ install( ) add_subdirectory (yaml-reader) - +add_subdirectory (util) diff --git a/src/emitterutils.cpp b/src/emitterutils.cpp index 1779a9b..2fc6ad7 100644 --- a/src/emitterutils.cpp +++ b/src/emitterutils.cpp @@ -3,6 +3,7 @@ #include "indentation.h" #include "exceptions.h" #include +#include "stringsource.h" namespace YAML { @@ -29,8 +30,8 @@ namespace YAML || (!Exp::Printable) || Exp::Break || Exp::Tab; - Buffer buffer(&str[0], str.size()); - while(buffer.size) { + StringCharSource buffer(str.c_str(), str.size()); + while(buffer) { if(disallowed.Matches(buffer)) return false; ++buffer; diff --git a/src/regex.cpp b/src/regex.cpp index b48036e..d26cdb6 100644 --- a/src/regex.cpp +++ b/src/regex.cpp @@ -1,139 +1,39 @@ #include "crt.h" #include "regex.h" -#include "stream.h" -#include namespace YAML { - RegEx::RegEx(REGEX_OP op): m_op(op), m_pOp(0) + // constructors + RegEx::RegEx(): m_op(REGEX_EMPTY) { - SetOp(); } - - RegEx::RegEx(const RegEx& rhs): m_pOp(0) + + RegEx::RegEx(REGEX_OP op): m_op(op) { - m_op = rhs.m_op; - m_a = rhs.m_a; - m_z = rhs.m_z; - m_params = rhs.m_params; - - SetOp(); } - - RegEx::RegEx(): m_op(REGEX_EMPTY), m_pOp(0) + + RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch) { - SetOp(); } - - RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_pOp(0), m_a(ch) + + RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z) { - SetOp(); } - - RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_pOp(0), m_a(a), m_z(z) - { - SetOp(); - } - - RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op), m_pOp(0) + + RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op) { for(unsigned i=0;i= 0; - } - - bool RegEx::Matches(const Buffer& buffer) const - { - return Match(buffer) >= 0; } - bool RegEx::Matches(const Stream& in) const - { - return Match(in) >= 0; - } - - // Match - // . Matches the given string against this regular expression. - // . Returns the number of characters matched. - // . Returns -1 if no characters were matched (the reason for - // not returning zero is that we may have an empty regex - // which is ALWAYS successful at matching zero characters). - // . REMEMBER that we only match from the start of the buffer! - int RegEx::Match(const Buffer& buffer) const - { - if(!m_pOp) - return !buffer ? 0 : -1; // the empty regex only is successful on the empty string - - return m_pOp->Match(buffer, *this); - } - - int RegEx::Match(const std::string& str) const - { - Buffer buffer(str.c_str(), str.size()); - return Match(buffer); - } - - // Match - int RegEx::Match(const Stream& in) const - { - return Match(in.current()); - } - + // combination constructors RegEx operator ! (const RegEx& ex) { RegEx ret(REGEX_NOT); ret.m_params.push_back(ex); return ret; } - + RegEx operator || (const RegEx& ex1, const RegEx& ex2) { RegEx ret(REGEX_OR); @@ -141,7 +41,7 @@ namespace YAML ret.m_params.push_back(ex2); return ret; } - + RegEx operator && (const RegEx& ex1, const RegEx& ex2) { RegEx ret(REGEX_AND); @@ -149,85 +49,13 @@ namespace YAML ret.m_params.push_back(ex2); return ret; } - + RegEx operator + (const RegEx& ex1, const RegEx& ex2) { RegEx ret(REGEX_SEQ); ret.m_params.push_back(ex1); ret.m_params.push_back(ex2); return ret; - } - - ////////////////////////////////////////////////////////////////////////////// - // Operators - - // MatchOperator - int RegEx::MatchOperator::Match(const Buffer& buffer, const RegEx& regex) const - { - if(!buffer || buffer[0] != regex.m_a) - return -1; - return 1; - } - - // RangeOperator - int RegEx::RangeOperator::Match(const Buffer& buffer, const RegEx& regex) const - { - if(!buffer || regex.m_a > buffer[0] || regex.m_z < buffer[0]) - return -1; - return 1; - } - - // OrOperator - int RegEx::OrOperator::Match(const Buffer& buffer, const RegEx& regex) const - { - for(unsigned i=0;i= 0) - return n; - } - return -1; - } - - // AndOperator - // Note: 'AND' is a little funny, since we may be required to match things - // of different lengths. If we find a match, we return the length of - // the FIRST entry on the list. - int RegEx::AndOperator::Match(const Buffer& buffer, const RegEx& regex) const - { - int first = -1; - for(unsigned i=0;i= 0) - return -1; - return 1; - } - - // SeqOperator - int RegEx::SeqOperator::Match(const Buffer& buffer, const RegEx& regex) const - { - int offset = 0; - for(unsigned i=0;i bool Matches(const Source& source) const; + + int Match(const std::string& str) const; + int Match(const Stream& in) const; private: RegEx(REGEX_OP op); - void SetOp(); + + template bool IsValidSource(const Source& source) const; + template int Match(const Source& source) const; + template int MatchUnchecked(const Source& source) const; + + template int MatchOpEmpty(const Source& source) const; + template int MatchOpMatch(const Source& source) const; + template int MatchOpRange(const Source& source) const; + template int MatchOpOr(const Source& source) const; + template int MatchOpAnd(const Source& source) const; + template int MatchOpNot(const Source& source) const; + template int MatchOpSeq(const Source& source) const; private: REGEX_OP m_op; - Operator *m_pOp; char m_a, m_z; std::vector m_params; }; } + +#include "regeximpl.h" diff --git a/src/regeximpl.h b/src/regeximpl.h new file mode 100644 index 0000000..f2daa43 --- /dev/null +++ b/src/regeximpl.h @@ -0,0 +1,172 @@ +#pragma once + +#include "stream.h" +#include "stringsource.h" +#include "streamcharsource.h" + +namespace YAML +{ + // query matches + inline bool RegEx::Matches(char ch) const { + std::string str; + str += ch; + return Matches(str); + } + + inline bool RegEx::Matches(const std::string& str) const { + return Match(str) >= 0; + } + + inline bool RegEx::Matches(const Stream& in) const { + return Match(in) >= 0; + } + + template + inline bool RegEx::Matches(const Source& source) const { + return Match(source) >= 0; + } + + // Match + // . Matches the given string against this regular expression. + // . Returns the number of characters matched. + // . Returns -1 if no characters were matched (the reason for + // not returning zero is that we may have an empty regex + // which is ALWAYS successful at matching zero characters). + // . REMEMBER that we only match from the start of the buffer! + inline int RegEx::Match(const std::string& str) const + { + StringCharSource source(str.c_str(), str.size()); + return Match(source); + } + + inline int RegEx::Match(const Stream& in) const + { + StreamCharSource source(in); + return Match(source); + } + + template + inline bool RegEx::IsValidSource(const Source& source) const + { + return source; + } + + template<> + inline bool RegEx::IsValidSource(const StringCharSource&source) const + { + return source || m_op == REGEX_EMPTY; + } + + template + inline int RegEx::Match(const Source& source) const + { + return IsValidSource(source) ? MatchUnchecked(source) : -1; + } + + template + inline int RegEx::MatchUnchecked(const Source& source) const + { + switch(m_op) { + case REGEX_EMPTY: + return MatchOpEmpty(source); + case REGEX_MATCH: + return MatchOpMatch(source); + case REGEX_RANGE: + return MatchOpRange(source); + case REGEX_OR: + return MatchOpOr(source); + case REGEX_AND: + return MatchOpAnd(source); + case REGEX_NOT: + return MatchOpNot(source); + case REGEX_SEQ: + return MatchOpSeq(source); + } + + return -1; + } + + ////////////////////////////////////////////////////////////////////////////// + // Operators + // Note: the convention MatchOp* is that we can assume IsSourceValid(source). + // So we do all our checks *before* we call these functions + + // EmptyOperator + template + inline int RegEx::MatchOpEmpty(const Source& source) const { + return source[0] == Stream::eof() ? 0 : -1; + } + + template <> + inline int RegEx::MatchOpEmpty(const StringCharSource& source) const { + return !source ? 0 : -1; // the empty regex only is successful on the empty string + } + + // MatchOperator + template + inline int RegEx::MatchOpMatch(const Source& source) const { + if(source[0] != m_a) + return -1; + return 1; + } + + // RangeOperator + template + inline int RegEx::MatchOpRange(const Source& source) const { + if(m_a > source[0] || m_z < source[0]) + return -1; + return 1; + } + + // OrOperator + template + inline int RegEx::MatchOpOr(const Source& source) const { + for(unsigned i=0;i= 0) + return n; + } + return -1; + } + + // AndOperator + // Note: 'AND' is a little funny, since we may be required to match things + // of different lengths. If we find a match, we return the length of + // the FIRST entry on the list. + template + inline int RegEx::MatchOpAnd(const Source& source) const { + int first = -1; + for(unsigned i=0;i + inline int RegEx::MatchOpNot(const Source& source) const { + if(m_params.empty()) + return -1; + if(m_params[0].MatchUnchecked(source) >= 0) + return -1; + return 1; + } + + // SeqOperator + template + inline int RegEx::MatchOpSeq(const Source& source) const { + int offset = 0; + for(unsigned i=0;i +#include "exp.h" + +#ifndef YAML_PREFETCH_SIZE +#define YAML_PREFETCH_SIZE 2048 +#endif + +#define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A))) +#define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A)) + +#define CP_REPLACEMENT_CHARACTER (0xFFFD) namespace YAML { - Stream::Stream(std::istream& input): pos(0), line(0), column(0), size(0), buffer(0) + enum UtfIntroState { + uis_start, + uis_utfbe_b1, + uis_utf32be_b2, + uis_utf32be_bom3, + uis_utf32be, + uis_utf16be, + uis_utf16be_bom1, + uis_utfle_bom1, + uis_utf16le_bom2, + uis_utf32le_bom3, + uis_utf16le, + uis_utf32le, + uis_utf8_imp, + uis_utf16le_imp, + uis_utf32le_imp3, + uis_utf8_bom1, + uis_utf8_bom2, + uis_utf8, + uis_error + }; + + enum UtfIntroCharType { + uict00, + uictBB, + uictBF, + uictEF, + uictFE, + uictFF, + uictAscii, + uictOther, + uictMax + }; + + static bool s_introFinalState[] = { + false, //uis_start + false, //uis_utfbe_b1 + false, //uis_utf32be_b2 + false, //uis_utf32be_bom3 + true, //uis_utf32be + true, //uis_utf16be + false, //uis_utf16be_bom1 + false, //uis_utfle_bom1 + false, //uis_utf16le_bom2 + false, //uis_utf32le_bom3 + true, //uis_utf16le + true, //uis_utf32le + false, //uis_utf8_imp + false, //uis_utf16le_imp + false, //uis_utf32le_imp3 + false, //uis_utf8_bom1 + false, //uis_utf8_bom2 + true, //uis_utf8 + true, //uis_error + }; + + static UtfIntroState s_introTransitions[][uictMax] = { + // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther + {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, uis_utfle_bom1, uis_utf8_imp, uis_utf8}, + {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8}, + {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, uis_utf8, uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, uis_utf8}, + {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be}, + {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, uis_utf8, uis_utf8}, + {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le}, + {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, + {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le}, + {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, + {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8}, + }; + + static char s_introUngetCount[][uictMax] = { + // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther + {0, 1, 1, 0, 0, 0, 0, 1}, + {0, 2, 2, 2, 2, 2, 2, 2}, + {3, 3, 3, 3, 0, 3, 3, 3}, + {4, 4, 4, 4, 4, 0, 4, 4}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {2, 2, 2, 2, 2, 0, 2, 2}, + {2, 2, 2, 2, 0, 2, 2, 2}, + {0, 1, 1, 1, 1, 1, 1, 1}, + {0, 2, 2, 2, 2, 2, 2, 2}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {0, 2, 2, 2, 2, 2, 2, 2}, + {0, 3, 3, 3, 3, 3, 3, 3}, + {4, 4, 4, 4, 4, 4, 4, 4}, + {2, 0, 2, 2, 2, 2, 2, 2}, + {3, 3, 0, 3, 3, 3, 3, 3}, + {1, 1, 1, 1, 1, 1, 1, 1}, + }; + + inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) { + if (std::istream::traits_type::eof() == ch) { + return uictOther; + } + + switch (ch) { + case 0: return uict00; + case 0xBB: return uictBB; + case 0xBF: return uictBF; + case 0xEF: return uictEF; + case 0xFE: return uictFE; + case 0xFF: return uictFF; + } + + if ((ch > 0) && (ch < 0xFF)) { + return uictAscii; + } + + return uictOther; + } + + inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift) + { + const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits); + const unsigned char mask = (0xFF >> (lead_bits + 1)); + return static_cast(static_cast( + header | ((ch >> rshift) & mask) + )); + } + + inline void QueueUnicodeCodepoint(std::deque& q, unsigned long ch) + { + // We are not allowed to queue the Stream::eof() codepoint, so + // replace it with CP_REPLACEMENT_CHARACTER + if (static_cast(Stream::eof()) == ch) + { + ch = CP_REPLACEMENT_CHARACTER; + } + + if (ch < 0x80) + { + q.push_back(Utf8Adjust(ch, 0, 0)); + } + else if (ch < 0x800) + { + q.push_back(Utf8Adjust(ch, 2, 6)); + q.push_back(Utf8Adjust(ch, 1, 0)); + } + else if (ch < 0x10000) + { + q.push_back(Utf8Adjust(ch, 3, 12)); + q.push_back(Utf8Adjust(ch, 1, 6)); + q.push_back(Utf8Adjust(ch, 1, 0)); + } + else + { + q.push_back(Utf8Adjust(ch, 4, 18)); + q.push_back(Utf8Adjust(ch, 1, 12)); + q.push_back(Utf8Adjust(ch, 1, 6)); + q.push_back(Utf8Adjust(ch, 1, 0)); + } + } + + Stream::Stream(std::istream& input) + : pos(0), line(0), column(0), m_input(input), m_nPushedBack(0), + m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), + m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0) + { + typedef std::istream::traits_type char_traits; + if(!input) return; - std::streambuf *pBuf = input.rdbuf(); + // Determine (or guess) the character-set by reading the BOM, if any. See + // the YAML specification for the determination algorithm. + char_traits::int_type intro[4]; + int nIntroUsed = 0; + UtfIntroState state = uis_start; + for (; !s_introFinalState[state]; ) { + std::istream::int_type ch = input.get(); + intro[nIntroUsed++] = ch; + UtfIntroCharType charType = IntroCharTypeOf(ch); + UtfIntroState newState = s_introTransitions[state][charType]; + int nUngets = s_introUngetCount[state][charType]; + if (nUngets > 0) { + for (; nUngets > 0; --nUngets) { + if (char_traits::eof() != intro[--nIntroUsed]) { + m_bufPushback[m_nPushedBack++] = + char_traits::to_char_type(intro[nIntroUsed]); + } + } + } + state = newState; + } - // store entire file in buffer - size = pBuf->pubseekoff(0, std::ios::end, std::ios::in); - pBuf->pubseekpos(0, std::ios::in); - buffer = new char[size]; - size = pBuf->sgetn(buffer, size); // Note: when reading a Windows CR/LF file, - // pubseekoff() counts CR/LF as two characters, - // setgn() reads CR/LF as a single LF character! + switch (state) { + case uis_utf8: m_charSet = utf8; break; + case uis_utf16le: m_charSet = utf16le; break; + case uis_utf16be: m_charSet = utf16be; break; + case uis_utf32le: m_charSet = utf32le; break; + case uis_utf32be: m_charSet = utf32be; break; + default: m_charSet = utf8; break; + } + + ReadAheadTo(0); } Stream::~Stream() { - delete [] buffer; + delete[] m_pPrefetched; } - - char Stream::peek() + char Stream::peek() const { - return buffer[pos]; + if (m_readahead.empty()) + { + return Stream::eof(); + } + + return m_readahead[0]; } Stream::operator bool() const { - return pos < size; + return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof()); } // get // . Extracts a character from the stream and updates our position char Stream::get() { - char ch = buffer[pos]; - pos++; + char ch = peek(); + AdvanceCurrent(); column++; + if(ch == '\n') { column = 0; line++; } + return ch; } @@ -69,4 +277,179 @@ namespace YAML get(); } + void Stream::AdvanceCurrent() + { + if (!m_readahead.empty()) + { + m_readahead.pop_front(); + ++pos; + } + + ReadAheadTo(0); + } + + bool Stream::_ReadAheadTo(size_t i) const + { + while (m_input.good() && (m_readahead.size() <= i)) + { + switch (m_charSet) + { + case utf8: StreamInUtf8(); break; + case utf16le: StreamInUtf16(); break; + case utf16be: StreamInUtf16(); break; + case utf32le: StreamInUtf32(); break; + case utf32be: StreamInUtf32(); break; + } + } + + // signal end of stream + if(!m_input.good()) + m_readahead.push_back(Stream::eof()); + + return m_readahead.size() > i; + } + + void Stream::StreamInUtf8() const + { + unsigned char b = GetNextByte(); + if (m_input.good()) + { + m_readahead.push_back(b); + } + } + + void Stream::StreamInUtf16() const + { + unsigned long ch = 0; + unsigned char bytes[2]; + int nBigEnd = (m_charSet == utf16be) ? 0 : 1; + + bytes[0] = GetNextByte(); + bytes[1] = GetNextByte(); + if (!m_input.good()) + { + return; + } + ch = (static_cast(bytes[nBigEnd]) << 8) | + static_cast(bytes[1 ^ nBigEnd]); + + if (ch >= 0xDC00 && ch < 0xE000) + { + // Trailing (low) surrogate...ugh, wrong order + QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); + return; + } + else if (ch >= 0xD800 && ch < 0xDC00) + { + // ch is a leading (high) surrogate + + // Four byte UTF-8 code point + + // Read the trailing (low) surrogate + for (;;) + { + bytes[0] = GetNextByte(); + bytes[1] = GetNextByte(); + if (!m_input.good()) + { + QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); + return; + } + unsigned long chLow = (static_cast(bytes[nBigEnd]) << 8) | + static_cast(bytes[1 ^ nBigEnd]); + if (chLow < 0xDC00 || ch >= 0xE000) + { + // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream. + QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); + + // Deal with the next UTF-16 unit + if (chLow < 0xD800 || ch >= 0xE000) + { + // Easiest case: queue the codepoint and return + QueueUnicodeCodepoint(m_readahead, ch); + return; + } + else + { + // Start the loop over with the new high surrogate + ch = chLow; + continue; + } + } + + // Select the payload bits from the high surrogate + ch &= 0x3FF; + ch <<= 10; + + // Include bits from low surrogate + ch |= (chLow & 0x3FF); + + // Add the surrogacy offset + ch += 0x10000; + } + } + + QueueUnicodeCodepoint(m_readahead, ch); + } + + inline char* ReadBuffer(unsigned char* pBuffer) + { + return reinterpret_cast(pBuffer); + } + + unsigned char Stream::GetNextByte() const + { + if (m_nPushedBack) + { + return m_bufPushback[--m_nPushedBack]; + } + + if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) + { + std::streambuf *pBuf = m_input.rdbuf(); + m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched), + YAML_PREFETCH_SIZE); + m_nPrefetchedUsed = 0; + if (!m_nPrefetchedAvailable) + { + m_input.setstate(std::ios_base::eofbit); + } + + if (0 == m_nPrefetchedAvailable) + { + return 0; + } + } + + return m_pPrefetched[m_nPrefetchedUsed++]; + } + + void Stream::StreamInUtf32() const + { + static int indexes[2][4] = { + {3, 2, 1, 0}, + {0, 1, 2, 3} + }; + + unsigned long ch = 0; + unsigned char bytes[4]; + int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; + + bytes[0] = GetNextByte(); + bytes[1] = GetNextByte(); + bytes[2] = GetNextByte(); + bytes[3] = GetNextByte(); + if (!m_input.good()) + { + return; + } + + for (int i = 0; i < 4; ++i) + { + ch <<= 8; + ch |= bytes[pIndexes[i]]; + } + + QueueUnicodeCodepoint(m_readahead, ch); + } } diff --git a/src/stream.h b/src/stream.h index 5bc2c11..05fb29c 100644 --- a/src/stream.h +++ b/src/stream.h @@ -1,42 +1,66 @@ #pragma once +#include #include #include +#include +#include namespace YAML { - // a simple buffer wrapper that knows how big it is - struct Buffer { - Buffer(const char *b, int s): buffer(b), size(s) {} - - operator bool() const { return size > 0; } - bool operator !() const { return !static_cast (*this); } - char operator [] (int i) const { return buffer[i]; } - const Buffer operator + (int offset) const { return Buffer(buffer + offset, size - offset); } - Buffer& operator ++ () { ++buffer; --size; return *this; } - - const char *buffer; - int size; - }; + static const size_t MAX_PARSER_PUSHBACK = 8; class Stream { public: + friend class StreamCharSource; + Stream(std::istream& input); ~Stream(); operator bool() const; bool operator !() const { return !static_cast (*this); } - const Buffer current() const { return Buffer(buffer + pos, size - pos); } - char peek(); + char peek() const; char get(); std::string get(int n); void eat(int n = 1); - int pos, line, column, size; + static char eof() { return 0x04; } + + int pos, line, column; private: - char *buffer; + enum CharacterSet {utf8, utf16le, utf16be, utf32le, utf32be}; + + std::istream& m_input; + CharacterSet m_charSet; + unsigned char m_bufPushback[MAX_PARSER_PUSHBACK]; + mutable size_t m_nPushedBack; + mutable std::deque m_readahead; + unsigned char* const m_pPrefetched; + mutable size_t m_nPrefetchedAvailable; + mutable size_t m_nPrefetchedUsed; + + void AdvanceCurrent(); + char CharAt(size_t i) const; + bool ReadAheadTo(size_t i) const; + bool _ReadAheadTo(size_t i) const; + void StreamInUtf8() const; + void StreamInUtf16() const; + void StreamInUtf32() const; + unsigned char GetNextByte() const; }; + + // CharAt + // . Unchecked access + inline char Stream::CharAt(size_t i) const { + return m_readahead[i]; + } + + inline bool Stream::ReadAheadTo(size_t i) const { + if(m_readahead.size() > i) + return true; + return _ReadAheadTo(i); + } } diff --git a/src/streamcharsource.h b/src/streamcharsource.h new file mode 100644 index 0000000..4a1122c --- /dev/null +++ b/src/streamcharsource.h @@ -0,0 +1,39 @@ +#pragma once + +#include + +namespace YAML +{ + class StreamCharSource + { + public: + StreamCharSource(const Stream& stream); + ~StreamCharSource() {} + + operator bool() const; + char operator [] (std::size_t i) const { return m_stream.CharAt(m_offset + i); } + bool operator !() const { return !static_cast(*this); } + + const StreamCharSource operator + (int i) const; + + private: + std::size_t m_offset; + const Stream& m_stream; + }; + + inline StreamCharSource::StreamCharSource(const Stream& stream): m_offset(0), m_stream(stream) { + } + + inline StreamCharSource::operator bool() const { + return m_stream.ReadAheadTo(m_offset); + } + + inline const StreamCharSource StreamCharSource::operator + (int i) const { + StreamCharSource source(*this); + if(static_cast (source.m_offset) + i >= 0) + source.m_offset += i; + else + source.m_offset = 0; + return source; + } +} diff --git a/src/stringsource.h b/src/stringsource.h new file mode 100644 index 0000000..4b892fe --- /dev/null +++ b/src/stringsource.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace YAML +{ + class StringCharSource + { + public: + StringCharSource(const char *str, std::size_t size): m_str(str), m_size(size), m_offset(0) {} + + operator bool() const { return m_offset < m_size; } + char operator [] (std::size_t i) const { return m_str[m_offset + i]; } + bool operator !() const { return !static_cast(*this); } + + const StringCharSource operator + (int i) const { + StringCharSource source(*this); + if(static_cast (source.m_offset) + i >= 0) + source.m_offset += i; + else + source.m_offset = 0; + return source; + } + + StringCharSource& operator ++ () { + ++m_offset; + return *this; + } + private: + const char *m_str; + std::size_t m_size; + std::size_t m_offset; + }; +} diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt new file mode 100644 index 0000000..22339f0 --- /dev/null +++ b/util/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(parse parse.cpp) +target_link_libraries(parse yaml-cpp) diff --git a/util/parse.cpp b/util/parse.cpp new file mode 100644 index 0000000..1632041 --- /dev/null +++ b/util/parse.cpp @@ -0,0 +1,21 @@ +#include "yaml.h" +#include +#include + +int main(int argc, char **argv) +{ + if(argc != 2) { + std::cout << "Usage: " << argv[0] << " input-file\n"; + return 0; + } + + std::ifstream fin(argv[1]); + try { + YAML::Parser parser(fin); + YAML::Node doc; + parser.GetNextDocument(doc); + } catch(const YAML::Exception& e) { + std::cerr << "Error at line " << e.line << ", col " << e.column << ": " << e.msg << "\n"; + } + return 0; +} diff --git a/yaml-reader/tests.cpp b/yaml-reader/tests.cpp index 74a2dd2..c4d222e 100644 --- a/yaml-reader/tests.cpp +++ b/yaml-reader/tests.cpp @@ -72,6 +72,174 @@ namespace Test std::cout << "Caught exception: " << error << "\n"; } } + + typedef void (*EncodingFn)(std::ostream&, int); + + inline char Byte(int ch) + { + return static_cast(static_cast(static_cast(ch))); + } + + void EncodeToUtf8(std::ostream& stream, int ch) + { + if (ch <= 0x7F) + { + stream << Byte(ch); + } + else if (ch <= 0x7FF) + { + stream << Byte(0xC0 | (ch >> 6)); + stream << Byte(0x80 | (ch & 0x3F)); + } + else if (ch <= 0xFFFF) + { + stream << Byte(0xE0 | (ch >> 12)); + stream << Byte(0x80 | ((ch >> 6) & 0x3F)); + stream << Byte(0x80 | (ch & 0x3F)); + } + else if (ch <= 0x1FFFFF) + { + stream << Byte(0xF0 | (ch >> 18)); + stream << Byte(0x80 | ((ch >> 12) & 0x3F)); + stream << Byte(0x80 | ((ch >> 6) & 0x3F)); + stream << Byte(0x80 | (ch & 0x3F)); + } + } + + bool SplitUtf16HighChar(std::ostream& stream, EncodingFn encoding, int ch) + { + int biasedValue = ch - 0x10000; + if (biasedValue < 0) + { + return false; + } + int high = 0xD800 | (biasedValue >> 10); + int low = 0xDC00 | (biasedValue & 0x3FF); + encoding(stream, high); + encoding(stream, low); + return true; + } + + void EncodeToUtf16LE(std::ostream& stream, int ch) + { + if (!SplitUtf16HighChar(stream, &EncodeToUtf16LE, ch)) + { + stream << Byte(ch & 0xFF) << Byte(ch >> 8); + } + } + + void EncodeToUtf16BE(std::ostream& stream, int ch) + { + if (!SplitUtf16HighChar(stream, &EncodeToUtf16BE, ch)) + { + stream << Byte(ch >> 8) << Byte(ch & 0xFF); + } + } + + void EncodeToUtf32LE(std::ostream& stream, int ch) + { + stream << Byte(ch & 0xFF) << Byte((ch >> 8) & 0xFF) + << Byte((ch >> 16) & 0xFF) << Byte((ch >> 24) & 0xFF); + } + + void EncodeToUtf32BE(std::ostream& stream, int ch) + { + stream << Byte((ch >> 24) & 0xFF) << Byte((ch >> 16) & 0xFF) + << Byte((ch >> 8) & 0xFF) << Byte(ch & 0xFF); + } + + class EncodingTester + { + public: + EncodingTester(EncodingFn encoding, bool declareEncoding) + { + if (declareEncoding) + { + encoding(m_yaml, 0xFEFF); + } + + AddEntry(encoding, 0x0021, 0x007E); // Basic Latin + AddEntry(encoding, 0x00A1, 0x00FF); // Latin-1 Supplement + AddEntry(encoding, 0x0660, 0x06FF); // Arabic (largest contiguous block) + + // CJK unified ideographs (multiple lines) + AddEntry(encoding, 0x4E00, 0x4EFF); + AddEntry(encoding, 0x4F00, 0x4FFF); + AddEntry(encoding, 0x5000, 0x51FF); // 512 character line + AddEntry(encoding, 0x5200, 0x54FF); // 768 character line + AddEntry(encoding, 0x5500, 0x58FF); // 1024 character line + + AddEntry(encoding, 0x103A0, 0x103C3); // Old Persian + + m_yaml.seekg(0, std::ios::beg); + } + + std::istream& stream() {return m_yaml;} + const std::vector& entries() {return m_entries;} + + private: + std::stringstream m_yaml; + std::vector m_entries; + + void AddEntry(EncodingFn encoding, int startCh, int endCh) + { + encoding(m_yaml, '-'); + encoding(m_yaml, ' '); + encoding(m_yaml, '|'); + encoding(m_yaml, '\n'); + encoding(m_yaml, ' '); + encoding(m_yaml, ' '); + + std::stringstream entry; + for (int ch = startCh; ch <= endCh; ++ch) + { + encoding(m_yaml, ch); + EncodeToUtf8(entry, ch); + } + encoding(m_yaml, '\n'); + + m_entries.push_back(entry.str()); + } + }; + + void RunEncodingTest(EncodingFn encoding, bool declareEncoding, const std::string& name, bool& passed) + { + EncodingTester tester(encoding, declareEncoding); + std::string error; + bool ok = true; + try { + YAML::Parser parser(tester.stream()); + YAML::Node doc; + parser.GetNextDocument(doc); + + YAML::Iterator itNode = doc.begin(); + std::vector::const_iterator itEntry = tester.entries().begin(); + for (; (itNode != doc.end()) && (itEntry != tester.entries().end()); ++itNode, ++itEntry) + { + std::string stScalarValue; + if (!itNode->GetScalar(stScalarValue) && (stScalarValue == *itEntry)) + { + break; + } + } + + if ((itNode != doc.end()) || (itEntry != tester.entries().end())) + { + ok = false; + } + } catch(const YAML::Exception& e) { + ok = false; + error = e.msg; + } + if(ok) { + std::cout << "Parser test passed: " << name << "\n"; + } else { + passed = false; + std::cout << "Parser test failed: " << name << "\n"; + if(error != "") + std::cout << "Caught exception: " << error << "\n"; + } + } } bool RunParserTests() @@ -94,6 +262,17 @@ namespace Test RunParserTest(&Parser::SimpleMap, "simple map", passed); RunParserTest(&Parser::FlowSeq, "flow seq", passed); RunParserTest(&Parser::FlowMap, "flow map", passed); + + RunEncodingTest(&EncodeToUtf8, false, "UTF-8, no BOM", passed); + RunEncodingTest(&EncodeToUtf8, true, "UTF-8 with BOM", passed); + RunEncodingTest(&EncodeToUtf16LE, false, "UTF-16LE, no BOM", passed); + RunEncodingTest(&EncodeToUtf16LE, true, "UTF-16LE with BOM", passed); + RunEncodingTest(&EncodeToUtf16BE, false, "UTF-16BE, no BOM", passed); + RunEncodingTest(&EncodeToUtf16BE, true, "UTF-16BE with BOM", passed); + RunEncodingTest(&EncodeToUtf32LE, false, "UTF-32LE, no BOM", passed); + RunEncodingTest(&EncodeToUtf32LE, true, "UTF-32LE with BOM", passed); + RunEncodingTest(&EncodeToUtf32BE, false, "UTF-32BE, no BOM", passed); + RunEncodingTest(&EncodeToUtf32BE, true, "UTF-32BE with BOM", passed); return passed; } diff --git a/yamlcpp.vcproj b/yamlcpp.vcproj index 53d6f37..218c524 100644 --- a/yamlcpp.vcproj +++ b/yamlcpp.vcproj @@ -231,6 +231,10 @@ RelativePath=".\src\stream.cpp" > + + + +