Merged utf branch changes r178:187 into the trunk

This commit is contained in:
Jesse Beder
2009-07-10 03:10:03 +00:00
parent a7b8879494
commit b929eb94a0
13 changed files with 939 additions and 271 deletions

View File

@@ -3,6 +3,7 @@
#include "indentation.h"
#include "exceptions.h"
#include <sstream>
#include "stringsource.h"
namespace YAML
{
@@ -29,8 +30,8 @@ namespace YAML
|| (!Exp::Printable)
|| Exp::Break
|| Exp::Tab;
Buffer buffer(&str[0], str.size());
while(buffer.size) {
StringCharSource buffer(str.c_str(), str.size());
while(buffer) {
if(disallowed.Matches(buffer))
return false;
++buffer;

View File

@@ -1,139 +1,39 @@
#include "crt.h"
#include "regex.h"
#include "stream.h"
#include <iostream>
namespace YAML
{
RegEx::RegEx(REGEX_OP op): m_op(op), m_pOp(0)
// constructors
RegEx::RegEx(): m_op(REGEX_EMPTY)
{
SetOp();
}
RegEx::RegEx(const RegEx& rhs): m_pOp(0)
RegEx::RegEx(REGEX_OP op): m_op(op)
{
m_op = rhs.m_op;
m_a = rhs.m_a;
m_z = rhs.m_z;
m_params = rhs.m_params;
SetOp();
}
RegEx::RegEx(): m_op(REGEX_EMPTY), m_pOp(0)
RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
{
SetOp();
}
RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_pOp(0), m_a(ch)
RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
{
SetOp();
}
RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_pOp(0), m_a(a), m_z(z)
{
SetOp();
}
RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op), m_pOp(0)
RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
{
for(unsigned i=0;i<str.size();i++)
m_params.push_back(RegEx(str[i]));
SetOp();
}
RegEx::~RegEx()
{
delete m_pOp;
}
RegEx& RegEx::operator = (const RegEx& rhs)
{
delete m_pOp;
m_pOp = 0;
m_op = rhs.m_op;
m_a = rhs.m_a;
m_z = rhs.m_z;
m_params = rhs.m_params;
SetOp();
return *this;
}
void RegEx::SetOp()
{
delete m_pOp;
m_pOp = 0;
switch(m_op) {
case REGEX_MATCH: m_pOp = new MatchOperator; break;
case REGEX_RANGE: m_pOp = new RangeOperator; break;
case REGEX_OR: m_pOp = new OrOperator; break;
case REGEX_AND: m_pOp = new AndOperator; break;
case REGEX_NOT: m_pOp = new NotOperator; break;
case REGEX_SEQ: m_pOp = new SeqOperator; break;
default: break;
}
}
bool RegEx::Matches(char ch) const
{
std::string str;
str += ch;
return Matches(str);
}
bool RegEx::Matches(const std::string& str) const
{
return Match(str) >= 0;
}
bool RegEx::Matches(const Buffer& buffer) const
{
return Match(buffer) >= 0;
}
bool RegEx::Matches(const Stream& in) const
{
return Match(in) >= 0;
}
// Match
// . Matches the given string against this regular expression.
// . Returns the number of characters matched.
// . Returns -1 if no characters were matched (the reason for
// not returning zero is that we may have an empty regex
// which is ALWAYS successful at matching zero characters).
// . REMEMBER that we only match from the start of the buffer!
int RegEx::Match(const Buffer& buffer) const
{
if(!m_pOp)
return !buffer ? 0 : -1; // the empty regex only is successful on the empty string
return m_pOp->Match(buffer, *this);
}
int RegEx::Match(const std::string& str) const
{
Buffer buffer(str.c_str(), str.size());
return Match(buffer);
}
// Match
int RegEx::Match(const Stream& in) const
{
return Match(in.current());
}
// combination constructors
RegEx operator ! (const RegEx& ex)
{
RegEx ret(REGEX_NOT);
ret.m_params.push_back(ex);
return ret;
}
RegEx operator || (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_OR);
@@ -141,7 +41,7 @@ namespace YAML
ret.m_params.push_back(ex2);
return ret;
}
RegEx operator && (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_AND);
@@ -149,85 +49,13 @@ namespace YAML
ret.m_params.push_back(ex2);
return ret;
}
RegEx operator + (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_SEQ);
ret.m_params.push_back(ex1);
ret.m_params.push_back(ex2);
return ret;
}
//////////////////////////////////////////////////////////////////////////////
// Operators
// MatchOperator
int RegEx::MatchOperator::Match(const Buffer& buffer, const RegEx& regex) const
{
if(!buffer || buffer[0] != regex.m_a)
return -1;
return 1;
}
// RangeOperator
int RegEx::RangeOperator::Match(const Buffer& buffer, const RegEx& regex) const
{
if(!buffer || regex.m_a > buffer[0] || regex.m_z < buffer[0])
return -1;
return 1;
}
// OrOperator
int RegEx::OrOperator::Match(const Buffer& buffer, const RegEx& regex) const
{
for(unsigned i=0;i<regex.m_params.size();i++) {
int n = regex.m_params[i].Match(buffer);
if(n >= 0)
return n;
}
return -1;
}
// AndOperator
// Note: 'AND' is a little funny, since we may be required to match things
// of different lengths. If we find a match, we return the length of
// the FIRST entry on the list.
int RegEx::AndOperator::Match(const Buffer& buffer, const RegEx& regex) const
{
int first = -1;
for(unsigned i=0;i<regex.m_params.size();i++) {
int n = regex.m_params[i].Match(buffer);
if(n == -1)
return -1;
if(i == 0)
first = n;
}
return first;
}
// NotOperator
int RegEx::NotOperator::Match(const Buffer& buffer, const RegEx& regex) const
{
if(regex.m_params.empty())
return -1;
if(regex.m_params[0].Match(buffer) >= 0)
return -1;
return 1;
}
// SeqOperator
int RegEx::SeqOperator::Match(const Buffer& buffer, const RegEx& regex) const
{
int offset = 0;
for(unsigned i=0;i<regex.m_params.size();i++) {
int n = regex.m_params[i].Match(buffer + offset);
if(n == -1)
return -1;
offset += n;
}
return offset;
}
}
}

View File

@@ -6,7 +6,6 @@
namespace YAML
{
class Stream;
struct Buffer;
enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };
@@ -15,70 +14,46 @@ namespace YAML
// . Only matches from start of string
class RegEx
{
private:
// the operators
struct Operator {
virtual ~Operator() {}
virtual int Match(const Buffer& buffer, const RegEx& regex) const = 0;
};
struct MatchOperator: public Operator {
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
};
struct RangeOperator: public Operator {
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
};
struct OrOperator: public Operator {
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
};
struct AndOperator: public Operator {
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
};
struct NotOperator: public Operator {
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
};
struct SeqOperator: public Operator {
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
};
public:
friend struct Operator;
RegEx();
RegEx(char ch);
RegEx(char a, char z);
RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ);
RegEx(const RegEx& rhs);
~RegEx();
RegEx& operator = (const RegEx& rhs);
bool Matches(char ch) const;
bool Matches(const std::string& str) const;
bool Matches(const Buffer& buffer) const;
bool Matches(const Stream& in) const;
int Match(const std::string& str) const;
int Match(const Buffer& buffer) const;
int Match(const Stream& in) const;
~RegEx() {}
friend RegEx operator ! (const RegEx& ex);
friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
bool Matches(char ch) const;
bool Matches(const std::string& str) const;
bool Matches(const Stream& in) const;
template <typename Source> bool Matches(const Source& source) const;
int Match(const std::string& str) const;
int Match(const Stream& in) const;
private:
RegEx(REGEX_OP op);
void SetOp();
template <typename Source> bool IsValidSource(const Source& source) const;
template <typename Source> int Match(const Source& source) const;
template <typename Source> int MatchUnchecked(const Source& source) const;
template <typename Source> int MatchOpEmpty(const Source& source) const;
template <typename Source> int MatchOpMatch(const Source& source) const;
template <typename Source> int MatchOpRange(const Source& source) const;
template <typename Source> int MatchOpOr(const Source& source) const;
template <typename Source> int MatchOpAnd(const Source& source) const;
template <typename Source> int MatchOpNot(const Source& source) const;
template <typename Source> int MatchOpSeq(const Source& source) const;
private:
REGEX_OP m_op;
Operator *m_pOp;
char m_a, m_z;
std::vector <RegEx> m_params;
};
}
#include "regeximpl.h"

172
src/regeximpl.h Normal file
View File

@@ -0,0 +1,172 @@
#pragma once
#include "stream.h"
#include "stringsource.h"
#include "streamcharsource.h"
namespace YAML
{
// query matches
inline bool RegEx::Matches(char ch) const {
std::string str;
str += ch;
return Matches(str);
}
inline bool RegEx::Matches(const std::string& str) const {
return Match(str) >= 0;
}
inline bool RegEx::Matches(const Stream& in) const {
return Match(in) >= 0;
}
template <typename Source>
inline bool RegEx::Matches(const Source& source) const {
return Match(source) >= 0;
}
// Match
// . Matches the given string against this regular expression.
// . Returns the number of characters matched.
// . Returns -1 if no characters were matched (the reason for
// not returning zero is that we may have an empty regex
// which is ALWAYS successful at matching zero characters).
// . REMEMBER that we only match from the start of the buffer!
inline int RegEx::Match(const std::string& str) const
{
StringCharSource source(str.c_str(), str.size());
return Match(source);
}
inline int RegEx::Match(const Stream& in) const
{
StreamCharSource source(in);
return Match(source);
}
template <typename Source>
inline bool RegEx::IsValidSource(const Source& source) const
{
return source;
}
template<>
inline bool RegEx::IsValidSource<StringCharSource>(const StringCharSource&source) const
{
return source || m_op == REGEX_EMPTY;
}
template <typename Source>
inline int RegEx::Match(const Source& source) const
{
return IsValidSource(source) ? MatchUnchecked(source) : -1;
}
template <typename Source>
inline int RegEx::MatchUnchecked(const Source& source) const
{
switch(m_op) {
case REGEX_EMPTY:
return MatchOpEmpty(source);
case REGEX_MATCH:
return MatchOpMatch(source);
case REGEX_RANGE:
return MatchOpRange(source);
case REGEX_OR:
return MatchOpOr(source);
case REGEX_AND:
return MatchOpAnd(source);
case REGEX_NOT:
return MatchOpNot(source);
case REGEX_SEQ:
return MatchOpSeq(source);
}
return -1;
}
//////////////////////////////////////////////////////////////////////////////
// Operators
// Note: the convention MatchOp*<Source> is that we can assume IsSourceValid(source).
// So we do all our checks *before* we call these functions
// EmptyOperator
template <typename Source>
inline int RegEx::MatchOpEmpty(const Source& source) const {
return source[0] == Stream::eof() ? 0 : -1;
}
template <>
inline int RegEx::MatchOpEmpty<StringCharSource>(const StringCharSource& source) const {
return !source ? 0 : -1; // the empty regex only is successful on the empty string
}
// MatchOperator
template <typename Source>
inline int RegEx::MatchOpMatch(const Source& source) const {
if(source[0] != m_a)
return -1;
return 1;
}
// RangeOperator
template <typename Source>
inline int RegEx::MatchOpRange(const Source& source) const {
if(m_a > source[0] || m_z < source[0])
return -1;
return 1;
}
// OrOperator
template <typename Source>
inline int RegEx::MatchOpOr(const Source& source) const {
for(unsigned i=0;i<m_params.size();i++) {
int n = m_params[i].MatchUnchecked(source);
if(n >= 0)
return n;
}
return -1;
}
// AndOperator
// Note: 'AND' is a little funny, since we may be required to match things
// of different lengths. If we find a match, we return the length of
// the FIRST entry on the list.
template <typename Source>
inline int RegEx::MatchOpAnd(const Source& source) const {
int first = -1;
for(unsigned i=0;i<m_params.size();i++) {
int n = m_params[i].MatchUnchecked(source);
if(n == -1)
return -1;
if(i == 0)
first = n;
}
return first;
}
// NotOperator
template <typename Source>
inline int RegEx::MatchOpNot(const Source& source) const {
if(m_params.empty())
return -1;
if(m_params[0].MatchUnchecked(source) >= 0)
return -1;
return 1;
}
// SeqOperator
template <typename Source>
inline int RegEx::MatchOpSeq(const Source& source) const {
int offset = 0;
for(unsigned i=0;i<m_params.size();i++) {
int n = m_params[i].Match(source + offset); // note Match, not MatchUnchecked because we need to check validity after the offset
if(n == -1)
return -1;
offset += n;
}
return offset;
}
}

View File

@@ -1,52 +1,260 @@
#include "crt.h"
#include "stream.h"
#include <iostream>
#include "exp.h"
#ifndef YAML_PREFETCH_SIZE
#define YAML_PREFETCH_SIZE 2048
#endif
#define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
#define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
#define CP_REPLACEMENT_CHARACTER (0xFFFD)
namespace YAML
{
Stream::Stream(std::istream& input): pos(0), line(0), column(0), size(0), buffer(0)
enum UtfIntroState {
uis_start,
uis_utfbe_b1,
uis_utf32be_b2,
uis_utf32be_bom3,
uis_utf32be,
uis_utf16be,
uis_utf16be_bom1,
uis_utfle_bom1,
uis_utf16le_bom2,
uis_utf32le_bom3,
uis_utf16le,
uis_utf32le,
uis_utf8_imp,
uis_utf16le_imp,
uis_utf32le_imp3,
uis_utf8_bom1,
uis_utf8_bom2,
uis_utf8,
uis_error
};
enum UtfIntroCharType {
uict00,
uictBB,
uictBF,
uictEF,
uictFE,
uictFF,
uictAscii,
uictOther,
uictMax
};
static bool s_introFinalState[] = {
false, //uis_start
false, //uis_utfbe_b1
false, //uis_utf32be_b2
false, //uis_utf32be_bom3
true, //uis_utf32be
true, //uis_utf16be
false, //uis_utf16be_bom1
false, //uis_utfle_bom1
false, //uis_utf16le_bom2
false, //uis_utf32le_bom3
true, //uis_utf16le
true, //uis_utf32le
false, //uis_utf8_imp
false, //uis_utf16le_imp
false, //uis_utf32le_imp3
false, //uis_utf8_bom1
false, //uis_utf8_bom2
true, //uis_utf8
true, //uis_error
};
static UtfIntroState s_introTransitions[][uictMax] = {
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
{uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, uis_utfle_bom1, uis_utf8_imp, uis_utf8},
{uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8},
{uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, uis_utf8, uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, uis_utf8},
{uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be},
{uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, uis_utf8, uis_utf8},
{uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le},
{uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
{uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
};
static char s_introUngetCount[][uictMax] = {
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
{0, 1, 1, 0, 0, 0, 0, 1},
{0, 2, 2, 2, 2, 2, 2, 2},
{3, 3, 3, 3, 0, 3, 3, 3},
{4, 4, 4, 4, 4, 0, 4, 4},
{1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 0, 2, 2},
{2, 2, 2, 2, 0, 2, 2, 2},
{0, 1, 1, 1, 1, 1, 1, 1},
{0, 2, 2, 2, 2, 2, 2, 2},
{1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1},
{0, 2, 2, 2, 2, 2, 2, 2},
{0, 3, 3, 3, 3, 3, 3, 3},
{4, 4, 4, 4, 4, 4, 4, 4},
{2, 0, 2, 2, 2, 2, 2, 2},
{3, 3, 0, 3, 3, 3, 3, 3},
{1, 1, 1, 1, 1, 1, 1, 1},
};
inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
{
if (std::istream::traits_type::eof() == ch) {
return uictOther;
}
switch (ch) {
case 0: return uict00;
case 0xBB: return uictBB;
case 0xBF: return uictBF;
case 0xEF: return uictEF;
case 0xFE: return uictFE;
case 0xFF: return uictFF;
}
if ((ch > 0) && (ch < 0xFF)) {
return uictAscii;
}
return uictOther;
}
inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
{
const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
const unsigned char mask = (0xFF >> (lead_bits + 1));
return static_cast<char>(static_cast<unsigned char>(
header | ((ch >> rshift) & mask)
));
}
inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
{
// We are not allowed to queue the Stream::eof() codepoint, so
// replace it with CP_REPLACEMENT_CHARACTER
if (static_cast<unsigned long>(Stream::eof()) == ch)
{
ch = CP_REPLACEMENT_CHARACTER;
}
if (ch < 0x80)
{
q.push_back(Utf8Adjust(ch, 0, 0));
}
else if (ch < 0x800)
{
q.push_back(Utf8Adjust(ch, 2, 6));
q.push_back(Utf8Adjust(ch, 1, 0));
}
else if (ch < 0x10000)
{
q.push_back(Utf8Adjust(ch, 3, 12));
q.push_back(Utf8Adjust(ch, 1, 6));
q.push_back(Utf8Adjust(ch, 1, 0));
}
else
{
q.push_back(Utf8Adjust(ch, 4, 18));
q.push_back(Utf8Adjust(ch, 1, 12));
q.push_back(Utf8Adjust(ch, 1, 6));
q.push_back(Utf8Adjust(ch, 1, 0));
}
}
Stream::Stream(std::istream& input)
: pos(0), line(0), column(0), m_input(input), m_nPushedBack(0),
m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
{
typedef std::istream::traits_type char_traits;
if(!input)
return;
std::streambuf *pBuf = input.rdbuf();
// Determine (or guess) the character-set by reading the BOM, if any. See
// the YAML specification for the determination algorithm.
char_traits::int_type intro[4];
int nIntroUsed = 0;
UtfIntroState state = uis_start;
for (; !s_introFinalState[state]; ) {
std::istream::int_type ch = input.get();
intro[nIntroUsed++] = ch;
UtfIntroCharType charType = IntroCharTypeOf(ch);
UtfIntroState newState = s_introTransitions[state][charType];
int nUngets = s_introUngetCount[state][charType];
if (nUngets > 0) {
for (; nUngets > 0; --nUngets) {
if (char_traits::eof() != intro[--nIntroUsed]) {
m_bufPushback[m_nPushedBack++] =
char_traits::to_char_type(intro[nIntroUsed]);
}
}
}
state = newState;
}
// store entire file in buffer
size = pBuf->pubseekoff(0, std::ios::end, std::ios::in);
pBuf->pubseekpos(0, std::ios::in);
buffer = new char[size];
size = pBuf->sgetn(buffer, size); // Note: when reading a Windows CR/LF file,
// pubseekoff() counts CR/LF as two characters,
// setgn() reads CR/LF as a single LF character!
switch (state) {
case uis_utf8: m_charSet = utf8; break;
case uis_utf16le: m_charSet = utf16le; break;
case uis_utf16be: m_charSet = utf16be; break;
case uis_utf32le: m_charSet = utf32le; break;
case uis_utf32be: m_charSet = utf32be; break;
default: m_charSet = utf8; break;
}
ReadAheadTo(0);
}
Stream::~Stream()
{
delete [] buffer;
delete[] m_pPrefetched;
}
char Stream::peek()
char Stream::peek() const
{
return buffer[pos];
if (m_readahead.empty())
{
return Stream::eof();
}
return m_readahead[0];
}
Stream::operator bool() const
{
return pos < size;
return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
}
// get
// . Extracts a character from the stream and updates our position
char Stream::get()
{
char ch = buffer[pos];
pos++;
char ch = peek();
AdvanceCurrent();
column++;
if(ch == '\n') {
column = 0;
line++;
}
return ch;
}
@@ -69,4 +277,179 @@ namespace YAML
get();
}
void Stream::AdvanceCurrent()
{
if (!m_readahead.empty())
{
m_readahead.pop_front();
++pos;
}
ReadAheadTo(0);
}
bool Stream::_ReadAheadTo(size_t i) const
{
while (m_input.good() && (m_readahead.size() <= i))
{
switch (m_charSet)
{
case utf8: StreamInUtf8(); break;
case utf16le: StreamInUtf16(); break;
case utf16be: StreamInUtf16(); break;
case utf32le: StreamInUtf32(); break;
case utf32be: StreamInUtf32(); break;
}
}
// signal end of stream
if(!m_input.good())
m_readahead.push_back(Stream::eof());
return m_readahead.size() > i;
}
void Stream::StreamInUtf8() const
{
unsigned char b = GetNextByte();
if (m_input.good())
{
m_readahead.push_back(b);
}
}
void Stream::StreamInUtf16() const
{
unsigned long ch = 0;
unsigned char bytes[2];
int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
if (!m_input.good())
{
return;
}
ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
if (ch >= 0xDC00 && ch < 0xE000)
{
// Trailing (low) surrogate...ugh, wrong order
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
return;
}
else if (ch >= 0xD800 && ch < 0xDC00)
{
// ch is a leading (high) surrogate
// Four byte UTF-8 code point
// Read the trailing (low) surrogate
for (;;)
{
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
if (!m_input.good())
{
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
return;
}
unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
if (chLow < 0xDC00 || ch >= 0xE000)
{
// Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream.
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
// Deal with the next UTF-16 unit
if (chLow < 0xD800 || ch >= 0xE000)
{
// Easiest case: queue the codepoint and return
QueueUnicodeCodepoint(m_readahead, ch);
return;
}
else
{
// Start the loop over with the new high surrogate
ch = chLow;
continue;
}
}
// Select the payload bits from the high surrogate
ch &= 0x3FF;
ch <<= 10;
// Include bits from low surrogate
ch |= (chLow & 0x3FF);
// Add the surrogacy offset
ch += 0x10000;
}
}
QueueUnicodeCodepoint(m_readahead, ch);
}
inline char* ReadBuffer(unsigned char* pBuffer)
{
return reinterpret_cast<char*>(pBuffer);
}
unsigned char Stream::GetNextByte() const
{
if (m_nPushedBack)
{
return m_bufPushback[--m_nPushedBack];
}
if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
{
std::streambuf *pBuf = m_input.rdbuf();
m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched),
YAML_PREFETCH_SIZE);
m_nPrefetchedUsed = 0;
if (!m_nPrefetchedAvailable)
{
m_input.setstate(std::ios_base::eofbit);
}
if (0 == m_nPrefetchedAvailable)
{
return 0;
}
}
return m_pPrefetched[m_nPrefetchedUsed++];
}
void Stream::StreamInUtf32() const
{
static int indexes[2][4] = {
{3, 2, 1, 0},
{0, 1, 2, 3}
};
unsigned long ch = 0;
unsigned char bytes[4];
int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
bytes[2] = GetNextByte();
bytes[3] = GetNextByte();
if (!m_input.good())
{
return;
}
for (int i = 0; i < 4; ++i)
{
ch <<= 8;
ch |= bytes[pIndexes[i]];
}
QueueUnicodeCodepoint(m_readahead, ch);
}
}

View File

@@ -1,42 +1,66 @@
#pragma once
#include <deque>
#include <ios>
#include <string>
#include <iostream>
#include <set>
namespace YAML
{
// a simple buffer wrapper that knows how big it is
struct Buffer {
Buffer(const char *b, int s): buffer(b), size(s) {}
operator bool() const { return size > 0; }
bool operator !() const { return !static_cast <bool> (*this); }
char operator [] (int i) const { return buffer[i]; }
const Buffer operator + (int offset) const { return Buffer(buffer + offset, size - offset); }
Buffer& operator ++ () { ++buffer; --size; return *this; }
const char *buffer;
int size;
};
static const size_t MAX_PARSER_PUSHBACK = 8;
class Stream
{
public:
friend class StreamCharSource;
Stream(std::istream& input);
~Stream();
operator bool() const;
bool operator !() const { return !static_cast <bool>(*this); }
const Buffer current() const { return Buffer(buffer + pos, size - pos); }
char peek();
char peek() const;
char get();
std::string get(int n);
void eat(int n = 1);
int pos, line, column, size;
static char eof() { return 0x04; }
int pos, line, column;
private:
char *buffer;
enum CharacterSet {utf8, utf16le, utf16be, utf32le, utf32be};
std::istream& m_input;
CharacterSet m_charSet;
unsigned char m_bufPushback[MAX_PARSER_PUSHBACK];
mutable size_t m_nPushedBack;
mutable std::deque<char> m_readahead;
unsigned char* const m_pPrefetched;
mutable size_t m_nPrefetchedAvailable;
mutable size_t m_nPrefetchedUsed;
void AdvanceCurrent();
char CharAt(size_t i) const;
bool ReadAheadTo(size_t i) const;
bool _ReadAheadTo(size_t i) const;
void StreamInUtf8() const;
void StreamInUtf16() const;
void StreamInUtf32() const;
unsigned char GetNextByte() const;
};
// CharAt
// . Unchecked access
inline char Stream::CharAt(size_t i) const {
return m_readahead[i];
}
inline bool Stream::ReadAheadTo(size_t i) const {
if(m_readahead.size() > i)
return true;
return _ReadAheadTo(i);
}
}

39
src/streamcharsource.h Normal file
View File

@@ -0,0 +1,39 @@
#pragma once
#include <cstddef>
namespace YAML
{
class StreamCharSource
{
public:
StreamCharSource(const Stream& stream);
~StreamCharSource() {}
operator bool() const;
char operator [] (std::size_t i) const { return m_stream.CharAt(m_offset + i); }
bool operator !() const { return !static_cast<bool>(*this); }
const StreamCharSource operator + (int i) const;
private:
std::size_t m_offset;
const Stream& m_stream;
};
inline StreamCharSource::StreamCharSource(const Stream& stream): m_offset(0), m_stream(stream) {
}
inline StreamCharSource::operator bool() const {
return m_stream.ReadAheadTo(m_offset);
}
inline const StreamCharSource StreamCharSource::operator + (int i) const {
StreamCharSource source(*this);
if(static_cast<int> (source.m_offset) + i >= 0)
source.m_offset += i;
else
source.m_offset = 0;
return source;
}
}

34
src/stringsource.h Normal file
View File

@@ -0,0 +1,34 @@
#pragma once
#include <cstddef>
namespace YAML
{
class StringCharSource
{
public:
StringCharSource(const char *str, std::size_t size): m_str(str), m_size(size), m_offset(0) {}
operator bool() const { return m_offset < m_size; }
char operator [] (std::size_t i) const { return m_str[m_offset + i]; }
bool operator !() const { return !static_cast<bool>(*this); }
const StringCharSource operator + (int i) const {
StringCharSource source(*this);
if(static_cast<int> (source.m_offset) + i >= 0)
source.m_offset += i;
else
source.m_offset = 0;
return source;
}
StringCharSource& operator ++ () {
++m_offset;
return *this;
}
private:
const char *m_str;
std::size_t m_size;
std::size_t m_offset;
};
}