mirror of
https://github.com/jbeder/yaml-cpp.git
synced 2025-09-09 04:41:16 +00:00
Merged utf branch changes r178:187 into the trunk
This commit is contained in:
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 2.6)
|
||||
|
||||
project (YAML_CPP)
|
||||
|
||||
SET(CMAKE_CXX_FLAGS "-O2")
|
||||
|
||||
enable_testing()
|
||||
|
||||
if(WIN32)
|
||||
@@ -36,4 +38,4 @@ install(
|
||||
)
|
||||
|
||||
add_subdirectory (yaml-reader)
|
||||
|
||||
add_subdirectory (util)
|
||||
|
@@ -3,6 +3,7 @@
|
||||
#include "indentation.h"
|
||||
#include "exceptions.h"
|
||||
#include <sstream>
|
||||
#include "stringsource.h"
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
@@ -29,8 +30,8 @@ namespace YAML
|
||||
|| (!Exp::Printable)
|
||||
|| Exp::Break
|
||||
|| Exp::Tab;
|
||||
Buffer buffer(&str[0], str.size());
|
||||
while(buffer.size) {
|
||||
StringCharSource buffer(str.c_str(), str.size());
|
||||
while(buffer) {
|
||||
if(disallowed.Matches(buffer))
|
||||
return false;
|
||||
++buffer;
|
||||
|
186
src/regex.cpp
186
src/regex.cpp
@@ -1,132 +1,32 @@
|
||||
#include "crt.h"
|
||||
#include "regex.h"
|
||||
#include "stream.h"
|
||||
#include <iostream>
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
RegEx::RegEx(REGEX_OP op): m_op(op), m_pOp(0)
|
||||
// constructors
|
||||
RegEx::RegEx(): m_op(REGEX_EMPTY)
|
||||
{
|
||||
SetOp();
|
||||
}
|
||||
|
||||
RegEx::RegEx(const RegEx& rhs): m_pOp(0)
|
||||
RegEx::RegEx(REGEX_OP op): m_op(op)
|
||||
{
|
||||
m_op = rhs.m_op;
|
||||
m_a = rhs.m_a;
|
||||
m_z = rhs.m_z;
|
||||
m_params = rhs.m_params;
|
||||
|
||||
SetOp();
|
||||
}
|
||||
|
||||
RegEx::RegEx(): m_op(REGEX_EMPTY), m_pOp(0)
|
||||
RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
|
||||
{
|
||||
SetOp();
|
||||
}
|
||||
|
||||
RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_pOp(0), m_a(ch)
|
||||
RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
|
||||
{
|
||||
SetOp();
|
||||
}
|
||||
|
||||
RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_pOp(0), m_a(a), m_z(z)
|
||||
{
|
||||
SetOp();
|
||||
}
|
||||
|
||||
RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op), m_pOp(0)
|
||||
RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
|
||||
{
|
||||
for(unsigned i=0;i<str.size();i++)
|
||||
m_params.push_back(RegEx(str[i]));
|
||||
|
||||
SetOp();
|
||||
}
|
||||
|
||||
RegEx::~RegEx()
|
||||
{
|
||||
delete m_pOp;
|
||||
}
|
||||
|
||||
RegEx& RegEx::operator = (const RegEx& rhs)
|
||||
{
|
||||
delete m_pOp;
|
||||
m_pOp = 0;
|
||||
|
||||
m_op = rhs.m_op;
|
||||
m_a = rhs.m_a;
|
||||
m_z = rhs.m_z;
|
||||
m_params = rhs.m_params;
|
||||
|
||||
SetOp();
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
void RegEx::SetOp()
|
||||
{
|
||||
delete m_pOp;
|
||||
m_pOp = 0;
|
||||
switch(m_op) {
|
||||
case REGEX_MATCH: m_pOp = new MatchOperator; break;
|
||||
case REGEX_RANGE: m_pOp = new RangeOperator; break;
|
||||
case REGEX_OR: m_pOp = new OrOperator; break;
|
||||
case REGEX_AND: m_pOp = new AndOperator; break;
|
||||
case REGEX_NOT: m_pOp = new NotOperator; break;
|
||||
case REGEX_SEQ: m_pOp = new SeqOperator; break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
bool RegEx::Matches(char ch) const
|
||||
{
|
||||
std::string str;
|
||||
str += ch;
|
||||
return Matches(str);
|
||||
}
|
||||
|
||||
bool RegEx::Matches(const std::string& str) const
|
||||
{
|
||||
return Match(str) >= 0;
|
||||
}
|
||||
|
||||
bool RegEx::Matches(const Buffer& buffer) const
|
||||
{
|
||||
return Match(buffer) >= 0;
|
||||
}
|
||||
|
||||
bool RegEx::Matches(const Stream& in) const
|
||||
{
|
||||
return Match(in) >= 0;
|
||||
}
|
||||
|
||||
// Match
|
||||
// . Matches the given string against this regular expression.
|
||||
// . Returns the number of characters matched.
|
||||
// . Returns -1 if no characters were matched (the reason for
|
||||
// not returning zero is that we may have an empty regex
|
||||
// which is ALWAYS successful at matching zero characters).
|
||||
// . REMEMBER that we only match from the start of the buffer!
|
||||
int RegEx::Match(const Buffer& buffer) const
|
||||
{
|
||||
if(!m_pOp)
|
||||
return !buffer ? 0 : -1; // the empty regex only is successful on the empty string
|
||||
|
||||
return m_pOp->Match(buffer, *this);
|
||||
}
|
||||
|
||||
int RegEx::Match(const std::string& str) const
|
||||
{
|
||||
Buffer buffer(str.c_str(), str.size());
|
||||
return Match(buffer);
|
||||
}
|
||||
|
||||
// Match
|
||||
int RegEx::Match(const Stream& in) const
|
||||
{
|
||||
return Match(in.current());
|
||||
}
|
||||
|
||||
// combination constructors
|
||||
RegEx operator ! (const RegEx& ex)
|
||||
{
|
||||
RegEx ret(REGEX_NOT);
|
||||
@@ -157,77 +57,5 @@ namespace YAML
|
||||
ret.m_params.push_back(ex2);
|
||||
return ret;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Operators
|
||||
|
||||
// MatchOperator
|
||||
int RegEx::MatchOperator::Match(const Buffer& buffer, const RegEx& regex) const
|
||||
{
|
||||
if(!buffer || buffer[0] != regex.m_a)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// RangeOperator
|
||||
int RegEx::RangeOperator::Match(const Buffer& buffer, const RegEx& regex) const
|
||||
{
|
||||
if(!buffer || regex.m_a > buffer[0] || regex.m_z < buffer[0])
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// OrOperator
|
||||
int RegEx::OrOperator::Match(const Buffer& buffer, const RegEx& regex) const
|
||||
{
|
||||
for(unsigned i=0;i<regex.m_params.size();i++) {
|
||||
int n = regex.m_params[i].Match(buffer);
|
||||
if(n >= 0)
|
||||
return n;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// AndOperator
|
||||
// Note: 'AND' is a little funny, since we may be required to match things
|
||||
// of different lengths. If we find a match, we return the length of
|
||||
// the FIRST entry on the list.
|
||||
int RegEx::AndOperator::Match(const Buffer& buffer, const RegEx& regex) const
|
||||
{
|
||||
int first = -1;
|
||||
for(unsigned i=0;i<regex.m_params.size();i++) {
|
||||
int n = regex.m_params[i].Match(buffer);
|
||||
if(n == -1)
|
||||
return -1;
|
||||
if(i == 0)
|
||||
first = n;
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
// NotOperator
|
||||
int RegEx::NotOperator::Match(const Buffer& buffer, const RegEx& regex) const
|
||||
{
|
||||
if(regex.m_params.empty())
|
||||
return -1;
|
||||
if(regex.m_params[0].Match(buffer) >= 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// SeqOperator
|
||||
int RegEx::SeqOperator::Match(const Buffer& buffer, const RegEx& regex) const
|
||||
{
|
||||
int offset = 0;
|
||||
for(unsigned i=0;i<regex.m_params.size();i++) {
|
||||
int n = regex.m_params[i].Match(buffer + offset);
|
||||
if(n == -1)
|
||||
return -1;
|
||||
|
||||
offset += n;
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
}
|
||||
|
||||
|
71
src/regex.h
71
src/regex.h
@@ -6,7 +6,6 @@
|
||||
namespace YAML
|
||||
{
|
||||
class Stream;
|
||||
struct Buffer;
|
||||
|
||||
enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };
|
||||
|
||||
@@ -15,70 +14,46 @@ namespace YAML
|
||||
// . Only matches from start of string
|
||||
class RegEx
|
||||
{
|
||||
private:
|
||||
// the operators
|
||||
struct Operator {
|
||||
virtual ~Operator() {}
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const = 0;
|
||||
};
|
||||
|
||||
struct MatchOperator: public Operator {
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
|
||||
};
|
||||
|
||||
struct RangeOperator: public Operator {
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
|
||||
};
|
||||
|
||||
struct OrOperator: public Operator {
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
|
||||
};
|
||||
|
||||
struct AndOperator: public Operator {
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
|
||||
};
|
||||
|
||||
struct NotOperator: public Operator {
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
|
||||
};
|
||||
|
||||
struct SeqOperator: public Operator {
|
||||
virtual int Match(const Buffer& buffer, const RegEx& regex) const;
|
||||
};
|
||||
|
||||
public:
|
||||
friend struct Operator;
|
||||
|
||||
RegEx();
|
||||
RegEx(char ch);
|
||||
RegEx(char a, char z);
|
||||
RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ);
|
||||
RegEx(const RegEx& rhs);
|
||||
~RegEx();
|
||||
|
||||
RegEx& operator = (const RegEx& rhs);
|
||||
|
||||
bool Matches(char ch) const;
|
||||
bool Matches(const std::string& str) const;
|
||||
bool Matches(const Buffer& buffer) const;
|
||||
bool Matches(const Stream& in) const;
|
||||
int Match(const std::string& str) const;
|
||||
int Match(const Buffer& buffer) const;
|
||||
int Match(const Stream& in) const;
|
||||
~RegEx() {}
|
||||
|
||||
friend RegEx operator ! (const RegEx& ex);
|
||||
friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
|
||||
friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
|
||||
friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
|
||||
|
||||
bool Matches(char ch) const;
|
||||
bool Matches(const std::string& str) const;
|
||||
bool Matches(const Stream& in) const;
|
||||
template <typename Source> bool Matches(const Source& source) const;
|
||||
|
||||
int Match(const std::string& str) const;
|
||||
int Match(const Stream& in) const;
|
||||
|
||||
private:
|
||||
RegEx(REGEX_OP op);
|
||||
void SetOp();
|
||||
|
||||
template <typename Source> bool IsValidSource(const Source& source) const;
|
||||
template <typename Source> int Match(const Source& source) const;
|
||||
template <typename Source> int MatchUnchecked(const Source& source) const;
|
||||
|
||||
template <typename Source> int MatchOpEmpty(const Source& source) const;
|
||||
template <typename Source> int MatchOpMatch(const Source& source) const;
|
||||
template <typename Source> int MatchOpRange(const Source& source) const;
|
||||
template <typename Source> int MatchOpOr(const Source& source) const;
|
||||
template <typename Source> int MatchOpAnd(const Source& source) const;
|
||||
template <typename Source> int MatchOpNot(const Source& source) const;
|
||||
template <typename Source> int MatchOpSeq(const Source& source) const;
|
||||
|
||||
private:
|
||||
REGEX_OP m_op;
|
||||
Operator *m_pOp;
|
||||
char m_a, m_z;
|
||||
std::vector <RegEx> m_params;
|
||||
};
|
||||
}
|
||||
|
||||
#include "regeximpl.h"
|
||||
|
172
src/regeximpl.h
Normal file
172
src/regeximpl.h
Normal file
@@ -0,0 +1,172 @@
|
||||
#pragma once
|
||||
|
||||
#include "stream.h"
|
||||
#include "stringsource.h"
|
||||
#include "streamcharsource.h"
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
// query matches
|
||||
inline bool RegEx::Matches(char ch) const {
|
||||
std::string str;
|
||||
str += ch;
|
||||
return Matches(str);
|
||||
}
|
||||
|
||||
inline bool RegEx::Matches(const std::string& str) const {
|
||||
return Match(str) >= 0;
|
||||
}
|
||||
|
||||
inline bool RegEx::Matches(const Stream& in) const {
|
||||
return Match(in) >= 0;
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
inline bool RegEx::Matches(const Source& source) const {
|
||||
return Match(source) >= 0;
|
||||
}
|
||||
|
||||
// Match
|
||||
// . Matches the given string against this regular expression.
|
||||
// . Returns the number of characters matched.
|
||||
// . Returns -1 if no characters were matched (the reason for
|
||||
// not returning zero is that we may have an empty regex
|
||||
// which is ALWAYS successful at matching zero characters).
|
||||
// . REMEMBER that we only match from the start of the buffer!
|
||||
inline int RegEx::Match(const std::string& str) const
|
||||
{
|
||||
StringCharSource source(str.c_str(), str.size());
|
||||
return Match(source);
|
||||
}
|
||||
|
||||
inline int RegEx::Match(const Stream& in) const
|
||||
{
|
||||
StreamCharSource source(in);
|
||||
return Match(source);
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
inline bool RegEx::IsValidSource(const Source& source) const
|
||||
{
|
||||
return source;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool RegEx::IsValidSource<StringCharSource>(const StringCharSource&source) const
|
||||
{
|
||||
return source || m_op == REGEX_EMPTY;
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
inline int RegEx::Match(const Source& source) const
|
||||
{
|
||||
return IsValidSource(source) ? MatchUnchecked(source) : -1;
|
||||
}
|
||||
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchUnchecked(const Source& source) const
|
||||
{
|
||||
switch(m_op) {
|
||||
case REGEX_EMPTY:
|
||||
return MatchOpEmpty(source);
|
||||
case REGEX_MATCH:
|
||||
return MatchOpMatch(source);
|
||||
case REGEX_RANGE:
|
||||
return MatchOpRange(source);
|
||||
case REGEX_OR:
|
||||
return MatchOpOr(source);
|
||||
case REGEX_AND:
|
||||
return MatchOpAnd(source);
|
||||
case REGEX_NOT:
|
||||
return MatchOpNot(source);
|
||||
case REGEX_SEQ:
|
||||
return MatchOpSeq(source);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Operators
|
||||
// Note: the convention MatchOp*<Source> is that we can assume IsSourceValid(source).
|
||||
// So we do all our checks *before* we call these functions
|
||||
|
||||
// EmptyOperator
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpEmpty(const Source& source) const {
|
||||
return source[0] == Stream::eof() ? 0 : -1;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline int RegEx::MatchOpEmpty<StringCharSource>(const StringCharSource& source) const {
|
||||
return !source ? 0 : -1; // the empty regex only is successful on the empty string
|
||||
}
|
||||
|
||||
// MatchOperator
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpMatch(const Source& source) const {
|
||||
if(source[0] != m_a)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// RangeOperator
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpRange(const Source& source) const {
|
||||
if(m_a > source[0] || m_z < source[0])
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// OrOperator
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpOr(const Source& source) const {
|
||||
for(unsigned i=0;i<m_params.size();i++) {
|
||||
int n = m_params[i].MatchUnchecked(source);
|
||||
if(n >= 0)
|
||||
return n;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// AndOperator
|
||||
// Note: 'AND' is a little funny, since we may be required to match things
|
||||
// of different lengths. If we find a match, we return the length of
|
||||
// the FIRST entry on the list.
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpAnd(const Source& source) const {
|
||||
int first = -1;
|
||||
for(unsigned i=0;i<m_params.size();i++) {
|
||||
int n = m_params[i].MatchUnchecked(source);
|
||||
if(n == -1)
|
||||
return -1;
|
||||
if(i == 0)
|
||||
first = n;
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
// NotOperator
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpNot(const Source& source) const {
|
||||
if(m_params.empty())
|
||||
return -1;
|
||||
if(m_params[0].MatchUnchecked(source) >= 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// SeqOperator
|
||||
template <typename Source>
|
||||
inline int RegEx::MatchOpSeq(const Source& source) const {
|
||||
int offset = 0;
|
||||
for(unsigned i=0;i<m_params.size();i++) {
|
||||
int n = m_params[i].Match(source + offset); // note Match, not MatchUnchecked because we need to check validity after the offset
|
||||
if(n == -1)
|
||||
return -1;
|
||||
offset += n;
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
}
|
415
src/stream.cpp
415
src/stream.cpp
@@ -1,52 +1,260 @@
|
||||
#include "crt.h"
|
||||
#include "stream.h"
|
||||
#include <iostream>
|
||||
#include "exp.h"
|
||||
|
||||
#ifndef YAML_PREFETCH_SIZE
|
||||
#define YAML_PREFETCH_SIZE 2048
|
||||
#endif
|
||||
|
||||
#define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
|
||||
#define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
|
||||
|
||||
#define CP_REPLACEMENT_CHARACTER (0xFFFD)
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
Stream::Stream(std::istream& input): pos(0), line(0), column(0), size(0), buffer(0)
|
||||
enum UtfIntroState {
|
||||
uis_start,
|
||||
uis_utfbe_b1,
|
||||
uis_utf32be_b2,
|
||||
uis_utf32be_bom3,
|
||||
uis_utf32be,
|
||||
uis_utf16be,
|
||||
uis_utf16be_bom1,
|
||||
uis_utfle_bom1,
|
||||
uis_utf16le_bom2,
|
||||
uis_utf32le_bom3,
|
||||
uis_utf16le,
|
||||
uis_utf32le,
|
||||
uis_utf8_imp,
|
||||
uis_utf16le_imp,
|
||||
uis_utf32le_imp3,
|
||||
uis_utf8_bom1,
|
||||
uis_utf8_bom2,
|
||||
uis_utf8,
|
||||
uis_error
|
||||
};
|
||||
|
||||
enum UtfIntroCharType {
|
||||
uict00,
|
||||
uictBB,
|
||||
uictBF,
|
||||
uictEF,
|
||||
uictFE,
|
||||
uictFF,
|
||||
uictAscii,
|
||||
uictOther,
|
||||
uictMax
|
||||
};
|
||||
|
||||
static bool s_introFinalState[] = {
|
||||
false, //uis_start
|
||||
false, //uis_utfbe_b1
|
||||
false, //uis_utf32be_b2
|
||||
false, //uis_utf32be_bom3
|
||||
true, //uis_utf32be
|
||||
true, //uis_utf16be
|
||||
false, //uis_utf16be_bom1
|
||||
false, //uis_utfle_bom1
|
||||
false, //uis_utf16le_bom2
|
||||
false, //uis_utf32le_bom3
|
||||
true, //uis_utf16le
|
||||
true, //uis_utf32le
|
||||
false, //uis_utf8_imp
|
||||
false, //uis_utf16le_imp
|
||||
false, //uis_utf32le_imp3
|
||||
false, //uis_utf8_bom1
|
||||
false, //uis_utf8_bom2
|
||||
true, //uis_utf8
|
||||
true, //uis_error
|
||||
};
|
||||
|
||||
static UtfIntroState s_introTransitions[][uictMax] = {
|
||||
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
|
||||
{uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, uis_utfle_bom1, uis_utf8_imp, uis_utf8},
|
||||
{uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8},
|
||||
{uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, uis_utf8, uis_utf8},
|
||||
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, uis_utf8},
|
||||
{uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be},
|
||||
{uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be},
|
||||
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, uis_utf8},
|
||||
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, uis_utf8, uis_utf8},
|
||||
{uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
|
||||
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
|
||||
{uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
|
||||
{uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le},
|
||||
{uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
|
||||
{uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
|
||||
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le},
|
||||
{uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
|
||||
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
|
||||
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8},
|
||||
};
|
||||
|
||||
static char s_introUngetCount[][uictMax] = {
|
||||
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
|
||||
{0, 1, 1, 0, 0, 0, 0, 1},
|
||||
{0, 2, 2, 2, 2, 2, 2, 2},
|
||||
{3, 3, 3, 3, 0, 3, 3, 3},
|
||||
{4, 4, 4, 4, 4, 0, 4, 4},
|
||||
{1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{2, 2, 2, 2, 2, 0, 2, 2},
|
||||
{2, 2, 2, 2, 0, 2, 2, 2},
|
||||
{0, 1, 1, 1, 1, 1, 1, 1},
|
||||
{0, 2, 2, 2, 2, 2, 2, 2},
|
||||
{1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1, 1},
|
||||
{0, 2, 2, 2, 2, 2, 2, 2},
|
||||
{0, 3, 3, 3, 3, 3, 3, 3},
|
||||
{4, 4, 4, 4, 4, 4, 4, 4},
|
||||
{2, 0, 2, 2, 2, 2, 2, 2},
|
||||
{3, 3, 0, 3, 3, 3, 3, 3},
|
||||
{1, 1, 1, 1, 1, 1, 1, 1},
|
||||
};
|
||||
|
||||
inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
|
||||
{
|
||||
if (std::istream::traits_type::eof() == ch) {
|
||||
return uictOther;
|
||||
}
|
||||
|
||||
switch (ch) {
|
||||
case 0: return uict00;
|
||||
case 0xBB: return uictBB;
|
||||
case 0xBF: return uictBF;
|
||||
case 0xEF: return uictEF;
|
||||
case 0xFE: return uictFE;
|
||||
case 0xFF: return uictFF;
|
||||
}
|
||||
|
||||
if ((ch > 0) && (ch < 0xFF)) {
|
||||
return uictAscii;
|
||||
}
|
||||
|
||||
return uictOther;
|
||||
}
|
||||
|
||||
inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
|
||||
{
|
||||
const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
|
||||
const unsigned char mask = (0xFF >> (lead_bits + 1));
|
||||
return static_cast<char>(static_cast<unsigned char>(
|
||||
header | ((ch >> rshift) & mask)
|
||||
));
|
||||
}
|
||||
|
||||
inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
|
||||
{
|
||||
// We are not allowed to queue the Stream::eof() codepoint, so
|
||||
// replace it with CP_REPLACEMENT_CHARACTER
|
||||
if (static_cast<unsigned long>(Stream::eof()) == ch)
|
||||
{
|
||||
ch = CP_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
|
||||
if (ch < 0x80)
|
||||
{
|
||||
q.push_back(Utf8Adjust(ch, 0, 0));
|
||||
}
|
||||
else if (ch < 0x800)
|
||||
{
|
||||
q.push_back(Utf8Adjust(ch, 2, 6));
|
||||
q.push_back(Utf8Adjust(ch, 1, 0));
|
||||
}
|
||||
else if (ch < 0x10000)
|
||||
{
|
||||
q.push_back(Utf8Adjust(ch, 3, 12));
|
||||
q.push_back(Utf8Adjust(ch, 1, 6));
|
||||
q.push_back(Utf8Adjust(ch, 1, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
q.push_back(Utf8Adjust(ch, 4, 18));
|
||||
q.push_back(Utf8Adjust(ch, 1, 12));
|
||||
q.push_back(Utf8Adjust(ch, 1, 6));
|
||||
q.push_back(Utf8Adjust(ch, 1, 0));
|
||||
}
|
||||
}
|
||||
|
||||
Stream::Stream(std::istream& input)
|
||||
: pos(0), line(0), column(0), m_input(input), m_nPushedBack(0),
|
||||
m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
|
||||
m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
|
||||
{
|
||||
typedef std::istream::traits_type char_traits;
|
||||
|
||||
if(!input)
|
||||
return;
|
||||
|
||||
std::streambuf *pBuf = input.rdbuf();
|
||||
// Determine (or guess) the character-set by reading the BOM, if any. See
|
||||
// the YAML specification for the determination algorithm.
|
||||
char_traits::int_type intro[4];
|
||||
int nIntroUsed = 0;
|
||||
UtfIntroState state = uis_start;
|
||||
for (; !s_introFinalState[state]; ) {
|
||||
std::istream::int_type ch = input.get();
|
||||
intro[nIntroUsed++] = ch;
|
||||
UtfIntroCharType charType = IntroCharTypeOf(ch);
|
||||
UtfIntroState newState = s_introTransitions[state][charType];
|
||||
int nUngets = s_introUngetCount[state][charType];
|
||||
if (nUngets > 0) {
|
||||
for (; nUngets > 0; --nUngets) {
|
||||
if (char_traits::eof() != intro[--nIntroUsed]) {
|
||||
m_bufPushback[m_nPushedBack++] =
|
||||
char_traits::to_char_type(intro[nIntroUsed]);
|
||||
}
|
||||
}
|
||||
}
|
||||
state = newState;
|
||||
}
|
||||
|
||||
// store entire file in buffer
|
||||
size = pBuf->pubseekoff(0, std::ios::end, std::ios::in);
|
||||
pBuf->pubseekpos(0, std::ios::in);
|
||||
buffer = new char[size];
|
||||
size = pBuf->sgetn(buffer, size); // Note: when reading a Windows CR/LF file,
|
||||
// pubseekoff() counts CR/LF as two characters,
|
||||
// setgn() reads CR/LF as a single LF character!
|
||||
switch (state) {
|
||||
case uis_utf8: m_charSet = utf8; break;
|
||||
case uis_utf16le: m_charSet = utf16le; break;
|
||||
case uis_utf16be: m_charSet = utf16be; break;
|
||||
case uis_utf32le: m_charSet = utf32le; break;
|
||||
case uis_utf32be: m_charSet = utf32be; break;
|
||||
default: m_charSet = utf8; break;
|
||||
}
|
||||
|
||||
ReadAheadTo(0);
|
||||
}
|
||||
|
||||
Stream::~Stream()
|
||||
{
|
||||
delete [] buffer;
|
||||
delete[] m_pPrefetched;
|
||||
}
|
||||
|
||||
|
||||
char Stream::peek()
|
||||
char Stream::peek() const
|
||||
{
|
||||
return buffer[pos];
|
||||
if (m_readahead.empty())
|
||||
{
|
||||
return Stream::eof();
|
||||
}
|
||||
|
||||
return m_readahead[0];
|
||||
}
|
||||
|
||||
Stream::operator bool() const
|
||||
{
|
||||
return pos < size;
|
||||
return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
|
||||
}
|
||||
|
||||
// get
|
||||
// . Extracts a character from the stream and updates our position
|
||||
char Stream::get()
|
||||
{
|
||||
char ch = buffer[pos];
|
||||
pos++;
|
||||
char ch = peek();
|
||||
AdvanceCurrent();
|
||||
column++;
|
||||
|
||||
if(ch == '\n') {
|
||||
column = 0;
|
||||
line++;
|
||||
}
|
||||
|
||||
return ch;
|
||||
}
|
||||
|
||||
@@ -69,4 +277,179 @@ namespace YAML
|
||||
get();
|
||||
}
|
||||
|
||||
void Stream::AdvanceCurrent()
|
||||
{
|
||||
if (!m_readahead.empty())
|
||||
{
|
||||
m_readahead.pop_front();
|
||||
++pos;
|
||||
}
|
||||
|
||||
ReadAheadTo(0);
|
||||
}
|
||||
|
||||
bool Stream::_ReadAheadTo(size_t i) const
|
||||
{
|
||||
while (m_input.good() && (m_readahead.size() <= i))
|
||||
{
|
||||
switch (m_charSet)
|
||||
{
|
||||
case utf8: StreamInUtf8(); break;
|
||||
case utf16le: StreamInUtf16(); break;
|
||||
case utf16be: StreamInUtf16(); break;
|
||||
case utf32le: StreamInUtf32(); break;
|
||||
case utf32be: StreamInUtf32(); break;
|
||||
}
|
||||
}
|
||||
|
||||
// signal end of stream
|
||||
if(!m_input.good())
|
||||
m_readahead.push_back(Stream::eof());
|
||||
|
||||
return m_readahead.size() > i;
|
||||
}
|
||||
|
||||
void Stream::StreamInUtf8() const
|
||||
{
|
||||
unsigned char b = GetNextByte();
|
||||
if (m_input.good())
|
||||
{
|
||||
m_readahead.push_back(b);
|
||||
}
|
||||
}
|
||||
|
||||
void Stream::StreamInUtf16() const
|
||||
{
|
||||
unsigned long ch = 0;
|
||||
unsigned char bytes[2];
|
||||
int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
|
||||
|
||||
bytes[0] = GetNextByte();
|
||||
bytes[1] = GetNextByte();
|
||||
if (!m_input.good())
|
||||
{
|
||||
return;
|
||||
}
|
||||
ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
|
||||
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
|
||||
|
||||
if (ch >= 0xDC00 && ch < 0xE000)
|
||||
{
|
||||
// Trailing (low) surrogate...ugh, wrong order
|
||||
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
|
||||
return;
|
||||
}
|
||||
else if (ch >= 0xD800 && ch < 0xDC00)
|
||||
{
|
||||
// ch is a leading (high) surrogate
|
||||
|
||||
// Four byte UTF-8 code point
|
||||
|
||||
// Read the trailing (low) surrogate
|
||||
for (;;)
|
||||
{
|
||||
bytes[0] = GetNextByte();
|
||||
bytes[1] = GetNextByte();
|
||||
if (!m_input.good())
|
||||
{
|
||||
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
|
||||
return;
|
||||
}
|
||||
unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
|
||||
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
|
||||
if (chLow < 0xDC00 || ch >= 0xE000)
|
||||
{
|
||||
// Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the stream.
|
||||
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
|
||||
|
||||
// Deal with the next UTF-16 unit
|
||||
if (chLow < 0xD800 || ch >= 0xE000)
|
||||
{
|
||||
// Easiest case: queue the codepoint and return
|
||||
QueueUnicodeCodepoint(m_readahead, ch);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Start the loop over with the new high surrogate
|
||||
ch = chLow;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Select the payload bits from the high surrogate
|
||||
ch &= 0x3FF;
|
||||
ch <<= 10;
|
||||
|
||||
// Include bits from low surrogate
|
||||
ch |= (chLow & 0x3FF);
|
||||
|
||||
// Add the surrogacy offset
|
||||
ch += 0x10000;
|
||||
}
|
||||
}
|
||||
|
||||
QueueUnicodeCodepoint(m_readahead, ch);
|
||||
}
|
||||
|
||||
inline char* ReadBuffer(unsigned char* pBuffer)
|
||||
{
|
||||
return reinterpret_cast<char*>(pBuffer);
|
||||
}
|
||||
|
||||
unsigned char Stream::GetNextByte() const
|
||||
{
|
||||
if (m_nPushedBack)
|
||||
{
|
||||
return m_bufPushback[--m_nPushedBack];
|
||||
}
|
||||
|
||||
if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
|
||||
{
|
||||
std::streambuf *pBuf = m_input.rdbuf();
|
||||
m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched),
|
||||
YAML_PREFETCH_SIZE);
|
||||
m_nPrefetchedUsed = 0;
|
||||
if (!m_nPrefetchedAvailable)
|
||||
{
|
||||
m_input.setstate(std::ios_base::eofbit);
|
||||
}
|
||||
|
||||
if (0 == m_nPrefetchedAvailable)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return m_pPrefetched[m_nPrefetchedUsed++];
|
||||
}
|
||||
|
||||
void Stream::StreamInUtf32() const
|
||||
{
|
||||
static int indexes[2][4] = {
|
||||
{3, 2, 1, 0},
|
||||
{0, 1, 2, 3}
|
||||
};
|
||||
|
||||
unsigned long ch = 0;
|
||||
unsigned char bytes[4];
|
||||
int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
|
||||
|
||||
bytes[0] = GetNextByte();
|
||||
bytes[1] = GetNextByte();
|
||||
bytes[2] = GetNextByte();
|
||||
bytes[3] = GetNextByte();
|
||||
if (!m_input.good())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
ch <<= 8;
|
||||
ch |= bytes[pIndexes[i]];
|
||||
}
|
||||
|
||||
QueueUnicodeCodepoint(m_readahead, ch);
|
||||
}
|
||||
}
|
||||
|
58
src/stream.h
58
src/stream.h
@@ -1,42 +1,66 @@
|
||||
#pragma once
|
||||
|
||||
#include <deque>
|
||||
#include <ios>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
// a simple buffer wrapper that knows how big it is
|
||||
struct Buffer {
|
||||
Buffer(const char *b, int s): buffer(b), size(s) {}
|
||||
|
||||
operator bool() const { return size > 0; }
|
||||
bool operator !() const { return !static_cast <bool> (*this); }
|
||||
char operator [] (int i) const { return buffer[i]; }
|
||||
const Buffer operator + (int offset) const { return Buffer(buffer + offset, size - offset); }
|
||||
Buffer& operator ++ () { ++buffer; --size; return *this; }
|
||||
|
||||
const char *buffer;
|
||||
int size;
|
||||
};
|
||||
static const size_t MAX_PARSER_PUSHBACK = 8;
|
||||
|
||||
class Stream
|
||||
{
|
||||
public:
|
||||
friend class StreamCharSource;
|
||||
|
||||
Stream(std::istream& input);
|
||||
~Stream();
|
||||
|
||||
operator bool() const;
|
||||
bool operator !() const { return !static_cast <bool>(*this); }
|
||||
|
||||
const Buffer current() const { return Buffer(buffer + pos, size - pos); }
|
||||
char peek();
|
||||
char peek() const;
|
||||
char get();
|
||||
std::string get(int n);
|
||||
void eat(int n = 1);
|
||||
|
||||
int pos, line, column, size;
|
||||
static char eof() { return 0x04; }
|
||||
|
||||
int pos, line, column;
|
||||
|
||||
private:
|
||||
char *buffer;
|
||||
enum CharacterSet {utf8, utf16le, utf16be, utf32le, utf32be};
|
||||
|
||||
std::istream& m_input;
|
||||
CharacterSet m_charSet;
|
||||
unsigned char m_bufPushback[MAX_PARSER_PUSHBACK];
|
||||
mutable size_t m_nPushedBack;
|
||||
mutable std::deque<char> m_readahead;
|
||||
unsigned char* const m_pPrefetched;
|
||||
mutable size_t m_nPrefetchedAvailable;
|
||||
mutable size_t m_nPrefetchedUsed;
|
||||
|
||||
void AdvanceCurrent();
|
||||
char CharAt(size_t i) const;
|
||||
bool ReadAheadTo(size_t i) const;
|
||||
bool _ReadAheadTo(size_t i) const;
|
||||
void StreamInUtf8() const;
|
||||
void StreamInUtf16() const;
|
||||
void StreamInUtf32() const;
|
||||
unsigned char GetNextByte() const;
|
||||
};
|
||||
|
||||
// CharAt
|
||||
// . Unchecked access
|
||||
inline char Stream::CharAt(size_t i) const {
|
||||
return m_readahead[i];
|
||||
}
|
||||
|
||||
inline bool Stream::ReadAheadTo(size_t i) const {
|
||||
if(m_readahead.size() > i)
|
||||
return true;
|
||||
return _ReadAheadTo(i);
|
||||
}
|
||||
}
|
||||
|
39
src/streamcharsource.h
Normal file
39
src/streamcharsource.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
class StreamCharSource
|
||||
{
|
||||
public:
|
||||
StreamCharSource(const Stream& stream);
|
||||
~StreamCharSource() {}
|
||||
|
||||
operator bool() const;
|
||||
char operator [] (std::size_t i) const { return m_stream.CharAt(m_offset + i); }
|
||||
bool operator !() const { return !static_cast<bool>(*this); }
|
||||
|
||||
const StreamCharSource operator + (int i) const;
|
||||
|
||||
private:
|
||||
std::size_t m_offset;
|
||||
const Stream& m_stream;
|
||||
};
|
||||
|
||||
inline StreamCharSource::StreamCharSource(const Stream& stream): m_offset(0), m_stream(stream) {
|
||||
}
|
||||
|
||||
inline StreamCharSource::operator bool() const {
|
||||
return m_stream.ReadAheadTo(m_offset);
|
||||
}
|
||||
|
||||
inline const StreamCharSource StreamCharSource::operator + (int i) const {
|
||||
StreamCharSource source(*this);
|
||||
if(static_cast<int> (source.m_offset) + i >= 0)
|
||||
source.m_offset += i;
|
||||
else
|
||||
source.m_offset = 0;
|
||||
return source;
|
||||
}
|
||||
}
|
34
src/stringsource.h
Normal file
34
src/stringsource.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
class StringCharSource
|
||||
{
|
||||
public:
|
||||
StringCharSource(const char *str, std::size_t size): m_str(str), m_size(size), m_offset(0) {}
|
||||
|
||||
operator bool() const { return m_offset < m_size; }
|
||||
char operator [] (std::size_t i) const { return m_str[m_offset + i]; }
|
||||
bool operator !() const { return !static_cast<bool>(*this); }
|
||||
|
||||
const StringCharSource operator + (int i) const {
|
||||
StringCharSource source(*this);
|
||||
if(static_cast<int> (source.m_offset) + i >= 0)
|
||||
source.m_offset += i;
|
||||
else
|
||||
source.m_offset = 0;
|
||||
return source;
|
||||
}
|
||||
|
||||
StringCharSource& operator ++ () {
|
||||
++m_offset;
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
const char *m_str;
|
||||
std::size_t m_size;
|
||||
std::size_t m_offset;
|
||||
};
|
||||
}
|
2
util/CMakeLists.txt
Normal file
2
util/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(parse parse.cpp)
|
||||
target_link_libraries(parse yaml-cpp)
|
21
util/parse.cpp
Normal file
21
util/parse.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "yaml.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if(argc != 2) {
|
||||
std::cout << "Usage: " << argv[0] << " input-file\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::ifstream fin(argv[1]);
|
||||
try {
|
||||
YAML::Parser parser(fin);
|
||||
YAML::Node doc;
|
||||
parser.GetNextDocument(doc);
|
||||
} catch(const YAML::Exception& e) {
|
||||
std::cerr << "Error at line " << e.line << ", col " << e.column << ": " << e.msg << "\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
@@ -72,6 +72,174 @@ namespace Test
|
||||
std::cout << "Caught exception: " << error << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (*EncodingFn)(std::ostream&, int);
|
||||
|
||||
inline char Byte(int ch)
|
||||
{
|
||||
return static_cast<char>(static_cast<unsigned char>(static_cast<unsigned int>(ch)));
|
||||
}
|
||||
|
||||
void EncodeToUtf8(std::ostream& stream, int ch)
|
||||
{
|
||||
if (ch <= 0x7F)
|
||||
{
|
||||
stream << Byte(ch);
|
||||
}
|
||||
else if (ch <= 0x7FF)
|
||||
{
|
||||
stream << Byte(0xC0 | (ch >> 6));
|
||||
stream << Byte(0x80 | (ch & 0x3F));
|
||||
}
|
||||
else if (ch <= 0xFFFF)
|
||||
{
|
||||
stream << Byte(0xE0 | (ch >> 12));
|
||||
stream << Byte(0x80 | ((ch >> 6) & 0x3F));
|
||||
stream << Byte(0x80 | (ch & 0x3F));
|
||||
}
|
||||
else if (ch <= 0x1FFFFF)
|
||||
{
|
||||
stream << Byte(0xF0 | (ch >> 18));
|
||||
stream << Byte(0x80 | ((ch >> 12) & 0x3F));
|
||||
stream << Byte(0x80 | ((ch >> 6) & 0x3F));
|
||||
stream << Byte(0x80 | (ch & 0x3F));
|
||||
}
|
||||
}
|
||||
|
||||
bool SplitUtf16HighChar(std::ostream& stream, EncodingFn encoding, int ch)
|
||||
{
|
||||
int biasedValue = ch - 0x10000;
|
||||
if (biasedValue < 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
int high = 0xD800 | (biasedValue >> 10);
|
||||
int low = 0xDC00 | (biasedValue & 0x3FF);
|
||||
encoding(stream, high);
|
||||
encoding(stream, low);
|
||||
return true;
|
||||
}
|
||||
|
||||
void EncodeToUtf16LE(std::ostream& stream, int ch)
|
||||
{
|
||||
if (!SplitUtf16HighChar(stream, &EncodeToUtf16LE, ch))
|
||||
{
|
||||
stream << Byte(ch & 0xFF) << Byte(ch >> 8);
|
||||
}
|
||||
}
|
||||
|
||||
void EncodeToUtf16BE(std::ostream& stream, int ch)
|
||||
{
|
||||
if (!SplitUtf16HighChar(stream, &EncodeToUtf16BE, ch))
|
||||
{
|
||||
stream << Byte(ch >> 8) << Byte(ch & 0xFF);
|
||||
}
|
||||
}
|
||||
|
||||
void EncodeToUtf32LE(std::ostream& stream, int ch)
|
||||
{
|
||||
stream << Byte(ch & 0xFF) << Byte((ch >> 8) & 0xFF)
|
||||
<< Byte((ch >> 16) & 0xFF) << Byte((ch >> 24) & 0xFF);
|
||||
}
|
||||
|
||||
void EncodeToUtf32BE(std::ostream& stream, int ch)
|
||||
{
|
||||
stream << Byte((ch >> 24) & 0xFF) << Byte((ch >> 16) & 0xFF)
|
||||
<< Byte((ch >> 8) & 0xFF) << Byte(ch & 0xFF);
|
||||
}
|
||||
|
||||
class EncodingTester
|
||||
{
|
||||
public:
|
||||
EncodingTester(EncodingFn encoding, bool declareEncoding)
|
||||
{
|
||||
if (declareEncoding)
|
||||
{
|
||||
encoding(m_yaml, 0xFEFF);
|
||||
}
|
||||
|
||||
AddEntry(encoding, 0x0021, 0x007E); // Basic Latin
|
||||
AddEntry(encoding, 0x00A1, 0x00FF); // Latin-1 Supplement
|
||||
AddEntry(encoding, 0x0660, 0x06FF); // Arabic (largest contiguous block)
|
||||
|
||||
// CJK unified ideographs (multiple lines)
|
||||
AddEntry(encoding, 0x4E00, 0x4EFF);
|
||||
AddEntry(encoding, 0x4F00, 0x4FFF);
|
||||
AddEntry(encoding, 0x5000, 0x51FF); // 512 character line
|
||||
AddEntry(encoding, 0x5200, 0x54FF); // 768 character line
|
||||
AddEntry(encoding, 0x5500, 0x58FF); // 1024 character line
|
||||
|
||||
AddEntry(encoding, 0x103A0, 0x103C3); // Old Persian
|
||||
|
||||
m_yaml.seekg(0, std::ios::beg);
|
||||
}
|
||||
|
||||
std::istream& stream() {return m_yaml;}
|
||||
const std::vector<std::string>& entries() {return m_entries;}
|
||||
|
||||
private:
|
||||
std::stringstream m_yaml;
|
||||
std::vector<std::string> m_entries;
|
||||
|
||||
void AddEntry(EncodingFn encoding, int startCh, int endCh)
|
||||
{
|
||||
encoding(m_yaml, '-');
|
||||
encoding(m_yaml, ' ');
|
||||
encoding(m_yaml, '|');
|
||||
encoding(m_yaml, '\n');
|
||||
encoding(m_yaml, ' ');
|
||||
encoding(m_yaml, ' ');
|
||||
|
||||
std::stringstream entry;
|
||||
for (int ch = startCh; ch <= endCh; ++ch)
|
||||
{
|
||||
encoding(m_yaml, ch);
|
||||
EncodeToUtf8(entry, ch);
|
||||
}
|
||||
encoding(m_yaml, '\n');
|
||||
|
||||
m_entries.push_back(entry.str());
|
||||
}
|
||||
};
|
||||
|
||||
void RunEncodingTest(EncodingFn encoding, bool declareEncoding, const std::string& name, bool& passed)
|
||||
{
|
||||
EncodingTester tester(encoding, declareEncoding);
|
||||
std::string error;
|
||||
bool ok = true;
|
||||
try {
|
||||
YAML::Parser parser(tester.stream());
|
||||
YAML::Node doc;
|
||||
parser.GetNextDocument(doc);
|
||||
|
||||
YAML::Iterator itNode = doc.begin();
|
||||
std::vector<std::string>::const_iterator itEntry = tester.entries().begin();
|
||||
for (; (itNode != doc.end()) && (itEntry != tester.entries().end()); ++itNode, ++itEntry)
|
||||
{
|
||||
std::string stScalarValue;
|
||||
if (!itNode->GetScalar(stScalarValue) && (stScalarValue == *itEntry))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ((itNode != doc.end()) || (itEntry != tester.entries().end()))
|
||||
{
|
||||
ok = false;
|
||||
}
|
||||
} catch(const YAML::Exception& e) {
|
||||
ok = false;
|
||||
error = e.msg;
|
||||
}
|
||||
if(ok) {
|
||||
std::cout << "Parser test passed: " << name << "\n";
|
||||
} else {
|
||||
passed = false;
|
||||
std::cout << "Parser test failed: " << name << "\n";
|
||||
if(error != "")
|
||||
std::cout << "Caught exception: " << error << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool RunParserTests()
|
||||
@@ -94,6 +262,17 @@ namespace Test
|
||||
RunParserTest(&Parser::SimpleMap, "simple map", passed);
|
||||
RunParserTest(&Parser::FlowSeq, "flow seq", passed);
|
||||
RunParserTest(&Parser::FlowMap, "flow map", passed);
|
||||
|
||||
RunEncodingTest(&EncodeToUtf8, false, "UTF-8, no BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf8, true, "UTF-8 with BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf16LE, false, "UTF-16LE, no BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf16LE, true, "UTF-16LE with BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf16BE, false, "UTF-16BE, no BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf16BE, true, "UTF-16BE with BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf32LE, false, "UTF-32LE, no BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf32LE, true, "UTF-32LE with BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf32BE, false, "UTF-32BE, no BOM", passed);
|
||||
RunEncodingTest(&EncodeToUtf32BE, true, "UTF-32BE with BOM", passed);
|
||||
return passed;
|
||||
}
|
||||
|
||||
|
@@ -231,6 +231,10 @@
|
||||
RelativePath=".\src\stream.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\stringsource.cpp"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Emitter"
|
||||
@@ -357,6 +361,10 @@
|
||||
RelativePath=".\src\stream.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\stringsource.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\token.h"
|
||||
>
|
||||
|
Reference in New Issue
Block a user