Merged utf branch changes r178:187 into the trunk

2026-02-04 08:16:59 +00:00 · 2009-07-10 03:10:03 +00:00
parent a7b8879494
commit b929eb94a0
13 changed files with 939 additions and 271 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 2.6)

 project (YAML_CPP)

+SET(CMAKE_CXX_FLAGS "-O2")
+
 enable_testing()

 if(WIN32)
@@ -36,4 +38,4 @@ install(
 )

 add_subdirectory (yaml-reader)
-
+add_subdirectory (util)
--- a/src/emitterutils.cpp
+++ b/src/emitterutils.cpp
@@ -3,6 +3,7 @@
 #include "indentation.h"
 #include "exceptions.h"
 #include <sstream>
+#include "stringsource.h"

 namespace YAML
 {
@@ -29,8 +30,8 @@ namespace YAML
 				                          || (!Exp::Printable)
 				                          || Exp::Break
 				                          || Exp::Tab;
-				Buffer buffer(&str[0], str.size());
-				while(buffer.size) {
+				StringCharSource buffer(str.c_str(), str.size());
+				while(buffer) {
 					if(disallowed.Matches(buffer))
 						return false;
 					++buffer;
--- a/src/regex.cpp
+++ b/src/regex.cpp
@@ -1,132 +1,32 @@
 #include "crt.h"
 #include "regex.h"
-#include "stream.h"
-#include <iostream>

 namespace YAML
 {
-	RegEx::RegEx(REGEX_OP op): m_op(op), m_pOp(0)
+	// constructors
+	RegEx::RegEx(): m_op(REGEX_EMPTY)
 	{
-		SetOp();
 	}
 	
-	RegEx::RegEx(const RegEx& rhs): m_pOp(0)
+	RegEx::RegEx(REGEX_OP op): m_op(op)
 	{
-		m_op = rhs.m_op;
-		m_a = rhs.m_a;
-		m_z = rhs.m_z;
-		m_params = rhs.m_params;
-
-		SetOp();
 	}
 	
-	RegEx::RegEx(): m_op(REGEX_EMPTY), m_pOp(0)
+	RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
 	{
-		SetOp();
 	}
 	
-	RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_pOp(0), m_a(ch)
+	RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
 	{
-		SetOp();
 	}
 	
-	RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_pOp(0), m_a(a), m_z(z)
-	{
-		SetOp();
-	}
-
-	RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op), m_pOp(0)
+	RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
 	{
 		for(unsigned i=0;i<str.size();i++)
 			m_params.push_back(RegEx(str[i]));
-
-		SetOp();
-	}
-
-	RegEx::~RegEx()
-	{
-		delete m_pOp;
-	}
-
-	RegEx& RegEx::operator = (const RegEx& rhs)
-	{
-		delete m_pOp;
-		m_pOp = 0;
-
-		m_op = rhs.m_op;
-		m_a = rhs.m_a;
-		m_z = rhs.m_z;
-		m_params = rhs.m_params;
-
-		SetOp();
-
-		return *this;
-	}
-
-	void RegEx::SetOp()
-	{
-		delete m_pOp;
-		m_pOp = 0;
-		switch(m_op) {
-			case REGEX_MATCH: m_pOp = new MatchOperator; break;
-			case REGEX_RANGE: m_pOp = new RangeOperator; break;
-			case REGEX_OR: m_pOp = new OrOperator; break;
-			case REGEX_AND: m_pOp = new AndOperator; break;
-			case REGEX_NOT: m_pOp = new NotOperator; break;
-			case REGEX_SEQ: m_pOp = new SeqOperator; break;
-			default: break;
-		}
-	}
-
-	bool RegEx::Matches(char ch) const
-	{
-		std::string str;
-		str += ch;
-		return Matches(str);
-	}
-
-	bool RegEx::Matches(const std::string& str) const
-	{
-		return Match(str) >= 0;
-	}
-
-	bool RegEx::Matches(const Buffer& buffer) const
-	{
-		return Match(buffer) >= 0;
-	}
-	
-	bool RegEx::Matches(const Stream& in) const
-	{
-		return Match(in) >= 0;
-	}
-
-	// Match
-	// . Matches the given string against this regular expression.
-	// . Returns the number of characters matched.
-	// . Returns -1 if no characters were matched (the reason for
-	//   not returning zero is that we may have an empty regex
-	//   which is ALWAYS successful at matching zero characters).
-	// . REMEMBER that we only match from the start of the buffer!
-	int RegEx::Match(const Buffer& buffer) const
-	{
-		if(!m_pOp)
-			return !buffer ? 0 : -1;  // the empty regex only is successful on the empty string
-
-		return m_pOp->Match(buffer, *this);
-	}
-
-	int RegEx::Match(const std::string& str) const
-	{
-		Buffer buffer(str.c_str(), str.size());
-		return Match(buffer);
-	}
-
-	// Match
-	int RegEx::Match(const Stream& in) const
-	{
-		return Match(in.current());
 	}
 	
+	// combination constructors
 	RegEx operator ! (const RegEx& ex)
 	{
 		RegEx ret(REGEX_NOT);
@@ -157,77 +57,5 @@ namespace YAML
 		ret.m_params.push_back(ex2);
 		return ret;
 	}	
-
-	//////////////////////////////////////////////////////////////////////////////
-	// Operators
-
-	// MatchOperator
-	int RegEx::MatchOperator::Match(const Buffer& buffer, const RegEx& regex) const
-	{
-		if(!buffer || buffer[0] != regex.m_a)
-			return -1;
-		return 1;
-	}
-
-	// RangeOperator
-	int RegEx::RangeOperator::Match(const Buffer& buffer, const RegEx& regex) const
-	{
-		if(!buffer || regex.m_a > buffer[0] || regex.m_z < buffer[0])
-			return -1;
-		return 1;
-	}
-
-	// OrOperator
-	int RegEx::OrOperator::Match(const Buffer& buffer, const RegEx& regex) const
-	{
-		for(unsigned i=0;i<regex.m_params.size();i++) {
-			int n = regex.m_params[i].Match(buffer);
-			if(n >= 0)
-				return n;
-		}
-		return -1;
-	}
-
-	// AndOperator
-	// Note: 'AND' is a little funny, since we may be required to match things
-	//       of different lengths. If we find a match, we return the length of
-	//       the FIRST entry on the list.
-	int RegEx::AndOperator::Match(const Buffer& buffer, const RegEx& regex) const
-	{
-		int first = -1;
-		for(unsigned i=0;i<regex.m_params.size();i++) {
-			int n = regex.m_params[i].Match(buffer);
-			if(n == -1)
-				return -1;
-			if(i == 0)
-				first = n;
-		}
-		return first;
-	}
-
-	// NotOperator
-	int RegEx::NotOperator::Match(const Buffer& buffer, const RegEx& regex) const
-	{
-		if(regex.m_params.empty())
-			return -1;
-		if(regex.m_params[0].Match(buffer) >= 0)
-			return -1;
-		return 1;
-	}
-
-	// SeqOperator
-	int RegEx::SeqOperator::Match(const Buffer& buffer, const RegEx& regex) const
-	{
-		int offset = 0;
-		for(unsigned i=0;i<regex.m_params.size();i++) {
-			int n = regex.m_params[i].Match(buffer + offset);
-			if(n == -1)
-				return -1;
-
-			offset += n;
-		}
-
-		return offset;
-	}
 }

--- a/src/regex.h
+++ b/src/regex.h
@@ -6,7 +6,6 @@
 namespace YAML
 {
 	class Stream;
-	struct Buffer;

 	enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };

@@ -15,70 +14,46 @@ namespace YAML
 	// . Only matches from start of string
 	class RegEx
 	{
-	private:
-		// the operators
-		struct Operator {
-			virtual ~Operator() {}
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const = 0;
-		};
-
-		struct MatchOperator: public Operator {
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const;
-		};
-
-		struct RangeOperator: public Operator {
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const;
-		};
-
-		struct OrOperator: public Operator {
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const;
-		};
-
-		struct AndOperator: public Operator {
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const;
-		};
-
-		struct NotOperator: public Operator {
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const;
-		};
-
-		struct SeqOperator: public Operator {
-			virtual int Match(const Buffer& buffer, const RegEx& regex) const;
-		};
-
 	public:
-		friend struct Operator;
-
 		RegEx();
 		RegEx(char ch);
 		RegEx(char a, char z);
 		RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ);
-		RegEx(const RegEx& rhs);
-		~RegEx();
-
-		RegEx& operator = (const RegEx& rhs);
-
-		bool Matches(char ch) const;
-		bool Matches(const std::string& str) const;
-		bool Matches(const Buffer& buffer) const;
-		bool Matches(const Stream& in) const;
-		int Match(const std::string& str) const;
-		int Match(const Buffer& buffer) const;
-		int Match(const Stream& in) const;
+		~RegEx() {}

 		friend RegEx operator ! (const RegEx& ex);
 		friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
 		friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
 		friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
 		
+		bool Matches(char ch) const;
+		bool Matches(const std::string& str) const;
+		bool Matches(const Stream& in) const;
+		template <typename Source> bool Matches(const Source& source) const;
+
+		int Match(const std::string& str) const;
+		int Match(const Stream& in) const;
+
 	private:
 		RegEx(REGEX_OP op);
-		void SetOp();
+		
+		template <typename Source> bool IsValidSource(const Source& source) const;
+		template <typename Source> int Match(const Source& source) const;
+		template <typename Source> int MatchUnchecked(const Source& source) const;
+
+		template <typename Source> int MatchOpEmpty(const Source& source) const;
+		template <typename Source> int MatchOpMatch(const Source& source) const;
+		template <typename Source> int MatchOpRange(const Source& source) const;
+		template <typename Source> int MatchOpOr(const Source& source) const;
+		template <typename Source> int MatchOpAnd(const Source& source) const;
+		template <typename Source> int MatchOpNot(const Source& source) const;
+		template <typename Source> int MatchOpSeq(const Source& source) const;

 	private:
 		REGEX_OP m_op;
-		Operator *m_pOp;
 		char m_a, m_z;
 		std::vector <RegEx> m_params;
 	};
 }
+
+#include "regeximpl.h"
--- a/src/regeximpl.h
+++ b/src/regeximpl.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include "stream.h"
+#include "stringsource.h"
+#include "streamcharsource.h"
+
+namespace YAML
+{
+	// query matches
+	inline bool RegEx::Matches(char ch) const {
+		std::string str;
+		str += ch;
+		return Matches(str);
+	}
+	
+	inline bool RegEx::Matches(const std::string& str) const {
+		return Match(str) >= 0;
+	}
+	
+	inline bool RegEx::Matches(const Stream& in) const {
+		return Match(in) >= 0;
+	}
+
+	template <typename Source>
+	inline bool RegEx::Matches(const Source& source) const {
+		return Match(source) >= 0;
+	}
+
+	// Match
+	// . Matches the given string against this regular expression.
+	// . Returns the number of characters matched.
+	// . Returns -1 if no characters were matched (the reason for
+	//   not returning zero is that we may have an empty regex
+	//   which is ALWAYS successful at matching zero characters).
+	// . REMEMBER that we only match from the start of the buffer!	
+	inline int RegEx::Match(const std::string& str) const
+	{
+		StringCharSource source(str.c_str(), str.size());
+		return Match(source);
+	}
+	
+	inline int RegEx::Match(const Stream& in) const
+	{
+		StreamCharSource source(in);
+		return Match(source);
+	}
+
+	template <typename Source>
+	inline bool RegEx::IsValidSource(const Source& source) const
+	{
+		return source;
+	}
+	
+	template<>
+	inline bool RegEx::IsValidSource<StringCharSource>(const StringCharSource&source) const
+	{
+		return source || m_op == REGEX_EMPTY;
+	}
+
+	template <typename Source>
+	inline int RegEx::Match(const Source& source) const
+	{
+		return IsValidSource(source) ? MatchUnchecked(source) : -1;
+	}
+	
+	template <typename Source>
+	inline int RegEx::MatchUnchecked(const Source& source) const
+	{
+		switch(m_op) {
+			case REGEX_EMPTY:
+				return MatchOpEmpty(source);
+			case REGEX_MATCH:
+				return MatchOpMatch(source);
+			case REGEX_RANGE:
+				return MatchOpRange(source);
+			case REGEX_OR:
+				return MatchOpOr(source);
+			case REGEX_AND:
+				return MatchOpAnd(source);
+			case REGEX_NOT:
+				return MatchOpNot(source);
+			case REGEX_SEQ:
+				return MatchOpSeq(source);
+		}
+		
+		return -1;
+	}
+
+	//////////////////////////////////////////////////////////////////////////////
+	// Operators
+	// Note: the convention MatchOp*<Source> is that we can assume IsSourceValid(source).
+	//       So we do all our checks *before* we call these functions
+	
+	// EmptyOperator
+	template <typename Source>
+	inline int RegEx::MatchOpEmpty(const Source& source) const {
+		return source[0] == Stream::eof() ? 0 : -1;
+	}
+	
+	template <>
+	inline int RegEx::MatchOpEmpty<StringCharSource>(const StringCharSource& source) const {
+		return !source ? 0 : -1;  // the empty regex only is successful on the empty string
+	}
+
+	// MatchOperator
+	template <typename Source>
+	inline int RegEx::MatchOpMatch(const Source& source) const {
+		if(source[0] != m_a)
+			return -1;
+		return 1;
+	}
+	
+	// RangeOperator
+	template <typename Source>
+	inline int RegEx::MatchOpRange(const Source& source) const {
+		if(m_a > source[0] || m_z < source[0])
+			return -1;
+		return 1;
+	}
+	
+	// OrOperator
+	template <typename Source>
+	inline int RegEx::MatchOpOr(const Source& source) const {
+		for(unsigned i=0;i<m_params.size();i++) {
+			int n = m_params[i].MatchUnchecked(source);
+			if(n >= 0)
+				return n;
+		}
+		return -1;
+	}
+	
+	// AndOperator
+	// Note: 'AND' is a little funny, since we may be required to match things
+	//       of different lengths. If we find a match, we return the length of
+	//       the FIRST entry on the list.
+	template <typename Source>
+	inline int RegEx::MatchOpAnd(const Source& source) const {
+		int first = -1;
+		for(unsigned i=0;i<m_params.size();i++) {
+			int n = m_params[i].MatchUnchecked(source);
+			if(n == -1)
+				return -1;
+			if(i == 0)
+				first = n;
+		}
+		return first;
+	}
+	
+	// NotOperator
+	template <typename Source>
+	inline int RegEx::MatchOpNot(const Source& source) const {
+		if(m_params.empty())
+			return -1;
+		if(m_params[0].MatchUnchecked(source) >= 0)
+			return -1;
+		return 1;
+	}
+	
+	// SeqOperator
+	template <typename Source>
+	inline int RegEx::MatchOpSeq(const Source& source) const {
+		int offset = 0;
+		for(unsigned i=0;i<m_params.size();i++) {
+			int n = m_params[i].Match(source + offset); // note Match, not MatchUnchecked because we need to check validity after the offset
+			if(n == -1)
+				return -1;
+			offset += n;
+		}
+		
+		return offset;
+	}
+}
--- a/src/stream.cpp
+++ b/src/stream.cpp
@@ -1,52 +1,260 @@
 #include "crt.h"
 #include "stream.h"
 #include <iostream>
+#include "exp.h"
+
+#ifndef YAML_PREFETCH_SIZE
+#define YAML_PREFETCH_SIZE 2048
+#endif
+
+#define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
+#define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
+
+#define CP_REPLACEMENT_CHARACTER (0xFFFD)

 namespace YAML
 {
-	Stream::Stream(std::istream& input): pos(0), line(0), column(0), size(0), buffer(0)
+	enum UtfIntroState {
+		uis_start,
+		uis_utfbe_b1,
+		uis_utf32be_b2,
+		uis_utf32be_bom3,
+		uis_utf32be,
+		uis_utf16be,
+		uis_utf16be_bom1,
+		uis_utfle_bom1,
+		uis_utf16le_bom2,
+		uis_utf32le_bom3,
+		uis_utf16le,
+		uis_utf32le,
+		uis_utf8_imp,
+		uis_utf16le_imp,
+		uis_utf32le_imp3,
+		uis_utf8_bom1,
+		uis_utf8_bom2,
+		uis_utf8,
+		uis_error
+	};
+
+	enum UtfIntroCharType {
+		uict00,
+		uictBB,
+		uictBF,
+		uictEF,
+		uictFE,
+		uictFF,
+		uictAscii,
+		uictOther,
+		uictMax
+	};
+
+	static bool s_introFinalState[] = {
+		false, //uis_start
+		false, //uis_utfbe_b1
+		false, //uis_utf32be_b2
+		false, //uis_utf32be_bom3
+		true,  //uis_utf32be
+		true,  //uis_utf16be
+		false, //uis_utf16be_bom1
+		false, //uis_utfle_bom1
+		false, //uis_utf16le_bom2
+		false, //uis_utf32le_bom3
+		true,  //uis_utf16le
+		true,  //uis_utf32le
+		false, //uis_utf8_imp
+		false, //uis_utf16le_imp
+		false, //uis_utf32le_imp3
+		false, //uis_utf8_bom1
+		false, //uis_utf8_bom2
+		true,  //uis_utf8
+		true,  //uis_error
+	};
+
+	static UtfIntroState s_introTransitions[][uictMax] = {
+		// uict00,           uictBB,           uictBF,           uictEF,           uictFE,           uictFF,           uictAscii,        uictOther
+		  {uis_utfbe_b1,     uis_utf8,         uis_utf8,         uis_utf8_bom1,    uis_utf16be_bom1, uis_utfle_bom1,   uis_utf8_imp,     uis_utf8},
+		  {uis_utf32be_b2,   uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16be,      uis_utf8},
+		  {uis_utf32be,      uis_utf8,         uis_utf8,         uis_utf8,         uis_utf32be_bom3, uis_utf8,         uis_utf8,         uis_utf8},
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf32be,      uis_utf8,         uis_utf8},
+		  {uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be},
+		  {uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be},
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16be,      uis_utf8,         uis_utf8},
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16le_bom2, uis_utf8,         uis_utf8,         uis_utf8},
+		  {uis_utf32le_bom3, uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
+		  {uis_utf32le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
+		  {uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
+		  {uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le},
+		  {uis_utf16le_imp,  uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
+		  {uis_utf32le_imp3, uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
+		  {uis_utf32le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
+		  {uis_utf8,         uis_utf8_bom2,    uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
+	};
+
+	static char s_introUngetCount[][uictMax] = {
+		// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
+		  {0,      1,      1,      0,      0,      0,      0,         1},
+		  {0,      2,      2,      2,      2,      2,      2,         2},
+		  {3,      3,      3,      3,      0,      3,      3,         3},
+		  {4,      4,      4,      4,      4,      0,      4,         4},
+		  {1,      1,      1,      1,      1,      1,      1,         1},
+		  {1,      1,      1,      1,      1,      1,      1,         1},
+		  {2,      2,      2,      2,      2,      0,      2,         2},
+		  {2,      2,      2,      2,      0,      2,      2,         2},
+		  {0,      1,      1,      1,      1,      1,      1,         1},
+		  {0,      2,      2,      2,      2,      2,      2,         2},
+		  {1,      1,      1,      1,      1,      1,      1,         1},
+		  {1,      1,      1,      1,      1,      1,      1,         1},
+		  {0,      2,      2,      2,      2,      2,      2,         2},
+		  {0,      3,      3,      3,      3,      3,      3,         3},
+		  {4,      4,      4,      4,      4,      4,      4,         4},
+		  {2,      0,      2,      2,      2,      2,      2,         2},
+		  {3,      3,      0,      3,      3,      3,      3,         3},
+		  {1,      1,      1,      1,      1,      1,      1,         1},
+	};
+
+	inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
 	{
+		if (std::istream::traits_type::eof() == ch) {
+			return uictOther;
+		}
+
+		switch (ch) {
+		case 0: return uict00;
+		case 0xBB: return uictBB;
+		case 0xBF: return uictBF;
+		case 0xEF: return uictEF;
+		case 0xFE: return uictFE;
+		case 0xFF: return uictFF;
+		}
+
+		if ((ch > 0) && (ch < 0xFF)) {
+			return uictAscii;
+		}
+
+		return uictOther;
+	}
+
+	inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
+	{
+		const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
+		const unsigned char mask = (0xFF >> (lead_bits + 1));
+		return static_cast<char>(static_cast<unsigned char>(
+			header | ((ch >> rshift) & mask)
+			));
+	}
+
+	inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
+	{
+		// We are not allowed to queue the Stream::eof() codepoint, so
+		// replace it with CP_REPLACEMENT_CHARACTER
+		if (static_cast<unsigned long>(Stream::eof()) == ch)
+		{
+			ch = CP_REPLACEMENT_CHARACTER;
+		}
+
+		if (ch < 0x80)
+		{
+			q.push_back(Utf8Adjust(ch, 0, 0));
+		}
+		else if (ch < 0x800)
+		{
+			q.push_back(Utf8Adjust(ch, 2, 6));
+			q.push_back(Utf8Adjust(ch, 1, 0));
+		}
+		else if (ch < 0x10000)
+		{
+			q.push_back(Utf8Adjust(ch, 3, 12));
+			q.push_back(Utf8Adjust(ch, 1, 6));
+			q.push_back(Utf8Adjust(ch, 1, 0));
+		}
+		else
+		{
+			q.push_back(Utf8Adjust(ch, 4, 18));
+			q.push_back(Utf8Adjust(ch, 1, 12));
+			q.push_back(Utf8Adjust(ch, 1, 6));
+			q.push_back(Utf8Adjust(ch, 1, 0));
+		}
+	}
+
+	Stream::Stream(std::istream& input)
+		: pos(0), line(0), column(0), m_input(input), m_nPushedBack(0),
+		m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), 
+		m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
+	{
+		typedef std::istream::traits_type char_traits;
+
 		if(!input)
 			return;

-		std::streambuf *pBuf = input.rdbuf();
+		// Determine (or guess) the character-set by reading the BOM, if any.  See
+		// the YAML specification for the determination algorithm.
+		char_traits::int_type intro[4];
+		int nIntroUsed = 0;
+		UtfIntroState state = uis_start;
+		for (; !s_introFinalState[state]; ) {
+			std::istream::int_type ch = input.get();
+			intro[nIntroUsed++] = ch;
+			UtfIntroCharType charType = IntroCharTypeOf(ch);
+			UtfIntroState newState = s_introTransitions[state][charType];
+			int nUngets = s_introUngetCount[state][charType];
+			if (nUngets > 0) {
+				for (; nUngets > 0; --nUngets) {
+					if (char_traits::eof() != intro[--nIntroUsed]) {
+						m_bufPushback[m_nPushedBack++] = 
+							char_traits::to_char_type(intro[nIntroUsed]);
+					}
+				}
+			}
+			state = newState;
+		}

-		// store entire file in buffer
-		size = pBuf->pubseekoff(0, std::ios::end, std::ios::in);
-		pBuf->pubseekpos(0, std::ios::in);
-		buffer = new char[size];
-		size = pBuf->sgetn(buffer, size);  // Note: when reading a Windows CR/LF file,
-		                                   // pubseekoff() counts CR/LF as two characters,
-		                                   // setgn() reads CR/LF as a single LF character!
+		switch (state) {
+		case uis_utf8: m_charSet = utf8; break;
+		case uis_utf16le: m_charSet = utf16le; break;
+		case uis_utf16be: m_charSet = utf16be; break;
+		case uis_utf32le: m_charSet = utf32le; break;
+		case uis_utf32be: m_charSet = utf32be; break;
+		default: m_charSet = utf8; break;
+		}
+
+		ReadAheadTo(0);
 	}

 	Stream::~Stream()
 	{
-		delete [] buffer;
+		delete[] m_pPrefetched;
 	}

-
-	char Stream::peek()
+	char Stream::peek() const
 	{
-		return buffer[pos];
+		if (m_readahead.empty())
+		{
+			return Stream::eof();
+		}
+
+		return m_readahead[0];
 	}
 	
 	Stream::operator bool() const
 	{
-		return pos < size;
+		return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
 	}

 	// get
 	// . Extracts a character from the stream and updates our position
 	char Stream::get()
 	{
-		char ch = buffer[pos];
-		pos++;
+		char ch = peek();
+		AdvanceCurrent();
 		column++;
+		
 		if(ch == '\n') {
 			column = 0;
 			line++;
 		}
+		
 		return ch;
 	}

@@ -69,4 +277,179 @@ namespace YAML
 			get();
 	}

+	void Stream::AdvanceCurrent()
+	{
+		if (!m_readahead.empty())
+		{
+			m_readahead.pop_front();
+			++pos;
+		}
+
+		ReadAheadTo(0);
+	}
+
+	bool Stream::_ReadAheadTo(size_t i) const
+	{
+		while (m_input.good() && (m_readahead.size() <= i))
+		{
+			switch (m_charSet)
+			{
+			case utf8: StreamInUtf8(); break;
+			case utf16le: StreamInUtf16(); break;
+			case utf16be: StreamInUtf16(); break;
+			case utf32le: StreamInUtf32(); break;
+			case utf32be: StreamInUtf32(); break;
+			}
+		}
+		
+		// signal end of stream
+		if(!m_input.good())
+			m_readahead.push_back(Stream::eof());
+
+		return m_readahead.size() > i;
+	}
+
+	void Stream::StreamInUtf8() const
+	{
+		unsigned char b = GetNextByte();
+		if (m_input.good())
+		{
+			m_readahead.push_back(b);
+		}
+	}
+
+	void Stream::StreamInUtf16() const
+	{
+		unsigned long ch = 0;
+		unsigned char bytes[2];
+		int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
+
+		bytes[0] = GetNextByte();
+		bytes[1] = GetNextByte();
+		if (!m_input.good())
+		{
+			return;
+		}
+		ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
+			static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
+
+		if (ch >= 0xDC00 && ch < 0xE000)
+		{
+			// Trailing (low) surrogate...ugh, wrong order
+			QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
+			return;
+		}
+		else if (ch >= 0xD800 && ch < 0xDC00)
+		{
+			// ch is a leading (high) surrogate
+
+			// Four byte UTF-8 code point
+
+			// Read the trailing (low) surrogate
+			for (;;)
+			{
+				bytes[0] = GetNextByte();
+				bytes[1] = GetNextByte();
+				if (!m_input.good())
+				{
+					QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
+					return;
+				}
+				unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
+					static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
+				if (chLow < 0xDC00 || ch >= 0xE000)
+				{
+					// Trouble...not a low surrogate.  Dump a REPLACEMENT CHARACTER into the stream.
+					QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
+
+					// Deal with the next UTF-16 unit
+					if (chLow < 0xD800 || ch >= 0xE000)
+					{
+						// Easiest case: queue the codepoint and return
+						QueueUnicodeCodepoint(m_readahead, ch);
+						return;
+					}
+					else
+					{
+						// Start the loop over with the new high surrogate
+						ch = chLow;
+						continue;
+					}
+				}
+
+				// Select the payload bits from the high surrogate
+				ch &= 0x3FF;
+				ch <<= 10;
+
+				// Include bits from low surrogate
+				ch |= (chLow & 0x3FF);
+
+				// Add the surrogacy offset
+				ch += 0x10000;
+			}
+		}
+
+		QueueUnicodeCodepoint(m_readahead, ch);
+	}
+
+	inline char* ReadBuffer(unsigned char* pBuffer)
+	{
+		return reinterpret_cast<char*>(pBuffer);
+	}
+
+	unsigned char Stream::GetNextByte() const
+	{
+		if (m_nPushedBack)
+		{
+			return m_bufPushback[--m_nPushedBack];
+		}
+
+		if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
+		{
+			std::streambuf *pBuf = m_input.rdbuf();
+			m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched), 
+				YAML_PREFETCH_SIZE);
+			m_nPrefetchedUsed = 0;
+			if (!m_nPrefetchedAvailable)
+			{
+				m_input.setstate(std::ios_base::eofbit);
+			}
+
+			if (0 == m_nPrefetchedAvailable)
+			{
+				return 0;
+			}
+		}
+
+		return m_pPrefetched[m_nPrefetchedUsed++];
+	}
+
+	void Stream::StreamInUtf32() const
+	{
+		static int indexes[2][4] = {
+			{3, 2, 1, 0},
+			{0, 1, 2, 3}
+		};
+
+		unsigned long ch = 0;
+		unsigned char bytes[4];
+		int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
+
+		bytes[0] = GetNextByte();
+		bytes[1] = GetNextByte();
+		bytes[2] = GetNextByte();
+		bytes[3] = GetNextByte();
+		if (!m_input.good())
+		{
+			return;
+		}
+
+		for (int i = 0; i < 4; ++i)
+		{
+			ch <<= 8;
+			ch |= bytes[pIndexes[i]];
+		}
+
+		QueueUnicodeCodepoint(m_readahead, ch);
+	}
 }
--- a/src/stream.h
+++ b/src/stream.h
@@ -1,42 +1,66 @@
 #pragma once

+#include <deque>
 #include <ios>
 #include <string>
+#include <iostream>
+#include <set>

 namespace YAML
 {
-	// a simple buffer wrapper that knows how big it is
-	struct Buffer {
-		Buffer(const char *b, int s): buffer(b), size(s) {}
-
-		operator bool() const { return size > 0; }
-		bool operator !() const { return !static_cast <bool> (*this); }
-		char operator [] (int i) const { return buffer[i]; }
-		const Buffer operator + (int offset) const { return Buffer(buffer + offset, size - offset); }
-		Buffer& operator ++ () { ++buffer; --size; return *this; }
-
-		const char *buffer;
-		int size;
-	};
+	static const size_t MAX_PARSER_PUSHBACK = 8;

 	class Stream
 	{
 	public:
+		friend class StreamCharSource;
+		
 		Stream(std::istream& input);
 		~Stream();

 		operator bool() const;
 		bool operator !() const { return !static_cast <bool>(*this); }

-		const Buffer current() const { return Buffer(buffer + pos, size - pos); }
-		char peek();
+		char peek() const;
 		char get();
 		std::string get(int n);
 		void eat(int n = 1);

-		int pos, line, column, size;
+		static char eof() { return 0x04; }
+
+		int pos, line, column;
 	
 	private:
-		char *buffer;
+		enum CharacterSet {utf8, utf16le, utf16be, utf32le, utf32be};
+
+		std::istream& m_input;
+		CharacterSet m_charSet;
+		unsigned char m_bufPushback[MAX_PARSER_PUSHBACK];
+		mutable size_t m_nPushedBack;
+		mutable std::deque<char> m_readahead;
+		unsigned char* const m_pPrefetched;
+		mutable size_t m_nPrefetchedAvailable;
+		mutable size_t m_nPrefetchedUsed;
+		
+		void AdvanceCurrent();
+		char CharAt(size_t i) const;
+		bool ReadAheadTo(size_t i) const;
+		bool _ReadAheadTo(size_t i) const;
+		void StreamInUtf8() const;
+		void StreamInUtf16() const;
+		void StreamInUtf32() const;
+		unsigned char GetNextByte() const;
 	};
+
+	// CharAt
+	// . Unchecked access
+	inline char Stream::CharAt(size_t i) const {
+		return m_readahead[i];
+	}
+	
+	inline bool Stream::ReadAheadTo(size_t i) const {
+		if(m_readahead.size() > i)
+			return true;
+		return _ReadAheadTo(i);
+	}	
 }
--- a/src/streamcharsource.h
+++ b/src/streamcharsource.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cstddef>
+
+namespace YAML
+{
+	class StreamCharSource
+	{
+	public:
+		StreamCharSource(const Stream& stream);
+		~StreamCharSource() {}
+			
+		operator bool() const;
+		char operator [] (std::size_t i) const { return m_stream.CharAt(m_offset + i); }
+		bool operator !() const { return !static_cast<bool>(*this); }
+
+		const StreamCharSource operator + (int i) const;
+			
+	private:
+		std::size_t m_offset;
+		const Stream& m_stream;
+	};
+	
+	inline StreamCharSource::StreamCharSource(const Stream& stream): m_offset(0), m_stream(stream) {
+	}
+	
+	inline StreamCharSource::operator bool() const {
+		return m_stream.ReadAheadTo(m_offset);
+	}
+	
+	inline const StreamCharSource StreamCharSource::operator + (int i) const {
+		StreamCharSource source(*this);
+		if(static_cast<int> (source.m_offset) + i >= 0)
+			source.m_offset += i;
+		else
+			source.m_offset = 0;
+		return source;
+	}
+}
--- a/src/stringsource.h
+++ b/src/stringsource.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <cstddef>
+
+namespace YAML
+{
+	class StringCharSource
+	{
+	public:
+		StringCharSource(const char *str, std::size_t size): m_str(str), m_size(size), m_offset(0) {}
+
+		operator bool() const { return m_offset < m_size; }
+		char operator [] (std::size_t i) const { return m_str[m_offset + i]; }
+		bool operator !() const { return !static_cast<bool>(*this); }
+
+		const StringCharSource operator + (int i) const {
+			StringCharSource source(*this);
+			if(static_cast<int> (source.m_offset) + i >= 0)
+				source.m_offset += i;
+			else
+				source.m_offset = 0;
+			return source;
+		}
+			
+		StringCharSource& operator ++ () {
+			++m_offset;
+			return *this;
+		}
+	private:
+		const char *m_str;
+		std::size_t m_size;
+		std::size_t m_offset;
+	};
+}
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(parse parse.cpp)
+target_link_libraries(parse yaml-cpp)
--- a/util/parse.cpp
+++ b/util/parse.cpp
@@ -0,0 +1,21 @@
+#include "yaml.h"
+#include <fstream>
+#include <iostream>
+
+int main(int argc, char **argv)
+{
+	if(argc != 2) {
+		std::cout << "Usage: " << argv[0] << " input-file\n";
+		return 0;
+	}
+
+	std::ifstream fin(argv[1]);
+	try {
+		YAML::Parser parser(fin);
+		YAML::Node doc;
+		parser.GetNextDocument(doc);
+	} catch(const YAML::Exception& e) {
+		std::cerr << "Error at line " << e.line << ", col " << e.column << ": " << e.msg << "\n";
+	}
+	return 0;
+}
--- a/yaml-reader/tests.cpp
+++ b/yaml-reader/tests.cpp
@@ -72,6 +72,174 @@ namespace Test
 					std::cout << "Caught exception: " << error << "\n";
 			}
 		}
+
+		typedef void (*EncodingFn)(std::ostream&, int);
+
+		inline char Byte(int ch)
+		{
+			return static_cast<char>(static_cast<unsigned char>(static_cast<unsigned int>(ch)));
+		}
+
+		void EncodeToUtf8(std::ostream& stream, int ch)
+		{
+			if (ch <= 0x7F)
+			{
+				stream << Byte(ch);
+			}
+			else if (ch <= 0x7FF)
+			{
+				stream << Byte(0xC0 | (ch >> 6));
+				stream << Byte(0x80 | (ch & 0x3F));
+			}
+			else if (ch <= 0xFFFF)
+			{
+				stream << Byte(0xE0 | (ch >> 12));
+				stream << Byte(0x80 | ((ch >> 6) & 0x3F));
+				stream << Byte(0x80 | (ch & 0x3F));
+			}
+			else if (ch <= 0x1FFFFF)
+			{
+				stream << Byte(0xF0 | (ch >> 18));
+				stream << Byte(0x80 | ((ch >> 12) & 0x3F));
+				stream << Byte(0x80 | ((ch >> 6) & 0x3F));
+				stream << Byte(0x80 | (ch & 0x3F));
+			}
+		}
+
+		bool SplitUtf16HighChar(std::ostream& stream, EncodingFn encoding, int ch)
+		{
+			int biasedValue = ch - 0x10000;
+			if (biasedValue < 0)
+			{
+				return false;
+			}
+			int high = 0xD800 | (biasedValue >> 10);
+			int low  = 0xDC00 | (biasedValue & 0x3FF);
+			encoding(stream, high);
+			encoding(stream, low);
+			return true;
+		}
+
+		void EncodeToUtf16LE(std::ostream& stream, int ch)
+		{
+			if (!SplitUtf16HighChar(stream, &EncodeToUtf16LE, ch))
+			{
+				stream << Byte(ch & 0xFF) << Byte(ch >> 8);
+			}
+		}
+
+		void EncodeToUtf16BE(std::ostream& stream, int ch)
+		{
+			if (!SplitUtf16HighChar(stream, &EncodeToUtf16BE, ch))
+			{
+				stream << Byte(ch >> 8) << Byte(ch & 0xFF);
+			}
+		}
+
+		void EncodeToUtf32LE(std::ostream& stream, int ch)
+		{
+			stream << Byte(ch & 0xFF) << Byte((ch >> 8) & 0xFF) 
+				<< Byte((ch >> 16) & 0xFF) << Byte((ch >> 24) & 0xFF);
+		}
+
+		void EncodeToUtf32BE(std::ostream& stream, int ch)
+		{
+			stream << Byte((ch >> 24) & 0xFF) << Byte((ch >> 16) & 0xFF)
+				<< Byte((ch >> 8) & 0xFF) << Byte(ch & 0xFF);
+		}
+
+		class EncodingTester
+		{
+		public:
+			EncodingTester(EncodingFn encoding, bool declareEncoding)
+			{
+				if (declareEncoding)
+				{
+					encoding(m_yaml, 0xFEFF);
+				}
+
+				AddEntry(encoding, 0x0021, 0x007E); // Basic Latin
+				AddEntry(encoding, 0x00A1, 0x00FF); // Latin-1 Supplement
+				AddEntry(encoding, 0x0660, 0x06FF); // Arabic (largest contiguous block)
+
+				// CJK unified ideographs (multiple lines)
+				AddEntry(encoding, 0x4E00, 0x4EFF);
+				AddEntry(encoding, 0x4F00, 0x4FFF);
+				AddEntry(encoding, 0x5000, 0x51FF); // 512 character line
+				AddEntry(encoding, 0x5200, 0x54FF); // 768 character line
+				AddEntry(encoding, 0x5500, 0x58FF); // 1024 character line
+
+				AddEntry(encoding, 0x103A0, 0x103C3); // Old Persian
+
+				m_yaml.seekg(0, std::ios::beg);
+			}
+
+			std::istream& stream() {return m_yaml;}
+			const std::vector<std::string>& entries() {return m_entries;}
+
+		private:
+			std::stringstream m_yaml;
+			std::vector<std::string> m_entries;
+
+			void AddEntry(EncodingFn encoding, int startCh, int endCh)
+			{
+				encoding(m_yaml, '-');
+				encoding(m_yaml, ' ');
+				encoding(m_yaml, '|');
+				encoding(m_yaml, '\n');
+				encoding(m_yaml, ' ');
+				encoding(m_yaml, ' ');
+
+				std::stringstream entry;
+				for (int ch = startCh; ch <= endCh; ++ch)
+				{
+					encoding(m_yaml, ch);
+					EncodeToUtf8(entry, ch);
+				}
+				encoding(m_yaml, '\n');
+
+				m_entries.push_back(entry.str());
+			}
+		};
+
+		void RunEncodingTest(EncodingFn encoding, bool declareEncoding, const std::string& name, bool& passed)
+		{
+			EncodingTester tester(encoding, declareEncoding);
+			std::string error;
+			bool ok = true;
+			try {
+				YAML::Parser parser(tester.stream());
+				YAML::Node doc;
+				parser.GetNextDocument(doc);
+
+				YAML::Iterator itNode = doc.begin();
+				std::vector<std::string>::const_iterator itEntry = tester.entries().begin();
+				for (; (itNode != doc.end()) && (itEntry != tester.entries().end()); ++itNode, ++itEntry)
+				{
+					std::string stScalarValue;
+					if (!itNode->GetScalar(stScalarValue) && (stScalarValue == *itEntry))
+					{
+						break;
+					}
+				}
+
+				if ((itNode != doc.end()) || (itEntry != tester.entries().end()))
+				{
+					ok = false;
+				}
+			} catch(const YAML::Exception& e) {
+				ok = false;
+				error = e.msg;
+			}
+			if(ok) {
+				std::cout << "Parser test passed: " << name << "\n";
+			} else {
+				passed = false;
+				std::cout << "Parser test failed: " << name << "\n";
+				if(error != "")
+					std::cout << "Caught exception: " << error << "\n";
+			}
+		}
 	}

 	bool RunParserTests()
@@ -94,6 +262,17 @@ namespace Test
 		RunParserTest(&Parser::SimpleMap, "simple map", passed);
 		RunParserTest(&Parser::FlowSeq, "flow seq", passed);
 		RunParserTest(&Parser::FlowMap, "flow map", passed);
+
+		RunEncodingTest(&EncodeToUtf8, false, "UTF-8, no BOM", passed);
+		RunEncodingTest(&EncodeToUtf8, true, "UTF-8 with BOM", passed);
+		RunEncodingTest(&EncodeToUtf16LE, false, "UTF-16LE, no BOM", passed);
+		RunEncodingTest(&EncodeToUtf16LE, true, "UTF-16LE with BOM", passed);
+		RunEncodingTest(&EncodeToUtf16BE, false, "UTF-16BE, no BOM", passed);
+		RunEncodingTest(&EncodeToUtf16BE, true, "UTF-16BE with BOM", passed);
+		RunEncodingTest(&EncodeToUtf32LE, false, "UTF-32LE, no BOM", passed);
+		RunEncodingTest(&EncodeToUtf32LE, true, "UTF-32LE with BOM", passed);
+		RunEncodingTest(&EncodeToUtf32BE, false, "UTF-32BE, no BOM", passed);
+		RunEncodingTest(&EncodeToUtf32BE, true, "UTF-32BE with BOM", passed);
 		return passed;
 	}

--- a/yamlcpp.vcproj
+++ b/yamlcpp.vcproj
@@ -231,6 +231,10 @@
 					RelativePath=".\src\stream.cpp"
 					>
 				</File>
+				<File
+					RelativePath=".\src\stringsource.cpp"
+					>
+				</File>
 			</Filter>
 			<Filter
 				Name="Emitter"
@@ -357,6 +361,10 @@
 					RelativePath=".\src\stream.h"
 					>
 				</File>
+				<File
+					RelativePath=".\src\stringsource.h"
+					>
+				</File>
 				<File
 					RelativePath=".\src\token.h"
 					>