From 9a28c9178e17a1c35301eb15abaf44481853bb12 Mon Sep 17 00:00:00 2001 From: jbeder Date: Mon, 19 Oct 2009 23:31:11 +0000 Subject: [PATCH] Merged r270:HEAD of the emitting-unicode branch --- include/emitter.h | 1 + include/emittermanip.h | 4 + src/emitter.cpp | 12 +- src/emitterstate.cpp | 14 ++ src/emitterstate.h | 4 + src/emitterutils.cpp | 357 +++++++++++++++++++---------------- src/emitterutils.h | 4 +- src/exp.cpp | 4 +- src/exp.h | 7 +- yaml-reader/emittertests.cpp | 19 +- 10 files changed, 257 insertions(+), 169 deletions(-) diff --git a/include/emitter.h b/include/emitter.h index 055c9c7..d512bb8 100644 --- a/include/emitter.h +++ b/include/emitter.h @@ -29,6 +29,7 @@ namespace YAML const std::string GetLastError() const; // global setters + bool SetOutputCharset(EMITTER_MANIP value); bool SetStringFormat(EMITTER_MANIP value); bool SetBoolFormat(EMITTER_MANIP value); bool SetIntBase(EMITTER_MANIP value); diff --git a/include/emittermanip.h b/include/emittermanip.h index 2dd83b5..9284768 100644 --- a/include/emittermanip.h +++ b/include/emittermanip.h @@ -11,6 +11,10 @@ namespace YAML enum EMITTER_MANIP { // general manipulators Auto, + + // output character set + EmitNonAscii, + EscapeNonAscii, // string manipulators // Auto, // duplicate diff --git a/src/emitter.cpp b/src/emitter.cpp index ac305b9..92d76df 100644 --- a/src/emitter.cpp +++ b/src/emitter.cpp @@ -37,6 +37,11 @@ namespace YAML } // global setters + bool Emitter::SetOutputCharset(EMITTER_MANIP value) + { + return m_pState->SetOutputCharset(value, GLOBAL); + } + bool Emitter::SetStringFormat(EMITTER_MANIP value) { return m_pState->SetStringFormat(value, GLOBAL); @@ -485,13 +490,14 @@ namespace YAML PreAtomicWrite(); EmitSeparationIfNecessary(); + bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii; EMITTER_MANIP strFmt = m_pState->GetStringFormat(); FLOW_TYPE flowType = m_pState->GetCurGroupFlowType(); unsigned curIndent = m_pState->GetCurIndent(); switch(strFmt) { case Auto: - Utils::WriteString(m_stream, str, flowType == FT_FLOW); + Utils::WriteString(m_stream, str, flowType == FT_FLOW, escapeNonAscii); break; case SingleQuoted: if(!Utils::WriteSingleQuotedString(m_stream, str)) { @@ -500,11 +506,11 @@ namespace YAML } break; case DoubleQuoted: - Utils::WriteDoubleQuotedString(m_stream, str); + Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii); break; case Literal: if(flowType == FT_FLOW) - Utils::WriteString(m_stream, str, flowType == FT_FLOW); + Utils::WriteString(m_stream, str, flowType == FT_FLOW, escapeNonAscii); else Utils::WriteLiteralString(m_stream, str, curIndent + m_pState->GetIndent()); break; diff --git a/src/emitterstate.cpp b/src/emitterstate.cpp index 5a2d8bf..2906679 100644 --- a/src/emitterstate.cpp +++ b/src/emitterstate.cpp @@ -9,6 +9,7 @@ namespace YAML m_stateStack.push(ES_WAITING_FOR_DOC); // set default global manipulators + m_charset.set(EmitNonAscii); m_strFmt.set(Auto); m_boolFmt.set(TrueFalseBool); m_boolLengthFmt.set(LongBool); @@ -43,6 +44,7 @@ namespace YAML // . Only the ones that make sense will be accepted void EmitterState::SetLocalValue(EMITTER_MANIP value) { + SetOutputCharset(value, LOCAL); SetStringFormat(value, LOCAL); SetBoolFormat(value, LOCAL); SetBoolCaseFormat(value, LOCAL); @@ -132,6 +134,18 @@ namespace YAML { m_modifiedSettings.clear(); } + + bool EmitterState::SetOutputCharset(EMITTER_MANIP value, FMT_SCOPE scope) + { + switch(value) { + case EmitNonAscii: + case EscapeNonAscii: + _Set(m_charset, value, scope); + return true; + default: + return false; + } + } bool EmitterState::SetStringFormat(EMITTER_MANIP value, FMT_SCOPE scope) { diff --git a/src/emitterstate.h b/src/emitterstate.h index fbabe81..b5be269 100644 --- a/src/emitterstate.h +++ b/src/emitterstate.h @@ -108,6 +108,9 @@ namespace YAML void ClearModifiedSettings(); // formatters + bool SetOutputCharset(EMITTER_MANIP value, FMT_SCOPE scope); + EMITTER_MANIP GetOutputCharset() const { return m_charset.get(); } + bool SetStringFormat(EMITTER_MANIP value, FMT_SCOPE scope); EMITTER_MANIP GetStringFormat() const { return m_strFmt.get(); } @@ -149,6 +152,7 @@ namespace YAML // other state std::stack m_stateStack; + Setting m_charset; Setting m_strFmt; Setting m_boolFmt; Setting m_boolLengthFmt; diff --git a/src/emitterutils.cpp b/src/emitterutils.cpp index f4b7a57..837ae20 100644 --- a/src/emitterutils.cpp +++ b/src/emitterutils.cpp @@ -5,18 +5,129 @@ #include "stringsource.h" #include #include -#include namespace YAML { namespace Utils { namespace { - bool IsPrintable(char ch) { - return (0x20 <= ch && ch <= 0x7E); + enum {REPLACEMENT_CHARACTER = 0xFFFD}; + + bool IsAnchorChar(int ch) { // test for ns-anchor-char + switch (ch) { + case ',': case '[': case ']': case '{': case '}': // c-flow-indicator + case ' ': case '\t': // s-white + case 0xFEFF: // c-byte-order-mark + case 0xA: case 0xD: // b-char + return false; + case 0x85: + return true; + } + + if (ch < 0x20) + return false; + + if (ch < 0x7E) + return true; + + if (ch < 0xA0) + return false; + if (ch >= 0xD800 && ch <= 0xDFFF) + return false; + if ((ch & 0xFFFE) == 0xFFFE) + return false; + if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) + return false; + if (ch > 0x10FFFF) + return false; + + return true; } - bool IsValidPlainScalar(const std::string& str, bool inFlow) { + int Utf8BytesIndicated(char ch) { + int byteVal = static_cast(ch); + switch (byteVal >> 4) { + case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: + return 1; + case 12: case 13: + return 2; + case 14: + return 3; + case 15: + return 4; + default: + return -1; + } + } + + bool IsTrailingByte(char ch) { + return (ch & 0xC0) == 0x80; + } + + bool GetNextCodePointAndAdvance(int& codePoint, std::string::const_iterator& first, std::string::const_iterator last) { + if (first == last) + return false; + + int nBytes = Utf8BytesIndicated(*first); + if (nBytes < 1) { + // Bad lead byte + ++first; + codePoint = REPLACEMENT_CHARACTER; + return true; + } + + if (nBytes == 1) { + codePoint = *first++; + return true; + } + + // Gather bits from trailing bytes + codePoint = static_cast(*first) & ~(0xFF << (7 - nBytes)); + ++first; + --nBytes; + for (; nBytes > 0; ++first, --nBytes) { + if ((first == last) || !IsTrailingByte(*first)) { + codePoint = REPLACEMENT_CHARACTER; + break; + } + codePoint <<= 6; + codePoint |= *first & 0x3F; + } + + // Check for illegal code points + if (codePoint > 0x10FFFF) + codePoint = REPLACEMENT_CHARACTER; + else if (codePoint >= 0xD800 && codePoint <= 0xDFFF) + codePoint = REPLACEMENT_CHARACTER; + else if ((codePoint & 0xFFFE) == 0xFFFE) + codePoint = REPLACEMENT_CHARACTER; + else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF) + codePoint = REPLACEMENT_CHARACTER; + return true; + } + + void WriteCodePoint(ostream& out, int codePoint) { + if (codePoint < 0 || codePoint > 0x10FFFF) { + codePoint = REPLACEMENT_CHARACTER; + } + if (codePoint < 0x7F) { + out << static_cast(codePoint); + } else if (codePoint < 0x7FF) { + out << static_cast(0xC0 | (codePoint >> 6)) + << static_cast(0x80 | (codePoint & 0x3F)); + } else if (codePoint < 0xFFFF) { + out << static_cast(0xE0 | (codePoint >> 12)) + << static_cast(0x80 | ((codePoint >> 6) & 0x3F)) + << static_cast(0x80 | (codePoint & 0x3F)); + } else { + out << static_cast(0xF0 | (codePoint >> 18)) + << static_cast(0x80 | ((codePoint >> 12) & 0x3F)) + << static_cast(0x80 | ((codePoint >> 6) & 0x3F)) + << static_cast(0x80 | (codePoint & 0x3F)); + } + } + + bool IsValidPlainScalar(const std::string& str, bool inFlow, bool allowOnlyAscii) { // first check the start const RegEx& start = (inFlow ? Exp::PlainScalarInFlow : Exp::PlainScalar); if(!start.Matches(str)) @@ -29,177 +140,109 @@ namespace YAML // then check until something is disallowed const RegEx& disallowed = (inFlow ? Exp::EndScalarInFlow : Exp::EndScalar) || (Exp::BlankOrBreak + Exp::Comment) - || (!Exp::Printable) + || Exp::NotPrintable + || Exp::Utf8_ByteOrderMark || Exp::Break || Exp::Tab; StringCharSource buffer(str.c_str(), str.size()); while(buffer) { if(disallowed.Matches(buffer)) return false; + if(allowOnlyAscii && (0x7F < static_cast(buffer[0]))) + return false; ++buffer; } return true; } - - typedef unsigned char byte; - byte ToByte(char ch) { return static_cast(ch); } - - typedef std::string::const_iterator StrIter; - std::string WriteUnicode(unsigned value) { - std::stringstream str; - // TODO: for the common escaped characters, give their usual symbol - if(value <= 0xFF) - str << "\\x" << std::hex << std::setfill('0') << std::setw(2) << value; - else if(value <= 0xFFFF) - str << "\\u" << std::hex << std::setfill('0') << std::setw(4) << value; - else - str << "\\U" << std::hex << std::setfill('0') << std::setw(8) << value; - return str.str(); - } - - // GetBytesToRead - // . Returns the length of the UTF-8 sequence starting with 'signal' - int GetBytesToRead(byte signal) { - if(signal <= 0x7F) // ASCII - return 1; - else if(signal <= 0xBF) // invalid first characters - return 0; - else if(signal <= 0xDF) // Note: this allows "overlong" UTF8 (0xC0 - 0xC1) to pass unscathed. OK? - return 2; - else if(signal <= 0xEF) - return 3; - else - return 4; - } - - // ReadBytes - // . Reads the next 'bytesToRead', if we can. - // . Returns zero if we fail, otherwise fills the byte buffer with - // the data and returns the number of bytes read. - int ReadBytes(byte bytes[4], StrIter start, StrIter end, int bytesToRead) { - for(int i=0;i 0; --digits, ++i) { + escSeq[i] = hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF]; + } + + escSeq[i] = 0; // terminate with NUL character + out << escSeq; } - - // IsValidUTF8 - // . Assumes bytes[0] is a valid signal byte with the right size passed - bool IsValidUTF8(byte bytes[4], int size) { - for(int i=1;i= 1) - it += (bytesRead - 1); - } + int codePoint; + for(std::string::const_iterator i = str.begin(); + GetNextCodePointAndAdvance(codePoint, i, str.end()); + ) + { + if (codePoint == '\"') + out << "\\\""; + else if (codePoint == '\\') + out << "\\\\"; + else if (codePoint < 0x20 || (codePoint >= 0x80 && codePoint <= 0xA0)) // Control characters and non-breaking space + WriteDoubleQuoteEscapeSequence(out, codePoint); + else if (codePoint == 0xFEFF) // Byte order marks (ZWNS) should be escaped (YAML 1.2, sec. 5.2) + WriteDoubleQuoteEscapeSequence(out, codePoint); + else if (escapeNonAscii && codePoint > 0x7E) + WriteDoubleQuoteEscapeSequence(out, codePoint); + else + WriteCodePoint(out, codePoint); } out << "\""; return true; @@ -209,11 +252,15 @@ namespace YAML { out << "|\n"; out << IndentTo(indent); - for(std::size_t i=0;i(ch); } // Escape diff --git a/src/exp.h b/src/exp.h index e7a96b5..126c2c9 100644 --- a/src/exp.h +++ b/src/exp.h @@ -26,7 +26,12 @@ namespace YAML const RegEx Alpha = RegEx('a', 'z') || RegEx('A', 'Z'); const RegEx AlphaNumeric = Alpha || Digit; const RegEx Hex = Digit || RegEx('A', 'F') || RegEx('a', 'f'); - const RegEx Printable = RegEx(0x20, 0x7E); + // Valid Unicode code points that are not part of c-printable (YAML 1.2, sec. 5.1) + const RegEx NotPrintable = RegEx(0) || + RegEx("\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x7F", REGEX_OR) || + RegEx(0x0E, 0x1F) || + (RegEx('\xC2') + (RegEx('\x80', '\x84') || RegEx('\x86', '\x9F'))); + const RegEx Utf8_ByteOrderMark = RegEx("\xEF\xBB\xBF"); // actual tags diff --git a/yaml-reader/emittertests.cpp b/yaml-reader/emittertests.cpp index c3209b2..f85af8d 100644 --- a/yaml-reader/emittertests.cpp +++ b/yaml-reader/emittertests.cpp @@ -448,12 +448,25 @@ namespace Test desiredOutput = "- ~\n-\n null value: ~\n ~: null key"; } - void Unicode(YAML::Emitter& out, std::string& desiredOutput) + void EscapedUnicode(YAML::Emitter& out, std::string& desiredOutput) { - out << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; + out << YAML::EscapeNonAscii << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; desiredOutput = "\"$ \\xa2 \\u20ac \\U00024b62\""; } + + void Unicode(YAML::Emitter& out, std::string& desiredOutput) + { + out << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; + desiredOutput = "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; + } + + void DoubleQuotedUnicode(YAML::Emitter& out, std::string& desiredOutput) + { + out << YAML::DoubleQuoted << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; + desiredOutput = "\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\""; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////// // incorrect emitting @@ -616,7 +629,9 @@ namespace Test RunEmitterTest(&Emitter::SimpleGlobalSettings, "simple global settings", passed, total); RunEmitterTest(&Emitter::ComplexGlobalSettings, "complex global settings", passed, total); RunEmitterTest(&Emitter::Null, "null", passed, total); + RunEmitterTest(&Emitter::EscapedUnicode, "escaped unicode", passed, total); RunEmitterTest(&Emitter::Unicode, "unicode", passed, total); + RunEmitterTest(&Emitter::DoubleQuotedUnicode, "double quoted unicode", passed, total); RunEmitterErrorTest(&Emitter::ExtraEndSeq, "extra EndSeq", passed, total); RunEmitterErrorTest(&Emitter::ExtraEndMap, "extra EndMap", passed, total);