From 15d5b2b53365c0c0f69bd9802ad6e78e4630892a Mon Sep 17 00:00:00 2001 From: jbeder Date: Wed, 7 Oct 2009 06:46:05 +0000 Subject: [PATCH] Fixed the emitter unicode output --- src/emitterutils.cpp | 102 ++++++++++++++++++++++++++++++++--- yaml-reader/emittertests.cpp | 8 +++ 2 files changed, 104 insertions(+), 6 deletions(-) diff --git a/src/emitterutils.cpp b/src/emitterutils.cpp index cddf5eb..f6ac3d3 100644 --- a/src/emitterutils.cpp +++ b/src/emitterutils.cpp @@ -40,6 +40,99 @@ namespace YAML return true; } + + unsigned ToUnsigned(char ch) { return static_cast(static_cast(ch)); } + unsigned AdvanceAndGetNextChar(std::string::const_iterator& it, std::string::const_iterator end) { + std::string::const_iterator jt = it; + ++jt; + if(jt == end) + return 0; + + ++it; + return ToUnsigned(*it); + } + + std::string WriteUnicode(unsigned value) { + std::stringstream str; + // TODO: for the common escaped characters, give their usual symbol + if(value <= 0xFF) + str << "\\x" << std::hex << std::setfill('0') << std::setw(2) << value; + else if(value <= 0xFFFF) + str << "\\u" << std::hex << std::setfill('0') << std::setw(4) << value; + else + str << "\\U" << std::hex << std::setfill('0') << std::setw(8) << value; + return str.str(); + } + + std::string WriteSingleByte(unsigned ch) { + return WriteUnicode(ch); + } + + std::string WriteTwoBytes(unsigned ch, unsigned ch1) { + // Note: if no second byte is provided (signalled by ch1 == 0) + // then we just write the first one as a single byte. + // Should we throw an error instead? Or write something else? + // (The same question goes for the other WriteNBytes functions) + if(ch1 == 0) + return WriteSingleByte(ch); + + unsigned value = ((ch - 0xC0) << 6) + (ch1 - 0x80); + return WriteUnicode(value); + } + + std::string WriteThreeBytes(unsigned ch, unsigned ch1, unsigned ch2) { + if(ch1 == 0) + return WriteSingleByte(ch); + if(ch2 == 0) + return WriteSingleByte(ch) + WriteSingleByte(ch1); + + unsigned value = ((ch - 0xE0) << 12) + ((ch1 - 0x80) << 6) + (ch2 - 0x80); + return WriteUnicode(value); + } + + std::string WriteFourBytes(unsigned ch, unsigned ch1, unsigned ch2, unsigned ch3) { + if(ch1 == 0) + return WriteSingleByte(ch); + if(ch2 == 0) + return WriteSingleByte(ch) + WriteSingleByte(ch1); + if(ch3 == 0) + return WriteSingleByte(ch) + WriteSingleByte(ch1) + WriteSingleByte(ch2); + + unsigned value = ((ch - 0xF0) << 18) + ((ch1 - 0x80) << 12) + ((ch2 - 0x80) << 6) + (ch3 - 0x80); + return WriteUnicode(value); + } + + // WriteNonPrintable + // . Writes the next UTF-8 code point to the stream + std::string::const_iterator WriteNonPrintable(ostream& out, std::string::const_iterator start, std::string::const_iterator end) { + std::string::const_iterator it = start; + unsigned ch = ToUnsigned(*it); + if(ch <= 0xC1) { + // this may include invalid first characters (0x80 - 0xBF) + // or "overlong" UTF-8 (0xC0 - 0xC1) + // We just copy them as bytes + // TODO: should we do something else? throw an error? + out << WriteSingleByte(ch); + return start; + } else if(ch <= 0xDF) { + unsigned ch1 = AdvanceAndGetNextChar(it, end); + out << WriteTwoBytes(ch, ch1); + return it; + } else if(ch <= 0xEF) { + unsigned ch1 = AdvanceAndGetNextChar(it, end); + unsigned ch2 = AdvanceAndGetNextChar(it, end); + out << WriteThreeBytes(ch, ch1, ch2); + return it; + } else { + unsigned ch1 = AdvanceAndGetNextChar(it, end); + unsigned ch2 = AdvanceAndGetNextChar(it, end); + unsigned ch3 = AdvanceAndGetNextChar(it, end); + out << WriteFourBytes(ch, ch1, ch2, ch3); + return it; + } + + return start; + } } bool WriteString(ostream& out, const std::string& str, bool inFlow) @@ -71,8 +164,8 @@ namespace YAML bool WriteDoubleQuotedString(ostream& out, const std::string& str) { out << "\""; - for(std::size_t i=0;i(static_cast(ch)); - out << str.str(); + it = WriteNonPrintable(out, it, str.end()); } } out << "\""; diff --git a/yaml-reader/emittertests.cpp b/yaml-reader/emittertests.cpp index 9af202f..c3209b2 100644 --- a/yaml-reader/emittertests.cpp +++ b/yaml-reader/emittertests.cpp @@ -447,6 +447,13 @@ namespace Test desiredOutput = "- ~\n-\n null value: ~\n ~: null key"; } + + void Unicode(YAML::Emitter& out, std::string& desiredOutput) + { + out << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2"; + + desiredOutput = "\"$ \\xa2 \\u20ac \\U00024b62\""; + } //////////////////////////////////////////////////////////////////////////////////////////////////////// // incorrect emitting @@ -609,6 +616,7 @@ namespace Test RunEmitterTest(&Emitter::SimpleGlobalSettings, "simple global settings", passed, total); RunEmitterTest(&Emitter::ComplexGlobalSettings, "complex global settings", passed, total); RunEmitterTest(&Emitter::Null, "null", passed, total); + RunEmitterTest(&Emitter::Unicode, "unicode", passed, total); RunEmitterErrorTest(&Emitter::ExtraEndSeq, "extra EndSeq", passed, total); RunEmitterErrorTest(&Emitter::ExtraEndMap, "extra EndMap", passed, total);