Add support for JSON-compatible string escapes (#485)

For completeness I've implemented escaping for characters outside the
basic multilingual plane, but it doesn't get used (as there's no
EscapeAsAsciiJson emitter option implemented).
This commit is contained in:
Oliver Hamlet
2020-07-14 03:16:34 +01:00
committed by GitHub
parent 370aceeaf8
commit c82d3129dd
6 changed files with 97 additions and 18 deletions

View File

@@ -19,6 +19,7 @@ enum EMITTER_MANIP {
// output character set // output character set
EmitNonAscii, EmitNonAscii,
EscapeNonAscii, EscapeNonAscii,
EscapeAsJson,
// string manipulators // string manipulators
// Auto, // duplicate // Auto, // duplicate

View File

@@ -686,14 +686,27 @@ void Emitter::StartedScalar() { m_pState->StartedScalar(); }
// ******************************************************************************************* // *******************************************************************************************
// overloads of Write // overloads of Write
StringEscaping::value GetStringEscapingStyle(const EMITTER_MANIP emitterManip) {
switch (emitterManip) {
case EscapeNonAscii:
return StringEscaping::NonAscii;
case EscapeAsJson:
return StringEscaping::JSON;
default:
return StringEscaping::None;
break;
}
}
Emitter& Emitter::Write(const std::string& str) { Emitter& Emitter::Write(const std::string& str) {
if (!good()) if (!good())
return *this; return *this;
const bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii; StringEscaping::value stringEscaping = GetStringEscapingStyle(m_pState->GetOutputCharset());
const StringFormat::value strFormat = const StringFormat::value strFormat =
Utils::ComputeStringFormat(str, m_pState->GetStringFormat(), Utils::ComputeStringFormat(str, m_pState->GetStringFormat(),
m_pState->CurGroupFlowType(), escapeNonAscii); m_pState->CurGroupFlowType(), stringEscaping == StringEscaping::NonAscii);
if (strFormat == StringFormat::Literal) if (strFormat == StringFormat::Literal)
m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local); m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local);
@@ -708,7 +721,7 @@ Emitter& Emitter::Write(const std::string& str) {
Utils::WriteSingleQuotedString(m_stream, str); Utils::WriteSingleQuotedString(m_stream, str);
break; break;
case StringFormat::DoubleQuoted: case StringFormat::DoubleQuoted:
Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii); Utils::WriteDoubleQuotedString(m_stream, str, stringEscaping);
break; break;
case StringFormat::Literal: case StringFormat::Literal:
Utils::WriteLiteralString(m_stream, str, Utils::WriteLiteralString(m_stream, str,
@@ -814,8 +827,10 @@ Emitter& Emitter::Write(char ch) {
if (!good()) if (!good())
return *this; return *this;
PrepareNode(EmitterNodeType::Scalar); PrepareNode(EmitterNodeType::Scalar);
Utils::WriteChar(m_stream, ch); Utils::WriteChar(m_stream, ch, GetStringEscapingStyle(m_pState->GetOutputCharset()));
StartedScalar(); StartedScalar();
return *this; return *this;

View File

@@ -231,6 +231,7 @@ bool EmitterState::SetOutputCharset(EMITTER_MANIP value,
switch (value) { switch (value) {
case EmitNonAscii: case EmitNonAscii:
case EscapeNonAscii: case EscapeNonAscii:
case EscapeAsJson:
_Set(m_charset, value, scope); _Set(m_charset, value, scope);
return true; return true;
default: default:

View File

@@ -218,20 +218,34 @@ bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
}); });
} }
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) { std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
return {
leadOffset | (codePoint >> 10),
0xDC00 | (codePoint & 0x3FF),
};
}
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
static const char hexDigits[] = "0123456789abcdef"; static const char hexDigits[] = "0123456789abcdef";
out << "\\"; out << "\\";
int digits = 8; int digits = 8;
if (codePoint < 0xFF) { if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
out << "x"; out << "x";
digits = 2; digits = 2;
} else if (codePoint < 0xFFFF) { } else if (codePoint < 0xFFFF) {
out << "u"; out << "u";
digits = 4; digits = 4;
} else { } else if (stringEscapingStyle != StringEscaping::JSON) {
out << "U"; out << "U";
digits = 8; digits = 8;
} else {
auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
return;
} }
// Write digits into the escape sequence // Write digits into the escape sequence
@@ -303,7 +317,7 @@ bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
} }
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
bool escapeNonAscii) { StringEscaping::value stringEscaping) {
out << "\""; out << "\"";
int codePoint; int codePoint;
for (std::string::const_iterator i = str.begin(); for (std::string::const_iterator i = str.begin();
@@ -327,16 +341,19 @@ bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
case '\b': case '\b':
out << "\\b"; out << "\\b";
break; break;
case '\f':
out << "\\f";
break;
default: default:
if (codePoint < 0x20 || if (codePoint < 0x20 ||
(codePoint >= 0x80 && (codePoint >= 0x80 &&
codePoint <= 0xA0)) { // Control characters and non-breaking space codePoint <= 0xA0)) { // Control characters and non-breaking space
WriteDoubleQuoteEscapeSequence(out, codePoint); WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
// escaped (YAML 1.2, sec. 5.2) // escaped (YAML 1.2, sec. 5.2)
WriteDoubleQuoteEscapeSequence(out, codePoint); WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else if (escapeNonAscii && codePoint > 0x7E) { } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
WriteDoubleQuoteEscapeSequence(out, codePoint); WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else { } else {
WriteCodePoint(out, codePoint); WriteCodePoint(out, codePoint);
} }
@@ -362,7 +379,7 @@ bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
return true; return true;
} }
bool WriteChar(ostream_wrapper& out, char ch) { bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) { if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
out << ch; out << ch;
} else if (ch == '\"') { } else if (ch == '\"') {
@@ -373,13 +390,17 @@ bool WriteChar(ostream_wrapper& out, char ch) {
out << R"("\n")"; out << R"("\n")";
} else if (ch == '\b') { } else if (ch == '\b') {
out << R"("\b")"; out << R"("\b")";
} else if (ch == '\r') {
out << R"("\r")";
} else if (ch == '\f') {
out << R"("\f")";
} else if (ch == '\\') { } else if (ch == '\\') {
out << R"("\\")"; out << R"("\\")";
} else if (0x20 <= ch && ch <= 0x7e) { } else if (0x20 <= ch && ch <= 0x7e) {
out << "\"" << ch << "\""; out << "\"" << ch << "\"";
} else { } else {
out << "\""; out << "\"";
WriteDoubleQuoteEscapeSequence(out, ch); WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
out << "\""; out << "\"";
} }
return true; return true;
@@ -469,7 +490,7 @@ bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
bool WriteBinary(ostream_wrapper& out, const Binary& binary) { bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
false); StringEscaping::None);
return true; return true;
} }
} // namespace Utils } // namespace Utils

View File

@@ -24,6 +24,10 @@ struct StringFormat {
enum value { Plain, SingleQuoted, DoubleQuoted, Literal }; enum value { Plain, SingleQuoted, DoubleQuoted, Literal };
}; };
struct StringEscaping {
enum value { None, NonAscii, JSON };
};
namespace Utils { namespace Utils {
StringFormat::value ComputeStringFormat(const std::string& str, StringFormat::value ComputeStringFormat(const std::string& str,
EMITTER_MANIP strFormat, EMITTER_MANIP strFormat,
@@ -32,10 +36,11 @@ StringFormat::value ComputeStringFormat(const std::string& str,
bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str); bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str);
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
bool escapeNonAscii); StringEscaping::value stringEscaping);
bool WriteLiteralString(ostream_wrapper& out, const std::string& str, bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
std::size_t indent); std::size_t indent);
bool WriteChar(ostream_wrapper& out, char ch); bool WriteChar(ostream_wrapper& out, char ch,
StringEscaping::value stringEscapingStyle);
bool WriteComment(ostream_wrapper& out, const std::string& str, bool WriteComment(ostream_wrapper& out, const std::string& str,
std::size_t postCommentIndent); std::size_t postCommentIndent);
bool WriteAlias(ostream_wrapper& out, const std::string& str); bool WriteAlias(ostream_wrapper& out, const std::string& str);

View File

@@ -816,6 +816,42 @@ TEST_F(EmitterTest, DoubleQuotedUnicode) {
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\""); ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
} }
TEST_F(EmitterTest, EscapedJsonString) {
out.SetStringFormat(DoubleQuoted);
out.SetOutputCharset(EscapeAsJson);
out << "\" \\ "
"\x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0A \x0B \x0C \x0D \x0E \x0F "
"\x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F "
"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";
ExpectEmit(R"("\" \\ \u0001 \u0002 \u0003 \u0004 \u0005 \u0006 \u0007 \b \t )"
R"(\n \u000b \f \r \u000e \u000f \u0010 \u0011 \u0012 \u0013 )"
R"(\u0014 \u0015 \u0016 \u0017 \u0018 \u0019 \u001a \u001b )"
R"(\u001c \u001d \u001e \u001f )"
"$ \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
}
TEST_F(EmitterTest, EscapedCharacters) {
out << BeginSeq
<< '\x00'
<< '\x0C'
<< '\x0D'
<< EndSeq;
ExpectEmit("- \"\\x00\"\n- \"\\f\"\n- \"\\r\"");
}
TEST_F(EmitterTest, CharactersEscapedAsJson) {
out.SetOutputCharset(EscapeAsJson);
out << BeginSeq
<< '\x00'
<< '\x0C'
<< '\x0D'
<< EndSeq;
ExpectEmit("- \"\\u0000\"\n- \"\\f\"\n- \"\\r\"");
}
TEST_F(EmitterTest, DoubleQuotedString) { TEST_F(EmitterTest, DoubleQuotedString) {
out << DoubleQuoted << "\" \\ \n \t \r \b \x15 \xEF\xBB\xBF \x24"; out << DoubleQuoted << "\" \\ \n \t \r \b \x15 \xEF\xBB\xBF \x24";
ExpectEmit("\"\\\" \\\\ \\n \\t \\r \\b \\x15 \\ufeff $\""); ExpectEmit("\"\\\" \\\\ \\n \\t \\r \\b \\x15 \\ufeff $\"");