Add support for JSON-compatible string escapes (#485)

For completeness I've implemented escaping for characters outside the
basic multilingual plane, but it doesn't get used (as there's no
EscapeAsAsciiJson emitter option implemented).
This commit is contained in:
Oliver Hamlet
2020-07-14 03:16:34 +01:00
committed by GitHub
parent 370aceeaf8
commit c82d3129dd
6 changed files with 97 additions and 18 deletions

View File

@@ -19,6 +19,7 @@ enum EMITTER_MANIP {
// output character set
EmitNonAscii,
EscapeNonAscii,
EscapeAsJson,
// string manipulators
// Auto, // duplicate

View File

@@ -686,14 +686,27 @@ void Emitter::StartedScalar() { m_pState->StartedScalar(); }
// *******************************************************************************************
// overloads of Write
StringEscaping::value GetStringEscapingStyle(const EMITTER_MANIP emitterManip) {
switch (emitterManip) {
case EscapeNonAscii:
return StringEscaping::NonAscii;
case EscapeAsJson:
return StringEscaping::JSON;
default:
return StringEscaping::None;
break;
}
}
Emitter& Emitter::Write(const std::string& str) {
if (!good())
return *this;
const bool escapeNonAscii = m_pState->GetOutputCharset() == EscapeNonAscii;
StringEscaping::value stringEscaping = GetStringEscapingStyle(m_pState->GetOutputCharset());
const StringFormat::value strFormat =
Utils::ComputeStringFormat(str, m_pState->GetStringFormat(),
m_pState->CurGroupFlowType(), escapeNonAscii);
m_pState->CurGroupFlowType(), stringEscaping == StringEscaping::NonAscii);
if (strFormat == StringFormat::Literal)
m_pState->SetMapKeyFormat(YAML::LongKey, FmtScope::Local);
@@ -708,7 +721,7 @@ Emitter& Emitter::Write(const std::string& str) {
Utils::WriteSingleQuotedString(m_stream, str);
break;
case StringFormat::DoubleQuoted:
Utils::WriteDoubleQuotedString(m_stream, str, escapeNonAscii);
Utils::WriteDoubleQuotedString(m_stream, str, stringEscaping);
break;
case StringFormat::Literal:
Utils::WriteLiteralString(m_stream, str,
@@ -814,8 +827,10 @@ Emitter& Emitter::Write(char ch) {
if (!good())
return *this;
PrepareNode(EmitterNodeType::Scalar);
Utils::WriteChar(m_stream, ch);
Utils::WriteChar(m_stream, ch, GetStringEscapingStyle(m_pState->GetOutputCharset()));
StartedScalar();
return *this;

View File

@@ -231,6 +231,7 @@ bool EmitterState::SetOutputCharset(EMITTER_MANIP value,
switch (value) {
case EmitNonAscii:
case EscapeNonAscii:
case EscapeAsJson:
_Set(m_charset, value, scope);
return true;
default:

View File

@@ -218,20 +218,34 @@ bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
});
}
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint) {
std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
return {
leadOffset | (codePoint >> 10),
0xDC00 | (codePoint & 0x3FF),
};
}
void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
static const char hexDigits[] = "0123456789abcdef";
out << "\\";
int digits = 8;
if (codePoint < 0xFF) {
if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
out << "x";
digits = 2;
} else if (codePoint < 0xFFFF) {
out << "u";
digits = 4;
} else {
} else if (stringEscapingStyle != StringEscaping::JSON) {
out << "U";
digits = 8;
} else {
auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
return;
}
// Write digits into the escape sequence
@@ -303,7 +317,7 @@ bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
}
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
bool escapeNonAscii) {
StringEscaping::value stringEscaping) {
out << "\"";
int codePoint;
for (std::string::const_iterator i = str.begin();
@@ -327,16 +341,19 @@ bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
case '\b':
out << "\\b";
break;
case '\f':
out << "\\f";
break;
default:
if (codePoint < 0x20 ||
(codePoint >= 0x80 &&
codePoint <= 0xA0)) { // Control characters and non-breaking space
WriteDoubleQuoteEscapeSequence(out, codePoint);
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
// escaped (YAML 1.2, sec. 5.2)
WriteDoubleQuoteEscapeSequence(out, codePoint);
} else if (escapeNonAscii && codePoint > 0x7E) {
WriteDoubleQuoteEscapeSequence(out, codePoint);
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
} else {
WriteCodePoint(out, codePoint);
}
@@ -362,7 +379,7 @@ bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
return true;
}
bool WriteChar(ostream_wrapper& out, char ch) {
bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
out << ch;
} else if (ch == '\"') {
@@ -373,13 +390,17 @@ bool WriteChar(ostream_wrapper& out, char ch) {
out << R"("\n")";
} else if (ch == '\b') {
out << R"("\b")";
} else if (ch == '\r') {
out << R"("\r")";
} else if (ch == '\f') {
out << R"("\f")";
} else if (ch == '\\') {
out << R"("\\")";
} else if (0x20 <= ch && ch <= 0x7e) {
out << "\"" << ch << "\"";
} else {
out << "\"";
WriteDoubleQuoteEscapeSequence(out, ch);
WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
out << "\"";
}
return true;
@@ -469,7 +490,7 @@ bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
false);
StringEscaping::None);
return true;
}
} // namespace Utils

View File

@@ -24,6 +24,10 @@ struct StringFormat {
enum value { Plain, SingleQuoted, DoubleQuoted, Literal };
};
struct StringEscaping {
enum value { None, NonAscii, JSON };
};
namespace Utils {
StringFormat::value ComputeStringFormat(const std::string& str,
EMITTER_MANIP strFormat,
@@ -32,10 +36,11 @@ StringFormat::value ComputeStringFormat(const std::string& str,
bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str);
bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
bool escapeNonAscii);
StringEscaping::value stringEscaping);
bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
std::size_t indent);
bool WriteChar(ostream_wrapper& out, char ch);
bool WriteChar(ostream_wrapper& out, char ch,
StringEscaping::value stringEscapingStyle);
bool WriteComment(ostream_wrapper& out, const std::string& str,
std::size_t postCommentIndent);
bool WriteAlias(ostream_wrapper& out, const std::string& str);

View File

@@ -813,7 +813,43 @@ TEST_F(EmitterTest, Unicode) {
TEST_F(EmitterTest, DoubleQuotedUnicode) {
out << DoubleQuoted << "\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
ExpectEmit("\"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
}
TEST_F(EmitterTest, EscapedJsonString) {
out.SetStringFormat(DoubleQuoted);
out.SetOutputCharset(EscapeAsJson);
out << "\" \\ "
"\x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0A \x0B \x0C \x0D \x0E \x0F "
"\x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F "
"\x24 \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2";
ExpectEmit(R"("\" \\ \u0001 \u0002 \u0003 \u0004 \u0005 \u0006 \u0007 \b \t )"
R"(\n \u000b \f \r \u000e \u000f \u0010 \u0011 \u0012 \u0013 )"
R"(\u0014 \u0015 \u0016 \u0017 \u0018 \u0019 \u001a \u001b )"
R"(\u001c \u001d \u001e \u001f )"
"$ \xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\"");
}
TEST_F(EmitterTest, EscapedCharacters) {
out << BeginSeq
<< '\x00'
<< '\x0C'
<< '\x0D'
<< EndSeq;
ExpectEmit("- \"\\x00\"\n- \"\\f\"\n- \"\\r\"");
}
TEST_F(EmitterTest, CharactersEscapedAsJson) {
out.SetOutputCharset(EscapeAsJson);
out << BeginSeq
<< '\x00'
<< '\x0C'
<< '\x0D'
<< EndSeq;
ExpectEmit("- \"\\u0000\"\n- \"\\f\"\n- \"\\r\"");
}
TEST_F(EmitterTest, DoubleQuotedString) {