mirror of
https://github.com/jbeder/yaml-cpp.git
synced 2025-09-09 12:41:17 +00:00
Refactored the UTF-8 emitting
This commit is contained in:
@@ -5,6 +5,7 @@
|
|||||||
#include "stringsource.h"
|
#include "stringsource.h"
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
namespace YAML
|
namespace YAML
|
||||||
{
|
{
|
||||||
@@ -41,17 +42,11 @@ namespace YAML
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned ToUnsigned(char ch) { return static_cast<unsigned int>(static_cast<unsigned char>(ch)); }
|
typedef unsigned char byte;
|
||||||
unsigned AdvanceAndGetNextChar(std::string::const_iterator& it, std::string::const_iterator end) {
|
byte ToByte(char ch) { return static_cast<byte>(ch); }
|
||||||
std::string::const_iterator jt = it;
|
|
||||||
++jt;
|
|
||||||
if(jt == end)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
++it;
|
|
||||||
return ToUnsigned(*it);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
typedef std::string::const_iterator StrIter;
|
||||||
|
|
||||||
std::string WriteUnicode(unsigned value) {
|
std::string WriteUnicode(unsigned value) {
|
||||||
std::stringstream str;
|
std::stringstream str;
|
||||||
// TODO: for the common escaped characters, give their usual symbol
|
// TODO: for the common escaped characters, give their usual symbol
|
||||||
@@ -64,74 +59,101 @@ namespace YAML
|
|||||||
return str.str();
|
return str.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string WriteSingleByte(unsigned ch) {
|
// GetBytesToRead
|
||||||
return WriteUnicode(ch);
|
// . Returns the length of the UTF-8 sequence starting with 'signal'
|
||||||
|
int GetBytesToRead(byte signal) {
|
||||||
|
if(signal <= 0x7F) // ASCII
|
||||||
|
return 1;
|
||||||
|
else if(signal <= 0xBF) // invalid first characters
|
||||||
|
return 0;
|
||||||
|
else if(signal <= 0xDF) // Note: this allows "overlong" UTF8 (0xC0 - 0xC1) to pass unscathed. OK?
|
||||||
|
return 2;
|
||||||
|
else if(signal <= 0xEF)
|
||||||
|
return 3;
|
||||||
|
else
|
||||||
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string WriteTwoBytes(unsigned ch, unsigned ch1) {
|
// ReadBytes
|
||||||
// Note: if no second byte is provided (signalled by ch1 == 0)
|
// . Reads the next 'bytesToRead', if we can.
|
||||||
// then we just write the first one as a single byte.
|
// . Returns zero if we fail, otherwise fills the byte buffer with
|
||||||
// Should we throw an error instead? Or write something else?
|
// the data and returns the number of bytes read.
|
||||||
// (The same question goes for the other WriteNBytes functions)
|
int ReadBytes(byte bytes[4], StrIter start, StrIter end, int bytesToRead) {
|
||||||
if(ch1 == 0)
|
for(int i=0;i<bytesToRead;i++) {
|
||||||
return WriteSingleByte(ch);
|
if(start == end)
|
||||||
|
return 0;
|
||||||
unsigned value = ((ch - 0xC0) << 6) + (ch1 - 0x80);
|
bytes[i] = ToByte(*start);
|
||||||
return WriteUnicode(value);
|
++start;
|
||||||
|
}
|
||||||
|
return bytesToRead;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string WriteThreeBytes(unsigned ch, unsigned ch1, unsigned ch2) {
|
// IsValidUTF8
|
||||||
if(ch1 == 0)
|
// . Assumes bytes[0] is a valid signal byte with the right size passed
|
||||||
return WriteSingleByte(ch);
|
bool IsValidUTF8(byte bytes[4], int size) {
|
||||||
if(ch2 == 0)
|
for(int i=1;i<size;i++)
|
||||||
return WriteSingleByte(ch) + WriteSingleByte(ch1);
|
if(bytes[i] & 0x80 != 0x80)
|
||||||
|
return false;
|
||||||
unsigned value = ((ch - 0xE0) << 12) + ((ch1 - 0x80) << 6) + (ch2 - 0x80);
|
return true;
|
||||||
return WriteUnicode(value);
|
}
|
||||||
|
|
||||||
|
byte UTF8SignalPrefix(int size) {
|
||||||
|
switch(size) {
|
||||||
|
case 1: return 0;
|
||||||
|
case 2: return 0xC0;
|
||||||
|
case 3: return 0xE0;
|
||||||
|
case 4: return 0xF0;
|
||||||
|
}
|
||||||
|
assert(false);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned UTF8ToUnicode(byte bytes[4], int size) {
|
||||||
|
unsigned value = bytes[0] - UTF8SignalPrefix(size);
|
||||||
|
for(int i=1;i<size;i++)
|
||||||
|
value = (value << 6) + (bytes[i] - 0x80);
|
||||||
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string WriteFourBytes(unsigned ch, unsigned ch1, unsigned ch2, unsigned ch3) {
|
// ReadUTF8
|
||||||
if(ch1 == 0)
|
// . Returns the Unicode code point starting at 'start',
|
||||||
return WriteSingleByte(ch);
|
// and sets 'bytesRead' to the length of the UTF-8 Sequence
|
||||||
if(ch2 == 0)
|
// . If it's invalid UTF8, we set 'bytesRead' to zero.
|
||||||
return WriteSingleByte(ch) + WriteSingleByte(ch1);
|
unsigned ReadUTF8(StrIter start, StrIter end, int& bytesRead) {
|
||||||
if(ch3 == 0)
|
int bytesToRead = GetBytesToRead(ToByte(*start));
|
||||||
return WriteSingleByte(ch) + WriteSingleByte(ch1) + WriteSingleByte(ch2);
|
if(!bytesToRead) {
|
||||||
|
bytesRead = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte bytes[4];
|
||||||
|
bytesRead = ReadBytes(bytes, start, end, bytesToRead);
|
||||||
|
if(!bytesRead)
|
||||||
|
return 0;
|
||||||
|
|
||||||
unsigned value = ((ch - 0xF0) << 18) + ((ch1 - 0x80) << 12) + ((ch2 - 0x80) << 6) + (ch3 - 0x80);
|
if(!IsValidUTF8(bytes, bytesRead)) {
|
||||||
return WriteUnicode(value);
|
bytesRead = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return UTF8ToUnicode(bytes, bytesRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
// WriteNonPrintable
|
// WriteNonPrintable
|
||||||
// . Writes the next UTF-8 code point to the stream
|
// . Writes the next UTF-8 code point to the stream
|
||||||
std::string::const_iterator WriteNonPrintable(ostream& out, std::string::const_iterator start, std::string::const_iterator end) {
|
int WriteNonPrintable(ostream& out, StrIter start, StrIter end) {
|
||||||
std::string::const_iterator it = start;
|
int bytesRead = 0;
|
||||||
unsigned ch = ToUnsigned(*it);
|
unsigned value = ReadUTF8(start, end, bytesRead);
|
||||||
if(ch <= 0xC1) {
|
|
||||||
// this may include invalid first characters (0x80 - 0xBF)
|
if(bytesRead == 0) {
|
||||||
// or "overlong" UTF-8 (0xC0 - 0xC1)
|
// TODO: is it ok to just write the replacement character here,
|
||||||
// We just copy them as bytes
|
// or should we instead write the invalid byte (as \xNN)?
|
||||||
// TODO: should we do something else? throw an error?
|
out << WriteUnicode(0xFFFD);
|
||||||
out << WriteSingleByte(ch);
|
return 1;
|
||||||
return start;
|
|
||||||
} else if(ch <= 0xDF) {
|
|
||||||
unsigned ch1 = AdvanceAndGetNextChar(it, end);
|
|
||||||
out << WriteTwoBytes(ch, ch1);
|
|
||||||
return it;
|
|
||||||
} else if(ch <= 0xEF) {
|
|
||||||
unsigned ch1 = AdvanceAndGetNextChar(it, end);
|
|
||||||
unsigned ch2 = AdvanceAndGetNextChar(it, end);
|
|
||||||
out << WriteThreeBytes(ch, ch1, ch2);
|
|
||||||
return it;
|
|
||||||
} else {
|
|
||||||
unsigned ch1 = AdvanceAndGetNextChar(it, end);
|
|
||||||
unsigned ch2 = AdvanceAndGetNextChar(it, end);
|
|
||||||
unsigned ch3 = AdvanceAndGetNextChar(it, end);
|
|
||||||
out << WriteFourBytes(ch, ch1, ch2, ch3);
|
|
||||||
return it;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return start;
|
out << WriteUnicode(value);
|
||||||
|
return bytesRead;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -164,7 +186,7 @@ namespace YAML
|
|||||||
bool WriteDoubleQuotedString(ostream& out, const std::string& str)
|
bool WriteDoubleQuotedString(ostream& out, const std::string& str)
|
||||||
{
|
{
|
||||||
out << "\"";
|
out << "\"";
|
||||||
for(std::string::const_iterator it=str.begin();it!=str.end();++it) {
|
for(StrIter it=str.begin();it!=str.end();++it) {
|
||||||
char ch = *it;
|
char ch = *it;
|
||||||
if(IsPrintable(ch)) {
|
if(IsPrintable(ch)) {
|
||||||
if(ch == '\"')
|
if(ch == '\"')
|
||||||
@@ -174,7 +196,9 @@ namespace YAML
|
|||||||
else
|
else
|
||||||
out << ch;
|
out << ch;
|
||||||
} else {
|
} else {
|
||||||
it = WriteNonPrintable(out, it, str.end());
|
int bytesRead = WriteNonPrintable(out, it, str.end());
|
||||||
|
if(bytesRead >= 1)
|
||||||
|
it += (bytesRead - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out << "\"";
|
out << "\"";
|
||||||
|
Reference in New Issue
Block a user