Refactored the UTF-8 emitting

This commit is contained in:
Jesse Beder
2009-10-08 21:05:56 +00:00
parent d0b5bf4b7b
commit 3e1ba0f3b4

View File

@@ -5,6 +5,7 @@
#include "stringsource.h" #include "stringsource.h"
#include <sstream> #include <sstream>
#include <iomanip> #include <iomanip>
#include <cassert>
namespace YAML namespace YAML
{ {
@@ -41,16 +42,10 @@ namespace YAML
return true; return true;
} }
unsigned ToUnsigned(char ch) { return static_cast<unsigned int>(static_cast<unsigned char>(ch)); } typedef unsigned char byte;
unsigned AdvanceAndGetNextChar(std::string::const_iterator& it, std::string::const_iterator end) { byte ToByte(char ch) { return static_cast<byte>(ch); }
std::string::const_iterator jt = it;
++jt;
if(jt == end)
return 0;
++it; typedef std::string::const_iterator StrIter;
return ToUnsigned(*it);
}
std::string WriteUnicode(unsigned value) { std::string WriteUnicode(unsigned value) {
std::stringstream str; std::stringstream str;
@@ -64,74 +59,101 @@ namespace YAML
return str.str(); return str.str();
} }
std::string WriteSingleByte(unsigned ch) { // GetBytesToRead
return WriteUnicode(ch); // . Returns the length of the UTF-8 sequence starting with 'signal'
int GetBytesToRead(byte signal) {
if(signal <= 0x7F) // ASCII
return 1;
else if(signal <= 0xBF) // invalid first characters
return 0;
else if(signal <= 0xDF) // Note: this allows "overlong" UTF8 (0xC0 - 0xC1) to pass unscathed. OK?
return 2;
else if(signal <= 0xEF)
return 3;
else
return 4;
} }
std::string WriteTwoBytes(unsigned ch, unsigned ch1) { // ReadBytes
// Note: if no second byte is provided (signalled by ch1 == 0) // . Reads the next 'bytesToRead', if we can.
// then we just write the first one as a single byte. // . Returns zero if we fail, otherwise fills the byte buffer with
// Should we throw an error instead? Or write something else? // the data and returns the number of bytes read.
// (The same question goes for the other WriteNBytes functions) int ReadBytes(byte bytes[4], StrIter start, StrIter end, int bytesToRead) {
if(ch1 == 0) for(int i=0;i<bytesToRead;i++) {
return WriteSingleByte(ch); if(start == end)
return 0;
unsigned value = ((ch - 0xC0) << 6) + (ch1 - 0x80); bytes[i] = ToByte(*start);
return WriteUnicode(value); ++start;
}
return bytesToRead;
} }
std::string WriteThreeBytes(unsigned ch, unsigned ch1, unsigned ch2) { // IsValidUTF8
if(ch1 == 0) // . Assumes bytes[0] is a valid signal byte with the right size passed
return WriteSingleByte(ch); bool IsValidUTF8(byte bytes[4], int size) {
if(ch2 == 0) for(int i=1;i<size;i++)
return WriteSingleByte(ch) + WriteSingleByte(ch1); if(bytes[i] & 0x80 != 0x80)
return false;
unsigned value = ((ch - 0xE0) << 12) + ((ch1 - 0x80) << 6) + (ch2 - 0x80); return true;
return WriteUnicode(value);
} }
std::string WriteFourBytes(unsigned ch, unsigned ch1, unsigned ch2, unsigned ch3) { byte UTF8SignalPrefix(int size) {
if(ch1 == 0) switch(size) {
return WriteSingleByte(ch); case 1: return 0;
if(ch2 == 0) case 2: return 0xC0;
return WriteSingleByte(ch) + WriteSingleByte(ch1); case 3: return 0xE0;
if(ch3 == 0) case 4: return 0xF0;
return WriteSingleByte(ch) + WriteSingleByte(ch1) + WriteSingleByte(ch2); }
assert(false);
return 0;
}
unsigned value = ((ch - 0xF0) << 18) + ((ch1 - 0x80) << 12) + ((ch2 - 0x80) << 6) + (ch3 - 0x80); unsigned UTF8ToUnicode(byte bytes[4], int size) {
return WriteUnicode(value); unsigned value = bytes[0] - UTF8SignalPrefix(size);
for(int i=1;i<size;i++)
value = (value << 6) + (bytes[i] - 0x80);
return value;
}
// ReadUTF8
// . Returns the Unicode code point starting at 'start',
// and sets 'bytesRead' to the length of the UTF-8 Sequence
// . If it's invalid UTF8, we set 'bytesRead' to zero.
unsigned ReadUTF8(StrIter start, StrIter end, int& bytesRead) {
int bytesToRead = GetBytesToRead(ToByte(*start));
if(!bytesToRead) {
bytesRead = 0;
return 0;
}
byte bytes[4];
bytesRead = ReadBytes(bytes, start, end, bytesToRead);
if(!bytesRead)
return 0;
if(!IsValidUTF8(bytes, bytesRead)) {
bytesRead = 0;
return 0;
}
return UTF8ToUnicode(bytes, bytesRead);
} }
// WriteNonPrintable // WriteNonPrintable
// . Writes the next UTF-8 code point to the stream // . Writes the next UTF-8 code point to the stream
std::string::const_iterator WriteNonPrintable(ostream& out, std::string::const_iterator start, std::string::const_iterator end) { int WriteNonPrintable(ostream& out, StrIter start, StrIter end) {
std::string::const_iterator it = start; int bytesRead = 0;
unsigned ch = ToUnsigned(*it); unsigned value = ReadUTF8(start, end, bytesRead);
if(ch <= 0xC1) {
// this may include invalid first characters (0x80 - 0xBF) if(bytesRead == 0) {
// or "overlong" UTF-8 (0xC0 - 0xC1) // TODO: is it ok to just write the replacement character here,
// We just copy them as bytes // or should we instead write the invalid byte (as \xNN)?
// TODO: should we do something else? throw an error? out << WriteUnicode(0xFFFD);
out << WriteSingleByte(ch); return 1;
return start;
} else if(ch <= 0xDF) {
unsigned ch1 = AdvanceAndGetNextChar(it, end);
out << WriteTwoBytes(ch, ch1);
return it;
} else if(ch <= 0xEF) {
unsigned ch1 = AdvanceAndGetNextChar(it, end);
unsigned ch2 = AdvanceAndGetNextChar(it, end);
out << WriteThreeBytes(ch, ch1, ch2);
return it;
} else {
unsigned ch1 = AdvanceAndGetNextChar(it, end);
unsigned ch2 = AdvanceAndGetNextChar(it, end);
unsigned ch3 = AdvanceAndGetNextChar(it, end);
out << WriteFourBytes(ch, ch1, ch2, ch3);
return it;
} }
return start; out << WriteUnicode(value);
return bytesRead;
} }
} }
@@ -164,7 +186,7 @@ namespace YAML
bool WriteDoubleQuotedString(ostream& out, const std::string& str) bool WriteDoubleQuotedString(ostream& out, const std::string& str)
{ {
out << "\""; out << "\"";
for(std::string::const_iterator it=str.begin();it!=str.end();++it) { for(StrIter it=str.begin();it!=str.end();++it) {
char ch = *it; char ch = *it;
if(IsPrintable(ch)) { if(IsPrintable(ch)) {
if(ch == '\"') if(ch == '\"')
@@ -174,7 +196,9 @@ namespace YAML
else else
out << ch; out << ch;
} else { } else {
it = WriteNonPrintable(out, it, str.end()); int bytesRead = WriteNonPrintable(out, it, str.end());
if(bytesRead >= 1)
it += (bytesRead - 1);
} }
} }
out << "\""; out << "\"";