From 114db22335e9e394d5f36e23413d0cc2cd2773a0 Mon Sep 17 00:00:00 2001 From: Jesse Beder Date: Sun, 23 Mar 2014 20:24:36 -0500 Subject: [PATCH] Fix UTF16 parsing for multi-byte characters --- src/stream.cpp | 5 +++-- test/integration/encoding_test.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/stream.cpp b/src/stream.cpp index 4caad85..bba66f0 100644 --- a/src/stream.cpp +++ b/src/stream.cpp @@ -365,13 +365,13 @@ void Stream::StreamInUtf16() const { } unsigned long chLow = (static_cast(bytes[nBigEnd]) << 8) | static_cast(bytes[1 ^ nBigEnd]); - if (chLow < 0xDC00 || ch >= 0xE000) { + if (chLow < 0xDC00 || chLow >= 0xE000) { // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the // stream. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); // Deal with the next UTF-16 unit - if (chLow < 0xD800 || ch >= 0xE000) { + if (chLow < 0xD800 || chLow >= 0xE000) { // Easiest case: queue the codepoint and return QueueUnicodeCodepoint(m_readahead, ch); return; @@ -391,6 +391,7 @@ void Stream::StreamInUtf16() const { // Add the surrogacy offset ch += 0x10000; + break; } } diff --git a/test/integration/encoding_test.cpp b/test/integration/encoding_test.cpp index 1e5e724..46392eb 100644 --- a/test/integration/encoding_test.cpp +++ b/test/integration/encoding_test.cpp @@ -139,22 +139,22 @@ TEST_F(EncodingTest, UTF8_BOM) { Run(); } -TEST_F(EncodingTest, DISABLED_UTF16LE_noBOM) { +TEST_F(EncodingTest, UTF16LE_noBOM) { SetUpEncoding(&EncodeToUtf16LE, false); Run(); } -TEST_F(EncodingTest, DISABLED_UTF16LE_BOM) { +TEST_F(EncodingTest, UTF16LE_BOM) { SetUpEncoding(&EncodeToUtf16LE, true); Run(); } -TEST_F(EncodingTest, DISABLED_UTF16BE_noBOM) { +TEST_F(EncodingTest, UTF16BE_noBOM) { SetUpEncoding(&EncodeToUtf16BE, false); Run(); } -TEST_F(EncodingTest, DISABLED_UTF16BE_BOM) { +TEST_F(EncodingTest, UTF16BE_BOM) { SetUpEncoding(&EncodeToUtf16BE, true); Run(); }