10#include "lcf/encoder.h"
11#include "lcf/reader_util.h"
12#include "lcf/scope_guard.h"
17#if LCF_SUPPORT_ICU == 1
18# include <unicode/ucsdet.h>
19# include <unicode/ucnv.h>
20#elif LCF_SUPPORT_ICU == 2
22# error "icu.h only supported on Windows"
39 if (ucnv_compareNames(enc.c_str(),
"UTF-8") == 0) {
44 if (enc ==
"utf-8" || enc ==
"UTF-8" || enc ==
"65001") {
51Encoder::Encoder(std::string encoding)
61bool Encoder::IsOk()
const {
62 return _encoding.empty() || (_conv_storage && _conv_runtime);
65void Encoder::Encode(std::string& str) {
66 if (_encoding.empty() || str.empty()) {
69 Convert(str, _conv_runtime, _conv_storage);
72void Encoder::Decode(std::string& str) {
73 if (_encoding.empty() || str.empty()) {
76 Convert(str, _conv_storage, _conv_runtime);
80 if (_encoding.empty()) {
84 auto code_page = atoi(_encoding.c_str());
85 const auto& storage_encoding = code_page > 0
86 ? ReaderUtil::CodepageToEncoding(code_page)
90 auto status = U_ZERO_ERROR;
91 constexpr auto runtime_encoding =
"UTF-8";
92 auto conv_runtime = ucnv_open(runtime_encoding, &status);
94 if (conv_runtime ==
nullptr) {
95 Log::Error(
"ucnv_open() error for encoding \"%s\": %s", runtime_encoding, u_errorName(status));
98 status = U_ZERO_ERROR;
99 auto sg = makeScopeGuard([&]() { ucnv_close(conv_runtime); });
101 auto conv_storage = ucnv_open(storage_encoding.c_str(), &status);
103 if (conv_storage ==
nullptr) {
104 Log::Error(
"ucnv_open() error for dest encoding \"%s\": %s", storage_encoding.c_str(), u_errorName(status));
110 _conv_runtime = conv_runtime;
111 _conv_storage = conv_storage;
113 if (storage_encoding !=
"windows-1252") {
117 _conv_runtime = 65001;
118 _conv_storage = 1252;
123void Encoder::Reset() {
125 ucnv_close(_conv_runtime);
126 _conv_runtime =
nullptr;
130 ucnv_close(_conv_storage);
131 _conv_storage =
nullptr;
135void Encoder::Convert(std::string& str, UConverter* conv_dst, UConverter* conv_src) {
136 const auto& src = str;
138 auto status = U_ZERO_ERROR;
139 _buffer.resize(src.size() * 4);
141 const auto* src_p = src.c_str();
142 auto* dst_p = _buffer.data();
144 ucnv_convertEx(conv_dst, conv_src,
145 &dst_p, dst_p + _buffer.size(),
146 &src_p, src_p + src.size(),
147 nullptr,
nullptr,
nullptr,
nullptr,
151 if (U_FAILURE(status)) {
152 Log::Error(
"ucnv_convertEx() error when encoding \"%s\": %s", src.c_str(), u_errorName(status));
156 str.assign(_buffer.data(), dst_p);
159void Encoder::Convert(std::string& str,
int conv_dst,
int) {
166 if (conv_dst == 65001) {
169 _buffer.resize(str.size() * 2 + 1);
171 for (
unsigned char ch: str) {
173 _buffer[buf_idx] =
static_cast<char>(ch);
175 _buffer[buf_idx] =
static_cast<char>(0xC0 | (ch >> 6));
177 _buffer[buf_idx] =
static_cast<char>(0x80 | (ch & 0x3F));
185 _buffer.resize(str.size() + 1);
188 for (
size_t str_idx = 0; str_idx < str.size(); ++str_idx) {
189 unsigned char ch = str[str_idx];
192 }
else if (ch <= 0xBF) {
193 codepoint = (codepoint << 6) | (ch & 0x3F);
194 }
else if (ch <= 0xDF) {
195 codepoint = ch & 0x1F;
196 }
else if (ch <= 0xEF) {
197 codepoint = ch & 0x0F;
199 codepoint = ch & 0x07;
203 if (((ch & 0xC0) != 0x80) && (codepoint <= 0x10ffff)) {
204 if (codepoint <= 255) {
205 _buffer[buf_idx] =
static_cast<char>(codepoint);
207 _buffer[buf_idx] =
'?';
214 str.assign(_buffer.data(), buf_idx);
static std::string filterUtf8Compatible(std::string enc)