UCommon
unicode.h
Go to the documentation of this file.
1// Copyright (C) 2009-2014 David Sugar, Tycho Softworks.
2// Copyright (C) 2015 Cherokees of Idaho.
3//
4// This file is part of GNU uCommon C++.
5//
6// GNU uCommon C++ is free software: you can redistribute it and/or modify
7// it under the terms of the GNU Lesser General Public License as published
8// by the Free Software Foundation, either version 3 of the License, or
9// (at your option) any later version.
10//
11// GNU uCommon C++ is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU Lesser General Public License for more details.
15//
16// You should have received a copy of the GNU Lesser General Public License
17// along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
18
33#ifndef _UCOMMON_UNICODE_H_
34#define _UCOMMON_UNICODE_H_
35
36#ifndef _UCOMMON_STRING_H_
37#include <ucommon/string.h>
38#endif
39
40#ifdef nil
41#undef nil
42#endif
43
44namespace ucommon {
45
50typedef int32_t ucs4_t;
51
55typedef int16_t ucs2_t;
56
60typedef void *unicode_t;
61
67class __EXPORT utf8
68{
69protected:
70 inline utf8() {};
71
72 inline utf8(const utf8& copy) {};
73
74public:
78 static const unsigned ucsize;
79
83 static const char *nil;
84
90 static unsigned size(const char *codepoint);
91
97 static size_t count(const char *string);
98
105 static char *offset(char *string, ssize_t position);
106
112 static ucs4_t codepoint(const char *encoded);
113
119 static size_t chars(const unicode_t string);
120
126 static size_t chars(ucs4_t character);
127
134 static size_t unpack(const unicode_t string, char *text, size_t size);
135
143 static size_t pack(unicode_t unicode, const char *cp, size_t len);
144
148 static ucs4_t *udup(const char *string);
149
153 static ucs2_t *wdup(const char *string);
154
162 static const char *find(const char *string, ucs4_t character, size_t start = 0);
163
171 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
172
179 static unsigned ccount(const char *string, ucs4_t character);
180
186 static ucs4_t get(const char *cp);
187
194 static void put(ucs4_t character, char *buf);
195};
196
203class __EXPORT UString : public String, public utf8
204{
205protected:
210
215 UString(size_t size);
216
221 UString(const unicode_t text);
222
229 UString(const char *text, size_t size);
230
237 UString(const unicode_t *text, const unicode_t *end);
238
244 UString(const UString& existing);
245
250 virtual ~UString();
251
258 UString get(size_t codepoint, size_t size = 0) const;
259
266 size_t get(unicode_t unicode, size_t size) const;
267
272 void set(const unicode_t unicode);
273
278 void add(const unicode_t unicode);
279
285 ucs4_t at(int position) const;
286
293 inline size_t operator()(unicode_t unicode, size_t size) const {
294 return get(unicode, size);
295 }
296
303 UString operator()(int codepoint, size_t size) const;
304
310 inline UString left(size_t size) const {
311 return operator()(0, size);
312 }
313
319 inline UString right(size_t offset) const {
320 return operator()(-((int)offset), 0);
321 }
322
329 inline UString copy(size_t offset, size_t size) const {
330 return operator()((int)offset, size);
331 }
332
338 void cut(size_t offset, size_t size = 0);
339
346 void paste(size_t offset, const char *text, size_t size = 0);
347
355 const char *operator()(int offset) const;
356
362 inline ucs4_t operator[](int position) const {
363 return UString::at(position);
364 }
365
370 inline size_t count(void) const {
371 return (size_t)utf8::count(str->text);
372 }
373
379 unsigned ccount(ucs4_t character) const;
380
387 const char *find(ucs4_t character, size_t start = 0) const;
388
395 const char *rfind(ucs4_t character, size_t end = npos) const;
396};
397
403class __EXPORT utf8_pointer
404{
405protected:
406 uint8_t *text;
407
408public:
413
418 utf8_pointer(const char *string);
419
425
430 utf8_pointer& operator ++();
431
436 utf8_pointer& operator --();
437
443 utf8_pointer& operator +=(long offset);
444
450 utf8_pointer& operator -=(long offset);
451
457 utf8_pointer operator+(long offset) const;
458
464 utf8_pointer operator-(long offset) const;
465
470 inline operator bool() const {
471 return text != NULL;
472 }
473
478 inline bool operator!() const {
479 return text == NULL;
480 }
481
487 ucs4_t operator[](long codepoint) const;
488
494 utf8_pointer& operator=(const char *string);
495
499 void inc(void);
500
504 void dec(void);
505
511 inline bool operator==(const char *string) const {
512 return (const char *)text == string;
513 }
514
520 inline bool operator!=(const char *string) const {
521 return (const char *)text != string;
522 }
523
528 inline ucs4_t operator*() const {
529 return utf8::codepoint((const char *)text);
530 }
531
536 inline char *c_str(void) const {
537 return (char *)text;
538 }
539
544 inline operator char*() const {
545 return (char *)text;
546 }
547
552 inline size_t len(void) const {
553 return utf8::count((const char *)text);
554 }
555};
556
557inline ucs4_t *strudup(const char *string) {
558 return utf8::udup(string);
559}
560
561inline ucs2_t *strwdup(const char *string) {
562 return utf8::wdup(string);
563}
564
565__EXPORT unicode_t unidup(const char *string);
566
567template<>
568inline void dupfree<ucs2_t*>(ucs2_t *string) {
569 ::free(string);
570}
571
572template<>
573inline void dupfree<ucs4_t*>(ucs4_t *string) {
574 ::free(string);
575}
576
577template<>
578inline void dupfree<unicode_t>(unicode_t string) {
579 ::free(string);
580}
581
586
591
592} // namespace ucommon
593
594#endif
Common namespace for all ucommon objects.
Definition access.h:47
UString ustring_t
Convenience type for utf8 encoded strings.
Definition unicode.h:585
utf8_pointer utf8_t
Convenience type for utf8_pointer strings.
Definition unicode.h:590
T copy(const T &src)
Convenience function to copy objects.
Definition generics.h:395
int32_t ucs4_t
32 bit unicode character code.
Definition unicode.h:50
int16_t ucs2_t
16 bit unicode character code.
Definition unicode.h:55
void * unicode_t
Resolves issues where wchar_t is not defined.
Definition unicode.h:60
A copy-on-write string class that operates by reference count.
Definition string.h:79
A core class of ut8 encoded string functions.
Definition unicode.h:68
static ucs4_t get(const char *cp)
Get a unicode character from a character protocol.
static size_t chars(ucs4_t character)
How many chars requires to encode a given unicode character.
static void put(ucs4_t character, char *buf)
Push a unicode character to a character protocol.
static size_t count(const char *string)
Count ut8 encoded ucs4 codepoints in string.
static const char * nil
A convenient NULL pointer value.
Definition unicode.h:83
static char * offset(char *string, ssize_t position)
Get codepoint offset in a string.
static ucs2_t * wdup(const char *string)
Dup a utf8 string into a ucs2_t representation.
static const unsigned ucsize
Size of "unicode_t" character codes, may not be ucs4_t size.
Definition unicode.h:78
static unsigned ccount(const char *string, ucs4_t character)
Count occurrences of a unicode character in string.
static unsigned size(const char *codepoint)
Compute character size of utf8 string codepoint.
static const char * rfind(const char *string, ucs4_t character, size_t end=(size_t) -1l)
Find last occurrence of character in string.
static ucs4_t codepoint(const char *encoded)
Convert a utf8 encoded codepoint to a ucs4 character value.
static const char * find(const char *string, ucs4_t character, size_t start=0)
Find first occurance of character in string.
static size_t chars(const unicode_t string)
How many chars requires to encode a given wchar string.
static size_t unpack(const unicode_t string, char *text, size_t size)
Convert a unicode string into utf8.
static ucs4_t * udup(const char *string)
Dup a utf8 string into a ucs4_t string.
static size_t pack(unicode_t unicode, const char *cp, size_t len)
Convert a utf8 string into a unicode data buffer.
A copy-on-write utf8 string class that operates by reference count.
Definition unicode.h:204
const char * find(ucs4_t character, size_t start=0) const
Find first occurrence of character in string.
UString(size_t size)
Create an empty string with a buffer pre-allocated to a specified size.
void set(const unicode_t unicode)
Set a utf8 encoded string based on unicode data.
UString operator()(int codepoint, size_t size) const
Get a new substring through object expression.
UString(const unicode_t *text, const unicode_t *end)
Create a string for a substring.
virtual ~UString()
Destroy string.
ucs4_t at(int position) const
Return unicode character found at a specific codepoint in the string.
size_t get(unicode_t unicode, size_t size) const
Extract a unicode byte sequence from utf8 object.
UString(const unicode_t text)
Create a utf8 aware string for a null terminated unicode string.
void cut(size_t offset, size_t size=0)
Cut (remove) text from string using codepoint offsets.
UString left(size_t size) const
Convenience method for left of string.
Definition unicode.h:310
size_t count(void) const
Count codepoints in current string.
Definition unicode.h:370
size_t operator()(unicode_t unicode, size_t size) const
Extract a unicode byte sequence from utf8 object.
Definition unicode.h:293
UString(const char *text, size_t size)
Create a string from null terminated text up to a maximum specified size.
UString(const UString &existing)
Construct a copy of a string object.
const char * operator()(int offset) const
Reference a string in the object by codepoint offset.
ucs4_t operator[](int position) const
Reference a unicode character in string object by array offset.
Definition unicode.h:362
UString copy(size_t offset, size_t size) const
Convenience method for substring extraction.
Definition unicode.h:329
UString get(size_t codepoint, size_t size=0) const
Get a new string object as a substring of the current object.
void add(const unicode_t unicode)
Add (append) unicode to a utf8 encoded string.
void paste(size_t offset, const char *text, size_t size=0)
Insert (paste) text into string using codepoint offsets.
const char * rfind(ucs4_t character, size_t end=npos) const
Find last occurrence of character in string.
UString right(size_t offset) const
Convenience method for right of string.
Definition unicode.h:319
unsigned ccount(ucs4_t character) const
Count occurrences of a unicode character in string.
UString()
Create a new empty utf8 aware string object.
Pointer to utf8 encoded character data.
Definition unicode.h:404
utf8_pointer operator-(long offset) const
Get new utf8 string after subtracting a codepoint offset.
void inc(void)
Iterative increment of a utf8 pointer to next codepoint.
ucs4_t operator[](long codepoint) const
Extract a unicode character from a specified codepoint.
size_t len(void) const
Get length of null terminated utf8 string in codepoints.
Definition unicode.h:552
bool operator==(const char *string) const
check if pointer equals another string.
Definition unicode.h:511
char * c_str(void) const
Get c string we point to.
Definition unicode.h:536
utf8_pointer operator+(long offset) const
Get new utf8 string after adding a codepoint offset.
utf8_pointer & operator=(const char *string)
Assign a utf8 string to point to.
bool operator!() const
Check if text is an invalid pointer.
Definition unicode.h:478
utf8_pointer(const utf8_pointer &copy)
Create a utf8 pointer as a copy of existing utf8 pointer.
ucs4_t operator*() const
Get unicode character pointed to by pointer.
Definition unicode.h:528
bool operator!=(const char *string) const
check if pointer does not equal another string.
Definition unicode.h:520
void dec(void)
Iterative decrement of a utf8 pointer to prior codepoint.
utf8_pointer(const char *string)
Create a utf8 pointer for an existing char pointer.
utf8_pointer()
Create a utf8 pointer set to NULL.
A common string class and character string support functions.