diff --git a/src/docs/char_class_whitespace.dox b/src/docs/char_class_whitespace.dox new file mode 100644 index 0000000..78dbb64 --- /dev/null +++ b/src/docs/char_class_whitespace.dox @@ -0,0 +1,30 @@ +/* libutf8/src/docs/MainPage.dox + * + * (c)2006-2009, Laurence Withers, . + * Released under the GNU GPLv3. See file COPYING or + * http://www.gnu.org/copyleft/gpl.html for details. +*/ + +/*! \page char_class_whitespace Character classification: whitespace + +From PropList-4.1.0.txt: + +
0009..000D    ; White_Space # Cc   [5] ..
+0020          ; White_Space # Zs       SPACE
+0085          ; White_Space # Cc       
+00A0          ; White_Space # Zs       NO-BREAK SPACE
+1680          ; White_Space # Zs       OGHAM SPACE MARK
+180E          ; White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+2000..200A    ; White_Space # Zs  [11] EN QUAD..HAIR SPACE
+2028          ; White_Space # Zl       LINE SEPARATOR
+2029          ; White_Space # Zp       PARAGRAPH SEPARATOR
+202F          ; White_Space # Zs       NARROW NO-BREAK SPACE
+205F          ; White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+3000          ; White_Space # Zs       IDEOGRAPHIC SPACE
+ +*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +vim: expandtab:ts=4:sw=4:syntax=doxygen +*/ diff --git a/src/libutf8/100_ctype.c b/src/libutf8/100_ctype.c index 7b15428..d2b19bf 100644 --- a/src/libutf8/100_ctype.c +++ b/src/libutf8/100_ctype.c @@ -5,6 +5,8 @@ * http://www.gnu.org/copyleft/gpl.html for details. */ + + int utf8_isascii(wchar_t ch) { return !(ch & ~0x7F); @@ -12,22 +14,6 @@ int utf8_isascii(wchar_t ch) -/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/) - -0009..000D ; White_Space # Cc [5] .. -0020 ; White_Space # Zs SPACE -0085 ; White_Space # Cc -00A0 ; White_Space # Zs NO-BREAK SPACE -1680 ; White_Space # Zs OGHAM SPACE MARK -180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR -2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE -2028 ; White_Space # Zl LINE SEPARATOR -2029 ; White_Space # Zp PARAGRAPH SEPARATOR -202F ; White_Space # Zs NARROW NO-BREAK SPACE -205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE -3000 ; White_Space # Zs IDEOGRAPHIC SPACE -*/ - int utf8_isspace(wchar_t ch) { return((ch >= 0x0009 && ch <= 0x000D) @@ -46,11 +32,32 @@ int utf8_isspace(wchar_t ch) -int utf8_isvalid(wchar_t ch) +int utf8_isucs4(wchar_t ch) { - return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF); + return !(ch & (~((wchar_t)0x7FFFFFFF))) + && (ch < 0xD800 || ch > 0xDFFF) + && (ch != 0xFFFE) && (ch != 0xFFFF); } + + +int utf8_isutf32(wchar_t ch) +{ + return ch >= 0 && ch <= 0x10FFFF + && (ch < 0xD800 || ch > 0xDFFF) + && (ch != 0xFFFE) && (ch != 0xFFFF); +} + + + +int utf8_isutf16(wchar_t ch) +{ + return ch >= 0 && ch <= 0xFFFD + && (ch < 0xD800 || ch > 0xDFFF); +} + + + /* options for text editors kate: replace-trailing-space-save true; space-indent true; tab-width 4; vim: expandtab:ts=4:sw=4:syntax=c.doxygen diff --git a/src/libutf8/100_ctype.h b/src/libutf8/100_ctype.h index f72d233..61c3624 100644 --- a/src/libutf8/100_ctype.h +++ b/src/libutf8/100_ctype.h @@ -5,43 +5,103 @@ * http://www.gnu.org/copyleft/gpl.html for details. */ + + /*! \defgroup ctype Character classification -This module contains functions for character classification. These are basically an extension of the -\c is* functions defined in \c <ctype.h>. +This module contains functions for character classification. These are +semantically equivalent to the \c is* functions defined in \c <ctype.h>, +except that they work on \c wchar_t UCS chars and are independent of the +system's current locale setting. -\todo There are many char classification functions that haven't been implemented yet. These won't be - implemented until they can be done in a proper, Unicode-safe fashion. +\todo There are many char classification functions that haven't been + implemented yet. These will be added on demand. - */ +*/ /*!@{*/ -/*! \biref Returns \c true if \a ch can be represented in ASCII. */ +/*! \brief Test if character is ASCII. + +\param ch Character to test. +\retval nonzero if \a ch is ASCII. +\retval 0 if \a ch is not ASCII. + +This function tests a UCS char to see if it lies within the range of characters +that can be represented by ASCII (i.e. that the value of \a ch lies between 0 +and 127, inclusive). + +*/ int utf8_isascii(wchar_t ch); -/*! \brief Returns \c true if \a ch is whitespace. */ + + +/*! \brief Test if character is whitespace. + +\param ch Character to test. +\retval nonzero if \a ch is whitespace. +\retval 0 if \a ch is not whitespace. + +This function tests a UCS char to see if it should be classified as +\ref char_class_whitespace "whitespace". + +*/ int utf8_isspace(wchar_t ch); -/*! \brief Returns \c true if \a ch is a valid UCS-4 character. + + +/*! \brief Test if character is valid UCS-4 codepoint. \param ch The character to classify. -\retval true If \a ch is a valid UCS-4 character. -\retval false If \a ch is not a valid UCS-4 character. +\retval nonzero If \a ch is a valid UCS-4 character. +\retval 0 If \a ch is not a valid UCS-4 character. -This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4 -character. Valid characters lie in the range 0–0x7FFFFFFF but exclude: +This function will examine a \c wchar_t value and determine whether or not it +is a valid UCS-4 character. Valid characters lie in the range +0–0x7FFFFFFF but exclude: \li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) \li the invalid code points U+FFFE and U+FFFF */ -int utf8_isvalid(wchar_t ch); +int utf8_isucs4(wchar_t ch); + + + +/*! \brief Test if character is valid UTF-32 (Unicode) codepoint. + +\param ch The character to classify. +\retval nonzero If \a ch is a valid Unicode character. +\retval 0 If \a ch is not a valid Unicode character. + +This function will examine a \c wchar_t value and determine whether or not it +is a valid Unicode character. Valid characters lie in the range +0–0x10FFFF but exclude: +\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) +\li the invalid code points U+FFFE and U+FFFF + +*/ +int utf8_isutf32(wchar_t ch); + + + +/*! \brief Test if character is valid UTF-16 (Unicode) codepoint. + +\param ch The character to classify. +\retval nonzero If \a ch is a valid Unicode character. +\retval 0 If \a ch is not a valid Unicode character. + +This function will examine a \c wchar_t value and determine whether or not it +is a valid Unicode character that can be represented by a single UTF-16 +codepoint. Valid characters lie in the range 0–0xFFFD but exclude: +\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) + +*/ +int utf8_isutf16(wchar_t ch); /*!@}*/ - /* options for text editors kate: replace-trailing-space-save true; space-indent true; tab-width 4; vim: expandtab:ts=4:sw=4:syntax=c.doxygen diff --git a/src/libutf8/300_encode.c b/src/libutf8/300_encode.c index ed0097b..811ab30 100644 --- a/src/libutf8/300_encode.c +++ b/src/libutf8/300_encode.c @@ -11,7 +11,7 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch) errno = EINVAL; return 0; } - if(!utf8_isvalid(ch)) { + if(!utf8_isucs4(ch)) { errno = EILSEQ; return 0; } @@ -78,12 +78,12 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch) char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq) { - if(!utf8_isvalid(ilseq)) { + if(!utf8_isucs4(ilseq)) { errno = EILSEQ; return 0; } - return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq); + return utf8_encode_char(dest, amt, utf8_isucs4(ch) ? ch : ilseq); } diff --git a/src/libutf8/400_decode_state.c b/src/libutf8/400_decode_state.c index 15d331a..4aab0cc 100644 --- a/src/libutf8/400_decode_state.c +++ b/src/libutf8/400_decode_state.c @@ -108,7 +108,7 @@ loop: goto error; } else { // validate codepoint - if(!utf8_isvalid(ctx->statech)) { + if(!utf8_isucs4(ctx->statech)) { error_type = utf8_decode_error_illegal_cp; goto error; }