From e98cbe5cc52ddd59e9c5cb3b3fe94a2394d5bccc Mon Sep 17 00:00:00 2001 From: Laurence Withers Date: Tue, 13 Oct 2009 11:20:33 +0000 Subject: [PATCH] Tidy up utf_is*() functions (ABI change) Tidy up the formatting of the character classification functions, including improving the documentation. Furthermore, change the "isvalid" function into more general "isucs4", "isutf32" and "isutf16" functions, which is another ABI change. --- src/docs/char_class_whitespace.dox | 30 ++++++++++ src/libutf8/100_ctype.c | 43 +++++++++------ src/libutf8/100_ctype.h | 88 +++++++++++++++++++++++++----- src/libutf8/300_encode.c | 6 +- src/libutf8/400_decode_state.c | 2 +- 5 files changed, 133 insertions(+), 36 deletions(-) create mode 100644 src/docs/char_class_whitespace.dox diff --git a/src/docs/char_class_whitespace.dox b/src/docs/char_class_whitespace.dox new file mode 100644 index 0000000..78dbb64 --- /dev/null +++ b/src/docs/char_class_whitespace.dox @@ -0,0 +1,30 @@ +/* libutf8/src/docs/MainPage.dox + * + * (c)2006-2009, Laurence Withers, . + * Released under the GNU GPLv3. See file COPYING or + * http://www.gnu.org/copyleft/gpl.html for details. +*/ + +/*! \page char_class_whitespace Character classification: whitespace + +From PropList-4.1.0.txt: + +
0009..000D    ; White_Space # Cc   [5] ..
+0020          ; White_Space # Zs       SPACE
+0085          ; White_Space # Cc       
+00A0          ; White_Space # Zs       NO-BREAK SPACE
+1680          ; White_Space # Zs       OGHAM SPACE MARK
+180E          ; White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+2000..200A    ; White_Space # Zs  [11] EN QUAD..HAIR SPACE
+2028          ; White_Space # Zl       LINE SEPARATOR
+2029          ; White_Space # Zp       PARAGRAPH SEPARATOR
+202F          ; White_Space # Zs       NARROW NO-BREAK SPACE
+205F          ; White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+3000          ; White_Space # Zs       IDEOGRAPHIC SPACE
+ +*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +vim: expandtab:ts=4:sw=4:syntax=doxygen +*/ diff --git a/src/libutf8/100_ctype.c b/src/libutf8/100_ctype.c index 7b15428..d2b19bf 100644 --- a/src/libutf8/100_ctype.c +++ b/src/libutf8/100_ctype.c @@ -5,6 +5,8 @@ * http://www.gnu.org/copyleft/gpl.html for details. */ + + int utf8_isascii(wchar_t ch) { return !(ch & ~0x7F); @@ -12,22 +14,6 @@ int utf8_isascii(wchar_t ch) -/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/) - -0009..000D ; White_Space # Cc [5] .. -0020 ; White_Space # Zs SPACE -0085 ; White_Space # Cc -00A0 ; White_Space # Zs NO-BREAK SPACE -1680 ; White_Space # Zs OGHAM SPACE MARK -180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR -2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE -2028 ; White_Space # Zl LINE SEPARATOR -2029 ; White_Space # Zp PARAGRAPH SEPARATOR -202F ; White_Space # Zs NARROW NO-BREAK SPACE -205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE -3000 ; White_Space # Zs IDEOGRAPHIC SPACE -*/ - int utf8_isspace(wchar_t ch) { return((ch >= 0x0009 && ch <= 0x000D) @@ -46,11 +32,32 @@ int utf8_isspace(wchar_t ch) -int utf8_isvalid(wchar_t ch) +int utf8_isucs4(wchar_t ch) { - return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF); + return !(ch & (~((wchar_t)0x7FFFFFFF))) + && (ch < 0xD800 || ch > 0xDFFF) + && (ch != 0xFFFE) && (ch != 0xFFFF); } + + +int utf8_isutf32(wchar_t ch) +{ + return ch >= 0 && ch <= 0x10FFFF + && (ch < 0xD800 || ch > 0xDFFF) + && (ch != 0xFFFE) && (ch != 0xFFFF); +} + + + +int utf8_isutf16(wchar_t ch) +{ + return ch >= 0 && ch <= 0xFFFD + && (ch < 0xD800 || ch > 0xDFFF); +} + + + /* options for text editors kate: replace-trailing-space-save true; space-indent true; tab-width 4; vim: expandtab:ts=4:sw=4:syntax=c.doxygen diff --git a/src/libutf8/100_ctype.h b/src/libutf8/100_ctype.h index f72d233..61c3624 100644 --- a/src/libutf8/100_ctype.h +++ b/src/libutf8/100_ctype.h @@ -5,43 +5,103 @@ * http://www.gnu.org/copyleft/gpl.html for details. */ + + /*! \defgroup ctype Character classification -This module contains functions for character classification. These are basically an extension of the -\c is* functions defined in \c <ctype.h>. +This module contains functions for character classification. These are +semantically equivalent to the \c is* functions defined in \c <ctype.h>, +except that they work on \c wchar_t UCS chars and are independent of the +system's current locale setting. -\todo There are many char classification functions that haven't been implemented yet. These won't be - implemented until they can be done in a proper, Unicode-safe fashion. +\todo There are many char classification functions that haven't been + implemented yet. These will be added on demand. - */ +*/ /*!@{*/ -/*! \biref Returns \c true if \a ch can be represented in ASCII. */ +/*! \brief Test if character is ASCII. + +\param ch Character to test. +\retval nonzero if \a ch is ASCII. +\retval 0 if \a ch is not ASCII. + +This function tests a UCS char to see if it lies within the range of characters +that can be represented by ASCII (i.e. that the value of \a ch lies between 0 +and 127, inclusive). + +*/ int utf8_isascii(wchar_t ch); -/*! \brief Returns \c true if \a ch is whitespace. */ + + +/*! \brief Test if character is whitespace. + +\param ch Character to test. +\retval nonzero if \a ch is whitespace. +\retval 0 if \a ch is not whitespace. + +This function tests a UCS char to see if it should be classified as +\ref char_class_whitespace "whitespace". + +*/ int utf8_isspace(wchar_t ch); -/*! \brief Returns \c true if \a ch is a valid UCS-4 character. + + +/*! \brief Test if character is valid UCS-4 codepoint. \param ch The character to classify. -\retval true If \a ch is a valid UCS-4 character. -\retval false If \a ch is not a valid UCS-4 character. +\retval nonzero If \a ch is a valid UCS-4 character. +\retval 0 If \a ch is not a valid UCS-4 character. -This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4 -character. Valid characters lie in the range 0–0x7FFFFFFF but exclude: +This function will examine a \c wchar_t value and determine whether or not it +is a valid UCS-4 character. Valid characters lie in the range +0–0x7FFFFFFF but exclude: \li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) \li the invalid code points U+FFFE and U+FFFF */ -int utf8_isvalid(wchar_t ch); +int utf8_isucs4(wchar_t ch); + + + +/*! \brief Test if character is valid UTF-32 (Unicode) codepoint. + +\param ch The character to classify. +\retval nonzero If \a ch is a valid Unicode character. +\retval 0 If \a ch is not a valid Unicode character. + +This function will examine a \c wchar_t value and determine whether or not it +is a valid Unicode character. Valid characters lie in the range +0–0x10FFFF but exclude: +\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) +\li the invalid code points U+FFFE and U+FFFF + +*/ +int utf8_isutf32(wchar_t ch); + + + +/*! \brief Test if character is valid UTF-16 (Unicode) codepoint. + +\param ch The character to classify. +\retval nonzero If \a ch is a valid Unicode character. +\retval 0 If \a ch is not a valid Unicode character. + +This function will examine a \c wchar_t value and determine whether or not it +is a valid Unicode character that can be represented by a single UTF-16 +codepoint. Valid characters lie in the range 0–0xFFFD but exclude: +\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) + +*/ +int utf8_isutf16(wchar_t ch); /*!@}*/ - /* options for text editors kate: replace-trailing-space-save true; space-indent true; tab-width 4; vim: expandtab:ts=4:sw=4:syntax=c.doxygen diff --git a/src/libutf8/300_encode.c b/src/libutf8/300_encode.c index ed0097b..811ab30 100644 --- a/src/libutf8/300_encode.c +++ b/src/libutf8/300_encode.c @@ -11,7 +11,7 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch) errno = EINVAL; return 0; } - if(!utf8_isvalid(ch)) { + if(!utf8_isucs4(ch)) { errno = EILSEQ; return 0; } @@ -78,12 +78,12 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch) char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq) { - if(!utf8_isvalid(ilseq)) { + if(!utf8_isucs4(ilseq)) { errno = EILSEQ; return 0; } - return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq); + return utf8_encode_char(dest, amt, utf8_isucs4(ch) ? ch : ilseq); } diff --git a/src/libutf8/400_decode_state.c b/src/libutf8/400_decode_state.c index 15d331a..4aab0cc 100644 --- a/src/libutf8/400_decode_state.c +++ b/src/libutf8/400_decode_state.c @@ -108,7 +108,7 @@ loop: goto error; } else { // validate codepoint - if(!utf8_isvalid(ctx->statech)) { + if(!utf8_isucs4(ctx->statech)) { error_type = utf8_decode_error_illegal_cp; goto error; }