Tidy up utf_is*() functions (ABI change)

Tidy up the formatting of the character classification functions, including improving the documentation. Furthermore, change the "isvalid" function into more general "isucs4", "isutf32" and "isutf16" functions, which is another ABI change.
2009-10-13 11:20:33 +00:00 · 2009-10-13 11:20:33 +00:00 · e98cbe5cc5
commit e98cbe5cc5
parent dc60dcb2e6
5 changed files with 133 additions and 36 deletions
--- a/src/docs/char_class_whitespace.dox
+++ b/src/docs/char_class_whitespace.dox
@ -0,0 +1,30 @@
 /* libutf8/src/docs/MainPage.dox
 *
 *  (c)2006-2009, Laurence Withers, <l@lwithers.me.uk>.
 *  Released under the GNU GPLv3. See file COPYING or
 *  http://www.gnu.org/copyleft/gpl.html for details.
 */
 /*! \page char_class_whitespace Character classification: whitespace
 From <a href='http://www.unicode.org/Public/UNIDATA/'>PropList-4.1.0.txt</a>:
 <pre>0009..000D    ; White_Space # Cc   [5] <control-0009>..<control-000D>
 0020          ; White_Space # Zs       SPACE
 0085          ; White_Space # Cc       <control-0085>
 00A0          ; White_Space # Zs       NO-BREAK SPACE
 1680          ; White_Space # Zs       OGHAM SPACE MARK
 180E          ; White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
 2000..200A    ; White_Space # Zs  [11] EN QUAD..HAIR SPACE
 2028          ; White_Space # Zl       LINE SEPARATOR
 2029          ; White_Space # Zp       PARAGRAPH SEPARATOR
 202F          ; White_Space # Zs       NARROW NO-BREAK SPACE
 205F          ; White_Space # Zs       MEDIUM MATHEMATICAL SPACE
 3000          ; White_Space # Zs       IDEOGRAPHIC SPACE</pre>
 */
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 vim: expandtab:ts=4:sw=4:syntax=doxygen
 */
--- a/src/libutf8/100_ctype.c
+++ b/src/libutf8/100_ctype.c
@ -5,6 +5,8 @@
 *  http://www.gnu.org/copyleft/gpl.html for details.
 */
 int utf8_isascii(wchar_t ch)
 {
    return !(ch & ~0x7F);
@ -12,22 +14,6 @@ int utf8_isascii(wchar_t ch)
 /* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/)
 0009..000D    ; White_Space # Cc   [5] <control-0009>..<control-000D>
 0020          ; White_Space # Zs       SPACE
 0085          ; White_Space # Cc       <control-0085>
 00A0          ; White_Space # Zs       NO-BREAK SPACE
 1680          ; White_Space # Zs       OGHAM SPACE MARK
 180E          ; White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
 2000..200A    ; White_Space # Zs  [11] EN QUAD..HAIR SPACE
 2028          ; White_Space # Zl       LINE SEPARATOR
 2029          ; White_Space # Zp       PARAGRAPH SEPARATOR
 202F          ; White_Space # Zs       NARROW NO-BREAK SPACE
 205F          ; White_Space # Zs       MEDIUM MATHEMATICAL SPACE
 3000          ; White_Space # Zs       IDEOGRAPHIC SPACE
 */
 int utf8_isspace(wchar_t ch)
 {
    return((ch >= 0x0009 && ch <= 0x000D)
@ -46,11 +32,32 @@ int utf8_isspace(wchar_t ch)
-int utf8_isvalid(wchar_t ch)
+int utf8_isucs4(wchar_t ch)
 {
-    return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF);
+    return !(ch & (~((wchar_t)0x7FFFFFFF)))
        && (ch < 0xD800 || ch > 0xDFFF)
        && (ch != 0xFFFE) && (ch != 0xFFFF);
 }
 int utf8_isutf32(wchar_t ch)
 {
    return ch >= 0 && ch <= 0x10FFFF
        && (ch < 0xD800 || ch > 0xDFFF)
        && (ch != 0xFFFE) && (ch != 0xFFFF);
 }
 int utf8_isutf16(wchar_t ch)
 {
    return ch >= 0 && ch <= 0xFFFD
        && (ch < 0xD800 || ch > 0xDFFF);
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 vim: expandtab:ts=4:sw=4:syntax=c.doxygen
--- a/src/libutf8/100_ctype.h
+++ b/src/libutf8/100_ctype.h
@ -5,43 +5,103 @@
 *  http://www.gnu.org/copyleft/gpl.html for details.
 */
 /*! \defgroup ctype Character classification
-This module contains functions for character classification. These are basically an extension of the
+This module contains functions for character classification. These are
-\c is* functions defined in \c &lt;ctype.h&gt;.
+semantically equivalent to the \c is* functions defined in \c &lt;ctype.h&gt;,
 except that they work on \c wchar_t UCS chars and are independent of the
 system's current locale setting.
-\todo There are many char classification functions that haven't been implemented yet. These won't be
+\todo There are many char classification functions that haven't been
-    implemented until they can be done in a proper, Unicode-safe fashion.
+      implemented yet. These will be added on demand.
- */
+*/
 /*!@{*/
-/*! \biref Returns \c true if \a ch can be represented in ASCII. */
+/*! \brief Test if character is ASCII.
 \param ch Character to test.
 \retval nonzero if \a ch is ASCII.
 \retval 0 if \a ch is not ASCII.
 This function tests a UCS char to see if it lies within the range of characters
 that can be represented by ASCII (i.e. that the value of \a ch lies between 0
 and 127, inclusive).
 */
 int utf8_isascii(wchar_t ch);
-/*! \brief Returns \c true if \a ch is whitespace. */
+
 /*! \brief Test if character is whitespace.
 \param ch Character to test.
 \retval nonzero if \a ch is whitespace.
 \retval 0 if \a ch is not whitespace.
 This function tests a UCS char to see if it should be classified as 
 \ref char_class_whitespace "whitespace".
 */
 int utf8_isspace(wchar_t ch);
-/*! \brief Returns \c true if \a ch is a valid UCS-4 character.
+
 /*! \brief Test if character is valid UCS-4 codepoint.
 \param ch The character to classify.
-\retval true If \a ch is a valid UCS-4 character.
+\retval nonzero If \a ch is a valid UCS-4 character.
-\retval false If \a ch is not a valid UCS-4 character.
+\retval 0 If \a ch is not a valid UCS-4 character.
-This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4
+This function will examine a \c wchar_t value and determine whether or not it
-character. Valid characters lie in the range 0&ndash;0x7FFFFFFF but exclude:
+is a valid UCS-4 character. Valid characters lie in the range
 0&ndash;0x7FFFFFFF but exclude:
 \li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
 \li the invalid code points U+FFFE and U+FFFF
 */
-int utf8_isvalid(wchar_t ch);
+int utf8_isucs4(wchar_t ch);
 /*! \brief Test if character is valid UTF-32 (Unicode) codepoint.
 \param ch The character to classify.
 \retval nonzero If \a ch is a valid Unicode character.
 \retval 0 If \a ch is not a valid Unicode character.
 This function will examine a \c wchar_t value and determine whether or not it
 is a valid Unicode character. Valid characters lie in the range
 0&ndash;0x10FFFF but exclude:
 \li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
 \li the invalid code points U+FFFE and U+FFFF
 */
 int utf8_isutf32(wchar_t ch);
 /*! \brief Test if character is valid UTF-16 (Unicode) codepoint.
 \param ch The character to classify.
 \retval nonzero If \a ch is a valid Unicode character.
 \retval 0 If \a ch is not a valid Unicode character.
 This function will examine a \c wchar_t value and determine whether or not it
 is a valid Unicode character that can be represented by a single UTF-16
 codepoint. Valid characters lie in the range 0&ndash;0xFFFD but exclude:
 \li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
 */
 int utf8_isutf16(wchar_t ch);
 /*!@}*/
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 vim: expandtab:ts=4:sw=4:syntax=c.doxygen
--- a/src/libutf8/300_encode.c
+++ b/src/libutf8/300_encode.c
@ -11,7 +11,7 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
        errno = EINVAL;
        return 0;
    }
-    if(!utf8_isvalid(ch)) {
+    if(!utf8_isucs4(ch)) {
        errno = EILSEQ;
        return 0;
    }
@ -78,12 +78,12 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
 char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
 {
-    if(!utf8_isvalid(ilseq)) {
+    if(!utf8_isucs4(ilseq)) {
        errno = EILSEQ;
        return 0;
    }
-    return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq);
+    return utf8_encode_char(dest, amt, utf8_isucs4(ch) ? ch : ilseq);
 }
--- a/src/libutf8/400_decode_state.c
+++ b/src/libutf8/400_decode_state.c
@ -108,7 +108,7 @@ loop:
                    goto error;
                } else {
                    // validate codepoint
-                    if(!utf8_isvalid(ctx->statech)) {
+                    if(!utf8_isucs4(ctx->statech)) {
                        error_type = utf8_decode_error_illegal_cp;
                        goto error;
                    }