Tidy up utf_is*() functions (ABI change)

Tidy up the formatting of the character classification functions, including
improving the documentation. Furthermore, change the "isvalid" function into
more general "isucs4", "isutf32" and "isutf16" functions, which is another
ABI change.
This commit is contained in:
Laurence Withers 2009-10-13 11:20:33 +00:00
parent dc60dcb2e6
commit e98cbe5cc5
5 changed files with 133 additions and 36 deletions

View File

@ -0,0 +1,30 @@
/* libutf8/src/docs/MainPage.dox
*
* (c)2006-2009, Laurence Withers, <l@lwithers.me.uk>.
* Released under the GNU GPLv3. See file COPYING or
* http://www.gnu.org/copyleft/gpl.html for details.
*/
/*! \page char_class_whitespace Character classification: whitespace
From <a href='http://www.unicode.org/Public/UNIDATA/'>PropList-4.1.0.txt</a>:
<pre>0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
0020 ; White_Space # Zs SPACE
0085 ; White_Space # Cc <control-0085>
00A0 ; White_Space # Zs NO-BREAK SPACE
1680 ; White_Space # Zs OGHAM SPACE MARK
180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
2028 ; White_Space # Zl LINE SEPARATOR
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
202F ; White_Space # Zs NARROW NO-BREAK SPACE
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
3000 ; White_Space # Zs IDEOGRAPHIC SPACE</pre>
*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4:syntax=doxygen
*/

View File

@ -5,6 +5,8 @@
* http://www.gnu.org/copyleft/gpl.html for details.
*/
int utf8_isascii(wchar_t ch)
{
return !(ch & ~0x7F);
@ -12,22 +14,6 @@ int utf8_isascii(wchar_t ch)
/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/)
0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
0020 ; White_Space # Zs SPACE
0085 ; White_Space # Cc <control-0085>
00A0 ; White_Space # Zs NO-BREAK SPACE
1680 ; White_Space # Zs OGHAM SPACE MARK
180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
2028 ; White_Space # Zl LINE SEPARATOR
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
202F ; White_Space # Zs NARROW NO-BREAK SPACE
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
3000 ; White_Space # Zs IDEOGRAPHIC SPACE
*/
int utf8_isspace(wchar_t ch)
{
return((ch >= 0x0009 && ch <= 0x000D)
@ -46,11 +32,32 @@ int utf8_isspace(wchar_t ch)
int utf8_isvalid(wchar_t ch)
int utf8_isucs4(wchar_t ch)
{
return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF);
return !(ch & (~((wchar_t)0x7FFFFFFF)))
&& (ch < 0xD800 || ch > 0xDFFF)
&& (ch != 0xFFFE) && (ch != 0xFFFF);
}
int utf8_isutf32(wchar_t ch)
{
return ch >= 0 && ch <= 0x10FFFF
&& (ch < 0xD800 || ch > 0xDFFF)
&& (ch != 0xFFFE) && (ch != 0xFFFF);
}
int utf8_isutf16(wchar_t ch)
{
return ch >= 0 && ch <= 0xFFFD
&& (ch < 0xD800 || ch > 0xDFFF);
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4:syntax=c.doxygen

View File

@ -5,43 +5,103 @@
* http://www.gnu.org/copyleft/gpl.html for details.
*/
/*! \defgroup ctype Character classification
This module contains functions for character classification. These are basically an extension of the
\c is* functions defined in \c &lt;ctype.h&gt;.
This module contains functions for character classification. These are
semantically equivalent to the \c is* functions defined in \c &lt;ctype.h&gt;,
except that they work on \c wchar_t UCS chars and are independent of the
system's current locale setting.
\todo There are many char classification functions that haven't been implemented yet. These won't be
implemented until they can be done in a proper, Unicode-safe fashion.
\todo There are many char classification functions that haven't been
implemented yet. These will be added on demand.
*/
*/
/*!@{*/
/*! \biref Returns \c true if \a ch can be represented in ASCII. */
/*! \brief Test if character is ASCII.
\param ch Character to test.
\retval nonzero if \a ch is ASCII.
\retval 0 if \a ch is not ASCII.
This function tests a UCS char to see if it lies within the range of characters
that can be represented by ASCII (i.e. that the value of \a ch lies between 0
and 127, inclusive).
*/
int utf8_isascii(wchar_t ch);
/*! \brief Returns \c true if \a ch is whitespace. */
/*! \brief Test if character is whitespace.
\param ch Character to test.
\retval nonzero if \a ch is whitespace.
\retval 0 if \a ch is not whitespace.
This function tests a UCS char to see if it should be classified as
\ref char_class_whitespace "whitespace".
*/
int utf8_isspace(wchar_t ch);
/*! \brief Returns \c true if \a ch is a valid UCS-4 character.
/*! \brief Test if character is valid UCS-4 codepoint.
\param ch The character to classify.
\retval true If \a ch is a valid UCS-4 character.
\retval false If \a ch is not a valid UCS-4 character.
\retval nonzero If \a ch is a valid UCS-4 character.
\retval 0 If \a ch is not a valid UCS-4 character.
This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4
character. Valid characters lie in the range 0&ndash;0x7FFFFFFF but exclude:
This function will examine a \c wchar_t value and determine whether or not it
is a valid UCS-4 character. Valid characters lie in the range
0&ndash;0x7FFFFFFF but exclude:
\li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
\li the invalid code points U+FFFE and U+FFFF
*/
int utf8_isvalid(wchar_t ch);
int utf8_isucs4(wchar_t ch);
/*! \brief Test if character is valid UTF-32 (Unicode) codepoint.
\param ch The character to classify.
\retval nonzero If \a ch is a valid Unicode character.
\retval 0 If \a ch is not a valid Unicode character.
This function will examine a \c wchar_t value and determine whether or not it
is a valid Unicode character. Valid characters lie in the range
0&ndash;0x10FFFF but exclude:
\li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
\li the invalid code points U+FFFE and U+FFFF
*/
int utf8_isutf32(wchar_t ch);
/*! \brief Test if character is valid UTF-16 (Unicode) codepoint.
\param ch The character to classify.
\retval nonzero If \a ch is a valid Unicode character.
\retval 0 If \a ch is not a valid Unicode character.
This function will examine a \c wchar_t value and determine whether or not it
is a valid Unicode character that can be represented by a single UTF-16
codepoint. Valid characters lie in the range 0&ndash;0xFFFD but exclude:
\li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
*/
int utf8_isutf16(wchar_t ch);
/*!@}*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4:syntax=c.doxygen

View File

@ -11,7 +11,7 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
errno = EINVAL;
return 0;
}
if(!utf8_isvalid(ch)) {
if(!utf8_isucs4(ch)) {
errno = EILSEQ;
return 0;
}
@ -78,12 +78,12 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
{
if(!utf8_isvalid(ilseq)) {
if(!utf8_isucs4(ilseq)) {
errno = EILSEQ;
return 0;
}
return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq);
return utf8_encode_char(dest, amt, utf8_isucs4(ch) ? ch : ilseq);
}

View File

@ -108,7 +108,7 @@ loop:
goto error;
} else {
// validate codepoint
if(!utf8_isvalid(ctx->statech)) {
if(!utf8_isucs4(ctx->statech)) {
error_type = utf8_decode_error_illegal_cp;
goto error;
}