Tidy up utf_is*() functions (ABI change)
Tidy up the formatting of the character classification functions, including improving the documentation. Furthermore, change the "isvalid" function into more general "isucs4", "isutf32" and "isutf16" functions, which is another ABI change.
This commit is contained in:
parent
dc60dcb2e6
commit
e98cbe5cc5
|
@ -0,0 +1,30 @@
|
||||||
|
/* libutf8/src/docs/MainPage.dox
|
||||||
|
*
|
||||||
|
* (c)2006-2009, Laurence Withers, <l@lwithers.me.uk>.
|
||||||
|
* Released under the GNU GPLv3. See file COPYING or
|
||||||
|
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \page char_class_whitespace Character classification: whitespace
|
||||||
|
|
||||||
|
From <a href='http://www.unicode.org/Public/UNIDATA/'>PropList-4.1.0.txt</a>:
|
||||||
|
|
||||||
|
<pre>0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
|
||||||
|
0020 ; White_Space # Zs SPACE
|
||||||
|
0085 ; White_Space # Cc <control-0085>
|
||||||
|
00A0 ; White_Space # Zs NO-BREAK SPACE
|
||||||
|
1680 ; White_Space # Zs OGHAM SPACE MARK
|
||||||
|
180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
||||||
|
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
|
||||||
|
2028 ; White_Space # Zl LINE SEPARATOR
|
||||||
|
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
|
||||||
|
202F ; White_Space # Zs NARROW NO-BREAK SPACE
|
||||||
|
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
||||||
|
3000 ; White_Space # Zs IDEOGRAPHIC SPACE</pre>
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
vim: expandtab:ts=4:sw=4:syntax=doxygen
|
||||||
|
*/
|
|
@ -5,6 +5,8 @@
|
||||||
* http://www.gnu.org/copyleft/gpl.html for details.
|
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int utf8_isascii(wchar_t ch)
|
int utf8_isascii(wchar_t ch)
|
||||||
{
|
{
|
||||||
return !(ch & ~0x7F);
|
return !(ch & ~0x7F);
|
||||||
|
@ -12,22 +14,6 @@ int utf8_isascii(wchar_t ch)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/)
|
|
||||||
|
|
||||||
0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
|
|
||||||
0020 ; White_Space # Zs SPACE
|
|
||||||
0085 ; White_Space # Cc <control-0085>
|
|
||||||
00A0 ; White_Space # Zs NO-BREAK SPACE
|
|
||||||
1680 ; White_Space # Zs OGHAM SPACE MARK
|
|
||||||
180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
|
||||||
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
|
|
||||||
2028 ; White_Space # Zl LINE SEPARATOR
|
|
||||||
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
|
|
||||||
202F ; White_Space # Zs NARROW NO-BREAK SPACE
|
|
||||||
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
|
||||||
3000 ; White_Space # Zs IDEOGRAPHIC SPACE
|
|
||||||
*/
|
|
||||||
|
|
||||||
int utf8_isspace(wchar_t ch)
|
int utf8_isspace(wchar_t ch)
|
||||||
{
|
{
|
||||||
return((ch >= 0x0009 && ch <= 0x000D)
|
return((ch >= 0x0009 && ch <= 0x000D)
|
||||||
|
@ -46,11 +32,32 @@ int utf8_isspace(wchar_t ch)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int utf8_isvalid(wchar_t ch)
|
int utf8_isucs4(wchar_t ch)
|
||||||
{
|
{
|
||||||
return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF);
|
return !(ch & (~((wchar_t)0x7FFFFFFF)))
|
||||||
|
&& (ch < 0xD800 || ch > 0xDFFF)
|
||||||
|
&& (ch != 0xFFFE) && (ch != 0xFFFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int utf8_isutf32(wchar_t ch)
|
||||||
|
{
|
||||||
|
return ch >= 0 && ch <= 0x10FFFF
|
||||||
|
&& (ch < 0xD800 || ch > 0xDFFF)
|
||||||
|
&& (ch != 0xFFFE) && (ch != 0xFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int utf8_isutf16(wchar_t ch)
|
||||||
|
{
|
||||||
|
return ch >= 0 && ch <= 0xFFFD
|
||||||
|
&& (ch < 0xD800 || ch > 0xDFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* options for text editors
|
/* options for text editors
|
||||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
vim: expandtab:ts=4:sw=4:syntax=c.doxygen
|
vim: expandtab:ts=4:sw=4:syntax=c.doxygen
|
||||||
|
|
|
@ -5,43 +5,103 @@
|
||||||
* http://www.gnu.org/copyleft/gpl.html for details.
|
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*! \defgroup ctype Character classification
|
/*! \defgroup ctype Character classification
|
||||||
|
|
||||||
This module contains functions for character classification. These are basically an extension of the
|
This module contains functions for character classification. These are
|
||||||
\c is* functions defined in \c <ctype.h>.
|
semantically equivalent to the \c is* functions defined in \c <ctype.h>,
|
||||||
|
except that they work on \c wchar_t UCS chars and are independent of the
|
||||||
|
system's current locale setting.
|
||||||
|
|
||||||
\todo There are many char classification functions that haven't been implemented yet. These won't be
|
\todo There are many char classification functions that haven't been
|
||||||
implemented until they can be done in a proper, Unicode-safe fashion.
|
implemented yet. These will be added on demand.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
/*!@{*/
|
/*!@{*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*! \biref Returns \c true if \a ch can be represented in ASCII. */
|
/*! \brief Test if character is ASCII.
|
||||||
|
|
||||||
|
\param ch Character to test.
|
||||||
|
\retval nonzero if \a ch is ASCII.
|
||||||
|
\retval 0 if \a ch is not ASCII.
|
||||||
|
|
||||||
|
This function tests a UCS char to see if it lies within the range of characters
|
||||||
|
that can be represented by ASCII (i.e. that the value of \a ch lies between 0
|
||||||
|
and 127, inclusive).
|
||||||
|
|
||||||
|
*/
|
||||||
int utf8_isascii(wchar_t ch);
|
int utf8_isascii(wchar_t ch);
|
||||||
|
|
||||||
/*! \brief Returns \c true if \a ch is whitespace. */
|
|
||||||
|
|
||||||
|
/*! \brief Test if character is whitespace.
|
||||||
|
|
||||||
|
\param ch Character to test.
|
||||||
|
\retval nonzero if \a ch is whitespace.
|
||||||
|
\retval 0 if \a ch is not whitespace.
|
||||||
|
|
||||||
|
This function tests a UCS char to see if it should be classified as
|
||||||
|
\ref char_class_whitespace "whitespace".
|
||||||
|
|
||||||
|
*/
|
||||||
int utf8_isspace(wchar_t ch);
|
int utf8_isspace(wchar_t ch);
|
||||||
|
|
||||||
/*! \brief Returns \c true if \a ch is a valid UCS-4 character.
|
|
||||||
|
|
||||||
|
/*! \brief Test if character is valid UCS-4 codepoint.
|
||||||
|
|
||||||
\param ch The character to classify.
|
\param ch The character to classify.
|
||||||
\retval true If \a ch is a valid UCS-4 character.
|
\retval nonzero If \a ch is a valid UCS-4 character.
|
||||||
\retval false If \a ch is not a valid UCS-4 character.
|
\retval 0 If \a ch is not a valid UCS-4 character.
|
||||||
|
|
||||||
This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4
|
This function will examine a \c wchar_t value and determine whether or not it
|
||||||
character. Valid characters lie in the range 0–0x7FFFFFFF but exclude:
|
is a valid UCS-4 character. Valid characters lie in the range
|
||||||
|
0–0x7FFFFFFF but exclude:
|
||||||
\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive)
|
\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive)
|
||||||
\li the invalid code points U+FFFE and U+FFFF
|
\li the invalid code points U+FFFE and U+FFFF
|
||||||
|
|
||||||
*/
|
*/
|
||||||
int utf8_isvalid(wchar_t ch);
|
int utf8_isucs4(wchar_t ch);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Test if character is valid UTF-32 (Unicode) codepoint.
|
||||||
|
|
||||||
|
\param ch The character to classify.
|
||||||
|
\retval nonzero If \a ch is a valid Unicode character.
|
||||||
|
\retval 0 If \a ch is not a valid Unicode character.
|
||||||
|
|
||||||
|
This function will examine a \c wchar_t value and determine whether or not it
|
||||||
|
is a valid Unicode character. Valid characters lie in the range
|
||||||
|
0–0x10FFFF but exclude:
|
||||||
|
\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive)
|
||||||
|
\li the invalid code points U+FFFE and U+FFFF
|
||||||
|
|
||||||
|
*/
|
||||||
|
int utf8_isutf32(wchar_t ch);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Test if character is valid UTF-16 (Unicode) codepoint.
|
||||||
|
|
||||||
|
\param ch The character to classify.
|
||||||
|
\retval nonzero If \a ch is a valid Unicode character.
|
||||||
|
\retval 0 If \a ch is not a valid Unicode character.
|
||||||
|
|
||||||
|
This function will examine a \c wchar_t value and determine whether or not it
|
||||||
|
is a valid Unicode character that can be represented by a single UTF-16
|
||||||
|
codepoint. Valid characters lie in the range 0–0xFFFD but exclude:
|
||||||
|
\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive)
|
||||||
|
|
||||||
|
*/
|
||||||
|
int utf8_isutf16(wchar_t ch);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!@}*/
|
/*!@}*/
|
||||||
|
|
||||||
/* options for text editors
|
/* options for text editors
|
||||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
vim: expandtab:ts=4:sw=4:syntax=c.doxygen
|
vim: expandtab:ts=4:sw=4:syntax=c.doxygen
|
||||||
|
|
|
@ -11,7 +11,7 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
|
||||||
errno = EINVAL;
|
errno = EINVAL;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if(!utf8_isvalid(ch)) {
|
if(!utf8_isucs4(ch)) {
|
||||||
errno = EILSEQ;
|
errno = EILSEQ;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -78,12 +78,12 @@ char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
|
||||||
|
|
||||||
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
|
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
|
||||||
{
|
{
|
||||||
if(!utf8_isvalid(ilseq)) {
|
if(!utf8_isucs4(ilseq)) {
|
||||||
errno = EILSEQ;
|
errno = EILSEQ;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq);
|
return utf8_encode_char(dest, amt, utf8_isucs4(ch) ? ch : ilseq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -108,7 +108,7 @@ loop:
|
||||||
goto error;
|
goto error;
|
||||||
} else {
|
} else {
|
||||||
// validate codepoint
|
// validate codepoint
|
||||||
if(!utf8_isvalid(ctx->statech)) {
|
if(!utf8_isucs4(ctx->statech)) {
|
||||||
error_type = utf8_decode_error_illegal_cp;
|
error_type = utf8_decode_error_illegal_cp;
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue