Import library from old svn repository.

This commit is contained in:
Laurence Withers 2006-07-31 15:34:21 +01:00
parent f9c0e57470
commit ea1c53e43f
36 changed files with 2248 additions and 1 deletions

6
README
View File

@ -10,5 +10,9 @@ Really Quick Instructions
To build: ./make.sh
To install: ./make.sh install
(you might want to set PREFIX, by default it's /usr/local)
Documentation is automatically built using doxygen.
@TODO@
Project Homepage
----------------
http://www.lwithers.me.uk/projects/libutf8/

1
src/docs/.params Normal file
View File

@ -0,0 +1 @@
doxygen docs docs

146
src/docs/Doxyfile.in Normal file
View File

@ -0,0 +1,146 @@
# libutf8/src/docs/Doxyfile.in
#
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
# Released under the GNU GPLv2. See file COPYING or
# http://www.gnu.org/copyleft/gpl.html for details.
#
PROJECT_NAME = libutf8
OUTPUT_DIRECTORY =
CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English
USE_WINDOWS_ENCODING = NO
BRIEF_MEMBER_DESC = YES
REPEAT_BRIEF = YES
ABBREVIATE_BRIEF =
ALWAYS_DETAILED_SEC = NO
INLINE_INHERITED_MEMB = YES
FULL_PATH_NAMES = NO
STRIP_FROM_PATH =
STRIP_FROM_INC_PATH =
SHORT_NAMES = NO
JAVADOC_AUTOBRIEF = NO
MULTILINE_CPP_IS_BRIEF = YES
DETAILS_AT_TOP = YES
INHERIT_DOCS = YES
DISTRIBUTE_GROUP_DOC = NO
TAB_SIZE = 4
ALIASES =
OPTIMIZE_OUTPUT_FOR_C = NO
OPTIMIZE_OUTPUT_JAVA = NO
SUBGROUPING = YES
EXTRACT_ALL = NO
EXTRACT_PRIVATE = NO
EXTRACT_STATIC = NO
EXTRACT_LOCAL_CLASSES = NO
EXTRACT_LOCAL_METHODS = NO
HIDE_UNDOC_MEMBERS = NO
HIDE_UNDOC_CLASSES = NO
HIDE_FRIEND_COMPOUNDS = YES
HIDE_IN_BODY_DOCS = NO
INTERNAL_DOCS = NO
CASE_SENSE_NAMES = YES
HIDE_SCOPE_NAMES = NO
SHOW_INCLUDE_FILES = NO
INLINE_INFO = YES
SORT_MEMBER_DOCS = YES
SORT_BRIEF_DOCS = NO
SORT_BY_SCOPE_NAME = NO
GENERATE_TODOLIST = YES
GENERATE_TESTLIST = YES
GENERATE_BUGLIST = YES
GENERATE_DEPRECATEDLIST= YES
ENABLED_SECTIONS =
MAX_INITIALIZER_LINES = 30
SHOW_USED_FILES = NO
SHOW_DIRECTORIES = NO
FILE_VERSION_FILTER =
QUIET = YES
WARNINGS = YES
WARN_IF_UNDOCUMENTED = YES
WARN_IF_DOC_ERROR = YES
WARN_NO_PARAMDOC = YES
WARN_FORMAT = "$file:$line: $text"
WARN_LOGFILE =
FILE_PATTERNS =
RECURSIVE = NO
EXCLUDE =
EXCLUDE_SYMLINKS = NO
EXCLUDE_PATTERNS =
EXAMPLE_PATH =
EXAMPLE_PATTERNS =
EXAMPLE_RECURSIVE = NO
IMAGE_PATH = src/docs
INPUT_FILTER =
FILTER_PATTERNS =
FILTER_SOURCE_FILES = NO
SOURCE_BROWSER = NO
INLINE_SOURCES = NO
STRIP_CODE_COMMENTS = YES
REFERENCED_BY_RELATION = YES
REFERENCES_RELATION = YES
VERBATIM_HEADERS = NO
ALPHABETICAL_INDEX = YES
COLS_IN_ALPHA_INDEX = 5
IGNORE_PREFIX =
GENERATE_HTML = YES
HTML_OUTPUT = html
HTML_FILE_EXTENSION = .html
HTML_HEADER =
HTML_FOOTER =
HTML_STYLESHEET =
HTML_ALIGN_MEMBERS = YES
GENERATE_HTMLHELP = NO
CHM_FILE =
HHC_LOCATION =
GENERATE_CHI = NO
BINARY_TOC = NO
TOC_EXPAND = NO
DISABLE_INDEX = NO
ENUM_VALUES_PER_LINE = 4
GENERATE_TREEVIEW = NO
TREEVIEW_WIDTH = 250
GENERATE_LATEX = NO
GENERATE_RTF = NO
GENERATE_MAN = NO
GENERATE_XML = NO
GENERATE_AUTOGEN_DEF = NO
GENERATE_PERLMOD = NO
ENABLE_PREPROCESSING = YES
MACRO_EXPANSION = NO
EXPAND_ONLY_PREDEF = NO
SEARCH_INCLUDES = YES
INCLUDE_PATH =
INCLUDE_FILE_PATTERNS =
PREDEFINED = DOXYGEN
EXPAND_AS_DEFINED =
SKIP_FUNCTION_MACROS = YES
TAGFILES =
GENERATE_TAGFILE =
ALLEXTERNALS = NO
EXTERNAL_GROUPS = YES
PERL_PATH = /usr/bin/perl
CLASS_DIAGRAMS = YES
HIDE_UNDOC_RELATIONS = YES
HAVE_DOT = YES
CLASS_GRAPH = YES
COLLABORATION_GRAPH = YES
GROUP_GRAPHS = NO
UML_LOOK = NO
TEMPLATE_RELATIONS = NO
INCLUDE_GRAPH = NO
INCLUDED_BY_GRAPH = NO
CALL_GRAPH = NO
GRAPHICAL_HIERARCHY = YES
DIRECTORY_GRAPH = NO
DOT_IMAGE_FORMAT = png
DOT_PATH =
DOTFILE_DIRS =
MAX_DOT_GRAPH_WIDTH = 1024
MAX_DOT_GRAPH_HEIGHT = 1024
MAX_DOT_GRAPH_DEPTH = 0
DOT_TRANSPARENT = YES
DOT_MULTI_TARGETS = YES
GENERATE_LEGEND = YES
DOT_CLEANUP = YES
SEARCHENGINE = NO

19
src/docs/MainPage.dox Normal file
View File

@ -0,0 +1,19 @@
/* libutf8/src/docs/MainPage.dox
*
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
* Released under the GNU GPLv2. See file COPYING or
* http://www.gnu.org/copyleft/gpl.html for details.
*/
/*! \mainpage
\c libutf8 provides a C API for encoding and decoding UTF-8. It uses the C type \c wchar_t as its
internal character representation. \c libutf8 is a "safe" decoder &mdash; it will not accept
overlong byte sequences.
*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4
*/

1
src/docs/build.default Normal file
View File

@ -0,0 +1 @@
source src/docs/build.docs

43
src/docs/build.docs Normal file
View File

@ -0,0 +1,43 @@
# These are external variables, and shouldn't clash with anything else
# docs_BUILT
#
MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
build_target monolithic
if [ -z ${docs_BUILT} ]
then
echo "Building documentation with Doxygen..."
DOXYFILE=obj/Doxyfile.docs
if [ ! -e ${DOXYFILE} ]
then
do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
fi
MODIFIED=0
for file in ${MONOLITHIC_DOC}
do
if [ ${file} -nt html/index.html ]
then
MODIFIED=1
break
fi
done
if [ ${MODIFIED} -ne 0 ]
then
do_cmd doxygen ${DOXYFILE} || return 1
print_success "Documentation built"
else
print_success "Documentation is up to date"
fi
docs_BUILT=1
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

1
src/docs/build.install Normal file
View File

@ -0,0 +1 @@
source src/docs/build.install-docs

View File

@ -0,0 +1,21 @@
build_target docs
# create documentation directories
echo "Installing documentation into ${DOCSDIR}"
build_dir_tree "${DOCSDIR}/html" || return 1
# copy across the Doxygen-generated documentation
for file in html/*
do
install_file ${file} ${DOCSDIR}/html 0644 || return 1
done
# copy across the generic files
for file in COPYING README
do
install_file ${file} ${DOCSDIR} 0644 || return 1
done
print_success "Documentation installed"
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

1
src/libutf8/.params Normal file
View File

@ -0,0 +1 @@
c lib libutf8 utf8.h

View File

@ -0,0 +1,11 @@
/* libutf8/src/lib/BottomHeader.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,14 @@
/* libutf8/src/lib/ForwardDeclare.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
// This file simply contains forward declarations of all libutf8
// classes, to facilitate header ordering, etc.
// encode_state.h
struct utf8_encode_state;
// decode_state.h
struct utf8_decode_state;

16
src/libutf8/TopHeader.h Normal file
View File

@ -0,0 +1,16 @@
/* libutf8/src/lib/TopHeader.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#ifndef HEADER_LIBUTF8
#define HEADER_LIBUTF8
// standard includes, or includes needed for type declarations
#include <stdbool.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif

13
src/libutf8/TopSource.c Normal file
View File

@ -0,0 +1,13 @@
/* libutf8/src/lib/TopSource.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#include "utf8.h"
// Below are all the includes used throughout the library.
#include <errno.h>
#include <stdint.h>
#include <string.h>

View File

@ -0,0 +1 @@
source src/libutf8/build.lib

View File

@ -0,0 +1 @@
source src/libutf8/build.install-lib

View File

@ -0,0 +1,36 @@
build_target libutf8
# make paths (this is for Gentoo in particular)
build_dir_tree "${LIBDIR}" || return 1
build_dir_tree "${PKGCONFDIR}" || return 1
build_dir_tree "${INCLUDEDIR}" || return 1
# install library
echo "Installing libraries into '${LIBDIR}'"
install_file ${libutf8} ${LIBDIR} 0755 || return 1
BASE="${libutf8_BASE}.so"
MAJOR="${BASE}.${SOMAJOR}"
MINOR="${MAJOR}.${SOMINOR}"
MICRO="${MINOR}.${SOMICRO}"
install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
# install header
echo "Installing header file '${libutf8_HEADER}' into ${INCLUDEDIR}"
install_header ${libutf8_HEADER} ${INCLUDEDIR} 0644 || return 1
# install pkgconfig file
echo "Installing package config file into ${PKGCONFDIR}"
PKGCONFFILE=${PKGCONFDIR}/libutf8.pc
do_cmd rm -f ${PKGCONFFILE}
do_cmd_redir ${PKGCONFFILE} sed \
-e "s,@VERSION@,${VERSION}," \
-e "s,@LIBDIR@,${FINALLIBDIR}," \
-e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
src/libutf8/pkgconf.in
do_cmd chmod 0644 ${PKGCONFFILE}
print_success "Done"
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

51
src/libutf8/build.lib Normal file
View File

@ -0,0 +1,51 @@
# These are external variables, and shouldn't clash with anything else
# libutf8
# libutf8_BUILT
# libutf8_HEADER
# libutf8_BASE
if [ -z ${libutf8_BUILT} ]
then
libutf8_BASE=libutf8
source src/libutf8/soversion
libutf8="obj/${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
SO_EXTRA="-lc"
echo "Building library ${libutf8}..."
do_cmd source src/libutf8/build.monolithic || return 1
MODIFIED=0
for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
do
if [ ${test} -nt ${libutf8} ]
then
MODIFIED=1
break
fi
done
if [ ${MODIFIED} -ne 0 ]
then
echo " Compiling"
SONAME="${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}"
do_cmd ${CC} ${CFLAGS} -shared -fpic -o "${libutf8}" \
-Wl,-soname,${SONAME} \
${SRC} ${SO_EXTRA} || return 1
# make tests work
do_cmd ln -sf $(basename ${libutf8}) obj/${SONAME} || return 1
print_success "Library built"
else
print_success "Library up to date"
fi
libutf8_BUILT=1
libutf8_HEADER=${HDR}
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

View File

@ -0,0 +1,21 @@
# These are external variables, and shouldn't clash with anything else
# libutf8_MONOLITHIC
SRC="obj/libutf8.c"
HDR="obj/utf8.h"
MONOLITHIC_TESTS="src/libutf8/build.lib src/libutf8/build.monolithic"
if [ -z "${libutf8_MONOLITHIC}" ]
then
MONOLITHIC_SOURCE="$(echo src/libutf8/{TopHeader,ForwardDeclare,ctype,{de,en}code{,_state},BottomHeader}.h)"
make_monolithic ${HDR} C || return 1
MONOLITHIC_SOURCE="$(echo src/libutf8/{TopSource,ctype,{de,en}code{,_state}}.c)"
make_monolithic ${SRC} C || return 1
libutf8_MONOLITHIC=1
MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

55
src/libutf8/ctype.c Normal file
View File

@ -0,0 +1,55 @@
/* libutf8/src/lib/ctype.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
bool utf8_isascii(wchar_t ch)
{
return !(ch & ~0x7F);
}
/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/)
0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
0020 ; White_Space # Zs SPACE
0085 ; White_Space # Cc <control-0085>
00A0 ; White_Space # Zs NO-BREAK SPACE
1680 ; White_Space # Zs OGHAM SPACE MARK
180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
2028 ; White_Space # Zl LINE SEPARATOR
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
202F ; White_Space # Zs NARROW NO-BREAK SPACE
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
3000 ; White_Space # Zs IDEOGRAPHIC SPACE
*/
bool utf8_isspace(wchar_t ch)
{
return((ch >= 0x0009 && ch <= 0x000D)
|| ch == 0x0020
|| ch == 0x0085
|| ch == 0x00A0
|| ch == 0x1680
|| ch == 0x180E
|| (ch >= 0x2000 && ch <= 0x200A)
|| ch == 0x2028
|| ch == 0x2029
|| ch == 0x202F
|| ch == 0x205F
|| ch == 0x3000);
}
bool utf8_isvalid(wchar_t ch)
{
return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF);
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

46
src/libutf8/ctype.h Normal file
View File

@ -0,0 +1,46 @@
/* libutf8/src/lib/ctype.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \defgroup ctype Character classification
This module contains functions for character classification. These are basically an extension of the
\c is* functions defined in \c &lt;ctype.h&gt;.
\todo There are many char classification functions that haven't been implemented yet. These won't be
implemented until they can be done in a proper, Unicode-safe fashion.
*/
/*!@{*/
/// Returns \c true if \a ch can be represented in ASCII.
bool utf8_isascii(wchar_t ch);
/// Returns \c true if \a ch is whitespace.
bool utf8_isspace(wchar_t ch);
/*! \brief Returns \c true if \a ch is a valid UCS-4 character.
\param ch The character to classify.
\retval true If \a ch is a valid UCS-4 character.
\retval false If \a ch is not a valid UCS-4 character.
This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4
character. Valid characters lie in the range 0&ndash;0x7FFFFFFF but exclude:
\li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
\li the invalid code points U+FFFE and U+FFFF
*/
bool utf8_isvalid(wchar_t ch);
/*!@}*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

234
src/libutf8/decode.c Normal file
View File

@ -0,0 +1,234 @@
/* libutf8/src/lib/decode.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
wchar_t utf8_decode_char(const char* src, size_t* used)
{
return utf8_decode_char2(src, 6, used);
}
wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used)
{
uint8_t ch;
wchar_t ret, min;
int remain;
if(!src || !size) {
errno = EINVAL;
return 0;
}
if(used) *used = 1;
ch = *src++;
if(ch & 0x80) {
if((ch & 0xE0) == 0xC0) {
min = 0x80;
remain = 1;
if(used) *used = 2;
ret = ch & 0x1F;
} else if((ch & 0xF0) == 0xE0) {
min = 0x800;
remain = 2;
if(used) *used = 3;
ret = ch & 0x0F;
} else if((ch & 0xF8) == 0xF0) {
min = 0x10000;
remain = 3;
if(used) *used = 4;
ret = ch & 0x07;
} else if((ch & 0xFC) == 0xF8) {
min = 0x200000;
remain = 4;
if(used) *used = 5;
ret = ch & 0x03;
} else if((ch & 0xFE) == 0xFC) {
min = 0x4000000;
remain = 5;
if(used) *used = 6;
ret = ch & 0x01;
} else {
errno = EILSEQ;
return 0;
}
while(remain--) {
if(!--size) {
errno = EILSEQ;
return 0;
}
ch = *src++;
if((ch & 0xC0) != 0x80) {
errno = EILSEQ;
return 0;
}
ret <<= 6;
ret |= ch & 0x3F;
}
if(ch < min) {
errno = EILSEQ;
return 0;
}
return ret;
}
return ch;
}
wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq)
{
return utf8_decode_char2_force(src, 6, used, ilseq);
}
wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq)
{
uint8_t ch;
wchar_t ret, min;
int remain;
if(!src || !size) {
errno = EINVAL;
return 0;
}
if(used) *used = 1;
ch = *src++;
if(ch & 0x80) {
if((ch & 0xE0) == 0xC0) {
min = 0x80;
remain = 1;
ret = ch & 0x1F;
} else if((ch & 0xF0) == 0xE0) {
min = 0x800;
remain = 2;
ret = ch & 0x0F;
} else if((ch & 0xF8) == 0xF0) {
min = 0x10000;
remain = 3;
ret = ch & 0x07;
} else if((ch & 0xFC) == 0xF8) {
min = 0x200000;
remain = 4;
ret = ch & 0x03;
} else if((ch & 0xFE) == 0xFC) {
min = 0x4000000;
remain = 5;
ret = ch & 0x01;
} else {
goto ILSEQ;
}
while(remain--) {
if(!--size) goto ILSEQ;
ch = *src++;
if(used) (*used)++;
if((ch & 0xC0) != 0x80) goto ILSEQ;
ret <<= 6;
ret |= ch & 0x3F;
}
if(ch < min) goto ILSEQ;
return ret;
}
return ch;
ILSEQ:
// advance pointer to next valid char boundary
while(1) {
if(!*src || !size) break;
if((*src & 0xC0) == 0x80) break;
++src;
--size;
if(used) (*used)++;
}
return ilseq;
}
wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src)
{
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = -1;
ctx.wr = dest;
ctx.wr_size = size;
if(!utf8_decoder(&ctx)) return 0;
if(*ctx.rd) {
errno = ENOMEM;
return 0;
}
return dest;
}
wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt)
{
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = amt;
ctx.wr = dest;
ctx.wr_size = size;
if(!utf8_decoder(&ctx)) return 0;
if(ctx.rd_remain || !ctx.complete) {
errno = ENOMEM;
return 0;
}
if(written) *written = ctx.written;
return dest;
}
wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src)
{
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = -1;
ctx.wr = dest;
ctx.wr_size = size;
ctx.error_callback = utf8_decode_error_callback_replace;
if(!utf8_decoder(&ctx)) return 0;
if(*ctx.rd) {
errno = ENOMEM;
return 0;
}
return dest;
}
wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt)
{
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = amt;
ctx.wr = dest;
ctx.wr_size = size;
ctx.error_callback = utf8_decode_error_callback_replace;
if(!utf8_decoder(&ctx)) return 0;
if(written) *written = ctx.written;
return dest;
}

187
src/libutf8/decode.h Normal file
View File

@ -0,0 +1,187 @@
/* libutf8/src/lib/decode.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \defgroup decode UTF-8 decoding routines.
These routines decode UTF-8 data into C's wide character type \c wchar_t. Errors are reported
through \c errno, with the following errors being of particular interest:
\li \c EINVAL - invalid argument to function
\li \c EILSEQ - illegal encoding (i.e. not UTF-8 or encoding error)
\li \c ENOMEM - not enough space in destination buffer
As a special case, functions which return a character may return the \c wchar_t representation of
-1 to signify an error. This wording is used to take into account the fact that the \c wchar_t type
could be unsigned.
*/
/*!@{*/
/*! \brief Decode a character.
\param src Pointer to start of source data.
\param used If not null, set to the number of bytes used.
\retval (wchar_t)(-1) on error (see \c errno).
\returns Decoded character.
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a errno
will be set to \c EILSEQ.
\warning Only use this function if you are sure it cannot read past the end of your buffer. See
utf8_decode_char2() for a safe version.
*/
wchar_t utf8_decode_char(const char* src, size_t* used);
/*! \brief Decode a character, discarding illegal sequences.
\param src Pointer to start of source data.
\param used If not null, set to the number of bytes used.
\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the
Unicode replacement character, \c 0xFFFD.
\retval (wchar_t)(-1) on error (see \c errno).
\retval ilseq If an illegal sequence is encountered.
\returns Decoded character.
\post \a *used will be set to the number of bytes consumed.
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq
will be returned and the buffer advanced to the next valid character. This means the function can
only fail if you pass it an invalid \a src pointer.
\warning Only use this function if you are sure it cannot read past the end of your buffer. See
utf8_decode_char2_force() for a safe version.
*/
wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq);
/*! \brief Decode a character, given source buffer size.
\param src Pointer to start of source data.
\param size Size of source data in bytes.
\param used If not null, set to the number of bytes used.
\retval (wchar_t)(-1) on error (see \c errno).
\returns Decoded character.
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used.
*/
wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used);
/*! \brief Decode a character, discarding illegal sequences and given source buffer size.
\param src Pointer to start of source data.
\param size Size of source data in bytes.
\param used If not null, set to the number of bytes used.
\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the
Unicode replacement character, \c 0xFFFD.
\retval (wchar_t)(-1) on error (see \c errno).
\retval ilseq If an illegal sequence is encountered.
\returns Decoded character.
\post \a *used will be set to the number of bytes consumed.
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq
will be returned and the buffer advanced to the next valid character. This means the function can
only fail if you pass it an invalid \a src pointer, or a \a size of 0.
*/
wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq);
/*! \brief Decode a null-terminated string.
\param dest The output destination.
\param size The number of characters that can be stored in \a dest.
\param src Pointer to the null-terminated source data.
\returns Pointer to the output destination.
\retval 0 on error (see \c errno).
This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets
\c errno appropriately.
*/
wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src);
/*! \brief Decode a fixed-size string.
\param dest The output destination.
\param size The number of characters that can be stored in \a dest.
\param written Set to the number of bytes written (excluding NUL).
\param src Pointer to the null-terminated source data.
\param amt Number of bytes to decode.
\returns Pointer to the output destination.
\retval 0 on error (see \c errno).
This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets
\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null,
it is set to the number of characters written excluding the terminating NUL. This function always
produces null-terminated strings.
*/
wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt);
/*! \brief Decode a null-terminated string, ignoring errors.
\param dest The output destination.
\param size The number of characters that can be stored in \a dest.
\param src Pointer to the null-terminated source data.
\returns Pointer to the output destination.
\retval 0 on error (see \c errno).
This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets
\c errno appropriately.
This function will truncate the output if there is not enough space and will skip characters it
cannot decode. It can only fail if you pass it invalid parameters.
*/
wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src);
/*! \brief Decode a fixed-size string, ignoring errors.
\param dest The output destination.
\param size The number of characters that can be stored in \a dest.
\param written Set to the number of bytes written (excluding NUL).
\param src Pointer to the null-terminated source data.
\param amt Number of bytes to decode.
\returns Pointer to the output destination.
\retval 0 on error (see \c errno).
This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets
\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null,
it is set to the number of characters written excluding the terminating NUL. This function always
produces null-terminated strings.
This function will truncate the output if there is not enough space and will skip characters it
cannot decode. It can only fail if you pass it invalid parameters.
*/
wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt);
/*!@}*/

204
src/libutf8/decode_state.c Normal file
View File

@ -0,0 +1,204 @@
/* libutf8/src/lib/decode_ctx.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
enum utf8_decoder_state {
utf8_state_none,
utf8_state_multibyte1,
utf8_state_multibyte2,
utf8_state_multibyte3,
utf8_state_multibyte4,
utf8_state_multibyte5,
utf8_state_error,
utf8_state_skip
};
struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* ctx)
{
wchar_t* wr;
size_t avail;
enum utf8_decode_error error_type;
if(!ctx || !ctx->rd || !ctx->wr || ctx->wr_size < 2 || ctx->state == utf8_state_error) {
errno = EINVAL;
return 0;
}
wr = ctx->wr;
ctx->written = 0;
avail = ctx->wr_size;
loop:
while(ctx->rd_remain) {
uint8_t in = *ctx->rd;
switch(ctx->state) {
case utf8_state_skip:
case utf8_state_none:
if(!in && ctx->rd_remain < 0) {
*wr = 0;
ctx->complete = true;
++ctx->byte_offset;
return ctx;
}
if(!(in & 0x80)) {
*wr++ = in;
++ctx->written;
--avail;
++ctx->char_offset;
ctx->complete = true;
if(in == 0x0A) {
++ctx->line;
ctx->col = 0;
} else {
++ctx->col;
}
ctx->state = utf8_state_none;
break;
}
ctx->complete = false;
if((in & 0xE0) == 0xC0) {
ctx->minch = 0x80;
ctx->state = utf8_state_multibyte1;
ctx->statech = in & 0x1F;
} else if((in & 0xF0) == 0xE0) {
ctx->minch = 0x800;
ctx->state = utf8_state_multibyte2;
ctx->statech = in & 0x0F;
} else if((in & 0xF8) == 0xF0) {
ctx->minch = 0x10000;
ctx->state = utf8_state_multibyte3;
ctx->statech = in & 0x07;
} else if((in & 0xFC) == 0xF8) {
ctx->minch = 0x200000;
ctx->state = utf8_state_multibyte4;
ctx->statech = in & 0x03;
} else if((in & 0xFE) == 0xFC) {
ctx->minch = 0x4000000;
ctx->state = utf8_state_multibyte5;
ctx->statech = in & 0x01;
} else if(ctx->state != utf8_state_none) {
ctx->state = utf8_state_none;
} else {
error_type = ((in & 0xC0) == 0x80) ? utf8_decode_error_lone_cchar
: utf8_decode_error_not_schar;
goto error;
}
break;
case utf8_state_multibyte1:
case utf8_state_multibyte2:
case utf8_state_multibyte3:
case utf8_state_multibyte4:
case utf8_state_multibyte5:
if((in & 0xC0) != 0x80) {
error_type = utf8_decode_error_not_cchar;
goto error;
}
ctx->statech <<= 6;
ctx->statech |= in & 0x3F;
if(!--ctx->state) {
if(ctx->statech < ctx->minch) {
error_type = utf8_decode_error_overlong;
goto error;
} else {
// validate codepoint
if(!utf8_isvalid(ctx->statech)) {
error_type = utf8_decode_error_illegal_cp;
goto error;
}
// add to output string
*wr++ = ctx->statech;
++ctx->written;
--avail;
++ctx->char_offset;
ctx->complete = true;
if(ctx->statech == 0x0A || ctx->statech == 0x2028) {
++ctx->line;
ctx->col = 0;
} else {
++ctx->col;
}
}
}
break;
default:
errno = EINVAL;
return 0;
}
++ctx->byte_offset;
++ctx->rd;
if(ctx->rd_remain > 0) --ctx->rd_remain;
if(avail == 1) break;
}
*wr = 0;
return ctx;
error:
if(!ctx->error_callback) {
errno = EILSEQ;
return 0;
}
switch(ctx->error_callback(ctx, error_type, wr)) {
case utf8_decode_error_action_abort:
errno = EILSEQ;
return 0;
case utf8_decode_error_action_skip:
ctx->state = utf8_state_skip;
goto loop;
case utf8_decode_error_action_replace:
ctx->state = utf8_state_skip;
++ctx->written;
if(*wr == 0x0A || *wr == 0x2028) {
++ctx->line;
ctx->col = 0;
} else {
++ctx->col;
}
++wr;
if(--avail == 1) {
*wr = 0;
return ctx;
}
goto loop;
}
// shouldn't reach here
errno = EILSEQ;
return 0;
}
enum utf8_decode_error_action utf8_decode_error_callback_replace(
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
{
(void)ctx;
(void)error;
*newch = 0xFFFD;
return utf8_decode_error_action_replace;
}
enum utf8_decode_error_action utf8_decode_error_callback_skip(
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
{
(void)ctx;
(void)error;
(void)newch;
return utf8_decode_error_action_skip;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

197
src/libutf8/decode_state.h Normal file
View File

@ -0,0 +1,197 @@
/* libutf8/src/lib/decode_ctx.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \defgroup decode_ctx UTF-8 stateful decoder.
This UTF-8 decoder uses a structure to maintain state information between calls. This means that
you can feed it a stream of data as it comes in without needing to store the entire document in a
buffer. It correctly copes with the currently-available data ending on a non-character boundary.
Errors are handled by providing a callback function (several of which are provided by the library).
The callback function has the option of aborting the conversion, substituting a replacement
character, or simply skipping the illegal byte sequence.
*/
/*!@{*/
/*! \brief Types of decoder error.
These are the types of error that can be encountered by the decoder. This allows slightly more
information than is provided by setting \a errno to \c EILSEQ. The type of error will be passed
to the callback function.
*/
enum utf8_decode_error {
/// Lone continuation char encountered when start char expected.
utf8_decode_error_lone_cchar,
/// Non-continuation char encountered within multibyte sequence.
utf8_decode_error_not_cchar,
/// Invalid start char (not ASCII).
utf8_decode_error_not_schar,
/// Overlong byte sequence.
utf8_decode_error_overlong,
/// Illegal code positions (UTF-16 surrogates or 0xFFFE,0xFFFF).
utf8_decode_error_illegal_cp
};
/*! \brief Action to be taken after error callback.
These are the possible actions that can be undertaken after a stateful decode has encountered an
error. These actions are specified by the error callback function's return value.
*/
enum utf8_decode_error_action {
/// Abort the conversion, returning EILSEQ.
utf8_decode_error_action_abort,
/// Skip the illegal byte sequence.
utf8_decode_error_action_skip,
/// Discard the illegal byte sequence and enter a replacement char.
utf8_decode_error_action_replace
};
/*! \brief Error callback type.
\param state The state-storage structure.
\param error The error type.
\param[out] newch If utf8_decode_error_action_replace is returned, then set this to the value of
the character you wish to replace with (\c 0xFFFD is recommended).
\returns A value specifying what action to undertake as a result of the callback.
This callback determines the action of the UTF-8 stateful decoder on encountering an illegal byte
sequence. It can choose to abort the conversion, skip the illegal sequence, or replace the illegal
sequence with an arbitrary character.
*/
typedef enum utf8_decode_error_action(*utf8_decode_error_callback)(
const struct utf8_decode_state* state, enum utf8_decode_error error, wchar_t* newch);
/*! \brief State structure used to decode UTF-8 into Unicode.
This structure is used to decode arbitrary chunks of UTF-8 data into Unicode. It can deal with
partial data streams (even if they are cut-off mid-character).
Before calling utf8_decoder, you must set up the object appropriately. The first step is to use
\a memset to initialise everything to 0. Then you need to fill out the read and write pointers, and
possibly set up the error callback.
To use it, you set \a rd to point to your input data and \a rd_remain to the amount you have. If
\a rd_remain is negative, the input data is assumed to be null-terminated; otherwise, it is taken
as the number of bytes remaining at the input. These are updated after each call, so simply check
if \a rd_remain is 0 (or \a *rd is 0 in the case of a null-terminated string).
You must also set \a wr (pointer to destination buffer) and \a wr_size (number of characters that
can be written there), and \a written is set for you (it is the number of characters written per
call but excluding the terminating NUL). This implies that the buffer must have space for at least
two characters. You can change \a wr and \a wr_size at any time, but if you leave them the same the
data will be overwritten on each call.
If you wish to do error recovery, set \a error_callback and possibly \a data.
You can examine the \a line and \a col variables to get the line / column of the input data at which
the decoder is currently operating. \a char_offset and \a byte_offset represent the offset, in
complete characters or bytes, from the start of the stream. With the exception of \a byte_offset,
these variables aren't perfect, as they can be affected by errors and limitations (only 0x0A and
0x2028 are recognised as line end chars, and the effect of tabs is ignored).
*/
struct utf8_decode_state {
/// \c false if we are part-way through a multi-byte character.
bool complete;
/// Data to read (current read position).
const char* rd;
/// Number of bytes remaining (current).
int rd_remain;
/// Internal state; initialise to 0, don't change.
int state;
/// Error callback (may be 0).
utf8_decode_error_callback error_callback;
/// Pointer to output buffer.
wchar_t* wr;
/// Number of characters that can be written.
size_t wr_size;
/// Number of characters written on last call.
size_t written;
/// Arbitrary data pointer for \a error_callback.
void* data;
/// Current line (starting from 0).
int line;
/// Current column (starting from 0).
int col;
/// Character offset from start of data (starting from 0).
int char_offset;
/// Byte offset from start of data (starting from 0).
int byte_offset;
/// Don't use this.
wchar_t statech;
/// Don't use this.
wchar_t minch;
};
/*! \brief Decode an arbitrary chunk of a UTF-8 byte stream.
\param state The state-storage structure.
\retval ctx on success.
\retval 0 on error (see \a errno).
This function is used to do multi-pass decoding of arbitrary UTF-8 byte streams. Each call will
update \a state.rd, \a state.rd_remain and \a state.written. \a state.complete is \c true if, on consumption
of all the data, we are not inside a multibyte character.
Should an error occur, \a state.error_callback is called (if it is not 0). If it is 0, or it returns
utf8_decode_error_action_abort, then the conversion will be aborted and the object set into
an error state. \a errno will be set to \c EILSEQ. Once the object is in an error state, there is
no way to recover short of completely clearing it and starting with fresh data. Continuing to call
this function with an invalid object will result in \c EINVAL.
*/
struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* state);
/// Standard error callback: use replacement char 0xFFFD.
enum utf8_decode_error_action utf8_decode_error_callback_replace(
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch);
/// Standard error callback: skip invalid chars.
enum utf8_decode_error_action utf8_decode_error_callback_skip(
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch);
/*!@}*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

141
src/libutf8/encode.c Normal file
View File

@ -0,0 +1,141 @@
/* libutf8/src/lib/encode.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
{
if(!dest || !amt) {
errno = EINVAL;
return 0;
}
if(!utf8_isvalid(ch)) {
errno = EILSEQ;
return 0;
}
if(ch < 0x80) {
*dest++ = ch;
} else if(ch < 0x800) {
if(amt < 2) {
errno = ENOMEM;
return 0;
}
*dest++ = 0xC0 | ((ch >> 6) & 0x1F);
*dest++ = 0x80 | (ch & 0x3F);
} else if(ch < 0x10000) {
if(amt < 3) {
errno = ENOMEM;
return 0;
}
*dest++ = 0xE0 | ((ch >> 12) & 0xF);
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
*dest++ = 0x80 | (ch & 0x3F);
} else if(ch < 0x200000) {
if(amt < 4) {
errno = ENOMEM;
return 0;
}
*dest++ = 0xF0 | ((ch >> 18) & 0x7);
*dest++ = 0x80 | ((ch >> 12) & 0x3F);
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
*dest++ = 0x80 | (ch & 0x3F);
} else if(ch < 0x4000000) {
if(amt < 5) {
errno = ENOMEM;
return 0;
}
*dest++ = 0xF8 | ((ch >> 24) & 0x3);
*dest++ = 0x80 | ((ch >> 18) & 0x3F);
*dest++ = 0x80 | ((ch >> 12) & 0x3F);
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
*dest++ = 0x80 | (ch & 0x3F);
} else {
if(amt < 6) {
errno = ENOMEM;
return 0;
}
*dest++ = 0xFC | ((ch >> 30) & 0x1);
*dest++ = 0x80 | ((ch >> 24) & 0x3F);
*dest++ = 0x80 | ((ch >> 18) & 0x3F);
*dest++ = 0x80 | ((ch >> 12) & 0x3F);
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
*dest++ = 0x80 | (ch & 0x3F);
}
return dest;
}
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
{
if(!utf8_isvalid(ilseq)) {
errno = EILSEQ;
return 0;
}
return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq);
}
char* utf8_encode(char* dest, size_t amt, const wchar_t* src)
{
return utf8_encode2(dest, amt, 0, src, -1);
}
char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt)
{
struct utf8_encode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = inamt;
ctx.wr = dest;
ctx.wr_size = amt;
if(!utf8_encoder(&ctx)) return 0;
if(ctx.rd_remain > 0 || (ctx.rd_remain < 0 && *ctx.rd)) {
errno = ENOMEM;
return 0;
}
if(written) *written = ctx.written;
return dest;
}
char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src)
{
return utf8_encode_force2(dest, amt, 0, src, -1);
}
char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt)
{
struct utf8_encode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = inamt;
ctx.wr = dest;
ctx.wr_size = amt;
ctx.error_callback = utf8_encode_error_callback_replace;
if(!utf8_encoder(&ctx)) return 0;
if(written) *written = ctx.written;
return dest;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

144
src/libutf8/encode.h Normal file
View File

@ -0,0 +1,144 @@
/* libutf8/src/lib/encode.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \defgroup encode UTF-8 encoding routines.
The functions in this module allow encoding of UTF-8 characters. Errors are reported through
\c errno, with the following errors being of particular interest:
\li \c EINVAL - invalid argument to function
\li \c EILSEQ - illegal source character (see utf8_isvalid())
\li \c ENOMEM - not enough space in destination buffer
*/
/*!@{*/
/*! \brief Encode a single character into UTF-8.
\param dest The destination buffer.
\param amt Number of bytes in destination buffer.
\param ch Character to encode.
\returns Pointer to next byte of buffer to use.
\retval 0 on error (see \c errno).
This function will encode a single character into UTF-8. It returns a pointer to the end of the
character (i.e. the next position in the buffer you want to write to).
On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1, \c EILSEQ
if \a ch is not valid; or \c ENOMEM if the result would not fit into
\a amt bytes) and returns 0.
*/
char* utf8_encode_char(char* dest, size_t amt, wchar_t ch);
/*! \brief Encode a single character into UTF-8, forcing replacement of invalid characters.
\param dest The destination buffer.
\param amt Number of bytes in destination buffer.
\param ch Character to encode.
\param ilseq If \a ch is not a legal character, then this is encoded instead.
\returns Pointer to next byte of buffer to use.
\retval 0 on error (see \c errno).
This function will encode a single character into UTF-8. It returns a pointer to the end of the
character (i.e. the next position in the buffer you want to write to). If the source character \a ch
is not a valid code point, it will instead encode the character \a ilseq.
On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1; \c EILSEQ
if \a ilseq is not valid; or \c ENOMEM if the result would not fit into
\a amt bytes) and returns 0.
*/
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq);
/*! \brief Encode a null-terminated string into UTF-8.
\param dest The destination buffer.
\param amt Number of bytes in the destination buffer.
\param src Null-terminated source string.
\returns Pointer to destination buffer.
\retval 0 on error (see \c errno).
This function encodes a null-terminated Unicode string into the destination buffer. It returns a
pointer to the destination buffer on success, and 0 on error. If there is not enough space in the
buffer, or an illegal character is encountered somewhere in the sequence, it will fail.
*/
char* utf8_encode(char* dest, size_t amt, const wchar_t* src);
/*! \brief Encode a fixed-size string into UTF-8.
\param dest The destination buffer.
\param amt Number of bytes in the destination buffer.
\param written Set to number of bytes written on success (excluding NUL).
\param src Pointer to source string.
\param inamt Number of characters to encode.
\returns Pointer to destination buffer.
\retval 0 on error (see \c errno).
This function encodes a Unicode string (possibly containing ASCII NUL) into the destination buffer.
It returns a pointer to the destination buffer on success, and 0 on error. If there is not enough
space in the buffer, or an illegal character is encountered somewhere in the sequence, it will fail.
The destination will be null-terminated.
*/
char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt);
/*! \brief Encode a null-terminated string into UTF-8, ignoring errors.
\param dest The destination buffer.
\param amt Number of bytes in the destination buffer.
\param src Null-terminated source string.
\returns Pointer to destination buffer.
\returns 0 if arguments are invalid.
This function will encode a null-terminated Unicode string into the destination buffer, making a
best-effort in the case of failures. If there is not enough memory, the destination string will be
truncated (but still null-terminated). If an illegal source character is encountered, it is replaced
with the Unicode replacement character U+FFFD. The function can only fail if one of the arguments is
invalid.
*/
char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src);
/*! \brief Encode a fixed-size string into UTF-8, ignoring errors.
\param dest The destination buffer.
\param amt Number of bytes in the destination buffer.
\param written Set to number of bytes written on success (excluding NUL).
\param src Null-terminated source string.
\param inamt Number of characters to encode.
\returns Pointer to destination buffer.
\returns 0 if arguments are invalid.
This function will encode a Unicode string (possibly containing ASCII NUL) into the destination
buffer, making a best-effort in the case of failures. If there is not enough memory, the destination
string will be truncated (but still null-terminated). If an illegal source character is encountered,
it is replaced with the Unicode replacement character U+FFFD. The function can only fail if one of
the arguments is invalid.
*/
char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt);
/*!@}*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

View File

@ -0,0 +1,88 @@
/* libutf8/src/lib/encode_state.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state)
{
char* wr = state->wr, * ret;
char* endp = wr + state->wr_size - 1;
wchar_t ch;
enum utf8_encode_error_action error_action;
bool reencoding;
if(!state || !state->rd || !state->wr || state->wr_size < 7) {
errno = EINVAL;
return 0;
}
state->written = 0;
while(state->rd_remain) {
ch = *state->rd;
if(!ch && state->rd_remain < 0) break;
reencoding = false;
reencode:
ret = utf8_encode_char(wr, endp - wr, ch);
if(!ret) {
if(errno == ENOMEM) break;
if(!state->error_callback || reencoding) {
errno = EILSEQ;
return 0;
}
error_action = state->error_callback(state, &ch);
switch(error_action) {
case utf8_encode_error_action_abort:
errno = EILSEQ;
return 0;
case utf8_encode_error_action_replace:
reencoding = true;
goto reencode;
case utf8_encode_error_action_skip:
ret = wr;
break;
}
}
if(state->rd_remain > 0) state->rd_remain--;
++state->rd;
++state->char_offset;
if(ch == 0x0A || ch == 0x2028) {
++state->line;
state->col = 0;
} else {
++state->col;
}
state->written += ret - wr;
wr = ret;
if(wr == endp) break;
}
*wr = 0;
return state;
}
enum utf8_encode_error_action utf8_encode_error_callback_replace(
const struct utf8_encode_state* state, wchar_t* newch)
{
(void)state;
*newch = 0xFFFD;
return utf8_encode_error_action_replace;
}
enum utf8_encode_error_action utf8_encode_error_callback_skip(
const struct utf8_encode_state* state, wchar_t* newch)
{
(void)state;
(void)newch;
return utf8_encode_error_action_skip;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

158
src/libutf8/encode_state.h Normal file
View File

@ -0,0 +1,158 @@
/* libutf8/src/lib/encode_state.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \defgroup encode_state UTF-8 stateful encoder.
This UTF-8 encoder uses a structure to maintain state information between calls. This means that
you can feed it a stream of data as it comes in without needing to store the entire source in a
buffer.
Errors (i.e. illegal source chars; see utf8_isvalid()) are handled by providing a callback function
(several of which are provided by the library). The callback function has the option of aborting
the conversion, substituting a replacement character, or simply skipping the illegal source
character.
*/
/*!@{*/
/*! \brief Action to be taken after error callback.
These are the possible actions that can be undertaken after a stateful encoding operation has
encountered an error (illegal source char). These actions are specified by the error callback
function's return value.
*/
enum utf8_encode_error_action {
/// Abort the conversion, returning EILSEQ.
utf8_encode_error_action_abort,
/// Skip the illegal byte sequence.
utf8_encode_error_action_skip,
/// Discard the illegal byte sequence and enter a replacement char.
utf8_encode_error_action_replace
};
/*! \brief Error callback type.
\param state The encoder state information.
\param[out] newch If \a utf8_encode_error_action_replace is returned, this is set to the
character that should be substituted instead of the illegal source character.
This function is called whenever an error occurs. It can examine \a state (and specifically
\a *state.rd) to determine the illegal source character. It can choose to skip the character, replace
it with something else, or abort the conversion entirely.
*/
typedef enum utf8_encode_error_action (*utf8_encode_error_callback)(
const struct utf8_encode_state* state, wchar_t* newch);
/// Standard error callback: use replacement char 0xFFFD.
enum utf8_encode_error_action utf8_encode_error_callback_replace(
const struct utf8_encode_state* state, wchar_t* newch);
/// Standard error callback: skip invalid chars.
enum utf8_encode_error_action utf8_encode_error_callback_skip(
const struct utf8_encode_state* state, wchar_t* newch);
/*! \brief State structure used to encode Unicode into UTF-8.
This structure is used to encode an arbitrary Unicode string into UTF-8. To set it up, first call
\a memset to clear the structure to zero. You will then
want to set \a rd to point to your input string, with \a rd_remain the number of bytes to encode
(you can set it to a negative number if \a rd is null-terminated and you want to encode the whole
thing). You will also want to tell it where to write to (\a wr) and how much space there is in that
buffer (\a wr_size).
To deal with errors (illegal input chars), you can provide a callback function \a error_callback.
An arbitrary \a data pointer is provided in case you wish to associate some object with the encode
operation. Passing a null pointer for \a error_callback is a valid way of indicating you do not
wish to attempt to correct errors.
You can examine the \a line and \a col variables to get the line / column of the input data at which
the decoder is currently operating. These variables aren't perfect, as they can be
affected by errors and limitations (only 0x0A and 0x2028 are recognised as line end chars, and the
effect of tabs is ignored). \a char_offset represents the offset, in complete characters, from the
start of the stream, and should always be accurate.
*/
struct utf8_encode_state {
/// Current read position.
const wchar_t* rd;
/// Number of chars remaining (-ve means to scan for null char).
int rd_remain;
/// Callback function used to handle illegal source characters.
utf8_encode_error_callback error_callback;
/// Output buffer.
char* wr;
/// Output buffer size.
size_t wr_size;
/// Number of bytes written during last call.
size_t written;
/// Arbitrary pointer (useful for \a error_callback).
void* data;
/// Current line (starting from 0).
int line;
/// Current column (starting from 0).
int col;
/// Character offset from start of data (starting from 0).
int char_offset;
};
/*! \brief Encode an arbitrary Unicode string.
\param state The encoder state information.
\retval state on success.
\retval 0 on error (see \c errno).
This function is used to encode some arbitrary Unicode string into UTF-8. It uses a state-storage
structure which allows you to perform the encoding in multiple passes (e.g. if you are encoding
an arbitrary string and outputting it, you will want to use a fixed size buffer and this might
be smaller than required).
In each pass of the function, \a rd and \a rd_remain will be updated to record the current reading
position and the number of bytes left to encode. If the function completes this pass, \a rd_remain
will be zero (but if you are converting a null-terminated string, you will need to check for \a *rd
to be zero instead).
After each call, \a wr will be unchanged but \a written will contain the number of bytes written
(excluding a terminating null, which is always written). If you do not want to overwrite this data
on the next call, you will have to update \a wr and \a wr_size.
If \a state is null, or not filled out properly (no source data or destination buffer not at least 7
bytes large), then no conversion will be performed and \a EINVAL will be stored in \a errno. If an
illegal source character is encountered, and the error callback is 0, aborts the process or tries
to replace the char with another illegal code point, then \a EILSEQ will be stored in \a errno. On
error, 0 will be returned.
*/
struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state);
/*!@}*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

21
src/libutf8/pkgconf.in Normal file
View File

@ -0,0 +1,21 @@
# libutf8/src/lib/clib/pkgconf.in
#
# Metadata file for pkg-config
# ( http://www.freedesktop.org/software/pkgconfig/ )
#
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
# Released under the GNU GPLv2. See file COPYING or
# http://www.gnu.org/copyleft/gpl.html for details.
#
# Name, description
Name: libutf8
Description: Library for encoding and decoding UTF-8
Version: @VERSION@
# Requirements
Requires:
# Compilation information
Libs: -L@LIBDIR@ -lutf8
Cflags: -I@INCLUDEDIR@

17
src/libutf8/soversion Normal file
View File

@ -0,0 +1,17 @@
# libutf8/src/libutf8/soversion
#
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
# Released under the GNU GPLv2. See file COPYING or
# http://www.gnu.org/copyleft/gpl.html for details.
#
# SOMAJOR and SOMINOR are included in the library's soname. They need to
# be bumped on a binary-incompatible release. They are both single
# integers.
SOMAJOR=0
SOMINOR=0
# SOMICRO is bumped every time there is a binary-compatible release.
SOMICRO=0

1
src/tests/.params Normal file
View File

@ -0,0 +1 @@
c tests tests libutf8

3
src/tests/build.default Normal file
View File

@ -0,0 +1,3 @@
source src/tests/build.tests
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

43
src/tests/build.tests Normal file
View File

@ -0,0 +1,43 @@
# These are external variables, and shouldn't clash with anything else
# tests_BUILT
#
build_target libutf8 || return 1
if [ -z ${tests_BUILT} ]
then
LIBS="${libutf8} "
EXTRAS=""
echo "Building test programs..."
do_cmd mkdir -p obj/tests || return 1
for SRC in src/tests/*.c
do
TEST="obj/tests/$(basename ${SRC} | sed -e 's,.c$,,')"
MODIFIED=0
for file in ${LIBS} ${SRC} src/tests/build.tests
do
if [ ${file} -nt ${TEST} ]
then
MODIFIED=1
break
fi
done
if [ ${MODIFIED} -ne 0 ]
then
do_cmd ${CC} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
print_success "Built ${TEST}"
else
print_success "${TEST} is up to date"
fi
done
print_success "All tests built"
tests_BUILT=1
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

107
src/tests/decode.c Normal file
View File

@ -0,0 +1,107 @@
/* libutf8/src/tests/decode.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#include "utf8.h"
#include <stdio.h>
#include <string.h>
void writeout(const wchar_t* x, int amt)
{
fwrite(x, sizeof(wchar_t), amt, stdout);
}
enum utf8_decode_error_action error_callback(
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
{
fprintf(stderr, "Line %d, col %d (char %d, byte %d): ",
ctx->line + 1, ctx->col + 1, ctx->char_offset, ctx->byte_offset);
switch(error) {
case utf8_decode_error_lone_cchar:
fprintf(stderr, "a lone continuation char was encountered.\n");
break;
case utf8_decode_error_not_cchar:
fprintf(stderr, "a continuation char was expected, but not encountered.\n");
break;
case utf8_decode_error_not_schar:
fprintf(stderr, "an invalid character was encountered (not start char).\n");
break;
case utf8_decode_error_overlong:
fprintf(stderr, "an overlong character sequence was encountered.\n");
break;
case utf8_decode_error_illegal_cp:
fprintf(stderr, "an illegal code point was encountered.\n");
break;
}
*newch = 0xFFFD;
return utf8_decode_error_action_replace;
}
int main(int argc, char* argv[])
{
char inbuf[1024];
wchar_t outbuf[1024];
struct utf8_decode_state ctx;
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
printf("Decodes UTF-8 on stdin to UCS-4 on stdout.\n");
return 0;
}
if(argc != 1) {
fprintf(stderr, "No parameters expected. This program decodes UTF-8 presented on stdin\n"
"and transforms it to UCS-4 on stdout.\n");
return 1;
}
// set up ctx structure
memset(&ctx, 0, sizeof(ctx));
ctx.wr = outbuf;
ctx.wr_size = sizeof(outbuf) / sizeof(wchar_t);
ctx.error_callback = error_callback;
// loop over input
while(!feof(stdin)) {
// read input
ctx.rd_remain = fread(inbuf, 1, sizeof(inbuf), stdin);
ctx.rd = inbuf;
// decode it
while(ctx.rd_remain) {
if(!utf8_decoder(&ctx)) {
perror("utf8_decoder");
fprintf(stderr, "(at line %d, col %d, char %d, byte %d)\n",
ctx.line + 1, ctx.col + 1, ctx.char_offset, ctx.byte_offset);
return 1;
}
// write output
writeout(outbuf, ctx.written);
}
}
if(!ctx.complete) {
fprintf(stderr, "Input did not end on a character boundary.\n");
}
return 0;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

165
src/tests/random.c Normal file
View File

@ -0,0 +1,165 @@
/* libutf8/src/tests/random.c
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#include "utf8.h"
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
void make_rand(wchar_t* buf, int ch)
{
int fd = open("/dev/urandom", O_RDONLY);
if(fd < 0) {
perror("open(\"/dev/urandom\")");
exit(1);
}
ch *= sizeof(wchar_t);
if(read(fd, (char*)buf, ch) != ch) {
perror("read(\"/dev/urandom\")");
exit(1);
}
close(fd);
ch /= sizeof(wchar_t);
while(ch--) {
buf[ch] &= 0x7FFFFFFF;
}
}
int do_encode(char* dest, size_t size, wchar_t* src, size_t amt)
{
struct utf8_encode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = amt;
ctx.wr = dest;
ctx.wr_size = 20;
while(ctx.rd_remain) {
if(!utf8_encoder(&ctx)) {
perror("utf8_encoder");
exit(1);
}
ctx.wr += ctx.written;
if(ctx.wr + ctx.wr_size > dest + size) {
fprintf(stderr, "do_encode: we're going to run out of memory\n");
exit(1);
}
}
return ctx.wr - dest;
}
int MIN(int x, int y)
{
return (x < y) ? x : y;
}
void do_decode_easy(wchar_t* dest, size_t size, const char* src, size_t amt)
{
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = amt;
ctx.wr = dest;
ctx.wr_size = size;
if(!utf8_decoder(&ctx)) {
perror("[easy] utf8_decoder");
exit(1);
}
if(ctx.rd_remain) {
fprintf(stderr, "do_decode_easy: %d bytes left in buffer\n", ctx.rd_remain);
exit(1);
}
if(!ctx.complete) {
fprintf(stderr, "do_decode_easy: incomplete character at end of data\n");
exit(1);
}
}
void do_decode(wchar_t* dest, size_t size, const char* src, size_t amt)
{
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = src;
ctx.rd_remain = MIN(20, amt);
amt -= ctx.rd_remain;
ctx.wr = dest;
ctx.wr_size = 20;
while(ctx.rd_remain) {
if(!utf8_decoder(&ctx)) {
perror("utf8_decoder");
exit(1);
}
if(!ctx.rd_remain) {
ctx.rd_remain = MIN(20, amt);
amt -= ctx.rd_remain;
}
ctx.wr += ctx.written;
if(ctx.wr + ctx.wr_size > dest + size) {
ctx.wr_size = ctx.wr - dest - size;
}
}
}
int main(int argc, char* argv[])
{
wchar_t wbuf[1024], wbuf2[1025];
char cbuf[8192];
int amt;
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
printf("Encodes and decodes random well-formed strings.\n");
return 0;
}
make_rand(wbuf, 1024);
amt = do_encode(cbuf, 8192, wbuf, 1024);
do_decode_easy(wbuf2, 1025, cbuf, amt);
do_decode(wbuf2, 1025, cbuf, amt);
if(memcmp(wbuf, wbuf2, 1024 * sizeof(wchar_t))) {
fprintf(stderr, "Output doesn't match input!\n");
for(amt = 0; amt < 1024; ++amt) {
if(wbuf[amt] != wbuf2[amt])
fprintf(stderr, "%4d: %08X != %08X\n", amt, wbuf[amt], wbuf2[amt]);
}
return 1;
}
printf("Success.\n");
return 0;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

35
src/tests/template Normal file
View File

@ -0,0 +1,35 @@
/* libutf8/src/tests/???.c
*
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
* Released under the GNU GPLv2. See file COPYING or
* http://www.gnu.org/copyleft/gpl.html for details.
*/
#include "utf8.h"
#include <stdio.h>
int main(int argc, char* argv[])
{
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
printf("One line summary.\n");
return 0;
}
if(argc == 1) {
// empty argument list
}
int ret = 0;
// TODO
return ret;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4
*/