diff --git a/README b/README index 9d65ace..01fb08b 100644 --- a/README +++ b/README @@ -10,5 +10,9 @@ Really Quick Instructions To build: ./make.sh To install: ./make.sh install (you might want to set PREFIX, by default it's /usr/local) +Documentation is automatically built using doxygen. -@TODO@ +Project Homepage +---------------- + +http://www.lwithers.me.uk/projects/libutf8/ diff --git a/src/docs/.params b/src/docs/.params new file mode 100644 index 0000000..efd9ae0 --- /dev/null +++ b/src/docs/.params @@ -0,0 +1 @@ +doxygen docs docs diff --git a/src/docs/Doxyfile.in b/src/docs/Doxyfile.in new file mode 100644 index 0000000..c85bf84 --- /dev/null +++ b/src/docs/Doxyfile.in @@ -0,0 +1,146 @@ +# libutf8/src/docs/Doxyfile.in +# +# (c)2006, Laurence Withers, . +# Released under the GNU GPLv2. See file COPYING or +# http://www.gnu.org/copyleft/gpl.html for details. +# + +PROJECT_NAME = libutf8 +OUTPUT_DIRECTORY = +CREATE_SUBDIRS = NO +OUTPUT_LANGUAGE = English +USE_WINDOWS_ENCODING = NO +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = YES +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = YES +DETAILS_AT_TOP = YES +INHERIT_DOCS = YES +DISTRIBUTE_GROUP_DOC = NO +TAB_SIZE = 4 +ALIASES = +OPTIMIZE_OUTPUT_FOR_C = NO +OPTIMIZE_OUTPUT_JAVA = NO +SUBGROUPING = YES +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = NO +EXTRACT_LOCAL_METHODS = NO +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = YES +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +SHOW_INCLUDE_FILES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +SORT_BRIEF_DOCS = NO +SORT_BY_SCOPE_NAME = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = NO +SHOW_DIRECTORIES = NO +FILE_VERSION_FILTER = +QUIET = YES +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = YES +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +FILE_PATTERNS = +RECURSIVE = NO +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = src/docs +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +SOURCE_BROWSER = NO +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = YES +REFERENCES_RELATION = YES +VERBATIM_HEADERS = NO +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +GENERATE_HTML = YES +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = +HTML_STYLESHEET = +HTML_ALIGN_MEMBERS = YES +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +BINARY_TOC = NO +TOC_EXPAND = NO +DISABLE_INDEX = NO +ENUM_VALUES_PER_LINE = 4 +GENERATE_TREEVIEW = NO +TREEVIEW_WIDTH = 250 +GENERATE_LATEX = NO +GENERATE_RTF = NO +GENERATE_MAN = NO +GENERATE_XML = NO +GENERATE_AUTOGEN_DEF = NO +GENERATE_PERLMOD = NO +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = DOXYGEN +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +PERL_PATH = /usr/bin/perl +CLASS_DIAGRAMS = YES +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = YES +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = NO +UML_LOOK = NO +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +CALL_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +DIRECTORY_GRAPH = NO +DOT_IMAGE_FORMAT = png +DOT_PATH = +DOTFILE_DIRS = +MAX_DOT_GRAPH_WIDTH = 1024 +MAX_DOT_GRAPH_HEIGHT = 1024 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = YES +DOT_MULTI_TARGETS = YES +GENERATE_LEGEND = YES +DOT_CLEANUP = YES +SEARCHENGINE = NO diff --git a/src/docs/MainPage.dox b/src/docs/MainPage.dox new file mode 100644 index 0000000..dbc156f --- /dev/null +++ b/src/docs/MainPage.dox @@ -0,0 +1,19 @@ +/* libutf8/src/docs/MainPage.dox + * + * (c)2006, Laurence Withers, . + * Released under the GNU GPLv2. See file COPYING or + * http://www.gnu.org/copyleft/gpl.html for details. +*/ + +/*! \mainpage + +\c libutf8 provides a C API for encoding and decoding UTF-8. It uses the C type \c wchar_t as its +internal character representation. \c libutf8 is a "safe" decoder — it will not accept +overlong byte sequences. + +*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +vim: expandtab:ts=4:sw=4 +*/ diff --git a/src/docs/build.default b/src/docs/build.default new file mode 100644 index 0000000..ca22639 --- /dev/null +++ b/src/docs/build.default @@ -0,0 +1 @@ +source src/docs/build.docs diff --git a/src/docs/build.docs b/src/docs/build.docs new file mode 100644 index 0000000..653c323 --- /dev/null +++ b/src/docs/build.docs @@ -0,0 +1,43 @@ +# These are external variables, and shouldn't clash with anything else +# docs_BUILT +# + +MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)" +build_target monolithic + +if [ -z ${docs_BUILT} ] +then + echo "Building documentation with Doxygen..." + + DOXYFILE=obj/Doxyfile.docs + + if [ ! -e ${DOXYFILE} ] + then + do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1 + echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE} + echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE} + fi + + MODIFIED=0 + for file in ${MONOLITHIC_DOC} + do + if [ ${file} -nt html/index.html ] + then + MODIFIED=1 + break + fi + done + + if [ ${MODIFIED} -ne 0 ] + then + do_cmd doxygen ${DOXYFILE} || return 1 + print_success "Documentation built" + else + print_success "Documentation is up to date" + fi + + docs_BUILT=1 +fi + +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/docs/build.install b/src/docs/build.install new file mode 100644 index 0000000..016c75c --- /dev/null +++ b/src/docs/build.install @@ -0,0 +1 @@ +source src/docs/build.install-docs diff --git a/src/docs/build.install-docs b/src/docs/build.install-docs new file mode 100644 index 0000000..66167d3 --- /dev/null +++ b/src/docs/build.install-docs @@ -0,0 +1,21 @@ +build_target docs + +# create documentation directories +echo "Installing documentation into ${DOCSDIR}" +build_dir_tree "${DOCSDIR}/html" || return 1 + +# copy across the Doxygen-generated documentation +for file in html/* +do + install_file ${file} ${DOCSDIR}/html 0644 || return 1 +done + +# copy across the generic files +for file in COPYING README +do + install_file ${file} ${DOCSDIR} 0644 || return 1 +done + +print_success "Documentation installed" +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/libutf8/.params b/src/libutf8/.params new file mode 100644 index 0000000..88c82f2 --- /dev/null +++ b/src/libutf8/.params @@ -0,0 +1 @@ +c lib libutf8 utf8.h diff --git a/src/libutf8/BottomHeader.h b/src/libutf8/BottomHeader.h new file mode 100644 index 0000000..fef2ec3 --- /dev/null +++ b/src/libutf8/BottomHeader.h @@ -0,0 +1,11 @@ +/* libutf8/src/lib/BottomHeader.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libutf8/ForwardDeclare.h b/src/libutf8/ForwardDeclare.h new file mode 100644 index 0000000..90f92c3 --- /dev/null +++ b/src/libutf8/ForwardDeclare.h @@ -0,0 +1,14 @@ +/* libutf8/src/lib/ForwardDeclare.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +// This file simply contains forward declarations of all libutf8 +// classes, to facilitate header ordering, etc. + +// encode_state.h +struct utf8_encode_state; + +// decode_state.h +struct utf8_decode_state; diff --git a/src/libutf8/TopHeader.h b/src/libutf8/TopHeader.h new file mode 100644 index 0000000..7c621f0 --- /dev/null +++ b/src/libutf8/TopHeader.h @@ -0,0 +1,16 @@ +/* libutf8/src/lib/TopHeader.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +#ifndef HEADER_LIBUTF8 +#define HEADER_LIBUTF8 + +// standard includes, or includes needed for type declarations +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif diff --git a/src/libutf8/TopSource.c b/src/libutf8/TopSource.c new file mode 100644 index 0000000..91ab1b9 --- /dev/null +++ b/src/libutf8/TopSource.c @@ -0,0 +1,13 @@ +/* libutf8/src/lib/TopSource.cpp + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +#include "utf8.h" + +// Below are all the includes used throughout the library. + +#include +#include +#include diff --git a/src/libutf8/build.default b/src/libutf8/build.default new file mode 100644 index 0000000..1a8d960 --- /dev/null +++ b/src/libutf8/build.default @@ -0,0 +1 @@ +source src/libutf8/build.lib diff --git a/src/libutf8/build.install b/src/libutf8/build.install new file mode 100644 index 0000000..441fcb0 --- /dev/null +++ b/src/libutf8/build.install @@ -0,0 +1 @@ +source src/libutf8/build.install-lib diff --git a/src/libutf8/build.install-lib b/src/libutf8/build.install-lib new file mode 100644 index 0000000..c6c6365 --- /dev/null +++ b/src/libutf8/build.install-lib @@ -0,0 +1,36 @@ +build_target libutf8 + +# make paths (this is for Gentoo in particular) +build_dir_tree "${LIBDIR}" || return 1 +build_dir_tree "${PKGCONFDIR}" || return 1 +build_dir_tree "${INCLUDEDIR}" || return 1 + +# install library +echo "Installing libraries into '${LIBDIR}'" +install_file ${libutf8} ${LIBDIR} 0755 || return 1 +BASE="${libutf8_BASE}.so" +MAJOR="${BASE}.${SOMAJOR}" +MINOR="${MAJOR}.${SOMINOR}" +MICRO="${MINOR}.${SOMICRO}" +install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}" +install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}" +install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}" + +# install header +echo "Installing header file '${libutf8_HEADER}' into ${INCLUDEDIR}" +install_header ${libutf8_HEADER} ${INCLUDEDIR} 0644 || return 1 + +# install pkgconfig file +echo "Installing package config file into ${PKGCONFDIR}" +PKGCONFFILE=${PKGCONFDIR}/libutf8.pc +do_cmd rm -f ${PKGCONFFILE} +do_cmd_redir ${PKGCONFFILE} sed \ + -e "s,@VERSION@,${VERSION}," \ + -e "s,@LIBDIR@,${FINALLIBDIR}," \ + -e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \ + src/libutf8/pkgconf.in +do_cmd chmod 0644 ${PKGCONFFILE} +print_success "Done" + +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/libutf8/build.lib b/src/libutf8/build.lib new file mode 100644 index 0000000..7eaecf6 --- /dev/null +++ b/src/libutf8/build.lib @@ -0,0 +1,51 @@ +# These are external variables, and shouldn't clash with anything else +# libutf8 +# libutf8_BUILT +# libutf8_HEADER +# libutf8_BASE + +if [ -z ${libutf8_BUILT} ] +then + libutf8_BASE=libutf8 + source src/libutf8/soversion + + libutf8="obj/${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}" + SO_EXTRA="-lc" + + echo "Building library ${libutf8}..." + + do_cmd source src/libutf8/build.monolithic || return 1 + + MODIFIED=0 + for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC} + do + if [ ${test} -nt ${libutf8} ] + then + MODIFIED=1 + break + fi + done + + if [ ${MODIFIED} -ne 0 ] + then + echo " Compiling" + + SONAME="${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}" + do_cmd ${CC} ${CFLAGS} -shared -fpic -o "${libutf8}" \ + -Wl,-soname,${SONAME} \ + ${SRC} ${SO_EXTRA} || return 1 + + # make tests work + do_cmd ln -sf $(basename ${libutf8}) obj/${SONAME} || return 1 + + print_success "Library built" + else + print_success "Library up to date" + fi + + libutf8_BUILT=1 + libutf8_HEADER=${HDR} + +fi +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/libutf8/build.monolithic b/src/libutf8/build.monolithic new file mode 100644 index 0000000..85fb157 --- /dev/null +++ b/src/libutf8/build.monolithic @@ -0,0 +1,21 @@ +# These are external variables, and shouldn't clash with anything else +# libutf8_MONOLITHIC + +SRC="obj/libutf8.c" +HDR="obj/utf8.h" + +MONOLITHIC_TESTS="src/libutf8/build.lib src/libutf8/build.monolithic" + +if [ -z "${libutf8_MONOLITHIC}" ] +then + MONOLITHIC_SOURCE="$(echo src/libutf8/{TopHeader,ForwardDeclare,ctype,{de,en}code{,_state},BottomHeader}.h)" + make_monolithic ${HDR} C || return 1 + + MONOLITHIC_SOURCE="$(echo src/libutf8/{TopSource,ctype,{de,en}code{,_state}}.c)" + make_monolithic ${SRC} C || return 1 + + libutf8_MONOLITHIC=1 + MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}" +fi +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/libutf8/ctype.c b/src/libutf8/ctype.c new file mode 100644 index 0000000..d51532d --- /dev/null +++ b/src/libutf8/ctype.c @@ -0,0 +1,55 @@ +/* libutf8/src/lib/ctype.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +bool utf8_isascii(wchar_t ch) +{ + return !(ch & ~0x7F); +} + + + +/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/) + +0009..000D ; White_Space # Cc [5] .. +0020 ; White_Space # Zs SPACE +0085 ; White_Space # Cc +00A0 ; White_Space # Zs NO-BREAK SPACE +1680 ; White_Space # Zs OGHAM SPACE MARK +180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR +2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE +2028 ; White_Space # Zl LINE SEPARATOR +2029 ; White_Space # Zp PARAGRAPH SEPARATOR +202F ; White_Space # Zs NARROW NO-BREAK SPACE +205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE +3000 ; White_Space # Zs IDEOGRAPHIC SPACE +*/ + +bool utf8_isspace(wchar_t ch) +{ + return((ch >= 0x0009 && ch <= 0x000D) + || ch == 0x0020 + || ch == 0x0085 + || ch == 0x00A0 + || ch == 0x1680 + || ch == 0x180E + || (ch >= 0x2000 && ch <= 0x200A) + || ch == 0x2028 + || ch == 0x2029 + || ch == 0x202F + || ch == 0x205F + || ch == 0x3000); +} + + + +bool utf8_isvalid(wchar_t ch) +{ + return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF); +} + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/ctype.h b/src/libutf8/ctype.h new file mode 100644 index 0000000..2cf7f62 --- /dev/null +++ b/src/libutf8/ctype.h @@ -0,0 +1,46 @@ +/* libutf8/src/lib/ctype.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +/*! \defgroup ctype Character classification + +This module contains functions for character classification. These are basically an extension of the +\c is* functions defined in \c <ctype.h>. + +\todo There are many char classification functions that haven't been implemented yet. These won't be + implemented until they can be done in a proper, Unicode-safe fashion. + + */ +/*!@{*/ + + + +/// Returns \c true if \a ch can be represented in ASCII. +bool utf8_isascii(wchar_t ch); + +/// Returns \c true if \a ch is whitespace. +bool utf8_isspace(wchar_t ch); + +/*! \brief Returns \c true if \a ch is a valid UCS-4 character. + +\param ch The character to classify. +\retval true If \a ch is a valid UCS-4 character. +\retval false If \a ch is not a valid UCS-4 character. + +This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4 +character. Valid characters lie in the range 0–0x7FFFFFFF but exclude: +\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive) +\li the invalid code points U+FFFE and U+FFFF + +*/ +bool utf8_isvalid(wchar_t ch); + + + +/*!@}*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/decode.c b/src/libutf8/decode.c new file mode 100644 index 0000000..2a25c51 --- /dev/null +++ b/src/libutf8/decode.c @@ -0,0 +1,234 @@ +/* libutf8/src/lib/decode.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +wchar_t utf8_decode_char(const char* src, size_t* used) +{ + return utf8_decode_char2(src, 6, used); +} + + + +wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used) +{ + uint8_t ch; + wchar_t ret, min; + int remain; + + if(!src || !size) { + errno = EINVAL; + return 0; + } + if(used) *used = 1; + ch = *src++; + + if(ch & 0x80) { + if((ch & 0xE0) == 0xC0) { + min = 0x80; + remain = 1; + if(used) *used = 2; + ret = ch & 0x1F; + } else if((ch & 0xF0) == 0xE0) { + min = 0x800; + remain = 2; + if(used) *used = 3; + ret = ch & 0x0F; + } else if((ch & 0xF8) == 0xF0) { + min = 0x10000; + remain = 3; + if(used) *used = 4; + ret = ch & 0x07; + } else if((ch & 0xFC) == 0xF8) { + min = 0x200000; + remain = 4; + if(used) *used = 5; + ret = ch & 0x03; + } else if((ch & 0xFE) == 0xFC) { + min = 0x4000000; + remain = 5; + if(used) *used = 6; + ret = ch & 0x01; + } else { + errno = EILSEQ; + return 0; + } + + while(remain--) { + if(!--size) { + errno = EILSEQ; + return 0; + } + ch = *src++; + if((ch & 0xC0) != 0x80) { + errno = EILSEQ; + return 0; + } + ret <<= 6; + ret |= ch & 0x3F; + } + + if(ch < min) { + errno = EILSEQ; + return 0; + } + + return ret; + } + return ch; +} + + + +wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq) +{ + return utf8_decode_char2_force(src, 6, used, ilseq); +} + + + +wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq) +{ + uint8_t ch; + wchar_t ret, min; + int remain; + + if(!src || !size) { + errno = EINVAL; + return 0; + } + if(used) *used = 1; + ch = *src++; + + if(ch & 0x80) { + if((ch & 0xE0) == 0xC0) { + min = 0x80; + remain = 1; + ret = ch & 0x1F; + } else if((ch & 0xF0) == 0xE0) { + min = 0x800; + remain = 2; + ret = ch & 0x0F; + } else if((ch & 0xF8) == 0xF0) { + min = 0x10000; + remain = 3; + ret = ch & 0x07; + } else if((ch & 0xFC) == 0xF8) { + min = 0x200000; + remain = 4; + ret = ch & 0x03; + } else if((ch & 0xFE) == 0xFC) { + min = 0x4000000; + remain = 5; + ret = ch & 0x01; + } else { + goto ILSEQ; + } + + while(remain--) { + if(!--size) goto ILSEQ; + ch = *src++; + if(used) (*used)++; + if((ch & 0xC0) != 0x80) goto ILSEQ; + ret <<= 6; + ret |= ch & 0x3F; + } + + if(ch < min) goto ILSEQ; + + return ret; + } + return ch; + +ILSEQ: + // advance pointer to next valid char boundary + while(1) { + if(!*src || !size) break; + if((*src & 0xC0) == 0x80) break; + ++src; + --size; + if(used) (*used)++; + } + + return ilseq; +} + + + +wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src) +{ + struct utf8_decode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.rd = src; + ctx.rd_remain = -1; + ctx.wr = dest; + ctx.wr_size = size; + + if(!utf8_decoder(&ctx)) return 0; + if(*ctx.rd) { + errno = ENOMEM; + return 0; + } + + return dest; +} + + + +wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt) +{ + struct utf8_decode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.rd = src; + ctx.rd_remain = amt; + ctx.wr = dest; + ctx.wr_size = size; + + if(!utf8_decoder(&ctx)) return 0; + if(ctx.rd_remain || !ctx.complete) { + errno = ENOMEM; + return 0; + } + if(written) *written = ctx.written; + + return dest; +} + + + +wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src) +{ + struct utf8_decode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.rd = src; + ctx.rd_remain = -1; + ctx.wr = dest; + ctx.wr_size = size; + ctx.error_callback = utf8_decode_error_callback_replace; + + if(!utf8_decoder(&ctx)) return 0; + if(*ctx.rd) { + errno = ENOMEM; + return 0; + } + + return dest; +} + + + +wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt) +{ + struct utf8_decode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.rd = src; + ctx.rd_remain = amt; + ctx.wr = dest; + ctx.wr_size = size; + ctx.error_callback = utf8_decode_error_callback_replace; + + if(!utf8_decoder(&ctx)) return 0; + if(written) *written = ctx.written; + return dest; +} diff --git a/src/libutf8/decode.h b/src/libutf8/decode.h new file mode 100644 index 0000000..bdd10b4 --- /dev/null +++ b/src/libutf8/decode.h @@ -0,0 +1,187 @@ +/* libutf8/src/lib/decode.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +/*! \defgroup decode UTF-8 decoding routines. + +These routines decode UTF-8 data into C's wide character type \c wchar_t. Errors are reported +through \c errno, with the following errors being of particular interest: + +\li \c EINVAL - invalid argument to function +\li \c EILSEQ - illegal encoding (i.e. not UTF-8 or encoding error) +\li \c ENOMEM - not enough space in destination buffer + +As a special case, functions which return a character may return the \c wchar_t representation of +-1 to signify an error. This wording is used to take into account the fact that the \c wchar_t type +could be unsigned. + +*/ +/*!@{*/ + + + +/*! \brief Decode a character. + +\param src Pointer to start of source data. +\param used If not null, set to the number of bytes used. +\retval (wchar_t)(-1) on error (see \c errno). +\returns Decoded character. + +Decodes a single character, returning the \c wchar_t representation of -1 on error and setting +\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing +a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a errno +will be set to \c EILSEQ. + +\warning Only use this function if you are sure it cannot read past the end of your buffer. See + utf8_decode_char2() for a safe version. + +*/ +wchar_t utf8_decode_char(const char* src, size_t* used); + + + +/*! \brief Decode a character, discarding illegal sequences. + +\param src Pointer to start of source data. +\param used If not null, set to the number of bytes used. +\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the + Unicode replacement character, \c 0xFFFD. +\retval (wchar_t)(-1) on error (see \c errno). +\retval ilseq If an illegal sequence is encountered. +\returns Decoded character. +\post \a *used will be set to the number of bytes consumed. + +Decodes a single character, returning the \c wchar_t representation of -1 on error and setting +\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing +a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq +will be returned and the buffer advanced to the next valid character. This means the function can +only fail if you pass it an invalid \a src pointer. + +\warning Only use this function if you are sure it cannot read past the end of your buffer. See + utf8_decode_char2_force() for a safe version. + +*/ +wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq); + + + +/*! \brief Decode a character, given source buffer size. + +\param src Pointer to start of source data. +\param size Size of source data in bytes. +\param used If not null, set to the number of bytes used. +\retval (wchar_t)(-1) on error (see \c errno). +\returns Decoded character. + +Decodes a single character, returning the \c wchar_t representation of -1 on error and setting +\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. + +*/ +wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used); + + + +/*! \brief Decode a character, discarding illegal sequences and given source buffer size. + +\param src Pointer to start of source data. +\param size Size of source data in bytes. +\param used If not null, set to the number of bytes used. +\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the + Unicode replacement character, \c 0xFFFD. +\retval (wchar_t)(-1) on error (see \c errno). +\retval ilseq If an illegal sequence is encountered. +\returns Decoded character. +\post \a *used will be set to the number of bytes consumed. + +Decodes a single character, returning the \c wchar_t representation of -1 on error and setting +\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing +a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq +will be returned and the buffer advanced to the next valid character. This means the function can +only fail if you pass it an invalid \a src pointer, or a \a size of 0. + +*/ +wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq); + + + +/*! \brief Decode a null-terminated string. + +\param dest The output destination. +\param size The number of characters that can be stored in \a dest. +\param src Pointer to the null-terminated source data. +\returns Pointer to the output destination. +\retval 0 on error (see \c errno). + +This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets +\c errno appropriately. + +*/ +wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src); + + + +/*! \brief Decode a fixed-size string. + +\param dest The output destination. +\param size The number of characters that can be stored in \a dest. +\param written Set to the number of bytes written (excluding NUL). +\param src Pointer to the null-terminated source data. +\param amt Number of bytes to decode. +\returns Pointer to the output destination. +\retval 0 on error (see \c errno). + +This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets +\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null, +it is set to the number of characters written excluding the terminating NUL. This function always +produces null-terminated strings. + +*/ +wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt); + + + +/*! \brief Decode a null-terminated string, ignoring errors. + +\param dest The output destination. +\param size The number of characters that can be stored in \a dest. +\param src Pointer to the null-terminated source data. +\returns Pointer to the output destination. +\retval 0 on error (see \c errno). + +This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets +\c errno appropriately. + +This function will truncate the output if there is not enough space and will skip characters it +cannot decode. It can only fail if you pass it invalid parameters. + +*/ +wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src); + + + +/*! \brief Decode a fixed-size string, ignoring errors. + +\param dest The output destination. +\param size The number of characters that can be stored in \a dest. +\param written Set to the number of bytes written (excluding NUL). +\param src Pointer to the null-terminated source data. +\param amt Number of bytes to decode. +\returns Pointer to the output destination. +\retval 0 on error (see \c errno). + +This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets +\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null, +it is set to the number of characters written excluding the terminating NUL. This function always +produces null-terminated strings. + +This function will truncate the output if there is not enough space and will skip characters it +cannot decode. It can only fail if you pass it invalid parameters. + +*/ +wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt); + + + +/*!@}*/ diff --git a/src/libutf8/decode_state.c b/src/libutf8/decode_state.c new file mode 100644 index 0000000..04a28f9 --- /dev/null +++ b/src/libutf8/decode_state.c @@ -0,0 +1,204 @@ +/* libutf8/src/lib/decode_ctx.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +enum utf8_decoder_state { + utf8_state_none, + utf8_state_multibyte1, + utf8_state_multibyte2, + utf8_state_multibyte3, + utf8_state_multibyte4, + utf8_state_multibyte5, + utf8_state_error, + utf8_state_skip +}; + + + +struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* ctx) +{ + wchar_t* wr; + size_t avail; + enum utf8_decode_error error_type; + + if(!ctx || !ctx->rd || !ctx->wr || ctx->wr_size < 2 || ctx->state == utf8_state_error) { + errno = EINVAL; + return 0; + } + + wr = ctx->wr; + ctx->written = 0; + avail = ctx->wr_size; + +loop: + while(ctx->rd_remain) { + uint8_t in = *ctx->rd; + + switch(ctx->state) { + case utf8_state_skip: + case utf8_state_none: + if(!in && ctx->rd_remain < 0) { + *wr = 0; + ctx->complete = true; + ++ctx->byte_offset; + return ctx; + } + if(!(in & 0x80)) { + *wr++ = in; + ++ctx->written; + --avail; + ++ctx->char_offset; + ctx->complete = true; + if(in == 0x0A) { + ++ctx->line; + ctx->col = 0; + } else { + ++ctx->col; + } + ctx->state = utf8_state_none; + break; + } + ctx->complete = false; + if((in & 0xE0) == 0xC0) { + ctx->minch = 0x80; + ctx->state = utf8_state_multibyte1; + ctx->statech = in & 0x1F; + } else if((in & 0xF0) == 0xE0) { + ctx->minch = 0x800; + ctx->state = utf8_state_multibyte2; + ctx->statech = in & 0x0F; + } else if((in & 0xF8) == 0xF0) { + ctx->minch = 0x10000; + ctx->state = utf8_state_multibyte3; + ctx->statech = in & 0x07; + } else if((in & 0xFC) == 0xF8) { + ctx->minch = 0x200000; + ctx->state = utf8_state_multibyte4; + ctx->statech = in & 0x03; + } else if((in & 0xFE) == 0xFC) { + ctx->minch = 0x4000000; + ctx->state = utf8_state_multibyte5; + ctx->statech = in & 0x01; + } else if(ctx->state != utf8_state_none) { + ctx->state = utf8_state_none; + } else { + error_type = ((in & 0xC0) == 0x80) ? utf8_decode_error_lone_cchar + : utf8_decode_error_not_schar; + goto error; + } + break; + + case utf8_state_multibyte1: + case utf8_state_multibyte2: + case utf8_state_multibyte3: + case utf8_state_multibyte4: + case utf8_state_multibyte5: + if((in & 0xC0) != 0x80) { + error_type = utf8_decode_error_not_cchar; + goto error; + } + ctx->statech <<= 6; + ctx->statech |= in & 0x3F; + if(!--ctx->state) { + if(ctx->statech < ctx->minch) { + error_type = utf8_decode_error_overlong; + goto error; + } else { + // validate codepoint + if(!utf8_isvalid(ctx->statech)) { + error_type = utf8_decode_error_illegal_cp; + goto error; + } + + // add to output string + *wr++ = ctx->statech; + ++ctx->written; + --avail; + ++ctx->char_offset; + ctx->complete = true; + if(ctx->statech == 0x0A || ctx->statech == 0x2028) { + ++ctx->line; + ctx->col = 0; + } else { + ++ctx->col; + } + } + } + break; + + default: + errno = EINVAL; + return 0; + } + + ++ctx->byte_offset; + ++ctx->rd; + if(ctx->rd_remain > 0) --ctx->rd_remain; + if(avail == 1) break; + } + *wr = 0; + return ctx; + +error: + if(!ctx->error_callback) { + errno = EILSEQ; + return 0; + } + switch(ctx->error_callback(ctx, error_type, wr)) { + case utf8_decode_error_action_abort: + errno = EILSEQ; + return 0; + + case utf8_decode_error_action_skip: + ctx->state = utf8_state_skip; + goto loop; + + case utf8_decode_error_action_replace: + ctx->state = utf8_state_skip; + ++ctx->written; + if(*wr == 0x0A || *wr == 0x2028) { + ++ctx->line; + ctx->col = 0; + } else { + ++ctx->col; + } + ++wr; + if(--avail == 1) { + *wr = 0; + return ctx; + } + goto loop; + } + + // shouldn't reach here + errno = EILSEQ; + return 0; +} + + + +enum utf8_decode_error_action utf8_decode_error_callback_replace( + const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch) +{ + (void)ctx; + (void)error; + *newch = 0xFFFD; + return utf8_decode_error_action_replace; +} + + + +enum utf8_decode_error_action utf8_decode_error_callback_skip( + const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch) +{ + (void)ctx; + (void)error; + (void)newch; + return utf8_decode_error_action_skip; +} + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/decode_state.h b/src/libutf8/decode_state.h new file mode 100644 index 0000000..6c15ee1 --- /dev/null +++ b/src/libutf8/decode_state.h @@ -0,0 +1,197 @@ +/* libutf8/src/lib/decode_ctx.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +/*! \defgroup decode_ctx UTF-8 stateful decoder. + +This UTF-8 decoder uses a structure to maintain state information between calls. This means that +you can feed it a stream of data as it comes in without needing to store the entire document in a +buffer. It correctly copes with the currently-available data ending on a non-character boundary. + +Errors are handled by providing a callback function (several of which are provided by the library). +The callback function has the option of aborting the conversion, substituting a replacement +character, or simply skipping the illegal byte sequence. + +*/ +/*!@{*/ + + + +/*! \brief Types of decoder error. + +These are the types of error that can be encountered by the decoder. This allows slightly more +information than is provided by setting \a errno to \c EILSEQ. The type of error will be passed +to the callback function. + +*/ +enum utf8_decode_error { + /// Lone continuation char encountered when start char expected. + utf8_decode_error_lone_cchar, + + /// Non-continuation char encountered within multibyte sequence. + utf8_decode_error_not_cchar, + + /// Invalid start char (not ASCII). + utf8_decode_error_not_schar, + + /// Overlong byte sequence. + utf8_decode_error_overlong, + + /// Illegal code positions (UTF-16 surrogates or 0xFFFE,0xFFFF). + utf8_decode_error_illegal_cp +}; + + + +/*! \brief Action to be taken after error callback. + +These are the possible actions that can be undertaken after a stateful decode has encountered an +error. These actions are specified by the error callback function's return value. + +*/ +enum utf8_decode_error_action { + /// Abort the conversion, returning EILSEQ. + utf8_decode_error_action_abort, + + /// Skip the illegal byte sequence. + utf8_decode_error_action_skip, + + /// Discard the illegal byte sequence and enter a replacement char. + utf8_decode_error_action_replace +}; + + + +/*! \brief Error callback type. + +\param state The state-storage structure. +\param error The error type. +\param[out] newch If utf8_decode_error_action_replace is returned, then set this to the value of + the character you wish to replace with (\c 0xFFFD is recommended). +\returns A value specifying what action to undertake as a result of the callback. + +This callback determines the action of the UTF-8 stateful decoder on encountering an illegal byte +sequence. It can choose to abort the conversion, skip the illegal sequence, or replace the illegal +sequence with an arbitrary character. + +*/ +typedef enum utf8_decode_error_action(*utf8_decode_error_callback)( + const struct utf8_decode_state* state, enum utf8_decode_error error, wchar_t* newch); + + + +/*! \brief State structure used to decode UTF-8 into Unicode. + +This structure is used to decode arbitrary chunks of UTF-8 data into Unicode. It can deal with +partial data streams (even if they are cut-off mid-character). + +Before calling utf8_decoder, you must set up the object appropriately. The first step is to use +\a memset to initialise everything to 0. Then you need to fill out the read and write pointers, and +possibly set up the error callback. + +To use it, you set \a rd to point to your input data and \a rd_remain to the amount you have. If +\a rd_remain is negative, the input data is assumed to be null-terminated; otherwise, it is taken +as the number of bytes remaining at the input. These are updated after each call, so simply check +if \a rd_remain is 0 (or \a *rd is 0 in the case of a null-terminated string). + +You must also set \a wr (pointer to destination buffer) and \a wr_size (number of characters that +can be written there), and \a written is set for you (it is the number of characters written per +call but excluding the terminating NUL). This implies that the buffer must have space for at least +two characters. You can change \a wr and \a wr_size at any time, but if you leave them the same the +data will be overwritten on each call. + +If you wish to do error recovery, set \a error_callback and possibly \a data. + +You can examine the \a line and \a col variables to get the line / column of the input data at which +the decoder is currently operating. \a char_offset and \a byte_offset represent the offset, in +complete characters or bytes, from the start of the stream. With the exception of \a byte_offset, +these variables aren't perfect, as they can be affected by errors and limitations (only 0x0A and +0x2028 are recognised as line end chars, and the effect of tabs is ignored). + +*/ +struct utf8_decode_state { + /// \c false if we are part-way through a multi-byte character. + bool complete; + + /// Data to read (current read position). + const char* rd; + + /// Number of bytes remaining (current). + int rd_remain; + + /// Internal state; initialise to 0, don't change. + int state; + + /// Error callback (may be 0). + utf8_decode_error_callback error_callback; + + /// Pointer to output buffer. + wchar_t* wr; + + /// Number of characters that can be written. + size_t wr_size; + + /// Number of characters written on last call. + size_t written; + + /// Arbitrary data pointer for \a error_callback. + void* data; + + /// Current line (starting from 0). + int line; + + /// Current column (starting from 0). + int col; + + /// Character offset from start of data (starting from 0). + int char_offset; + + /// Byte offset from start of data (starting from 0). + int byte_offset; + + /// Don't use this. + wchar_t statech; + /// Don't use this. + wchar_t minch; +}; + + + +/*! \brief Decode an arbitrary chunk of a UTF-8 byte stream. + +\param state The state-storage structure. +\retval ctx on success. +\retval 0 on error (see \a errno). + +This function is used to do multi-pass decoding of arbitrary UTF-8 byte streams. Each call will +update \a state.rd, \a state.rd_remain and \a state.written. \a state.complete is \c true if, on consumption +of all the data, we are not inside a multibyte character. + +Should an error occur, \a state.error_callback is called (if it is not 0). If it is 0, or it returns +utf8_decode_error_action_abort, then the conversion will be aborted and the object set into +an error state. \a errno will be set to \c EILSEQ. Once the object is in an error state, there is +no way to recover short of completely clearing it and starting with fresh data. Continuing to call +this function with an invalid object will result in \c EINVAL. + +*/ +struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* state); + + + +/// Standard error callback: use replacement char 0xFFFD. +enum utf8_decode_error_action utf8_decode_error_callback_replace( + const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch); + +/// Standard error callback: skip invalid chars. +enum utf8_decode_error_action utf8_decode_error_callback_skip( + const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch); + + + +/*!@}*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/encode.c b/src/libutf8/encode.c new file mode 100644 index 0000000..35930af --- /dev/null +++ b/src/libutf8/encode.c @@ -0,0 +1,141 @@ +/* libutf8/src/lib/encode.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +char* utf8_encode_char(char* dest, size_t amt, wchar_t ch) +{ + if(!dest || !amt) { + errno = EINVAL; + return 0; + } + if(!utf8_isvalid(ch)) { + errno = EILSEQ; + return 0; + } + + if(ch < 0x80) { + *dest++ = ch; + + } else if(ch < 0x800) { + if(amt < 2) { + errno = ENOMEM; + return 0; + } + *dest++ = 0xC0 | ((ch >> 6) & 0x1F); + *dest++ = 0x80 | (ch & 0x3F); + + } else if(ch < 0x10000) { + if(amt < 3) { + errno = ENOMEM; + return 0; + } + *dest++ = 0xE0 | ((ch >> 12) & 0xF); + *dest++ = 0x80 | ((ch >> 6) & 0x3F); + *dest++ = 0x80 | (ch & 0x3F); + + } else if(ch < 0x200000) { + if(amt < 4) { + errno = ENOMEM; + return 0; + } + *dest++ = 0xF0 | ((ch >> 18) & 0x7); + *dest++ = 0x80 | ((ch >> 12) & 0x3F); + *dest++ = 0x80 | ((ch >> 6) & 0x3F); + *dest++ = 0x80 | (ch & 0x3F); + + } else if(ch < 0x4000000) { + if(amt < 5) { + errno = ENOMEM; + return 0; + } + *dest++ = 0xF8 | ((ch >> 24) & 0x3); + *dest++ = 0x80 | ((ch >> 18) & 0x3F); + *dest++ = 0x80 | ((ch >> 12) & 0x3F); + *dest++ = 0x80 | ((ch >> 6) & 0x3F); + *dest++ = 0x80 | (ch & 0x3F); + + } else { + if(amt < 6) { + errno = ENOMEM; + return 0; + } + *dest++ = 0xFC | ((ch >> 30) & 0x1); + *dest++ = 0x80 | ((ch >> 24) & 0x3F); + *dest++ = 0x80 | ((ch >> 18) & 0x3F); + *dest++ = 0x80 | ((ch >> 12) & 0x3F); + *dest++ = 0x80 | ((ch >> 6) & 0x3F); + *dest++ = 0x80 | (ch & 0x3F); + + } + + return dest; +} + + + +char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq) +{ + if(!utf8_isvalid(ilseq)) { + errno = EILSEQ; + return 0; + } + + return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq); +} + + + +char* utf8_encode(char* dest, size_t amt, const wchar_t* src) +{ + return utf8_encode2(dest, amt, 0, src, -1); +} + + + +char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt) +{ + struct utf8_encode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.rd = src; + ctx.rd_remain = inamt; + ctx.wr = dest; + ctx.wr_size = amt; + + if(!utf8_encoder(&ctx)) return 0; + if(ctx.rd_remain > 0 || (ctx.rd_remain < 0 && *ctx.rd)) { + errno = ENOMEM; + return 0; + } + if(written) *written = ctx.written; + return dest; +} + + + +char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src) +{ + return utf8_encode_force2(dest, amt, 0, src, -1); +} + + + +char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt) +{ + struct utf8_encode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.rd = src; + ctx.rd_remain = inamt; + ctx.wr = dest; + ctx.wr_size = amt; + ctx.error_callback = utf8_encode_error_callback_replace; + + if(!utf8_encoder(&ctx)) return 0; + if(written) *written = ctx.written; + return dest; +} + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/encode.h b/src/libutf8/encode.h new file mode 100644 index 0000000..136358f --- /dev/null +++ b/src/libutf8/encode.h @@ -0,0 +1,144 @@ +/* libutf8/src/lib/encode.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +/*! \defgroup encode UTF-8 encoding routines. + +The functions in this module allow encoding of UTF-8 characters. Errors are reported through +\c errno, with the following errors being of particular interest: + +\li \c EINVAL - invalid argument to function +\li \c EILSEQ - illegal source character (see utf8_isvalid()) +\li \c ENOMEM - not enough space in destination buffer + +*/ +/*!@{*/ + + + +/*! \brief Encode a single character into UTF-8. + +\param dest The destination buffer. +\param amt Number of bytes in destination buffer. +\param ch Character to encode. +\returns Pointer to next byte of buffer to use. +\retval 0 on error (see \c errno). + +This function will encode a single character into UTF-8. It returns a pointer to the end of the +character (i.e. the next position in the buffer you want to write to). + +On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1, \c EILSEQ +if \a ch is not valid; or \c ENOMEM if the result would not fit into +\a amt bytes) and returns 0. + +*/ +char* utf8_encode_char(char* dest, size_t amt, wchar_t ch); + + + +/*! \brief Encode a single character into UTF-8, forcing replacement of invalid characters. + +\param dest The destination buffer. +\param amt Number of bytes in destination buffer. +\param ch Character to encode. +\param ilseq If \a ch is not a legal character, then this is encoded instead. +\returns Pointer to next byte of buffer to use. +\retval 0 on error (see \c errno). + +This function will encode a single character into UTF-8. It returns a pointer to the end of the +character (i.e. the next position in the buffer you want to write to). If the source character \a ch +is not a valid code point, it will instead encode the character \a ilseq. + +On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1; \c EILSEQ +if \a ilseq is not valid; or \c ENOMEM if the result would not fit into +\a amt bytes) and returns 0. + +*/ +char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq); + + + +/*! \brief Encode a null-terminated string into UTF-8. + +\param dest The destination buffer. +\param amt Number of bytes in the destination buffer. +\param src Null-terminated source string. +\returns Pointer to destination buffer. +\retval 0 on error (see \c errno). + +This function encodes a null-terminated Unicode string into the destination buffer. It returns a +pointer to the destination buffer on success, and 0 on error. If there is not enough space in the +buffer, or an illegal character is encountered somewhere in the sequence, it will fail. + +*/ +char* utf8_encode(char* dest, size_t amt, const wchar_t* src); + + + +/*! \brief Encode a fixed-size string into UTF-8. + +\param dest The destination buffer. +\param amt Number of bytes in the destination buffer. +\param written Set to number of bytes written on success (excluding NUL). +\param src Pointer to source string. +\param inamt Number of characters to encode. +\returns Pointer to destination buffer. +\retval 0 on error (see \c errno). + +This function encodes a Unicode string (possibly containing ASCII NUL) into the destination buffer. +It returns a pointer to the destination buffer on success, and 0 on error. If there is not enough +space in the buffer, or an illegal character is encountered somewhere in the sequence, it will fail. +The destination will be null-terminated. + +*/ +char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt); + + + +/*! \brief Encode a null-terminated string into UTF-8, ignoring errors. + +\param dest The destination buffer. +\param amt Number of bytes in the destination buffer. +\param src Null-terminated source string. +\returns Pointer to destination buffer. +\returns 0 if arguments are invalid. + +This function will encode a null-terminated Unicode string into the destination buffer, making a +best-effort in the case of failures. If there is not enough memory, the destination string will be +truncated (but still null-terminated). If an illegal source character is encountered, it is replaced +with the Unicode replacement character U+FFFD. The function can only fail if one of the arguments is +invalid. + +*/ +char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src); + + + +/*! \brief Encode a fixed-size string into UTF-8, ignoring errors. + +\param dest The destination buffer. +\param amt Number of bytes in the destination buffer. +\param written Set to number of bytes written on success (excluding NUL). +\param src Null-terminated source string. +\param inamt Number of characters to encode. +\returns Pointer to destination buffer. +\returns 0 if arguments are invalid. + +This function will encode a Unicode string (possibly containing ASCII NUL) into the destination +buffer, making a best-effort in the case of failures. If there is not enough memory, the destination +string will be truncated (but still null-terminated). If an illegal source character is encountered, +it is replaced with the Unicode replacement character U+FFFD. The function can only fail if one of +the arguments is invalid. + +*/ +char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt); + + + +/*!@}*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/encode_state.c b/src/libutf8/encode_state.c new file mode 100644 index 0000000..1838285 --- /dev/null +++ b/src/libutf8/encode_state.c @@ -0,0 +1,88 @@ +/* libutf8/src/lib/encode_state.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state) +{ + char* wr = state->wr, * ret; + char* endp = wr + state->wr_size - 1; + wchar_t ch; + enum utf8_encode_error_action error_action; + bool reencoding; + + if(!state || !state->rd || !state->wr || state->wr_size < 7) { + errno = EINVAL; + return 0; + } + + state->written = 0; + while(state->rd_remain) { + ch = *state->rd; + if(!ch && state->rd_remain < 0) break; + + reencoding = false; + reencode: + ret = utf8_encode_char(wr, endp - wr, ch); + if(!ret) { + if(errno == ENOMEM) break; + if(!state->error_callback || reencoding) { + errno = EILSEQ; + return 0; + } + error_action = state->error_callback(state, &ch); + switch(error_action) { + case utf8_encode_error_action_abort: + errno = EILSEQ; + return 0; + + case utf8_encode_error_action_replace: + reencoding = true; + goto reencode; + + case utf8_encode_error_action_skip: + ret = wr; + break; + } + } + if(state->rd_remain > 0) state->rd_remain--; + ++state->rd; + ++state->char_offset; + if(ch == 0x0A || ch == 0x2028) { + ++state->line; + state->col = 0; + } else { + ++state->col; + } + state->written += ret - wr; + wr = ret; + if(wr == endp) break; + } + *wr = 0; + return state; +} + + + +enum utf8_encode_error_action utf8_encode_error_callback_replace( + const struct utf8_encode_state* state, wchar_t* newch) +{ + (void)state; + *newch = 0xFFFD; + return utf8_encode_error_action_replace; +} + + + +enum utf8_encode_error_action utf8_encode_error_callback_skip( + const struct utf8_encode_state* state, wchar_t* newch) +{ + (void)state; + (void)newch; + return utf8_encode_error_action_skip; +} + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/encode_state.h b/src/libutf8/encode_state.h new file mode 100644 index 0000000..88958cf --- /dev/null +++ b/src/libutf8/encode_state.h @@ -0,0 +1,158 @@ +/* libutf8/src/lib/encode_state.h + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +/*! \defgroup encode_state UTF-8 stateful encoder. + +This UTF-8 encoder uses a structure to maintain state information between calls. This means that +you can feed it a stream of data as it comes in without needing to store the entire source in a +buffer. + +Errors (i.e. illegal source chars; see utf8_isvalid()) are handled by providing a callback function +(several of which are provided by the library). The callback function has the option of aborting +the conversion, substituting a replacement character, or simply skipping the illegal source +character. + +*/ +/*!@{*/ + + + +/*! \brief Action to be taken after error callback. + +These are the possible actions that can be undertaken after a stateful encoding operation has +encountered an error (illegal source char). These actions are specified by the error callback +function's return value. + +*/ +enum utf8_encode_error_action { + /// Abort the conversion, returning EILSEQ. + utf8_encode_error_action_abort, + + /// Skip the illegal byte sequence. + utf8_encode_error_action_skip, + + /// Discard the illegal byte sequence and enter a replacement char. + utf8_encode_error_action_replace +}; + + + + +/*! \brief Error callback type. + +\param state The encoder state information. +\param[out] newch If \a utf8_encode_error_action_replace is returned, this is set to the + character that should be substituted instead of the illegal source character. + +This function is called whenever an error occurs. It can examine \a state (and specifically +\a *state.rd) to determine the illegal source character. It can choose to skip the character, replace +it with something else, or abort the conversion entirely. + +*/ +typedef enum utf8_encode_error_action (*utf8_encode_error_callback)( + const struct utf8_encode_state* state, wchar_t* newch); + +/// Standard error callback: use replacement char 0xFFFD. +enum utf8_encode_error_action utf8_encode_error_callback_replace( + const struct utf8_encode_state* state, wchar_t* newch); + +/// Standard error callback: skip invalid chars. +enum utf8_encode_error_action utf8_encode_error_callback_skip( + const struct utf8_encode_state* state, wchar_t* newch); + + + +/*! \brief State structure used to encode Unicode into UTF-8. + +This structure is used to encode an arbitrary Unicode string into UTF-8. To set it up, first call +\a memset to clear the structure to zero. You will then +want to set \a rd to point to your input string, with \a rd_remain the number of bytes to encode +(you can set it to a negative number if \a rd is null-terminated and you want to encode the whole +thing). You will also want to tell it where to write to (\a wr) and how much space there is in that +buffer (\a wr_size). + +To deal with errors (illegal input chars), you can provide a callback function \a error_callback. +An arbitrary \a data pointer is provided in case you wish to associate some object with the encode +operation. Passing a null pointer for \a error_callback is a valid way of indicating you do not +wish to attempt to correct errors. + +You can examine the \a line and \a col variables to get the line / column of the input data at which +the decoder is currently operating. These variables aren't perfect, as they can be +affected by errors and limitations (only 0x0A and 0x2028 are recognised as line end chars, and the +effect of tabs is ignored). \a char_offset represents the offset, in complete characters, from the +start of the stream, and should always be accurate. + +*/ +struct utf8_encode_state { + /// Current read position. + const wchar_t* rd; + + /// Number of chars remaining (-ve means to scan for null char). + int rd_remain; + + /// Callback function used to handle illegal source characters. + utf8_encode_error_callback error_callback; + + /// Output buffer. + char* wr; + + /// Output buffer size. + size_t wr_size; + + /// Number of bytes written during last call. + size_t written; + + /// Arbitrary pointer (useful for \a error_callback). + void* data; + + /// Current line (starting from 0). + int line; + + /// Current column (starting from 0). + int col; + + /// Character offset from start of data (starting from 0). + int char_offset; +}; + + + +/*! \brief Encode an arbitrary Unicode string. + +\param state The encoder state information. +\retval state on success. +\retval 0 on error (see \c errno). + +This function is used to encode some arbitrary Unicode string into UTF-8. It uses a state-storage +structure which allows you to perform the encoding in multiple passes (e.g. if you are encoding +an arbitrary string and outputting it, you will want to use a fixed size buffer and this might +be smaller than required). + +In each pass of the function, \a rd and \a rd_remain will be updated to record the current reading +position and the number of bytes left to encode. If the function completes this pass, \a rd_remain +will be zero (but if you are converting a null-terminated string, you will need to check for \a *rd +to be zero instead). + +After each call, \a wr will be unchanged but \a written will contain the number of bytes written +(excluding a terminating null, which is always written). If you do not want to overwrite this data +on the next call, you will have to update \a wr and \a wr_size. + +If \a state is null, or not filled out properly (no source data or destination buffer not at least 7 +bytes large), then no conversion will be performed and \a EINVAL will be stored in \a errno. If an +illegal source character is encountered, and the error callback is 0, aborts the process or tries +to replace the char with another illegal code point, then \a EILSEQ will be stored in \a errno. On +error, 0 will be returned. + +*/ +struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state); + + + +/*!@}*/ + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/libutf8/pkgconf.in b/src/libutf8/pkgconf.in new file mode 100644 index 0000000..28a0741 --- /dev/null +++ b/src/libutf8/pkgconf.in @@ -0,0 +1,21 @@ +# libutf8/src/lib/clib/pkgconf.in +# +# Metadata file for pkg-config +# ( http://www.freedesktop.org/software/pkgconfig/ ) +# +# (c)2006, Laurence Withers, . +# Released under the GNU GPLv2. See file COPYING or +# http://www.gnu.org/copyleft/gpl.html for details. +# + +# Name, description +Name: libutf8 +Description: Library for encoding and decoding UTF-8 +Version: @VERSION@ + +# Requirements +Requires: + +# Compilation information +Libs: -L@LIBDIR@ -lutf8 +Cflags: -I@INCLUDEDIR@ diff --git a/src/libutf8/soversion b/src/libutf8/soversion new file mode 100644 index 0000000..c539b11 --- /dev/null +++ b/src/libutf8/soversion @@ -0,0 +1,17 @@ +# libutf8/src/libutf8/soversion +# +# (c)2006, Laurence Withers, . +# Released under the GNU GPLv2. See file COPYING or +# http://www.gnu.org/copyleft/gpl.html for details. +# + + + +# SOMAJOR and SOMINOR are included in the library's soname. They need to +# be bumped on a binary-incompatible release. They are both single +# integers. +SOMAJOR=0 +SOMINOR=0 + +# SOMICRO is bumped every time there is a binary-compatible release. +SOMICRO=0 diff --git a/src/tests/.params b/src/tests/.params new file mode 100644 index 0000000..7e00419 --- /dev/null +++ b/src/tests/.params @@ -0,0 +1 @@ +c tests tests libutf8 diff --git a/src/tests/build.default b/src/tests/build.default new file mode 100644 index 0000000..2d979e2 --- /dev/null +++ b/src/tests/build.default @@ -0,0 +1,3 @@ +source src/tests/build.tests +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/tests/build.tests b/src/tests/build.tests new file mode 100644 index 0000000..f484dab --- /dev/null +++ b/src/tests/build.tests @@ -0,0 +1,43 @@ +# These are external variables, and shouldn't clash with anything else +# tests_BUILT +# + +build_target libutf8 || return 1 + +if [ -z ${tests_BUILT} ] +then + LIBS="${libutf8} " + EXTRAS="" + + echo "Building test programs..." + do_cmd mkdir -p obj/tests || return 1 + + for SRC in src/tests/*.c + do + TEST="obj/tests/$(basename ${SRC} | sed -e 's,.c$,,')" + MODIFIED=0 + for file in ${LIBS} ${SRC} src/tests/build.tests + do + if [ ${file} -nt ${TEST} ] + then + MODIFIED=1 + break + fi + done + + if [ ${MODIFIED} -ne 0 ] + then + do_cmd ${CC} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1 + print_success "Built ${TEST}" + else + print_success "${TEST} is up to date" + fi + done + + print_success "All tests built" + + tests_BUILT=1 +fi + +# kate: replace-trailing-space-save true; space-indent true; tab-width 4; +# vim: expandtab:ts=4:sw=4 diff --git a/src/tests/decode.c b/src/tests/decode.c new file mode 100644 index 0000000..436e1d6 --- /dev/null +++ b/src/tests/decode.c @@ -0,0 +1,107 @@ +/* libutf8/src/tests/decode.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +#include "utf8.h" + +#include +#include + + + +void writeout(const wchar_t* x, int amt) +{ + fwrite(x, sizeof(wchar_t), amt, stdout); +} + + + +enum utf8_decode_error_action error_callback( + const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch) +{ + fprintf(stderr, "Line %d, col %d (char %d, byte %d): ", + ctx->line + 1, ctx->col + 1, ctx->char_offset, ctx->byte_offset); + switch(error) { + case utf8_decode_error_lone_cchar: + fprintf(stderr, "a lone continuation char was encountered.\n"); + break; + + case utf8_decode_error_not_cchar: + fprintf(stderr, "a continuation char was expected, but not encountered.\n"); + break; + + case utf8_decode_error_not_schar: + fprintf(stderr, "an invalid character was encountered (not start char).\n"); + break; + + case utf8_decode_error_overlong: + fprintf(stderr, "an overlong character sequence was encountered.\n"); + break; + + case utf8_decode_error_illegal_cp: + fprintf(stderr, "an illegal code point was encountered.\n"); + break; + } + + *newch = 0xFFFD; + return utf8_decode_error_action_replace; +} + + + +int main(int argc, char* argv[]) +{ + char inbuf[1024]; + wchar_t outbuf[1024]; + struct utf8_decode_state ctx; + + if(argc == 2 && !strcmp(argv[1], "--print-summary")) { + printf("Decodes UTF-8 on stdin to UCS-4 on stdout.\n"); + return 0; + } + + if(argc != 1) { + fprintf(stderr, "No parameters expected. This program decodes UTF-8 presented on stdin\n" + "and transforms it to UCS-4 on stdout.\n"); + return 1; + } + + // set up ctx structure + memset(&ctx, 0, sizeof(ctx)); + ctx.wr = outbuf; + ctx.wr_size = sizeof(outbuf) / sizeof(wchar_t); + ctx.error_callback = error_callback; + + // loop over input + while(!feof(stdin)) { + // read input + ctx.rd_remain = fread(inbuf, 1, sizeof(inbuf), stdin); + ctx.rd = inbuf; + + // decode it + while(ctx.rd_remain) { + if(!utf8_decoder(&ctx)) { + perror("utf8_decoder"); + fprintf(stderr, "(at line %d, col %d, char %d, byte %d)\n", + ctx.line + 1, ctx.col + 1, ctx.char_offset, ctx.byte_offset); + return 1; + } + + // write output + writeout(outbuf, ctx.written); + } + } + + if(!ctx.complete) { + fprintf(stderr, "Input did not end on a character boundary.\n"); + } + + return 0; +} + + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/tests/random.c b/src/tests/random.c new file mode 100644 index 0000000..d341284 --- /dev/null +++ b/src/tests/random.c @@ -0,0 +1,165 @@ +/* libutf8/src/tests/random.c + * + * (c)2006, Laurence Withers. Released under the GNU GPL. See file + * COPYING for more information / terms of license. +*/ + +#include "utf8.h" + +#include +#include +#include +#include +#include + + + +void make_rand(wchar_t* buf, int ch) +{ + int fd = open("/dev/urandom", O_RDONLY); + if(fd < 0) { + perror("open(\"/dev/urandom\")"); + exit(1); + } + ch *= sizeof(wchar_t); + if(read(fd, (char*)buf, ch) != ch) { + perror("read(\"/dev/urandom\")"); + exit(1); + } + close(fd); + + ch /= sizeof(wchar_t); + while(ch--) { + buf[ch] &= 0x7FFFFFFF; + } +} + + + +int do_encode(char* dest, size_t size, wchar_t* src, size_t amt) +{ + struct utf8_encode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + + ctx.rd = src; + ctx.rd_remain = amt; + ctx.wr = dest; + ctx.wr_size = 20; + + while(ctx.rd_remain) { + if(!utf8_encoder(&ctx)) { + perror("utf8_encoder"); + exit(1); + } + + ctx.wr += ctx.written; + if(ctx.wr + ctx.wr_size > dest + size) { + fprintf(stderr, "do_encode: we're going to run out of memory\n"); + exit(1); + } + } + + return ctx.wr - dest; +} + + + +int MIN(int x, int y) +{ + return (x < y) ? x : y; +} + + + +void do_decode_easy(wchar_t* dest, size_t size, const char* src, size_t amt) +{ + struct utf8_decode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + + ctx.rd = src; + ctx.rd_remain = amt; + ctx.wr = dest; + ctx.wr_size = size; + + if(!utf8_decoder(&ctx)) { + perror("[easy] utf8_decoder"); + exit(1); + } + + if(ctx.rd_remain) { + fprintf(stderr, "do_decode_easy: %d bytes left in buffer\n", ctx.rd_remain); + exit(1); + } + + if(!ctx.complete) { + fprintf(stderr, "do_decode_easy: incomplete character at end of data\n"); + exit(1); + } +} + + + +void do_decode(wchar_t* dest, size_t size, const char* src, size_t amt) +{ + struct utf8_decode_state ctx; + memset(&ctx, 0, sizeof(ctx)); + + ctx.rd = src; + ctx.rd_remain = MIN(20, amt); + amt -= ctx.rd_remain; + ctx.wr = dest; + ctx.wr_size = 20; + + while(ctx.rd_remain) { + if(!utf8_decoder(&ctx)) { + perror("utf8_decoder"); + exit(1); + } + + if(!ctx.rd_remain) { + ctx.rd_remain = MIN(20, amt); + amt -= ctx.rd_remain; + } + + ctx.wr += ctx.written; + if(ctx.wr + ctx.wr_size > dest + size) { + ctx.wr_size = ctx.wr - dest - size; + } + } +} + + + +int main(int argc, char* argv[]) +{ + wchar_t wbuf[1024], wbuf2[1025]; + char cbuf[8192]; + int amt; + + if(argc == 2 && !strcmp(argv[1], "--print-summary")) { + printf("Encodes and decodes random well-formed strings.\n"); + return 0; + } + + make_rand(wbuf, 1024); + amt = do_encode(cbuf, 8192, wbuf, 1024); + do_decode_easy(wbuf2, 1025, cbuf, amt); + do_decode(wbuf2, 1025, cbuf, amt); + + if(memcmp(wbuf, wbuf2, 1024 * sizeof(wchar_t))) { + fprintf(stderr, "Output doesn't match input!\n"); + for(amt = 0; amt < 1024; ++amt) { + if(wbuf[amt] != wbuf2[amt]) + fprintf(stderr, "%4d: %08X != %08X\n", amt, wbuf[amt], wbuf2[amt]); + } + return 1; + } + + printf("Success.\n"); + return 0; +} + + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +*/ diff --git a/src/tests/template b/src/tests/template new file mode 100644 index 0000000..7e835b8 --- /dev/null +++ b/src/tests/template @@ -0,0 +1,35 @@ +/* libutf8/src/tests/???.c + * + * (c)2006, Laurence Withers, . + * Released under the GNU GPLv2. See file COPYING or + * http://www.gnu.org/copyleft/gpl.html for details. +*/ + +#include "utf8.h" + +#include + + + +int main(int argc, char* argv[]) +{ + if(argc == 2 && !strcmp(argv[1], "--print-summary")) { + printf("One line summary.\n"); + return 0; + } + + if(argc == 1) { + // empty argument list + } + + int ret = 0; + + // TODO + + return ret; +} + +/* options for text editors +kate: replace-trailing-space-save true; space-indent true; tab-width 4; +vim: expandtab:ts=4:sw=4 +*/