Import library from old svn repository.

2006-07-31 15:34:21 +01:00 · 2006-07-31 15:34:21 +01:00 · ea1c53e43f
parent f9c0e57470
commit ea1c53e43f
36 changed files with 2248 additions and 1 deletions
--- a/6
+++ b/6
@ -10,5 +10,9 @@ Really Quick Instructions
 To build: ./make.sh
 To install: ./make.sh install
    (you might want to set PREFIX, by default it's /usr/local)
+Documentation is automatically built using doxygen.

-@TODO@
+Project Homepage
+----------------
+
+http://www.lwithers.me.uk/projects/libutf8/
--- a/src/docs/.params
+++ b/src/docs/.params
@ -0,0 +1 @@
+doxygen docs docs
--- a/src/docs/Doxyfile.in
+++ b/src/docs/Doxyfile.in
@ -0,0 +1,146 @@
+# libutf8/src/docs/Doxyfile.in
+#
+#  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
+#  Released under the GNU GPLv2. See file COPYING or
+#  http://www.gnu.org/copyleft/gpl.html for details.
+#
+
+PROJECT_NAME           = libutf8
+OUTPUT_DIRECTORY       =
+CREATE_SUBDIRS         = NO
+OUTPUT_LANGUAGE        = English
+USE_WINDOWS_ENCODING   = NO
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ABBREVIATE_BRIEF       =
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = YES
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        =
+STRIP_FROM_INC_PATH    =
+SHORT_NAMES            = NO
+JAVADOC_AUTOBRIEF      = NO
+MULTILINE_CPP_IS_BRIEF = YES
+DETAILS_AT_TOP         = YES
+INHERIT_DOCS           = YES
+DISTRIBUTE_GROUP_DOC   = NO
+TAB_SIZE               = 4
+ALIASES                =
+OPTIMIZE_OUTPUT_FOR_C  = NO
+OPTIMIZE_OUTPUT_JAVA   = NO
+SUBGROUPING            = YES
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = NO
+EXTRACT_LOCAL_METHODS  = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = YES
+HIDE_IN_BODY_DOCS      = NO
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = NO
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+SORT_BRIEF_DOCS        = NO
+SORT_BY_SCOPE_NAME     = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ENABLED_SECTIONS       =
+MAX_INITIALIZER_LINES  = 30
+SHOW_USED_FILES        = NO
+SHOW_DIRECTORIES       = NO
+FILE_VERSION_FILTER    =
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_NO_PARAMDOC       = YES
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           =
+FILE_PATTERNS          =
+RECURSIVE              = NO
+EXCLUDE                =
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       =
+EXAMPLE_PATH           =
+EXAMPLE_PATTERNS       =
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = src/docs
+INPUT_FILTER           =
+FILTER_PATTERNS        =
+FILTER_SOURCE_FILES    = NO
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION    = YES
+VERBATIM_HEADERS       = NO
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          =
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            =
+HTML_FOOTER            =
+HTML_STYLESHEET        =
+HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+CHM_FILE               =
+HHC_LOCATION           =
+GENERATE_CHI           = NO
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+DISABLE_INDEX          = NO
+ENUM_VALUES_PER_LINE   = 4
+GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
+GENERATE_LATEX         = NO
+GENERATE_RTF           = NO
+GENERATE_MAN           = NO
+GENERATE_XML           = NO
+GENERATE_AUTOGEN_DEF   = NO
+GENERATE_PERLMOD       = NO
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           =
+INCLUDE_FILE_PATTERNS  =
+PREDEFINED             = DOXYGEN
+EXPAND_AS_DEFINED      =
+SKIP_FUNCTION_MACROS   = YES
+TAGFILES               =
+GENERATE_TAGFILE       =
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+CLASS_DIAGRAMS         = YES
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = YES
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+GROUP_GRAPHS           = NO
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+CALL_GRAPH             = NO
+GRAPHICAL_HIERARCHY    = YES
+DIRECTORY_GRAPH        = NO
+DOT_IMAGE_FORMAT       = png
+DOT_PATH               =
+DOTFILE_DIRS           =
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+MAX_DOT_GRAPH_DEPTH    = 0
+DOT_TRANSPARENT        = YES
+DOT_MULTI_TARGETS      = YES
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
+SEARCHENGINE           = NO
--- a/src/docs/MainPage.dox
+++ b/src/docs/MainPage.dox
@ -0,0 +1,19 @@
+/* libutf8/src/docs/MainPage.dox
+ *
+ *  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
+ *  Released under the GNU GPLv2. See file COPYING or
+ *  http://www.gnu.org/copyleft/gpl.html for details.
+*/
+
+/*! \mainpage
+
+\c libutf8 provides a C API for encoding and decoding UTF-8. It uses the C type \c wchar_t as its
+internal character representation. \c libutf8 is a "safe" decoder &mdash; it will not accept
+overlong byte sequences.
+
+*/
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+vim: expandtab:ts=4:sw=4
+*/
--- a/src/docs/build.default
+++ b/src/docs/build.default
@ -0,0 +1 @@
+source src/docs/build.docs
--- a/src/docs/build.docs
+++ b/src/docs/build.docs
@ -0,0 +1,43 @@
+# These are external variables, and shouldn't clash with anything else
+#  docs_BUILT
+#
+
+MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
+build_target monolithic
+
+if [ -z ${docs_BUILT} ]
+then
+    echo "Building documentation with Doxygen..."
+
+    DOXYFILE=obj/Doxyfile.docs
+
+    if [ ! -e ${DOXYFILE} ]
+    then
+        do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
+        echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
+        echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
+    fi
+
+    MODIFIED=0
+    for file in ${MONOLITHIC_DOC}
+    do
+        if [ ${file} -nt html/index.html ]
+        then
+            MODIFIED=1
+            break
+        fi
+    done
+
+    if [ ${MODIFIED} -ne 0 ]
+    then
+        do_cmd doxygen ${DOXYFILE} || return 1
+        print_success "Documentation built"
+    else
+        print_success "Documentation is up to date"
+    fi
+
+    docs_BUILT=1
+fi
+
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/docs/build.install
+++ b/src/docs/build.install
@ -0,0 +1 @@
+source src/docs/build.install-docs
--- a/src/docs/build.install-docs
+++ b/src/docs/build.install-docs
@ -0,0 +1,21 @@
+build_target docs
+
+# create documentation directories
+echo "Installing documentation into ${DOCSDIR}"
+build_dir_tree "${DOCSDIR}/html" || return 1
+
+# copy across the Doxygen-generated documentation
+for file in html/*
+do
+    install_file ${file} ${DOCSDIR}/html 0644 || return 1
+done
+
+# copy across the generic files
+for file in COPYING README
+do
+    install_file ${file} ${DOCSDIR} 0644 || return 1
+done
+
+print_success "Documentation installed"
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/libutf8/.params
+++ b/src/libutf8/.params
@ -0,0 +1 @@
+c lib libutf8 utf8.h
--- a/src/libutf8/BottomHeader.h
+++ b/src/libutf8/BottomHeader.h
@ -0,0 +1,11 @@
+/* libutf8/src/lib/BottomHeader.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/libutf8/ForwardDeclare.h
+++ b/src/libutf8/ForwardDeclare.h
@ -0,0 +1,14 @@
+/* libutf8/src/lib/ForwardDeclare.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+// This file simply contains forward declarations of all libutf8
+// classes, to facilitate header ordering, etc.
+
+// encode_state.h
+struct utf8_encode_state;
+
+// decode_state.h
+struct utf8_decode_state;
--- a/src/libutf8/TopHeader.h
+++ b/src/libutf8/TopHeader.h
@ -0,0 +1,16 @@
+/* libutf8/src/lib/TopHeader.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+#ifndef HEADER_LIBUTF8
+#define HEADER_LIBUTF8
+
+// standard includes, or includes needed for type declarations
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
--- a/src/libutf8/TopSource.c
+++ b/src/libutf8/TopSource.c
@ -0,0 +1,13 @@
+/* libutf8/src/lib/TopSource.cpp
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+#include "utf8.h"
+
+// Below are all the includes used throughout the library.
+
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
--- a/src/libutf8/build.default
+++ b/src/libutf8/build.default
@ -0,0 +1 @@
+source src/libutf8/build.lib
--- a/src/libutf8/build.install
+++ b/src/libutf8/build.install
@ -0,0 +1 @@
+source src/libutf8/build.install-lib
--- a/src/libutf8/build.install-lib
+++ b/src/libutf8/build.install-lib
@ -0,0 +1,36 @@
+build_target libutf8
+
+# make paths (this is for Gentoo in particular)
+build_dir_tree "${LIBDIR}" || return 1
+build_dir_tree "${PKGCONFDIR}" || return 1
+build_dir_tree "${INCLUDEDIR}" || return 1
+
+# install library
+echo "Installing libraries into '${LIBDIR}'"
+install_file ${libutf8} ${LIBDIR} 0755 || return 1
+BASE="${libutf8_BASE}.so"
+MAJOR="${BASE}.${SOMAJOR}"
+MINOR="${MAJOR}.${SOMINOR}"
+MICRO="${MINOR}.${SOMICRO}"
+install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
+install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
+install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
+
+# install header
+echo "Installing header file '${libutf8_HEADER}' into ${INCLUDEDIR}"
+install_header ${libutf8_HEADER} ${INCLUDEDIR} 0644 || return 1
+
+# install pkgconfig file
+echo "Installing package config file into ${PKGCONFDIR}"
+PKGCONFFILE=${PKGCONFDIR}/libutf8.pc
+do_cmd rm -f ${PKGCONFFILE}
+do_cmd_redir ${PKGCONFFILE} sed \
+    -e "s,@VERSION@,${VERSION}," \
+    -e "s,@LIBDIR@,${FINALLIBDIR}," \
+    -e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
+    src/libutf8/pkgconf.in
+do_cmd chmod 0644 ${PKGCONFFILE}
+print_success "Done"
+
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/libutf8/build.lib
+++ b/src/libutf8/build.lib
@ -0,0 +1,51 @@
+# These are external variables, and shouldn't clash with anything else
+#  libutf8
+#  libutf8_BUILT
+#  libutf8_HEADER
+#  libutf8_BASE
+
+if [ -z ${libutf8_BUILT} ]
+then
+    libutf8_BASE=libutf8
+    source src/libutf8/soversion
+
+    libutf8="obj/${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
+    SO_EXTRA="-lc"
+
+    echo "Building library ${libutf8}..."
+
+    do_cmd source src/libutf8/build.monolithic || return 1
+
+    MODIFIED=0
+    for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
+    do
+        if [ ${test} -nt ${libutf8} ]
+        then
+            MODIFIED=1
+            break
+        fi
+    done
+
+    if [ ${MODIFIED} -ne 0 ]
+    then
+        echo " Compiling"
+
+        SONAME="${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}"
+        do_cmd ${CC} ${CFLAGS} -shared -fpic -o "${libutf8}" \
+            -Wl,-soname,${SONAME} \
+            ${SRC} ${SO_EXTRA} || return 1
+
+        # make tests work
+        do_cmd ln -sf $(basename ${libutf8}) obj/${SONAME} || return 1
+
+        print_success "Library built"
+    else
+        print_success "Library up to date"
+    fi
+
+    libutf8_BUILT=1
+    libutf8_HEADER=${HDR}
+
+fi
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/libutf8/build.monolithic
+++ b/src/libutf8/build.monolithic
@ -0,0 +1,21 @@
+# These are external variables, and shouldn't clash with anything else
+#  libutf8_MONOLITHIC
+
+SRC="obj/libutf8.c"
+HDR="obj/utf8.h"
+
+MONOLITHIC_TESTS="src/libutf8/build.lib src/libutf8/build.monolithic"
+
+if [ -z "${libutf8_MONOLITHIC}" ]
+then
+    MONOLITHIC_SOURCE="$(echo src/libutf8/{TopHeader,ForwardDeclare,ctype,{de,en}code{,_state},BottomHeader}.h)"
+    make_monolithic ${HDR} C || return 1
+
+    MONOLITHIC_SOURCE="$(echo src/libutf8/{TopSource,ctype,{de,en}code{,_state}}.c)"
+    make_monolithic ${SRC} C || return 1
+
+    libutf8_MONOLITHIC=1
+    MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
+fi
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/libutf8/ctype.c
+++ b/src/libutf8/ctype.c
@ -0,0 +1,55 @@
+/* libutf8/src/lib/ctype.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+bool utf8_isascii(wchar_t ch)
+{
+    return !(ch & ~0x7F);
+}
+
+
+
+/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/)
+
+0009..000D    ; White_Space # Cc   [5] <control-0009>..<control-000D>
+0020          ; White_Space # Zs       SPACE
+0085          ; White_Space # Cc       <control-0085>
+00A0          ; White_Space # Zs       NO-BREAK SPACE
+1680          ; White_Space # Zs       OGHAM SPACE MARK
+180E          ; White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+2000..200A    ; White_Space # Zs  [11] EN QUAD..HAIR SPACE
+2028          ; White_Space # Zl       LINE SEPARATOR
+2029          ; White_Space # Zp       PARAGRAPH SEPARATOR
+202F          ; White_Space # Zs       NARROW NO-BREAK SPACE
+205F          ; White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+3000          ; White_Space # Zs       IDEOGRAPHIC SPACE
+*/
+
+bool utf8_isspace(wchar_t ch)
+{
+    return((ch >= 0x0009 && ch <= 0x000D)
+            || ch == 0x0020
+            || ch == 0x0085
+            || ch == 0x00A0
+            || ch == 0x1680
+            || ch == 0x180E
+            || (ch >= 0x2000 && ch <= 0x200A)
+            || ch == 0x2028
+            || ch == 0x2029
+            || ch == 0x202F
+            || ch == 0x205F
+            || ch == 0x3000);
+}
+
+
+
+bool utf8_isvalid(wchar_t ch)
+{
+    return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF);
+}
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/ctype.h
+++ b/src/libutf8/ctype.h
@ -0,0 +1,46 @@
+/* libutf8/src/lib/ctype.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+/*! \defgroup ctype Character classification
+
+This module contains functions for character classification. These are basically an extension of the
+\c is* functions defined in \c &lt;ctype.h&gt;.
+
+\todo There are many char classification functions that haven't been implemented yet. These won't be
+    implemented until they can be done in a proper, Unicode-safe fashion.
+
+ */
+/*!@{*/
+
+
+
+/// Returns \c true if \a ch can be represented in ASCII.
+bool utf8_isascii(wchar_t ch);
+
+/// Returns \c true if \a ch is whitespace.
+bool utf8_isspace(wchar_t ch);
+
+/*! \brief Returns \c true if \a ch is a valid UCS-4 character.
+
+\param ch The character to classify.
+\retval true If \a ch is a valid UCS-4 character.
+\retval false If \a ch is not a valid UCS-4 character.
+
+This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4
+character. Valid characters lie in the range 0&ndash;0x7FFFFFFF but exclude:
+\li the UTF-16 surrogate code points (U+D800&ndash;U+DFFF, inclusive)
+\li the invalid code points U+FFFE and U+FFFF
+
+*/
+bool utf8_isvalid(wchar_t ch);
+
+
+
+/*!@}*/
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/decode.c
+++ b/src/libutf8/decode.c
@ -0,0 +1,234 @@
+/* libutf8/src/lib/decode.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+wchar_t utf8_decode_char(const char* src, size_t* used)
+{
+    return utf8_decode_char2(src, 6, used);
+}
+
+
+
+wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used)
+{
+    uint8_t ch;
+    wchar_t ret, min;
+    int remain;
+
+    if(!src || !size) {
+        errno = EINVAL;
+        return 0;
+    }
+    if(used) *used = 1;
+    ch = *src++;
+
+    if(ch & 0x80) {
+        if((ch & 0xE0) == 0xC0) {
+            min = 0x80;
+            remain = 1;
+            if(used) *used = 2;
+            ret = ch & 0x1F;
+        } else if((ch & 0xF0) == 0xE0) {
+            min = 0x800;
+            remain = 2;
+            if(used) *used = 3;
+            ret = ch & 0x0F;
+        } else if((ch & 0xF8) == 0xF0) {
+            min = 0x10000;
+            remain = 3;
+            if(used) *used = 4;
+            ret = ch & 0x07;
+        } else if((ch & 0xFC) == 0xF8) {
+            min = 0x200000;
+            remain = 4;
+            if(used) *used = 5;
+            ret = ch & 0x03;
+        } else if((ch & 0xFE) == 0xFC) {
+            min = 0x4000000;
+            remain = 5;
+            if(used) *used = 6;
+            ret = ch & 0x01;
+        } else {
+            errno = EILSEQ;
+            return 0;
+        }
+
+        while(remain--) {
+            if(!--size) {
+                errno = EILSEQ;
+                return 0;
+            }
+            ch = *src++;
+            if((ch & 0xC0) != 0x80) {
+                errno = EILSEQ;
+                return 0;
+            }
+            ret <<= 6;
+            ret |= ch & 0x3F;
+        }
+
+        if(ch < min) {
+            errno = EILSEQ;
+            return 0;
+        }
+
+        return ret;
+    }
+    return ch;
+}
+
+
+
+wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq)
+{
+    return utf8_decode_char2_force(src, 6, used, ilseq);
+}
+
+
+
+wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq)
+{
+    uint8_t ch;
+    wchar_t ret, min;
+    int remain;
+
+    if(!src || !size) {
+        errno = EINVAL;
+        return 0;
+    }
+    if(used) *used = 1;
+    ch = *src++;
+
+    if(ch & 0x80) {
+        if((ch & 0xE0) == 0xC0) {
+            min = 0x80;
+            remain = 1;
+            ret = ch & 0x1F;
+        } else if((ch & 0xF0) == 0xE0) {
+            min = 0x800;
+            remain = 2;
+            ret = ch & 0x0F;
+        } else if((ch & 0xF8) == 0xF0) {
+            min = 0x10000;
+            remain = 3;
+            ret = ch & 0x07;
+        } else if((ch & 0xFC) == 0xF8) {
+            min = 0x200000;
+            remain = 4;
+            ret = ch & 0x03;
+        } else if((ch & 0xFE) == 0xFC) {
+            min = 0x4000000;
+            remain = 5;
+            ret = ch & 0x01;
+        } else {
+            goto ILSEQ;
+        }
+
+        while(remain--) {
+            if(!--size) goto ILSEQ;
+            ch = *src++;
+            if(used) (*used)++;
+            if((ch & 0xC0) != 0x80) goto ILSEQ;
+            ret <<= 6;
+            ret |= ch & 0x3F;
+        }
+
+        if(ch < min) goto ILSEQ;
+
+        return ret;
+    }
+    return ch;
+
+ILSEQ:
+    // advance pointer to next valid char boundary
+    while(1) {
+        if(!*src || !size) break;
+        if((*src & 0xC0) == 0x80) break;
+        ++src;
+        --size;
+        if(used) (*used)++;
+    }
+
+    return ilseq;
+}
+
+
+
+wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src)
+{
+    struct utf8_decode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.rd = src;
+    ctx.rd_remain = -1;
+    ctx.wr = dest;
+    ctx.wr_size = size;
+
+    if(!utf8_decoder(&ctx)) return 0;
+    if(*ctx.rd) {
+        errno = ENOMEM;
+        return 0;
+    }
+
+    return dest;
+}
+
+
+
+wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt)
+{
+    struct utf8_decode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.rd = src;
+    ctx.rd_remain = amt;
+    ctx.wr = dest;
+    ctx.wr_size = size;
+
+    if(!utf8_decoder(&ctx)) return 0;
+    if(ctx.rd_remain || !ctx.complete) {
+        errno = ENOMEM;
+        return 0;
+    }
+    if(written) *written = ctx.written;
+
+    return dest;
+}
+
+
+
+wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src)
+{
+    struct utf8_decode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.rd = src;
+    ctx.rd_remain = -1;
+    ctx.wr = dest;
+    ctx.wr_size = size;
+    ctx.error_callback = utf8_decode_error_callback_replace;
+
+    if(!utf8_decoder(&ctx)) return 0;
+    if(*ctx.rd) {
+        errno = ENOMEM;
+        return 0;
+    }
+
+    return dest;
+}
+
+
+
+wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt)
+{
+    struct utf8_decode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.rd = src;
+    ctx.rd_remain = amt;
+    ctx.wr = dest;
+    ctx.wr_size = size;
+    ctx.error_callback = utf8_decode_error_callback_replace;
+
+    if(!utf8_decoder(&ctx)) return 0;
+    if(written) *written = ctx.written;
+    return dest;
+}
--- a/src/libutf8/decode.h
+++ b/src/libutf8/decode.h
@ -0,0 +1,187 @@
+/* libutf8/src/lib/decode.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+/*! \defgroup decode UTF-8 decoding routines.
+
+These routines decode UTF-8 data into C's wide character type \c wchar_t. Errors are reported
+through \c errno, with the following errors being of particular interest:
+
+\li \c EINVAL - invalid argument to function
+\li \c EILSEQ - illegal encoding (i.e. not UTF-8 or encoding error)
+\li \c ENOMEM - not enough space in destination buffer
+
+As a special case, functions which return a character may return the \c wchar_t representation of
+-1 to signify an error. This wording is used to take into account the fact that the \c wchar_t type
+could be unsigned.
+
+*/
+/*!@{*/
+
+
+
+/*! \brief Decode a character.
+
+\param src Pointer to start of source data.
+\param used If not null, set to the number of bytes used.
+\retval (wchar_t)(-1) on error (see \c errno).
+\returns Decoded character.
+
+Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
+\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
+a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a errno
+will be set to \c EILSEQ.
+
+\warning Only use this function if you are sure it cannot read past the end of your buffer. See
+    utf8_decode_char2() for a safe version.
+
+*/
+wchar_t utf8_decode_char(const char* src, size_t* used);
+
+
+
+/*! \brief Decode a character, discarding illegal sequences.
+
+\param src Pointer to start of source data.
+\param used If not null, set to the number of bytes used.
+\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the
+    Unicode replacement character, \c 0xFFFD.
+\retval (wchar_t)(-1) on error (see \c errno).
+\retval ilseq If an illegal sequence is encountered.
+\returns Decoded character.
+\post \a *used will be set to the number of bytes consumed.
+
+Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
+\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
+a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq
+will be returned and the buffer advanced to the next valid character. This means the function can
+only fail if you pass it an invalid \a src pointer.
+
+\warning Only use this function if you are sure it cannot read past the end of your buffer. See
+    utf8_decode_char2_force() for a safe version.
+
+*/
+wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq);
+
+
+
+/*! \brief Decode a character, given source buffer size.
+
+\param src Pointer to start of source data.
+\param size Size of source data in bytes.
+\param used If not null, set to the number of bytes used.
+\retval (wchar_t)(-1) on error (see \c errno).
+\returns Decoded character.
+
+Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
+\c errno appropriately. If \a used is not NULL, it is set to the number of characters used.
+
+*/
+wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used);
+
+
+
+/*! \brief Decode a character, discarding illegal sequences and given source buffer size.
+
+\param src Pointer to start of source data.
+\param size Size of source data in bytes.
+\param used If not null, set to the number of bytes used.
+\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the
+    Unicode replacement character, \c 0xFFFD.
+\retval (wchar_t)(-1) on error (see \c errno).
+\retval ilseq If an illegal sequence is encountered.
+\returns Decoded character.
+\post \a *used will be set to the number of bytes consumed.
+
+Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
+\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
+a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq
+will be returned and the buffer advanced to the next valid character. This means the function can
+only fail if you pass it an invalid \a src pointer, or a \a size of 0.
+
+*/
+wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq);
+
+
+
+/*! \brief Decode a null-terminated string.
+
+\param dest The output destination.
+\param size The number of characters that can be stored in \a dest.
+\param src Pointer to the null-terminated source data.
+\returns Pointer to the output destination.
+\retval 0 on error (see \c errno).
+
+This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets
+\c errno appropriately.
+
+*/
+wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src);
+
+
+
+/*! \brief Decode a fixed-size string.
+
+\param dest The output destination.
+\param size The number of characters that can be stored in \a dest.
+\param written Set to the number of bytes written (excluding NUL).
+\param src Pointer to the null-terminated source data.
+\param amt Number of bytes to decode.
+\returns Pointer to the output destination.
+\retval 0 on error (see \c errno).
+
+This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets
+\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null,
+it is set to the number of characters written excluding the terminating NUL. This function always
+produces null-terminated strings.
+
+*/
+wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt);
+
+
+
+/*! \brief Decode a null-terminated string, ignoring errors.
+
+\param dest The output destination.
+\param size The number of characters that can be stored in \a dest.
+\param src Pointer to the null-terminated source data.
+\returns Pointer to the output destination.
+\retval 0 on error (see \c errno).
+
+This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets
+\c errno appropriately.
+
+This function will truncate the output if there is not enough space and will skip characters it
+cannot decode. It can only fail if you pass it invalid parameters.
+
+*/
+wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src);
+
+
+
+/*! \brief Decode a fixed-size string, ignoring errors.
+
+\param dest The output destination.
+\param size The number of characters that can be stored in \a dest.
+\param written Set to the number of bytes written (excluding NUL).
+\param src Pointer to the null-terminated source data.
+\param amt Number of bytes to decode.
+\returns Pointer to the output destination.
+\retval 0 on error (see \c errno).
+
+This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets
+\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null,
+it is set to the number of characters written excluding the terminating NUL. This function always
+produces null-terminated strings.
+
+This function will truncate the output if there is not enough space and will skip characters it
+cannot decode. It can only fail if you pass it invalid parameters.
+
+*/
+wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt);
+
+
+
+/*!@}*/
--- a/src/libutf8/decode_state.c
+++ b/src/libutf8/decode_state.c
@ -0,0 +1,204 @@
+/* libutf8/src/lib/decode_ctx.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+enum utf8_decoder_state {
+    utf8_state_none,
+    utf8_state_multibyte1,
+    utf8_state_multibyte2,
+    utf8_state_multibyte3,
+    utf8_state_multibyte4,
+    utf8_state_multibyte5,
+    utf8_state_error,
+    utf8_state_skip
+};
+
+
+
+struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* ctx)
+{
+    wchar_t* wr;
+    size_t avail;
+    enum utf8_decode_error error_type;
+
+    if(!ctx || !ctx->rd || !ctx->wr || ctx->wr_size < 2 || ctx->state == utf8_state_error) {
+        errno = EINVAL;
+        return 0;
+    }
+
+    wr = ctx->wr;
+    ctx->written = 0;
+    avail = ctx->wr_size;
+
+loop:
+    while(ctx->rd_remain) {
+        uint8_t in = *ctx->rd;
+
+        switch(ctx->state) {
+        case utf8_state_skip:
+        case utf8_state_none:
+            if(!in && ctx->rd_remain < 0) {
+                *wr = 0;
+                ctx->complete = true;
+                ++ctx->byte_offset;
+                return ctx;
+            }
+            if(!(in & 0x80)) {
+                *wr++ = in;
+                ++ctx->written;
+                --avail;
+                ++ctx->char_offset;
+                ctx->complete = true;
+                if(in == 0x0A) {
+                    ++ctx->line;
+                    ctx->col = 0;
+                } else {
+                    ++ctx->col;
+                }
+                ctx->state = utf8_state_none;
+                break;
+            }
+            ctx->complete = false;
+            if((in & 0xE0) == 0xC0) {
+                ctx->minch = 0x80;
+                ctx->state = utf8_state_multibyte1;
+                ctx->statech = in & 0x1F;
+            } else if((in & 0xF0) == 0xE0) {
+                ctx->minch = 0x800;
+                ctx->state = utf8_state_multibyte2;
+                ctx->statech = in & 0x0F;
+            } else if((in & 0xF8) == 0xF0) {
+                ctx->minch = 0x10000;
+                ctx->state = utf8_state_multibyte3;
+                ctx->statech = in & 0x07;
+            } else if((in & 0xFC) == 0xF8) {
+                ctx->minch = 0x200000;
+                ctx->state = utf8_state_multibyte4;
+                ctx->statech = in & 0x03;
+            } else if((in & 0xFE) == 0xFC) {
+                ctx->minch = 0x4000000;
+                ctx->state = utf8_state_multibyte5;
+                ctx->statech = in & 0x01;
+            } else if(ctx->state != utf8_state_none) {
+                ctx->state = utf8_state_none;
+            } else {
+                error_type = ((in & 0xC0) == 0x80) ? utf8_decode_error_lone_cchar
+                    : utf8_decode_error_not_schar;
+                goto error;
+            }
+            break;
+
+        case utf8_state_multibyte1:
+        case utf8_state_multibyte2:
+        case utf8_state_multibyte3:
+        case utf8_state_multibyte4:
+        case utf8_state_multibyte5:
+            if((in & 0xC0) != 0x80) {
+                error_type = utf8_decode_error_not_cchar;
+                goto error;
+            }
+            ctx->statech <<= 6;
+            ctx->statech |= in & 0x3F;
+            if(!--ctx->state) {
+                if(ctx->statech < ctx->minch) {
+                    error_type = utf8_decode_error_overlong;
+                    goto error;
+                } else {
+                    // validate codepoint
+                    if(!utf8_isvalid(ctx->statech)) {
+                        error_type = utf8_decode_error_illegal_cp;
+                        goto error;
+                    }
+
+                    // add to output string
+                    *wr++ = ctx->statech;
+                    ++ctx->written;
+                    --avail;
+                    ++ctx->char_offset;
+                    ctx->complete = true;
+                    if(ctx->statech == 0x0A || ctx->statech == 0x2028) {
+                        ++ctx->line;
+                        ctx->col = 0;
+                    } else {
+                        ++ctx->col;
+                    }
+               }
+            }
+            break;
+
+        default:
+            errno = EINVAL;
+            return 0;
+        }
+
+        ++ctx->byte_offset;
+        ++ctx->rd;
+        if(ctx->rd_remain > 0) --ctx->rd_remain;
+        if(avail == 1) break;
+    }
+    *wr = 0;
+    return ctx;
+
+error:
+    if(!ctx->error_callback) {
+        errno = EILSEQ;
+        return 0;
+    }
+    switch(ctx->error_callback(ctx, error_type, wr)) {
+    case utf8_decode_error_action_abort:
+        errno = EILSEQ;
+        return 0;
+
+    case utf8_decode_error_action_skip:
+        ctx->state = utf8_state_skip;
+        goto loop;
+
+    case utf8_decode_error_action_replace:
+        ctx->state = utf8_state_skip;
+        ++ctx->written;
+        if(*wr == 0x0A || *wr == 0x2028) {
+            ++ctx->line;
+            ctx->col = 0;
+        } else {
+            ++ctx->col;
+        }
+        ++wr;
+        if(--avail == 1) {
+            *wr = 0;
+            return ctx;
+        }
+        goto loop;
+    }
+
+    // shouldn't reach here
+    errno = EILSEQ;
+    return 0;
+}
+
+
+
+enum utf8_decode_error_action utf8_decode_error_callback_replace(
+    const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
+{
+    (void)ctx;
+    (void)error;
+    *newch = 0xFFFD;
+    return utf8_decode_error_action_replace;
+}
+
+
+
+enum utf8_decode_error_action utf8_decode_error_callback_skip(
+    const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
+{
+    (void)ctx;
+    (void)error;
+    (void)newch;
+    return utf8_decode_error_action_skip;
+}
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/decode_state.h
+++ b/src/libutf8/decode_state.h
@ -0,0 +1,197 @@
+/* libutf8/src/lib/decode_ctx.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+/*! \defgroup decode_ctx UTF-8 stateful decoder.
+
+This UTF-8 decoder uses a structure to maintain state information between calls. This means that
+you can feed it a stream of data as it comes in without needing to store the entire document in a
+buffer. It correctly copes with the currently-available data ending on a non-character boundary.
+
+Errors are handled by providing a callback function (several of which are provided by the library).
+The callback function has the option of aborting the conversion, substituting a replacement
+character, or simply skipping the illegal byte sequence.
+
+*/
+/*!@{*/
+
+
+
+/*! \brief Types of decoder error.
+
+These are the types of error that can be encountered by the decoder. This allows slightly more
+information than is provided by setting \a errno to \c EILSEQ. The type of error will be passed
+to the callback function.
+
+*/
+enum utf8_decode_error {
+    /// Lone continuation char encountered when start char expected.
+    utf8_decode_error_lone_cchar,
+
+    /// Non-continuation char encountered within multibyte sequence.
+    utf8_decode_error_not_cchar,
+
+    /// Invalid start char (not ASCII).
+    utf8_decode_error_not_schar,
+
+    /// Overlong byte sequence.
+    utf8_decode_error_overlong,
+
+    /// Illegal code positions (UTF-16 surrogates or 0xFFFE,0xFFFF).
+    utf8_decode_error_illegal_cp
+};
+
+
+
+/*! \brief Action to be taken after error callback.
+
+These are the possible actions that can be undertaken after a stateful decode has encountered an
+error. These actions are specified by the error callback function's return value.
+
+*/
+enum utf8_decode_error_action {
+    /// Abort the conversion, returning EILSEQ.
+    utf8_decode_error_action_abort,
+
+    /// Skip the illegal byte sequence.
+    utf8_decode_error_action_skip,
+
+    /// Discard the illegal byte sequence and enter a replacement char.
+    utf8_decode_error_action_replace
+};
+
+
+
+/*! \brief Error callback type.
+
+\param state The state-storage structure.
+\param error The error type.
+\param[out] newch If utf8_decode_error_action_replace is returned, then set this to the value of
+    the character you wish to replace with (\c 0xFFFD is recommended).
+\returns A value specifying what action to undertake as a result of the callback.
+
+This callback determines the action of the UTF-8 stateful decoder on encountering an illegal byte
+sequence. It can choose to abort the conversion, skip the illegal sequence, or replace the illegal
+sequence with an arbitrary character.
+
+*/
+typedef enum utf8_decode_error_action(*utf8_decode_error_callback)(
+    const struct utf8_decode_state* state, enum utf8_decode_error error, wchar_t* newch);
+
+
+
+/*! \brief State structure used to decode UTF-8 into Unicode.
+
+This structure is used to decode arbitrary chunks of UTF-8 data into Unicode. It can deal with
+partial data streams (even if they are cut-off mid-character).
+
+Before calling utf8_decoder, you must set up the object appropriately. The first step is to use
+\a memset to initialise everything to 0. Then you need to fill out the read and write pointers, and
+possibly set up the error callback.
+
+To use it, you set \a rd to point to your input data and \a rd_remain to the amount you have. If
+\a rd_remain is negative, the input data is assumed to be null-terminated; otherwise, it is taken
+as the number of bytes remaining at the input. These are updated after each call, so simply check
+if \a rd_remain is 0 (or \a *rd is 0 in the case of a null-terminated string).
+
+You must also set \a wr (pointer to destination buffer) and \a wr_size (number of characters that
+can be written there), and \a written is set for you (it is the number of characters written per
+call but excluding the terminating NUL). This implies that the buffer must have space for at least
+two characters. You can change \a wr and \a wr_size at any time, but if you leave them the same the
+data will be overwritten on each call.
+
+If you wish to do error recovery, set \a error_callback and possibly \a data.
+
+You can examine the \a line and \a col variables to get the line / column of the input data at which
+the decoder is currently operating. \a char_offset and \a byte_offset represent the offset, in
+complete characters or bytes, from the start of the stream. With the exception of \a byte_offset,
+these variables aren't perfect, as they can be affected by errors and limitations (only 0x0A and
+0x2028 are recognised as line end chars, and the effect of tabs is ignored).
+
+*/
+struct utf8_decode_state {
+    /// \c false if we are part-way through a multi-byte character.
+    bool complete;
+
+    /// Data to read (current read position).
+    const char* rd;
+
+    /// Number of bytes remaining (current).
+    int rd_remain;
+
+    /// Internal state; initialise to 0, don't change.
+    int state;
+
+    /// Error callback (may be 0).
+    utf8_decode_error_callback error_callback;
+
+    /// Pointer to output buffer.
+    wchar_t* wr;
+
+    /// Number of characters that can be written.
+    size_t wr_size;
+
+    /// Number of characters written on last call.
+    size_t written;
+
+    /// Arbitrary data pointer for \a error_callback.
+    void* data;
+
+    /// Current line (starting from 0).
+    int line;
+
+    /// Current column (starting from 0).
+    int col;
+
+    /// Character offset from start of data (starting from 0).
+    int char_offset;
+
+    /// Byte offset from start of data (starting from 0).
+    int byte_offset;
+
+    /// Don't use this.
+    wchar_t statech;
+    /// Don't use this.
+    wchar_t minch;
+};
+
+
+
+/*! \brief Decode an arbitrary chunk of a UTF-8 byte stream.
+
+\param state The state-storage structure.
+\retval ctx on success.
+\retval 0 on error (see \a errno).
+
+This function is used to do multi-pass decoding of arbitrary UTF-8 byte streams. Each call will
+update \a state.rd, \a state.rd_remain and \a state.written. \a state.complete is \c true if, on consumption
+of all the data, we are not inside a multibyte character.
+
+Should an error occur, \a state.error_callback is called (if it is not 0). If it is 0, or it returns
+utf8_decode_error_action_abort, then the conversion will be aborted and the object set into
+an error state. \a errno will be set to \c EILSEQ. Once the object is in an error state, there is
+no way to recover short of completely clearing it and starting with fresh data. Continuing to call
+this function with an invalid object will result in \c EINVAL.
+
+*/
+struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* state);
+
+
+
+/// Standard error callback: use replacement char 0xFFFD.
+enum utf8_decode_error_action utf8_decode_error_callback_replace(
+    const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch);
+
+/// Standard error callback: skip invalid chars.
+enum utf8_decode_error_action utf8_decode_error_callback_skip(
+    const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch);
+
+
+
+/*!@}*/
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/encode.c
+++ b/src/libutf8/encode.c
@ -0,0 +1,141 @@
+/* libutf8/src/lib/encode.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
+{
+    if(!dest || !amt) {
+        errno = EINVAL;
+        return 0;
+    }
+    if(!utf8_isvalid(ch)) {
+        errno = EILSEQ;
+        return 0;
+    }
+
+    if(ch < 0x80) {
+        *dest++ = ch;
+
+    } else if(ch < 0x800) {
+        if(amt < 2) {
+            errno = ENOMEM;
+            return 0;
+        }
+        *dest++ = 0xC0 | ((ch >> 6) & 0x1F);
+        *dest++ = 0x80 | (ch & 0x3F);
+
+    } else if(ch < 0x10000) {
+        if(amt < 3) {
+            errno = ENOMEM;
+            return 0;
+        }
+        *dest++ = 0xE0 | ((ch >> 12) & 0xF);
+        *dest++ = 0x80 | ((ch >> 6) & 0x3F);
+        *dest++ = 0x80 | (ch & 0x3F);
+
+    } else if(ch < 0x200000) {
+        if(amt < 4) {
+            errno = ENOMEM;
+            return 0;
+        }
+        *dest++ = 0xF0 | ((ch >> 18) & 0x7);
+        *dest++ = 0x80 | ((ch >> 12) & 0x3F);
+        *dest++ = 0x80 | ((ch >> 6) & 0x3F);
+        *dest++ = 0x80 | (ch & 0x3F);
+
+    } else if(ch < 0x4000000) {
+        if(amt < 5) {
+            errno = ENOMEM;
+            return 0;
+        }
+        *dest++ = 0xF8 | ((ch >> 24) & 0x3);
+        *dest++ = 0x80 | ((ch >> 18) & 0x3F);
+        *dest++ = 0x80 | ((ch >> 12) & 0x3F);
+        *dest++ = 0x80 | ((ch >> 6) & 0x3F);
+        *dest++ = 0x80 | (ch & 0x3F);
+
+    } else {
+        if(amt < 6) {
+            errno = ENOMEM;
+            return 0;
+        }
+        *dest++ = 0xFC | ((ch >> 30) & 0x1);
+        *dest++ = 0x80 | ((ch >> 24) & 0x3F);
+        *dest++ = 0x80 | ((ch >> 18) & 0x3F);
+        *dest++ = 0x80 | ((ch >> 12) & 0x3F);
+        *dest++ = 0x80 | ((ch >> 6) & 0x3F);
+        *dest++ = 0x80 | (ch & 0x3F);
+
+    }
+
+    return dest;
+}
+
+
+
+char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
+{
+    if(!utf8_isvalid(ilseq)) {
+        errno = EILSEQ;
+        return 0;
+    }
+
+    return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq);
+}
+
+
+
+char* utf8_encode(char* dest, size_t amt, const wchar_t* src)
+{
+    return utf8_encode2(dest, amt, 0, src, -1);
+}
+
+
+
+char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt)
+{
+    struct utf8_encode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.rd = src;
+    ctx.rd_remain = inamt;
+    ctx.wr = dest;
+    ctx.wr_size = amt;
+
+    if(!utf8_encoder(&ctx)) return 0;
+    if(ctx.rd_remain > 0 || (ctx.rd_remain < 0 && *ctx.rd)) {
+        errno = ENOMEM;
+        return 0;
+    }
+    if(written) *written = ctx.written;
+    return dest;
+}
+
+
+
+char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src)
+{
+    return utf8_encode_force2(dest, amt, 0, src, -1);
+}
+
+
+
+char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt)
+{
+    struct utf8_encode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.rd = src;
+    ctx.rd_remain = inamt;
+    ctx.wr = dest;
+    ctx.wr_size = amt;
+    ctx.error_callback = utf8_encode_error_callback_replace;
+
+    if(!utf8_encoder(&ctx)) return 0;
+    if(written) *written = ctx.written;
+    return dest;
+}
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/encode.h
+++ b/src/libutf8/encode.h
@ -0,0 +1,144 @@
+/* libutf8/src/lib/encode.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+/*! \defgroup encode UTF-8 encoding routines.
+
+The functions in this module allow encoding of UTF-8 characters. Errors are reported through
+\c errno, with the following errors being of particular interest:
+
+\li \c EINVAL - invalid argument to function
+\li \c EILSEQ - illegal source character (see utf8_isvalid())
+\li \c ENOMEM - not enough space in destination buffer
+
+*/
+/*!@{*/
+
+
+
+/*! \brief Encode a single character into UTF-8.
+
+\param dest The destination buffer.
+\param amt Number of bytes in destination buffer.
+\param ch Character to encode.
+\returns Pointer to next byte of buffer to use.
+\retval 0 on error (see \c errno).
+
+This function will encode a single character into UTF-8. It returns a pointer to the end of the
+character (i.e. the next position in the buffer you want to write to).
+
+On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1, \c EILSEQ
+if \a ch is not valid; or \c ENOMEM if the result would not fit into
+\a amt bytes) and returns 0.
+
+*/
+char* utf8_encode_char(char* dest, size_t amt, wchar_t ch);
+
+
+
+/*! \brief Encode a single character into UTF-8, forcing replacement of invalid characters.
+
+\param dest The destination buffer.
+\param amt Number of bytes in destination buffer.
+\param ch Character to encode.
+\param ilseq If \a ch is not a legal character, then this is encoded instead.
+\returns Pointer to next byte of buffer to use.
+\retval 0 on error (see \c errno).
+
+This function will encode a single character into UTF-8. It returns a pointer to the end of the
+character (i.e. the next position in the buffer you want to write to). If the source character \a ch
+is not a valid code point, it will instead encode the character \a ilseq.
+
+On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1; \c EILSEQ
+if \a ilseq is not valid; or \c ENOMEM if the result would not fit into
+\a amt bytes) and returns 0.
+
+*/
+char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq);
+
+
+
+/*! \brief Encode a null-terminated string into UTF-8.
+
+\param dest The destination buffer.
+\param amt Number of bytes in the destination buffer.
+\param src Null-terminated source string.
+\returns Pointer to destination buffer.
+\retval 0 on error (see \c errno).
+
+This function encodes a null-terminated Unicode string into the destination buffer. It returns a
+pointer to the destination buffer on success, and 0 on error. If there is not enough space in the
+buffer, or an illegal character is encountered somewhere in the sequence, it will fail.
+
+*/
+char* utf8_encode(char* dest, size_t amt, const wchar_t* src);
+
+
+
+/*! \brief Encode a fixed-size string into UTF-8.
+
+\param dest The destination buffer.
+\param amt Number of bytes in the destination buffer.
+\param written Set to number of bytes written on success (excluding NUL).
+\param src Pointer to source string.
+\param inamt Number of characters to encode.
+\returns Pointer to destination buffer.
+\retval 0 on error (see \c errno).
+
+This function encodes a Unicode string (possibly containing ASCII NUL) into the destination buffer.
+It returns a pointer to the destination buffer on success, and 0 on error. If there is not enough
+space in the buffer, or an illegal character is encountered somewhere in the sequence, it will fail.
+The destination will be null-terminated.
+
+*/
+char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt);
+
+
+
+/*! \brief Encode a null-terminated string into UTF-8, ignoring errors.
+
+\param dest The destination buffer.
+\param amt Number of bytes in the destination buffer.
+\param src Null-terminated source string.
+\returns Pointer to destination buffer.
+\returns 0 if arguments are invalid.
+
+This function will encode a null-terminated Unicode string into the destination buffer, making a
+best-effort in the case of failures. If there is not enough memory, the destination string will be
+truncated (but still null-terminated). If an illegal source character is encountered, it is replaced
+with the Unicode replacement character U+FFFD. The function can only fail if one of the arguments is
+invalid.
+
+*/
+char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src);
+
+
+
+/*! \brief Encode a fixed-size string into UTF-8, ignoring errors.
+
+\param dest The destination buffer.
+\param amt Number of bytes in the destination buffer.
+\param written Set to number of bytes written on success (excluding NUL).
+\param src Null-terminated source string.
+\param inamt Number of characters to encode.
+\returns Pointer to destination buffer.
+\returns 0 if arguments are invalid.
+
+This function will encode a Unicode string (possibly containing ASCII NUL) into the destination
+buffer, making a best-effort in the case of failures. If there is not enough memory, the destination
+string will be truncated (but still null-terminated). If an illegal source character is encountered,
+it is replaced with the Unicode replacement character U+FFFD. The function can only fail if one of
+the arguments is invalid.
+
+*/
+char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt);
+
+
+
+/*!@}*/
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/encode_state.c
+++ b/src/libutf8/encode_state.c
@ -0,0 +1,88 @@
+/* libutf8/src/lib/encode_state.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state)
+{
+    char* wr = state->wr, * ret;
+    char* endp = wr + state->wr_size - 1;
+    wchar_t ch;
+    enum utf8_encode_error_action error_action;
+    bool reencoding;
+
+    if(!state || !state->rd || !state->wr || state->wr_size < 7) {
+        errno = EINVAL;
+        return 0;
+    }
+
+    state->written = 0;
+    while(state->rd_remain) {
+        ch = *state->rd;
+        if(!ch && state->rd_remain < 0) break;
+
+        reencoding = false;
+    reencode:
+        ret = utf8_encode_char(wr, endp - wr, ch);
+        if(!ret) {
+            if(errno == ENOMEM) break;
+            if(!state->error_callback || reencoding) {
+                errno = EILSEQ;
+                return 0;
+            }
+            error_action = state->error_callback(state, &ch);
+            switch(error_action) {
+            case utf8_encode_error_action_abort:
+                errno = EILSEQ;
+                return 0;
+
+            case utf8_encode_error_action_replace:
+                reencoding = true;
+                goto reencode;
+
+            case utf8_encode_error_action_skip:
+                ret = wr;
+                break;
+            }
+        }
+        if(state->rd_remain > 0) state->rd_remain--;
+        ++state->rd;
+        ++state->char_offset;
+        if(ch == 0x0A || ch == 0x2028) {
+            ++state->line;
+            state->col = 0;
+        } else {
+            ++state->col;
+        }
+        state->written += ret - wr;
+        wr = ret;
+        if(wr == endp) break;
+    }
+    *wr = 0;
+    return state;
+}
+
+
+
+enum utf8_encode_error_action utf8_encode_error_callback_replace(
+    const struct utf8_encode_state* state, wchar_t* newch)
+{
+    (void)state;
+    *newch = 0xFFFD;
+    return utf8_encode_error_action_replace;
+}
+
+
+
+enum utf8_encode_error_action utf8_encode_error_callback_skip(
+    const struct utf8_encode_state* state, wchar_t* newch)
+{
+    (void)state;
+    (void)newch;
+    return utf8_encode_error_action_skip;
+}
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/encode_state.h
+++ b/src/libutf8/encode_state.h
@ -0,0 +1,158 @@
+/* libutf8/src/lib/encode_state.h
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+/*! \defgroup encode_state UTF-8 stateful encoder.
+
+This UTF-8 encoder uses a structure to maintain state information between calls. This means that
+you can feed it a stream of data as it comes in without needing to store the entire source in a
+buffer.
+
+Errors (i.e. illegal source chars; see utf8_isvalid()) are handled by providing a callback function
+(several of which are provided by the library). The callback function has the option of aborting
+the conversion, substituting a replacement character, or simply skipping the illegal source
+character.
+
+*/
+/*!@{*/
+
+
+
+/*! \brief Action to be taken after error callback.
+
+These are the possible actions that can be undertaken after a stateful encoding operation has
+encountered an error (illegal source char). These actions are specified by the error callback
+function's return value.
+
+*/
+enum utf8_encode_error_action {
+    /// Abort the conversion, returning EILSEQ.
+    utf8_encode_error_action_abort,
+
+    /// Skip the illegal byte sequence.
+    utf8_encode_error_action_skip,
+
+    /// Discard the illegal byte sequence and enter a replacement char.
+    utf8_encode_error_action_replace
+};
+
+
+
+
+/*! \brief Error callback type.
+
+\param state The encoder state information.
+\param[out] newch If \a utf8_encode_error_action_replace is returned, this is set to the
+    character that should be substituted instead of the illegal source character.
+
+This function is called whenever an error occurs. It can examine \a state (and specifically
+\a *state.rd) to determine the illegal source character. It can choose to skip the character, replace
+it with something else, or abort the conversion entirely.
+
+*/
+typedef enum utf8_encode_error_action (*utf8_encode_error_callback)(
+    const struct utf8_encode_state* state, wchar_t* newch);
+
+/// Standard error callback: use replacement char 0xFFFD.
+enum utf8_encode_error_action utf8_encode_error_callback_replace(
+    const struct utf8_encode_state* state, wchar_t* newch);
+
+/// Standard error callback: skip invalid chars.
+enum utf8_encode_error_action utf8_encode_error_callback_skip(
+    const struct utf8_encode_state* state, wchar_t* newch);
+
+
+
+/*! \brief State structure used to encode Unicode into UTF-8.
+
+This structure is used to encode an arbitrary Unicode string into UTF-8. To set it up, first call
+\a memset to clear the structure to zero. You will then
+want to set \a rd to point to your input string, with \a rd_remain the number of bytes to encode
+(you can set it to a negative number if \a rd is null-terminated and you want to encode the whole
+thing). You will also want to tell it where to write to (\a wr) and how much space there is in that
+buffer (\a wr_size).
+
+To deal with errors (illegal input chars), you can provide a callback function \a error_callback.
+An arbitrary \a data pointer is provided in case you wish to associate some object with the encode
+operation. Passing a null pointer for \a error_callback is a valid way of indicating you do not
+wish to attempt to correct errors.
+
+You can examine the \a line and \a col variables to get the line / column of the input data at which
+the decoder is currently operating. These variables aren't perfect, as they can be
+affected by errors and limitations (only 0x0A and 0x2028 are recognised as line end chars, and the
+effect of tabs is ignored). \a char_offset represents the offset, in complete characters, from the
+start of the stream, and should always be accurate.
+
+*/
+struct utf8_encode_state {
+    /// Current read position.
+    const wchar_t* rd;
+
+    /// Number of chars remaining (-ve means to scan for null char).
+    int rd_remain;
+
+    /// Callback function used to handle illegal source characters.
+    utf8_encode_error_callback error_callback;
+
+    /// Output buffer.
+    char* wr;
+
+    /// Output buffer size.
+    size_t wr_size;
+
+    /// Number of bytes written during last call.
+    size_t written;
+
+    /// Arbitrary pointer (useful for \a error_callback).
+    void* data;
+
+    /// Current line (starting from 0).
+    int line;
+
+    /// Current column (starting from 0).
+    int col;
+
+    /// Character offset from start of data (starting from 0).
+    int char_offset;
+};
+
+
+
+/*! \brief Encode an arbitrary Unicode string.
+
+\param state The encoder state information.
+\retval state on success.
+\retval 0 on error (see \c errno).
+
+This function is used to encode some arbitrary Unicode string into UTF-8. It uses a state-storage
+structure which allows you to perform the encoding in multiple passes (e.g. if you are encoding
+an arbitrary string and outputting it, you will want to use a fixed size buffer and this might
+be smaller than required).
+
+In each pass of the function, \a rd and \a rd_remain will be updated to record the current reading
+position and the number of bytes left to encode. If the function completes this pass, \a rd_remain
+will be zero (but if you are converting a null-terminated string, you will need to check for \a *rd
+to be zero instead).
+
+After each call, \a wr will be unchanged but \a written will contain the number of bytes written
+(excluding a terminating null, which is always written). If you do not want to overwrite this data
+on the next call, you will have to update \a wr and \a wr_size.
+
+If \a state is null, or not filled out properly (no source data or destination buffer not at least 7
+bytes large), then no conversion will be performed and \a EINVAL will be stored in \a errno. If an
+illegal source character is encountered, and the error callback is 0, aborts the process or tries
+to replace the char with another illegal code point, then \a EILSEQ will be stored in \a errno. On
+error, 0 will be returned.
+
+*/
+struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state);
+
+
+
+/*!@}*/
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/libutf8/pkgconf.in
+++ b/src/libutf8/pkgconf.in
@ -0,0 +1,21 @@
+# libutf8/src/lib/clib/pkgconf.in
+#
+#  Metadata file for pkg-config
+#  ( http://www.freedesktop.org/software/pkgconfig/ )
+#
+#  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
+#  Released under the GNU GPLv2. See file COPYING or
+#  http://www.gnu.org/copyleft/gpl.html for details.
+#
+
+# Name, description
+Name: libutf8
+Description: Library for encoding and decoding UTF-8
+Version: @VERSION@
+
+# Requirements
+Requires:
+
+# Compilation information
+Libs: -L@LIBDIR@ -lutf8
+Cflags: -I@INCLUDEDIR@
--- a/src/libutf8/soversion
+++ b/src/libutf8/soversion
@ -0,0 +1,17 @@
+# libutf8/src/libutf8/soversion
+#
+#  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
+#  Released under the GNU GPLv2. See file COPYING or
+#  http://www.gnu.org/copyleft/gpl.html for details.
+#
+
+
+
+# SOMAJOR and SOMINOR are included in the library's soname. They need to
+# be bumped on a binary-incompatible release. They are both single
+# integers.
+SOMAJOR=0
+SOMINOR=0
+
+# SOMICRO is bumped every time there is a binary-compatible release.
+SOMICRO=0
--- a/src/tests/.params
+++ b/src/tests/.params
@ -0,0 +1 @@
+c tests tests libutf8
--- a/src/tests/build.default
+++ b/src/tests/build.default
@ -0,0 +1,3 @@
+source src/tests/build.tests
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/tests/build.tests
+++ b/src/tests/build.tests
@ -0,0 +1,43 @@
+# These are external variables, and shouldn't clash with anything else
+#  tests_BUILT
+#
+
+build_target libutf8 || return 1
+
+if [ -z ${tests_BUILT} ]
+then
+    LIBS="${libutf8} "
+    EXTRAS=""
+
+    echo "Building test programs..."
+    do_cmd mkdir -p obj/tests || return 1
+
+    for SRC in src/tests/*.c
+    do
+        TEST="obj/tests/$(basename ${SRC} | sed -e 's,.c$,,')"
+        MODIFIED=0
+        for file in ${LIBS} ${SRC} src/tests/build.tests
+        do
+            if [ ${file} -nt ${TEST} ]
+            then
+                MODIFIED=1
+                break
+            fi
+        done
+
+        if [ ${MODIFIED} -ne 0 ]
+        then
+            do_cmd ${CC} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
+            print_success "Built ${TEST}"
+        else
+            print_success "${TEST} is up to date"
+        fi
+    done
+
+    print_success "All tests built"
+
+    tests_BUILT=1
+fi
+
+# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+# vim: expandtab:ts=4:sw=4
--- a/src/tests/decode.c
+++ b/src/tests/decode.c
@ -0,0 +1,107 @@
+/* libutf8/src/tests/decode.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+#include "utf8.h"
+
+#include <stdio.h>
+#include <string.h>
+
+
+
+void writeout(const wchar_t* x, int amt)
+{
+    fwrite(x, sizeof(wchar_t), amt, stdout);
+}
+
+
+
+enum utf8_decode_error_action error_callback(
+    const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
+{
+    fprintf(stderr, "Line %d, col %d (char %d, byte %d): ",
+            ctx->line + 1, ctx->col + 1, ctx->char_offset, ctx->byte_offset);
+    switch(error) {
+    case utf8_decode_error_lone_cchar:
+        fprintf(stderr, "a lone continuation char was encountered.\n");
+        break;
+
+    case utf8_decode_error_not_cchar:
+        fprintf(stderr, "a continuation char was expected, but not encountered.\n");
+        break;
+
+    case utf8_decode_error_not_schar:
+        fprintf(stderr, "an invalid character was encountered (not start char).\n");
+        break;
+
+    case utf8_decode_error_overlong:
+        fprintf(stderr, "an overlong character sequence was encountered.\n");
+        break;
+
+    case utf8_decode_error_illegal_cp:
+        fprintf(stderr, "an illegal code point was encountered.\n");
+        break;
+    }
+
+    *newch = 0xFFFD;
+    return utf8_decode_error_action_replace;
+}
+
+
+
+int main(int argc, char* argv[])
+{
+    char inbuf[1024];
+    wchar_t outbuf[1024];
+    struct utf8_decode_state ctx;
+
+    if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
+        printf("Decodes UTF-8 on stdin to UCS-4 on stdout.\n");
+        return 0;
+    }
+
+    if(argc != 1) {
+        fprintf(stderr, "No parameters expected. This program decodes UTF-8 presented on stdin\n"
+                "and transforms it to UCS-4 on stdout.\n");
+        return 1;
+    }
+
+    // set up ctx structure
+    memset(&ctx, 0, sizeof(ctx));
+    ctx.wr = outbuf;
+    ctx.wr_size = sizeof(outbuf) / sizeof(wchar_t);
+    ctx.error_callback = error_callback;
+
+    // loop over input
+    while(!feof(stdin)) {
+        // read input
+        ctx.rd_remain = fread(inbuf, 1, sizeof(inbuf), stdin);
+        ctx.rd = inbuf;
+
+        // decode it
+        while(ctx.rd_remain) {
+            if(!utf8_decoder(&ctx)) {
+                perror("utf8_decoder");
+                fprintf(stderr, "(at line %d, col %d, char %d, byte %d)\n",
+                        ctx.line + 1, ctx.col + 1, ctx.char_offset, ctx.byte_offset);
+                return 1;
+            }
+
+            // write output
+            writeout(outbuf, ctx.written);
+        }
+    }
+
+    if(!ctx.complete) {
+        fprintf(stderr, "Input did not end on a character boundary.\n");
+    }
+
+    return 0;
+}
+
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/tests/random.c
+++ b/src/tests/random.c
@ -0,0 +1,165 @@
+/* libutf8/src/tests/random.c
+ *
+ *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
+ *  COPYING for more information / terms of license.
+*/
+
+#include "utf8.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+
+
+void make_rand(wchar_t* buf, int ch)
+{
+    int fd = open("/dev/urandom", O_RDONLY);
+    if(fd < 0) {
+        perror("open(\"/dev/urandom\")");
+        exit(1);
+    }
+    ch *= sizeof(wchar_t);
+    if(read(fd, (char*)buf, ch) != ch) {
+        perror("read(\"/dev/urandom\")");
+        exit(1);
+    }
+    close(fd);
+
+    ch /= sizeof(wchar_t);
+    while(ch--) {
+        buf[ch] &= 0x7FFFFFFF;
+    }
+}
+
+
+
+int do_encode(char* dest, size_t size, wchar_t* src, size_t amt)
+{
+    struct utf8_encode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+
+    ctx.rd = src;
+    ctx.rd_remain = amt;
+    ctx.wr = dest;
+    ctx.wr_size = 20;
+
+    while(ctx.rd_remain) {
+        if(!utf8_encoder(&ctx)) {
+            perror("utf8_encoder");
+            exit(1);
+        }
+
+        ctx.wr += ctx.written;
+        if(ctx.wr + ctx.wr_size > dest + size) {
+            fprintf(stderr, "do_encode: we're going to run out of memory\n");
+            exit(1);
+        }
+    }
+
+    return ctx.wr - dest;
+}
+
+
+
+int MIN(int x, int y)
+{
+    return (x < y) ? x : y;
+}
+
+
+
+void do_decode_easy(wchar_t* dest, size_t size, const char* src, size_t amt)
+{
+    struct utf8_decode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+
+    ctx.rd = src;
+    ctx.rd_remain = amt;
+    ctx.wr = dest;
+    ctx.wr_size = size;
+
+    if(!utf8_decoder(&ctx)) {
+        perror("[easy] utf8_decoder");
+        exit(1);
+    }
+
+    if(ctx.rd_remain) {
+        fprintf(stderr, "do_decode_easy: %d bytes left in buffer\n", ctx.rd_remain);
+        exit(1);
+    }
+
+    if(!ctx.complete) {
+        fprintf(stderr, "do_decode_easy: incomplete character at end of data\n");
+        exit(1);
+    }
+}
+
+
+
+void do_decode(wchar_t* dest, size_t size, const char* src, size_t amt)
+{
+    struct utf8_decode_state ctx;
+    memset(&ctx, 0, sizeof(ctx));
+
+    ctx.rd = src;
+    ctx.rd_remain = MIN(20, amt);
+    amt -= ctx.rd_remain;
+    ctx.wr = dest;
+    ctx.wr_size = 20;
+
+    while(ctx.rd_remain) {
+        if(!utf8_decoder(&ctx)) {
+            perror("utf8_decoder");
+            exit(1);
+        }
+
+        if(!ctx.rd_remain) {
+            ctx.rd_remain = MIN(20, amt);
+            amt -= ctx.rd_remain;
+        }
+
+        ctx.wr += ctx.written;
+        if(ctx.wr + ctx.wr_size > dest + size) {
+            ctx.wr_size = ctx.wr - dest - size;
+        }
+    }
+}
+
+
+
+int main(int argc, char* argv[])
+{
+    wchar_t wbuf[1024], wbuf2[1025];
+    char cbuf[8192];
+    int amt;
+
+    if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
+        printf("Encodes and decodes random well-formed strings.\n");
+        return 0;
+    }
+
+    make_rand(wbuf, 1024);
+    amt = do_encode(cbuf, 8192, wbuf, 1024);
+    do_decode_easy(wbuf2, 1025, cbuf, amt);
+    do_decode(wbuf2, 1025, cbuf, amt);
+
+    if(memcmp(wbuf, wbuf2, 1024 * sizeof(wchar_t))) {
+        fprintf(stderr, "Output doesn't match input!\n");
+        for(amt = 0; amt < 1024; ++amt) {
+            if(wbuf[amt] != wbuf2[amt])
+                fprintf(stderr, "%4d: %08X != %08X\n", amt, wbuf[amt], wbuf2[amt]);
+        }
+        return 1;
+    }
+
+    printf("Success.\n");
+    return 0;
+}
+
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+*/
--- a/src/tests/template
+++ b/src/tests/template
@ -0,0 +1,35 @@
+/* libutf8/src/tests/???.c
+ *
+ *  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
+ *  Released under the GNU GPLv2. See file COPYING or
+ *  http://www.gnu.org/copyleft/gpl.html for details.
+*/
+
+#include "utf8.h"
+
+#include <stdio.h>
+
+
+
+int main(int argc, char* argv[])
+{
+    if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
+        printf("One line summary.\n");
+        return 0;
+    }
+
+    if(argc == 1) {
+        // empty argument list
+    }
+
+    int ret = 0;
+
+    // TODO
+
+    return ret;
+}
+
+/* options for text editors
+kate: replace-trailing-space-save true; space-indent true; tab-width 4;
+vim: expandtab:ts=4:sw=4
+*/