Copy from svn repository.

2006-07-31 15:40:56 +01:00 · 2006-07-31 15:40:56 +01:00 · ac22dabfe6
parent 73d6e6fbd0
commit ac22dabfe6
34 changed files with 1557 additions and 1 deletions
--- a/11
+++ b/11
@ -10,5 +10,14 @@ Really Quick Instructions
 To build: ./make.sh
 To install: ./make.sh install
    (you might want to set PREFIX, by default it's /usr/local)
 Documentation is automatically built using doxygen.
-@TODO@
+Dependencies
 ------------
 libutf8, http://www.lwithers.me.uk/projects/libutf8/
 Project Homepage
 ----------------
 http://www.lwithers.me.uk/projects/libutf8++/
--- a/src/docs/.params
+++ b/src/docs/.params
@ -0,0 +1 @@
 doxygen docs docs
--- a/src/docs/Doxyfile.in
+++ b/src/docs/Doxyfile.in
@ -0,0 +1,146 @@
 # libutf8++/src/docs/Doxyfile.in
 #
 #  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
 #  Released under the GNU GPLv2. See file COPYING or
 #  http://www.gnu.org/copyleft/gpl.html for details.
 #
 PROJECT_NAME           = libutf8++
 OUTPUT_DIRECTORY       =
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
 USE_WINDOWS_ENCODING   = NO
 BRIEF_MEMBER_DESC      = YES
 REPEAT_BRIEF           = YES
 ABBREVIATE_BRIEF       =
 ALWAYS_DETAILED_SEC    = NO
 INLINE_INHERITED_MEMB  = YES
 FULL_PATH_NAMES        = NO
 STRIP_FROM_PATH        =
 STRIP_FROM_INC_PATH    =
 SHORT_NAMES            = NO
 JAVADOC_AUTOBRIEF      = NO
 MULTILINE_CPP_IS_BRIEF = YES
 DETAILS_AT_TOP         = YES
 INHERIT_DOCS           = YES
 DISTRIBUTE_GROUP_DOC   = NO
 TAB_SIZE               = 4
 ALIASES                =
 OPTIMIZE_OUTPUT_FOR_C  = NO
 OPTIMIZE_OUTPUT_JAVA   = NO
 SUBGROUPING            = YES
 EXTRACT_ALL            = NO
 EXTRACT_PRIVATE        = NO
 EXTRACT_STATIC         = NO
 EXTRACT_LOCAL_CLASSES  = NO
 EXTRACT_LOCAL_METHODS  = NO
 HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 HIDE_FRIEND_COMPOUNDS  = YES
 HIDE_IN_BODY_DOCS      = NO
 INTERNAL_DOCS          = NO
 CASE_SENSE_NAMES       = YES
 HIDE_SCOPE_NAMES       = NO
 SHOW_INCLUDE_FILES     = NO
 INLINE_INFO            = YES
 SORT_MEMBER_DOCS       = YES
 SORT_BRIEF_DOCS        = NO
 SORT_BY_SCOPE_NAME     = NO
 GENERATE_TODOLIST      = YES
 GENERATE_TESTLIST      = YES
 GENERATE_BUGLIST       = YES
 GENERATE_DEPRECATEDLIST= YES
 ENABLED_SECTIONS       =
 MAX_INITIALIZER_LINES  = 30
 SHOW_USED_FILES        = NO
 SHOW_DIRECTORIES       = NO
 FILE_VERSION_FILTER    =
 QUIET                  = YES
 WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 WARN_IF_DOC_ERROR      = YES
 WARN_NO_PARAMDOC       = YES
 WARN_FORMAT            = "$file:$line: $text"
 WARN_LOGFILE           =
 FILE_PATTERNS          =
 RECURSIVE              = NO
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
 EXCLUDE_PATTERNS       =
 EXAMPLE_PATH           =
 EXAMPLE_PATTERNS       =
 EXAMPLE_RECURSIVE      = NO
 IMAGE_PATH             = src/docs
 INPUT_FILTER           =
 FILTER_PATTERNS        =
 FILTER_SOURCE_FILES    = NO
 SOURCE_BROWSER         = NO
 INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 REFERENCED_BY_RELATION = YES
 REFERENCES_RELATION    = YES
 VERBATIM_HEADERS       = NO
 ALPHABETICAL_INDEX     = YES
 COLS_IN_ALPHA_INDEX    = 5
 IGNORE_PREFIX          =
 GENERATE_HTML          = YES
 HTML_OUTPUT            = html
 HTML_FILE_EXTENSION    = .html
 HTML_HEADER            =
 HTML_FOOTER            =
 HTML_STYLESHEET        =
 HTML_ALIGN_MEMBERS     = YES
 GENERATE_HTMLHELP      = NO
 CHM_FILE               =
 HHC_LOCATION           =
 GENERATE_CHI           = NO
 BINARY_TOC             = NO
 TOC_EXPAND             = NO
 DISABLE_INDEX          = NO
 ENUM_VALUES_PER_LINE   = 4
 GENERATE_TREEVIEW      = NO
 TREEVIEW_WIDTH         = 250
 GENERATE_LATEX         = NO
 GENERATE_RTF           = NO
 GENERATE_MAN           = NO
 GENERATE_XML           = NO
 GENERATE_AUTOGEN_DEF   = NO
 GENERATE_PERLMOD       = NO
 ENABLE_PREPROCESSING   = YES
 MACRO_EXPANSION        = NO
 EXPAND_ONLY_PREDEF     = NO
 SEARCH_INCLUDES        = YES
 INCLUDE_PATH           =
 INCLUDE_FILE_PATTERNS  =
 PREDEFINED             = DOXYGEN
 EXPAND_AS_DEFINED      =
 SKIP_FUNCTION_MACROS   = YES
 TAGFILES               =
 GENERATE_TAGFILE       =
 ALLEXTERNALS           = NO
 EXTERNAL_GROUPS        = YES
 PERL_PATH              = /usr/bin/perl
 CLASS_DIAGRAMS         = YES
 HIDE_UNDOC_RELATIONS   = YES
 HAVE_DOT               = YES
 CLASS_GRAPH            = YES
 COLLABORATION_GRAPH    = YES
 GROUP_GRAPHS           = NO
 UML_LOOK               = NO
 TEMPLATE_RELATIONS     = NO
 INCLUDE_GRAPH          = NO
 INCLUDED_BY_GRAPH      = NO
 CALL_GRAPH             = NO
 GRAPHICAL_HIERARCHY    = YES
 DIRECTORY_GRAPH        = NO
 DOT_IMAGE_FORMAT       = png
 DOT_PATH               =
 DOTFILE_DIRS           =
 MAX_DOT_GRAPH_WIDTH    = 1024
 MAX_DOT_GRAPH_HEIGHT   = 1024
 MAX_DOT_GRAPH_DEPTH    = 0
 DOT_TRANSPARENT        = YES
 DOT_MULTI_TARGETS      = YES
 GENERATE_LEGEND        = YES
 DOT_CLEANUP            = YES
 SEARCHENGINE           = NO
--- a/src/docs/MainPage.dox
+++ b/src/docs/MainPage.dox
@ -0,0 +1,15 @@
 /* libutf8++/src/docs/MainPage.dox
 *
 *  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
 *  Released under the GNU GPLv2. See file COPYING or
 *  http://www.gnu.org/copyleft/gpl.html for details.
 */
 /*! \mainpage
 */
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 vim: expandtab:ts=4:sw=4
 */
--- a/src/docs/build.default
+++ b/src/docs/build.default
@ -0,0 +1 @@
 source src/docs/build.docs
--- a/src/docs/build.docs
+++ b/src/docs/build.docs
@ -0,0 +1,43 @@
 # These are external variables, and shouldn't clash with anything else
 #  docs_BUILT
 #
 MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
 build_target monolithic
 if [ -z ${docs_BUILT} ]
 then
    echo "Building documentation with Doxygen..."
    DOXYFILE=obj/Doxyfile.docs
    if [ ! -e ${DOXYFILE} ]
    then
        do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
        echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
        echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
    fi
    MODIFIED=0
    for file in ${MONOLITHIC_DOC}
    do
        if [ ${file} -nt html/index.html ]
        then
            MODIFIED=1
            break
        fi
    done
    if [ ${MODIFIED} -ne 0 ]
    then
        do_cmd doxygen ${DOXYFILE} || return 1
        print_success "Documentation built"
    else
        print_success "Documentation is up to date"
    fi
    docs_BUILT=1
 fi
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/docs/build.install
+++ b/src/docs/build.install
@ -0,0 +1 @@
 source src/docs/build.install-docs
--- a/src/docs/build.install-docs
+++ b/src/docs/build.install-docs
@ -0,0 +1,21 @@
 build_target docs
 # create documentation directories
 echo "Installing documentation into ${DOCSDIR}"
 build_dir_tree "${DOCSDIR}/html" || return 1
 # copy across the Doxygen-generated documentation
 for file in html/*
 do
    install_file ${file} ${DOCSDIR}/html 0644 || return 1
 done
 # copy across the generic files
 for file in COPYING README
 do
    install_file ${file} ${DOCSDIR} 0644 || return 1
 done
 print_success "Documentation installed"
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/libutf8++/.params
+++ b/src/libutf8++/.params
@ -0,0 +1 @@
 c++ lib libutf8++ utf8
--- a/src/libutf8++/BottomHeader.h
+++ b/src/libutf8++/BottomHeader.h
@ -0,0 +1,9 @@
 /* libutf8++/src/lib/BottomHeader.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 }
 #endif
--- a/src/libutf8++/ForwardDeclare.h
+++ b/src/libutf8++/ForwardDeclare.h
@ -0,0 +1,9 @@
 /* libutf8++/src/lib/ForwardDeclare.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 // This file simply contains forward declarations of all libutf8++
 // classes, to facilitate header ordering, etc.
--- a/src/libutf8++/TopHeader.h
+++ b/src/libutf8++/TopHeader.h
@ -0,0 +1,23 @@
 /* libutf8++/src/lib/TopHeader.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 #ifndef HEADER_libutf8pp
 #define HEADER_libutf8pp
 // standard includes, or includes needed for type declarations
 #include <string>
 #include <stdexcept>
 #include <utf8.h>
 /*! \brief UTF-8 handling routines.
 The library's UTF-8 handling routines are all made available through this namespace.
 */
 namespace utf8 {
--- a/src/libutf8++/TopSource.cpp
+++ b/src/libutf8++/TopSource.cpp
@ -0,0 +1,12 @@
 /* libutf8++/src/lib/TopSource.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 #include "utf8"
 // Below are all the includes used throughout the library.
 #include <sstream>
 #include <iomanip>
--- a/src/libutf8++/build.default
+++ b/src/libutf8++/build.default
@ -0,0 +1 @@
 source src/libutf8++/build.lib
--- a/src/libutf8++/build.install
+++ b/src/libutf8++/build.install
@ -0,0 +1 @@
 source src/libutf8++/build.install-lib
--- a/src/libutf8++/build.install-lib
+++ b/src/libutf8++/build.install-lib
@ -0,0 +1,36 @@
 build_target libutf8++
 # make paths (this is for Gentoo in particular)
 build_dir_tree "${LIBDIR}" || return 1
 build_dir_tree "${PKGCONFDIR}" || return 1
 build_dir_tree "${INCLUDEDIR}" || return 1
 # install library
 echo "Installing libraries into '${LIBDIR}'"
 install_file ${libutf8pp} ${LIBDIR} 0755 || return 1
 BASE="${libutf8pp_BASE}.so"
 MAJOR="${BASE}.${SOMAJOR}"
 MINOR="${MAJOR}.${SOMINOR}"
 MICRO="${MINOR}.${SOMICRO}"
 install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
 install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
 install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
 # install header
 echo "Installing header file '${libutf8pp_HEADER}' into ${INCLUDEDIR}"
 install_header ${libutf8pp_HEADER} ${INCLUDEDIR} 0644 || return 1
 # install pkgconfig file
 echo "Installing package config file into ${PKGCONFDIR}"
 PKGCONFFILE=${PKGCONFDIR}/libutf8pp.pc
 do_cmd rm -f ${PKGCONFFILE}
 do_cmd_redir ${PKGCONFFILE} sed \
    -e "s,@VERSION@,${VERSION}," \
    -e "s,@LIBDIR@,${FINALLIBDIR}," \
    -e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
    src/libutf8++/pkgconf.in
 do_cmd chmod 0644 ${PKGCONFFILE}
 print_success "Done"
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/libutf8++/build.lib
+++ b/src/libutf8++/build.lib
@ -0,0 +1,51 @@
 # These are external variables, and shouldn't clash with anything else
 #  libutf8pp
 #  libutf8pp_BUILT
 #  libutf8pp_HEADER
 #  libutf8pp_BASE
 if [ -z ${libutf8pp_BUILT} ]
 then
    libutf8pp_BASE=libutf8++
    source src/libutf8++/soversion
    libutf8pp="obj/${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
    SO_EXTRA="$(pkg-config libutf8 --libs --cflags) -lstdc++ -lc"
    echo "Building library ${libutf8pp}..."
    do_cmd source src/libutf8++/build.monolithic || return 1
    MODIFIED=0
    for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
    do
        if [ ${test} -nt ${libutf8pp} ]
        then
            MODIFIED=1
            break
        fi
    done
    if [ ${MODIFIED} -ne 0 ]
    then
        echo " Compiling"
        SONAME="${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}"
        do_cmd ${CXX} ${CFLAGS} -shared -fpic -o "${libutf8pp}" \
            -Wl,-soname,${SONAME} \
            ${SRC} ${SO_EXTRA} || return 1
        # make tests work
        do_cmd ln -sf $(basename ${libutf8pp}) obj/${SONAME} || return 1
        print_success "Library built"
    else
        print_success "Library up to date"
    fi
    libutf8pp_BUILT=1
    libutf8pp_HEADER=${HDR}
 fi
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/libutf8++/build.monolithic
+++ b/src/libutf8++/build.monolithic
@ -0,0 +1,21 @@
 # These are external variables, and shouldn't clash with anything else
 #  libutf8pp_MONOLITHIC
 SRC="obj/libutf8++.cpp"
 HDR="obj/utf8"
 MONOLITHIC_TESTS="src/libutf8++/build.lib src/libutf8++/build.monolithic"
 if [ -z "${libutf8pp_MONOLITHIC}" ]
 then
    MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopHeader,ForwardDeclare,exception,string,{en,de}coder,BottomHeader}.h)"
    make_monolithic ${HDR} C || return 1
    MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopSource,exception,string,{en,de}coder}.cpp)"
    make_monolithic ${SRC} C || return 1
    libutf8pp_MONOLITHIC=1
    MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
 fi
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/libutf8++/decoder.cpp
+++ b/src/libutf8++/decoder.cpp
@ -0,0 +1,159 @@
 /* libutf8++/src/lib/decoder.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 namespace utf8 {
 Decoder::Decoder(size_t hint)
 {
    memset(&ctx, 0, sizeof(ctx));
    ctx.wr_size = (hint < 2) ? 2 : hint;
    ctx.wr = new wchar_t[ctx.wr_size];
    ctx.error_callback = _exceptionOnError;
    ctx.data = this;
 }
 Decoder::~Decoder()
 {
    delete [] ctx.wr;
 }
 void Decoder::decode(const std::string& str)
 {
    decode(str.data(), str.size());
 }
 void Decoder::decode(const char* str, ssize_t amt)
 {
    ctx.rd = str;
    ctx.rd_remain = amt;
    while(ctx.rd_remain) {
        utf8_decoder(&ctx);
        decoded.append(ctx.wr, ctx.written);
        if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
        if(ctx.rd_remain) {
            ctx.wr_size *= 2;
            delete [] ctx.wr;
            ctx.wr = new wchar_t[ctx.wr_size];
        }
    }
 }
 bool Decoder::complete() const
 {
    return ctx.complete;
 }
 void Decoder::reset()
 {
    size_t old_wr_size = ctx.wr_size;
    wchar_t* old_wr = ctx.wr;
    utf8_decode_error_callback old_error_callback = ctx.error_callback;
    memset(&ctx, 0, sizeof(ctx));
    ctx.wr_size = old_wr_size;
    ctx.wr = old_wr;
    ctx.error_callback = old_error_callback;
    ctx.data = this;
    decoded.clear();
 }
 void Decoder::skipOnError()
 {
    ctx.error_callback = _skipOnError;
 }
 void Decoder::replaceOnError(wchar_t ch)
 {
    replaceChar = ch;
    ctx.error_callback = _replaceOnError;
 }
 void Decoder::exceptionOnError()
 {
    ctx.error_callback = _exceptionOnError;
 }
 enum utf8_decode_error_action Decoder::_skipOnError
    (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
 {
    (void)ctx;
    (void)error;
    (void)newch;
    return utf8_decode_error_action_skip;
 }
 enum utf8_decode_error_action Decoder::_replaceOnError
    (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
 {
    (void)error;
    Decoder* self = (utf8::Decoder*)(ctx->data);
    *newch = self->replaceChar;
    return utf8_decode_error_action_replace;
 }
 enum utf8_decode_error_action Decoder::_exceptionOnError
    (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
 {
    (void)newch;
    const char* desc = "unknown";
    switch(error) {
    case utf8_decode_error_lone_cchar:
        desc = "An invalid continuation byte was encountered while expecting a character.";
        break;
    case utf8_decode_error_not_cchar:
        desc = "A multi-byte sequence contained an invalid byte.";
        break;
    case utf8_decode_error_not_schar:
        desc = "An invalid byte was encountered while expecting a character.";
        break;
    case utf8_decode_error_overlong:
        desc = "An overlong encoding of a character was encountered.";
        break;
    case utf8_decode_error_illegal_cp:
        desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
        break;
    }
    throw BadUTF8Sequence(desc, ctx);
 }
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/decoder.h
+++ b/src/libutf8++/decoder.h
@ -0,0 +1,128 @@
 /* libutf8++/src/lib/decoder.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 /*! \brief Stateful UTF-8 decoder object.
 This object is used for stateful decoding of a UTF-8 byte stream. It can be fed the data in
 arbitrary chunks, even split on non-character boundaries. It writes its output into a wide character
 string.
 A variety of error handling modes are available. The default is to throw a BadUTF8Sequence
 exception, but you can change this with skipOnError() or replaceOnError().
 */
 class Decoder {
 public:
    /*! \brief Constructor.
    \param hint Hint at number of characters to allocate space for in decoder buffer.
    The constructor sets up the UTF-8 decoder. You can provide a hint as to the size of your input
    stream chunks. This hint is the number of characters to allocate in the output buffer. If,
    during a single decode operation, this buffer is filled, then it is doubled in size.
     */
    Decoder(size_t hint = 25);
    /// Destructor.
    ~Decoder();
    /// Result of decoding operations (appended to).
    std::wstring decoded;
    /*! \brief UTF-8 decoder.
    \param str Pointer to source data.
    \param amt Number of bytes in source data (-1 for null terminated strings).
    \throws BadUTF8Sequence.
    This function will decode a chunk of UTF-8 data. The decoded data will be appended to whatever
    is contained in the string decoded. You can check if the decoder ended on a character boundary
    or not by calling complete().
    */
    void decode(const char* str, ssize_t amt);
    /// Decode data stored in a std::string.
    void decode(const std::string& str);
    /// Returns \a true if the last call to \a decode() ended on a character boundary.
    bool complete() const;
    /*! \brief Resets the parser for a new UTF-8 stream.
    This function will clear the internal state of the decoder so that it is ready for data from a
    new source. This can be used if you have opened a new file, accepted a new connection, recovered
    from an error, etc. It will also clear \a decoded.
    */
    void reset();
    /*! \brief Set error handling to \e skip mode.
    This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
    byte sequences will simply be skipped altogether, and will not have any effect on the output in
    \a decoded.
    */
    void skipOnError();
    /*! \brief Set error handling to \e replace mode.
    \param ch The replacement character that will appear in the output.
    This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
    byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
    in \a decoded. The default parameter is the unicode replacement character, which should look
    like an upside-down question mark.
    */
    void replaceOnError(wchar_t ch = 0xFFFD);
    /*! \brief Set error handling to \e exception mode (default).
    This function will set the error handling to \e exception mode. In this mode, any invalid
    UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
    mode.
    */
    void exceptionOnError();
 private:
    struct utf8_decode_state ctx;
    wchar_t replaceChar;
    static enum utf8_decode_error_action _skipOnError
        (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
    static enum utf8_decode_error_action _replaceOnError
        (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
    static enum utf8_decode_error_action _exceptionOnError
        (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
 };
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/encoder.cpp
+++ b/src/libutf8++/encoder.cpp
@ -0,0 +1,124 @@
 /* libutf8++/src/lib/encoder.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 namespace utf8 {
 Encoder::Encoder(size_t hint)
 {
    memset(&ctx, 0, sizeof(ctx));
    ctx.wr_size = (hint < 7) ? 7 : hint;
    ctx.wr = new char[ctx.wr_size];
    ctx.error_callback = _exceptionOnError;
    ctx.data = this;
 }
 Encoder::~Encoder()
 {
    delete [] ctx.wr;
 }
 void Encoder::reset()
 {
    char* wr = ctx.wr;
    size_t wr_size = ctx.wr_size;
    utf8_encode_error_callback cb = ctx.error_callback;
    memset(&ctx, 0, sizeof(ctx));
    ctx.wr = wr;
    ctx.wr_size = wr_size;
    ctx.error_callback = cb;
    ctx.data = this;
 }
 void Encoder::encode(const std::wstring& str)
 {
    encode(str.data(), str.size());
 }
 void Encoder::encode(const wchar_t* str, ssize_t amt)
 {
    ctx.rd = str;
    ctx.rd_remain = amt;
    while(ctx.rd_remain) {
        if(!utf8_encoder(&ctx)) throw BadUnicodeChar(&ctx);
        encoded.append(ctx.wr, ctx.written);
        if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
        if(ctx.rd_remain) {
            ctx.wr_size *= 2;
            delete [] ctx.wr;
            ctx.wr = new char[ctx.wr_size];
        }
    }
 }
 void Encoder::skipOnError()
 {
    ctx.error_callback = _skipOnError;
 }
 void Encoder::replaceOnError(wchar_t ch)
 {
    replaceChar = ch;
    ctx.error_callback = _replaceOnError;
 }
 void Encoder::exceptionOnError()
 {
    ctx.error_callback = _exceptionOnError;
 }
 enum utf8_encode_error_action Encoder::_skipOnError
    (const struct utf8_encode_state *ctx, wchar_t *newch)
 {
    (void)ctx;
    (void)newch;
    return utf8_encode_error_action_skip;
 }
 enum utf8_encode_error_action Encoder::_replaceOnError
    (const struct utf8_encode_state *ctx, wchar_t *newch)
 {
    Encoder* self = (utf8::Encoder*)(ctx->data);
    *newch = self->replaceChar;
    return utf8_encode_error_action_replace;
 }
 enum utf8_encode_error_action Encoder::_exceptionOnError
    (const struct utf8_encode_state *ctx, wchar_t *newch)
 {
    (void)newch;
    throw BadUnicodeChar(ctx);
 }
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/encoder.h
+++ b/src/libutf8++/encoder.h
@ -0,0 +1,108 @@
 /* libutf8++/src/lib/encoder.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 /*! \brief UTF-8 encoder object.
 This object is used to encode Unicode wide characters into UTF-8. It can be fed chunks of characters
 which it then encodes, appending the result to an internal buffer.
 */
 class Encoder {
 public:
    /*! \brief Constructor.
    \param hint Number of bytes to allocate for the encoding buffer.
    The constructor sets up the encoder and allocates some space for an internal buffer. You can
    hint at how large you expect the chunks to be encoded will be. If an encoding operation fills
    the buffer without consuming all the input data, the buffer will be doubled in size for the
    next round.
    */
    Encoder(size_t hint = 100);
    /// Destructor.
    virtual ~Encoder();
    /// UTF-8 output data is appended to this string.
    std::string encoded;
    /*! \brief Encode some data into UTF-8.
    \param str Pointer to the character array to encode.
    \param amt Number of characters to encode.
    This function performs an encoding of some Unicode characters into UTF-8. It appends the result
    onto \a encoded.
    */
    void encode(const wchar_t* str, ssize_t amt);
    /// Encode a std::wstring.
    void encode(const std::wstring& str);
    /// Reset the encoder for a new character stream.
    void reset();
    /*! \brief Set error handling to \e skip mode.
    This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
    byte sequences will simply be skipped altogether, and will not have any effect on the output in
    \a decoded.
    */
    void skipOnError();
    /*! \brief Set error handling to \e replace mode.
        \param ch The replacement character that will appear in the output.
    This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
    byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
    in \a decoded. The default parameter is the unicode replacement character, which should look
    like an upside-down question mark.
    */
    void replaceOnError(wchar_t ch = 0xFFFD);
    /*! \brief Set error handling to \e exception mode (default).
    This function will set the error handling to \e exception mode. In this mode, any invalid
    UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
    mode.
    */
    void exceptionOnError();
 private:
    struct utf8_encode_state ctx;
    wchar_t replaceChar;
    static enum utf8_encode_error_action _skipOnError
        (const struct utf8_encode_state *ctx, wchar_t *newch);
    static enum utf8_encode_error_action _replaceOnError
        (const struct utf8_encode_state *ctx, wchar_t *newch);
    static enum utf8_encode_error_action _exceptionOnError
        (const struct utf8_encode_state *ctx, wchar_t *newch);
 };
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/exception.cpp
+++ b/src/libutf8++/exception.cpp
@ -0,0 +1,81 @@
 /* libutf8++/src/lib/exception.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 namespace utf8 {
 Error::Error(const std::string& reason)
    : reason(reason)
 {
 }
 const char* Error::what()
 {
    return reason.c_str();
 }
 BadUnicodeChar::BadUnicodeChar(const struct utf8_encode_state* ctx)
    : Error(format(ctx)), badChar(*ctx->rd), line(ctx->line), col(ctx->col), char_offset(ctx->char_offset)
 {
 }
 std::string BadUnicodeChar::format(const struct utf8_encode_state* ctx)
 {
    std::ostringstream str;
    str << "Invalid Unicode code point encountered."
           "\n  Position       : line "
        << ctx->line + 1
        << ", column "
        << ctx->col + 1
        << "\n  Stream offset  : "
        << ctx->char_offset
        << " characters\n  Character value: 0x"
        << std::hex
        << *(ctx->rd);
    return str.str();
 }
 BadUTF8Sequence::BadUTF8Sequence(const std::string& description,
                                 const struct utf8_decode_state* ctx)
    : Error(format(description, ctx)), description(description), line(ctx->line + 1),
    col(ctx->col + 1), char_offset(ctx->char_offset), byte_offset(ctx->byte_offset)
 {
 }
 std::string BadUTF8Sequence::format(const std::string& description,
                                    const struct utf8_decode_state* ctx)
 {
    std::ostringstream str;
    str << "Bad byte sequence in UTF-8 data.\n"
        "  Reason  : " << description
        << "\n  Position: line " << ctx->line + 1
        << ", column " << ctx->col + 1
        << "\n  Offset  : " << ctx->char_offset << " chars, " << ctx->byte_offset << " bytes";
    return str.str();
 }
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/exception.h
+++ b/src/libutf8++/exception.h
@ -0,0 +1,96 @@
 /* libutf8++/src/lib/exception.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 /*! \brief Exception base class.
 This is the base class for all libutf8 exceptions. It contains one member, \a reason, which allows
 you to print a human-readable description of the error. To recover the actual type, you can refer
 to the more specific derived classes.
 */
 class Error : public std::exception {
 public:
    /// Human-readable reason for error.
    std::string reason;
    /// Constructor.
    Error(const std::string& reason);
    /// Destructor.
    virtual ~Error() throw()
    { }
    /// Find what caused the error.
    virtual const char* what();
 };
 /*! \brief Invalid Unicode character exception.
 This exception is thrown when encoding Unicode into UTF-8 and an invalid character is encountered.
 */
 class BadUnicodeChar : public Error {
 public:
    /// A copy of the invalid character.
    wchar_t badChar;
    /// Line of input data at which error occurred (starts at 1).
    int line;
    /// Column of input data at which error occurred (starts at 1).
    int col;
    /// Character offset of input data at which error occurred.
    int char_offset;
    /// Constructor.
    BadUnicodeChar(const struct utf8_encode_state* ctx);
 private:
    static std::string format(const struct utf8_encode_state* ctx);
 };
 /*! \brief Invalid UTF-8 sequence exception.
 This exception is thrown when decoding UTF-8 and an invalid sequence is encountered. This could be
 a nonsensical sequence, a redundantly-encounded character or truncated source data. It contains some
 variables for allowing detailed diagnostics.
 */
 class BadUTF8Sequence : public Error {
 public:
    /// Description of the error, for human diagnostics.
    std::string description;
    /// Line of input data at which error occurred (starts at 1).
    int line;
    /// Column of input data at which error occurred (starts at 1).
    int col;
    /// Character offset of input data at which error occurred.
    int char_offset;
    /// Byte offset of input data at which error occurred.
    int byte_offset;
    /// Constructor.
    BadUTF8Sequence(const std::string& description, const struct utf8_decode_state* ctx);
    /// Destructor.
    ~BadUTF8Sequence() throw() { }
 private:
    std::string format(const std::string& description, const struct utf8_decode_state* ctx);
 };
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/pkgconf.in
+++ b/src/libutf8++/pkgconf.in
@ -0,0 +1,21 @@
 # libutf8++/src/lib/libutf8++/pkgconf.in
 #
 #  Metadata file for pkg-config
 #  ( http://www.freedesktop.org/software/pkgconfig/ )
 #
 #  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
 #  Released under the GNU GPLv2. See file COPYING or
 #  http://www.gnu.org/copyleft/gpl.html for details.
 #
 # Name, description
 Name: libutf8++
 Description: C++ wrapper around libutf8 (library for handling UTF-8)
 Version: @VERSION@
 # Requirements
 Requires:
 # Compilation information
 Libs: -L@LIBDIR@ -lutf8++
 Cflags: -I@INCLUDEDIR@
--- a/src/libutf8++/soversion
+++ b/src/libutf8++/soversion
@ -0,0 +1,17 @@
 # libutf8++/src/libutf8++/soversion
 #
 #  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
 #  Released under the GNU GPLv2. See file COPYING or
 #  http://www.gnu.org/copyleft/gpl.html for details.
 #
 # SOMAJOR and SOMINOR are included in the library's soname. They need to
 # be bumped on a binary-incompatible release. They are both single
 # integers.
 SOMAJOR=0
 SOMINOR=0
 # SOMICRO is bumped every time there is a binary-compatible release.
 SOMICRO=0
--- a/src/libutf8++/string.cpp
+++ b/src/libutf8++/string.cpp
@ -0,0 +1,126 @@
 /* libutf8++/src/lib/string.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 namespace utf8 {
 static enum utf8_decode_error_action decode_replace_callback(const struct utf8_decode_state* ctx,
    enum utf8_decode_error error, wchar_t* newch)
 {
    (void)error;
    *newch = *(wchar_t*)(ctx->data);
    return utf8_decode_error_action_replace;
 }
 static enum utf8_decode_error_action decode_error_callback(const struct utf8_decode_state* ctx,
    enum utf8_decode_error error, wchar_t* newch)
 {
    (void)newch;
    const char* desc = "unknown";
    switch(error) {
    case utf8_decode_error_lone_cchar:
        desc = "An invalid continuation byte was encountered while expecting a character.";
        break;
    case utf8_decode_error_not_cchar:
        desc = "A multi-byte sequence contained an invalid byte.";
        break;
    case utf8_decode_error_not_schar:
        desc = "An invalid byte was encountered while expecting a character.";
        break;
    case utf8_decode_error_overlong:
        desc = "An overlong encoding of a character was encountered.";
        break;
    case utf8_decode_error_illegal_cp:
        desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
        break;
    }
    throw BadUTF8Sequence(desc, ctx);
 }
 std::wstring decode(const std::string& utf8, bool force, wchar_t replace)
 {
    wchar_t buffer[128];
    struct utf8_decode_state ctx;
    memset(&ctx, 0, sizeof(ctx));
    ctx.rd = utf8.data();
    ctx.rd_remain = utf8.size();
    ctx.wr = buffer;
    ctx.wr_size = 128;
    if(force) {
        ctx.error_callback = decode_replace_callback;
        ctx.data = &replace;
    } else {
        ctx.error_callback = decode_error_callback;
    }
    std::wstring ret;
    while(ctx.rd_remain) {
        utf8_decoder(&ctx);
        ret.append(buffer, ctx.written);
    }
    return ret;
 }
 static enum utf8_encode_error_action encode_replace_callback(const struct utf8_encode_state* ctx,
    wchar_t* newch)
 {
    *newch = *(wchar_t*)(ctx->data);
    return utf8_encode_error_action_replace;
 }
 std::string encode(const std::wstring& ustr, bool force, wchar_t replace)
 {
    char buffer[512];
    struct utf8_encode_state ctx;
    memset(&ctx, 0, sizeof(ctx));
    ctx.rd = ustr.data();
    ctx.rd_remain = ustr.size();
    ctx.wr = buffer;
    ctx.wr_size = 512;
    if(force) {
        ctx.error_callback = encode_replace_callback;
        ctx.data = &replace;
    }
    std::string ret;
    while(ctx.rd_remain) {
        if(!utf8_encoder(&ctx)) {
            throw BadUnicodeChar(&ctx);
        }
        ret.append(buffer, ctx.written);
    }
    return ret;
 }
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/libutf8++/string.h
+++ b/src/libutf8++/string.h
@ -0,0 +1,37 @@
 /* libutf8++/src/lib/string.h
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 /*! \brief Decode UTF-8.
 \param utf8 The UTF-8 encoded data.
 \param force If set to \a true, errors will be inhibited.
 \param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
    character.
 \returns The Unicode wide-character string representation.
 \throws BadUTF8Sequence if there is an invalid byte sequence in the UTF-8 source data.
 This function will decode a UTF-8 source string into a Unicode wide-character string. It has a force
 mode whereby any errors will be inhibited and a best-effort attempt will be made.
 */
 std::wstring decode(const std::string& utf8, bool force = false, wchar_t replace = 0xFFFD);
 /*! \brief Encode UTF-8.
 \param ustr The Unicode wide-character string.
 \param force If set to \a true, errors will be inhibited (invalid chars will be omitted).
 \param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
    character.
 \returns The UTF-8 transformed representation of \a ustr.
 \throws BadUnicodeChar on invalid characters in the source data.
 This function will encode a Unicode wide-character string into a UTF-8 transformed representation.
 It has a force mode whereby any errors will be inhibited and a best-effort attempt will be made.
 */
 std::string encode(const std::wstring& ustr, bool force = false, wchar_t replace = 0xFFFD);
--- a/src/tests/.params
+++ b/src/tests/.params
@ -0,0 +1 @@
 c++ tests tests libutf8++
--- a/src/tests/build.default
+++ b/src/tests/build.default
@ -0,0 +1,3 @@
 source src/tests/build.tests
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/tests/build.tests
+++ b/src/tests/build.tests
@ -0,0 +1,43 @@
 # These are external variables, and shouldn't clash with anything else
 #  tests_BUILT
 #
 build_target libutf8++ || return 1
 if [ -z ${tests_BUILT} ]
 then
    LIBS="${libutf8pp} "
    EXTRAS=""
    echo "Building test programs..."
    do_cmd mkdir -p obj/tests || return 1
    for SRC in src/tests/*.cpp
    do
        TEST="obj/tests/$(basename ${SRC} | sed -e 's,.cpp$,,')"
        MODIFIED=0
        for file in ${LIBS} ${SRC} src/tests/build.tests
        do
            if [ ${file} -nt ${TEST} ]
            then
                MODIFIED=1
                break
            fi
        done
        if [ ${MODIFIED} -ne 0 ]
        then
            do_cmd ${CXX} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
            print_success "Built ${TEST}"
        else
            print_success "${TEST} is up to date"
        fi
    done
    print_success "All tests built"
    tests_BUILT=1
 fi
 # kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 # vim: expandtab:ts=4:sw=4
--- a/src/tests/objects.cpp
+++ b/src/tests/objects.cpp
@ -0,0 +1,85 @@
 /* libutf8++/src/tests/objects.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 #include "utf8"
 #include <iostream>
 #include <iomanip>
 #include <fcntl.h>
 #include <unistd.h>
 void make_random(wchar_t* buf, int ch)
 {
    int fd = open("/dev/urandom", O_RDONLY);
    if(fd < 0) {
        perror("open(\"/dev/urandom\")");
        throw 1;
    }
    ch *= sizeof(wchar_t);
    if(read(fd, (char*)buf, ch) != ch) {
        perror("read(\"/dev/urandom\")");
        throw 1;
    }
    close(fd);
    ch /= sizeof(wchar_t);
    while(ch--) {
        buf[ch] &= 0x7FFFFFFF;
    }
 }
 int main(int argc, char* argv[])
 {
    if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
        std::cout << "Performs some tests on the Encoder and Decoder objects.\n";
        return 0;
    }
    int ret = 0;
    try {
        wchar_t wch[1024];
        make_random(wch, 1024);
        std::wstring ustr;
        ustr.assign(wch, 1024);
        utf8::Encoder encoder;
        utf8::Decoder decoder;
        encoder.encode(ustr);
        decoder.decode(encoder.encoded);
        if(ustr != decoder.decoded) {
            std::cerr << "Decoded string does not match original.\n";
            for(size_t i = 0, end = std::min(ustr.size(), decoder.decoded.size()); i != end; ++i) {
                if(ustr[i] != decoder.decoded[i]) {
                    std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
                            << std::setfill('0') << std::hex << ": 0x"
                            << std::setw(8) << ustr[i] << " != "
                            << std::setw(8) << decoder.decoded[i] << "\n";
                }
            }
            std::cerr << "Original size " << std::dec << ustr.size()
                    << ", decoded size " << decoder.decoded.size() << std::endl;
            return 1;
        }
        std::cout << "Success.\n";
    }
    catch(utf8::Error& e) {
        std::cerr << e.reason << std::endl;
        ret = 1;
    }
    return ret;
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/tests/strings.cpp
+++ b/src/tests/strings.cpp
@ -0,0 +1,82 @@
 /* libutf8++/src/tests/strings.cpp
 *
 *  (c)2006, Laurence Withers. Released under the GNU GPL. See file
 *  COPYING for more information / terms of license.
 */
 #include "utf8"
 #include <iostream>
 #include <iomanip>
 #include <fcntl.h>
 #include <unistd.h>
 void make_random(wchar_t* buf, int ch)
 {
    int fd = open("/dev/urandom", O_RDONLY);
    if(fd < 0) {
        perror("open(\"/dev/urandom\")");
        throw 1;
    }
    ch *= sizeof(wchar_t);
    if(read(fd, (char*)buf, ch) != ch) {
        perror("read(\"/dev/urandom\")");
        throw 1;
    }
    close(fd);
    ch /= sizeof(wchar_t);
    while(ch--) {
        buf[ch] &= 0x7FFFFFFF;
    }
 }
 int main(int argc, char* argv[])
 {
    if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
        std::cout << "Performs some tests on the string encode/decode routines.\n";
        return 0;
    }
    int ret = 0;
    try {
        wchar_t wch[1024];
        make_random(wch, 1024);
        std::wstring ustr1, ustr2;
        std::string utf8;
        ustr1.assign(wch, 1024);
        utf8 = utf8::encode(ustr1);
        ustr2 = utf8::decode(utf8);
        if(ustr1 != ustr2) {
            std::cerr << "Decoded string does not match original.\n";
            for(size_t i = 0, end = std::min(ustr1.size(), ustr2.size()); i != end; ++i) {
                if(ustr1[i] != ustr2[i]) {
                    std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
                            << std::setfill('0') << std::hex << ": 0x"
                            << std::setw(8) << ustr1[i] << " != "
                            << std::setw(8) << ustr2[i] << "\n";
                }
            }
            std::cerr << "Original size " << std::dec << ustr1.size()
                    << ", decoded size " << ustr2.size() << std::endl;
            return 1;
        }
        std::cout << "Success.\n";
    }
    catch(utf8::Error& e) {
        std::cerr << e.reason << std::endl;
        ret = 1;
    }
    return ret;
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 */
--- a/src/tests/template
+++ b/src/tests/template
@ -0,0 +1,44 @@
 /* libutf8++/src/tests/???.cpp
 *
 *  (c)2006, Laurence Withers, <l@lwithers.me.uk>.
 *  Released under the GNU GPLv2. See file COPYING or
 *  http://www.gnu.org/copyleft/gpl.html for details.
 */
 #include "utf8"
 #include <iostream>
 int main(int argc, char* argv[])
 {
    if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
        std::cout << "One line summary.\n";
        return 0;
    }
    if(argc == 1) {
        // empty argument list
    }
    int ret = 0;
    try {
        // TODO
    }
    catch(std::exception& e) {
        std::cerr << e.what() << std::endl;
        ret = 1;
    }
    catch(...) {
        std::cerr << "Unknown exception caught." << std::endl;
        ret = 1;
    }
    return ret;
 }
 /* options for text editors
 kate: replace-trailing-space-save true; space-indent true; tab-width 4;
 vim: expandtab:ts=4:sw=4
 */