Copy from svn repository.
This commit is contained in:
		
							parent
							
								
									73d6e6fbd0
								
							
						
					
					
						commit
						ac22dabfe6
					
				
							
								
								
									
										11
									
								
								README
								
								
								
								
							
							
						
						
									
										11
									
								
								README
								
								
								
								
							|  | @ -10,5 +10,14 @@ Really Quick Instructions | |||
| To build: ./make.sh | ||||
| To install: ./make.sh install | ||||
|     (you might want to set PREFIX, by default it's /usr/local) | ||||
| Documentation is automatically built using doxygen. | ||||
| 
 | ||||
| @TODO@ | ||||
| Dependencies | ||||
| ------------ | ||||
| 
 | ||||
| libutf8, http://www.lwithers.me.uk/projects/libutf8/ | ||||
| 
 | ||||
| Project Homepage | ||||
| ---------------- | ||||
| 
 | ||||
| http://www.lwithers.me.uk/projects/libutf8++/ | ||||
|  |  | |||
|  | @ -0,0 +1 @@ | |||
| doxygen docs docs | ||||
|  | @ -0,0 +1,146 @@ | |||
| # libutf8++/src/docs/Doxyfile.in | ||||
| # | ||||
| #  (c)2006, Laurence Withers, <l@lwithers.me.uk>. | ||||
| #  Released under the GNU GPLv2. See file COPYING or | ||||
| #  http://www.gnu.org/copyleft/gpl.html for details. | ||||
| # | ||||
| 
 | ||||
| PROJECT_NAME           = libutf8++ | ||||
| OUTPUT_DIRECTORY       = | ||||
| CREATE_SUBDIRS         = NO | ||||
| OUTPUT_LANGUAGE        = English | ||||
| USE_WINDOWS_ENCODING   = NO | ||||
| BRIEF_MEMBER_DESC      = YES | ||||
| REPEAT_BRIEF           = YES | ||||
| ABBREVIATE_BRIEF       = | ||||
| ALWAYS_DETAILED_SEC    = NO | ||||
| INLINE_INHERITED_MEMB  = YES | ||||
| FULL_PATH_NAMES        = NO | ||||
| STRIP_FROM_PATH        = | ||||
| STRIP_FROM_INC_PATH    = | ||||
| SHORT_NAMES            = NO | ||||
| JAVADOC_AUTOBRIEF      = NO | ||||
| MULTILINE_CPP_IS_BRIEF = YES | ||||
| DETAILS_AT_TOP         = YES | ||||
| INHERIT_DOCS           = YES | ||||
| DISTRIBUTE_GROUP_DOC   = NO | ||||
| TAB_SIZE               = 4 | ||||
| ALIASES                = | ||||
| OPTIMIZE_OUTPUT_FOR_C  = NO | ||||
| OPTIMIZE_OUTPUT_JAVA   = NO | ||||
| SUBGROUPING            = YES | ||||
| EXTRACT_ALL            = NO | ||||
| EXTRACT_PRIVATE        = NO | ||||
| EXTRACT_STATIC         = NO | ||||
| EXTRACT_LOCAL_CLASSES  = NO | ||||
| EXTRACT_LOCAL_METHODS  = NO | ||||
| HIDE_UNDOC_MEMBERS     = NO | ||||
| HIDE_UNDOC_CLASSES     = NO | ||||
| HIDE_FRIEND_COMPOUNDS  = YES | ||||
| HIDE_IN_BODY_DOCS      = NO | ||||
| INTERNAL_DOCS          = NO | ||||
| CASE_SENSE_NAMES       = YES | ||||
| HIDE_SCOPE_NAMES       = NO | ||||
| SHOW_INCLUDE_FILES     = NO | ||||
| INLINE_INFO            = YES | ||||
| SORT_MEMBER_DOCS       = YES | ||||
| SORT_BRIEF_DOCS        = NO | ||||
| SORT_BY_SCOPE_NAME     = NO | ||||
| GENERATE_TODOLIST      = YES | ||||
| GENERATE_TESTLIST      = YES | ||||
| GENERATE_BUGLIST       = YES | ||||
| GENERATE_DEPRECATEDLIST= YES | ||||
| ENABLED_SECTIONS       = | ||||
| MAX_INITIALIZER_LINES  = 30 | ||||
| SHOW_USED_FILES        = NO | ||||
| SHOW_DIRECTORIES       = NO | ||||
| FILE_VERSION_FILTER    = | ||||
| QUIET                  = YES | ||||
| WARNINGS               = YES | ||||
| WARN_IF_UNDOCUMENTED   = YES | ||||
| WARN_IF_DOC_ERROR      = YES | ||||
| WARN_NO_PARAMDOC       = YES | ||||
| WARN_FORMAT            = "$file:$line: $text" | ||||
| WARN_LOGFILE           = | ||||
| FILE_PATTERNS          = | ||||
| RECURSIVE              = NO | ||||
| EXCLUDE                = | ||||
| EXCLUDE_SYMLINKS       = NO | ||||
| EXCLUDE_PATTERNS       = | ||||
| EXAMPLE_PATH           = | ||||
| EXAMPLE_PATTERNS       = | ||||
| EXAMPLE_RECURSIVE      = NO | ||||
| IMAGE_PATH             = src/docs | ||||
| INPUT_FILTER           = | ||||
| FILTER_PATTERNS        = | ||||
| FILTER_SOURCE_FILES    = NO | ||||
| SOURCE_BROWSER         = NO | ||||
| INLINE_SOURCES         = NO | ||||
| STRIP_CODE_COMMENTS    = YES | ||||
| REFERENCED_BY_RELATION = YES | ||||
| REFERENCES_RELATION    = YES | ||||
| VERBATIM_HEADERS       = NO | ||||
| ALPHABETICAL_INDEX     = YES | ||||
| COLS_IN_ALPHA_INDEX    = 5 | ||||
| IGNORE_PREFIX          = | ||||
| GENERATE_HTML          = YES | ||||
| HTML_OUTPUT            = html | ||||
| HTML_FILE_EXTENSION    = .html | ||||
| HTML_HEADER            = | ||||
| HTML_FOOTER            = | ||||
| HTML_STYLESHEET        = | ||||
| HTML_ALIGN_MEMBERS     = YES | ||||
| GENERATE_HTMLHELP      = NO | ||||
| CHM_FILE               = | ||||
| HHC_LOCATION           = | ||||
| GENERATE_CHI           = NO | ||||
| BINARY_TOC             = NO | ||||
| TOC_EXPAND             = NO | ||||
| DISABLE_INDEX          = NO | ||||
| ENUM_VALUES_PER_LINE   = 4 | ||||
| GENERATE_TREEVIEW      = NO | ||||
| TREEVIEW_WIDTH         = 250 | ||||
| GENERATE_LATEX         = NO | ||||
| GENERATE_RTF           = NO | ||||
| GENERATE_MAN           = NO | ||||
| GENERATE_XML           = NO | ||||
| GENERATE_AUTOGEN_DEF   = NO | ||||
| GENERATE_PERLMOD       = NO | ||||
| ENABLE_PREPROCESSING   = YES | ||||
| MACRO_EXPANSION        = NO | ||||
| EXPAND_ONLY_PREDEF     = NO | ||||
| SEARCH_INCLUDES        = YES | ||||
| INCLUDE_PATH           = | ||||
| INCLUDE_FILE_PATTERNS  = | ||||
| PREDEFINED             = DOXYGEN | ||||
| EXPAND_AS_DEFINED      = | ||||
| SKIP_FUNCTION_MACROS   = YES | ||||
| TAGFILES               = | ||||
| GENERATE_TAGFILE       = | ||||
| ALLEXTERNALS           = NO | ||||
| EXTERNAL_GROUPS        = YES | ||||
| PERL_PATH              = /usr/bin/perl | ||||
| CLASS_DIAGRAMS         = YES | ||||
| HIDE_UNDOC_RELATIONS   = YES | ||||
| HAVE_DOT               = YES | ||||
| CLASS_GRAPH            = YES | ||||
| COLLABORATION_GRAPH    = YES | ||||
| GROUP_GRAPHS           = NO | ||||
| UML_LOOK               = NO | ||||
| TEMPLATE_RELATIONS     = NO | ||||
| INCLUDE_GRAPH          = NO | ||||
| INCLUDED_BY_GRAPH      = NO | ||||
| CALL_GRAPH             = NO | ||||
| GRAPHICAL_HIERARCHY    = YES | ||||
| DIRECTORY_GRAPH        = NO | ||||
| DOT_IMAGE_FORMAT       = png | ||||
| DOT_PATH               = | ||||
| DOTFILE_DIRS           = | ||||
| MAX_DOT_GRAPH_WIDTH    = 1024 | ||||
| MAX_DOT_GRAPH_HEIGHT   = 1024 | ||||
| MAX_DOT_GRAPH_DEPTH    = 0 | ||||
| DOT_TRANSPARENT        = YES | ||||
| DOT_MULTI_TARGETS      = YES | ||||
| GENERATE_LEGEND        = YES | ||||
| DOT_CLEANUP            = YES | ||||
| SEARCHENGINE           = NO | ||||
|  | @ -0,0 +1,15 @@ | |||
| /* libutf8++/src/docs/MainPage.dox | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers, <l@lwithers.me.uk>. | ||||
|  *  Released under the GNU GPLv2. See file COPYING or | ||||
|  *  http://www.gnu.org/copyleft/gpl.html for details. | ||||
| */ | ||||
| 
 | ||||
| /*! \mainpage | ||||
| 
 | ||||
| */ | ||||
| 
 | ||||
| /* options for text editors | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| vim: expandtab:ts=4:sw=4 | ||||
| */ | ||||
|  | @ -0,0 +1 @@ | |||
| source src/docs/build.docs | ||||
|  | @ -0,0 +1,43 @@ | |||
| # These are external variables, and shouldn't clash with anything else | ||||
| #  docs_BUILT | ||||
| # | ||||
| 
 | ||||
| MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)" | ||||
| build_target monolithic | ||||
| 
 | ||||
| if [ -z ${docs_BUILT} ] | ||||
| then | ||||
|     echo "Building documentation with Doxygen..." | ||||
| 
 | ||||
|     DOXYFILE=obj/Doxyfile.docs | ||||
| 
 | ||||
|     if [ ! -e ${DOXYFILE} ] | ||||
|     then | ||||
|         do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1 | ||||
|         echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE} | ||||
|         echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE} | ||||
|     fi | ||||
| 
 | ||||
|     MODIFIED=0 | ||||
|     for file in ${MONOLITHIC_DOC} | ||||
|     do | ||||
|         if [ ${file} -nt html/index.html ] | ||||
|         then | ||||
|             MODIFIED=1 | ||||
|             break | ||||
|         fi | ||||
|     done | ||||
| 
 | ||||
|     if [ ${MODIFIED} -ne 0 ] | ||||
|     then | ||||
|         do_cmd doxygen ${DOXYFILE} || return 1 | ||||
|         print_success "Documentation built" | ||||
|     else | ||||
|         print_success "Documentation is up to date" | ||||
|     fi | ||||
| 
 | ||||
|     docs_BUILT=1 | ||||
| fi | ||||
| 
 | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1 @@ | |||
| source src/docs/build.install-docs | ||||
|  | @ -0,0 +1,21 @@ | |||
| build_target docs | ||||
| 
 | ||||
| # create documentation directories | ||||
| echo "Installing documentation into ${DOCSDIR}" | ||||
| build_dir_tree "${DOCSDIR}/html" || return 1 | ||||
| 
 | ||||
| # copy across the Doxygen-generated documentation | ||||
| for file in html/* | ||||
| do | ||||
|     install_file ${file} ${DOCSDIR}/html 0644 || return 1 | ||||
| done | ||||
| 
 | ||||
| # copy across the generic files | ||||
| for file in COPYING README | ||||
| do | ||||
|     install_file ${file} ${DOCSDIR} 0644 || return 1 | ||||
| done | ||||
| 
 | ||||
| print_success "Documentation installed" | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1 @@ | |||
| c++ lib libutf8++ utf8 | ||||
|  | @ -0,0 +1,9 @@ | |||
| /* libutf8++/src/lib/BottomHeader.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
|  | @ -0,0 +1,9 @@ | |||
| /* libutf8++/src/lib/ForwardDeclare.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| // This file simply contains forward declarations of all libutf8++
 | ||||
| // classes, to facilitate header ordering, etc.
 | ||||
| 
 | ||||
|  | @ -0,0 +1,23 @@ | |||
| /* libutf8++/src/lib/TopHeader.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| #ifndef HEADER_libutf8pp | ||||
| #define HEADER_libutf8pp | ||||
| 
 | ||||
| // standard includes, or includes needed for type declarations
 | ||||
| 
 | ||||
| #include <string> | ||||
| #include <stdexcept> | ||||
| #include <utf8.h> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| /*! \brief UTF-8 handling routines.
 | ||||
| 
 | ||||
| The library's UTF-8 handling routines are all made available through this namespace. | ||||
| 
 | ||||
| */ | ||||
| namespace utf8 { | ||||
|  | @ -0,0 +1,12 @@ | |||
| /* libutf8++/src/lib/TopSource.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| #include "utf8" | ||||
| 
 | ||||
| // Below are all the includes used throughout the library.
 | ||||
| 
 | ||||
| #include <sstream> | ||||
| #include <iomanip> | ||||
|  | @ -0,0 +1 @@ | |||
| source src/libutf8++/build.lib | ||||
|  | @ -0,0 +1 @@ | |||
| source src/libutf8++/build.install-lib | ||||
|  | @ -0,0 +1,36 @@ | |||
| build_target libutf8++ | ||||
| 
 | ||||
| # make paths (this is for Gentoo in particular) | ||||
| build_dir_tree "${LIBDIR}" || return 1 | ||||
| build_dir_tree "${PKGCONFDIR}" || return 1 | ||||
| build_dir_tree "${INCLUDEDIR}" || return 1 | ||||
| 
 | ||||
| # install library | ||||
| echo "Installing libraries into '${LIBDIR}'" | ||||
| install_file ${libutf8pp} ${LIBDIR} 0755 || return 1 | ||||
| BASE="${libutf8pp_BASE}.so" | ||||
| MAJOR="${BASE}.${SOMAJOR}" | ||||
| MINOR="${MAJOR}.${SOMINOR}" | ||||
| MICRO="${MINOR}.${SOMICRO}" | ||||
| install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}" | ||||
| install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}" | ||||
| install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}" | ||||
| 
 | ||||
| # install header | ||||
| echo "Installing header file '${libutf8pp_HEADER}' into ${INCLUDEDIR}" | ||||
| install_header ${libutf8pp_HEADER} ${INCLUDEDIR} 0644 || return 1 | ||||
| 
 | ||||
| # install pkgconfig file | ||||
| echo "Installing package config file into ${PKGCONFDIR}" | ||||
| PKGCONFFILE=${PKGCONFDIR}/libutf8pp.pc | ||||
| do_cmd rm -f ${PKGCONFFILE} | ||||
| do_cmd_redir ${PKGCONFFILE} sed \ | ||||
|     -e "s,@VERSION@,${VERSION}," \ | ||||
|     -e "s,@LIBDIR@,${FINALLIBDIR}," \ | ||||
|     -e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \ | ||||
|     src/libutf8++/pkgconf.in | ||||
| do_cmd chmod 0644 ${PKGCONFFILE} | ||||
| print_success "Done" | ||||
| 
 | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1,51 @@ | |||
| # These are external variables, and shouldn't clash with anything else | ||||
| #  libutf8pp | ||||
| #  libutf8pp_BUILT | ||||
| #  libutf8pp_HEADER | ||||
| #  libutf8pp_BASE | ||||
| 
 | ||||
| if [ -z ${libutf8pp_BUILT} ] | ||||
| then | ||||
|     libutf8pp_BASE=libutf8++ | ||||
|     source src/libutf8++/soversion | ||||
| 
 | ||||
|     libutf8pp="obj/${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}" | ||||
|     SO_EXTRA="$(pkg-config libutf8 --libs --cflags) -lstdc++ -lc" | ||||
| 
 | ||||
|     echo "Building library ${libutf8pp}..." | ||||
| 
 | ||||
|     do_cmd source src/libutf8++/build.monolithic || return 1 | ||||
| 
 | ||||
|     MODIFIED=0 | ||||
|     for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC} | ||||
|     do | ||||
|         if [ ${test} -nt ${libutf8pp} ] | ||||
|         then | ||||
|             MODIFIED=1 | ||||
|             break | ||||
|         fi | ||||
|     done | ||||
| 
 | ||||
|     if [ ${MODIFIED} -ne 0 ] | ||||
|     then | ||||
|         echo " Compiling" | ||||
| 
 | ||||
|         SONAME="${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}" | ||||
|         do_cmd ${CXX} ${CFLAGS} -shared -fpic -o "${libutf8pp}" \ | ||||
|             -Wl,-soname,${SONAME} \ | ||||
|             ${SRC} ${SO_EXTRA} || return 1 | ||||
| 
 | ||||
|         # make tests work | ||||
|         do_cmd ln -sf $(basename ${libutf8pp}) obj/${SONAME} || return 1 | ||||
| 
 | ||||
|         print_success "Library built" | ||||
|     else | ||||
|         print_success "Library up to date" | ||||
|     fi | ||||
| 
 | ||||
|     libutf8pp_BUILT=1 | ||||
|     libutf8pp_HEADER=${HDR} | ||||
| 
 | ||||
| fi | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1,21 @@ | |||
| # These are external variables, and shouldn't clash with anything else | ||||
| #  libutf8pp_MONOLITHIC | ||||
| 
 | ||||
| SRC="obj/libutf8++.cpp" | ||||
| HDR="obj/utf8" | ||||
| 
 | ||||
| MONOLITHIC_TESTS="src/libutf8++/build.lib src/libutf8++/build.monolithic" | ||||
| 
 | ||||
| if [ -z "${libutf8pp_MONOLITHIC}" ] | ||||
| then | ||||
|     MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopHeader,ForwardDeclare,exception,string,{en,de}coder,BottomHeader}.h)" | ||||
|     make_monolithic ${HDR} C || return 1 | ||||
| 
 | ||||
|     MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopSource,exception,string,{en,de}coder}.cpp)" | ||||
|     make_monolithic ${SRC} C || return 1 | ||||
| 
 | ||||
|     libutf8pp_MONOLITHIC=1 | ||||
|     MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}" | ||||
| fi | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1,159 @@ | |||
| /* libutf8++/src/lib/decoder.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| namespace utf8 { | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| Decoder::Decoder(size_t hint) | ||||
| { | ||||
|     memset(&ctx, 0, sizeof(ctx)); | ||||
|     ctx.wr_size = (hint < 2) ? 2 : hint; | ||||
|     ctx.wr = new wchar_t[ctx.wr_size]; | ||||
|     ctx.error_callback = _exceptionOnError; | ||||
|     ctx.data = this; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| Decoder::~Decoder() | ||||
| { | ||||
|     delete [] ctx.wr; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Decoder::decode(const std::string& str) | ||||
| { | ||||
|     decode(str.data(), str.size()); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Decoder::decode(const char* str, ssize_t amt) | ||||
| { | ||||
|     ctx.rd = str; | ||||
|     ctx.rd_remain = amt; | ||||
|     while(ctx.rd_remain) { | ||||
|         utf8_decoder(&ctx); | ||||
|         decoded.append(ctx.wr, ctx.written); | ||||
| 
 | ||||
|         if(ctx.rd_remain < 0 && !*(ctx.rd)) break; | ||||
|         if(ctx.rd_remain) { | ||||
|             ctx.wr_size *= 2; | ||||
|             delete [] ctx.wr; | ||||
|             ctx.wr = new wchar_t[ctx.wr_size]; | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| bool Decoder::complete() const | ||||
| { | ||||
|     return ctx.complete; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Decoder::reset() | ||||
| { | ||||
|     size_t old_wr_size = ctx.wr_size; | ||||
|     wchar_t* old_wr = ctx.wr; | ||||
|     utf8_decode_error_callback old_error_callback = ctx.error_callback; | ||||
| 
 | ||||
|     memset(&ctx, 0, sizeof(ctx)); | ||||
|     ctx.wr_size = old_wr_size; | ||||
|     ctx.wr = old_wr; | ||||
|     ctx.error_callback = old_error_callback; | ||||
|     ctx.data = this; | ||||
|     decoded.clear(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Decoder::skipOnError() | ||||
| { | ||||
|     ctx.error_callback = _skipOnError; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Decoder::replaceOnError(wchar_t ch) | ||||
| { | ||||
|     replaceChar = ch; | ||||
|     ctx.error_callback = _replaceOnError; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Decoder::exceptionOnError() | ||||
| { | ||||
|     ctx.error_callback = _exceptionOnError; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| enum utf8_decode_error_action Decoder::_skipOnError | ||||
|     (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch) | ||||
| { | ||||
|     (void)ctx; | ||||
|     (void)error; | ||||
|     (void)newch; | ||||
|     return utf8_decode_error_action_skip; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| enum utf8_decode_error_action Decoder::_replaceOnError | ||||
|     (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch) | ||||
| { | ||||
|     (void)error; | ||||
|     Decoder* self = (utf8::Decoder*)(ctx->data); | ||||
|     *newch = self->replaceChar; | ||||
|     return utf8_decode_error_action_replace; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| enum utf8_decode_error_action Decoder::_exceptionOnError | ||||
|     (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch) | ||||
| { | ||||
|     (void)newch; | ||||
|     const char* desc = "unknown"; | ||||
| 
 | ||||
|     switch(error) { | ||||
|     case utf8_decode_error_lone_cchar: | ||||
|         desc = "An invalid continuation byte was encountered while expecting a character."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_not_cchar: | ||||
|         desc = "A multi-byte sequence contained an invalid byte."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_not_schar: | ||||
|         desc = "An invalid byte was encountered while expecting a character."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_overlong: | ||||
|         desc = "An overlong encoding of a character was encountered."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_illegal_cp: | ||||
|         desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered."; | ||||
|         break; | ||||
|     } | ||||
| 
 | ||||
|     throw BadUTF8Sequence(desc, ctx); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,128 @@ | |||
| /* libutf8++/src/lib/decoder.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| /*! \brief Stateful UTF-8 decoder object.
 | ||||
| 
 | ||||
| This object is used for stateful decoding of a UTF-8 byte stream. It can be fed the data in | ||||
| arbitrary chunks, even split on non-character boundaries. It writes its output into a wide character | ||||
| string. | ||||
| 
 | ||||
| A variety of error handling modes are available. The default is to throw a BadUTF8Sequence | ||||
| exception, but you can change this with skipOnError() or replaceOnError(). | ||||
| 
 | ||||
| */ | ||||
| class Decoder { | ||||
| public: | ||||
|     /*! \brief Constructor.
 | ||||
| 
 | ||||
|     \param hint Hint at number of characters to allocate space for in decoder buffer. | ||||
| 
 | ||||
|     The constructor sets up the UTF-8 decoder. You can provide a hint as to the size of your input | ||||
|     stream chunks. This hint is the number of characters to allocate in the output buffer. If, | ||||
|     during a single decode operation, this buffer is filled, then it is doubled in size. | ||||
| 
 | ||||
|      */ | ||||
|     Decoder(size_t hint = 25); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /// Destructor.
 | ||||
|     ~Decoder(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /// Result of decoding operations (appended to).
 | ||||
|     std::wstring decoded; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief UTF-8 decoder.
 | ||||
| 
 | ||||
|     \param str Pointer to source data. | ||||
|     \param amt Number of bytes in source data (-1 for null terminated strings). | ||||
|     \throws BadUTF8Sequence. | ||||
| 
 | ||||
|     This function will decode a chunk of UTF-8 data. The decoded data will be appended to whatever | ||||
|     is contained in the string decoded. You can check if the decoder ended on a character boundary | ||||
|     or not by calling complete(). | ||||
| 
 | ||||
|     */ | ||||
|     void decode(const char* str, ssize_t amt); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /// Decode data stored in a std::string.
 | ||||
|     void decode(const std::string& str); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /// Returns \a true if the last call to \a decode() ended on a character boundary.
 | ||||
|     bool complete() const; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Resets the parser for a new UTF-8 stream.
 | ||||
| 
 | ||||
|     This function will clear the internal state of the decoder so that it is ready for data from a | ||||
|     new source. This can be used if you have opened a new file, accepted a new connection, recovered | ||||
|     from an error, etc. It will also clear \a decoded. | ||||
| 
 | ||||
|     */ | ||||
|     void reset(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Set error handling to \e skip mode.
 | ||||
| 
 | ||||
|     This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8 | ||||
|     byte sequences will simply be skipped altogether, and will not have any effect on the output in | ||||
|     \a decoded. | ||||
| 
 | ||||
|     */ | ||||
|     void skipOnError(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Set error handling to \e replace mode.
 | ||||
| 
 | ||||
|     \param ch The replacement character that will appear in the output. | ||||
| 
 | ||||
|     This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8 | ||||
|     byte sequences will be skipped, and a replacement character \a ch will be placed onto the output | ||||
|     in \a decoded. The default parameter is the unicode replacement character, which should look | ||||
|     like an upside-down question mark. | ||||
| 
 | ||||
|     */ | ||||
|     void replaceOnError(wchar_t ch = 0xFFFD); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Set error handling to \e exception mode (default).
 | ||||
| 
 | ||||
|     This function will set the error handling to \e exception mode. In this mode, any invalid | ||||
|     UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default | ||||
|     mode. | ||||
| 
 | ||||
|     */ | ||||
|     void exceptionOnError(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| private: | ||||
|     struct utf8_decode_state ctx; | ||||
|     wchar_t replaceChar; | ||||
| 
 | ||||
|     static enum utf8_decode_error_action _skipOnError | ||||
|         (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch); | ||||
|     static enum utf8_decode_error_action _replaceOnError | ||||
|         (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch); | ||||
|     static enum utf8_decode_error_action _exceptionOnError | ||||
|         (const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch); | ||||
| }; | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,124 @@ | |||
| /* libutf8++/src/lib/encoder.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| namespace utf8 { | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| Encoder::Encoder(size_t hint) | ||||
| { | ||||
|     memset(&ctx, 0, sizeof(ctx)); | ||||
|     ctx.wr_size = (hint < 7) ? 7 : hint; | ||||
|     ctx.wr = new char[ctx.wr_size]; | ||||
|     ctx.error_callback = _exceptionOnError; | ||||
|     ctx.data = this; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| Encoder::~Encoder() | ||||
| { | ||||
|     delete [] ctx.wr; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Encoder::reset() | ||||
| { | ||||
|     char* wr = ctx.wr; | ||||
|     size_t wr_size = ctx.wr_size; | ||||
|     utf8_encode_error_callback cb = ctx.error_callback; | ||||
|     memset(&ctx, 0, sizeof(ctx)); | ||||
|     ctx.wr = wr; | ||||
|     ctx.wr_size = wr_size; | ||||
|     ctx.error_callback = cb; | ||||
|     ctx.data = this; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Encoder::encode(const std::wstring& str) | ||||
| { | ||||
|     encode(str.data(), str.size()); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Encoder::encode(const wchar_t* str, ssize_t amt) | ||||
| { | ||||
|     ctx.rd = str; | ||||
|     ctx.rd_remain = amt; | ||||
|     while(ctx.rd_remain) { | ||||
|         if(!utf8_encoder(&ctx)) throw BadUnicodeChar(&ctx); | ||||
|         encoded.append(ctx.wr, ctx.written); | ||||
| 
 | ||||
|         if(ctx.rd_remain < 0 && !*(ctx.rd)) break; | ||||
|         if(ctx.rd_remain) { | ||||
|             ctx.wr_size *= 2; | ||||
|             delete [] ctx.wr; | ||||
|             ctx.wr = new char[ctx.wr_size]; | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Encoder::skipOnError() | ||||
| { | ||||
|     ctx.error_callback = _skipOnError; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Encoder::replaceOnError(wchar_t ch) | ||||
| { | ||||
|     replaceChar = ch; | ||||
|     ctx.error_callback = _replaceOnError; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void Encoder::exceptionOnError() | ||||
| { | ||||
|     ctx.error_callback = _exceptionOnError; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| enum utf8_encode_error_action Encoder::_skipOnError | ||||
|     (const struct utf8_encode_state *ctx, wchar_t *newch) | ||||
| { | ||||
|     (void)ctx; | ||||
|     (void)newch; | ||||
|     return utf8_encode_error_action_skip; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| enum utf8_encode_error_action Encoder::_replaceOnError | ||||
|     (const struct utf8_encode_state *ctx, wchar_t *newch) | ||||
| { | ||||
|     Encoder* self = (utf8::Encoder*)(ctx->data); | ||||
|     *newch = self->replaceChar; | ||||
|     return utf8_encode_error_action_replace; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| enum utf8_encode_error_action Encoder::_exceptionOnError | ||||
|     (const struct utf8_encode_state *ctx, wchar_t *newch) | ||||
| { | ||||
|     (void)newch; | ||||
|     throw BadUnicodeChar(ctx); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,108 @@ | |||
| /* libutf8++/src/lib/encoder.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| /*! \brief UTF-8 encoder object.
 | ||||
| 
 | ||||
| This object is used to encode Unicode wide characters into UTF-8. It can be fed chunks of characters | ||||
| which it then encodes, appending the result to an internal buffer. | ||||
| 
 | ||||
| */ | ||||
| class Encoder { | ||||
| public: | ||||
|     /*! \brief Constructor.
 | ||||
| 
 | ||||
|     \param hint Number of bytes to allocate for the encoding buffer. | ||||
| 
 | ||||
|     The constructor sets up the encoder and allocates some space for an internal buffer. You can | ||||
|     hint at how large you expect the chunks to be encoded will be. If an encoding operation fills | ||||
|     the buffer without consuming all the input data, the buffer will be doubled in size for the | ||||
|     next round. | ||||
| 
 | ||||
|     */ | ||||
|     Encoder(size_t hint = 100); | ||||
| 
 | ||||
|     /// Destructor.
 | ||||
|     virtual ~Encoder(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /// UTF-8 output data is appended to this string.
 | ||||
|     std::string encoded; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Encode some data into UTF-8.
 | ||||
| 
 | ||||
|     \param str Pointer to the character array to encode. | ||||
|     \param amt Number of characters to encode. | ||||
| 
 | ||||
|     This function performs an encoding of some Unicode characters into UTF-8. It appends the result | ||||
|     onto \a encoded. | ||||
| 
 | ||||
|     */ | ||||
|     void encode(const wchar_t* str, ssize_t amt); | ||||
| 
 | ||||
|     /// Encode a std::wstring.
 | ||||
|     void encode(const std::wstring& str); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /// Reset the encoder for a new character stream.
 | ||||
|     void reset(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Set error handling to \e skip mode.
 | ||||
| 
 | ||||
|     This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8 | ||||
|     byte sequences will simply be skipped altogether, and will not have any effect on the output in | ||||
|     \a decoded. | ||||
| 
 | ||||
|     */ | ||||
|     void skipOnError(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Set error handling to \e replace mode.
 | ||||
| 
 | ||||
|         \param ch The replacement character that will appear in the output. | ||||
| 
 | ||||
|     This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8 | ||||
|     byte sequences will be skipped, and a replacement character \a ch will be placed onto the output | ||||
|     in \a decoded. The default parameter is the unicode replacement character, which should look | ||||
|     like an upside-down question mark. | ||||
| 
 | ||||
|     */ | ||||
|     void replaceOnError(wchar_t ch = 0xFFFD); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     /*! \brief Set error handling to \e exception mode (default).
 | ||||
| 
 | ||||
|     This function will set the error handling to \e exception mode. In this mode, any invalid | ||||
|     UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default | ||||
|     mode. | ||||
| 
 | ||||
|     */ | ||||
|     void exceptionOnError(); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| private: | ||||
|     struct utf8_encode_state ctx; | ||||
|     wchar_t replaceChar; | ||||
| 
 | ||||
|     static enum utf8_encode_error_action _skipOnError | ||||
|         (const struct utf8_encode_state *ctx, wchar_t *newch); | ||||
|     static enum utf8_encode_error_action _replaceOnError | ||||
|         (const struct utf8_encode_state *ctx, wchar_t *newch); | ||||
|     static enum utf8_encode_error_action _exceptionOnError | ||||
|         (const struct utf8_encode_state *ctx, wchar_t *newch); | ||||
| }; | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,81 @@ | |||
| /* libutf8++/src/lib/exception.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| namespace utf8 { | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| Error::Error(const std::string& reason) | ||||
|     : reason(reason) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| const char* Error::what() | ||||
| { | ||||
|     return reason.c_str(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| BadUnicodeChar::BadUnicodeChar(const struct utf8_encode_state* ctx) | ||||
|     : Error(format(ctx)), badChar(*ctx->rd), line(ctx->line), col(ctx->col), char_offset(ctx->char_offset) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| std::string BadUnicodeChar::format(const struct utf8_encode_state* ctx) | ||||
| { | ||||
|     std::ostringstream str; | ||||
| 
 | ||||
|     str << "Invalid Unicode code point encountered." | ||||
|            "\n  Position       : line " | ||||
|         << ctx->line + 1 | ||||
|         << ", column " | ||||
|         << ctx->col + 1 | ||||
|         << "\n  Stream offset  : " | ||||
|         << ctx->char_offset | ||||
|         << " characters\n  Character value: 0x" | ||||
|         << std::hex | ||||
|         << *(ctx->rd); | ||||
| 
 | ||||
|     return str.str(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| BadUTF8Sequence::BadUTF8Sequence(const std::string& description, | ||||
|                                  const struct utf8_decode_state* ctx) | ||||
|     : Error(format(description, ctx)), description(description), line(ctx->line + 1), | ||||
|     col(ctx->col + 1), char_offset(ctx->char_offset), byte_offset(ctx->byte_offset) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| std::string BadUTF8Sequence::format(const std::string& description, | ||||
|                                     const struct utf8_decode_state* ctx) | ||||
| { | ||||
|     std::ostringstream str; | ||||
| 
 | ||||
|     str << "Bad byte sequence in UTF-8 data.\n" | ||||
|         "  Reason  : " << description | ||||
|         << "\n  Position: line " << ctx->line + 1 | ||||
|         << ", column " << ctx->col + 1 | ||||
|         << "\n  Offset  : " << ctx->char_offset << " chars, " << ctx->byte_offset << " bytes"; | ||||
| 
 | ||||
|     return str.str(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,96 @@ | |||
| /* libutf8++/src/lib/exception.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| /*! \brief Exception base class.
 | ||||
| 
 | ||||
| This is the base class for all libutf8 exceptions. It contains one member, \a reason, which allows | ||||
| you to print a human-readable description of the error. To recover the actual type, you can refer | ||||
| to the more specific derived classes. | ||||
| 
 | ||||
| */ | ||||
| class Error : public std::exception { | ||||
| public: | ||||
|     /// Human-readable reason for error.
 | ||||
|     std::string reason; | ||||
| 
 | ||||
|     /// Constructor.
 | ||||
|     Error(const std::string& reason); | ||||
| 
 | ||||
|     /// Destructor.
 | ||||
|     virtual ~Error() throw() | ||||
|     { } | ||||
| 
 | ||||
|     /// Find what caused the error.
 | ||||
|     virtual const char* what(); | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| /*! \brief Invalid Unicode character exception.
 | ||||
| 
 | ||||
| This exception is thrown when encoding Unicode into UTF-8 and an invalid character is encountered. | ||||
| 
 | ||||
| */ | ||||
| class BadUnicodeChar : public Error { | ||||
| public: | ||||
|     /// A copy of the invalid character.
 | ||||
|     wchar_t badChar; | ||||
| 
 | ||||
|     /// Line of input data at which error occurred (starts at 1).
 | ||||
|     int line; | ||||
| 
 | ||||
|     /// Column of input data at which error occurred (starts at 1).
 | ||||
|     int col; | ||||
| 
 | ||||
|     /// Character offset of input data at which error occurred.
 | ||||
|     int char_offset; | ||||
| 
 | ||||
|     /// Constructor.
 | ||||
|     BadUnicodeChar(const struct utf8_encode_state* ctx); | ||||
| 
 | ||||
| private: | ||||
|     static std::string format(const struct utf8_encode_state* ctx); | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| /*! \brief Invalid UTF-8 sequence exception.
 | ||||
| 
 | ||||
| This exception is thrown when decoding UTF-8 and an invalid sequence is encountered. This could be | ||||
| a nonsensical sequence, a redundantly-encounded character or truncated source data. It contains some | ||||
| variables for allowing detailed diagnostics. | ||||
| 
 | ||||
| */ | ||||
| class BadUTF8Sequence : public Error { | ||||
| public: | ||||
|     /// Description of the error, for human diagnostics.
 | ||||
|     std::string description; | ||||
| 
 | ||||
|     /// Line of input data at which error occurred (starts at 1).
 | ||||
|     int line; | ||||
| 
 | ||||
|     /// Column of input data at which error occurred (starts at 1).
 | ||||
|     int col; | ||||
| 
 | ||||
|     /// Character offset of input data at which error occurred.
 | ||||
|     int char_offset; | ||||
| 
 | ||||
|     /// Byte offset of input data at which error occurred.
 | ||||
|     int byte_offset; | ||||
| 
 | ||||
|     /// Constructor.
 | ||||
|     BadUTF8Sequence(const std::string& description, const struct utf8_decode_state* ctx); | ||||
| 
 | ||||
|     /// Destructor.
 | ||||
|     ~BadUTF8Sequence() throw() { } | ||||
| 
 | ||||
| private: | ||||
|     std::string format(const std::string& description, const struct utf8_decode_state* ctx); | ||||
| }; | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,21 @@ | |||
| # libutf8++/src/lib/libutf8++/pkgconf.in | ||||
| # | ||||
| #  Metadata file for pkg-config | ||||
| #  ( http://www.freedesktop.org/software/pkgconfig/ ) | ||||
| # | ||||
| #  (c)2006, Laurence Withers, <l@lwithers.me.uk>. | ||||
| #  Released under the GNU GPLv2. See file COPYING or | ||||
| #  http://www.gnu.org/copyleft/gpl.html for details. | ||||
| # | ||||
| 
 | ||||
| # Name, description | ||||
| Name: libutf8++ | ||||
| Description: C++ wrapper around libutf8 (library for handling UTF-8) | ||||
| Version: @VERSION@ | ||||
| 
 | ||||
| # Requirements | ||||
| Requires: | ||||
| 
 | ||||
| # Compilation information | ||||
| Libs: -L@LIBDIR@ -lutf8++ | ||||
| Cflags: -I@INCLUDEDIR@ | ||||
|  | @ -0,0 +1,17 @@ | |||
| # libutf8++/src/libutf8++/soversion | ||||
| # | ||||
| #  (c)2006, Laurence Withers, <l@lwithers.me.uk>. | ||||
| #  Released under the GNU GPLv2. See file COPYING or | ||||
| #  http://www.gnu.org/copyleft/gpl.html for details. | ||||
| # | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| # SOMAJOR and SOMINOR are included in the library's soname. They need to | ||||
| # be bumped on a binary-incompatible release. They are both single | ||||
| # integers. | ||||
| SOMAJOR=0 | ||||
| SOMINOR=0 | ||||
| 
 | ||||
| # SOMICRO is bumped every time there is a binary-compatible release. | ||||
| SOMICRO=0 | ||||
|  | @ -0,0 +1,126 @@ | |||
| /* libutf8++/src/lib/string.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| namespace utf8 { | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| static enum utf8_decode_error_action decode_replace_callback(const struct utf8_decode_state* ctx, | ||||
|     enum utf8_decode_error error, wchar_t* newch) | ||||
| { | ||||
|     (void)error; | ||||
|     *newch = *(wchar_t*)(ctx->data); | ||||
|     return utf8_decode_error_action_replace; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| static enum utf8_decode_error_action decode_error_callback(const struct utf8_decode_state* ctx, | ||||
|     enum utf8_decode_error error, wchar_t* newch) | ||||
| { | ||||
|     (void)newch; | ||||
|     const char* desc = "unknown"; | ||||
| 
 | ||||
|     switch(error) { | ||||
|     case utf8_decode_error_lone_cchar: | ||||
|         desc = "An invalid continuation byte was encountered while expecting a character."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_not_cchar: | ||||
|         desc = "A multi-byte sequence contained an invalid byte."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_not_schar: | ||||
|         desc = "An invalid byte was encountered while expecting a character."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_overlong: | ||||
|         desc = "An overlong encoding of a character was encountered."; | ||||
|         break; | ||||
| 
 | ||||
|     case utf8_decode_error_illegal_cp: | ||||
|         desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered."; | ||||
|         break; | ||||
|     } | ||||
| 
 | ||||
|     throw BadUTF8Sequence(desc, ctx); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| std::wstring decode(const std::string& utf8, bool force, wchar_t replace) | ||||
| { | ||||
|     wchar_t buffer[128]; | ||||
|     struct utf8_decode_state ctx; | ||||
|     memset(&ctx, 0, sizeof(ctx)); | ||||
| 
 | ||||
|     ctx.rd = utf8.data(); | ||||
|     ctx.rd_remain = utf8.size(); | ||||
|     ctx.wr = buffer; | ||||
|     ctx.wr_size = 128; | ||||
|     if(force) { | ||||
|         ctx.error_callback = decode_replace_callback; | ||||
|         ctx.data = &replace; | ||||
|     } else { | ||||
|         ctx.error_callback = decode_error_callback; | ||||
|     } | ||||
| 
 | ||||
|     std::wstring ret; | ||||
| 
 | ||||
|     while(ctx.rd_remain) { | ||||
|         utf8_decoder(&ctx); | ||||
|         ret.append(buffer, ctx.written); | ||||
|     } | ||||
| 
 | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| static enum utf8_encode_error_action encode_replace_callback(const struct utf8_encode_state* ctx, | ||||
|     wchar_t* newch) | ||||
| { | ||||
|     *newch = *(wchar_t*)(ctx->data); | ||||
|     return utf8_encode_error_action_replace; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| std::string encode(const std::wstring& ustr, bool force, wchar_t replace) | ||||
| { | ||||
|     char buffer[512]; | ||||
|     struct utf8_encode_state ctx; | ||||
|     memset(&ctx, 0, sizeof(ctx)); | ||||
| 
 | ||||
|     ctx.rd = ustr.data(); | ||||
|     ctx.rd_remain = ustr.size(); | ||||
|     ctx.wr = buffer; | ||||
|     ctx.wr_size = 512; | ||||
|     if(force) { | ||||
|         ctx.error_callback = encode_replace_callback; | ||||
|         ctx.data = &replace; | ||||
|     } | ||||
| 
 | ||||
|     std::string ret; | ||||
| 
 | ||||
|     while(ctx.rd_remain) { | ||||
|         if(!utf8_encoder(&ctx)) { | ||||
|             throw BadUnicodeChar(&ctx); | ||||
|         } | ||||
| 
 | ||||
|         ret.append(buffer, ctx.written); | ||||
|     } | ||||
| 
 | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,37 @@ | |||
| /* libutf8++/src/lib/string.h
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| /*! \brief Decode UTF-8.
 | ||||
| 
 | ||||
| \param utf8 The UTF-8 encoded data. | ||||
| \param force If set to \a true, errors will be inhibited. | ||||
| \param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this | ||||
|     character. | ||||
| \returns The Unicode wide-character string representation. | ||||
| \throws BadUTF8Sequence if there is an invalid byte sequence in the UTF-8 source data. | ||||
| 
 | ||||
| This function will decode a UTF-8 source string into a Unicode wide-character string. It has a force | ||||
| mode whereby any errors will be inhibited and a best-effort attempt will be made. | ||||
| 
 | ||||
| */ | ||||
| std::wstring decode(const std::string& utf8, bool force = false, wchar_t replace = 0xFFFD); | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| /*! \brief Encode UTF-8.
 | ||||
| 
 | ||||
| \param ustr The Unicode wide-character string. | ||||
| \param force If set to \a true, errors will be inhibited (invalid chars will be omitted). | ||||
| \param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this | ||||
|     character. | ||||
| \returns The UTF-8 transformed representation of \a ustr. | ||||
| \throws BadUnicodeChar on invalid characters in the source data. | ||||
| 
 | ||||
| This function will encode a Unicode wide-character string into a UTF-8 transformed representation. | ||||
| It has a force mode whereby any errors will be inhibited and a best-effort attempt will be made. | ||||
| 
 | ||||
| */ | ||||
| std::string encode(const std::wstring& ustr, bool force = false, wchar_t replace = 0xFFFD); | ||||
|  | @ -0,0 +1 @@ | |||
| c++ tests tests libutf8++ | ||||
|  | @ -0,0 +1,3 @@ | |||
| source src/tests/build.tests | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1,43 @@ | |||
| # These are external variables, and shouldn't clash with anything else | ||||
| #  tests_BUILT | ||||
| # | ||||
| 
 | ||||
| build_target libutf8++ || return 1 | ||||
| 
 | ||||
| if [ -z ${tests_BUILT} ] | ||||
| then | ||||
|     LIBS="${libutf8pp} " | ||||
|     EXTRAS="" | ||||
| 
 | ||||
|     echo "Building test programs..." | ||||
|     do_cmd mkdir -p obj/tests || return 1 | ||||
| 
 | ||||
|     for SRC in src/tests/*.cpp | ||||
|     do | ||||
|         TEST="obj/tests/$(basename ${SRC} | sed -e 's,.cpp$,,')" | ||||
|         MODIFIED=0 | ||||
|         for file in ${LIBS} ${SRC} src/tests/build.tests | ||||
|         do | ||||
|             if [ ${file} -nt ${TEST} ] | ||||
|             then | ||||
|                 MODIFIED=1 | ||||
|                 break | ||||
|             fi | ||||
|         done | ||||
| 
 | ||||
|         if [ ${MODIFIED} -ne 0 ] | ||||
|         then | ||||
|             do_cmd ${CXX} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1 | ||||
|             print_success "Built ${TEST}" | ||||
|         else | ||||
|             print_success "${TEST} is up to date" | ||||
|         fi | ||||
|     done | ||||
| 
 | ||||
|     print_success "All tests built" | ||||
| 
 | ||||
|     tests_BUILT=1 | ||||
| fi | ||||
| 
 | ||||
| # kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| # vim: expandtab:ts=4:sw=4 | ||||
|  | @ -0,0 +1,85 @@ | |||
| /* libutf8++/src/tests/objects.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| #include "utf8" | ||||
| #include <iostream> | ||||
| #include <iomanip> | ||||
| #include <fcntl.h> | ||||
| #include <unistd.h> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void make_random(wchar_t* buf, int ch) | ||||
| { | ||||
|     int fd = open("/dev/urandom", O_RDONLY); | ||||
|     if(fd < 0) { | ||||
|         perror("open(\"/dev/urandom\")"); | ||||
|         throw 1; | ||||
|     } | ||||
|     ch *= sizeof(wchar_t); | ||||
|     if(read(fd, (char*)buf, ch) != ch) { | ||||
|         perror("read(\"/dev/urandom\")"); | ||||
|         throw 1; | ||||
|     } | ||||
|     close(fd); | ||||
| 
 | ||||
|     ch /= sizeof(wchar_t); | ||||
|     while(ch--) { | ||||
|         buf[ch] &= 0x7FFFFFFF; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| int main(int argc, char* argv[]) | ||||
| { | ||||
|     if(argc == 2 && !strcmp(argv[1], "--print-summary")) { | ||||
|         std::cout << "Performs some tests on the Encoder and Decoder objects.\n"; | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     int ret = 0; | ||||
|     try { | ||||
|         wchar_t wch[1024]; | ||||
|         make_random(wch, 1024); | ||||
| 
 | ||||
|         std::wstring ustr; | ||||
|         ustr.assign(wch, 1024); | ||||
| 
 | ||||
|         utf8::Encoder encoder; | ||||
|         utf8::Decoder decoder; | ||||
| 
 | ||||
|         encoder.encode(ustr); | ||||
|         decoder.decode(encoder.encoded); | ||||
| 
 | ||||
|         if(ustr != decoder.decoded) { | ||||
|             std::cerr << "Decoded string does not match original.\n"; | ||||
|             for(size_t i = 0, end = std::min(ustr.size(), decoder.decoded.size()); i != end; ++i) { | ||||
|                 if(ustr[i] != decoder.decoded[i]) { | ||||
|                     std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i | ||||
|                             << std::setfill('0') << std::hex << ": 0x" | ||||
|                             << std::setw(8) << ustr[i] << " != " | ||||
|                             << std::setw(8) << decoder.decoded[i] << "\n"; | ||||
|                 } | ||||
|             } | ||||
|             std::cerr << "Original size " << std::dec << ustr.size() | ||||
|                     << ", decoded size " << decoder.decoded.size() << std::endl; | ||||
|             return 1; | ||||
|         } | ||||
| 
 | ||||
|         std::cout << "Success.\n"; | ||||
|     } | ||||
|     catch(utf8::Error& e) { | ||||
|         std::cerr << e.reason << std::endl; | ||||
|         ret = 1; | ||||
|     } | ||||
| 
 | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,82 @@ | |||
| /* libutf8++/src/tests/strings.cpp
 | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers. Released under the GNU GPL. See file | ||||
|  *  COPYING for more information / terms of license. | ||||
| */ | ||||
| 
 | ||||
| #include "utf8" | ||||
| #include <iostream> | ||||
| #include <iomanip> | ||||
| #include <fcntl.h> | ||||
| #include <unistd.h> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| void make_random(wchar_t* buf, int ch) | ||||
| { | ||||
|     int fd = open("/dev/urandom", O_RDONLY); | ||||
|     if(fd < 0) { | ||||
|         perror("open(\"/dev/urandom\")"); | ||||
|         throw 1; | ||||
|     } | ||||
|     ch *= sizeof(wchar_t); | ||||
|     if(read(fd, (char*)buf, ch) != ch) { | ||||
|         perror("read(\"/dev/urandom\")"); | ||||
|         throw 1; | ||||
|     } | ||||
|     close(fd); | ||||
| 
 | ||||
|     ch /= sizeof(wchar_t); | ||||
|     while(ch--) { | ||||
|         buf[ch] &= 0x7FFFFFFF; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| int main(int argc, char* argv[]) | ||||
| { | ||||
|     if(argc == 2 && !strcmp(argv[1], "--print-summary")) { | ||||
|         std::cout << "Performs some tests on the string encode/decode routines.\n"; | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     int ret = 0; | ||||
|     try { | ||||
|         wchar_t wch[1024]; | ||||
|         make_random(wch, 1024); | ||||
| 
 | ||||
|         std::wstring ustr1, ustr2; | ||||
|         std::string utf8; | ||||
|         ustr1.assign(wch, 1024); | ||||
|         utf8 = utf8::encode(ustr1); | ||||
|         ustr2 = utf8::decode(utf8); | ||||
| 
 | ||||
|         if(ustr1 != ustr2) { | ||||
|             std::cerr << "Decoded string does not match original.\n"; | ||||
|             for(size_t i = 0, end = std::min(ustr1.size(), ustr2.size()); i != end; ++i) { | ||||
|                 if(ustr1[i] != ustr2[i]) { | ||||
|                     std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i | ||||
|                             << std::setfill('0') << std::hex << ": 0x" | ||||
|                             << std::setw(8) << ustr1[i] << " != " | ||||
|                             << std::setw(8) << ustr2[i] << "\n"; | ||||
|                 } | ||||
|             } | ||||
|             std::cerr << "Original size " << std::dec << ustr1.size() | ||||
|                     << ", decoded size " << ustr2.size() << std::endl; | ||||
|             return 1; | ||||
|         } | ||||
| 
 | ||||
|         std::cout << "Success.\n"; | ||||
|     } | ||||
|     catch(utf8::Error& e) { | ||||
|         std::cerr << e.reason << std::endl; | ||||
|         ret = 1; | ||||
|     } | ||||
| 
 | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| /* options for text editors
 | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| */ | ||||
|  | @ -0,0 +1,44 @@ | |||
| /* libutf8++/src/tests/???.cpp | ||||
|  * | ||||
|  *  (c)2006, Laurence Withers, <l@lwithers.me.uk>. | ||||
|  *  Released under the GNU GPLv2. See file COPYING or | ||||
|  *  http://www.gnu.org/copyleft/gpl.html for details. | ||||
| */ | ||||
| 
 | ||||
| #include "utf8" | ||||
| 
 | ||||
| #include <iostream> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| int main(int argc, char* argv[]) | ||||
| { | ||||
|     if(argc == 2 && !strcmp(argv[1], "--print-summary")) { | ||||
|         std::cout << "One line summary.\n"; | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     if(argc == 1) { | ||||
|         // empty argument list | ||||
|     } | ||||
| 
 | ||||
|     int ret = 0; | ||||
|     try { | ||||
|         // TODO | ||||
|     } | ||||
|     catch(std::exception& e) { | ||||
|         std::cerr << e.what() << std::endl; | ||||
|         ret = 1; | ||||
|     } | ||||
|     catch(...) { | ||||
|         std::cerr << "Unknown exception caught." << std::endl; | ||||
|         ret = 1; | ||||
|     } | ||||
| 
 | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| /* options for text editors | ||||
| kate: replace-trailing-space-save true; space-indent true; tab-width 4; | ||||
| vim: expandtab:ts=4:sw=4 | ||||
| */ | ||||
		Loading…
	
		Reference in New Issue
	
	 Laurence Withers
						Laurence Withers