Copy from svn repository.
This commit is contained in:
parent
73d6e6fbd0
commit
ac22dabfe6
11
README
11
README
|
@ -10,5 +10,14 @@ Really Quick Instructions
|
||||||
To build: ./make.sh
|
To build: ./make.sh
|
||||||
To install: ./make.sh install
|
To install: ./make.sh install
|
||||||
(you might want to set PREFIX, by default it's /usr/local)
|
(you might want to set PREFIX, by default it's /usr/local)
|
||||||
|
Documentation is automatically built using doxygen.
|
||||||
|
|
||||||
@TODO@
|
Dependencies
|
||||||
|
------------
|
||||||
|
|
||||||
|
libutf8, http://www.lwithers.me.uk/projects/libutf8/
|
||||||
|
|
||||||
|
Project Homepage
|
||||||
|
----------------
|
||||||
|
|
||||||
|
http://www.lwithers.me.uk/projects/libutf8++/
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
doxygen docs docs
|
|
@ -0,0 +1,146 @@
|
||||||
|
# libutf8++/src/docs/Doxyfile.in
|
||||||
|
#
|
||||||
|
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||||
|
# Released under the GNU GPLv2. See file COPYING or
|
||||||
|
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
|
#
|
||||||
|
|
||||||
|
PROJECT_NAME = libutf8++
|
||||||
|
OUTPUT_DIRECTORY =
|
||||||
|
CREATE_SUBDIRS = NO
|
||||||
|
OUTPUT_LANGUAGE = English
|
||||||
|
USE_WINDOWS_ENCODING = NO
|
||||||
|
BRIEF_MEMBER_DESC = YES
|
||||||
|
REPEAT_BRIEF = YES
|
||||||
|
ABBREVIATE_BRIEF =
|
||||||
|
ALWAYS_DETAILED_SEC = NO
|
||||||
|
INLINE_INHERITED_MEMB = YES
|
||||||
|
FULL_PATH_NAMES = NO
|
||||||
|
STRIP_FROM_PATH =
|
||||||
|
STRIP_FROM_INC_PATH =
|
||||||
|
SHORT_NAMES = NO
|
||||||
|
JAVADOC_AUTOBRIEF = NO
|
||||||
|
MULTILINE_CPP_IS_BRIEF = YES
|
||||||
|
DETAILS_AT_TOP = YES
|
||||||
|
INHERIT_DOCS = YES
|
||||||
|
DISTRIBUTE_GROUP_DOC = NO
|
||||||
|
TAB_SIZE = 4
|
||||||
|
ALIASES =
|
||||||
|
OPTIMIZE_OUTPUT_FOR_C = NO
|
||||||
|
OPTIMIZE_OUTPUT_JAVA = NO
|
||||||
|
SUBGROUPING = YES
|
||||||
|
EXTRACT_ALL = NO
|
||||||
|
EXTRACT_PRIVATE = NO
|
||||||
|
EXTRACT_STATIC = NO
|
||||||
|
EXTRACT_LOCAL_CLASSES = NO
|
||||||
|
EXTRACT_LOCAL_METHODS = NO
|
||||||
|
HIDE_UNDOC_MEMBERS = NO
|
||||||
|
HIDE_UNDOC_CLASSES = NO
|
||||||
|
HIDE_FRIEND_COMPOUNDS = YES
|
||||||
|
HIDE_IN_BODY_DOCS = NO
|
||||||
|
INTERNAL_DOCS = NO
|
||||||
|
CASE_SENSE_NAMES = YES
|
||||||
|
HIDE_SCOPE_NAMES = NO
|
||||||
|
SHOW_INCLUDE_FILES = NO
|
||||||
|
INLINE_INFO = YES
|
||||||
|
SORT_MEMBER_DOCS = YES
|
||||||
|
SORT_BRIEF_DOCS = NO
|
||||||
|
SORT_BY_SCOPE_NAME = NO
|
||||||
|
GENERATE_TODOLIST = YES
|
||||||
|
GENERATE_TESTLIST = YES
|
||||||
|
GENERATE_BUGLIST = YES
|
||||||
|
GENERATE_DEPRECATEDLIST= YES
|
||||||
|
ENABLED_SECTIONS =
|
||||||
|
MAX_INITIALIZER_LINES = 30
|
||||||
|
SHOW_USED_FILES = NO
|
||||||
|
SHOW_DIRECTORIES = NO
|
||||||
|
FILE_VERSION_FILTER =
|
||||||
|
QUIET = YES
|
||||||
|
WARNINGS = YES
|
||||||
|
WARN_IF_UNDOCUMENTED = YES
|
||||||
|
WARN_IF_DOC_ERROR = YES
|
||||||
|
WARN_NO_PARAMDOC = YES
|
||||||
|
WARN_FORMAT = "$file:$line: $text"
|
||||||
|
WARN_LOGFILE =
|
||||||
|
FILE_PATTERNS =
|
||||||
|
RECURSIVE = NO
|
||||||
|
EXCLUDE =
|
||||||
|
EXCLUDE_SYMLINKS = NO
|
||||||
|
EXCLUDE_PATTERNS =
|
||||||
|
EXAMPLE_PATH =
|
||||||
|
EXAMPLE_PATTERNS =
|
||||||
|
EXAMPLE_RECURSIVE = NO
|
||||||
|
IMAGE_PATH = src/docs
|
||||||
|
INPUT_FILTER =
|
||||||
|
FILTER_PATTERNS =
|
||||||
|
FILTER_SOURCE_FILES = NO
|
||||||
|
SOURCE_BROWSER = NO
|
||||||
|
INLINE_SOURCES = NO
|
||||||
|
STRIP_CODE_COMMENTS = YES
|
||||||
|
REFERENCED_BY_RELATION = YES
|
||||||
|
REFERENCES_RELATION = YES
|
||||||
|
VERBATIM_HEADERS = NO
|
||||||
|
ALPHABETICAL_INDEX = YES
|
||||||
|
COLS_IN_ALPHA_INDEX = 5
|
||||||
|
IGNORE_PREFIX =
|
||||||
|
GENERATE_HTML = YES
|
||||||
|
HTML_OUTPUT = html
|
||||||
|
HTML_FILE_EXTENSION = .html
|
||||||
|
HTML_HEADER =
|
||||||
|
HTML_FOOTER =
|
||||||
|
HTML_STYLESHEET =
|
||||||
|
HTML_ALIGN_MEMBERS = YES
|
||||||
|
GENERATE_HTMLHELP = NO
|
||||||
|
CHM_FILE =
|
||||||
|
HHC_LOCATION =
|
||||||
|
GENERATE_CHI = NO
|
||||||
|
BINARY_TOC = NO
|
||||||
|
TOC_EXPAND = NO
|
||||||
|
DISABLE_INDEX = NO
|
||||||
|
ENUM_VALUES_PER_LINE = 4
|
||||||
|
GENERATE_TREEVIEW = NO
|
||||||
|
TREEVIEW_WIDTH = 250
|
||||||
|
GENERATE_LATEX = NO
|
||||||
|
GENERATE_RTF = NO
|
||||||
|
GENERATE_MAN = NO
|
||||||
|
GENERATE_XML = NO
|
||||||
|
GENERATE_AUTOGEN_DEF = NO
|
||||||
|
GENERATE_PERLMOD = NO
|
||||||
|
ENABLE_PREPROCESSING = YES
|
||||||
|
MACRO_EXPANSION = NO
|
||||||
|
EXPAND_ONLY_PREDEF = NO
|
||||||
|
SEARCH_INCLUDES = YES
|
||||||
|
INCLUDE_PATH =
|
||||||
|
INCLUDE_FILE_PATTERNS =
|
||||||
|
PREDEFINED = DOXYGEN
|
||||||
|
EXPAND_AS_DEFINED =
|
||||||
|
SKIP_FUNCTION_MACROS = YES
|
||||||
|
TAGFILES =
|
||||||
|
GENERATE_TAGFILE =
|
||||||
|
ALLEXTERNALS = NO
|
||||||
|
EXTERNAL_GROUPS = YES
|
||||||
|
PERL_PATH = /usr/bin/perl
|
||||||
|
CLASS_DIAGRAMS = YES
|
||||||
|
HIDE_UNDOC_RELATIONS = YES
|
||||||
|
HAVE_DOT = YES
|
||||||
|
CLASS_GRAPH = YES
|
||||||
|
COLLABORATION_GRAPH = YES
|
||||||
|
GROUP_GRAPHS = NO
|
||||||
|
UML_LOOK = NO
|
||||||
|
TEMPLATE_RELATIONS = NO
|
||||||
|
INCLUDE_GRAPH = NO
|
||||||
|
INCLUDED_BY_GRAPH = NO
|
||||||
|
CALL_GRAPH = NO
|
||||||
|
GRAPHICAL_HIERARCHY = YES
|
||||||
|
DIRECTORY_GRAPH = NO
|
||||||
|
DOT_IMAGE_FORMAT = png
|
||||||
|
DOT_PATH =
|
||||||
|
DOTFILE_DIRS =
|
||||||
|
MAX_DOT_GRAPH_WIDTH = 1024
|
||||||
|
MAX_DOT_GRAPH_HEIGHT = 1024
|
||||||
|
MAX_DOT_GRAPH_DEPTH = 0
|
||||||
|
DOT_TRANSPARENT = YES
|
||||||
|
DOT_MULTI_TARGETS = YES
|
||||||
|
GENERATE_LEGEND = YES
|
||||||
|
DOT_CLEANUP = YES
|
||||||
|
SEARCHENGINE = NO
|
|
@ -0,0 +1,15 @@
|
||||||
|
/* libutf8++/src/docs/MainPage.dox
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||||
|
* Released under the GNU GPLv2. See file COPYING or
|
||||||
|
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \mainpage
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
vim: expandtab:ts=4:sw=4
|
||||||
|
*/
|
|
@ -0,0 +1 @@
|
||||||
|
source src/docs/build.docs
|
|
@ -0,0 +1,43 @@
|
||||||
|
# These are external variables, and shouldn't clash with anything else
|
||||||
|
# docs_BUILT
|
||||||
|
#
|
||||||
|
|
||||||
|
MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
|
||||||
|
build_target monolithic
|
||||||
|
|
||||||
|
if [ -z ${docs_BUILT} ]
|
||||||
|
then
|
||||||
|
echo "Building documentation with Doxygen..."
|
||||||
|
|
||||||
|
DOXYFILE=obj/Doxyfile.docs
|
||||||
|
|
||||||
|
if [ ! -e ${DOXYFILE} ]
|
||||||
|
then
|
||||||
|
do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
|
||||||
|
echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
|
||||||
|
echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
|
||||||
|
fi
|
||||||
|
|
||||||
|
MODIFIED=0
|
||||||
|
for file in ${MONOLITHIC_DOC}
|
||||||
|
do
|
||||||
|
if [ ${file} -nt html/index.html ]
|
||||||
|
then
|
||||||
|
MODIFIED=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ ${MODIFIED} -ne 0 ]
|
||||||
|
then
|
||||||
|
do_cmd doxygen ${DOXYFILE} || return 1
|
||||||
|
print_success "Documentation built"
|
||||||
|
else
|
||||||
|
print_success "Documentation is up to date"
|
||||||
|
fi
|
||||||
|
|
||||||
|
docs_BUILT=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1 @@
|
||||||
|
source src/docs/build.install-docs
|
|
@ -0,0 +1,21 @@
|
||||||
|
build_target docs
|
||||||
|
|
||||||
|
# create documentation directories
|
||||||
|
echo "Installing documentation into ${DOCSDIR}"
|
||||||
|
build_dir_tree "${DOCSDIR}/html" || return 1
|
||||||
|
|
||||||
|
# copy across the Doxygen-generated documentation
|
||||||
|
for file in html/*
|
||||||
|
do
|
||||||
|
install_file ${file} ${DOCSDIR}/html 0644 || return 1
|
||||||
|
done
|
||||||
|
|
||||||
|
# copy across the generic files
|
||||||
|
for file in COPYING README
|
||||||
|
do
|
||||||
|
install_file ${file} ${DOCSDIR} 0644 || return 1
|
||||||
|
done
|
||||||
|
|
||||||
|
print_success "Documentation installed"
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1 @@
|
||||||
|
c++ lib libutf8++ utf8
|
|
@ -0,0 +1,9 @@
|
||||||
|
/* libutf8++/src/lib/BottomHeader.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,9 @@
|
||||||
|
/* libutf8++/src/lib/ForwardDeclare.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// This file simply contains forward declarations of all libutf8++
|
||||||
|
// classes, to facilitate header ordering, etc.
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
/* libutf8++/src/lib/TopHeader.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HEADER_libutf8pp
|
||||||
|
#define HEADER_libutf8pp
|
||||||
|
|
||||||
|
// standard includes, or includes needed for type declarations
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <utf8.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief UTF-8 handling routines.
|
||||||
|
|
||||||
|
The library's UTF-8 handling routines are all made available through this namespace.
|
||||||
|
|
||||||
|
*/
|
||||||
|
namespace utf8 {
|
|
@ -0,0 +1,12 @@
|
||||||
|
/* libutf8++/src/lib/TopSource.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utf8"
|
||||||
|
|
||||||
|
// Below are all the includes used throughout the library.
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include <iomanip>
|
|
@ -0,0 +1 @@
|
||||||
|
source src/libutf8++/build.lib
|
|
@ -0,0 +1 @@
|
||||||
|
source src/libutf8++/build.install-lib
|
|
@ -0,0 +1,36 @@
|
||||||
|
build_target libutf8++
|
||||||
|
|
||||||
|
# make paths (this is for Gentoo in particular)
|
||||||
|
build_dir_tree "${LIBDIR}" || return 1
|
||||||
|
build_dir_tree "${PKGCONFDIR}" || return 1
|
||||||
|
build_dir_tree "${INCLUDEDIR}" || return 1
|
||||||
|
|
||||||
|
# install library
|
||||||
|
echo "Installing libraries into '${LIBDIR}'"
|
||||||
|
install_file ${libutf8pp} ${LIBDIR} 0755 || return 1
|
||||||
|
BASE="${libutf8pp_BASE}.so"
|
||||||
|
MAJOR="${BASE}.${SOMAJOR}"
|
||||||
|
MINOR="${MAJOR}.${SOMINOR}"
|
||||||
|
MICRO="${MINOR}.${SOMICRO}"
|
||||||
|
install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
|
||||||
|
install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
|
||||||
|
install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
|
||||||
|
|
||||||
|
# install header
|
||||||
|
echo "Installing header file '${libutf8pp_HEADER}' into ${INCLUDEDIR}"
|
||||||
|
install_header ${libutf8pp_HEADER} ${INCLUDEDIR} 0644 || return 1
|
||||||
|
|
||||||
|
# install pkgconfig file
|
||||||
|
echo "Installing package config file into ${PKGCONFDIR}"
|
||||||
|
PKGCONFFILE=${PKGCONFDIR}/libutf8pp.pc
|
||||||
|
do_cmd rm -f ${PKGCONFFILE}
|
||||||
|
do_cmd_redir ${PKGCONFFILE} sed \
|
||||||
|
-e "s,@VERSION@,${VERSION}," \
|
||||||
|
-e "s,@LIBDIR@,${FINALLIBDIR}," \
|
||||||
|
-e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
|
||||||
|
src/libutf8++/pkgconf.in
|
||||||
|
do_cmd chmod 0644 ${PKGCONFFILE}
|
||||||
|
print_success "Done"
|
||||||
|
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,51 @@
|
||||||
|
# These are external variables, and shouldn't clash with anything else
|
||||||
|
# libutf8pp
|
||||||
|
# libutf8pp_BUILT
|
||||||
|
# libutf8pp_HEADER
|
||||||
|
# libutf8pp_BASE
|
||||||
|
|
||||||
|
if [ -z ${libutf8pp_BUILT} ]
|
||||||
|
then
|
||||||
|
libutf8pp_BASE=libutf8++
|
||||||
|
source src/libutf8++/soversion
|
||||||
|
|
||||||
|
libutf8pp="obj/${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
|
||||||
|
SO_EXTRA="$(pkg-config libutf8 --libs --cflags) -lstdc++ -lc"
|
||||||
|
|
||||||
|
echo "Building library ${libutf8pp}..."
|
||||||
|
|
||||||
|
do_cmd source src/libutf8++/build.monolithic || return 1
|
||||||
|
|
||||||
|
MODIFIED=0
|
||||||
|
for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
|
||||||
|
do
|
||||||
|
if [ ${test} -nt ${libutf8pp} ]
|
||||||
|
then
|
||||||
|
MODIFIED=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ ${MODIFIED} -ne 0 ]
|
||||||
|
then
|
||||||
|
echo " Compiling"
|
||||||
|
|
||||||
|
SONAME="${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}"
|
||||||
|
do_cmd ${CXX} ${CFLAGS} -shared -fpic -o "${libutf8pp}" \
|
||||||
|
-Wl,-soname,${SONAME} \
|
||||||
|
${SRC} ${SO_EXTRA} || return 1
|
||||||
|
|
||||||
|
# make tests work
|
||||||
|
do_cmd ln -sf $(basename ${libutf8pp}) obj/${SONAME} || return 1
|
||||||
|
|
||||||
|
print_success "Library built"
|
||||||
|
else
|
||||||
|
print_success "Library up to date"
|
||||||
|
fi
|
||||||
|
|
||||||
|
libutf8pp_BUILT=1
|
||||||
|
libutf8pp_HEADER=${HDR}
|
||||||
|
|
||||||
|
fi
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,21 @@
|
||||||
|
# These are external variables, and shouldn't clash with anything else
|
||||||
|
# libutf8pp_MONOLITHIC
|
||||||
|
|
||||||
|
SRC="obj/libutf8++.cpp"
|
||||||
|
HDR="obj/utf8"
|
||||||
|
|
||||||
|
MONOLITHIC_TESTS="src/libutf8++/build.lib src/libutf8++/build.monolithic"
|
||||||
|
|
||||||
|
if [ -z "${libutf8pp_MONOLITHIC}" ]
|
||||||
|
then
|
||||||
|
MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopHeader,ForwardDeclare,exception,string,{en,de}coder,BottomHeader}.h)"
|
||||||
|
make_monolithic ${HDR} C || return 1
|
||||||
|
|
||||||
|
MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopSource,exception,string,{en,de}coder}.cpp)"
|
||||||
|
make_monolithic ${SRC} C || return 1
|
||||||
|
|
||||||
|
libutf8pp_MONOLITHIC=1
|
||||||
|
MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
|
||||||
|
fi
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,159 @@
|
||||||
|
/* libutf8++/src/lib/decoder.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace utf8 {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Decoder::Decoder(size_t hint)
|
||||||
|
{
|
||||||
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
|
ctx.wr_size = (hint < 2) ? 2 : hint;
|
||||||
|
ctx.wr = new wchar_t[ctx.wr_size];
|
||||||
|
ctx.error_callback = _exceptionOnError;
|
||||||
|
ctx.data = this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Decoder::~Decoder()
|
||||||
|
{
|
||||||
|
delete [] ctx.wr;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Decoder::decode(const std::string& str)
|
||||||
|
{
|
||||||
|
decode(str.data(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Decoder::decode(const char* str, ssize_t amt)
|
||||||
|
{
|
||||||
|
ctx.rd = str;
|
||||||
|
ctx.rd_remain = amt;
|
||||||
|
while(ctx.rd_remain) {
|
||||||
|
utf8_decoder(&ctx);
|
||||||
|
decoded.append(ctx.wr, ctx.written);
|
||||||
|
|
||||||
|
if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
|
||||||
|
if(ctx.rd_remain) {
|
||||||
|
ctx.wr_size *= 2;
|
||||||
|
delete [] ctx.wr;
|
||||||
|
ctx.wr = new wchar_t[ctx.wr_size];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
bool Decoder::complete() const
|
||||||
|
{
|
||||||
|
return ctx.complete;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Decoder::reset()
|
||||||
|
{
|
||||||
|
size_t old_wr_size = ctx.wr_size;
|
||||||
|
wchar_t* old_wr = ctx.wr;
|
||||||
|
utf8_decode_error_callback old_error_callback = ctx.error_callback;
|
||||||
|
|
||||||
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
|
ctx.wr_size = old_wr_size;
|
||||||
|
ctx.wr = old_wr;
|
||||||
|
ctx.error_callback = old_error_callback;
|
||||||
|
ctx.data = this;
|
||||||
|
decoded.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Decoder::skipOnError()
|
||||||
|
{
|
||||||
|
ctx.error_callback = _skipOnError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Decoder::replaceOnError(wchar_t ch)
|
||||||
|
{
|
||||||
|
replaceChar = ch;
|
||||||
|
ctx.error_callback = _replaceOnError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Decoder::exceptionOnError()
|
||||||
|
{
|
||||||
|
ctx.error_callback = _exceptionOnError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
enum utf8_decode_error_action Decoder::_skipOnError
|
||||||
|
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
|
||||||
|
{
|
||||||
|
(void)ctx;
|
||||||
|
(void)error;
|
||||||
|
(void)newch;
|
||||||
|
return utf8_decode_error_action_skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
enum utf8_decode_error_action Decoder::_replaceOnError
|
||||||
|
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
|
||||||
|
{
|
||||||
|
(void)error;
|
||||||
|
Decoder* self = (utf8::Decoder*)(ctx->data);
|
||||||
|
*newch = self->replaceChar;
|
||||||
|
return utf8_decode_error_action_replace;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
enum utf8_decode_error_action Decoder::_exceptionOnError
|
||||||
|
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
|
||||||
|
{
|
||||||
|
(void)newch;
|
||||||
|
const char* desc = "unknown";
|
||||||
|
|
||||||
|
switch(error) {
|
||||||
|
case utf8_decode_error_lone_cchar:
|
||||||
|
desc = "An invalid continuation byte was encountered while expecting a character.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_not_cchar:
|
||||||
|
desc = "A multi-byte sequence contained an invalid byte.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_not_schar:
|
||||||
|
desc = "An invalid byte was encountered while expecting a character.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_overlong:
|
||||||
|
desc = "An overlong encoding of a character was encountered.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_illegal_cp:
|
||||||
|
desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw BadUTF8Sequence(desc, ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,128 @@
|
||||||
|
/* libutf8++/src/lib/decoder.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \brief Stateful UTF-8 decoder object.
|
||||||
|
|
||||||
|
This object is used for stateful decoding of a UTF-8 byte stream. It can be fed the data in
|
||||||
|
arbitrary chunks, even split on non-character boundaries. It writes its output into a wide character
|
||||||
|
string.
|
||||||
|
|
||||||
|
A variety of error handling modes are available. The default is to throw a BadUTF8Sequence
|
||||||
|
exception, but you can change this with skipOnError() or replaceOnError().
|
||||||
|
|
||||||
|
*/
|
||||||
|
class Decoder {
|
||||||
|
public:
|
||||||
|
/*! \brief Constructor.
|
||||||
|
|
||||||
|
\param hint Hint at number of characters to allocate space for in decoder buffer.
|
||||||
|
|
||||||
|
The constructor sets up the UTF-8 decoder. You can provide a hint as to the size of your input
|
||||||
|
stream chunks. This hint is the number of characters to allocate in the output buffer. If,
|
||||||
|
during a single decode operation, this buffer is filled, then it is doubled in size.
|
||||||
|
|
||||||
|
*/
|
||||||
|
Decoder(size_t hint = 25);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Destructor.
|
||||||
|
~Decoder();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Result of decoding operations (appended to).
|
||||||
|
std::wstring decoded;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief UTF-8 decoder.
|
||||||
|
|
||||||
|
\param str Pointer to source data.
|
||||||
|
\param amt Number of bytes in source data (-1 for null terminated strings).
|
||||||
|
\throws BadUTF8Sequence.
|
||||||
|
|
||||||
|
This function will decode a chunk of UTF-8 data. The decoded data will be appended to whatever
|
||||||
|
is contained in the string decoded. You can check if the decoder ended on a character boundary
|
||||||
|
or not by calling complete().
|
||||||
|
|
||||||
|
*/
|
||||||
|
void decode(const char* str, ssize_t amt);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Decode data stored in a std::string.
|
||||||
|
void decode(const std::string& str);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Returns \a true if the last call to \a decode() ended on a character boundary.
|
||||||
|
bool complete() const;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Resets the parser for a new UTF-8 stream.
|
||||||
|
|
||||||
|
This function will clear the internal state of the decoder so that it is ready for data from a
|
||||||
|
new source. This can be used if you have opened a new file, accepted a new connection, recovered
|
||||||
|
from an error, etc. It will also clear \a decoded.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void reset();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Set error handling to \e skip mode.
|
||||||
|
|
||||||
|
This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
|
||||||
|
byte sequences will simply be skipped altogether, and will not have any effect on the output in
|
||||||
|
\a decoded.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void skipOnError();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Set error handling to \e replace mode.
|
||||||
|
|
||||||
|
\param ch The replacement character that will appear in the output.
|
||||||
|
|
||||||
|
This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
|
||||||
|
byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
|
||||||
|
in \a decoded. The default parameter is the unicode replacement character, which should look
|
||||||
|
like an upside-down question mark.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void replaceOnError(wchar_t ch = 0xFFFD);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Set error handling to \e exception mode (default).
|
||||||
|
|
||||||
|
This function will set the error handling to \e exception mode. In this mode, any invalid
|
||||||
|
UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
|
||||||
|
mode.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void exceptionOnError();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct utf8_decode_state ctx;
|
||||||
|
wchar_t replaceChar;
|
||||||
|
|
||||||
|
static enum utf8_decode_error_action _skipOnError
|
||||||
|
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
|
||||||
|
static enum utf8_decode_error_action _replaceOnError
|
||||||
|
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
|
||||||
|
static enum utf8_decode_error_action _exceptionOnError
|
||||||
|
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,124 @@
|
||||||
|
/* libutf8++/src/lib/encoder.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace utf8 {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Encoder::Encoder(size_t hint)
|
||||||
|
{
|
||||||
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
|
ctx.wr_size = (hint < 7) ? 7 : hint;
|
||||||
|
ctx.wr = new char[ctx.wr_size];
|
||||||
|
ctx.error_callback = _exceptionOnError;
|
||||||
|
ctx.data = this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Encoder::~Encoder()
|
||||||
|
{
|
||||||
|
delete [] ctx.wr;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Encoder::reset()
|
||||||
|
{
|
||||||
|
char* wr = ctx.wr;
|
||||||
|
size_t wr_size = ctx.wr_size;
|
||||||
|
utf8_encode_error_callback cb = ctx.error_callback;
|
||||||
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
|
ctx.wr = wr;
|
||||||
|
ctx.wr_size = wr_size;
|
||||||
|
ctx.error_callback = cb;
|
||||||
|
ctx.data = this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Encoder::encode(const std::wstring& str)
|
||||||
|
{
|
||||||
|
encode(str.data(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Encoder::encode(const wchar_t* str, ssize_t amt)
|
||||||
|
{
|
||||||
|
ctx.rd = str;
|
||||||
|
ctx.rd_remain = amt;
|
||||||
|
while(ctx.rd_remain) {
|
||||||
|
if(!utf8_encoder(&ctx)) throw BadUnicodeChar(&ctx);
|
||||||
|
encoded.append(ctx.wr, ctx.written);
|
||||||
|
|
||||||
|
if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
|
||||||
|
if(ctx.rd_remain) {
|
||||||
|
ctx.wr_size *= 2;
|
||||||
|
delete [] ctx.wr;
|
||||||
|
ctx.wr = new char[ctx.wr_size];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Encoder::skipOnError()
|
||||||
|
{
|
||||||
|
ctx.error_callback = _skipOnError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Encoder::replaceOnError(wchar_t ch)
|
||||||
|
{
|
||||||
|
replaceChar = ch;
|
||||||
|
ctx.error_callback = _replaceOnError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void Encoder::exceptionOnError()
|
||||||
|
{
|
||||||
|
ctx.error_callback = _exceptionOnError;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
enum utf8_encode_error_action Encoder::_skipOnError
|
||||||
|
(const struct utf8_encode_state *ctx, wchar_t *newch)
|
||||||
|
{
|
||||||
|
(void)ctx;
|
||||||
|
(void)newch;
|
||||||
|
return utf8_encode_error_action_skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
enum utf8_encode_error_action Encoder::_replaceOnError
|
||||||
|
(const struct utf8_encode_state *ctx, wchar_t *newch)
|
||||||
|
{
|
||||||
|
Encoder* self = (utf8::Encoder*)(ctx->data);
|
||||||
|
*newch = self->replaceChar;
|
||||||
|
return utf8_encode_error_action_replace;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
enum utf8_encode_error_action Encoder::_exceptionOnError
|
||||||
|
(const struct utf8_encode_state *ctx, wchar_t *newch)
|
||||||
|
{
|
||||||
|
(void)newch;
|
||||||
|
throw BadUnicodeChar(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,108 @@
|
||||||
|
/* libutf8++/src/lib/encoder.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \brief UTF-8 encoder object.
|
||||||
|
|
||||||
|
This object is used to encode Unicode wide characters into UTF-8. It can be fed chunks of characters
|
||||||
|
which it then encodes, appending the result to an internal buffer.
|
||||||
|
|
||||||
|
*/
|
||||||
|
class Encoder {
|
||||||
|
public:
|
||||||
|
/*! \brief Constructor.
|
||||||
|
|
||||||
|
\param hint Number of bytes to allocate for the encoding buffer.
|
||||||
|
|
||||||
|
The constructor sets up the encoder and allocates some space for an internal buffer. You can
|
||||||
|
hint at how large you expect the chunks to be encoded will be. If an encoding operation fills
|
||||||
|
the buffer without consuming all the input data, the buffer will be doubled in size for the
|
||||||
|
next round.
|
||||||
|
|
||||||
|
*/
|
||||||
|
Encoder(size_t hint = 100);
|
||||||
|
|
||||||
|
/// Destructor.
|
||||||
|
virtual ~Encoder();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// UTF-8 output data is appended to this string.
|
||||||
|
std::string encoded;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Encode some data into UTF-8.
|
||||||
|
|
||||||
|
\param str Pointer to the character array to encode.
|
||||||
|
\param amt Number of characters to encode.
|
||||||
|
|
||||||
|
This function performs an encoding of some Unicode characters into UTF-8. It appends the result
|
||||||
|
onto \a encoded.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void encode(const wchar_t* str, ssize_t amt);
|
||||||
|
|
||||||
|
/// Encode a std::wstring.
|
||||||
|
void encode(const std::wstring& str);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/// Reset the encoder for a new character stream.
|
||||||
|
void reset();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Set error handling to \e skip mode.
|
||||||
|
|
||||||
|
This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
|
||||||
|
byte sequences will simply be skipped altogether, and will not have any effect on the output in
|
||||||
|
\a decoded.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void skipOnError();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Set error handling to \e replace mode.
|
||||||
|
|
||||||
|
\param ch The replacement character that will appear in the output.
|
||||||
|
|
||||||
|
This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
|
||||||
|
byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
|
||||||
|
in \a decoded. The default parameter is the unicode replacement character, which should look
|
||||||
|
like an upside-down question mark.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void replaceOnError(wchar_t ch = 0xFFFD);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Set error handling to \e exception mode (default).
|
||||||
|
|
||||||
|
This function will set the error handling to \e exception mode. In this mode, any invalid
|
||||||
|
UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
|
||||||
|
mode.
|
||||||
|
|
||||||
|
*/
|
||||||
|
void exceptionOnError();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct utf8_encode_state ctx;
|
||||||
|
wchar_t replaceChar;
|
||||||
|
|
||||||
|
static enum utf8_encode_error_action _skipOnError
|
||||||
|
(const struct utf8_encode_state *ctx, wchar_t *newch);
|
||||||
|
static enum utf8_encode_error_action _replaceOnError
|
||||||
|
(const struct utf8_encode_state *ctx, wchar_t *newch);
|
||||||
|
static enum utf8_encode_error_action _exceptionOnError
|
||||||
|
(const struct utf8_encode_state *ctx, wchar_t *newch);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,81 @@
|
||||||
|
/* libutf8++/src/lib/exception.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace utf8 {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Error::Error(const std::string& reason)
|
||||||
|
: reason(reason)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
const char* Error::what()
|
||||||
|
{
|
||||||
|
return reason.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BadUnicodeChar::BadUnicodeChar(const struct utf8_encode_state* ctx)
|
||||||
|
: Error(format(ctx)), badChar(*ctx->rd), line(ctx->line), col(ctx->col), char_offset(ctx->char_offset)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::string BadUnicodeChar::format(const struct utf8_encode_state* ctx)
|
||||||
|
{
|
||||||
|
std::ostringstream str;
|
||||||
|
|
||||||
|
str << "Invalid Unicode code point encountered."
|
||||||
|
"\n Position : line "
|
||||||
|
<< ctx->line + 1
|
||||||
|
<< ", column "
|
||||||
|
<< ctx->col + 1
|
||||||
|
<< "\n Stream offset : "
|
||||||
|
<< ctx->char_offset
|
||||||
|
<< " characters\n Character value: 0x"
|
||||||
|
<< std::hex
|
||||||
|
<< *(ctx->rd);
|
||||||
|
|
||||||
|
return str.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
BadUTF8Sequence::BadUTF8Sequence(const std::string& description,
|
||||||
|
const struct utf8_decode_state* ctx)
|
||||||
|
: Error(format(description, ctx)), description(description), line(ctx->line + 1),
|
||||||
|
col(ctx->col + 1), char_offset(ctx->char_offset), byte_offset(ctx->byte_offset)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::string BadUTF8Sequence::format(const std::string& description,
|
||||||
|
const struct utf8_decode_state* ctx)
|
||||||
|
{
|
||||||
|
std::ostringstream str;
|
||||||
|
|
||||||
|
str << "Bad byte sequence in UTF-8 data.\n"
|
||||||
|
" Reason : " << description
|
||||||
|
<< "\n Position: line " << ctx->line + 1
|
||||||
|
<< ", column " << ctx->col + 1
|
||||||
|
<< "\n Offset : " << ctx->char_offset << " chars, " << ctx->byte_offset << " bytes";
|
||||||
|
|
||||||
|
return str.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,96 @@
|
||||||
|
/* libutf8++/src/lib/exception.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \brief Exception base class.
|
||||||
|
|
||||||
|
This is the base class for all libutf8 exceptions. It contains one member, \a reason, which allows
|
||||||
|
you to print a human-readable description of the error. To recover the actual type, you can refer
|
||||||
|
to the more specific derived classes.
|
||||||
|
|
||||||
|
*/
|
||||||
|
class Error : public std::exception {
|
||||||
|
public:
|
||||||
|
/// Human-readable reason for error.
|
||||||
|
std::string reason;
|
||||||
|
|
||||||
|
/// Constructor.
|
||||||
|
Error(const std::string& reason);
|
||||||
|
|
||||||
|
/// Destructor.
|
||||||
|
virtual ~Error() throw()
|
||||||
|
{ }
|
||||||
|
|
||||||
|
/// Find what caused the error.
|
||||||
|
virtual const char* what();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Invalid Unicode character exception.
|
||||||
|
|
||||||
|
This exception is thrown when encoding Unicode into UTF-8 and an invalid character is encountered.
|
||||||
|
|
||||||
|
*/
|
||||||
|
class BadUnicodeChar : public Error {
|
||||||
|
public:
|
||||||
|
/// A copy of the invalid character.
|
||||||
|
wchar_t badChar;
|
||||||
|
|
||||||
|
/// Line of input data at which error occurred (starts at 1).
|
||||||
|
int line;
|
||||||
|
|
||||||
|
/// Column of input data at which error occurred (starts at 1).
|
||||||
|
int col;
|
||||||
|
|
||||||
|
/// Character offset of input data at which error occurred.
|
||||||
|
int char_offset;
|
||||||
|
|
||||||
|
/// Constructor.
|
||||||
|
BadUnicodeChar(const struct utf8_encode_state* ctx);
|
||||||
|
|
||||||
|
private:
|
||||||
|
static std::string format(const struct utf8_encode_state* ctx);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Invalid UTF-8 sequence exception.
|
||||||
|
|
||||||
|
This exception is thrown when decoding UTF-8 and an invalid sequence is encountered. This could be
|
||||||
|
a nonsensical sequence, a redundantly-encounded character or truncated source data. It contains some
|
||||||
|
variables for allowing detailed diagnostics.
|
||||||
|
|
||||||
|
*/
|
||||||
|
class BadUTF8Sequence : public Error {
|
||||||
|
public:
|
||||||
|
/// Description of the error, for human diagnostics.
|
||||||
|
std::string description;
|
||||||
|
|
||||||
|
/// Line of input data at which error occurred (starts at 1).
|
||||||
|
int line;
|
||||||
|
|
||||||
|
/// Column of input data at which error occurred (starts at 1).
|
||||||
|
int col;
|
||||||
|
|
||||||
|
/// Character offset of input data at which error occurred.
|
||||||
|
int char_offset;
|
||||||
|
|
||||||
|
/// Byte offset of input data at which error occurred.
|
||||||
|
int byte_offset;
|
||||||
|
|
||||||
|
/// Constructor.
|
||||||
|
BadUTF8Sequence(const std::string& description, const struct utf8_decode_state* ctx);
|
||||||
|
|
||||||
|
/// Destructor.
|
||||||
|
~BadUTF8Sequence() throw() { }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string format(const std::string& description, const struct utf8_decode_state* ctx);
|
||||||
|
};
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,21 @@
|
||||||
|
# libutf8++/src/lib/libutf8++/pkgconf.in
|
||||||
|
#
|
||||||
|
# Metadata file for pkg-config
|
||||||
|
# ( http://www.freedesktop.org/software/pkgconfig/ )
|
||||||
|
#
|
||||||
|
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||||
|
# Released under the GNU GPLv2. See file COPYING or
|
||||||
|
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
|
#
|
||||||
|
|
||||||
|
# Name, description
|
||||||
|
Name: libutf8++
|
||||||
|
Description: C++ wrapper around libutf8 (library for handling UTF-8)
|
||||||
|
Version: @VERSION@
|
||||||
|
|
||||||
|
# Requirements
|
||||||
|
Requires:
|
||||||
|
|
||||||
|
# Compilation information
|
||||||
|
Libs: -L@LIBDIR@ -lutf8++
|
||||||
|
Cflags: -I@INCLUDEDIR@
|
|
@ -0,0 +1,17 @@
|
||||||
|
# libutf8++/src/libutf8++/soversion
|
||||||
|
#
|
||||||
|
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||||
|
# Released under the GNU GPLv2. See file COPYING or
|
||||||
|
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# SOMAJOR and SOMINOR are included in the library's soname. They need to
|
||||||
|
# be bumped on a binary-incompatible release. They are both single
|
||||||
|
# integers.
|
||||||
|
SOMAJOR=0
|
||||||
|
SOMINOR=0
|
||||||
|
|
||||||
|
# SOMICRO is bumped every time there is a binary-compatible release.
|
||||||
|
SOMICRO=0
|
|
@ -0,0 +1,126 @@
|
||||||
|
/* libutf8++/src/lib/string.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace utf8 {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static enum utf8_decode_error_action decode_replace_callback(const struct utf8_decode_state* ctx,
|
||||||
|
enum utf8_decode_error error, wchar_t* newch)
|
||||||
|
{
|
||||||
|
(void)error;
|
||||||
|
*newch = *(wchar_t*)(ctx->data);
|
||||||
|
return utf8_decode_error_action_replace;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static enum utf8_decode_error_action decode_error_callback(const struct utf8_decode_state* ctx,
|
||||||
|
enum utf8_decode_error error, wchar_t* newch)
|
||||||
|
{
|
||||||
|
(void)newch;
|
||||||
|
const char* desc = "unknown";
|
||||||
|
|
||||||
|
switch(error) {
|
||||||
|
case utf8_decode_error_lone_cchar:
|
||||||
|
desc = "An invalid continuation byte was encountered while expecting a character.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_not_cchar:
|
||||||
|
desc = "A multi-byte sequence contained an invalid byte.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_not_schar:
|
||||||
|
desc = "An invalid byte was encountered while expecting a character.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_overlong:
|
||||||
|
desc = "An overlong encoding of a character was encountered.";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case utf8_decode_error_illegal_cp:
|
||||||
|
desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw BadUTF8Sequence(desc, ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::wstring decode(const std::string& utf8, bool force, wchar_t replace)
|
||||||
|
{
|
||||||
|
wchar_t buffer[128];
|
||||||
|
struct utf8_decode_state ctx;
|
||||||
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
|
|
||||||
|
ctx.rd = utf8.data();
|
||||||
|
ctx.rd_remain = utf8.size();
|
||||||
|
ctx.wr = buffer;
|
||||||
|
ctx.wr_size = 128;
|
||||||
|
if(force) {
|
||||||
|
ctx.error_callback = decode_replace_callback;
|
||||||
|
ctx.data = &replace;
|
||||||
|
} else {
|
||||||
|
ctx.error_callback = decode_error_callback;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::wstring ret;
|
||||||
|
|
||||||
|
while(ctx.rd_remain) {
|
||||||
|
utf8_decoder(&ctx);
|
||||||
|
ret.append(buffer, ctx.written);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static enum utf8_encode_error_action encode_replace_callback(const struct utf8_encode_state* ctx,
|
||||||
|
wchar_t* newch)
|
||||||
|
{
|
||||||
|
*newch = *(wchar_t*)(ctx->data);
|
||||||
|
return utf8_encode_error_action_replace;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::string encode(const std::wstring& ustr, bool force, wchar_t replace)
|
||||||
|
{
|
||||||
|
char buffer[512];
|
||||||
|
struct utf8_encode_state ctx;
|
||||||
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
|
|
||||||
|
ctx.rd = ustr.data();
|
||||||
|
ctx.rd_remain = ustr.size();
|
||||||
|
ctx.wr = buffer;
|
||||||
|
ctx.wr_size = 512;
|
||||||
|
if(force) {
|
||||||
|
ctx.error_callback = encode_replace_callback;
|
||||||
|
ctx.data = &replace;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ret;
|
||||||
|
|
||||||
|
while(ctx.rd_remain) {
|
||||||
|
if(!utf8_encoder(&ctx)) {
|
||||||
|
throw BadUnicodeChar(&ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.append(buffer, ctx.written);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,37 @@
|
||||||
|
/* libutf8++/src/lib/string.h
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \brief Decode UTF-8.
|
||||||
|
|
||||||
|
\param utf8 The UTF-8 encoded data.
|
||||||
|
\param force If set to \a true, errors will be inhibited.
|
||||||
|
\param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
|
||||||
|
character.
|
||||||
|
\returns The Unicode wide-character string representation.
|
||||||
|
\throws BadUTF8Sequence if there is an invalid byte sequence in the UTF-8 source data.
|
||||||
|
|
||||||
|
This function will decode a UTF-8 source string into a Unicode wide-character string. It has a force
|
||||||
|
mode whereby any errors will be inhibited and a best-effort attempt will be made.
|
||||||
|
|
||||||
|
*/
|
||||||
|
std::wstring decode(const std::string& utf8, bool force = false, wchar_t replace = 0xFFFD);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*! \brief Encode UTF-8.
|
||||||
|
|
||||||
|
\param ustr The Unicode wide-character string.
|
||||||
|
\param force If set to \a true, errors will be inhibited (invalid chars will be omitted).
|
||||||
|
\param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
|
||||||
|
character.
|
||||||
|
\returns The UTF-8 transformed representation of \a ustr.
|
||||||
|
\throws BadUnicodeChar on invalid characters in the source data.
|
||||||
|
|
||||||
|
This function will encode a Unicode wide-character string into a UTF-8 transformed representation.
|
||||||
|
It has a force mode whereby any errors will be inhibited and a best-effort attempt will be made.
|
||||||
|
|
||||||
|
*/
|
||||||
|
std::string encode(const std::wstring& ustr, bool force = false, wchar_t replace = 0xFFFD);
|
|
@ -0,0 +1 @@
|
||||||
|
c++ tests tests libutf8++
|
|
@ -0,0 +1,3 @@
|
||||||
|
source src/tests/build.tests
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,43 @@
|
||||||
|
# These are external variables, and shouldn't clash with anything else
|
||||||
|
# tests_BUILT
|
||||||
|
#
|
||||||
|
|
||||||
|
build_target libutf8++ || return 1
|
||||||
|
|
||||||
|
if [ -z ${tests_BUILT} ]
|
||||||
|
then
|
||||||
|
LIBS="${libutf8pp} "
|
||||||
|
EXTRAS=""
|
||||||
|
|
||||||
|
echo "Building test programs..."
|
||||||
|
do_cmd mkdir -p obj/tests || return 1
|
||||||
|
|
||||||
|
for SRC in src/tests/*.cpp
|
||||||
|
do
|
||||||
|
TEST="obj/tests/$(basename ${SRC} | sed -e 's,.cpp$,,')"
|
||||||
|
MODIFIED=0
|
||||||
|
for file in ${LIBS} ${SRC} src/tests/build.tests
|
||||||
|
do
|
||||||
|
if [ ${file} -nt ${TEST} ]
|
||||||
|
then
|
||||||
|
MODIFIED=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ ${MODIFIED} -ne 0 ]
|
||||||
|
then
|
||||||
|
do_cmd ${CXX} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
|
||||||
|
print_success "Built ${TEST}"
|
||||||
|
else
|
||||||
|
print_success "${TEST} is up to date"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
print_success "All tests built"
|
||||||
|
|
||||||
|
tests_BUILT=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,85 @@
|
||||||
|
/* libutf8++/src/tests/objects.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utf8"
|
||||||
|
#include <iostream>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void make_random(wchar_t* buf, int ch)
|
||||||
|
{
|
||||||
|
int fd = open("/dev/urandom", O_RDONLY);
|
||||||
|
if(fd < 0) {
|
||||||
|
perror("open(\"/dev/urandom\")");
|
||||||
|
throw 1;
|
||||||
|
}
|
||||||
|
ch *= sizeof(wchar_t);
|
||||||
|
if(read(fd, (char*)buf, ch) != ch) {
|
||||||
|
perror("read(\"/dev/urandom\")");
|
||||||
|
throw 1;
|
||||||
|
}
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
ch /= sizeof(wchar_t);
|
||||||
|
while(ch--) {
|
||||||
|
buf[ch] &= 0x7FFFFFFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||||
|
std::cout << "Performs some tests on the Encoder and Decoder objects.\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ret = 0;
|
||||||
|
try {
|
||||||
|
wchar_t wch[1024];
|
||||||
|
make_random(wch, 1024);
|
||||||
|
|
||||||
|
std::wstring ustr;
|
||||||
|
ustr.assign(wch, 1024);
|
||||||
|
|
||||||
|
utf8::Encoder encoder;
|
||||||
|
utf8::Decoder decoder;
|
||||||
|
|
||||||
|
encoder.encode(ustr);
|
||||||
|
decoder.decode(encoder.encoded);
|
||||||
|
|
||||||
|
if(ustr != decoder.decoded) {
|
||||||
|
std::cerr << "Decoded string does not match original.\n";
|
||||||
|
for(size_t i = 0, end = std::min(ustr.size(), decoder.decoded.size()); i != end; ++i) {
|
||||||
|
if(ustr[i] != decoder.decoded[i]) {
|
||||||
|
std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
|
||||||
|
<< std::setfill('0') << std::hex << ": 0x"
|
||||||
|
<< std::setw(8) << ustr[i] << " != "
|
||||||
|
<< std::setw(8) << decoder.decoded[i] << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cerr << "Original size " << std::dec << ustr.size()
|
||||||
|
<< ", decoded size " << decoder.decoded.size() << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Success.\n";
|
||||||
|
}
|
||||||
|
catch(utf8::Error& e) {
|
||||||
|
std::cerr << e.reason << std::endl;
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,82 @@
|
||||||
|
/* libutf8++/src/tests/strings.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||||
|
* COPYING for more information / terms of license.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utf8"
|
||||||
|
#include <iostream>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void make_random(wchar_t* buf, int ch)
|
||||||
|
{
|
||||||
|
int fd = open("/dev/urandom", O_RDONLY);
|
||||||
|
if(fd < 0) {
|
||||||
|
perror("open(\"/dev/urandom\")");
|
||||||
|
throw 1;
|
||||||
|
}
|
||||||
|
ch *= sizeof(wchar_t);
|
||||||
|
if(read(fd, (char*)buf, ch) != ch) {
|
||||||
|
perror("read(\"/dev/urandom\")");
|
||||||
|
throw 1;
|
||||||
|
}
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
ch /= sizeof(wchar_t);
|
||||||
|
while(ch--) {
|
||||||
|
buf[ch] &= 0x7FFFFFFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||||
|
std::cout << "Performs some tests on the string encode/decode routines.\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ret = 0;
|
||||||
|
try {
|
||||||
|
wchar_t wch[1024];
|
||||||
|
make_random(wch, 1024);
|
||||||
|
|
||||||
|
std::wstring ustr1, ustr2;
|
||||||
|
std::string utf8;
|
||||||
|
ustr1.assign(wch, 1024);
|
||||||
|
utf8 = utf8::encode(ustr1);
|
||||||
|
ustr2 = utf8::decode(utf8);
|
||||||
|
|
||||||
|
if(ustr1 != ustr2) {
|
||||||
|
std::cerr << "Decoded string does not match original.\n";
|
||||||
|
for(size_t i = 0, end = std::min(ustr1.size(), ustr2.size()); i != end; ++i) {
|
||||||
|
if(ustr1[i] != ustr2[i]) {
|
||||||
|
std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
|
||||||
|
<< std::setfill('0') << std::hex << ": 0x"
|
||||||
|
<< std::setw(8) << ustr1[i] << " != "
|
||||||
|
<< std::setw(8) << ustr2[i] << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cerr << "Original size " << std::dec << ustr1.size()
|
||||||
|
<< ", decoded size " << ustr2.size() << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Success.\n";
|
||||||
|
}
|
||||||
|
catch(utf8::Error& e) {
|
||||||
|
std::cerr << e.reason << std::endl;
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
*/
|
|
@ -0,0 +1,44 @@
|
||||||
|
/* libutf8++/src/tests/???.cpp
|
||||||
|
*
|
||||||
|
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||||
|
* Released under the GNU GPLv2. See file COPYING or
|
||||||
|
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utf8"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||||
|
std::cout << "One line summary.\n";
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(argc == 1) {
|
||||||
|
// empty argument list
|
||||||
|
}
|
||||||
|
|
||||||
|
int ret = 0;
|
||||||
|
try {
|
||||||
|
// TODO
|
||||||
|
}
|
||||||
|
catch(std::exception& e) {
|
||||||
|
std::cerr << e.what() << std::endl;
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
catch(...) {
|
||||||
|
std::cerr << "Unknown exception caught." << std::endl;
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* options for text editors
|
||||||
|
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||||
|
vim: expandtab:ts=4:sw=4
|
||||||
|
*/
|
Loading…
Reference in New Issue