Copy from svn repository.
This commit is contained in:
parent
73d6e6fbd0
commit
ac22dabfe6
11
README
11
README
|
@ -10,5 +10,14 @@ Really Quick Instructions
|
|||
To build: ./make.sh
|
||||
To install: ./make.sh install
|
||||
(you might want to set PREFIX, by default it's /usr/local)
|
||||
Documentation is automatically built using doxygen.
|
||||
|
||||
@TODO@
|
||||
Dependencies
|
||||
------------
|
||||
|
||||
libutf8, http://www.lwithers.me.uk/projects/libutf8/
|
||||
|
||||
Project Homepage
|
||||
----------------
|
||||
|
||||
http://www.lwithers.me.uk/projects/libutf8++/
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
doxygen docs docs
|
|
@ -0,0 +1,146 @@
|
|||
# libutf8++/src/docs/Doxyfile.in
|
||||
#
|
||||
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
# Released under the GNU GPLv2. See file COPYING or
|
||||
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||
#
|
||||
|
||||
PROJECT_NAME = libutf8++
|
||||
OUTPUT_DIRECTORY =
|
||||
CREATE_SUBDIRS = NO
|
||||
OUTPUT_LANGUAGE = English
|
||||
USE_WINDOWS_ENCODING = NO
|
||||
BRIEF_MEMBER_DESC = YES
|
||||
REPEAT_BRIEF = YES
|
||||
ABBREVIATE_BRIEF =
|
||||
ALWAYS_DETAILED_SEC = NO
|
||||
INLINE_INHERITED_MEMB = YES
|
||||
FULL_PATH_NAMES = NO
|
||||
STRIP_FROM_PATH =
|
||||
STRIP_FROM_INC_PATH =
|
||||
SHORT_NAMES = NO
|
||||
JAVADOC_AUTOBRIEF = NO
|
||||
MULTILINE_CPP_IS_BRIEF = YES
|
||||
DETAILS_AT_TOP = YES
|
||||
INHERIT_DOCS = YES
|
||||
DISTRIBUTE_GROUP_DOC = NO
|
||||
TAB_SIZE = 4
|
||||
ALIASES =
|
||||
OPTIMIZE_OUTPUT_FOR_C = NO
|
||||
OPTIMIZE_OUTPUT_JAVA = NO
|
||||
SUBGROUPING = YES
|
||||
EXTRACT_ALL = NO
|
||||
EXTRACT_PRIVATE = NO
|
||||
EXTRACT_STATIC = NO
|
||||
EXTRACT_LOCAL_CLASSES = NO
|
||||
EXTRACT_LOCAL_METHODS = NO
|
||||
HIDE_UNDOC_MEMBERS = NO
|
||||
HIDE_UNDOC_CLASSES = NO
|
||||
HIDE_FRIEND_COMPOUNDS = YES
|
||||
HIDE_IN_BODY_DOCS = NO
|
||||
INTERNAL_DOCS = NO
|
||||
CASE_SENSE_NAMES = YES
|
||||
HIDE_SCOPE_NAMES = NO
|
||||
SHOW_INCLUDE_FILES = NO
|
||||
INLINE_INFO = YES
|
||||
SORT_MEMBER_DOCS = YES
|
||||
SORT_BRIEF_DOCS = NO
|
||||
SORT_BY_SCOPE_NAME = NO
|
||||
GENERATE_TODOLIST = YES
|
||||
GENERATE_TESTLIST = YES
|
||||
GENERATE_BUGLIST = YES
|
||||
GENERATE_DEPRECATEDLIST= YES
|
||||
ENABLED_SECTIONS =
|
||||
MAX_INITIALIZER_LINES = 30
|
||||
SHOW_USED_FILES = NO
|
||||
SHOW_DIRECTORIES = NO
|
||||
FILE_VERSION_FILTER =
|
||||
QUIET = YES
|
||||
WARNINGS = YES
|
||||
WARN_IF_UNDOCUMENTED = YES
|
||||
WARN_IF_DOC_ERROR = YES
|
||||
WARN_NO_PARAMDOC = YES
|
||||
WARN_FORMAT = "$file:$line: $text"
|
||||
WARN_LOGFILE =
|
||||
FILE_PATTERNS =
|
||||
RECURSIVE = NO
|
||||
EXCLUDE =
|
||||
EXCLUDE_SYMLINKS = NO
|
||||
EXCLUDE_PATTERNS =
|
||||
EXAMPLE_PATH =
|
||||
EXAMPLE_PATTERNS =
|
||||
EXAMPLE_RECURSIVE = NO
|
||||
IMAGE_PATH = src/docs
|
||||
INPUT_FILTER =
|
||||
FILTER_PATTERNS =
|
||||
FILTER_SOURCE_FILES = NO
|
||||
SOURCE_BROWSER = NO
|
||||
INLINE_SOURCES = NO
|
||||
STRIP_CODE_COMMENTS = YES
|
||||
REFERENCED_BY_RELATION = YES
|
||||
REFERENCES_RELATION = YES
|
||||
VERBATIM_HEADERS = NO
|
||||
ALPHABETICAL_INDEX = YES
|
||||
COLS_IN_ALPHA_INDEX = 5
|
||||
IGNORE_PREFIX =
|
||||
GENERATE_HTML = YES
|
||||
HTML_OUTPUT = html
|
||||
HTML_FILE_EXTENSION = .html
|
||||
HTML_HEADER =
|
||||
HTML_FOOTER =
|
||||
HTML_STYLESHEET =
|
||||
HTML_ALIGN_MEMBERS = YES
|
||||
GENERATE_HTMLHELP = NO
|
||||
CHM_FILE =
|
||||
HHC_LOCATION =
|
||||
GENERATE_CHI = NO
|
||||
BINARY_TOC = NO
|
||||
TOC_EXPAND = NO
|
||||
DISABLE_INDEX = NO
|
||||
ENUM_VALUES_PER_LINE = 4
|
||||
GENERATE_TREEVIEW = NO
|
||||
TREEVIEW_WIDTH = 250
|
||||
GENERATE_LATEX = NO
|
||||
GENERATE_RTF = NO
|
||||
GENERATE_MAN = NO
|
||||
GENERATE_XML = NO
|
||||
GENERATE_AUTOGEN_DEF = NO
|
||||
GENERATE_PERLMOD = NO
|
||||
ENABLE_PREPROCESSING = YES
|
||||
MACRO_EXPANSION = NO
|
||||
EXPAND_ONLY_PREDEF = NO
|
||||
SEARCH_INCLUDES = YES
|
||||
INCLUDE_PATH =
|
||||
INCLUDE_FILE_PATTERNS =
|
||||
PREDEFINED = DOXYGEN
|
||||
EXPAND_AS_DEFINED =
|
||||
SKIP_FUNCTION_MACROS = YES
|
||||
TAGFILES =
|
||||
GENERATE_TAGFILE =
|
||||
ALLEXTERNALS = NO
|
||||
EXTERNAL_GROUPS = YES
|
||||
PERL_PATH = /usr/bin/perl
|
||||
CLASS_DIAGRAMS = YES
|
||||
HIDE_UNDOC_RELATIONS = YES
|
||||
HAVE_DOT = YES
|
||||
CLASS_GRAPH = YES
|
||||
COLLABORATION_GRAPH = YES
|
||||
GROUP_GRAPHS = NO
|
||||
UML_LOOK = NO
|
||||
TEMPLATE_RELATIONS = NO
|
||||
INCLUDE_GRAPH = NO
|
||||
INCLUDED_BY_GRAPH = NO
|
||||
CALL_GRAPH = NO
|
||||
GRAPHICAL_HIERARCHY = YES
|
||||
DIRECTORY_GRAPH = NO
|
||||
DOT_IMAGE_FORMAT = png
|
||||
DOT_PATH =
|
||||
DOTFILE_DIRS =
|
||||
MAX_DOT_GRAPH_WIDTH = 1024
|
||||
MAX_DOT_GRAPH_HEIGHT = 1024
|
||||
MAX_DOT_GRAPH_DEPTH = 0
|
||||
DOT_TRANSPARENT = YES
|
||||
DOT_MULTI_TARGETS = YES
|
||||
GENERATE_LEGEND = YES
|
||||
DOT_CLEANUP = YES
|
||||
SEARCHENGINE = NO
|
|
@ -0,0 +1,15 @@
|
|||
/* libutf8++/src/docs/MainPage.dox
|
||||
*
|
||||
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
* Released under the GNU GPLv2. See file COPYING or
|
||||
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||
*/
|
||||
|
||||
/*! \mainpage
|
||||
|
||||
*/
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
vim: expandtab:ts=4:sw=4
|
||||
*/
|
|
@ -0,0 +1 @@
|
|||
source src/docs/build.docs
|
|
@ -0,0 +1,43 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# docs_BUILT
|
||||
#
|
||||
|
||||
MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
|
||||
build_target monolithic
|
||||
|
||||
if [ -z ${docs_BUILT} ]
|
||||
then
|
||||
echo "Building documentation with Doxygen..."
|
||||
|
||||
DOXYFILE=obj/Doxyfile.docs
|
||||
|
||||
if [ ! -e ${DOXYFILE} ]
|
||||
then
|
||||
do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
|
||||
echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
|
||||
echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
|
||||
fi
|
||||
|
||||
MODIFIED=0
|
||||
for file in ${MONOLITHIC_DOC}
|
||||
do
|
||||
if [ ${file} -nt html/index.html ]
|
||||
then
|
||||
MODIFIED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${MODIFIED} -ne 0 ]
|
||||
then
|
||||
do_cmd doxygen ${DOXYFILE} || return 1
|
||||
print_success "Documentation built"
|
||||
else
|
||||
print_success "Documentation is up to date"
|
||||
fi
|
||||
|
||||
docs_BUILT=1
|
||||
fi
|
||||
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1 @@
|
|||
source src/docs/build.install-docs
|
|
@ -0,0 +1,21 @@
|
|||
build_target docs
|
||||
|
||||
# create documentation directories
|
||||
echo "Installing documentation into ${DOCSDIR}"
|
||||
build_dir_tree "${DOCSDIR}/html" || return 1
|
||||
|
||||
# copy across the Doxygen-generated documentation
|
||||
for file in html/*
|
||||
do
|
||||
install_file ${file} ${DOCSDIR}/html 0644 || return 1
|
||||
done
|
||||
|
||||
# copy across the generic files
|
||||
for file in COPYING README
|
||||
do
|
||||
install_file ${file} ${DOCSDIR} 0644 || return 1
|
||||
done
|
||||
|
||||
print_success "Documentation installed"
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1 @@
|
|||
c++ lib libutf8++ utf8
|
|
@ -0,0 +1,9 @@
|
|||
/* libutf8++/src/lib/BottomHeader.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,9 @@
|
|||
/* libutf8++/src/lib/ForwardDeclare.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
// This file simply contains forward declarations of all libutf8++
|
||||
// classes, to facilitate header ordering, etc.
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
/* libutf8++/src/lib/TopHeader.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#ifndef HEADER_libutf8pp
|
||||
#define HEADER_libutf8pp
|
||||
|
||||
// standard includes, or includes needed for type declarations
|
||||
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <utf8.h>
|
||||
|
||||
|
||||
|
||||
/*! \brief UTF-8 handling routines.
|
||||
|
||||
The library's UTF-8 handling routines are all made available through this namespace.
|
||||
|
||||
*/
|
||||
namespace utf8 {
|
|
@ -0,0 +1,12 @@
|
|||
/* libutf8++/src/lib/TopSource.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#include "utf8"
|
||||
|
||||
// Below are all the includes used throughout the library.
|
||||
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
|
@ -0,0 +1 @@
|
|||
source src/libutf8++/build.lib
|
|
@ -0,0 +1 @@
|
|||
source src/libutf8++/build.install-lib
|
|
@ -0,0 +1,36 @@
|
|||
build_target libutf8++
|
||||
|
||||
# make paths (this is for Gentoo in particular)
|
||||
build_dir_tree "${LIBDIR}" || return 1
|
||||
build_dir_tree "${PKGCONFDIR}" || return 1
|
||||
build_dir_tree "${INCLUDEDIR}" || return 1
|
||||
|
||||
# install library
|
||||
echo "Installing libraries into '${LIBDIR}'"
|
||||
install_file ${libutf8pp} ${LIBDIR} 0755 || return 1
|
||||
BASE="${libutf8pp_BASE}.so"
|
||||
MAJOR="${BASE}.${SOMAJOR}"
|
||||
MINOR="${MAJOR}.${SOMINOR}"
|
||||
MICRO="${MINOR}.${SOMICRO}"
|
||||
install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
|
||||
install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
|
||||
install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
|
||||
|
||||
# install header
|
||||
echo "Installing header file '${libutf8pp_HEADER}' into ${INCLUDEDIR}"
|
||||
install_header ${libutf8pp_HEADER} ${INCLUDEDIR} 0644 || return 1
|
||||
|
||||
# install pkgconfig file
|
||||
echo "Installing package config file into ${PKGCONFDIR}"
|
||||
PKGCONFFILE=${PKGCONFDIR}/libutf8pp.pc
|
||||
do_cmd rm -f ${PKGCONFFILE}
|
||||
do_cmd_redir ${PKGCONFFILE} sed \
|
||||
-e "s,@VERSION@,${VERSION}," \
|
||||
-e "s,@LIBDIR@,${FINALLIBDIR}," \
|
||||
-e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
|
||||
src/libutf8++/pkgconf.in
|
||||
do_cmd chmod 0644 ${PKGCONFFILE}
|
||||
print_success "Done"
|
||||
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,51 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# libutf8pp
|
||||
# libutf8pp_BUILT
|
||||
# libutf8pp_HEADER
|
||||
# libutf8pp_BASE
|
||||
|
||||
if [ -z ${libutf8pp_BUILT} ]
|
||||
then
|
||||
libutf8pp_BASE=libutf8++
|
||||
source src/libutf8++/soversion
|
||||
|
||||
libutf8pp="obj/${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
|
||||
SO_EXTRA="$(pkg-config libutf8 --libs --cflags) -lstdc++ -lc"
|
||||
|
||||
echo "Building library ${libutf8pp}..."
|
||||
|
||||
do_cmd source src/libutf8++/build.monolithic || return 1
|
||||
|
||||
MODIFIED=0
|
||||
for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
|
||||
do
|
||||
if [ ${test} -nt ${libutf8pp} ]
|
||||
then
|
||||
MODIFIED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${MODIFIED} -ne 0 ]
|
||||
then
|
||||
echo " Compiling"
|
||||
|
||||
SONAME="${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}"
|
||||
do_cmd ${CXX} ${CFLAGS} -shared -fpic -o "${libutf8pp}" \
|
||||
-Wl,-soname,${SONAME} \
|
||||
${SRC} ${SO_EXTRA} || return 1
|
||||
|
||||
# make tests work
|
||||
do_cmd ln -sf $(basename ${libutf8pp}) obj/${SONAME} || return 1
|
||||
|
||||
print_success "Library built"
|
||||
else
|
||||
print_success "Library up to date"
|
||||
fi
|
||||
|
||||
libutf8pp_BUILT=1
|
||||
libutf8pp_HEADER=${HDR}
|
||||
|
||||
fi
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,21 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# libutf8pp_MONOLITHIC
|
||||
|
||||
SRC="obj/libutf8++.cpp"
|
||||
HDR="obj/utf8"
|
||||
|
||||
MONOLITHIC_TESTS="src/libutf8++/build.lib src/libutf8++/build.monolithic"
|
||||
|
||||
if [ -z "${libutf8pp_MONOLITHIC}" ]
|
||||
then
|
||||
MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopHeader,ForwardDeclare,exception,string,{en,de}coder,BottomHeader}.h)"
|
||||
make_monolithic ${HDR} C || return 1
|
||||
|
||||
MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopSource,exception,string,{en,de}coder}.cpp)"
|
||||
make_monolithic ${SRC} C || return 1
|
||||
|
||||
libutf8pp_MONOLITHIC=1
|
||||
MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
|
||||
fi
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,159 @@
|
|||
/* libutf8++/src/lib/decoder.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
|
||||
|
||||
Decoder::Decoder(size_t hint)
|
||||
{
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.wr_size = (hint < 2) ? 2 : hint;
|
||||
ctx.wr = new wchar_t[ctx.wr_size];
|
||||
ctx.error_callback = _exceptionOnError;
|
||||
ctx.data = this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Decoder::~Decoder()
|
||||
{
|
||||
delete [] ctx.wr;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Decoder::decode(const std::string& str)
|
||||
{
|
||||
decode(str.data(), str.size());
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Decoder::decode(const char* str, ssize_t amt)
|
||||
{
|
||||
ctx.rd = str;
|
||||
ctx.rd_remain = amt;
|
||||
while(ctx.rd_remain) {
|
||||
utf8_decoder(&ctx);
|
||||
decoded.append(ctx.wr, ctx.written);
|
||||
|
||||
if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
|
||||
if(ctx.rd_remain) {
|
||||
ctx.wr_size *= 2;
|
||||
delete [] ctx.wr;
|
||||
ctx.wr = new wchar_t[ctx.wr_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool Decoder::complete() const
|
||||
{
|
||||
return ctx.complete;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Decoder::reset()
|
||||
{
|
||||
size_t old_wr_size = ctx.wr_size;
|
||||
wchar_t* old_wr = ctx.wr;
|
||||
utf8_decode_error_callback old_error_callback = ctx.error_callback;
|
||||
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.wr_size = old_wr_size;
|
||||
ctx.wr = old_wr;
|
||||
ctx.error_callback = old_error_callback;
|
||||
ctx.data = this;
|
||||
decoded.clear();
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Decoder::skipOnError()
|
||||
{
|
||||
ctx.error_callback = _skipOnError;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Decoder::replaceOnError(wchar_t ch)
|
||||
{
|
||||
replaceChar = ch;
|
||||
ctx.error_callback = _replaceOnError;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Decoder::exceptionOnError()
|
||||
{
|
||||
ctx.error_callback = _exceptionOnError;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_decode_error_action Decoder::_skipOnError
|
||||
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
|
||||
{
|
||||
(void)ctx;
|
||||
(void)error;
|
||||
(void)newch;
|
||||
return utf8_decode_error_action_skip;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_decode_error_action Decoder::_replaceOnError
|
||||
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
|
||||
{
|
||||
(void)error;
|
||||
Decoder* self = (utf8::Decoder*)(ctx->data);
|
||||
*newch = self->replaceChar;
|
||||
return utf8_decode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_decode_error_action Decoder::_exceptionOnError
|
||||
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
|
||||
{
|
||||
(void)newch;
|
||||
const char* desc = "unknown";
|
||||
|
||||
switch(error) {
|
||||
case utf8_decode_error_lone_cchar:
|
||||
desc = "An invalid continuation byte was encountered while expecting a character.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_not_cchar:
|
||||
desc = "A multi-byte sequence contained an invalid byte.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_not_schar:
|
||||
desc = "An invalid byte was encountered while expecting a character.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_overlong:
|
||||
desc = "An overlong encoding of a character was encountered.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_illegal_cp:
|
||||
desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
|
||||
break;
|
||||
}
|
||||
|
||||
throw BadUTF8Sequence(desc, ctx);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,128 @@
|
|||
/* libutf8++/src/lib/decoder.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \brief Stateful UTF-8 decoder object.
|
||||
|
||||
This object is used for stateful decoding of a UTF-8 byte stream. It can be fed the data in
|
||||
arbitrary chunks, even split on non-character boundaries. It writes its output into a wide character
|
||||
string.
|
||||
|
||||
A variety of error handling modes are available. The default is to throw a BadUTF8Sequence
|
||||
exception, but you can change this with skipOnError() or replaceOnError().
|
||||
|
||||
*/
|
||||
class Decoder {
|
||||
public:
|
||||
/*! \brief Constructor.
|
||||
|
||||
\param hint Hint at number of characters to allocate space for in decoder buffer.
|
||||
|
||||
The constructor sets up the UTF-8 decoder. You can provide a hint as to the size of your input
|
||||
stream chunks. This hint is the number of characters to allocate in the output buffer. If,
|
||||
during a single decode operation, this buffer is filled, then it is doubled in size.
|
||||
|
||||
*/
|
||||
Decoder(size_t hint = 25);
|
||||
|
||||
|
||||
|
||||
/// Destructor.
|
||||
~Decoder();
|
||||
|
||||
|
||||
|
||||
/// Result of decoding operations (appended to).
|
||||
std::wstring decoded;
|
||||
|
||||
|
||||
|
||||
/*! \brief UTF-8 decoder.
|
||||
|
||||
\param str Pointer to source data.
|
||||
\param amt Number of bytes in source data (-1 for null terminated strings).
|
||||
\throws BadUTF8Sequence.
|
||||
|
||||
This function will decode a chunk of UTF-8 data. The decoded data will be appended to whatever
|
||||
is contained in the string decoded. You can check if the decoder ended on a character boundary
|
||||
or not by calling complete().
|
||||
|
||||
*/
|
||||
void decode(const char* str, ssize_t amt);
|
||||
|
||||
|
||||
|
||||
/// Decode data stored in a std::string.
|
||||
void decode(const std::string& str);
|
||||
|
||||
|
||||
|
||||
/// Returns \a true if the last call to \a decode() ended on a character boundary.
|
||||
bool complete() const;
|
||||
|
||||
|
||||
|
||||
/*! \brief Resets the parser for a new UTF-8 stream.
|
||||
|
||||
This function will clear the internal state of the decoder so that it is ready for data from a
|
||||
new source. This can be used if you have opened a new file, accepted a new connection, recovered
|
||||
from an error, etc. It will also clear \a decoded.
|
||||
|
||||
*/
|
||||
void reset();
|
||||
|
||||
|
||||
|
||||
/*! \brief Set error handling to \e skip mode.
|
||||
|
||||
This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
|
||||
byte sequences will simply be skipped altogether, and will not have any effect on the output in
|
||||
\a decoded.
|
||||
|
||||
*/
|
||||
void skipOnError();
|
||||
|
||||
|
||||
|
||||
/*! \brief Set error handling to \e replace mode.
|
||||
|
||||
\param ch The replacement character that will appear in the output.
|
||||
|
||||
This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
|
||||
byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
|
||||
in \a decoded. The default parameter is the unicode replacement character, which should look
|
||||
like an upside-down question mark.
|
||||
|
||||
*/
|
||||
void replaceOnError(wchar_t ch = 0xFFFD);
|
||||
|
||||
|
||||
|
||||
/*! \brief Set error handling to \e exception mode (default).
|
||||
|
||||
This function will set the error handling to \e exception mode. In this mode, any invalid
|
||||
UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
|
||||
mode.
|
||||
|
||||
*/
|
||||
void exceptionOnError();
|
||||
|
||||
|
||||
|
||||
private:
|
||||
struct utf8_decode_state ctx;
|
||||
wchar_t replaceChar;
|
||||
|
||||
static enum utf8_decode_error_action _skipOnError
|
||||
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
|
||||
static enum utf8_decode_error_action _replaceOnError
|
||||
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
|
||||
static enum utf8_decode_error_action _exceptionOnError
|
||||
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
|
||||
};
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,124 @@
|
|||
/* libutf8++/src/lib/encoder.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
|
||||
|
||||
Encoder::Encoder(size_t hint)
|
||||
{
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.wr_size = (hint < 7) ? 7 : hint;
|
||||
ctx.wr = new char[ctx.wr_size];
|
||||
ctx.error_callback = _exceptionOnError;
|
||||
ctx.data = this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Encoder::~Encoder()
|
||||
{
|
||||
delete [] ctx.wr;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Encoder::reset()
|
||||
{
|
||||
char* wr = ctx.wr;
|
||||
size_t wr_size = ctx.wr_size;
|
||||
utf8_encode_error_callback cb = ctx.error_callback;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.wr = wr;
|
||||
ctx.wr_size = wr_size;
|
||||
ctx.error_callback = cb;
|
||||
ctx.data = this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Encoder::encode(const std::wstring& str)
|
||||
{
|
||||
encode(str.data(), str.size());
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Encoder::encode(const wchar_t* str, ssize_t amt)
|
||||
{
|
||||
ctx.rd = str;
|
||||
ctx.rd_remain = amt;
|
||||
while(ctx.rd_remain) {
|
||||
if(!utf8_encoder(&ctx)) throw BadUnicodeChar(&ctx);
|
||||
encoded.append(ctx.wr, ctx.written);
|
||||
|
||||
if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
|
||||
if(ctx.rd_remain) {
|
||||
ctx.wr_size *= 2;
|
||||
delete [] ctx.wr;
|
||||
ctx.wr = new char[ctx.wr_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Encoder::skipOnError()
|
||||
{
|
||||
ctx.error_callback = _skipOnError;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Encoder::replaceOnError(wchar_t ch)
|
||||
{
|
||||
replaceChar = ch;
|
||||
ctx.error_callback = _replaceOnError;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void Encoder::exceptionOnError()
|
||||
{
|
||||
ctx.error_callback = _exceptionOnError;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_encode_error_action Encoder::_skipOnError
|
||||
(const struct utf8_encode_state *ctx, wchar_t *newch)
|
||||
{
|
||||
(void)ctx;
|
||||
(void)newch;
|
||||
return utf8_encode_error_action_skip;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_encode_error_action Encoder::_replaceOnError
|
||||
(const struct utf8_encode_state *ctx, wchar_t *newch)
|
||||
{
|
||||
Encoder* self = (utf8::Encoder*)(ctx->data);
|
||||
*newch = self->replaceChar;
|
||||
return utf8_encode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_encode_error_action Encoder::_exceptionOnError
|
||||
(const struct utf8_encode_state *ctx, wchar_t *newch)
|
||||
{
|
||||
(void)newch;
|
||||
throw BadUnicodeChar(ctx);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,108 @@
|
|||
/* libutf8++/src/lib/encoder.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \brief UTF-8 encoder object.
|
||||
|
||||
This object is used to encode Unicode wide characters into UTF-8. It can be fed chunks of characters
|
||||
which it then encodes, appending the result to an internal buffer.
|
||||
|
||||
*/
|
||||
class Encoder {
|
||||
public:
|
||||
/*! \brief Constructor.
|
||||
|
||||
\param hint Number of bytes to allocate for the encoding buffer.
|
||||
|
||||
The constructor sets up the encoder and allocates some space for an internal buffer. You can
|
||||
hint at how large you expect the chunks to be encoded will be. If an encoding operation fills
|
||||
the buffer without consuming all the input data, the buffer will be doubled in size for the
|
||||
next round.
|
||||
|
||||
*/
|
||||
Encoder(size_t hint = 100);
|
||||
|
||||
/// Destructor.
|
||||
virtual ~Encoder();
|
||||
|
||||
|
||||
|
||||
/// UTF-8 output data is appended to this string.
|
||||
std::string encoded;
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode some data into UTF-8.
|
||||
|
||||
\param str Pointer to the character array to encode.
|
||||
\param amt Number of characters to encode.
|
||||
|
||||
This function performs an encoding of some Unicode characters into UTF-8. It appends the result
|
||||
onto \a encoded.
|
||||
|
||||
*/
|
||||
void encode(const wchar_t* str, ssize_t amt);
|
||||
|
||||
/// Encode a std::wstring.
|
||||
void encode(const std::wstring& str);
|
||||
|
||||
|
||||
|
||||
/// Reset the encoder for a new character stream.
|
||||
void reset();
|
||||
|
||||
|
||||
|
||||
/*! \brief Set error handling to \e skip mode.
|
||||
|
||||
This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
|
||||
byte sequences will simply be skipped altogether, and will not have any effect on the output in
|
||||
\a decoded.
|
||||
|
||||
*/
|
||||
void skipOnError();
|
||||
|
||||
|
||||
|
||||
/*! \brief Set error handling to \e replace mode.
|
||||
|
||||
\param ch The replacement character that will appear in the output.
|
||||
|
||||
This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
|
||||
byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
|
||||
in \a decoded. The default parameter is the unicode replacement character, which should look
|
||||
like an upside-down question mark.
|
||||
|
||||
*/
|
||||
void replaceOnError(wchar_t ch = 0xFFFD);
|
||||
|
||||
|
||||
|
||||
/*! \brief Set error handling to \e exception mode (default).
|
||||
|
||||
This function will set the error handling to \e exception mode. In this mode, any invalid
|
||||
UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
|
||||
mode.
|
||||
|
||||
*/
|
||||
void exceptionOnError();
|
||||
|
||||
|
||||
|
||||
private:
|
||||
struct utf8_encode_state ctx;
|
||||
wchar_t replaceChar;
|
||||
|
||||
static enum utf8_encode_error_action _skipOnError
|
||||
(const struct utf8_encode_state *ctx, wchar_t *newch);
|
||||
static enum utf8_encode_error_action _replaceOnError
|
||||
(const struct utf8_encode_state *ctx, wchar_t *newch);
|
||||
static enum utf8_encode_error_action _exceptionOnError
|
||||
(const struct utf8_encode_state *ctx, wchar_t *newch);
|
||||
};
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,81 @@
|
|||
/* libutf8++/src/lib/exception.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
|
||||
|
||||
Error::Error(const std::string& reason)
|
||||
: reason(reason)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
const char* Error::what()
|
||||
{
|
||||
return reason.c_str();
|
||||
}
|
||||
|
||||
|
||||
|
||||
BadUnicodeChar::BadUnicodeChar(const struct utf8_encode_state* ctx)
|
||||
: Error(format(ctx)), badChar(*ctx->rd), line(ctx->line), col(ctx->col), char_offset(ctx->char_offset)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string BadUnicodeChar::format(const struct utf8_encode_state* ctx)
|
||||
{
|
||||
std::ostringstream str;
|
||||
|
||||
str << "Invalid Unicode code point encountered."
|
||||
"\n Position : line "
|
||||
<< ctx->line + 1
|
||||
<< ", column "
|
||||
<< ctx->col + 1
|
||||
<< "\n Stream offset : "
|
||||
<< ctx->char_offset
|
||||
<< " characters\n Character value: 0x"
|
||||
<< std::hex
|
||||
<< *(ctx->rd);
|
||||
|
||||
return str.str();
|
||||
}
|
||||
|
||||
|
||||
|
||||
BadUTF8Sequence::BadUTF8Sequence(const std::string& description,
|
||||
const struct utf8_decode_state* ctx)
|
||||
: Error(format(description, ctx)), description(description), line(ctx->line + 1),
|
||||
col(ctx->col + 1), char_offset(ctx->char_offset), byte_offset(ctx->byte_offset)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string BadUTF8Sequence::format(const std::string& description,
|
||||
const struct utf8_decode_state* ctx)
|
||||
{
|
||||
std::ostringstream str;
|
||||
|
||||
str << "Bad byte sequence in UTF-8 data.\n"
|
||||
" Reason : " << description
|
||||
<< "\n Position: line " << ctx->line + 1
|
||||
<< ", column " << ctx->col + 1
|
||||
<< "\n Offset : " << ctx->char_offset << " chars, " << ctx->byte_offset << " bytes";
|
||||
|
||||
return str.str();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,96 @@
|
|||
/* libutf8++/src/lib/exception.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \brief Exception base class.
|
||||
|
||||
This is the base class for all libutf8 exceptions. It contains one member, \a reason, which allows
|
||||
you to print a human-readable description of the error. To recover the actual type, you can refer
|
||||
to the more specific derived classes.
|
||||
|
||||
*/
|
||||
class Error : public std::exception {
|
||||
public:
|
||||
/// Human-readable reason for error.
|
||||
std::string reason;
|
||||
|
||||
/// Constructor.
|
||||
Error(const std::string& reason);
|
||||
|
||||
/// Destructor.
|
||||
virtual ~Error() throw()
|
||||
{ }
|
||||
|
||||
/// Find what caused the error.
|
||||
virtual const char* what();
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*! \brief Invalid Unicode character exception.
|
||||
|
||||
This exception is thrown when encoding Unicode into UTF-8 and an invalid character is encountered.
|
||||
|
||||
*/
|
||||
class BadUnicodeChar : public Error {
|
||||
public:
|
||||
/// A copy of the invalid character.
|
||||
wchar_t badChar;
|
||||
|
||||
/// Line of input data at which error occurred (starts at 1).
|
||||
int line;
|
||||
|
||||
/// Column of input data at which error occurred (starts at 1).
|
||||
int col;
|
||||
|
||||
/// Character offset of input data at which error occurred.
|
||||
int char_offset;
|
||||
|
||||
/// Constructor.
|
||||
BadUnicodeChar(const struct utf8_encode_state* ctx);
|
||||
|
||||
private:
|
||||
static std::string format(const struct utf8_encode_state* ctx);
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*! \brief Invalid UTF-8 sequence exception.
|
||||
|
||||
This exception is thrown when decoding UTF-8 and an invalid sequence is encountered. This could be
|
||||
a nonsensical sequence, a redundantly-encounded character or truncated source data. It contains some
|
||||
variables for allowing detailed diagnostics.
|
||||
|
||||
*/
|
||||
class BadUTF8Sequence : public Error {
|
||||
public:
|
||||
/// Description of the error, for human diagnostics.
|
||||
std::string description;
|
||||
|
||||
/// Line of input data at which error occurred (starts at 1).
|
||||
int line;
|
||||
|
||||
/// Column of input data at which error occurred (starts at 1).
|
||||
int col;
|
||||
|
||||
/// Character offset of input data at which error occurred.
|
||||
int char_offset;
|
||||
|
||||
/// Byte offset of input data at which error occurred.
|
||||
int byte_offset;
|
||||
|
||||
/// Constructor.
|
||||
BadUTF8Sequence(const std::string& description, const struct utf8_decode_state* ctx);
|
||||
|
||||
/// Destructor.
|
||||
~BadUTF8Sequence() throw() { }
|
||||
|
||||
private:
|
||||
std::string format(const std::string& description, const struct utf8_decode_state* ctx);
|
||||
};
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,21 @@
|
|||
# libutf8++/src/lib/libutf8++/pkgconf.in
|
||||
#
|
||||
# Metadata file for pkg-config
|
||||
# ( http://www.freedesktop.org/software/pkgconfig/ )
|
||||
#
|
||||
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
# Released under the GNU GPLv2. See file COPYING or
|
||||
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||
#
|
||||
|
||||
# Name, description
|
||||
Name: libutf8++
|
||||
Description: C++ wrapper around libutf8 (library for handling UTF-8)
|
||||
Version: @VERSION@
|
||||
|
||||
# Requirements
|
||||
Requires:
|
||||
|
||||
# Compilation information
|
||||
Libs: -L@LIBDIR@ -lutf8++
|
||||
Cflags: -I@INCLUDEDIR@
|
|
@ -0,0 +1,17 @@
|
|||
# libutf8++/src/libutf8++/soversion
|
||||
#
|
||||
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
# Released under the GNU GPLv2. See file COPYING or
|
||||
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||
#
|
||||
|
||||
|
||||
|
||||
# SOMAJOR and SOMINOR are included in the library's soname. They need to
|
||||
# be bumped on a binary-incompatible release. They are both single
|
||||
# integers.
|
||||
SOMAJOR=0
|
||||
SOMINOR=0
|
||||
|
||||
# SOMICRO is bumped every time there is a binary-compatible release.
|
||||
SOMICRO=0
|
|
@ -0,0 +1,126 @@
|
|||
/* libutf8++/src/lib/string.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
|
||||
|
||||
static enum utf8_decode_error_action decode_replace_callback(const struct utf8_decode_state* ctx,
|
||||
enum utf8_decode_error error, wchar_t* newch)
|
||||
{
|
||||
(void)error;
|
||||
*newch = *(wchar_t*)(ctx->data);
|
||||
return utf8_decode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static enum utf8_decode_error_action decode_error_callback(const struct utf8_decode_state* ctx,
|
||||
enum utf8_decode_error error, wchar_t* newch)
|
||||
{
|
||||
(void)newch;
|
||||
const char* desc = "unknown";
|
||||
|
||||
switch(error) {
|
||||
case utf8_decode_error_lone_cchar:
|
||||
desc = "An invalid continuation byte was encountered while expecting a character.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_not_cchar:
|
||||
desc = "A multi-byte sequence contained an invalid byte.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_not_schar:
|
||||
desc = "An invalid byte was encountered while expecting a character.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_overlong:
|
||||
desc = "An overlong encoding of a character was encountered.";
|
||||
break;
|
||||
|
||||
case utf8_decode_error_illegal_cp:
|
||||
desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
|
||||
break;
|
||||
}
|
||||
|
||||
throw BadUTF8Sequence(desc, ctx);
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::wstring decode(const std::string& utf8, bool force, wchar_t replace)
|
||||
{
|
||||
wchar_t buffer[128];
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
ctx.rd = utf8.data();
|
||||
ctx.rd_remain = utf8.size();
|
||||
ctx.wr = buffer;
|
||||
ctx.wr_size = 128;
|
||||
if(force) {
|
||||
ctx.error_callback = decode_replace_callback;
|
||||
ctx.data = &replace;
|
||||
} else {
|
||||
ctx.error_callback = decode_error_callback;
|
||||
}
|
||||
|
||||
std::wstring ret;
|
||||
|
||||
while(ctx.rd_remain) {
|
||||
utf8_decoder(&ctx);
|
||||
ret.append(buffer, ctx.written);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static enum utf8_encode_error_action encode_replace_callback(const struct utf8_encode_state* ctx,
|
||||
wchar_t* newch)
|
||||
{
|
||||
*newch = *(wchar_t*)(ctx->data);
|
||||
return utf8_encode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string encode(const std::wstring& ustr, bool force, wchar_t replace)
|
||||
{
|
||||
char buffer[512];
|
||||
struct utf8_encode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
ctx.rd = ustr.data();
|
||||
ctx.rd_remain = ustr.size();
|
||||
ctx.wr = buffer;
|
||||
ctx.wr_size = 512;
|
||||
if(force) {
|
||||
ctx.error_callback = encode_replace_callback;
|
||||
ctx.data = &replace;
|
||||
}
|
||||
|
||||
std::string ret;
|
||||
|
||||
while(ctx.rd_remain) {
|
||||
if(!utf8_encoder(&ctx)) {
|
||||
throw BadUnicodeChar(&ctx);
|
||||
}
|
||||
|
||||
ret.append(buffer, ctx.written);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,37 @@
|
|||
/* libutf8++/src/lib/string.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \brief Decode UTF-8.
|
||||
|
||||
\param utf8 The UTF-8 encoded data.
|
||||
\param force If set to \a true, errors will be inhibited.
|
||||
\param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
|
||||
character.
|
||||
\returns The Unicode wide-character string representation.
|
||||
\throws BadUTF8Sequence if there is an invalid byte sequence in the UTF-8 source data.
|
||||
|
||||
This function will decode a UTF-8 source string into a Unicode wide-character string. It has a force
|
||||
mode whereby any errors will be inhibited and a best-effort attempt will be made.
|
||||
|
||||
*/
|
||||
std::wstring decode(const std::string& utf8, bool force = false, wchar_t replace = 0xFFFD);
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode UTF-8.
|
||||
|
||||
\param ustr The Unicode wide-character string.
|
||||
\param force If set to \a true, errors will be inhibited (invalid chars will be omitted).
|
||||
\param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
|
||||
character.
|
||||
\returns The UTF-8 transformed representation of \a ustr.
|
||||
\throws BadUnicodeChar on invalid characters in the source data.
|
||||
|
||||
This function will encode a Unicode wide-character string into a UTF-8 transformed representation.
|
||||
It has a force mode whereby any errors will be inhibited and a best-effort attempt will be made.
|
||||
|
||||
*/
|
||||
std::string encode(const std::wstring& ustr, bool force = false, wchar_t replace = 0xFFFD);
|
|
@ -0,0 +1 @@
|
|||
c++ tests tests libutf8++
|
|
@ -0,0 +1,3 @@
|
|||
source src/tests/build.tests
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,43 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# tests_BUILT
|
||||
#
|
||||
|
||||
build_target libutf8++ || return 1
|
||||
|
||||
if [ -z ${tests_BUILT} ]
|
||||
then
|
||||
LIBS="${libutf8pp} "
|
||||
EXTRAS=""
|
||||
|
||||
echo "Building test programs..."
|
||||
do_cmd mkdir -p obj/tests || return 1
|
||||
|
||||
for SRC in src/tests/*.cpp
|
||||
do
|
||||
TEST="obj/tests/$(basename ${SRC} | sed -e 's,.cpp$,,')"
|
||||
MODIFIED=0
|
||||
for file in ${LIBS} ${SRC} src/tests/build.tests
|
||||
do
|
||||
if [ ${file} -nt ${TEST} ]
|
||||
then
|
||||
MODIFIED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${MODIFIED} -ne 0 ]
|
||||
then
|
||||
do_cmd ${CXX} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
|
||||
print_success "Built ${TEST}"
|
||||
else
|
||||
print_success "${TEST} is up to date"
|
||||
fi
|
||||
done
|
||||
|
||||
print_success "All tests built"
|
||||
|
||||
tests_BUILT=1
|
||||
fi
|
||||
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,85 @@
|
|||
/* libutf8++/src/tests/objects.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#include "utf8"
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
|
||||
void make_random(wchar_t* buf, int ch)
|
||||
{
|
||||
int fd = open("/dev/urandom", O_RDONLY);
|
||||
if(fd < 0) {
|
||||
perror("open(\"/dev/urandom\")");
|
||||
throw 1;
|
||||
}
|
||||
ch *= sizeof(wchar_t);
|
||||
if(read(fd, (char*)buf, ch) != ch) {
|
||||
perror("read(\"/dev/urandom\")");
|
||||
throw 1;
|
||||
}
|
||||
close(fd);
|
||||
|
||||
ch /= sizeof(wchar_t);
|
||||
while(ch--) {
|
||||
buf[ch] &= 0x7FFFFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||
std::cout << "Performs some tests on the Encoder and Decoder objects.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ret = 0;
|
||||
try {
|
||||
wchar_t wch[1024];
|
||||
make_random(wch, 1024);
|
||||
|
||||
std::wstring ustr;
|
||||
ustr.assign(wch, 1024);
|
||||
|
||||
utf8::Encoder encoder;
|
||||
utf8::Decoder decoder;
|
||||
|
||||
encoder.encode(ustr);
|
||||
decoder.decode(encoder.encoded);
|
||||
|
||||
if(ustr != decoder.decoded) {
|
||||
std::cerr << "Decoded string does not match original.\n";
|
||||
for(size_t i = 0, end = std::min(ustr.size(), decoder.decoded.size()); i != end; ++i) {
|
||||
if(ustr[i] != decoder.decoded[i]) {
|
||||
std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
|
||||
<< std::setfill('0') << std::hex << ": 0x"
|
||||
<< std::setw(8) << ustr[i] << " != "
|
||||
<< std::setw(8) << decoder.decoded[i] << "\n";
|
||||
}
|
||||
}
|
||||
std::cerr << "Original size " << std::dec << ustr.size()
|
||||
<< ", decoded size " << decoder.decoded.size() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "Success.\n";
|
||||
}
|
||||
catch(utf8::Error& e) {
|
||||
std::cerr << e.reason << std::endl;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,82 @@
|
|||
/* libutf8++/src/tests/strings.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#include "utf8"
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
|
||||
void make_random(wchar_t* buf, int ch)
|
||||
{
|
||||
int fd = open("/dev/urandom", O_RDONLY);
|
||||
if(fd < 0) {
|
||||
perror("open(\"/dev/urandom\")");
|
||||
throw 1;
|
||||
}
|
||||
ch *= sizeof(wchar_t);
|
||||
if(read(fd, (char*)buf, ch) != ch) {
|
||||
perror("read(\"/dev/urandom\")");
|
||||
throw 1;
|
||||
}
|
||||
close(fd);
|
||||
|
||||
ch /= sizeof(wchar_t);
|
||||
while(ch--) {
|
||||
buf[ch] &= 0x7FFFFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||
std::cout << "Performs some tests on the string encode/decode routines.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ret = 0;
|
||||
try {
|
||||
wchar_t wch[1024];
|
||||
make_random(wch, 1024);
|
||||
|
||||
std::wstring ustr1, ustr2;
|
||||
std::string utf8;
|
||||
ustr1.assign(wch, 1024);
|
||||
utf8 = utf8::encode(ustr1);
|
||||
ustr2 = utf8::decode(utf8);
|
||||
|
||||
if(ustr1 != ustr2) {
|
||||
std::cerr << "Decoded string does not match original.\n";
|
||||
for(size_t i = 0, end = std::min(ustr1.size(), ustr2.size()); i != end; ++i) {
|
||||
if(ustr1[i] != ustr2[i]) {
|
||||
std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
|
||||
<< std::setfill('0') << std::hex << ": 0x"
|
||||
<< std::setw(8) << ustr1[i] << " != "
|
||||
<< std::setw(8) << ustr2[i] << "\n";
|
||||
}
|
||||
}
|
||||
std::cerr << "Original size " << std::dec << ustr1.size()
|
||||
<< ", decoded size " << ustr2.size() << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "Success.\n";
|
||||
}
|
||||
catch(utf8::Error& e) {
|
||||
std::cerr << e.reason << std::endl;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,44 @@
|
|||
/* libutf8++/src/tests/???.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
* Released under the GNU GPLv2. See file COPYING or
|
||||
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||
*/
|
||||
|
||||
#include "utf8"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||
std::cout << "One line summary.\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(argc == 1) {
|
||||
// empty argument list
|
||||
}
|
||||
|
||||
int ret = 0;
|
||||
try {
|
||||
// TODO
|
||||
}
|
||||
catch(std::exception& e) {
|
||||
std::cerr << e.what() << std::endl;
|
||||
ret = 1;
|
||||
}
|
||||
catch(...) {
|
||||
std::cerr << "Unknown exception caught." << std::endl;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
vim: expandtab:ts=4:sw=4
|
||||
*/
|
Loading…
Reference in New Issue