Copy from svn repository.

This commit is contained in:
Laurence Withers 2006-07-31 15:40:56 +01:00
parent 73d6e6fbd0
commit ac22dabfe6
34 changed files with 1557 additions and 1 deletions

11
README
View File

@ -10,5 +10,14 @@ Really Quick Instructions
To build: ./make.sh
To install: ./make.sh install
(you might want to set PREFIX, by default it's /usr/local)
Documentation is automatically built using doxygen.
@TODO@
Dependencies
------------
libutf8, http://www.lwithers.me.uk/projects/libutf8/
Project Homepage
----------------
http://www.lwithers.me.uk/projects/libutf8++/

1
src/docs/.params Normal file
View File

@ -0,0 +1 @@
doxygen docs docs

146
src/docs/Doxyfile.in Normal file
View File

@ -0,0 +1,146 @@
# libutf8++/src/docs/Doxyfile.in
#
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
# Released under the GNU GPLv2. See file COPYING or
# http://www.gnu.org/copyleft/gpl.html for details.
#
PROJECT_NAME = libutf8++
OUTPUT_DIRECTORY =
CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English
USE_WINDOWS_ENCODING = NO
BRIEF_MEMBER_DESC = YES
REPEAT_BRIEF = YES
ABBREVIATE_BRIEF =
ALWAYS_DETAILED_SEC = NO
INLINE_INHERITED_MEMB = YES
FULL_PATH_NAMES = NO
STRIP_FROM_PATH =
STRIP_FROM_INC_PATH =
SHORT_NAMES = NO
JAVADOC_AUTOBRIEF = NO
MULTILINE_CPP_IS_BRIEF = YES
DETAILS_AT_TOP = YES
INHERIT_DOCS = YES
DISTRIBUTE_GROUP_DOC = NO
TAB_SIZE = 4
ALIASES =
OPTIMIZE_OUTPUT_FOR_C = NO
OPTIMIZE_OUTPUT_JAVA = NO
SUBGROUPING = YES
EXTRACT_ALL = NO
EXTRACT_PRIVATE = NO
EXTRACT_STATIC = NO
EXTRACT_LOCAL_CLASSES = NO
EXTRACT_LOCAL_METHODS = NO
HIDE_UNDOC_MEMBERS = NO
HIDE_UNDOC_CLASSES = NO
HIDE_FRIEND_COMPOUNDS = YES
HIDE_IN_BODY_DOCS = NO
INTERNAL_DOCS = NO
CASE_SENSE_NAMES = YES
HIDE_SCOPE_NAMES = NO
SHOW_INCLUDE_FILES = NO
INLINE_INFO = YES
SORT_MEMBER_DOCS = YES
SORT_BRIEF_DOCS = NO
SORT_BY_SCOPE_NAME = NO
GENERATE_TODOLIST = YES
GENERATE_TESTLIST = YES
GENERATE_BUGLIST = YES
GENERATE_DEPRECATEDLIST= YES
ENABLED_SECTIONS =
MAX_INITIALIZER_LINES = 30
SHOW_USED_FILES = NO
SHOW_DIRECTORIES = NO
FILE_VERSION_FILTER =
QUIET = YES
WARNINGS = YES
WARN_IF_UNDOCUMENTED = YES
WARN_IF_DOC_ERROR = YES
WARN_NO_PARAMDOC = YES
WARN_FORMAT = "$file:$line: $text"
WARN_LOGFILE =
FILE_PATTERNS =
RECURSIVE = NO
EXCLUDE =
EXCLUDE_SYMLINKS = NO
EXCLUDE_PATTERNS =
EXAMPLE_PATH =
EXAMPLE_PATTERNS =
EXAMPLE_RECURSIVE = NO
IMAGE_PATH = src/docs
INPUT_FILTER =
FILTER_PATTERNS =
FILTER_SOURCE_FILES = NO
SOURCE_BROWSER = NO
INLINE_SOURCES = NO
STRIP_CODE_COMMENTS = YES
REFERENCED_BY_RELATION = YES
REFERENCES_RELATION = YES
VERBATIM_HEADERS = NO
ALPHABETICAL_INDEX = YES
COLS_IN_ALPHA_INDEX = 5
IGNORE_PREFIX =
GENERATE_HTML = YES
HTML_OUTPUT = html
HTML_FILE_EXTENSION = .html
HTML_HEADER =
HTML_FOOTER =
HTML_STYLESHEET =
HTML_ALIGN_MEMBERS = YES
GENERATE_HTMLHELP = NO
CHM_FILE =
HHC_LOCATION =
GENERATE_CHI = NO
BINARY_TOC = NO
TOC_EXPAND = NO
DISABLE_INDEX = NO
ENUM_VALUES_PER_LINE = 4
GENERATE_TREEVIEW = NO
TREEVIEW_WIDTH = 250
GENERATE_LATEX = NO
GENERATE_RTF = NO
GENERATE_MAN = NO
GENERATE_XML = NO
GENERATE_AUTOGEN_DEF = NO
GENERATE_PERLMOD = NO
ENABLE_PREPROCESSING = YES
MACRO_EXPANSION = NO
EXPAND_ONLY_PREDEF = NO
SEARCH_INCLUDES = YES
INCLUDE_PATH =
INCLUDE_FILE_PATTERNS =
PREDEFINED = DOXYGEN
EXPAND_AS_DEFINED =
SKIP_FUNCTION_MACROS = YES
TAGFILES =
GENERATE_TAGFILE =
ALLEXTERNALS = NO
EXTERNAL_GROUPS = YES
PERL_PATH = /usr/bin/perl
CLASS_DIAGRAMS = YES
HIDE_UNDOC_RELATIONS = YES
HAVE_DOT = YES
CLASS_GRAPH = YES
COLLABORATION_GRAPH = YES
GROUP_GRAPHS = NO
UML_LOOK = NO
TEMPLATE_RELATIONS = NO
INCLUDE_GRAPH = NO
INCLUDED_BY_GRAPH = NO
CALL_GRAPH = NO
GRAPHICAL_HIERARCHY = YES
DIRECTORY_GRAPH = NO
DOT_IMAGE_FORMAT = png
DOT_PATH =
DOTFILE_DIRS =
MAX_DOT_GRAPH_WIDTH = 1024
MAX_DOT_GRAPH_HEIGHT = 1024
MAX_DOT_GRAPH_DEPTH = 0
DOT_TRANSPARENT = YES
DOT_MULTI_TARGETS = YES
GENERATE_LEGEND = YES
DOT_CLEANUP = YES
SEARCHENGINE = NO

15
src/docs/MainPage.dox Normal file
View File

@ -0,0 +1,15 @@
/* libutf8++/src/docs/MainPage.dox
*
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
* Released under the GNU GPLv2. See file COPYING or
* http://www.gnu.org/copyleft/gpl.html for details.
*/
/*! \mainpage
*/
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4
*/

1
src/docs/build.default Normal file
View File

@ -0,0 +1 @@
source src/docs/build.docs

43
src/docs/build.docs Normal file
View File

@ -0,0 +1,43 @@
# These are external variables, and shouldn't clash with anything else
# docs_BUILT
#
MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
build_target monolithic
if [ -z ${docs_BUILT} ]
then
echo "Building documentation with Doxygen..."
DOXYFILE=obj/Doxyfile.docs
if [ ! -e ${DOXYFILE} ]
then
do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
fi
MODIFIED=0
for file in ${MONOLITHIC_DOC}
do
if [ ${file} -nt html/index.html ]
then
MODIFIED=1
break
fi
done
if [ ${MODIFIED} -ne 0 ]
then
do_cmd doxygen ${DOXYFILE} || return 1
print_success "Documentation built"
else
print_success "Documentation is up to date"
fi
docs_BUILT=1
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

1
src/docs/build.install Normal file
View File

@ -0,0 +1 @@
source src/docs/build.install-docs

View File

@ -0,0 +1,21 @@
build_target docs
# create documentation directories
echo "Installing documentation into ${DOCSDIR}"
build_dir_tree "${DOCSDIR}/html" || return 1
# copy across the Doxygen-generated documentation
for file in html/*
do
install_file ${file} ${DOCSDIR}/html 0644 || return 1
done
# copy across the generic files
for file in COPYING README
do
install_file ${file} ${DOCSDIR} 0644 || return 1
done
print_success "Documentation installed"
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

1
src/libutf8++/.params Normal file
View File

@ -0,0 +1 @@
c++ lib libutf8++ utf8

View File

@ -0,0 +1,9 @@
/* libutf8++/src/lib/BottomHeader.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
}
#endif

View File

@ -0,0 +1,9 @@
/* libutf8++/src/lib/ForwardDeclare.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
// This file simply contains forward declarations of all libutf8++
// classes, to facilitate header ordering, etc.

23
src/libutf8++/TopHeader.h Normal file
View File

@ -0,0 +1,23 @@
/* libutf8++/src/lib/TopHeader.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#ifndef HEADER_libutf8pp
#define HEADER_libutf8pp
// standard includes, or includes needed for type declarations
#include <string>
#include <stdexcept>
#include <utf8.h>
/*! \brief UTF-8 handling routines.
The library's UTF-8 handling routines are all made available through this namespace.
*/
namespace utf8 {

View File

@ -0,0 +1,12 @@
/* libutf8++/src/lib/TopSource.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#include "utf8"
// Below are all the includes used throughout the library.
#include <sstream>
#include <iomanip>

View File

@ -0,0 +1 @@
source src/libutf8++/build.lib

View File

@ -0,0 +1 @@
source src/libutf8++/build.install-lib

View File

@ -0,0 +1,36 @@
build_target libutf8++
# make paths (this is for Gentoo in particular)
build_dir_tree "${LIBDIR}" || return 1
build_dir_tree "${PKGCONFDIR}" || return 1
build_dir_tree "${INCLUDEDIR}" || return 1
# install library
echo "Installing libraries into '${LIBDIR}'"
install_file ${libutf8pp} ${LIBDIR} 0755 || return 1
BASE="${libutf8pp_BASE}.so"
MAJOR="${BASE}.${SOMAJOR}"
MINOR="${MAJOR}.${SOMINOR}"
MICRO="${MINOR}.${SOMICRO}"
install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
# install header
echo "Installing header file '${libutf8pp_HEADER}' into ${INCLUDEDIR}"
install_header ${libutf8pp_HEADER} ${INCLUDEDIR} 0644 || return 1
# install pkgconfig file
echo "Installing package config file into ${PKGCONFDIR}"
PKGCONFFILE=${PKGCONFDIR}/libutf8pp.pc
do_cmd rm -f ${PKGCONFFILE}
do_cmd_redir ${PKGCONFFILE} sed \
-e "s,@VERSION@,${VERSION}," \
-e "s,@LIBDIR@,${FINALLIBDIR}," \
-e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
src/libutf8++/pkgconf.in
do_cmd chmod 0644 ${PKGCONFFILE}
print_success "Done"
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

51
src/libutf8++/build.lib Normal file
View File

@ -0,0 +1,51 @@
# These are external variables, and shouldn't clash with anything else
# libutf8pp
# libutf8pp_BUILT
# libutf8pp_HEADER
# libutf8pp_BASE
if [ -z ${libutf8pp_BUILT} ]
then
libutf8pp_BASE=libutf8++
source src/libutf8++/soversion
libutf8pp="obj/${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
SO_EXTRA="$(pkg-config libutf8 --libs --cflags) -lstdc++ -lc"
echo "Building library ${libutf8pp}..."
do_cmd source src/libutf8++/build.monolithic || return 1
MODIFIED=0
for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
do
if [ ${test} -nt ${libutf8pp} ]
then
MODIFIED=1
break
fi
done
if [ ${MODIFIED} -ne 0 ]
then
echo " Compiling"
SONAME="${libutf8pp_BASE}.so.${SOMAJOR}.${SOMINOR}"
do_cmd ${CXX} ${CFLAGS} -shared -fpic -o "${libutf8pp}" \
-Wl,-soname,${SONAME} \
${SRC} ${SO_EXTRA} || return 1
# make tests work
do_cmd ln -sf $(basename ${libutf8pp}) obj/${SONAME} || return 1
print_success "Library built"
else
print_success "Library up to date"
fi
libutf8pp_BUILT=1
libutf8pp_HEADER=${HDR}
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

View File

@ -0,0 +1,21 @@
# These are external variables, and shouldn't clash with anything else
# libutf8pp_MONOLITHIC
SRC="obj/libutf8++.cpp"
HDR="obj/utf8"
MONOLITHIC_TESTS="src/libutf8++/build.lib src/libutf8++/build.monolithic"
if [ -z "${libutf8pp_MONOLITHIC}" ]
then
MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopHeader,ForwardDeclare,exception,string,{en,de}coder,BottomHeader}.h)"
make_monolithic ${HDR} C || return 1
MONOLITHIC_SOURCE="$(echo src/libutf8++/{TopSource,exception,string,{en,de}coder}.cpp)"
make_monolithic ${SRC} C || return 1
libutf8pp_MONOLITHIC=1
MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

159
src/libutf8++/decoder.cpp Normal file
View File

@ -0,0 +1,159 @@
/* libutf8++/src/lib/decoder.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
namespace utf8 {
Decoder::Decoder(size_t hint)
{
memset(&ctx, 0, sizeof(ctx));
ctx.wr_size = (hint < 2) ? 2 : hint;
ctx.wr = new wchar_t[ctx.wr_size];
ctx.error_callback = _exceptionOnError;
ctx.data = this;
}
Decoder::~Decoder()
{
delete [] ctx.wr;
}
void Decoder::decode(const std::string& str)
{
decode(str.data(), str.size());
}
void Decoder::decode(const char* str, ssize_t amt)
{
ctx.rd = str;
ctx.rd_remain = amt;
while(ctx.rd_remain) {
utf8_decoder(&ctx);
decoded.append(ctx.wr, ctx.written);
if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
if(ctx.rd_remain) {
ctx.wr_size *= 2;
delete [] ctx.wr;
ctx.wr = new wchar_t[ctx.wr_size];
}
}
}
bool Decoder::complete() const
{
return ctx.complete;
}
void Decoder::reset()
{
size_t old_wr_size = ctx.wr_size;
wchar_t* old_wr = ctx.wr;
utf8_decode_error_callback old_error_callback = ctx.error_callback;
memset(&ctx, 0, sizeof(ctx));
ctx.wr_size = old_wr_size;
ctx.wr = old_wr;
ctx.error_callback = old_error_callback;
ctx.data = this;
decoded.clear();
}
void Decoder::skipOnError()
{
ctx.error_callback = _skipOnError;
}
void Decoder::replaceOnError(wchar_t ch)
{
replaceChar = ch;
ctx.error_callback = _replaceOnError;
}
void Decoder::exceptionOnError()
{
ctx.error_callback = _exceptionOnError;
}
enum utf8_decode_error_action Decoder::_skipOnError
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
{
(void)ctx;
(void)error;
(void)newch;
return utf8_decode_error_action_skip;
}
enum utf8_decode_error_action Decoder::_replaceOnError
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
{
(void)error;
Decoder* self = (utf8::Decoder*)(ctx->data);
*newch = self->replaceChar;
return utf8_decode_error_action_replace;
}
enum utf8_decode_error_action Decoder::_exceptionOnError
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch)
{
(void)newch;
const char* desc = "unknown";
switch(error) {
case utf8_decode_error_lone_cchar:
desc = "An invalid continuation byte was encountered while expecting a character.";
break;
case utf8_decode_error_not_cchar:
desc = "A multi-byte sequence contained an invalid byte.";
break;
case utf8_decode_error_not_schar:
desc = "An invalid byte was encountered while expecting a character.";
break;
case utf8_decode_error_overlong:
desc = "An overlong encoding of a character was encountered.";
break;
case utf8_decode_error_illegal_cp:
desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
break;
}
throw BadUTF8Sequence(desc, ctx);
}
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

128
src/libutf8++/decoder.h Normal file
View File

@ -0,0 +1,128 @@
/* libutf8++/src/lib/decoder.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \brief Stateful UTF-8 decoder object.
This object is used for stateful decoding of a UTF-8 byte stream. It can be fed the data in
arbitrary chunks, even split on non-character boundaries. It writes its output into a wide character
string.
A variety of error handling modes are available. The default is to throw a BadUTF8Sequence
exception, but you can change this with skipOnError() or replaceOnError().
*/
class Decoder {
public:
/*! \brief Constructor.
\param hint Hint at number of characters to allocate space for in decoder buffer.
The constructor sets up the UTF-8 decoder. You can provide a hint as to the size of your input
stream chunks. This hint is the number of characters to allocate in the output buffer. If,
during a single decode operation, this buffer is filled, then it is doubled in size.
*/
Decoder(size_t hint = 25);
/// Destructor.
~Decoder();
/// Result of decoding operations (appended to).
std::wstring decoded;
/*! \brief UTF-8 decoder.
\param str Pointer to source data.
\param amt Number of bytes in source data (-1 for null terminated strings).
\throws BadUTF8Sequence.
This function will decode a chunk of UTF-8 data. The decoded data will be appended to whatever
is contained in the string decoded. You can check if the decoder ended on a character boundary
or not by calling complete().
*/
void decode(const char* str, ssize_t amt);
/// Decode data stored in a std::string.
void decode(const std::string& str);
/// Returns \a true if the last call to \a decode() ended on a character boundary.
bool complete() const;
/*! \brief Resets the parser for a new UTF-8 stream.
This function will clear the internal state of the decoder so that it is ready for data from a
new source. This can be used if you have opened a new file, accepted a new connection, recovered
from an error, etc. It will also clear \a decoded.
*/
void reset();
/*! \brief Set error handling to \e skip mode.
This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
byte sequences will simply be skipped altogether, and will not have any effect on the output in
\a decoded.
*/
void skipOnError();
/*! \brief Set error handling to \e replace mode.
\param ch The replacement character that will appear in the output.
This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
in \a decoded. The default parameter is the unicode replacement character, which should look
like an upside-down question mark.
*/
void replaceOnError(wchar_t ch = 0xFFFD);
/*! \brief Set error handling to \e exception mode (default).
This function will set the error handling to \e exception mode. In this mode, any invalid
UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
mode.
*/
void exceptionOnError();
private:
struct utf8_decode_state ctx;
wchar_t replaceChar;
static enum utf8_decode_error_action _skipOnError
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
static enum utf8_decode_error_action _replaceOnError
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
static enum utf8_decode_error_action _exceptionOnError
(const struct utf8_decode_state *ctx, enum utf8_decode_error error, wchar_t *newch);
};
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

124
src/libutf8++/encoder.cpp Normal file
View File

@ -0,0 +1,124 @@
/* libutf8++/src/lib/encoder.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
namespace utf8 {
Encoder::Encoder(size_t hint)
{
memset(&ctx, 0, sizeof(ctx));
ctx.wr_size = (hint < 7) ? 7 : hint;
ctx.wr = new char[ctx.wr_size];
ctx.error_callback = _exceptionOnError;
ctx.data = this;
}
Encoder::~Encoder()
{
delete [] ctx.wr;
}
void Encoder::reset()
{
char* wr = ctx.wr;
size_t wr_size = ctx.wr_size;
utf8_encode_error_callback cb = ctx.error_callback;
memset(&ctx, 0, sizeof(ctx));
ctx.wr = wr;
ctx.wr_size = wr_size;
ctx.error_callback = cb;
ctx.data = this;
}
void Encoder::encode(const std::wstring& str)
{
encode(str.data(), str.size());
}
void Encoder::encode(const wchar_t* str, ssize_t amt)
{
ctx.rd = str;
ctx.rd_remain = amt;
while(ctx.rd_remain) {
if(!utf8_encoder(&ctx)) throw BadUnicodeChar(&ctx);
encoded.append(ctx.wr, ctx.written);
if(ctx.rd_remain < 0 && !*(ctx.rd)) break;
if(ctx.rd_remain) {
ctx.wr_size *= 2;
delete [] ctx.wr;
ctx.wr = new char[ctx.wr_size];
}
}
}
void Encoder::skipOnError()
{
ctx.error_callback = _skipOnError;
}
void Encoder::replaceOnError(wchar_t ch)
{
replaceChar = ch;
ctx.error_callback = _replaceOnError;
}
void Encoder::exceptionOnError()
{
ctx.error_callback = _exceptionOnError;
}
enum utf8_encode_error_action Encoder::_skipOnError
(const struct utf8_encode_state *ctx, wchar_t *newch)
{
(void)ctx;
(void)newch;
return utf8_encode_error_action_skip;
}
enum utf8_encode_error_action Encoder::_replaceOnError
(const struct utf8_encode_state *ctx, wchar_t *newch)
{
Encoder* self = (utf8::Encoder*)(ctx->data);
*newch = self->replaceChar;
return utf8_encode_error_action_replace;
}
enum utf8_encode_error_action Encoder::_exceptionOnError
(const struct utf8_encode_state *ctx, wchar_t *newch)
{
(void)newch;
throw BadUnicodeChar(ctx);
}
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

108
src/libutf8++/encoder.h Normal file
View File

@ -0,0 +1,108 @@
/* libutf8++/src/lib/encoder.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \brief UTF-8 encoder object.
This object is used to encode Unicode wide characters into UTF-8. It can be fed chunks of characters
which it then encodes, appending the result to an internal buffer.
*/
class Encoder {
public:
/*! \brief Constructor.
\param hint Number of bytes to allocate for the encoding buffer.
The constructor sets up the encoder and allocates some space for an internal buffer. You can
hint at how large you expect the chunks to be encoded will be. If an encoding operation fills
the buffer without consuming all the input data, the buffer will be doubled in size for the
next round.
*/
Encoder(size_t hint = 100);
/// Destructor.
virtual ~Encoder();
/// UTF-8 output data is appended to this string.
std::string encoded;
/*! \brief Encode some data into UTF-8.
\param str Pointer to the character array to encode.
\param amt Number of characters to encode.
This function performs an encoding of some Unicode characters into UTF-8. It appends the result
onto \a encoded.
*/
void encode(const wchar_t* str, ssize_t amt);
/// Encode a std::wstring.
void encode(const std::wstring& str);
/// Reset the encoder for a new character stream.
void reset();
/*! \brief Set error handling to \e skip mode.
This function will set the error handling into \e skip mode. In this mode, any invalid UTF-8
byte sequences will simply be skipped altogether, and will not have any effect on the output in
\a decoded.
*/
void skipOnError();
/*! \brief Set error handling to \e replace mode.
\param ch The replacement character that will appear in the output.
This function will set the error handling into \e replace mode. In this mode, any invalid UTF-8
byte sequences will be skipped, and a replacement character \a ch will be placed onto the output
in \a decoded. The default parameter is the unicode replacement character, which should look
like an upside-down question mark.
*/
void replaceOnError(wchar_t ch = 0xFFFD);
/*! \brief Set error handling to \e exception mode (default).
This function will set the error handling to \e exception mode. In this mode, any invalid
UTF-8 byte sequences will cause a BadUTF8Sequence exception to be thrown. This is the default
mode.
*/
void exceptionOnError();
private:
struct utf8_encode_state ctx;
wchar_t replaceChar;
static enum utf8_encode_error_action _skipOnError
(const struct utf8_encode_state *ctx, wchar_t *newch);
static enum utf8_encode_error_action _replaceOnError
(const struct utf8_encode_state *ctx, wchar_t *newch);
static enum utf8_encode_error_action _exceptionOnError
(const struct utf8_encode_state *ctx, wchar_t *newch);
};
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

View File

@ -0,0 +1,81 @@
/* libutf8++/src/lib/exception.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
namespace utf8 {
Error::Error(const std::string& reason)
: reason(reason)
{
}
const char* Error::what()
{
return reason.c_str();
}
BadUnicodeChar::BadUnicodeChar(const struct utf8_encode_state* ctx)
: Error(format(ctx)), badChar(*ctx->rd), line(ctx->line), col(ctx->col), char_offset(ctx->char_offset)
{
}
std::string BadUnicodeChar::format(const struct utf8_encode_state* ctx)
{
std::ostringstream str;
str << "Invalid Unicode code point encountered."
"\n Position : line "
<< ctx->line + 1
<< ", column "
<< ctx->col + 1
<< "\n Stream offset : "
<< ctx->char_offset
<< " characters\n Character value: 0x"
<< std::hex
<< *(ctx->rd);
return str.str();
}
BadUTF8Sequence::BadUTF8Sequence(const std::string& description,
const struct utf8_decode_state* ctx)
: Error(format(description, ctx)), description(description), line(ctx->line + 1),
col(ctx->col + 1), char_offset(ctx->char_offset), byte_offset(ctx->byte_offset)
{
}
std::string BadUTF8Sequence::format(const std::string& description,
const struct utf8_decode_state* ctx)
{
std::ostringstream str;
str << "Bad byte sequence in UTF-8 data.\n"
" Reason : " << description
<< "\n Position: line " << ctx->line + 1
<< ", column " << ctx->col + 1
<< "\n Offset : " << ctx->char_offset << " chars, " << ctx->byte_offset << " bytes";
return str.str();
}
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

96
src/libutf8++/exception.h Normal file
View File

@ -0,0 +1,96 @@
/* libutf8++/src/lib/exception.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \brief Exception base class.
This is the base class for all libutf8 exceptions. It contains one member, \a reason, which allows
you to print a human-readable description of the error. To recover the actual type, you can refer
to the more specific derived classes.
*/
class Error : public std::exception {
public:
/// Human-readable reason for error.
std::string reason;
/// Constructor.
Error(const std::string& reason);
/// Destructor.
virtual ~Error() throw()
{ }
/// Find what caused the error.
virtual const char* what();
};
/*! \brief Invalid Unicode character exception.
This exception is thrown when encoding Unicode into UTF-8 and an invalid character is encountered.
*/
class BadUnicodeChar : public Error {
public:
/// A copy of the invalid character.
wchar_t badChar;
/// Line of input data at which error occurred (starts at 1).
int line;
/// Column of input data at which error occurred (starts at 1).
int col;
/// Character offset of input data at which error occurred.
int char_offset;
/// Constructor.
BadUnicodeChar(const struct utf8_encode_state* ctx);
private:
static std::string format(const struct utf8_encode_state* ctx);
};
/*! \brief Invalid UTF-8 sequence exception.
This exception is thrown when decoding UTF-8 and an invalid sequence is encountered. This could be
a nonsensical sequence, a redundantly-encounded character or truncated source data. It contains some
variables for allowing detailed diagnostics.
*/
class BadUTF8Sequence : public Error {
public:
/// Description of the error, for human diagnostics.
std::string description;
/// Line of input data at which error occurred (starts at 1).
int line;
/// Column of input data at which error occurred (starts at 1).
int col;
/// Character offset of input data at which error occurred.
int char_offset;
/// Byte offset of input data at which error occurred.
int byte_offset;
/// Constructor.
BadUTF8Sequence(const std::string& description, const struct utf8_decode_state* ctx);
/// Destructor.
~BadUTF8Sequence() throw() { }
private:
std::string format(const std::string& description, const struct utf8_decode_state* ctx);
};
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

21
src/libutf8++/pkgconf.in Normal file
View File

@ -0,0 +1,21 @@
# libutf8++/src/lib/libutf8++/pkgconf.in
#
# Metadata file for pkg-config
# ( http://www.freedesktop.org/software/pkgconfig/ )
#
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
# Released under the GNU GPLv2. See file COPYING or
# http://www.gnu.org/copyleft/gpl.html for details.
#
# Name, description
Name: libutf8++
Description: C++ wrapper around libutf8 (library for handling UTF-8)
Version: @VERSION@
# Requirements
Requires:
# Compilation information
Libs: -L@LIBDIR@ -lutf8++
Cflags: -I@INCLUDEDIR@

17
src/libutf8++/soversion Normal file
View File

@ -0,0 +1,17 @@
# libutf8++/src/libutf8++/soversion
#
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
# Released under the GNU GPLv2. See file COPYING or
# http://www.gnu.org/copyleft/gpl.html for details.
#
# SOMAJOR and SOMINOR are included in the library's soname. They need to
# be bumped on a binary-incompatible release. They are both single
# integers.
SOMAJOR=0
SOMINOR=0
# SOMICRO is bumped every time there is a binary-compatible release.
SOMICRO=0

126
src/libutf8++/string.cpp Normal file
View File

@ -0,0 +1,126 @@
/* libutf8++/src/lib/string.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
namespace utf8 {
static enum utf8_decode_error_action decode_replace_callback(const struct utf8_decode_state* ctx,
enum utf8_decode_error error, wchar_t* newch)
{
(void)error;
*newch = *(wchar_t*)(ctx->data);
return utf8_decode_error_action_replace;
}
static enum utf8_decode_error_action decode_error_callback(const struct utf8_decode_state* ctx,
enum utf8_decode_error error, wchar_t* newch)
{
(void)newch;
const char* desc = "unknown";
switch(error) {
case utf8_decode_error_lone_cchar:
desc = "An invalid continuation byte was encountered while expecting a character.";
break;
case utf8_decode_error_not_cchar:
desc = "A multi-byte sequence contained an invalid byte.";
break;
case utf8_decode_error_not_schar:
desc = "An invalid byte was encountered while expecting a character.";
break;
case utf8_decode_error_overlong:
desc = "An overlong encoding of a character was encountered.";
break;
case utf8_decode_error_illegal_cp:
desc = "An illegal code point (a UTF-16 surrogate perhaps?) was encountered.";
break;
}
throw BadUTF8Sequence(desc, ctx);
}
std::wstring decode(const std::string& utf8, bool force, wchar_t replace)
{
wchar_t buffer[128];
struct utf8_decode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = utf8.data();
ctx.rd_remain = utf8.size();
ctx.wr = buffer;
ctx.wr_size = 128;
if(force) {
ctx.error_callback = decode_replace_callback;
ctx.data = &replace;
} else {
ctx.error_callback = decode_error_callback;
}
std::wstring ret;
while(ctx.rd_remain) {
utf8_decoder(&ctx);
ret.append(buffer, ctx.written);
}
return ret;
}
static enum utf8_encode_error_action encode_replace_callback(const struct utf8_encode_state* ctx,
wchar_t* newch)
{
*newch = *(wchar_t*)(ctx->data);
return utf8_encode_error_action_replace;
}
std::string encode(const std::wstring& ustr, bool force, wchar_t replace)
{
char buffer[512];
struct utf8_encode_state ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.rd = ustr.data();
ctx.rd_remain = ustr.size();
ctx.wr = buffer;
ctx.wr_size = 512;
if(force) {
ctx.error_callback = encode_replace_callback;
ctx.data = &replace;
}
std::string ret;
while(ctx.rd_remain) {
if(!utf8_encoder(&ctx)) {
throw BadUnicodeChar(&ctx);
}
ret.append(buffer, ctx.written);
}
return ret;
}
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

37
src/libutf8++/string.h Normal file
View File

@ -0,0 +1,37 @@
/* libutf8++/src/lib/string.h
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
/*! \brief Decode UTF-8.
\param utf8 The UTF-8 encoded data.
\param force If set to \a true, errors will be inhibited.
\param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
character.
\returns The Unicode wide-character string representation.
\throws BadUTF8Sequence if there is an invalid byte sequence in the UTF-8 source data.
This function will decode a UTF-8 source string into a Unicode wide-character string. It has a force
mode whereby any errors will be inhibited and a best-effort attempt will be made.
*/
std::wstring decode(const std::string& utf8, bool force = false, wchar_t replace = 0xFFFD);
/*! \brief Encode UTF-8.
\param ustr The Unicode wide-character string.
\param force If set to \a true, errors will be inhibited (invalid chars will be omitted).
\param replace If \a force is \a true, then invalid UTF-8 sequences will be replaced by this
character.
\returns The UTF-8 transformed representation of \a ustr.
\throws BadUnicodeChar on invalid characters in the source data.
This function will encode a Unicode wide-character string into a UTF-8 transformed representation.
It has a force mode whereby any errors will be inhibited and a best-effort attempt will be made.
*/
std::string encode(const std::wstring& ustr, bool force = false, wchar_t replace = 0xFFFD);

1
src/tests/.params Normal file
View File

@ -0,0 +1 @@
c++ tests tests libutf8++

3
src/tests/build.default Normal file
View File

@ -0,0 +1,3 @@
source src/tests/build.tests
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

43
src/tests/build.tests Normal file
View File

@ -0,0 +1,43 @@
# These are external variables, and shouldn't clash with anything else
# tests_BUILT
#
build_target libutf8++ || return 1
if [ -z ${tests_BUILT} ]
then
LIBS="${libutf8pp} "
EXTRAS=""
echo "Building test programs..."
do_cmd mkdir -p obj/tests || return 1
for SRC in src/tests/*.cpp
do
TEST="obj/tests/$(basename ${SRC} | sed -e 's,.cpp$,,')"
MODIFIED=0
for file in ${LIBS} ${SRC} src/tests/build.tests
do
if [ ${file} -nt ${TEST} ]
then
MODIFIED=1
break
fi
done
if [ ${MODIFIED} -ne 0 ]
then
do_cmd ${CXX} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
print_success "Built ${TEST}"
else
print_success "${TEST} is up to date"
fi
done
print_success "All tests built"
tests_BUILT=1
fi
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
# vim: expandtab:ts=4:sw=4

85
src/tests/objects.cpp Normal file
View File

@ -0,0 +1,85 @@
/* libutf8++/src/tests/objects.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#include "utf8"
#include <iostream>
#include <iomanip>
#include <fcntl.h>
#include <unistd.h>
void make_random(wchar_t* buf, int ch)
{
int fd = open("/dev/urandom", O_RDONLY);
if(fd < 0) {
perror("open(\"/dev/urandom\")");
throw 1;
}
ch *= sizeof(wchar_t);
if(read(fd, (char*)buf, ch) != ch) {
perror("read(\"/dev/urandom\")");
throw 1;
}
close(fd);
ch /= sizeof(wchar_t);
while(ch--) {
buf[ch] &= 0x7FFFFFFF;
}
}
int main(int argc, char* argv[])
{
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
std::cout << "Performs some tests on the Encoder and Decoder objects.\n";
return 0;
}
int ret = 0;
try {
wchar_t wch[1024];
make_random(wch, 1024);
std::wstring ustr;
ustr.assign(wch, 1024);
utf8::Encoder encoder;
utf8::Decoder decoder;
encoder.encode(ustr);
decoder.decode(encoder.encoded);
if(ustr != decoder.decoded) {
std::cerr << "Decoded string does not match original.\n";
for(size_t i = 0, end = std::min(ustr.size(), decoder.decoded.size()); i != end; ++i) {
if(ustr[i] != decoder.decoded[i]) {
std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
<< std::setfill('0') << std::hex << ": 0x"
<< std::setw(8) << ustr[i] << " != "
<< std::setw(8) << decoder.decoded[i] << "\n";
}
}
std::cerr << "Original size " << std::dec << ustr.size()
<< ", decoded size " << decoder.decoded.size() << std::endl;
return 1;
}
std::cout << "Success.\n";
}
catch(utf8::Error& e) {
std::cerr << e.reason << std::endl;
ret = 1;
}
return ret;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

82
src/tests/strings.cpp Normal file
View File

@ -0,0 +1,82 @@
/* libutf8++/src/tests/strings.cpp
*
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
* COPYING for more information / terms of license.
*/
#include "utf8"
#include <iostream>
#include <iomanip>
#include <fcntl.h>
#include <unistd.h>
void make_random(wchar_t* buf, int ch)
{
int fd = open("/dev/urandom", O_RDONLY);
if(fd < 0) {
perror("open(\"/dev/urandom\")");
throw 1;
}
ch *= sizeof(wchar_t);
if(read(fd, (char*)buf, ch) != ch) {
perror("read(\"/dev/urandom\")");
throw 1;
}
close(fd);
ch /= sizeof(wchar_t);
while(ch--) {
buf[ch] &= 0x7FFFFFFF;
}
}
int main(int argc, char* argv[])
{
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
std::cout << "Performs some tests on the string encode/decode routines.\n";
return 0;
}
int ret = 0;
try {
wchar_t wch[1024];
make_random(wch, 1024);
std::wstring ustr1, ustr2;
std::string utf8;
ustr1.assign(wch, 1024);
utf8 = utf8::encode(ustr1);
ustr2 = utf8::decode(utf8);
if(ustr1 != ustr2) {
std::cerr << "Decoded string does not match original.\n";
for(size_t i = 0, end = std::min(ustr1.size(), ustr2.size()); i != end; ++i) {
if(ustr1[i] != ustr2[i]) {
std::cerr << std::dec << std::setfill(' ') << std::setw(4) << i
<< std::setfill('0') << std::hex << ": 0x"
<< std::setw(8) << ustr1[i] << " != "
<< std::setw(8) << ustr2[i] << "\n";
}
}
std::cerr << "Original size " << std::dec << ustr1.size()
<< ", decoded size " << ustr2.size() << std::endl;
return 1;
}
std::cout << "Success.\n";
}
catch(utf8::Error& e) {
std::cerr << e.reason << std::endl;
ret = 1;
}
return ret;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
*/

44
src/tests/template Normal file
View File

@ -0,0 +1,44 @@
/* libutf8++/src/tests/???.cpp
*
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
* Released under the GNU GPLv2. See file COPYING or
* http://www.gnu.org/copyleft/gpl.html for details.
*/
#include "utf8"
#include <iostream>
int main(int argc, char* argv[])
{
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
std::cout << "One line summary.\n";
return 0;
}
if(argc == 1) {
// empty argument list
}
int ret = 0;
try {
// TODO
}
catch(std::exception& e) {
std::cerr << e.what() << std::endl;
ret = 1;
}
catch(...) {
std::cerr << "Unknown exception caught." << std::endl;
ret = 1;
}
return ret;
}
/* options for text editors
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
vim: expandtab:ts=4:sw=4
*/