Import library from old svn repository.
This commit is contained in:
parent
f9c0e57470
commit
ea1c53e43f
6
README
6
README
|
@ -10,5 +10,9 @@ Really Quick Instructions
|
|||
To build: ./make.sh
|
||||
To install: ./make.sh install
|
||||
(you might want to set PREFIX, by default it's /usr/local)
|
||||
Documentation is automatically built using doxygen.
|
||||
|
||||
@TODO@
|
||||
Project Homepage
|
||||
----------------
|
||||
|
||||
http://www.lwithers.me.uk/projects/libutf8/
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
doxygen docs docs
|
|
@ -0,0 +1,146 @@
|
|||
# libutf8/src/docs/Doxyfile.in
|
||||
#
|
||||
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
# Released under the GNU GPLv2. See file COPYING or
|
||||
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||
#
|
||||
|
||||
PROJECT_NAME = libutf8
|
||||
OUTPUT_DIRECTORY =
|
||||
CREATE_SUBDIRS = NO
|
||||
OUTPUT_LANGUAGE = English
|
||||
USE_WINDOWS_ENCODING = NO
|
||||
BRIEF_MEMBER_DESC = YES
|
||||
REPEAT_BRIEF = YES
|
||||
ABBREVIATE_BRIEF =
|
||||
ALWAYS_DETAILED_SEC = NO
|
||||
INLINE_INHERITED_MEMB = YES
|
||||
FULL_PATH_NAMES = NO
|
||||
STRIP_FROM_PATH =
|
||||
STRIP_FROM_INC_PATH =
|
||||
SHORT_NAMES = NO
|
||||
JAVADOC_AUTOBRIEF = NO
|
||||
MULTILINE_CPP_IS_BRIEF = YES
|
||||
DETAILS_AT_TOP = YES
|
||||
INHERIT_DOCS = YES
|
||||
DISTRIBUTE_GROUP_DOC = NO
|
||||
TAB_SIZE = 4
|
||||
ALIASES =
|
||||
OPTIMIZE_OUTPUT_FOR_C = NO
|
||||
OPTIMIZE_OUTPUT_JAVA = NO
|
||||
SUBGROUPING = YES
|
||||
EXTRACT_ALL = NO
|
||||
EXTRACT_PRIVATE = NO
|
||||
EXTRACT_STATIC = NO
|
||||
EXTRACT_LOCAL_CLASSES = NO
|
||||
EXTRACT_LOCAL_METHODS = NO
|
||||
HIDE_UNDOC_MEMBERS = NO
|
||||
HIDE_UNDOC_CLASSES = NO
|
||||
HIDE_FRIEND_COMPOUNDS = YES
|
||||
HIDE_IN_BODY_DOCS = NO
|
||||
INTERNAL_DOCS = NO
|
||||
CASE_SENSE_NAMES = YES
|
||||
HIDE_SCOPE_NAMES = NO
|
||||
SHOW_INCLUDE_FILES = NO
|
||||
INLINE_INFO = YES
|
||||
SORT_MEMBER_DOCS = YES
|
||||
SORT_BRIEF_DOCS = NO
|
||||
SORT_BY_SCOPE_NAME = NO
|
||||
GENERATE_TODOLIST = YES
|
||||
GENERATE_TESTLIST = YES
|
||||
GENERATE_BUGLIST = YES
|
||||
GENERATE_DEPRECATEDLIST= YES
|
||||
ENABLED_SECTIONS =
|
||||
MAX_INITIALIZER_LINES = 30
|
||||
SHOW_USED_FILES = NO
|
||||
SHOW_DIRECTORIES = NO
|
||||
FILE_VERSION_FILTER =
|
||||
QUIET = YES
|
||||
WARNINGS = YES
|
||||
WARN_IF_UNDOCUMENTED = YES
|
||||
WARN_IF_DOC_ERROR = YES
|
||||
WARN_NO_PARAMDOC = YES
|
||||
WARN_FORMAT = "$file:$line: $text"
|
||||
WARN_LOGFILE =
|
||||
FILE_PATTERNS =
|
||||
RECURSIVE = NO
|
||||
EXCLUDE =
|
||||
EXCLUDE_SYMLINKS = NO
|
||||
EXCLUDE_PATTERNS =
|
||||
EXAMPLE_PATH =
|
||||
EXAMPLE_PATTERNS =
|
||||
EXAMPLE_RECURSIVE = NO
|
||||
IMAGE_PATH = src/docs
|
||||
INPUT_FILTER =
|
||||
FILTER_PATTERNS =
|
||||
FILTER_SOURCE_FILES = NO
|
||||
SOURCE_BROWSER = NO
|
||||
INLINE_SOURCES = NO
|
||||
STRIP_CODE_COMMENTS = YES
|
||||
REFERENCED_BY_RELATION = YES
|
||||
REFERENCES_RELATION = YES
|
||||
VERBATIM_HEADERS = NO
|
||||
ALPHABETICAL_INDEX = YES
|
||||
COLS_IN_ALPHA_INDEX = 5
|
||||
IGNORE_PREFIX =
|
||||
GENERATE_HTML = YES
|
||||
HTML_OUTPUT = html
|
||||
HTML_FILE_EXTENSION = .html
|
||||
HTML_HEADER =
|
||||
HTML_FOOTER =
|
||||
HTML_STYLESHEET =
|
||||
HTML_ALIGN_MEMBERS = YES
|
||||
GENERATE_HTMLHELP = NO
|
||||
CHM_FILE =
|
||||
HHC_LOCATION =
|
||||
GENERATE_CHI = NO
|
||||
BINARY_TOC = NO
|
||||
TOC_EXPAND = NO
|
||||
DISABLE_INDEX = NO
|
||||
ENUM_VALUES_PER_LINE = 4
|
||||
GENERATE_TREEVIEW = NO
|
||||
TREEVIEW_WIDTH = 250
|
||||
GENERATE_LATEX = NO
|
||||
GENERATE_RTF = NO
|
||||
GENERATE_MAN = NO
|
||||
GENERATE_XML = NO
|
||||
GENERATE_AUTOGEN_DEF = NO
|
||||
GENERATE_PERLMOD = NO
|
||||
ENABLE_PREPROCESSING = YES
|
||||
MACRO_EXPANSION = NO
|
||||
EXPAND_ONLY_PREDEF = NO
|
||||
SEARCH_INCLUDES = YES
|
||||
INCLUDE_PATH =
|
||||
INCLUDE_FILE_PATTERNS =
|
||||
PREDEFINED = DOXYGEN
|
||||
EXPAND_AS_DEFINED =
|
||||
SKIP_FUNCTION_MACROS = YES
|
||||
TAGFILES =
|
||||
GENERATE_TAGFILE =
|
||||
ALLEXTERNALS = NO
|
||||
EXTERNAL_GROUPS = YES
|
||||
PERL_PATH = /usr/bin/perl
|
||||
CLASS_DIAGRAMS = YES
|
||||
HIDE_UNDOC_RELATIONS = YES
|
||||
HAVE_DOT = YES
|
||||
CLASS_GRAPH = YES
|
||||
COLLABORATION_GRAPH = YES
|
||||
GROUP_GRAPHS = NO
|
||||
UML_LOOK = NO
|
||||
TEMPLATE_RELATIONS = NO
|
||||
INCLUDE_GRAPH = NO
|
||||
INCLUDED_BY_GRAPH = NO
|
||||
CALL_GRAPH = NO
|
||||
GRAPHICAL_HIERARCHY = YES
|
||||
DIRECTORY_GRAPH = NO
|
||||
DOT_IMAGE_FORMAT = png
|
||||
DOT_PATH =
|
||||
DOTFILE_DIRS =
|
||||
MAX_DOT_GRAPH_WIDTH = 1024
|
||||
MAX_DOT_GRAPH_HEIGHT = 1024
|
||||
MAX_DOT_GRAPH_DEPTH = 0
|
||||
DOT_TRANSPARENT = YES
|
||||
DOT_MULTI_TARGETS = YES
|
||||
GENERATE_LEGEND = YES
|
||||
DOT_CLEANUP = YES
|
||||
SEARCHENGINE = NO
|
|
@ -0,0 +1,19 @@
|
|||
/* libutf8/src/docs/MainPage.dox
|
||||
*
|
||||
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
* Released under the GNU GPLv2. See file COPYING or
|
||||
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||
*/
|
||||
|
||||
/*! \mainpage
|
||||
|
||||
\c libutf8 provides a C API for encoding and decoding UTF-8. It uses the C type \c wchar_t as its
|
||||
internal character representation. \c libutf8 is a "safe" decoder — it will not accept
|
||||
overlong byte sequences.
|
||||
|
||||
*/
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
vim: expandtab:ts=4:sw=4
|
||||
*/
|
|
@ -0,0 +1 @@
|
|||
source src/docs/build.docs
|
|
@ -0,0 +1,43 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# docs_BUILT
|
||||
#
|
||||
|
||||
MONOLITHIC_DOC="${MONOLITHIC_DOC} $(echo src/docs/*.dox)"
|
||||
build_target monolithic
|
||||
|
||||
if [ -z ${docs_BUILT} ]
|
||||
then
|
||||
echo "Building documentation with Doxygen..."
|
||||
|
||||
DOXYFILE=obj/Doxyfile.docs
|
||||
|
||||
if [ ! -e ${DOXYFILE} ]
|
||||
then
|
||||
do_cmd cp src/docs/Doxyfile.in ${DOXYFILE} || return 1
|
||||
echo "INPUT = ${MONOLITHIC_DOC}" >> ${DOXYFILE}
|
||||
echo "PROJECT_NUMBER = ${VERSION}" >> ${DOXYFILE}
|
||||
fi
|
||||
|
||||
MODIFIED=0
|
||||
for file in ${MONOLITHIC_DOC}
|
||||
do
|
||||
if [ ${file} -nt html/index.html ]
|
||||
then
|
||||
MODIFIED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${MODIFIED} -ne 0 ]
|
||||
then
|
||||
do_cmd doxygen ${DOXYFILE} || return 1
|
||||
print_success "Documentation built"
|
||||
else
|
||||
print_success "Documentation is up to date"
|
||||
fi
|
||||
|
||||
docs_BUILT=1
|
||||
fi
|
||||
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1 @@
|
|||
source src/docs/build.install-docs
|
|
@ -0,0 +1,21 @@
|
|||
build_target docs
|
||||
|
||||
# create documentation directories
|
||||
echo "Installing documentation into ${DOCSDIR}"
|
||||
build_dir_tree "${DOCSDIR}/html" || return 1
|
||||
|
||||
# copy across the Doxygen-generated documentation
|
||||
for file in html/*
|
||||
do
|
||||
install_file ${file} ${DOCSDIR}/html 0644 || return 1
|
||||
done
|
||||
|
||||
# copy across the generic files
|
||||
for file in COPYING README
|
||||
do
|
||||
install_file ${file} ${DOCSDIR} 0644 || return 1
|
||||
done
|
||||
|
||||
print_success "Documentation installed"
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1 @@
|
|||
c lib libutf8 utf8.h
|
|
@ -0,0 +1,11 @@
|
|||
/* libutf8/src/lib/BottomHeader.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,14 @@
|
|||
/* libutf8/src/lib/ForwardDeclare.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
// This file simply contains forward declarations of all libutf8
|
||||
// classes, to facilitate header ordering, etc.
|
||||
|
||||
// encode_state.h
|
||||
struct utf8_encode_state;
|
||||
|
||||
// decode_state.h
|
||||
struct utf8_decode_state;
|
|
@ -0,0 +1,16 @@
|
|||
/* libutf8/src/lib/TopHeader.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#ifndef HEADER_LIBUTF8
|
||||
#define HEADER_LIBUTF8
|
||||
|
||||
// standard includes, or includes needed for type declarations
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
|
@ -0,0 +1,13 @@
|
|||
/* libutf8/src/lib/TopSource.cpp
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
// Below are all the includes used throughout the library.
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
|
@ -0,0 +1 @@
|
|||
source src/libutf8/build.lib
|
|
@ -0,0 +1 @@
|
|||
source src/libutf8/build.install-lib
|
|
@ -0,0 +1,36 @@
|
|||
build_target libutf8
|
||||
|
||||
# make paths (this is for Gentoo in particular)
|
||||
build_dir_tree "${LIBDIR}" || return 1
|
||||
build_dir_tree "${PKGCONFDIR}" || return 1
|
||||
build_dir_tree "${INCLUDEDIR}" || return 1
|
||||
|
||||
# install library
|
||||
echo "Installing libraries into '${LIBDIR}'"
|
||||
install_file ${libutf8} ${LIBDIR} 0755 || return 1
|
||||
BASE="${libutf8_BASE}.so"
|
||||
MAJOR="${BASE}.${SOMAJOR}"
|
||||
MINOR="${MAJOR}.${SOMINOR}"
|
||||
MICRO="${MINOR}.${SOMICRO}"
|
||||
install_symlink "${MINOR}" "${MICRO}" "${LIBDIR}"
|
||||
install_symlink "${MAJOR}" "${MINOR}" "${LIBDIR}"
|
||||
install_symlink "${BASE}" "${MAJOR}" "${LIBDIR}"
|
||||
|
||||
# install header
|
||||
echo "Installing header file '${libutf8_HEADER}' into ${INCLUDEDIR}"
|
||||
install_header ${libutf8_HEADER} ${INCLUDEDIR} 0644 || return 1
|
||||
|
||||
# install pkgconfig file
|
||||
echo "Installing package config file into ${PKGCONFDIR}"
|
||||
PKGCONFFILE=${PKGCONFDIR}/libutf8.pc
|
||||
do_cmd rm -f ${PKGCONFFILE}
|
||||
do_cmd_redir ${PKGCONFFILE} sed \
|
||||
-e "s,@VERSION@,${VERSION}," \
|
||||
-e "s,@LIBDIR@,${FINALLIBDIR}," \
|
||||
-e "s,@INCLUDEDIR@,${FINALINCLUDEDIR}," \
|
||||
src/libutf8/pkgconf.in
|
||||
do_cmd chmod 0644 ${PKGCONFFILE}
|
||||
print_success "Done"
|
||||
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,51 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# libutf8
|
||||
# libutf8_BUILT
|
||||
# libutf8_HEADER
|
||||
# libutf8_BASE
|
||||
|
||||
if [ -z ${libutf8_BUILT} ]
|
||||
then
|
||||
libutf8_BASE=libutf8
|
||||
source src/libutf8/soversion
|
||||
|
||||
libutf8="obj/${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}.${SOMICRO}"
|
||||
SO_EXTRA="-lc"
|
||||
|
||||
echo "Building library ${libutf8}..."
|
||||
|
||||
do_cmd source src/libutf8/build.monolithic || return 1
|
||||
|
||||
MODIFIED=0
|
||||
for test in ${MONOLITHIC_TESTS} ${HDR} ${SRC}
|
||||
do
|
||||
if [ ${test} -nt ${libutf8} ]
|
||||
then
|
||||
MODIFIED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${MODIFIED} -ne 0 ]
|
||||
then
|
||||
echo " Compiling"
|
||||
|
||||
SONAME="${libutf8_BASE}.so.${SOMAJOR}.${SOMINOR}"
|
||||
do_cmd ${CC} ${CFLAGS} -shared -fpic -o "${libutf8}" \
|
||||
-Wl,-soname,${SONAME} \
|
||||
${SRC} ${SO_EXTRA} || return 1
|
||||
|
||||
# make tests work
|
||||
do_cmd ln -sf $(basename ${libutf8}) obj/${SONAME} || return 1
|
||||
|
||||
print_success "Library built"
|
||||
else
|
||||
print_success "Library up to date"
|
||||
fi
|
||||
|
||||
libutf8_BUILT=1
|
||||
libutf8_HEADER=${HDR}
|
||||
|
||||
fi
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,21 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# libutf8_MONOLITHIC
|
||||
|
||||
SRC="obj/libutf8.c"
|
||||
HDR="obj/utf8.h"
|
||||
|
||||
MONOLITHIC_TESTS="src/libutf8/build.lib src/libutf8/build.monolithic"
|
||||
|
||||
if [ -z "${libutf8_MONOLITHIC}" ]
|
||||
then
|
||||
MONOLITHIC_SOURCE="$(echo src/libutf8/{TopHeader,ForwardDeclare,ctype,{de,en}code{,_state},BottomHeader}.h)"
|
||||
make_monolithic ${HDR} C || return 1
|
||||
|
||||
MONOLITHIC_SOURCE="$(echo src/libutf8/{TopSource,ctype,{de,en}code{,_state}}.c)"
|
||||
make_monolithic ${SRC} C || return 1
|
||||
|
||||
libutf8_MONOLITHIC=1
|
||||
MONOLITHIC_DOC="${MONOLITHIC_DOC} ${HDR}"
|
||||
fi
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,55 @@
|
|||
/* libutf8/src/lib/ctype.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
bool utf8_isascii(wchar_t ch)
|
||||
{
|
||||
return !(ch & ~0x7F);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* From PropList-4.1.0.txt (http://www.unicode.org/Public/UNIDATA/)
|
||||
|
||||
0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>
|
||||
0020 ; White_Space # Zs SPACE
|
||||
0085 ; White_Space # Cc <control-0085>
|
||||
00A0 ; White_Space # Zs NO-BREAK SPACE
|
||||
1680 ; White_Space # Zs OGHAM SPACE MARK
|
||||
180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
||||
2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE
|
||||
2028 ; White_Space # Zl LINE SEPARATOR
|
||||
2029 ; White_Space # Zp PARAGRAPH SEPARATOR
|
||||
202F ; White_Space # Zs NARROW NO-BREAK SPACE
|
||||
205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
||||
3000 ; White_Space # Zs IDEOGRAPHIC SPACE
|
||||
*/
|
||||
|
||||
bool utf8_isspace(wchar_t ch)
|
||||
{
|
||||
return((ch >= 0x0009 && ch <= 0x000D)
|
||||
|| ch == 0x0020
|
||||
|| ch == 0x0085
|
||||
|| ch == 0x00A0
|
||||
|| ch == 0x1680
|
||||
|| ch == 0x180E
|
||||
|| (ch >= 0x2000 && ch <= 0x200A)
|
||||
|| ch == 0x2028
|
||||
|| ch == 0x2029
|
||||
|| ch == 0x202F
|
||||
|| ch == 0x205F
|
||||
|| ch == 0x3000);
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool utf8_isvalid(wchar_t ch)
|
||||
{
|
||||
return !(ch & (~((wchar_t)0x7FFFFFFF))) && (ch < 0xD800 || ch > 0xDFFF) && (ch != 0xFFFE) && (ch != 0xFFFF);
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,46 @@
|
|||
/* libutf8/src/lib/ctype.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \defgroup ctype Character classification
|
||||
|
||||
This module contains functions for character classification. These are basically an extension of the
|
||||
\c is* functions defined in \c <ctype.h>.
|
||||
|
||||
\todo There are many char classification functions that haven't been implemented yet. These won't be
|
||||
implemented until they can be done in a proper, Unicode-safe fashion.
|
||||
|
||||
*/
|
||||
/*!@{*/
|
||||
|
||||
|
||||
|
||||
/// Returns \c true if \a ch can be represented in ASCII.
|
||||
bool utf8_isascii(wchar_t ch);
|
||||
|
||||
/// Returns \c true if \a ch is whitespace.
|
||||
bool utf8_isspace(wchar_t ch);
|
||||
|
||||
/*! \brief Returns \c true if \a ch is a valid UCS-4 character.
|
||||
|
||||
\param ch The character to classify.
|
||||
\retval true If \a ch is a valid UCS-4 character.
|
||||
\retval false If \a ch is not a valid UCS-4 character.
|
||||
|
||||
This function will examine a \c wchar_t value and determine whether or not it is a valid UCS-4
|
||||
character. Valid characters lie in the range 0–0x7FFFFFFF but exclude:
|
||||
\li the UTF-16 surrogate code points (U+D800–U+DFFF, inclusive)
|
||||
\li the invalid code points U+FFFE and U+FFFF
|
||||
|
||||
*/
|
||||
bool utf8_isvalid(wchar_t ch);
|
||||
|
||||
|
||||
|
||||
/*!@}*/
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,234 @@
|
|||
/* libutf8/src/lib/decode.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
wchar_t utf8_decode_char(const char* src, size_t* used)
|
||||
{
|
||||
return utf8_decode_char2(src, 6, used);
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used)
|
||||
{
|
||||
uint8_t ch;
|
||||
wchar_t ret, min;
|
||||
int remain;
|
||||
|
||||
if(!src || !size) {
|
||||
errno = EINVAL;
|
||||
return 0;
|
||||
}
|
||||
if(used) *used = 1;
|
||||
ch = *src++;
|
||||
|
||||
if(ch & 0x80) {
|
||||
if((ch & 0xE0) == 0xC0) {
|
||||
min = 0x80;
|
||||
remain = 1;
|
||||
if(used) *used = 2;
|
||||
ret = ch & 0x1F;
|
||||
} else if((ch & 0xF0) == 0xE0) {
|
||||
min = 0x800;
|
||||
remain = 2;
|
||||
if(used) *used = 3;
|
||||
ret = ch & 0x0F;
|
||||
} else if((ch & 0xF8) == 0xF0) {
|
||||
min = 0x10000;
|
||||
remain = 3;
|
||||
if(used) *used = 4;
|
||||
ret = ch & 0x07;
|
||||
} else if((ch & 0xFC) == 0xF8) {
|
||||
min = 0x200000;
|
||||
remain = 4;
|
||||
if(used) *used = 5;
|
||||
ret = ch & 0x03;
|
||||
} else if((ch & 0xFE) == 0xFC) {
|
||||
min = 0x4000000;
|
||||
remain = 5;
|
||||
if(used) *used = 6;
|
||||
ret = ch & 0x01;
|
||||
} else {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
while(remain--) {
|
||||
if(!--size) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
ch = *src++;
|
||||
if((ch & 0xC0) != 0x80) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
ret <<= 6;
|
||||
ret |= ch & 0x3F;
|
||||
}
|
||||
|
||||
if(ch < min) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq)
|
||||
{
|
||||
return utf8_decode_char2_force(src, 6, used, ilseq);
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq)
|
||||
{
|
||||
uint8_t ch;
|
||||
wchar_t ret, min;
|
||||
int remain;
|
||||
|
||||
if(!src || !size) {
|
||||
errno = EINVAL;
|
||||
return 0;
|
||||
}
|
||||
if(used) *used = 1;
|
||||
ch = *src++;
|
||||
|
||||
if(ch & 0x80) {
|
||||
if((ch & 0xE0) == 0xC0) {
|
||||
min = 0x80;
|
||||
remain = 1;
|
||||
ret = ch & 0x1F;
|
||||
} else if((ch & 0xF0) == 0xE0) {
|
||||
min = 0x800;
|
||||
remain = 2;
|
||||
ret = ch & 0x0F;
|
||||
} else if((ch & 0xF8) == 0xF0) {
|
||||
min = 0x10000;
|
||||
remain = 3;
|
||||
ret = ch & 0x07;
|
||||
} else if((ch & 0xFC) == 0xF8) {
|
||||
min = 0x200000;
|
||||
remain = 4;
|
||||
ret = ch & 0x03;
|
||||
} else if((ch & 0xFE) == 0xFC) {
|
||||
min = 0x4000000;
|
||||
remain = 5;
|
||||
ret = ch & 0x01;
|
||||
} else {
|
||||
goto ILSEQ;
|
||||
}
|
||||
|
||||
while(remain--) {
|
||||
if(!--size) goto ILSEQ;
|
||||
ch = *src++;
|
||||
if(used) (*used)++;
|
||||
if((ch & 0xC0) != 0x80) goto ILSEQ;
|
||||
ret <<= 6;
|
||||
ret |= ch & 0x3F;
|
||||
}
|
||||
|
||||
if(ch < min) goto ILSEQ;
|
||||
|
||||
return ret;
|
||||
}
|
||||
return ch;
|
||||
|
||||
ILSEQ:
|
||||
// advance pointer to next valid char boundary
|
||||
while(1) {
|
||||
if(!*src || !size) break;
|
||||
if((*src & 0xC0) == 0x80) break;
|
||||
++src;
|
||||
--size;
|
||||
if(used) (*used)++;
|
||||
}
|
||||
|
||||
return ilseq;
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src)
|
||||
{
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = -1;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = size;
|
||||
|
||||
if(!utf8_decoder(&ctx)) return 0;
|
||||
if(*ctx.rd) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt)
|
||||
{
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = amt;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = size;
|
||||
|
||||
if(!utf8_decoder(&ctx)) return 0;
|
||||
if(ctx.rd_remain || !ctx.complete) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
if(written) *written = ctx.written;
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src)
|
||||
{
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = -1;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = size;
|
||||
ctx.error_callback = utf8_decode_error_callback_replace;
|
||||
|
||||
if(!utf8_decoder(&ctx)) return 0;
|
||||
if(*ctx.rd) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt)
|
||||
{
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = amt;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = size;
|
||||
ctx.error_callback = utf8_decode_error_callback_replace;
|
||||
|
||||
if(!utf8_decoder(&ctx)) return 0;
|
||||
if(written) *written = ctx.written;
|
||||
return dest;
|
||||
}
|
|
@ -0,0 +1,187 @@
|
|||
/* libutf8/src/lib/decode.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \defgroup decode UTF-8 decoding routines.
|
||||
|
||||
These routines decode UTF-8 data into C's wide character type \c wchar_t. Errors are reported
|
||||
through \c errno, with the following errors being of particular interest:
|
||||
|
||||
\li \c EINVAL - invalid argument to function
|
||||
\li \c EILSEQ - illegal encoding (i.e. not UTF-8 or encoding error)
|
||||
\li \c ENOMEM - not enough space in destination buffer
|
||||
|
||||
As a special case, functions which return a character may return the \c wchar_t representation of
|
||||
-1 to signify an error. This wording is used to take into account the fact that the \c wchar_t type
|
||||
could be unsigned.
|
||||
|
||||
*/
|
||||
/*!@{*/
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a character.
|
||||
|
||||
\param src Pointer to start of source data.
|
||||
\param used If not null, set to the number of bytes used.
|
||||
\retval (wchar_t)(-1) on error (see \c errno).
|
||||
\returns Decoded character.
|
||||
|
||||
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
|
||||
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
|
||||
a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a errno
|
||||
will be set to \c EILSEQ.
|
||||
|
||||
\warning Only use this function if you are sure it cannot read past the end of your buffer. See
|
||||
utf8_decode_char2() for a safe version.
|
||||
|
||||
*/
|
||||
wchar_t utf8_decode_char(const char* src, size_t* used);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a character, discarding illegal sequences.
|
||||
|
||||
\param src Pointer to start of source data.
|
||||
\param used If not null, set to the number of bytes used.
|
||||
\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the
|
||||
Unicode replacement character, \c 0xFFFD.
|
||||
\retval (wchar_t)(-1) on error (see \c errno).
|
||||
\retval ilseq If an illegal sequence is encountered.
|
||||
\returns Decoded character.
|
||||
\post \a *used will be set to the number of bytes consumed.
|
||||
|
||||
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
|
||||
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
|
||||
a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq
|
||||
will be returned and the buffer advanced to the next valid character. This means the function can
|
||||
only fail if you pass it an invalid \a src pointer.
|
||||
|
||||
\warning Only use this function if you are sure it cannot read past the end of your buffer. See
|
||||
utf8_decode_char2_force() for a safe version.
|
||||
|
||||
*/
|
||||
wchar_t utf8_decode_char_force(const char* src, size_t* used, wchar_t ilseq);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a character, given source buffer size.
|
||||
|
||||
\param src Pointer to start of source data.
|
||||
\param size Size of source data in bytes.
|
||||
\param used If not null, set to the number of bytes used.
|
||||
\retval (wchar_t)(-1) on error (see \c errno).
|
||||
\returns Decoded character.
|
||||
|
||||
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
|
||||
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used.
|
||||
|
||||
*/
|
||||
wchar_t utf8_decode_char2(const char* src, size_t size, size_t* used);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a character, discarding illegal sequences and given source buffer size.
|
||||
|
||||
\param src Pointer to start of source data.
|
||||
\param size Size of source data in bytes.
|
||||
\param used If not null, set to the number of bytes used.
|
||||
\param ilseq This value is returned if the UTF-8 byte sequence is invalid. Recommended is the
|
||||
Unicode replacement character, \c 0xFFFD.
|
||||
\retval (wchar_t)(-1) on error (see \c errno).
|
||||
\retval ilseq If an illegal sequence is encountered.
|
||||
\returns Decoded character.
|
||||
\post \a *used will be set to the number of bytes consumed.
|
||||
|
||||
Decodes a single character, returning the \c wchar_t representation of -1 on error and setting
|
||||
\c errno appropriately. If \a used is not NULL, it is set to the number of characters used. Passing
|
||||
a null pointer for \a src will result in \c EINVAL. If the UTF-8 byte sequence is corrupt, \a ilseq
|
||||
will be returned and the buffer advanced to the next valid character. This means the function can
|
||||
only fail if you pass it an invalid \a src pointer, or a \a size of 0.
|
||||
|
||||
*/
|
||||
wchar_t utf8_decode_char2_force(const char* src, size_t size, size_t* used, wchar_t ilseq);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a null-terminated string.
|
||||
|
||||
\param dest The output destination.
|
||||
\param size The number of characters that can be stored in \a dest.
|
||||
\param src Pointer to the null-terminated source data.
|
||||
\returns Pointer to the output destination.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets
|
||||
\c errno appropriately.
|
||||
|
||||
*/
|
||||
wchar_t* utf8_decode(wchar_t* dest, size_t size, const char* src);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a fixed-size string.
|
||||
|
||||
\param dest The output destination.
|
||||
\param size The number of characters that can be stored in \a dest.
|
||||
\param written Set to the number of bytes written (excluding NUL).
|
||||
\param src Pointer to the null-terminated source data.
|
||||
\param amt Number of bytes to decode.
|
||||
\returns Pointer to the output destination.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets
|
||||
\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null,
|
||||
it is set to the number of characters written excluding the terminating NUL. This function always
|
||||
produces null-terminated strings.
|
||||
|
||||
*/
|
||||
wchar_t* utf8_decode2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a null-terminated string, ignoring errors.
|
||||
|
||||
\param dest The output destination.
|
||||
\param size The number of characters that can be stored in \a dest.
|
||||
\param src Pointer to the null-terminated source data.
|
||||
\returns Pointer to the output destination.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function will attempt to decode a null-terminated UTF-8 string. It returns 0 on error and sets
|
||||
\c errno appropriately.
|
||||
|
||||
This function will truncate the output if there is not enough space and will skip characters it
|
||||
cannot decode. It can only fail if you pass it invalid parameters.
|
||||
|
||||
*/
|
||||
wchar_t* utf8_decode_force(wchar_t* dest, size_t size, const char* src);
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode a fixed-size string, ignoring errors.
|
||||
|
||||
\param dest The output destination.
|
||||
\param size The number of characters that can be stored in \a dest.
|
||||
\param written Set to the number of bytes written (excluding NUL).
|
||||
\param src Pointer to the null-terminated source data.
|
||||
\param amt Number of bytes to decode.
|
||||
\returns Pointer to the output destination.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function will attempt to decode a fixed-size UTF-8 string. It returns 0 on error and sets
|
||||
\c errno appropriately. It will happily transcode ASCII NUL characters. If \a written is not null,
|
||||
it is set to the number of characters written excluding the terminating NUL. This function always
|
||||
produces null-terminated strings.
|
||||
|
||||
This function will truncate the output if there is not enough space and will skip characters it
|
||||
cannot decode. It can only fail if you pass it invalid parameters.
|
||||
|
||||
*/
|
||||
wchar_t* utf8_decode_force2(wchar_t* dest, size_t size, size_t* written, const char* src, size_t amt);
|
||||
|
||||
|
||||
|
||||
/*!@}*/
|
|
@ -0,0 +1,204 @@
|
|||
/* libutf8/src/lib/decode_ctx.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
enum utf8_decoder_state {
|
||||
utf8_state_none,
|
||||
utf8_state_multibyte1,
|
||||
utf8_state_multibyte2,
|
||||
utf8_state_multibyte3,
|
||||
utf8_state_multibyte4,
|
||||
utf8_state_multibyte5,
|
||||
utf8_state_error,
|
||||
utf8_state_skip
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* ctx)
|
||||
{
|
||||
wchar_t* wr;
|
||||
size_t avail;
|
||||
enum utf8_decode_error error_type;
|
||||
|
||||
if(!ctx || !ctx->rd || !ctx->wr || ctx->wr_size < 2 || ctx->state == utf8_state_error) {
|
||||
errno = EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
wr = ctx->wr;
|
||||
ctx->written = 0;
|
||||
avail = ctx->wr_size;
|
||||
|
||||
loop:
|
||||
while(ctx->rd_remain) {
|
||||
uint8_t in = *ctx->rd;
|
||||
|
||||
switch(ctx->state) {
|
||||
case utf8_state_skip:
|
||||
case utf8_state_none:
|
||||
if(!in && ctx->rd_remain < 0) {
|
||||
*wr = 0;
|
||||
ctx->complete = true;
|
||||
++ctx->byte_offset;
|
||||
return ctx;
|
||||
}
|
||||
if(!(in & 0x80)) {
|
||||
*wr++ = in;
|
||||
++ctx->written;
|
||||
--avail;
|
||||
++ctx->char_offset;
|
||||
ctx->complete = true;
|
||||
if(in == 0x0A) {
|
||||
++ctx->line;
|
||||
ctx->col = 0;
|
||||
} else {
|
||||
++ctx->col;
|
||||
}
|
||||
ctx->state = utf8_state_none;
|
||||
break;
|
||||
}
|
||||
ctx->complete = false;
|
||||
if((in & 0xE0) == 0xC0) {
|
||||
ctx->minch = 0x80;
|
||||
ctx->state = utf8_state_multibyte1;
|
||||
ctx->statech = in & 0x1F;
|
||||
} else if((in & 0xF0) == 0xE0) {
|
||||
ctx->minch = 0x800;
|
||||
ctx->state = utf8_state_multibyte2;
|
||||
ctx->statech = in & 0x0F;
|
||||
} else if((in & 0xF8) == 0xF0) {
|
||||
ctx->minch = 0x10000;
|
||||
ctx->state = utf8_state_multibyte3;
|
||||
ctx->statech = in & 0x07;
|
||||
} else if((in & 0xFC) == 0xF8) {
|
||||
ctx->minch = 0x200000;
|
||||
ctx->state = utf8_state_multibyte4;
|
||||
ctx->statech = in & 0x03;
|
||||
} else if((in & 0xFE) == 0xFC) {
|
||||
ctx->minch = 0x4000000;
|
||||
ctx->state = utf8_state_multibyte5;
|
||||
ctx->statech = in & 0x01;
|
||||
} else if(ctx->state != utf8_state_none) {
|
||||
ctx->state = utf8_state_none;
|
||||
} else {
|
||||
error_type = ((in & 0xC0) == 0x80) ? utf8_decode_error_lone_cchar
|
||||
: utf8_decode_error_not_schar;
|
||||
goto error;
|
||||
}
|
||||
break;
|
||||
|
||||
case utf8_state_multibyte1:
|
||||
case utf8_state_multibyte2:
|
||||
case utf8_state_multibyte3:
|
||||
case utf8_state_multibyte4:
|
||||
case utf8_state_multibyte5:
|
||||
if((in & 0xC0) != 0x80) {
|
||||
error_type = utf8_decode_error_not_cchar;
|
||||
goto error;
|
||||
}
|
||||
ctx->statech <<= 6;
|
||||
ctx->statech |= in & 0x3F;
|
||||
if(!--ctx->state) {
|
||||
if(ctx->statech < ctx->minch) {
|
||||
error_type = utf8_decode_error_overlong;
|
||||
goto error;
|
||||
} else {
|
||||
// validate codepoint
|
||||
if(!utf8_isvalid(ctx->statech)) {
|
||||
error_type = utf8_decode_error_illegal_cp;
|
||||
goto error;
|
||||
}
|
||||
|
||||
// add to output string
|
||||
*wr++ = ctx->statech;
|
||||
++ctx->written;
|
||||
--avail;
|
||||
++ctx->char_offset;
|
||||
ctx->complete = true;
|
||||
if(ctx->statech == 0x0A || ctx->statech == 0x2028) {
|
||||
++ctx->line;
|
||||
ctx->col = 0;
|
||||
} else {
|
||||
++ctx->col;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
errno = EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
++ctx->byte_offset;
|
||||
++ctx->rd;
|
||||
if(ctx->rd_remain > 0) --ctx->rd_remain;
|
||||
if(avail == 1) break;
|
||||
}
|
||||
*wr = 0;
|
||||
return ctx;
|
||||
|
||||
error:
|
||||
if(!ctx->error_callback) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
switch(ctx->error_callback(ctx, error_type, wr)) {
|
||||
case utf8_decode_error_action_abort:
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
|
||||
case utf8_decode_error_action_skip:
|
||||
ctx->state = utf8_state_skip;
|
||||
goto loop;
|
||||
|
||||
case utf8_decode_error_action_replace:
|
||||
ctx->state = utf8_state_skip;
|
||||
++ctx->written;
|
||||
if(*wr == 0x0A || *wr == 0x2028) {
|
||||
++ctx->line;
|
||||
ctx->col = 0;
|
||||
} else {
|
||||
++ctx->col;
|
||||
}
|
||||
++wr;
|
||||
if(--avail == 1) {
|
||||
*wr = 0;
|
||||
return ctx;
|
||||
}
|
||||
goto loop;
|
||||
}
|
||||
|
||||
// shouldn't reach here
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_decode_error_action utf8_decode_error_callback_replace(
|
||||
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
|
||||
{
|
||||
(void)ctx;
|
||||
(void)error;
|
||||
*newch = 0xFFFD;
|
||||
return utf8_decode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_decode_error_action utf8_decode_error_callback_skip(
|
||||
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
|
||||
{
|
||||
(void)ctx;
|
||||
(void)error;
|
||||
(void)newch;
|
||||
return utf8_decode_error_action_skip;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,197 @@
|
|||
/* libutf8/src/lib/decode_ctx.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \defgroup decode_ctx UTF-8 stateful decoder.
|
||||
|
||||
This UTF-8 decoder uses a structure to maintain state information between calls. This means that
|
||||
you can feed it a stream of data as it comes in without needing to store the entire document in a
|
||||
buffer. It correctly copes with the currently-available data ending on a non-character boundary.
|
||||
|
||||
Errors are handled by providing a callback function (several of which are provided by the library).
|
||||
The callback function has the option of aborting the conversion, substituting a replacement
|
||||
character, or simply skipping the illegal byte sequence.
|
||||
|
||||
*/
|
||||
/*!@{*/
|
||||
|
||||
|
||||
|
||||
/*! \brief Types of decoder error.
|
||||
|
||||
These are the types of error that can be encountered by the decoder. This allows slightly more
|
||||
information than is provided by setting \a errno to \c EILSEQ. The type of error will be passed
|
||||
to the callback function.
|
||||
|
||||
*/
|
||||
enum utf8_decode_error {
|
||||
/// Lone continuation char encountered when start char expected.
|
||||
utf8_decode_error_lone_cchar,
|
||||
|
||||
/// Non-continuation char encountered within multibyte sequence.
|
||||
utf8_decode_error_not_cchar,
|
||||
|
||||
/// Invalid start char (not ASCII).
|
||||
utf8_decode_error_not_schar,
|
||||
|
||||
/// Overlong byte sequence.
|
||||
utf8_decode_error_overlong,
|
||||
|
||||
/// Illegal code positions (UTF-16 surrogates or 0xFFFE,0xFFFF).
|
||||
utf8_decode_error_illegal_cp
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*! \brief Action to be taken after error callback.
|
||||
|
||||
These are the possible actions that can be undertaken after a stateful decode has encountered an
|
||||
error. These actions are specified by the error callback function's return value.
|
||||
|
||||
*/
|
||||
enum utf8_decode_error_action {
|
||||
/// Abort the conversion, returning EILSEQ.
|
||||
utf8_decode_error_action_abort,
|
||||
|
||||
/// Skip the illegal byte sequence.
|
||||
utf8_decode_error_action_skip,
|
||||
|
||||
/// Discard the illegal byte sequence and enter a replacement char.
|
||||
utf8_decode_error_action_replace
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*! \brief Error callback type.
|
||||
|
||||
\param state The state-storage structure.
|
||||
\param error The error type.
|
||||
\param[out] newch If utf8_decode_error_action_replace is returned, then set this to the value of
|
||||
the character you wish to replace with (\c 0xFFFD is recommended).
|
||||
\returns A value specifying what action to undertake as a result of the callback.
|
||||
|
||||
This callback determines the action of the UTF-8 stateful decoder on encountering an illegal byte
|
||||
sequence. It can choose to abort the conversion, skip the illegal sequence, or replace the illegal
|
||||
sequence with an arbitrary character.
|
||||
|
||||
*/
|
||||
typedef enum utf8_decode_error_action(*utf8_decode_error_callback)(
|
||||
const struct utf8_decode_state* state, enum utf8_decode_error error, wchar_t* newch);
|
||||
|
||||
|
||||
|
||||
/*! \brief State structure used to decode UTF-8 into Unicode.
|
||||
|
||||
This structure is used to decode arbitrary chunks of UTF-8 data into Unicode. It can deal with
|
||||
partial data streams (even if they are cut-off mid-character).
|
||||
|
||||
Before calling utf8_decoder, you must set up the object appropriately. The first step is to use
|
||||
\a memset to initialise everything to 0. Then you need to fill out the read and write pointers, and
|
||||
possibly set up the error callback.
|
||||
|
||||
To use it, you set \a rd to point to your input data and \a rd_remain to the amount you have. If
|
||||
\a rd_remain is negative, the input data is assumed to be null-terminated; otherwise, it is taken
|
||||
as the number of bytes remaining at the input. These are updated after each call, so simply check
|
||||
if \a rd_remain is 0 (or \a *rd is 0 in the case of a null-terminated string).
|
||||
|
||||
You must also set \a wr (pointer to destination buffer) and \a wr_size (number of characters that
|
||||
can be written there), and \a written is set for you (it is the number of characters written per
|
||||
call but excluding the terminating NUL). This implies that the buffer must have space for at least
|
||||
two characters. You can change \a wr and \a wr_size at any time, but if you leave them the same the
|
||||
data will be overwritten on each call.
|
||||
|
||||
If you wish to do error recovery, set \a error_callback and possibly \a data.
|
||||
|
||||
You can examine the \a line and \a col variables to get the line / column of the input data at which
|
||||
the decoder is currently operating. \a char_offset and \a byte_offset represent the offset, in
|
||||
complete characters or bytes, from the start of the stream. With the exception of \a byte_offset,
|
||||
these variables aren't perfect, as they can be affected by errors and limitations (only 0x0A and
|
||||
0x2028 are recognised as line end chars, and the effect of tabs is ignored).
|
||||
|
||||
*/
|
||||
struct utf8_decode_state {
|
||||
/// \c false if we are part-way through a multi-byte character.
|
||||
bool complete;
|
||||
|
||||
/// Data to read (current read position).
|
||||
const char* rd;
|
||||
|
||||
/// Number of bytes remaining (current).
|
||||
int rd_remain;
|
||||
|
||||
/// Internal state; initialise to 0, don't change.
|
||||
int state;
|
||||
|
||||
/// Error callback (may be 0).
|
||||
utf8_decode_error_callback error_callback;
|
||||
|
||||
/// Pointer to output buffer.
|
||||
wchar_t* wr;
|
||||
|
||||
/// Number of characters that can be written.
|
||||
size_t wr_size;
|
||||
|
||||
/// Number of characters written on last call.
|
||||
size_t written;
|
||||
|
||||
/// Arbitrary data pointer for \a error_callback.
|
||||
void* data;
|
||||
|
||||
/// Current line (starting from 0).
|
||||
int line;
|
||||
|
||||
/// Current column (starting from 0).
|
||||
int col;
|
||||
|
||||
/// Character offset from start of data (starting from 0).
|
||||
int char_offset;
|
||||
|
||||
/// Byte offset from start of data (starting from 0).
|
||||
int byte_offset;
|
||||
|
||||
/// Don't use this.
|
||||
wchar_t statech;
|
||||
/// Don't use this.
|
||||
wchar_t minch;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*! \brief Decode an arbitrary chunk of a UTF-8 byte stream.
|
||||
|
||||
\param state The state-storage structure.
|
||||
\retval ctx on success.
|
||||
\retval 0 on error (see \a errno).
|
||||
|
||||
This function is used to do multi-pass decoding of arbitrary UTF-8 byte streams. Each call will
|
||||
update \a state.rd, \a state.rd_remain and \a state.written. \a state.complete is \c true if, on consumption
|
||||
of all the data, we are not inside a multibyte character.
|
||||
|
||||
Should an error occur, \a state.error_callback is called (if it is not 0). If it is 0, or it returns
|
||||
utf8_decode_error_action_abort, then the conversion will be aborted and the object set into
|
||||
an error state. \a errno will be set to \c EILSEQ. Once the object is in an error state, there is
|
||||
no way to recover short of completely clearing it and starting with fresh data. Continuing to call
|
||||
this function with an invalid object will result in \c EINVAL.
|
||||
|
||||
*/
|
||||
struct utf8_decode_state* utf8_decoder(struct utf8_decode_state* state);
|
||||
|
||||
|
||||
|
||||
/// Standard error callback: use replacement char 0xFFFD.
|
||||
enum utf8_decode_error_action utf8_decode_error_callback_replace(
|
||||
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch);
|
||||
|
||||
/// Standard error callback: skip invalid chars.
|
||||
enum utf8_decode_error_action utf8_decode_error_callback_skip(
|
||||
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch);
|
||||
|
||||
|
||||
|
||||
/*!@}*/
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,141 @@
|
|||
/* libutf8/src/lib/encode.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
char* utf8_encode_char(char* dest, size_t amt, wchar_t ch)
|
||||
{
|
||||
if(!dest || !amt) {
|
||||
errno = EINVAL;
|
||||
return 0;
|
||||
}
|
||||
if(!utf8_isvalid(ch)) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(ch < 0x80) {
|
||||
*dest++ = ch;
|
||||
|
||||
} else if(ch < 0x800) {
|
||||
if(amt < 2) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
*dest++ = 0xC0 | ((ch >> 6) & 0x1F);
|
||||
*dest++ = 0x80 | (ch & 0x3F);
|
||||
|
||||
} else if(ch < 0x10000) {
|
||||
if(amt < 3) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
*dest++ = 0xE0 | ((ch >> 12) & 0xF);
|
||||
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
|
||||
*dest++ = 0x80 | (ch & 0x3F);
|
||||
|
||||
} else if(ch < 0x200000) {
|
||||
if(amt < 4) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
*dest++ = 0xF0 | ((ch >> 18) & 0x7);
|
||||
*dest++ = 0x80 | ((ch >> 12) & 0x3F);
|
||||
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
|
||||
*dest++ = 0x80 | (ch & 0x3F);
|
||||
|
||||
} else if(ch < 0x4000000) {
|
||||
if(amt < 5) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
*dest++ = 0xF8 | ((ch >> 24) & 0x3);
|
||||
*dest++ = 0x80 | ((ch >> 18) & 0x3F);
|
||||
*dest++ = 0x80 | ((ch >> 12) & 0x3F);
|
||||
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
|
||||
*dest++ = 0x80 | (ch & 0x3F);
|
||||
|
||||
} else {
|
||||
if(amt < 6) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
*dest++ = 0xFC | ((ch >> 30) & 0x1);
|
||||
*dest++ = 0x80 | ((ch >> 24) & 0x3F);
|
||||
*dest++ = 0x80 | ((ch >> 18) & 0x3F);
|
||||
*dest++ = 0x80 | ((ch >> 12) & 0x3F);
|
||||
*dest++ = 0x80 | ((ch >> 6) & 0x3F);
|
||||
*dest++ = 0x80 | (ch & 0x3F);
|
||||
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq)
|
||||
{
|
||||
if(!utf8_isvalid(ilseq)) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return utf8_encode_char(dest, amt, utf8_isvalid(ch) ? ch : ilseq);
|
||||
}
|
||||
|
||||
|
||||
|
||||
char* utf8_encode(char* dest, size_t amt, const wchar_t* src)
|
||||
{
|
||||
return utf8_encode2(dest, amt, 0, src, -1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt)
|
||||
{
|
||||
struct utf8_encode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = inamt;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = amt;
|
||||
|
||||
if(!utf8_encoder(&ctx)) return 0;
|
||||
if(ctx.rd_remain > 0 || (ctx.rd_remain < 0 && *ctx.rd)) {
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
if(written) *written = ctx.written;
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src)
|
||||
{
|
||||
return utf8_encode_force2(dest, amt, 0, src, -1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt)
|
||||
{
|
||||
struct utf8_encode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = inamt;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = amt;
|
||||
ctx.error_callback = utf8_encode_error_callback_replace;
|
||||
|
||||
if(!utf8_encoder(&ctx)) return 0;
|
||||
if(written) *written = ctx.written;
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,144 @@
|
|||
/* libutf8/src/lib/encode.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \defgroup encode UTF-8 encoding routines.
|
||||
|
||||
The functions in this module allow encoding of UTF-8 characters. Errors are reported through
|
||||
\c errno, with the following errors being of particular interest:
|
||||
|
||||
\li \c EINVAL - invalid argument to function
|
||||
\li \c EILSEQ - illegal source character (see utf8_isvalid())
|
||||
\li \c ENOMEM - not enough space in destination buffer
|
||||
|
||||
*/
|
||||
/*!@{*/
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode a single character into UTF-8.
|
||||
|
||||
\param dest The destination buffer.
|
||||
\param amt Number of bytes in destination buffer.
|
||||
\param ch Character to encode.
|
||||
\returns Pointer to next byte of buffer to use.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function will encode a single character into UTF-8. It returns a pointer to the end of the
|
||||
character (i.e. the next position in the buffer you want to write to).
|
||||
|
||||
On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1, \c EILSEQ
|
||||
if \a ch is not valid; or \c ENOMEM if the result would not fit into
|
||||
\a amt bytes) and returns 0.
|
||||
|
||||
*/
|
||||
char* utf8_encode_char(char* dest, size_t amt, wchar_t ch);
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode a single character into UTF-8, forcing replacement of invalid characters.
|
||||
|
||||
\param dest The destination buffer.
|
||||
\param amt Number of bytes in destination buffer.
|
||||
\param ch Character to encode.
|
||||
\param ilseq If \a ch is not a legal character, then this is encoded instead.
|
||||
\returns Pointer to next byte of buffer to use.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function will encode a single character into UTF-8. It returns a pointer to the end of the
|
||||
character (i.e. the next position in the buffer you want to write to). If the source character \a ch
|
||||
is not a valid code point, it will instead encode the character \a ilseq.
|
||||
|
||||
On error, it sets \c errno (to \c EINVAL, if \a dest is null or \a amt is less than 1; \c EILSEQ
|
||||
if \a ilseq is not valid; or \c ENOMEM if the result would not fit into
|
||||
\a amt bytes) and returns 0.
|
||||
|
||||
*/
|
||||
char* utf8_encode_char_force(char* dest, size_t amt, wchar_t ch, wchar_t ilseq);
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode a null-terminated string into UTF-8.
|
||||
|
||||
\param dest The destination buffer.
|
||||
\param amt Number of bytes in the destination buffer.
|
||||
\param src Null-terminated source string.
|
||||
\returns Pointer to destination buffer.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function encodes a null-terminated Unicode string into the destination buffer. It returns a
|
||||
pointer to the destination buffer on success, and 0 on error. If there is not enough space in the
|
||||
buffer, or an illegal character is encountered somewhere in the sequence, it will fail.
|
||||
|
||||
*/
|
||||
char* utf8_encode(char* dest, size_t amt, const wchar_t* src);
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode a fixed-size string into UTF-8.
|
||||
|
||||
\param dest The destination buffer.
|
||||
\param amt Number of bytes in the destination buffer.
|
||||
\param written Set to number of bytes written on success (excluding NUL).
|
||||
\param src Pointer to source string.
|
||||
\param inamt Number of characters to encode.
|
||||
\returns Pointer to destination buffer.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function encodes a Unicode string (possibly containing ASCII NUL) into the destination buffer.
|
||||
It returns a pointer to the destination buffer on success, and 0 on error. If there is not enough
|
||||
space in the buffer, or an illegal character is encountered somewhere in the sequence, it will fail.
|
||||
The destination will be null-terminated.
|
||||
|
||||
*/
|
||||
char* utf8_encode2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt);
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode a null-terminated string into UTF-8, ignoring errors.
|
||||
|
||||
\param dest The destination buffer.
|
||||
\param amt Number of bytes in the destination buffer.
|
||||
\param src Null-terminated source string.
|
||||
\returns Pointer to destination buffer.
|
||||
\returns 0 if arguments are invalid.
|
||||
|
||||
This function will encode a null-terminated Unicode string into the destination buffer, making a
|
||||
best-effort in the case of failures. If there is not enough memory, the destination string will be
|
||||
truncated (but still null-terminated). If an illegal source character is encountered, it is replaced
|
||||
with the Unicode replacement character U+FFFD. The function can only fail if one of the arguments is
|
||||
invalid.
|
||||
|
||||
*/
|
||||
char* utf8_encode_force(char* dest, size_t amt, const wchar_t* src);
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode a fixed-size string into UTF-8, ignoring errors.
|
||||
|
||||
\param dest The destination buffer.
|
||||
\param amt Number of bytes in the destination buffer.
|
||||
\param written Set to number of bytes written on success (excluding NUL).
|
||||
\param src Null-terminated source string.
|
||||
\param inamt Number of characters to encode.
|
||||
\returns Pointer to destination buffer.
|
||||
\returns 0 if arguments are invalid.
|
||||
|
||||
This function will encode a Unicode string (possibly containing ASCII NUL) into the destination
|
||||
buffer, making a best-effort in the case of failures. If there is not enough memory, the destination
|
||||
string will be truncated (but still null-terminated). If an illegal source character is encountered,
|
||||
it is replaced with the Unicode replacement character U+FFFD. The function can only fail if one of
|
||||
the arguments is invalid.
|
||||
|
||||
*/
|
||||
char* utf8_encode_force2(char* dest, size_t amt, size_t* written, const wchar_t* src, size_t inamt);
|
||||
|
||||
|
||||
|
||||
/*!@}*/
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,88 @@
|
|||
/* libutf8/src/lib/encode_state.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state)
|
||||
{
|
||||
char* wr = state->wr, * ret;
|
||||
char* endp = wr + state->wr_size - 1;
|
||||
wchar_t ch;
|
||||
enum utf8_encode_error_action error_action;
|
||||
bool reencoding;
|
||||
|
||||
if(!state || !state->rd || !state->wr || state->wr_size < 7) {
|
||||
errno = EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
state->written = 0;
|
||||
while(state->rd_remain) {
|
||||
ch = *state->rd;
|
||||
if(!ch && state->rd_remain < 0) break;
|
||||
|
||||
reencoding = false;
|
||||
reencode:
|
||||
ret = utf8_encode_char(wr, endp - wr, ch);
|
||||
if(!ret) {
|
||||
if(errno == ENOMEM) break;
|
||||
if(!state->error_callback || reencoding) {
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
}
|
||||
error_action = state->error_callback(state, &ch);
|
||||
switch(error_action) {
|
||||
case utf8_encode_error_action_abort:
|
||||
errno = EILSEQ;
|
||||
return 0;
|
||||
|
||||
case utf8_encode_error_action_replace:
|
||||
reencoding = true;
|
||||
goto reencode;
|
||||
|
||||
case utf8_encode_error_action_skip:
|
||||
ret = wr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(state->rd_remain > 0) state->rd_remain--;
|
||||
++state->rd;
|
||||
++state->char_offset;
|
||||
if(ch == 0x0A || ch == 0x2028) {
|
||||
++state->line;
|
||||
state->col = 0;
|
||||
} else {
|
||||
++state->col;
|
||||
}
|
||||
state->written += ret - wr;
|
||||
wr = ret;
|
||||
if(wr == endp) break;
|
||||
}
|
||||
*wr = 0;
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_encode_error_action utf8_encode_error_callback_replace(
|
||||
const struct utf8_encode_state* state, wchar_t* newch)
|
||||
{
|
||||
(void)state;
|
||||
*newch = 0xFFFD;
|
||||
return utf8_encode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_encode_error_action utf8_encode_error_callback_skip(
|
||||
const struct utf8_encode_state* state, wchar_t* newch)
|
||||
{
|
||||
(void)state;
|
||||
(void)newch;
|
||||
return utf8_encode_error_action_skip;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,158 @@
|
|||
/* libutf8/src/lib/encode_state.h
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
/*! \defgroup encode_state UTF-8 stateful encoder.
|
||||
|
||||
This UTF-8 encoder uses a structure to maintain state information between calls. This means that
|
||||
you can feed it a stream of data as it comes in without needing to store the entire source in a
|
||||
buffer.
|
||||
|
||||
Errors (i.e. illegal source chars; see utf8_isvalid()) are handled by providing a callback function
|
||||
(several of which are provided by the library). The callback function has the option of aborting
|
||||
the conversion, substituting a replacement character, or simply skipping the illegal source
|
||||
character.
|
||||
|
||||
*/
|
||||
/*!@{*/
|
||||
|
||||
|
||||
|
||||
/*! \brief Action to be taken after error callback.
|
||||
|
||||
These are the possible actions that can be undertaken after a stateful encoding operation has
|
||||
encountered an error (illegal source char). These actions are specified by the error callback
|
||||
function's return value.
|
||||
|
||||
*/
|
||||
enum utf8_encode_error_action {
|
||||
/// Abort the conversion, returning EILSEQ.
|
||||
utf8_encode_error_action_abort,
|
||||
|
||||
/// Skip the illegal byte sequence.
|
||||
utf8_encode_error_action_skip,
|
||||
|
||||
/// Discard the illegal byte sequence and enter a replacement char.
|
||||
utf8_encode_error_action_replace
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
/*! \brief Error callback type.
|
||||
|
||||
\param state The encoder state information.
|
||||
\param[out] newch If \a utf8_encode_error_action_replace is returned, this is set to the
|
||||
character that should be substituted instead of the illegal source character.
|
||||
|
||||
This function is called whenever an error occurs. It can examine \a state (and specifically
|
||||
\a *state.rd) to determine the illegal source character. It can choose to skip the character, replace
|
||||
it with something else, or abort the conversion entirely.
|
||||
|
||||
*/
|
||||
typedef enum utf8_encode_error_action (*utf8_encode_error_callback)(
|
||||
const struct utf8_encode_state* state, wchar_t* newch);
|
||||
|
||||
/// Standard error callback: use replacement char 0xFFFD.
|
||||
enum utf8_encode_error_action utf8_encode_error_callback_replace(
|
||||
const struct utf8_encode_state* state, wchar_t* newch);
|
||||
|
||||
/// Standard error callback: skip invalid chars.
|
||||
enum utf8_encode_error_action utf8_encode_error_callback_skip(
|
||||
const struct utf8_encode_state* state, wchar_t* newch);
|
||||
|
||||
|
||||
|
||||
/*! \brief State structure used to encode Unicode into UTF-8.
|
||||
|
||||
This structure is used to encode an arbitrary Unicode string into UTF-8. To set it up, first call
|
||||
\a memset to clear the structure to zero. You will then
|
||||
want to set \a rd to point to your input string, with \a rd_remain the number of bytes to encode
|
||||
(you can set it to a negative number if \a rd is null-terminated and you want to encode the whole
|
||||
thing). You will also want to tell it where to write to (\a wr) and how much space there is in that
|
||||
buffer (\a wr_size).
|
||||
|
||||
To deal with errors (illegal input chars), you can provide a callback function \a error_callback.
|
||||
An arbitrary \a data pointer is provided in case you wish to associate some object with the encode
|
||||
operation. Passing a null pointer for \a error_callback is a valid way of indicating you do not
|
||||
wish to attempt to correct errors.
|
||||
|
||||
You can examine the \a line and \a col variables to get the line / column of the input data at which
|
||||
the decoder is currently operating. These variables aren't perfect, as they can be
|
||||
affected by errors and limitations (only 0x0A and 0x2028 are recognised as line end chars, and the
|
||||
effect of tabs is ignored). \a char_offset represents the offset, in complete characters, from the
|
||||
start of the stream, and should always be accurate.
|
||||
|
||||
*/
|
||||
struct utf8_encode_state {
|
||||
/// Current read position.
|
||||
const wchar_t* rd;
|
||||
|
||||
/// Number of chars remaining (-ve means to scan for null char).
|
||||
int rd_remain;
|
||||
|
||||
/// Callback function used to handle illegal source characters.
|
||||
utf8_encode_error_callback error_callback;
|
||||
|
||||
/// Output buffer.
|
||||
char* wr;
|
||||
|
||||
/// Output buffer size.
|
||||
size_t wr_size;
|
||||
|
||||
/// Number of bytes written during last call.
|
||||
size_t written;
|
||||
|
||||
/// Arbitrary pointer (useful for \a error_callback).
|
||||
void* data;
|
||||
|
||||
/// Current line (starting from 0).
|
||||
int line;
|
||||
|
||||
/// Current column (starting from 0).
|
||||
int col;
|
||||
|
||||
/// Character offset from start of data (starting from 0).
|
||||
int char_offset;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*! \brief Encode an arbitrary Unicode string.
|
||||
|
||||
\param state The encoder state information.
|
||||
\retval state on success.
|
||||
\retval 0 on error (see \c errno).
|
||||
|
||||
This function is used to encode some arbitrary Unicode string into UTF-8. It uses a state-storage
|
||||
structure which allows you to perform the encoding in multiple passes (e.g. if you are encoding
|
||||
an arbitrary string and outputting it, you will want to use a fixed size buffer and this might
|
||||
be smaller than required).
|
||||
|
||||
In each pass of the function, \a rd and \a rd_remain will be updated to record the current reading
|
||||
position and the number of bytes left to encode. If the function completes this pass, \a rd_remain
|
||||
will be zero (but if you are converting a null-terminated string, you will need to check for \a *rd
|
||||
to be zero instead).
|
||||
|
||||
After each call, \a wr will be unchanged but \a written will contain the number of bytes written
|
||||
(excluding a terminating null, which is always written). If you do not want to overwrite this data
|
||||
on the next call, you will have to update \a wr and \a wr_size.
|
||||
|
||||
If \a state is null, or not filled out properly (no source data or destination buffer not at least 7
|
||||
bytes large), then no conversion will be performed and \a EINVAL will be stored in \a errno. If an
|
||||
illegal source character is encountered, and the error callback is 0, aborts the process or tries
|
||||
to replace the char with another illegal code point, then \a EILSEQ will be stored in \a errno. On
|
||||
error, 0 will be returned.
|
||||
|
||||
*/
|
||||
struct utf8_encode_state* utf8_encoder(struct utf8_encode_state* state);
|
||||
|
||||
|
||||
|
||||
/*!@}*/
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,21 @@
|
|||
# libutf8/src/lib/clib/pkgconf.in
|
||||
#
|
||||
# Metadata file for pkg-config
|
||||
# ( http://www.freedesktop.org/software/pkgconfig/ )
|
||||
#
|
||||
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
# Released under the GNU GPLv2. See file COPYING or
|
||||
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||
#
|
||||
|
||||
# Name, description
|
||||
Name: libutf8
|
||||
Description: Library for encoding and decoding UTF-8
|
||||
Version: @VERSION@
|
||||
|
||||
# Requirements
|
||||
Requires:
|
||||
|
||||
# Compilation information
|
||||
Libs: -L@LIBDIR@ -lutf8
|
||||
Cflags: -I@INCLUDEDIR@
|
|
@ -0,0 +1,17 @@
|
|||
# libutf8/src/libutf8/soversion
|
||||
#
|
||||
# (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
# Released under the GNU GPLv2. See file COPYING or
|
||||
# http://www.gnu.org/copyleft/gpl.html for details.
|
||||
#
|
||||
|
||||
|
||||
|
||||
# SOMAJOR and SOMINOR are included in the library's soname. They need to
|
||||
# be bumped on a binary-incompatible release. They are both single
|
||||
# integers.
|
||||
SOMAJOR=0
|
||||
SOMINOR=0
|
||||
|
||||
# SOMICRO is bumped every time there is a binary-compatible release.
|
||||
SOMICRO=0
|
|
@ -0,0 +1 @@
|
|||
c tests tests libutf8
|
|
@ -0,0 +1,3 @@
|
|||
source src/tests/build.tests
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,43 @@
|
|||
# These are external variables, and shouldn't clash with anything else
|
||||
# tests_BUILT
|
||||
#
|
||||
|
||||
build_target libutf8 || return 1
|
||||
|
||||
if [ -z ${tests_BUILT} ]
|
||||
then
|
||||
LIBS="${libutf8} "
|
||||
EXTRAS=""
|
||||
|
||||
echo "Building test programs..."
|
||||
do_cmd mkdir -p obj/tests || return 1
|
||||
|
||||
for SRC in src/tests/*.c
|
||||
do
|
||||
TEST="obj/tests/$(basename ${SRC} | sed -e 's,.c$,,')"
|
||||
MODIFIED=0
|
||||
for file in ${LIBS} ${SRC} src/tests/build.tests
|
||||
do
|
||||
if [ ${file} -nt ${TEST} ]
|
||||
then
|
||||
MODIFIED=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${MODIFIED} -ne 0 ]
|
||||
then
|
||||
do_cmd ${CC} -Iobj ${CFLAGS} -o ${TEST} ${SRC} ${LIBS} ${EXTRAS} || return 1
|
||||
print_success "Built ${TEST}"
|
||||
else
|
||||
print_success "${TEST} is up to date"
|
||||
fi
|
||||
done
|
||||
|
||||
print_success "All tests built"
|
||||
|
||||
tests_BUILT=1
|
||||
fi
|
||||
|
||||
# kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
# vim: expandtab:ts=4:sw=4
|
|
@ -0,0 +1,107 @@
|
|||
/* libutf8/src/tests/decode.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
|
||||
void writeout(const wchar_t* x, int amt)
|
||||
{
|
||||
fwrite(x, sizeof(wchar_t), amt, stdout);
|
||||
}
|
||||
|
||||
|
||||
|
||||
enum utf8_decode_error_action error_callback(
|
||||
const struct utf8_decode_state* ctx, enum utf8_decode_error error, wchar_t* newch)
|
||||
{
|
||||
fprintf(stderr, "Line %d, col %d (char %d, byte %d): ",
|
||||
ctx->line + 1, ctx->col + 1, ctx->char_offset, ctx->byte_offset);
|
||||
switch(error) {
|
||||
case utf8_decode_error_lone_cchar:
|
||||
fprintf(stderr, "a lone continuation char was encountered.\n");
|
||||
break;
|
||||
|
||||
case utf8_decode_error_not_cchar:
|
||||
fprintf(stderr, "a continuation char was expected, but not encountered.\n");
|
||||
break;
|
||||
|
||||
case utf8_decode_error_not_schar:
|
||||
fprintf(stderr, "an invalid character was encountered (not start char).\n");
|
||||
break;
|
||||
|
||||
case utf8_decode_error_overlong:
|
||||
fprintf(stderr, "an overlong character sequence was encountered.\n");
|
||||
break;
|
||||
|
||||
case utf8_decode_error_illegal_cp:
|
||||
fprintf(stderr, "an illegal code point was encountered.\n");
|
||||
break;
|
||||
}
|
||||
|
||||
*newch = 0xFFFD;
|
||||
return utf8_decode_error_action_replace;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
char inbuf[1024];
|
||||
wchar_t outbuf[1024];
|
||||
struct utf8_decode_state ctx;
|
||||
|
||||
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||
printf("Decodes UTF-8 on stdin to UCS-4 on stdout.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(argc != 1) {
|
||||
fprintf(stderr, "No parameters expected. This program decodes UTF-8 presented on stdin\n"
|
||||
"and transforms it to UCS-4 on stdout.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// set up ctx structure
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
ctx.wr = outbuf;
|
||||
ctx.wr_size = sizeof(outbuf) / sizeof(wchar_t);
|
||||
ctx.error_callback = error_callback;
|
||||
|
||||
// loop over input
|
||||
while(!feof(stdin)) {
|
||||
// read input
|
||||
ctx.rd_remain = fread(inbuf, 1, sizeof(inbuf), stdin);
|
||||
ctx.rd = inbuf;
|
||||
|
||||
// decode it
|
||||
while(ctx.rd_remain) {
|
||||
if(!utf8_decoder(&ctx)) {
|
||||
perror("utf8_decoder");
|
||||
fprintf(stderr, "(at line %d, col %d, char %d, byte %d)\n",
|
||||
ctx.line + 1, ctx.col + 1, ctx.char_offset, ctx.byte_offset);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// write output
|
||||
writeout(outbuf, ctx.written);
|
||||
}
|
||||
}
|
||||
|
||||
if(!ctx.complete) {
|
||||
fprintf(stderr, "Input did not end on a character boundary.\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,165 @@
|
|||
/* libutf8/src/tests/random.c
|
||||
*
|
||||
* (c)2006, Laurence Withers. Released under the GNU GPL. See file
|
||||
* COPYING for more information / terms of license.
|
||||
*/
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
|
||||
void make_rand(wchar_t* buf, int ch)
|
||||
{
|
||||
int fd = open("/dev/urandom", O_RDONLY);
|
||||
if(fd < 0) {
|
||||
perror("open(\"/dev/urandom\")");
|
||||
exit(1);
|
||||
}
|
||||
ch *= sizeof(wchar_t);
|
||||
if(read(fd, (char*)buf, ch) != ch) {
|
||||
perror("read(\"/dev/urandom\")");
|
||||
exit(1);
|
||||
}
|
||||
close(fd);
|
||||
|
||||
ch /= sizeof(wchar_t);
|
||||
while(ch--) {
|
||||
buf[ch] &= 0x7FFFFFFF;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int do_encode(char* dest, size_t size, wchar_t* src, size_t amt)
|
||||
{
|
||||
struct utf8_encode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = amt;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = 20;
|
||||
|
||||
while(ctx.rd_remain) {
|
||||
if(!utf8_encoder(&ctx)) {
|
||||
perror("utf8_encoder");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ctx.wr += ctx.written;
|
||||
if(ctx.wr + ctx.wr_size > dest + size) {
|
||||
fprintf(stderr, "do_encode: we're going to run out of memory\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return ctx.wr - dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int MIN(int x, int y)
|
||||
{
|
||||
return (x < y) ? x : y;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void do_decode_easy(wchar_t* dest, size_t size, const char* src, size_t amt)
|
||||
{
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = amt;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = size;
|
||||
|
||||
if(!utf8_decoder(&ctx)) {
|
||||
perror("[easy] utf8_decoder");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(ctx.rd_remain) {
|
||||
fprintf(stderr, "do_decode_easy: %d bytes left in buffer\n", ctx.rd_remain);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(!ctx.complete) {
|
||||
fprintf(stderr, "do_decode_easy: incomplete character at end of data\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void do_decode(wchar_t* dest, size_t size, const char* src, size_t amt)
|
||||
{
|
||||
struct utf8_decode_state ctx;
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
ctx.rd = src;
|
||||
ctx.rd_remain = MIN(20, amt);
|
||||
amt -= ctx.rd_remain;
|
||||
ctx.wr = dest;
|
||||
ctx.wr_size = 20;
|
||||
|
||||
while(ctx.rd_remain) {
|
||||
if(!utf8_decoder(&ctx)) {
|
||||
perror("utf8_decoder");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(!ctx.rd_remain) {
|
||||
ctx.rd_remain = MIN(20, amt);
|
||||
amt -= ctx.rd_remain;
|
||||
}
|
||||
|
||||
ctx.wr += ctx.written;
|
||||
if(ctx.wr + ctx.wr_size > dest + size) {
|
||||
ctx.wr_size = ctx.wr - dest - size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
wchar_t wbuf[1024], wbuf2[1025];
|
||||
char cbuf[8192];
|
||||
int amt;
|
||||
|
||||
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||
printf("Encodes and decodes random well-formed strings.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
make_rand(wbuf, 1024);
|
||||
amt = do_encode(cbuf, 8192, wbuf, 1024);
|
||||
do_decode_easy(wbuf2, 1025, cbuf, amt);
|
||||
do_decode(wbuf2, 1025, cbuf, amt);
|
||||
|
||||
if(memcmp(wbuf, wbuf2, 1024 * sizeof(wchar_t))) {
|
||||
fprintf(stderr, "Output doesn't match input!\n");
|
||||
for(amt = 0; amt < 1024; ++amt) {
|
||||
if(wbuf[amt] != wbuf2[amt])
|
||||
fprintf(stderr, "%4d: %08X != %08X\n", amt, wbuf[amt], wbuf2[amt]);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("Success.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
*/
|
|
@ -0,0 +1,35 @@
|
|||
/* libutf8/src/tests/???.c
|
||||
*
|
||||
* (c)2006, Laurence Withers, <l@lwithers.me.uk>.
|
||||
* Released under the GNU GPLv2. See file COPYING or
|
||||
* http://www.gnu.org/copyleft/gpl.html for details.
|
||||
*/
|
||||
|
||||
#include "utf8.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
if(argc == 2 && !strcmp(argv[1], "--print-summary")) {
|
||||
printf("One line summary.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(argc == 1) {
|
||||
// empty argument list
|
||||
}
|
||||
|
||||
int ret = 0;
|
||||
|
||||
// TODO
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* options for text editors
|
||||
kate: replace-trailing-space-save true; space-indent true; tab-width 4;
|
||||
vim: expandtab:ts=4:sw=4
|
||||
*/
|
Loading…
Reference in New Issue