241 lines
7.6 KiB
C
241 lines
7.6 KiB
C
/*
|
|
* Asterisk -- An open source telephony toolkit.
|
|
*
|
|
* Copyright (C) 2020, Sean Bright
|
|
*
|
|
* Sean Bright <sean.bright@gmail.com>
|
|
*
|
|
* See http://www.asterisk.org for more information about
|
|
* the Asterisk project. Please do not directly contact
|
|
* any of the maintainers of this project for assistance;
|
|
* the project provides a web site, mailing lists and IRC
|
|
* channels for your use.
|
|
*
|
|
* This program is free software, distributed under the terms of
|
|
* the GNU General Public License Version 2. See the LICENSE file
|
|
* at the top of the source tree.
|
|
*/
|
|
|
|
/*! \file
|
|
*
|
|
* \brief UTF-8 information and validation functions
|
|
*/
|
|
|
|
#ifndef ASTERISK_UTF8_H
|
|
#define ASTERISK_UTF8_H
|
|
|
|
/*!
|
|
* \brief Check if a zero-terminated string is valid UTF-8
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* \param str The zero-terminated string to check
|
|
*
|
|
* \retval 0 if the string is not valid UTF-8
|
|
* \retval Non-zero if the string is valid UTF-8
|
|
*/
|
|
int ast_utf8_is_valid(const char *str);
|
|
|
|
/*!
|
|
* \brief Check if the first \a size bytes of a string are valid UTF-8
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* Similar to \a ast_utf8_is_valid() but checks the first \a size bytes or until
|
|
* a zero byte is reached, whichever comes first.
|
|
*
|
|
* \param str The string to check
|
|
* \param size The number of bytes to evaluate
|
|
*
|
|
* \retval 0 if the string is not valid UTF-8
|
|
* \retval Non-zero if the string is valid UTF-8
|
|
*/
|
|
int ast_utf8_is_validn(const char *str, size_t size);
|
|
|
|
/*!
|
|
* \brief Copy a string safely ensuring valid UTF-8
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
|
|
* sequences from the source string into the destination buffer. If an invalid
|
|
* UTF-8 sequence is encountered, or the available space in the destination
|
|
* buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the
|
|
* destination buffer will be truncated to ensure that it only contains valid
|
|
* UTF-8.
|
|
*
|
|
* \param dst The destination buffer.
|
|
* \param src The source string
|
|
* \param size The size of the destination buffer
|
|
*/
|
|
void ast_utf8_copy_string(char *dst, const char *src, size_t size);
|
|
|
|
enum ast_utf8_replace_result {
|
|
/*! \brief Source contained fully valid UTF-8
|
|
*
|
|
* The entire string was valid UTF-8 and no replacement
|
|
* was required.
|
|
*/
|
|
AST_UTF8_REPLACE_VALID,
|
|
|
|
/*! \brief Source contained at least 1 invalid UTF-8 sequence
|
|
*
|
|
* Parts of the string contained invalid UTF-8 sequences
|
|
* but those were successfully replaced with the U+FFFD
|
|
* replacement sequence.
|
|
*/
|
|
AST_UTF8_REPLACE_INVALID,
|
|
|
|
/*! \brief Not enough space to copy entire source
|
|
*
|
|
* The destination buffer wasn't large enough to copy
|
|
* all of the source characters. As many of the source
|
|
* characters that could be copied/replaced were done so
|
|
* and a final NULL terminator added.
|
|
*/
|
|
AST_UTF8_REPLACE_OVERRUN,
|
|
};
|
|
|
|
/*!
|
|
* \brief Copy a string safely replacing any invalid UTF-8 sequences
|
|
*
|
|
* This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
|
|
* sequences from the source string into the destination buffer.
|
|
* If an invalid sequence is encountered, it's replaced with the \uFFFD
|
|
* sequence which is the valid UTF-8 sequence that represents an unknown,
|
|
* unrecognized, or unrepresentable character. Since \uFFFD is actually a
|
|
* 3 byte sequence, the destination buffer will need to be larger than
|
|
* the corresponding source string if it contains invalid sequences.
|
|
* You can pass NULL as the destination buffer pointer to get the actual
|
|
* size required, then call the function again with the properly sized
|
|
* buffer.
|
|
*
|
|
* \param dst Pointer to the destination buffer. If NULL,
|
|
* dst_size will be set to the size of the
|
|
* buffer required to fully process the
|
|
* source string.
|
|
* \param dst_size A pointer to the size of the dst buffer
|
|
* \param src The source string
|
|
* \param src_len The number of bytes to copy
|
|
*
|
|
* \return \ref ast_utf8_replace_result
|
|
*/
|
|
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst,
|
|
size_t *dst_size, const char *src, size_t src_len);
|
|
|
|
enum ast_utf8_validation_result {
|
|
/*! \brief The consumed sequence is valid UTF-8
|
|
*
|
|
* The bytes consumed thus far by the validator represent a valid sequence of
|
|
* UTF-8 bytes. If additional bytes are fed into the validator, it can
|
|
* transition into either \a AST_UTF8_INVALID or \a AST_UTF8_UNKNOWN
|
|
*/
|
|
AST_UTF8_VALID,
|
|
|
|
/*! \brief The consumed sequence is invalid UTF-8
|
|
*
|
|
* The bytes consumed thus far by the validator represent an invalid sequence
|
|
* of UTF-8 bytes. Feeding additional bytes into the validator will not
|
|
* change its state.
|
|
*/
|
|
AST_UTF8_INVALID,
|
|
|
|
/*! \brief The validator is in an intermediate state
|
|
*
|
|
* The validator is in the process of validating a multibyte UTF-8 sequence
|
|
* and requires additional data to be fed into it to determine validity. If
|
|
* additional bytes are fed into the validator, it can transition into either
|
|
* \a AST_UTF8_VALID or \a AST_UTF8_INVALID. If you have no additional data
|
|
* to feed into the validator the UTF-8 sequence is invalid.
|
|
*/
|
|
AST_UTF8_UNKNOWN,
|
|
};
|
|
|
|
/*!
|
|
* \brief Opaque type for UTF-8 validator state.
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*/
|
|
struct ast_utf8_validator;
|
|
|
|
/*!
|
|
* \brief Create a new UTF-8 validator
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* \param[out] validator The validator instance
|
|
*
|
|
* \retval 0 on success
|
|
* \retval -1 on failure
|
|
*/
|
|
int ast_utf8_validator_new(struct ast_utf8_validator **validator);
|
|
|
|
/*!
|
|
* \brief Feed a zero-terminated string into the UTF-8 validator
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* \param validator The validator instance
|
|
* \param data The zero-terminated string to feed into the validator
|
|
*
|
|
* \return The \ref ast_utf8_validation_result indicating the current state of
|
|
* the validator.
|
|
*/
|
|
enum ast_utf8_validation_result ast_utf8_validator_feed(
|
|
struct ast_utf8_validator *validator, const char *data);
|
|
|
|
/*!
|
|
* \brief Feed a string into the UTF-8 validator
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* Similar to \a ast_utf8_validator_feed but will stop feeding in data if a zero
|
|
* byte is encountered or \a size bytes have been read.
|
|
*
|
|
* \param validator The validator instance
|
|
* \param data The string to feed into the validator
|
|
* \param size The number of bytes to feed into the validator
|
|
*
|
|
* \return The \ref ast_utf8_validation_result indicating the current state of
|
|
* the validator.
|
|
*/
|
|
enum ast_utf8_validation_result ast_utf8_validator_feedn(
|
|
struct ast_utf8_validator *validator, const char *data, size_t size);
|
|
|
|
/*!
|
|
* \brief Get the current UTF-8 validator state
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* \param validator The validator instance
|
|
*
|
|
* \return The \ref ast_utf8_validation_result indicating the current state of
|
|
* the validator.
|
|
*/
|
|
enum ast_utf8_validation_result ast_utf8_validator_state(
|
|
struct ast_utf8_validator *validator);
|
|
|
|
/*!
|
|
* \brief Reset the state of a UTF-8 validator
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* Resets the provided UTF-8 validator to its initial state so that it can be
|
|
* reused.
|
|
*
|
|
* \param validator The validator instance to reset
|
|
*/
|
|
void ast_utf8_validator_reset(
|
|
struct ast_utf8_validator *validator);
|
|
|
|
/*!
|
|
* \brief Destroy a UTF-8 validator
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* \param validator The validator instance to destroy
|
|
*/
|
|
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator);
|
|
|
|
/*!
|
|
* \brief Register UTF-8 tests
|
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
|
*
|
|
* Does nothing unless TEST_FRAMEWORK is defined.
|
|
*
|
|
* \retval 0 Always
|
|
*/
|
|
int ast_utf8_init(void);
|
|
|
|
#endif /* ASTERISK_UTF8_H */
|