blob: 936dddeafa391b28c64ef4e2252fc3e481ff271d [file] [log] [blame]
/*
******************************************************************************
* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.
******************************************************************************
*/
#ifndef URBTOK_H
#define URBTOK_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/utext.h"
#include "unicode/ubrk.h"
#include "unicode/parseerr.h"
typedef struct RuleBasedTokenRange {
signed long location;
signed long length;
} RuleBasedTokenRange;
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* The rule syntax is ... (TBD)
* @param rules A set of rules specifying the text breaking conventions.
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
* @param parseErr Receives position and context information for any syntax errors
* detected while parsing the rules.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openRules(const UChar *rules,
int32_t rulesLength,
UParseError *parseErr,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* @param rules A set of rules specifying the text breaking conventions. The binary rules
* must be at least 32-bit aligned. Note: This version makes a copy of the
* rules, so after calling this function the caller can close or release
* the rules that were passed to this function. The copy created by this
* call will be freed when ubrk_close() is called on the UBreakIterator*.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openBinaryRules(const uint8_t *rules,
UErrorCode *status);
/**
* Open a new UBreakIterator for tokenizing text using specified breaking rules.
* @param rules A set of rules specifying the text breaking conventions. The binary rules
* must be at least 32-bit aligned. Note: This version does NOT make a copy
* of the rules, so after calling this function the caller must not close or
* release the rules passed to this function until after they are finished
* with this UBreakIterator* (and any others created using the same rules)
* and have called ubrk_close() to close the UBreakIterator* (and any others
* using the same rules).
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @internal
*/
U_INTERNAL UBreakIterator* U_EXPORT2
urbtok_openBinaryRulesNoCopy(const uint8_t *rules,
UErrorCode *status);
/**
* Get the (native-endian) binary break rules for this tokenizer.
* @param bi The tokenizer to use.
* @param buffer The output buffer for the rules. You can pass 0 to get the required size.
* @param buffSize The size of the output buffer.
* @param status A UErrorCode to receive any errors.
* @return The actual size of the binary rules, whether they fit the buffer or not.
* @internal
*/
U_INTERNAL uint32_t U_EXPORT2
urbtok_getBinaryRules(UBreakIterator *bi,
uint8_t *buffer,
uint32_t buffSize,
UErrorCode *status);
/**
* Tokenize text using a rule-based tokenizer.
* @param bi The tokenizer to use.
* @param maxTokens The maximum number of tokens to return.
* @param outTokens An array of RuleBasedTokenRange to fill in with the tokens.
* @param outTokenFlags An (optional) array of uint32_t to fill in with token flags.
* @return The number of tokens returned, 0 if done.
* @internal
*/
U_INTERNAL int32_t U_EXPORT2
urbtok_tokenize(UBreakIterator *bi,
int32_t maxTokens,
RuleBasedTokenRange *outTokens,
unsigned long *outTokenFlags);
/**
* Swap the endianness of a set of binary break rules.
* @param rules A set of rules which need swapping.
* @param buffer The output buffer for the swapped rules, which must be the same
* size as the input rules buffer.
* @param inIsBigEndian UBool indicating whether the input is big-endian
* @param outIsBigEndian UBool indicating whether the output should be big-endian
* @param status A UErrorCode to receive any errors.
* @internal
*/
U_INTERNAL void U_EXPORT2
urbtok_swapBinaryRules(const uint8_t *rules,
uint8_t *buffer,
UBool inIsBigEndian,
UBool outIsBigEndian,
UErrorCode *status);
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif