source/tools/toolutil/uparse.h

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2000-2010, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  uparse.h
  11 *   encoding:   US-ASCII
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2000apr18
  16 *   created by: Markus W. Scherer
  17 *
  18 *   This file provides a parser for files that are delimited by one single
  19 *   character like ';' or TAB. Example: the Unicode Character Properties files
  20 *   like UnicodeData.txt are semicolon-delimited.
  21 */
  22
  23 #ifndef __UPARSE_H__
  24 #define __UPARSE_H__
  25
  26 #include "unicode/utypes.h"
  27
  28 /**
  29  * Is c an invariant-character whitespace?
  30  * @param c invariant character
  31  */
  32 #define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
  33
  34 U_CDECL_BEGIN
  35
  36 /**
  37  * Skip space ' ' and TAB '\t' characters.
  38  *
  39  * @param s Pointer to characters.
  40  * @return Pointer to first character at or after s that is not a space or TAB.
  41  */
  42 U_CAPI const char * U_EXPORT2
  43 u_skipWhitespace(const char *s);
  44
  45 /**
  46  * Trim whitespace (including line endings) from the end of the string.
  47  *
  48  * @param s Pointer to the string.
  49  * @return Pointer to the new end of the string.
  50  */
  51 U_CAPI char * U_EXPORT2
  52 u_rtrim(char *s);
  53
  54 /** Function type for u_parseDelimitedFile(). */
  55 typedef void U_CALLCONV
  56 UParseLineFn(void *context,
  57               char *fields[][2],
  58               int32_t fieldCount,
  59               UErrorCode *pErrorCode);
  60
  61 /**
  62  * Parser for files that are similar to UnicodeData.txt:
  63  * This function opens the file and reads it line by line. It skips empty lines
  64  * and comment lines that start with a '#'.
  65  * All other lines are separated into fields with one delimiter character
  66  * (semicolon for Unicode Properties files) between two fields. The last field in
  67  * a line does not need to be terminated with a delimiter.
  68  *
  69  * For each line, after segmenting it, a line function is called.
  70  * It gets passed the array of field start and limit pointers that is
  71  * passed into this parser and filled by it for each line.
  72  * For each field i of the line, the start pointer in fields[i][0]
  73  * points to the beginning of the field, while the limit pointer in fields[i][1]
  74  * points behind the field, i.e., to the delimiter or the line end.
  75  *
  76  * The context parameter of the line function is
  77  * the same as the one for the parse function.
  78  *
  79  * The line function may modify the contents of the fields including the
  80  * limit characters.
  81  *
  82  * If the file cannot be opened, or there is a parsing error or a field function
  83  * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
  84  */
  85 U_CAPI void U_EXPORT2
  86 u_parseDelimitedFile(const char *filename, char delimiter,
  87                      char *fields[][2], int32_t fieldCount,
  88                      UParseLineFn *lineFn, void *context,
  89                      UErrorCode *pErrorCode);
  90
  91 /**
  92  * Parse a string of code points like 0061 0308 0300.
  93  * s must end with either ';' or NUL.
  94  *
  95  * @return Number of code points.
  96  */
  97 U_CAPI int32_t U_EXPORT2
  98 u_parseCodePoints(const char *s,
  99                   uint32_t *dest, int32_t destCapacity,
 100                   UErrorCode *pErrorCode);
 101
 102 /**
 103  * Parse a list of code points like 0061 0308 0300
 104  * into a UChar * string.
 105  * s must end with either ';' or NUL.
 106  *
 107  * Set the first code point in *pFirst.
 108  *
 109  * @param s Input char * string.
 110  * @param dest Output string buffer.
 111  * @param destCapacity Capacity of dest in numbers of UChars.
 112  * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
 113  *               code point in the string.
 114  * @param pErrorCode ICU error code.
 115  * @return The length of the string in numbers of UChars.
 116  */
 117 U_CAPI int32_t U_EXPORT2
 118 u_parseString(const char *s,
 119               UChar *dest, int32_t destCapacity,
 120               uint32_t *pFirst,
 121               UErrorCode *pErrorCode);
 122
 123 /**
 124  * Parse a code point range like
 125  * 0085 or
 126  * 4E00..9FA5.
 127  *
 128  * s must contain such a range and end with either ';' or NUL.
 129  *
 130  * @return Length of code point range, end-start+1
 131  */
 132 U_CAPI int32_t U_EXPORT2
 133 u_parseCodePointRange(const char *s,
 134                       uint32_t *pStart, uint32_t *pEnd,
 135                       UErrorCode *pErrorCode);
 136
 137 /**
 138  * Same as u_parseCodePointRange() but the range may be terminated by
 139  * any character. The position of the terminating character is returned via
 140  * the *terminator output parameter.
 141  */
 142 U_CAPI int32_t U_EXPORT2
 143 u_parseCodePointRangeAnyTerminator(const char *s,
 144                                    uint32_t *pStart, uint32_t *pEnd,
 145                                    const char **terminator,
 146                                    UErrorCode *pErrorCode);
 147
 148 U_CAPI int32_t U_EXPORT2
 149 u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
 150
 151 U_CDECL_END
 152
 153 #endif