[279] | 1 | /**********************************************************************
|
---|
[331] | 2 | utf_8.c - Oniguruma (regular expression library)
|
---|
[279] | 3 | **********************************************************************/
|
---|
| 4 | /*-
|
---|
| 5 | * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
|
---|
| 6 | * All rights reserved.
|
---|
| 7 | *
|
---|
| 8 | * Redistribution and use in source and binary forms, with or without
|
---|
| 9 | * modification, are permitted provided that the following conditions
|
---|
| 10 | * are met:
|
---|
| 11 | * 1. Redistributions of source code must retain the above copyright
|
---|
| 12 | * notice, this list of conditions and the following disclaimer.
|
---|
| 13 | * 2. Redistributions in binary form must reproduce the above copyright
|
---|
| 14 | * notice, this list of conditions and the following disclaimer in the
|
---|
| 15 | * documentation and/or other materials provided with the distribution.
|
---|
| 16 | *
|
---|
| 17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
---|
| 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
---|
| 19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
---|
| 20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
---|
| 21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
---|
| 22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
---|
| 23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
---|
| 24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
---|
| 25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
---|
| 26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
---|
| 27 | * SUCH DAMAGE.
|
---|
| 28 | */
|
---|
| 29 |
|
---|
| 30 | #include "regenc.h"
|
---|
[331] | 31 | #ifdef RUBY
|
---|
| 32 | # include "encindex.h"
|
---|
| 33 | #endif
|
---|
[279] | 34 |
|
---|
[331] | 35 | #ifndef ENCINDEX_UTF_8
|
---|
| 36 | # define ENCINDEX_UTF_8 0
|
---|
| 37 | #endif
|
---|
| 38 |
|
---|
[279] | 39 | #define USE_INVALID_CODE_SCHEME
|
---|
| 40 |
|
---|
| 41 | #ifdef USE_INVALID_CODE_SCHEME
|
---|
| 42 | /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
|
---|
[331] | 43 | # define INVALID_CODE_FE 0xfffffffe
|
---|
| 44 | # define INVALID_CODE_FF 0xffffffff
|
---|
[279] | 45 | #endif
|
---|
[331] | 46 | #define VALID_CODE_LIMIT 0x0010ffff
|
---|
[279] | 47 |
|
---|
| 48 | #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
|
---|
| 49 |
|
---|
| 50 | static const int EncLen_UTF8[] = {
|
---|
| 51 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 52 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 53 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 54 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 55 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 56 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 57 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 58 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 59 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 60 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 61 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 62 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 63 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
---|
| 64 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
---|
| 65 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
[331] | 66 | 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
---|
[279] | 67 | };
|
---|
| 68 |
|
---|
[331] | 69 | typedef enum {
|
---|
| 70 | FAILURE = -2,
|
---|
| 71 | ACCEPT,
|
---|
| 72 | S0, S1, S2, S3,
|
---|
| 73 | S4, S5, S6, S7
|
---|
| 74 | } state_t;
|
---|
| 75 | #define A ACCEPT
|
---|
| 76 | #define F FAILURE
|
---|
| 77 | static const signed char trans[][0x100] = {
|
---|
| 78 | { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 79 | /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 80 | /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 81 | /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 82 | /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 83 | /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 84 | /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 85 | /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 86 | /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 87 | /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 88 | /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 89 | /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 90 | /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 91 | /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 92 | /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 93 | /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
|
---|
| 94 | /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 95 | },
|
---|
| 96 | { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 97 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 98 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 99 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 100 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 101 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 102 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 103 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 104 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 105 | /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 106 | /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 107 | /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 108 | /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
|
---|
| 109 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 110 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 111 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 112 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 113 | },
|
---|
| 114 | { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 115 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 116 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 117 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 118 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 119 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 120 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 121 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 122 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 123 | /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 124 | /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 125 | /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 126 | /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 127 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 128 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 129 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 130 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 131 | },
|
---|
| 132 | { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 133 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 134 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 135 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 136 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 137 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 138 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 139 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 140 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 141 | /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 142 | /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 143 | /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 144 | /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 145 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 146 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 147 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 148 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 149 | },
|
---|
| 150 | { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 151 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 152 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 153 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 154 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 155 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 156 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 157 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 158 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 159 | /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 160 | /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
| 161 | /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 162 | /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 163 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 164 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 165 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 166 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 167 | },
|
---|
| 168 | { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 169 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 170 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 171 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 172 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 173 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 174 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 175 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 176 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 177 | /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 178 | /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 179 | /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 180 | /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 181 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 182 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 183 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 184 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 185 | },
|
---|
| 186 | { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 187 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 188 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 189 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 190 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 191 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 192 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 193 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 194 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 195 | /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 196 | /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 197 | /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 198 | /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 199 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 200 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 201 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 202 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 203 | },
|
---|
| 204 | { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
|
---|
| 205 | /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 206 | /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 207 | /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 208 | /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 209 | /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 210 | /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 211 | /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 212 | /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 213 | /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
| 214 | /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 215 | /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 216 | /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 217 | /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 218 | /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 219 | /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
|
---|
| 220 | /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
|
---|
| 221 | },
|
---|
| 222 | };
|
---|
| 223 | #undef A
|
---|
| 224 | #undef F
|
---|
| 225 |
|
---|
[279] | 226 | static int
|
---|
[331] | 227 | mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
|
---|
[279] | 228 | {
|
---|
[331] | 229 | int firstbyte = *p++;
|
---|
| 230 | state_t s;
|
---|
| 231 | s = trans[0][firstbyte];
|
---|
| 232 | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
|
---|
| 233 | ONIGENC_CONSTRUCT_MBCLEN_INVALID();
|
---|
| 234 |
|
---|
| 235 | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
|
---|
| 236 | s = trans[s][*p++];
|
---|
| 237 | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
|
---|
| 238 | ONIGENC_CONSTRUCT_MBCLEN_INVALID();
|
---|
| 239 |
|
---|
| 240 | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
|
---|
| 241 | s = trans[s][*p++];
|
---|
| 242 | if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
|
---|
| 243 | ONIGENC_CONSTRUCT_MBCLEN_INVALID();
|
---|
| 244 |
|
---|
| 245 | if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
|
---|
| 246 | s = trans[s][*p++];
|
---|
| 247 | return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
|
---|
| 248 | ONIGENC_CONSTRUCT_MBCLEN_INVALID();
|
---|
[279] | 249 | }
|
---|
| 250 |
|
---|
| 251 | static int
|
---|
[331] | 252 | is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
|
---|
[279] | 253 | {
|
---|
| 254 | if (p < end) {
|
---|
| 255 | if (*p == 0x0a) return 1;
|
---|
| 256 |
|
---|
| 257 | #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
|
---|
| 258 | if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1;
|
---|
| 259 | if (p + 1 < end) {
|
---|
| 260 | if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
|
---|
| 261 | return 1;
|
---|
| 262 | if (p + 2 < end) {
|
---|
| 263 | if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
|
---|
| 264 | && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
|
---|
| 265 | return 1;
|
---|
| 266 | }
|
---|
| 267 | }
|
---|
| 268 | #endif
|
---|
| 269 | }
|
---|
| 270 |
|
---|
| 271 | return 0;
|
---|
| 272 | }
|
---|
| 273 |
|
---|
| 274 | static OnigCodePoint
|
---|
[331] | 275 | mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
|
---|
[279] | 276 | {
|
---|
| 277 | int c, len;
|
---|
| 278 | OnigCodePoint n;
|
---|
| 279 |
|
---|
[331] | 280 | len = mbc_enc_len(p, end, enc);
|
---|
[279] | 281 | c = *p++;
|
---|
| 282 | if (len > 1) {
|
---|
| 283 | len--;
|
---|
| 284 | n = c & ((1 << (6 - len)) - 1);
|
---|
| 285 | while (len--) {
|
---|
| 286 | c = *p++;
|
---|
| 287 | n = (n << 6) | (c & ((1 << 6) - 1));
|
---|
| 288 | }
|
---|
| 289 | return n;
|
---|
| 290 | }
|
---|
| 291 | else {
|
---|
| 292 | #ifdef USE_INVALID_CODE_SCHEME
|
---|
| 293 | if (c > 0xfd) {
|
---|
| 294 | return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
|
---|
| 295 | }
|
---|
| 296 | #endif
|
---|
| 297 | return (OnigCodePoint )c;
|
---|
| 298 | }
|
---|
| 299 | }
|
---|
| 300 |
|
---|
| 301 | static int
|
---|
[331] | 302 | code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
|
---|
[279] | 303 | {
|
---|
| 304 | if ((code & 0xffffff80) == 0) return 1;
|
---|
| 305 | else if ((code & 0xfffff800) == 0) return 2;
|
---|
| 306 | else if ((code & 0xffff0000) == 0) return 3;
|
---|
[331] | 307 | else if (code <= VALID_CODE_LIMIT) return 4;
|
---|
[279] | 308 | #ifdef USE_INVALID_CODE_SCHEME
|
---|
| 309 | else if (code == INVALID_CODE_FE) return 1;
|
---|
| 310 | else if (code == INVALID_CODE_FF) return 1;
|
---|
| 311 | #endif
|
---|
| 312 | else
|
---|
[331] | 313 | return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
|
---|
[279] | 314 | }
|
---|
| 315 |
|
---|
| 316 | static int
|
---|
[331] | 317 | code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
|
---|
[279] | 318 | {
|
---|
| 319 | #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
|
---|
| 320 | #define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80)
|
---|
| 321 |
|
---|
| 322 | if ((code & 0xffffff80) == 0) {
|
---|
| 323 | *buf = (UChar )code;
|
---|
| 324 | return 1;
|
---|
| 325 | }
|
---|
| 326 | else {
|
---|
| 327 | UChar *p = buf;
|
---|
| 328 |
|
---|
| 329 | if ((code & 0xfffff800) == 0) {
|
---|
| 330 | *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
|
---|
| 331 | }
|
---|
| 332 | else if ((code & 0xffff0000) == 0) {
|
---|
| 333 | *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
|
---|
| 334 | *p++ = UTF8_TRAILS(code, 6);
|
---|
| 335 | }
|
---|
[331] | 336 | else if (code <= VALID_CODE_LIMIT) {
|
---|
[279] | 337 | *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
|
---|
| 338 | *p++ = UTF8_TRAILS(code, 12);
|
---|
| 339 | *p++ = UTF8_TRAILS(code, 6);
|
---|
| 340 | }
|
---|
| 341 | #ifdef USE_INVALID_CODE_SCHEME
|
---|
| 342 | else if (code == INVALID_CODE_FE) {
|
---|
| 343 | *p = 0xfe;
|
---|
| 344 | return 1;
|
---|
| 345 | }
|
---|
| 346 | else if (code == INVALID_CODE_FF) {
|
---|
| 347 | *p = 0xff;
|
---|
| 348 | return 1;
|
---|
| 349 | }
|
---|
| 350 | #endif
|
---|
| 351 | else {
|
---|
| 352 | return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
|
---|
| 353 | }
|
---|
| 354 |
|
---|
| 355 | *p++ = UTF8_TRAIL0(code);
|
---|
| 356 | return (int )(p - buf);
|
---|
| 357 | }
|
---|
| 358 | }
|
---|
| 359 |
|
---|
| 360 | static int
|
---|
| 361 | mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
|
---|
[331] | 362 | const UChar* end, UChar* fold, OnigEncoding enc)
|
---|
[279] | 363 | {
|
---|
| 364 | const UChar* p = *pp;
|
---|
| 365 |
|
---|
| 366 | if (ONIGENC_IS_MBC_ASCII(p)) {
|
---|
| 367 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
|
---|
| 368 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
|
---|
| 369 | if (*p == 0x49) {
|
---|
| 370 | *fold++ = 0xc4;
|
---|
| 371 | *fold = 0xb1;
|
---|
| 372 | (*pp)++;
|
---|
| 373 | return 2;
|
---|
| 374 | }
|
---|
| 375 | }
|
---|
| 376 | #endif
|
---|
| 377 |
|
---|
| 378 | *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
|
---|
| 379 | (*pp)++;
|
---|
| 380 | return 1; /* return byte length of converted char to lower */
|
---|
| 381 | }
|
---|
| 382 | else {
|
---|
[331] | 383 | return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
|
---|
[279] | 384 | }
|
---|
| 385 | }
|
---|
| 386 |
|
---|
| 387 |
|
---|
| 388 | static int
|
---|
| 389 | get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
|
---|
[331] | 390 | const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
|
---|
[279] | 391 | {
|
---|
| 392 | *sb_out = 0x80;
|
---|
| 393 | return onigenc_unicode_ctype_code_range(ctype, ranges);
|
---|
| 394 | }
|
---|
| 395 |
|
---|
| 396 |
|
---|
| 397 | static UChar*
|
---|
[331] | 398 | left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
|
---|
[279] | 399 | {
|
---|
| 400 | const UChar *p;
|
---|
| 401 |
|
---|
| 402 | if (s <= start) return (UChar* )s;
|
---|
| 403 | p = s;
|
---|
| 404 |
|
---|
| 405 | while (!utf8_islead(*p) && p > start) p--;
|
---|
| 406 | return (UChar* )p;
|
---|
| 407 | }
|
---|
| 408 |
|
---|
| 409 | static int
|
---|
| 410 | get_case_fold_codes_by_str(OnigCaseFoldType flag,
|
---|
[331] | 411 | const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
|
---|
| 412 | OnigEncoding enc)
|
---|
[279] | 413 | {
|
---|
[331] | 414 | return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
|
---|
[279] | 415 | }
|
---|
| 416 |
|
---|
[331] | 417 | OnigEncodingDefine(utf_8, UTF_8) = {
|
---|
[279] | 418 | mbc_enc_len,
|
---|
| 419 | "UTF-8", /* name */
|
---|
[331] | 420 | 4, /* max byte length */
|
---|
[279] | 421 | 1, /* min byte length */
|
---|
| 422 | is_mbc_newline,
|
---|
| 423 | mbc_to_code,
|
---|
| 424 | code_to_mbclen,
|
---|
| 425 | code_to_mbc,
|
---|
| 426 | mbc_case_fold,
|
---|
| 427 | onigenc_unicode_apply_all_case_fold,
|
---|
| 428 | get_case_fold_codes_by_str,
|
---|
| 429 | onigenc_unicode_property_name_to_ctype,
|
---|
| 430 | onigenc_unicode_is_code_ctype,
|
---|
| 431 | get_ctype_code_range,
|
---|
| 432 | left_adjust_char_head,
|
---|
| 433 | onigenc_always_true_is_allowed_reverse_match,
|
---|
[331] | 434 | onigenc_unicode_case_map,
|
---|
| 435 | ENCINDEX_UTF_8,
|
---|
[279] | 436 | ONIGENC_FLAG_UNICODE,
|
---|
| 437 | };
|
---|
[331] | 438 | ENC_ALIAS("CP65001", "UTF-8")
|
---|
| 439 |
|
---|
| 440 | /*
|
---|
| 441 | * Name: UTF8-MAC
|
---|
| 442 | * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
|
---|
| 443 | * Link: http://developer.apple.com/qa/qa2001/qa1235.html
|
---|
| 444 | * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
|
---|
| 445 | * Link: http://www.gnu.org/software/emacs/NEWS.23.2
|
---|
| 446 | */
|
---|
| 447 | ENC_REPLICATE("UTF8-MAC", "UTF-8")
|
---|
| 448 | ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")
|
---|
| 449 | ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */
|
---|