source: EcnlProtoTool/trunk/onigmo-6.1.3/src/enc/utf_8.c@ 331

Last change on this file since 331 was 331, checked in by coas-nagasima, 6 years ago

prototoolに関連するプロジェクトをnewlibからmuslを使うよう変更・更新
ntshellをnewlibの下位の実装から、muslのsyscallの実装に変更・更新
以下のOSSをアップデート
・mruby-1.3.0
・musl-1.1.18
・onigmo-6.1.3
・tcc-0.9.27
以下のOSSを追加
・openssl-1.1.0e
・curl-7.57.0
・zlib-1.2.11
以下のmrbgemsを追加
・iij/mruby-digest
・iij/mruby-env
・iij/mruby-errno
・iij/mruby-iijson
・iij/mruby-ipaddr
・iij/mruby-mock
・iij/mruby-require
・iij/mruby-tls-openssl

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 16.7 KB
Line 
1/**********************************************************************
2 utf_8.c - Oniguruma (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include "regenc.h"
31#ifdef RUBY
32# include "encindex.h"
33#endif
34
35#ifndef ENCINDEX_UTF_8
36# define ENCINDEX_UTF_8 0
37#endif
38
39#define USE_INVALID_CODE_SCHEME
40
41#ifdef USE_INVALID_CODE_SCHEME
42/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
43# define INVALID_CODE_FE 0xfffffffe
44# define INVALID_CODE_FF 0xffffffff
45#endif
46#define VALID_CODE_LIMIT 0x0010ffff
47
48#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
49
50static const int EncLen_UTF8[] = {
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
64 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
65 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
66 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
67};
68
69typedef enum {
70 FAILURE = -2,
71 ACCEPT,
72 S0, S1, S2, S3,
73 S4, S5, S6, S7
74} state_t;
75#define A ACCEPT
76#define F FAILURE
77static const signed char trans[][0x100] = {
78 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
79 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
81 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
82 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
83 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
84 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
85 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
86 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
87 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
88 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
89 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
90 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
91 /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93 /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
94 /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
95 },
96 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
97 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
101 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
102 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
103 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
106 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
107 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
108 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
109 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
110 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
111 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
112 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
113 },
114 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
119 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
120 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
121 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
122 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
123 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
124 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
125 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
128 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
129 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
130 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
131 },
132 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
134 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
135 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
136 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
137 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
138 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
139 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
140 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
141 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
143 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
144 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
145 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
146 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
147 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
148 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
149 },
150 { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
151 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
152 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
153 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
154 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
155 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
156 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
157 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
158 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
159 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
162 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
163 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
164 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
165 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
166 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
167 },
168 { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
169 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
170 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
171 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
172 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
173 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
174 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
175 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
176 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
177 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
178 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
179 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
180 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
181 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
182 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
183 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
184 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
185 },
186 { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
187 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
188 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
189 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
190 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
191 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
192 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
193 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
194 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
195 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
196 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
197 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
198 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
199 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
200 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
201 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
202 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
203 },
204 { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
205 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
206 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
207 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
208 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
209 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
210 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
211 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
212 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
213 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
214 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
215 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
216 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
217 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
218 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
219 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
220 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
221 },
222};
223#undef A
224#undef F
225
226static int
227mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
228{
229 int firstbyte = *p++;
230 state_t s;
231 s = trans[0][firstbyte];
232 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
233 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
234
235 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
236 s = trans[s][*p++];
237 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
238 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
239
240 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
241 s = trans[s][*p++];
242 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
243 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
244
245 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
246 s = trans[s][*p++];
247 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
248 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
249}
250
251static int
252is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
253{
254 if (p < end) {
255 if (*p == 0x0a) return 1;
256
257#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
258 if (*p == 0x0b || *p == 0x0c || *p == 0x0d) return 1;
259 if (p + 1 < end) {
260 if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
261 return 1;
262 if (p + 2 < end) {
263 if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
264 && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
265 return 1;
266 }
267 }
268#endif
269 }
270
271 return 0;
272}
273
274static OnigCodePoint
275mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
276{
277 int c, len;
278 OnigCodePoint n;
279
280 len = mbc_enc_len(p, end, enc);
281 c = *p++;
282 if (len > 1) {
283 len--;
284 n = c & ((1 << (6 - len)) - 1);
285 while (len--) {
286 c = *p++;
287 n = (n << 6) | (c & ((1 << 6) - 1));
288 }
289 return n;
290 }
291 else {
292#ifdef USE_INVALID_CODE_SCHEME
293 if (c > 0xfd) {
294 return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
295 }
296#endif
297 return (OnigCodePoint )c;
298 }
299}
300
301static int
302code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
303{
304 if ((code & 0xffffff80) == 0) return 1;
305 else if ((code & 0xfffff800) == 0) return 2;
306 else if ((code & 0xffff0000) == 0) return 3;
307 else if (code <= VALID_CODE_LIMIT) return 4;
308#ifdef USE_INVALID_CODE_SCHEME
309 else if (code == INVALID_CODE_FE) return 1;
310 else if (code == INVALID_CODE_FF) return 1;
311#endif
312 else
313 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
314}
315
316static int
317code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
318{
319#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
320#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80)
321
322 if ((code & 0xffffff80) == 0) {
323 *buf = (UChar )code;
324 return 1;
325 }
326 else {
327 UChar *p = buf;
328
329 if ((code & 0xfffff800) == 0) {
330 *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
331 }
332 else if ((code & 0xffff0000) == 0) {
333 *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
334 *p++ = UTF8_TRAILS(code, 6);
335 }
336 else if (code <= VALID_CODE_LIMIT) {
337 *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
338 *p++ = UTF8_TRAILS(code, 12);
339 *p++ = UTF8_TRAILS(code, 6);
340 }
341#ifdef USE_INVALID_CODE_SCHEME
342 else if (code == INVALID_CODE_FE) {
343 *p = 0xfe;
344 return 1;
345 }
346 else if (code == INVALID_CODE_FF) {
347 *p = 0xff;
348 return 1;
349 }
350#endif
351 else {
352 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
353 }
354
355 *p++ = UTF8_TRAIL0(code);
356 return (int )(p - buf);
357 }
358}
359
360static int
361mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
362 const UChar* end, UChar* fold, OnigEncoding enc)
363{
364 const UChar* p = *pp;
365
366 if (ONIGENC_IS_MBC_ASCII(p)) {
367#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
368 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
369 if (*p == 0x49) {
370 *fold++ = 0xc4;
371 *fold = 0xb1;
372 (*pp)++;
373 return 2;
374 }
375 }
376#endif
377
378 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
379 (*pp)++;
380 return 1; /* return byte length of converted char to lower */
381 }
382 else {
383 return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
384 }
385}
386
387
388static int
389get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
390 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
391{
392 *sb_out = 0x80;
393 return onigenc_unicode_ctype_code_range(ctype, ranges);
394}
395
396
397static UChar*
398left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
399{
400 const UChar *p;
401
402 if (s <= start) return (UChar* )s;
403 p = s;
404
405 while (!utf8_islead(*p) && p > start) p--;
406 return (UChar* )p;
407}
408
409static int
410get_case_fold_codes_by_str(OnigCaseFoldType flag,
411 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
412 OnigEncoding enc)
413{
414 return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
415}
416
417OnigEncodingDefine(utf_8, UTF_8) = {
418 mbc_enc_len,
419 "UTF-8", /* name */
420 4, /* max byte length */
421 1, /* min byte length */
422 is_mbc_newline,
423 mbc_to_code,
424 code_to_mbclen,
425 code_to_mbc,
426 mbc_case_fold,
427 onigenc_unicode_apply_all_case_fold,
428 get_case_fold_codes_by_str,
429 onigenc_unicode_property_name_to_ctype,
430 onigenc_unicode_is_code_ctype,
431 get_ctype_code_range,
432 left_adjust_char_head,
433 onigenc_always_true_is_allowed_reverse_match,
434 onigenc_unicode_case_map,
435 ENCINDEX_UTF_8,
436 ONIGENC_FLAG_UNICODE,
437};
438ENC_ALIAS("CP65001", "UTF-8")
439
440/*
441 * Name: UTF8-MAC
442 * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
443 * Link: http://developer.apple.com/qa/qa2001/qa1235.html
444 * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
445 * Link: http://www.gnu.org/software/emacs/NEWS.23.2
446 */
447ENC_REPLICATE("UTF8-MAC", "UTF-8")
448ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")
449ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */
Note: See TracBrowser for help on using the repository browser.