Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: EcnlProtoTool/trunk/onigmo-5.15.0/src/enc/utf8.c@ 279

Last change on this file since 279 was 279, checked in by coas-nagasima, 7 years ago
ファイルを追加、更新。
Property svn:eol-style set to `native` Property svn:keywords set to `Id` Property svn:mime-type set to `text/x-csrc`
File size: 8.1 KB

Rev	Line
[279]	1	/**********************************************************************
	2	utf8.c - Oniguruma (regular expression library)
	3	**********************************************************************/
	4	/*-
	5	* Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
	6	* All rights reserved.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	20	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	27	* SUCH DAMAGE.
	28	*/
	29
	30	#include "regenc.h"
	31
	32	#define USE_INVALID_CODE_SCHEME
	33
	34	#ifdef USE_INVALID_CODE_SCHEME
	35	/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
	36	#define INVALID_CODE_FE 0xfffffffe
	37	#define INVALID_CODE_FF 0xffffffff
	38	#define VALID_CODE_LIMIT 0x7fffffff
	39	#endif
	40
	41	#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
	42
	43	static const int EncLen_UTF8[] = {
	44	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	45	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	46	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	47	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	48	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	49	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	50	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	51	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	52	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	53	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	54	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	55	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	56	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	57	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	58	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	59	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
	60	};
	61
	62	static int
	63	mbc_enc_len(const UChar* p)
	64	{
	65	return EncLen_UTF8[*p];
	66	}
	67
	68	static int
	69	is_mbc_newline(const UChar* p, const UChar* end)
	70	{
	71	if (p < end) {
	72	if (*p == 0x0a) return 1;
	73
	74	#ifdef USE_UNICODE_ALL_LINE_TERMINATORS
	75	if (p == 0x0b \|\| p == 0x0c \|\| *p == 0x0d) return 1;
	76	if (p + 1 < end) {
	77	if ((p+1) == 0x85 && p == 0xc2) /* U+0085 */
	78	return 1;
	79	if (p + 2 < end) {
	80	if (((p+2) == 0xa8 \|\| (p+2) == 0xa9)
	81	&& (p+1) == 0x80 && p == 0xe2) /* U+2028, U+2029 */
	82	return 1;
	83	}
	84	}
	85	#endif
	86	}
	87
	88	return 0;
	89	}
	90
	91	static OnigCodePoint
	92	mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
	93	{
	94	int c, len;
	95	OnigCodePoint n;
	96
	97	len = mbc_enc_len(p);
	98	c = *p++;
	99	if (len > 1) {
	100	len--;
	101	n = c & ((1 << (6 - len)) - 1);
	102	while (len--) {
	103	c = *p++;
	104	n = (n << 6) \| (c & ((1 << 6) - 1));
	105	}
	106	return n;
	107	}
	108	else {
	109	#ifdef USE_INVALID_CODE_SCHEME
	110	if (c > 0xfd) {
	111	return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
	112	}
	113	#endif
	114	return (OnigCodePoint )c;
	115	}
	116	}
	117
	118	static int
	119	code_to_mbclen(OnigCodePoint code)
	120	{
	121	if ((code & 0xffffff80) == 0) return 1;
	122	else if ((code & 0xfffff800) == 0) return 2;
	123	else if ((code & 0xffff0000) == 0) return 3;
	124	else if ((code & 0xffe00000) == 0) return 4;
	125	else if ((code & 0xfc000000) == 0) return 5;
	126	else if ((code & 0x80000000) == 0) return 6;
	127	#ifdef USE_INVALID_CODE_SCHEME
	128	else if (code == INVALID_CODE_FE) return 1;
	129	else if (code == INVALID_CODE_FF) return 1;
	130	#endif
	131	else
	132	return ONIGERR_INVALID_CODE_POINT_VALUE;
	133	}
	134
	135	static int
	136	code_to_mbc(OnigCodePoint code, UChar *buf)
	137	{
	138	#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) \| 0x80)
	139	#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) \| 0x80)
	140
	141	if ((code & 0xffffff80) == 0) {
	142	*buf = (UChar )code;
	143	return 1;
	144	}
	145	else {
	146	UChar *p = buf;
	147
	148	if ((code & 0xfffff800) == 0) {
	149	*p++ = (UChar )(((code>>6)& 0x1f) \| 0xc0);
	150	}
	151	else if ((code & 0xffff0000) == 0) {
	152	*p++ = (UChar )(((code>>12) & 0x0f) \| 0xe0);
	153	*p++ = UTF8_TRAILS(code, 6);
	154	}
	155	else if ((code & 0xffe00000) == 0) {
	156	*p++ = (UChar )(((code>>18) & 0x07) \| 0xf0);
	157	*p++ = UTF8_TRAILS(code, 12);
	158	*p++ = UTF8_TRAILS(code, 6);
	159	}
	160	else if ((code & 0xfc000000) == 0) {
	161	*p++ = (UChar )(((code>>24) & 0x03) \| 0xf8);
	162	*p++ = UTF8_TRAILS(code, 18);
	163	*p++ = UTF8_TRAILS(code, 12);
	164	*p++ = UTF8_TRAILS(code, 6);
	165	}
	166	else if ((code & 0x80000000) == 0) {
	167	*p++ = (UChar )(((code>>30) & 0x01) \| 0xfc);
	168	*p++ = UTF8_TRAILS(code, 24);
	169	*p++ = UTF8_TRAILS(code, 18);
	170	*p++ = UTF8_TRAILS(code, 12);
	171	*p++ = UTF8_TRAILS(code, 6);
	172	}
	173	#ifdef USE_INVALID_CODE_SCHEME
	174	else if (code == INVALID_CODE_FE) {
	175	*p = 0xfe;
	176	return 1;
	177	}
	178	else if (code == INVALID_CODE_FF) {
	179	*p = 0xff;
	180	return 1;
	181	}
	182	#endif
	183	else {
	184	return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
	185	}
	186
	187	*p++ = UTF8_TRAIL0(code);
	188	return (int )(p - buf);
	189	}
	190	}
	191
	192	static int
	193	mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
	194	const UChar* end, UChar* fold)
	195	{
	196	const UChar* p = *pp;
	197
	198	if (ONIGENC_IS_MBC_ASCII(p)) {
	199	#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
	200	if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
	201	if (*p == 0x49) {
	202	*fold++ = 0xc4;
	203	*fold = 0xb1;
	204	(*pp)++;
	205	return 2;
	206	}
	207	}
	208	#endif
	209
	210	fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(p);
	211	(*pp)++;
	212	return 1; /* return byte length of converted char to lower */
	213	}
	214	else {
	215	return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
	216	pp, end, fold);
	217	}
	218	}
	219
	220	#if 0
	221	static int
	222	is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
	223	{
	224	const UChar* p = *pp;
	225
	226	if (ONIGENC_IS_MBC_ASCII(p)) {
	227	(*pp)++;
	228	return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
	229	}
	230	else {
	231	(*pp) += mbc_enc_len(p);
	232
	233	if (*p == 0xc3) {
	234	int c = *(p + 1);
	235	if (c >= 0x80) {
	236	if (c <= (UChar )0x9e) { /* upper */
	237	if (c == (UChar )0x97) return FALSE;
	238	return TRUE;
	239	}
	240	else if (c >= (UChar )0xa0 && c <= (UChar )0xbe) { /* lower */
	241	if (c == (UChar )'\267') return FALSE;
	242	return TRUE;
	243	}
	244	else if (c == (UChar )0x9f &&
	245	(flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
	246	return TRUE;
	247	}
	248	}
	249	}
	250	}
	251
	252	return FALSE;
	253	}
	254	#endif
	255
	256
	257	static int
	258	get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
	259	const OnigCodePoint* ranges[])
	260	{
	261	*sb_out = 0x80;
	262	return onigenc_unicode_ctype_code_range(ctype, ranges);
	263	}
	264
	265
	266	static UChar*
	267	left_adjust_char_head(const UChar* start, const UChar* s)
	268	{
	269	const UChar *p;
	270
	271	if (s <= start) return (UChar* )s;
	272	p = s;
	273
	274	while (!utf8_islead(*p) && p > start) p--;
	275	return (UChar* )p;
	276	}
	277
	278	static int
	279	get_case_fold_codes_by_str(OnigCaseFoldType flag,
	280	const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
	281	{
	282	return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
	283	flag, p, end, items);
	284	}
	285
	286	OnigEncodingType OnigEncodingUTF8 = {
	287	mbc_enc_len,
	288	"UTF-8", /* name */
	289	6, /* max byte length */
	290	1, /* min byte length */
	291	is_mbc_newline,
	292	mbc_to_code,
	293	code_to_mbclen,
	294	code_to_mbc,
	295	mbc_case_fold,
	296	onigenc_unicode_apply_all_case_fold,
	297	get_case_fold_codes_by_str,
	298	onigenc_unicode_property_name_to_ctype,
	299	onigenc_unicode_is_code_ctype,
	300	get_ctype_code_range,
	301	left_adjust_char_head,
	302	onigenc_always_true_is_allowed_reverse_match,
	303	ONIGENC_FLAG_UNICODE,
	304	};

Note: See TracBrowser for help on using the repository browser.

Download in other formats: