source: EcnlProtoTool/trunk/mruby-2.1.1/mrbgems/mruby-string-ext/src/string.c@ 439

Last change on this file since 439 was 439, checked in by coas-nagasima, 4 years ago

mrubyを2.1.1に更新

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 31.0 KB
Line 
1#include <string.h>
2#include <mruby.h>
3#include <mruby/array.h>
4#include <mruby/class.h>
5#include <mruby/string.h>
6#include <mruby/range.h>
7
8#define ENC_ASCII_8BIT "ASCII-8BIT"
9#define ENC_BINARY "BINARY"
10#define ENC_UTF8 "UTF-8"
11
12#define ENC_COMP_P(enc, enc_lit) \
13 str_casecmp_p(RSTRING_PTR(enc), RSTRING_LEN(enc), enc_lit, sizeof(enc_lit"")-1)
14
15#ifdef MRB_WITHOUT_FLOAT
16# define mrb_float_p(o) FALSE
17#endif
18
19static mrb_bool
20str_casecmp_p(const char *s1, mrb_int len1, const char *s2, mrb_int len2)
21{
22 const char *e1, *e2;
23
24 if (len1 != len2) return FALSE;
25 e1 = s1 + len1;
26 e2 = s2 + len2;
27 while (s1 < e1 && s2 < e2) {
28 if (*s1 != *s2 && TOUPPER(*s1) != TOUPPER(*s2)) return FALSE;
29 ++s1;
30 ++s2;
31 }
32 return TRUE;
33}
34
35static mrb_value
36int_chr_binary(mrb_state *mrb, mrb_value num)
37{
38 mrb_int cp = mrb_int(mrb, num);
39 char c;
40 mrb_value str;
41
42 if (cp < 0 || 0xff < cp) {
43 mrb_raisef(mrb, E_RANGE_ERROR, "%v out of char range", num);
44 }
45 c = (char)cp;
46 str = mrb_str_new(mrb, &c, 1);
47 RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
48 return str;
49}
50
51#ifdef MRB_UTF8_STRING
52static mrb_value
53int_chr_utf8(mrb_state *mrb, mrb_value num)
54{
55 mrb_int cp = mrb_int(mrb, num);
56 char utf8[4];
57 mrb_int len;
58 mrb_value str;
59 uint32_t ascii_flag = 0;
60
61 if (cp < 0 || 0x10FFFF < cp) {
62 mrb_raisef(mrb, E_RANGE_ERROR, "%v out of char range", num);
63 }
64 if (cp < 0x80) {
65 utf8[0] = (char)cp;
66 len = 1;
67 ascii_flag = MRB_STR_ASCII;
68 }
69 else if (cp < 0x800) {
70 utf8[0] = (char)(0xC0 | (cp >> 6));
71 utf8[1] = (char)(0x80 | (cp & 0x3F));
72 len = 2;
73 }
74 else if (cp < 0x10000) {
75 utf8[0] = (char)(0xE0 | (cp >> 12));
76 utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
77 utf8[2] = (char)(0x80 | ( cp & 0x3F));
78 len = 3;
79 }
80 else {
81 utf8[0] = (char)(0xF0 | (cp >> 18));
82 utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
83 utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
84 utf8[3] = (char)(0x80 | ( cp & 0x3F));
85 len = 4;
86 }
87 str = mrb_str_new(mrb, utf8, len);
88 mrb_str_ptr(str)->flags |= ascii_flag;
89 return str;
90}
91#endif
92
93/*
94 * call-seq:
95 * str.swapcase! -> str or nil
96 *
97 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
98 * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
99 * Note: case conversion is effective only in ASCII region.
100 */
101static mrb_value
102mrb_str_swapcase_bang(mrb_state *mrb, mrb_value str)
103{
104 char *p, *pend;
105 int modify = 0;
106 struct RString *s = mrb_str_ptr(str);
107
108 mrb_str_modify(mrb, s);
109 p = RSTRING_PTR(str);
110 pend = p + RSTRING_LEN(str);
111 while (p < pend) {
112 if (ISUPPER(*p)) {
113 *p = TOLOWER(*p);
114 modify = 1;
115 }
116 else if (ISLOWER(*p)) {
117 *p = TOUPPER(*p);
118 modify = 1;
119 }
120 p++;
121 }
122
123 if (modify) return str;
124 return mrb_nil_value();
125}
126
127/*
128 * call-seq:
129 * str.swapcase -> new_str
130 *
131 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
132 * to lowercase and lowercase characters converted to uppercase.
133 * Note: case conversion is effective only in ASCII region.
134 *
135 * "Hello".swapcase #=> "hELLO"
136 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
137 */
138static mrb_value
139mrb_str_swapcase(mrb_state *mrb, mrb_value self)
140{
141 mrb_value str;
142
143 str = mrb_str_dup(mrb, self);
144 mrb_str_swapcase_bang(mrb, str);
145 return str;
146}
147
148/*
149 * call-seq:
150 * str << integer -> str
151 * str.concat(integer) -> str
152 * str << obj -> str
153 * str.concat(obj) -> str
154 *
155 * Append---Concatenates the given object to <i>str</i>. If the object is a
156 * <code>Integer</code>, it is considered as a codepoint, and is converted
157 * to a character before concatenation
158 * (equivalent to <code>str.concat(integer.chr(__ENCODING__))</code>).
159 *
160 * a = "hello "
161 * a << "world" #=> "hello world"
162 * a.concat(33) #=> "hello world!"
163 */
164static mrb_value
165mrb_str_concat_m(mrb_state *mrb, mrb_value self)
166{
167 mrb_value str;
168
169 mrb_get_args(mrb, "o", &str);
170 if (mrb_fixnum_p(str) || mrb_float_p(str))
171#ifdef MRB_UTF8_STRING
172 str = int_chr_utf8(mrb, str);
173#else
174 str = int_chr_binary(mrb, str);
175#endif
176 else
177 mrb_ensure_string_type(mrb, str);
178 mrb_str_cat_str(mrb, self, str);
179 return self;
180}
181
182/*
183 * call-seq:
184 * str.start_with?([prefixes]+) -> true or false
185 *
186 * Returns true if +str+ starts with one of the +prefixes+ given.
187 *
188 * "hello".start_with?("hell") #=> true
189 *
190 * # returns true if one of the prefixes matches.
191 * "hello".start_with?("heaven", "hell") #=> true
192 * "hello".start_with?("heaven", "paradise") #=> false
193 * "h".start_with?("heaven", "hell") #=> false
194 */
195static mrb_value
196mrb_str_start_with(mrb_state *mrb, mrb_value self)
197{
198 mrb_value *argv, sub;
199 mrb_int argc, i;
200 mrb_get_args(mrb, "*", &argv, &argc);
201
202 for (i = 0; i < argc; i++) {
203 size_t len_l, len_r;
204 int ai = mrb_gc_arena_save(mrb);
205 sub = mrb_ensure_string_type(mrb, argv[i]);
206 mrb_gc_arena_restore(mrb, ai);
207 len_l = RSTRING_LEN(self);
208 len_r = RSTRING_LEN(sub);
209 if (len_l >= len_r) {
210 if (memcmp(RSTRING_PTR(self), RSTRING_PTR(sub), len_r) == 0) {
211 return mrb_true_value();
212 }
213 }
214 }
215 return mrb_false_value();
216}
217
218/*
219 * call-seq:
220 * str.end_with?([suffixes]+) -> true or false
221 *
222 * Returns true if +str+ ends with one of the +suffixes+ given.
223 */
224static mrb_value
225mrb_str_end_with(mrb_state *mrb, mrb_value self)
226{
227 mrb_value *argv, sub;
228 mrb_int argc, i;
229 mrb_get_args(mrb, "*", &argv, &argc);
230
231 for (i = 0; i < argc; i++) {
232 size_t len_l, len_r;
233 int ai = mrb_gc_arena_save(mrb);
234 sub = mrb_ensure_string_type(mrb, argv[i]);
235 mrb_gc_arena_restore(mrb, ai);
236 len_l = RSTRING_LEN(self);
237 len_r = RSTRING_LEN(sub);
238 if (len_l >= len_r) {
239 if (memcmp(RSTRING_PTR(self) + (len_l - len_r),
240 RSTRING_PTR(sub),
241 len_r) == 0) {
242 return mrb_true_value();
243 }
244 }
245 }
246 return mrb_false_value();
247}
248
249enum tr_pattern_type {
250 TR_UNINITIALIZED = 0,
251 TR_IN_ORDER = 1,
252 TR_RANGE = 2,
253};
254
255/*
256 #tr Pattern syntax
257
258 <syntax> ::= (<pattern>)* | '^' (<pattern>)*
259 <pattern> ::= <in order> | <range>
260 <in order> ::= (<ch>)+
261 <range> ::= <ch> '-' <ch>
262*/
263struct tr_pattern {
264 uint8_t type; // 1:in-order, 2:range
265 mrb_bool flag_reverse : 1;
266 mrb_bool flag_on_heap : 1;
267 uint16_t n;
268 union {
269 uint16_t start_pos;
270 char ch[2];
271 } val;
272 struct tr_pattern *next;
273};
274
275#define STATIC_TR_PATTERN { 0 }
276
277static inline void
278tr_free_pattern(mrb_state *mrb, struct tr_pattern *pat)
279{
280 while (pat) {
281 struct tr_pattern *p = pat->next;
282 if (pat->flag_on_heap) {
283 mrb_free(mrb, pat);
284 }
285 pat = p;
286 }
287}
288
289static struct tr_pattern*
290tr_parse_pattern(mrb_state *mrb, struct tr_pattern *ret, const mrb_value v_pattern, mrb_bool flag_reverse_enable)
291{
292 const char *pattern = RSTRING_PTR(v_pattern);
293 mrb_int pattern_length = RSTRING_LEN(v_pattern);
294 mrb_bool flag_reverse = FALSE;
295 struct tr_pattern *pat1;
296 mrb_int i = 0;
297
298 if(flag_reverse_enable && pattern_length >= 2 && pattern[0] == '^') {
299 flag_reverse = TRUE;
300 i++;
301 }
302
303 while (i < pattern_length) {
304 /* is range pattern ? */
305 mrb_bool const ret_uninit = (ret->type == TR_UNINITIALIZED);
306 pat1 = ret_uninit
307 ? ret
308 : (struct tr_pattern*)mrb_malloc_simple(mrb, sizeof(struct tr_pattern));
309 if ((i+2) < pattern_length && pattern[i] != '\\' && pattern[i+1] == '-') {
310 if (pat1 == NULL && ret) {
311 nomem:
312 tr_free_pattern(mrb, ret);
313 mrb_exc_raise(mrb, mrb_obj_value(mrb->nomem_err));
314 return NULL; /* not reached */
315 }
316 pat1->type = TR_RANGE;
317 pat1->flag_reverse = flag_reverse;
318 pat1->flag_on_heap = !ret_uninit;
319 pat1->n = pattern[i+2] - pattern[i] + 1;
320 pat1->next = NULL;
321 pat1->val.ch[0] = pattern[i];
322 pat1->val.ch[1] = pattern[i+2];
323 i += 3;
324 }
325 else {
326 /* in order pattern. */
327 mrb_int start_pos = i++;
328 mrb_int len;
329
330 while (i < pattern_length) {
331 if ((i+2) < pattern_length && pattern[i] != '\\' && pattern[i+1] == '-')
332 break;
333 i++;
334 }
335
336 len = i - start_pos;
337 if (len > UINT16_MAX) {
338 mrb_raise(mrb, E_ARGUMENT_ERROR, "tr pattern too long (max 65536)");
339 }
340 if (pat1 == NULL && ret) {
341 goto nomem;
342 }
343 pat1->type = TR_IN_ORDER;
344 pat1->flag_reverse = flag_reverse;
345 pat1->flag_on_heap = !ret_uninit;
346 pat1->n = len;
347 pat1->next = NULL;
348 pat1->val.start_pos = start_pos;
349 }
350
351 if (ret == NULL || ret_uninit) {
352 ret = pat1;
353 }
354 else {
355 struct tr_pattern *p = ret;
356 while (p->next != NULL) {
357 p = p->next;
358 }
359 p->next = pat1;
360 }
361 }
362
363 return ret;
364}
365
366static inline mrb_int
367tr_find_character(const struct tr_pattern *pat, const char *pat_str, int ch)
368{
369 mrb_int ret = -1;
370 mrb_int n_sum = 0;
371 mrb_int flag_reverse = pat ? pat->flag_reverse : 0;
372
373 while (pat != NULL) {
374 if (pat->type == TR_IN_ORDER) {
375 int i;
376 for (i = 0; i < pat->n; i++) {
377 if (pat_str[pat->val.start_pos + i] == ch) ret = n_sum + i;
378 }
379 }
380 else if (pat->type == TR_RANGE) {
381 if (pat->val.ch[0] <= ch && ch <= pat->val.ch[1])
382 ret = n_sum + ch - pat->val.ch[0];
383 }
384 else {
385 mrb_assert(pat->type == TR_UNINITIALIZED);
386 }
387 n_sum += pat->n;
388 pat = pat->next;
389 }
390
391 if (flag_reverse) {
392 return (ret < 0) ? MRB_INT_MAX : -1;
393 }
394 return ret;
395}
396
397static inline mrb_int
398tr_get_character(const struct tr_pattern *pat, const char *pat_str, mrb_int n_th)
399{
400 mrb_int n_sum = 0;
401
402 while (pat != NULL) {
403 if (n_th < (n_sum + pat->n)) {
404 mrb_int i = (n_th - n_sum);
405
406 switch (pat->type) {
407 case TR_IN_ORDER:
408 return pat_str[pat->val.start_pos + i];
409 case TR_RANGE:
410 return pat->val.ch[0]+i;
411 case TR_UNINITIALIZED:
412 return -1;
413 }
414 }
415 if (pat->next == NULL) {
416 switch (pat->type) {
417 case TR_IN_ORDER:
418 return pat_str[pat->val.start_pos + pat->n - 1];
419 case TR_RANGE:
420 return pat->val.ch[1];
421 case TR_UNINITIALIZED:
422 return -1;
423 }
424 }
425 n_sum += pat->n;
426 pat = pat->next;
427 }
428
429 return -1;
430}
431
432static inline void
433tr_bitmap_set(uint8_t bitmap[32], uint8_t ch)
434{
435 uint8_t idx1 = ch / 8;
436 uint8_t idx2 = ch % 8;
437 bitmap[idx1] |= (1<<idx2);
438}
439
440static inline mrb_bool
441tr_bitmap_detect(uint8_t bitmap[32], uint8_t ch)
442{
443 uint8_t idx1 = ch / 8;
444 uint8_t idx2 = ch % 8;
445 if (bitmap[idx1] & (1<<idx2))
446 return TRUE;
447 return FALSE;
448}
449
450/* compile patter to bitmap */
451static void
452tr_compile_pattern(const struct tr_pattern *pat, mrb_value pstr, uint8_t bitmap[32])
453{
454 const char *pattern = RSTRING_PTR(pstr);
455 mrb_int flag_reverse = pat ? pat->flag_reverse : 0;
456 int i;
457
458 for (i=0; i<32; i++) {
459 bitmap[i] = 0;
460 }
461 while (pat != NULL) {
462 if (pat->type == TR_IN_ORDER) {
463 for (i = 0; i < pat->n; i++) {
464 tr_bitmap_set(bitmap, pattern[pat->val.start_pos + i]);
465 }
466 }
467 else if (pat->type == TR_RANGE) {
468 for (i = pat->val.ch[0]; i < pat->val.ch[1]; i++) {
469 tr_bitmap_set(bitmap, i);
470 }
471 }
472 else {
473 mrb_assert(pat->type == TR_UNINITIALIZED);
474 }
475 pat = pat->next;
476 }
477
478 if (flag_reverse) {
479 for (i=0; i<32; i++) {
480 bitmap[i] ^= 0xff;
481 }
482 }
483}
484
485static mrb_bool
486str_tr(mrb_state *mrb, mrb_value str, mrb_value p1, mrb_value p2, mrb_bool squeeze)
487{
488 struct tr_pattern pat = STATIC_TR_PATTERN;
489 struct tr_pattern rep_storage = STATIC_TR_PATTERN;
490 char *s;
491 mrb_int len;
492 mrb_int i;
493 mrb_int j;
494 mrb_bool flag_changed = FALSE;
495 mrb_int lastch = -1;
496 struct tr_pattern *rep;
497
498 mrb_str_modify(mrb, mrb_str_ptr(str));
499 tr_parse_pattern(mrb, &pat, p1, TRUE);
500 rep = tr_parse_pattern(mrb, &rep_storage, p2, FALSE);
501 s = RSTRING_PTR(str);
502 len = RSTRING_LEN(str);
503
504 for (i=j=0; i<len; i++,j++) {
505 mrb_int n = tr_find_character(&pat, RSTRING_PTR(p1), s[i]);
506
507 if (i>j) s[j] = s[i];
508 if (n >= 0) {
509 flag_changed = TRUE;
510 if (rep == NULL) {
511 j--;
512 }
513 else {
514 mrb_int c = tr_get_character(rep, RSTRING_PTR(p2), n);
515
516 if (c < 0 || (squeeze && c == lastch)) {
517 j--;
518 continue;
519 }
520 if (c > 0x80) {
521 mrb_raisef(mrb, E_ARGUMENT_ERROR, "character (%i) out of range", c);
522 }
523 lastch = c;
524 s[i] = (char)c;
525 }
526 }
527 }
528
529 tr_free_pattern(mrb, &pat);
530 tr_free_pattern(mrb, rep);
531
532 if (flag_changed) {
533 RSTR_SET_LEN(RSTRING(str), j);
534 RSTRING_PTR(str)[j] = 0;
535 }
536 return flag_changed;
537}
538
539/*
540 * call-seq:
541 * str.tr(from_str, to_str) => new_str
542 *
543 * Returns a copy of str with the characters in from_str replaced by the
544 * corresponding characters in to_str. If to_str is shorter than from_str,
545 * it is padded with its last character in order to maintain the
546 * correspondence.
547 *
548 * "hello".tr('el', 'ip') #=> "hippo"
549 * "hello".tr('aeiou', '*') #=> "h*ll*"
550 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
551 *
552 * Both strings may use the c1-c2 notation to denote ranges of characters,
553 * and from_str may start with a ^, which denotes all characters except
554 * those listed.
555 *
556 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
557 * "hello".tr('^aeiou', '*') #=> "*e**o"
558 *
559 * The backslash character \ can be used to escape ^ or - and is otherwise
560 * ignored unless it appears at the end of a range or the end of the
561 * from_str or to_str:
562 *
563 *
564 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
565 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
566 *
567 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
568 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
569 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
570 *
571 * "X['\\b']".tr("X\\", "") #=> "['b']"
572 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
573 *
574 * Note: conversion is effective only in ASCII region.
575 */
576static mrb_value
577mrb_str_tr(mrb_state *mrb, mrb_value str)
578{
579 mrb_value dup;
580 mrb_value p1, p2;
581
582 mrb_get_args(mrb, "SS", &p1, &p2);
583 dup = mrb_str_dup(mrb, str);
584 str_tr(mrb, dup, p1, p2, FALSE);
585 return dup;
586}
587
588/*
589 * call-seq:
590 * str.tr!(from_str, to_str) -> str or nil
591 *
592 * Translates str in place, using the same rules as String#tr.
593 * Returns str, or nil if no changes were made.
594 */
595static mrb_value
596mrb_str_tr_bang(mrb_state *mrb, mrb_value str)
597{
598 mrb_value p1, p2;
599
600 mrb_get_args(mrb, "SS", &p1, &p2);
601 if (str_tr(mrb, str, p1, p2, FALSE)) {
602 return str;
603 }
604 return mrb_nil_value();
605}
606
607/*
608 * call-seq:
609 * str.tr_s(from_str, to_str) -> new_str
610 *
611 * Processes a copy of str as described under String#tr, then removes
612 * duplicate characters in regions that were affected by the translation.
613 *
614 * "hello".tr_s('l', 'r') #=> "hero"
615 * "hello".tr_s('el', '*') #=> "h*o"
616 * "hello".tr_s('el', 'hx') #=> "hhxo"
617 */
618static mrb_value
619mrb_str_tr_s(mrb_state *mrb, mrb_value str)
620{
621 mrb_value dup;
622 mrb_value p1, p2;
623
624 mrb_get_args(mrb, "SS", &p1, &p2);
625 dup = mrb_str_dup(mrb, str);
626 str_tr(mrb, dup, p1, p2, TRUE);
627 return dup;
628}
629
630/*
631 * call-seq:
632 * str.tr_s!(from_str, to_str) -> str or nil
633 *
634 * Performs String#tr_s processing on str in place, returning
635 * str, or nil if no changes were made.
636 */
637static mrb_value
638mrb_str_tr_s_bang(mrb_state *mrb, mrb_value str)
639{
640 mrb_value p1, p2;
641
642 mrb_get_args(mrb, "SS", &p1, &p2);
643 if (str_tr(mrb, str, p1, p2, TRUE)) {
644 return str;
645 }
646 return mrb_nil_value();
647}
648
649static mrb_bool
650str_squeeze(mrb_state *mrb, mrb_value str, mrb_value v_pat)
651{
652 struct tr_pattern pat_storage = STATIC_TR_PATTERN;
653 struct tr_pattern *pat = NULL;
654 mrb_int i, j;
655 char *s;
656 mrb_int len;
657 mrb_bool flag_changed = FALSE;
658 mrb_int lastch = -1;
659 uint8_t bitmap[32];
660
661 mrb_str_modify(mrb, mrb_str_ptr(str));
662 if (!mrb_nil_p(v_pat)) {
663 pat = tr_parse_pattern(mrb, &pat_storage, v_pat, TRUE);
664 tr_compile_pattern(pat, v_pat, bitmap);
665 tr_free_pattern(mrb, pat);
666 }
667 s = RSTRING_PTR(str);
668 len = RSTRING_LEN(str);
669
670 if (pat) {
671 for (i=j=0; i<len; i++,j++) {
672 if (i>j) s[j] = s[i];
673 if (tr_bitmap_detect(bitmap, s[i]) && s[i] == lastch) {
674 flag_changed = TRUE;
675 j--;
676 }
677 lastch = s[i];
678 }
679 }
680 else {
681 for (i=j=0; i<len; i++,j++) {
682 if (i>j) s[j] = s[i];
683 if (s[i] >= 0 && s[i] == lastch) {
684 flag_changed = TRUE;
685 j--;
686 }
687 lastch = s[i];
688 }
689 }
690
691 if (flag_changed) {
692 RSTR_SET_LEN(RSTRING(str), j);
693 RSTRING_PTR(str)[j] = 0;
694 }
695 return flag_changed;
696}
697
698/*
699 * call-seq:
700 * str.squeeze([other_str]) -> new_str
701 *
702 * Builds a set of characters from the other_str
703 * parameter(s) using the procedure described for String#count. Returns a
704 * new string where runs of the same character that occur in this set are
705 * replaced by a single character. If no arguments are given, all runs of
706 * identical characters are replaced by a single character.
707 *
708 * "yellow moon".squeeze #=> "yelow mon"
709 * " now is the".squeeze(" ") #=> " now is the"
710 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
711 */
712static mrb_value
713mrb_str_squeeze(mrb_state *mrb, mrb_value str)
714{
715 mrb_value pat = mrb_nil_value();
716 mrb_value dup;
717
718 mrb_get_args(mrb, "|S", &pat);
719 dup = mrb_str_dup(mrb, str);
720 str_squeeze(mrb, dup, pat);
721 return dup;
722}
723
724/*
725 * call-seq:
726 * str.squeeze!([other_str]) -> str or nil
727 *
728 * Squeezes str in place, returning either str, or nil if no
729 * changes were made.
730 */
731static mrb_value
732mrb_str_squeeze_bang(mrb_state *mrb, mrb_value str)
733{
734 mrb_value pat = mrb_nil_value();
735
736 mrb_get_args(mrb, "|S", &pat);
737 if (str_squeeze(mrb, str, pat)) {
738 return str;
739 }
740 return mrb_nil_value();
741}
742
743static mrb_bool
744str_delete(mrb_state *mrb, mrb_value str, mrb_value v_pat)
745{
746 struct tr_pattern pat = STATIC_TR_PATTERN;
747 mrb_int i, j;
748 char *s;
749 mrb_int len;
750 mrb_bool flag_changed = FALSE;
751 uint8_t bitmap[32];
752
753 mrb_str_modify(mrb, mrb_str_ptr(str));
754 tr_parse_pattern(mrb, &pat, v_pat, TRUE);
755 tr_compile_pattern(&pat, v_pat, bitmap);
756 tr_free_pattern(mrb, &pat);
757
758 s = RSTRING_PTR(str);
759 len = RSTRING_LEN(str);
760
761 for (i=j=0; i<len; i++,j++) {
762 if (i>j) s[j] = s[i];
763 if (tr_bitmap_detect(bitmap, s[i])) {
764 flag_changed = TRUE;
765 j--;
766 }
767 }
768 if (flag_changed) {
769 RSTR_SET_LEN(RSTRING(str), j);
770 RSTRING_PTR(str)[j] = 0;
771 }
772 return flag_changed;
773}
774
775static mrb_value
776mrb_str_delete(mrb_state *mrb, mrb_value str)
777{
778 mrb_value pat;
779 mrb_value dup;
780
781 mrb_get_args(mrb, "S", &pat);
782 dup = mrb_str_dup(mrb, str);
783 str_delete(mrb, dup, pat);
784 return dup;
785}
786
787static mrb_value
788mrb_str_delete_bang(mrb_state *mrb, mrb_value str)
789{
790 mrb_value pat;
791
792 mrb_get_args(mrb, "S", &pat);
793 if (str_delete(mrb, str, pat)) {
794 return str;
795 }
796 return mrb_nil_value();
797}
798
799/*
800 * call_seq:
801 * str.count([other_str]) -> integer
802 *
803 * Each other_str parameter defines a set of characters to count. The
804 * intersection of these sets defines the characters to count in str. Any
805 * other_str that starts with a caret ^ is negated. The sequence c1-c2
806 * means all characters between c1 and c2. The backslash character \ can
807 * be used to escape ^ or - and is otherwise ignored unless it appears at
808 * the end of a sequence or the end of a other_str.
809 */
810static mrb_value
811mrb_str_count(mrb_state *mrb, mrb_value str)
812{
813 mrb_value v_pat = mrb_nil_value();
814 mrb_int i;
815 char *s;
816 mrb_int len;
817 mrb_int count = 0;
818 struct tr_pattern pat = STATIC_TR_PATTERN;
819 uint8_t bitmap[32];
820
821 mrb_get_args(mrb, "S", &v_pat);
822 tr_parse_pattern(mrb, &pat, v_pat, TRUE);
823 tr_compile_pattern(&pat, v_pat, bitmap);
824 tr_free_pattern(mrb, &pat);
825
826 s = RSTRING_PTR(str);
827 len = RSTRING_LEN(str);
828 for (i = 0; i < len; i++) {
829 if (tr_bitmap_detect(bitmap, s[i])) count++;
830 }
831 return mrb_fixnum_value(count);
832}
833
834static mrb_value
835mrb_str_hex(mrb_state *mrb, mrb_value self)
836{
837 return mrb_str_to_inum(mrb, self, 16, FALSE);
838}
839
840static mrb_value
841mrb_str_oct(mrb_state *mrb, mrb_value self)
842{
843 return mrb_str_to_inum(mrb, self, 8, FALSE);
844}
845
846/*
847 * call-seq:
848 * string.chr -> string
849 *
850 * Returns a one-character string at the beginning of the string.
851 *
852 * a = "abcde"
853 * a.chr #=> "a"
854 */
855static mrb_value
856mrb_str_chr(mrb_state *mrb, mrb_value self)
857{
858 return mrb_str_substr(mrb, self, 0, 1);
859}
860
861/*
862 * call-seq:
863 * int.chr([encoding]) -> string
864 *
865 * Returns a string containing the character represented by the +int+'s value
866 * according to +encoding+. +"ASCII-8BIT"+ (+"BINARY"+) and +"UTF-8"+ (only
867 * with +MRB_UTF8_STRING+) can be specified as +encoding+ (default is
868 * +"ASCII-8BIT"+).
869 *
870 * 65.chr #=> "A"
871 * 230.chr #=> "\xE6"
872 * 230.chr("ASCII-8BIT") #=> "\xE6"
873 * 230.chr("UTF-8") #=> "\u00E6"
874 */
875static mrb_value
876mrb_int_chr(mrb_state *mrb, mrb_value num)
877{
878 mrb_value enc;
879 mrb_bool enc_given;
880
881 mrb_get_args(mrb, "|S?", &enc, &enc_given);
882 if (!enc_given ||
883 ENC_COMP_P(enc, ENC_ASCII_8BIT) ||
884 ENC_COMP_P(enc, ENC_BINARY)) {
885 return int_chr_binary(mrb, num);
886 }
887#ifdef MRB_UTF8_STRING
888 else if (ENC_COMP_P(enc, ENC_UTF8)) {
889 return int_chr_utf8(mrb, num);
890 }
891#endif
892 else {
893 mrb_raisef(mrb, E_ARGUMENT_ERROR, "unknown encoding name - %v", enc);
894 }
895 /* not reached */
896 return mrb_nil_value();
897}
898
899/*
900 * call-seq:
901 * string.succ -> string
902 *
903 * Returns next sequence of the string;
904 *
905 * a = "abc"
906 * a.succ #=> "abd"
907 */
908static mrb_value
909mrb_str_succ_bang(mrb_state *mrb, mrb_value self)
910{
911 mrb_value result;
912 unsigned char *p, *e, *b, *t;
913 const char *prepend;
914 struct RString *s = mrb_str_ptr(self);
915 mrb_int l;
916
917 if (RSTRING_LEN(self) == 0)
918 return self;
919
920 mrb_str_modify(mrb, s);
921 l = RSTRING_LEN(self);
922 b = p = (unsigned char*) RSTRING_PTR(self);
923 t = e = p + l;
924 *(e--) = 0;
925
926 // find trailing ascii/number
927 while (e >= b) {
928 if (ISALNUM(*e))
929 break;
930 e--;
931 }
932 if (e < b) {
933 e = p + l - 1;
934 result = mrb_str_new_lit(mrb, "");
935 }
936 else {
937 // find leading letter of the ascii/number
938 b = e;
939 while (b > p) {
940 if (!ISALNUM(*b) || (ISALNUM(*b) && *b != '9' && *b != 'z' && *b != 'Z'))
941 break;
942 b--;
943 }
944 if (!ISALNUM(*b))
945 b++;
946 result = mrb_str_new(mrb, (char*) p, b - p);
947 }
948
949 while (e >= b) {
950 if (!ISALNUM(*e)) {
951 if (*e == 0xff) {
952 mrb_str_cat_lit(mrb, result, "\x01");
953 (*e) = 0;
954 }
955 else
956 (*e)++;
957 break;
958 }
959 prepend = NULL;
960 if (*e == '9') {
961 if (e == b) prepend = "1";
962 *e = '0';
963 }
964 else if (*e == 'z') {
965 if (e == b) prepend = "a";
966 *e = 'a';
967 }
968 else if (*e == 'Z') {
969 if (e == b) prepend = "A";
970 *e = 'A';
971 }
972 else {
973 (*e)++;
974 break;
975 }
976 if (prepend) mrb_str_cat_cstr(mrb, result, prepend);
977 e--;
978 }
979 result = mrb_str_cat(mrb, result, (char*) b, t - b);
980 l = RSTRING_LEN(result);
981 mrb_str_resize(mrb, self, l);
982 memcpy(RSTRING_PTR(self), RSTRING_PTR(result), l);
983 return self;
984}
985
986static mrb_value
987mrb_str_succ(mrb_state *mrb, mrb_value self)
988{
989 mrb_value str;
990
991 str = mrb_str_dup(mrb, self);
992 mrb_str_succ_bang(mrb, str);
993 return str;
994}
995
996#ifdef MRB_UTF8_STRING
997static const char utf8len_codepage_zero[256] =
998{
999 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1000 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1001 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1002 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1003 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1004 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1005 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1006 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
1007};
1008
1009static mrb_int
1010utf8code(unsigned char* p)
1011{
1012 mrb_int len;
1013
1014 if (p[0] < 0x80)
1015 return p[0];
1016
1017 len = utf8len_codepage_zero[p[0]];
1018 if (len > 1 && (p[1] & 0xc0) == 0x80) {
1019 if (len == 2)
1020 return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
1021 if ((p[2] & 0xc0) == 0x80) {
1022 if (len == 3)
1023 return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
1024 + (p[2] & 0x3f);
1025 if ((p[3] & 0xc0) == 0x80) {
1026 if (len == 4)
1027 return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
1028 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
1029 if ((p[4] & 0xc0) == 0x80) {
1030 if (len == 5)
1031 return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
1032 + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
1033 + (p[4] & 0x3f);
1034 if ((p[5] & 0xc0) == 0x80 && len == 6)
1035 return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
1036 + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
1037 + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
1038 }
1039 }
1040 }
1041 }
1042 return p[0];
1043}
1044
1045static mrb_value
1046mrb_str_ord(mrb_state* mrb, mrb_value str)
1047{
1048 if (RSTRING_LEN(str) == 0)
1049 mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
1050 return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str)));
1051}
1052#else
1053static mrb_value
1054mrb_str_ord(mrb_state* mrb, mrb_value str)
1055{
1056 if (RSTRING_LEN(str) == 0)
1057 mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
1058 return mrb_fixnum_value((unsigned char)RSTRING_PTR(str)[0]);
1059}
1060#endif
1061
1062/*
1063 * call-seq:
1064 * str.delete_prefix!(prefix) -> self or nil
1065 *
1066 * Deletes leading <code>prefix</code> from <i>str</i>, returning
1067 * <code>nil</code> if no change was made.
1068 *
1069 * "hello".delete_prefix!("hel") #=> "lo"
1070 * "hello".delete_prefix!("llo") #=> nil
1071 */
1072static mrb_value
1073mrb_str_del_prefix_bang(mrb_state *mrb, mrb_value self)
1074{
1075 mrb_int plen, slen;
1076 char *ptr, *s;
1077 struct RString *str = RSTRING(self);
1078
1079 mrb_get_args(mrb, "s", &ptr, &plen);
1080 slen = RSTR_LEN(str);
1081 if (plen > slen) return mrb_nil_value();
1082 s = RSTR_PTR(str);
1083 if (memcmp(s, ptr, plen) != 0) return mrb_nil_value();
1084 if (!mrb_frozen_p(str) && (RSTR_SHARED_P(str) || RSTR_FSHARED_P(str))) {
1085 str->as.heap.ptr += plen;
1086 }
1087 else {
1088 mrb_str_modify(mrb, str);
1089 s = RSTR_PTR(str);
1090 memmove(s, s+plen, slen-plen);
1091 }
1092 RSTR_SET_LEN(str, slen-plen);
1093 return self;
1094}
1095
1096/*
1097 * call-seq:
1098 * str.delete_prefix(prefix) -> new_str
1099 *
1100 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
1101 *
1102 * "hello".delete_prefix("hel") #=> "lo"
1103 * "hello".delete_prefix("llo") #=> "hello"
1104 */
1105static mrb_value
1106mrb_str_del_prefix(mrb_state *mrb, mrb_value self)
1107{
1108 mrb_int plen, slen;
1109 char *ptr;
1110
1111 mrb_get_args(mrb, "s", &ptr, &plen);
1112 slen = RSTRING_LEN(self);
1113 if (plen > slen) return mrb_str_dup(mrb, self);
1114 if (memcmp(RSTRING_PTR(self), ptr, plen) != 0)
1115 return mrb_str_dup(mrb, self);
1116 return mrb_str_substr(mrb, self, plen, slen-plen);
1117}
1118
1119/*
1120 * call-seq:
1121 * str.delete_suffix!(suffix) -> self or nil
1122 *
1123 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
1124 * <code>nil</code> if no change was made.
1125 *
1126 * "hello".delete_suffix!("llo") #=> "he"
1127 * "hello".delete_suffix!("hel") #=> nil
1128 */
1129static mrb_value
1130mrb_str_del_suffix_bang(mrb_state *mrb, mrb_value self)
1131{
1132 mrb_int plen, slen;
1133 char *ptr, *s;
1134 struct RString *str = RSTRING(self);
1135
1136 mrb_get_args(mrb, "s", &ptr, &plen);
1137 slen = RSTR_LEN(str);
1138 if (plen > slen) return mrb_nil_value();
1139 s = RSTR_PTR(str);
1140 if (memcmp(s+slen-plen, ptr, plen) != 0) return mrb_nil_value();
1141 if (!mrb_frozen_p(str) && (RSTR_SHARED_P(str) || RSTR_FSHARED_P(str))) {
1142 /* no need to modify string */
1143 }
1144 else {
1145 mrb_str_modify(mrb, str);
1146 }
1147 RSTR_SET_LEN(str, slen-plen);
1148 return self;
1149}
1150
1151/*
1152 * call-seq:
1153 * str.delete_suffix(suffix) -> new_str
1154 *
1155 * Returns a copy of <i>str</i> with leading <code>suffix</code> deleted.
1156 *
1157 * "hello".delete_suffix("hel") #=> "lo"
1158 * "hello".delete_suffix("llo") #=> "hello"
1159 */
1160static mrb_value
1161mrb_str_del_suffix(mrb_state *mrb, mrb_value self)
1162{
1163 mrb_int plen, slen;
1164 char *ptr;
1165
1166 mrb_get_args(mrb, "s", &ptr, &plen);
1167 slen = RSTRING_LEN(self);
1168 if (plen > slen) return mrb_str_dup(mrb, self);
1169 if (memcmp(RSTRING_PTR(self)+slen-plen, ptr, plen) != 0)
1170 return mrb_str_dup(mrb, self);
1171 return mrb_str_substr(mrb, self, 0, slen-plen);
1172}
1173
1174static mrb_value
1175mrb_str_lines(mrb_state *mrb, mrb_value self)
1176{
1177 mrb_value result;
1178 int ai;
1179 mrb_int len;
1180 char *b = RSTRING_PTR(self);
1181 char *p = b, *t;
1182 char *e = b + RSTRING_LEN(self);
1183
1184 result = mrb_ary_new(mrb);
1185 ai = mrb_gc_arena_save(mrb);
1186 while (p < e) {
1187 t = p;
1188 while (p < e && *p != '\n') p++;
1189 if (*p == '\n') p++;
1190 len = (mrb_int) (p - t);
1191 mrb_ary_push(mrb, result, mrb_str_new(mrb, t, len));
1192 mrb_gc_arena_restore(mrb, ai);
1193 }
1194 return result;
1195}
1196
1197void
1198mrb_mruby_string_ext_gem_init(mrb_state* mrb)
1199{
1200 struct RClass * s = mrb->string_class;
1201
1202 mrb_define_method(mrb, s, "dump", mrb_str_dump, MRB_ARGS_NONE());
1203 mrb_define_method(mrb, s, "swapcase!", mrb_str_swapcase_bang, MRB_ARGS_NONE());
1204 mrb_define_method(mrb, s, "swapcase", mrb_str_swapcase, MRB_ARGS_NONE());
1205 mrb_define_method(mrb, s, "concat", mrb_str_concat_m, MRB_ARGS_REQ(1));
1206 mrb_define_method(mrb, s, "<<", mrb_str_concat_m, MRB_ARGS_REQ(1));
1207 mrb_define_method(mrb, s, "count", mrb_str_count, MRB_ARGS_REQ(1));
1208 mrb_define_method(mrb, s, "tr", mrb_str_tr, MRB_ARGS_REQ(2));
1209 mrb_define_method(mrb, s, "tr!", mrb_str_tr_bang, MRB_ARGS_REQ(2));
1210 mrb_define_method(mrb, s, "tr_s", mrb_str_tr_s, MRB_ARGS_REQ(2));
1211 mrb_define_method(mrb, s, "tr_s!", mrb_str_tr_s_bang, MRB_ARGS_REQ(2));
1212 mrb_define_method(mrb, s, "squeeze", mrb_str_squeeze, MRB_ARGS_OPT(1));
1213 mrb_define_method(mrb, s, "squeeze!", mrb_str_squeeze_bang, MRB_ARGS_OPT(1));
1214 mrb_define_method(mrb, s, "delete", mrb_str_delete, MRB_ARGS_REQ(1));
1215 mrb_define_method(mrb, s, "delete!", mrb_str_delete_bang, MRB_ARGS_REQ(1));
1216 mrb_define_method(mrb, s, "start_with?", mrb_str_start_with, MRB_ARGS_REST());
1217 mrb_define_method(mrb, s, "end_with?", mrb_str_end_with, MRB_ARGS_REST());
1218 mrb_define_method(mrb, s, "hex", mrb_str_hex, MRB_ARGS_NONE());
1219 mrb_define_method(mrb, s, "oct", mrb_str_oct, MRB_ARGS_NONE());
1220 mrb_define_method(mrb, s, "chr", mrb_str_chr, MRB_ARGS_NONE());
1221 mrb_define_method(mrb, s, "succ", mrb_str_succ, MRB_ARGS_NONE());
1222 mrb_define_method(mrb, s, "succ!", mrb_str_succ_bang, MRB_ARGS_NONE());
1223 mrb_define_method(mrb, s, "next", mrb_str_succ, MRB_ARGS_NONE());
1224 mrb_define_method(mrb, s, "next!", mrb_str_succ_bang, MRB_ARGS_NONE());
1225 mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE());
1226 mrb_define_method(mrb, s, "delete_prefix!", mrb_str_del_prefix_bang, MRB_ARGS_REQ(1));
1227 mrb_define_method(mrb, s, "delete_prefix", mrb_str_del_prefix, MRB_ARGS_REQ(1));
1228 mrb_define_method(mrb, s, "delete_suffix!", mrb_str_del_suffix_bang, MRB_ARGS_REQ(1));
1229 mrb_define_method(mrb, s, "delete_suffix", mrb_str_del_suffix, MRB_ARGS_REQ(1));
1230
1231 mrb_define_method(mrb, s, "__lines", mrb_str_lines, MRB_ARGS_NONE());
1232
1233 mrb_define_method(mrb, mrb_module_get(mrb, "Integral"), "chr", mrb_int_chr, MRB_ARGS_OPT(1));
1234}
1235
1236void
1237mrb_mruby_string_ext_gem_final(mrb_state* mrb)
1238{
1239}
Note: See TracBrowser for help on using the repository browser.