source: EcnlProtoTool/trunk/mruby-2.1.1/src/string.c@ 439

Last change on this file since 439 was 439, checked in by coas-nagasima, 4 years ago

mrubyを2.1.1に更新

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 80.4 KB
Line 
1/*
2** string.c - String class
3**
4** See Copyright Notice in mruby.h
5*/
6
7#ifdef _MSC_VER
8# define _CRT_NONSTDC_NO_DEPRECATE
9#endif
10
11#ifndef MRB_WITHOUT_FLOAT
12#include <float.h>
13#include <math.h>
14#endif
15#include <limits.h>
16#include <stddef.h>
17#include <stdlib.h>
18#include <string.h>
19#include <mruby.h>
20#include <mruby/array.h>
21#include <mruby/class.h>
22#include <mruby/range.h>
23#include <mruby/string.h>
24#include <mruby/numeric.h>
25
26typedef struct mrb_shared_string {
27 int refcnt;
28 mrb_ssize capa;
29 char *ptr;
30} mrb_shared_string;
31
32const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
33
34#define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class))
35
36static struct RString*
37str_init_normal_capa(mrb_state *mrb, struct RString *s,
38 const char *p, size_t len, size_t capa)
39{
40 char *dst = (char *)mrb_malloc(mrb, capa + 1);
41 if (p) memcpy(dst, p, len);
42 dst[len] = '\0';
43 s->as.heap.ptr = dst;
44 s->as.heap.len = (mrb_ssize)len;
45 s->as.heap.aux.capa = (mrb_ssize)capa;
46 RSTR_UNSET_TYPE_FLAG(s);
47 return s;
48}
49
50static struct RString*
51str_init_normal(mrb_state *mrb, struct RString *s, const char *p, size_t len)
52{
53 return str_init_normal_capa(mrb, s, p, len, len);
54}
55
56static struct RString*
57str_init_embed(struct RString *s, const char *p, size_t len)
58{
59 if (p) memcpy(RSTR_EMBED_PTR(s), p, len);
60 RSTR_EMBED_PTR(s)[len] = '\0';
61 RSTR_SET_TYPE_FLAG(s, EMBED);
62 RSTR_SET_EMBED_LEN(s, len);
63 return s;
64}
65
66static struct RString*
67str_init_nofree(struct RString *s, const char *p, size_t len)
68{
69 s->as.heap.ptr = (char *)p;
70 s->as.heap.len = (mrb_ssize)len;
71 s->as.heap.aux.capa = 0; /* nofree */
72 RSTR_SET_TYPE_FLAG(s, NOFREE);
73 return s;
74}
75
76static struct RString*
77str_init_shared(mrb_state *mrb, const struct RString *orig, struct RString *s, mrb_shared_string *shared)
78{
79 if (shared) {
80 shared->refcnt++;
81 }
82 else {
83 shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string));
84 shared->refcnt = 1;
85 shared->ptr = orig->as.heap.ptr;
86 shared->capa = orig->as.heap.aux.capa;
87 }
88 s->as.heap.ptr = orig->as.heap.ptr;
89 s->as.heap.len = orig->as.heap.len;
90 s->as.heap.aux.shared = shared;
91 RSTR_SET_TYPE_FLAG(s, SHARED);
92 return s;
93}
94
95static struct RString*
96str_init_fshared(const struct RString *orig, struct RString *s, struct RString *fshared)
97{
98 s->as.heap.ptr = orig->as.heap.ptr;
99 s->as.heap.len = orig->as.heap.len;
100 s->as.heap.aux.fshared = fshared;
101 RSTR_SET_TYPE_FLAG(s, FSHARED);
102 return s;
103}
104
105static struct RString*
106str_init_modifiable(mrb_state *mrb, struct RString *s, const char *p, size_t len)
107{
108 if (RSTR_EMBEDDABLE_P(len)) {
109 return str_init_embed(s, p, len);
110 }
111 else {
112 return str_init_normal(mrb, s, p, len);
113 }
114}
115
116static struct RString*
117str_new_static(mrb_state *mrb, const char *p, size_t len)
118{
119 if (RSTR_EMBEDDABLE_P(len)) {
120 return str_init_embed(mrb_obj_alloc_string(mrb), p, len);
121 }
122 if (len >= MRB_SSIZE_MAX) {
123 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
124 }
125 return str_init_nofree(mrb_obj_alloc_string(mrb), p, len);
126}
127
128static struct RString*
129str_new(mrb_state *mrb, const char *p, size_t len)
130{
131 if (RSTR_EMBEDDABLE_P(len)) {
132 return str_init_embed(mrb_obj_alloc_string(mrb), p, len);
133 }
134 if (len >= MRB_SSIZE_MAX) {
135 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
136 }
137 if (p && mrb_ro_data_p(p)) {
138 return str_init_nofree(mrb_obj_alloc_string(mrb), p, len);
139 }
140 return str_init_normal(mrb, mrb_obj_alloc_string(mrb), p, len);
141}
142
143static inline void
144str_with_class(struct RString *s, mrb_value obj)
145{
146 s->c = mrb_str_ptr(obj)->c;
147}
148
149static mrb_value
150mrb_str_new_empty(mrb_state *mrb, mrb_value str)
151{
152 struct RString *s = str_new(mrb, 0, 0);
153
154 str_with_class(s, str);
155 return mrb_obj_value(s);
156}
157
158MRB_API mrb_value
159mrb_str_new_capa(mrb_state *mrb, size_t capa)
160{
161 struct RString *s;
162
163 if (RSTR_EMBEDDABLE_P(capa)) {
164 s = str_init_embed(mrb_obj_alloc_string(mrb), NULL, 0);
165 }
166 else if (capa >= MRB_SSIZE_MAX) {
167 mrb_raise(mrb, E_ARGUMENT_ERROR, "string capacity size too big");
168 /* not reached */
169 s = NULL;
170 }
171 else {
172 s = str_init_normal_capa(mrb, mrb_obj_alloc_string(mrb), NULL, 0, capa);
173 }
174
175 return mrb_obj_value(s);
176}
177
178#ifndef MRB_STR_BUF_MIN_SIZE
179# define MRB_STR_BUF_MIN_SIZE 128
180#endif
181
182MRB_API mrb_value
183mrb_str_buf_new(mrb_state *mrb, size_t capa)
184{
185 if (capa < MRB_STR_BUF_MIN_SIZE) {
186 capa = MRB_STR_BUF_MIN_SIZE;
187 }
188 return mrb_str_new_capa(mrb, capa);
189}
190
191static void
192resize_capa(mrb_state *mrb, struct RString *s, size_t capacity)
193{
194#if SIZE_MAX > MRB_SSIZE_MAX
195 mrb_assert(capacity < MRB_SSIZE_MAX);
196#endif
197 if (RSTR_EMBED_P(s)) {
198 if (!RSTR_EMBEDDABLE_P(capacity)) {
199 str_init_normal_capa(mrb, s, RSTR_EMBED_PTR(s), RSTR_EMBED_LEN(s), capacity);
200 }
201 }
202 else {
203 s->as.heap.ptr = (char*)mrb_realloc(mrb, RSTR_PTR(s), capacity+1);
204 s->as.heap.aux.capa = (mrb_ssize)capacity;
205 }
206}
207
208MRB_API mrb_value
209mrb_str_new(mrb_state *mrb, const char *p, size_t len)
210{
211 return mrb_obj_value(str_new(mrb, p, len));
212}
213
214MRB_API mrb_value
215mrb_str_new_cstr(mrb_state *mrb, const char *p)
216{
217 struct RString *s;
218 size_t len;
219
220 if (p) {
221 len = strlen(p);
222 }
223 else {
224 len = 0;
225 }
226
227 s = str_new(mrb, p, len);
228
229 return mrb_obj_value(s);
230}
231
232MRB_API mrb_value
233mrb_str_new_static(mrb_state *mrb, const char *p, size_t len)
234{
235 struct RString *s = str_new_static(mrb, p, len);
236 return mrb_obj_value(s);
237}
238
239static void
240str_decref(mrb_state *mrb, mrb_shared_string *shared)
241{
242 shared->refcnt--;
243 if (shared->refcnt == 0) {
244 mrb_free(mrb, shared->ptr);
245 mrb_free(mrb, shared);
246 }
247}
248
249static void
250str_modify_keep_ascii(mrb_state *mrb, struct RString *s)
251{
252 if (RSTR_SHARED_P(s)) {
253 mrb_shared_string *shared = s->as.heap.aux.shared;
254
255 if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
256 s->as.heap.aux.capa = shared->capa;
257 s->as.heap.ptr[s->as.heap.len] = '\0';
258 RSTR_UNSET_SHARED_FLAG(s);
259 mrb_free(mrb, shared);
260 }
261 else {
262 str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len);
263 str_decref(mrb, shared);
264 }
265 }
266 else if (RSTR_NOFREE_P(s) || RSTR_FSHARED_P(s)) {
267 str_init_modifiable(mrb, s, s->as.heap.ptr, (size_t)s->as.heap.len);
268 }
269}
270
271static void
272check_null_byte(mrb_state *mrb, mrb_value str)
273{
274 mrb_to_str(mrb, str);
275 if (memchr(RSTRING_PTR(str), '\0', RSTRING_LEN(str))) {
276 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
277 }
278}
279
280void
281mrb_gc_free_str(mrb_state *mrb, struct RString *str)
282{
283 if (RSTR_EMBED_P(str))
284 /* no code */;
285 else if (RSTR_SHARED_P(str))
286 str_decref(mrb, str->as.heap.aux.shared);
287 else if (!RSTR_NOFREE_P(str) && !RSTR_FSHARED_P(str))
288 mrb_free(mrb, str->as.heap.ptr);
289}
290
291#ifdef MRB_UTF8_STRING
292static const char utf8len_codepage[256] =
293{
294 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
295 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
296 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
297 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
298 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
299 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
300 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
301 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
302};
303
304mrb_int
305mrb_utf8len(const char* p, const char* e)
306{
307 mrb_int len;
308 mrb_int i;
309
310 if ((unsigned char)*p < 0x80) return 1;
311 len = utf8len_codepage[(unsigned char)*p];
312 if (len == 1) return 1;
313 if (len > e - p) return 1;
314 for (i = 1; i < len; ++i)
315 if ((p[i] & 0xc0) != 0x80)
316 return 1;
317 return len;
318}
319
320mrb_int
321mrb_utf8_strlen(const char *str, mrb_int byte_len)
322{
323 mrb_int total = 0;
324 const char *p = str;
325 const char *e = p + byte_len;
326
327 while (p < e) {
328 p += mrb_utf8len(p, e);
329 total++;
330 }
331 return total;
332}
333
334static mrb_int
335utf8_strlen(mrb_value str)
336{
337 struct RString *s = mrb_str_ptr(str);
338 mrb_int byte_len = RSTR_LEN(s);
339
340 if (RSTR_ASCII_P(s)) {
341 return byte_len;
342 }
343 else {
344 mrb_int utf8_len = mrb_utf8_strlen(RSTR_PTR(s), byte_len);
345 if (byte_len == utf8_len) RSTR_SET_ASCII_FLAG(s);
346 return utf8_len;
347 }
348}
349
350#define RSTRING_CHAR_LEN(s) utf8_strlen(s)
351
352/* map character index to byte offset index */
353static mrb_int
354chars2bytes(mrb_value s, mrb_int off, mrb_int idx)
355{
356 if (RSTR_ASCII_P(mrb_str_ptr(s))) {
357 return idx;
358 }
359 else {
360 mrb_int i, b, n;
361 const char *p = RSTRING_PTR(s) + off;
362 const char *e = RSTRING_END(s);
363
364 for (b=i=0; p<e && i<idx; i++) {
365 n = mrb_utf8len(p, e);
366 b += n;
367 p += n;
368 }
369 return b;
370 }
371}
372
373/* map byte offset to character index */
374static mrb_int
375bytes2chars(char *p, mrb_int len, mrb_int bi)
376{
377 const char *e = p + (size_t)len;
378 const char *pivot = p + bi;
379 mrb_int i;
380
381 for (i = 0; p < pivot; i ++) {
382 p += mrb_utf8len(p, e);
383 }
384 if (p != pivot) return -1;
385 return i;
386}
387
388static const char *
389char_adjust(const char *beg, const char *end, const char *ptr)
390{
391 if ((ptr > beg || ptr < end) && (*ptr & 0xc0) == 0x80) {
392 const int utf8_adjust_max = 3;
393 const char *p;
394
395 if (ptr - beg > utf8_adjust_max) {
396 beg = ptr - utf8_adjust_max;
397 }
398
399 p = ptr;
400 while (p > beg) {
401 p --;
402 if ((*p & 0xc0) != 0x80) {
403 int clen = mrb_utf8len(p, end);
404 if (clen > ptr - p) return p;
405 break;
406 }
407 }
408 }
409
410 return ptr;
411}
412
413static const char *
414char_backtrack(const char *ptr, const char *end)
415{
416 if (ptr < end) {
417 const int utf8_bytelen_max = 4;
418 const char *p;
419
420 if (end - ptr > utf8_bytelen_max) {
421 ptr = end - utf8_bytelen_max;
422 }
423
424 p = end;
425 while (p > ptr) {
426 p --;
427 if ((*p & 0xc0) != 0x80) {
428 int clen = utf8len_codepage[(unsigned char)*p];
429 if (clen == end - p) { return p; }
430 break;
431 }
432 }
433 }
434
435 return end - 1;
436}
437
438static mrb_int
439str_index_str_by_char_search(mrb_state *mrb, const char *p, const char *pend, const char *s, const mrb_int slen, mrb_int off)
440{
441 /* Based on Quick Search algorithm (Boyer-Moore-Horspool algorithm) */
442
443 ptrdiff_t qstable[1 << CHAR_BIT];
444
445 /* Preprocessing */
446 {
447 mrb_int i;
448
449 for (i = 0; i < 1 << CHAR_BIT; i ++) {
450 qstable[i] = slen;
451 }
452 for (i = 0; i < slen; i ++) {
453 qstable[(unsigned char)s[i]] = slen - (i + 1);
454 }
455 }
456
457 /* Searching */
458 while (p < pend && pend - p >= slen) {
459 const char *pivot;
460
461 if (memcmp(p, s, slen) == 0) {
462 return off;
463 }
464
465 pivot = p + qstable[(unsigned char)p[slen - 1]];
466 if (pivot >= pend || pivot < p /* overflowed */) { return -1; }
467
468 do {
469 p += mrb_utf8len(p, pend);
470 off ++;
471 } while (p < pivot);
472 }
473
474 return -1;
475}
476
477static mrb_int
478str_index_str_by_char(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
479{
480 const char *p = RSTRING_PTR(str);
481 const char *pend = p + RSTRING_LEN(str);
482 const char *s = RSTRING_PTR(sub);
483 const mrb_int slen = RSTRING_LEN(sub);
484 mrb_int off = pos;
485
486 for (; pos > 0; pos --) {
487 if (pend - p < 1) { return -1; }
488 p += mrb_utf8len(p, pend);
489 }
490
491 if (slen < 1) { return off; }
492
493 return str_index_str_by_char_search(mrb, p, pend, s, slen, off);
494}
495
496#define BYTES_ALIGN_CHECK(pos) if (pos < 0) return mrb_nil_value();
497#else
498#define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
499#define chars2bytes(p, off, ci) (ci)
500#define bytes2chars(p, end, bi) (bi)
501#define char_adjust(beg, end, ptr) (ptr)
502#define char_backtrack(ptr, end) ((end) - 1)
503#define BYTES_ALIGN_CHECK(pos)
504#define str_index_str_by_char(mrb, str, sub, pos) str_index_str(mrb, str, sub, pos)
505#endif
506
507#ifndef MRB_QS_SHORT_STRING_LENGTH
508#define MRB_QS_SHORT_STRING_LENGTH 2048
509#endif
510
511static inline mrb_int
512mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n)
513{
514 if (n + m < MRB_QS_SHORT_STRING_LENGTH) {
515 const unsigned char *y = ys;
516 const unsigned char *ye = ys+n-m+1;
517
518 for (;;) {
519 y = (const unsigned char*)memchr(y, xs[0], (size_t)(ye-y));
520 if (y == NULL) return -1;
521 if (memcmp(xs, y, m) == 0) {
522 return (mrb_int)(y - ys);
523 }
524 y++;
525 }
526 return -1;
527 }
528 else {
529 const unsigned char *x = xs, *xe = xs + m;
530 const unsigned char *y = ys;
531 int i;
532 ptrdiff_t qstable[256];
533
534 /* Preprocessing */
535 for (i = 0; i < 256; ++i)
536 qstable[i] = m + 1;
537 for (; x < xe; ++x)
538 qstable[*x] = xe - x;
539 /* Searching */
540 for (; y + m <= ys + n; y += *(qstable + y[m])) {
541 if (*xs == *y && memcmp(xs, y, m) == 0)
542 return (mrb_int)(y - ys);
543 }
544 return -1;
545 }
546}
547
548static mrb_int
549mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
550{
551 const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0;
552
553 if (m > n) return -1;
554 else if (m == n) {
555 return memcmp(x0, y0, m) == 0 ? 0 : -1;
556 }
557 else if (m < 1) {
558 return 0;
559 }
560 else if (m == 1) {
561 const unsigned char *ys = (const unsigned char *)memchr(y, *x, n);
562
563 if (ys)
564 return (mrb_int)(ys - y);
565 else
566 return -1;
567 }
568 return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n);
569}
570
571static void
572str_share(mrb_state *mrb, struct RString *orig, struct RString *s)
573{
574 size_t len = (size_t)orig->as.heap.len;
575
576 mrb_assert(!RSTR_EMBED_P(orig));
577 if (RSTR_NOFREE_P(orig)) {
578 str_init_nofree(s, orig->as.heap.ptr, len);
579 }
580 else if (RSTR_SHARED_P(orig)) {
581 str_init_shared(mrb, orig, s, orig->as.heap.aux.shared);
582 }
583 else if (RSTR_FSHARED_P(orig)) {
584 str_init_fshared(orig, s, orig->as.heap.aux.fshared);
585 }
586 else if (mrb_frozen_p(orig) && !RSTR_POOL_P(orig)) {
587 str_init_fshared(orig, s, orig);
588 }
589 else {
590 if (orig->as.heap.aux.capa > orig->as.heap.len) {
591 orig->as.heap.ptr = (char *)mrb_realloc(mrb, orig->as.heap.ptr, len+1);
592 orig->as.heap.aux.capa = (mrb_ssize)len;
593 }
594 str_init_shared(mrb, orig, s, NULL);
595 str_init_shared(mrb, orig, orig, s->as.heap.aux.shared);
596 }
597}
598
599mrb_value
600mrb_str_pool(mrb_state *mrb, const char *p, mrb_int len, mrb_bool nofree)
601{
602 struct RString *s = (struct RString *)mrb_malloc(mrb, sizeof(struct RString));
603
604 s->tt = MRB_TT_STRING;
605 s->c = mrb->string_class;
606 s->flags = 0;
607
608 if (RSTR_EMBEDDABLE_P(len)) {
609 str_init_embed(s, p, len);
610 }
611 else if (nofree) {
612 str_init_nofree(s, p, len);
613 }
614 else {
615 str_init_normal(mrb, s, p, len);
616 }
617 RSTR_SET_POOL_FLAG(s);
618 MRB_SET_FROZEN_FLAG(s);
619 return mrb_obj_value(s);
620}
621
622mrb_value
623mrb_str_byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
624{
625 struct RString *orig, *s;
626
627 orig = mrb_str_ptr(str);
628 s = mrb_obj_alloc_string(mrb);
629 if (RSTR_EMBEDDABLE_P(len)) {
630 str_init_embed(s, RSTR_PTR(orig)+beg, len);
631 }
632 else {
633 str_share(mrb, orig, s);
634 s->as.heap.ptr += (mrb_ssize)beg;
635 s->as.heap.len = (mrb_ssize)len;
636 }
637 RSTR_COPY_ASCII_FLAG(s, orig);
638 return mrb_obj_value(s);
639}
640
641static void
642str_range_to_bytes(mrb_value str, mrb_int *pos, mrb_int *len)
643{
644 *pos = chars2bytes(str, 0, *pos);
645 *len = chars2bytes(str, *pos, *len);
646}
647#ifdef MRB_UTF8_STRING
648static inline mrb_value
649str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
650{
651 str_range_to_bytes(str, &beg, &len);
652 return mrb_str_byte_subseq(mrb, str, beg, len);
653}
654#else
655#define str_subseq(mrb, str, beg, len) mrb_str_byte_subseq(mrb, str, beg, len)
656#endif
657
658mrb_bool
659mrb_str_beg_len(mrb_int str_len, mrb_int *begp, mrb_int *lenp)
660{
661 if (str_len < *begp || *lenp < 0) return FALSE;
662 if (*begp < 0) {
663 *begp += str_len;
664 if (*begp < 0) return FALSE;
665 }
666 if (*lenp > str_len - *begp)
667 *lenp = str_len - *begp;
668 if (*lenp <= 0) {
669 *lenp = 0;
670 }
671 return TRUE;
672}
673
674static mrb_value
675str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
676{
677 return mrb_str_beg_len(RSTRING_CHAR_LEN(str), &beg, &len) ?
678 str_subseq(mrb, str, beg, len) : mrb_nil_value();
679}
680
681MRB_API mrb_int
682mrb_str_index(mrb_state *mrb, mrb_value str, const char *sptr, mrb_int slen, mrb_int offset)
683{
684 mrb_int pos;
685 char *s;
686 mrb_int len;
687
688 len = RSTRING_LEN(str);
689 if (offset < 0) {
690 offset += len;
691 if (offset < 0) return -1;
692 }
693 if (len - offset < slen) return -1;
694 s = RSTRING_PTR(str);
695 if (offset) {
696 s += offset;
697 }
698 if (slen == 0) return offset;
699 /* need proceed one character at a time */
700 len = RSTRING_LEN(str) - offset;
701 pos = mrb_memsearch(sptr, slen, s, len);
702 if (pos < 0) return pos;
703 return pos + offset;
704}
705
706static mrb_int
707str_index_str(mrb_state *mrb, mrb_value str, mrb_value str2, mrb_int offset)
708{
709 const char *ptr;
710 mrb_int len;
711
712 ptr = RSTRING_PTR(str2);
713 len = RSTRING_LEN(str2);
714
715 return mrb_str_index(mrb, str, ptr, len, offset);
716}
717
718static mrb_value
719str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
720{
721 size_t len;
722
723 mrb_check_frozen(mrb, s1);
724 if (s1 == s2) return mrb_obj_value(s1);
725 RSTR_COPY_ASCII_FLAG(s1, s2);
726 if (RSTR_SHARED_P(s1)) {
727 str_decref(mrb, s1->as.heap.aux.shared);
728 }
729 else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1) && !RSTR_FSHARED_P(s1)
730 && s1->as.heap.ptr) {
731 mrb_free(mrb, s1->as.heap.ptr);
732 }
733
734 len = (size_t)RSTR_LEN(s2);
735 if (RSTR_EMBEDDABLE_P(len)) {
736 str_init_embed(s1, RSTR_PTR(s2), len);
737 }
738 else {
739 str_share(mrb, s2, s1);
740 }
741
742 return mrb_obj_value(s1);
743}
744
745static mrb_int
746str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
747{
748 const char *s, *sbeg, *t;
749 struct RString *ps = mrb_str_ptr(str);
750 mrb_int len = RSTRING_LEN(sub);
751
752 /* substring longer than string */
753 if (RSTR_LEN(ps) < len) return -1;
754 if (RSTR_LEN(ps) - pos < len) {
755 pos = RSTR_LEN(ps) - len;
756 }
757 sbeg = RSTR_PTR(ps);
758 s = RSTR_PTR(ps) + pos;
759 t = RSTRING_PTR(sub);
760 if (len) {
761 s = char_adjust(sbeg, sbeg + RSTR_LEN(ps), s);
762 while (sbeg <= s) {
763 if (memcmp(s, t, len) == 0) {
764 return (mrb_int)(s - RSTR_PTR(ps));
765 }
766 s = char_backtrack(sbeg, s);
767 }
768 return -1;
769 }
770 else {
771 return pos;
772 }
773}
774
775MRB_API mrb_int
776mrb_str_strlen(mrb_state *mrb, struct RString *s)
777{
778 mrb_int i, max = RSTR_LEN(s);
779 char *p = RSTR_PTR(s);
780
781 if (!p) return 0;
782 for (i=0; i<max; i++) {
783 if (p[i] == '\0') {
784 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
785 }
786 }
787 return max;
788}
789
790#ifdef _WIN32
791#include <windows.h>
792
793char*
794mrb_utf8_from_locale(const char *str, int len)
795{
796 wchar_t* wcsp;
797 char* mbsp;
798 int mbssize, wcssize;
799
800 if (len == 0)
801 return strdup("");
802 if (len == -1)
803 len = (int)strlen(str);
804 wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0);
805 wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
806 if (!wcsp)
807 return NULL;
808 wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1);
809 wcsp[wcssize] = 0;
810
811 mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
812 mbsp = (char*) malloc((mbssize + 1));
813 if (!mbsp) {
814 free(wcsp);
815 return NULL;
816 }
817 mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
818 mbsp[mbssize] = 0;
819 free(wcsp);
820 return mbsp;
821}
822
823char*
824mrb_locale_from_utf8(const char *utf8, int len)
825{
826 wchar_t* wcsp;
827 char* mbsp;
828 int mbssize, wcssize;
829
830 if (len == 0)
831 return strdup("");
832 if (len == -1)
833 len = (int)strlen(utf8);
834 wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
835 wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
836 if (!wcsp)
837 return NULL;
838 wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1);
839 wcsp[wcssize] = 0;
840 mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
841 mbsp = (char*) malloc((mbssize + 1));
842 if (!mbsp) {
843 free(wcsp);
844 return NULL;
845 }
846 mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
847 mbsp[mbssize] = 0;
848 free(wcsp);
849 return mbsp;
850}
851#endif
852
853MRB_API void
854mrb_str_modify_keep_ascii(mrb_state *mrb, struct RString *s)
855{
856 mrb_check_frozen(mrb, s);
857 str_modify_keep_ascii(mrb, s);
858}
859
860MRB_API void
861mrb_str_modify(mrb_state *mrb, struct RString *s)
862{
863 mrb_str_modify_keep_ascii(mrb, s);
864 RSTR_UNSET_ASCII_FLAG(s);
865}
866
867MRB_API mrb_value
868mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len)
869{
870 mrb_int slen;
871 struct RString *s = mrb_str_ptr(str);
872
873 if (len < 0) {
874 mrb_raise(mrb, E_ARGUMENT_ERROR, "negative (or overflowed) string size");
875 }
876 mrb_str_modify(mrb, s);
877 slen = RSTR_LEN(s);
878 if (len != slen) {
879 if (slen < len || slen - len > 256) {
880 resize_capa(mrb, s, len);
881 }
882 RSTR_SET_LEN(s, len);
883 RSTR_PTR(s)[len] = '\0'; /* sentinel */
884 }
885 return str;
886}
887
888MRB_API char*
889mrb_str_to_cstr(mrb_state *mrb, mrb_value str0)
890{
891 struct RString *s;
892
893 check_null_byte(mrb, str0);
894 s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0));
895 return RSTR_PTR(s);
896}
897
898MRB_API void
899mrb_str_concat(mrb_state *mrb, mrb_value self, mrb_value other)
900{
901 other = mrb_str_to_str(mrb, other);
902 mrb_str_cat_str(mrb, self, other);
903}
904
905MRB_API mrb_value
906mrb_str_plus(mrb_state *mrb, mrb_value a, mrb_value b)
907{
908 struct RString *s = mrb_str_ptr(a);
909 struct RString *s2 = mrb_str_ptr(b);
910 struct RString *t;
911
912 t = str_new(mrb, 0, RSTR_LEN(s) + RSTR_LEN(s2));
913 memcpy(RSTR_PTR(t), RSTR_PTR(s), RSTR_LEN(s));
914 memcpy(RSTR_PTR(t) + RSTR_LEN(s), RSTR_PTR(s2), RSTR_LEN(s2));
915
916 return mrb_obj_value(t);
917}
918
919/* 15.2.10.5.2 */
920
921/*
922 * call-seq:
923 * str + other_str -> new_str
924 *
925 * Concatenation---Returns a new <code>String</code> containing
926 * <i>other_str</i> concatenated to <i>str</i>.
927 *
928 * "Hello from " + self.to_s #=> "Hello from main"
929 */
930static mrb_value
931mrb_str_plus_m(mrb_state *mrb, mrb_value self)
932{
933 mrb_value str;
934
935 mrb_get_args(mrb, "S", &str);
936 return mrb_str_plus(mrb, self, str);
937}
938
939/* 15.2.10.5.26 */
940/* 15.2.10.5.33 */
941/*
942 * call-seq:
943 * "abcd".size => int
944 *
945 * Returns the length of string.
946 */
947static mrb_value
948mrb_str_size(mrb_state *mrb, mrb_value self)
949{
950 mrb_int len = RSTRING_CHAR_LEN(self);
951 return mrb_fixnum_value(len);
952}
953
954static mrb_value
955mrb_str_bytesize(mrb_state *mrb, mrb_value self)
956{
957 mrb_int len = RSTRING_LEN(self);
958 return mrb_fixnum_value(len);
959}
960
961/* 15.2.10.5.1 */
962/*
963 * call-seq:
964 * str * integer => new_str
965 *
966 * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
967 * the receiver.
968 *
969 * "Ho! " * 3 #=> "Ho! Ho! Ho! "
970 */
971static mrb_value
972mrb_str_times(mrb_state *mrb, mrb_value self)
973{
974 mrb_int n,len,times;
975 struct RString *str2;
976 char *p;
977
978 mrb_get_args(mrb, "i", &times);
979 if (times < 0) {
980 mrb_raise(mrb, E_ARGUMENT_ERROR, "negative argument");
981 }
982 if (times && MRB_SSIZE_MAX / times < RSTRING_LEN(self)) {
983 mrb_raise(mrb, E_ARGUMENT_ERROR, "argument too big");
984 }
985
986 len = RSTRING_LEN(self)*times;
987 str2 = str_new(mrb, 0, len);
988 str_with_class(str2, self);
989 p = RSTR_PTR(str2);
990 if (len > 0) {
991 n = RSTRING_LEN(self);
992 memcpy(p, RSTRING_PTR(self), n);
993 while (n <= len/2) {
994 memcpy(p + n, p, n);
995 n *= 2;
996 }
997 memcpy(p + n, p, len-n);
998 }
999 p[RSTR_LEN(str2)] = '\0';
1000 RSTR_COPY_ASCII_FLAG(str2, mrb_str_ptr(self));
1001
1002 return mrb_obj_value(str2);
1003}
1004/* -------------------------------------------------------------- */
1005
1006#define lesser(a,b) (((a)>(b))?(b):(a))
1007
1008/* ---------------------------*/
1009/*
1010 * call-seq:
1011 * mrb_value str1 <=> mrb_value str2 => int
1012 * > 1
1013 * = 0
1014 * < -1
1015 */
1016MRB_API int
1017mrb_str_cmp(mrb_state *mrb, mrb_value str1, mrb_value str2)
1018{
1019 mrb_int len;
1020 mrb_int retval;
1021 struct RString *s1 = mrb_str_ptr(str1);
1022 struct RString *s2 = mrb_str_ptr(str2);
1023
1024 len = lesser(RSTR_LEN(s1), RSTR_LEN(s2));
1025 retval = memcmp(RSTR_PTR(s1), RSTR_PTR(s2), len);
1026 if (retval == 0) {
1027 if (RSTR_LEN(s1) == RSTR_LEN(s2)) return 0;
1028 if (RSTR_LEN(s1) > RSTR_LEN(s2)) return 1;
1029 return -1;
1030 }
1031 if (retval > 0) return 1;
1032 return -1;
1033}
1034
1035/* 15.2.10.5.3 */
1036
1037/*
1038 * call-seq:
1039 * str <=> other_str => -1, 0, +1
1040 *
1041 * Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
1042 * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
1043 * <i>str</i>. If the strings are of different lengths, and the strings are
1044 * equal when compared up to the shortest length, then the longer string is
1045 * considered greater than the shorter one. If the variable <code>$=</code> is
1046 * <code>false</code>, the comparison is based on comparing the binary values
1047 * of each character in the string. In older versions of Ruby, setting
1048 * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
1049 * in favor of using <code>String#casecmp</code>.
1050 *
1051 * <code><=></code> is the basis for the methods <code><</code>,
1052 * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
1053 * included from module <code>Comparable</code>. The method
1054 * <code>String#==</code> does not use <code>Comparable#==</code>.
1055 *
1056 * "abcdef" <=> "abcde" #=> 1
1057 * "abcdef" <=> "abcdef" #=> 0
1058 * "abcdef" <=> "abcdefg" #=> -1
1059 * "abcdef" <=> "ABCDEF" #=> 1
1060 */
1061static mrb_value
1062mrb_str_cmp_m(mrb_state *mrb, mrb_value str1)
1063{
1064 mrb_value str2;
1065 mrb_int result;
1066
1067 mrb_get_args(mrb, "o", &str2);
1068 if (!mrb_string_p(str2)) {
1069 return mrb_nil_value();
1070 }
1071 else {
1072 result = mrb_str_cmp(mrb, str1, str2);
1073 }
1074 return mrb_fixnum_value(result);
1075}
1076
1077static mrb_bool
1078str_eql(mrb_state *mrb, const mrb_value str1, const mrb_value str2)
1079{
1080 const mrb_int len = RSTRING_LEN(str1);
1081
1082 if (len != RSTRING_LEN(str2)) return FALSE;
1083 if (memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), (size_t)len) == 0)
1084 return TRUE;
1085 return FALSE;
1086}
1087
1088MRB_API mrb_bool
1089mrb_str_equal(mrb_state *mrb, mrb_value str1, mrb_value str2)
1090{
1091 if (!mrb_string_p(str2)) return FALSE;
1092 return str_eql(mrb, str1, str2);
1093}
1094
1095/* 15.2.10.5.4 */
1096/*
1097 * call-seq:
1098 * str == obj => true or false
1099 *
1100 * Equality---
1101 * If <i>obj</i> is not a <code>String</code>, returns <code>false</code>.
1102 * Otherwise, returns <code>false</code> or <code>true</code>
1103 *
1104 * caution:if <i>str</i> <code><=></code> <i>obj</i> returns zero.
1105 */
1106static mrb_value
1107mrb_str_equal_m(mrb_state *mrb, mrb_value str1)
1108{
1109 mrb_value str2;
1110
1111 mrb_get_args(mrb, "o", &str2);
1112
1113 return mrb_bool_value(mrb_str_equal(mrb, str1, str2));
1114}
1115/* ---------------------------------- */
1116
1117MRB_API mrb_value
1118mrb_str_to_str(mrb_state *mrb, mrb_value str)
1119{
1120 switch (mrb_type(str)) {
1121 case MRB_TT_STRING:
1122 return str;
1123 case MRB_TT_SYMBOL:
1124 return mrb_sym_str(mrb, mrb_symbol(str));
1125 case MRB_TT_FIXNUM:
1126 return mrb_fixnum_to_str(mrb, str, 10);
1127 case MRB_TT_CLASS:
1128 case MRB_TT_MODULE:
1129 return mrb_mod_to_s(mrb, str);
1130 default:
1131 return mrb_convert_type(mrb, str, MRB_TT_STRING, "String", "to_s");
1132 }
1133}
1134
1135/* obslete: use RSTRING_PTR() */
1136MRB_API const char*
1137mrb_string_value_ptr(mrb_state *mrb, mrb_value str)
1138{
1139 str = mrb_str_to_str(mrb, str);
1140 return RSTRING_PTR(str);
1141}
1142
1143/* obslete: use RSTRING_LEN() */
1144MRB_API mrb_int
1145mrb_string_value_len(mrb_state *mrb, mrb_value ptr)
1146{
1147 mrb_to_str(mrb, ptr);
1148 return RSTRING_LEN(ptr);
1149}
1150
1151MRB_API mrb_value
1152mrb_str_dup(mrb_state *mrb, mrb_value str)
1153{
1154 struct RString *s = mrb_str_ptr(str);
1155 struct RString *dup = str_new(mrb, 0, 0);
1156
1157 str_with_class(dup, str);
1158 return str_replace(mrb, dup, s);
1159}
1160
1161enum str_convert_range {
1162 /* `beg` and `len` are byte unit in `0 ... str.bytesize` */
1163 STR_BYTE_RANGE_CORRECTED = 1,
1164
1165 /* `beg` and `len` are char unit in any range */
1166 STR_CHAR_RANGE = 2,
1167
1168 /* `beg` and `len` are char unit in `0 ... str.size` */
1169 STR_CHAR_RANGE_CORRECTED = 3,
1170
1171 /* `beg` is out of range */
1172 STR_OUT_OF_RANGE = -1
1173};
1174
1175static enum str_convert_range
1176str_convert_range(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_int *beg, mrb_int *len)
1177{
1178 if (!mrb_undef_p(alen)) {
1179 *beg = mrb_int(mrb, indx);
1180 *len = mrb_int(mrb, alen);
1181 return STR_CHAR_RANGE;
1182 }
1183 else {
1184 switch (mrb_type(indx)) {
1185 case MRB_TT_FIXNUM:
1186 *beg = mrb_fixnum(indx);
1187 *len = 1;
1188 return STR_CHAR_RANGE;
1189
1190 case MRB_TT_STRING:
1191 *beg = str_index_str(mrb, str, indx, 0);
1192 if (*beg < 0) { break; }
1193 *len = RSTRING_LEN(indx);
1194 return STR_BYTE_RANGE_CORRECTED;
1195
1196 case MRB_TT_RANGE:
1197 goto range_arg;
1198
1199 default:
1200 indx = mrb_to_int(mrb, indx);
1201 if (mrb_fixnum_p(indx)) {
1202 *beg = mrb_fixnum(indx);
1203 *len = 1;
1204 return STR_CHAR_RANGE;
1205 }
1206range_arg:
1207 *len = RSTRING_CHAR_LEN(str);
1208 switch (mrb_range_beg_len(mrb, indx, beg, len, *len, TRUE)) {
1209 case MRB_RANGE_OK:
1210 return STR_CHAR_RANGE_CORRECTED;
1211 case MRB_RANGE_OUT:
1212 return STR_OUT_OF_RANGE;
1213 default:
1214 break;
1215 }
1216
1217 mrb_raise(mrb, E_TYPE_ERROR, "can't convert to Fixnum");
1218 }
1219 }
1220 return STR_OUT_OF_RANGE;
1221}
1222
1223static mrb_value
1224mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen)
1225{
1226 mrb_int beg, len;
1227
1228 switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) {
1229 case STR_CHAR_RANGE_CORRECTED:
1230 return str_subseq(mrb, str, beg, len);
1231 case STR_CHAR_RANGE:
1232 str = str_substr(mrb, str, beg, len);
1233 if (mrb_undef_p(alen) && !mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value();
1234 return str;
1235 case STR_BYTE_RANGE_CORRECTED:
1236 if (mrb_string_p(indx)) {
1237 return mrb_str_dup(mrb, indx);
1238 }
1239 else {
1240 return mrb_str_byte_subseq(mrb, str, beg, len);
1241 }
1242 case STR_OUT_OF_RANGE:
1243 default:
1244 return mrb_nil_value();
1245 }
1246}
1247
1248/* 15.2.10.5.6 */
1249/* 15.2.10.5.34 */
1250/*
1251 * call-seq:
1252 * str[fixnum] => fixnum or nil
1253 * str[fixnum, fixnum] => new_str or nil
1254 * str[range] => new_str or nil
1255 * str[other_str] => new_str or nil
1256 * str.slice(fixnum) => fixnum or nil
1257 * str.slice(fixnum, fixnum) => new_str or nil
1258 * str.slice(range) => new_str or nil
1259 * str.slice(other_str) => new_str or nil
1260 *
1261 * Element Reference---If passed a single <code>Fixnum</code>, returns the code
1262 * of the character at that position. If passed two <code>Fixnum</code>
1263 * objects, returns a substring starting at the offset given by the first, and
1264 * a length given by the second. If given a range, a substring containing
1265 * characters at offsets given by the range is returned. In all three cases, if
1266 * an offset is negative, it is counted from the end of <i>str</i>. Returns
1267 * <code>nil</code> if the initial offset falls outside the string, the length
1268 * is negative, or the beginning of the range is greater than the end.
1269 *
1270 * If a <code>String</code> is given, that string is returned if it occurs in
1271 * <i>str</i>. In both cases, <code>nil</code> is returned if there is no
1272 * match.
1273 *
1274 * a = "hello there"
1275 * a[1] #=> 101(1.8.7) "e"(1.9.2)
1276 * a[1.1] #=> "e"(1.9.2)
1277 * a[1,3] #=> "ell"
1278 * a[1..3] #=> "ell"
1279 * a[-3,2] #=> "er"
1280 * a[-4..-2] #=> "her"
1281 * a[12..-1] #=> nil
1282 * a[-2..-4] #=> ""
1283 * a["lo"] #=> "lo"
1284 * a["bye"] #=> nil
1285 */
1286static mrb_value
1287mrb_str_aref_m(mrb_state *mrb, mrb_value str)
1288{
1289 mrb_value a1, a2;
1290
1291 if (mrb_get_args(mrb, "o|o", &a1, &a2) == 1) {
1292 a2 = mrb_undef_value();
1293 }
1294
1295 return mrb_str_aref(mrb, str, a1, a2);
1296}
1297
1298static mrb_noreturn void
1299str_out_of_index(mrb_state *mrb, mrb_value index)
1300{
1301 mrb_raisef(mrb, E_INDEX_ERROR, "index %v out of string", index);
1302}
1303
1304static mrb_value
1305str_replace_partial(mrb_state *mrb, mrb_value src, mrb_int pos, mrb_int end, mrb_value rep)
1306{
1307 const mrb_int shrink_threshold = 256;
1308 struct RString *str = mrb_str_ptr(src);
1309 mrb_int len = RSTR_LEN(str);
1310 mrb_int replen, newlen;
1311 char *strp;
1312
1313 if (end > len) { end = len; }
1314
1315 if (pos < 0 || pos > len) {
1316 str_out_of_index(mrb, mrb_fixnum_value(pos));
1317 }
1318
1319 replen = (mrb_nil_p(rep) ? 0 : RSTRING_LEN(rep));
1320 newlen = replen + len - (end - pos);
1321
1322 if (newlen >= MRB_SSIZE_MAX || newlen < replen /* overflowed */) {
1323 mrb_raise(mrb, E_RUNTIME_ERROR, "string size too big");
1324 }
1325
1326 mrb_str_modify(mrb, str);
1327
1328 if (len < newlen) {
1329 resize_capa(mrb, str, newlen);
1330 }
1331
1332 strp = RSTR_PTR(str);
1333
1334 memmove(strp + newlen - (len - end), strp + end, len - end);
1335 if (!mrb_nil_p(rep)) {
1336 memmove(strp + pos, RSTRING_PTR(rep), replen);
1337 }
1338 RSTR_SET_LEN(str, newlen);
1339 strp[newlen] = '\0';
1340
1341 if (len - newlen >= shrink_threshold) {
1342 resize_capa(mrb, str, newlen);
1343 }
1344
1345 return src;
1346}
1347
1348#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
1349
1350static mrb_value
1351str_escape(mrb_state *mrb, mrb_value str, mrb_bool inspect)
1352{
1353 const char *p, *pend;
1354 char buf[4]; /* `\x??` or UTF-8 character */
1355 mrb_value result = mrb_str_new_lit(mrb, "\"");
1356#ifdef MRB_UTF8_STRING
1357 uint32_t ascii_flag = MRB_STR_ASCII;
1358#endif
1359
1360 p = RSTRING_PTR(str); pend = RSTRING_END(str);
1361 for (;p < pend; p++) {
1362 unsigned char c, cc;
1363#ifdef MRB_UTF8_STRING
1364 if (inspect) {
1365 mrb_int clen = mrb_utf8len(p, pend);
1366 if (clen > 1) {
1367 mrb_int i;
1368
1369 for (i=0; i<clen; i++) {
1370 buf[i] = p[i];
1371 }
1372 mrb_str_cat(mrb, result, buf, clen);
1373 p += clen-1;
1374 ascii_flag = 0;
1375 continue;
1376 }
1377 }
1378#endif
1379 c = *p;
1380 if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p+1, pend))) {
1381 buf[0] = '\\'; buf[1] = c;
1382 mrb_str_cat(mrb, result, buf, 2);
1383 continue;
1384 }
1385 if (ISPRINT(c)) {
1386 buf[0] = c;
1387 mrb_str_cat(mrb, result, buf, 1);
1388 continue;
1389 }
1390 switch (c) {
1391 case '\n': cc = 'n'; break;
1392 case '\r': cc = 'r'; break;
1393 case '\t': cc = 't'; break;
1394 case '\f': cc = 'f'; break;
1395 case '\013': cc = 'v'; break;
1396 case '\010': cc = 'b'; break;
1397 case '\007': cc = 'a'; break;
1398 case 033: cc = 'e'; break;
1399 default: cc = 0; break;
1400 }
1401 if (cc) {
1402 buf[0] = '\\';
1403 buf[1] = (char)cc;
1404 mrb_str_cat(mrb, result, buf, 2);
1405 continue;
1406 }
1407 else {
1408 buf[0] = '\\';
1409 buf[1] = 'x';
1410 buf[3] = mrb_digitmap[c % 16]; c /= 16;
1411 buf[2] = mrb_digitmap[c % 16];
1412 mrb_str_cat(mrb, result, buf, 4);
1413 continue;
1414 }
1415 }
1416 mrb_str_cat_lit(mrb, result, "\"");
1417#ifdef MRB_UTF8_STRING
1418 if (inspect) {
1419 mrb_str_ptr(str)->flags |= ascii_flag;
1420 mrb_str_ptr(result)->flags |= ascii_flag;
1421 }
1422 else {
1423 RSTR_SET_ASCII_FLAG(mrb_str_ptr(result));
1424 }
1425#endif
1426
1427 return result;
1428}
1429
1430static void
1431mrb_str_aset(mrb_state *mrb, mrb_value str, mrb_value indx, mrb_value alen, mrb_value replace)
1432{
1433 mrb_int beg, len, charlen;
1434
1435 mrb_to_str(mrb, replace);
1436
1437 switch (str_convert_range(mrb, str, indx, alen, &beg, &len)) {
1438 case STR_OUT_OF_RANGE:
1439 default:
1440 mrb_raise(mrb, E_INDEX_ERROR, "string not matched");
1441 case STR_CHAR_RANGE:
1442 if (len < 0) {
1443 mrb_raisef(mrb, E_INDEX_ERROR, "negative length %v", alen);
1444 }
1445 charlen = RSTRING_CHAR_LEN(str);
1446 if (beg < 0) { beg += charlen; }
1447 if (beg < 0 || beg > charlen) { str_out_of_index(mrb, indx); }
1448 /* fall through */
1449 case STR_CHAR_RANGE_CORRECTED:
1450 str_range_to_bytes(str, &beg, &len);
1451 /* fall through */
1452 case STR_BYTE_RANGE_CORRECTED:
1453 str_replace_partial(mrb, str, beg, beg + len, replace);
1454 }
1455}
1456
1457/*
1458 * call-seq:
1459 * str[fixnum] = replace
1460 * str[fixnum, fixnum] = replace
1461 * str[range] = replace
1462 * str[other_str] = replace
1463 *
1464 * Modify +self+ by replacing the content of +self+.
1465 * The portion of the string affected is determined using the same criteria as +String#[]+.
1466 */
1467static mrb_value
1468mrb_str_aset_m(mrb_state *mrb, mrb_value str)
1469{
1470 mrb_value indx, alen, replace;
1471
1472 switch (mrb_get_args(mrb, "oo|S!", &indx, &alen, &replace)) {
1473 case 2:
1474 replace = alen;
1475 alen = mrb_undef_value();
1476 break;
1477 case 3:
1478 break;
1479 }
1480 mrb_str_aset(mrb, str, indx, alen, replace);
1481 return str;
1482}
1483
1484/* 15.2.10.5.8 */
1485/*
1486 * call-seq:
1487 * str.capitalize! => str or nil
1488 *
1489 * Modifies <i>str</i> by converting the first character to uppercase and the
1490 * remainder to lowercase. Returns <code>nil</code> if no changes are made.
1491 *
1492 * a = "hello"
1493 * a.capitalize! #=> "Hello"
1494 * a #=> "Hello"
1495 * a.capitalize! #=> nil
1496 */
1497static mrb_value
1498mrb_str_capitalize_bang(mrb_state *mrb, mrb_value str)
1499{
1500 char *p, *pend;
1501 mrb_bool modify = FALSE;
1502 struct RString *s = mrb_str_ptr(str);
1503
1504 mrb_str_modify_keep_ascii(mrb, s);
1505 if (RSTR_LEN(s) == 0 || !RSTR_PTR(s)) return mrb_nil_value();
1506 p = RSTR_PTR(s); pend = RSTR_PTR(s) + RSTR_LEN(s);
1507 if (ISLOWER(*p)) {
1508 *p = TOUPPER(*p);
1509 modify = TRUE;
1510 }
1511 while (++p < pend) {
1512 if (ISUPPER(*p)) {
1513 *p = TOLOWER(*p);
1514 modify = TRUE;
1515 }
1516 }
1517 if (modify) return str;
1518 return mrb_nil_value();
1519}
1520
1521/* 15.2.10.5.7 */
1522/*
1523 * call-seq:
1524 * str.capitalize => new_str
1525 *
1526 * Returns a copy of <i>str</i> with the first character converted to uppercase
1527 * and the remainder to lowercase.
1528 *
1529 * "hello".capitalize #=> "Hello"
1530 * "HELLO".capitalize #=> "Hello"
1531 * "123ABC".capitalize #=> "123abc"
1532 */
1533static mrb_value
1534mrb_str_capitalize(mrb_state *mrb, mrb_value self)
1535{
1536 mrb_value str;
1537
1538 str = mrb_str_dup(mrb, self);
1539 mrb_str_capitalize_bang(mrb, str);
1540 return str;
1541}
1542
1543/* 15.2.10.5.10 */
1544/*
1545 * call-seq:
1546 * str.chomp!(separator="\n") => str or nil
1547 *
1548 * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
1549 * returning <i>str</i>, or <code>nil</code> if no modifications were made.
1550 */
1551static mrb_value
1552mrb_str_chomp_bang(mrb_state *mrb, mrb_value str)
1553{
1554 mrb_value rs;
1555 mrb_int newline;
1556 char *p, *pp;
1557 mrb_int rslen;
1558 mrb_int len;
1559 mrb_int argc;
1560 struct RString *s = mrb_str_ptr(str);
1561
1562 argc = mrb_get_args(mrb, "|S", &rs);
1563 mrb_str_modify_keep_ascii(mrb, s);
1564 len = RSTR_LEN(s);
1565 if (argc == 0) {
1566 if (len == 0) return mrb_nil_value();
1567 smart_chomp:
1568 if (RSTR_PTR(s)[len-1] == '\n') {
1569 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1570 if (RSTR_LEN(s) > 0 &&
1571 RSTR_PTR(s)[RSTR_LEN(s)-1] == '\r') {
1572 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1573 }
1574 }
1575 else if (RSTR_PTR(s)[len-1] == '\r') {
1576 RSTR_SET_LEN(s, RSTR_LEN(s) - 1);
1577 }
1578 else {
1579 return mrb_nil_value();
1580 }
1581 RSTR_PTR(s)[RSTR_LEN(s)] = '\0';
1582 return str;
1583 }
1584
1585 if (len == 0 || mrb_nil_p(rs)) return mrb_nil_value();
1586 p = RSTR_PTR(s);
1587 rslen = RSTRING_LEN(rs);
1588 if (rslen == 0) {
1589 while (len>0 && p[len-1] == '\n') {
1590 len--;
1591 if (len>0 && p[len-1] == '\r')
1592 len--;
1593 }
1594 if (len < RSTR_LEN(s)) {
1595 RSTR_SET_LEN(s, len);
1596 p[len] = '\0';
1597 return str;
1598 }
1599 return mrb_nil_value();
1600 }
1601 if (rslen > len) return mrb_nil_value();
1602 newline = RSTRING_PTR(rs)[rslen-1];
1603 if (rslen == 1 && newline == '\n')
1604 newline = RSTRING_PTR(rs)[rslen-1];
1605 if (rslen == 1 && newline == '\n')
1606 goto smart_chomp;
1607
1608 pp = p + len - rslen;
1609 if (p[len-1] == newline &&
1610 (rslen <= 1 ||
1611 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
1612 RSTR_SET_LEN(s, len - rslen);
1613 p[RSTR_LEN(s)] = '\0';
1614 return str;
1615 }
1616 return mrb_nil_value();
1617}
1618
1619/* 15.2.10.5.9 */
1620/*
1621 * call-seq:
1622 * str.chomp(separator="\n") => new_str
1623 *
1624 * Returns a new <code>String</code> with the given record separator removed
1625 * from the end of <i>str</i> (if present). <code>chomp</code> also removes
1626 * carriage return characters (that is it will remove <code>\n</code>,
1627 * <code>\r</code>, and <code>\r\n</code>).
1628 *
1629 * "hello".chomp #=> "hello"
1630 * "hello\n".chomp #=> "hello"
1631 * "hello\r\n".chomp #=> "hello"
1632 * "hello\n\r".chomp #=> "hello\n"
1633 * "hello\r".chomp #=> "hello"
1634 * "hello \n there".chomp #=> "hello \n there"
1635 * "hello".chomp("llo") #=> "he"
1636 */
1637static mrb_value
1638mrb_str_chomp(mrb_state *mrb, mrb_value self)
1639{
1640 mrb_value str;
1641
1642 str = mrb_str_dup(mrb, self);
1643 mrb_str_chomp_bang(mrb, str);
1644 return str;
1645}
1646
1647/* 15.2.10.5.12 */
1648/*
1649 * call-seq:
1650 * str.chop! => str or nil
1651 *
1652 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
1653 * or <code>nil</code> if <i>str</i> is the empty string. See also
1654 * <code>String#chomp!</code>.
1655 */
1656static mrb_value
1657mrb_str_chop_bang(mrb_state *mrb, mrb_value str)
1658{
1659 struct RString *s = mrb_str_ptr(str);
1660
1661 mrb_str_modify_keep_ascii(mrb, s);
1662 if (RSTR_LEN(s) > 0) {
1663 mrb_int len;
1664#ifdef MRB_UTF8_STRING
1665 const char* t = RSTR_PTR(s), *p = t;
1666 const char* e = p + RSTR_LEN(s);
1667 while (p<e) {
1668 mrb_int clen = mrb_utf8len(p, e);
1669 if (p + clen>=e) break;
1670 p += clen;
1671 }
1672 len = p - t;
1673#else
1674 len = RSTR_LEN(s) - 1;
1675#endif
1676 if (RSTR_PTR(s)[len] == '\n') {
1677 if (len > 0 &&
1678 RSTR_PTR(s)[len-1] == '\r') {
1679 len--;
1680 }
1681 }
1682 RSTR_SET_LEN(s, len);
1683 RSTR_PTR(s)[len] = '\0';
1684 return str;
1685 }
1686 return mrb_nil_value();
1687}
1688
1689/* 15.2.10.5.11 */
1690/*
1691 * call-seq:
1692 * str.chop => new_str
1693 *
1694 * Returns a new <code>String</code> with the last character removed. If the
1695 * string ends with <code>\r\n</code>, both characters are removed. Applying
1696 * <code>chop</code> to an empty string returns an empty
1697 * string. <code>String#chomp</code> is often a safer alternative, as it leaves
1698 * the string unchanged if it doesn't end in a record separator.
1699 *
1700 * "string\r\n".chop #=> "string"
1701 * "string\n\r".chop #=> "string\n"
1702 * "string\n".chop #=> "string"
1703 * "string".chop #=> "strin"
1704 * "x".chop #=> ""
1705 */
1706static mrb_value
1707mrb_str_chop(mrb_state *mrb, mrb_value self)
1708{
1709 mrb_value str;
1710 str = mrb_str_dup(mrb, self);
1711 mrb_str_chop_bang(mrb, str);
1712 return str;
1713}
1714
1715/* 15.2.10.5.14 */
1716/*
1717 * call-seq:
1718 * str.downcase! => str or nil
1719 *
1720 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
1721 * changes were made.
1722 */
1723static mrb_value
1724mrb_str_downcase_bang(mrb_state *mrb, mrb_value str)
1725{
1726 char *p, *pend;
1727 mrb_bool modify = FALSE;
1728 struct RString *s = mrb_str_ptr(str);
1729
1730 mrb_str_modify_keep_ascii(mrb, s);
1731 p = RSTR_PTR(s);
1732 pend = RSTR_PTR(s) + RSTR_LEN(s);
1733 while (p < pend) {
1734 if (ISUPPER(*p)) {
1735 *p = TOLOWER(*p);
1736 modify = TRUE;
1737 }
1738 p++;
1739 }
1740
1741 if (modify) return str;
1742 return mrb_nil_value();
1743}
1744
1745/* 15.2.10.5.13 */
1746/*
1747 * call-seq:
1748 * str.downcase => new_str
1749 *
1750 * Returns a copy of <i>str</i> with all uppercase letters replaced with their
1751 * lowercase counterparts. The operation is locale insensitive---only
1752 * characters 'A' to 'Z' are affected.
1753 *
1754 * "hEllO".downcase #=> "hello"
1755 */
1756static mrb_value
1757mrb_str_downcase(mrb_state *mrb, mrb_value self)
1758{
1759 mrb_value str;
1760
1761 str = mrb_str_dup(mrb, self);
1762 mrb_str_downcase_bang(mrb, str);
1763 return str;
1764}
1765
1766/* 15.2.10.5.16 */
1767/*
1768 * call-seq:
1769 * str.empty? => true or false
1770 *
1771 * Returns <code>true</code> if <i>str</i> has a length of zero.
1772 *
1773 * "hello".empty? #=> false
1774 * "".empty? #=> true
1775 */
1776static mrb_value
1777mrb_str_empty_p(mrb_state *mrb, mrb_value self)
1778{
1779 struct RString *s = mrb_str_ptr(self);
1780
1781 return mrb_bool_value(RSTR_LEN(s) == 0);
1782}
1783
1784/* 15.2.10.5.17 */
1785/*
1786 * call-seq:
1787 * str.eql?(other) => true or false
1788 *
1789 * Two strings are equal if the have the same length and content.
1790 */
1791static mrb_value
1792mrb_str_eql(mrb_state *mrb, mrb_value self)
1793{
1794 mrb_value str2;
1795 mrb_bool eql_p;
1796
1797 mrb_get_args(mrb, "o", &str2);
1798 eql_p = (mrb_string_p(str2)) && str_eql(mrb, self, str2);
1799
1800 return mrb_bool_value(eql_p);
1801}
1802
1803MRB_API mrb_value
1804mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
1805{
1806 return str_substr(mrb, str, beg, len);
1807}
1808
1809uint32_t
1810mrb_str_hash(mrb_state *mrb, mrb_value str)
1811{
1812 /* 1-8-7 */
1813 struct RString *s = mrb_str_ptr(str);
1814 mrb_int len = RSTR_LEN(s);
1815 char *p = RSTR_PTR(s);
1816 uint64_t key = 0;
1817
1818 while (len--) {
1819 key = key*65599 + *p;
1820 p++;
1821 }
1822 return (uint32_t)(key + (key>>5));
1823}
1824
1825/* 15.2.10.5.20 */
1826/*
1827 * call-seq:
1828 * str.hash => fixnum
1829 *
1830 * Return a hash based on the string's length and content.
1831 */
1832static mrb_value
1833mrb_str_hash_m(mrb_state *mrb, mrb_value self)
1834{
1835 mrb_int key = mrb_str_hash(mrb, self);
1836 return mrb_fixnum_value(key);
1837}
1838
1839/* 15.2.10.5.21 */
1840/*
1841 * call-seq:
1842 * str.include? other_str => true or false
1843 * str.include? fixnum => true or false
1844 *
1845 * Returns <code>true</code> if <i>str</i> contains the given string or
1846 * character.
1847 *
1848 * "hello".include? "lo" #=> true
1849 * "hello".include? "ol" #=> false
1850 * "hello".include? ?h #=> true
1851 */
1852static mrb_value
1853mrb_str_include(mrb_state *mrb, mrb_value self)
1854{
1855 mrb_value str2;
1856
1857 mrb_get_args(mrb, "S", &str2);
1858 if (str_index_str(mrb, self, str2, 0) < 0)
1859 return mrb_bool_value(FALSE);
1860 return mrb_bool_value(TRUE);
1861}
1862
1863/* 15.2.10.5.22 */
1864/*
1865 * call-seq:
1866 * str.index(substring [, offset]) => fixnum or nil
1867 *
1868 * Returns the index of the first occurrence of the given
1869 * <i>substring</i>. Returns <code>nil</code> if not found.
1870 * If the second parameter is present, it
1871 * specifies the position in the string to begin the search.
1872 *
1873 * "hello".index('l') #=> 2
1874 * "hello".index('lo') #=> 3
1875 * "hello".index('a') #=> nil
1876 * "hello".index('l', -2) #=> 3
1877 */
1878static mrb_value
1879mrb_str_index_m(mrb_state *mrb, mrb_value str)
1880{
1881 mrb_value sub;
1882 mrb_int pos;
1883
1884 if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) {
1885 pos = 0;
1886 }
1887 else if (pos < 0) {
1888 mrb_int clen = RSTRING_CHAR_LEN(str);
1889 pos += clen;
1890 if (pos < 0) {
1891 return mrb_nil_value();
1892 }
1893 }
1894 pos = str_index_str_by_char(mrb, str, sub, pos);
1895
1896 if (pos == -1) return mrb_nil_value();
1897 BYTES_ALIGN_CHECK(pos);
1898 return mrb_fixnum_value(pos);
1899}
1900
1901/* 15.2.10.5.24 */
1902/* 15.2.10.5.28 */
1903/*
1904 * call-seq:
1905 * str.replace(other_str) => str
1906 *
1907 * s = "hello" #=> "hello"
1908 * s.replace "world" #=> "world"
1909 */
1910static mrb_value
1911mrb_str_replace(mrb_state *mrb, mrb_value str)
1912{
1913 mrb_value str2;
1914
1915 mrb_get_args(mrb, "S", &str2);
1916 return str_replace(mrb, mrb_str_ptr(str), mrb_str_ptr(str2));
1917}
1918
1919/* 15.2.10.5.23 */
1920/*
1921 * call-seq:
1922 * String.new(str="") => new_str
1923 *
1924 * Returns a new string object containing a copy of <i>str</i>.
1925 */
1926static mrb_value
1927mrb_str_init(mrb_state *mrb, mrb_value self)
1928{
1929 mrb_value str2;
1930
1931 if (mrb_get_args(mrb, "|S", &str2) == 0) {
1932 struct RString *s = str_new(mrb, 0, 0);
1933 str2 = mrb_obj_value(s);
1934 }
1935 str_replace(mrb, mrb_str_ptr(self), mrb_str_ptr(str2));
1936 return self;
1937}
1938
1939/* 15.2.10.5.25 */
1940/* 15.2.10.5.41 */
1941/*
1942 * call-seq:
1943 * str.intern => symbol
1944 * str.to_sym => symbol
1945 *
1946 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
1947 * symbol if it did not previously exist. See <code>Symbol#id2name</code>.
1948 *
1949 * "Koala".intern #=> :Koala
1950 * s = 'cat'.to_sym #=> :cat
1951 * s == :cat #=> true
1952 * s = '@cat'.to_sym #=> :@cat
1953 * s == :@cat #=> true
1954 *
1955 * This can also be used to create symbols that cannot be represented using the
1956 * <code>:xxx</code> notation.
1957 *
1958 * 'cat and dog'.to_sym #=> :"cat and dog"
1959 */
1960MRB_API mrb_value
1961mrb_str_intern(mrb_state *mrb, mrb_value self)
1962{
1963 return mrb_symbol_value(mrb_intern_str(mrb, self));
1964}
1965/* ---------------------------------- */
1966MRB_API mrb_value
1967mrb_obj_as_string(mrb_state *mrb, mrb_value obj)
1968{
1969 if (mrb_string_p(obj)) {
1970 return obj;
1971 }
1972 return mrb_str_to_str(mrb, obj);
1973}
1974
1975MRB_API mrb_value
1976mrb_ptr_to_str(mrb_state *mrb, void *p)
1977{
1978 struct RString *p_str;
1979 char *p1;
1980 char *p2;
1981 uintptr_t n = (uintptr_t)p;
1982
1983 p_str = str_new(mrb, NULL, 2 + sizeof(uintptr_t) * CHAR_BIT / 4);
1984 p1 = RSTR_PTR(p_str);
1985 *p1++ = '0';
1986 *p1++ = 'x';
1987 p2 = p1;
1988
1989 do {
1990 *p2++ = mrb_digitmap[n % 16];
1991 n /= 16;
1992 } while (n > 0);
1993 *p2 = '\0';
1994 RSTR_SET_LEN(p_str, (mrb_int)(p2 - RSTR_PTR(p_str)));
1995
1996 while (p1 < p2) {
1997 const char c = *p1;
1998 *p1++ = *--p2;
1999 *p2 = c;
2000 }
2001
2002 return mrb_obj_value(p_str);
2003}
2004
2005static inline void
2006str_reverse(char *p, char *e)
2007{
2008 char c;
2009
2010 while (p < e) {
2011 c = *p;
2012 *p++ = *e;
2013 *e-- = c;
2014 }
2015}
2016
2017/* 15.2.10.5.30 */
2018/*
2019 * call-seq:
2020 * str.reverse! => str
2021 *
2022 * Reverses <i>str</i> in place.
2023 */
2024static mrb_value
2025mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
2026{
2027 struct RString *s = mrb_str_ptr(str);
2028 char *p, *e;
2029
2030#ifdef MRB_UTF8_STRING
2031 mrb_int utf8_len = RSTRING_CHAR_LEN(str);
2032 mrb_int len = RSTR_LEN(s);
2033
2034 if (utf8_len < 2) return str;
2035 if (utf8_len < len) {
2036 mrb_str_modify(mrb, s);
2037 p = RSTR_PTR(s);
2038 e = p + RSTR_LEN(s);
2039 while (p<e) {
2040 mrb_int clen = mrb_utf8len(p, e);
2041 str_reverse(p, p + clen - 1);
2042 p += clen;
2043 }
2044 goto bytes;
2045 }
2046#endif
2047
2048 if (RSTR_LEN(s) > 1) {
2049 mrb_str_modify(mrb, s);
2050 goto bytes;
2051 }
2052 return str;
2053
2054 bytes:
2055 p = RSTR_PTR(s);
2056 e = p + RSTR_LEN(s) - 1;
2057 str_reverse(p, e);
2058 return str;
2059}
2060
2061/* ---------------------------------- */
2062/* 15.2.10.5.29 */
2063/*
2064 * call-seq:
2065 * str.reverse => new_str
2066 *
2067 * Returns a new string with the characters from <i>str</i> in reverse order.
2068 *
2069 * "stressed".reverse #=> "desserts"
2070 */
2071static mrb_value
2072mrb_str_reverse(mrb_state *mrb, mrb_value str)
2073{
2074 mrb_value str2 = mrb_str_dup(mrb, str);
2075 mrb_str_reverse_bang(mrb, str2);
2076 return str2;
2077}
2078
2079/* 15.2.10.5.31 */
2080/*
2081 * call-seq:
2082 * str.rindex(substring [, offset]) => fixnum or nil
2083 *
2084 * Returns the index of the last occurrence of the given <i>substring</i>.
2085 * Returns <code>nil</code> if not found. If the second parameter is
2086 * present, it specifies the position in the string to end the
2087 * search---characters beyond this point will not be considered.
2088 *
2089 * "hello".rindex('e') #=> 1
2090 * "hello".rindex('l') #=> 3
2091 * "hello".rindex('a') #=> nil
2092 * "hello".rindex('l', 2) #=> 2
2093 */
2094static mrb_value
2095mrb_str_rindex(mrb_state *mrb, mrb_value str)
2096{
2097 mrb_value sub;
2098 mrb_int pos, len = RSTRING_CHAR_LEN(str);
2099
2100 if (mrb_get_args(mrb, "S|i", &sub, &pos) == 1) {
2101 pos = len;
2102 }
2103 else {
2104 if (pos < 0) {
2105 pos += len;
2106 if (pos < 0) {
2107 return mrb_nil_value();
2108 }
2109 }
2110 if (pos > len) pos = len;
2111 }
2112 pos = chars2bytes(str, 0, pos);
2113 pos = str_rindex(mrb, str, sub, pos);
2114 if (pos >= 0) {
2115 pos = bytes2chars(RSTRING_PTR(str), RSTRING_LEN(str), pos);
2116 BYTES_ALIGN_CHECK(pos);
2117 return mrb_fixnum_value(pos);
2118 }
2119 return mrb_nil_value();
2120}
2121
2122/* 15.2.10.5.35 */
2123
2124/*
2125 * call-seq:
2126 * str.split(separator=nil, [limit]) => anArray
2127 *
2128 * Divides <i>str</i> into substrings based on a delimiter, returning an array
2129 * of these substrings.
2130 *
2131 * If <i>separator</i> is a <code>String</code>, then its contents are used as
2132 * the delimiter when splitting <i>str</i>. If <i>separator</i> is a single
2133 * space, <i>str</i> is split on whitespace, with leading whitespace and runs
2134 * of contiguous whitespace characters ignored.
2135 *
2136 * If <i>separator</i> is omitted or <code>nil</code> (which is the default),
2137 * <i>str</i> is split on whitespace as if ' ' were specified.
2138 *
2139 * If the <i>limit</i> parameter is omitted, trailing null fields are
2140 * suppressed. If <i>limit</i> is a positive number, at most that number of
2141 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire
2142 * string is returned as the only entry in an array). If negative, there is no
2143 * limit to the number of fields returned, and trailing null fields are not
2144 * suppressed.
2145 *
2146 * " now's the time".split #=> ["now's", "the", "time"]
2147 * " now's the time".split(' ') #=> ["now's", "the", "time"]
2148 *
2149 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
2150 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
2151 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
2152 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
2153 */
2154
2155static mrb_value
2156mrb_str_split_m(mrb_state *mrb, mrb_value str)
2157{
2158 mrb_int argc;
2159 mrb_value spat = mrb_nil_value();
2160 enum {awk, string} split_type = string;
2161 mrb_int i = 0;
2162 mrb_int beg;
2163 mrb_int end;
2164 mrb_int lim = 0;
2165 mrb_bool lim_p;
2166 mrb_value result, tmp;
2167
2168 argc = mrb_get_args(mrb, "|oi", &spat, &lim);
2169 lim_p = (lim > 0 && argc == 2);
2170 if (argc == 2) {
2171 if (lim == 1) {
2172 if (RSTRING_LEN(str) == 0)
2173 return mrb_ary_new_capa(mrb, 0);
2174 return mrb_ary_new_from_values(mrb, 1, &str);
2175 }
2176 i = 1;
2177 }
2178
2179 if (argc == 0 || mrb_nil_p(spat)) {
2180 split_type = awk;
2181 }
2182 else if (!mrb_string_p(spat)) {
2183 mrb_raise(mrb, E_TYPE_ERROR, "expected String");
2184 }
2185 else if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
2186 split_type = awk;
2187 }
2188
2189 result = mrb_ary_new(mrb);
2190 beg = 0;
2191 if (split_type == awk) {
2192 mrb_bool skip = TRUE;
2193 mrb_int idx = 0;
2194 mrb_int str_len = RSTRING_LEN(str);
2195 unsigned int c;
2196 int ai = mrb_gc_arena_save(mrb);
2197
2198 idx = end = beg;
2199 while (idx < str_len) {
2200 c = (unsigned char)RSTRING_PTR(str)[idx++];
2201 if (skip) {
2202 if (ISSPACE(c)) {
2203 beg = idx;
2204 }
2205 else {
2206 end = idx;
2207 skip = FALSE;
2208 if (lim_p && lim <= i) break;
2209 }
2210 }
2211 else if (ISSPACE(c)) {
2212 mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, beg, end-beg));
2213 mrb_gc_arena_restore(mrb, ai);
2214 skip = TRUE;
2215 beg = idx;
2216 if (lim_p) ++i;
2217 }
2218 else {
2219 end = idx;
2220 }
2221 }
2222 }
2223 else { /* split_type == string */
2224 mrb_int str_len = RSTRING_LEN(str);
2225 mrb_int pat_len = RSTRING_LEN(spat);
2226 mrb_int idx = 0;
2227 int ai = mrb_gc_arena_save(mrb);
2228
2229 while (idx < str_len) {
2230 if (pat_len > 0) {
2231 end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx);
2232 if (end < 0) break;
2233 }
2234 else {
2235 end = chars2bytes(str, idx, 1);
2236 }
2237 mrb_ary_push(mrb, result, mrb_str_byte_subseq(mrb, str, idx, end));
2238 mrb_gc_arena_restore(mrb, ai);
2239 idx += end + pat_len;
2240 if (lim_p && lim <= ++i) break;
2241 }
2242 beg = idx;
2243 }
2244 if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) {
2245 if (RSTRING_LEN(str) == beg) {
2246 tmp = mrb_str_new_empty(mrb, str);
2247 }
2248 else {
2249 tmp = mrb_str_byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg);
2250 }
2251 mrb_ary_push(mrb, result, tmp);
2252 }
2253 if (!lim_p && lim == 0) {
2254 mrb_int len;
2255 while ((len = RARRAY_LEN(result)) > 0 &&
2256 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
2257 mrb_ary_pop(mrb, result);
2258 }
2259
2260 return result;
2261}
2262
2263mrb_value
2264mrb_str_len_to_inum(mrb_state *mrb, const char *str, mrb_int len, mrb_int base, int badcheck)
2265{
2266 const char *p = str;
2267 const char *pend = str + len;
2268 char sign = 1;
2269 int c;
2270 uint64_t n = 0;
2271 mrb_int val;
2272
2273#define conv_digit(c) \
2274 (ISDIGIT(c) ? ((c) - '0') : \
2275 ISLOWER(c) ? ((c) - 'a' + 10) : \
2276 ISUPPER(c) ? ((c) - 'A' + 10) : \
2277 -1)
2278
2279 if (!p) {
2280 if (badcheck) goto bad;
2281 return mrb_fixnum_value(0);
2282 }
2283 while (p<pend && ISSPACE(*p))
2284 p++;
2285
2286 if (p[0] == '+') {
2287 p++;
2288 }
2289 else if (p[0] == '-') {
2290 p++;
2291 sign = 0;
2292 }
2293 if (base <= 0) {
2294 if (p[0] == '0') {
2295 switch (p[1]) {
2296 case 'x': case 'X':
2297 base = 16;
2298 break;
2299 case 'b': case 'B':
2300 base = 2;
2301 break;
2302 case 'o': case 'O':
2303 base = 8;
2304 break;
2305 case 'd': case 'D':
2306 base = 10;
2307 break;
2308 default:
2309 base = 8;
2310 break;
2311 }
2312 }
2313 else if (base < -1) {
2314 base = -base;
2315 }
2316 else {
2317 base = 10;
2318 }
2319 }
2320 switch (base) {
2321 case 2:
2322 if (p[0] == '0' && (p[1] == 'b'||p[1] == 'B')) {
2323 p += 2;
2324 }
2325 break;
2326 case 3:
2327 break;
2328 case 8:
2329 if (p[0] == '0' && (p[1] == 'o'||p[1] == 'O')) {
2330 p += 2;
2331 }
2332 case 4: case 5: case 6: case 7:
2333 break;
2334 case 10:
2335 if (p[0] == '0' && (p[1] == 'd'||p[1] == 'D')) {
2336 p += 2;
2337 }
2338 case 9: case 11: case 12: case 13: case 14: case 15:
2339 break;
2340 case 16:
2341 if (p[0] == '0' && (p[1] == 'x'||p[1] == 'X')) {
2342 p += 2;
2343 }
2344 break;
2345 default:
2346 if (base < 2 || 36 < base) {
2347 mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base);
2348 }
2349 break;
2350 } /* end of switch (base) { */
2351 if (p>=pend) {
2352 if (badcheck) goto bad;
2353 return mrb_fixnum_value(0);
2354 }
2355 if (*p == '0') { /* squeeze preceding 0s */
2356 p++;
2357 while (p<pend) {
2358 c = *p++;
2359 if (c == '_') {
2360 if (p<pend && *p == '_') {
2361 if (badcheck) goto bad;
2362 break;
2363 }
2364 continue;
2365 }
2366 if (c != '0') {
2367 p--;
2368 break;
2369 }
2370 }
2371 if (*(p - 1) == '0')
2372 p--;
2373 }
2374 if (p == pend || *p == '_') {
2375 if (badcheck) goto bad;
2376 return mrb_fixnum_value(0);
2377 }
2378 for ( ;p<pend;p++) {
2379 if (*p == '_') {
2380 p++;
2381 if (p==pend) {
2382 if (badcheck) goto bad;
2383 continue;
2384 }
2385 if (*p == '_') {
2386 if (badcheck) goto bad;
2387 break;
2388 }
2389 }
2390 if (badcheck && *p == '\0') {
2391 goto nullbyte;
2392 }
2393 c = conv_digit(*p);
2394 if (c < 0 || c >= base) {
2395 break;
2396 }
2397 n *= base;
2398 n += c;
2399 if (n > (uint64_t)MRB_INT_MAX + (sign ? 0 : 1)) {
2400#ifndef MRB_WITHOUT_FLOAT
2401 if (base == 10) {
2402 return mrb_float_value(mrb, mrb_str_to_dbl(mrb, mrb_str_new(mrb, str, len), badcheck));
2403 }
2404 else
2405#endif
2406 {
2407 mrb_raisef(mrb, E_RANGE_ERROR, "string (%l) too big for integer", str, pend-str);
2408 }
2409 }
2410 }
2411 val = (mrb_int)n;
2412 if (badcheck) {
2413 if (p == str) goto bad; /* no number */
2414 if (*(p - 1) == '_') goto bad; /* trailing '_' */
2415 while (p<pend && ISSPACE(*p)) p++;
2416 if (p<pend) goto bad; /* trailing garbage */
2417 }
2418
2419 return mrb_fixnum_value(sign ? val : -val);
2420 nullbyte:
2421 mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
2422 /* not reached */
2423 bad:
2424 mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for number(%!l)", str, pend-str);
2425 /* not reached */
2426 return mrb_fixnum_value(0);
2427}
2428
2429MRB_API mrb_value
2430mrb_cstr_to_inum(mrb_state *mrb, const char *str, mrb_int base, mrb_bool badcheck)
2431{
2432 return mrb_str_len_to_inum(mrb, str, strlen(str), base, badcheck);
2433}
2434
2435/* obslete: use RSTRING_CSTR() or mrb_string_cstr() */
2436MRB_API const char*
2437mrb_string_value_cstr(mrb_state *mrb, mrb_value *ptr)
2438{
2439 struct RString *ps;
2440 const char *p;
2441 mrb_int len;
2442
2443 check_null_byte(mrb, *ptr);
2444 ps = mrb_str_ptr(*ptr);
2445 p = RSTR_PTR(ps);
2446 len = RSTR_LEN(ps);
2447 if (p[len] == '\0') {
2448 return p;
2449 }
2450
2451 /*
2452 * Even after str_modify_keep_ascii(), NULL termination is not ensured if
2453 * RSTR_SET_LEN() is used explicitly (e.g. String#delete_suffix!).
2454 */
2455 str_modify_keep_ascii(mrb, ps);
2456 RSTR_PTR(ps)[len] = '\0';
2457 return RSTR_PTR(ps);
2458}
2459
2460MRB_API const char*
2461mrb_string_cstr(mrb_state *mrb, mrb_value str)
2462{
2463 return mrb_string_value_cstr(mrb, &str);
2464}
2465
2466MRB_API mrb_value
2467mrb_str_to_inum(mrb_state *mrb, mrb_value str, mrb_int base, mrb_bool badcheck)
2468{
2469 const char *s;
2470 mrb_int len;
2471
2472 mrb_to_str(mrb, str);
2473 s = RSTRING_PTR(str);
2474 len = RSTRING_LEN(str);
2475 return mrb_str_len_to_inum(mrb, s, len, base, badcheck);
2476}
2477
2478/* 15.2.10.5.38 */
2479/*
2480 * call-seq:
2481 * str.to_i(base=10) => integer
2482 *
2483 * Returns the result of interpreting leading characters in <i>str</i> as an
2484 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
2485 * end of a valid number are ignored. If there is not a valid number at the
2486 * start of <i>str</i>, <code>0</code> is returned. This method never raises an
2487 * exception.
2488 *
2489 * "12345".to_i #=> 12345
2490 * "99 red balloons".to_i #=> 99
2491 * "0a".to_i #=> 0
2492 * "0a".to_i(16) #=> 10
2493 * "hello".to_i #=> 0
2494 * "1100101".to_i(2) #=> 101
2495 * "1100101".to_i(8) #=> 294977
2496 * "1100101".to_i(10) #=> 1100101
2497 * "1100101".to_i(16) #=> 17826049
2498 */
2499static mrb_value
2500mrb_str_to_i(mrb_state *mrb, mrb_value self)
2501{
2502 mrb_int base = 10;
2503
2504 mrb_get_args(mrb, "|i", &base);
2505 if (base < 0) {
2506 mrb_raisef(mrb, E_ARGUMENT_ERROR, "illegal radix %i", base);
2507 }
2508 return mrb_str_to_inum(mrb, self, base, FALSE);
2509}
2510
2511#ifndef MRB_WITHOUT_FLOAT
2512double
2513mrb_str_len_to_dbl(mrb_state *mrb, const char *s, size_t len, mrb_bool badcheck)
2514{
2515 char buf[DBL_DIG * 4 + 20];
2516 const char *p = s, *p2;
2517 const char *pend = p + len;
2518 char *end;
2519 char *n;
2520 char prev = 0;
2521 double d;
2522 mrb_bool dot = FALSE;
2523
2524 if (!p) return 0.0;
2525 while (p<pend && ISSPACE(*p)) p++;
2526 p2 = p;
2527
2528 if (pend - p > 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
2529 mrb_value x;
2530
2531 if (!badcheck) return 0.0;
2532 x = mrb_str_len_to_inum(mrb, p, pend-p, 0, badcheck);
2533 if (mrb_fixnum_p(x))
2534 d = (double)mrb_fixnum(x);
2535 else /* if (mrb_float_p(x)) */
2536 d = mrb_float(x);
2537 return d;
2538 }
2539 while (p < pend) {
2540 if (!*p) {
2541 if (badcheck) {
2542 mrb_raise(mrb, E_ARGUMENT_ERROR, "string for Float contains null byte");
2543 /* not reached */
2544 }
2545 pend = p;
2546 p = p2;
2547 goto nocopy;
2548 }
2549 if (!badcheck && *p == ' ') {
2550 pend = p;
2551 p = p2;
2552 goto nocopy;
2553 }
2554 if (*p == '_') break;
2555 p++;
2556 }
2557 p = p2;
2558 n = buf;
2559 while (p < pend) {
2560 char c = *p++;
2561 if (c == '.') dot = TRUE;
2562 if (c == '_') {
2563 /* remove an underscore between digits */
2564 if (n == buf || !ISDIGIT(prev) || p == pend) {
2565 if (badcheck) goto bad;
2566 break;
2567 }
2568 }
2569 else if (badcheck && prev == '_' && !ISDIGIT(c)) goto bad;
2570 else {
2571 const char *bend = buf+sizeof(buf)-1;
2572 if (n==bend) { /* buffer overflow */
2573 if (dot) break; /* cut off remaining fractions */
2574 return INFINITY;
2575 }
2576 *n++ = c;
2577 }
2578 prev = c;
2579 }
2580 *n = '\0';
2581 p = buf;
2582 pend = n;
2583nocopy:
2584 d = mrb_float_read(p, &end);
2585 if (p == end) {
2586 if (badcheck) {
2587bad:
2588 mrb_raisef(mrb, E_ARGUMENT_ERROR, "invalid string for float(%!s)", s);
2589 /* not reached */
2590 }
2591 return d;
2592 }
2593 if (badcheck) {
2594 if (!end || p == end) goto bad;
2595 while (end<pend && ISSPACE(*end)) end++;
2596 if (end<pend) goto bad;
2597 }
2598 return d;
2599}
2600
2601MRB_API double
2602mrb_cstr_to_dbl(mrb_state *mrb, const char *s, mrb_bool badcheck)
2603{
2604 return mrb_str_len_to_dbl(mrb, s, strlen(s), badcheck);
2605}
2606
2607MRB_API double
2608mrb_str_to_dbl(mrb_state *mrb, mrb_value str, mrb_bool badcheck)
2609{
2610 return mrb_str_len_to_dbl(mrb, RSTRING_PTR(str), RSTRING_LEN(str), badcheck);
2611}
2612
2613/* 15.2.10.5.39 */
2614/*
2615 * call-seq:
2616 * str.to_f => float
2617 *
2618 * Returns the result of interpreting leading characters in <i>str</i> as a
2619 * floating point number. Extraneous characters past the end of a valid number
2620 * are ignored. If there is not a valid number at the start of <i>str</i>,
2621 * <code>0.0</code> is returned. This method never raises an exception.
2622 *
2623 * "123.45e1".to_f #=> 1234.5
2624 * "45.67 degrees".to_f #=> 45.67
2625 * "thx1138".to_f #=> 0.0
2626 */
2627static mrb_value
2628mrb_str_to_f(mrb_state *mrb, mrb_value self)
2629{
2630 return mrb_float_value(mrb, mrb_str_to_dbl(mrb, self, FALSE));
2631}
2632#endif
2633
2634/* 15.2.10.5.40 */
2635/*
2636 * call-seq:
2637 * str.to_s => str
2638 *
2639 * Returns the receiver.
2640 */
2641static mrb_value
2642mrb_str_to_s(mrb_state *mrb, mrb_value self)
2643{
2644 if (mrb_obj_class(mrb, self) != mrb->string_class) {
2645 return mrb_str_dup(mrb, self);
2646 }
2647 return self;
2648}
2649
2650/* 15.2.10.5.43 */
2651/*
2652 * call-seq:
2653 * str.upcase! => str or nil
2654 *
2655 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
2656 * were made.
2657 */
2658static mrb_value
2659mrb_str_upcase_bang(mrb_state *mrb, mrb_value str)
2660{
2661 struct RString *s = mrb_str_ptr(str);
2662 char *p, *pend;
2663 mrb_bool modify = FALSE;
2664
2665 mrb_str_modify_keep_ascii(mrb, s);
2666 p = RSTRING_PTR(str);
2667 pend = RSTRING_END(str);
2668 while (p < pend) {
2669 if (ISLOWER(*p)) {
2670 *p = TOUPPER(*p);
2671 modify = TRUE;
2672 }
2673 p++;
2674 }
2675
2676 if (modify) return str;
2677 return mrb_nil_value();
2678}
2679
2680/* 15.2.10.5.42 */
2681/*
2682 * call-seq:
2683 * str.upcase => new_str
2684 *
2685 * Returns a copy of <i>str</i> with all lowercase letters replaced with their
2686 * uppercase counterparts. The operation is locale insensitive---only
2687 * characters 'a' to 'z' are affected.
2688 *
2689 * "hEllO".upcase #=> "HELLO"
2690 */
2691static mrb_value
2692mrb_str_upcase(mrb_state *mrb, mrb_value self)
2693{
2694 mrb_value str;
2695
2696 str = mrb_str_dup(mrb, self);
2697 mrb_str_upcase_bang(mrb, str);
2698 return str;
2699}
2700
2701/*
2702 * call-seq:
2703 * str.dump -> new_str
2704 *
2705 * Produces a version of <i>str</i> with all nonprinting characters replaced by
2706 * <code>\nnn</code> notation and all special characters escaped.
2707 */
2708mrb_value
2709mrb_str_dump(mrb_state *mrb, mrb_value str)
2710{
2711 return str_escape(mrb, str, FALSE);
2712}
2713
2714MRB_API mrb_value
2715mrb_str_cat(mrb_state *mrb, mrb_value str, const char *ptr, size_t len)
2716{
2717 struct RString *s = mrb_str_ptr(str);
2718 size_t capa;
2719 size_t total;
2720 ptrdiff_t off = -1;
2721
2722 if (len == 0) return str;
2723 mrb_str_modify(mrb, s);
2724 if (ptr >= RSTR_PTR(s) && ptr <= RSTR_PTR(s) + (size_t)RSTR_LEN(s)) {
2725 off = ptr - RSTR_PTR(s);
2726 }
2727
2728 capa = RSTR_CAPA(s);
2729 total = RSTR_LEN(s)+len;
2730 if (total >= MRB_SSIZE_MAX) {
2731 size_error:
2732 mrb_raise(mrb, E_ARGUMENT_ERROR, "string size too big");
2733 }
2734 if (capa <= total) {
2735 if (capa == 0) capa = 1;
2736 while (capa <= total) {
2737 if (capa <= MRB_SSIZE_MAX / 2) {
2738 capa *= 2;
2739 }
2740 else {
2741 capa = total+1;
2742 }
2743 }
2744 if (capa <= total || capa > MRB_SSIZE_MAX) {
2745 goto size_error;
2746 }
2747 resize_capa(mrb, s, capa);
2748 }
2749 if (off != -1) {
2750 ptr = RSTR_PTR(s) + off;
2751 }
2752 memcpy(RSTR_PTR(s) + RSTR_LEN(s), ptr, len);
2753 mrb_assert_int_fit(size_t, total, mrb_ssize, MRB_SSIZE_MAX);
2754 RSTR_SET_LEN(s, total);
2755 RSTR_PTR(s)[total] = '\0'; /* sentinel */
2756 return str;
2757}
2758
2759MRB_API mrb_value
2760mrb_str_cat_cstr(mrb_state *mrb, mrb_value str, const char *ptr)
2761{
2762 return mrb_str_cat(mrb, str, ptr, strlen(ptr));
2763}
2764
2765MRB_API mrb_value
2766mrb_str_cat_str(mrb_state *mrb, mrb_value str, mrb_value str2)
2767{
2768 if (mrb_str_ptr(str) == mrb_str_ptr(str2)) {
2769 mrb_str_modify(mrb, mrb_str_ptr(str));
2770 }
2771 return mrb_str_cat(mrb, str, RSTRING_PTR(str2), RSTRING_LEN(str2));
2772}
2773
2774MRB_API mrb_value
2775mrb_str_append(mrb_state *mrb, mrb_value str1, mrb_value str2)
2776{
2777 mrb_to_str(mrb, str2);
2778 return mrb_str_cat_str(mrb, str1, str2);
2779}
2780
2781/*
2782 * call-seq:
2783 * str.inspect -> string
2784 *
2785 * Returns a printable version of _str_, surrounded by quote marks,
2786 * with special characters escaped.
2787 *
2788 * str = "hello"
2789 * str[3] = "\b"
2790 * str.inspect #=> "\"hel\\bo\""
2791 */
2792mrb_value
2793mrb_str_inspect(mrb_state *mrb, mrb_value str)
2794{
2795 return str_escape(mrb, str, TRUE);
2796}
2797
2798/*
2799 * call-seq:
2800 * str.bytes -> array of fixnums
2801 *
2802 * Returns an array of bytes in _str_.
2803 *
2804 * str = "hello"
2805 * str.bytes #=> [104, 101, 108, 108, 111]
2806 */
2807static mrb_value
2808mrb_str_bytes(mrb_state *mrb, mrb_value str)
2809{
2810 struct RString *s = mrb_str_ptr(str);
2811 mrb_value a = mrb_ary_new_capa(mrb, RSTR_LEN(s));
2812 unsigned char *p = (unsigned char *)(RSTR_PTR(s)), *pend = p + RSTR_LEN(s);
2813
2814 while (p < pend) {
2815 mrb_ary_push(mrb, a, mrb_fixnum_value(p[0]));
2816 p++;
2817 }
2818 return a;
2819}
2820
2821/*
2822 * call-seq:
2823 * str.getbyte(index) -> 0 .. 255
2824 *
2825 * returns the <i>index</i>th byte as an integer.
2826 */
2827static mrb_value
2828mrb_str_getbyte(mrb_state *mrb, mrb_value str)
2829{
2830 mrb_int pos;
2831 mrb_get_args(mrb, "i", &pos);
2832
2833 if (pos < 0)
2834 pos += RSTRING_LEN(str);
2835 if (pos < 0 || RSTRING_LEN(str) <= pos)
2836 return mrb_nil_value();
2837
2838 return mrb_fixnum_value((unsigned char)RSTRING_PTR(str)[pos]);
2839}
2840
2841/*
2842 * call-seq:
2843 * str.setbyte(index, integer) -> integer
2844 *
2845 * modifies the <i>index</i>th byte as <i>integer</i>.
2846 */
2847static mrb_value
2848mrb_str_setbyte(mrb_state *mrb, mrb_value str)
2849{
2850 mrb_int pos, byte;
2851 mrb_int len;
2852
2853 mrb_get_args(mrb, "ii", &pos, &byte);
2854
2855 len = RSTRING_LEN(str);
2856 if (pos < -len || len <= pos)
2857 mrb_raisef(mrb, E_INDEX_ERROR, "index %i out of string", pos);
2858 if (pos < 0)
2859 pos += len;
2860
2861 mrb_str_modify(mrb, mrb_str_ptr(str));
2862 byte &= 0xff;
2863 RSTRING_PTR(str)[pos] = (unsigned char)byte;
2864 return mrb_fixnum_value((unsigned char)byte);
2865}
2866
2867/*
2868 * call-seq:
2869 * str.byteslice(integer) -> new_str or nil
2870 * str.byteslice(integer, integer) -> new_str or nil
2871 * str.byteslice(range) -> new_str or nil
2872 *
2873 * Byte Reference---If passed a single Integer, returns a
2874 * substring of one byte at that position. If passed two Integer
2875 * objects, returns a substring starting at the offset given by the first, and
2876 * a length given by the second. If given a Range, a substring containing
2877 * bytes at offsets given by the range is returned. In all three cases, if
2878 * an offset is negative, it is counted from the end of <i>str</i>. Returns
2879 * <code>nil</code> if the initial offset falls outside the string, the length
2880 * is negative, or the beginning of the range is greater than the end.
2881 * The encoding of the resulted string keeps original encoding.
2882 *
2883 * "hello".byteslice(1) #=> "e"
2884 * "hello".byteslice(-1) #=> "o"
2885 * "hello".byteslice(1, 2) #=> "el"
2886 * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
2887 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
2888 */
2889static mrb_value
2890mrb_str_byteslice(mrb_state *mrb, mrb_value str)
2891{
2892 mrb_value a1, a2;
2893 mrb_int str_len = RSTRING_LEN(str), beg, len;
2894 mrb_bool empty = TRUE;
2895
2896 if (mrb_get_args(mrb, "o|o", &a1, &a2) == 2) {
2897 beg = mrb_fixnum(mrb_to_int(mrb, a1));
2898 len = mrb_fixnum(mrb_to_int(mrb, a2));
2899 }
2900 else if (mrb_range_p(a1)) {
2901 if (mrb_range_beg_len(mrb, a1, &beg, &len, str_len, TRUE) != MRB_RANGE_OK) {
2902 return mrb_nil_value();
2903 }
2904 }
2905 else {
2906 beg = mrb_fixnum(mrb_to_int(mrb, a1));
2907 len = 1;
2908 empty = FALSE;
2909 }
2910
2911 if (mrb_str_beg_len(str_len, &beg, &len) && (empty || len != 0)) {
2912 return mrb_str_byte_subseq(mrb, str, beg, len);
2913 }
2914 else {
2915 return mrb_nil_value();
2916 }
2917}
2918
2919/* ---------------------------*/
2920void
2921mrb_init_string(mrb_state *mrb)
2922{
2923 struct RClass *s;
2924
2925 mrb_static_assert(RSTRING_EMBED_LEN_MAX < (1 << MRB_STR_EMBED_LEN_BIT),
2926 "pointer size too big for embedded string");
2927
2928 mrb->string_class = s = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */
2929 MRB_SET_INSTANCE_TT(s, MRB_TT_STRING);
2930
2931 mrb_define_method(mrb, s, "bytesize", mrb_str_bytesize, MRB_ARGS_NONE());
2932
2933 mrb_define_method(mrb, s, "<=>", mrb_str_cmp_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.1 */
2934 mrb_define_method(mrb, s, "==", mrb_str_equal_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.2 */
2935 mrb_define_method(mrb, s, "+", mrb_str_plus_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.4 */
2936 mrb_define_method(mrb, s, "*", mrb_str_times, MRB_ARGS_REQ(1)); /* 15.2.10.5.5 */
2937 mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.6 */
2938 mrb_define_method(mrb, s, "[]=", mrb_str_aset_m, MRB_ARGS_ANY());
2939 mrb_define_method(mrb, s, "capitalize", mrb_str_capitalize, MRB_ARGS_NONE()); /* 15.2.10.5.7 */
2940 mrb_define_method(mrb, s, "capitalize!", mrb_str_capitalize_bang, MRB_ARGS_NONE()); /* 15.2.10.5.8 */
2941 mrb_define_method(mrb, s, "chomp", mrb_str_chomp, MRB_ARGS_ANY()); /* 15.2.10.5.9 */
2942 mrb_define_method(mrb, s, "chomp!", mrb_str_chomp_bang, MRB_ARGS_ANY()); /* 15.2.10.5.10 */
2943 mrb_define_method(mrb, s, "chop", mrb_str_chop, MRB_ARGS_NONE()); /* 15.2.10.5.11 */
2944 mrb_define_method(mrb, s, "chop!", mrb_str_chop_bang, MRB_ARGS_NONE()); /* 15.2.10.5.12 */
2945 mrb_define_method(mrb, s, "downcase", mrb_str_downcase, MRB_ARGS_NONE()); /* 15.2.10.5.13 */
2946 mrb_define_method(mrb, s, "downcase!", mrb_str_downcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.14 */
2947 mrb_define_method(mrb, s, "empty?", mrb_str_empty_p, MRB_ARGS_NONE()); /* 15.2.10.5.16 */
2948 mrb_define_method(mrb, s, "eql?", mrb_str_eql, MRB_ARGS_REQ(1)); /* 15.2.10.5.17 */
2949
2950 mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */
2951 mrb_define_method(mrb, s, "include?", mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */
2952 mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ARG(1,1)); /* 15.2.10.5.22 */
2953 mrb_define_method(mrb, s, "initialize", mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */
2954 mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */
2955 mrb_define_method(mrb, s, "intern", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */
2956 mrb_define_method(mrb, s, "length", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.26 */
2957 mrb_define_method(mrb, s, "replace", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */
2958 mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */
2959 mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */
2960 mrb_define_method(mrb, s, "rindex", mrb_str_rindex, MRB_ARGS_ANY()); /* 15.2.10.5.31 */
2961 mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */
2962 mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */
2963 mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */
2964
2965#ifndef MRB_WITHOUT_FLOAT
2966 mrb_define_method(mrb, s, "to_f", mrb_str_to_f, MRB_ARGS_NONE()); /* 15.2.10.5.38 */
2967#endif
2968 mrb_define_method(mrb, s, "to_i", mrb_str_to_i, MRB_ARGS_ANY()); /* 15.2.10.5.39 */
2969 mrb_define_method(mrb, s, "to_s", mrb_str_to_s, MRB_ARGS_NONE()); /* 15.2.10.5.40 */
2970 mrb_define_method(mrb, s, "to_str", mrb_str_to_s, MRB_ARGS_NONE());
2971 mrb_define_method(mrb, s, "to_sym", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.41 */
2972 mrb_define_method(mrb, s, "upcase", mrb_str_upcase, MRB_ARGS_NONE()); /* 15.2.10.5.42 */
2973 mrb_define_method(mrb, s, "upcase!", mrb_str_upcase_bang, MRB_ARGS_NONE()); /* 15.2.10.5.43 */
2974 mrb_define_method(mrb, s, "inspect", mrb_str_inspect, MRB_ARGS_NONE()); /* 15.2.10.5.46(x) */
2975 mrb_define_method(mrb, s, "bytes", mrb_str_bytes, MRB_ARGS_NONE());
2976
2977 mrb_define_method(mrb, s, "getbyte", mrb_str_getbyte, MRB_ARGS_REQ(1));
2978 mrb_define_method(mrb, s, "setbyte", mrb_str_setbyte, MRB_ARGS_REQ(2));
2979 mrb_define_method(mrb, s, "byteslice", mrb_str_byteslice, MRB_ARGS_ARG(1,1));
2980}
2981
2982#ifndef MRB_WITHOUT_FLOAT
2983/*
2984 * Source code for the "strtod" library procedure.
2985 *
2986 * Copyright (c) 1988-1993 The Regents of the University of California.
2987 * Copyright (c) 1994 Sun Microsystems, Inc.
2988 *
2989 * Permission to use, copy, modify, and distribute this
2990 * software and its documentation for any purpose and without
2991 * fee is hereby granted, provided that the above copyright
2992 * notice appear in all copies. The University of California
2993 * makes no representations about the suitability of this
2994 * software for any purpose. It is provided "as is" without
2995 * express or implied warranty.
2996 *
2997 * RCS: @(#) $Id$
2998 */
2999
3000#include <ctype.h>
3001#include <errno.h>
3002
3003static const int maxExponent = 511; /* Largest possible base 10 exponent. Any
3004 * exponent larger than this will already
3005 * produce underflow or overflow, so there's
3006 * no need to worry about additional digits.
3007 */
3008static const double powersOf10[] = {/* Table giving binary powers of 10. Entry */
3009 10., /* is 10^2^i. Used to convert decimal */
3010 100., /* exponents into floating-point numbers. */
3011 1.0e4,
3012 1.0e8,
3013 1.0e16,
3014 1.0e32,
3015 1.0e64,
3016 1.0e128,
3017 1.0e256
3018};
3019
3020MRB_API double
3021mrb_float_read(const char *string, char **endPtr)
3022/* const char *string; A decimal ASCII floating-point number,
3023 * optionally preceded by white space.
3024 * Must have form "-I.FE-X", where I is the
3025 * integer part of the mantissa, F is the
3026 * fractional part of the mantissa, and X
3027 * is the exponent. Either of the signs
3028 * may be "+", "-", or omitted. Either I
3029 * or F may be omitted, or both. The decimal
3030 * point isn't necessary unless F is present.
3031 * The "E" may actually be an "e". E and X
3032 * may both be omitted (but not just one).
3033 */
3034/* char **endPtr; If non-NULL, store terminating character's
3035 * address here. */
3036{
3037 int sign, expSign = FALSE;
3038 double fraction, dblExp;
3039 const double *d;
3040 const char *p;
3041 int c;
3042 int exp = 0; /* Exponent read from "EX" field. */
3043 int fracExp = 0; /* Exponent that derives from the fractional
3044 * part. Under normal circumstatnces, it is
3045 * the negative of the number of digits in F.
3046 * However, if I is very long, the last digits
3047 * of I get dropped (otherwise a long I with a
3048 * large negative exponent could cause an
3049 * unnecessary overflow on I alone). In this
3050 * case, fracExp is incremented one for each
3051 * dropped digit. */
3052 int mantSize; /* Number of digits in mantissa. */
3053 int decPt; /* Number of mantissa digits BEFORE decimal
3054 * point. */
3055 const char *pExp; /* Temporarily holds location of exponent
3056 * in string. */
3057
3058 /*
3059 * Strip off leading blanks and check for a sign.
3060 */
3061
3062 p = string;
3063 while (ISSPACE(*p)) {
3064 p += 1;
3065 }
3066 if (*p == '-') {
3067 sign = TRUE;
3068 p += 1;
3069 }
3070 else {
3071 if (*p == '+') {
3072 p += 1;
3073 }
3074 sign = FALSE;
3075 }
3076
3077 /*
3078 * Count the number of digits in the mantissa (including the decimal
3079 * point), and also locate the decimal point.
3080 */
3081
3082 decPt = -1;
3083 for (mantSize = 0; ; mantSize += 1)
3084 {
3085 c = *p;
3086 if (!ISDIGIT(c)) {
3087 if ((c != '.') || (decPt >= 0)) {
3088 break;
3089 }
3090 decPt = mantSize;
3091 }
3092 p += 1;
3093 }
3094
3095 /*
3096 * Now suck up the digits in the mantissa. Use two integers to
3097 * collect 9 digits each (this is faster than using floating-point).
3098 * If the mantissa has more than 18 digits, ignore the extras, since
3099 * they can't affect the value anyway.
3100 */
3101
3102 pExp = p;
3103 p -= mantSize;
3104 if (decPt < 0) {
3105 decPt = mantSize;
3106 }
3107 else {
3108 mantSize -= 1; /* One of the digits was the point. */
3109 }
3110 if (mantSize > 18) {
3111 if (decPt - 18 > 29999) {
3112 fracExp = 29999;
3113 }
3114 else {
3115 fracExp = decPt - 18;
3116 }
3117 mantSize = 18;
3118 }
3119 else {
3120 fracExp = decPt - mantSize;
3121 }
3122 if (mantSize == 0) {
3123 fraction = 0.0;
3124 p = string;
3125 goto done;
3126 }
3127 else {
3128 int frac1, frac2;
3129 frac1 = 0;
3130 for ( ; mantSize > 9; mantSize -= 1)
3131 {
3132 c = *p;
3133 p += 1;
3134 if (c == '.') {
3135 c = *p;
3136 p += 1;
3137 }
3138 frac1 = 10*frac1 + (c - '0');
3139 }
3140 frac2 = 0;
3141 for (; mantSize > 0; mantSize -= 1)
3142 {
3143 c = *p;
3144 p += 1;
3145 if (c == '.') {
3146 c = *p;
3147 p += 1;
3148 }
3149 frac2 = 10*frac2 + (c - '0');
3150 }
3151 fraction = (1.0e9 * frac1) + frac2;
3152 }
3153
3154 /*
3155 * Skim off the exponent.
3156 */
3157
3158 p = pExp;
3159 if ((*p == 'E') || (*p == 'e')) {
3160 p += 1;
3161 if (*p == '-') {
3162 expSign = TRUE;
3163 p += 1;
3164 }
3165 else {
3166 if (*p == '+') {
3167 p += 1;
3168 }
3169 expSign = FALSE;
3170 }
3171 while (ISDIGIT(*p)) {
3172 exp = exp * 10 + (*p - '0');
3173 if (exp > 19999) {
3174 exp = 19999;
3175 }
3176 p += 1;
3177 }
3178 }
3179 if (expSign) {
3180 exp = fracExp - exp;
3181 }
3182 else {
3183 exp = fracExp + exp;
3184 }
3185
3186 /*
3187 * Generate a floating-point number that represents the exponent.
3188 * Do this by processing the exponent one bit at a time to combine
3189 * many powers of 2 of 10. Then combine the exponent with the
3190 * fraction.
3191 */
3192
3193 if (exp < 0) {
3194 expSign = TRUE;
3195 exp = -exp;
3196 }
3197 else {
3198 expSign = FALSE;
3199 }
3200 if (exp > maxExponent) {
3201 exp = maxExponent;
3202 errno = ERANGE;
3203 }
3204 dblExp = 1.0;
3205 for (d = powersOf10; exp != 0; exp >>= 1, d += 1) {
3206 if (exp & 01) {
3207 dblExp *= *d;
3208 }
3209 }
3210 if (expSign) {
3211 fraction /= dblExp;
3212 }
3213 else {
3214 fraction *= dblExp;
3215 }
3216
3217done:
3218 if (endPtr != NULL) {
3219 *endPtr = (char *) p;
3220 }
3221
3222 if (sign) {
3223 return -fraction;
3224 }
3225 return fraction;
3226}
3227#endif
Note: See TracBrowser for help on using the repository browser.