Ruby 3.2.4p170 (2024-04-23 revision af471c0e0127eea0cafa6f308c0425bbfab0acf5)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "gc.h"
27#include "id.h"
28#include "internal.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* FLAGS of RString
83 *
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
93 *
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
101 */
102
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
109
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
114 }\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
117 }\
118} while (0)
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120#if USE_RVARGC
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124} while (0)
125#else
126# define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130} while (0)
131#endif
132
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
136 }\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
139 }\
140} while (0)
141
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
147 }\
148 else {\
149 RSTRING(str)->as.heap.len--;\
150 }\
151} while (0)
152
153static inline bool
154str_enc_fastpath(VALUE str)
155{
156 // The overwhelming majority of strings are in one of these 3 encodings.
157 switch (ENCODING_GET_INLINED(str)) {
158 case ENCINDEX_ASCII_8BIT:
159 case ENCINDEX_UTF_8:
160 case ENCINDEX_US_ASCII:
161 return true;
162 default:
163 return false;
164 }
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231#if USE_RVARGC
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233#else
234 return RSTRING_EMBED_LEN_MAX + 1;
235#endif
236}
237
238bool
239rb_str_reembeddable_p(VALUE str)
240{
241 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242}
243
244static inline size_t
245rb_str_embed_size(long capa)
246{
247 return offsetof(struct RString, as.embed.ary) + capa;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254#if USE_RVARGC
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.embed.len) + TERM_LEN(str);
257 }
258 /* if the string is not currently embedded, but it can be embedded, how
259 * much space would it require */
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
262 }
263 else {
264#endif
265 real_size = sizeof(struct RString);
266#if USE_RVARGC
267 }
268#endif
269 return real_size;
270}
271
272static inline bool
273STR_EMBEDDABLE_P(long len, long termlen)
274{
275#if USE_RVARGC
276 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
277#else
278 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
279#endif
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->as.heap.len;
317
318 STR_SET_EMBED(str);
319 STR_SET_EMBED_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_str_update_shared_ary(VALUE str, VALUE old_root, VALUE new_root)
331{
332 // if the root location hasn't changed, we don't need to update
333 if (new_root == old_root) {
334 return;
335 }
336
337 // if the root string isn't embedded, we don't need to touch the ponter.
338 // it already points to the shame shared buffer
339 if (!STR_EMBED_P(new_root)) {
340 return;
341 }
342
343 size_t offset = (size_t)((uintptr_t)RSTRING(str)->as.heap.ptr - (uintptr_t)RSTRING(old_root)->as.embed.ary);
344
345 RUBY_ASSERT(RSTRING(str)->as.heap.ptr >= RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr = RSTRING(new_root)->as.embed.ary + offset;
347}
348
349void
350rb_debug_rstring_null_ptr(const char *func)
351{
352 fprintf(stderr, "%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
356 func);
357}
358
359/* symbols for [up|down|swap]case/capitalize options */
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
361
362static rb_encoding *
363get_encoding(VALUE str)
364{
365 return rb_enc_from_index(ENCODING_GET(str));
366}
367
368static void
369mustnot_broken(VALUE str)
370{
371 if (is_broken_string(str)) {
372 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
373 }
374}
375
376static void
377mustnot_wchar(VALUE str)
378{
379 rb_encoding *enc = STR_ENC_GET(str);
380 if (rb_enc_mbminlen(enc) > 1) {
381 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
382 }
383}
384
385static int fstring_cmp(VALUE a, VALUE b);
386
387static VALUE register_fstring(VALUE str, bool copy);
388
389const struct st_hash_type rb_fstring_hash_type = {
390 fstring_cmp,
392};
393
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
395
397 VALUE fstr;
398 bool copy;
399};
400
401static int
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
403{
404
405 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
406 VALUE str = (VALUE)*key;
407
408 if (existing) {
409 /* because of lazy sweep, str may be unmarked already and swept
410 * at next time */
411
412 if (rb_objspace_garbage_object_p(str)) {
413 arg->fstr = Qundef;
414 return ST_DELETE;
415 }
416
417 arg->fstr = str;
418 return ST_STOP;
419 }
420 else {
421 if (FL_TEST_RAW(str, STR_FAKESTR)) {
422 if (arg->copy) {
423 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
424 rb_enc_copy(new_str, str);
425 str = new_str;
426 }
427 else {
428 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
429 RSTRING(str)->as.heap.len,
430 ENCODING_GET(str));
431 }
432 OBJ_FREEZE_RAW(str);
433 }
434 else {
435 if (!OBJ_FROZEN(str))
436 str = str_new_frozen(rb_cString, str);
437 if (STR_SHARED_P(str)) { /* str should not be shared */
438 /* shared substring */
439 str_make_independent(str);
440 assert(OBJ_FROZEN(str));
441 }
442 if (!BARE_STRING_P(str)) {
443 str = str_new_frozen(rb_cString, str);
444 }
445 }
446 RBASIC(str)->flags |= RSTRING_FSTR;
447
448 *key = *value = arg->fstr = str;
449 return ST_CONTINUE;
450 }
451}
452
453RUBY_FUNC_EXPORTED
454VALUE
455rb_fstring(VALUE str)
456{
457 VALUE fstr;
458 int bare;
459
460 Check_Type(str, T_STRING);
461
462 if (FL_TEST(str, RSTRING_FSTR))
463 return str;
464
465 bare = BARE_STRING_P(str);
466 if (!bare) {
467 if (STR_EMBED_P(str)) {
468 OBJ_FREEZE_RAW(str);
469 return str;
470 }
471 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
472 assert(OBJ_FROZEN(str));
473 return str;
474 }
475 }
476
477 if (!OBJ_FROZEN(str))
478 rb_str_resize(str, RSTRING_LEN(str));
479
480 fstr = register_fstring(str, FALSE);
481
482 if (!bare) {
483 str_replace_shared_without_enc(str, fstr);
484 OBJ_FREEZE_RAW(str);
485 return str;
486 }
487 return fstr;
488}
489
490static VALUE
491register_fstring(VALUE str, bool copy)
492{
493 struct fstr_update_arg args;
494 args.copy = copy;
495
496 RB_VM_LOCK_ENTER();
497 {
498 st_table *frozen_strings = rb_vm_fstring_table();
499 do {
500 args.fstr = str;
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 } while (UNDEF_P(args.fstr));
503 }
504 RB_VM_LOCK_LEAVE();
505
506 assert(OBJ_FROZEN(args.fstr));
507 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
508 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
509 assert(RBASIC_CLASS(args.fstr) == rb_cString);
510 return args.fstr;
511}
512
513static VALUE
514setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
515{
516 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
517 /* SHARED to be allocated by the callback */
518
519 if (!name) {
520 RUBY_ASSERT_ALWAYS(len == 0);
521 name = "";
522 }
523
524 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
525
526 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
527 fake_str->as.heap.len = len;
528 fake_str->as.heap.ptr = (char *)name;
529 fake_str->as.heap.aux.capa = len;
530 return (VALUE)fake_str;
531}
532
533/*
534 * set up a fake string which refers a static string literal.
535 */
536VALUE
537rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
538{
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
540}
541
542/*
543 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
544 * shared string which refers a static string literal. `ptr` must
545 * point a constant string.
546 */
547MJIT_FUNC_EXPORTED VALUE
548rb_fstring_new(const char *ptr, long len)
549{
550 struct RString fake_str;
551 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
552}
553
554VALUE
555rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
556{
557 struct RString fake_str;
558 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
559}
560
561VALUE
562rb_fstring_cstr(const char *ptr)
563{
564 return rb_fstring_new(ptr, strlen(ptr));
565}
566
567static int
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
569{
570 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
571 return ST_CONTINUE;
572}
573
574static int
575fstring_cmp(VALUE a, VALUE b)
576{
577 long alen, blen;
578 const char *aptr, *bptr;
579 RSTRING_GETMEM(a, aptr, alen);
580 RSTRING_GETMEM(b, bptr, blen);
581 return (alen != blen ||
582 ENCODING_GET(a) != ENCODING_GET(b) ||
583 memcmp(aptr, bptr, alen) != 0);
584}
585
586static inline int
587single_byte_optimizable(VALUE str)
588{
589 rb_encoding *enc;
590
591 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
593 return 1;
594
595 enc = STR_ENC_GET(str);
596 if (rb_enc_mbmaxlen(enc) == 1)
597 return 1;
598
599 /* Conservative. Possibly single byte.
600 * "\xa1" in Shift_JIS for example. */
601 return 0;
602}
603
605
606static inline const char *
607search_nonascii(const char *p, const char *e)
608{
609 const uintptr_t *s, *t;
610
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
616# else
617# error "don't know what to do."
618# endif
619#else
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL /* or...? */
624# else
625# error "don't know what to do."
626# endif
627#endif
628
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
633 p += l;
634 switch (l) {
635 default: UNREACHABLE;
636#if SIZEOF_VOIDP > 4
637 case 7: if (p[-7]&0x80) return p-7;
638 case 6: if (p[-6]&0x80) return p-6;
639 case 5: if (p[-5]&0x80) return p-5;
640 case 4: if (p[-4]&0x80) return p-4;
641#endif
642 case 3: if (p[-3]&0x80) return p-3;
643 case 2: if (p[-2]&0x80) return p-2;
644 case 1: if (p[-1]&0x80) return p-1;
645 case 0: break;
646 }
647 }
648#endif
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
652#else
653#define aligned_ptr(value) (uintptr_t *)(value)
654#endif
655 s = aligned_ptr(p);
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
657#undef aligned_ptr
658 for (;s < t; s++) {
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
662#else
663 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
664#endif
665 }
666 }
667 p = (const char *)s;
668 }
669
670 switch (e - p) {
671 default: UNREACHABLE;
672#if SIZEOF_VOIDP > 4
673 case 7: if (e[-7]&0x80) return e-7;
674 case 6: if (e[-6]&0x80) return e-6;
675 case 5: if (e[-5]&0x80) return e-5;
676 case 4: if (e[-4]&0x80) return e-4;
677#endif
678 case 3: if (e[-3]&0x80) return e-3;
679 case 2: if (e[-2]&0x80) return e-2;
680 case 1: if (e[-1]&0x80) return e-1;
681 case 0: return NULL;
682 }
683}
684
685static int
686coderange_scan(const char *p, long len, rb_encoding *enc)
687{
688 const char *e = p + len;
689
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
691 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
692 p = search_nonascii(p, e);
694 }
695
696 if (rb_enc_asciicompat(enc)) {
697 p = search_nonascii(p, e);
698 if (!p) return ENC_CODERANGE_7BIT;
699 for (;;) {
700 int ret = rb_enc_precise_mbclen(p, e, enc);
702 p += MBCLEN_CHARFOUND_LEN(ret);
703 if (p == e) break;
704 p = search_nonascii(p, e);
705 if (!p) break;
706 }
707 }
708 else {
709 while (p < e) {
710 int ret = rb_enc_precise_mbclen(p, e, enc);
712 p += MBCLEN_CHARFOUND_LEN(ret);
713 }
714 }
715 return ENC_CODERANGE_VALID;
716}
717
718long
719rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
720{
721 const char *p = s;
722
723 if (*cr == ENC_CODERANGE_BROKEN)
724 return e - s;
725
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
727 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
728 if (*cr == ENC_CODERANGE_VALID) return e - s;
729 p = search_nonascii(p, e);
731 return e - s;
732 }
733 else if (rb_enc_asciicompat(enc)) {
734 p = search_nonascii(p, e);
735 if (!p) {
736 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
737 return e - s;
738 }
739 for (;;) {
740 int ret = rb_enc_precise_mbclen(p, e, enc);
741 if (!MBCLEN_CHARFOUND_P(ret)) {
743 return p - s;
744 }
745 p += MBCLEN_CHARFOUND_LEN(ret);
746 if (p == e) break;
747 p = search_nonascii(p, e);
748 if (!p) break;
749 }
750 }
751 else {
752 while (p < e) {
753 int ret = rb_enc_precise_mbclen(p, e, enc);
754 if (!MBCLEN_CHARFOUND_P(ret)) {
756 return p - s;
757 }
758 p += MBCLEN_CHARFOUND_LEN(ret);
759 }
760 }
762 return e - s;
763}
764
765static inline void
766str_enc_copy(VALUE str1, VALUE str2)
767{
768 rb_enc_set_index(str1, ENCODING_GET(str2));
769}
770
771static void
772rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
773{
774 /* this function is designed for copying encoding and coderange
775 * from src to new string "dest" which is made from the part of src.
776 */
777 str_enc_copy(dest, src);
778 if (RSTRING_LEN(dest) == 0) {
779 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
781 else
783 return;
784 }
785 switch (ENC_CODERANGE(src)) {
788 break;
790 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
791 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
793 else
795 break;
796 default:
797 break;
798 }
799}
800
801static void
802rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
803{
804 str_enc_copy(dest, src);
806}
807
808static int
809enc_coderange_scan(VALUE str, rb_encoding *enc)
810{
811 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
812}
813
814int
815rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
816{
817 return enc_coderange_scan(str, enc);
818}
819
820int
822{
823 int cr = ENC_CODERANGE(str);
824
825 if (cr == ENC_CODERANGE_UNKNOWN) {
826 cr = enc_coderange_scan(str, get_encoding(str));
827 ENC_CODERANGE_SET(str, cr);
828 }
829 return cr;
830}
831
832int
834{
835 rb_encoding *enc = STR_ENC_GET(str);
836
837 if (!rb_enc_asciicompat(enc))
838 return FALSE;
839 else if (is_ascii_string(str))
840 return TRUE;
841 return FALSE;
842}
843
844static inline void
845str_mod_check(VALUE s, const char *p, long len)
846{
847 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
848 rb_raise(rb_eRuntimeError, "string modified");
849 }
850}
851
852static size_t
853str_capacity(VALUE str, const int termlen)
854{
855 if (STR_EMBED_P(str)) {
856#if USE_RVARGC
857 return str_embed_capa(str) - termlen;
858#else
859 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
860#endif
861 }
862 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
864 }
865 else {
866 return RSTRING(str)->as.heap.aux.capa;
867 }
868}
869
870size_t
872{
873 return str_capacity(str, TERM_LEN(str));
874}
875
876static inline void
877must_not_null(const char *ptr)
878{
879 if (!ptr) {
880 rb_raise(rb_eArgError, "NULL pointer given");
881 }
882}
883
884static inline VALUE
885str_alloc_embed(VALUE klass, size_t capa)
886{
887 size_t size = rb_str_embed_size(capa);
888 assert(size > 0);
889 assert(rb_gc_size_allocatable_p(size));
890#if !USE_RVARGC
891 assert(size <= sizeof(struct RString));
892#endif
893
894 RVARGC_NEWOBJ_OF(str, struct RString, klass,
896
897 return (VALUE)str;
898}
899
900static inline VALUE
901str_alloc_heap(VALUE klass)
902{
903 RVARGC_NEWOBJ_OF(str, struct RString, klass,
904 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
905
906 return (VALUE)str;
907}
908
909static inline VALUE
910empty_str_alloc(VALUE klass)
911{
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
915 return str;
916}
917
918static VALUE
919str_new0(VALUE klass, const char *ptr, long len, int termlen)
920{
921 VALUE str;
922
923 if (len < 0) {
924 rb_raise(rb_eArgError, "negative string size (or size too big)");
925 }
926
927 RUBY_DTRACE_CREATE_HOOK(STRING, len);
928
929 if (STR_EMBEDDABLE_P(len, termlen)) {
930 str = str_alloc_embed(klass, len + termlen);
931 if (len == 0) {
933 }
934 }
935 else {
936 str = str_alloc_heap(klass);
937 RSTRING(str)->as.heap.aux.capa = len;
938 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
939 * integer overflow. If we can STATIC_ASSERT that, the following
940 * mul_add_mul can be reverted to a simple ALLOC_N. */
941 RSTRING(str)->as.heap.ptr =
942 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
943 }
944 if (ptr) {
945 memcpy(RSTRING_PTR(str), ptr, len);
946 }
947 STR_SET_LEN(str, len);
948 TERM_FILL(RSTRING_PTR(str) + len, termlen);
949 return str;
950}
951
952static VALUE
953str_new(VALUE klass, const char *ptr, long len)
954{
955 return str_new0(klass, ptr, len, 1);
956}
957
958VALUE
959rb_str_new(const char *ptr, long len)
960{
961 return str_new(rb_cString, ptr, len);
962}
963
964VALUE
965rb_usascii_str_new(const char *ptr, long len)
966{
967 VALUE str = rb_str_new(ptr, len);
968 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
969 return str;
970}
971
972VALUE
973rb_utf8_str_new(const char *ptr, long len)
974{
975 VALUE str = str_new(rb_cString, ptr, len);
976 rb_enc_associate_index(str, rb_utf8_encindex());
977 return str;
978}
979
980VALUE
981rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
982{
983 VALUE str;
984
985 if (!enc) return rb_str_new(ptr, len);
986
987 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
988 rb_enc_associate(str, enc);
989 return str;
990}
991
992VALUE
994{
995 must_not_null(ptr);
996 /* rb_str_new_cstr() can take pointer from non-malloc-generated
997 * memory regions, and that cannot be detected by the MSAN. Just
998 * trust the programmer that the argument passed here is a sane C
999 * string. */
1000 __msan_unpoison_string(ptr);
1001 return rb_str_new(ptr, strlen(ptr));
1002}
1003
1004VALUE
1006{
1007 VALUE str = rb_str_new_cstr(ptr);
1008 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
1009 return str;
1010}
1011
1012VALUE
1014{
1015 VALUE str = rb_str_new_cstr(ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1017 return str;
1018}
1019
1020VALUE
1022{
1023 must_not_null(ptr);
1024 if (rb_enc_mbminlen(enc) != 1) {
1025 rb_raise(rb_eArgError, "wchar encoding given");
1026 }
1027 return rb_enc_str_new(ptr, strlen(ptr), enc);
1028}
1029
1030static VALUE
1031str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (!ptr) {
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1041 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1042 }
1043 else {
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045 str = str_alloc_heap(klass);
1046 RSTRING(str)->as.heap.len = len;
1047 RSTRING(str)->as.heap.ptr = (char *)ptr;
1048 RSTRING(str)->as.heap.aux.capa = len;
1049 RBASIC(str)->flags |= STR_NOFREE;
1050 }
1051 rb_enc_associate_index(str, encindex);
1052 return str;
1053}
1054
1055VALUE
1056rb_str_new_static(const char *ptr, long len)
1057{
1058 return str_new_static(rb_cString, ptr, len, 0);
1059}
1060
1061VALUE
1063{
1064 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1065}
1066
1067VALUE
1069{
1070 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1071}
1072
1073VALUE
1075{
1076 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1077}
1078
1079static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1080 rb_encoding *from, rb_encoding *to,
1081 int ecflags, VALUE ecopts);
1082
1083static inline bool
1084is_enc_ascii_string(VALUE str, rb_encoding *enc)
1085{
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1089 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1090}
1091
1092VALUE
1093rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1094{
1095 long len;
1096 const char *ptr;
1097 VALUE newstr;
1098
1099 if (!to) return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to) return str;
1102 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1105 str = rb_str_dup(str);
1106 rb_enc_associate(str, to);
1107 }
1108 return str;
1109 }
1110
1111 RSTRING_GETMEM(str, ptr, len);
1112 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1113 from, to, ecflags, ecopts);
1114 if (NIL_P(newstr)) {
1115 /* some error, return original */
1116 return str;
1117 }
1118 return newstr;
1119}
1120
1121VALUE
1122rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1123 rb_encoding *from, int ecflags, VALUE ecopts)
1124{
1125 long olen;
1126
1127 olen = RSTRING_LEN(newstr);
1128 if (ofs < -olen || olen < ofs)
1129 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1130 if (ofs < 0) ofs += olen;
1131 if (!from) {
1132 STR_SET_LEN(newstr, ofs);
1133 return rb_str_cat(newstr, ptr, len);
1134 }
1135
1136 rb_str_modify(newstr);
1137 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1138 rb_enc_get(newstr),
1139 ecflags, ecopts);
1140}
1141
1142VALUE
1143rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1144{
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1147 rb_str_cat(str, ptr, len);
1148 return str;
1149}
1150
1151static VALUE
1152str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1153 rb_encoding *from, rb_encoding *to,
1154 int ecflags, VALUE ecopts)
1155{
1156 rb_econv_t *ec;
1158 long olen;
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1163
1164 olen = rb_str_capacity(newstr);
1165
1166 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1168 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1169 if (!ec) return Qnil;
1170 DATA_PTR(econv_wrapper) = ec;
1171
1172 sp = (unsigned char*)ptr;
1173 start = sp;
1174 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1176 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1178 /* destination buffer short */
1179 size_t converted_input = sp - start;
1180 size_t rest = len - converted_input;
1181 converted_output = dp - dest;
1182 rb_str_set_len(newstr, converted_output);
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1186 }
1187 else {
1188 rest = olen;
1189 }
1190 olen += rest < 2 ? 2 : rest;
1191 rb_str_resize(newstr, olen);
1192 }
1193 DATA_PTR(econv_wrapper) = 0;
1194 rb_econv_close(ec);
1195 switch (ret) {
1196 case econv_finished:
1197 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1198 rb_str_set_len(newstr, len);
1199 rb_enc_associate(newstr, to);
1200 return newstr;
1201
1202 default:
1203 return Qnil;
1204 }
1205}
1206
1207VALUE
1209{
1210 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1211}
1212
1213VALUE
1215{
1216 rb_encoding *ienc;
1217 VALUE str;
1218 const int eidx = rb_enc_to_index(eenc);
1219
1220 if (!ptr) {
1221 return rb_enc_str_new(ptr, len, eenc);
1222 }
1223
1224 /* ASCII-8BIT case, no conversion */
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1227 return rb_str_new(ptr, len);
1228 }
1229 /* no default_internal or same encoding, no conversion */
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1232 return rb_enc_str_new(ptr, len, eenc);
1233 }
1234 /* ASCII compatible, and ASCII only string, no conversion in
1235 * default_internal */
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1238 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1239 return rb_enc_str_new(ptr, len, ienc);
1240 }
1241 /* convert from the given encoding to default_internal */
1242 str = rb_enc_str_new(NULL, 0, ienc);
1243 /* when the conversion failed for some reason, just ignore the
1244 * default_internal and result in the given encoding as-is. */
1245 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1246 rb_str_initialize(str, ptr, len, eenc);
1247 }
1248 return str;
1249}
1250
1251VALUE
1252rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1253{
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1258 return str;
1259 }
1260 rb_enc_associate_index(str, eidx);
1261 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1262}
1263
1264VALUE
1265rb_external_str_new(const char *ptr, long len)
1266{
1267 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1268}
1269
1270VALUE
1272{
1273 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1274}
1275
1276VALUE
1277rb_locale_str_new(const char *ptr, long len)
1278{
1279 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1280}
1281
1282VALUE
1284{
1285 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1286}
1287
1288VALUE
1290{
1291 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1292}
1293
1294VALUE
1296{
1297 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1298}
1299
1300VALUE
1302{
1303 return rb_str_export_to_enc(str, rb_default_external_encoding());
1304}
1305
1306VALUE
1308{
1309 return rb_str_export_to_enc(str, rb_locale_encoding());
1310}
1311
1312VALUE
1314{
1315 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1316}
1317
1318static VALUE
1319str_replace_shared_without_enc(VALUE str2, VALUE str)
1320{
1321 const int termlen = TERM_LEN(str);
1322 char *ptr;
1323 long len;
1324
1325 RSTRING_GETMEM(str, ptr, len);
1326 if (str_embed_capa(str2) >= len + termlen) {
1327 char *ptr2 = RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1329 memcpy(ptr2, RSTRING_PTR(str), len);
1330 STR_SET_EMBED_LEN(str2, len);
1331 TERM_FILL(ptr2+len, termlen);
1332 }
1333 else {
1334 VALUE root;
1335 if (STR_SHARED_P(str)) {
1336 root = RSTRING(str)->as.heap.aux.shared;
1337 RSTRING_GETMEM(str, ptr, len);
1338 }
1339 else {
1340 root = rb_str_new_frozen(str);
1341 RSTRING_GETMEM(root, ptr, len);
1342 }
1343 assert(OBJ_FROZEN(root));
1344 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1345 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1346 rb_fatal("about to free a possible shared root");
1347 }
1348 char *ptr2 = STR_HEAP_PTR(str2);
1349 if (ptr2 != ptr) {
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1351 }
1352 }
1353 FL_SET(str2, STR_NOEMBED);
1354 RSTRING(str2)->as.heap.len = len;
1355 RSTRING(str2)->as.heap.ptr = ptr;
1356 STR_SET_SHARED(str2, root);
1357 }
1358 return str2;
1359}
1360
1361static VALUE
1362str_replace_shared(VALUE str2, VALUE str)
1363{
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1366 return str2;
1367}
1368
1369static VALUE
1370str_new_shared(VALUE klass, VALUE str)
1371{
1372 return str_replace_shared(str_alloc_heap(klass), str);
1373}
1374
1375VALUE
1377{
1378 return str_new_shared(rb_obj_class(str), str);
1379}
1380
1381VALUE
1383{
1384 if (OBJ_FROZEN(orig)) return orig;
1385 return str_new_frozen(rb_obj_class(orig), orig);
1386}
1387
1388static VALUE
1389rb_str_new_frozen_String(VALUE orig)
1390{
1391 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1392 return str_new_frozen(rb_cString, orig);
1393}
1394
1395VALUE
1396rb_str_tmp_frozen_acquire(VALUE orig)
1397{
1398 if (OBJ_FROZEN_RAW(orig)) return orig;
1399 return str_new_frozen_buffer(0, orig, FALSE);
1400}
1401
1402void
1403rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1404{
1405 if (RBASIC_CLASS(tmp) != 0)
1406 return;
1407
1408 if (STR_EMBED_P(tmp)) {
1409 assert(OBJ_FROZEN_RAW(tmp));
1410 }
1411 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1412 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1413 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1414
1415 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1416 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1417 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1418
1419 /* Unshare orig since the root (tmp) only has this one child. */
1420 FL_UNSET_RAW(orig, STR_SHARED);
1421 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1423 assert(OBJ_FROZEN_RAW(tmp));
1424
1425 /* Make tmp embedded and empty so it is safe for sweeping. */
1426 STR_SET_EMBED(tmp);
1427 STR_SET_EMBED_LEN(tmp, 0);
1428 }
1429 }
1430}
1431
1432static VALUE
1433str_new_frozen(VALUE klass, VALUE orig)
1434{
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1436}
1437
1438static VALUE
1439heap_str_make_shared(VALUE klass, VALUE orig)
1440{
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1443
1444 VALUE str = str_alloc_heap(klass);
1445 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1446 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1447 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1451 if (klass == 0)
1452 FL_UNSET_RAW(str, STR_BORROWED);
1453 return str;
1454}
1455
1456static VALUE
1457str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1458{
1459 VALUE str;
1460
1461 long len = RSTRING_LEN(orig);
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1463
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1465 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1466 assert(STR_EMBED_P(str));
1467 }
1468 else {
1469 if (FL_TEST_RAW(orig, STR_SHARED)) {
1470 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1471 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1472 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1473 assert(ofs >= 0);
1474 assert(rest >= 0);
1475 assert(ofs + rest <= RSTRING_LEN(shared));
1476#if !USE_RVARGC
1477 assert(!STR_EMBED_P(shared));
1478#endif
1479 assert(OBJ_FROZEN(shared));
1480
1481 if ((ofs > 0) || (rest > 0) ||
1482 (klass != RBASIC(shared)->klass) ||
1483 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1484 str = str_new_shared(klass, shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1488 }
1489 else {
1490 if (RBASIC_CLASS(shared) == 0)
1491 FL_SET_RAW(shared, STR_BORROWED);
1492 return shared;
1493 }
1494 }
1495 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1497 STR_SET_EMBED(str);
1498 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1499 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1500 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1501 }
1502 else {
1503 str = heap_str_make_shared(klass, orig);
1504 }
1505 }
1506
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1508 OBJ_FREEZE(str);
1509 return str;
1510}
1511
1512VALUE
1513rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1514{
1515 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1516}
1517
1518static VALUE
1519str_new_empty_String(VALUE str)
1520{
1521 VALUE v = rb_str_new(0, 0);
1522 rb_enc_copy(v, str);
1523 return v;
1524}
1525
1526#define STR_BUF_MIN_SIZE 63
1527#if !USE_RVARGC
1528STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1529#endif
1530
1531VALUE
1533{
1534 if (STR_EMBEDDABLE_P(capa, 1)) {
1535 return str_alloc_embed(rb_cString, capa + 1);
1536 }
1537
1538 VALUE str = str_alloc_heap(rb_cString);
1539
1540#if !USE_RVARGC
1541 if (capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1543 }
1544#endif
1545 RSTRING(str)->as.heap.aux.capa = capa;
1546 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1547 RSTRING(str)->as.heap.ptr[0] = '\0';
1548
1549 return str;
1550}
1551
1552VALUE
1554{
1555 VALUE str;
1556 long len = strlen(ptr);
1557
1558 str = rb_str_buf_new(len);
1559 rb_str_buf_cat(str, ptr, len);
1560
1561 return str;
1562}
1563
1564VALUE
1566{
1567 return str_new(0, 0, len);
1568}
1569
1570void
1572{
1573 if (FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1575
1576 RB_VM_LOCK_ENTER();
1577 {
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1580 }
1581 RB_VM_LOCK_LEAVE();
1582 }
1583
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1586 }
1587 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1590 }
1591 else {
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1594 }
1595}
1596
1597RUBY_FUNC_EXPORTED size_t
1598rb_str_memsize(VALUE str)
1599{
1600 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1602 }
1603 else {
1604 return 0;
1605 }
1606}
1607
1608VALUE
1610{
1611 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1612}
1613
1614static inline void str_discard(VALUE str);
1615static void str_shared_replace(VALUE str, VALUE str2);
1616
1617void
1619{
1620 if (str != str2) str_shared_replace(str, str2);
1621}
1622
1623static void
1624str_shared_replace(VALUE str, VALUE str2)
1625{
1626 rb_encoding *enc;
1627 int cr;
1628 int termlen;
1629
1630 RUBY_ASSERT(str2 != str);
1631 enc = STR_ENC_GET(str2);
1632 cr = ENC_CODERANGE(str2);
1633 str_discard(str);
1634 termlen = rb_enc_mbminlen(enc);
1635
1636 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1637 STR_SET_EMBED(str);
1638 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1639 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1640 rb_enc_associate(str, enc);
1641 ENC_CODERANGE_SET(str, cr);
1642 }
1643 else {
1644#if USE_RVARGC
1645 if (STR_EMBED_P(str2)) {
1646 assert(!FL_TEST(str2, STR_SHARED));
1647 long len = RSTRING(str2)->as.embed.len;
1648 assert(len + termlen <= str_embed_capa(str2));
1649
1650 char *new_ptr = ALLOC_N(char, len + termlen);
1651 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1653 RSTRING(str2)->as.heap.len = len;
1654 RSTRING(str2)->as.heap.aux.capa = len;
1655 STR_SET_NOEMBED(str2);
1656 }
1657#endif
1658
1659 STR_SET_NOEMBED(str);
1660 FL_UNSET(str, STR_SHARED);
1661 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1662 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1663
1664 if (FL_TEST(str2, STR_SHARED)) {
1665 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1666 STR_SET_SHARED(str, shared);
1667 }
1668 else {
1669 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1670 }
1671
1672 /* abandon str2 */
1673 STR_SET_EMBED(str2);
1674 RSTRING_PTR(str2)[0] = 0;
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1677 ENC_CODERANGE_SET(str, cr);
1678 }
1679}
1680
1681VALUE
1683{
1684 VALUE str;
1685
1686 if (RB_TYPE_P(obj, T_STRING)) {
1687 return obj;
1688 }
1689 str = rb_funcall(obj, idTo_s, 0);
1690 return rb_obj_as_string_result(str, obj);
1691}
1692
1693MJIT_FUNC_EXPORTED VALUE
1694rb_obj_as_string_result(VALUE str, VALUE obj)
1695{
1696 if (!RB_TYPE_P(str, T_STRING))
1697 return rb_any_to_s(obj);
1698 return str;
1699}
1700
1701static VALUE
1702str_replace(VALUE str, VALUE str2)
1703{
1704 long len;
1705
1706 len = RSTRING_LEN(str2);
1707 if (STR_SHARED_P(str2)) {
1708 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1709 assert(OBJ_FROZEN(shared));
1710 STR_SET_NOEMBED(str);
1711 RSTRING(str)->as.heap.len = len;
1712 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1713 STR_SET_SHARED(str, shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1715 }
1716 else {
1717 str_replace_shared(str, str2);
1718 }
1719
1720 return str;
1721}
1722
1723static inline VALUE
1724ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1725{
1726 size_t size = rb_str_embed_size(capa);
1727 assert(size > 0);
1728 assert(rb_gc_size_allocatable_p(size));
1729#if !USE_RVARGC
1730 assert(size <= sizeof(struct RString));
1731#endif
1732
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1735
1736 return (VALUE)str;
1737}
1738
1739static inline VALUE
1740ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1741{
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1743 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
1744
1745 return (VALUE)str;
1746}
1747
1748static inline VALUE
1749str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1750{
1751 const VALUE flag_mask =
1752#if !USE_RVARGC
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1754#endif
1756 FL_FREEZE
1757 ;
1758 VALUE flags = FL_TEST_RAW(str, flag_mask);
1759 int encidx = 0;
1760 if (STR_EMBED_P(str)) {
1761 long len = RSTRING_EMBED_LEN(str);
1762
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >= len + 1);
1765 STR_SET_EMBED_LEN(dup, len);
1766 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1767 }
1768 else {
1769 VALUE root = str;
1770 if (FL_TEST_RAW(str, STR_SHARED)) {
1771 root = RSTRING(str)->as.heap.aux.shared;
1772 }
1773 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1775 flags = FL_TEST_RAW(str, flag_mask);
1776 }
1777 assert(!STR_SHARED_P(root));
1778 assert(RB_OBJ_FROZEN_RAW(root));
1779 if (0) {}
1780#if !USE_RVARGC
1781 else if (STR_EMBED_P(root)) {
1782 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1783 char, RSTRING_EMBED_LEN_MAX + 1);
1784 FL_UNSET(dup, STR_NOEMBED);
1785 }
1786#endif
1787 else {
1788 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1789 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1790 FL_SET(root, STR_SHARED_ROOT);
1791 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1793 }
1794 }
1795
1796 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1799 }
1800 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1802 return dup;
1803}
1804
1805static inline VALUE
1806ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1807{
1808 VALUE dup;
1809 if (FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1811 }
1812 else {
1813 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1814 }
1815
1816 return str_duplicate_setup(klass, str, dup);
1817}
1818
1819static inline VALUE
1820str_duplicate(VALUE klass, VALUE str)
1821{
1822 VALUE dup;
1823 if (FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1825 }
1826 else {
1827 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1828 }
1829
1830 return str_duplicate_setup(klass, str, dup);
1831}
1832
1833VALUE
1835{
1836 return str_duplicate(rb_obj_class(str), str);
1837}
1838
1839VALUE
1841{
1842 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1843 return str_duplicate(rb_cString, str);
1844}
1845
1846VALUE
1847rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1848{
1849 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1850 return ec_str_duplicate(ec, rb_cString, str);
1851}
1852
1853/*
1854 *
1855 * call-seq:
1856 * String.new(string = '', **opts) -> new_string
1857 *
1858 * :include: doc/string/new.rdoc
1859 *
1860 */
1861
1862static VALUE
1863rb_str_init(int argc, VALUE *argv, VALUE str)
1864{
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1867 VALUE kwargs[2];
1868 rb_encoding *enc = 0;
1869 int n;
1870
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1], "capacity");
1874 }
1875
1876 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1877 if (!NIL_P(opt)) {
1878 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1879 venc = kwargs[0];
1880 vcapa = kwargs[1];
1881 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1883 }
1884 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1885 long capa = NUM2LONG(vcapa);
1886 long len = 0;
1887 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1888
1889 if (capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1891 }
1892 if (n == 1) {
1893 StringValue(orig);
1894 len = RSTRING_LEN(orig);
1895 if (capa < len) {
1896 capa = len;
1897 }
1898 if (orig == str) n = 0;
1899 }
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) { /* make noembed always */
1902 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1903#if USE_RVARGC
1904 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1905 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1906#else
1907 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1908#endif
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1910 }
1911 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)capa + termlen;
1913 const char *const old_ptr = RSTRING_PTR(str);
1914 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1917 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1919 }
1920 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1921 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1922 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1923 }
1924 RSTRING(str)->as.heap.len = len;
1925 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1926 if (n == 1) {
1927 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1928 rb_enc_cr_str_exact_copy(str, orig);
1929 }
1930 FL_SET(str, STR_NOEMBED);
1931 RSTRING(str)->as.heap.aux.capa = capa;
1932 }
1933 else if (n == 1) {
1934 rb_str_replace(str, orig);
1935 }
1936 if (enc) {
1937 rb_enc_associate(str, enc);
1939 }
1940 }
1941 else if (n == 1) {
1942 rb_str_replace(str, orig);
1943 }
1944 return str;
1945}
1946
1947#ifdef NONASCII_MASK
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1949
1950/*
1951 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1952 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1953 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1954 *
1955 * if (!(byte & 0x80))
1956 * byte |= 0x40; // turn on bit6
1957 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1958 *
1959 * This function calculates whether a byte is leading or not for all bytes
1960 * in the argument word by concurrently using the above logic, and then
1961 * adds up the number of leading bytes in the word.
1962 */
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(const uintptr_t *s)
1965{
1966 uintptr_t d = *s;
1967
1968 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1971
1972 /* Gather all bytes. */
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1974 /* use only if it can use POPCNT */
1975 return rb_popcount_intptr(d);
1976#else
1977 d += (d>>8);
1978 d += (d>>16);
1979# if SIZEOF_VOIDP == 8
1980 d += (d>>32);
1981# endif
1982 return (d&0xF);
1983#endif
1984}
1985#endif
1986
1987static inline long
1988enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1989{
1990 long c;
1991 const char *q;
1992
1993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1994 long diff = (long)(e - p);
1995 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1996 }
1997#ifdef NONASCII_MASK
1998 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1999 uintptr_t len = 0;
2000 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2003 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (const char *)s) {
2006 if (is_utf8_lead_byte(*p)) len++;
2007 p++;
2008 }
2009 while (s < t) {
2010 len += count_utf8_lead_bytes_with_word(s);
2011 s++;
2012 }
2013 p = (const char *)s;
2014 }
2015 while (p < e) {
2016 if (is_utf8_lead_byte(*p)) len++;
2017 p++;
2018 }
2019 return (long)len;
2020 }
2021#endif
2022 else if (rb_enc_asciicompat(enc)) {
2023 c = 0;
2024 if (ENC_CODERANGE_CLEAN_P(cr)) {
2025 while (p < e) {
2026 if (ISASCII(*p)) {
2027 q = search_nonascii(p, e);
2028 if (!q)
2029 return c + (e - p);
2030 c += q - p;
2031 p = q;
2032 }
2033 p += rb_enc_fast_mbclen(p, e, enc);
2034 c++;
2035 }
2036 }
2037 else {
2038 while (p < e) {
2039 if (ISASCII(*p)) {
2040 q = search_nonascii(p, e);
2041 if (!q)
2042 return c + (e - p);
2043 c += q - p;
2044 p = q;
2045 }
2046 p += rb_enc_mbclen(p, e, enc);
2047 c++;
2048 }
2049 }
2050 return c;
2051 }
2052
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2055 }
2056 return c;
2057}
2058
2059long
2060rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2061{
2062 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2063}
2064
2065/* To get strlen with cr
2066 * Note that given cr is not used.
2067 */
2068long
2069rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2070{
2071 long c;
2072 const char *q;
2073 int ret;
2074
2075 *cr = 0;
2076 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2077 long diff = (long)(e - p);
2078 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2079 }
2080 else if (rb_enc_asciicompat(enc)) {
2081 c = 0;
2082 while (p < e) {
2083 if (ISASCII(*p)) {
2084 q = search_nonascii(p, e);
2085 if (!q) {
2086 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2087 return c + (e - p);
2088 }
2089 c += q - p;
2090 p = q;
2091 }
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2093 if (MBCLEN_CHARFOUND_P(ret)) {
2094 *cr |= ENC_CODERANGE_VALID;
2095 p += MBCLEN_CHARFOUND_LEN(ret);
2096 }
2097 else {
2099 p++;
2100 }
2101 c++;
2102 }
2103 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2104 return c;
2105 }
2106
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2109 if (MBCLEN_CHARFOUND_P(ret)) {
2110 *cr |= ENC_CODERANGE_VALID;
2111 p += MBCLEN_CHARFOUND_LEN(ret);
2112 }
2113 else {
2115 if (p + rb_enc_mbminlen(enc) <= e)
2116 p += rb_enc_mbminlen(enc);
2117 else
2118 p = e;
2119 }
2120 }
2121 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2122 return c;
2123}
2124
2125/* enc must be str's enc or rb_enc_check(str, str2) */
2126static long
2127str_strlen(VALUE str, rb_encoding *enc)
2128{
2129 const char *p, *e;
2130 int cr;
2131
2132 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2134 p = RSTRING_PTR(str);
2135 e = RSTRING_END(str);
2136 cr = ENC_CODERANGE(str);
2137
2138 if (cr == ENC_CODERANGE_UNKNOWN) {
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2140 if (cr) ENC_CODERANGE_SET(str, cr);
2141 return n;
2142 }
2143 else {
2144 return enc_strlen(p, e, enc, cr);
2145 }
2146}
2147
2148long
2150{
2151 return str_strlen(str, NULL);
2152}
2153
2154/*
2155 * call-seq:
2156 * length -> integer
2157 *
2158 * :include: doc/string/length.rdoc
2159 *
2160 */
2161
2162VALUE
2164{
2165 return LONG2NUM(str_strlen(str, NULL));
2166}
2167
2168/*
2169 * call-seq:
2170 * bytesize -> integer
2171 *
2172 * :include: doc/string/bytesize.rdoc
2173 *
2174 */
2175
2176static VALUE
2177rb_str_bytesize(VALUE str)
2178{
2179 return LONG2NUM(RSTRING_LEN(str));
2180}
2181
2182/*
2183 * call-seq:
2184 * empty? -> true or false
2185 *
2186 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2187 *
2188 * "hello".empty? # => false
2189 * " ".empty? # => false
2190 * "".empty? # => true
2191 *
2192 */
2193
2194static VALUE
2195rb_str_empty(VALUE str)
2196{
2197 return RBOOL(RSTRING_LEN(str) == 0);
2198}
2199
2200/*
2201 * call-seq:
2202 * string + other_string -> new_string
2203 *
2204 * Returns a new \String containing +other_string+ concatenated to +self+:
2205 *
2206 * "Hello from " + self.to_s # => "Hello from main"
2207 *
2208 */
2209
2210VALUE
2212{
2213 VALUE str3;
2214 rb_encoding *enc;
2215 char *ptr1, *ptr2, *ptr3;
2216 long len1, len2;
2217 int termlen;
2218
2219 StringValue(str2);
2220 enc = rb_enc_check_str(str1, str2);
2221 RSTRING_GETMEM(str1, ptr1, len1);
2222 RSTRING_GETMEM(str2, ptr2, len2);
2223 termlen = rb_enc_mbminlen(enc);
2224 if (len1 > LONG_MAX - len2) {
2225 rb_raise(rb_eArgError, "string size too big");
2226 }
2227 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2228 ptr3 = RSTRING_PTR(str3);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2232
2233 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2235 RB_GC_GUARD(str1);
2236 RB_GC_GUARD(str2);
2237 return str3;
2238}
2239
2240/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2241MJIT_FUNC_EXPORTED VALUE
2242rb_str_opt_plus(VALUE str1, VALUE str2)
2243{
2244 assert(RBASIC_CLASS(str1) == rb_cString);
2245 assert(RBASIC_CLASS(str2) == rb_cString);
2246 long len1, len2;
2247 MAYBE_UNUSED(char) *ptr1, *ptr2;
2248 RSTRING_GETMEM(str1, ptr1, len1);
2249 RSTRING_GETMEM(str2, ptr2, len2);
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2252
2253 if (enc1 < 0) {
2254 return Qundef;
2255 }
2256 else if (enc2 < 0) {
2257 return Qundef;
2258 }
2259 else if (enc1 != enc2) {
2260 return Qundef;
2261 }
2262 else if (len1 > LONG_MAX - len2) {
2263 return Qundef;
2264 }
2265 else {
2266 return rb_str_plus(str1, str2);
2267 }
2268
2269}
2270
2271/*
2272 * call-seq:
2273 * string * integer -> new_string
2274 *
2275 * Returns a new \String containing +integer+ copies of +self+:
2276 *
2277 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2278 * "Ho! " * 0 # => ""
2279 *
2280 */
2281
2282VALUE
2284{
2285 VALUE str2;
2286 long n, len;
2287 char *ptr2;
2288 int termlen;
2289
2290 if (times == INT2FIX(1)) {
2291 return str_duplicate(rb_cString, str);
2292 }
2293 if (times == INT2FIX(0)) {
2294 str2 = str_alloc_embed(rb_cString, 0);
2295 rb_enc_copy(str2, str);
2296 return str2;
2297 }
2298 len = NUM2LONG(times);
2299 if (len < 0) {
2300 rb_raise(rb_eArgError, "negative argument");
2301 }
2302 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2303 if (STR_EMBEDDABLE_P(len, 1)) {
2304 str2 = str_alloc_embed(rb_cString, len + 1);
2305 memset(RSTRING_PTR(str2), 0, len + 1);
2306 }
2307 else {
2308 str2 = str_alloc_heap(rb_cString);
2309 RSTRING(str2)->as.heap.aux.capa = len;
2310 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2311 }
2312 STR_SET_LEN(str2, len);
2313 rb_enc_copy(str2, str);
2314 return str2;
2315 }
2316 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2317 rb_raise(rb_eArgError, "argument too big");
2318 }
2319
2320 len *= RSTRING_LEN(str);
2321 termlen = TERM_LEN(str);
2322 str2 = str_new0(rb_cString, 0, len, termlen);
2323 ptr2 = RSTRING_PTR(str2);
2324 if (len) {
2325 n = RSTRING_LEN(str);
2326 memcpy(ptr2, RSTRING_PTR(str), n);
2327 while (n <= len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2329 n *= 2;
2330 }
2331 memcpy(ptr2 + n, ptr2, len-n);
2332 }
2333 STR_SET_LEN(str2, len);
2334 TERM_FILL(&ptr2[len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2336
2337 return str2;
2338}
2339
2340/*
2341 * call-seq:
2342 * string % object -> new_string
2343 *
2344 * Returns the result of formatting +object+ into the format specification +self+
2345 * (see Kernel#sprintf for formatting details):
2346 *
2347 * "%05d" % 123 # => "00123"
2348 *
2349 * If +self+ contains multiple substitutions, +object+ must be
2350 * an \Array or \Hash containing the values to be substituted:
2351 *
2352 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2353 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2354 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2355 *
2356 */
2357
2358static VALUE
2359rb_str_format_m(VALUE str, VALUE arg)
2360{
2361 VALUE tmp = rb_check_array_type(arg);
2362
2363 if (!NIL_P(tmp)) {
2364 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2365 }
2366 return rb_str_format(1, &arg, str);
2367}
2368
2369static inline void
2370rb_check_lockedtmp(VALUE str)
2371{
2372 if (FL_TEST(str, STR_TMPLOCK)) {
2373 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2374 }
2375}
2376
2377static inline void
2378str_modifiable(VALUE str)
2379{
2380 rb_check_lockedtmp(str);
2381 rb_check_frozen(str);
2382}
2383
2384static inline int
2385str_dependent_p(VALUE str)
2386{
2387 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2388 return 0;
2389 }
2390 else {
2391 return 1;
2392 }
2393}
2394
2395static inline int
2396str_independent(VALUE str)
2397{
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2400}
2401
2402static void
2403str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2404{
2405 char *ptr;
2406 char *oldptr;
2407 long capa = len + expand;
2408
2409 if (len > capa) len = capa;
2410
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2412 ptr = RSTRING(str)->as.heap.ptr;
2413 STR_SET_EMBED(str);
2414 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2416 STR_SET_EMBED_LEN(str, len);
2417 return;
2418 }
2419
2420 ptr = ALLOC_N(char, (size_t)capa + termlen);
2421 oldptr = RSTRING_PTR(str);
2422 if (oldptr) {
2423 memcpy(ptr, oldptr, len);
2424 }
2425 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2426 xfree(oldptr);
2427 }
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(ptr + len, termlen);
2431 RSTRING(str)->as.heap.ptr = ptr;
2432 RSTRING(str)->as.heap.len = len;
2433 RSTRING(str)->as.heap.aux.capa = capa;
2434}
2435
2436void
2438{
2439 if (!str_independent(str))
2440 str_make_independent(str);
2442}
2443
2444void
2446{
2447 int termlen = TERM_LEN(str);
2448 long len = RSTRING_LEN(str);
2449
2450 if (expand < 0) {
2451 rb_raise(rb_eArgError, "negative expanding string size");
2452 }
2453 if (expand >= LONG_MAX - len) {
2454 rb_raise(rb_eArgError, "string size too big");
2455 }
2456
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str, len, expand, termlen);
2459 }
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str, len + expand, termlen);
2462 }
2464}
2465
2466/* As rb_str_modify(), but don't clear coderange */
2467static void
2468str_modify_keep_cr(VALUE str)
2469{
2470 if (!str_independent(str))
2471 str_make_independent(str);
2473 /* Force re-scan later */
2475}
2476
2477static inline void
2478str_discard(VALUE str)
2479{
2480 str_modifiable(str);
2481 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2482 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2483 RSTRING(str)->as.heap.ptr = 0;
2484 RSTRING(str)->as.heap.len = 0;
2485 }
2486}
2487
2488void
2490{
2491 rb_encoding *enc = rb_enc_get(str);
2492 if (!enc) {
2493 rb_raise(rb_eTypeError, "not encoding capable object");
2494 }
2495 if (!rb_enc_asciicompat(enc)) {
2496 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2497 }
2498}
2499
2500VALUE
2502{
2503 VALUE s = *ptr;
2504 if (!RB_TYPE_P(s, T_STRING)) {
2505 s = rb_str_to_str(s);
2506 *ptr = s;
2507 }
2508 return s;
2509}
2510
2511char *
2513{
2514 VALUE str = rb_string_value(ptr);
2515 return RSTRING_PTR(str);
2516}
2517
2518static int
2519zero_filled(const char *s, int n)
2520{
2521 for (; n > 0; --n) {
2522 if (*s++) return 0;
2523 }
2524 return 1;
2525}
2526
2527static const char *
2528str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2529{
2530 const char *e = s + len;
2531
2532 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2533 if (zero_filled(s, minlen)) return s;
2534 }
2535 return 0;
2536}
2537
2538static char *
2539str_fill_term(VALUE str, char *s, long len, int termlen)
2540{
2541 /* This function assumes that (capa + termlen) bytes of memory
2542 * is allocated, like many other functions in this file.
2543 */
2544 if (str_dependent_p(str)) {
2545 if (!zero_filled(s + len, termlen))
2546 str_make_independent_expand(str, len, 0L, termlen);
2547 }
2548 else {
2549 TERM_FILL(s + len, termlen);
2550 return s;
2551 }
2552 return RSTRING_PTR(str);
2553}
2554
2555void
2556rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2557{
2558 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2559 long len = RSTRING_LEN(str);
2560
2561 assert(capa >= len);
2562 if (capa - len < termlen) {
2563 rb_check_lockedtmp(str);
2564 str_make_independent_expand(str, len, 0L, termlen);
2565 }
2566 else if (str_dependent_p(str)) {
2567 if (termlen > oldtermlen)
2568 str_make_independent_expand(str, len, 0L, termlen);
2569 }
2570 else {
2571 if (!STR_EMBED_P(str)) {
2572 /* modify capa instead of realloc */
2573 assert(!FL_TEST((str), STR_SHARED));
2574 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2575 }
2576 if (termlen > oldtermlen) {
2577 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2578 }
2579 }
2580
2581 return;
2582}
2583
2584static char *
2585str_null_check(VALUE str, int *w)
2586{
2587 char *s = RSTRING_PTR(str);
2588 long len = RSTRING_LEN(str);
2589 rb_encoding *enc = rb_enc_get(str);
2590 const int minlen = rb_enc_mbminlen(enc);
2591
2592 if (minlen > 1) {
2593 *w = 1;
2594 if (str_null_char(s, len, minlen, enc)) {
2595 return NULL;
2596 }
2597 return str_fill_term(str, s, len, minlen);
2598 }
2599 *w = 0;
2600 if (!s || memchr(s, 0, len)) {
2601 return NULL;
2602 }
2603 if (s[len]) {
2604 s = str_fill_term(str, s, len, minlen);
2605 }
2606 return s;
2607}
2608
2609char *
2610rb_str_to_cstr(VALUE str)
2611{
2612 int w;
2613 return str_null_check(str, &w);
2614}
2615
2616char *
2618{
2619 VALUE str = rb_string_value(ptr);
2620 int w;
2621 char *s = str_null_check(str, &w);
2622 if (!s) {
2623 if (w) {
2624 rb_raise(rb_eArgError, "string contains null char");
2625 }
2626 rb_raise(rb_eArgError, "string contains null byte");
2627 }
2628 return s;
2629}
2630
2631char *
2632rb_str_fill_terminator(VALUE str, const int newminlen)
2633{
2634 char *s = RSTRING_PTR(str);
2635 long len = RSTRING_LEN(str);
2636 return str_fill_term(str, s, len, newminlen);
2637}
2638
2639VALUE
2641{
2642 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2643 return str;
2644}
2645
2646/*
2647 * call-seq:
2648 * String.try_convert(object) -> object, new_string, or nil
2649 *
2650 * If +object+ is a \String object, returns +object+.
2651 *
2652 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2653 * calls <tt>object.to_str</tt> and returns the result.
2654 *
2655 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2656 *
2657 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2658 */
2659static VALUE
2660rb_str_s_try_convert(VALUE dummy, VALUE str)
2661{
2662 return rb_check_string_type(str);
2663}
2664
2665static char*
2666str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2667{
2668 long nth = *nthp;
2669 if (rb_enc_mbmaxlen(enc) == 1) {
2670 p += nth;
2671 }
2672 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2673 p += nth * rb_enc_mbmaxlen(enc);
2674 }
2675 else if (rb_enc_asciicompat(enc)) {
2676 const char *p2, *e2;
2677 int n;
2678
2679 while (p < e && 0 < nth) {
2680 e2 = p + nth;
2681 if (e < e2) {
2682 *nthp = nth;
2683 return (char *)e;
2684 }
2685 if (ISASCII(*p)) {
2686 p2 = search_nonascii(p, e2);
2687 if (!p2) {
2688 nth -= e2 - p;
2689 *nthp = nth;
2690 return (char *)e2;
2691 }
2692 nth -= p2 - p;
2693 p = p2;
2694 }
2695 n = rb_enc_mbclen(p, e, enc);
2696 p += n;
2697 nth--;
2698 }
2699 *nthp = nth;
2700 if (nth != 0) {
2701 return (char *)e;
2702 }
2703 return (char *)p;
2704 }
2705 else {
2706 while (p < e && nth--) {
2707 p += rb_enc_mbclen(p, e, enc);
2708 }
2709 }
2710 if (p > e) p = e;
2711 *nthp = nth;
2712 return (char*)p;
2713}
2714
2715char*
2716rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2717{
2718 return str_nth_len(p, e, &nth, enc);
2719}
2720
2721static char*
2722str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2723{
2724 if (singlebyte)
2725 p += nth;
2726 else {
2727 p = str_nth_len(p, e, &nth, enc);
2728 }
2729 if (!p) return 0;
2730 if (p > e) p = e;
2731 return (char *)p;
2732}
2733
2734/* char offset to byte offset */
2735static long
2736str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2737{
2738 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2739 if (!pp) return e - p;
2740 return pp - p;
2741}
2742
2743long
2744rb_str_offset(VALUE str, long pos)
2745{
2746 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2747 STR_ENC_GET(str), single_byte_optimizable(str));
2748}
2749
2750#ifdef NONASCII_MASK
2751static char *
2752str_utf8_nth(const char *p, const char *e, long *nthp)
2753{
2754 long nth = *nthp;
2755 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2756 const uintptr_t *s, *t;
2757 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2758 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2759 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2760 while (p < (const char *)s) {
2761 if (is_utf8_lead_byte(*p)) nth--;
2762 p++;
2763 }
2764 do {
2765 nth -= count_utf8_lead_bytes_with_word(s);
2766 s++;
2767 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2768 p = (char *)s;
2769 }
2770 while (p < e) {
2771 if (is_utf8_lead_byte(*p)) {
2772 if (nth == 0) break;
2773 nth--;
2774 }
2775 p++;
2776 }
2777 *nthp = nth;
2778 return (char *)p;
2779}
2780
2781static long
2782str_utf8_offset(const char *p, const char *e, long nth)
2783{
2784 const char *pp = str_utf8_nth(p, e, &nth);
2785 return pp - p;
2786}
2787#endif
2788
2789/* byte offset to char offset */
2790long
2791rb_str_sublen(VALUE str, long pos)
2792{
2793 if (single_byte_optimizable(str) || pos < 0)
2794 return pos;
2795 else {
2796 char *p = RSTRING_PTR(str);
2797 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2798 }
2799}
2800
2801static VALUE
2802str_subseq(VALUE str, long beg, long len)
2803{
2804 VALUE str2;
2805
2806 const long rstring_embed_capa_max = ((sizeof(struct RString) - offsetof(struct RString, as.embed.ary)) / sizeof(char)) - 1;
2807
2808 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str)) ||
2809 len <= rstring_embed_capa_max) {
2810 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2811 RB_GC_GUARD(str);
2812 }
2813 else {
2814 str2 = str_new_shared(rb_cString, str);
2815 ENC_CODERANGE_CLEAR(str2);
2816 RSTRING(str2)->as.heap.ptr += beg;
2817 if (RSTRING(str2)->as.heap.len > len) {
2818 RSTRING(str2)->as.heap.len = len;
2819 }
2820 }
2821
2822 return str2;
2823}
2824
2825VALUE
2826rb_str_subseq(VALUE str, long beg, long len)
2827{
2828 VALUE str2 = str_subseq(str, beg, len);
2829 rb_enc_cr_str_copy_for_substr(str2, str);
2830 return str2;
2831}
2832
2833char *
2834rb_str_subpos(VALUE str, long beg, long *lenp)
2835{
2836 long len = *lenp;
2837 long slen = -1L;
2838 long blen = RSTRING_LEN(str);
2839 rb_encoding *enc = STR_ENC_GET(str);
2840 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2841
2842 if (len < 0) return 0;
2843 if (!blen) {
2844 len = 0;
2845 }
2846 if (single_byte_optimizable(str)) {
2847 if (beg > blen) return 0;
2848 if (beg < 0) {
2849 beg += blen;
2850 if (beg < 0) return 0;
2851 }
2852 if (len > blen - beg)
2853 len = blen - beg;
2854 if (len < 0) return 0;
2855 p = s + beg;
2856 goto end;
2857 }
2858 if (beg < 0) {
2859 if (len > -beg) len = -beg;
2860 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2861 beg = -beg;
2862 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2863 p = e;
2864 if (!p) return 0;
2865 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2866 if (!p) return 0;
2867 len = e - p;
2868 goto end;
2869 }
2870 else {
2871 slen = str_strlen(str, enc);
2872 beg += slen;
2873 if (beg < 0) return 0;
2874 p = s + beg;
2875 if (len == 0) goto end;
2876 }
2877 }
2878 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2879 return 0;
2880 }
2881 if (len == 0) {
2882 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2883 p = s + beg;
2884 }
2885#ifdef NONASCII_MASK
2886 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2887 enc == rb_utf8_encoding()) {
2888 p = str_utf8_nth(s, e, &beg);
2889 if (beg > 0) return 0;
2890 len = str_utf8_offset(p, e, len);
2891 }
2892#endif
2893 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2894 int char_sz = rb_enc_mbmaxlen(enc);
2895
2896 p = s + beg * char_sz;
2897 if (p > e) {
2898 return 0;
2899 }
2900 else if (len * char_sz > e - p)
2901 len = e - p;
2902 else
2903 len *= char_sz;
2904 }
2905 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2906 if (beg > 0) return 0;
2907 len = 0;
2908 }
2909 else {
2910 len = str_offset(p, e, len, enc, 0);
2911 }
2912 end:
2913 *lenp = len;
2914 RB_GC_GUARD(str);
2915 return p;
2916}
2917
2918static VALUE str_substr(VALUE str, long beg, long len, int empty);
2919
2920VALUE
2921rb_str_substr(VALUE str, long beg, long len)
2922{
2923 return str_substr(str, beg, len, TRUE);
2924}
2925
2926static VALUE
2927str_substr(VALUE str, long beg, long len, int empty)
2928{
2929 char *p = rb_str_subpos(str, beg, &len);
2930
2931 if (!p) return Qnil;
2932 if (!len && !empty) return Qnil;
2933
2934 beg = p - RSTRING_PTR(str);
2935
2936 VALUE str2 = str_subseq(str, beg, len);
2937 rb_enc_cr_str_copy_for_substr(str2, str);
2938 return str2;
2939}
2940
2941VALUE
2943{
2944 if (OBJ_FROZEN(str)) return str;
2945 rb_str_resize(str, RSTRING_LEN(str));
2946 return rb_obj_freeze(str);
2947}
2948
2949
2950/*
2951 * call-seq:
2952 * +string -> new_string or self
2953 *
2954 * Returns +self+ if +self+ is not frozen.
2955 *
2956 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
2957 */
2958static VALUE
2959str_uplus(VALUE str)
2960{
2961 if (OBJ_FROZEN(str)) {
2962 return rb_str_dup(str);
2963 }
2964 else {
2965 return str;
2966 }
2967}
2968
2969/*
2970 * call-seq:
2971 * -string -> frozen_string
2972 *
2973 * Returns a frozen, possibly pre-existing copy of the string.
2974 *
2975 * The returned \String will be deduplicated as long as it does not have
2976 * any instance variables set on it and is not a String subclass.
2977 *
2978 * String#dedup is an alias for String#-@.
2979 */
2980static VALUE
2981str_uminus(VALUE str)
2982{
2983 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2984 str = rb_str_dup(str);
2985 }
2986 return rb_fstring(str);
2987}
2988
2989RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2990#define rb_str_dup_frozen rb_str_new_frozen
2991
2992VALUE
2994{
2995 if (FL_TEST(str, STR_TMPLOCK)) {
2996 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2997 }
2998 FL_SET(str, STR_TMPLOCK);
2999 return str;
3000}
3001
3002VALUE
3004{
3005 if (!FL_TEST(str, STR_TMPLOCK)) {
3006 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3007 }
3008 FL_UNSET(str, STR_TMPLOCK);
3009 return str;
3010}
3011
3012RUBY_FUNC_EXPORTED VALUE
3013rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3014{
3015 rb_str_locktmp(str);
3016 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3017}
3018
3019void
3021{
3022 long capa;
3023 const int termlen = TERM_LEN(str);
3024
3025 str_modifiable(str);
3026 if (STR_SHARED_P(str)) {
3027 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3028 }
3029 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3030 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3031 }
3032
3033 int cr = ENC_CODERANGE(str);
3034 if (cr == ENC_CODERANGE_UNKNOWN) {
3035 /* Leave unknown. */
3036 }
3037 else if (len > RSTRING_LEN(str)) {
3038 if (ENC_CODERANGE_CLEAN_P(cr)) {
3039 /* Update the coderange regarding the extended part. */
3040 const char *const prev_end = RSTRING_END(str);
3041 const char *const new_end = RSTRING_PTR(str) + len;
3042 rb_encoding *enc = rb_enc_get(str);
3043 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3044 ENC_CODERANGE_SET(str, cr);
3045 }
3046 else if (cr == ENC_CODERANGE_BROKEN) {
3047 /* May be valid now, by appended part. */
3049 }
3050 }
3051 else if (len < RSTRING_LEN(str)) {
3052 if (cr != ENC_CODERANGE_7BIT) {
3053 /* ASCII-only string is keeping after truncated. Valid
3054 * and broken may be invalid or valid, leave unknown. */
3056 }
3057 }
3058
3059 STR_SET_LEN(str, len);
3060 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3061}
3062
3063VALUE
3065{
3066 if (len < 0) {
3067 rb_raise(rb_eArgError, "negative string size (or size too big)");
3068 }
3069
3070 int independent = str_independent(str);
3071 long slen = RSTRING_LEN(str);
3072
3073 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3075 }
3076
3077 {
3078 long capa;
3079 const int termlen = TERM_LEN(str);
3080 if (STR_EMBED_P(str)) {
3081 if (len == slen) return str;
3082 if (str_embed_capa(str) >= len + termlen) {
3083 STR_SET_EMBED_LEN(str, len);
3084 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3085 return str;
3086 }
3087 str_make_independent_expand(str, slen, len - slen, termlen);
3088 }
3089 else if (str_embed_capa(str) >= len + termlen) {
3090 char *ptr = STR_HEAP_PTR(str);
3091 STR_SET_EMBED(str);
3092 if (slen > len) slen = len;
3093 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3094 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3095 STR_SET_EMBED_LEN(str, len);
3096 if (independent) ruby_xfree(ptr);
3097 return str;
3098 }
3099 else if (!independent) {
3100 if (len == slen) return str;
3101 str_make_independent_expand(str, slen, len - slen, termlen);
3102 }
3103 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3104 (capa - len) > (len < 1024 ? len : 1024)) {
3105 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3106 (size_t)len + termlen, STR_HEAP_SIZE(str));
3107 RSTRING(str)->as.heap.aux.capa = len;
3108 }
3109 else if (len == slen) return str;
3110 RSTRING(str)->as.heap.len = len;
3111 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3112 }
3113 return str;
3114}
3115
3116static VALUE
3117str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3118{
3119 if (keep_cr) {
3120 str_modify_keep_cr(str);
3121 }
3122 else {
3123 rb_str_modify(str);
3124 }
3125 if (len == 0) return 0;
3126
3127 long capa, total, olen, off = -1;
3128 char *sptr;
3129 const int termlen = TERM_LEN(str);
3130#if !USE_RVARGC
3131 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3132#endif
3133
3134 RSTRING_GETMEM(str, sptr, olen);
3135 if (ptr >= sptr && ptr <= sptr + olen) {
3136 off = ptr - sptr;
3137 }
3138
3139 if (STR_EMBED_P(str)) {
3140 capa = str_embed_capa(str) - termlen;
3141 sptr = RSTRING(str)->as.embed.ary;
3142 olen = RSTRING_EMBED_LEN(str);
3143 }
3144 else {
3145 capa = RSTRING(str)->as.heap.aux.capa;
3146 sptr = RSTRING(str)->as.heap.ptr;
3147 olen = RSTRING(str)->as.heap.len;
3148 }
3149 if (olen > LONG_MAX - len) {
3150 rb_raise(rb_eArgError, "string sizes too big");
3151 }
3152 total = olen + len;
3153 if (capa < total) {
3154 if (total >= LONG_MAX / 2) {
3155 capa = total;
3156 }
3157 while (total > capa) {
3158 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3159 }
3160 RESIZE_CAPA_TERM(str, capa, termlen);
3161 sptr = RSTRING_PTR(str);
3162 }
3163 if (off != -1) {
3164 ptr = sptr + off;
3165 }
3166 memcpy(sptr + olen, ptr, len);
3167 STR_SET_LEN(str, total);
3168 TERM_FILL(sptr + total, termlen); /* sentinel */
3169
3170 return str;
3171}
3172
3173#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3174#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3175
3176VALUE
3177rb_str_cat(VALUE str, const char *ptr, long len)
3178{
3179 if (len == 0) return str;
3180 if (len < 0) {
3181 rb_raise(rb_eArgError, "negative string size (or size too big)");
3182 }
3183 return str_buf_cat(str, ptr, len);
3184}
3185
3186VALUE
3187rb_str_cat_cstr(VALUE str, const char *ptr)
3188{
3189 must_not_null(ptr);
3190 return rb_str_buf_cat(str, ptr, strlen(ptr));
3191}
3192
3193RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3194RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3195RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3196
3197static VALUE
3198rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3199 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3200{
3201 int str_encindex = ENCODING_GET(str);
3202 int res_encindex;
3203 int str_cr, res_cr;
3204 rb_encoding *str_enc, *ptr_enc;
3205
3206 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3207
3208 if (str_encindex == ptr_encindex) {
3209 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3210 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3211 }
3212 }
3213 else {
3214 str_enc = rb_enc_from_index(str_encindex);
3215 ptr_enc = rb_enc_from_index(ptr_encindex);
3216 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3217 if (len == 0)
3218 return str;
3219 if (RSTRING_LEN(str) == 0) {
3220 rb_str_buf_cat(str, ptr, len);
3221 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3222 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3223 return str;
3224 }
3225 goto incompatible;
3226 }
3227 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3228 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3229 }
3230 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3232 str_cr = rb_enc_str_coderange(str);
3233 }
3234 }
3235 }
3236 if (ptr_cr_ret)
3237 *ptr_cr_ret = ptr_cr;
3238
3239 if (str_encindex != ptr_encindex &&
3240 str_cr != ENC_CODERANGE_7BIT &&
3241 ptr_cr != ENC_CODERANGE_7BIT) {
3242 str_enc = rb_enc_from_index(str_encindex);
3243 ptr_enc = rb_enc_from_index(ptr_encindex);
3244 goto incompatible;
3245 }
3246
3247 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3248 res_encindex = str_encindex;
3249 res_cr = ENC_CODERANGE_UNKNOWN;
3250 }
3251 else if (str_cr == ENC_CODERANGE_7BIT) {
3252 if (ptr_cr == ENC_CODERANGE_7BIT) {
3253 res_encindex = str_encindex;
3254 res_cr = ENC_CODERANGE_7BIT;
3255 }
3256 else {
3257 res_encindex = ptr_encindex;
3258 res_cr = ptr_cr;
3259 }
3260 }
3261 else if (str_cr == ENC_CODERANGE_VALID) {
3262 res_encindex = str_encindex;
3263 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3264 res_cr = str_cr;
3265 else
3266 res_cr = ptr_cr;
3267 }
3268 else { /* str_cr == ENC_CODERANGE_BROKEN */
3269 res_encindex = str_encindex;
3270 res_cr = str_cr;
3271 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3272 }
3273
3274 if (len < 0) {
3275 rb_raise(rb_eArgError, "negative string size (or size too big)");
3276 }
3277 str_buf_cat(str, ptr, len);
3278 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3279 return str;
3280
3281 incompatible:
3282 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3283 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3285}
3286
3287VALUE
3288rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3289{
3290 return rb_enc_cr_str_buf_cat(str, ptr, len,
3291 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3292}
3293
3294VALUE
3296{
3297 /* ptr must reference NUL terminated ASCII string. */
3298 int encindex = ENCODING_GET(str);
3299 rb_encoding *enc = rb_enc_from_index(encindex);
3300 if (rb_enc_asciicompat(enc)) {
3301 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3302 encindex, ENC_CODERANGE_7BIT, 0);
3303 }
3304 else {
3305 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3306 while (*ptr) {
3307 unsigned int c = (unsigned char)*ptr;
3308 int len = rb_enc_codelen(c, enc);
3309 rb_enc_mbcput(c, buf, enc);
3310 rb_enc_cr_str_buf_cat(str, buf, len,
3311 encindex, ENC_CODERANGE_VALID, 0);
3312 ptr++;
3313 }
3314 return str;
3315 }
3316}
3317
3318VALUE
3320{
3321 int str2_cr = rb_enc_str_coderange(str2);
3322
3323 if (str_enc_fastpath(str)) {
3324 switch (str2_cr) {
3325 case ENC_CODERANGE_7BIT:
3326 // If RHS is 7bit we can do simple concatenation
3327 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3328 RB_GC_GUARD(str2);
3329 return str;
3331 // If RHS is valid, we can do simple concatenation if encodings are the same
3332 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3333 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3334 int str_cr = ENC_CODERANGE(str);
3335 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3336 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3337 }
3338 RB_GC_GUARD(str2);
3339 return str;
3340 }
3341 }
3342 }
3343
3344 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3345 ENCODING_GET(str2), str2_cr, &str2_cr);
3346
3347 ENC_CODERANGE_SET(str2, str2_cr);
3348
3349 return str;
3350}
3351
3352VALUE
3354{
3355 StringValue(str2);
3356 return rb_str_buf_append(str, str2);
3357}
3358
3359#define MIN_PRE_ALLOC_SIZE 48
3360
3361MJIT_FUNC_EXPORTED VALUE
3362rb_str_concat_literals(size_t num, const VALUE *strary)
3363{
3364 VALUE str;
3365 size_t i, s;
3366 long len = 1;
3367
3368 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3369 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3370
3371 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3372 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3373 str = rb_str_resurrect(strary[0]);
3374 s = 1;
3375 }
3376 else {
3377 str = rb_str_buf_new(len);
3378 rb_enc_copy(str, strary[0]);
3379 s = 0;
3380 }
3381
3382 for (i = s; i < num; ++i) {
3383 const VALUE v = strary[i];
3384 int encidx = ENCODING_GET(v);
3385
3386 rb_str_buf_append(str, v);
3387 if (encidx != ENCINDEX_US_ASCII) {
3388 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3389 rb_enc_set_index(str, encidx);
3390 }
3391 }
3392 return str;
3393}
3394
3395/*
3396 * call-seq:
3397 * concat(*objects) -> string
3398 *
3399 * Concatenates each object in +objects+ to +self+ and returns +self+:
3400 *
3401 * s = 'foo'
3402 * s.concat('bar', 'baz') # => "foobarbaz"
3403 * s # => "foobarbaz"
3404 *
3405 * For each given object +object+ that is an \Integer,
3406 * the value is considered a codepoint and converted to a character before concatenation:
3407 *
3408 * s = 'foo'
3409 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3410 *
3411 * Related: String#<<, which takes a single argument.
3412 */
3413static VALUE
3414rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3415{
3416 str_modifiable(str);
3417
3418 if (argc == 1) {
3419 return rb_str_concat(str, argv[0]);
3420 }
3421 else if (argc > 1) {
3422 int i;
3423 VALUE arg_str = rb_str_tmp_new(0);
3424 rb_enc_copy(arg_str, str);
3425 for (i = 0; i < argc; i++) {
3426 rb_str_concat(arg_str, argv[i]);
3427 }
3428 rb_str_buf_append(str, arg_str);
3429 }
3430
3431 return str;
3432}
3433
3434/*
3435 * call-seq:
3436 * string << object -> string
3437 *
3438 * Concatenates +object+ to +self+ and returns +self+:
3439 *
3440 * s = 'foo'
3441 * s << 'bar' # => "foobar"
3442 * s # => "foobar"
3443 *
3444 * If +object+ is an \Integer,
3445 * the value is considered a codepoint and converted to a character before concatenation:
3446 *
3447 * s = 'foo'
3448 * s << 33 # => "foo!"
3449 *
3450 * Related: String#concat, which takes multiple arguments.
3451 */
3452VALUE
3454{
3455 unsigned int code;
3456 rb_encoding *enc = STR_ENC_GET(str1);
3457 int encidx;
3458
3459 if (RB_INTEGER_TYPE_P(str2)) {
3460 if (rb_num_to_uint(str2, &code) == 0) {
3461 }
3462 else if (FIXNUM_P(str2)) {
3463 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3464 }
3465 else {
3466 rb_raise(rb_eRangeError, "bignum out of char range");
3467 }
3468 }
3469 else {
3470 return rb_str_append(str1, str2);
3471 }
3472
3473 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3474 if (encidx >= 0) {
3475 char buf[1];
3476 buf[0] = (char)code;
3477 rb_str_cat(str1, buf, 1);
3478 if (encidx != rb_enc_to_index(enc)) {
3479 rb_enc_associate_index(str1, encidx);
3481 }
3482 }
3483 else {
3484 long pos = RSTRING_LEN(str1);
3485 int cr = ENC_CODERANGE(str1);
3486 int len;
3487 char *buf;
3488
3489 switch (len = rb_enc_codelen(code, enc)) {
3490 case ONIGERR_INVALID_CODE_POINT_VALUE:
3491 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3492 break;
3493 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3494 case 0:
3495 rb_raise(rb_eRangeError, "%u out of char range", code);
3496 break;
3497 }
3498 buf = ALLOCA_N(char, len + 1);
3499 rb_enc_mbcput(code, buf, enc);
3500 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3501 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3502 }
3503 rb_str_resize(str1, pos+len);
3504 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3505 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3507 }
3508 else if (cr == ENC_CODERANGE_BROKEN) {
3510 }
3511 ENC_CODERANGE_SET(str1, cr);
3512 }
3513 return str1;
3514}
3515
3516int
3517rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3518{
3519 int encidx = rb_enc_to_index(enc);
3520
3521 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3522 /* US-ASCII automatically extended to ASCII-8BIT */
3523 if (code > 0xFF) {
3524 rb_raise(rb_eRangeError, "%u out of char range", code);
3525 }
3526 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3527 return ENCINDEX_ASCII_8BIT;
3528 }
3529 return encidx;
3530 }
3531 else {
3532 return -1;
3533 }
3534}
3535
3536/*
3537 * call-seq:
3538 * prepend(*other_strings) -> string
3539 *
3540 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3541 *
3542 * s = 'foo'
3543 * s.prepend('bar', 'baz') # => "barbazfoo"
3544 * s # => "barbazfoo"
3545 *
3546 * Related: String#concat.
3547 */
3548
3549static VALUE
3550rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3551{
3552 str_modifiable(str);
3553
3554 if (argc == 1) {
3555 rb_str_update(str, 0L, 0L, argv[0]);
3556 }
3557 else if (argc > 1) {
3558 int i;
3559 VALUE arg_str = rb_str_tmp_new(0);
3560 rb_enc_copy(arg_str, str);
3561 for (i = 0; i < argc; i++) {
3562 rb_str_append(arg_str, argv[i]);
3563 }
3564 rb_str_update(str, 0L, 0L, arg_str);
3565 }
3566
3567 return str;
3568}
3569
3570st_index_t
3572{
3573 int e = ENCODING_GET(str);
3574 if (e && is_ascii_string(str)) {
3575 e = 0;
3576 }
3577 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3578}
3579
3580int
3582{
3583 long len1, len2;
3584 const char *ptr1, *ptr2;
3585 RSTRING_GETMEM(str1, ptr1, len1);
3586 RSTRING_GETMEM(str2, ptr2, len2);
3587 return (len1 != len2 ||
3588 !rb_str_comparable(str1, str2) ||
3589 memcmp(ptr1, ptr2, len1) != 0);
3590}
3591
3592/*
3593 * call-seq:
3594 * hash -> integer
3595 *
3596 * Returns the integer hash value for +self+.
3597 * The value is based on the length, content and encoding of +self+.
3598 *
3599 * Related: Object#hash.
3600 */
3601
3602static VALUE
3603rb_str_hash_m(VALUE str)
3604{
3605 st_index_t hval = rb_str_hash(str);
3606 return ST2FIX(hval);
3607}
3608
3609#define lesser(a,b) (((a)>(b))?(b):(a))
3610
3611int
3613{
3614 int idx1, idx2;
3615 int rc1, rc2;
3616
3617 if (RSTRING_LEN(str1) == 0) return TRUE;
3618 if (RSTRING_LEN(str2) == 0) return TRUE;
3619 idx1 = ENCODING_GET(str1);
3620 idx2 = ENCODING_GET(str2);
3621 if (idx1 == idx2) return TRUE;
3622 rc1 = rb_enc_str_coderange(str1);
3623 rc2 = rb_enc_str_coderange(str2);
3624 if (rc1 == ENC_CODERANGE_7BIT) {
3625 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3626 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3627 return TRUE;
3628 }
3629 if (rc2 == ENC_CODERANGE_7BIT) {
3630 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3631 return TRUE;
3632 }
3633 return FALSE;
3634}
3635
3636int
3638{
3639 long len1, len2;
3640 const char *ptr1, *ptr2;
3641 int retval;
3642
3643 if (str1 == str2) return 0;
3644 RSTRING_GETMEM(str1, ptr1, len1);
3645 RSTRING_GETMEM(str2, ptr2, len2);
3646 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3647 if (len1 == len2) {
3648 if (!rb_str_comparable(str1, str2)) {
3649 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3650 return 1;
3651 return -1;
3652 }
3653 return 0;
3654 }
3655 if (len1 > len2) return 1;
3656 return -1;
3657 }
3658 if (retval > 0) return 1;
3659 return -1;
3660}
3661
3662/*
3663 * call-seq:
3664 * string == object -> true or false
3665 * string === object -> true or false
3666 *
3667 * Returns +true+ if +object+ has the same length and content;
3668 * as +self+; +false+ otherwise:
3669 *
3670 * s = 'foo'
3671 * s == 'foo' # => true
3672 * s == 'food' # => false
3673 * s == 'FOO' # => false
3674 *
3675 * Returns +false+ if the two strings' encodings are not compatible:
3676 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3677 *
3678 * If +object+ is not an instance of \String but responds to +to_str+, then the
3679 * two strings are compared using <code>object.==</code>.
3680 */
3681
3682VALUE
3684{
3685 if (str1 == str2) return Qtrue;
3686 if (!RB_TYPE_P(str2, T_STRING)) {
3687 if (!rb_respond_to(str2, idTo_str)) {
3688 return Qfalse;
3689 }
3690 return rb_equal(str2, str1);
3691 }
3692 return rb_str_eql_internal(str1, str2);
3693}
3694
3695/*
3696 * call-seq:
3697 * eql?(object) -> true or false
3698 *
3699 * Returns +true+ if +object+ has the same length and content;
3700 * as +self+; +false+ otherwise:
3701 *
3702 * s = 'foo'
3703 * s.eql?('foo') # => true
3704 * s.eql?('food') # => false
3705 * s.eql?('FOO') # => false
3706 *
3707 * Returns +false+ if the two strings' encodings are not compatible:
3708 *
3709 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3710 *
3711 */
3712
3713MJIT_FUNC_EXPORTED VALUE
3714rb_str_eql(VALUE str1, VALUE str2)
3715{
3716 if (str1 == str2) return Qtrue;
3717 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3718 return rb_str_eql_internal(str1, str2);
3719}
3720
3721/*
3722 * call-seq:
3723 * string <=> other_string -> -1, 0, 1, or nil
3724 *
3725 * Compares +self+ and +other_string+, returning:
3726 *
3727 * - -1 if +other_string+ is larger.
3728 * - 0 if the two are equal.
3729 * - 1 if +other_string+ is smaller.
3730 * - +nil+ if the two are incomparable.
3731 *
3732 * Examples:
3733 *
3734 * 'foo' <=> 'foo' # => 0
3735 * 'foo' <=> 'food' # => -1
3736 * 'food' <=> 'foo' # => 1
3737 * 'FOO' <=> 'foo' # => -1
3738 * 'foo' <=> 'FOO' # => 1
3739 * 'foo' <=> 1 # => nil
3740 *
3741 */
3742
3743static VALUE
3744rb_str_cmp_m(VALUE str1, VALUE str2)
3745{
3746 int result;
3747 VALUE s = rb_check_string_type(str2);
3748 if (NIL_P(s)) {
3749 return rb_invcmp(str1, str2);
3750 }
3751 result = rb_str_cmp(str1, s);
3752 return INT2FIX(result);
3753}
3754
3755static VALUE str_casecmp(VALUE str1, VALUE str2);
3756static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3757
3758/*
3759 * call-seq:
3760 * casecmp(other_string) -> -1, 0, 1, or nil
3761 *
3762 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3763 *
3764 * - -1 if <tt>other_string.downcase</tt> is larger.
3765 * - 0 if the two are equal.
3766 * - 1 if <tt>other_string.downcase</tt> is smaller.
3767 * - +nil+ if the two are incomparable.
3768 *
3769 * Examples:
3770 *
3771 * 'foo'.casecmp('foo') # => 0
3772 * 'foo'.casecmp('food') # => -1
3773 * 'food'.casecmp('foo') # => 1
3774 * 'FOO'.casecmp('foo') # => 0
3775 * 'foo'.casecmp('FOO') # => 0
3776 * 'foo'.casecmp(1) # => nil
3777 *
3778 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3779 *
3780 * Related: String#casecmp?.
3781 *
3782 */
3783
3784static VALUE
3785rb_str_casecmp(VALUE str1, VALUE str2)
3786{
3787 VALUE s = rb_check_string_type(str2);
3788 if (NIL_P(s)) {
3789 return Qnil;
3790 }
3791 return str_casecmp(str1, s);
3792}
3793
3794static VALUE
3795str_casecmp(VALUE str1, VALUE str2)
3796{
3797 long len;
3798 rb_encoding *enc;
3799 const char *p1, *p1end, *p2, *p2end;
3800
3801 enc = rb_enc_compatible(str1, str2);
3802 if (!enc) {
3803 return Qnil;
3804 }
3805
3806 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3807 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3808 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3809 while (p1 < p1end && p2 < p2end) {
3810 if (*p1 != *p2) {
3811 unsigned int c1 = TOLOWER(*p1 & 0xff);
3812 unsigned int c2 = TOLOWER(*p2 & 0xff);
3813 if (c1 != c2)
3814 return INT2FIX(c1 < c2 ? -1 : 1);
3815 }
3816 p1++;
3817 p2++;
3818 }
3819 }
3820 else {
3821 while (p1 < p1end && p2 < p2end) {
3822 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3823 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3824
3825 if (0 <= c1 && 0 <= c2) {
3826 c1 = TOLOWER(c1);
3827 c2 = TOLOWER(c2);
3828 if (c1 != c2)
3829 return INT2FIX(c1 < c2 ? -1 : 1);
3830 }
3831 else {
3832 int r;
3833 l1 = rb_enc_mbclen(p1, p1end, enc);
3834 l2 = rb_enc_mbclen(p2, p2end, enc);
3835 len = l1 < l2 ? l1 : l2;
3836 r = memcmp(p1, p2, len);
3837 if (r != 0)
3838 return INT2FIX(r < 0 ? -1 : 1);
3839 if (l1 != l2)
3840 return INT2FIX(l1 < l2 ? -1 : 1);
3841 }
3842 p1 += l1;
3843 p2 += l2;
3844 }
3845 }
3846 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3847 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3848 return INT2FIX(-1);
3849}
3850
3851/*
3852 * call-seq:
3853 * casecmp?(other_string) -> true, false, or nil
3854 *
3855 * Returns +true+ if +self+ and +other_string+ are equal after
3856 * Unicode case folding, otherwise +false+:
3857 *
3858 * 'foo'.casecmp?('foo') # => true
3859 * 'foo'.casecmp?('food') # => false
3860 * 'food'.casecmp?('foo') # => false
3861 * 'FOO'.casecmp?('foo') # => true
3862 * 'foo'.casecmp?('FOO') # => true
3863 *
3864 * Returns +nil+ if the two values are incomparable:
3865 *
3866 * 'foo'.casecmp?(1) # => nil
3867 *
3868 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3869 *
3870 * Related: String#casecmp.
3871 *
3872 */
3873
3874static VALUE
3875rb_str_casecmp_p(VALUE str1, VALUE str2)
3876{
3877 VALUE s = rb_check_string_type(str2);
3878 if (NIL_P(s)) {
3879 return Qnil;
3880 }
3881 return str_casecmp_p(str1, s);
3882}
3883
3884static VALUE
3885str_casecmp_p(VALUE str1, VALUE str2)
3886{
3887 rb_encoding *enc;
3888 VALUE folded_str1, folded_str2;
3889 VALUE fold_opt = sym_fold;
3890
3891 enc = rb_enc_compatible(str1, str2);
3892 if (!enc) {
3893 return Qnil;
3894 }
3895
3896 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3897 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3898
3899 return rb_str_eql(folded_str1, folded_str2);
3900}
3901
3902static long
3903strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3904 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3905{
3906 const char *search_start = str_ptr;
3907 long pos, search_len = str_len - offset;
3908
3909 for (;;) {
3910 const char *t;
3911 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3912 if (pos < 0) return pos;
3913 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3914 if (t == search_start + pos) break;
3915 search_len -= t - search_start;
3916 if (search_len <= 0) return -1;
3917 offset += t - search_start;
3918 search_start = t;
3919 }
3920 return pos + offset;
3921}
3922
3923#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3924
3925static long
3926rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3927{
3928 const char *str_ptr, *str_ptr_end, *sub_ptr;
3929 long str_len, sub_len;
3930 rb_encoding *enc;
3931
3932 enc = rb_enc_check(str, sub);
3933 if (is_broken_string(sub)) return -1;
3934
3935 str_ptr = RSTRING_PTR(str);
3936 str_ptr_end = RSTRING_END(str);
3937 str_len = RSTRING_LEN(str);
3938 sub_ptr = RSTRING_PTR(sub);
3939 sub_len = RSTRING_LEN(sub);
3940
3941 if (str_len < sub_len) return -1;
3942
3943 if (offset != 0) {
3944 long str_len_char, sub_len_char;
3945 int single_byte = single_byte_optimizable(str);
3946 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3947 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3948 if (offset < 0) {
3949 offset += str_len_char;
3950 if (offset < 0) return -1;
3951 }
3952 if (str_len_char - offset < sub_len_char) return -1;
3953 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3954 str_ptr += offset;
3955 }
3956 if (sub_len == 0) return offset;
3957
3958 /* need proceed one character at a time */
3959 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3960}
3961
3962
3963/*
3964 * call-seq:
3965 * index(substring, offset = 0) -> integer or nil
3966 * index(regexp, offset = 0) -> integer or nil
3967 *
3968 * :include: doc/string/index.rdoc
3969 *
3970 */
3971
3972static VALUE
3973rb_str_index_m(int argc, VALUE *argv, VALUE str)
3974{
3975 VALUE sub;
3976 VALUE initpos;
3977 long pos;
3978
3979 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3980 pos = NUM2LONG(initpos);
3981 }
3982 else {
3983 pos = 0;
3984 }
3985 if (pos < 0) {
3986 pos += str_strlen(str, NULL);
3987 if (pos < 0) {
3988 if (RB_TYPE_P(sub, T_REGEXP)) {
3990 }
3991 return Qnil;
3992 }
3993 }
3994
3995 if (RB_TYPE_P(sub, T_REGEXP)) {
3996 if (pos > str_strlen(str, NULL))
3997 return Qnil;
3998 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3999 rb_enc_check(str, sub), single_byte_optimizable(str));
4000
4001 if (rb_reg_search(sub, str, pos, 0) < 0) {
4002 return Qnil;
4003 }
4004 else {
4005 VALUE match = rb_backref_get();
4006 struct re_registers *regs = RMATCH_REGS(match);
4007 pos = rb_str_sublen(str, BEG(0));
4008 return LONG2NUM(pos);
4009 }
4010 }
4011 else {
4012 StringValue(sub);
4013 pos = rb_str_index(str, sub, pos);
4014 pos = rb_str_sublen(str, pos);
4015 }
4016
4017 if (pos == -1) return Qnil;
4018 return LONG2NUM(pos);
4019}
4020
4021/* whether given pos is valid character boundary or not
4022 * Note that in this function, "character" means a code point
4023 * (Unicode scalar value), not a grapheme cluster.
4024 */
4025static bool
4026str_check_byte_pos(VALUE str, long pos)
4027{
4028 const char *s = RSTRING_PTR(str);
4029 const char *e = RSTRING_END(str);
4030 const char *p = s + pos;
4031 const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str));
4032 return p == pp;
4033}
4034
4035/*
4036 * call-seq:
4037 * byteindex(substring, offset = 0) -> integer or nil
4038 * byteindex(regexp, offset = 0) -> integer or nil
4039 *
4040 * Returns the \Integer byte-based index of the first occurrence of the given +substring+,
4041 * or +nil+ if none found:
4042 *
4043 * 'foo'.byteindex('f') # => 0
4044 * 'foo'.byteindex('o') # => 1
4045 * 'foo'.byteindex('oo') # => 1
4046 * 'foo'.byteindex('ooo') # => nil
4047 *
4048 * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+,
4049 * or +nil+ if none found:
4050 *
4051 * 'foo'.byteindex(/f/) # => 0
4052 * 'foo'.byteindex(/o/) # => 1
4053 * 'foo'.byteindex(/oo/) # => 1
4054 * 'foo'.byteindex(/ooo/) # => nil
4055 *
4056 * \Integer argument +offset+, if given, specifies the byte-based position in the
4057 * string to begin the search:
4058 *
4059 * 'foo'.byteindex('o', 1) # => 1
4060 * 'foo'.byteindex('o', 2) # => 2
4061 * 'foo'.byteindex('o', 3) # => nil
4062 *
4063 * If +offset+ is negative, counts backward from the end of +self+:
4064 *
4065 * 'foo'.byteindex('o', -1) # => 2
4066 * 'foo'.byteindex('o', -2) # => 1
4067 * 'foo'.byteindex('o', -3) # => 1
4068 * 'foo'.byteindex('o', -4) # => nil
4069 *
4070 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4071 * raised.
4072 *
4073 * Related: String#index, String#byterindex.
4074 */
4075
4076static VALUE
4077rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4078{
4079 VALUE sub;
4080 VALUE initpos;
4081 long pos;
4082
4083 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4084 long slen = RSTRING_LEN(str);
4085 pos = NUM2LONG(initpos);
4086 if (pos < 0) {
4087 pos += slen;
4088 }
4089 if (pos < 0 || pos > slen) {
4090 if (RB_TYPE_P(sub, T_REGEXP)) {
4092 }
4093 return Qnil;
4094 }
4095 }
4096 else {
4097 pos = 0;
4098 }
4099
4100 if (!str_check_byte_pos(str, pos)) {
4102 "offset %ld does not land on character boundary", pos);
4103 }
4104
4105 if (RB_TYPE_P(sub, T_REGEXP)) {
4106 if (rb_reg_search(sub, str, pos, 0) < 0) {
4107 return Qnil;
4108 }
4109 else {
4110 VALUE match = rb_backref_get();
4111 struct re_registers *regs = RMATCH_REGS(match);
4112 pos = BEG(0);
4113 return LONG2NUM(pos);
4114 }
4115 }
4116 else {
4117 StringValue(sub);
4118 pos = rb_strseq_index(str, sub, pos, 1);
4119 }
4120
4121 if (pos == -1) return Qnil;
4122 return LONG2NUM(pos);
4123}
4124
4125#ifdef HAVE_MEMRCHR
4126static long
4127str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4128{
4129 char *hit, *adjusted;
4130 int c;
4131 long slen, searchlen;
4132 char *sbeg, *e, *t;
4133
4134 sbeg = RSTRING_PTR(str);
4135 slen = RSTRING_LEN(sub);
4136 if (slen == 0) return s - sbeg;
4137 e = RSTRING_END(str);
4138 t = RSTRING_PTR(sub);
4139 c = *t & 0xff;
4140 searchlen = s - sbeg + 1;
4141
4142 do {
4143 hit = memrchr(sbeg, c, searchlen);
4144 if (!hit) break;
4145 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4146 if (hit != adjusted) {
4147 searchlen = adjusted - sbeg;
4148 continue;
4149 }
4150 if (memcmp(hit, t, slen) == 0)
4151 return hit - sbeg;
4152 searchlen = adjusted - sbeg;
4153 } while (searchlen > 0);
4154
4155 return -1;
4156}
4157#else
4158static long
4159str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4160{
4161 long slen;
4162 char *sbeg, *e, *t;
4163
4164 sbeg = RSTRING_PTR(str);
4165 e = RSTRING_END(str);
4166 t = RSTRING_PTR(sub);
4167 slen = RSTRING_LEN(sub);
4168
4169 while (s) {
4170 if (memcmp(s, t, slen) == 0) {
4171 return s - sbeg;
4172 }
4173 if (s <= sbeg) break;
4174 s = rb_enc_prev_char(sbeg, s, e, enc);
4175 }
4176
4177 return -1;
4178}
4179#endif
4180
4181static long
4182rb_str_rindex(VALUE str, VALUE sub, long pos)
4183{
4184 long len, slen;
4185 char *sbeg, *s;
4186 rb_encoding *enc;
4187 int singlebyte;
4188
4189 enc = rb_enc_check(str, sub);
4190 if (is_broken_string(sub)) return -1;
4191 singlebyte = single_byte_optimizable(str);
4192 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4193 slen = str_strlen(sub, enc); /* rb_enc_check */
4194
4195 /* substring longer than string */
4196 if (len < slen) return -1;
4197 if (len - pos < slen) pos = len - slen;
4198 if (len == 0) return pos;
4199
4200 sbeg = RSTRING_PTR(str);
4201
4202 if (pos == 0) {
4203 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4204 return 0;
4205 else
4206 return -1;
4207 }
4208
4209 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4210 return rb_str_sublen(str, str_rindex(str, sub, s, enc));
4211}
4212
4213/*
4214 * call-seq:
4215 * rindex(substring, offset = self.length) -> integer or nil
4216 * rindex(regexp, offset = self.length) -> integer or nil
4217 *
4218 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4219 * or +nil+ if none found:
4220 *
4221 * 'foo'.rindex('f') # => 0
4222 * 'foo'.rindex('o') # => 2
4223 * 'foo'.rindex('oo') # => 1
4224 * 'foo'.rindex('ooo') # => nil
4225 *
4226 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4227 * or +nil+ if none found:
4228 *
4229 * 'foo'.rindex(/f/) # => 0
4230 * 'foo'.rindex(/o/) # => 2
4231 * 'foo'.rindex(/oo/) # => 1
4232 * 'foo'.rindex(/ooo/) # => nil
4233 *
4234 * The _last_ match means starting at the possible last position, not
4235 * the last of longest matches.
4236 *
4237 * 'foo'.rindex(/o+/) # => 2
4238 * $~ #=> #<MatchData "o">
4239 *
4240 * To get the last longest match, needs to combine with negative
4241 * lookbehind.
4242 *
4243 * 'foo'.rindex(/(?<!o)o+/) # => 1
4244 * $~ #=> #<MatchData "oo">
4245 *
4246 * Or String#index with negative lookforward.
4247 *
4248 * 'foo'.index(/o+(?!.*o)/) # => 1
4249 * $~ #=> #<MatchData "oo">
4250 *
4251 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4252 * string to _end_ the search:
4253 *
4254 * 'foo'.rindex('o', 0) # => nil
4255 * 'foo'.rindex('o', 1) # => 1
4256 * 'foo'.rindex('o', 2) # => 2
4257 * 'foo'.rindex('o', 3) # => 2
4258 *
4259 * If +offset+ is a negative \Integer, the maximum starting position in the
4260 * string to _end_ the search is the sum of the string's length and +offset+:
4261 *
4262 * 'foo'.rindex('o', -1) # => 2
4263 * 'foo'.rindex('o', -2) # => 1
4264 * 'foo'.rindex('o', -3) # => nil
4265 * 'foo'.rindex('o', -4) # => nil
4266 *
4267 * Related: String#index.
4268 */
4269
4270static VALUE
4271rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4272{
4273 VALUE sub;
4274 VALUE vpos;
4275 rb_encoding *enc = STR_ENC_GET(str);
4276 long pos, len = str_strlen(str, enc); /* str's enc */
4277
4278 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4279 pos = NUM2LONG(vpos);
4280 if (pos < 0) {
4281 pos += len;
4282 if (pos < 0) {
4283 if (RB_TYPE_P(sub, T_REGEXP)) {
4285 }
4286 return Qnil;
4287 }
4288 }
4289 if (pos > len) pos = len;
4290 }
4291 else {
4292 pos = len;
4293 }
4294
4295 if (RB_TYPE_P(sub, T_REGEXP)) {
4296 /* enc = rb_get_check(str, sub); */
4297 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4298 enc, single_byte_optimizable(str));
4299
4300 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4301 VALUE match = rb_backref_get();
4302 struct re_registers *regs = RMATCH_REGS(match);
4303 pos = rb_str_sublen(str, BEG(0));
4304 return LONG2NUM(pos);
4305 }
4306 }
4307 else {
4308 StringValue(sub);
4309 pos = rb_str_rindex(str, sub, pos);
4310 if (pos >= 0) return LONG2NUM(pos);
4311 }
4312 return Qnil;
4313}
4314
4315static long
4316rb_str_byterindex(VALUE str, VALUE sub, long pos)
4317{
4318 long len, slen;
4319 char *sbeg, *s;
4320 rb_encoding *enc;
4321
4322 enc = rb_enc_check(str, sub);
4323 if (is_broken_string(sub)) return -1;
4324 len = RSTRING_LEN(str);
4325 slen = RSTRING_LEN(sub);
4326
4327 /* substring longer than string */
4328 if (len < slen) return -1;
4329 if (len - pos < slen) pos = len - slen;
4330 if (len == 0) return pos;
4331
4332 sbeg = RSTRING_PTR(str);
4333
4334 if (pos == 0) {
4335 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4336 return 0;
4337 else
4338 return -1;
4339 }
4340
4341 s = sbeg + pos;
4342 return str_rindex(str, sub, s, enc);
4343}
4344
4345
4346/*
4347 * call-seq:
4348 * byterindex(substring, offset = self.bytesize) -> integer or nil
4349 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4350 *
4351 * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
4352 * or +nil+ if none found:
4353 *
4354 * 'foo'.byterindex('f') # => 0
4355 * 'foo'.byterindex('o') # => 2
4356 * 'foo'.byterindex('oo') # => 1
4357 * 'foo'.byterindex('ooo') # => nil
4358 *
4359 * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+,
4360 * or +nil+ if none found:
4361 *
4362 * 'foo'.byterindex(/f/) # => 0
4363 * 'foo'.byterindex(/o/) # => 2
4364 * 'foo'.byterindex(/oo/) # => 1
4365 * 'foo'.byterindex(/ooo/) # => nil
4366 *
4367 * The _last_ match means starting at the possible last position, not
4368 * the last of longest matches.
4369 *
4370 * 'foo'.byterindex(/o+/) # => 2
4371 * $~ #=> #<MatchData "o">
4372 *
4373 * To get the last longest match, needs to combine with negative
4374 * lookbehind.
4375 *
4376 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4377 * $~ #=> #<MatchData "oo">
4378 *
4379 * Or String#byteindex with negative lookforward.
4380 *
4381 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4382 * $~ #=> #<MatchData "oo">
4383 *
4384 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4385 * string to _end_ the search:
4386 *
4387 * 'foo'.byterindex('o', 0) # => nil
4388 * 'foo'.byterindex('o', 1) # => 1
4389 * 'foo'.byterindex('o', 2) # => 2
4390 * 'foo'.byterindex('o', 3) # => 2
4391 *
4392 * If +offset+ is a negative \Integer, the maximum starting position in the
4393 * string to _end_ the search is the sum of the string's length and +offset+:
4394 *
4395 * 'foo'.byterindex('o', -1) # => 2
4396 * 'foo'.byterindex('o', -2) # => 1
4397 * 'foo'.byterindex('o', -3) # => nil
4398 * 'foo'.byterindex('o', -4) # => nil
4399 *
4400 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4401 * raised.
4402 *
4403 * Related: String#byteindex.
4404 */
4405
4406static VALUE
4407rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4408{
4409 VALUE sub;
4410 VALUE vpos;
4411 long pos, len = RSTRING_LEN(str);
4412
4413 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4414 pos = NUM2LONG(vpos);
4415 if (pos < 0) {
4416 pos += len;
4417 if (pos < 0) {
4418 if (RB_TYPE_P(sub, T_REGEXP)) {
4420 }
4421 return Qnil;
4422 }
4423 }
4424 if (pos > len) pos = len;
4425 }
4426 else {
4427 pos = len;
4428 }
4429
4430 if (!str_check_byte_pos(str, pos)) {
4432 "offset %ld does not land on character boundary", pos);
4433 }
4434
4435 if (RB_TYPE_P(sub, T_REGEXP)) {
4436 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4437 VALUE match = rb_backref_get();
4438 struct re_registers *regs = RMATCH_REGS(match);
4439 pos = BEG(0);
4440 return LONG2NUM(pos);
4441 }
4442 }
4443 else {
4444 StringValue(sub);
4445 pos = rb_str_byterindex(str, sub, pos);
4446 if (pos >= 0) return LONG2NUM(pos);
4447 }
4448 return Qnil;
4449}
4450
4451/*
4452 * call-seq:
4453 * string =~ regexp -> integer or nil
4454 * string =~ object -> integer or nil
4455 *
4456 * Returns the \Integer index of the first substring that matches
4457 * the given +regexp+, or +nil+ if no match found:
4458 *
4459 * 'foo' =~ /f/ # => 0
4460 * 'foo' =~ /o/ # => 1
4461 * 'foo' =~ /x/ # => nil
4462 *
4463 * Note: also updates Regexp@Special+global+variables.
4464 *
4465 * If the given +object+ is not a \Regexp, returns the value
4466 * returned by <tt>object =~ self</tt>.
4467 *
4468 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4469 * (see Regexp#=~):
4470 *
4471 * number= nil
4472 * "no. 9" =~ /(?<number>\d+)/
4473 * number # => nil (not assigned)
4474 * /(?<number>\d+)/ =~ "no. 9"
4475 * number #=> "9"
4476 *
4477 */
4478
4479static VALUE
4480rb_str_match(VALUE x, VALUE y)
4481{
4482 switch (OBJ_BUILTIN_TYPE(y)) {
4483 case T_STRING:
4484 rb_raise(rb_eTypeError, "type mismatch: String given");
4485
4486 case T_REGEXP:
4487 return rb_reg_match(y, x);
4488
4489 default:
4490 return rb_funcall(y, idEqTilde, 1, x);
4491 }
4492}
4493
4494
4495static VALUE get_pat(VALUE);
4496
4497
4498/*
4499 * call-seq:
4500 * match(pattern, offset = 0) -> matchdata or nil
4501 * match(pattern, offset = 0) {|matchdata| ... } -> object
4502 *
4503 * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+.
4504 *
4505 * Note: also updates Regexp@Special+global+variables.
4506 *
4507 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4508 * regexp = Regexp.new(pattern)
4509 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4510 * (see Regexp#match):
4511 * matchdata = <tt>regexp.match(self)
4512 *
4513 * With no block given, returns the computed +matchdata+:
4514 *
4515 * 'foo'.match('f') # => #<MatchData "f">
4516 * 'foo'.match('o') # => #<MatchData "o">
4517 * 'foo'.match('x') # => nil
4518 *
4519 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4520 *
4521 * 'foo'.match('f', 1) # => nil
4522 * 'foo'.match('o', 1) # => #<MatchData "o">
4523 *
4524 * With a block given, calls the block with the computed +matchdata+
4525 * and returns the block's return value:
4526 *
4527 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4528 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4529 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4530 *
4531 */
4532
4533static VALUE
4534rb_str_match_m(int argc, VALUE *argv, VALUE str)
4535{
4536 VALUE re, result;
4537 if (argc < 1)
4538 rb_check_arity(argc, 1, 2);
4539 re = argv[0];
4540 argv[0] = str;
4541 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4542 if (!NIL_P(result) && rb_block_given_p()) {
4543 return rb_yield(result);
4544 }
4545 return result;
4546}
4547
4548/*
4549 * call-seq:
4550 * match?(pattern, offset = 0) -> true or false
4551 *
4552 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4553 *
4554 * Note: does not update Regexp@Special+global+variables.
4555 *
4556 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4557 * regexp = Regexp.new(pattern)
4558 *
4559 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \MatchData object,
4560 * +false+ otherwise:
4561 *
4562 * 'foo'.match?(/o/) # => true
4563 * 'foo'.match?('o') # => true
4564 * 'foo'.match?(/x/) # => false
4565 *
4566 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4567 * 'foo'.match?('f', 1) # => false
4568 * 'foo'.match?('o', 1) # => true
4569 *
4570 */
4571
4572static VALUE
4573rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4574{
4575 VALUE re;
4576 rb_check_arity(argc, 1, 2);
4577 re = get_pat(argv[0]);
4578 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4579}
4580
4581enum neighbor_char {
4582 NEIGHBOR_NOT_CHAR,
4583 NEIGHBOR_FOUND,
4584 NEIGHBOR_WRAPPED
4585};
4586
4587static enum neighbor_char
4588enc_succ_char(char *p, long len, rb_encoding *enc)
4589{
4590 long i;
4591 int l;
4592
4593 if (rb_enc_mbminlen(enc) > 1) {
4594 /* wchar, trivial case */
4595 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4596 if (!MBCLEN_CHARFOUND_P(r)) {
4597 return NEIGHBOR_NOT_CHAR;
4598 }
4599 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4600 l = rb_enc_code_to_mbclen(c, enc);
4601 if (!l) return NEIGHBOR_NOT_CHAR;
4602 if (l != len) return NEIGHBOR_WRAPPED;
4603 rb_enc_mbcput(c, p, enc);
4604 r = rb_enc_precise_mbclen(p, p + len, enc);
4605 if (!MBCLEN_CHARFOUND_P(r)) {
4606 return NEIGHBOR_NOT_CHAR;
4607 }
4608 return NEIGHBOR_FOUND;
4609 }
4610 while (1) {
4611 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4612 p[i] = '\0';
4613 if (i < 0)
4614 return NEIGHBOR_WRAPPED;
4615 ++((unsigned char*)p)[i];
4616 l = rb_enc_precise_mbclen(p, p+len, enc);
4617 if (MBCLEN_CHARFOUND_P(l)) {
4618 l = MBCLEN_CHARFOUND_LEN(l);
4619 if (l == len) {
4620 return NEIGHBOR_FOUND;
4621 }
4622 else {
4623 memset(p+l, 0xff, len-l);
4624 }
4625 }
4626 if (MBCLEN_INVALID_P(l) && i < len-1) {
4627 long len2;
4628 int l2;
4629 for (len2 = len-1; 0 < len2; len2--) {
4630 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4631 if (!MBCLEN_INVALID_P(l2))
4632 break;
4633 }
4634 memset(p+len2+1, 0xff, len-(len2+1));
4635 }
4636 }
4637}
4638
4639static enum neighbor_char
4640enc_pred_char(char *p, long len, rb_encoding *enc)
4641{
4642 long i;
4643 int l;
4644 if (rb_enc_mbminlen(enc) > 1) {
4645 /* wchar, trivial case */
4646 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4647 if (!MBCLEN_CHARFOUND_P(r)) {
4648 return NEIGHBOR_NOT_CHAR;
4649 }
4650 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4651 if (!c) return NEIGHBOR_NOT_CHAR;
4652 --c;
4653 l = rb_enc_code_to_mbclen(c, enc);
4654 if (!l) return NEIGHBOR_NOT_CHAR;
4655 if (l != len) return NEIGHBOR_WRAPPED;
4656 rb_enc_mbcput(c, p, enc);
4657 r = rb_enc_precise_mbclen(p, p + len, enc);
4658 if (!MBCLEN_CHARFOUND_P(r)) {
4659 return NEIGHBOR_NOT_CHAR;
4660 }
4661 return NEIGHBOR_FOUND;
4662 }
4663 while (1) {
4664 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4665 p[i] = '\xff';
4666 if (i < 0)
4667 return NEIGHBOR_WRAPPED;
4668 --((unsigned char*)p)[i];
4669 l = rb_enc_precise_mbclen(p, p+len, enc);
4670 if (MBCLEN_CHARFOUND_P(l)) {
4671 l = MBCLEN_CHARFOUND_LEN(l);
4672 if (l == len) {
4673 return NEIGHBOR_FOUND;
4674 }
4675 else {
4676 memset(p+l, 0, len-l);
4677 }
4678 }
4679 if (MBCLEN_INVALID_P(l) && i < len-1) {
4680 long len2;
4681 int l2;
4682 for (len2 = len-1; 0 < len2; len2--) {
4683 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4684 if (!MBCLEN_INVALID_P(l2))
4685 break;
4686 }
4687 memset(p+len2+1, 0, len-(len2+1));
4688 }
4689 }
4690}
4691
4692/*
4693 overwrite +p+ by succeeding letter in +enc+ and returns
4694 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4695 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4696 assuming each ranges are successive, and mbclen
4697 never change in each ranges.
4698 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4699 character.
4700 */
4701static enum neighbor_char
4702enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4703{
4704 enum neighbor_char ret;
4705 unsigned int c;
4706 int ctype;
4707 int range;
4708 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4709
4710 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4711 int try;
4712 const int max_gaps = 1;
4713
4714 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4715 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4716 ctype = ONIGENC_CTYPE_DIGIT;
4717 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4718 ctype = ONIGENC_CTYPE_ALPHA;
4719 else
4720 return NEIGHBOR_NOT_CHAR;
4721
4722 MEMCPY(save, p, char, len);
4723 for (try = 0; try <= max_gaps; ++try) {
4724 ret = enc_succ_char(p, len, enc);
4725 if (ret == NEIGHBOR_FOUND) {
4726 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4727 if (rb_enc_isctype(c, ctype, enc))
4728 return NEIGHBOR_FOUND;
4729 }
4730 }
4731 MEMCPY(p, save, char, len);
4732 range = 1;
4733 while (1) {
4734 MEMCPY(save, p, char, len);
4735 ret = enc_pred_char(p, len, enc);
4736 if (ret == NEIGHBOR_FOUND) {
4737 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4738 if (!rb_enc_isctype(c, ctype, enc)) {
4739 MEMCPY(p, save, char, len);
4740 break;
4741 }
4742 }
4743 else {
4744 MEMCPY(p, save, char, len);
4745 break;
4746 }
4747 range++;
4748 }
4749 if (range == 1) {
4750 return NEIGHBOR_NOT_CHAR;
4751 }
4752
4753 if (ctype != ONIGENC_CTYPE_DIGIT) {
4754 MEMCPY(carry, p, char, len);
4755 return NEIGHBOR_WRAPPED;
4756 }
4757
4758 MEMCPY(carry, p, char, len);
4759 enc_succ_char(carry, len, enc);
4760 return NEIGHBOR_WRAPPED;
4761}
4762
4763
4764static VALUE str_succ(VALUE str);
4765
4766/*
4767 * call-seq:
4768 * succ -> new_str
4769 *
4770 * Returns the successor to +self+. The successor is calculated by
4771 * incrementing characters.
4772 *
4773 * The first character to be incremented is the rightmost alphanumeric:
4774 * or, if no alphanumerics, the rightmost character:
4775 *
4776 * 'THX1138'.succ # => "THX1139"
4777 * '<<koala>>'.succ # => "<<koalb>>"
4778 * '***'.succ # => '**+'
4779 *
4780 * The successor to a digit is another digit, "carrying" to the next-left
4781 * character for a "rollover" from 9 to 0, and prepending another digit
4782 * if necessary:
4783 *
4784 * '00'.succ # => "01"
4785 * '09'.succ # => "10"
4786 * '99'.succ # => "100"
4787 *
4788 * The successor to a letter is another letter of the same case,
4789 * carrying to the next-left character for a rollover,
4790 * and prepending another same-case letter if necessary:
4791 *
4792 * 'aa'.succ # => "ab"
4793 * 'az'.succ # => "ba"
4794 * 'zz'.succ # => "aaa"
4795 * 'AA'.succ # => "AB"
4796 * 'AZ'.succ # => "BA"
4797 * 'ZZ'.succ # => "AAA"
4798 *
4799 * The successor to a non-alphanumeric character is the next character
4800 * in the underlying character set's collating sequence,
4801 * carrying to the next-left character for a rollover,
4802 * and prepending another character if necessary:
4803 *
4804 * s = 0.chr * 3
4805 * s # => "\x00\x00\x00"
4806 * s.succ # => "\x00\x00\x01"
4807 * s = 255.chr * 3
4808 * s # => "\xFF\xFF\xFF"
4809 * s.succ # => "\x01\x00\x00\x00"
4810 *
4811 * Carrying can occur between and among mixtures of alphanumeric characters:
4812 *
4813 * s = 'zz99zz99'
4814 * s.succ # => "aaa00aa00"
4815 * s = '99zz99zz'
4816 * s.succ # => "100aa00aa"
4817 *
4818 * The successor to an empty \String is a new empty \String:
4819 *
4820 * ''.succ # => ""
4821 *
4822 * String#next is an alias for String#succ.
4823 */
4824
4825VALUE
4827{
4828 VALUE str;
4829 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4830 rb_enc_cr_str_copy_for_substr(str, orig);
4831 return str_succ(str);
4832}
4833
4834static VALUE
4835str_succ(VALUE str)
4836{
4837 rb_encoding *enc;
4838 char *sbeg, *s, *e, *last_alnum = 0;
4839 int found_alnum = 0;
4840 long l, slen;
4841 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4842 long carry_pos = 0, carry_len = 1;
4843 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4844
4845 slen = RSTRING_LEN(str);
4846 if (slen == 0) return str;
4847
4848 enc = STR_ENC_GET(str);
4849 sbeg = RSTRING_PTR(str);
4850 s = e = sbeg + slen;
4851
4852 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4853 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4854 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4855 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4856 break;
4857 }
4858 }
4859 l = rb_enc_precise_mbclen(s, e, enc);
4860 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4861 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4862 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4863 switch (neighbor) {
4864 case NEIGHBOR_NOT_CHAR:
4865 continue;
4866 case NEIGHBOR_FOUND:
4867 return str;
4868 case NEIGHBOR_WRAPPED:
4869 last_alnum = s;
4870 break;
4871 }
4872 found_alnum = 1;
4873 carry_pos = s - sbeg;
4874 carry_len = l;
4875 }
4876 if (!found_alnum) { /* str contains no alnum */
4877 s = e;
4878 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4879 enum neighbor_char neighbor;
4880 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4881 l = rb_enc_precise_mbclen(s, e, enc);
4882 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4883 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4884 MEMCPY(tmp, s, char, l);
4885 neighbor = enc_succ_char(tmp, l, enc);
4886 switch (neighbor) {
4887 case NEIGHBOR_FOUND:
4888 MEMCPY(s, tmp, char, l);
4889 return str;
4890 break;
4891 case NEIGHBOR_WRAPPED:
4892 MEMCPY(s, tmp, char, l);
4893 break;
4894 case NEIGHBOR_NOT_CHAR:
4895 break;
4896 }
4897 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4898 /* wrapped to \0...\0. search next valid char. */
4899 enc_succ_char(s, l, enc);
4900 }
4901 if (!rb_enc_asciicompat(enc)) {
4902 MEMCPY(carry, s, char, l);
4903 carry_len = l;
4904 }
4905 carry_pos = s - sbeg;
4906 }
4908 }
4909 RESIZE_CAPA(str, slen + carry_len);
4910 sbeg = RSTRING_PTR(str);
4911 s = sbeg + carry_pos;
4912 memmove(s + carry_len, s, slen - carry_pos);
4913 memmove(s, carry, carry_len);
4914 slen += carry_len;
4915 STR_SET_LEN(str, slen);
4916 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4918 return str;
4919}
4920
4921
4922/*
4923 * call-seq:
4924 * succ! -> self
4925 *
4926 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4927 *
4928 * String#next! is an alias for String#succ!.
4929 */
4930
4931static VALUE
4932rb_str_succ_bang(VALUE str)
4933{
4934 rb_str_modify(str);
4935 str_succ(str);
4936 return str;
4937}
4938
4939static int
4940all_digits_p(const char *s, long len)
4941{
4942 while (len-- > 0) {
4943 if (!ISDIGIT(*s)) return 0;
4944 s++;
4945 }
4946 return 1;
4947}
4948
4949static int
4950str_upto_i(VALUE str, VALUE arg)
4951{
4952 rb_yield(str);
4953 return 0;
4954}
4955
4956/*
4957 * call-seq:
4958 * upto(other_string, exclusive = false) {|string| ... } -> self
4959 * upto(other_string, exclusive = false) -> new_enumerator
4960 *
4961 * With a block given, calls the block with each \String value
4962 * returned by successive calls to String#succ;
4963 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4964 * the sequence terminates when value +other_string+ is reached;
4965 * returns +self+:
4966 *
4967 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4968 * Output:
4969 *
4970 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4971 *
4972 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4973 *
4974 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4975 *
4976 * Output:
4977 *
4978 * a8 a9 b0 b1 b2 b3 b4 b5
4979 *
4980 * If +other_string+ would not be reached, does not call the block:
4981 *
4982 * '25'.upto('5') {|s| fail s }
4983 * 'aa'.upto('a') {|s| fail s }
4984 *
4985 * With no block given, returns a new \Enumerator:
4986 *
4987 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4988 *
4989 */
4990
4991static VALUE
4992rb_str_upto(int argc, VALUE *argv, VALUE beg)
4993{
4994 VALUE end, exclusive;
4995
4996 rb_scan_args(argc, argv, "11", &end, &exclusive);
4997 RETURN_ENUMERATOR(beg, argc, argv);
4998 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4999}
5000
5001VALUE
5002rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5003{
5004 VALUE current, after_end;
5005 ID succ;
5006 int n, ascii;
5007 rb_encoding *enc;
5008
5009 CONST_ID(succ, "succ");
5010 StringValue(end);
5011 enc = rb_enc_check(beg, end);
5012 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5013 /* single character */
5014 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5015 char c = RSTRING_PTR(beg)[0];
5016 char e = RSTRING_PTR(end)[0];
5017
5018 if (c > e || (excl && c == e)) return beg;
5019 for (;;) {
5020 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5021 if (!excl && c == e) break;
5022 c++;
5023 if (excl && c == e) break;
5024 }
5025 return beg;
5026 }
5027 /* both edges are all digits */
5028 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5029 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5030 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5031 VALUE b, e;
5032 int width;
5033
5034 width = RSTRING_LENINT(beg);
5035 b = rb_str_to_inum(beg, 10, FALSE);
5036 e = rb_str_to_inum(end, 10, FALSE);
5037 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5038 long bi = FIX2LONG(b);
5039 long ei = FIX2LONG(e);
5040 rb_encoding *usascii = rb_usascii_encoding();
5041
5042 while (bi <= ei) {
5043 if (excl && bi == ei) break;
5044 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5045 bi++;
5046 }
5047 }
5048 else {
5049 ID op = excl ? '<' : idLE;
5050 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5051
5052 args[0] = INT2FIX(width);
5053 while (rb_funcall(b, op, 1, e)) {
5054 args[1] = b;
5055 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5056 b = rb_funcallv(b, succ, 0, 0);
5057 }
5058 }
5059 return beg;
5060 }
5061 /* normal case */
5062 n = rb_str_cmp(beg, end);
5063 if (n > 0 || (excl && n == 0)) return beg;
5064
5065 after_end = rb_funcallv(end, succ, 0, 0);
5066 current = str_duplicate(rb_cString, beg);
5067 while (!rb_str_equal(current, after_end)) {
5068 VALUE next = Qnil;
5069 if (excl || !rb_str_equal(current, end))
5070 next = rb_funcallv(current, succ, 0, 0);
5071 if ((*each)(current, arg)) break;
5072 if (NIL_P(next)) break;
5073 current = next;
5074 StringValue(current);
5075 if (excl && rb_str_equal(current, end)) break;
5076 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5077 break;
5078 }
5079
5080 return beg;
5081}
5082
5083VALUE
5084rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5085{
5086 VALUE current;
5087 ID succ;
5088
5089 CONST_ID(succ, "succ");
5090 /* both edges are all digits */
5091 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5092 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5093 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5094 int width = RSTRING_LENINT(beg);
5095 b = rb_str_to_inum(beg, 10, FALSE);
5096 if (FIXNUM_P(b)) {
5097 long bi = FIX2LONG(b);
5098 rb_encoding *usascii = rb_usascii_encoding();
5099
5100 while (FIXABLE(bi)) {
5101 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5102 bi++;
5103 }
5104 b = LONG2NUM(bi);
5105 }
5106 args[0] = INT2FIX(width);
5107 while (1) {
5108 args[1] = b;
5109 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5110 b = rb_funcallv(b, succ, 0, 0);
5111 }
5112 }
5113 /* normal case */
5114 current = str_duplicate(rb_cString, beg);
5115 while (1) {
5116 VALUE next = rb_funcallv(current, succ, 0, 0);
5117 if ((*each)(current, arg)) break;
5118 current = next;
5119 StringValue(current);
5120 if (RSTRING_LEN(current) == 0)
5121 break;
5122 }
5123
5124 return beg;
5125}
5126
5127static int
5128include_range_i(VALUE str, VALUE arg)
5129{
5130 VALUE *argp = (VALUE *)arg;
5131 if (!rb_equal(str, *argp)) return 0;
5132 *argp = Qnil;
5133 return 1;
5134}
5135
5136VALUE
5137rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5138{
5139 beg = rb_str_new_frozen(beg);
5140 StringValue(end);
5141 end = rb_str_new_frozen(end);
5142 if (NIL_P(val)) return Qfalse;
5143 val = rb_check_string_type(val);
5144 if (NIL_P(val)) return Qfalse;
5145 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5146 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5147 rb_enc_asciicompat(STR_ENC_GET(val))) {
5148 const char *bp = RSTRING_PTR(beg);
5149 const char *ep = RSTRING_PTR(end);
5150 const char *vp = RSTRING_PTR(val);
5151 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5152 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5153 return Qfalse;
5154 else {
5155 char b = *bp;
5156 char e = *ep;
5157 char v = *vp;
5158
5159 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5160 if (b <= v && v < e) return Qtrue;
5161 return RBOOL(!RTEST(exclusive) && v == e);
5162 }
5163 }
5164 }
5165#if 0
5166 /* both edges are all digits */
5167 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5168 all_digits_p(bp, RSTRING_LEN(beg)) &&
5169 all_digits_p(ep, RSTRING_LEN(end))) {
5170 /* TODO */
5171 }
5172#endif
5173 }
5174 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5175
5176 return RBOOL(NIL_P(val));
5177}
5178
5179static VALUE
5180rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5181{
5182 if (rb_reg_search(re, str, 0, 0) >= 0) {
5183 VALUE match = rb_backref_get();
5184 int nth = rb_reg_backref_number(match, backref);
5185 return rb_reg_nth_match(nth, match);
5186 }
5187 return Qnil;
5188}
5189
5190static VALUE
5191rb_str_aref(VALUE str, VALUE indx)
5192{
5193 long idx;
5194
5195 if (FIXNUM_P(indx)) {
5196 idx = FIX2LONG(indx);
5197 }
5198 else if (RB_TYPE_P(indx, T_REGEXP)) {
5199 return rb_str_subpat(str, indx, INT2FIX(0));
5200 }
5201 else if (RB_TYPE_P(indx, T_STRING)) {
5202 if (rb_str_index(str, indx, 0) != -1)
5203 return str_duplicate(rb_cString, indx);
5204 return Qnil;
5205 }
5206 else {
5207 /* check if indx is Range */
5208 long beg, len = str_strlen(str, NULL);
5209 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5210 case Qfalse:
5211 break;
5212 case Qnil:
5213 return Qnil;
5214 default:
5215 return rb_str_substr(str, beg, len);
5216 }
5217 idx = NUM2LONG(indx);
5218 }
5219
5220 return str_substr(str, idx, 1, FALSE);
5221}
5222
5223
5224/*
5225 * call-seq:
5226 * string[index] -> new_string or nil
5227 * string[start, length] -> new_string or nil
5228 * string[range] -> new_string or nil
5229 * string[regexp, capture = 0] -> new_string or nil
5230 * string[substring] -> new_string or nil
5231 *
5232 * Returns the substring of +self+ specified by the arguments.
5233 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5234 *
5235 *
5236 */
5237
5238static VALUE
5239rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5240{
5241 if (argc == 2) {
5242 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5243 return rb_str_subpat(str, argv[0], argv[1]);
5244 }
5245 else {
5246 long beg = NUM2LONG(argv[0]);
5247 long len = NUM2LONG(argv[1]);
5248 return rb_str_substr(str, beg, len);
5249 }
5250 }
5251 rb_check_arity(argc, 1, 2);
5252 return rb_str_aref(str, argv[0]);
5253}
5254
5255VALUE
5257{
5258 char *ptr = RSTRING_PTR(str);
5259 long olen = RSTRING_LEN(str), nlen;
5260
5261 str_modifiable(str);
5262 if (len > olen) len = olen;
5263 nlen = olen - len;
5264 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5265 char *oldptr = ptr;
5266 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5267 STR_SET_EMBED(str);
5268 STR_SET_EMBED_LEN(str, nlen);
5269 ptr = RSTRING(str)->as.embed.ary;
5270 memmove(ptr, oldptr + len, nlen);
5271 if (fl == STR_NOEMBED) xfree(oldptr);
5272 }
5273 else {
5274 if (!STR_SHARED_P(str)) {
5275 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5276 rb_enc_cr_str_exact_copy(shared, str);
5277 OBJ_FREEZE(shared);
5278 }
5279 ptr = RSTRING(str)->as.heap.ptr += len;
5280 RSTRING(str)->as.heap.len = nlen;
5281 }
5282 ptr[nlen] = 0;
5284 return str;
5285}
5286
5287static void
5288rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5289{
5290 char *sptr;
5291 long slen, vlen = RSTRING_LEN(val);
5292 int cr;
5293
5294 if (beg == 0 && vlen == 0) {
5295 rb_str_drop_bytes(str, len);
5296 return;
5297 }
5298
5299 str_modify_keep_cr(str);
5300 RSTRING_GETMEM(str, sptr, slen);
5301 if (len < vlen) {
5302 /* expand string */
5303 RESIZE_CAPA(str, slen + vlen - len);
5304 sptr = RSTRING_PTR(str);
5305 }
5306
5308 cr = rb_enc_str_coderange(val);
5309 else
5311
5312 if (vlen != len) {
5313 memmove(sptr + beg + vlen,
5314 sptr + beg + len,
5315 slen - (beg + len));
5316 }
5317 if (vlen < beg && len < 0) {
5318 MEMZERO(sptr + slen, char, -len);
5319 }
5320 if (vlen > 0) {
5321 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5322 }
5323 slen += vlen - len;
5324 STR_SET_LEN(str, slen);
5325 TERM_FILL(&sptr[slen], TERM_LEN(str));
5326 ENC_CODERANGE_SET(str, cr);
5327}
5328
5329void
5330rb_str_update(VALUE str, long beg, long len, VALUE val)
5331{
5332 long slen;
5333 char *p, *e;
5334 rb_encoding *enc;
5335 int singlebyte = single_byte_optimizable(str);
5336 int cr;
5337
5338 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5339
5340 StringValue(val);
5341 enc = rb_enc_check(str, val);
5342 slen = str_strlen(str, enc); /* rb_enc_check */
5343
5344 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5345 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5346 }
5347 if (beg < 0) {
5348 beg += slen;
5349 }
5350 assert(beg >= 0);
5351 assert(beg <= slen);
5352 if (len > slen - beg) {
5353 len = slen - beg;
5354 }
5355 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5356 if (!p) p = RSTRING_END(str);
5357 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5358 if (!e) e = RSTRING_END(str);
5359 /* error check */
5360 beg = p - RSTRING_PTR(str); /* physical position */
5361 len = e - p; /* physical length */
5362 rb_str_splice_0(str, beg, len, val);
5363 rb_enc_associate(str, enc);
5365 if (cr != ENC_CODERANGE_BROKEN)
5366 ENC_CODERANGE_SET(str, cr);
5367}
5368
5369#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5370
5371static void
5372rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5373{
5374 int nth;
5375 VALUE match;
5376 long start, end, len;
5377 rb_encoding *enc;
5378 struct re_registers *regs;
5379
5380 if (rb_reg_search(re, str, 0, 0) < 0) {
5381 rb_raise(rb_eIndexError, "regexp not matched");
5382 }
5383 match = rb_backref_get();
5384 nth = rb_reg_backref_number(match, backref);
5385 regs = RMATCH_REGS(match);
5386 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5387 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5388 }
5389 if (nth < 0) {
5390 nth += regs->num_regs;
5391 }
5392
5393 start = BEG(nth);
5394 if (start == -1) {
5395 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5396 }
5397 end = END(nth);
5398 len = end - start;
5399 StringValue(val);
5400 enc = rb_enc_check_str(str, val);
5401 rb_str_splice_0(str, start, len, val);
5402 rb_enc_associate(str, enc);
5403}
5404
5405static VALUE
5406rb_str_aset(VALUE str, VALUE indx, VALUE val)
5407{
5408 long idx, beg;
5409
5410 switch (TYPE(indx)) {
5411 case T_REGEXP:
5412 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5413 return val;
5414
5415 case T_STRING:
5416 beg = rb_str_index(str, indx, 0);
5417 if (beg < 0) {
5418 rb_raise(rb_eIndexError, "string not matched");
5419 }
5420 beg = rb_str_sublen(str, beg);
5421 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5422 return val;
5423
5424 default:
5425 /* check if indx is Range */
5426 {
5427 long beg, len;
5428 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5429 rb_str_splice(str, beg, len, val);
5430 return val;
5431 }
5432 }
5433 /* FALLTHROUGH */
5434
5435 case T_FIXNUM:
5436 idx = NUM2LONG(indx);
5437 rb_str_splice(str, idx, 1, val);
5438 return val;
5439 }
5440}
5441
5442/*
5443 * call-seq:
5444 * string[index] = new_string
5445 * string[start, length] = new_string
5446 * string[range] = new_string
5447 * string[regexp, capture = 0] = new_string
5448 * string[substring] = new_string
5449 *
5450 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5451 * See {String Slices}[rdoc-ref:String@String+Slices].
5452 *
5453 * A few examples:
5454 *
5455 * s = 'foo'
5456 * s[2] = 'rtune' # => "rtune"
5457 * s # => "fortune"
5458 * s[1, 5] = 'init' # => "init"
5459 * s # => "finite"
5460 * s[3..4] = 'al' # => "al"
5461 * s # => "finale"
5462 * s[/e$/] = 'ly' # => "ly"
5463 * s # => "finally"
5464 * s['lly'] = 'ncial' # => "ncial"
5465 * s # => "financial"
5466 *
5467 * String#slice is an alias for String#[].
5468 *
5469 */
5470
5471static VALUE
5472rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5473{
5474 if (argc == 3) {
5475 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5476 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5477 }
5478 else {
5479 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5480 }
5481 return argv[2];
5482 }
5483 rb_check_arity(argc, 2, 3);
5484 return rb_str_aset(str, argv[0], argv[1]);
5485}
5486
5487/*
5488 * call-seq:
5489 * insert(index, other_string) -> self
5490 *
5491 * Inserts the given +other_string+ into +self+; returns +self+.
5492 *
5493 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5494 *
5495 * 'foo'.insert(1, 'bar') # => "fbaroo"
5496 *
5497 * If the \Integer +index+ is negative, counts backward from the end of +self+
5498 * and inserts +other_string+ at offset <tt>index+1</tt>
5499 * (that is, _after_ <tt>self[index]</tt>):
5500 *
5501 * 'foo'.insert(-2, 'bar') # => "fobaro"
5502 *
5503 */
5504
5505static VALUE
5506rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5507{
5508 long pos = NUM2LONG(idx);
5509
5510 if (pos == -1) {
5511 return rb_str_append(str, str2);
5512 }
5513 else if (pos < 0) {
5514 pos++;
5515 }
5516 rb_str_splice(str, pos, 0, str2);
5517 return str;
5518}
5519
5520
5521/*
5522 * call-seq:
5523 * slice!(index) -> new_string or nil
5524 * slice!(start, length) -> new_string or nil
5525 * slice!(range) -> new_string or nil
5526 * slice!(regexp, capture = 0) -> new_string or nil
5527 * slice!(substring) -> new_string or nil
5528 *
5529 * Removes and returns the substring of +self+ specified by the arguments.
5530 * See {String Slices}[rdoc-ref:String@String+Slices].
5531 *
5532 * A few examples:
5533 *
5534 * string = "This is a string"
5535 * string.slice!(2) #=> "i"
5536 * string.slice!(3..6) #=> " is "
5537 * string.slice!(/s.*t/) #=> "sa st"
5538 * string.slice!("r") #=> "r"
5539 * string #=> "Thing"
5540 *
5541 */
5542
5543static VALUE
5544rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5545{
5546 VALUE result = Qnil;
5547 VALUE indx;
5548 long beg, len = 1;
5549 char *p;
5550
5551 rb_check_arity(argc, 1, 2);
5552 str_modify_keep_cr(str);
5553 indx = argv[0];
5554 if (RB_TYPE_P(indx, T_REGEXP)) {
5555 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5556 VALUE match = rb_backref_get();
5557 struct re_registers *regs = RMATCH_REGS(match);
5558 int nth = 0;
5559 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5560 if ((nth += regs->num_regs) <= 0) return Qnil;
5561 }
5562 else if (nth >= regs->num_regs) return Qnil;
5563 beg = BEG(nth);
5564 len = END(nth) - beg;
5565 goto subseq;
5566 }
5567 else if (argc == 2) {
5568 beg = NUM2LONG(indx);
5569 len = NUM2LONG(argv[1]);
5570 goto num_index;
5571 }
5572 else if (FIXNUM_P(indx)) {
5573 beg = FIX2LONG(indx);
5574 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5575 if (!len) return Qnil;
5576 beg = p - RSTRING_PTR(str);
5577 goto subseq;
5578 }
5579 else if (RB_TYPE_P(indx, T_STRING)) {
5580 beg = rb_str_index(str, indx, 0);
5581 if (beg == -1) return Qnil;
5582 len = RSTRING_LEN(indx);
5583 result = str_duplicate(rb_cString, indx);
5584 goto squash;
5585 }
5586 else {
5587 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5588 case Qnil:
5589 return Qnil;
5590 case Qfalse:
5591 beg = NUM2LONG(indx);
5592 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5593 if (!len) return Qnil;
5594 beg = p - RSTRING_PTR(str);
5595 goto subseq;
5596 default:
5597 goto num_index;
5598 }
5599 }
5600
5601 num_index:
5602 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5603 beg = p - RSTRING_PTR(str);
5604
5605 subseq:
5606 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5607 rb_enc_cr_str_copy_for_substr(result, str);
5608
5609 squash:
5610 if (len > 0) {
5611 if (beg == 0) {
5612 rb_str_drop_bytes(str, len);
5613 }
5614 else {
5615 char *sptr = RSTRING_PTR(str);
5616 long slen = RSTRING_LEN(str);
5617 if (beg + len > slen) /* pathological check */
5618 len = slen - beg;
5619 memmove(sptr + beg,
5620 sptr + beg + len,
5621 slen - (beg + len));
5622 slen -= len;
5623 STR_SET_LEN(str, slen);
5624 TERM_FILL(&sptr[slen], TERM_LEN(str));
5625 }
5626 }
5627 return result;
5628}
5629
5630static VALUE
5631get_pat(VALUE pat)
5632{
5633 VALUE val;
5634
5635 switch (OBJ_BUILTIN_TYPE(pat)) {
5636 case T_REGEXP:
5637 return pat;
5638
5639 case T_STRING:
5640 break;
5641
5642 default:
5643 val = rb_check_string_type(pat);
5644 if (NIL_P(val)) {
5645 Check_Type(pat, T_REGEXP);
5646 }
5647 pat = val;
5648 }
5649
5650 return rb_reg_regcomp(pat);
5651}
5652
5653static VALUE
5654get_pat_quoted(VALUE pat, int check)
5655{
5656 VALUE val;
5657
5658 switch (OBJ_BUILTIN_TYPE(pat)) {
5659 case T_REGEXP:
5660 return pat;
5661
5662 case T_STRING:
5663 break;
5664
5665 default:
5666 val = rb_check_string_type(pat);
5667 if (NIL_P(val)) {
5668 Check_Type(pat, T_REGEXP);
5669 }
5670 pat = val;
5671 }
5672 if (check && is_broken_string(pat)) {
5673 rb_exc_raise(rb_reg_check_preprocess(pat));
5674 }
5675 return pat;
5676}
5677
5678static long
5679rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5680{
5681 if (BUILTIN_TYPE(pat) == T_STRING) {
5682 pos = rb_strseq_index(str, pat, pos, 1);
5683 if (set_backref_str) {
5684 if (pos >= 0) {
5685 str = rb_str_new_frozen_String(str);
5686 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5687 }
5688 else {
5690 }
5691 }
5692 return pos;
5693 }
5694 else {
5695 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5696 }
5697}
5698
5699
5700/*
5701 * call-seq:
5702 * sub!(pattern, replacement) -> self or nil
5703 * sub!(pattern) {|match| ... } -> self or nil
5704 *
5705 * Returns +self+ with only the first occurrence
5706 * (not all occurrences) of the given +pattern+ replaced.
5707 *
5708 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5709 *
5710 * Related: String#sub, String#gsub, String#gsub!.
5711 *
5712 */
5713
5714static VALUE
5715rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5716{
5717 VALUE pat, repl, hash = Qnil;
5718 int iter = 0;
5719 long plen;
5720 int min_arity = rb_block_given_p() ? 1 : 2;
5721 long beg;
5722
5723 rb_check_arity(argc, min_arity, 2);
5724 if (argc == 1) {
5725 iter = 1;
5726 }
5727 else {
5728 repl = argv[1];
5729 hash = rb_check_hash_type(argv[1]);
5730 if (NIL_P(hash)) {
5731 StringValue(repl);
5732 }
5733 }
5734
5735 pat = get_pat_quoted(argv[0], 1);
5736
5737 str_modifiable(str);
5738 beg = rb_pat_search(pat, str, 0, 1);
5739 if (beg >= 0) {
5740 rb_encoding *enc;
5741 int cr = ENC_CODERANGE(str);
5742 long beg0, end0;
5743 VALUE match, match0 = Qnil;
5744 struct re_registers *regs;
5745 char *p, *rp;
5746 long len, rlen;
5747
5748 match = rb_backref_get();
5749 regs = RMATCH_REGS(match);
5750 if (RB_TYPE_P(pat, T_STRING)) {
5751 beg0 = beg;
5752 end0 = beg0 + RSTRING_LEN(pat);
5753 match0 = pat;
5754 }
5755 else {
5756 beg0 = BEG(0);
5757 end0 = END(0);
5758 if (iter) match0 = rb_reg_nth_match(0, match);
5759 }
5760
5761 if (iter || !NIL_P(hash)) {
5762 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5763
5764 if (iter) {
5765 repl = rb_obj_as_string(rb_yield(match0));
5766 }
5767 else {
5768 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5769 repl = rb_obj_as_string(repl);
5770 }
5771 str_mod_check(str, p, len);
5772 rb_check_frozen(str);
5773 }
5774 else {
5775 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5776 }
5777
5778 enc = rb_enc_compatible(str, repl);
5779 if (!enc) {
5780 rb_encoding *str_enc = STR_ENC_GET(str);
5781 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5782 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5783 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5784 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5785 rb_enc_name(str_enc),
5786 rb_enc_name(STR_ENC_GET(repl)));
5787 }
5788 enc = STR_ENC_GET(repl);
5789 }
5790 rb_str_modify(str);
5791 rb_enc_associate(str, enc);
5793 int cr2 = ENC_CODERANGE(repl);
5794 if (cr2 == ENC_CODERANGE_BROKEN ||
5795 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5797 else
5798 cr = cr2;
5799 }
5800 plen = end0 - beg0;
5801 rlen = RSTRING_LEN(repl);
5802 len = RSTRING_LEN(str);
5803 if (rlen > plen) {
5804 RESIZE_CAPA(str, len + rlen - plen);
5805 }
5806 p = RSTRING_PTR(str);
5807 if (rlen != plen) {
5808 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5809 }
5810 rp = RSTRING_PTR(repl);
5811 memmove(p + beg0, rp, rlen);
5812 len += rlen - plen;
5813 STR_SET_LEN(str, len);
5814 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5815 ENC_CODERANGE_SET(str, cr);
5816
5817 return str;
5818 }
5819 return Qnil;
5820}
5821
5822
5823/*
5824 * call-seq:
5825 * sub(pattern, replacement) -> new_string
5826 * sub(pattern) {|match| ... } -> new_string
5827 *
5828 * Returns a copy of +self+ with only the first occurrence
5829 * (not all occurrences) of the given +pattern+ replaced.
5830 *
5831 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5832 *
5833 * Related: String#sub!, String#gsub, String#gsub!.
5834 *
5835 */
5836
5837static VALUE
5838rb_str_sub(int argc, VALUE *argv, VALUE str)
5839{
5840 str = str_duplicate(rb_cString, str);
5841 rb_str_sub_bang(argc, argv, str);
5842 return str;
5843}
5844
5845static VALUE
5846str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5847{
5848 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5849 struct re_registers *regs;
5850 long beg, beg0, end0;
5851 long offset, blen, slen, len, last;
5852 enum {STR, ITER, MAP} mode = STR;
5853 char *sp, *cp;
5854 int need_backref = -1;
5855 rb_encoding *str_enc;
5856
5857 switch (argc) {
5858 case 1:
5859 RETURN_ENUMERATOR(str, argc, argv);
5860 mode = ITER;
5861 break;
5862 case 2:
5863 repl = argv[1];
5864 hash = rb_check_hash_type(argv[1]);
5865 if (NIL_P(hash)) {
5866 StringValue(repl);
5867 }
5868 else {
5869 mode = MAP;
5870 }
5871 break;
5872 default:
5873 rb_error_arity(argc, 1, 2);
5874 }
5875
5876 pat = get_pat_quoted(argv[0], 1);
5877 beg = rb_pat_search(pat, str, 0, need_backref);
5878 if (beg < 0) {
5879 if (bang) return Qnil; /* no match, no substitution */
5880 return str_duplicate(rb_cString, str);
5881 }
5882
5883 offset = 0;
5884 blen = RSTRING_LEN(str) + 30; /* len + margin */
5885 dest = rb_str_buf_new(blen);
5886 sp = RSTRING_PTR(str);
5887 slen = RSTRING_LEN(str);
5888 cp = sp;
5889 str_enc = STR_ENC_GET(str);
5890 rb_enc_associate(dest, str_enc);
5892
5893 do {
5894 match = rb_backref_get();
5895 regs = RMATCH_REGS(match);
5896 if (RB_TYPE_P(pat, T_STRING)) {
5897 beg0 = beg;
5898 end0 = beg0 + RSTRING_LEN(pat);
5899 match0 = pat;
5900 }
5901 else {
5902 beg0 = BEG(0);
5903 end0 = END(0);
5904 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5905 }
5906
5907 if (mode) {
5908 if (mode == ITER) {
5909 val = rb_obj_as_string(rb_yield(match0));
5910 }
5911 else {
5912 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5913 val = rb_obj_as_string(val);
5914 }
5915 str_mod_check(str, sp, slen);
5916 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5917 rb_raise(rb_eRuntimeError, "block should not cheat");
5918 }
5919 }
5920 else if (need_backref) {
5921 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5922 if (need_backref < 0) {
5923 need_backref = val != repl;
5924 }
5925 }
5926 else {
5927 val = repl;
5928 }
5929
5930 len = beg0 - offset; /* copy pre-match substr */
5931 if (len) {
5932 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5933 }
5934
5935 rb_str_buf_append(dest, val);
5936
5937 last = offset;
5938 offset = end0;
5939 if (beg0 == end0) {
5940 /*
5941 * Always consume at least one character of the input string
5942 * in order to prevent infinite loops.
5943 */
5944 if (RSTRING_LEN(str) <= end0) break;
5945 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5946 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5947 offset = end0 + len;
5948 }
5949 cp = RSTRING_PTR(str) + offset;
5950 if (offset > RSTRING_LEN(str)) break;
5951 beg = rb_pat_search(pat, str, offset, need_backref);
5952 } while (beg >= 0);
5953 if (RSTRING_LEN(str) > offset) {
5954 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5955 }
5956 rb_pat_search(pat, str, last, 1);
5957 if (bang) {
5958 str_shared_replace(str, dest);
5959 }
5960 else {
5961 str = dest;
5962 }
5963
5964 return str;
5965}
5966
5967
5968/*
5969 * call-seq:
5970 * gsub!(pattern, replacement) -> self or nil
5971 * gsub!(pattern) {|match| ... } -> self or nil
5972 * gsub!(pattern) -> an_enumerator
5973 *
5974 * Performs the specified substring replacement(s) on +self+;
5975 * returns +self+ if any replacement occurred, +nil+ otherwise.
5976 *
5977 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5978 *
5979 * Returns an Enumerator if no +replacement+ and no block given.
5980 *
5981 * Related: String#sub, String#gsub, String#sub!.
5982 *
5983 */
5984
5985static VALUE
5986rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5987{
5988 str_modify_keep_cr(str);
5989 return str_gsub(argc, argv, str, 1);
5990}
5991
5992
5993/*
5994 * call-seq:
5995 * gsub(pattern, replacement) -> new_string
5996 * gsub(pattern) {|match| ... } -> new_string
5997 * gsub(pattern) -> enumerator
5998 *
5999 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6000 *
6001 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6002 *
6003 * Returns an Enumerator if no +replacement+ and no block given.
6004 *
6005 * Related: String#sub, String#sub!, String#gsub!.
6006 *
6007 */
6008
6009static VALUE
6010rb_str_gsub(int argc, VALUE *argv, VALUE str)
6011{
6012 return str_gsub(argc, argv, str, 0);
6013}
6014
6015
6016/*
6017 * call-seq:
6018 * replace(other_string) -> self
6019 *
6020 * Replaces the contents of +self+ with the contents of +other_string+:
6021 *
6022 * s = 'foo' # => "foo"
6023 * s.replace('bar') # => "bar"
6024 *
6025 */
6026
6027VALUE
6029{
6030 str_modifiable(str);
6031 if (str == str2) return str;
6032
6033 StringValue(str2);
6034 str_discard(str);
6035 return str_replace(str, str2);
6036}
6037
6038/*
6039 * call-seq:
6040 * clear -> self
6041 *
6042 * Removes the contents of +self+:
6043 *
6044 * s = 'foo' # => "foo"
6045 * s.clear # => ""
6046 *
6047 */
6048
6049static VALUE
6050rb_str_clear(VALUE str)
6051{
6052 str_discard(str);
6053 STR_SET_EMBED(str);
6054 STR_SET_EMBED_LEN(str, 0);
6055 RSTRING_PTR(str)[0] = 0;
6056 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6058 else
6060 return str;
6061}
6062
6063/*
6064 * call-seq:
6065 * chr -> string
6066 *
6067 * Returns a string containing the first character of +self+:
6068 *
6069 * s = 'foo' # => "foo"
6070 * s.chr # => "f"
6071 *
6072 */
6073
6074static VALUE
6075rb_str_chr(VALUE str)
6076{
6077 return rb_str_substr(str, 0, 1);
6078}
6079
6080/*
6081 * call-seq:
6082 * getbyte(index) -> integer or nil
6083 *
6084 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6085 *
6086 * s = 'abcde' # => "abcde"
6087 * s.getbyte(0) # => 97
6088 * s.getbyte(-1) # => 101
6089 * s.getbyte(5) # => nil
6090 *
6091 * Related: String#setbyte.
6092 */
6093static VALUE
6094rb_str_getbyte(VALUE str, VALUE index)
6095{
6096 long pos = NUM2LONG(index);
6097
6098 if (pos < 0)
6099 pos += RSTRING_LEN(str);
6100 if (pos < 0 || RSTRING_LEN(str) <= pos)
6101 return Qnil;
6102
6103 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6104}
6105
6106/*
6107 * call-seq:
6108 * setbyte(index, integer) -> integer
6109 *
6110 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6111 *
6112 * s = 'abcde' # => "abcde"
6113 * s.setbyte(0, 98) # => 98
6114 * s # => "bbcde"
6115 *
6116 * Related: String#getbyte.
6117 */
6118static VALUE
6119rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6120{
6121 long pos = NUM2LONG(index);
6122 long len = RSTRING_LEN(str);
6123 char *ptr, *head, *left = 0;
6124 rb_encoding *enc;
6125 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6126
6127 if (pos < -len || len <= pos)
6128 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6129 if (pos < 0)
6130 pos += len;
6131
6132 VALUE v = rb_to_int(value);
6133 VALUE w = rb_int_and(v, INT2FIX(0xff));
6134 char byte = (char)(NUM2INT(w) & 0xFF);
6135
6136 if (!str_independent(str))
6137 str_make_independent(str);
6138 enc = STR_ENC_GET(str);
6139 head = RSTRING_PTR(str);
6140 ptr = &head[pos];
6141 if (!STR_EMBED_P(str)) {
6142 cr = ENC_CODERANGE(str);
6143 switch (cr) {
6144 case ENC_CODERANGE_7BIT:
6145 left = ptr;
6146 *ptr = byte;
6147 if (ISASCII(byte)) goto end;
6148 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6149 if (!MBCLEN_CHARFOUND_P(nlen))
6151 else
6153 goto end;
6155 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6156 width = rb_enc_precise_mbclen(left, head+len, enc);
6157 *ptr = byte;
6158 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6159 if (!MBCLEN_CHARFOUND_P(nlen))
6161 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6163 goto end;
6164 }
6165 }
6167 *ptr = byte;
6168
6169 end:
6170 return value;
6171}
6172
6173static VALUE
6174str_byte_substr(VALUE str, long beg, long len, int empty)
6175{
6176 long n = RSTRING_LEN(str);
6177
6178 if (beg > n || len < 0) return Qnil;
6179 if (beg < 0) {
6180 beg += n;
6181 if (beg < 0) return Qnil;
6182 }
6183 if (len > n - beg)
6184 len = n - beg;
6185 if (len <= 0) {
6186 if (!empty) return Qnil;
6187 len = 0;
6188 }
6189
6190 VALUE str2 = str_subseq(str, beg, len);
6191
6192 str_enc_copy(str2, str);
6193
6194 if (RSTRING_LEN(str2) == 0) {
6195 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6197 else
6199 }
6200 else {
6201 switch (ENC_CODERANGE(str)) {
6202 case ENC_CODERANGE_7BIT:
6204 break;
6205 default:
6207 break;
6208 }
6209 }
6210
6211 return str2;
6212}
6213
6214static VALUE
6215str_byte_aref(VALUE str, VALUE indx)
6216{
6217 long idx;
6218 if (FIXNUM_P(indx)) {
6219 idx = FIX2LONG(indx);
6220 }
6221 else {
6222 /* check if indx is Range */
6223 long beg, len = RSTRING_LEN(str);
6224
6225 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6226 case Qfalse:
6227 break;
6228 case Qnil:
6229 return Qnil;
6230 default:
6231 return str_byte_substr(str, beg, len, TRUE);
6232 }
6233
6234 idx = NUM2LONG(indx);
6235 }
6236 return str_byte_substr(str, idx, 1, FALSE);
6237}
6238
6239/*
6240 * call-seq:
6241 * byteslice(index, length = 1) -> string or nil
6242 * byteslice(range) -> string or nil
6243 *
6244 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6245 *
6246 * With integer arguments +index+ and +length+ given,
6247 * returns the substring beginning at the given +index+
6248 * of the given +length+ (if possible),
6249 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6250 *
6251 * s = '0123456789' # => "0123456789"
6252 * s.byteslice(2) # => "2"
6253 * s.byteslice(200) # => nil
6254 * s.byteslice(4, 3) # => "456"
6255 * s.byteslice(4, 30) # => "456789"
6256 * s.byteslice(4, -1) # => nil
6257 * s.byteslice(40, 2) # => nil
6258 *
6259 * In either case above, counts backwards from the end of +self+
6260 * if +index+ is negative:
6261 *
6262 * s = '0123456789' # => "0123456789"
6263 * s.byteslice(-4) # => "6"
6264 * s.byteslice(-4, 3) # => "678"
6265 *
6266 * With Range argument +range+ given, returns
6267 * <tt>byteslice(range.begin, range.size)</tt>:
6268 *
6269 * s = '0123456789' # => "0123456789"
6270 * s.byteslice(4..6) # => "456"
6271 * s.byteslice(-6..-4) # => "456"
6272 * s.byteslice(5..2) # => "" # range.size is zero.
6273 * s.byteslice(40..42) # => nil
6274 *
6275 * In all cases, a returned string has the same encoding as +self+:
6276 *
6277 * s.encoding # => #<Encoding:UTF-8>
6278 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6279 *
6280 */
6281
6282static VALUE
6283rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6284{
6285 if (argc == 2) {
6286 long beg = NUM2LONG(argv[0]);
6287 long len = NUM2LONG(argv[1]);
6288 return str_byte_substr(str, beg, len, TRUE);
6289 }
6290 rb_check_arity(argc, 1, 2);
6291 return str_byte_aref(str, argv[0]);
6292}
6293
6294/*
6295 * call-seq:
6296 * bytesplice(index, length, str) -> string
6297 * bytesplice(range, str) -> string
6298 *
6299 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6300 * The portion of the string affected is determined using
6301 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6302 * If the replacement string is not the same length as the text it is replacing,
6303 * the string will be adjusted accordingly.
6304 * The form that take an Integer will raise an IndexError if the value is out
6305 * of range; the Range form will raise a RangeError.
6306 * If the beginning or ending offset does not land on character (codepoint)
6307 * boundary, an IndexError will be raised.
6308 */
6309
6310static VALUE
6311rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6312{
6313 long beg, end, len, slen;
6314 VALUE val;
6315 rb_encoding *enc;
6316 int cr;
6317
6318 rb_check_arity(argc, 2, 3);
6319 if (argc == 2) {
6320 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6321 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6322 rb_builtin_class_name(argv[0]));
6323 }
6324 val = argv[1];
6325 }
6326 else {
6327 beg = NUM2LONG(argv[0]);
6328 len = NUM2LONG(argv[1]);
6329 val = argv[2];
6330 }
6331 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6332 slen = RSTRING_LEN(str);
6333 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6334 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6335 }
6336 if (beg < 0) {
6337 beg += slen;
6338 }
6339 assert(beg >= 0);
6340 assert(beg <= slen);
6341 if (len > slen - beg) {
6342 len = slen - beg;
6343 }
6344 end = beg + len;
6345 if (!str_check_byte_pos(str, beg)) {
6347 "offset %ld does not land on character boundary", beg);
6348 }
6349 if (!str_check_byte_pos(str, end)) {
6351 "offset %ld does not land on character boundary", end);
6352 }
6353 StringValue(val);
6354 enc = rb_enc_check(str, val);
6355 str_modify_keep_cr(str);
6356 rb_str_splice_0(str, beg, len, val);
6357 rb_enc_associate(str, enc);
6359 if (cr != ENC_CODERANGE_BROKEN)
6360 ENC_CODERANGE_SET(str, cr);
6361 return str;
6362}
6363
6364/*
6365 * call-seq:
6366 * reverse -> string
6367 *
6368 * Returns a new string with the characters from +self+ in reverse order.
6369 *
6370 * 'stressed'.reverse # => "desserts"
6371 *
6372 */
6373
6374static VALUE
6375rb_str_reverse(VALUE str)
6376{
6377 rb_encoding *enc;
6378 VALUE rev;
6379 char *s, *e, *p;
6380 int cr;
6381
6382 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6383 enc = STR_ENC_GET(str);
6384 rev = rb_str_new(0, RSTRING_LEN(str));
6385 s = RSTRING_PTR(str); e = RSTRING_END(str);
6386 p = RSTRING_END(rev);
6387 cr = ENC_CODERANGE(str);
6388
6389 if (RSTRING_LEN(str) > 1) {
6390 if (single_byte_optimizable(str)) {
6391 while (s < e) {
6392 *--p = *s++;
6393 }
6394 }
6395 else if (cr == ENC_CODERANGE_VALID) {
6396 while (s < e) {
6397 int clen = rb_enc_fast_mbclen(s, e, enc);
6398
6399 p -= clen;
6400 memcpy(p, s, clen);
6401 s += clen;
6402 }
6403 }
6404 else {
6405 cr = rb_enc_asciicompat(enc) ?
6407 while (s < e) {
6408 int clen = rb_enc_mbclen(s, e, enc);
6409
6410 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6411 p -= clen;
6412 memcpy(p, s, clen);
6413 s += clen;
6414 }
6415 }
6416 }
6417 STR_SET_LEN(rev, RSTRING_LEN(str));
6418 str_enc_copy(rev, str);
6419 ENC_CODERANGE_SET(rev, cr);
6420
6421 return rev;
6422}
6423
6424
6425/*
6426 * call-seq:
6427 * reverse! -> self
6428 *
6429 * Returns +self+ with its characters reversed:
6430 *
6431 * s = 'stressed'
6432 * s.reverse! # => "desserts"
6433 * s # => "desserts"
6434 *
6435 */
6436
6437static VALUE
6438rb_str_reverse_bang(VALUE str)
6439{
6440 if (RSTRING_LEN(str) > 1) {
6441 if (single_byte_optimizable(str)) {
6442 char *s, *e, c;
6443
6444 str_modify_keep_cr(str);
6445 s = RSTRING_PTR(str);
6446 e = RSTRING_END(str) - 1;
6447 while (s < e) {
6448 c = *s;
6449 *s++ = *e;
6450 *e-- = c;
6451 }
6452 }
6453 else {
6454 str_shared_replace(str, rb_str_reverse(str));
6455 }
6456 }
6457 else {
6458 str_modify_keep_cr(str);
6459 }
6460 return str;
6461}
6462
6463
6464/*
6465 * call-seq:
6466 * include? other_string -> true or false
6467 *
6468 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6469 *
6470 * s = 'foo'
6471 * s.include?('f') # => true
6472 * s.include?('fo') # => true
6473 * s.include?('food') # => false
6474 *
6475 */
6476
6477VALUE
6478rb_str_include(VALUE str, VALUE arg)
6479{
6480 long i;
6481
6482 StringValue(arg);
6483 i = rb_str_index(str, arg, 0);
6484
6485 return RBOOL(i != -1);
6486}
6487
6488
6489/*
6490 * call-seq:
6491 * to_i(base = 10) -> integer
6492 *
6493 * Returns the result of interpreting leading characters in +self+
6494 * as an integer in the given +base+ (which must be in (0, 2..36)):
6495 *
6496 * '123456'.to_i # => 123456
6497 * '123def'.to_i(16) # => 1195503
6498 *
6499 * With +base+ zero, string +object+ may contain leading characters
6500 * to specify the actual base:
6501 *
6502 * '123def'.to_i(0) # => 123
6503 * '0123def'.to_i(0) # => 83
6504 * '0b123def'.to_i(0) # => 1
6505 * '0o123def'.to_i(0) # => 83
6506 * '0d123def'.to_i(0) # => 123
6507 * '0x123def'.to_i(0) # => 1195503
6508 *
6509 * Characters past a leading valid number (in the given +base+) are ignored:
6510 *
6511 * '12.345'.to_i # => 12
6512 * '12345'.to_i(2) # => 1
6513 *
6514 * Returns zero if there is no leading valid number:
6515 *
6516 * 'abcdef'.to_i # => 0
6517 * '2'.to_i(2) # => 0
6518 *
6519 */
6520
6521static VALUE
6522rb_str_to_i(int argc, VALUE *argv, VALUE str)
6523{
6524 int base = 10;
6525
6526 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6527 rb_raise(rb_eArgError, "invalid radix %d", base);
6528 }
6529 return rb_str_to_inum(str, base, FALSE);
6530}
6531
6532
6533/*
6534 * call-seq:
6535 * to_f -> float
6536 *
6537 * Returns the result of interpreting leading characters in +self+ as a Float:
6538 *
6539 * '3.14159'.to_f # => 3.14159
6540 '1.234e-2'.to_f # => 0.01234
6541 *
6542 * Characters past a leading valid number (in the given +base+) are ignored:
6543 *
6544 * '3.14 (pi to two places)'.to_f # => 3.14
6545 *
6546 * Returns zero if there is no leading valid number:
6547 *
6548 * 'abcdef'.to_f # => 0.0
6549 *
6550 */
6551
6552static VALUE
6553rb_str_to_f(VALUE str)
6554{
6555 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6556}
6557
6558
6559/*
6560 * call-seq:
6561 * to_s -> self or string
6562 *
6563 * Returns +self+ if +self+ is a \String,
6564 * or +self+ converted to a \String if +self+ is a subclass of \String.
6565 *
6566 * String#to_str is an alias for String#to_s.
6567 *
6568 */
6569
6570static VALUE
6571rb_str_to_s(VALUE str)
6572{
6573 if (rb_obj_class(str) != rb_cString) {
6574 return str_duplicate(rb_cString, str);
6575 }
6576 return str;
6577}
6578
6579#if 0
6580static void
6581str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6582{
6583 char s[RUBY_MAX_CHAR_LEN];
6584 int n = rb_enc_codelen(c, enc);
6585
6586 rb_enc_mbcput(c, s, enc);
6587 rb_enc_str_buf_cat(str, s, n, enc);
6588}
6589#endif
6590
6591#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6592
6593int
6594rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6595{
6596 char buf[CHAR_ESC_LEN + 1];
6597 int l;
6598
6599#if SIZEOF_INT > 4
6600 c &= 0xffffffff;
6601#endif
6602 if (unicode_p) {
6603 if (c < 0x7F && ISPRINT(c)) {
6604 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6605 }
6606 else if (c < 0x10000) {
6607 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6608 }
6609 else {
6610 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6611 }
6612 }
6613 else {
6614 if (c < 0x100) {
6615 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6616 }
6617 else {
6618 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6619 }
6620 }
6621 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6622 rb_str_buf_cat(result, buf, l);
6623 return l;
6624}
6625
6626const char *
6627ruby_escaped_char(int c)
6628{
6629 switch (c) {
6630 case '\0': return "\\0";
6631 case '\n': return "\\n";
6632 case '\r': return "\\r";
6633 case '\t': return "\\t";
6634 case '\f': return "\\f";
6635 case '\013': return "\\v";
6636 case '\010': return "\\b";
6637 case '\007': return "\\a";
6638 case '\033': return "\\e";
6639 case '\x7f': return "\\c?";
6640 }
6641 return NULL;
6642}
6643
6644VALUE
6645rb_str_escape(VALUE str)
6646{
6647 int encidx = ENCODING_GET(str);
6648 rb_encoding *enc = rb_enc_from_index(encidx);
6649 const char *p = RSTRING_PTR(str);
6650 const char *pend = RSTRING_END(str);
6651 const char *prev = p;
6652 char buf[CHAR_ESC_LEN + 1];
6653 VALUE result = rb_str_buf_new(0);
6654 int unicode_p = rb_enc_unicode_p(enc);
6655 int asciicompat = rb_enc_asciicompat(enc);
6656
6657 while (p < pend) {
6658 unsigned int c;
6659 const char *cc;
6660 int n = rb_enc_precise_mbclen(p, pend, enc);
6661 if (!MBCLEN_CHARFOUND_P(n)) {
6662 if (p > prev) str_buf_cat(result, prev, p - prev);
6663 n = rb_enc_mbminlen(enc);
6664 if (pend < p + n)
6665 n = (int)(pend - p);
6666 while (n--) {
6667 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6668 str_buf_cat(result, buf, strlen(buf));
6669 prev = ++p;
6670 }
6671 continue;
6672 }
6673 n = MBCLEN_CHARFOUND_LEN(n);
6674 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6675 p += n;
6676 cc = ruby_escaped_char(c);
6677 if (cc) {
6678 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6679 str_buf_cat(result, cc, strlen(cc));
6680 prev = p;
6681 }
6682 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6683 }
6684 else {
6685 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6686 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6687 prev = p;
6688 }
6689 }
6690 if (p > prev) str_buf_cat(result, prev, p - prev);
6691 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6692
6693 return result;
6694}
6695
6696/*
6697 * call-seq:
6698 * inspect -> string
6699 *
6700 * Returns a printable version of +self+, enclosed in double-quotes,
6701 * and with special characters escaped:
6702 *
6703 * s = "foo\tbar\tbaz\n"
6704 * s.inspect
6705 * # => "\"foo\\tbar\\tbaz\\n\""
6706 *
6707 */
6708
6709VALUE
6711{
6712 int encidx = ENCODING_GET(str);
6713 rb_encoding *enc = rb_enc_from_index(encidx);
6714 const char *p, *pend, *prev;
6715 char buf[CHAR_ESC_LEN + 1];
6716 VALUE result = rb_str_buf_new(0);
6717 rb_encoding *resenc = rb_default_internal_encoding();
6718 int unicode_p = rb_enc_unicode_p(enc);
6719 int asciicompat = rb_enc_asciicompat(enc);
6720
6721 if (resenc == NULL) resenc = rb_default_external_encoding();
6722 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6723 rb_enc_associate(result, resenc);
6724 str_buf_cat2(result, "\"");
6725
6726 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6727 prev = p;
6728 while (p < pend) {
6729 unsigned int c, cc;
6730 int n;
6731
6732 n = rb_enc_precise_mbclen(p, pend, enc);
6733 if (!MBCLEN_CHARFOUND_P(n)) {
6734 if (p > prev) str_buf_cat(result, prev, p - prev);
6735 n = rb_enc_mbminlen(enc);
6736 if (pend < p + n)
6737 n = (int)(pend - p);
6738 while (n--) {
6739 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6740 str_buf_cat(result, buf, strlen(buf));
6741 prev = ++p;
6742 }
6743 continue;
6744 }
6745 n = MBCLEN_CHARFOUND_LEN(n);
6746 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6747 p += n;
6748 if ((asciicompat || unicode_p) &&
6749 (c == '"'|| c == '\\' ||
6750 (c == '#' &&
6751 p < pend &&
6752 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6753 (cc = rb_enc_codepoint(p,pend,enc),
6754 (cc == '$' || cc == '@' || cc == '{'))))) {
6755 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6756 str_buf_cat2(result, "\\");
6757 if (asciicompat || enc == resenc) {
6758 prev = p - n;
6759 continue;
6760 }
6761 }
6762 switch (c) {
6763 case '\n': cc = 'n'; break;
6764 case '\r': cc = 'r'; break;
6765 case '\t': cc = 't'; break;
6766 case '\f': cc = 'f'; break;
6767 case '\013': cc = 'v'; break;
6768 case '\010': cc = 'b'; break;
6769 case '\007': cc = 'a'; break;
6770 case 033: cc = 'e'; break;
6771 default: cc = 0; break;
6772 }
6773 if (cc) {
6774 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6775 buf[0] = '\\';
6776 buf[1] = (char)cc;
6777 str_buf_cat(result, buf, 2);
6778 prev = p;
6779 continue;
6780 }
6781 /* The special casing of 0x85 (NEXT_LINE) here is because
6782 * Oniguruma historically treats it as printable, but it
6783 * doesn't match the print POSIX bracket class or character
6784 * property in regexps.
6785 *
6786 * See Ruby Bug #16842 for details:
6787 * https://bugs.ruby-lang.org/issues/16842
6788 */
6789 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6790 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6791 continue;
6792 }
6793 else {
6794 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6795 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6796 prev = p;
6797 continue;
6798 }
6799 }
6800 if (p > prev) str_buf_cat(result, prev, p - prev);
6801 str_buf_cat2(result, "\"");
6802
6803 return result;
6804}
6805
6806#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6807
6808/*
6809 * call-seq:
6810 * dump -> string
6811 *
6812 * Returns a printable version of +self+, enclosed in double-quotes,
6813 * with special characters escaped, and with non-printing characters
6814 * replaced by hexadecimal notation:
6815 *
6816 * "hello \n ''".dump # => "\"hello \\n ''\""
6817 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6818 *
6819 * Related: String#undump (inverse of String#dump).
6820 *
6821 */
6822
6823VALUE
6825{
6826 int encidx = rb_enc_get_index(str);
6827 rb_encoding *enc = rb_enc_from_index(encidx);
6828 long len;
6829 const char *p, *pend;
6830 char *q, *qend;
6831 VALUE result;
6832 int u8 = (encidx == rb_utf8_encindex());
6833 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6834
6835 len = 2; /* "" */
6836 if (!rb_enc_asciicompat(enc)) {
6837 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6838 len += strlen(enc->name);
6839 }
6840
6841 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6842 while (p < pend) {
6843 int clen;
6844 unsigned char c = *p++;
6845
6846 switch (c) {
6847 case '"': case '\\':
6848 case '\n': case '\r':
6849 case '\t': case '\f':
6850 case '\013': case '\010': case '\007': case '\033':
6851 clen = 2;
6852 break;
6853
6854 case '#':
6855 clen = IS_EVSTR(p, pend) ? 2 : 1;
6856 break;
6857
6858 default:
6859 if (ISPRINT(c)) {
6860 clen = 1;
6861 }
6862 else {
6863 if (u8 && c > 0x7F) { /* \u notation */
6864 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6865 if (MBCLEN_CHARFOUND_P(n)) {
6866 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6867 if (cc <= 0xFFFF)
6868 clen = 6; /* \uXXXX */
6869 else if (cc <= 0xFFFFF)
6870 clen = 9; /* \u{XXXXX} */
6871 else
6872 clen = 10; /* \u{XXXXXX} */
6873 p += MBCLEN_CHARFOUND_LEN(n)-1;
6874 break;
6875 }
6876 }
6877 clen = 4; /* \xNN */
6878 }
6879 break;
6880 }
6881
6882 if (clen > LONG_MAX - len) {
6883 rb_raise(rb_eRuntimeError, "string size too big");
6884 }
6885 len += clen;
6886 }
6887
6888 result = rb_str_new(0, len);
6889 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6890 q = RSTRING_PTR(result); qend = q + len + 1;
6891
6892 *q++ = '"';
6893 while (p < pend) {
6894 unsigned char c = *p++;
6895
6896 if (c == '"' || c == '\\') {
6897 *q++ = '\\';
6898 *q++ = c;
6899 }
6900 else if (c == '#') {
6901 if (IS_EVSTR(p, pend)) *q++ = '\\';
6902 *q++ = '#';
6903 }
6904 else if (c == '\n') {
6905 *q++ = '\\';
6906 *q++ = 'n';
6907 }
6908 else if (c == '\r') {
6909 *q++ = '\\';
6910 *q++ = 'r';
6911 }
6912 else if (c == '\t') {
6913 *q++ = '\\';
6914 *q++ = 't';
6915 }
6916 else if (c == '\f') {
6917 *q++ = '\\';
6918 *q++ = 'f';
6919 }
6920 else if (c == '\013') {
6921 *q++ = '\\';
6922 *q++ = 'v';
6923 }
6924 else if (c == '\010') {
6925 *q++ = '\\';
6926 *q++ = 'b';
6927 }
6928 else if (c == '\007') {
6929 *q++ = '\\';
6930 *q++ = 'a';
6931 }
6932 else if (c == '\033') {
6933 *q++ = '\\';
6934 *q++ = 'e';
6935 }
6936 else if (ISPRINT(c)) {
6937 *q++ = c;
6938 }
6939 else {
6940 *q++ = '\\';
6941 if (u8) {
6942 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6943 if (MBCLEN_CHARFOUND_P(n)) {
6944 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6945 p += n;
6946 if (cc <= 0xFFFF)
6947 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6948 else
6949 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6950 q += strlen(q);
6951 continue;
6952 }
6953 }
6954 snprintf(q, qend-q, "x%02X", c);
6955 q += 3;
6956 }
6957 }
6958 *q++ = '"';
6959 *q = '\0';
6960 if (!rb_enc_asciicompat(enc)) {
6961 snprintf(q, qend-q, nonascii_suffix, enc->name);
6962 encidx = rb_ascii8bit_encindex();
6963 }
6964 /* result from dump is ASCII */
6965 rb_enc_associate_index(result, encidx);
6967 return result;
6968}
6969
6970static int
6971unescape_ascii(unsigned int c)
6972{
6973 switch (c) {
6974 case 'n':
6975 return '\n';
6976 case 'r':
6977 return '\r';
6978 case 't':
6979 return '\t';
6980 case 'f':
6981 return '\f';
6982 case 'v':
6983 return '\13';
6984 case 'b':
6985 return '\010';
6986 case 'a':
6987 return '\007';
6988 case 'e':
6989 return 033;
6990 }
6992}
6993
6994static void
6995undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6996{
6997 const char *s = *ss;
6998 unsigned int c;
6999 int codelen;
7000 size_t hexlen;
7001 unsigned char buf[6];
7002 static rb_encoding *enc_utf8 = NULL;
7003
7004 switch (*s) {
7005 case '\\':
7006 case '"':
7007 case '#':
7008 rb_str_cat(undumped, s, 1); /* cat itself */
7009 s++;
7010 break;
7011 case 'n':
7012 case 'r':
7013 case 't':
7014 case 'f':
7015 case 'v':
7016 case 'b':
7017 case 'a':
7018 case 'e':
7019 *buf = unescape_ascii(*s);
7020 rb_str_cat(undumped, (char *)buf, 1);
7021 s++;
7022 break;
7023 case 'u':
7024 if (*binary) {
7025 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7026 }
7027 *utf8 = true;
7028 if (++s >= s_end) {
7029 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7030 }
7031 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7032 if (*penc != enc_utf8) {
7033 *penc = enc_utf8;
7034 rb_enc_associate(undumped, enc_utf8);
7035 }
7036 if (*s == '{') { /* handle \u{...} form */
7037 s++;
7038 for (;;) {
7039 if (s >= s_end) {
7040 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7041 }
7042 if (*s == '}') {
7043 s++;
7044 break;
7045 }
7046 if (ISSPACE(*s)) {
7047 s++;
7048 continue;
7049 }
7050 c = scan_hex(s, s_end-s, &hexlen);
7051 if (hexlen == 0 || hexlen > 6) {
7052 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7053 }
7054 if (c > 0x10ffff) {
7055 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7056 }
7057 if (0xd800 <= c && c <= 0xdfff) {
7058 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7059 }
7060 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7061 rb_str_cat(undumped, (char *)buf, codelen);
7062 s += hexlen;
7063 }
7064 }
7065 else { /* handle \uXXXX form */
7066 c = scan_hex(s, 4, &hexlen);
7067 if (hexlen != 4) {
7068 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7069 }
7070 if (0xd800 <= c && c <= 0xdfff) {
7071 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7072 }
7073 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7074 rb_str_cat(undumped, (char *)buf, codelen);
7075 s += hexlen;
7076 }
7077 break;
7078 case 'x':
7079 if (*utf8) {
7080 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7081 }
7082 *binary = true;
7083 if (++s >= s_end) {
7084 rb_raise(rb_eRuntimeError, "invalid hex escape");
7085 }
7086 *buf = scan_hex(s, 2, &hexlen);
7087 if (hexlen != 2) {
7088 rb_raise(rb_eRuntimeError, "invalid hex escape");
7089 }
7090 rb_str_cat(undumped, (char *)buf, 1);
7091 s += hexlen;
7092 break;
7093 default:
7094 rb_str_cat(undumped, s-1, 2);
7095 s++;
7096 }
7097
7098 *ss = s;
7099}
7100
7101static VALUE rb_str_is_ascii_only_p(VALUE str);
7102
7103/*
7104 * call-seq:
7105 * undump -> string
7106 *
7107 * Returns an unescaped version of +self+:
7108 *
7109 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7110 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7111 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7112 * s_undumped == s_orig # => true
7113 *
7114 * Related: String#dump (inverse of String#undump).
7115 *
7116 */
7117
7118static VALUE
7119str_undump(VALUE str)
7120{
7121 const char *s = RSTRING_PTR(str);
7122 const char *s_end = RSTRING_END(str);
7123 rb_encoding *enc = rb_enc_get(str);
7124 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7125 bool utf8 = false;
7126 bool binary = false;
7127 int w;
7128
7130 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7131 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7132 }
7133 if (!str_null_check(str, &w)) {
7134 rb_raise(rb_eRuntimeError, "string contains null byte");
7135 }
7136 if (RSTRING_LEN(str) < 2) goto invalid_format;
7137 if (*s != '"') goto invalid_format;
7138
7139 /* strip '"' at the start */
7140 s++;
7141
7142 for (;;) {
7143 if (s >= s_end) {
7144 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7145 }
7146
7147 if (*s == '"') {
7148 /* epilogue */
7149 s++;
7150 if (s == s_end) {
7151 /* ascii compatible dumped string */
7152 break;
7153 }
7154 else {
7155 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7156 static const char dup_suffix[] = ".dup";
7157 const char *encname;
7158 int encidx;
7159 ptrdiff_t size;
7160
7161 /* check separately for strings dumped by older versions */
7162 size = sizeof(dup_suffix) - 1;
7163 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7164
7165 size = sizeof(force_encoding_suffix) - 1;
7166 if (s_end - s <= size) goto invalid_format;
7167 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7168 s += size;
7169
7170 if (utf8) {
7171 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7172 }
7173
7174 encname = s;
7175 s = memchr(s, '"', s_end-s);
7176 size = s - encname;
7177 if (!s) goto invalid_format;
7178 if (s_end - s != 2) goto invalid_format;
7179 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7180
7181 encidx = rb_enc_find_index2(encname, (long)size);
7182 if (encidx < 0) {
7183 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7184 }
7185 rb_enc_associate_index(undumped, encidx);
7186 }
7187 break;
7188 }
7189
7190 if (*s == '\\') {
7191 s++;
7192 if (s >= s_end) {
7193 rb_raise(rb_eRuntimeError, "invalid escape");
7194 }
7195 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7196 }
7197 else {
7198 rb_str_cat(undumped, s++, 1);
7199 }
7200 }
7201
7202 return undumped;
7203invalid_format:
7204 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7205}
7206
7207static void
7208rb_str_check_dummy_enc(rb_encoding *enc)
7209{
7210 if (rb_enc_dummy_p(enc)) {
7211 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7212 rb_enc_name(enc));
7213 }
7214}
7215
7216static rb_encoding *
7217str_true_enc(VALUE str)
7218{
7219 rb_encoding *enc = STR_ENC_GET(str);
7220 rb_str_check_dummy_enc(enc);
7221 return enc;
7222}
7223
7224static OnigCaseFoldType
7225check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7226{
7227 if (argc==0)
7228 return flags;
7229 if (argc>2)
7230 rb_raise(rb_eArgError, "too many options");
7231 if (argv[0]==sym_turkic) {
7232 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7233 if (argc==2) {
7234 if (argv[1]==sym_lithuanian)
7235 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7236 else
7237 rb_raise(rb_eArgError, "invalid second option");
7238 }
7239 }
7240 else if (argv[0]==sym_lithuanian) {
7241 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7242 if (argc==2) {
7243 if (argv[1]==sym_turkic)
7244 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7245 else
7246 rb_raise(rb_eArgError, "invalid second option");
7247 }
7248 }
7249 else if (argc>1)
7250 rb_raise(rb_eArgError, "too many options");
7251 else if (argv[0]==sym_ascii)
7252 flags |= ONIGENC_CASE_ASCII_ONLY;
7253 else if (argv[0]==sym_fold) {
7254 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7255 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7256 else
7257 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7258 }
7259 else
7260 rb_raise(rb_eArgError, "invalid option");
7261 return flags;
7262}
7263
7264static inline bool
7265case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7266{
7267 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7268 return true;
7269 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7270}
7271
7272/* 16 should be long enough to absorb any kind of single character length increase */
7273#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7274#ifndef CASEMAP_DEBUG
7275# define CASEMAP_DEBUG 0
7276#endif
7277
7278struct mapping_buffer;
7279typedef struct mapping_buffer {
7280 size_t capa;
7281 size_t used;
7282 struct mapping_buffer *next;
7283 OnigUChar space[FLEX_ARY_LEN];
7285
7286static void
7287mapping_buffer_free(void *p)
7288{
7289 mapping_buffer *previous_buffer;
7290 mapping_buffer *current_buffer = p;
7291 while (current_buffer) {
7292 previous_buffer = current_buffer;
7293 current_buffer = current_buffer->next;
7294 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7295 }
7296}
7297
7298static const rb_data_type_t mapping_buffer_type = {
7299 "mapping_buffer",
7300 {0, mapping_buffer_free,}
7301};
7302
7303static VALUE
7304rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7305{
7306 VALUE target;
7307
7308 const OnigUChar *source_current, *source_end;
7309 int target_length = 0;
7310 VALUE buffer_anchor;
7311 mapping_buffer *current_buffer = 0;
7312 mapping_buffer **pre_buffer;
7313 size_t buffer_count = 0;
7314 int buffer_length_or_invalid;
7315
7316 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7317
7318 source_current = (OnigUChar*)RSTRING_PTR(source);
7319 source_end = (OnigUChar*)RSTRING_END(source);
7320
7321 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7322 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7323 while (source_current < source_end) {
7324 /* increase multiplier using buffer count to converge quickly */
7325 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7326 if (CASEMAP_DEBUG) {
7327 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7328 }
7329 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7330 *pre_buffer = current_buffer;
7331 pre_buffer = &current_buffer->next;
7332 current_buffer->next = NULL;
7333 current_buffer->capa = capa;
7334 buffer_length_or_invalid = enc->case_map(flags,
7335 &source_current, source_end,
7336 current_buffer->space,
7337 current_buffer->space+current_buffer->capa,
7338 enc);
7339 if (buffer_length_or_invalid < 0) {
7340 current_buffer = DATA_PTR(buffer_anchor);
7341 DATA_PTR(buffer_anchor) = 0;
7342 mapping_buffer_free(current_buffer);
7343 rb_raise(rb_eArgError, "input string invalid");
7344 }
7345 target_length += current_buffer->used = buffer_length_or_invalid;
7346 }
7347 if (CASEMAP_DEBUG) {
7348 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7349 }
7350
7351 if (buffer_count==1) {
7352 target = rb_str_new((const char*)current_buffer->space, target_length);
7353 }
7354 else {
7355 char *target_current;
7356
7357 target = rb_str_new(0, target_length);
7358 target_current = RSTRING_PTR(target);
7359 current_buffer = DATA_PTR(buffer_anchor);
7360 while (current_buffer) {
7361 memcpy(target_current, current_buffer->space, current_buffer->used);
7362 target_current += current_buffer->used;
7363 current_buffer = current_buffer->next;
7364 }
7365 }
7366 current_buffer = DATA_PTR(buffer_anchor);
7367 DATA_PTR(buffer_anchor) = 0;
7368 mapping_buffer_free(current_buffer);
7369
7370 RB_GC_GUARD(buffer_anchor);
7371
7372 /* TODO: check about string terminator character */
7373 str_enc_copy(target, source);
7374 /*ENC_CODERANGE_SET(mapped, cr);*/
7375
7376 return target;
7377}
7378
7379static VALUE
7380rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7381{
7382 const OnigUChar *source_current, *source_end;
7383 OnigUChar *target_current, *target_end;
7384 long old_length = RSTRING_LEN(source);
7385 int length_or_invalid;
7386
7387 if (old_length == 0) return Qnil;
7388
7389 source_current = (OnigUChar*)RSTRING_PTR(source);
7390 source_end = (OnigUChar*)RSTRING_END(source);
7391 if (source == target) {
7392 target_current = (OnigUChar*)source_current;
7393 target_end = (OnigUChar*)source_end;
7394 }
7395 else {
7396 target_current = (OnigUChar*)RSTRING_PTR(target);
7397 target_end = (OnigUChar*)RSTRING_END(target);
7398 }
7399
7400 length_or_invalid = onigenc_ascii_only_case_map(flags,
7401 &source_current, source_end,
7402 target_current, target_end, enc);
7403 if (length_or_invalid < 0)
7404 rb_raise(rb_eArgError, "input string invalid");
7405 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7406 fprintf(stderr, "problem with rb_str_ascii_casemap"
7407 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7408 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7409 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7410 }
7411
7412 str_enc_copy(target, source);
7413
7414 return target;
7415}
7416
7417static bool
7418upcase_single(VALUE str)
7419{
7420 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7421 bool modified = false;
7422
7423 while (s < send) {
7424 unsigned int c = *(unsigned char*)s;
7425
7426 if ('a' <= c && c <= 'z') {
7427 *s = 'A' + (c - 'a');
7428 modified = true;
7429 }
7430 s++;
7431 }
7432 return modified;
7433}
7434
7435/*
7436 * call-seq:
7437 * upcase!(*options) -> self or nil
7438 *
7439 * Upcases the characters in +self+;
7440 * returns +self+ if any changes were made, +nil+ otherwise:
7441 *
7442 * s = 'Hello World!' # => "Hello World!"
7443 * s.upcase! # => "HELLO WORLD!"
7444 * s # => "HELLO WORLD!"
7445 * s.upcase! # => nil
7446 *
7447 * The casing may be affected by the given +options+;
7448 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7449 *
7450 * Related: String#upcase, String#downcase, String#downcase!.
7451 *
7452 */
7453
7454static VALUE
7455rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7456{
7457 rb_encoding *enc;
7458 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7459
7460 flags = check_case_options(argc, argv, flags);
7461 str_modify_keep_cr(str);
7462 enc = str_true_enc(str);
7463 if (case_option_single_p(flags, enc, str)) {
7464 if (upcase_single(str))
7465 flags |= ONIGENC_CASE_MODIFIED;
7466 }
7467 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7468 rb_str_ascii_casemap(str, str, &flags, enc);
7469 else
7470 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7471
7472 if (ONIGENC_CASE_MODIFIED&flags) return str;
7473 return Qnil;
7474}
7475
7476
7477/*
7478 * call-seq:
7479 * upcase(*options) -> string
7480 *
7481 * Returns a string containing the upcased characters in +self+:
7482 *
7483 * s = 'Hello World!' # => "Hello World!"
7484 * s.upcase # => "HELLO WORLD!"
7485 *
7486 * The casing may be affected by the given +options+;
7487 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7488 *
7489 * Related: String#upcase!, String#downcase, String#downcase!.
7490 *
7491 */
7492
7493static VALUE
7494rb_str_upcase(int argc, VALUE *argv, VALUE str)
7495{
7496 rb_encoding *enc;
7497 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7498 VALUE ret;
7499
7500 flags = check_case_options(argc, argv, flags);
7501 enc = str_true_enc(str);
7502 if (case_option_single_p(flags, enc, str)) {
7503 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7504 str_enc_copy(ret, str);
7505 upcase_single(ret);
7506 }
7507 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7508 ret = rb_str_new(0, RSTRING_LEN(str));
7509 rb_str_ascii_casemap(str, ret, &flags, enc);
7510 }
7511 else {
7512 ret = rb_str_casemap(str, &flags, enc);
7513 }
7514
7515 return ret;
7516}
7517
7518static bool
7519downcase_single(VALUE str)
7520{
7521 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7522 bool modified = false;
7523
7524 while (s < send) {
7525 unsigned int c = *(unsigned char*)s;
7526
7527 if ('A' <= c && c <= 'Z') {
7528 *s = 'a' + (c - 'A');
7529 modified = true;
7530 }
7531 s++;
7532 }
7533
7534 return modified;
7535}
7536
7537/*
7538 * call-seq:
7539 * downcase!(*options) -> self or nil
7540 *
7541 * Downcases the characters in +self+;
7542 * returns +self+ if any changes were made, +nil+ otherwise:
7543 *
7544 * s = 'Hello World!' # => "Hello World!"
7545 * s.downcase! # => "hello world!"
7546 * s # => "hello world!"
7547 * s.downcase! # => nil
7548 *
7549 * The casing may be affected by the given +options+;
7550 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7551 *
7552 * Related: String#downcase, String#upcase, String#upcase!.
7553 *
7554 */
7555
7556static VALUE
7557rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7558{
7559 rb_encoding *enc;
7560 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7561
7562 flags = check_case_options(argc, argv, flags);
7563 str_modify_keep_cr(str);
7564 enc = str_true_enc(str);
7565 if (case_option_single_p(flags, enc, str)) {
7566 if (downcase_single(str))
7567 flags |= ONIGENC_CASE_MODIFIED;
7568 }
7569 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7570 rb_str_ascii_casemap(str, str, &flags, enc);
7571 else
7572 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7573
7574 if (ONIGENC_CASE_MODIFIED&flags) return str;
7575 return Qnil;
7576}
7577
7578
7579/*
7580 * call-seq:
7581 * downcase(*options) -> string
7582 *
7583 * Returns a string containing the downcased characters in +self+:
7584 *
7585 * s = 'Hello World!' # => "Hello World!"
7586 * s.downcase # => "hello world!"
7587 *
7588 * The casing may be affected by the given +options+;
7589 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7590 *
7591 * Related: String#downcase!, String#upcase, String#upcase!.
7592 *
7593 */
7594
7595static VALUE
7596rb_str_downcase(int argc, VALUE *argv, VALUE str)
7597{
7598 rb_encoding *enc;
7599 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7600 VALUE ret;
7601
7602 flags = check_case_options(argc, argv, flags);
7603 enc = str_true_enc(str);
7604 if (case_option_single_p(flags, enc, str)) {
7605 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7606 str_enc_copy(ret, str);
7607 downcase_single(ret);
7608 }
7609 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7610 ret = rb_str_new(0, RSTRING_LEN(str));
7611 rb_str_ascii_casemap(str, ret, &flags, enc);
7612 }
7613 else {
7614 ret = rb_str_casemap(str, &flags, enc);
7615 }
7616
7617 return ret;
7618}
7619
7620
7621/*
7622 * call-seq:
7623 * capitalize!(*options) -> self or nil
7624 *
7625 * Upcases the first character in +self+;
7626 * downcases the remaining characters;
7627 * returns +self+ if any changes were made, +nil+ otherwise:
7628 *
7629 * s = 'hello World!' # => "hello World!"
7630 * s.capitalize! # => "Hello world!"
7631 * s # => "Hello world!"
7632 * s.capitalize! # => nil
7633 *
7634 * The casing may be affected by the given +options+;
7635 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7636 *
7637 * Related: String#capitalize.
7638 *
7639 */
7640
7641static VALUE
7642rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7643{
7644 rb_encoding *enc;
7645 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7646
7647 flags = check_case_options(argc, argv, flags);
7648 str_modify_keep_cr(str);
7649 enc = str_true_enc(str);
7650 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7651 if (flags&ONIGENC_CASE_ASCII_ONLY)
7652 rb_str_ascii_casemap(str, str, &flags, enc);
7653 else
7654 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7655
7656 if (ONIGENC_CASE_MODIFIED&flags) return str;
7657 return Qnil;
7658}
7659
7660
7661/*
7662 * call-seq:
7663 * capitalize(*options) -> string
7664 *
7665 * Returns a string containing the characters in +self+;
7666 * the first character is upcased;
7667 * the remaining characters are downcased:
7668 *
7669 * s = 'hello World!' # => "hello World!"
7670 * s.capitalize # => "Hello world!"
7671 *
7672 * The casing may be affected by the given +options+;
7673 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7674 *
7675 * Related: String#capitalize!.
7676 *
7677 */
7678
7679static VALUE
7680rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7681{
7682 rb_encoding *enc;
7683 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7684 VALUE ret;
7685
7686 flags = check_case_options(argc, argv, flags);
7687 enc = str_true_enc(str);
7688 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7689 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7690 ret = rb_str_new(0, RSTRING_LEN(str));
7691 rb_str_ascii_casemap(str, ret, &flags, enc);
7692 }
7693 else {
7694 ret = rb_str_casemap(str, &flags, enc);
7695 }
7696 return ret;
7697}
7698
7699
7700/*
7701 * call-seq:
7702 * swapcase!(*options) -> self or nil
7703 *
7704 * Upcases each lowercase character in +self+;
7705 * downcases uppercase character;
7706 * returns +self+ if any changes were made, +nil+ otherwise:
7707 *
7708 * s = 'Hello World!' # => "Hello World!"
7709 * s.swapcase! # => "hELLO wORLD!"
7710 * s # => "hELLO wORLD!"
7711 * ''.swapcase! # => nil
7712 *
7713 * The casing may be affected by the given +options+;
7714 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7715 *
7716 * Related: String#swapcase.
7717 *
7718 */
7719
7720static VALUE
7721rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7722{
7723 rb_encoding *enc;
7724 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7725
7726 flags = check_case_options(argc, argv, flags);
7727 str_modify_keep_cr(str);
7728 enc = str_true_enc(str);
7729 if (flags&ONIGENC_CASE_ASCII_ONLY)
7730 rb_str_ascii_casemap(str, str, &flags, enc);
7731 else
7732 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7733
7734 if (ONIGENC_CASE_MODIFIED&flags) return str;
7735 return Qnil;
7736}
7737
7738
7739/*
7740 * call-seq:
7741 * swapcase(*options) -> string
7742 *
7743 * Returns a string containing the characters in +self+, with cases reversed;
7744 * each uppercase character is downcased;
7745 * each lowercase character is upcased:
7746 *
7747 * s = 'Hello World!' # => "Hello World!"
7748 * s.swapcase # => "hELLO wORLD!"
7749 *
7750 * The casing may be affected by the given +options+;
7751 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7752 *
7753 * Related: String#swapcase!.
7754 *
7755 */
7756
7757static VALUE
7758rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7759{
7760 rb_encoding *enc;
7761 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7762 VALUE ret;
7763
7764 flags = check_case_options(argc, argv, flags);
7765 enc = str_true_enc(str);
7766 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7767 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7768 ret = rb_str_new(0, RSTRING_LEN(str));
7769 rb_str_ascii_casemap(str, ret, &flags, enc);
7770 }
7771 else {
7772 ret = rb_str_casemap(str, &flags, enc);
7773 }
7774 return ret;
7775}
7776
7777typedef unsigned char *USTR;
7778
7779struct tr {
7780 int gen;
7781 unsigned int now, max;
7782 char *p, *pend;
7783};
7784
7785static unsigned int
7786trnext(struct tr *t, rb_encoding *enc)
7787{
7788 int n;
7789
7790 for (;;) {
7791 nextpart:
7792 if (!t->gen) {
7793 if (t->p == t->pend) return -1;
7794 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7795 t->p += n;
7796 }
7797 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7798 t->p += n;
7799 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7800 t->p += n;
7801 if (t->p < t->pend) {
7802 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7803 t->p += n;
7804 if (t->now > c) {
7805 if (t->now < 0x80 && c < 0x80) {
7807 "invalid range \"%c-%c\" in string transliteration",
7808 t->now, c);
7809 }
7810 else {
7811 rb_raise(rb_eArgError, "invalid range in string transliteration");
7812 }
7813 continue; /* not reached */
7814 }
7815 t->gen = 1;
7816 t->max = c;
7817 }
7818 }
7819 return t->now;
7820 }
7821 else {
7822 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7823 if (t->now == t->max) {
7824 t->gen = 0;
7825 goto nextpart;
7826 }
7827 }
7828 if (t->now < t->max) {
7829 return t->now;
7830 }
7831 else {
7832 t->gen = 0;
7833 return t->max;
7834 }
7835 }
7836 }
7837}
7838
7839static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7840
7841static VALUE
7842tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7843{
7844 const unsigned int errc = -1;
7845 unsigned int trans[256];
7846 rb_encoding *enc, *e1, *e2;
7847 struct tr trsrc, trrepl;
7848 int cflag = 0;
7849 unsigned int c, c0, last = 0;
7850 int modify = 0, i, l;
7851 unsigned char *s, *send;
7852 VALUE hash = 0;
7853 int singlebyte = single_byte_optimizable(str);
7854 int termlen;
7855 int cr;
7856
7857#define CHECK_IF_ASCII(c) \
7858 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7859 (cr = ENC_CODERANGE_VALID) : 0)
7860
7861 StringValue(src);
7862 StringValue(repl);
7863 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7864 if (RSTRING_LEN(repl) == 0) {
7865 return rb_str_delete_bang(1, &src, str);
7866 }
7867
7868 cr = ENC_CODERANGE(str);
7869 e1 = rb_enc_check(str, src);
7870 e2 = rb_enc_check(str, repl);
7871 if (e1 == e2) {
7872 enc = e1;
7873 }
7874 else {
7875 enc = rb_enc_check(src, repl);
7876 }
7877 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7878 if (RSTRING_LEN(src) > 1 &&
7879 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7880 trsrc.p + l < trsrc.pend) {
7881 cflag = 1;
7882 trsrc.p += l;
7883 }
7884 trrepl.p = RSTRING_PTR(repl);
7885 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7886 trsrc.gen = trrepl.gen = 0;
7887 trsrc.now = trrepl.now = 0;
7888 trsrc.max = trrepl.max = 0;
7889
7890 if (cflag) {
7891 for (i=0; i<256; i++) {
7892 trans[i] = 1;
7893 }
7894 while ((c = trnext(&trsrc, enc)) != errc) {
7895 if (c < 256) {
7896 trans[c] = errc;
7897 }
7898 else {
7899 if (!hash) hash = rb_hash_new();
7900 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7901 }
7902 }
7903 while ((c = trnext(&trrepl, enc)) != errc)
7904 /* retrieve last replacer */;
7905 last = trrepl.now;
7906 for (i=0; i<256; i++) {
7907 if (trans[i] != errc) {
7908 trans[i] = last;
7909 }
7910 }
7911 }
7912 else {
7913 unsigned int r;
7914
7915 for (i=0; i<256; i++) {
7916 trans[i] = errc;
7917 }
7918 while ((c = trnext(&trsrc, enc)) != errc) {
7919 r = trnext(&trrepl, enc);
7920 if (r == errc) r = trrepl.now;
7921 if (c < 256) {
7922 trans[c] = r;
7923 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7924 }
7925 else {
7926 if (!hash) hash = rb_hash_new();
7927 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7928 }
7929 }
7930 }
7931
7932 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7933 cr = ENC_CODERANGE_7BIT;
7934 str_modify_keep_cr(str);
7935 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7936 termlen = rb_enc_mbminlen(enc);
7937 if (sflag) {
7938 int clen, tlen;
7939 long offset, max = RSTRING_LEN(str);
7940 unsigned int save = -1;
7941 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7942
7943 while (s < send) {
7944 int may_modify = 0;
7945
7946 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7947 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7948
7949 s += clen;
7950 if (c < 256) {
7951 c = trans[c];
7952 }
7953 else if (hash) {
7954 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7955 if (NIL_P(tmp)) {
7956 if (cflag) c = last;
7957 else c = errc;
7958 }
7959 else if (cflag) c = errc;
7960 else c = NUM2INT(tmp);
7961 }
7962 else {
7963 c = errc;
7964 }
7965 if (c != (unsigned int)-1) {
7966 if (save == c) {
7967 CHECK_IF_ASCII(c);
7968 continue;
7969 }
7970 save = c;
7971 tlen = rb_enc_codelen(c, enc);
7972 modify = 1;
7973 }
7974 else {
7975 save = -1;
7976 c = c0;
7977 if (enc != e1) may_modify = 1;
7978 }
7979 if ((offset = t - buf) + tlen > max) {
7980 size_t MAYBE_UNUSED(old) = max + termlen;
7981 max = offset + tlen + (send - s);
7982 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7983 t = buf + offset;
7984 }
7985 rb_enc_mbcput(c, t, enc);
7986 if (may_modify && memcmp(s, t, tlen) != 0) {
7987 modify = 1;
7988 }
7989 CHECK_IF_ASCII(c);
7990 t += tlen;
7991 }
7992 if (!STR_EMBED_P(str)) {
7993 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7994 }
7995 TERM_FILL((char *)t, termlen);
7996 RSTRING(str)->as.heap.ptr = (char *)buf;
7997 RSTRING(str)->as.heap.len = t - buf;
7998 STR_SET_NOEMBED(str);
7999 RSTRING(str)->as.heap.aux.capa = max;
8000 }
8001 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8002 while (s < send) {
8003 c = (unsigned char)*s;
8004 if (trans[c] != errc) {
8005 if (!cflag) {
8006 c = trans[c];
8007 *s = c;
8008 modify = 1;
8009 }
8010 else {
8011 *s = last;
8012 modify = 1;
8013 }
8014 }
8015 CHECK_IF_ASCII(c);
8016 s++;
8017 }
8018 }
8019 else {
8020 int clen, tlen;
8021 long offset, max = (long)((send - s) * 1.2);
8022 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8023
8024 while (s < send) {
8025 int may_modify = 0;
8026 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8027 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8028
8029 if (c < 256) {
8030 c = trans[c];
8031 }
8032 else if (hash) {
8033 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8034 if (NIL_P(tmp)) {
8035 if (cflag) c = last;
8036 else c = errc;
8037 }
8038 else if (cflag) c = errc;
8039 else c = NUM2INT(tmp);
8040 }
8041 else {
8042 c = cflag ? last : errc;
8043 }
8044 if (c != errc) {
8045 tlen = rb_enc_codelen(c, enc);
8046 modify = 1;
8047 }
8048 else {
8049 c = c0;
8050 if (enc != e1) may_modify = 1;
8051 }
8052 if ((offset = t - buf) + tlen > max) {
8053 size_t MAYBE_UNUSED(old) = max + termlen;
8054 max = offset + tlen + (long)((send - s) * 1.2);
8055 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8056 t = buf + offset;
8057 }
8058 if (s != t) {
8059 rb_enc_mbcput(c, t, enc);
8060 if (may_modify && memcmp(s, t, tlen) != 0) {
8061 modify = 1;
8062 }
8063 }
8064 CHECK_IF_ASCII(c);
8065 s += clen;
8066 t += tlen;
8067 }
8068 if (!STR_EMBED_P(str)) {
8069 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8070 }
8071 TERM_FILL((char *)t, termlen);
8072 RSTRING(str)->as.heap.ptr = (char *)buf;
8073 RSTRING(str)->as.heap.len = t - buf;
8074 STR_SET_NOEMBED(str);
8075 RSTRING(str)->as.heap.aux.capa = max;
8076 }
8077
8078 if (modify) {
8079 if (cr != ENC_CODERANGE_BROKEN)
8080 ENC_CODERANGE_SET(str, cr);
8081 rb_enc_associate(str, enc);
8082 return str;
8083 }
8084 return Qnil;
8085}
8086
8087
8088/*
8089 * call-seq:
8090 * tr!(selector, replacements) -> self or nil
8091 *
8092 * Like String#tr, but modifies +self+ in place.
8093 * Returns +self+ if any changes were made, +nil+ otherwise.
8094 *
8095 */
8096
8097static VALUE
8098rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8099{
8100 return tr_trans(str, src, repl, 0);
8101}
8102
8103
8104/*
8105 * call-seq:
8106 * tr(selector, replacements) -> new_string
8107 *
8108 * Returns a copy of +self+ with each character specified by string +selector+
8109 * translated to the corresponding character in string +replacements+.
8110 * The correspondence is _positional_:
8111 *
8112 * - Each occurrence of the first character specified by +selector+
8113 * is translated to the first character in +replacements+.
8114 * - Each occurrence of the second character specified by +selector+
8115 * is translated to the second character in +replacements+.
8116 * - And so on.
8117 *
8118 * Example:
8119 *
8120 * 'hello'.tr('el', 'ip') #=> "hippo"
8121 *
8122 * If +replacements+ is shorter than +selector+,
8123 * it is implicitly padded with its own last character:
8124 *
8125 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8126 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8127 *
8128 * Arguments +selector+ and +replacements+ must be valid character selectors
8129 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8130 * and may use any of its valid forms, including negation, ranges, and escaping:
8131 *
8132 * # Negation.
8133 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8134 * # Ranges.
8135 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8136 * # Escapes.
8137 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8138 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8139 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8140 *
8141 */
8142
8143static VALUE
8144rb_str_tr(VALUE str, VALUE src, VALUE repl)
8145{
8146 str = str_duplicate(rb_cString, str);
8147 tr_trans(str, src, repl, 0);
8148 return str;
8149}
8150
8151#define TR_TABLE_MAX (UCHAR_MAX+1)
8152#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8153static void
8154tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8155 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8156{
8157 const unsigned int errc = -1;
8158 char buf[TR_TABLE_MAX];
8159 struct tr tr;
8160 unsigned int c;
8161 VALUE table = 0, ptable = 0;
8162 int i, l, cflag = 0;
8163
8164 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8165 tr.gen = tr.now = tr.max = 0;
8166
8167 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8168 cflag = 1;
8169 tr.p += l;
8170 }
8171 if (first) {
8172 for (i=0; i<TR_TABLE_MAX; i++) {
8173 stable[i] = 1;
8174 }
8175 stable[TR_TABLE_MAX] = cflag;
8176 }
8177 else if (stable[TR_TABLE_MAX] && !cflag) {
8178 stable[TR_TABLE_MAX] = 0;
8179 }
8180 for (i=0; i<TR_TABLE_MAX; i++) {
8181 buf[i] = cflag;
8182 }
8183
8184 while ((c = trnext(&tr, enc)) != errc) {
8185 if (c < TR_TABLE_MAX) {
8186 buf[(unsigned char)c] = !cflag;
8187 }
8188 else {
8189 VALUE key = UINT2NUM(c);
8190
8191 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8192 if (cflag) {
8193 ptable = *ctablep;
8194 table = ptable ? ptable : rb_hash_new();
8195 *ctablep = table;
8196 }
8197 else {
8198 table = rb_hash_new();
8199 ptable = *tablep;
8200 *tablep = table;
8201 }
8202 }
8203 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8204 rb_hash_aset(table, key, Qtrue);
8205 }
8206 }
8207 }
8208 for (i=0; i<TR_TABLE_MAX; i++) {
8209 stable[i] = stable[i] && buf[i];
8210 }
8211 if (!table && !cflag) {
8212 *tablep = 0;
8213 }
8214}
8215
8216
8217static int
8218tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8219{
8220 if (c < TR_TABLE_MAX) {
8221 return table[c] != 0;
8222 }
8223 else {
8224 VALUE v = UINT2NUM(c);
8225
8226 if (del) {
8227 if (!NIL_P(rb_hash_lookup(del, v)) &&
8228 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8229 return TRUE;
8230 }
8231 }
8232 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8233 return FALSE;
8234 }
8235 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8236 }
8237}
8238
8239/*
8240 * call-seq:
8241 * delete!(*selectors) -> self or nil
8242 *
8243 * Like String#delete, but modifies +self+ in place.
8244 * Returns +self+ if any changes were made, +nil+ otherwise.
8245 *
8246 */
8247
8248static VALUE
8249rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8250{
8251 char squeez[TR_TABLE_SIZE];
8252 rb_encoding *enc = 0;
8253 char *s, *send, *t;
8254 VALUE del = 0, nodel = 0;
8255 int modify = 0;
8256 int i, ascompat, cr;
8257
8258 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8260 for (i=0; i<argc; i++) {
8261 VALUE s = argv[i];
8262
8263 StringValue(s);
8264 enc = rb_enc_check(str, s);
8265 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8266 }
8267
8268 str_modify_keep_cr(str);
8269 ascompat = rb_enc_asciicompat(enc);
8270 s = t = RSTRING_PTR(str);
8271 send = RSTRING_END(str);
8272 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8273 while (s < send) {
8274 unsigned int c;
8275 int clen;
8276
8277 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8278 if (squeez[c]) {
8279 modify = 1;
8280 }
8281 else {
8282 if (t != s) *t = c;
8283 t++;
8284 }
8285 s++;
8286 }
8287 else {
8288 c = rb_enc_codepoint_len(s, send, &clen, enc);
8289
8290 if (tr_find(c, squeez, del, nodel)) {
8291 modify = 1;
8292 }
8293 else {
8294 if (t != s) rb_enc_mbcput(c, t, enc);
8295 t += clen;
8297 }
8298 s += clen;
8299 }
8300 }
8301 TERM_FILL(t, TERM_LEN(str));
8302 STR_SET_LEN(str, t - RSTRING_PTR(str));
8303 ENC_CODERANGE_SET(str, cr);
8304
8305 if (modify) return str;
8306 return Qnil;
8307}
8308
8309
8310/*
8311 * call-seq:
8312 * delete(*selectors) -> new_string
8313 *
8314 * Returns a copy of +self+ with characters specified by +selectors+ removed
8315 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8316 *
8317 * "hello".delete "l","lo" #=> "heo"
8318 * "hello".delete "lo" #=> "he"
8319 * "hello".delete "aeiou", "^e" #=> "hell"
8320 * "hello".delete "ej-m" #=> "ho"
8321 *
8322 */
8323
8324static VALUE
8325rb_str_delete(int argc, VALUE *argv, VALUE str)
8326{
8327 str = str_duplicate(rb_cString, str);
8328 rb_str_delete_bang(argc, argv, str);
8329 return str;
8330}
8331
8332
8333/*
8334 * call-seq:
8335 * squeeze!(*selectors) -> self or nil
8336 *
8337 * Like String#squeeze, but modifies +self+ in place.
8338 * Returns +self+ if any changes were made, +nil+ otherwise.
8339 */
8340
8341static VALUE
8342rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8343{
8344 char squeez[TR_TABLE_SIZE];
8345 rb_encoding *enc = 0;
8346 VALUE del = 0, nodel = 0;
8347 unsigned char *s, *send, *t;
8348 int i, modify = 0;
8349 int ascompat, singlebyte = single_byte_optimizable(str);
8350 unsigned int save;
8351
8352 if (argc == 0) {
8353 enc = STR_ENC_GET(str);
8354 }
8355 else {
8356 for (i=0; i<argc; i++) {
8357 VALUE s = argv[i];
8358
8359 StringValue(s);
8360 enc = rb_enc_check(str, s);
8361 if (singlebyte && !single_byte_optimizable(s))
8362 singlebyte = 0;
8363 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8364 }
8365 }
8366
8367 str_modify_keep_cr(str);
8368 s = t = (unsigned char *)RSTRING_PTR(str);
8369 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8370 send = (unsigned char *)RSTRING_END(str);
8371 save = -1;
8372 ascompat = rb_enc_asciicompat(enc);
8373
8374 if (singlebyte) {
8375 while (s < send) {
8376 unsigned int c = *s++;
8377 if (c != save || (argc > 0 && !squeez[c])) {
8378 *t++ = save = c;
8379 }
8380 }
8381 }
8382 else {
8383 while (s < send) {
8384 unsigned int c;
8385 int clen;
8386
8387 if (ascompat && (c = *s) < 0x80) {
8388 if (c != save || (argc > 0 && !squeez[c])) {
8389 *t++ = save = c;
8390 }
8391 s++;
8392 }
8393 else {
8394 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8395
8396 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8397 if (t != s) rb_enc_mbcput(c, t, enc);
8398 save = c;
8399 t += clen;
8400 }
8401 s += clen;
8402 }
8403 }
8404 }
8405
8406 TERM_FILL((char *)t, TERM_LEN(str));
8407 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8408 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8409 modify = 1;
8410 }
8411
8412 if (modify) return str;
8413 return Qnil;
8414}
8415
8416
8417/*
8418 * call-seq:
8419 * squeeze(*selectors) -> new_string
8420 *
8421 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8422 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8423 *
8424 * "Squeezed" means that each multiple-character run of a selected character
8425 * is squeezed down to a single character;
8426 * with no arguments given, squeezes all characters:
8427 *
8428 * "yellow moon".squeeze #=> "yelow mon"
8429 * " now is the".squeeze(" ") #=> " now is the"
8430 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8431 *
8432 */
8433
8434static VALUE
8435rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8436{
8437 str = str_duplicate(rb_cString, str);
8438 rb_str_squeeze_bang(argc, argv, str);
8439 return str;
8440}
8441
8442
8443/*
8444 * call-seq:
8445 * tr_s!(selector, replacements) -> self or nil
8446 *
8447 * Like String#tr_s, but modifies +self+ in place.
8448 * Returns +self+ if any changes were made, +nil+ otherwise.
8449 *
8450 * Related: String#squeeze!.
8451 */
8452
8453static VALUE
8454rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8455{
8456 return tr_trans(str, src, repl, 1);
8457}
8458
8459
8460/*
8461 * call-seq:
8462 * tr_s(selector, replacements) -> string
8463 *
8464 * Like String#tr, but also squeezes the modified portions of the translated string;
8465 * returns a new string (translated and squeezed).
8466 *
8467 * 'hello'.tr_s('l', 'r') #=> "hero"
8468 * 'hello'.tr_s('el', '-') #=> "h-o"
8469 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8470 *
8471 * Related: String#squeeze.
8472 *
8473 */
8474
8475static VALUE
8476rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8477{
8478 str = str_duplicate(rb_cString, str);
8479 tr_trans(str, src, repl, 1);
8480 return str;
8481}
8482
8483
8484/*
8485 * call-seq:
8486 * count(*selectors) -> integer
8487 *
8488 * Returns the total number of characters in +self+
8489 * that are specified by the given +selectors+
8490 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8491 *
8492 * a = "hello world"
8493 * a.count "lo" #=> 5
8494 * a.count "lo", "o" #=> 2
8495 * a.count "hello", "^l" #=> 4
8496 * a.count "ej-m" #=> 4
8497 *
8498 * "hello^world".count "\\^aeiou" #=> 4
8499 * "hello-world".count "a\\-eo" #=> 4
8500 *
8501 * c = "hello world\\r\\n"
8502 * c.count "\\" #=> 2
8503 * c.count "\\A" #=> 0
8504 * c.count "X-\\w" #=> 3
8505 */
8506
8507static VALUE
8508rb_str_count(int argc, VALUE *argv, VALUE str)
8509{
8510 char table[TR_TABLE_SIZE];
8511 rb_encoding *enc = 0;
8512 VALUE del = 0, nodel = 0, tstr;
8513 char *s, *send;
8514 int i;
8515 int ascompat;
8516 size_t n = 0;
8517
8519
8520 tstr = argv[0];
8521 StringValue(tstr);
8522 enc = rb_enc_check(str, tstr);
8523 if (argc == 1) {
8524 const char *ptstr;
8525 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8526 (ptstr = RSTRING_PTR(tstr),
8527 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8528 !is_broken_string(str)) {
8529 int clen;
8530 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8531
8532 s = RSTRING_PTR(str);
8533 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8534 send = RSTRING_END(str);
8535 while (s < send) {
8536 if (*(unsigned char*)s++ == c) n++;
8537 }
8538 return SIZET2NUM(n);
8539 }
8540 }
8541
8542 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8543 for (i=1; i<argc; i++) {
8544 tstr = argv[i];
8545 StringValue(tstr);
8546 enc = rb_enc_check(str, tstr);
8547 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8548 }
8549
8550 s = RSTRING_PTR(str);
8551 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8552 send = RSTRING_END(str);
8553 ascompat = rb_enc_asciicompat(enc);
8554 while (s < send) {
8555 unsigned int c;
8556
8557 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8558 if (table[c]) {
8559 n++;
8560 }
8561 s++;
8562 }
8563 else {
8564 int clen;
8565 c = rb_enc_codepoint_len(s, send, &clen, enc);
8566 if (tr_find(c, table, del, nodel)) {
8567 n++;
8568 }
8569 s += clen;
8570 }
8571 }
8572
8573 return SIZET2NUM(n);
8574}
8575
8576static VALUE
8577rb_fs_check(VALUE val)
8578{
8579 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8580 val = rb_check_string_type(val);
8581 if (NIL_P(val)) return 0;
8582 }
8583 return val;
8584}
8585
8586static const char isspacetable[256] = {
8587 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8589 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8603};
8604
8605#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8606
8607static long
8608split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8609{
8610 if (empty_count >= 0 && len == 0) {
8611 return empty_count + 1;
8612 }
8613 if (empty_count > 0) {
8614 /* make different substrings */
8615 if (result) {
8616 do {
8617 rb_ary_push(result, str_new_empty_String(str));
8618 } while (--empty_count > 0);
8619 }
8620 else {
8621 do {
8622 rb_yield(str_new_empty_String(str));
8623 } while (--empty_count > 0);
8624 }
8625 }
8626 str = rb_str_subseq(str, beg, len);
8627 if (result) {
8628 rb_ary_push(result, str);
8629 }
8630 else {
8631 rb_yield(str);
8632 }
8633 return empty_count;
8634}
8635
8636typedef enum {
8637 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8638} split_type_t;
8639
8640static split_type_t
8641literal_split_pattern(VALUE spat, split_type_t default_type)
8642{
8643 rb_encoding *enc = STR_ENC_GET(spat);
8644 const char *ptr;
8645 long len;
8646 RSTRING_GETMEM(spat, ptr, len);
8647 if (len == 0) {
8648 /* Special case - split into chars */
8649 return SPLIT_TYPE_CHARS;
8650 }
8651 else if (rb_enc_asciicompat(enc)) {
8652 if (len == 1 && ptr[0] == ' ') {
8653 return SPLIT_TYPE_AWK;
8654 }
8655 }
8656 else {
8657 int l;
8658 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8659 return SPLIT_TYPE_AWK;
8660 }
8661 }
8662 return default_type;
8663}
8664
8665/*
8666 * call-seq:
8667 * split(field_sep = $;, limit = nil) -> array
8668 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8669 *
8670 * :include: doc/string/split.rdoc
8671 *
8672 */
8673
8674static VALUE
8675rb_str_split_m(int argc, VALUE *argv, VALUE str)
8676{
8677 rb_encoding *enc;
8678 VALUE spat;
8679 VALUE limit;
8680 split_type_t split_type;
8681 long beg, end, i = 0, empty_count = -1;
8682 int lim = 0;
8683 VALUE result, tmp;
8684
8685 result = rb_block_given_p() ? Qfalse : Qnil;
8686 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8687 lim = NUM2INT(limit);
8688 if (lim <= 0) limit = Qnil;
8689 else if (lim == 1) {
8690 if (RSTRING_LEN(str) == 0)
8691 return result ? rb_ary_new2(0) : str;
8692 tmp = str_duplicate(rb_cString, str);
8693 if (!result) {
8694 rb_yield(tmp);
8695 return str;
8696 }
8697 return rb_ary_new3(1, tmp);
8698 }
8699 i = 1;
8700 }
8701 if (NIL_P(limit) && !lim) empty_count = 0;
8702
8703 enc = STR_ENC_GET(str);
8704 split_type = SPLIT_TYPE_REGEXP;
8705 if (!NIL_P(spat)) {
8706 spat = get_pat_quoted(spat, 0);
8707 }
8708 else if (NIL_P(spat = rb_fs)) {
8709 split_type = SPLIT_TYPE_AWK;
8710 }
8711 else if (!(spat = rb_fs_check(spat))) {
8712 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8713 }
8714 else {
8715 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8716 }
8717 if (split_type != SPLIT_TYPE_AWK) {
8718 switch (BUILTIN_TYPE(spat)) {
8719 case T_REGEXP:
8720 rb_reg_options(spat); /* check if uninitialized */
8721 tmp = RREGEXP_SRC(spat);
8722 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8723 if (split_type == SPLIT_TYPE_AWK) {
8724 spat = tmp;
8725 split_type = SPLIT_TYPE_STRING;
8726 }
8727 break;
8728
8729 case T_STRING:
8730 mustnot_broken(spat);
8731 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8732 break;
8733
8734 default:
8736 }
8737 }
8738
8739#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8740
8741 if (result) result = rb_ary_new();
8742 beg = 0;
8743 char *ptr = RSTRING_PTR(str);
8744 char *eptr = RSTRING_END(str);
8745 if (split_type == SPLIT_TYPE_AWK) {
8746 char *bptr = ptr;
8747 int skip = 1;
8748 unsigned int c;
8749
8750 end = beg;
8751 if (is_ascii_string(str)) {
8752 while (ptr < eptr) {
8753 c = (unsigned char)*ptr++;
8754 if (skip) {
8755 if (ascii_isspace(c)) {
8756 beg = ptr - bptr;
8757 }
8758 else {
8759 end = ptr - bptr;
8760 skip = 0;
8761 if (!NIL_P(limit) && lim <= i) break;
8762 }
8763 }
8764 else if (ascii_isspace(c)) {
8765 SPLIT_STR(beg, end-beg);
8766 skip = 1;
8767 beg = ptr - bptr;
8768 if (!NIL_P(limit)) ++i;
8769 }
8770 else {
8771 end = ptr - bptr;
8772 }
8773 }
8774 }
8775 else {
8776 while (ptr < eptr) {
8777 int n;
8778
8779 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8780 ptr += n;
8781 if (skip) {
8782 if (rb_isspace(c)) {
8783 beg = ptr - bptr;
8784 }
8785 else {
8786 end = ptr - bptr;
8787 skip = 0;
8788 if (!NIL_P(limit) && lim <= i) break;
8789 }
8790 }
8791 else if (rb_isspace(c)) {
8792 SPLIT_STR(beg, end-beg);
8793 skip = 1;
8794 beg = ptr - bptr;
8795 if (!NIL_P(limit)) ++i;
8796 }
8797 else {
8798 end = ptr - bptr;
8799 }
8800 }
8801 }
8802 }
8803 else if (split_type == SPLIT_TYPE_STRING) {
8804 char *str_start = ptr;
8805 char *substr_start = ptr;
8806 char *sptr = RSTRING_PTR(spat);
8807 long slen = RSTRING_LEN(spat);
8808
8809 mustnot_broken(str);
8810 enc = rb_enc_check(str, spat);
8811 while (ptr < eptr &&
8812 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8813 /* Check we are at the start of a char */
8814 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8815 if (t != ptr + end) {
8816 ptr = t;
8817 continue;
8818 }
8819 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8820 ptr += end + slen;
8821 substr_start = ptr;
8822 if (!NIL_P(limit) && lim <= ++i) break;
8823 }
8824 beg = ptr - str_start;
8825 }
8826 else if (split_type == SPLIT_TYPE_CHARS) {
8827 char *str_start = ptr;
8828 int n;
8829
8830 mustnot_broken(str);
8831 enc = rb_enc_get(str);
8832 while (ptr < eptr &&
8833 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8834 SPLIT_STR(ptr - str_start, n);
8835 ptr += n;
8836 if (!NIL_P(limit) && lim <= ++i) break;
8837 }
8838 beg = ptr - str_start;
8839 }
8840 else {
8841 long len = RSTRING_LEN(str);
8842 long start = beg;
8843 long idx;
8844 int last_null = 0;
8845 struct re_registers *regs;
8846 VALUE match = 0;
8847
8848 for (; rb_reg_search(spat, str, start, 0) >= 0;
8849 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8850 match = rb_backref_get();
8851 if (!result) rb_match_busy(match);
8852 regs = RMATCH_REGS(match);
8853 end = BEG(0);
8854 if (start == end && BEG(0) == END(0)) {
8855 if (!ptr) {
8856 SPLIT_STR(0, 0);
8857 break;
8858 }
8859 else if (last_null == 1) {
8860 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8861 beg = start;
8862 }
8863 else {
8864 if (start == len)
8865 start++;
8866 else
8867 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8868 last_null = 1;
8869 continue;
8870 }
8871 }
8872 else {
8873 SPLIT_STR(beg, end-beg);
8874 beg = start = END(0);
8875 }
8876 last_null = 0;
8877
8878 for (idx=1; idx < regs->num_regs; idx++) {
8879 if (BEG(idx) == -1) continue;
8880 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8881 }
8882 if (!NIL_P(limit) && lim <= ++i) break;
8883 }
8884 if (match) rb_match_unbusy(match);
8885 }
8886 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8887 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8888 }
8889
8890 return result ? result : str;
8891}
8892
8893VALUE
8894rb_str_split(VALUE str, const char *sep0)
8895{
8896 VALUE sep;
8897
8898 StringValue(str);
8899 sep = rb_str_new_cstr(sep0);
8900 return rb_str_split_m(1, &sep, str);
8901}
8902
8903#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8904
8905static inline int
8906enumerator_element(VALUE ary, VALUE e)
8907{
8908 if (ary) {
8909 rb_ary_push(ary, e);
8910 return 0;
8911 }
8912 else {
8913 rb_yield(e);
8914 return 1;
8915 }
8916}
8917
8918#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8919
8920static const char *
8921chomp_newline(const char *p, const char *e, rb_encoding *enc)
8922{
8923 const char *prev = rb_enc_prev_char(p, e, e, enc);
8924 if (rb_enc_is_newline(prev, e, enc)) {
8925 e = prev;
8926 prev = rb_enc_prev_char(p, e, e, enc);
8927 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8928 e = prev;
8929 }
8930 return e;
8931}
8932
8933static VALUE
8934get_rs(void)
8935{
8936 VALUE rs = rb_rs;
8937 if (!NIL_P(rs) &&
8938 (!RB_TYPE_P(rs, T_STRING) ||
8939 RSTRING_LEN(rs) != 1 ||
8940 RSTRING_PTR(rs)[0] != '\n')) {
8941 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8942 }
8943 return rs;
8944}
8945
8946#define rb_rs get_rs()
8947
8948static VALUE
8949rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8950{
8951 rb_encoding *enc;
8952 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8953 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8954 long pos, len, rslen;
8955 int rsnewline = 0;
8956
8957 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8958 rs = rb_rs;
8959 if (!NIL_P(opts)) {
8960 static ID keywords[1];
8961 if (!keywords[0]) {
8962 keywords[0] = rb_intern_const("chomp");
8963 }
8964 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8965 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
8966 }
8967
8968 if (NIL_P(rs)) {
8969 if (!ENUM_ELEM(ary, str)) {
8970 return ary;
8971 }
8972 else {
8973 return orig;
8974 }
8975 }
8976
8977 if (!RSTRING_LEN(str)) goto end;
8978 str = rb_str_new_frozen(str);
8979 ptr = subptr = RSTRING_PTR(str);
8980 pend = RSTRING_END(str);
8981 len = RSTRING_LEN(str);
8982 StringValue(rs);
8983 rslen = RSTRING_LEN(rs);
8984
8985 if (rs == rb_default_rs)
8986 enc = rb_enc_get(str);
8987 else
8988 enc = rb_enc_check(str, rs);
8989
8990 if (rslen == 0) {
8991 /* paragraph mode */
8992 int n;
8993 const char *eol = NULL;
8994 subend = subptr;
8995 while (subend < pend) {
8996 long chomp_rslen = 0;
8997 do {
8998 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8999 n = 0;
9000 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9001 if (rb_enc_is_newline(subend + n, pend, enc)) {
9002 if (eol == subend) break;
9003 subend += rslen;
9004 if (subptr) {
9005 eol = subend;
9006 chomp_rslen = -rslen;
9007 }
9008 }
9009 else {
9010 if (!subptr) subptr = subend;
9011 subend += rslen;
9012 }
9013 rslen = 0;
9014 } while (subend < pend);
9015 if (!subptr) break;
9016 if (rslen == 0) chomp_rslen = 0;
9017 line = rb_str_subseq(str, subptr - ptr,
9018 subend - subptr + (chomp ? chomp_rslen : rslen));
9019 if (ENUM_ELEM(ary, line)) {
9020 str_mod_check(str, ptr, len);
9021 }
9022 subptr = eol = NULL;
9023 }
9024 goto end;
9025 }
9026 else {
9027 rsptr = RSTRING_PTR(rs);
9028 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9029 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9030 rsnewline = 1;
9031 }
9032 }
9033
9034 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9035 rs = rb_str_new(rsptr, rslen);
9036 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9037 rsptr = RSTRING_PTR(rs);
9038 rslen = RSTRING_LEN(rs);
9039 }
9040
9041 while (subptr < pend) {
9042 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9043 if (pos < 0) break;
9044 hit = subptr + pos;
9045 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9046 if (hit != adjusted) {
9047 subptr = adjusted;
9048 continue;
9049 }
9050 subend = hit += rslen;
9051 if (chomp) {
9052 if (rsnewline) {
9053 subend = chomp_newline(subptr, subend, enc);
9054 }
9055 else {
9056 subend -= rslen;
9057 }
9058 }
9059 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9060 if (ENUM_ELEM(ary, line)) {
9061 str_mod_check(str, ptr, len);
9062 }
9063 subptr = hit;
9064 }
9065
9066 if (subptr != pend) {
9067 if (chomp) {
9068 if (rsnewline) {
9069 pend = chomp_newline(subptr, pend, enc);
9070 }
9071 else if (pend - subptr >= rslen &&
9072 memcmp(pend - rslen, rsptr, rslen) == 0) {
9073 pend -= rslen;
9074 }
9075 }
9076 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9077 ENUM_ELEM(ary, line);
9078 RB_GC_GUARD(str);
9079 }
9080
9081 end:
9082 if (ary)
9083 return ary;
9084 else
9085 return orig;
9086}
9087
9088/*
9089 * call-seq:
9090 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9091 * each_line(line_sep = $/, chomp: false) -> enumerator
9092 *
9093 * :include: doc/string/each_line.rdoc
9094 *
9095 */
9096
9097static VALUE
9098rb_str_each_line(int argc, VALUE *argv, VALUE str)
9099{
9100 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9101 return rb_str_enumerate_lines(argc, argv, str, 0);
9102}
9103
9104/*
9105 * call-seq:
9106 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9107 *
9108 * Forms substrings ("lines") of +self+ according to the given arguments
9109 * (see String#each_line for details); returns the lines in an array.
9110 *
9111 */
9112
9113static VALUE
9114rb_str_lines(int argc, VALUE *argv, VALUE str)
9115{
9116 VALUE ary = WANTARRAY("lines", 0);
9117 return rb_str_enumerate_lines(argc, argv, str, ary);
9118}
9119
9120static VALUE
9121rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9122{
9123 return LONG2FIX(RSTRING_LEN(str));
9124}
9125
9126static VALUE
9127rb_str_enumerate_bytes(VALUE str, VALUE ary)
9128{
9129 long i;
9130
9131 for (i=0; i<RSTRING_LEN(str); i++) {
9132 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9133 }
9134 if (ary)
9135 return ary;
9136 else
9137 return str;
9138}
9139
9140/*
9141 * call-seq:
9142 * each_byte {|byte| ... } -> self
9143 * each_byte -> enumerator
9144 *
9145 * :include: doc/string/each_byte.rdoc
9146 *
9147 */
9148
9149static VALUE
9150rb_str_each_byte(VALUE str)
9151{
9152 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9153 return rb_str_enumerate_bytes(str, 0);
9154}
9155
9156/*
9157 * call-seq:
9158 * bytes -> array_of_bytes
9159 *
9160 * :include: doc/string/bytes.rdoc
9161 *
9162 */
9163
9164static VALUE
9165rb_str_bytes(VALUE str)
9166{
9167 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9168 return rb_str_enumerate_bytes(str, ary);
9169}
9170
9171static VALUE
9172rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9173{
9174 return rb_str_length(str);
9175}
9176
9177static VALUE
9178rb_str_enumerate_chars(VALUE str, VALUE ary)
9179{
9180 VALUE orig = str;
9181 long i, len, n;
9182 const char *ptr;
9183 rb_encoding *enc;
9184
9185 str = rb_str_new_frozen(str);
9186 ptr = RSTRING_PTR(str);
9187 len = RSTRING_LEN(str);
9188 enc = rb_enc_get(str);
9189
9191 for (i = 0; i < len; i += n) {
9192 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9193 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9194 }
9195 }
9196 else {
9197 for (i = 0; i < len; i += n) {
9198 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9199 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9200 }
9201 }
9202 RB_GC_GUARD(str);
9203 if (ary)
9204 return ary;
9205 else
9206 return orig;
9207}
9208
9209/*
9210 * call-seq:
9211 * each_char {|c| ... } -> self
9212 * each_char -> enumerator
9213 *
9214 * :include: doc/string/each_char.rdoc
9215 *
9216 */
9217
9218static VALUE
9219rb_str_each_char(VALUE str)
9220{
9221 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9222 return rb_str_enumerate_chars(str, 0);
9223}
9224
9225/*
9226 * call-seq:
9227 * chars -> array_of_characters
9228 *
9229 * :include: doc/string/chars.rdoc
9230 *
9231 */
9232
9233static VALUE
9234rb_str_chars(VALUE str)
9235{
9236 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9237 return rb_str_enumerate_chars(str, ary);
9238}
9239
9240static VALUE
9241rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9242{
9243 VALUE orig = str;
9244 int n;
9245 unsigned int c;
9246 const char *ptr, *end;
9247 rb_encoding *enc;
9248
9249 if (single_byte_optimizable(str))
9250 return rb_str_enumerate_bytes(str, ary);
9251
9252 str = rb_str_new_frozen(str);
9253 ptr = RSTRING_PTR(str);
9254 end = RSTRING_END(str);
9255 enc = STR_ENC_GET(str);
9256
9257 while (ptr < end) {
9258 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9259 ENUM_ELEM(ary, UINT2NUM(c));
9260 ptr += n;
9261 }
9262 RB_GC_GUARD(str);
9263 if (ary)
9264 return ary;
9265 else
9266 return orig;
9267}
9268
9269/*
9270 * call-seq:
9271 * each_codepoint {|integer| ... } -> self
9272 * each_codepoint -> enumerator
9273 *
9274 * :include: doc/string/each_codepoint.rdoc
9275 *
9276 */
9277
9278static VALUE
9279rb_str_each_codepoint(VALUE str)
9280{
9281 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9282 return rb_str_enumerate_codepoints(str, 0);
9283}
9284
9285/*
9286 * call-seq:
9287 * codepoints -> array_of_integers
9288 *
9289 * :include: doc/string/codepoints.rdoc
9290 *
9291 */
9292
9293static VALUE
9294rb_str_codepoints(VALUE str)
9295{
9296 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9297 return rb_str_enumerate_codepoints(str, ary);
9298}
9299
9300static regex_t *
9301get_reg_grapheme_cluster(rb_encoding *enc)
9302{
9303 int encidx = rb_enc_to_index(enc);
9304
9305 const OnigUChar source_ascii[] = "\\X";
9306 const OnigUChar *source = source_ascii;
9307 size_t source_len = sizeof(source_ascii) - 1;
9308
9309 switch (encidx) {
9310#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9311#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9312#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9313#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9314#define CASE_UTF(e) \
9315 case ENCINDEX_UTF_##e: { \
9316 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9317 source = source_UTF_##e; \
9318 source_len = sizeof(source_UTF_##e); \
9319 break; \
9320 }
9321 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9322#undef CASE_UTF
9323#undef CHARS_16BE
9324#undef CHARS_16LE
9325#undef CHARS_32BE
9326#undef CHARS_32LE
9327 }
9328
9329 regex_t *reg_grapheme_cluster;
9330 OnigErrorInfo einfo;
9331 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9332 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9333 if (r) {
9334 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9335 onig_error_code_to_str(message, r, &einfo);
9336 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9337 }
9338
9339 return reg_grapheme_cluster;
9340}
9341
9342static regex_t *
9343get_cached_reg_grapheme_cluster(rb_encoding *enc)
9344{
9345 int encidx = rb_enc_to_index(enc);
9346 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9347
9348 if (encidx == rb_utf8_encindex()) {
9349 if (!reg_grapheme_cluster_utf8) {
9350 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9351 }
9352
9353 return reg_grapheme_cluster_utf8;
9354 }
9355
9356 return NULL;
9357}
9358
9359static VALUE
9360rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9361{
9362 size_t grapheme_cluster_count = 0;
9363 rb_encoding *enc = get_encoding(str);
9364 const char *ptr, *end;
9365
9366 if (!rb_enc_unicode_p(enc)) {
9367 return rb_str_length(str);
9368 }
9369
9370 bool cached_reg_grapheme_cluster = true;
9371 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9372 if (!reg_grapheme_cluster) {
9373 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9374 cached_reg_grapheme_cluster = false;
9375 }
9376
9377 ptr = RSTRING_PTR(str);
9378 end = RSTRING_END(str);
9379
9380 while (ptr < end) {
9381 OnigPosition len = onig_match(reg_grapheme_cluster,
9382 (const OnigUChar *)ptr, (const OnigUChar *)end,
9383 (const OnigUChar *)ptr, NULL, 0);
9384 if (len <= 0) break;
9385 grapheme_cluster_count++;
9386 ptr += len;
9387 }
9388
9389 if (!cached_reg_grapheme_cluster) {
9390 onig_free(reg_grapheme_cluster);
9391 }
9392
9393 return SIZET2NUM(grapheme_cluster_count);
9394}
9395
9396static VALUE
9397rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9398{
9399 VALUE orig = str;
9400 rb_encoding *enc = get_encoding(str);
9401 const char *ptr0, *ptr, *end;
9402
9403 if (!rb_enc_unicode_p(enc)) {
9404 return rb_str_enumerate_chars(str, ary);
9405 }
9406
9407 if (!ary) str = rb_str_new_frozen(str);
9408
9409 bool cached_reg_grapheme_cluster = true;
9410 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9411 if (!reg_grapheme_cluster) {
9412 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9413 cached_reg_grapheme_cluster = false;
9414 }
9415
9416 ptr0 = ptr = RSTRING_PTR(str);
9417 end = RSTRING_END(str);
9418
9419 while (ptr < end) {
9420 OnigPosition len = onig_match(reg_grapheme_cluster,
9421 (const OnigUChar *)ptr, (const OnigUChar *)end,
9422 (const OnigUChar *)ptr, NULL, 0);
9423 if (len <= 0) break;
9424 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9425 ptr += len;
9426 }
9427
9428 if (!cached_reg_grapheme_cluster) {
9429 onig_free(reg_grapheme_cluster);
9430 }
9431
9432 RB_GC_GUARD(str);
9433 if (ary)
9434 return ary;
9435 else
9436 return orig;
9437}
9438
9439/*
9440 * call-seq:
9441 * each_grapheme_cluster {|gc| ... } -> self
9442 * each_grapheme_cluster -> enumerator
9443 *
9444 * :include: doc/string/each_grapheme_cluster.rdoc
9445 *
9446 */
9447
9448static VALUE
9449rb_str_each_grapheme_cluster(VALUE str)
9450{
9451 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9452 return rb_str_enumerate_grapheme_clusters(str, 0);
9453}
9454
9455/*
9456 * call-seq:
9457 * grapheme_clusters -> array_of_grapheme_clusters
9458 *
9459 * :include: doc/string/grapheme_clusters.rdoc
9460 *
9461 */
9462
9463static VALUE
9464rb_str_grapheme_clusters(VALUE str)
9465{
9466 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9467 return rb_str_enumerate_grapheme_clusters(str, ary);
9468}
9469
9470static long
9471chopped_length(VALUE str)
9472{
9473 rb_encoding *enc = STR_ENC_GET(str);
9474 const char *p, *p2, *beg, *end;
9475
9476 beg = RSTRING_PTR(str);
9477 end = beg + RSTRING_LEN(str);
9478 if (beg >= end) return 0;
9479 p = rb_enc_prev_char(beg, end, end, enc);
9480 if (!p) return 0;
9481 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9482 p2 = rb_enc_prev_char(beg, p, end, enc);
9483 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9484 }
9485 return p - beg;
9486}
9487
9488/*
9489 * call-seq:
9490 * chop! -> self or nil
9491 *
9492 * Like String#chop, but modifies +self+ in place;
9493 * returns +nil+ if +self+ is empty, +self+ otherwise.
9494 *
9495 * Related: String#chomp!.
9496 */
9497
9498static VALUE
9499rb_str_chop_bang(VALUE str)
9500{
9501 str_modify_keep_cr(str);
9502 if (RSTRING_LEN(str) > 0) {
9503 long len;
9504 len = chopped_length(str);
9505 STR_SET_LEN(str, len);
9506 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9507 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9509 }
9510 return str;
9511 }
9512 return Qnil;
9513}
9514
9515
9516/*
9517 * call-seq:
9518 * chop -> new_string
9519 *
9520 * :include: doc/string/chop.rdoc
9521 *
9522 */
9523
9524static VALUE
9525rb_str_chop(VALUE str)
9526{
9527 return rb_str_subseq(str, 0, chopped_length(str));
9528}
9529
9530static long
9531smart_chomp(VALUE str, const char *e, const char *p)
9532{
9533 rb_encoding *enc = rb_enc_get(str);
9534 if (rb_enc_mbminlen(enc) > 1) {
9535 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9536 if (rb_enc_is_newline(pp, e, enc)) {
9537 e = pp;
9538 }
9539 pp = e - rb_enc_mbminlen(enc);
9540 if (pp >= p) {
9541 pp = rb_enc_left_char_head(p, pp, e, enc);
9542 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9543 e = pp;
9544 }
9545 }
9546 }
9547 else {
9548 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9549 case '\n':
9550 if (--e > p && *(e-1) == '\r') {
9551 --e;
9552 }
9553 break;
9554 case '\r':
9555 --e;
9556 break;
9557 }
9558 }
9559 return e - p;
9560}
9561
9562static long
9563chompped_length(VALUE str, VALUE rs)
9564{
9565 rb_encoding *enc;
9566 int newline;
9567 char *pp, *e, *rsptr;
9568 long rslen;
9569 char *const p = RSTRING_PTR(str);
9570 long len = RSTRING_LEN(str);
9571
9572 if (len == 0) return 0;
9573 e = p + len;
9574 if (rs == rb_default_rs) {
9575 return smart_chomp(str, e, p);
9576 }
9577
9578 enc = rb_enc_get(str);
9579 RSTRING_GETMEM(rs, rsptr, rslen);
9580 if (rslen == 0) {
9581 if (rb_enc_mbminlen(enc) > 1) {
9582 while (e > p) {
9583 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9584 if (!rb_enc_is_newline(pp, e, enc)) break;
9585 e = pp;
9586 pp -= rb_enc_mbminlen(enc);
9587 if (pp >= p) {
9588 pp = rb_enc_left_char_head(p, pp, e, enc);
9589 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9590 e = pp;
9591 }
9592 }
9593 }
9594 }
9595 else {
9596 while (e > p && *(e-1) == '\n') {
9597 --e;
9598 if (e > p && *(e-1) == '\r')
9599 --e;
9600 }
9601 }
9602 return e - p;
9603 }
9604 if (rslen > len) return len;
9605
9606 enc = rb_enc_get(rs);
9607 newline = rsptr[rslen-1];
9608 if (rslen == rb_enc_mbminlen(enc)) {
9609 if (rslen == 1) {
9610 if (newline == '\n')
9611 return smart_chomp(str, e, p);
9612 }
9613 else {
9614 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9615 return smart_chomp(str, e, p);
9616 }
9617 }
9618
9619 enc = rb_enc_check(str, rs);
9620 if (is_broken_string(rs)) {
9621 return len;
9622 }
9623 pp = e - rslen;
9624 if (p[len-1] == newline &&
9625 (rslen <= 1 ||
9626 memcmp(rsptr, pp, rslen) == 0)) {
9627 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9628 return len - rslen;
9629 RB_GC_GUARD(rs);
9630 }
9631 return len;
9632}
9633
9639static VALUE
9640chomp_rs(int argc, const VALUE *argv)
9641{
9642 rb_check_arity(argc, 0, 1);
9643 if (argc > 0) {
9644 VALUE rs = argv[0];
9645 if (!NIL_P(rs)) StringValue(rs);
9646 return rs;
9647 }
9648 else {
9649 return rb_rs;
9650 }
9651}
9652
9653VALUE
9654rb_str_chomp_string(VALUE str, VALUE rs)
9655{
9656 long olen = RSTRING_LEN(str);
9657 long len = chompped_length(str, rs);
9658 if (len >= olen) return Qnil;
9659 str_modify_keep_cr(str);
9660 STR_SET_LEN(str, len);
9661 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9662 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9664 }
9665 return str;
9666}
9667
9668/*
9669 * call-seq:
9670 * chomp!(line_sep = $/) -> self or nil
9671 *
9672 * Like String#chomp, but modifies +self+ in place;
9673 * returns +nil+ if no modification made, +self+ otherwise.
9674 *
9675 */
9676
9677static VALUE
9678rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9679{
9680 VALUE rs;
9681 str_modifiable(str);
9682 if (RSTRING_LEN(str) == 0) return Qnil;
9683 rs = chomp_rs(argc, argv);
9684 if (NIL_P(rs)) return Qnil;
9685 return rb_str_chomp_string(str, rs);
9686}
9687
9688
9689/*
9690 * call-seq:
9691 * chomp(line_sep = $/) -> new_string
9692 *
9693 * :include: doc/string/chomp.rdoc
9694 *
9695 */
9696
9697static VALUE
9698rb_str_chomp(int argc, VALUE *argv, VALUE str)
9699{
9700 VALUE rs = chomp_rs(argc, argv);
9701 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9702 return rb_str_subseq(str, 0, chompped_length(str, rs));
9703}
9704
9705static long
9706lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9707{
9708 const char *const start = s;
9709
9710 if (!s || s >= e) return 0;
9711
9712 /* remove spaces at head */
9713 if (single_byte_optimizable(str)) {
9714 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9715 }
9716 else {
9717 while (s < e) {
9718 int n;
9719 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9720
9721 if (cc && !rb_isspace(cc)) break;
9722 s += n;
9723 }
9724 }
9725 return s - start;
9726}
9727
9728/*
9729 * call-seq:
9730 * lstrip! -> self or nil
9731 *
9732 * Like String#lstrip, except that any modifications are made in +self+;
9733 * returns +self+ if any modification are made, +nil+ otherwise.
9734 *
9735 * Related: String#rstrip!, String#strip!.
9736 */
9737
9738static VALUE
9739rb_str_lstrip_bang(VALUE str)
9740{
9741 rb_encoding *enc;
9742 char *start, *s;
9743 long olen, loffset;
9744
9745 str_modify_keep_cr(str);
9746 enc = STR_ENC_GET(str);
9747 RSTRING_GETMEM(str, start, olen);
9748 loffset = lstrip_offset(str, start, start+olen, enc);
9749 if (loffset > 0) {
9750 long len = olen-loffset;
9751 s = start + loffset;
9752 memmove(start, s, len);
9753 STR_SET_LEN(str, len);
9754 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9755 return str;
9756 }
9757 return Qnil;
9758}
9759
9760
9761/*
9762 * call-seq:
9763 * lstrip -> new_string
9764 *
9765 * Returns a copy of +self+ with leading whitespace removed;
9766 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9767 *
9768 * whitespace = "\x00\t\n\v\f\r "
9769 * s = whitespace + 'abc' + whitespace
9770 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9771 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9772 *
9773 * Related: String#rstrip, String#strip.
9774 */
9775
9776static VALUE
9777rb_str_lstrip(VALUE str)
9778{
9779 char *start;
9780 long len, loffset;
9781 RSTRING_GETMEM(str, start, len);
9782 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9783 if (loffset <= 0) return str_duplicate(rb_cString, str);
9784 return rb_str_subseq(str, loffset, len - loffset);
9785}
9786
9787static long
9788rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9789{
9790 const char *t;
9791
9792 rb_str_check_dummy_enc(enc);
9794 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9795 }
9796 if (!s || s >= e) return 0;
9797 t = e;
9798
9799 /* remove trailing spaces or '\0's */
9800 if (single_byte_optimizable(str)) {
9801 unsigned char c;
9802 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9803 }
9804 else {
9805 char *tp;
9806
9807 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9808 unsigned int c = rb_enc_codepoint(tp, e, enc);
9809 if (c && !rb_isspace(c)) break;
9810 t = tp;
9811 }
9812 }
9813 return e - t;
9814}
9815
9816/*
9817 * call-seq:
9818 * rstrip! -> self or nil
9819 *
9820 * Like String#rstrip, except that any modifications are made in +self+;
9821 * returns +self+ if any modification are made, +nil+ otherwise.
9822 *
9823 * Related: String#lstrip!, String#strip!.
9824 */
9825
9826static VALUE
9827rb_str_rstrip_bang(VALUE str)
9828{
9829 rb_encoding *enc;
9830 char *start;
9831 long olen, roffset;
9832
9833 str_modify_keep_cr(str);
9834 enc = STR_ENC_GET(str);
9835 RSTRING_GETMEM(str, start, olen);
9836 roffset = rstrip_offset(str, start, start+olen, enc);
9837 if (roffset > 0) {
9838 long len = olen - roffset;
9839
9840 STR_SET_LEN(str, len);
9841 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9842 return str;
9843 }
9844 return Qnil;
9845}
9846
9847
9848/*
9849 * call-seq:
9850 * rstrip -> new_string
9851 *
9852 * Returns a copy of the receiver with trailing whitespace removed;
9853 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9854 *
9855 * whitespace = "\x00\t\n\v\f\r "
9856 * s = whitespace + 'abc' + whitespace
9857 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9858 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9859 *
9860 * Related: String#lstrip, String#strip.
9861 */
9862
9863static VALUE
9864rb_str_rstrip(VALUE str)
9865{
9866 rb_encoding *enc;
9867 char *start;
9868 long olen, roffset;
9869
9870 enc = STR_ENC_GET(str);
9871 RSTRING_GETMEM(str, start, olen);
9872 roffset = rstrip_offset(str, start, start+olen, enc);
9873
9874 if (roffset <= 0) return str_duplicate(rb_cString, str);
9875 return rb_str_subseq(str, 0, olen-roffset);
9876}
9877
9878
9879/*
9880 * call-seq:
9881 * strip! -> self or nil
9882 *
9883 * Like String#strip, except that any modifications are made in +self+;
9884 * returns +self+ if any modification are made, +nil+ otherwise.
9885 *
9886 * Related: String#lstrip!, String#strip!.
9887 */
9888
9889static VALUE
9890rb_str_strip_bang(VALUE str)
9891{
9892 char *start;
9893 long olen, loffset, roffset;
9894 rb_encoding *enc;
9895
9896 str_modify_keep_cr(str);
9897 enc = STR_ENC_GET(str);
9898 RSTRING_GETMEM(str, start, olen);
9899 loffset = lstrip_offset(str, start, start+olen, enc);
9900 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9901
9902 if (loffset > 0 || roffset > 0) {
9903 long len = olen-roffset;
9904 if (loffset > 0) {
9905 len -= loffset;
9906 memmove(start, start + loffset, len);
9907 }
9908 STR_SET_LEN(str, len);
9909 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9910 return str;
9911 }
9912 return Qnil;
9913}
9914
9915
9916/*
9917 * call-seq:
9918 * strip -> new_string
9919 *
9920 * Returns a copy of the receiver with leading and trailing whitespace removed;
9921 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9922 *
9923 * whitespace = "\x00\t\n\v\f\r "
9924 * s = whitespace + 'abc' + whitespace
9925 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9926 * s.strip # => "abc"
9927 *
9928 * Related: String#lstrip, String#rstrip.
9929 */
9930
9931static VALUE
9932rb_str_strip(VALUE str)
9933{
9934 char *start;
9935 long olen, loffset, roffset;
9936 rb_encoding *enc = STR_ENC_GET(str);
9937
9938 RSTRING_GETMEM(str, start, olen);
9939 loffset = lstrip_offset(str, start, start+olen, enc);
9940 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9941
9942 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9943 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9944}
9945
9946static VALUE
9947scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9948{
9949 VALUE result, match;
9950 struct re_registers *regs;
9951 int i;
9952 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9953 if (pos >= 0) {
9954 if (BUILTIN_TYPE(pat) == T_STRING) {
9955 regs = NULL;
9956 end = pos + RSTRING_LEN(pat);
9957 }
9958 else {
9959 match = rb_backref_get();
9960 regs = RMATCH_REGS(match);
9961 pos = BEG(0);
9962 end = END(0);
9963 }
9964 if (pos == end) {
9965 rb_encoding *enc = STR_ENC_GET(str);
9966 /*
9967 * Always consume at least one character of the input string
9968 */
9969 if (RSTRING_LEN(str) > end)
9970 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9971 RSTRING_END(str), enc);
9972 else
9973 *start = end + 1;
9974 }
9975 else {
9976 *start = end;
9977 }
9978 if (!regs || regs->num_regs == 1) {
9979 result = rb_str_subseq(str, pos, end - pos);
9980 return result;
9981 }
9982 result = rb_ary_new2(regs->num_regs);
9983 for (i=1; i < regs->num_regs; i++) {
9984 VALUE s = Qnil;
9985 if (BEG(i) >= 0) {
9986 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9987 }
9988 rb_ary_push(result, s);
9989 }
9990
9991 return result;
9992 }
9993 return Qnil;
9994}
9995
9996
9997/*
9998 * call-seq:
9999 * scan(string_or_regexp) -> array
10000 * scan(string_or_regexp) {|matches| ... } -> self
10001 *
10002 * Matches a pattern against +self+; the pattern is:
10003 *
10004 * - +string_or_regexp+ itself, if it is a Regexp.
10005 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10006 *
10007 * Iterates through +self+, generating a collection of matching results:
10008 *
10009 * - If the pattern contains no groups, each result is the
10010 * matched string, <code>$&</code>.
10011 * - If the pattern contains groups, each result is an array
10012 * containing one entry per group.
10013 *
10014 * With no block given, returns an array of the results:
10015 *
10016 * s = 'cruel world'
10017 * s.scan(/\w+/) # => ["cruel", "world"]
10018 * s.scan(/.../) # => ["cru", "el ", "wor"]
10019 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10020 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10021 *
10022 * With a block given, calls the block with each result; returns +self+:
10023 *
10024 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10025 * print "\n"
10026 * s.scan(/(.)(.)/) {|x,y| print y, x }
10027 * print "\n"
10028 *
10029 * Output:
10030 *
10031 * <<cruel>> <<world>>
10032 * rceu lowlr
10033 *
10034 */
10035
10036static VALUE
10037rb_str_scan(VALUE str, VALUE pat)
10038{
10039 VALUE result;
10040 long start = 0;
10041 long last = -1, prev = 0;
10042 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10043
10044 pat = get_pat_quoted(pat, 1);
10045 mustnot_broken(str);
10046 if (!rb_block_given_p()) {
10047 VALUE ary = rb_ary_new();
10048
10049 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10050 last = prev;
10051 prev = start;
10052 rb_ary_push(ary, result);
10053 }
10054 if (last >= 0) rb_pat_search(pat, str, last, 1);
10055 else rb_backref_set(Qnil);
10056 return ary;
10057 }
10058
10059 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10060 last = prev;
10061 prev = start;
10062 rb_yield(result);
10063 str_mod_check(str, p, len);
10064 }
10065 if (last >= 0) rb_pat_search(pat, str, last, 1);
10066 return str;
10067}
10068
10069
10070/*
10071 * call-seq:
10072 * hex -> integer
10073 *
10074 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10075 * (with an optional sign and an optional <code>0x</code>) and returns the
10076 * corresponding number;
10077 * returns zero if there is no such leading substring:
10078 *
10079 * '0x0a'.hex # => 10
10080 * '-1234'.hex # => -4660
10081 * '0'.hex # => 0
10082 * 'non-numeric'.hex # => 0
10083 *
10084 * Related: String#oct.
10085 *
10086 */
10087
10088static VALUE
10089rb_str_hex(VALUE str)
10090{
10091 return rb_str_to_inum(str, 16, FALSE);
10092}
10093
10094
10095/*
10096 * call-seq:
10097 * oct -> integer
10098 *
10099 * Interprets the leading substring of +self+ as a string of octal digits
10100 * (with an optional sign) and returns the corresponding number;
10101 * returns zero if there is no such leading substring:
10102 *
10103 * '123'.oct # => 83
10104 * '-377'.oct # => -255
10105 * '0377non-numeric'.oct # => 255
10106 * 'non-numeric'.oct # => 0
10107 *
10108 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10109 * see Kernel#Integer.
10110 *
10111 * Related: String#hex.
10112 *
10113 */
10114
10115static VALUE
10116rb_str_oct(VALUE str)
10117{
10118 return rb_str_to_inum(str, -8, FALSE);
10119}
10120
10121#ifndef HAVE_CRYPT_R
10122# include "ruby/thread_native.h"
10123# include "ruby/atomic.h"
10124
10125static struct {
10126 rb_nativethread_lock_t lock;
10127} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10128
10129static void
10130crypt_mutex_initialize(void)
10131{
10132}
10133#endif
10134
10135/*
10136 * call-seq:
10137 * crypt(salt_str) -> new_string
10138 *
10139 * Returns the string generated by calling <code>crypt(3)</code>
10140 * standard library function with <code>str</code> and
10141 * <code>salt_str</code>, in this order, as its arguments. Please do
10142 * not use this method any longer. It is legacy; provided only for
10143 * backward compatibility with ruby scripts in earlier days. It is
10144 * bad to use in contemporary programs for several reasons:
10145 *
10146 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10147 * run. The generated string lacks data portability.
10148 *
10149 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10150 * (i.e. silently ends up in unexpected results).
10151 *
10152 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10153 * thread safe.
10154 *
10155 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10156 * very very weak. According to its manpage, Linux's traditional
10157 * <code>crypt(3)</code> output has only 2**56 variations; too
10158 * easy to brute force today. And this is the default behaviour.
10159 *
10160 * * In order to make things robust some OSes implement so-called
10161 * "modular" usage. To go through, you have to do a complex
10162 * build-up of the <code>salt_str</code> parameter, by hand.
10163 * Failure in generation of a proper salt string tends not to
10164 * yield any errors; typos in parameters are normally not
10165 * detectable.
10166 *
10167 * * For instance, in the following example, the second invocation
10168 * of String#crypt is wrong; it has a typo in "round=" (lacks
10169 * "s"). However the call does not fail and something unexpected
10170 * is generated.
10171 *
10172 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10173 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10174 *
10175 * * Even in the "modular" mode, some hash functions are considered
10176 * archaic and no longer recommended at all; for instance module
10177 * <code>$1$</code> is officially abandoned by its author: see
10178 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10179 * instance module <code>$3$</code> is considered completely
10180 * broken: see the manpage of FreeBSD.
10181 *
10182 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10183 * written above, <code>crypt(3)</code> on Mac OS never fails.
10184 * This means even if you build up a proper salt string it
10185 * generates a traditional DES hash anyways, and there is no way
10186 * for you to be aware of.
10187 *
10188 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10189 *
10190 * If for some reason you cannot migrate to other secure contemporary
10191 * password hashing algorithms, install the string-crypt gem and
10192 * <code>require 'string/crypt'</code> to continue using it.
10193 */
10194
10195static VALUE
10196rb_str_crypt(VALUE str, VALUE salt)
10197{
10198#ifdef HAVE_CRYPT_R
10199 VALUE databuf;
10200 struct crypt_data *data;
10201# define CRYPT_END() ALLOCV_END(databuf)
10202#else
10203 extern char *crypt(const char *, const char *);
10204# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10205#endif
10206 VALUE result;
10207 const char *s, *saltp;
10208 char *res;
10209#ifdef BROKEN_CRYPT
10210 char salt_8bit_clean[3];
10211#endif
10212
10213 StringValue(salt);
10214 mustnot_wchar(str);
10215 mustnot_wchar(salt);
10216 s = StringValueCStr(str);
10217 saltp = RSTRING_PTR(salt);
10218 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10219 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10220 }
10221
10222#ifdef BROKEN_CRYPT
10223 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10224 salt_8bit_clean[0] = saltp[0] & 0x7f;
10225 salt_8bit_clean[1] = saltp[1] & 0x7f;
10226 salt_8bit_clean[2] = '\0';
10227 saltp = salt_8bit_clean;
10228 }
10229#endif
10230#ifdef HAVE_CRYPT_R
10231 data = ALLOCV(databuf, sizeof(struct crypt_data));
10232# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10233 data->initialized = 0;
10234# endif
10235 res = crypt_r(s, saltp, data);
10236#else
10237 crypt_mutex_initialize();
10238 rb_nativethread_lock_lock(&crypt_mutex.lock);
10239 res = crypt(s, saltp);
10240#endif
10241 if (!res) {
10242 int err = errno;
10243 CRYPT_END();
10244 rb_syserr_fail(err, "crypt");
10245 }
10246 result = rb_str_new_cstr(res);
10247 CRYPT_END();
10248 return result;
10249}
10250
10251
10252/*
10253 * call-seq:
10254 * ord -> integer
10255 *
10256 * :include: doc/string/ord.rdoc
10257 *
10258 */
10259
10260static VALUE
10261rb_str_ord(VALUE s)
10262{
10263 unsigned int c;
10264
10265 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10266 return UINT2NUM(c);
10267}
10268/*
10269 * call-seq:
10270 * sum(n = 16) -> integer
10271 *
10272 * :include: doc/string/sum.rdoc
10273 *
10274 */
10275
10276static VALUE
10277rb_str_sum(int argc, VALUE *argv, VALUE str)
10278{
10279 int bits = 16;
10280 char *ptr, *p, *pend;
10281 long len;
10282 VALUE sum = INT2FIX(0);
10283 unsigned long sum0 = 0;
10284
10285 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10286 bits = 0;
10287 }
10288 ptr = p = RSTRING_PTR(str);
10289 len = RSTRING_LEN(str);
10290 pend = p + len;
10291
10292 while (p < pend) {
10293 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10294 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10295 str_mod_check(str, ptr, len);
10296 sum0 = 0;
10297 }
10298 sum0 += (unsigned char)*p;
10299 p++;
10300 }
10301
10302 if (bits == 0) {
10303 if (sum0) {
10304 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10305 }
10306 }
10307 else {
10308 if (sum == INT2FIX(0)) {
10309 if (bits < (int)sizeof(long)*CHAR_BIT) {
10310 sum0 &= (((unsigned long)1)<<bits)-1;
10311 }
10312 sum = LONG2FIX(sum0);
10313 }
10314 else {
10315 VALUE mod;
10316
10317 if (sum0) {
10318 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10319 }
10320
10321 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10322 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10323 sum = rb_funcall(sum, '&', 1, mod);
10324 }
10325 }
10326 return sum;
10327}
10328
10329static VALUE
10330rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10331{
10332 rb_encoding *enc;
10333 VALUE w;
10334 long width, len, flen = 1, fclen = 1;
10335 VALUE res;
10336 char *p;
10337 const char *f = " ";
10338 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10339 VALUE pad;
10340 int singlebyte = 1, cr;
10341 int termlen;
10342
10343 rb_scan_args(argc, argv, "11", &w, &pad);
10344 enc = STR_ENC_GET(str);
10345 termlen = rb_enc_mbminlen(enc);
10346 width = NUM2LONG(w);
10347 if (argc == 2) {
10348 StringValue(pad);
10349 enc = rb_enc_check(str, pad);
10350 f = RSTRING_PTR(pad);
10351 flen = RSTRING_LEN(pad);
10352 fclen = str_strlen(pad, enc); /* rb_enc_check */
10353 singlebyte = single_byte_optimizable(pad);
10354 if (flen == 0 || fclen == 0) {
10355 rb_raise(rb_eArgError, "zero width padding");
10356 }
10357 }
10358 len = str_strlen(str, enc); /* rb_enc_check */
10359 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10360 n = width - len;
10361 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10362 rlen = n - llen;
10363 cr = ENC_CODERANGE(str);
10364 if (flen > 1) {
10365 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10366 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10367 }
10368 size = RSTRING_LEN(str);
10369 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10370 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10371 (len += llen2 + rlen2) >= LONG_MAX - size) {
10372 rb_raise(rb_eArgError, "argument too big");
10373 }
10374 len += size;
10375 res = str_new0(rb_cString, 0, len, termlen);
10376 p = RSTRING_PTR(res);
10377 if (flen <= 1) {
10378 memset(p, *f, llen);
10379 p += llen;
10380 }
10381 else {
10382 while (llen >= fclen) {
10383 memcpy(p,f,flen);
10384 p += flen;
10385 llen -= fclen;
10386 }
10387 if (llen > 0) {
10388 memcpy(p, f, llen2);
10389 p += llen2;
10390 }
10391 }
10392 memcpy(p, RSTRING_PTR(str), size);
10393 p += size;
10394 if (flen <= 1) {
10395 memset(p, *f, rlen);
10396 p += rlen;
10397 }
10398 else {
10399 while (rlen >= fclen) {
10400 memcpy(p,f,flen);
10401 p += flen;
10402 rlen -= fclen;
10403 }
10404 if (rlen > 0) {
10405 memcpy(p, f, rlen2);
10406 p += rlen2;
10407 }
10408 }
10409 TERM_FILL(p, termlen);
10410 STR_SET_LEN(res, p-RSTRING_PTR(res));
10411 rb_enc_associate(res, enc);
10412 if (argc == 2)
10413 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10414 if (cr != ENC_CODERANGE_BROKEN)
10415 ENC_CODERANGE_SET(res, cr);
10416
10417 RB_GC_GUARD(pad);
10418 return res;
10419}
10420
10421
10422/*
10423 * call-seq:
10424 * ljust(size, pad_string = ' ') -> new_string
10425 *
10426 * :include: doc/string/ljust.rdoc
10427 *
10428 * Related: String#rjust, String#center.
10429 *
10430 */
10431
10432static VALUE
10433rb_str_ljust(int argc, VALUE *argv, VALUE str)
10434{
10435 return rb_str_justify(argc, argv, str, 'l');
10436}
10437
10438/*
10439 * call-seq:
10440 * rjust(size, pad_string = ' ') -> new_string
10441 *
10442 * :include: doc/string/rjust.rdoc
10443 *
10444 * Related: String#ljust, String#center.
10445 *
10446 */
10447
10448static VALUE
10449rb_str_rjust(int argc, VALUE *argv, VALUE str)
10450{
10451 return rb_str_justify(argc, argv, str, 'r');
10452}
10453
10454
10455/*
10456 * call-seq:
10457 * center(size, pad_string = ' ') -> new_string
10458 *
10459 * :include: doc/string/center.rdoc
10460 *
10461 * Related: String#ljust, String#rjust.
10462 *
10463 */
10464
10465static VALUE
10466rb_str_center(int argc, VALUE *argv, VALUE str)
10467{
10468 return rb_str_justify(argc, argv, str, 'c');
10469}
10470
10471/*
10472 * call-seq:
10473 * partition(string_or_regexp) -> [head, match, tail]
10474 *
10475 * :include: doc/string/partition.rdoc
10476 *
10477 */
10478
10479static VALUE
10480rb_str_partition(VALUE str, VALUE sep)
10481{
10482 long pos;
10483
10484 sep = get_pat_quoted(sep, 0);
10485 if (RB_TYPE_P(sep, T_REGEXP)) {
10486 if (rb_reg_search(sep, str, 0, 0) < 0) {
10487 goto failed;
10488 }
10489 VALUE match = rb_backref_get();
10490 struct re_registers *regs = RMATCH_REGS(match);
10491
10492 pos = BEG(0);
10493 sep = rb_str_subseq(str, pos, END(0) - pos);
10494 }
10495 else {
10496 pos = rb_str_index(str, sep, 0);
10497 if (pos < 0) goto failed;
10498 }
10499 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10500 sep,
10501 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10502 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10503
10504 failed:
10505 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10506}
10507
10508/*
10509 * call-seq:
10510 * rpartition(sep) -> [head, match, tail]
10511 *
10512 * :include: doc/string/rpartition.rdoc
10513 *
10514 */
10515
10516static VALUE
10517rb_str_rpartition(VALUE str, VALUE sep)
10518{
10519 long pos = RSTRING_LEN(str);
10520
10521 sep = get_pat_quoted(sep, 0);
10522 if (RB_TYPE_P(sep, T_REGEXP)) {
10523 if (rb_reg_search(sep, str, pos, 1) < 0) {
10524 goto failed;
10525 }
10526 VALUE match = rb_backref_get();
10527 struct re_registers *regs = RMATCH_REGS(match);
10528
10529 pos = BEG(0);
10530 sep = rb_str_subseq(str, pos, END(0) - pos);
10531 }
10532 else {
10533 pos = rb_str_sublen(str, pos);
10534 pos = rb_str_rindex(str, sep, pos);
10535 if (pos < 0) {
10536 goto failed;
10537 }
10538 pos = rb_str_offset(str, pos);
10539 }
10540
10541 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10542 sep,
10543 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10544 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10545 failed:
10546 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10547}
10548
10549/*
10550 * call-seq:
10551 * start_with?(*string_or_regexp) -> true or false
10552 *
10553 * :include: doc/string/start_with_p.rdoc
10554 *
10555 */
10556
10557static VALUE
10558rb_str_start_with(int argc, VALUE *argv, VALUE str)
10559{
10560 int i;
10561
10562 for (i=0; i<argc; i++) {
10563 VALUE tmp = argv[i];
10564 if (RB_TYPE_P(tmp, T_REGEXP)) {
10565 if (rb_reg_start_with_p(tmp, str))
10566 return Qtrue;
10567 }
10568 else {
10569 StringValue(tmp);
10570 rb_enc_check(str, tmp);
10571 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10572 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10573 return Qtrue;
10574 }
10575 }
10576 return Qfalse;
10577}
10578
10579/*
10580 * call-seq:
10581 * end_with?(*strings) -> true or false
10582 *
10583 * :include: doc/string/end_with_p.rdoc
10584 *
10585 */
10586
10587static VALUE
10588rb_str_end_with(int argc, VALUE *argv, VALUE str)
10589{
10590 int i;
10591 char *p, *s, *e;
10592 rb_encoding *enc;
10593
10594 for (i=0; i<argc; i++) {
10595 VALUE tmp = argv[i];
10596 long slen, tlen;
10597 StringValue(tmp);
10598 enc = rb_enc_check(str, tmp);
10599 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10600 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10601 p = RSTRING_PTR(str);
10602 e = p + slen;
10603 s = e - tlen;
10604 if (rb_enc_left_char_head(p, s, e, enc) != s)
10605 continue;
10606 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10607 return Qtrue;
10608 }
10609 return Qfalse;
10610}
10611
10621static long
10622deleted_prefix_length(VALUE str, VALUE prefix)
10623{
10624 char *strptr, *prefixptr;
10625 long olen, prefixlen;
10626
10627 StringValue(prefix);
10628 if (is_broken_string(prefix)) return 0;
10629 rb_enc_check(str, prefix);
10630
10631 /* return 0 if not start with prefix */
10632 prefixlen = RSTRING_LEN(prefix);
10633 if (prefixlen <= 0) return 0;
10634 olen = RSTRING_LEN(str);
10635 if (olen < prefixlen) return 0;
10636 strptr = RSTRING_PTR(str);
10637 prefixptr = RSTRING_PTR(prefix);
10638 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10639
10640 return prefixlen;
10641}
10642
10643/*
10644 * call-seq:
10645 * delete_prefix!(prefix) -> self or nil
10646 *
10647 * Like String#delete_prefix, except that +self+ is modified in place.
10648 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10649 *
10650 */
10651
10652static VALUE
10653rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10654{
10655 long prefixlen;
10656 str_modify_keep_cr(str);
10657
10658 prefixlen = deleted_prefix_length(str, prefix);
10659 if (prefixlen <= 0) return Qnil;
10660
10661 return rb_str_drop_bytes(str, prefixlen);
10662}
10663
10664/*
10665 * call-seq:
10666 * delete_prefix(prefix) -> new_string
10667 *
10668 * :include: doc/string/delete_prefix.rdoc
10669 *
10670 */
10671
10672static VALUE
10673rb_str_delete_prefix(VALUE str, VALUE prefix)
10674{
10675 long prefixlen;
10676
10677 prefixlen = deleted_prefix_length(str, prefix);
10678 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10679
10680 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10681}
10682
10692static long
10693deleted_suffix_length(VALUE str, VALUE suffix)
10694{
10695 char *strptr, *suffixptr, *s;
10696 long olen, suffixlen;
10697 rb_encoding *enc;
10698
10699 StringValue(suffix);
10700 if (is_broken_string(suffix)) return 0;
10701 enc = rb_enc_check(str, suffix);
10702
10703 /* return 0 if not start with suffix */
10704 suffixlen = RSTRING_LEN(suffix);
10705 if (suffixlen <= 0) return 0;
10706 olen = RSTRING_LEN(str);
10707 if (olen < suffixlen) return 0;
10708 strptr = RSTRING_PTR(str);
10709 suffixptr = RSTRING_PTR(suffix);
10710 s = strptr + olen - suffixlen;
10711 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10712 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10713
10714 return suffixlen;
10715}
10716
10717/*
10718 * call-seq:
10719 * delete_suffix!(suffix) -> self or nil
10720 *
10721 * Like String#delete_suffix, except that +self+ is modified in place.
10722 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10723 *
10724 */
10725
10726static VALUE
10727rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10728{
10729 long olen, suffixlen, len;
10730 str_modifiable(str);
10731
10732 suffixlen = deleted_suffix_length(str, suffix);
10733 if (suffixlen <= 0) return Qnil;
10734
10735 olen = RSTRING_LEN(str);
10736 str_modify_keep_cr(str);
10737 len = olen - suffixlen;
10738 STR_SET_LEN(str, len);
10739 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10740 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10742 }
10743 return str;
10744}
10745
10746/*
10747 * call-seq:
10748 * delete_suffix(suffix) -> new_string
10749 *
10750 * :include: doc/string/delete_suffix.rdoc
10751 *
10752 */
10753
10754static VALUE
10755rb_str_delete_suffix(VALUE str, VALUE suffix)
10756{
10757 long suffixlen;
10758
10759 suffixlen = deleted_suffix_length(str, suffix);
10760 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10761
10762 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10763}
10764
10765void
10766rb_str_setter(VALUE val, ID id, VALUE *var)
10767{
10768 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10769 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10770 }
10771 *var = val;
10772}
10773
10774static void
10775rb_fs_setter(VALUE val, ID id, VALUE *var)
10776{
10777 val = rb_fs_check(val);
10778 if (!val) {
10780 "value of %"PRIsVALUE" must be String or Regexp",
10781 rb_id2str(id));
10782 }
10783 if (!NIL_P(val)) {
10784 rb_warn_deprecated("`$;'", NULL);
10785 }
10786 *var = val;
10787}
10788
10789
10790/*
10791 * call-seq:
10792 * force_encoding(encoding) -> self
10793 *
10794 * :include: doc/string/force_encoding.rdoc
10795 *
10796 */
10797
10798static VALUE
10799rb_str_force_encoding(VALUE str, VALUE enc)
10800{
10801 str_modifiable(str);
10802 rb_enc_associate(str, rb_to_encoding(enc));
10804 return str;
10805}
10806
10807/*
10808 * call-seq:
10809 * b -> string
10810 *
10811 * :include: doc/string/b.rdoc
10812 *
10813 */
10814
10815static VALUE
10816rb_str_b(VALUE str)
10817{
10818 VALUE str2;
10819 if (FL_TEST(str, STR_NOEMBED)) {
10820 str2 = str_alloc_heap(rb_cString);
10821 }
10822 else {
10823 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10824 }
10825 str_replace_shared_without_enc(str2, str);
10826
10827 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10828 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10829 // If we know the receiver's code range then we know the result's code range.
10830 int cr = ENC_CODERANGE(str);
10831 switch (cr) {
10832 case ENC_CODERANGE_7BIT:
10834 break;
10838 break;
10839 default:
10840 ENC_CODERANGE_CLEAR(str2);
10841 break;
10842 }
10843 }
10844
10845 return str2;
10846}
10847
10848/*
10849 * call-seq:
10850 * valid_encoding? -> true or false
10851 *
10852 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10853 *
10854 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10855 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10856 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10857 */
10858
10859static VALUE
10860rb_str_valid_encoding_p(VALUE str)
10861{
10862 int cr = rb_enc_str_coderange(str);
10863
10864 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10865}
10866
10867/*
10868 * call-seq:
10869 * ascii_only? -> true or false
10870 *
10871 * Returns +true+ if +self+ contains only ASCII characters,
10872 * +false+ otherwise:
10873 *
10874 * 'abc'.ascii_only? # => true
10875 * "abc\u{6666}".ascii_only? # => false
10876 *
10877 */
10878
10879static VALUE
10880rb_str_is_ascii_only_p(VALUE str)
10881{
10882 int cr = rb_enc_str_coderange(str);
10883
10884 return RBOOL(cr == ENC_CODERANGE_7BIT);
10885}
10886
10887VALUE
10889{
10890 static const char ellipsis[] = "...";
10891 const long ellipsislen = sizeof(ellipsis) - 1;
10892 rb_encoding *const enc = rb_enc_get(str);
10893 const long blen = RSTRING_LEN(str);
10894 const char *const p = RSTRING_PTR(str), *e = p + blen;
10895 VALUE estr, ret = 0;
10896
10897 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10898 if (len * rb_enc_mbminlen(enc) >= blen ||
10899 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10900 ret = str;
10901 }
10902 else if (len <= ellipsislen ||
10903 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10904 if (rb_enc_asciicompat(enc)) {
10905 ret = rb_str_new(ellipsis, len);
10906 rb_enc_associate(ret, enc);
10907 }
10908 else {
10909 estr = rb_usascii_str_new(ellipsis, len);
10910 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10911 }
10912 }
10913 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10914 rb_str_cat(ret, ellipsis, ellipsislen);
10915 }
10916 else {
10917 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10918 rb_enc_from_encoding(enc), 0, Qnil);
10919 rb_str_append(ret, estr);
10920 }
10921 return ret;
10922}
10923
10924static VALUE
10925str_compat_and_valid(VALUE str, rb_encoding *enc)
10926{
10927 int cr;
10928 str = StringValue(str);
10929 cr = rb_enc_str_coderange(str);
10930 if (cr == ENC_CODERANGE_BROKEN) {
10931 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10932 }
10933 else {
10934 rb_encoding *e = STR_ENC_GET(str);
10935 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10936 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10937 rb_enc_name(enc), rb_enc_name(e));
10938 }
10939 }
10940 return str;
10941}
10942
10943static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10944
10945VALUE
10947{
10948 rb_encoding *enc = STR_ENC_GET(str);
10949 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10950}
10951
10952VALUE
10953rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10954{
10955 int cr = ENC_CODERANGE_UNKNOWN;
10956 if (enc == STR_ENC_GET(str)) {
10957 /* cached coderange makes sense only when enc equals the
10958 * actual encoding of str */
10959 cr = ENC_CODERANGE(str);
10960 }
10961 return enc_str_scrub(enc, str, repl, cr);
10962}
10963
10964static VALUE
10965enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10966{
10967 int encidx;
10968 VALUE buf = Qnil;
10969 const char *rep, *p, *e, *p1, *sp;
10970 long replen = -1;
10971 long slen;
10972
10973 if (rb_block_given_p()) {
10974 if (!NIL_P(repl))
10975 rb_raise(rb_eArgError, "both of block and replacement given");
10976 replen = 0;
10977 }
10978
10979 if (ENC_CODERANGE_CLEAN_P(cr))
10980 return Qnil;
10981
10982 if (!NIL_P(repl)) {
10983 repl = str_compat_and_valid(repl, enc);
10984 }
10985
10986 if (rb_enc_dummy_p(enc)) {
10987 return Qnil;
10988 }
10989 encidx = rb_enc_to_index(enc);
10990
10991#define DEFAULT_REPLACE_CHAR(str) do { \
10992 static const char replace[sizeof(str)-1] = str; \
10993 rep = replace; replen = (int)sizeof(replace); \
10994 } while (0)
10995
10996 slen = RSTRING_LEN(str);
10997 p = RSTRING_PTR(str);
10998 e = RSTRING_END(str);
10999 p1 = p;
11000 sp = p;
11001
11002 if (rb_enc_asciicompat(enc)) {
11003 int rep7bit_p;
11004 if (!replen) {
11005 rep = NULL;
11006 rep7bit_p = FALSE;
11007 }
11008 else if (!NIL_P(repl)) {
11009 rep = RSTRING_PTR(repl);
11010 replen = RSTRING_LEN(repl);
11011 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11012 }
11013 else if (encidx == rb_utf8_encindex()) {
11014 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11015 rep7bit_p = FALSE;
11016 }
11017 else {
11018 DEFAULT_REPLACE_CHAR("?");
11019 rep7bit_p = TRUE;
11020 }
11021 cr = ENC_CODERANGE_7BIT;
11022
11023 p = search_nonascii(p, e);
11024 if (!p) {
11025 p = e;
11026 }
11027 while (p < e) {
11028 int ret = rb_enc_precise_mbclen(p, e, enc);
11029 if (MBCLEN_NEEDMORE_P(ret)) {
11030 break;
11031 }
11032 else if (MBCLEN_CHARFOUND_P(ret)) {
11034 p += MBCLEN_CHARFOUND_LEN(ret);
11035 }
11036 else if (MBCLEN_INVALID_P(ret)) {
11037 /*
11038 * p1~p: valid ascii/multibyte chars
11039 * p ~e: invalid bytes + unknown bytes
11040 */
11041 long clen = rb_enc_mbmaxlen(enc);
11042 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11043 if (p > p1) {
11044 rb_str_buf_cat(buf, p1, p - p1);
11045 }
11046
11047 if (e - p < clen) clen = e - p;
11048 if (clen <= 2) {
11049 clen = 1;
11050 }
11051 else {
11052 const char *q = p;
11053 clen--;
11054 for (; clen > 1; clen--) {
11055 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11056 if (MBCLEN_NEEDMORE_P(ret)) break;
11057 if (MBCLEN_INVALID_P(ret)) continue;
11059 }
11060 }
11061 if (rep) {
11062 rb_str_buf_cat(buf, rep, replen);
11063 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11064 }
11065 else {
11066 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11067 str_mod_check(str, sp, slen);
11068 repl = str_compat_and_valid(repl, enc);
11069 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11072 }
11073 p += clen;
11074 p1 = p;
11075 p = search_nonascii(p, e);
11076 if (!p) {
11077 p = e;
11078 break;
11079 }
11080 }
11081 else {
11083 }
11084 }
11085 if (NIL_P(buf)) {
11086 if (p == e) {
11087 ENC_CODERANGE_SET(str, cr);
11088 return Qnil;
11089 }
11090 buf = rb_str_buf_new(RSTRING_LEN(str));
11091 }
11092 if (p1 < p) {
11093 rb_str_buf_cat(buf, p1, p - p1);
11094 }
11095 if (p < e) {
11096 if (rep) {
11097 rb_str_buf_cat(buf, rep, replen);
11098 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11099 }
11100 else {
11101 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11102 str_mod_check(str, sp, slen);
11103 repl = str_compat_and_valid(repl, enc);
11104 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11107 }
11108 }
11109 }
11110 else {
11111 /* ASCII incompatible */
11112 long mbminlen = rb_enc_mbminlen(enc);
11113 if (!replen) {
11114 rep = NULL;
11115 }
11116 else if (!NIL_P(repl)) {
11117 rep = RSTRING_PTR(repl);
11118 replen = RSTRING_LEN(repl);
11119 }
11120 else if (encidx == ENCINDEX_UTF_16BE) {
11121 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11122 }
11123 else if (encidx == ENCINDEX_UTF_16LE) {
11124 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11125 }
11126 else if (encidx == ENCINDEX_UTF_32BE) {
11127 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11128 }
11129 else if (encidx == ENCINDEX_UTF_32LE) {
11130 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11131 }
11132 else {
11133 DEFAULT_REPLACE_CHAR("?");
11134 }
11135
11136 while (p < e) {
11137 int ret = rb_enc_precise_mbclen(p, e, enc);
11138 if (MBCLEN_NEEDMORE_P(ret)) {
11139 break;
11140 }
11141 else if (MBCLEN_CHARFOUND_P(ret)) {
11142 p += MBCLEN_CHARFOUND_LEN(ret);
11143 }
11144 else if (MBCLEN_INVALID_P(ret)) {
11145 const char *q = p;
11146 long clen = rb_enc_mbmaxlen(enc);
11147 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11148 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11149
11150 if (e - p < clen) clen = e - p;
11151 if (clen <= mbminlen * 2) {
11152 clen = mbminlen;
11153 }
11154 else {
11155 clen -= mbminlen;
11156 for (; clen > mbminlen; clen-=mbminlen) {
11157 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11158 if (MBCLEN_NEEDMORE_P(ret)) break;
11159 if (MBCLEN_INVALID_P(ret)) continue;
11161 }
11162 }
11163 if (rep) {
11164 rb_str_buf_cat(buf, rep, replen);
11165 }
11166 else {
11167 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11168 str_mod_check(str, sp, slen);
11169 repl = str_compat_and_valid(repl, enc);
11170 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11171 }
11172 p += clen;
11173 p1 = p;
11174 }
11175 else {
11177 }
11178 }
11179 if (NIL_P(buf)) {
11180 if (p == e) {
11182 return Qnil;
11183 }
11184 buf = rb_str_buf_new(RSTRING_LEN(str));
11185 }
11186 if (p1 < p) {
11187 rb_str_buf_cat(buf, p1, p - p1);
11188 }
11189 if (p < e) {
11190 if (rep) {
11191 rb_str_buf_cat(buf, rep, replen);
11192 }
11193 else {
11194 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11195 str_mod_check(str, sp, slen);
11196 repl = str_compat_and_valid(repl, enc);
11197 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11198 }
11199 }
11201 }
11202 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11203 return buf;
11204}
11205
11206/*
11207 * call-seq:
11208 * scrub(replacement_string = default_replacement) -> new_string
11209 * scrub{|bytes| ... } -> new_string
11210 *
11211 * :include: doc/string/scrub.rdoc
11212 *
11213 */
11214static VALUE
11215str_scrub(int argc, VALUE *argv, VALUE str)
11216{
11217 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11218 VALUE new = rb_str_scrub(str, repl);
11219 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11220}
11221
11222/*
11223 * call-seq:
11224 * scrub! -> self
11225 * scrub!(replacement_string = default_replacement) -> self
11226 * scrub!{|bytes| ... } -> self
11227 *
11228 * Like String#scrub, except that any replacements are made in +self+.
11229 *
11230 */
11231static VALUE
11232str_scrub_bang(int argc, VALUE *argv, VALUE str)
11233{
11234 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11235 VALUE new = rb_str_scrub(str, repl);
11236 if (!NIL_P(new)) rb_str_replace(str, new);
11237 return str;
11238}
11239
11240static ID id_normalize;
11241static ID id_normalized_p;
11242static VALUE mUnicodeNormalize;
11243
11244static VALUE
11245unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11246{
11247 static int UnicodeNormalizeRequired = 0;
11248 VALUE argv2[2];
11249
11250 if (!UnicodeNormalizeRequired) {
11251 rb_require("unicode_normalize/normalize.rb");
11252 UnicodeNormalizeRequired = 1;
11253 }
11254 argv2[0] = str;
11255 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11256 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11257}
11258
11259/*
11260 * call-seq:
11261 * unicode_normalize(form = :nfc) -> string
11262 *
11263 * Returns a copy of +self+ with
11264 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11265 *
11266 * Argument +form+ must be one of the following symbols
11267 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11268 *
11269 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11270 * - +:nfd+: Canonical decomposition.
11271 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11272 * - +:nfkd+: Compatibility decomposition.
11273 *
11274 * The encoding of +self+ must be one of:
11275 *
11276 * - Encoding::UTF_8
11277 * - Encoding::UTF_16BE
11278 * - Encoding::UTF_16LE
11279 * - Encoding::UTF_32BE
11280 * - Encoding::UTF_32LE
11281 * - Encoding::GB18030
11282 * - Encoding::UCS_2BE
11283 * - Encoding::UCS_4BE
11284 *
11285 * Examples:
11286 *
11287 * "a\u0300".unicode_normalize # => "a"
11288 * "\u00E0".unicode_normalize(:nfd) # => "a "
11289 *
11290 * Related: String#unicode_normalize!, String#unicode_normalized?.
11291 */
11292static VALUE
11293rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11294{
11295 return unicode_normalize_common(argc, argv, str, id_normalize);
11296}
11297
11298/*
11299 * call-seq:
11300 * unicode_normalize!(form = :nfc) -> self
11301 *
11302 * Like String#unicode_normalize, except that the normalization
11303 * is performed on +self+.
11304 *
11305 * Related String#unicode_normalized?.
11306 *
11307 */
11308static VALUE
11309rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11310{
11311 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11312}
11313
11314/* call-seq:
11315 * unicode_normalized?(form = :nfc) -> true or false
11316 *
11317 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11318 * +false+ otherwise.
11319 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11320 *
11321 * Examples:
11322 *
11323 * "a\u0300".unicode_normalized? # => false
11324 * "a\u0300".unicode_normalized?(:nfd) # => true
11325 * "\u00E0".unicode_normalized? # => true
11326 * "\u00E0".unicode_normalized?(:nfd) # => false
11327 *
11328 *
11329 * Raises an exception if +self+ is not in a Unicode encoding:
11330 *
11331 * s = "\xE0".force_encoding('ISO-8859-1')
11332 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11333 *
11334 * Related: String#unicode_normalize, String#unicode_normalize!.
11335 *
11336 */
11337static VALUE
11338rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11339{
11340 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11341}
11342
11343/**********************************************************************
11344 * Document-class: Symbol
11345 *
11346 * Symbol objects represent named identifiers inside the Ruby interpreter.
11347 *
11348 * You can create a \Symbol object explicitly with:
11349 *
11350 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11351 *
11352 * The same Symbol object will be
11353 * created for a given name or string for the duration of a program's
11354 * execution, regardless of the context or meaning of that name. Thus
11355 * if <code>Fred</code> is a constant in one context, a method in
11356 * another, and a class in a third, the Symbol <code>:Fred</code>
11357 * will be the same object in all three contexts.
11358 *
11359 * module One
11360 * class Fred
11361 * end
11362 * $f1 = :Fred
11363 * end
11364 * module Two
11365 * Fred = 1
11366 * $f2 = :Fred
11367 * end
11368 * def Fred()
11369 * end
11370 * $f3 = :Fred
11371 * $f1.object_id #=> 2514190
11372 * $f2.object_id #=> 2514190
11373 * $f3.object_id #=> 2514190
11374 *
11375 * Constant, method, and variable names are returned as symbols:
11376 *
11377 * module One
11378 * Two = 2
11379 * def three; 3 end
11380 * @four = 4
11381 * @@five = 5
11382 * $six = 6
11383 * end
11384 * seven = 7
11385 *
11386 * One.constants
11387 * # => [:Two]
11388 * One.instance_methods(true)
11389 * # => [:three]
11390 * One.instance_variables
11391 * # => [:@four]
11392 * One.class_variables
11393 * # => [:@@five]
11394 * global_variables.grep(/six/)
11395 * # => [:$six]
11396 * local_variables
11397 * # => [:seven]
11398 *
11399 * Symbol objects are different from String objects in that
11400 * Symbol objects represent identifiers, while String objects
11401 * represent text or data.
11402 *
11403 * == What's Here
11404 *
11405 * First, what's elsewhere. \Class \Symbol:
11406 *
11407 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11408 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11409 *
11410 * Here, class \Symbol provides methods that are useful for:
11411 *
11412 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11413 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11414 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11415 *
11416 * === Methods for Querying
11417 *
11418 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11419 * - #=~: Returns the index of the first substring in symbol that matches a
11420 * given Regexp or other object; returns +nil+ if no match is found.
11421 * - #[], #slice : Returns a substring of symbol
11422 * determined by a given index, start/length, or range, or string.
11423 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11424 * - #encoding: Returns the Encoding object that represents the encoding
11425 * of symbol.
11426 * - #end_with?: Returns +true+ if symbol ends with
11427 * any of the given strings.
11428 * - #match: Returns a MatchData object if symbol
11429 * matches a given Regexp; +nil+ otherwise.
11430 * - #match?: Returns +true+ if symbol
11431 * matches a given Regexp; +false+ otherwise.
11432 * - #length, #size: Returns the number of characters in symbol.
11433 * - #start_with?: Returns +true+ if symbol starts with
11434 * any of the given strings.
11435 *
11436 * === Methods for Comparing
11437 *
11438 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11439 * or larger than symbol.
11440 * - #==, #===: Returns +true+ if a given symbol has the same content and
11441 * encoding.
11442 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11443 * symbol is smaller than, equal to, or larger than symbol.
11444 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11445 * after Unicode case folding; +false+ otherwise.
11446 *
11447 * === Methods for Converting
11448 *
11449 * - #capitalize: Returns symbol with the first character upcased
11450 * and all other characters downcased.
11451 * - #downcase: Returns symbol with all characters downcased.
11452 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11453 * - #name: Returns the frozen string corresponding to symbol.
11454 * - #succ, #next: Returns the symbol that is the successor to symbol.
11455 * - #swapcase: Returns symbol with all upcase characters downcased
11456 * and all downcase characters upcased.
11457 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11458 * - #to_s, #id2name: Returns the string corresponding to +self+.
11459 * - #to_sym, #intern: Returns +self+.
11460 * - #upcase: Returns symbol with all characters upcased.
11461 *
11462 */
11463
11464
11465/*
11466 * call-seq:
11467 * symbol == object -> true or false
11468 *
11469 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11470 *
11471 * Symbol#=== is an alias for Symbol#==.
11472 *
11473 */
11474
11475#define sym_equal rb_obj_equal
11476
11477static int
11478sym_printable(const char *s, const char *send, rb_encoding *enc)
11479{
11480 while (s < send) {
11481 int n;
11482 int c = rb_enc_precise_mbclen(s, send, enc);
11483
11484 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11485 n = MBCLEN_CHARFOUND_LEN(c);
11486 c = rb_enc_mbc_to_codepoint(s, send, enc);
11487 if (!rb_enc_isprint(c, enc)) return FALSE;
11488 s += n;
11489 }
11490 return TRUE;
11491}
11492
11493int
11494rb_str_symname_p(VALUE sym)
11495{
11496 rb_encoding *enc;
11497 const char *ptr;
11498 long len;
11499 rb_encoding *resenc = rb_default_internal_encoding();
11500
11501 if (resenc == NULL) resenc = rb_default_external_encoding();
11502 enc = STR_ENC_GET(sym);
11503 ptr = RSTRING_PTR(sym);
11504 len = RSTRING_LEN(sym);
11505 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11506 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11507 return FALSE;
11508 }
11509 return TRUE;
11510}
11511
11512VALUE
11513rb_str_quote_unprintable(VALUE str)
11514{
11515 rb_encoding *enc;
11516 const char *ptr;
11517 long len;
11518 rb_encoding *resenc;
11519
11520 Check_Type(str, T_STRING);
11521 resenc = rb_default_internal_encoding();
11522 if (resenc == NULL) resenc = rb_default_external_encoding();
11523 enc = STR_ENC_GET(str);
11524 ptr = RSTRING_PTR(str);
11525 len = RSTRING_LEN(str);
11526 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11527 !sym_printable(ptr, ptr + len, enc)) {
11528 return rb_str_escape(str);
11529 }
11530 return str;
11531}
11532
11533MJIT_FUNC_EXPORTED VALUE
11534rb_id_quote_unprintable(ID id)
11535{
11536 VALUE str = rb_id2str(id);
11537 if (!rb_str_symname_p(str)) {
11538 return rb_str_escape(str);
11539 }
11540 return str;
11541}
11542
11543/*
11544 * call-seq:
11545 * inspect -> string
11546 *
11547 * Returns a string representation of +self+ (including the leading colon):
11548 *
11549 * :foo.inspect # => ":foo"
11550 *
11551 * Related: Symbol#to_s, Symbol#name.
11552 *
11553 */
11554
11555static VALUE
11556sym_inspect(VALUE sym)
11557{
11558 VALUE str = rb_sym2str(sym);
11559 const char *ptr;
11560 long len;
11561 char *dest;
11562
11563 if (!rb_str_symname_p(str)) {
11564 str = rb_str_inspect(str);
11565 len = RSTRING_LEN(str);
11566 rb_str_resize(str, len + 1);
11567 dest = RSTRING_PTR(str);
11568 memmove(dest + 1, dest, len);
11569 }
11570 else {
11571 rb_encoding *enc = STR_ENC_GET(str);
11572 RSTRING_GETMEM(str, ptr, len);
11573 str = rb_enc_str_new(0, len + 1, enc);
11574 dest = RSTRING_PTR(str);
11575 memcpy(dest + 1, ptr, len);
11576 }
11577 dest[0] = ':';
11578 return str;
11579}
11580
11581/*
11582 * call-seq:
11583 * to_s -> string
11584 *
11585 * Returns a string representation of +self+ (not including the leading colon):
11586 *
11587 * :foo.to_s # => "foo"
11588 *
11589 * Symbol#id2name is an alias for Symbol#to_s.
11590 *
11591 * Related: Symbol#inspect, Symbol#name.
11592 */
11593
11594VALUE
11596{
11597 return str_new_shared(rb_cString, rb_sym2str(sym));
11598}
11599
11600MJIT_FUNC_EXPORTED VALUE
11601rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11602{
11603 VALUE obj;
11604
11605 if (argc < 1) {
11606 rb_raise(rb_eArgError, "no receiver given");
11607 }
11608 obj = argv[0];
11609 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11610}
11611
11612/*
11613 * call-seq:
11614 * succ
11615 *
11616 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11617 *
11618 * :foo.succ # => :fop
11619 *
11620 * Symbol#next is an alias for Symbol#succ.
11621 *
11622 * Related: String#succ.
11623 */
11624
11625static VALUE
11626sym_succ(VALUE sym)
11627{
11628 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11629}
11630
11631/*
11632 * call-seq:
11633 * symbol <=> object -> -1, 0, +1, or nil
11634 *
11635 * If +object+ is a symbol,
11636 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11637 *
11638 * :bar <=> :foo # => -1
11639 * :foo <=> :foo # => 0
11640 * :foo <=> :bar # => 1
11641 *
11642 * Otherwise, returns +nil+:
11643 *
11644 * :foo <=> 'bar' # => nil
11645 *
11646 * Related: String#<=>.
11647 */
11648
11649static VALUE
11650sym_cmp(VALUE sym, VALUE other)
11651{
11652 if (!SYMBOL_P(other)) {
11653 return Qnil;
11654 }
11655 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11656}
11657
11658/*
11659 * call-seq:
11660 * casecmp(object) -> -1, 0, 1, or nil
11661 *
11662 * :include: doc/symbol/casecmp.rdoc
11663 *
11664 */
11665
11666static VALUE
11667sym_casecmp(VALUE sym, VALUE other)
11668{
11669 if (!SYMBOL_P(other)) {
11670 return Qnil;
11671 }
11672 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11673}
11674
11675/*
11676 * call-seq:
11677 * casecmp?(object) -> true, false, or nil
11678 *
11679 * :include: doc/symbol/casecmp_p.rdoc
11680 *
11681 */
11682
11683static VALUE
11684sym_casecmp_p(VALUE sym, VALUE other)
11685{
11686 if (!SYMBOL_P(other)) {
11687 return Qnil;
11688 }
11689 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11690}
11691
11692/*
11693 * call-seq:
11694 * symbol =~ object -> integer or nil
11695 *
11696 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11697 * including possible updates to global variables;
11698 * see String#=~.
11699 *
11700 */
11701
11702static VALUE
11703sym_match(VALUE sym, VALUE other)
11704{
11705 return rb_str_match(rb_sym2str(sym), other);
11706}
11707
11708/*
11709 * call-seq:
11710 * match(pattern, offset = 0) -> matchdata or nil
11711 * match(pattern, offset = 0) {|matchdata| } -> object
11712 *
11713 * Equivalent to <tt>self.to_s.match</tt>,
11714 * including possible updates to global variables;
11715 * see String#match.
11716 *
11717 */
11718
11719static VALUE
11720sym_match_m(int argc, VALUE *argv, VALUE sym)
11721{
11722 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11723}
11724
11725/*
11726 * call-seq:
11727 * match?(pattern, offset) -> true or false
11728 *
11729 * Equivalent to <tt>sym.to_s.match?</tt>;
11730 * see String#match.
11731 *
11732 */
11733
11734static VALUE
11735sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11736{
11737 return rb_str_match_m_p(argc, argv, sym);
11738}
11739
11740/*
11741 * call-seq:
11742 * symbol[index] -> string or nil
11743 * symbol[start, length] -> string or nil
11744 * symbol[range] -> string or nil
11745 * symbol[regexp, capture = 0] -> string or nil
11746 * symbol[substring] -> string or nil
11747 *
11748 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11749 *
11750 */
11751
11752static VALUE
11753sym_aref(int argc, VALUE *argv, VALUE sym)
11754{
11755 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11756}
11757
11758/*
11759 * call-seq:
11760 * length -> integer
11761 *
11762 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11763 *
11764 * Symbol#size is an alias for Symbol#length.
11765 *
11766 */
11767
11768static VALUE
11769sym_length(VALUE sym)
11770{
11771 return rb_str_length(rb_sym2str(sym));
11772}
11773
11774/*
11775 * call-seq:
11776 * empty? -> true or false
11777 *
11778 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11779 *
11780 */
11781
11782static VALUE
11783sym_empty(VALUE sym)
11784{
11785 return rb_str_empty(rb_sym2str(sym));
11786}
11787
11788/*
11789 * call-seq:
11790 * upcase(*options) -> symbol
11791 *
11792 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11793 *
11794 * See String#upcase.
11795 *
11796 */
11797
11798static VALUE
11799sym_upcase(int argc, VALUE *argv, VALUE sym)
11800{
11801 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11802}
11803
11804/*
11805 * call-seq:
11806 * downcase(*options) -> symbol
11807 *
11808 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11809 *
11810 * See String#downcase.
11811 *
11812 * Related: Symbol#upcase.
11813 *
11814 */
11815
11816static VALUE
11817sym_downcase(int argc, VALUE *argv, VALUE sym)
11818{
11819 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11820}
11821
11822/*
11823 * call-seq:
11824 * capitalize(*options) -> symbol
11825 *
11826 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11827 *
11828 * See String#capitalize.
11829 *
11830 */
11831
11832static VALUE
11833sym_capitalize(int argc, VALUE *argv, VALUE sym)
11834{
11835 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11836}
11837
11838/*
11839 * call-seq:
11840 * swapcase(*options) -> symbol
11841 *
11842 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11843 *
11844 * See String#swapcase.
11845 *
11846 */
11847
11848static VALUE
11849sym_swapcase(int argc, VALUE *argv, VALUE sym)
11850{
11851 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11852}
11853
11854/*
11855 * call-seq:
11856 * start_with?(*string_or_regexp) -> true or false
11857 *
11858 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11859 *
11860 */
11861
11862static VALUE
11863sym_start_with(int argc, VALUE *argv, VALUE sym)
11864{
11865 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11866}
11867
11868/*
11869 * call-seq:
11870 * end_with?(*string_or_regexp) -> true or false
11871 *
11872 *
11873 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11874 *
11875 */
11876
11877static VALUE
11878sym_end_with(int argc, VALUE *argv, VALUE sym)
11879{
11880 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11881}
11882
11883/*
11884 * call-seq:
11885 * encoding -> encoding
11886 *
11887 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
11888 *
11889 */
11890
11891static VALUE
11892sym_encoding(VALUE sym)
11893{
11894 return rb_obj_encoding(rb_sym2str(sym));
11895}
11896
11897static VALUE
11898string_for_symbol(VALUE name)
11899{
11900 if (!RB_TYPE_P(name, T_STRING)) {
11901 VALUE tmp = rb_check_string_type(name);
11902 if (NIL_P(tmp)) {
11903 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11904 name);
11905 }
11906 name = tmp;
11907 }
11908 return name;
11909}
11910
11911ID
11913{
11914 if (SYMBOL_P(name)) {
11915 return SYM2ID(name);
11916 }
11917 name = string_for_symbol(name);
11918 return rb_intern_str(name);
11919}
11920
11921VALUE
11923{
11924 if (SYMBOL_P(name)) {
11925 return name;
11926 }
11927 name = string_for_symbol(name);
11928 return rb_str_intern(name);
11929}
11930
11931/*
11932 * call-seq:
11933 * Symbol.all_symbols -> array_of_symbols
11934 *
11935 * Returns an array of all symbols currently in Ruby's symbol table:
11936 *
11937 * Symbol.all_symbols.size # => 9334
11938 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
11939 *
11940 */
11941
11942static VALUE
11943sym_all_symbols(VALUE _)
11944{
11945 return rb_sym_all_symbols();
11946}
11947
11948VALUE
11950{
11951 return rb_fstring(str);
11952}
11953
11954VALUE
11955rb_interned_str(const char *ptr, long len)
11956{
11957 struct RString fake_str;
11958 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11959}
11960
11961VALUE
11963{
11964 return rb_interned_str(ptr, strlen(ptr));
11965}
11966
11967VALUE
11968rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11969{
11970 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11971 rb_enc_autoload(enc);
11972 }
11973
11974 struct RString fake_str;
11975 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11976}
11977
11978VALUE
11980{
11981 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11982}
11983
11984void
11985Init_String(void)
11986{
11987 rb_cString = rb_define_class("String", rb_cObject);
11988 assert(rb_vm_fstring_table());
11989 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11991 rb_define_alloc_func(rb_cString, empty_str_alloc);
11992 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11993 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11994 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11995 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11998 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11999 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12000 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12001 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12004 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12005 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12006 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12007 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12010 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12011 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12012 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12013 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12014 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12016 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12018 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12019 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12020 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12021 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12022 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12023 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12025 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12026 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12027 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12028 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12029 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12030 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12031 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12032 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12034 rb_define_method(rb_cString, "+@", str_uplus, 0);
12035 rb_define_method(rb_cString, "-@", str_uminus, 0);
12036 rb_define_alias(rb_cString, "dedup", "-@");
12037
12038 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12039 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12040 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12041 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12044 rb_define_method(rb_cString, "undump", str_undump, 0);
12045
12046 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12047 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12048 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12049 sym_fold = ID2SYM(rb_intern_const("fold"));
12050
12051 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12052 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12053 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12054 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12055
12056 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12057 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12058 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12059 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12060
12061 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12062 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12063 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12064 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12065 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12066 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12067 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12068 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12069 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12070 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12071 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12073 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12074 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12075 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12076 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12077 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12078
12079 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12080 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12081 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12082
12083 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12084
12085 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12086 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12087 rb_define_method(rb_cString, "center", rb_str_center, -1);
12088
12089 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12090 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12091 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12092 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12093 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12094 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12095 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12096 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12097 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12098
12099 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12100 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12101 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12102 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12103 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12104 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12105 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12106 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12107 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12108
12109 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12110 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12111 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12112 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12113 rb_define_method(rb_cString, "count", rb_str_count, -1);
12114
12115 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12116 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12117 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12118 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12119
12120 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12121 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12122 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12123 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12124 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12125
12126 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12127
12128 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12129 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12130
12131 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12132 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12133
12134 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12135 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12136 rb_define_method(rb_cString, "b", rb_str_b, 0);
12137 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12138 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12139
12140 /* define UnicodeNormalize module here so that we don't have to look it up */
12141 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12142 id_normalize = rb_intern_const("normalize");
12143 id_normalized_p = rb_intern_const("normalized?");
12144
12145 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12146 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12147 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12148
12149 rb_fs = Qnil;
12150 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12151 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12152 rb_gc_register_address(&rb_fs);
12153
12154 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12158 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12159
12160 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12161 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12162 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12164 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12165 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12166 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12167 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12168 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12169
12170 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12171 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12172 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12173 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12174
12175 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12176 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12177 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12178 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12179 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12180 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12181 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12182
12183 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12184 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12185 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12186 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12187
12188 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12189 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12190
12191 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12192}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
Definition fl_type.h:906
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1125
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:923
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1033
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2284
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2108
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2574
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:868
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2363
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:142
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:145
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition error.c:421
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition error.c:3150
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:688
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3262
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition error.c:794
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1095
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1091
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition error.c:3201
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1098
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1089
VALUE rb_eArgError
ArgumentError exception.
Definition error.c:1092
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1093
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:589
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:1939
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1194
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3416
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:190
VALUE rb_cSymbol
Sumbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:122
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1182
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3026
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition rgengc.h:220
Encoding relates APIs.
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition encoding.h:433
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:699
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition encoding.h:678
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:720
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition encoding.h:784
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition encoding.h:659
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:463
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:607
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:448
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:635
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:742
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1208
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:821
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1074
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2716
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1093
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:11968
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:249
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2060
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3288
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1021
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition string.c:981
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1313
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1214
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:833
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:11979
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:719
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:411
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1453
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2630
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2884
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1709
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1102
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1189
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:604
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:200
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1662
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1010
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1668
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1229
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4114
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3597
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1435
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1861
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:11949
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1571
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1376
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2211
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3353
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1289
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11595
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2283
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1265
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1565
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2744
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4826
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3581
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:2826
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:10888
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1741
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1618
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1056
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:871
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1382
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1834
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition string.c:2437
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3571
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3177
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2149
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1840
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6028
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2834
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:11962
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1295
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3319
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2791
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3683
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3020
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6710
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2489
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:11955
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3637
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3453
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3612
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3295
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2942
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5330
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:10946
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1513
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2640
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2921
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3003
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3064
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1068
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2445
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6824
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1277
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1532
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2163
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5256
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8894
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1062
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:851
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1682
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2805
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1142
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition symbol.c:796
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:943
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:11922
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition string.c:11912
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition symbol.c:802
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1765
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3376
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4358
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1357
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:343
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:69
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:139
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1307
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2617
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:423
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:554
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:528
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:574
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2501
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:484
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1301
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2512
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1609
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:498
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1329
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:231
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:234
long capa
Capacity of *ptr.
Definition rstring.h:268
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:250
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
Definition rstring.h:298
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition rstring.h:258
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:190
Definition st.h:79
Definition string.c:7779
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:299
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:375