14#include "ruby/internal/config.h"
24#include "debug_counter.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
46#include "ruby_assert.h"
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
62#undef rb_usascii_str_new
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
116 STR_SET_EMBED_LEN((str), 0);\
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
126# define STR_SET_EMBED_LEN(str, n) do { \
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
138 RSTRING(str)->as.heap.len = (n);\
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
146 STR_SET_EMBED_LEN((str), n);\
149 RSTRING(str)->as.heap.len--;\
154str_enc_fastpath(
VALUE str)
158 case ENCINDEX_ASCII_8BIT:
160 case ENCINDEX_US_ASCII:
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216#define STR_ENC_GET(str) get_encoding(str)
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
229str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.
ary);
239rb_str_reembeddable_p(
VALUE str)
241 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
245rb_str_embed_size(
long capa)
251rb_str_size_as_embedded(
VALUE str)
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(
RSTRING(str)->as.embed.len) + TERM_LEN(str);
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
265 real_size =
sizeof(
struct RString);
273STR_EMBEDDABLE_P(
long len,
long termlen)
276 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
284static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
285static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
287static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
288static inline void str_modifiable(
VALUE str);
292str_make_independent(
VALUE str)
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str),
len, 0L, termlen);
299static inline int str_dependent_p(
VALUE str);
302rb_str_make_independent(
VALUE str)
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
310rb_str_make_embedded(
VALUE str)
315 char *buf =
RSTRING(str)->as.heap.ptr;
319 STR_SET_EMBED_LEN(str,
len);
326 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
333 if (new_root == old_root) {
339 if (!STR_EMBED_P(new_root)) {
343 size_t offset = (size_t)((uintptr_t)
RSTRING(str)->as.heap.ptr - (uintptr_t)
RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr =
RSTRING(new_root)->as.embed.ary + offset;
350rb_debug_rstring_null_ptr(
const char *func)
352 fprintf(stderr,
"%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
363get_encoding(
VALUE str)
369mustnot_broken(
VALUE str)
371 if (is_broken_string(str)) {
377mustnot_wchar(
VALUE str)
387static VALUE register_fstring(
VALUE str,
bool copy);
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
412 if (rb_objspace_garbage_object_p(str)) {
424 rb_enc_copy(new_str, str);
437 if (STR_SHARED_P(str)) {
439 str_make_independent(str);
442 if (!BARE_STRING_P(str)) {
446 RBASIC(str)->flags |= RSTRING_FSTR;
448 *key = *value = arg->fstr = str;
462 if (
FL_TEST(str, RSTRING_FSTR))
465 bare = BARE_STRING_P(str);
467 if (STR_EMBED_P(str)) {
471 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
480 fstr = register_fstring(str, FALSE);
483 str_replace_shared_without_enc(str, fstr);
491register_fstring(
VALUE str,
bool copy)
498 st_table *frozen_strings = rb_vm_fstring_table();
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 }
while (UNDEF_P(args.fstr));
514setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
530 return (
VALUE)fake_str;
537rb_setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
rb_encoding *enc)
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
547MJIT_FUNC_EXPORTED
VALUE
548rb_fstring_new(
const char *ptr,
long len)
551 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII), FALSE);
558 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc), FALSE);
562rb_fstring_cstr(
const char *
ptr)
564 return rb_fstring_new(
ptr, strlen(
ptr));
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
578 const char *aptr, *bptr;
581 return (alen != blen ||
583 memcmp(aptr, bptr, alen) != 0);
587single_byte_optimizable(
VALUE str)
595 enc = STR_ENC_GET(str);
606static inline const char *
607search_nonascii(
const char *p,
const char *e)
609 const uintptr_t *s, *t;
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
617# error "don't know what to do."
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL
625# error "don't know what to do."
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
637 case 7:
if (p[-7]&0x80)
return p-7;
638 case 6:
if (p[-6]&0x80)
return p-6;
639 case 5:
if (p[-5]&0x80)
return p-5;
640 case 4:
if (p[-4]&0x80)
return p-4;
642 case 3:
if (p[-3]&0x80)
return p-3;
643 case 2:
if (p[-2]&0x80)
return p-2;
644 case 1:
if (p[-1]&0x80)
return p-1;
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
653#define aligned_ptr(value) (uintptr_t *)(value)
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
663 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
673 case 7:
if (e[-7]&0x80)
return e-7;
674 case 6:
if (e[-6]&0x80)
return e-6;
675 case 5:
if (e[-5]&0x80)
return e-5;
676 case 4:
if (e[-4]&0x80)
return e-4;
678 case 3:
if (e[-3]&0x80)
return e-3;
679 case 2:
if (e[-2]&0x80)
return e-2;
680 case 1:
if (e[-1]&0x80)
return e-1;
688 const char *e = p +
len;
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
692 p = search_nonascii(p, e);
697 p = search_nonascii(p, e);
700 int ret = rb_enc_precise_mbclen(p, e, enc);
704 p = search_nonascii(p, e);
710 int ret = rb_enc_precise_mbclen(p, e, enc);
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
729 p = search_nonascii(p, e);
734 p = search_nonascii(p, e);
740 int ret = rb_enc_precise_mbclen(p, e, enc);
747 p = search_nonascii(p, e);
753 int ret = rb_enc_precise_mbclen(p, e, enc);
772rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
777 str_enc_copy(dest, src);
802rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
804 str_enc_copy(dest, src);
817 return enc_coderange_scan(str, enc);
826 cr = enc_coderange_scan(str, get_encoding(str));
839 else if (is_ascii_string(str))
845str_mod_check(
VALUE s,
const char *p,
long len)
853str_capacity(
VALUE str,
const int termlen)
855 if (STR_EMBED_P(str)) {
857 return str_embed_capa(str) - termlen;
862 else if (
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
866 return RSTRING(str)->as.heap.aux.capa;
873 return str_capacity(str, TERM_LEN(str));
877must_not_null(
const char *
ptr)
887 size_t size = rb_str_embed_size(
capa);
889 assert(rb_gc_size_allocatable_p(size));
891 assert(size <=
sizeof(
struct RString));
894 RVARGC_NEWOBJ_OF(str,
struct RString, klass,
901str_alloc_heap(
VALUE klass)
903 RVARGC_NEWOBJ_OF(str,
struct RString, klass,
910empty_str_alloc(
VALUE klass)
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
919str_new0(
VALUE klass,
const char *
ptr,
long len,
int termlen)
927 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
929 if (STR_EMBEDDABLE_P(
len, termlen)) {
930 str = str_alloc_embed(klass,
len + termlen);
936 str = str_alloc_heap(klass);
942 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
947 STR_SET_LEN(str,
len);
955 return str_new0(klass,
ptr,
len, 1);
976 rb_enc_associate_index(str, rb_utf8_encindex());
988 rb_enc_associate(str, enc);
1000 __msan_unpoison_string(
ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1031str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1044 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1045 str = str_alloc_heap(klass);
1049 RBASIC(str)->flags |= STR_NOFREE;
1051 rb_enc_associate_index(str, encindex);
1079static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1081 int ecflags,
VALUE ecopts);
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1099 if (!to)
return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to)
return str;
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1106 rb_enc_associate(str, to);
1113 from, to, ecflags, ecopts);
1114 if (
NIL_P(newstr)) {
1122rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1128 if (ofs < -olen || olen < ofs)
1130 if (ofs < 0) ofs += olen;
1132 STR_SET_LEN(newstr, ofs);
1137 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1152str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1154 int ecflags,
VALUE ecopts)
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1169 if (!ec)
return Qnil;
1172 sp = (
unsigned char*)
ptr;
1174 while ((dest = (
unsigned char*)
RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1179 size_t converted_input = sp - start;
1180 size_t rest =
len - converted_input;
1181 converted_output = dp - dest;
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1190 olen += rest < 2 ? 2 : rest;
1199 rb_enc_associate(newstr, to);
1218 const int eidx = rb_enc_to_index(eenc);
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1245 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1246 rb_str_initialize(str,
ptr,
len, eenc);
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1260 rb_enc_associate_index(str, eidx);
1319str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1321 const int termlen = TERM_LEN(str);
1326 if (str_embed_capa(str2) >=
len + termlen) {
1327 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1330 STR_SET_EMBED_LEN(str2,
len);
1331 TERM_FILL(ptr2+
len, termlen);
1335 if (STR_SHARED_P(str)) {
1336 root =
RSTRING(str)->as.heap.aux.shared;
1344 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1346 rb_fatal(
"about to free a possible shared root");
1348 char *ptr2 = STR_HEAP_PTR(str2);
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1353 FL_SET(str2, STR_NOEMBED);
1356 STR_SET_SHARED(str2, root);
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1372 return str_replace_shared(str_alloc_heap(klass), str);
1389rb_str_new_frozen_String(
VALUE orig)
1396rb_str_tmp_frozen_acquire(
VALUE orig)
1399 return str_new_frozen_buffer(0, orig, FALSE);
1403rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1408 if (STR_EMBED_P(tmp)) {
1421 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1427 STR_SET_EMBED_LEN(tmp, 0);
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1444 VALUE str = str_alloc_heap(klass);
1447 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1457str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1466 assert(STR_EMBED_P(str));
1477 assert(!STR_EMBED_P(
shared));
1481 if ((ofs > 0) || (rest > 0) ||
1484 str = str_new_shared(klass,
shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1495 else if (STR_EMBEDDABLE_P(
RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass,
RSTRING_LEN(orig) + TERM_LEN(orig));
1503 str = heap_str_make_shared(klass, orig);
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1519str_new_empty_String(
VALUE str)
1522 rb_enc_copy(v, str);
1526#define STR_BUF_MIN_SIZE 63
1534 if (STR_EMBEDDABLE_P(
capa, 1)) {
1541 if (
capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1547 RSTRING(str)->as.heap.ptr[0] =
'\0';
1567 return str_new(0, 0,
len);
1573 if (
FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1587 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1597RUBY_FUNC_EXPORTED
size_t
1598rb_str_memsize(
VALUE str)
1600 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1611 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1614static inline void str_discard(
VALUE str);
1615static void str_shared_replace(
VALUE str,
VALUE str2);
1620 if (str != str2) str_shared_replace(str, str2);
1631 enc = STR_ENC_GET(str2);
1636 if (str_embed_capa(str) >=
RSTRING_LEN(str2) + termlen) {
1640 rb_enc_associate(str, enc);
1645 if (STR_EMBED_P(str2)) {
1646 assert(!
FL_TEST(str2, STR_SHARED));
1648 assert(
len + termlen <= str_embed_capa(str2));
1650 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1651 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1655 STR_SET_NOEMBED(str2);
1659 STR_SET_NOEMBED(str);
1664 if (
FL_TEST(str2, STR_SHARED)) {
1666 STR_SET_SHARED(str,
shared);
1669 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1673 STR_SET_EMBED(str2);
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1690 return rb_obj_as_string_result(str, obj);
1693MJIT_FUNC_EXPORTED
VALUE
1707 if (STR_SHARED_P(str2)) {
1710 STR_SET_NOEMBED(str);
1713 STR_SET_SHARED(str,
shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1717 str_replace_shared(str, str2);
1726 size_t size = rb_str_embed_size(
capa);
1728 assert(rb_gc_size_allocatable_p(size));
1730 assert(size <=
sizeof(
struct RString));
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str,
struct RString, klass,
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str,
struct RString, klass,
1751 const VALUE flag_mask =
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1760 if (STR_EMBED_P(str)) {
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >=
len + 1);
1765 STR_SET_EMBED_LEN(dup,
len);
1771 root =
RSTRING(str)->as.heap.aux.shared;
1773 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1777 assert(!STR_SHARED_P(root));
1781 else if (STR_EMBED_P(root)) {
1790 FL_SET(root, STR_SHARED_ROOT);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1809 if (
FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1816 return str_duplicate_setup(klass, str, dup);
1823 if (
FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1830 return str_duplicate_setup(klass, str, dup);
1842 RUBY_DTRACE_CREATE_HOOK(STRING,
RSTRING_LEN(str));
1849 RUBY_DTRACE_CREATE_HOOK(STRING,
RSTRING_LEN(str));
1850 return ec_str_duplicate(ec,
rb_cString, str);
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1],
"capacity");
1881 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1884 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
1889 if (
capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1898 if (orig == str) n = 0;
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) {
1902 char *new_ptr =
ALLOC_N(
char, (
size_t)
capa + termlen);
1904 assert(
RSTRING(str)->
as.embed.len + 1 <= str_embed_capa(str));
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1911 else if (
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)
capa + termlen;
1914 const size_t osize =
RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr =
ALLOC_N(
char, (
size_t)
capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1920 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
1921 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
1922 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
1928 rb_enc_cr_str_exact_copy(str, orig);
1930 FL_SET(str, STR_NOEMBED);
1937 rb_enc_associate(str, enc);
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(
const uintptr_t *s)
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1975 return rb_popcount_intptr(d);
1979# if SIZEOF_VOIDP == 8
1988enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
1994 long diff = (long)(e - p);
2000 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2003 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (
const char *)s) {
2006 if (is_utf8_lead_byte(*p))
len++;
2010 len += count_utf8_lead_bytes_with_word(s);
2013 p = (
const char *)s;
2016 if (is_utf8_lead_byte(*p))
len++;
2027 q = search_nonascii(p, e);
2033 p += rb_enc_fast_mbclen(p, e, enc);
2040 q = search_nonascii(p, e);
2046 p += rb_enc_mbclen(p, e, enc);
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2069rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2077 long diff = (long)(e - p);
2084 q = search_nonascii(p, e);
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2132 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2144 return enc_strlen(p, e, enc, cr);
2151 return str_strlen(str, NULL);
2165 return LONG2NUM(str_strlen(str, NULL));
2177rb_str_bytesize(
VALUE str)
2195rb_str_empty(
VALUE str)
2215 char *ptr1, *ptr2, *ptr3;
2220 enc = rb_enc_check_str(str1, str2);
2224 if (len1 > LONG_MAX - len2) {
2227 str3 = str_new0(
rb_cString, 0, len1+len2, termlen);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2241MJIT_FUNC_EXPORTED
VALUE
2247 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2256 else if (enc2 < 0) {
2259 else if (enc1 != enc2) {
2262 else if (len1 > LONG_MAX - len2) {
2295 rb_enc_copy(str2, str);
2303 if (STR_EMBEDDABLE_P(
len, 1)) {
2312 STR_SET_LEN(str2,
len);
2313 rb_enc_copy(str2, str);
2321 termlen = TERM_LEN(str);
2327 while (n <=
len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2331 memcpy(ptr2 + n, ptr2,
len-n);
2333 STR_SET_LEN(str2,
len);
2334 TERM_FILL(&ptr2[
len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2361 VALUE tmp = rb_check_array_type(arg);
2370rb_check_lockedtmp(
VALUE str)
2372 if (
FL_TEST(str, STR_TMPLOCK)) {
2378str_modifiable(
VALUE str)
2380 rb_check_lockedtmp(str);
2385str_dependent_p(
VALUE str)
2387 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2396str_independent(
VALUE str)
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2403str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2416 STR_SET_EMBED_LEN(str,
len);
2423 memcpy(
ptr, oldptr,
len);
2425 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(
ptr +
len, termlen);
2439 if (!str_independent(str))
2440 str_make_independent(str);
2447 int termlen = TERM_LEN(str);
2453 if (expand >= LONG_MAX -
len) {
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str,
len, expand, termlen);
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2468str_modify_keep_cr(
VALUE str)
2470 if (!str_independent(str))
2471 str_make_independent(str);
2478str_discard(
VALUE str)
2480 str_modifiable(str);
2481 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2482 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2483 RSTRING(str)->as.heap.ptr = 0;
2484 RSTRING(str)->as.heap.len = 0;
2519zero_filled(
const char *s,
int n)
2521 for (; n > 0; --n) {
2528str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2530 const char *e = s +
len;
2532 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2533 if (zero_filled(s, minlen))
return s;
2539str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2544 if (str_dependent_p(str)) {
2545 if (!zero_filled(s +
len, termlen))
2546 str_make_independent_expand(str,
len, 0L, termlen);
2549 TERM_FILL(s +
len, termlen);
2556rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2558 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2563 rb_check_lockedtmp(str);
2564 str_make_independent_expand(str,
len, 0L, termlen);
2566 else if (str_dependent_p(str)) {
2567 if (termlen > oldtermlen)
2568 str_make_independent_expand(str,
len, 0L, termlen);
2571 if (!STR_EMBED_P(str)) {
2573 assert(!
FL_TEST((str), STR_SHARED));
2576 if (termlen > oldtermlen) {
2585str_null_check(
VALUE str,
int *w)
2594 if (str_null_char(s,
len, minlen, enc)) {
2597 return str_fill_term(str, s,
len, minlen);
2600 if (!s || memchr(s, 0,
len)) {
2604 s = str_fill_term(str, s,
len, minlen);
2610rb_str_to_cstr(
VALUE str)
2613 return str_null_check(str, &w);
2621 char *s = str_null_check(str, &w);
2632rb_str_fill_terminator(
VALUE str,
const int newminlen)
2636 return str_fill_term(str, s,
len, newminlen);
2642 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2666str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2676 const char *p2, *e2;
2679 while (p < e && 0 < nth) {
2686 p2 = search_nonascii(p, e2);
2695 n = rb_enc_mbclen(p, e, enc);
2706 while (p < e && nth--) {
2707 p += rb_enc_mbclen(p, e, enc);
2718 return str_nth_len(p, e, &nth, enc);
2722str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2727 p = str_nth_len(p, e, &nth, enc);
2736str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2738 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2739 if (!pp)
return e - p;
2747 STR_ENC_GET(str), single_byte_optimizable(str));
2752str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2755 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2756 const uintptr_t *s, *t;
2757 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2758 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2759 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2760 while (p < (
const char *)s) {
2761 if (is_utf8_lead_byte(*p)) nth--;
2765 nth -= count_utf8_lead_bytes_with_word(s);
2767 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
2771 if (is_utf8_lead_byte(*p)) {
2772 if (nth == 0)
break;
2782str_utf8_offset(
const char *p,
const char *e,
long nth)
2784 const char *pp = str_utf8_nth(p, e, &nth);
2793 if (single_byte_optimizable(str) || pos < 0)
2797 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
2802str_subseq(
VALUE str,
long beg,
long len)
2806 const long rstring_embed_capa_max = ((
sizeof(
struct RString) - offsetof(struct
RString,
as.
embed.
ary)) / sizeof(char)) - 1;
2809 len <= rstring_embed_capa_max) {
2816 RSTRING(str2)->as.heap.ptr += beg;
2828 VALUE str2 = str_subseq(str, beg,
len);
2829 rb_enc_cr_str_copy_for_substr(str2, str);
2842 if (
len < 0)
return 0;
2846 if (single_byte_optimizable(str)) {
2847 if (beg > blen)
return 0;
2850 if (beg < 0)
return 0;
2852 if (
len > blen - beg)
2854 if (
len < 0)
return 0;
2859 if (
len > -beg)
len = -beg;
2871 slen = str_strlen(str, enc);
2873 if (beg < 0)
return 0;
2875 if (
len == 0)
goto end;
2882 if (beg > str_strlen(str, enc))
return 0;
2887 enc == rb_utf8_encoding()) {
2888 p = str_utf8_nth(s, e, &beg);
2889 if (beg > 0)
return 0;
2890 len = str_utf8_offset(p, e,
len);
2896 p = s + beg * char_sz;
2900 else if (
len * char_sz > e - p)
2905 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2906 if (beg > 0)
return 0;
2910 len = str_offset(p, e,
len, enc, 0);
2918static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
2923 return str_substr(str, beg,
len, TRUE);
2927str_substr(
VALUE str,
long beg,
long len,
int empty)
2931 if (!p)
return Qnil;
2932 if (!
len && !empty)
return Qnil;
2936 VALUE str2 = str_subseq(str, beg,
len);
2937 rb_enc_cr_str_copy_for_substr(str2, str);
2981str_uminus(
VALUE str)
2986 return rb_fstring(str);
2990#define rb_str_dup_frozen rb_str_new_frozen
2995 if (
FL_TEST(str, STR_TMPLOCK)) {
2998 FL_SET(str, STR_TMPLOCK);
3005 if (!
FL_TEST(str, STR_TMPLOCK)) {
3012RUBY_FUNC_EXPORTED
VALUE
3023 const int termlen = TERM_LEN(str);
3025 str_modifiable(str);
3026 if (STR_SHARED_P(str)) {
3029 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3059 STR_SET_LEN(str,
len);
3070 int independent = str_independent(str);
3079 const int termlen = TERM_LEN(str);
3080 if (STR_EMBED_P(str)) {
3081 if (
len == slen)
return str;
3082 if (str_embed_capa(str) >=
len + termlen) {
3083 STR_SET_EMBED_LEN(str,
len);
3087 str_make_independent_expand(str, slen,
len - slen, termlen);
3089 else if (str_embed_capa(str) >=
len + termlen) {
3090 char *
ptr = STR_HEAP_PTR(str);
3092 if (slen >
len) slen =
len;
3095 STR_SET_EMBED_LEN(str,
len);
3096 if (independent) ruby_xfree(
ptr);
3099 else if (!independent) {
3100 if (
len == slen)
return str;
3101 str_make_independent_expand(str, slen,
len - slen, termlen);
3105 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3106 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3109 else if (
len == slen)
return str;
3117str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3120 str_modify_keep_cr(str);
3125 if (
len == 0)
return 0;
3127 long capa, total, olen, off = -1;
3129 const int termlen = TERM_LEN(str);
3135 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3139 if (STR_EMBED_P(str)) {
3140 capa = str_embed_capa(str) - termlen;
3141 sptr =
RSTRING(str)->as.embed.ary;
3146 sptr =
RSTRING(str)->as.heap.ptr;
3147 olen =
RSTRING(str)->as.heap.len;
3149 if (olen > LONG_MAX -
len) {
3154 if (total >= LONG_MAX / 2) {
3157 while (total >
capa) {
3160 RESIZE_CAPA_TERM(str,
capa, termlen);
3166 memcpy(sptr + olen,
ptr,
len);
3167 STR_SET_LEN(str, total);
3168 TERM_FILL(sptr + total, termlen);
3173#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3174#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3179 if (
len == 0)
return str;
3183 return str_buf_cat(str,
ptr,
len);
3198rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3199 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3208 if (str_encindex == ptr_encindex) {
3210 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3214 str_enc = rb_enc_from_index(str_encindex);
3215 ptr_enc = rb_enc_from_index(ptr_encindex);
3228 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3237 *ptr_cr_ret = ptr_cr;
3239 if (str_encindex != ptr_encindex &&
3242 str_enc = rb_enc_from_index(str_encindex);
3243 ptr_enc = rb_enc_from_index(ptr_encindex);
3248 res_encindex = str_encindex;
3253 res_encindex = str_encindex;
3257 res_encindex = ptr_encindex;
3262 res_encindex = str_encindex;
3269 res_encindex = str_encindex;
3277 str_buf_cat(str,
ptr,
len);
3290 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3301 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3307 unsigned int c = (
unsigned char)*
ptr;
3308 int len = rb_enc_codelen(c, enc);
3310 rb_enc_cr_str_buf_cat(str, buf,
len,
3323 if (str_enc_fastpath(str)) {
3359#define MIN_PRE_ALLOC_SIZE 48
3361MJIT_FUNC_EXPORTED
VALUE
3362rb_str_concat_literals(
size_t num,
const VALUE *strary)
3372 if (LIKELY(
len < MIN_PRE_ALLOC_SIZE)) {
3378 rb_enc_copy(str, strary[0]);
3382 for (i = s; i < num; ++i) {
3383 const VALUE v = strary[i];
3387 if (encidx != ENCINDEX_US_ASCII) {
3389 rb_enc_set_index(str, encidx);
3414rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3416 str_modifiable(str);
3421 else if (argc > 1) {
3424 rb_enc_copy(arg_str, str);
3425 for (i = 0; i < argc; i++) {
3460 if (rb_num_to_uint(str2, &code) == 0) {
3473 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3476 buf[0] = (char)code;
3478 if (encidx != rb_enc_to_index(enc)) {
3479 rb_enc_associate_index(str1, encidx);
3489 switch (
len = rb_enc_codelen(code, enc)) {
3490 case ONIGERR_INVALID_CODE_POINT_VALUE:
3493 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3500 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3517rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
3519 int encidx = rb_enc_to_index(enc);
3521 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3526 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3527 return ENCINDEX_ASCII_8BIT;
3550rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
3552 str_modifiable(str);
3557 else if (argc > 1) {
3560 rb_enc_copy(arg_str, str);
3561 for (i = 0; i < argc; i++) {
3574 if (e && is_ascii_string(str)) {
3584 const char *ptr1, *ptr2;
3587 return (len1 != len2 ||
3589 memcmp(ptr1, ptr2, len1) != 0);
3603rb_str_hash_m(
VALUE str)
3609#define lesser(a,b) (((a)>(b))?(b):(a))
3621 if (idx1 == idx2)
return TRUE;
3640 const char *ptr1, *ptr2;
3643 if (str1 == str2)
return 0;
3646 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3655 if (len1 > len2)
return 1;
3658 if (retval > 0)
return 1;
3685 if (str1 == str2)
return Qtrue;
3692 return rb_str_eql_internal(str1, str2);
3713MJIT_FUNC_EXPORTED
VALUE
3716 if (str1 == str2)
return Qtrue;
3718 return rb_str_eql_internal(str1, str2);
3749 return rb_invcmp(str1, str2);
3791 return str_casecmp(str1, s);
3799 const char *p1, *p1end, *p2, *p2end;
3801 enc = rb_enc_compatible(str1, str2);
3808 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3809 while (p1 < p1end && p2 < p2end) {
3811 unsigned int c1 =
TOLOWER(*p1 & 0xff);
3812 unsigned int c2 =
TOLOWER(*p2 & 0xff);
3814 return INT2FIX(c1 < c2 ? -1 : 1);
3821 while (p1 < p1end && p2 < p2end) {
3822 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3823 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3825 if (0 <= c1 && 0 <= c2) {
3829 return INT2FIX(c1 < c2 ? -1 : 1);
3833 l1 = rb_enc_mbclen(p1, p1end, enc);
3834 l2 = rb_enc_mbclen(p2, p2end, enc);
3835 len = l1 < l2 ? l1 : l2;
3836 r = memcmp(p1, p2,
len);
3838 return INT2FIX(r < 0 ? -1 : 1);
3840 return INT2FIX(l1 < l2 ? -1 : 1);
3881 return str_casecmp_p(str1, s);
3888 VALUE folded_str1, folded_str2;
3889 VALUE fold_opt = sym_fold;
3891 enc = rb_enc_compatible(str1, str2);
3896 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3897 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3899 return rb_str_eql(folded_str1, folded_str2);
3903strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
3904 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
3906 const char *search_start = str_ptr;
3907 long pos, search_len = str_len - offset;
3911 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3912 if (pos < 0)
return pos;
3914 if (t == search_start + pos)
break;
3915 search_len -= t - search_start;
3916 if (search_len <= 0)
return -1;
3917 offset += t - search_start;
3920 return pos + offset;
3923#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3926rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
3928 const char *str_ptr, *str_ptr_end, *sub_ptr;
3929 long str_len, sub_len;
3932 enc = rb_enc_check(str, sub);
3933 if (is_broken_string(sub))
return -1;
3941 if (str_len < sub_len)
return -1;
3944 long str_len_char, sub_len_char;
3945 int single_byte = single_byte_optimizable(str);
3946 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3947 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3949 offset += str_len_char;
3950 if (offset < 0)
return -1;
3952 if (str_len_char - offset < sub_len_char)
return -1;
3953 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3956 if (sub_len == 0)
return offset;
3959 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3973rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
3979 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
3986 pos += str_strlen(str, NULL);
3996 if (pos > str_strlen(str, NULL))
3999 rb_enc_check(str, sub), single_byte_optimizable(str));
4013 pos = rb_str_index(str, sub, pos);
4017 if (pos == -1)
return Qnil;
4026str_check_byte_pos(
VALUE str,
long pos)
4030 const char *p = s + pos;
4077rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4083 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4089 if (pos < 0 || pos > slen) {
4100 if (!str_check_byte_pos(str, pos)) {
4102 "offset %ld does not land on character boundary", pos);
4118 pos = rb_strseq_index(str, sub, pos, 1);
4121 if (pos == -1)
return Qnil;
4129 char *hit, *adjusted;
4131 long slen, searchlen;
4136 if (slen == 0)
return s - sbeg;
4140 searchlen = s - sbeg + 1;
4143 hit = memrchr(sbeg, c, searchlen);
4146 if (hit != adjusted) {
4147 searchlen = adjusted - sbeg;
4150 if (memcmp(hit, t, slen) == 0)
4152 searchlen = adjusted - sbeg;
4153 }
while (searchlen > 0);
4170 if (memcmp(s, t, slen) == 0) {
4173 if (s <= sbeg)
break;
4189 enc = rb_enc_check(str, sub);
4190 if (is_broken_string(sub))
return -1;
4191 singlebyte = single_byte_optimizable(str);
4192 len = singlebyte ?
RSTRING_LEN(str) : str_strlen(str, enc);
4193 slen = str_strlen(sub, enc);
4196 if (len < slen)
return -1;
4197 if (len - pos < slen) pos = len - slen;
4198 if (len == 0)
return pos;
4209 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4271rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4276 long pos, len = str_strlen(str, enc);
4278 if (
rb_scan_args(argc, argv,
"11", &sub, &vpos) == 2) {
4289 if (pos > len) pos = len;
4298 enc, single_byte_optimizable(str));
4309 pos = rb_str_rindex(str, sub, pos);
4310 if (pos >= 0)
return LONG2NUM(pos);
4316rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4322 enc = rb_enc_check(str, sub);
4323 if (is_broken_string(sub))
return -1;
4328 if (len < slen)
return -1;
4329 if (len - pos < slen) pos = len - slen;
4330 if (len == 0)
return pos;
4342 return str_rindex(str, sub, s, enc);
4407rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4413 if (
rb_scan_args(argc, argv,
"11", &sub, &vpos) == 2) {
4424 if (pos > len) pos = len;
4430 if (!str_check_byte_pos(str, pos)) {
4432 "offset %ld does not land on character boundary", pos);
4445 pos = rb_str_byterindex(str, sub, pos);
4446 if (pos >= 0)
return LONG2NUM(pos);
4482 switch (OBJ_BUILTIN_TYPE(y)) {
4534rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
4541 result = rb_funcallv(get_pat(re),
rb_intern(
"match"), argc, argv);
4573rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
4577 re = get_pat(argv[0]);
4578 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
4587static enum neighbor_char
4595 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4597 return NEIGHBOR_NOT_CHAR;
4601 if (!l)
return NEIGHBOR_NOT_CHAR;
4602 if (l != len)
return NEIGHBOR_WRAPPED;
4604 r = rb_enc_precise_mbclen(p, p + len, enc);
4606 return NEIGHBOR_NOT_CHAR;
4608 return NEIGHBOR_FOUND;
4611 for (i = len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
4614 return NEIGHBOR_WRAPPED;
4615 ++((
unsigned char*)p)[i];
4616 l = rb_enc_precise_mbclen(p, p+len, enc);
4620 return NEIGHBOR_FOUND;
4623 memset(p+l, 0xff, len-l);
4629 for (len2 = len-1; 0 < len2; len2--) {
4630 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4634 memset(p+len2+1, 0xff, len-(len2+1));
4639static enum neighbor_char
4646 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4648 return NEIGHBOR_NOT_CHAR;
4651 if (!c)
return NEIGHBOR_NOT_CHAR;
4654 if (!l)
return NEIGHBOR_NOT_CHAR;
4655 if (l != len)
return NEIGHBOR_WRAPPED;
4657 r = rb_enc_precise_mbclen(p, p + len, enc);
4659 return NEIGHBOR_NOT_CHAR;
4661 return NEIGHBOR_FOUND;
4664 for (i = len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
4667 return NEIGHBOR_WRAPPED;
4668 --((
unsigned char*)p)[i];
4669 l = rb_enc_precise_mbclen(p, p+len, enc);
4673 return NEIGHBOR_FOUND;
4676 memset(p+l, 0, len-l);
4682 for (len2 = len-1; 0 < len2; len2--) {
4683 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4687 memset(p+len2+1, 0, len-(len2+1));
4701static enum neighbor_char
4702enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
4704 enum neighbor_char ret;
4708 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4712 const int max_gaps = 1;
4716 ctype = ONIGENC_CTYPE_DIGIT;
4718 ctype = ONIGENC_CTYPE_ALPHA;
4720 return NEIGHBOR_NOT_CHAR;
4722 MEMCPY(save, p,
char, len);
4723 for (
try = 0;
try <= max_gaps; ++
try) {
4724 ret = enc_succ_char(p, len, enc);
4725 if (ret == NEIGHBOR_FOUND) {
4728 return NEIGHBOR_FOUND;
4731 MEMCPY(p, save,
char, len);
4734 MEMCPY(save, p,
char, len);
4735 ret = enc_pred_char(p, len, enc);
4736 if (ret == NEIGHBOR_FOUND) {
4739 MEMCPY(p, save,
char, len);
4744 MEMCPY(p, save,
char, len);
4750 return NEIGHBOR_NOT_CHAR;
4753 if (ctype != ONIGENC_CTYPE_DIGIT) {
4754 MEMCPY(carry, p,
char, len);
4755 return NEIGHBOR_WRAPPED;
4758 MEMCPY(carry, p,
char, len);
4759 enc_succ_char(carry, len, enc);
4760 return NEIGHBOR_WRAPPED;
4830 rb_enc_cr_str_copy_for_substr(str, orig);
4831 return str_succ(str);
4838 char *sbeg, *s, *e, *last_alnum = 0;
4839 int found_alnum = 0;
4841 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
4842 long carry_pos = 0, carry_len = 1;
4843 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4846 if (slen == 0)
return str;
4848 enc = STR_ENC_GET(str);
4850 s = e = sbeg + slen;
4853 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4859 l = rb_enc_precise_mbclen(s, e, enc);
4860 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
4861 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4862 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4864 case NEIGHBOR_NOT_CHAR:
4866 case NEIGHBOR_FOUND:
4868 case NEIGHBOR_WRAPPED:
4873 carry_pos = s - sbeg;
4879 enum neighbor_char neighbor;
4880 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4881 l = rb_enc_precise_mbclen(s, e, enc);
4882 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
4883 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4885 neighbor = enc_succ_char(tmp, l, enc);
4887 case NEIGHBOR_FOUND:
4891 case NEIGHBOR_WRAPPED:
4894 case NEIGHBOR_NOT_CHAR:
4897 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4899 enc_succ_char(s, l, enc);
4902 MEMCPY(carry, s,
char, l);
4905 carry_pos = s - sbeg;
4909 RESIZE_CAPA(str, slen + carry_len);
4911 s = sbeg + carry_pos;
4912 memmove(s + carry_len, s, slen - carry_pos);
4913 memmove(s, carry, carry_len);
4915 STR_SET_LEN(str, slen);
4932rb_str_succ_bang(
VALUE str)
4940all_digits_p(
const char *s,
long len)
4994 VALUE end, exclusive;
4998 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5004 VALUE current, after_end;
5011 enc = rb_enc_check(beg, end);
5012 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5018 if (c > e || (excl && c == e))
return beg;
5021 if (!excl && c == e)
break;
5023 if (excl && c == e)
break;
5035 b = rb_str_to_inum(beg, 10, FALSE);
5036 e = rb_str_to_inum(end, 10, FALSE);
5043 if (excl && bi == ei)
break;
5044 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5049 ID op = excl ?
'<' : idLE;
5050 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5055 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5056 b = rb_funcallv(b, succ, 0, 0);
5063 if (n > 0 || (excl && n == 0))
return beg;
5065 after_end = rb_funcallv(end, succ, 0, 0);
5070 next = rb_funcallv(current, succ, 0, 0);
5071 if ((*each)(current, arg))
break;
5072 if (
NIL_P(next))
break;
5093 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5095 b = rb_str_to_inum(beg, 10, FALSE);
5101 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5109 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5110 b = rb_funcallv(b, succ, 0, 0);
5116 VALUE next = rb_funcallv(current, succ, 0, 0);
5117 if ((*each)(current, arg))
break;
5131 if (!
rb_equal(str, *argp))
return 0;
5160 if (b <= v && v < e)
return Qtrue;
5161 return RBOOL(!
RTEST(exclusive) && v == e);
5174 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5176 return RBOOL(
NIL_P(val));
5199 return rb_str_subpat(str, indx,
INT2FIX(0));
5202 if (rb_str_index(str, indx, 0) != -1)
5208 long beg, len = str_strlen(str, NULL);
5220 return str_substr(str, idx, 1, FALSE);
5239rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5243 return rb_str_subpat(str, argv[0], argv[1]);
5252 return rb_str_aref(str, argv[0]);
5261 str_modifiable(str);
5262 if (len > olen) len = olen;
5264 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5266 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5268 STR_SET_EMBED_LEN(str, nlen);
5269 ptr =
RSTRING(str)->as.embed.ary;
5270 memmove(ptr, oldptr + len, nlen);
5271 if (fl == STR_NOEMBED)
xfree(oldptr);
5274 if (!STR_SHARED_P(str)) {
5276 rb_enc_cr_str_exact_copy(shared, str);
5279 ptr =
RSTRING(str)->as.heap.ptr += len;
5280 RSTRING(str)->as.heap.len = nlen;
5288rb_str_splice_0(
VALUE str,
long beg,
long len,
VALUE val)
5294 if (beg == 0 && vlen == 0) {
5299 str_modify_keep_cr(str);
5303 RESIZE_CAPA(str, slen + vlen - len);
5313 memmove(sptr + beg + vlen,
5315 slen - (beg + len));
5317 if (vlen < beg && len < 0) {
5318 MEMZERO(sptr + slen,
char, -len);
5324 STR_SET_LEN(str, slen);
5325 TERM_FILL(&sptr[slen], TERM_LEN(str));
5335 int singlebyte = single_byte_optimizable(str);
5341 enc = rb_enc_check(str, val);
5342 slen = str_strlen(str, enc);
5344 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5351 assert(beg <= slen);
5352 if (len > slen - beg) {
5357 e = str_nth(p,
RSTRING_END(str), len, enc, singlebyte);
5362 rb_str_splice_0(str, beg, len, val);
5363 rb_enc_associate(str, enc);
5369#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5376 long start, end, len;
5386 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5390 nth += regs->num_regs;
5400 enc = rb_enc_check_str(str, val);
5401 rb_str_splice_0(str, start, len, val);
5402 rb_enc_associate(str, enc);
5410 switch (
TYPE(indx)) {
5412 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5416 beg = rb_str_index(str, indx, 0);
5421 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5429 rb_str_splice(str, beg, len, val);
5437 rb_str_splice(str, idx, 1, val);
5472rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5476 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5484 return rb_str_aset(str, argv[0], argv[1]);
5516 rb_str_splice(str, pos, 0, str2);
5544rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
5552 str_modify_keep_cr(str);
5560 if ((nth += regs->num_regs) <= 0)
return Qnil;
5562 else if (nth >= regs->num_regs)
return Qnil;
5564 len = END(nth) - beg;
5567 else if (argc == 2) {
5575 if (!len)
return Qnil;
5580 beg = rb_str_index(str, indx, 0);
5581 if (beg == -1)
return Qnil;
5593 if (!len)
return Qnil;
5607 rb_enc_cr_str_copy_for_substr(result, str);
5617 if (beg + len > slen)
5621 slen - (beg + len));
5623 STR_SET_LEN(str, slen);
5624 TERM_FILL(&sptr[slen], TERM_LEN(str));
5635 switch (OBJ_BUILTIN_TYPE(pat)) {
5654get_pat_quoted(
VALUE pat,
int check)
5658 switch (OBJ_BUILTIN_TYPE(pat)) {
5672 if (check && is_broken_string(pat)) {
5679rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
5682 pos = rb_strseq_index(str, pat, pos, 1);
5683 if (set_backref_str) {
5685 str = rb_str_new_frozen_String(str);
5686 rb_backref_set_string(str, pos,
RSTRING_LEN(pat));
5695 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5715rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
5729 hash = rb_check_hash_type(argv[1]);
5735 pat = get_pat_quoted(argv[0], 1);
5737 str_modifiable(str);
5738 beg = rb_pat_search(pat, str, 0, 1);
5761 if (iter || !
NIL_P(hash)) {
5768 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
5771 str_mod_check(str, p, len);
5778 enc = rb_enc_compatible(str, repl);
5788 enc = STR_ENC_GET(repl);
5791 rb_enc_associate(str, enc);
5804 RESIZE_CAPA(str, len + rlen - plen);
5808 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5811 memmove(p + beg0, rp, rlen);
5813 STR_SET_LEN(str, len);
5841 rb_str_sub_bang(argc, argv, str);
5846str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
5850 long beg, beg0, end0;
5851 long offset, blen, slen, len, last;
5852 enum {STR, ITER, MAP} mode = STR;
5854 int need_backref = -1;
5864 hash = rb_check_hash_type(argv[1]);
5873 rb_error_arity(argc, 1, 2);
5876 pat = get_pat_quoted(argv[0], 1);
5877 beg = rb_pat_search(pat, str, 0, need_backref);
5879 if (bang)
return Qnil;
5889 str_enc = STR_ENC_GET(str);
5890 rb_enc_associate(dest, str_enc);
5912 val = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
5915 str_mod_check(str, sp, slen);
5920 else if (need_backref) {
5922 if (need_backref < 0) {
5923 need_backref = val != repl;
5930 len = beg0 - offset;
5947 offset = end0 + len;
5951 beg = rb_pat_search(pat, str, offset, need_backref);
5956 rb_pat_search(pat, str, last, 1);
5958 str_shared_replace(str, dest);
5986rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
5988 str_modify_keep_cr(str);
5989 return str_gsub(argc, argv, str, 1);
6012 return str_gsub(argc, argv, str, 0);
6030 str_modifiable(str);
6031 if (str == str2)
return str;
6035 return str_replace(str, str2);
6050rb_str_clear(
VALUE str)
6054 STR_SET_EMBED_LEN(str, 0);
6075rb_str_chr(
VALUE str)
6123 char *ptr, *head, *left = 0;
6127 if (pos < -len || len <= pos)
6134 char byte = (char)(
NUM2INT(w) & 0xFF);
6136 if (!str_independent(str))
6137 str_make_independent(str);
6138 enc = STR_ENC_GET(str);
6141 if (!STR_EMBED_P(str)) {
6148 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6156 width = rb_enc_precise_mbclen(left, head+len, enc);
6158 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6174str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6178 if (beg > n || len < 0)
return Qnil;
6181 if (beg < 0)
return Qnil;
6186 if (!empty)
return Qnil;
6190 VALUE str2 = str_subseq(str, beg, len);
6192 str_enc_copy(str2, str);
6231 return str_byte_substr(str, beg, len, TRUE);
6236 return str_byte_substr(str, idx, 1, FALSE);
6283rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6288 return str_byte_substr(str, beg, len, TRUE);
6291 return str_byte_aref(str, argv[0]);
6311rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6313 long beg, end, len, slen;
6322 rb_builtin_class_name(argv[0]));
6333 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6340 assert(beg <= slen);
6341 if (len > slen - beg) {
6345 if (!str_check_byte_pos(str, beg)) {
6347 "offset %ld does not land on character boundary", beg);
6349 if (!str_check_byte_pos(str, end)) {
6351 "offset %ld does not land on character boundary", end);
6354 enc = rb_enc_check(str, val);
6355 str_modify_keep_cr(str);
6356 rb_str_splice_0(str, beg, len, val);
6357 rb_enc_associate(str, enc);
6375rb_str_reverse(
VALUE str)
6383 enc = STR_ENC_GET(str);
6390 if (single_byte_optimizable(str)) {
6397 int clen = rb_enc_fast_mbclen(s, e, enc);
6408 int clen = rb_enc_mbclen(s, e, enc);
6418 str_enc_copy(rev, str);
6438rb_str_reverse_bang(
VALUE str)
6441 if (single_byte_optimizable(str)) {
6444 str_modify_keep_cr(str);
6454 str_shared_replace(str, rb_str_reverse(str));
6458 str_modify_keep_cr(str);
6483 i = rb_str_index(str, arg, 0);
6485 return RBOOL(i != -1);
6529 return rb_str_to_inum(str, base, FALSE);
6553rb_str_to_f(
VALUE str)
6571rb_str_to_s(
VALUE str)
6583 char s[RUBY_MAX_CHAR_LEN];
6584 int n = rb_enc_codelen(c, enc);
6591#define CHAR_ESC_LEN 13
6594rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
6596 char buf[CHAR_ESC_LEN + 1];
6604 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
6606 else if (c < 0x10000) {
6607 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
6610 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
6615 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
6618 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
6621 l = (int)strlen(buf);
6627ruby_escaped_char(
int c)
6630 case '\0':
return "\\0";
6631 case '\n':
return "\\n";
6632 case '\r':
return "\\r";
6633 case '\t':
return "\\t";
6634 case '\f':
return "\\f";
6635 case '\013':
return "\\v";
6636 case '\010':
return "\\b";
6637 case '\007':
return "\\a";
6638 case '\033':
return "\\e";
6639 case '\x7f':
return "\\c?";
6645rb_str_escape(
VALUE str)
6651 const char *prev = p;
6652 char buf[CHAR_ESC_LEN + 1];
6654 int unicode_p = rb_enc_unicode_p(enc);
6660 int n = rb_enc_precise_mbclen(p, pend, enc);
6662 if (p > prev) str_buf_cat(result, prev, p - prev);
6665 n = (int)(pend - p);
6667 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
6668 str_buf_cat(result, buf, strlen(buf));
6676 cc = ruby_escaped_char(c);
6678 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6679 str_buf_cat(result, cc, strlen(cc));
6685 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6686 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6690 if (p > prev) str_buf_cat(result, prev, p - prev);
6714 const char *p, *pend, *prev;
6715 char buf[CHAR_ESC_LEN + 1];
6717 rb_encoding *resenc = rb_default_internal_encoding();
6718 int unicode_p = rb_enc_unicode_p(enc);
6721 if (resenc == NULL) resenc = rb_default_external_encoding();
6723 rb_enc_associate(result, resenc);
6724 str_buf_cat2(result,
"\"");
6732 n = rb_enc_precise_mbclen(p, pend, enc);
6734 if (p > prev) str_buf_cat(result, prev, p - prev);
6737 n = (int)(pend - p);
6739 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
6740 str_buf_cat(result, buf, strlen(buf));
6748 if ((asciicompat || unicode_p) &&
6749 (c ==
'"'|| c ==
'\\' ||
6754 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
6755 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6756 str_buf_cat2(result,
"\\");
6757 if (asciicompat || enc == resenc) {
6763 case '\n': cc =
'n';
break;
6764 case '\r': cc =
'r';
break;
6765 case '\t': cc =
't';
break;
6766 case '\f': cc =
'f';
break;
6767 case '\013': cc =
'v';
break;
6768 case '\010': cc =
'b';
break;
6769 case '\007': cc =
'a';
break;
6770 case 033: cc =
'e';
break;
6771 default: cc = 0;
break;
6774 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6777 str_buf_cat(result, buf, 2);
6794 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6795 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6800 if (p > prev) str_buf_cat(result, prev, p - prev);
6801 str_buf_cat2(result,
"\"");
6806#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6826 int encidx = rb_enc_get_index(str);
6829 const char *p, *pend;
6832 int u8 = (encidx == rb_utf8_encindex());
6833 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
6838 len += strlen(enc->name);
6844 unsigned char c = *p++;
6847 case '"':
case '\\':
6848 case '\n':
case '\r':
6849 case '\t':
case '\f':
6850 case '\013':
case '\010':
case '\007':
case '\033':
6855 clen = IS_EVSTR(p, pend) ? 2 : 1;
6863 if (u8 && c > 0x7F) {
6864 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6869 else if (cc <= 0xFFFFF)
6882 if (clen > LONG_MAX - len) {
6894 unsigned char c = *p++;
6896 if (c ==
'"' || c ==
'\\') {
6900 else if (c ==
'#') {
6901 if (IS_EVSTR(p, pend)) *q++ =
'\\';
6904 else if (c ==
'\n') {
6908 else if (c ==
'\r') {
6912 else if (c ==
'\t') {
6916 else if (c ==
'\f') {
6920 else if (c ==
'\013') {
6924 else if (c ==
'\010') {
6928 else if (c ==
'\007') {
6932 else if (c ==
'\033') {
6942 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6947 snprintf(q, qend-q,
"u%04X", cc);
6949 snprintf(q, qend-q,
"u{%X}", cc);
6954 snprintf(q, qend-q,
"x%02X", c);
6961 snprintf(q, qend-q, nonascii_suffix, enc->name);
6962 encidx = rb_ascii8bit_encindex();
6965 rb_enc_associate_index(result, encidx);
6971unescape_ascii(
unsigned int c)
6995undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
6997 const char *s = *ss;
7001 unsigned char buf[6];
7019 *buf = unescape_ascii(*s);
7031 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7032 if (*penc != enc_utf8) {
7034 rb_enc_associate(undumped, enc_utf8);
7051 if (hexlen == 0 || hexlen > 6) {
7057 if (0xd800 <= c && c <= 0xdfff) {
7070 if (0xd800 <= c && c <= 0xdfff) {
7101static VALUE rb_str_is_ascii_only_p(
VALUE str);
7119str_undump(
VALUE str)
7126 bool binary =
false;
7130 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7133 if (!str_null_check(str, &w)) {
7137 if (*s !=
'"')
goto invalid_format;
7155 static const char force_encoding_suffix[] =
".force_encoding(\"";
7156 static const char dup_suffix[] =
".dup";
7157 const char *encname;
7162 size =
sizeof(dup_suffix) - 1;
7163 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7165 size =
sizeof(force_encoding_suffix) - 1;
7166 if (s_end - s <= size)
goto invalid_format;
7167 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7175 s = memchr(s,
'"', s_end-s);
7177 if (!s)
goto invalid_format;
7178 if (s_end - s != 2)
goto invalid_format;
7179 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7181 encidx = rb_enc_find_index2(encname, (
long)size);
7185 rb_enc_associate_index(undumped, encidx);
7195 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7204 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7210 if (rb_enc_dummy_p(enc)) {
7217str_true_enc(
VALUE str)
7220 rb_str_check_dummy_enc(enc);
7224static OnigCaseFoldType
7225check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7231 if (argv[0]==sym_turkic) {
7232 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7234 if (argv[1]==sym_lithuanian)
7235 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7240 else if (argv[0]==sym_lithuanian) {
7241 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7243 if (argv[1]==sym_turkic)
7244 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7251 else if (argv[0]==sym_ascii)
7252 flags |= ONIGENC_CASE_ASCII_ONLY;
7253 else if (argv[0]==sym_fold) {
7254 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7255 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7267 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7273#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7274#ifndef CASEMAP_DEBUG
7275# define CASEMAP_DEBUG 0
7283 OnigUChar space[FLEX_ARY_LEN];
7287mapping_buffer_free(
void *p)
7291 while (current_buffer) {
7292 previous_buffer = current_buffer;
7293 current_buffer = current_buffer->next;
7294 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7300 {0, mapping_buffer_free,}
7308 const OnigUChar *source_current, *source_end;
7309 int target_length = 0;
7310 VALUE buffer_anchor;
7313 size_t buffer_count = 0;
7314 int buffer_length_or_invalid;
7323 while (source_current < source_end) {
7325 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7326 if (CASEMAP_DEBUG) {
7327 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n", capa);
7330 *pre_buffer = current_buffer;
7331 pre_buffer = ¤t_buffer->next;
7332 current_buffer->next = NULL;
7333 current_buffer->capa = capa;
7334 buffer_length_or_invalid = enc->case_map(flags,
7335 &source_current, source_end,
7336 current_buffer->space,
7337 current_buffer->space+current_buffer->capa,
7339 if (buffer_length_or_invalid < 0) {
7340 current_buffer =
DATA_PTR(buffer_anchor);
7342 mapping_buffer_free(current_buffer);
7345 target_length += current_buffer->used = buffer_length_or_invalid;
7347 if (CASEMAP_DEBUG) {
7348 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7351 if (buffer_count==1) {
7352 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7355 char *target_current;
7359 current_buffer =
DATA_PTR(buffer_anchor);
7360 while (current_buffer) {
7361 memcpy(target_current, current_buffer->space, current_buffer->used);
7362 target_current += current_buffer->used;
7363 current_buffer = current_buffer->next;
7366 current_buffer =
DATA_PTR(buffer_anchor);
7368 mapping_buffer_free(current_buffer);
7373 str_enc_copy(target, source);
7382 const OnigUChar *source_current, *source_end;
7383 OnigUChar *target_current, *target_end;
7385 int length_or_invalid;
7387 if (old_length == 0)
return Qnil;
7391 if (source == target) {
7392 target_current = (OnigUChar*)source_current;
7393 target_end = (OnigUChar*)source_end;
7400 length_or_invalid = onigenc_ascii_only_case_map(flags,
7401 &source_current, source_end,
7402 target_current, target_end, enc);
7403 if (length_or_invalid < 0)
7405 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7406 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7407 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7409 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7412 str_enc_copy(target, source);
7418upcase_single(
VALUE str)
7421 bool modified =
false;
7424 unsigned int c = *(
unsigned char*)s;
7426 if (
'a' <= c && c <=
'z') {
7427 *s =
'A' + (c -
'a');
7455rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7458 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7460 flags = check_case_options(argc, argv, flags);
7461 str_modify_keep_cr(str);
7462 enc = str_true_enc(str);
7463 if (case_option_single_p(flags, enc, str)) {
7464 if (upcase_single(str))
7465 flags |= ONIGENC_CASE_MODIFIED;
7467 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7468 rb_str_ascii_casemap(str, str, &flags, enc);
7470 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7472 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7494rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
7497 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7500 flags = check_case_options(argc, argv, flags);
7501 enc = str_true_enc(str);
7502 if (case_option_single_p(flags, enc, str)) {
7504 str_enc_copy(ret, str);
7507 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7509 rb_str_ascii_casemap(str, ret, &flags, enc);
7512 ret = rb_str_casemap(str, &flags, enc);
7519downcase_single(
VALUE str)
7522 bool modified =
false;
7525 unsigned int c = *(
unsigned char*)s;
7527 if (
'A' <= c && c <=
'Z') {
7528 *s =
'a' + (c -
'A');
7557rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
7560 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7562 flags = check_case_options(argc, argv, flags);
7563 str_modify_keep_cr(str);
7564 enc = str_true_enc(str);
7565 if (case_option_single_p(flags, enc, str)) {
7566 if (downcase_single(str))
7567 flags |= ONIGENC_CASE_MODIFIED;
7569 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7570 rb_str_ascii_casemap(str, str, &flags, enc);
7572 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7574 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7596rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
7599 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7602 flags = check_case_options(argc, argv, flags);
7603 enc = str_true_enc(str);
7604 if (case_option_single_p(flags, enc, str)) {
7606 str_enc_copy(ret, str);
7607 downcase_single(ret);
7609 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7611 rb_str_ascii_casemap(str, ret, &flags, enc);
7614 ret = rb_str_casemap(str, &flags, enc);
7642rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
7645 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7647 flags = check_case_options(argc, argv, flags);
7648 str_modify_keep_cr(str);
7649 enc = str_true_enc(str);
7651 if (flags&ONIGENC_CASE_ASCII_ONLY)
7652 rb_str_ascii_casemap(str, str, &flags, enc);
7654 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7656 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7680rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
7683 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7686 flags = check_case_options(argc, argv, flags);
7687 enc = str_true_enc(str);
7689 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7691 rb_str_ascii_casemap(str, ret, &flags, enc);
7694 ret = rb_str_casemap(str, &flags, enc);
7721rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
7724 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7726 flags = check_case_options(argc, argv, flags);
7727 str_modify_keep_cr(str);
7728 enc = str_true_enc(str);
7729 if (flags&ONIGENC_CASE_ASCII_ONLY)
7730 rb_str_ascii_casemap(str, str, &flags, enc);
7732 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7734 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7758rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
7761 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7764 flags = check_case_options(argc, argv, flags);
7765 enc = str_true_enc(str);
7767 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7769 rb_str_ascii_casemap(str, ret, &flags, enc);
7772 ret = rb_str_casemap(str, &flags, enc);
7777typedef unsigned char *USTR;
7781 unsigned int now, max;
7793 if (t->p == t->pend)
return -1;
7794 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
7797 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7799 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
7801 if (t->p < t->pend) {
7802 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7805 if (t->now < 0x80 && c < 0x80) {
7807 "invalid range \"%c-%c\" in string transliteration",
7822 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7823 if (t->now == t->max) {
7828 if (t->now < t->max) {
7844 const unsigned int errc = -1;
7845 unsigned int trans[256];
7847 struct tr trsrc, trrepl;
7849 unsigned int c, c0, last = 0;
7850 int modify = 0, i, l;
7851 unsigned char *s, *send;
7853 int singlebyte = single_byte_optimizable(str);
7857#define CHECK_IF_ASCII(c) \
7858 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7859 (cr = ENC_CODERANGE_VALID) : 0)
7865 return rb_str_delete_bang(1, &src, str);
7869 e1 = rb_enc_check(str, src);
7870 e2 = rb_enc_check(str, repl);
7875 enc = rb_enc_check(src, repl);
7879 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
7880 trsrc.p + l < trsrc.pend) {
7886 trsrc.gen = trrepl.gen = 0;
7887 trsrc.now = trrepl.now = 0;
7888 trsrc.max = trrepl.max = 0;
7891 for (i=0; i<256; i++) {
7894 while ((c = trnext(&trsrc, enc)) != errc) {
7899 if (!hash) hash = rb_hash_new();
7903 while ((c = trnext(&trrepl, enc)) != errc)
7906 for (i=0; i<256; i++) {
7907 if (trans[i] != errc) {
7915 for (i=0; i<256; i++) {
7918 while ((c = trnext(&trsrc, enc)) != errc) {
7919 r = trnext(&trrepl, enc);
7920 if (r == errc) r = trrepl.now;
7923 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7926 if (!hash) hash = rb_hash_new();
7934 str_modify_keep_cr(str);
7940 unsigned int save = -1;
7941 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
7946 c0 = c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, e1);
7947 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7956 if (cflag) c = last;
7959 else if (cflag) c = errc;
7965 if (c != (
unsigned int)-1) {
7971 tlen = rb_enc_codelen(c, enc);
7977 if (enc != e1) may_modify = 1;
7979 if ((offset = t - buf) + tlen > max) {
7980 size_t MAYBE_UNUSED(old) = max + termlen;
7981 max = offset + tlen + (send - s);
7982 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
7986 if (may_modify && memcmp(s, t, tlen) != 0) {
7992 if (!STR_EMBED_P(str)) {
7993 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7995 TERM_FILL((
char *)t, termlen);
7996 RSTRING(str)->as.heap.ptr = (
char *)buf;
7997 RSTRING(str)->as.heap.len = t - buf;
7998 STR_SET_NOEMBED(str);
7999 RSTRING(str)->as.heap.aux.capa = max;
8003 c = (
unsigned char)*s;
8004 if (trans[c] != errc) {
8021 long offset, max = (long)((send - s) * 1.2);
8022 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8026 c0 = c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, e1);
8027 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8035 if (cflag) c = last;
8038 else if (cflag) c = errc;
8042 c = cflag ? last : errc;
8045 tlen = rb_enc_codelen(c, enc);
8050 if (enc != e1) may_modify = 1;
8052 if ((offset = t - buf) + tlen > max) {
8053 size_t MAYBE_UNUSED(old) = max + termlen;
8054 max = offset + tlen + (long)((send - s) * 1.2);
8055 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8060 if (may_modify && memcmp(s, t, tlen) != 0) {
8068 if (!STR_EMBED_P(str)) {
8069 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8071 TERM_FILL((
char *)t, termlen);
8072 RSTRING(str)->as.heap.ptr = (
char *)buf;
8073 RSTRING(str)->as.heap.len = t - buf;
8074 STR_SET_NOEMBED(str);
8075 RSTRING(str)->as.heap.aux.capa = max;
8081 rb_enc_associate(str, enc);
8100 return tr_trans(str, src, repl, 0);
8147 tr_trans(str, src, repl, 0);
8151#define TR_TABLE_MAX (UCHAR_MAX+1)
8152#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8154tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8157 const unsigned int errc = -1;
8158 char buf[TR_TABLE_MAX];
8161 VALUE table = 0, ptable = 0;
8162 int i, l, cflag = 0;
8165 tr.gen =
tr.now =
tr.max = 0;
8167 if (
RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8172 for (i=0; i<TR_TABLE_MAX; i++) {
8175 stable[TR_TABLE_MAX] = cflag;
8177 else if (stable[TR_TABLE_MAX] && !cflag) {
8178 stable[TR_TABLE_MAX] = 0;
8180 for (i=0; i<TR_TABLE_MAX; i++) {
8184 while ((c = trnext(&
tr, enc)) != errc) {
8185 if (c < TR_TABLE_MAX) {
8186 buf[(
unsigned char)c] = !cflag;
8191 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8194 table = ptable ? ptable : rb_hash_new();
8198 table = rb_hash_new();
8203 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8204 rb_hash_aset(table, key,
Qtrue);
8208 for (i=0; i<TR_TABLE_MAX; i++) {
8209 stable[i] = stable[i] && buf[i];
8211 if (!table && !cflag) {
8218tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8220 if (c < TR_TABLE_MAX) {
8221 return table[c] != 0;
8227 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8228 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8232 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8235 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8249rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8251 char squeez[TR_TABLE_SIZE];
8254 VALUE del = 0, nodel = 0;
8256 int i, ascompat, cr;
8260 for (i=0; i<argc; i++) {
8264 enc = rb_enc_check(str, s);
8265 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8268 str_modify_keep_cr(str);
8277 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8288 c = rb_enc_codepoint_len(s, send, &clen, enc);
8290 if (tr_find(c, squeez, del, nodel)) {
8301 TERM_FILL(t, TERM_LEN(str));
8305 if (modify)
return str;
8325rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8328 rb_str_delete_bang(argc, argv, str);
8342rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8344 char squeez[TR_TABLE_SIZE];
8346 VALUE del = 0, nodel = 0;
8347 unsigned char *s, *send, *t;
8349 int ascompat, singlebyte = single_byte_optimizable(str);
8353 enc = STR_ENC_GET(str);
8356 for (i=0; i<argc; i++) {
8360 enc = rb_enc_check(str, s);
8361 if (singlebyte && !single_byte_optimizable(s))
8363 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8367 str_modify_keep_cr(str);
8376 unsigned int c = *s++;
8377 if (c != save || (argc > 0 && !squeez[c])) {
8387 if (ascompat && (c = *s) < 0x80) {
8388 if (c != save || (argc > 0 && !squeez[c])) {
8394 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8396 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8406 TERM_FILL((
char *)t, TERM_LEN(str));
8412 if (modify)
return str;
8435rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8438 rb_str_squeeze_bang(argc, argv, str);
8456 return tr_trans(str, src, repl, 1);
8479 tr_trans(str, src, repl, 1);
8508rb_str_count(
int argc,
VALUE *argv,
VALUE str)
8510 char table[TR_TABLE_SIZE];
8512 VALUE del = 0, nodel = 0, tstr;
8522 enc = rb_enc_check(str, tstr);
8527 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
8528 !is_broken_string(str)) {
8530 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8536 if (*(
unsigned char*)s++ == c) n++;
8542 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8543 for (i=1; i<argc; i++) {
8546 enc = rb_enc_check(str, tstr);
8547 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8557 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8565 c = rb_enc_codepoint_len(s, send, &clen, enc);
8566 if (tr_find(c, table, del, nodel)) {
8577rb_fs_check(
VALUE val)
8581 if (
NIL_P(val))
return 0;
8586static const char isspacetable[256] = {
8587 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8589 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8605#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8608split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
8610 if (empty_count >= 0 && len == 0) {
8611 return empty_count + 1;
8613 if (empty_count > 0) {
8617 rb_ary_push(result, str_new_empty_String(str));
8618 }
while (--empty_count > 0);
8622 rb_yield(str_new_empty_String(str));
8623 }
while (--empty_count > 0);
8628 rb_ary_push(result, str);
8637 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8641literal_split_pattern(
VALUE spat, split_type_t default_type)
8649 return SPLIT_TYPE_CHARS;
8652 if (len == 1 && ptr[0] ==
' ') {
8653 return SPLIT_TYPE_AWK;
8658 if (rb_enc_ascget(ptr, ptr + len, &l, enc) ==
' ' && len == l) {
8659 return SPLIT_TYPE_AWK;
8662 return default_type;
8675rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
8680 split_type_t split_type;
8681 long beg, end, i = 0, empty_count = -1;
8686 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
8688 if (lim <= 0) limit =
Qnil;
8689 else if (lim == 1) {
8701 if (
NIL_P(limit) && !lim) empty_count = 0;
8703 enc = STR_ENC_GET(str);
8704 split_type = SPLIT_TYPE_REGEXP;
8706 spat = get_pat_quoted(spat, 0);
8708 else if (
NIL_P(spat = rb_fs)) {
8709 split_type = SPLIT_TYPE_AWK;
8711 else if (!(spat = rb_fs_check(spat))) {
8717 if (split_type != SPLIT_TYPE_AWK) {
8722 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8723 if (split_type == SPLIT_TYPE_AWK) {
8725 split_type = SPLIT_TYPE_STRING;
8730 mustnot_broken(spat);
8731 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8739#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8741 if (result) result = rb_ary_new();
8745 if (split_type == SPLIT_TYPE_AWK) {
8751 if (is_ascii_string(str)) {
8752 while (ptr < eptr) {
8753 c = (
unsigned char)*ptr++;
8755 if (ascii_isspace(c)) {
8761 if (!
NIL_P(limit) && lim <= i)
break;
8764 else if (ascii_isspace(c)) {
8765 SPLIT_STR(beg, end-beg);
8768 if (!
NIL_P(limit)) ++i;
8776 while (ptr < eptr) {
8779 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8788 if (!
NIL_P(limit) && lim <= i)
break;
8792 SPLIT_STR(beg, end-beg);
8795 if (!
NIL_P(limit)) ++i;
8803 else if (split_type == SPLIT_TYPE_STRING) {
8804 char *str_start = ptr;
8805 char *substr_start = ptr;
8809 mustnot_broken(str);
8810 enc = rb_enc_check(str, spat);
8811 while (ptr < eptr &&
8812 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8815 if (t != ptr + end) {
8819 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8822 if (!
NIL_P(limit) && lim <= ++i)
break;
8824 beg = ptr - str_start;
8826 else if (split_type == SPLIT_TYPE_CHARS) {
8827 char *str_start = ptr;
8830 mustnot_broken(str);
8831 enc = rb_enc_get(str);
8832 while (ptr < eptr &&
8833 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8834 SPLIT_STR(ptr - str_start, n);
8836 if (!
NIL_P(limit) && lim <= ++i)
break;
8838 beg = ptr - str_start;
8849 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
8854 if (start == end && BEG(0) == END(0)) {
8859 else if (last_null == 1) {
8860 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8867 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8873 SPLIT_STR(beg, end-beg);
8874 beg = start = END(0);
8878 for (idx=1; idx < regs->num_regs; idx++) {
8879 if (BEG(idx) == -1)
continue;
8880 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8882 if (!
NIL_P(limit) && lim <= ++i)
break;
8884 if (match) rb_match_unbusy(match);
8890 return result ? result : str;
8900 return rb_str_split_m(1, &sep, str);
8903#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8909 rb_ary_push(ary, e);
8918#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8921chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
8927 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
8946#define rb_rs get_rs()
8953 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8954 long pos, len, rslen;
8960 static ID keywords[1];
8965 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
8969 if (!ENUM_ELEM(ary, str)) {
8986 enc = rb_enc_get(str);
8988 enc = rb_enc_check(str, rs);
8993 const char *eol = NULL;
8995 while (subend < pend) {
8996 long chomp_rslen = 0;
8998 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
9000 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9002 if (eol == subend)
break;
9006 chomp_rslen = -rslen;
9010 if (!subptr) subptr = subend;
9014 }
while (subend < pend);
9016 if (rslen == 0) chomp_rslen = 0;
9018 subend - subptr + (chomp ? chomp_rslen : rslen));
9019 if (ENUM_ELEM(ary, line)) {
9020 str_mod_check(str, ptr, len);
9022 subptr = eol = NULL;
9041 while (subptr < pend) {
9042 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9046 if (hit != adjusted) {
9050 subend = hit += rslen;
9053 subend = chomp_newline(subptr, subend, enc);
9060 if (ENUM_ELEM(ary, line)) {
9061 str_mod_check(str, ptr, len);
9066 if (subptr != pend) {
9069 pend = chomp_newline(subptr, pend, enc);
9071 else if (pend - subptr >= rslen &&
9072 memcmp(pend - rslen, rsptr, rslen) == 0) {
9077 ENUM_ELEM(ary, line);
9098rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9101 return rb_str_enumerate_lines(argc, argv, str, 0);
9114rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9116 VALUE ary = WANTARRAY(
"lines", 0);
9117 return rb_str_enumerate_lines(argc, argv, str, ary);
9150rb_str_each_byte(
VALUE str)
9153 return rb_str_enumerate_bytes(str, 0);
9165rb_str_bytes(
VALUE str)
9168 return rb_str_enumerate_bytes(str, ary);
9188 enc = rb_enc_get(str);
9191 for (i = 0; i < len; i += n) {
9192 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9197 for (i = 0; i < len; i += n) {
9198 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9219rb_str_each_char(
VALUE str)
9222 return rb_str_enumerate_chars(str, 0);
9234rb_str_chars(
VALUE str)
9237 return rb_str_enumerate_chars(str, ary);
9241rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9246 const char *ptr, *end;
9249 if (single_byte_optimizable(str))
9250 return rb_str_enumerate_bytes(str, ary);
9255 enc = STR_ENC_GET(str);
9258 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9279rb_str_each_codepoint(
VALUE str)
9282 return rb_str_enumerate_codepoints(str, 0);
9294rb_str_codepoints(
VALUE str)
9297 return rb_str_enumerate_codepoints(str, ary);
9303 int encidx = rb_enc_to_index(enc);
9305 const OnigUChar source_ascii[] =
"\\X";
9306 const OnigUChar *source = source_ascii;
9307 size_t source_len =
sizeof(source_ascii) - 1;
9310#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9311#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9312#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9313#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9314#define CASE_UTF(e) \
9315 case ENCINDEX_UTF_##e: { \
9316 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9317 source = source_UTF_##e; \
9318 source_len = sizeof(source_UTF_##e); \
9321 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9329 regex_t *reg_grapheme_cluster;
9331 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9332 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9334 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9335 onig_error_code_to_str(message, r, &einfo);
9336 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9339 return reg_grapheme_cluster;
9345 int encidx = rb_enc_to_index(enc);
9346 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9348 if (encidx == rb_utf8_encindex()) {
9349 if (!reg_grapheme_cluster_utf8) {
9350 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9353 return reg_grapheme_cluster_utf8;
9362 size_t grapheme_cluster_count = 0;
9364 const char *ptr, *end;
9366 if (!rb_enc_unicode_p(enc)) {
9370 bool cached_reg_grapheme_cluster =
true;
9371 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9372 if (!reg_grapheme_cluster) {
9373 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9374 cached_reg_grapheme_cluster =
false;
9381 OnigPosition len = onig_match(reg_grapheme_cluster,
9382 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9383 (
const OnigUChar *)ptr, NULL, 0);
9384 if (len <= 0)
break;
9385 grapheme_cluster_count++;
9389 if (!cached_reg_grapheme_cluster) {
9390 onig_free(reg_grapheme_cluster);
9393 return SIZET2NUM(grapheme_cluster_count);
9397rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9401 const char *ptr0, *ptr, *end;
9403 if (!rb_enc_unicode_p(enc)) {
9404 return rb_str_enumerate_chars(str, ary);
9409 bool cached_reg_grapheme_cluster =
true;
9410 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9411 if (!reg_grapheme_cluster) {
9412 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9413 cached_reg_grapheme_cluster =
false;
9420 OnigPosition len = onig_match(reg_grapheme_cluster,
9421 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9422 (
const OnigUChar *)ptr, NULL, 0);
9423 if (len <= 0)
break;
9428 if (!cached_reg_grapheme_cluster) {
9429 onig_free(reg_grapheme_cluster);
9449rb_str_each_grapheme_cluster(
VALUE str)
9452 return rb_str_enumerate_grapheme_clusters(str, 0);
9464rb_str_grapheme_clusters(
VALUE str)
9467 return rb_str_enumerate_grapheme_clusters(str, ary);
9471chopped_length(
VALUE str)
9474 const char *p, *p2, *beg, *end;
9478 if (beg >= end)
return 0;
9481 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
9483 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
9499rb_str_chop_bang(
VALUE str)
9501 str_modify_keep_cr(str);
9504 len = chopped_length(str);
9505 STR_SET_LEN(str, len);
9525rb_str_chop(
VALUE str)
9531smart_chomp(
VALUE str,
const char *e,
const char *p)
9542 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
9550 if (--e > p && *(e-1) ==
'\r') {
9567 char *pp, *e, *rsptr;
9572 if (len == 0)
return 0;
9575 return smart_chomp(str, e, p);
9578 enc = rb_enc_get(str);
9589 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
9596 while (e > p && *(e-1) ==
'\n') {
9598 if (e > p && *(e-1) ==
'\r')
9604 if (rslen > len)
return len;
9606 enc = rb_enc_get(rs);
9607 newline = rsptr[rslen-1];
9610 if (newline ==
'\n')
9611 return smart_chomp(str, e, p);
9615 return smart_chomp(str, e, p);
9619 enc = rb_enc_check(str, rs);
9620 if (is_broken_string(rs)) {
9624 if (p[len-1] == newline &&
9626 memcmp(rsptr, pp, rslen) == 0)) {
9640chomp_rs(
int argc,
const VALUE *argv)
9657 long len = chompped_length(str, rs);
9658 if (len >= olen)
return Qnil;
9659 str_modify_keep_cr(str);
9660 STR_SET_LEN(str, len);
9678rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
9681 str_modifiable(str);
9683 rs = chomp_rs(argc, argv);
9685 return rb_str_chomp_string(str, rs);
9698rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
9700 VALUE rs = chomp_rs(argc, argv);
9708 const char *
const start = s;
9710 if (!s || s >= e)
return 0;
9713 if (single_byte_optimizable(str)) {
9714 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
9719 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9739rb_str_lstrip_bang(
VALUE str)
9745 str_modify_keep_cr(str);
9746 enc = STR_ENC_GET(str);
9748 loffset = lstrip_offset(str, start, start+olen, enc);
9750 long len = olen-loffset;
9751 s = start + loffset;
9752 memmove(start, s, len);
9753 STR_SET_LEN(str, len);
9777rb_str_lstrip(
VALUE str)
9782 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9783 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
9792 rb_str_check_dummy_enc(enc);
9796 if (!s || s >= e)
return 0;
9800 if (single_byte_optimizable(str)) {
9802 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
9827rb_str_rstrip_bang(
VALUE str)
9833 str_modify_keep_cr(str);
9834 enc = STR_ENC_GET(str);
9836 roffset = rstrip_offset(str, start, start+olen, enc);
9838 long len = olen - roffset;
9840 STR_SET_LEN(str, len);
9864rb_str_rstrip(
VALUE str)
9870 enc = STR_ENC_GET(str);
9872 roffset = rstrip_offset(str, start, start+olen, enc);
9874 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
9890rb_str_strip_bang(
VALUE str)
9893 long olen, loffset, roffset;
9896 str_modify_keep_cr(str);
9897 enc = STR_ENC_GET(str);
9899 loffset = lstrip_offset(str, start, start+olen, enc);
9900 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9902 if (loffset > 0 || roffset > 0) {
9903 long len = olen-roffset;
9906 memmove(start, start + loffset, len);
9908 STR_SET_LEN(str, len);
9932rb_str_strip(
VALUE str)
9935 long olen, loffset, roffset;
9939 loffset = lstrip_offset(str, start, start+olen, enc);
9940 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9942 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
9947scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
9949 VALUE result, match;
9952 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9970 *start = end + rb_enc_fast_mbclen(
RSTRING_PTR(str) + end,
9978 if (!regs || regs->num_regs == 1) {
9983 for (i=1; i < regs->num_regs; i++) {
9988 rb_ary_push(result, s);
10041 long last = -1, prev = 0;
10044 pat = get_pat_quoted(pat, 1);
10045 mustnot_broken(str);
10047 VALUE ary = rb_ary_new();
10049 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10052 rb_ary_push(ary, result);
10054 if (last >= 0) rb_pat_search(pat, str, last, 1);
10059 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10063 str_mod_check(str, p, len);
10065 if (last >= 0) rb_pat_search(pat, str, last, 1);
10089rb_str_hex(
VALUE str)
10091 return rb_str_to_inum(str, 16, FALSE);
10116rb_str_oct(
VALUE str)
10118 return rb_str_to_inum(str, -8, FALSE);
10121#ifndef HAVE_CRYPT_R
10126 rb_nativethread_lock_t lock;
10127} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10130crypt_mutex_initialize(
void)
10201# define CRYPT_END() ALLOCV_END(databuf)
10203 extern char *crypt(
const char *,
const char *);
10204# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10207 const char *s, *saltp;
10210 char salt_8bit_clean[3];
10214 mustnot_wchar(str);
10215 mustnot_wchar(salt);
10218 if (
RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10223 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10224 salt_8bit_clean[0] = saltp[0] & 0x7f;
10225 salt_8bit_clean[1] = saltp[1] & 0x7f;
10226 salt_8bit_clean[2] =
'\0';
10227 saltp = salt_8bit_clean;
10232# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10233 data->initialized = 0;
10235 res = crypt_r(s, saltp, data);
10237 crypt_mutex_initialize();
10239 res = crypt(s, saltp);
10280 char *ptr, *p, *pend;
10283 unsigned long sum0 = 0;
10295 str_mod_check(str, ptr, len);
10298 sum0 += (
unsigned char)*p;
10309 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10310 sum0 &= (((
unsigned long)1)<<bits)-1;
10330rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10334 long width, len, flen = 1, fclen = 1;
10337 const char *f =
" ";
10338 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10340 int singlebyte = 1, cr;
10344 enc = STR_ENC_GET(str);
10349 enc = rb_enc_check(str, pad);
10352 fclen = str_strlen(pad, enc);
10353 singlebyte = single_byte_optimizable(pad);
10354 if (flen == 0 || fclen == 0) {
10358 len = str_strlen(str, enc);
10359 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10361 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10365 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10366 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10369 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10370 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10371 (len += llen2 + rlen2) >= LONG_MAX - size) {
10375 res = str_new0(
rb_cString, 0, len, termlen);
10378 memset(p, *f, llen);
10382 while (llen >= fclen) {
10388 memcpy(p, f, llen2);
10395 memset(p, *f, rlen);
10399 while (rlen >= fclen) {
10405 memcpy(p, f, rlen2);
10409 TERM_FILL(p, termlen);
10411 rb_enc_associate(res, enc);
10433rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10435 return rb_str_justify(argc, argv, str,
'l');
10449rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
10451 return rb_str_justify(argc, argv, str,
'r');
10466rb_str_center(
int argc,
VALUE *argv,
VALUE str)
10468 return rb_str_justify(argc, argv, str,
'c');
10484 sep = get_pat_quoted(sep, 0);
10496 pos = rb_str_index(str, sep, 0);
10497 if (pos < 0)
goto failed;
10505 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10521 sep = get_pat_quoted(sep, 0);
10534 pos = rb_str_rindex(str, sep, pos);
10546 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
10558rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
10562 for (i=0; i<argc; i++) {
10563 VALUE tmp = argv[i];
10565 if (rb_reg_start_with_p(tmp, str))
10570 rb_enc_check(str, tmp);
10588rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
10594 for (i=0; i<argc; i++) {
10595 VALUE tmp = argv[i];
10598 enc = rb_enc_check(str, tmp);
10622deleted_prefix_length(
VALUE str,
VALUE prefix)
10624 char *strptr, *prefixptr;
10625 long olen, prefixlen;
10628 if (is_broken_string(prefix))
return 0;
10629 rb_enc_check(str, prefix);
10633 if (prefixlen <= 0)
return 0;
10635 if (olen < prefixlen)
return 0;
10638 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
10653rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
10656 str_modify_keep_cr(str);
10658 prefixlen = deleted_prefix_length(str, prefix);
10659 if (prefixlen <= 0)
return Qnil;
10673rb_str_delete_prefix(
VALUE str,
VALUE prefix)
10677 prefixlen = deleted_prefix_length(str, prefix);
10678 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
10693deleted_suffix_length(
VALUE str,
VALUE suffix)
10695 char *strptr, *suffixptr, *s;
10696 long olen, suffixlen;
10700 if (is_broken_string(suffix))
return 0;
10701 enc = rb_enc_check(str, suffix);
10705 if (suffixlen <= 0)
return 0;
10707 if (olen < suffixlen)
return 0;
10710 s = strptr + olen - suffixlen;
10711 if (memcmp(s, suffixptr, suffixlen) != 0)
return 0;
10727rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
10729 long olen, suffixlen, len;
10730 str_modifiable(str);
10732 suffixlen = deleted_suffix_length(str, suffix);
10733 if (suffixlen <= 0)
return Qnil;
10736 str_modify_keep_cr(str);
10737 len = olen - suffixlen;
10738 STR_SET_LEN(str, len);
10739 TERM_FILL(&
RSTRING_PTR(str)[len], TERM_LEN(str));
10755rb_str_delete_suffix(
VALUE str,
VALUE suffix)
10759 suffixlen = deleted_suffix_length(str, suffix);
10760 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
10777 val = rb_fs_check(val);
10780 "value of %"PRIsVALUE
" must be String or Regexp",
10784 rb_warn_deprecated(
"`$;'", NULL);
10801 str_modifiable(str);
10802 rb_enc_associate(str, rb_to_encoding(enc));
10819 if (
FL_TEST(str, STR_NOEMBED)) {
10825 str_replace_shared_without_enc(str2, str);
10860rb_str_valid_encoding_p(
VALUE str)
10880rb_str_is_ascii_only_p(
VALUE str)
10890 static const char ellipsis[] =
"...";
10891 const long ellipsislen =
sizeof(ellipsis) - 1;
10894 const char *
const p =
RSTRING_PTR(str), *e = p + blen;
10895 VALUE estr, ret = 0;
10899 (e =
rb_enc_nth(p, e, len, enc)) - p == blen) {
10902 else if (len <= ellipsislen ||
10906 rb_enc_associate(ret, enc);
10918 rb_enc_from_encoding(enc), 0,
Qnil);
10956 if (enc == STR_ENC_GET(str)) {
10961 return enc_str_scrub(enc, str, repl, cr);
10969 const char *rep, *p, *e, *p1, *sp;
10982 if (!
NIL_P(repl)) {
10983 repl = str_compat_and_valid(repl, enc);
10986 if (rb_enc_dummy_p(enc)) {
10989 encidx = rb_enc_to_index(enc);
10991#define DEFAULT_REPLACE_CHAR(str) do { \
10992 static const char replace[sizeof(str)-1] = str; \
10993 rep = replace; replen = (int)sizeof(replace); \
11008 else if (!
NIL_P(repl)) {
11013 else if (encidx == rb_utf8_encindex()) {
11014 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11018 DEFAULT_REPLACE_CHAR(
"?");
11023 p = search_nonascii(p, e);
11028 int ret = rb_enc_precise_mbclen(p, e, enc);
11047 if (e - p < clen) clen = e - p;
11054 for (; clen > 1; clen--) {
11055 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11067 str_mod_check(str, sp, slen);
11068 repl = str_compat_and_valid(repl, enc);
11075 p = search_nonascii(p, e);
11102 str_mod_check(str, sp, slen);
11103 repl = str_compat_and_valid(repl, enc);
11116 else if (!
NIL_P(repl)) {
11120 else if (encidx == ENCINDEX_UTF_16BE) {
11121 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11123 else if (encidx == ENCINDEX_UTF_16LE) {
11124 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11126 else if (encidx == ENCINDEX_UTF_32BE) {
11127 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11129 else if (encidx == ENCINDEX_UTF_32LE) {
11130 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11133 DEFAULT_REPLACE_CHAR(
"?");
11137 int ret = rb_enc_precise_mbclen(p, e, enc);
11150 if (e - p < clen) clen = e - p;
11151 if (clen <= mbminlen * 2) {
11156 for (; clen > mbminlen; clen-=mbminlen) {
11157 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11168 str_mod_check(str, sp, slen);
11169 repl = str_compat_and_valid(repl, enc);
11195 str_mod_check(str, sp, slen);
11196 repl = str_compat_and_valid(repl, enc);
11232str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11240static ID id_normalize;
11241static ID id_normalized_p;
11242static VALUE mUnicodeNormalize;
11245unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11247 static int UnicodeNormalizeRequired = 0;
11250 if (!UnicodeNormalizeRequired) {
11251 rb_require(
"unicode_normalize/normalize.rb");
11252 UnicodeNormalizeRequired = 1;
11256 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11293rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11295 return unicode_normalize_common(argc, argv, str, id_normalize);
11309rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11311 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11338rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11340 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11475#define sym_equal rb_obj_equal
11478sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
11482 int c = rb_enc_precise_mbclen(s, send, enc);
11494rb_str_symname_p(
VALUE sym)
11499 rb_encoding *resenc = rb_default_internal_encoding();
11501 if (resenc == NULL) resenc = rb_default_external_encoding();
11502 enc = STR_ENC_GET(sym);
11505 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (
long)strlen(ptr) ||
11513rb_str_quote_unprintable(
VALUE str)
11521 resenc = rb_default_internal_encoding();
11522 if (resenc == NULL) resenc = rb_default_external_encoding();
11523 enc = STR_ENC_GET(str);
11526 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11527 !sym_printable(ptr, ptr + len, enc)) {
11528 return rb_str_escape(str);
11533MJIT_FUNC_EXPORTED
VALUE
11534rb_id_quote_unprintable(
ID id)
11536 VALUE str = rb_id2str(
id);
11537 if (!rb_str_symname_p(str)) {
11538 return rb_str_escape(str);
11556sym_inspect(
VALUE sym)
11563 if (!rb_str_symname_p(str)) {
11568 memmove(dest + 1, dest, len);
11575 memcpy(dest + 1, ptr, len);
11600MJIT_FUNC_EXPORTED
VALUE
11601rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
11705 return rb_str_match(
rb_sym2str(sym), other);
11720sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
11722 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
11735sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
11737 return rb_str_match_m_p(argc, argv, sym);
11755 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
11769sym_length(
VALUE sym)
11783sym_empty(
VALUE sym)
11817sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
11833sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
11849sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
11863sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
11865 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
11878sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
11880 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
11892sym_encoding(
VALUE sym)
11898string_for_symbol(
VALUE name)
11917 name = string_for_symbol(name);
11927 name = string_for_symbol(name);
11951 return rb_fstring(str);
11958 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII), TRUE);
11970 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11971 rb_enc_autoload(enc);
11975 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc), TRUE);
11988 assert(rb_vm_fstring_table());
11989 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12152 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
void rb_bug(const char *fmt,...)
Interpreter panic switch.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eArgError
ArgumentError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_cSymbol
Sumbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
#define rb_check_frozen
Just another name of rb_check_frozen.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.