[uim-commit] r2280 - branches/r5rs/sigscheme
yamaken at freedesktop.org
yamaken at freedesktop.org
Tue Nov 29 18:00:27 PST 2005
Author: yamaken
Date: 2005-11-29 18:00:22 -0800 (Tue, 29 Nov 2005)
New Revision: 2280
Modified:
branches/r5rs/sigscheme/TODO
branches/r5rs/sigscheme/debug.c
branches/r5rs/sigscheme/encoding.c
branches/r5rs/sigscheme/encoding.h
branches/r5rs/sigscheme/io.c
branches/r5rs/sigscheme/operations.c
branches/r5rs/sigscheme/read.c
branches/r5rs/sigscheme/sigscheme.h
branches/r5rs/sigscheme/sigschemetype-compact.h
branches/r5rs/sigscheme/sigschemetype.h
branches/r5rs/sigscheme/storage-gc.c
branches/r5rs/sigscheme/storage.c
branches/r5rs/sigscheme/test-compact.c
Log:
* This commit changes string-based charcter object to int-based
* sigscheme/sigscheme.h
- (Scm_NewChar): Adapt to int-based char
* sigscheme/sigschemetype.h
- (SCM_CHARCELL_SIZE): Removed
- (struct ScmCell_, SCM_CHAR_VALUE, SCM_CHAR_SET_VALUE): Adapt to
int-based char
* sigscheme/sigschemetype-compact.h
- (SCM_CHAR_VALUE, SCM_CHAR_SET_VALUE): Adapt to int-based char
* sigscheme/test-compact.c
- (Scm_CheckChar): Add FIXME comment
* sigscheme/storage.c
- Remove SCM_CHARCELL_SIZE handling
- (Scm_NewChar): Adapt to int-based char
* sigscheme/storage-gc.c
- (sweep_obj): Adapt to int-based char
* sigscheme/encoding.h
- (Scm_mb_scan_char): Removed
* sigscheme/encoding.c
- (Scm_mb_scan_char): Removed
- (Scm_mb_strlen, Scm_mb_substring): Replace Scm_mb_scan_char() with
SCM_CHARCODEC_SCAN_CHAR()
- (IS_ASCII): Replace the cast (uchar) with (uint) to multibyte char
- (eucjp_str2int): Fix broken 3byte char handling
- (LEADING_VAL, TRAILING_VAL): Fix broken expression
- (utf8_int2str): Follow the change of TRAILING_VAL()
* sigscheme/read.c
- (CHAR_LITERAL_LEN_MAX, INITIAL_STRING_BUF_SIZE): New macro
- (read_char):
* Support multibyte char
* Adapt to int-based char
- (read_string):
* Ditto
* Fix buffer overflow
* Simplify
- (read_char_sequence):
* Fix fixed-size large buffer on stack
* Fix buffer overflow
* Simplify
* Add non-ASCII char rejection
* sigscheme/io.c
- (ScmOp_read_char, ScmOp_peek_char):
* Adapt to int-based char
* Support multibyte char
* sigscheme/debug.c
- (print_char):
* Support multibyte char
* Adapt to int-based char
* Simplify
* sigscheme/operations.c
- (ScmOp_charequalp, ScmOp_char_alphabeticp, ScmOp_char_numericp,
ScmOp_char_whitespacep, ScmOp_char_upper_casep,
ScmOp_char_lower_casep, ScmOp_char2integer, ScmOp_integer2char,
ScmOp_char_upcase, ScmOp_char_downcase, ScmOp_make_string,
ScmOp_string_ref, ScmOp_string_setd, ScmOp_string2list,
ScmOp_list2string, ScmOp_string_filld): Adapt to int-based char
* sigscheme/TODO
- Update
Modified: branches/r5rs/sigscheme/TODO
===================================================================
--- branches/r5rs/sigscheme/TODO 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/TODO 2005-11-30 02:00:22 UTC (rev 2280)
@@ -66,7 +66,6 @@
* Add integer->char and char->integer procedures with multibyte support
- Write test
- - Make character object int-based
* Dynamic encoding switching for a conversion between string and char list,
based on a SRFI or de facto standard API
Modified: branches/r5rs/sigscheme/debug.c
===================================================================
--- branches/r5rs/sigscheme/debug.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/debug.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -308,51 +308,33 @@
static void print_char(ScmObj port, ScmObj obj, enum OutputType otype)
{
- const ScmSpecialCharInfo *info = NULL;
- char *lex_rep = NULL;
+ const ScmSpecialCharInfo *info;
+ int c;
- /* sanity check */
- if (SCM_CHAR_VALUE(obj) == NULL)
- return;
-
+ c = SCM_CHAR_VALUE(obj);
switch (otype) {
case AS_WRITE:
- lex_rep = SCM_CHAR_VALUE(obj);
- /* single byte */
- if (strlen(lex_rep) < 2) {
- /* search in the Scm_special_char_table */
- for (info = Scm_special_char_table; info->esc_seq; info++) {
- if (SCM_CHAR_VALUE(obj)[0] == info->code) {
- SigScm_PortPrintf(port, "#\\%s", info->lex_rep);
- return;
- }
+ SCM_PORT_PUTS(port, "#\\");
+ /* special chars */
+ for (info = Scm_special_char_table; info->esc_seq; info++) {
+ if (c == (int)info->code) {
+ SCM_PORT_PUTS(port, info->lex_rep);
+ return;
}
- /* not found in the table*/
- if (isprint(SCM_CHAR_VALUE(obj)[0])) {
- SigScm_PortPrintf(port, "#\\%s", SCM_CHAR_VALUE(obj));
- } else {
- /* convert to hexadecimal format */
- lex_rep = (char*)malloc(sizeof(char) * 4);
- snprintf(lex_rep, 4, "x%02x", (int)SCM_CHAR_VALUE(obj)[0]);
- SigScm_PortPrintf(port, "#\\%s", lex_rep);
- free(lex_rep);
- }
- return;
}
- /* multi byte */
- SigScm_PortPrintf(port, "#\\%s", lex_rep);
- break;
+ /* other control chars are printed in hexadecimal form */
+ if (isascii(c) && iscntrl(c)) {
+ SigScm_PortPrintf(port, "x%02x", c);
+ return;
+ }
+ /* FALLTHROUGH */
case AS_DISPLAY:
- /*
- * in display, character objects appear in the reqpresentation as
- * if writen by write-char instead of by write.
- */
- SCM_PORT_PRINT(port, SCM_CHAR_VALUE(obj));
+ SCM_PORT_PUT_CHAR(port, c);
break;
default:
- ERR("print_char : unknown output type");
+ ERR("print_char: unknown output type");
break;
}
}
Modified: branches/r5rs/sigscheme/encoding.c
===================================================================
--- branches/r5rs/sigscheme/encoding.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/encoding.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -206,22 +206,6 @@
/*=======================================
Global Variables
=======================================*/
-/* FIXME: merge into Scm_current_char_codec */
-ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)
-#if SCM_USE_UTF8_AS_DEFAULT
- = utf8_scan_char;
-#elif SCM_USE_EUCCN_AS_DEFAULT
- = euccn_scan_char;
-#elif SCM_USE_EUCJP_AS_DEFAULT
- = eucjp_scan_char;
-#elif SCM_USE_EUCKR_AS_DEFAULT
- = eucjp_scan_char;
-#elif SCM_USE_SJIS_AS_DEFAULT
- = sjis_scan_char;
-#else
- = unibyte_scan_char;
-#endif
-
/* temporary solution */
ScmCharCodec *Scm_current_char_codec
#if SCM_USE_UTF8_AS_DEFAULT
@@ -251,7 +235,7 @@
SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));
while (SCM_MBS_GET_SIZE(mbs)) {
- c = Scm_mb_scan_char(mbs);
+ c = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, mbs);
CDBG((SCM_DBG_ENCODING, "%d, %d;", SCM_MBCINFO_GET_SIZE(c), c.flag));
SCM_MBS_SKIP_CHAR(mbs, c);
len++;
@@ -280,14 +264,14 @@
ret = mbs;
while (i--) {
- c = Scm_mb_scan_char(ret);
+ c = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, ret);
SCM_MBS_SKIP_CHAR(ret, c);
}
end = ret;
while (len--) {
- c = Scm_mb_scan_char(end);
+ c = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, end);
SCM_MBS_SKIP_CHAR(end, c);
}
@@ -341,7 +325,7 @@
#define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)
#define IN_GR96(c) (0xA0 <= (uchar)(c) && (uchar)(c) <= 0xFF)
-#define IS_ASCII(c) ((uchar)(c) <= 0x7F)
+#define IS_ASCII(c) ((uint)(c) <= 0x7F)
#define IS_GR_SPC_OR_DEL(c) ((uchar)(c) == 0xA0 || (uchar)(c) == 0xFF)
#define CHAR_BITS 8
@@ -424,8 +408,8 @@
break;
case 3:
- ch = src[0] << CHAR_BITS;
- ch |= src[1] << CHAR_BITS * 2;
+ ch = src[0] << CHAR_BITS * 2;
+ ch |= src[1] << CHAR_BITS;
ch |= src[2];
break;
@@ -621,10 +605,8 @@
#define TRAILING_CODE_BITS LEN_CODE_BITS(1)
#define TRAILING_VAL_BITS (CHAR_BITS - TRAILING_CODE_BITS)
#define LEADING_VAL_BITS(n) (CHAR_BITS - LEN_CODE_BITS(n))
-#define LEADING_VAL(u, n) \
- ((u) >> (LEN_CODE_BITS(n) + TRAILING_VAL_BITS * ((n) - 1)))
-#define TRAILING_VAL(u, n, i) \
- (~MASK(1) & ((u) >> (TRAILING_VAL_BITS * ((n) - 2 - (i)))))
+#define LEADING_VAL(u, n) ((u) >> TRAILING_VAL_BITS * ((n) - 1))
+#define TRAILING_VAL(u, i) (~MASK(1) & ((u) >> TRAILING_VAL_BITS * (i)))
static const char *utf8_encoding(void)
{
@@ -707,16 +689,16 @@
*dst++ = ch;
} else if (IN_OCT_BMP(ch)) {
*dst++ = LEN_CODE(2) | LEADING_VAL(ch, 2);
- *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 2, 0);
+ *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
} else if (IN_BMP(ch)) {
*dst++ = LEN_CODE(3) | LEADING_VAL(ch, 3);
- *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 3, 0);
- *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 3, 1);
+ *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 1);
+ *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
} else if (IN_SMP(ch)) {
*dst++ = LEN_CODE(4) | LEADING_VAL(ch, 4);
- *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 0);
- *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 1);
- *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 2);
+ *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 2);
+ *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 1);
+ *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
} else {
return NULL;
}
Modified: branches/r5rs/sigscheme/encoding.h
===================================================================
--- branches/r5rs/sigscheme/encoding.h 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/encoding.h 2005-11-30 02:00:22 UTC (rev 2280)
@@ -153,7 +153,6 @@
/*=======================================
Variable Declarations
=======================================*/
-extern ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs);
extern ScmCharCodec *Scm_current_char_codec;
/*=======================================
Modified: branches/r5rs/sigscheme/io.c
===================================================================
--- branches/r5rs/sigscheme/io.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/io.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -373,9 +373,7 @@
ScmObj ScmOp_read_char(ScmObj args)
{
ScmObj port = SCM_INVALID;
- /* FIXME: use int as char */
int ch;
- char buf[2];
DECLARE_FUNCTION("read-char", ProcedureVariadic0);
PREPARE_PORT(port, args, scm_current_input_port);
@@ -384,18 +382,13 @@
if (ch == EOF)
return SCM_EOF;
- buf[0] = ch;
- buf[1] = '\0';
-
- return Scm_NewChar(strdup(buf));
+ return Scm_NewChar(ch);
}
ScmObj ScmOp_peek_char(ScmObj args)
{
ScmObj port = SCM_INVALID;
- /* FIXME: use int as char */
int ch;
- char buf[2];
DECLARE_FUNCTION("peek-char", ProcedureVariadic0);
PREPARE_PORT(port, args, scm_current_input_port);
@@ -404,9 +397,7 @@
if (ch == EOF)
return SCM_EOF;
- buf[0] = ch;
- buf[1] = '\0';
- return Scm_NewChar(strdup(buf));
+ return Scm_NewChar(ch);
}
ScmObj ScmOp_eof_objectp(ScmObj obj)
Modified: branches/r5rs/sigscheme/operations.c
===================================================================
--- branches/r5rs/sigscheme/operations.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/operations.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -986,109 +986,80 @@
ScmObj ScmOp_charequalp(ScmObj ch1, ScmObj ch2)
{
DECLARE_FUNCTION("char=?", ProcedureFixed2);
+
ASSERT_CHARP(ch1);
ASSERT_CHARP(ch2);
- if (strcmp(SCM_CHAR_VALUE(ch1), SCM_CHAR_VALUE(ch2)) == 0)
- return SCM_TRUE;
-
- return SCM_FALSE;
+ return (SCM_CHAR_VALUE(ch1) == SCM_CHAR_VALUE(ch2)) ? SCM_TRUE : SCM_FALSE;
}
ScmObj ScmOp_char_alphabeticp(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-alphabetic?", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return SCM_FALSE;
+ ch = SCM_CHAR_VALUE(obj);
- /* check alphabet */
- if (isalpha(SCM_CHAR_VALUE(obj)[0]) != 0)
- return SCM_TRUE;
-
- return SCM_FALSE;
+ return (isascii(ch) && isalpha(ch)) ? SCM_TRUE : SCM_FALSE;
}
ScmObj ScmOp_char_numericp(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-numeric?", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return SCM_FALSE;
+ ch = SCM_CHAR_VALUE(obj);
- /* check digit */
- if (isdigit(SCM_CHAR_VALUE(obj)[0]) != 0)
- return SCM_TRUE;
-
- return SCM_FALSE;
+ return (isascii(ch) && isdigit(ch)) ? SCM_TRUE : SCM_FALSE;
}
ScmObj ScmOp_char_whitespacep(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-whitespace?", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return SCM_FALSE;
+ ch = SCM_CHAR_VALUE(obj);
- /* check space */
- if (isspace(SCM_CHAR_VALUE(obj)[0]) != 0)
- return SCM_TRUE;
-
- return SCM_FALSE;
+ return (isascii(ch) && isspace(ch)) ? SCM_TRUE : SCM_FALSE;
}
ScmObj ScmOp_char_upper_casep(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-upper-case?", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return SCM_FALSE;
+ ch = SCM_CHAR_VALUE(obj);
- /* check uppercase */
- if (isupper(SCM_CHAR_VALUE(obj)[0]) != 0)
- return SCM_TRUE;
-
- return SCM_FALSE;
+ return (isascii(ch) && isupper(ch)) ? SCM_TRUE : SCM_FALSE;
}
ScmObj ScmOp_char_lower_casep(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-lower-case?", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return SCM_FALSE;
+ ch = SCM_CHAR_VALUE(obj);
- /* check lowercase */
- if (islower(SCM_CHAR_VALUE(obj)[0]) != 0)
- return SCM_TRUE;
-
- return SCM_FALSE;
+ return (isascii(ch) && islower(ch)) ? SCM_TRUE : SCM_FALSE;
}
ScmObj ScmOp_char2integer(ScmObj obj)
{
- int val;
- const char *str;
DECLARE_FUNCTION("char->integer", ProcedureFixed1);
ASSERT_CHARP(obj);
- str = SCM_CHAR_VALUE(obj);
- val = SCM_CHARCODEC_STR2INT(Scm_current_char_codec, str, strlen(str),
- SCM_MB_STATELESS);
- if (val == EOF)
- ERR_OBJ("invalid char value", obj);
- return Scm_NewInt(val);
+ return Scm_NewInt(SCM_CHAR_VALUE(obj));
}
ScmObj ScmOp_integer2char(ScmObj obj)
@@ -1102,36 +1073,34 @@
val = SCM_INT_VALUE(obj);
if (!SCM_CHARCODEC_INT2STR(Scm_current_char_codec, buf, val, SCM_MB_STATELESS))
ERR_OBJ("invalid char value", obj);
- return Scm_NewChar(strdup(buf));
+ return Scm_NewChar(val);
}
ScmObj ScmOp_char_upcase(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-upcase", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return obj;
+ ch = SCM_CHAR_VALUE(obj);
+ if (isascii(ch))
+ SCM_CHAR_SET_VALUE(obj, toupper(ch));
- /* to upcase */
- SCM_CHAR_VALUE(obj)[0] = toupper(SCM_CHAR_VALUE(obj)[0]);
-
return obj;
}
ScmObj ScmOp_char_downcase(ScmObj obj)
{
+ int ch;
DECLARE_FUNCTION("char-downcase", ProcedureFixed1);
+
ASSERT_CHARP(obj);
- /* check multibyte */
- if (strlen(SCM_CHAR_VALUE(obj)) != 1)
- return obj;
+ ch = SCM_CHAR_VALUE(obj);
+ if (isascii(ch))
+ SCM_CHAR_SET_VALUE(obj, tolower(ch));
- /* to upcase */
- SCM_CHAR_VALUE(obj)[0] = tolower(SCM_CHAR_VALUE(obj)[0]);
-
return obj;
}
@@ -1146,13 +1115,9 @@
ScmObj ScmOp_make_string(ScmObj length, ScmObj args)
{
- int len = 0;
- int i;
- int fillstr_size = 0;
+ int filler_val, len, i;
ScmObj filler = SCM_FALSE;
- const char *fillstr = NULL;
- char *new_str = NULL;
- char *p = NULL;
+ ScmObj sport = SCM_FALSE;
DECLARE_FUNCTION("make-string", ProcedureVariadic1);
ASSERT_INTP(length);
@@ -1162,26 +1127,24 @@
if (len < 0)
ERR_OBJ("length must be a positive integer", length);
- /* extract fillstr */
+ /* extract filler */
if (NO_MORE_ARG(args)) {
- fillstr = " ";
+ filler_val = ' ';
} else {
filler = POP_ARG(args);
ASSERT_NO_MORE_ARG(args);
ASSERT_CHARP(filler);
- fillstr = SCM_CHAR_VALUE(filler);
+ filler_val = SCM_CHAR_VALUE(filler);
}
- /* fill string */
- fillstr_size = strlen(fillstr);
- new_str = (char*)malloc(sizeof(char) * fillstr_size * len + 1);
- for (i = 0, p = new_str; i < len; i++) {
- strcpy(p, fillstr);
- p += fillstr_size;
+ /* TODO: make efficient */
+ /* fill string (multibyte-ready) */
+ sport = ScmOp_SRFI6_open_output_string();
+ for (i = 0; i < len; i++) {
+ SCM_PORT_PUT_CHAR(sport, filler_val);
}
- new_str[fillstr_size * len] = '\0';
- return Scm_NewMutableString(new_str);
+ return ScmOp_SRFI6_get_output_string(sport);
}
ScmObj ScmOp_string(ScmObj args)
@@ -1200,30 +1163,29 @@
ScmObj ScmOp_string_ref(ScmObj str, ScmObj k)
{
int c_index = 0;
- char *new_ch = NULL;
+ int ch;
ScmMultibyteString mbs;
DECLARE_FUNCTION("string-ref", ProcedureFixed2);
ASSERT_STRINGP(str);
ASSERT_INTP(k);
- SCM_MBS_INIT(mbs);
- /* get start_ptr and end_ptr */
c_index = SCM_INT_VALUE(k);
if (c_index < 0 || SCM_STRING_LEN(str) <= c_index)
ERR_OBJ("index out of range", k);
- SCM_MBS_SET_STR(mbs, SCM_STRING_STR(str));
- /* FIXME: This strlen() can be eliminated. */
+ SCM_MBS_INIT(mbs);
+ SCM_MBS_SET_STR(mbs, SCM_STRING_STR(str));
SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(str)));
mbs = Scm_mb_strref(mbs, c_index);
- /* copy from start_ptr to end_ptr */
- new_ch = (char*)malloc(SCM_MBS_GET_SIZE(mbs) + 1);
- memcpy(new_ch, SCM_MBS_GET_STR(mbs), SCM_MBS_GET_SIZE(mbs));
- new_ch[SCM_MBS_GET_SIZE(mbs)] = '\0';
+ /* FIXME: support stateful encoding */
+ ch = SCM_CHARCODEC_STR2INT(Scm_current_char_codec, SCM_MBS_GET_STR(mbs),
+ SCM_MBS_GET_SIZE(mbs), SCM_MB_STATELESS);
+ if (ch == EOF)
+ ERR("string-ref: invalid char sequence");
- return Scm_NewChar(new_ch);
+ return Scm_NewChar(ch);
}
ScmObj ScmOp_string_setd(ScmObj str, ScmObj k, ScmObj ch)
@@ -1236,6 +1198,8 @@
char *new_str = NULL;
ScmMultibyteString mbs;
const char *string_str = NULL;
+ char new_ch_str[SCM_MB_MAX_LEN + sizeof((char)'\0')];
+ const char *next;
DECLARE_FUNCTION("string-set!", ProcedureFixed3);
ASSERT_STRINGP(str);
@@ -1251,22 +1215,27 @@
/* FIXME: can string_str be NULL at this point or not? */
if (!string_str) string_str = "";
- /* FIXME: strlen() can be eliminiated. */
SCM_MBS_INIT(mbs);
SCM_MBS_SET_STR(mbs, string_str);
SCM_MBS_SET_SIZE(mbs, strlen(string_str));
mbs = Scm_mb_strref(mbs, c_start_index);
+ /* FIXME: support stateful encoding */
+ next = SCM_CHARCODEC_INT2STR(Scm_current_char_codec, new_ch_str,
+ SCM_CHAR_VALUE(ch), SCM_MB_STATELESS);
+ if (!next)
+ ERR_OBJ("invalid char in", str);
+
/* calculate total size */
prefix_size = SCM_MBS_GET_STR(mbs) - string_str;
- newch_size = strlen(SCM_CHAR_VALUE(ch));
+ newch_size = next - new_ch_str;
postfix_size = strlen(SCM_MBS_GET_STR(mbs) + SCM_MBS_GET_SIZE(mbs));
total_size = prefix_size + newch_size + postfix_size;
/* copy each part */
new_str = (char*)malloc(total_size + 1);
memcpy(new_str, string_str, prefix_size);
- memcpy(new_str+prefix_size, SCM_CHAR_VALUE(ch), newch_size);
+ memcpy(new_str+prefix_size, new_ch_str, newch_size);
memcpy(new_str+prefix_size+newch_size,
SCM_MBS_GET_STR(mbs)+SCM_MBS_GET_SIZE(mbs), postfix_size);
@@ -1376,8 +1345,9 @@
ScmObj tail = SCM_NULL;
ScmObj next = SCM_NULL;
ScmMultibyteString mbs;
- ScmMultibyteCharInfo ch;
- char *buf;
+ ScmMultibyteCharInfo mbc;
+ ScmMultibyteState state;
+ int ch;
DECLARE_FUNCTION("string->list", ProcedureFixed1);
ASSERT_STRINGP(str);
@@ -1387,12 +1357,19 @@
SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(str)));
while (SCM_MBS_GET_SIZE(mbs)) {
- ch = Scm_mb_scan_char(mbs);
- buf = malloc(SCM_MBCINFO_GET_SIZE(ch)+1);
- memcpy(buf, SCM_MBS_GET_STR(mbs), SCM_MBCINFO_GET_SIZE(ch));
- buf[SCM_MBCINFO_GET_SIZE(ch)] = 0;
- next = LIST_1(Scm_NewChar(buf));
+ state = SCM_MBS_GET_STATE(mbs);
+ mbc = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, mbs);
+ if (SCM_MBCINFO_ERRORP(mbc) || SCM_MBCINFO_INCOMPLETEP(mbc))
+ ERR("string->list: invalid char sequence");
+ ch = SCM_CHARCODEC_STR2INT(Scm_current_char_codec,
+ SCM_MBS_GET_STR(mbs),
+ SCM_MBCINFO_GET_SIZE(mbc),
+ state);
+ if (ch == EOF)
+ ERR("string->list: invalid char sequence");
+ next = LIST_1(Scm_NewChar(ch));
+
if (NULLP(tail))
head = tail = next;
else {
@@ -1400,7 +1377,7 @@
tail = CDR(tail);
}
- SCM_MBS_SKIP_CHAR(mbs, ch);
+ SCM_MBS_SKIP_CHAR(mbs, mbc);
}
return head;
@@ -1408,44 +1385,25 @@
ScmObj ScmOp_list2string(ScmObj lst)
{
- int len = 0;
- int total_size = 0;
- ScmObj chars = SCM_NULL;
- ScmObj obj = SCM_NULL;
- char *new_str = NULL;
- char *ch = NULL;
- char *p = NULL;
+ ScmObj rest, ch, sport;
DECLARE_FUNCTION("list->string", ProcedureFixed1);
- if (FALSEP(ScmOp_listp(lst)))
- ERR_OBJ("list required but got", lst);
+ ASSERT_LISTP(lst);
if (NULLP(lst))
return Scm_NewMutableStringCopying("");
- /* count total size of the string */
- for (chars = lst; !NULLP(chars); chars = CDR(chars)) {
- obj = CAR(chars);
- ASSERT_CHARP(obj);
-
- total_size += strlen(SCM_CHAR_VALUE(obj));
+ /* TODO: make efficient */
+ sport = ScmOp_SRFI6_open_output_string();
+ for (rest = lst; CONSP(rest); rest = CDR(rest)) {
+ ch = CAR(rest);
+ ASSERT_CHARP(ch);
+ SCM_PORT_PUT_CHAR(sport, SCM_CHAR_VALUE(ch));
}
+ if (!NULLP(rest))
+ ERR_OBJ("invalid char list", lst);
- /* allocate new string */
- new_str = (char*)malloc(sizeof(char) * total_size + 1);
-
- /* copy char by char */
- p = new_str;
- for (chars = lst; !NULLP(chars); chars = CDR(chars)) {
- obj = CAR(chars);
- ch = SCM_CHAR_VALUE(obj);
- len = strlen(SCM_CHAR_VALUE(obj));
-
- strcpy(p, ch);
- p += len;
- }
-
- return Scm_NewMutableString(new_str);
+ return ScmOp_SRFI6_get_output_string(sport);
}
ScmObj ScmOp_string_copy(ScmObj str)
@@ -1461,23 +1419,26 @@
int str_len = 0;
char *new_str = NULL;
char *p = NULL;
- int i = 0;
+ char ch_str[SCM_MB_MAX_LEN + sizeof((char)'\0')];
+ const char *next;
DECLARE_FUNCTION("string-fill!", ProcedureFixed2);
ASSERT_STRINGP(str);
ASSERT_MUTABLEP(str);
ASSERT_CHARP(ch);
+ /* FIXME: support stateful encoding */
+ next = SCM_CHARCODEC_INT2STR(Scm_current_char_codec, ch_str,
+ SCM_CHAR_VALUE(ch), SCM_MB_STATELESS);
+ if (!next)
+ ERR_OBJ("invalid char in", str);
+
/* create new str */
- char_size = strlen(SCM_CHAR_VALUE(ch));
+ char_size = next - ch_str;
str_len = SCM_STRING_LEN(str);
- new_str = (char*)realloc(SCM_STRING_STR(str),
- sizeof(char) * str_len * char_size + 1);
- for (i = 0, p = new_str; i < char_size * str_len;) {
- strcpy(p, SCM_CHAR_VALUE(ch));
-
- p += char_size;
- i += char_size;
+ new_str = realloc(SCM_STRING_STR(str), str_len * char_size + 1);
+ for (p = new_str; p < &new_str[char_size * str_len]; p += char_size) {
+ strcpy(p, ch_str);
}
SCM_STRING_SET_STR(str, new_str);
Modified: branches/r5rs/sigscheme/read.c
===================================================================
--- branches/r5rs/sigscheme/read.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/read.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -77,6 +77,11 @@
/*=======================================
File Local Macro Declarations
=======================================*/
+/* can accept "backspace" of R5RS and "U0010FFFF" of SRFI-75 */
+#define CHAR_LITERAL_LEN_MAX (sizeof("backspace") - sizeof((char)'\0'))
+
+#define INITIAL_STRING_BUF_SIZE 1024
+
/* Compatible with isspace(3). Use this to prevent incorrect space handlings */
#define CASE_ISSPACE \
case ' ': case '\t': case '\n': case '\r': case '\v': case '\f'
@@ -329,10 +334,30 @@
static ScmObj read_char(ScmObj port)
{
- char *ch = read_char_sequence(port);
- const ScmSpecialCharInfo *info = Scm_special_char_table;
+ int c;
+ char *ch;
+ const ScmSpecialCharInfo *info;
char *first_nondigit = NULL;
+ /* TODO: reorganize with read_char_sequence() */
+ /* non-ascii char (multibyte-ready) */
+ c = SCM_PORT_PEEK_CHAR(port);
+ if (!isascii(c)) {
+ DISCARD_LOOKAHEAD(port);
+ switch (SCM_PORT_PEEK_CHAR(port)) {
+ case '(': case ')': case '\"': case '\'': case ';':
+ CASE_ISSPACE:
+ case EOF:
+ /* properly delimited */
+ return Scm_NewChar(c);
+
+ default:
+ ERR("invalid character literal");
+ }
+ }
+
+ ch = read_char_sequence(port);
+
CDBG((SCM_DBG_PARSER, "read_char : ch = %s", ch));
/* check #\x<x><x> style character where <x> is a hexadecimal
@@ -341,48 +366,53 @@
if (ch && ch[0] == 'x' && 1 < strlen(ch)) {
if (strlen(ch) != 3)
SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
- ch[0] = (char)strtol(ch + 1, &first_nondigit, 16);
+ c = strtol(ch + 1, &first_nondigit, 16);
if (*first_nondigit)
SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
- ch[1] = '\0';
} else {
/* check special sequence */
- for (; info->esc_seq; info++) {
+ for (info = Scm_special_char_table; info->esc_seq; info++) {
if (strcmp(ch, info->lex_rep) == 0) {
- ch[0] = info->code;
- ch[1] = '\0';
+ c = info->code;
break;
}
}
}
-
- return Scm_NewChar(ch);
+ free(ch);
+ return Scm_NewChar(c);
}
+/* FIXME: extend buffer on demand */
static ScmObj read_string(ScmObj port)
{
- char stringbuf[1024]; /* FIXME! */
- int stringlen = 0;
- int c = 0;
- const ScmSpecialCharInfo *info = NULL;
- int found = 0;
+ ScmObj obj;
+ const ScmSpecialCharInfo *info;
+ int c;
+ size_t bufsize;
+ char *p, *buf;
+ char autobuf[INITIAL_STRING_BUF_SIZE];
CDBG((SCM_DBG_PARSER, "read_string"));
- while (1) {
- SCM_PORT_GETC(port, c);
+ buf = autobuf;
+ bufsize = sizeof(autobuf);
+ for (p = buf; p < &buf[bufsize];) {
+ c = SCM_PORT_GET_CHAR(port);
CDBG((SCM_DBG_PARSER, "read_string c = %c", c));
switch (c) {
case EOF:
- stringbuf[stringlen] = '\0';
- SigScm_Error("EOF in the string : str = %s", stringbuf);
+ *p = '\0';
+ ERR("EOF in string: \"%s<eof>", buf);
break;
case '\"':
- stringbuf[stringlen] = '\0';
- return Scm_NewImmutableStringCopying(stringbuf);
+ *p = '\0';
+ obj = Scm_NewImmutableStringCopying(buf);
+ if (buf != autobuf)
+ free(buf);
+ return obj;
case '\\':
/*
@@ -390,25 +420,29 @@
* A double quote can be written inside a string only by
* escaping it with a backslash (\).
*/
- SCM_PORT_GETC(port, c);
- found = 0;
+ c = SCM_PORT_GET_CHAR(port);
for (info = Scm_special_char_table; info->esc_seq; info++) {
if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
- stringbuf[stringlen++] = info->code;
- found = 1;
- break;
+ *p++ = info->code;
+ goto found;
}
}
- if (found == 0)
- SigScm_Error("\\%c in a string causes invalid escape sequence", c);
+ ERR("invalid escape sequence in string: \\%c", c);
+ found:
break;
default:
- stringbuf[stringlen] = c;
- stringlen++;
+ /* FIXME: support stateful encoding */
+ p = SCM_CHARCODEC_INT2STR(Scm_current_char_codec,
+ p, c, SCM_MB_STATELESS);
+ if (!p)
+ ERR("invalid char in string: 0x%x", c);
break;
}
}
+ buf[bufsize - 1] = '\0';
+ ERR("too long string: \"%s\"", buf);
+ /* NOTREACHED */
}
static ScmObj read_symbol(ScmObj port)
@@ -485,42 +519,40 @@
static char *read_char_sequence(ScmObj port)
{
- char stringbuf[1024]; /* FIXME! */
- int stringlen = 0;
- int c = 0;
- char *dst = NULL;
+ int c;
+ size_t len;
+ char buf[CHAR_LITERAL_LEN_MAX + sizeof((char)'\0')];
+ DECLARE_INTERNAL_FUNCTION("read_char_sequence");
- while (1) {
+ for (len = 0; len <= CHAR_LITERAL_LEN_MAX; len++) {
c = SCM_PORT_PEEK_CHAR(port);
+ if (!isascii(c))
+ ERR("non-ASCII char in char sequence: 0x%x", c);
CDBG((SCM_DBG_PARSER, "c = %c", c));
switch (c) {
case EOF:
- stringbuf[stringlen] = '\0';
- SigScm_Error("EOF in the char sequence : char = %s", stringbuf);
- break;
+ buf[len] = '\0';
+ ERR("EOF in char sequence: %s", buf);
+ /* NOTREACHED */
case '(': case ')': case '\"': case '\'': case ';':
CASE_ISSPACE:
- /* pass through first char */
- if (stringlen == 0) {
- DISCARD_LOOKAHEAD(port);
- stringbuf[stringlen++] = (char)c;
- break;
+ if (len) {
+ /* appeared as delimiter */
+ buf[len] = '\0';
+ return strdup(buf);
}
- /* return buf */
- SCM_PORT_UNGETC(port, c);
- stringbuf[stringlen] = '\0';
- dst = strdup(stringbuf);
- return dst;
-
+ /* FALLTHROUGH */
default:
DISCARD_LOOKAHEAD(port);
- stringbuf[stringlen++] = (char)c;
+ buf[len] = (char)c;
break;
}
}
+ ERR("invalid char sequence");
+ /* NOTREACHED */
}
static ScmObj read_quote(ScmObj port, ScmObj quoter)
Modified: branches/r5rs/sigscheme/sigscheme.h
===================================================================
--- branches/r5rs/sigscheme/sigscheme.h 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/sigscheme.h 2005-11-30 02:00:22 UTC (rev 2280)
@@ -382,7 +382,7 @@
ScmObj Scm_NewCons(ScmObj a, ScmObj b);
ScmObj Scm_NewInt(int val);
ScmObj Scm_NewSymbol(char *name, ScmObj v_cell);
-ScmObj Scm_NewChar(char *ch);
+ScmObj Scm_NewChar(int val);
ScmObj Scm_NewString(char *str, int is_immutable);
ScmObj Scm_NewImmutableString(char *str);
ScmObj Scm_NewImmutableStringCopying(const char *str);
Modified: branches/r5rs/sigscheme/sigschemetype-compact.h
===================================================================
--- branches/r5rs/sigscheme/sigschemetype-compact.h 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/sigschemetype-compact.h 2005-11-30 02:00:22 UTC (rev 2280)
@@ -530,8 +530,8 @@
#define SCM_C_FUNCPOINTER_VALUE(a) (SCM_WORD_CAST(ScmCFunc, SCM_CAR_GET_VALUE_AS_PTR(a)))
#define SCM_C_FUNCPOINTER_SET_VALUE(a, val) (SCM_CAR_SET_VALUE_AS_PTR((a), SCM_WORD_CAST(ScmObj, (val))))
-#define SCM_CHAR_VALUE(a) (SCM_PRIMARY_GET_VALUE_AS_STR((a), ~SCM_TAG_IMM_MASK_CHAR))
-#define SCM_CHAR_SET_VALUE(a, ch) (SCM_PRIMARY_SET_VALUE_AS_STR((a), (ch), SCM_TAG_IMM_CHAR))
+#define SCM_CHAR_VALUE(a) (SCM_PRIMARY_GET_VALUE_AS_INT((a), ~SCM_TAG_IMM_MASK_CHAR))
+#define SCM_CHAR_SET_VALUE(a, ch) (SCM_PRIMARY_SET_VALUE_AS_INT((a), (ch), SCM_TAG_IMM_CHAR))
/*
* Integer need to preserve 'singed' or 'unsigned', so need special accessor.
Modified: branches/r5rs/sigscheme/sigschemetype.h
===================================================================
--- branches/r5rs/sigscheme/sigschemetype.h 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/sigschemetype.h 2005-11-30 02:00:22 UTC (rev 2280)
@@ -160,8 +160,6 @@
SCM_STR_MUTABLE = 1
};
-#define SCM_CHARCELL_SIZE 8
-
/* Scheme Object */
struct ScmCell_ {
enum ScmObjType type;
@@ -183,12 +181,7 @@
} symbol;
struct {
-#if 0
- /* placeholdler for future optimization */
- unsigned char seq[SCM_CHARCELL_SIZE];
-#else
- char *ch;
-#endif
+ int value;
} ch;
struct {
@@ -291,8 +284,8 @@
#define SCM_CHARP(a) (SCM_TYPE(a) == ScmChar)
#define SCM_ENTYPE_CHAR(a) (SCM_ENTYPE((a), ScmChar))
-#define SCM_CHAR_VALUE(a) (SCM_AS_CHAR(a)->obj.ch.ch)
-#define SCM_CHAR_SET_VALUE(a, chr) (SCM_CHAR_VALUE(a) = (chr))
+#define SCM_CHAR_VALUE(a) (SCM_AS_CHAR(a)->obj.ch.value)
+#define SCM_CHAR_SET_VALUE(a, val) (SCM_CHAR_VALUE(a) = (val))
/* String Object uses tagged pointer for packing mutation type.
* LSB of ScmCell.obj.string.str is used to represent mutation type
Modified: branches/r5rs/sigscheme/storage-gc.c
===================================================================
--- branches/r5rs/sigscheme/storage-gc.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/storage-gc.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -548,16 +548,12 @@
#else /* SCM_OBJ_COMPACT */
/* if the type has the pointer to free, then free it! */
switch (SCM_TYPE(obj)) {
- case ScmInt:
case ScmCons:
+ case ScmInt:
+ case ScmChar:
case ScmClosure:
break;
- case ScmChar:
- if (SCM_CHAR_VALUE(obj))
- free(SCM_CHAR_VALUE(obj));
- break;
-
case ScmString:
if (SCM_STRING_STR(obj))
free(SCM_STRING_STR(obj));
Modified: branches/r5rs/sigscheme/storage.c
===================================================================
--- branches/r5rs/sigscheme/storage.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/storage.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -45,13 +45,6 @@
#include "sigschemeinternal.h"
#include "encoding.h"
-#if !SCM_OBJ_COMPACT
-#if (SCM_CHARCELL_SIZE <= SCM_MB_MAX_LEN)
-#error
-#error "SCM_MB_MAX_LEN is exceeded design limit"
-#endif
-#endif /* !SCM_OBJ_COMPACT */
-
/*=======================================
File Local Struct Declarations
=======================================*/
@@ -185,19 +178,12 @@
return obj;
}
-ScmObj Scm_NewChar(char *ch)
+ScmObj Scm_NewChar(int val)
{
ScmObj obj = SigScm_NewObjFromHeap();
- int len;
- len = Scm_mb_bare_c_strlen(ch);
- if (len > SCM_MB_MAX_LEN) {
- SigScm_Error("Scm_NewChar : invalid character ch = [%s], len = %d",
- ch, len);
- }
-
SCM_ENTYPE_CHAR(obj);
- SCM_CHAR_SET_VALUE(obj, ch);
+ SCM_CHAR_SET_VALUE(obj, val);
return obj;
}
Modified: branches/r5rs/sigscheme/test-compact.c
===================================================================
--- branches/r5rs/sigscheme/test-compact.c 2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/test-compact.c 2005-11-30 02:00:22 UTC (rev 2280)
@@ -296,6 +296,7 @@
return obj;
}
+/* FIXME: follow int-based char */
ScmObj Scm_CheckChar(char *ch)
{
#define SCM_MB_MAX_LEN 4
More information about the uim-commit
mailing list