[uim-commit] r2280 - branches/r5rs/sigscheme

yamaken at freedesktop.org yamaken at freedesktop.org
Tue Nov 29 18:00:27 PST 2005


Author: yamaken
Date: 2005-11-29 18:00:22 -0800 (Tue, 29 Nov 2005)
New Revision: 2280

Modified:
   branches/r5rs/sigscheme/TODO
   branches/r5rs/sigscheme/debug.c
   branches/r5rs/sigscheme/encoding.c
   branches/r5rs/sigscheme/encoding.h
   branches/r5rs/sigscheme/io.c
   branches/r5rs/sigscheme/operations.c
   branches/r5rs/sigscheme/read.c
   branches/r5rs/sigscheme/sigscheme.h
   branches/r5rs/sigscheme/sigschemetype-compact.h
   branches/r5rs/sigscheme/sigschemetype.h
   branches/r5rs/sigscheme/storage-gc.c
   branches/r5rs/sigscheme/storage.c
   branches/r5rs/sigscheme/test-compact.c
Log:
* This commit changes string-based charcter object to int-based

* sigscheme/sigscheme.h
  - (Scm_NewChar): Adapt to int-based char
* sigscheme/sigschemetype.h
  - (SCM_CHARCELL_SIZE): Removed
  - (struct ScmCell_, SCM_CHAR_VALUE, SCM_CHAR_SET_VALUE): Adapt to
    int-based char
* sigscheme/sigschemetype-compact.h
  - (SCM_CHAR_VALUE, SCM_CHAR_SET_VALUE): Adapt to int-based char
* sigscheme/test-compact.c
  - (Scm_CheckChar): Add FIXME comment
* sigscheme/storage.c
  - Remove SCM_CHARCELL_SIZE handling
  - (Scm_NewChar): Adapt to int-based char
* sigscheme/storage-gc.c
  - (sweep_obj): Adapt to int-based char
* sigscheme/encoding.h
  - (Scm_mb_scan_char): Removed
* sigscheme/encoding.c
  - (Scm_mb_scan_char): Removed
  - (Scm_mb_strlen, Scm_mb_substring): Replace Scm_mb_scan_char() with
    SCM_CHARCODEC_SCAN_CHAR()
  - (IS_ASCII): Replace the cast (uchar) with (uint) to multibyte char
  - (eucjp_str2int): Fix broken 3byte char handling
  - (LEADING_VAL, TRAILING_VAL): Fix broken expression
  - (utf8_int2str): Follow the change of TRAILING_VAL()
* sigscheme/read.c
  - (CHAR_LITERAL_LEN_MAX, INITIAL_STRING_BUF_SIZE): New macro
  - (read_char):
    * Support multibyte char
    * Adapt to int-based char
  - (read_string):
    * Ditto
    * Fix buffer overflow
    * Simplify
  - (read_char_sequence):
    * Fix fixed-size large buffer on stack
    * Fix buffer overflow
    * Simplify
    * Add non-ASCII char rejection
* sigscheme/io.c
  - (ScmOp_read_char, ScmOp_peek_char):
    * Adapt to int-based char
    * Support multibyte char
* sigscheme/debug.c
  - (print_char):
    * Support multibyte char
    * Adapt to int-based char
    * Simplify
* sigscheme/operations.c
  - (ScmOp_charequalp, ScmOp_char_alphabeticp, ScmOp_char_numericp,
    ScmOp_char_whitespacep, ScmOp_char_upper_casep,
    ScmOp_char_lower_casep, ScmOp_char2integer, ScmOp_integer2char,
    ScmOp_char_upcase, ScmOp_char_downcase, ScmOp_make_string,
    ScmOp_string_ref, ScmOp_string_setd, ScmOp_string2list,
    ScmOp_list2string, ScmOp_string_filld): Adapt to int-based char
* sigscheme/TODO
  - Update


Modified: branches/r5rs/sigscheme/TODO
===================================================================
--- branches/r5rs/sigscheme/TODO	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/TODO	2005-11-30 02:00:22 UTC (rev 2280)
@@ -66,7 +66,6 @@
 
 * Add integer->char and char->integer procedures with multibyte support
   - Write test
-  - Make character object int-based
 
 * Dynamic encoding switching for a conversion between string and char list,
   based on a SRFI or de facto standard API

Modified: branches/r5rs/sigscheme/debug.c
===================================================================
--- branches/r5rs/sigscheme/debug.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/debug.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -308,51 +308,33 @@
 
 static void print_char(ScmObj port, ScmObj obj, enum OutputType otype)
 {
-    const ScmSpecialCharInfo *info = NULL;
-    char *lex_rep = NULL;
+    const ScmSpecialCharInfo *info;
+    int c;
 
-    /* sanity check */
-    if (SCM_CHAR_VALUE(obj) == NULL)
-        return;
-
+    c = SCM_CHAR_VALUE(obj);
     switch (otype) {
     case AS_WRITE:
-        lex_rep = SCM_CHAR_VALUE(obj);
-        /* single byte */
-        if (strlen(lex_rep) < 2) {
-            /* search in the Scm_special_char_table */
-            for (info = Scm_special_char_table; info->esc_seq; info++) {
-                if (SCM_CHAR_VALUE(obj)[0] == info->code) {
-                    SigScm_PortPrintf(port, "#\\%s", info->lex_rep);
-                    return;
-                }
+        SCM_PORT_PUTS(port, "#\\");
+        /* special chars */
+        for (info = Scm_special_char_table; info->esc_seq; info++) {
+            if (c == (int)info->code) {
+                SCM_PORT_PUTS(port, info->lex_rep);
+                return;
             }
-            /* not found in the table*/
-            if (isprint(SCM_CHAR_VALUE(obj)[0])) {
-                SigScm_PortPrintf(port, "#\\%s", SCM_CHAR_VALUE(obj));
-            } else {
-                /* convert to hexadecimal format */ 
-                lex_rep = (char*)malloc(sizeof(char) * 4);
-                snprintf(lex_rep, 4, "x%02x", (int)SCM_CHAR_VALUE(obj)[0]);
-                SigScm_PortPrintf(port, "#\\%s", lex_rep);
-                free(lex_rep);
-            }
-            return;
         }
-        /* multi byte */
-        SigScm_PortPrintf(port, "#\\%s", lex_rep);
-        break;
 
+        /* other control chars are printed in hexadecimal form */ 
+        if (isascii(c) && iscntrl(c)) {
+            SigScm_PortPrintf(port, "x%02x", c);
+            return;
+        }
+        /* FALLTHROUGH */
     case AS_DISPLAY:
-        /*
-         * in display, character objects appear in the reqpresentation as
-         * if writen by write-char instead of by write.
-         */
-        SCM_PORT_PRINT(port, SCM_CHAR_VALUE(obj));
+        SCM_PORT_PUT_CHAR(port, c);
         break;
 
     default:
-        ERR("print_char : unknown output type");
+        ERR("print_char: unknown output type");
         break;
     }
 }

Modified: branches/r5rs/sigscheme/encoding.c
===================================================================
--- branches/r5rs/sigscheme/encoding.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/encoding.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -206,22 +206,6 @@
 /*=======================================
   Global Variables
 =======================================*/
-/* FIXME: merge into Scm_current_char_codec */
-ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)
-#if SCM_USE_UTF8_AS_DEFAULT
-    = utf8_scan_char;
-#elif SCM_USE_EUCCN_AS_DEFAULT
-    = euccn_scan_char;
-#elif SCM_USE_EUCJP_AS_DEFAULT
-    = eucjp_scan_char;
-#elif SCM_USE_EUCKR_AS_DEFAULT
-    = eucjp_scan_char;
-#elif SCM_USE_SJIS_AS_DEFAULT
-    = sjis_scan_char;
-#else
-    = unibyte_scan_char;
-#endif
-
 /* temporary solution */
 ScmCharCodec *Scm_current_char_codec
 #if SCM_USE_UTF8_AS_DEFAULT
@@ -251,7 +235,7 @@
           SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));
 
     while (SCM_MBS_GET_SIZE(mbs)) {
-        c = Scm_mb_scan_char(mbs);
+        c = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, mbs);
         CDBG((SCM_DBG_ENCODING, "%d, %d;", SCM_MBCINFO_GET_SIZE(c), c.flag));
         SCM_MBS_SKIP_CHAR(mbs, c);
         len++;
@@ -280,14 +264,14 @@
     ret = mbs;
 
     while (i--) {
-        c = Scm_mb_scan_char(ret);
+        c = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, ret);
         SCM_MBS_SKIP_CHAR(ret, c);
     }
 
     end = ret;
 
     while (len--) {
-        c = Scm_mb_scan_char(end);
+        c = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, end);
         SCM_MBS_SKIP_CHAR(end, c);
     }
 
@@ -341,7 +325,7 @@
 #define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)
 #define IN_GR96(c) (0xA0 <= (uchar)(c) && (uchar)(c) <= 0xFF)
 
-#define IS_ASCII(c) ((uchar)(c) <= 0x7F)
+#define IS_ASCII(c) ((uint)(c) <= 0x7F)
 #define IS_GR_SPC_OR_DEL(c)  ((uchar)(c) == 0xA0 || (uchar)(c) == 0xFF)
 
 #define CHAR_BITS    8
@@ -424,8 +408,8 @@
         break;
 
     case 3:
-        ch  = src[0] << CHAR_BITS;
-        ch |= src[1] << CHAR_BITS * 2;
+        ch  = src[0] << CHAR_BITS * 2;
+        ch |= src[1] << CHAR_BITS;
         ch |= src[2];
         break;
 
@@ -621,10 +605,8 @@
 #define TRAILING_CODE_BITS  LEN_CODE_BITS(1)
 #define TRAILING_VAL_BITS   (CHAR_BITS - TRAILING_CODE_BITS)
 #define LEADING_VAL_BITS(n) (CHAR_BITS - LEN_CODE_BITS(n))
-#define LEADING_VAL(u, n)                                                    \
-    ((u) >> (LEN_CODE_BITS(n) + TRAILING_VAL_BITS * ((n) - 1)))
-#define TRAILING_VAL(u, n, i)                                                \
-    (~MASK(1) & ((u) >> (TRAILING_VAL_BITS * ((n) - 2 - (i)))))
+#define LEADING_VAL(u, n)   ((u) >> TRAILING_VAL_BITS * ((n) - 1))
+#define TRAILING_VAL(u, i)  (~MASK(1) & ((u) >> TRAILING_VAL_BITS * (i)))
 
 static const char *utf8_encoding(void)
 {
@@ -707,16 +689,16 @@
         *dst++ = ch;
     } else if (IN_OCT_BMP(ch)) {
         *dst++ = LEN_CODE(2) | LEADING_VAL(ch, 2);
-        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 2, 0);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
     } else if (IN_BMP(ch)) {
         *dst++ = LEN_CODE(3) | LEADING_VAL(ch, 3);
-        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 3, 0);
-        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 3, 1);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 1);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
     } else if (IN_SMP(ch)) {
         *dst++ = LEN_CODE(4) | LEADING_VAL(ch, 4);
-        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 0);
-        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 1);
-        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 2);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 2);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 1);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
     } else {
         return NULL;
     }

Modified: branches/r5rs/sigscheme/encoding.h
===================================================================
--- branches/r5rs/sigscheme/encoding.h	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/encoding.h	2005-11-30 02:00:22 UTC (rev 2280)
@@ -153,7 +153,6 @@
 /*=======================================
    Variable Declarations
 =======================================*/
-extern ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs);
 extern ScmCharCodec *Scm_current_char_codec;
 
 /*=======================================

Modified: branches/r5rs/sigscheme/io.c
===================================================================
--- branches/r5rs/sigscheme/io.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/io.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -373,9 +373,7 @@
 ScmObj ScmOp_read_char(ScmObj args)
 {
     ScmObj port = SCM_INVALID;
-    /* FIXME: use int as char */
     int    ch;
-    char   buf[2];
     DECLARE_FUNCTION("read-char", ProcedureVariadic0);
 
     PREPARE_PORT(port, args, scm_current_input_port);
@@ -384,18 +382,13 @@
     if (ch == EOF)
         return SCM_EOF;
 
-    buf[0] = ch;
-    buf[1] = '\0';
-
-    return Scm_NewChar(strdup(buf));
+    return Scm_NewChar(ch);
 }
 
 ScmObj ScmOp_peek_char(ScmObj args)
 {
     ScmObj port = SCM_INVALID;
-    /* FIXME: use int as char */
     int    ch;
-    char   buf[2];
     DECLARE_FUNCTION("peek-char", ProcedureVariadic0);
 
     PREPARE_PORT(port, args, scm_current_input_port);
@@ -404,9 +397,7 @@
     if (ch == EOF)
         return SCM_EOF;
 
-    buf[0] = ch;
-    buf[1] = '\0';
-    return Scm_NewChar(strdup(buf));
+    return Scm_NewChar(ch);
 }
 
 ScmObj ScmOp_eof_objectp(ScmObj obj)

Modified: branches/r5rs/sigscheme/operations.c
===================================================================
--- branches/r5rs/sigscheme/operations.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/operations.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -986,109 +986,80 @@
 ScmObj ScmOp_charequalp(ScmObj ch1, ScmObj ch2)
 {
     DECLARE_FUNCTION("char=?", ProcedureFixed2);
+
     ASSERT_CHARP(ch1);
     ASSERT_CHARP(ch2);
 
-    if (strcmp(SCM_CHAR_VALUE(ch1), SCM_CHAR_VALUE(ch2)) == 0)
-        return SCM_TRUE;
-
-    return SCM_FALSE;
+    return (SCM_CHAR_VALUE(ch1) == SCM_CHAR_VALUE(ch2)) ? SCM_TRUE : SCM_FALSE;
 }
 
 ScmObj ScmOp_char_alphabeticp(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-alphabetic?", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return SCM_FALSE;
+    ch = SCM_CHAR_VALUE(obj);
 
-    /* check alphabet */
-    if (isalpha(SCM_CHAR_VALUE(obj)[0]) != 0)
-        return SCM_TRUE;
-
-    return SCM_FALSE;
+    return (isascii(ch) && isalpha(ch)) ? SCM_TRUE : SCM_FALSE;
 }
 
 ScmObj ScmOp_char_numericp(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-numeric?", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return SCM_FALSE;
+    ch = SCM_CHAR_VALUE(obj);
 
-    /* check digit */
-    if (isdigit(SCM_CHAR_VALUE(obj)[0]) != 0)
-        return SCM_TRUE;
-
-    return SCM_FALSE;
+    return (isascii(ch) && isdigit(ch)) ? SCM_TRUE : SCM_FALSE;
 }
 
 ScmObj ScmOp_char_whitespacep(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-whitespace?", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return SCM_FALSE;
+    ch = SCM_CHAR_VALUE(obj);
 
-    /* check space */
-    if (isspace(SCM_CHAR_VALUE(obj)[0]) != 0)
-        return SCM_TRUE;
-
-    return SCM_FALSE;
+    return (isascii(ch) && isspace(ch)) ? SCM_TRUE : SCM_FALSE;
 }
 
 ScmObj ScmOp_char_upper_casep(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-upper-case?", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return SCM_FALSE;
+    ch = SCM_CHAR_VALUE(obj);
 
-    /* check uppercase */
-    if (isupper(SCM_CHAR_VALUE(obj)[0]) != 0)
-        return SCM_TRUE;
-
-    return SCM_FALSE;
+    return (isascii(ch) && isupper(ch)) ? SCM_TRUE : SCM_FALSE;
 }
 
 ScmObj ScmOp_char_lower_casep(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-lower-case?", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return SCM_FALSE;
+    ch = SCM_CHAR_VALUE(obj);
 
-    /* check lowercase */
-    if (islower(SCM_CHAR_VALUE(obj)[0]) != 0)
-        return SCM_TRUE;
-
-    return SCM_FALSE;
+    return (isascii(ch) && islower(ch)) ? SCM_TRUE : SCM_FALSE;
 }
 
 ScmObj ScmOp_char2integer(ScmObj obj)
 {
-    int val;
-    const char *str;
     DECLARE_FUNCTION("char->integer", ProcedureFixed1);
 
     ASSERT_CHARP(obj);
 
-    str = SCM_CHAR_VALUE(obj);
-    val = SCM_CHARCODEC_STR2INT(Scm_current_char_codec, str, strlen(str),
-                                SCM_MB_STATELESS);
-    if (val == EOF)
-        ERR_OBJ("invalid char value", obj);
-    return Scm_NewInt(val);
+    return Scm_NewInt(SCM_CHAR_VALUE(obj));
 }
 
 ScmObj ScmOp_integer2char(ScmObj obj)
@@ -1102,36 +1073,34 @@
     val = SCM_INT_VALUE(obj);
     if (!SCM_CHARCODEC_INT2STR(Scm_current_char_codec, buf, val, SCM_MB_STATELESS))
         ERR_OBJ("invalid char value", obj);
-    return Scm_NewChar(strdup(buf));
+    return Scm_NewChar(val);
 }
 
 ScmObj ScmOp_char_upcase(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-upcase", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return obj;
+    ch = SCM_CHAR_VALUE(obj);
+    if (isascii(ch))
+        SCM_CHAR_SET_VALUE(obj, toupper(ch));
 
-    /* to upcase */
-    SCM_CHAR_VALUE(obj)[0] = toupper(SCM_CHAR_VALUE(obj)[0]);
-
     return obj;
 }
 
 ScmObj ScmOp_char_downcase(ScmObj obj)
 {
+    int ch;
     DECLARE_FUNCTION("char-downcase", ProcedureFixed1);
+
     ASSERT_CHARP(obj);
 
-    /* check multibyte */
-    if (strlen(SCM_CHAR_VALUE(obj)) != 1)
-        return obj;
+    ch = SCM_CHAR_VALUE(obj);
+    if (isascii(ch))
+        SCM_CHAR_SET_VALUE(obj, tolower(ch));
 
-    /* to upcase */
-    SCM_CHAR_VALUE(obj)[0] = tolower(SCM_CHAR_VALUE(obj)[0]);
-
     return obj;
 }
 
@@ -1146,13 +1115,9 @@
 
 ScmObj ScmOp_make_string(ScmObj length, ScmObj args)
 {
-    int len = 0;
-    int i;
-    int fillstr_size = 0;
+    int filler_val, len, i;
     ScmObj filler = SCM_FALSE;
-    const char *fillstr = NULL;
-    char *new_str = NULL;
-    char *p = NULL;
+    ScmObj sport  = SCM_FALSE;
     DECLARE_FUNCTION("make-string", ProcedureVariadic1);
 
     ASSERT_INTP(length);
@@ -1162,26 +1127,24 @@
     if (len < 0)
         ERR_OBJ("length must be a positive integer", length);
 
-    /* extract fillstr */
+    /* extract filler */
     if (NO_MORE_ARG(args)) {
-        fillstr = " ";
+        filler_val = ' ';
     } else {
         filler = POP_ARG(args);
         ASSERT_NO_MORE_ARG(args);
         ASSERT_CHARP(filler);
-        fillstr = SCM_CHAR_VALUE(filler);
+        filler_val = SCM_CHAR_VALUE(filler);
     }
 
-    /* fill string */
-    fillstr_size = strlen(fillstr);
-    new_str = (char*)malloc(sizeof(char) * fillstr_size * len + 1);
-    for (i = 0, p = new_str; i < len; i++) {
-        strcpy(p, fillstr);
-        p += fillstr_size;
+    /* TODO: make efficient */
+    /* fill string (multibyte-ready) */
+    sport = ScmOp_SRFI6_open_output_string();
+    for (i = 0; i < len; i++) {
+        SCM_PORT_PUT_CHAR(sport, filler_val);
     }
-    new_str[fillstr_size * len] = '\0';
 
-    return Scm_NewMutableString(new_str);
+    return ScmOp_SRFI6_get_output_string(sport);
 }
 
 ScmObj ScmOp_string(ScmObj args)
@@ -1200,30 +1163,29 @@
 ScmObj ScmOp_string_ref(ScmObj str, ScmObj k)
 {
     int   c_index = 0;
-    char *new_ch  = NULL;
+    int   ch;
     ScmMultibyteString mbs;
     DECLARE_FUNCTION("string-ref", ProcedureFixed2);
 
     ASSERT_STRINGP(str);
     ASSERT_INTP(k);
 
-    SCM_MBS_INIT(mbs);
-    /* get start_ptr and end_ptr */
     c_index = SCM_INT_VALUE(k);
     if (c_index < 0 || SCM_STRING_LEN(str) <= c_index)
         ERR_OBJ("index out of range", k);
-    SCM_MBS_SET_STR(mbs, SCM_STRING_STR(str));
 
-    /* FIXME: This strlen() can be eliminated. */
+    SCM_MBS_INIT(mbs);
+    SCM_MBS_SET_STR(mbs, SCM_STRING_STR(str));
     SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(str)));
     mbs = Scm_mb_strref(mbs, c_index);
 
-    /* copy from start_ptr to end_ptr */
-    new_ch = (char*)malloc(SCM_MBS_GET_SIZE(mbs) + 1);
-    memcpy(new_ch, SCM_MBS_GET_STR(mbs), SCM_MBS_GET_SIZE(mbs));
-    new_ch[SCM_MBS_GET_SIZE(mbs)] = '\0';
+    /* FIXME: support stateful encoding */
+    ch = SCM_CHARCODEC_STR2INT(Scm_current_char_codec, SCM_MBS_GET_STR(mbs),
+                               SCM_MBS_GET_SIZE(mbs), SCM_MB_STATELESS);
+    if (ch == EOF)
+        ERR("string-ref: invalid char sequence");
 
-    return Scm_NewChar(new_ch);
+    return Scm_NewChar(ch);
 }
 
 ScmObj ScmOp_string_setd(ScmObj str, ScmObj k, ScmObj ch)
@@ -1236,6 +1198,8 @@
     char *new_str  = NULL;
     ScmMultibyteString mbs;
     const char *string_str   = NULL;
+    char new_ch_str[SCM_MB_MAX_LEN + sizeof((char)'\0')];
+    const char *next;
     DECLARE_FUNCTION("string-set!", ProcedureFixed3);
 
     ASSERT_STRINGP(str);
@@ -1251,22 +1215,27 @@
     /* FIXME: can string_str be NULL at this point or not? */
     if (!string_str) string_str = "";
 
-    /* FIXME: strlen() can be eliminiated. */
     SCM_MBS_INIT(mbs);
     SCM_MBS_SET_STR(mbs, string_str);
     SCM_MBS_SET_SIZE(mbs, strlen(string_str));
     mbs = Scm_mb_strref(mbs, c_start_index);
 
+    /* FIXME: support stateful encoding */
+    next = SCM_CHARCODEC_INT2STR(Scm_current_char_codec, new_ch_str,
+                                 SCM_CHAR_VALUE(ch), SCM_MB_STATELESS);
+    if (!next)
+        ERR_OBJ("invalid char in", str);
+
     /* calculate total size */
     prefix_size = SCM_MBS_GET_STR(mbs) - string_str;
-    newch_size  = strlen(SCM_CHAR_VALUE(ch));
+    newch_size  = next - new_ch_str;
     postfix_size  = strlen(SCM_MBS_GET_STR(mbs) + SCM_MBS_GET_SIZE(mbs));
     total_size = prefix_size + newch_size + postfix_size;
 
     /* copy each part */
     new_str = (char*)malloc(total_size + 1);
     memcpy(new_str, string_str, prefix_size);
-    memcpy(new_str+prefix_size, SCM_CHAR_VALUE(ch), newch_size);
+    memcpy(new_str+prefix_size, new_ch_str, newch_size);
     memcpy(new_str+prefix_size+newch_size,
            SCM_MBS_GET_STR(mbs)+SCM_MBS_GET_SIZE(mbs), postfix_size);
 
@@ -1376,8 +1345,9 @@
     ScmObj tail = SCM_NULL;
     ScmObj next = SCM_NULL;
     ScmMultibyteString mbs;
-    ScmMultibyteCharInfo ch;
-    char *buf;
+    ScmMultibyteCharInfo mbc;
+    ScmMultibyteState state;
+    int ch;
     DECLARE_FUNCTION("string->list", ProcedureFixed1);
 
     ASSERT_STRINGP(str);
@@ -1387,12 +1357,19 @@
     SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(str)));
 
     while (SCM_MBS_GET_SIZE(mbs)) {
-        ch = Scm_mb_scan_char(mbs);
-        buf = malloc(SCM_MBCINFO_GET_SIZE(ch)+1);
-        memcpy(buf, SCM_MBS_GET_STR(mbs), SCM_MBCINFO_GET_SIZE(ch));
-        buf[SCM_MBCINFO_GET_SIZE(ch)] = 0;
-        next = LIST_1(Scm_NewChar(buf));
+        state = SCM_MBS_GET_STATE(mbs);
+        mbc = SCM_CHARCODEC_SCAN_CHAR(Scm_current_char_codec, mbs);
+        if (SCM_MBCINFO_ERRORP(mbc) || SCM_MBCINFO_INCOMPLETEP(mbc))
+            ERR("string->list: invalid char sequence");
+        ch = SCM_CHARCODEC_STR2INT(Scm_current_char_codec,
+                                   SCM_MBS_GET_STR(mbs),
+                                   SCM_MBCINFO_GET_SIZE(mbc),
+                                   state);
+        if (ch == EOF)
+            ERR("string->list: invalid char sequence");
 
+        next = LIST_1(Scm_NewChar(ch));
+
         if (NULLP(tail))
             head = tail = next;
         else {
@@ -1400,7 +1377,7 @@
             tail = CDR(tail);
         }
 
-        SCM_MBS_SKIP_CHAR(mbs, ch);
+        SCM_MBS_SKIP_CHAR(mbs, mbc);
     }
 
     return head;
@@ -1408,44 +1385,25 @@
 
 ScmObj ScmOp_list2string(ScmObj lst)
 {
-    int len = 0;
-    int total_size = 0;
-    ScmObj chars   = SCM_NULL;
-    ScmObj obj     = SCM_NULL;
-    char  *new_str = NULL;
-    char  *ch      = NULL;
-    char  *p       = NULL;
+    ScmObj rest, ch, sport;
     DECLARE_FUNCTION("list->string", ProcedureFixed1);
 
-    if (FALSEP(ScmOp_listp(lst)))
-        ERR_OBJ("list required but got", lst);
+    ASSERT_LISTP(lst);
 
     if (NULLP(lst))
         return Scm_NewMutableStringCopying("");
 
-    /* count total size of the string */
-    for (chars = lst; !NULLP(chars); chars = CDR(chars)) {
-        obj = CAR(chars);
-        ASSERT_CHARP(obj);
-
-        total_size += strlen(SCM_CHAR_VALUE(obj));
+    /* TODO: make efficient */
+    sport = ScmOp_SRFI6_open_output_string();
+    for (rest = lst; CONSP(rest); rest = CDR(rest)) {
+        ch = CAR(rest);
+        ASSERT_CHARP(ch);
+        SCM_PORT_PUT_CHAR(sport, SCM_CHAR_VALUE(ch));
     }
+    if (!NULLP(rest))
+        ERR_OBJ("invalid char list", lst);
 
-    /* allocate new string */
-    new_str = (char*)malloc(sizeof(char) * total_size + 1);
-
-    /* copy char by char */
-    p = new_str;
-    for (chars = lst; !NULLP(chars); chars = CDR(chars)) {
-        obj = CAR(chars);
-        ch  = SCM_CHAR_VALUE(obj);
-        len = strlen(SCM_CHAR_VALUE(obj));
-
-        strcpy(p, ch);
-        p += len;
-    }
-
-    return Scm_NewMutableString(new_str);
+    return ScmOp_SRFI6_get_output_string(sport);
 }
 
 ScmObj ScmOp_string_copy(ScmObj str)
@@ -1461,23 +1419,26 @@
     int  str_len   = 0;
     char *new_str  = NULL;
     char *p        = NULL;
-    int   i        = 0;
+    char ch_str[SCM_MB_MAX_LEN + sizeof((char)'\0')];
+    const char *next;
     DECLARE_FUNCTION("string-fill!", ProcedureFixed2);
 
     ASSERT_STRINGP(str);
     ASSERT_MUTABLEP(str);
     ASSERT_CHARP(ch);
 
+    /* FIXME: support stateful encoding */
+    next = SCM_CHARCODEC_INT2STR(Scm_current_char_codec, ch_str,
+                                 SCM_CHAR_VALUE(ch), SCM_MB_STATELESS);
+    if (!next)
+        ERR_OBJ("invalid char in", str);
+
     /* create new str */
-    char_size = strlen(SCM_CHAR_VALUE(ch));
+    char_size = next - ch_str;
     str_len   = SCM_STRING_LEN(str);
-    new_str   = (char*)realloc(SCM_STRING_STR(str),
-                               sizeof(char) * str_len * char_size + 1);
-    for (i = 0, p = new_str; i < char_size * str_len;) {
-        strcpy(p, SCM_CHAR_VALUE(ch));
-
-        p += char_size;
-        i += char_size;
+    new_str   = realloc(SCM_STRING_STR(str), str_len * char_size + 1);
+    for (p = new_str; p < &new_str[char_size * str_len]; p += char_size) {
+        strcpy(p, ch_str);
     }
 
     SCM_STRING_SET_STR(str, new_str);

Modified: branches/r5rs/sigscheme/read.c
===================================================================
--- branches/r5rs/sigscheme/read.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/read.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -77,6 +77,11 @@
 /*=======================================
   File Local Macro Declarations
 =======================================*/
+/* can accept "backspace" of R5RS and "U0010FFFF" of SRFI-75 */
+#define CHAR_LITERAL_LEN_MAX (sizeof("backspace") - sizeof((char)'\0'))
+
+#define INITIAL_STRING_BUF_SIZE 1024
+
 /* Compatible with isspace(3). Use this to prevent incorrect space handlings */
 #define CASE_ISSPACE                                                         \
     case ' ': case '\t': case '\n': case '\r': case '\v': case '\f'
@@ -329,10 +334,30 @@
 
 static ScmObj read_char(ScmObj port)
 {
-    char *ch = read_char_sequence(port);
-    const ScmSpecialCharInfo *info = Scm_special_char_table;
+    int c;
+    char *ch;
+    const ScmSpecialCharInfo *info;
     char *first_nondigit = NULL;
 
+    /* TODO: reorganize with read_char_sequence() */
+    /* non-ascii char (multibyte-ready) */
+    c = SCM_PORT_PEEK_CHAR(port);
+    if (!isascii(c)) {
+        DISCARD_LOOKAHEAD(port);
+        switch (SCM_PORT_PEEK_CHAR(port)) {
+        case '(': case ')': case '\"': case '\'': case ';':
+        CASE_ISSPACE:
+        case EOF:
+            /* properly delimited */
+            return Scm_NewChar(c);
+
+        default:
+            ERR("invalid character literal");
+        }
+    }
+
+    ch = read_char_sequence(port);
+
     CDBG((SCM_DBG_PARSER, "read_char : ch = %s", ch));
 
     /* check #\x<x><x> style character where <x> is a hexadecimal
@@ -341,48 +366,53 @@
     if (ch && ch[0] == 'x' && 1 < strlen(ch)) {
         if (strlen(ch) != 3)
             SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
-        ch[0] = (char)strtol(ch + 1, &first_nondigit, 16);
+        c = strtol(ch + 1, &first_nondigit, 16);
         if (*first_nondigit)
             SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
-        ch[1] = '\0';
     } else {
         /* check special sequence */
-        for (; info->esc_seq; info++) {
+        for (info = Scm_special_char_table; info->esc_seq; info++) {
             if (strcmp(ch, info->lex_rep) == 0) {
-                ch[0] = info->code;
-                ch[1] = '\0';
+                c = info->code;
                 break;
             }
         }
     }
-
-    return Scm_NewChar(ch);
+    free(ch);
+    return Scm_NewChar(c);
 }
 
+/* FIXME: extend buffer on demand */
 static ScmObj read_string(ScmObj port)
 {
-    char  stringbuf[1024]; /* FIXME! */
-    int   stringlen = 0;
-    int   c = 0;
-    const ScmSpecialCharInfo *info = NULL;
-    int found = 0;
+    ScmObj obj;
+    const ScmSpecialCharInfo *info;
+    int c;
+    size_t bufsize;
+    char *p, *buf;
+    char autobuf[INITIAL_STRING_BUF_SIZE];
 
     CDBG((SCM_DBG_PARSER, "read_string"));
 
-    while (1) {
-        SCM_PORT_GETC(port, c);
+    buf = autobuf;
+    bufsize = sizeof(autobuf);
+    for (p = buf; p < &buf[bufsize];) {
+        c = SCM_PORT_GET_CHAR(port);
 
         CDBG((SCM_DBG_PARSER, "read_string c = %c", c));
 
         switch (c) {
         case EOF:
-            stringbuf[stringlen] = '\0';
-            SigScm_Error("EOF in the string : str = %s", stringbuf);
+            *p = '\0';
+            ERR("EOF in string: \"%s<eof>", buf);
             break;
 
         case '\"':
-            stringbuf[stringlen] = '\0';
-            return Scm_NewImmutableStringCopying(stringbuf);
+            *p = '\0';
+            obj = Scm_NewImmutableStringCopying(buf);
+            if (buf != autobuf)
+                free(buf);
+            return obj;
 
         case '\\':
             /*
@@ -390,25 +420,29 @@
              * A double quote can be written inside a string only by
              * escaping it with a backslash (\).
              */
-            SCM_PORT_GETC(port, c);
-            found = 0;
+            c = SCM_PORT_GET_CHAR(port);
             for (info = Scm_special_char_table; info->esc_seq; info++) {
                 if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
-                    stringbuf[stringlen++] = info->code;
-                    found = 1;
-                    break;
+                    *p++ = info->code;
+                    goto found;
                 }
             }
-            if (found == 0)
-                SigScm_Error("\\%c in a string causes invalid escape sequence", c);
+            ERR("invalid escape sequence in string: \\%c", c);
+        found:
             break;
 
         default:
-            stringbuf[stringlen] = c;
-            stringlen++;
+            /* FIXME: support stateful encoding */
+            p = SCM_CHARCODEC_INT2STR(Scm_current_char_codec,
+                                      p, c, SCM_MB_STATELESS);
+            if (!p)
+                ERR("invalid char in string: 0x%x", c);
             break;
         }
     }
+    buf[bufsize - 1] = '\0';
+    ERR("too long string: \"%s\"", buf);
+    /* NOTREACHED */
 }
 
 static ScmObj read_symbol(ScmObj port)
@@ -485,42 +519,40 @@
 
 static char *read_char_sequence(ScmObj port)
 {
-    char  stringbuf[1024];  /* FIXME! */
-    int   stringlen = 0;
-    int   c = 0;
-    char *dst = NULL;
+    int c;
+    size_t len;
+    char buf[CHAR_LITERAL_LEN_MAX + sizeof((char)'\0')];
+    DECLARE_INTERNAL_FUNCTION("read_char_sequence");
 
-    while (1) {
+    for (len = 0; len <= CHAR_LITERAL_LEN_MAX; len++) {
         c = SCM_PORT_PEEK_CHAR(port);
+        if (!isascii(c))
+            ERR("non-ASCII char in char sequence: 0x%x", c);
 
         CDBG((SCM_DBG_PARSER, "c = %c", c));
 
         switch (c) {
         case EOF:
-            stringbuf[stringlen] = '\0';
-            SigScm_Error("EOF in the char sequence : char = %s", stringbuf);
-            break;
+            buf[len] = '\0';
+            ERR("EOF in char sequence: %s", buf);
+            /* NOTREACHED */
 
         case '(': case ')': case '\"': case '\'': case ';':
         CASE_ISSPACE:
-            /* pass through first char */
-            if (stringlen == 0) {
-                DISCARD_LOOKAHEAD(port);
-                stringbuf[stringlen++] = (char)c;
-                break;
+            if (len) {
+                /* appeared as delimiter */
+                buf[len] = '\0';
+                return strdup(buf);
             }
-            /* return buf */
-            SCM_PORT_UNGETC(port, c);
-            stringbuf[stringlen] = '\0';
-            dst = strdup(stringbuf);
-            return dst;
-
+            /* FALLTHROUGH */
         default:
             DISCARD_LOOKAHEAD(port);
-            stringbuf[stringlen++] = (char)c;
+            buf[len] = (char)c;
             break;
         }
     }
+    ERR("invalid char sequence");
+    /* NOTREACHED */
 }
 
 static ScmObj read_quote(ScmObj port, ScmObj quoter)

Modified: branches/r5rs/sigscheme/sigscheme.h
===================================================================
--- branches/r5rs/sigscheme/sigscheme.h	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/sigscheme.h	2005-11-30 02:00:22 UTC (rev 2280)
@@ -382,7 +382,7 @@
 ScmObj Scm_NewCons(ScmObj a, ScmObj b);
 ScmObj Scm_NewInt(int val);
 ScmObj Scm_NewSymbol(char *name, ScmObj v_cell);
-ScmObj Scm_NewChar(char *ch);
+ScmObj Scm_NewChar(int val);
 ScmObj Scm_NewString(char *str, int is_immutable);
 ScmObj Scm_NewImmutableString(char *str);
 ScmObj Scm_NewImmutableStringCopying(const char *str);

Modified: branches/r5rs/sigscheme/sigschemetype-compact.h
===================================================================
--- branches/r5rs/sigscheme/sigschemetype-compact.h	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/sigschemetype-compact.h	2005-11-30 02:00:22 UTC (rev 2280)
@@ -530,8 +530,8 @@
 #define SCM_C_FUNCPOINTER_VALUE(a)          (SCM_WORD_CAST(ScmCFunc, SCM_CAR_GET_VALUE_AS_PTR(a)))
 #define SCM_C_FUNCPOINTER_SET_VALUE(a, val) (SCM_CAR_SET_VALUE_AS_PTR((a), SCM_WORD_CAST(ScmObj, (val))))
 
-#define SCM_CHAR_VALUE(a)         (SCM_PRIMARY_GET_VALUE_AS_STR((a), ~SCM_TAG_IMM_MASK_CHAR))
-#define SCM_CHAR_SET_VALUE(a, ch) (SCM_PRIMARY_SET_VALUE_AS_STR((a), (ch), SCM_TAG_IMM_CHAR))
+#define SCM_CHAR_VALUE(a)         (SCM_PRIMARY_GET_VALUE_AS_INT((a), ~SCM_TAG_IMM_MASK_CHAR))
+#define SCM_CHAR_SET_VALUE(a, ch) (SCM_PRIMARY_SET_VALUE_AS_INT((a), (ch), SCM_TAG_IMM_CHAR))
 
 /*
  * Integer need to preserve 'singed' or 'unsigned', so need special accessor.

Modified: branches/r5rs/sigscheme/sigschemetype.h
===================================================================
--- branches/r5rs/sigscheme/sigschemetype.h	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/sigschemetype.h	2005-11-30 02:00:22 UTC (rev 2280)
@@ -160,8 +160,6 @@
     SCM_STR_MUTABLE             = 1
 };
 
-#define SCM_CHARCELL_SIZE 8
-
 /* Scheme Object */
 struct ScmCell_ {
     enum ScmObjType type;
@@ -183,12 +181,7 @@
         } symbol;
 
         struct {
-#if 0
-            /* placeholdler for future optimization */
-            unsigned char seq[SCM_CHARCELL_SIZE];
-#else
-            char *ch;
-#endif
+            int value;
         } ch;
 
         struct {
@@ -291,8 +284,8 @@
 
 #define SCM_CHARP(a) (SCM_TYPE(a) == ScmChar)
 #define SCM_ENTYPE_CHAR(a) (SCM_ENTYPE((a), ScmChar))
-#define SCM_CHAR_VALUE(a) (SCM_AS_CHAR(a)->obj.ch.ch)
-#define SCM_CHAR_SET_VALUE(a, chr) (SCM_CHAR_VALUE(a) = (chr))
+#define SCM_CHAR_VALUE(a) (SCM_AS_CHAR(a)->obj.ch.value)
+#define SCM_CHAR_SET_VALUE(a, val) (SCM_CHAR_VALUE(a) = (val))
 
 /* String Object uses tagged pointer for packing mutation type.
  * LSB of ScmCell.obj.string.str is used to represent mutation type

Modified: branches/r5rs/sigscheme/storage-gc.c
===================================================================
--- branches/r5rs/sigscheme/storage-gc.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/storage-gc.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -548,16 +548,12 @@
 #else /* SCM_OBJ_COMPACT */
     /* if the type has the pointer to free, then free it! */
     switch (SCM_TYPE(obj)) {
-    case ScmInt:
     case ScmCons:
+    case ScmInt:
+    case ScmChar:
     case ScmClosure:
         break;
 
-    case ScmChar:
-        if (SCM_CHAR_VALUE(obj))
-            free(SCM_CHAR_VALUE(obj));
-        break;
-
     case ScmString:
         if (SCM_STRING_STR(obj))
             free(SCM_STRING_STR(obj));

Modified: branches/r5rs/sigscheme/storage.c
===================================================================
--- branches/r5rs/sigscheme/storage.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/storage.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -45,13 +45,6 @@
 #include "sigschemeinternal.h"
 #include "encoding.h"
 
-#if !SCM_OBJ_COMPACT
-#if (SCM_CHARCELL_SIZE <= SCM_MB_MAX_LEN)
-#error
-#error "SCM_MB_MAX_LEN is exceeded design limit"
-#endif
-#endif /* !SCM_OBJ_COMPACT */
-
 /*=======================================
   File Local Struct Declarations
 =======================================*/
@@ -185,19 +178,12 @@
     return obj;
 }
 
-ScmObj Scm_NewChar(char *ch)
+ScmObj Scm_NewChar(int val)
 {
     ScmObj obj = SigScm_NewObjFromHeap();
-    int len;
 
-    len = Scm_mb_bare_c_strlen(ch);
-    if (len > SCM_MB_MAX_LEN) {
-        SigScm_Error("Scm_NewChar : invalid character ch = [%s], len = %d",
-                     ch, len);
-    }
-
     SCM_ENTYPE_CHAR(obj);
-    SCM_CHAR_SET_VALUE(obj, ch);
+    SCM_CHAR_SET_VALUE(obj, val);
 
     return obj;
 }

Modified: branches/r5rs/sigscheme/test-compact.c
===================================================================
--- branches/r5rs/sigscheme/test-compact.c	2005-11-30 01:15:42 UTC (rev 2279)
+++ branches/r5rs/sigscheme/test-compact.c	2005-11-30 02:00:22 UTC (rev 2280)
@@ -296,6 +296,7 @@
     return obj;
 }
 
+/* FIXME: follow int-based char */
 ScmObj Scm_CheckChar(char *ch)
 {
 #define SCM_MB_MAX_LEN 4



More information about the uim-commit mailing list