[uim-commit] r1858 - branches/r5rs/sigscheme

jun0 at freedesktop.org jun0 at freedesktop.org
Mon Oct 17 23:45:25 PDT 2005


Author: jun0
Date: 2005-10-17 23:45:18 -0700 (Mon, 17 Oct 2005)
New Revision: 1858

Modified:
   branches/r5rs/sigscheme/config.h
   branches/r5rs/sigscheme/datas.c
   branches/r5rs/sigscheme/debug.c
   branches/r5rs/sigscheme/encoding.c
   branches/r5rs/sigscheme/operations-siod.c
   branches/r5rs/sigscheme/operations.c
   branches/r5rs/sigscheme/sigscheme.h
   branches/r5rs/sigscheme/sigschemeinternal.h
   branches/r5rs/sigscheme/sigschemetype.h
Log:
Adds proper multibyte encoding support.

* sigscheme/config.h
  - (SCM_USE_EUCJP, SCM_USE_SJIS, SCM_USE_UTF8, SCM_USE_EUCCN,
    SCM_USE_EUCKR): New macros.
  - (SCM_STRICT_ENCODING_CHECK, SCM_DEBUG_ENCODING): New macro.

* sigscheme/encoding.c
  - sigschemeinternal.h: New include.
  - (eucjp_scan_char, sjis_scan_char, euccn_scan_char,
    euckr_scan_char, utf8_scan_char, unibyte_scan_char): New
    functions.
  - (uchar): New type.
  - (Scm_mb_scan_char): New global variable.
  - (eucjp_strlen, eucjp_str_startpos, eucjp_str_endpos): Remove.
  - (SigScm_default_encoding_strlen): Rename to Scm_mb_strlen.
  - (Scm_mb_bare_c_strlen): New function.  Better name wanted.
  - (Scm_mb_substring): New function.
  - (ENTER, RETURN, RETURN_ERROR, RETURN_INCOMPLETE, SAVE_STATE,
    EXPECT_SIZE, IN_CL, IN_CR, IN_GL94, IN_GL96, IN_GR94, IN_GR96,
    IS_ASCII, ESC, SO, SI, SS2, SS3): New macros.
  - (iso2022kr_scan_char, iso2022kr_scan_input_char,
    iso2022jp_scan_char, iso2022jp_scan_input_char): Declared, but not
    implemented.

* sigscheme/operations.c
  - (Scm_mb_bare_c_strlen): Follow renaming in encoding.c.
  - (ScmOp_string_ref, ScmOp_string_set, ScmOp_string_substring,
    ScmOp_string2list): Use the new multibyte stuff.

* sigscheme/datas.c
  - (Scm_NewChar, Scm_NewString, Scm_NewStringCopying): Use the new
    multibyte stuff.

* sigscheme/sigscheme.h
  - (SCM_DBG_ENCODING): New enum constant.
  - (SigScm_default_encoding_strlen,
    SigScm_default_encoding_str_startpos,
    SigScm_default_encoding_str_endpos): Remove prototypes.
  - (Scm_mb_strlen, Scm_mb_bare_c_strlen, Scm_mb_scan_char): New declarations.
  - (Scm_mb_strref): New macro. (wrapper for Scm_mb_substring)

* sigscheme/sigschemetype.h
  - (ScmMultibyteState, ScmMultibyteCharInfo, ScmMultibyteString): New types.
  - (SCM_MBS_SET_STR, SCM_MBS_GET_STR, SCM_MBS_SET_SIZE,
    SCM_MBS_GET_SIZE, SCM_MBCINFO_SET_SIZE, SCM_MBCINFO_GET_SIZE,
    SCM_MBCINFO_CLEAR_STATE, SCM_MBCINFO_SET_STATE, SCM_MBCINFO_GET_STATE,
    SCM_MBCINFO_CLEAR_FLAG, SCM_MBCINFO_SET_ERROR,
    SCM_MBCINFO_SET_INCOMPLETE, SCM_MBCINFO_ERRORP,
    SCM_MBCINFO_INCOMPLETEP, SCM_MBCINFO_INIT, SCM_MBS_GET_STATE,
    SCM_MBS_SET_STATE, SCM_MBS_CLEAR_STATE, SCM_MBS_GET_STATE,
    SCM_MBS_SET_STATE, SCM_MBS_CLEAR_STATE, SCM_MBS_INIT,
    SCM_MBS_SKIP_CHAR): New macros.

* sigscheme/debug.c
  - (SigScm_PredefinedDebugCategories): Add SCM_DBG_ENCODING.
  - (SigScm_WriteToPort, SigScm_DisplayToPort): Untabify.

* sigscheme/operations-siod.c
  - (ScmOp_closure_code, SigScm_SetVerboseLevel): Untabify.

* sigscheme/sigschemeinternal.h
  - (ASSERT_NO_MORE_ARG): Correct error reporting macro.

Modified: branches/r5rs/sigscheme/config.h
===================================================================
--- branches/r5rs/sigscheme/config.h	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/config.h	2005-10-18 06:45:18 UTC (rev 1858)
@@ -56,8 +56,17 @@
 /*===========================================================================
   Character Encoding Handlers
 ===========================================================================*/
-#define SCM_USE_EUCJP           1  /* use EUC-JP as internal encoding */
+/* Support for each encoding will be compiled in if the corresponding
+ * macro is defined as nonzero. */
+#define SCM_USE_EUCJP           0
+#define SCM_USE_SJIS            0
+#define SCM_USE_UTF8            1
+#define SCM_USE_EUCCN           0
+#define SCM_USE_EUCKR           0
 
+/* For now, edit encoding.c and change the initialization of
+ * Scm_mb_scan_char to change the default encoding. */
+
 /* "which encodings are enabled" and "which encoding is the default" will be
  * separated in future
  */
@@ -67,6 +76,7 @@
 ===========================================================================*/
 #define SCM_STRICT_R5RS         0  /* use strict R5RS check */
 #define SCM_STRICT_ARGCHECK     1  /* enable strict argument check */
+#define SCM_STRICT_ENCODING_CHECK 1 /* do all feasible encoding error checks */
 #define SCM_ACCESSOR_ASSERT     0  /* enable strict type check with accessor */
 #define SCM_USE_VALUECONS       1  /* use experimental values passing */
 #define SCM_VOLATILE_OUTPUT     0  /* always flush files on write */
@@ -80,6 +90,7 @@
 #define SCM_DEBUG               1  /* enable debugging features */
 #define SCM_DEBUG_GC            0  /* enable GC debugging */
 #define SCM_DEBUG_PARSER        0  /* enable parser debugging */
+#define SCM_DEBUG_ENCODING      0  /* debug encoding-related functions */
 #define SCM_DEBUG_BACKTRACE_SEP 1  /* enable frame-separator on backtrace */
 #define SCM_DEBUG_BACKTRACE_VAL 1  /* enable values printing on backtrace */
 

Modified: branches/r5rs/sigscheme/datas.c
===================================================================
--- branches/r5rs/sigscheme/datas.c	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/datas.c	2005-10-18 06:45:18 UTC (rev 1858)
@@ -721,11 +721,13 @@
 ScmObj Scm_NewChar(char *ch)
 {
     ScmObj obj = SCM_FALSE;
+    int len;
 
-    /* check length */
-    if (SigScm_default_encoding_strlen(ch) != 1) {
+    /* assert length == 1 */
+    len = Scm_mb_bare_c_strlen(ch);
+    if (len != 1) {
         SigScm_Error("Scm_NewChar : invalid character ch = [%s], len = %d",
-                     ch, SigScm_default_encoding_strlen(ch));
+                     ch, len);
     }
 
     SCM_NEW_OBJ_INTERNAL(obj);
@@ -744,7 +746,7 @@
 
     SCM_ENTYPE_STRING(obj);
     SCM_STRING_SET_STR(obj, str);
-    SCM_STRING_SET_LEN(obj, SigScm_default_encoding_strlen(str));
+    SCM_STRING_SET_LEN(obj, str ? Scm_mb_bare_c_strlen(str) : 0);
 
     return obj;
 }
@@ -758,7 +760,7 @@
 
     SCM_ENTYPE_STRING(obj);
     SCM_STRING_SET_STR(obj, strdup(str));
-    SCM_STRING_SET_LEN(obj, SigScm_default_encoding_strlen(str));
+    SCM_STRING_SET_LEN(obj, Scm_mb_bare_c_strlen(str));
 
     return obj;
 }

Modified: branches/r5rs/sigscheme/debug.c
===================================================================
--- branches/r5rs/sigscheme/debug.c	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/debug.c	2005-10-18 06:45:18 UTC (rev 1858)
@@ -142,6 +142,9 @@
 #if SCM_DEBUG_GC
             | SCM_DBG_GC
 #endif
+#if SCM_DEBUG_ENCODING
+            | SCM_DBG_ENCODING
+#endif
             );
 #else /* SCM_DEBUG */
     return SCM_DBG_NONE;
@@ -182,7 +185,7 @@
     DECLARE_INTERNAL_FUNCTION("SigScm_WriteToPort");
 
     if (FALSEP(port))
-	return;
+        return;
 
     ASSERT_PORTP(port);
     if (SCM_PORT_PORTDIRECTION(port) != PORT_OUTPUT)
@@ -201,7 +204,7 @@
     DECLARE_INTERNAL_FUNCTION("SigScm_DisplayToPort");
 
     if (FALSEP(port))
-	return;
+        return;
 
     ASSERT_PORTP(port);
     if (SCM_PORT_PORTDIRECTION(port) != PORT_OUTPUT)

Modified: branches/r5rs/sigscheme/encoding.c
===================================================================
--- branches/r5rs/sigscheme/encoding.c	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/encoding.c	2005-10-18 06:45:18 UTC (rev 1858)
@@ -31,6 +31,12 @@
  *  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  *  SUCH DAMAGE.
 ===========================================================================*/
+
+/* Acknowledgement: much information was gained from the
+ * i18n-introduction of the debian project.  Many thanks to its
+ * authors, Tomohiro KUBOTA, et al. */
+
+
 /*=======================================
   System Include
 =======================================*/
@@ -39,107 +45,379 @@
   Local Include
 =======================================*/
 #include "sigscheme.h"
+#include "sigschemeinternal.h"
 
 /*=======================================
-  File Local Struct Declarations
+  File Local Functions
 =======================================*/
+#if SCM_USE_EUCJP
+static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs);
+#endif
 
-/*=======================================
-  File Local Macro Declarations
-=======================================*/
+#if SCM_USE_ISO2022KR
+static ScmMultibyteCharInfo iso2022kr_scan_char(ScmMultibyteString mbs);
+static ScmMultibyteCharInfo iso2022kr_scan_input_char(ScmMultibyteString mbs);
+#endif
 
+#if SCM_USE_ISO2022JP
+static ScmMultibyteCharInfo iso2022jp_scan_char(ScmMultibyteString mbs);
+static ScmMultibyteCharInfo iso2022jp_scan_input_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_SJIS
+static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_EUCCN
+static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_EUCKR
+static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_UTF8
+static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs);
+#endif
+
+static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs);
+
+typedef unsigned char uchar;
+
 /*=======================================
-  Variable Declarations
+  Global Variables
 =======================================*/
+/* TODO: add some mechanism to dynamically switch between encodings. */
+ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)
+    = utf8_scan_char;
 
 /*=======================================
-  File Local Function Declarations
+  Public API
 =======================================*/
-static int eucjp_strlen(const char *p);
-static const char* eucjp_str_startpos(const char *p, int k);
-static const char* eucjp_str_endpos(const char *p, int k);
 
-/*=======================================
-  Function Implementations
-=======================================*/
-int SigScm_default_encoding_strlen(const char *str)
+int Scm_mb_strlen(ScmMultibyteString mbs)
 {
-#if SCM_USE_EUCJP
-    return eucjp_strlen(str);
-#endif
+    int len = 0;
+    ScmMultibyteCharInfo c;
+
+    CDBG((SCM_DBG_ENCODING, "mb_strlen: size = %d; str = %s;",
+          SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));
+
+    while (SCM_MBS_GET_SIZE(mbs)) {
+        c = Scm_mb_scan_char(mbs);
+        CDBG((SCM_DBG_ENCODING, "%d, %d;", SCM_MBCINFO_GET_SIZE(c), c.flag));
+        SCM_MBS_SKIP_CHAR(mbs, c);
+        len++;
+    }
+
+    CDBG((SCM_DBG_ENCODING, "len=%d\n", len));
+    return len;
 }
 
-const char* SigScm_default_encoding_str_startpos(const char *str, int k)
+/* FIXME: pick a better name. */
+int Scm_mb_bare_c_strlen(const char *s)
 {
-#if SCM_USE_EUCJP
-    return eucjp_str_startpos(str, k);
-#endif    
+    ScmMultibyteString mbs;
+    SCM_MBS_INIT(mbs);
+    SCM_MBS_SET_STR(mbs, s);
+    SCM_MBS_SET_SIZE(mbs, strlen(s));
+    return Scm_mb_strlen(mbs);
 }
 
-const char* SigScm_default_encoding_str_endpos(const char *str, int k)
+ScmMultibyteString Scm_mb_substring(ScmMultibyteString mbs, int i, int len)
 {
+    ScmMultibyteString ret;
+    ScmMultibyteString end;
+    ScmMultibyteCharInfo c;
+
+    ret = mbs;
+
+    while (i--) {
+        c = Scm_mb_scan_char(ret);
+        SCM_MBS_SKIP_CHAR(ret, c);
+    }
+
+    end = ret;
+
+    while (len--) {
+        c = Scm_mb_scan_char(end);
+        SCM_MBS_SKIP_CHAR(end, c);
+    }
+
+    SCM_MBS_SET_SIZE(ret, SCM_MBS_GET_STR(end) - SCM_MBS_GET_STR(ret));
+    return ret;
+}
+
+
+/*=======================================
+  Encoding-specific functions
+=======================================*/
+
+/* Every encoding implements the <encoding name>_scan_char()
+ * primitive.  Its job is to determine the length of the first
+ * character in the given string.  Stateful encodings should save
+ * their state *at exit*, that is, the state right after reading the
+ * first character (so don't omit it).  */
+
+/* Convenience macros.  Start with ENTER and return with RETURN*.
+ * EXPECT_SIZE() declares the expected length of the character.  We'll
+ * use it to return information on how many octets are missing.  It
+ * also serves as documentation.  */
+#define ENTER   ScmMultibyteCharInfo _ret;  SCM_MBCINFO_INIT(_ret)
+#define RETURN(n)  do { SCM_MBCINFO_SET_SIZE(_ret, n); return _ret; } while (0)
+#define RETURN_ERROR() do { SCM_MBCINFO_SET_ERROR(_ret); RETURN(1); } while (0)
+#define RETURN_INCOMPLETE(n) do { SCM_MBCINFO_SET_INCOMPLETE(_ret); RETURN(n); } while (0)
+#define SAVE_STATE(stat) (SCM_MBCINFO_SET_STATE(_ret, (stat)))
+#define EXPECT_SIZE(size) /* Currently ignored. */
+
+/* Encodings based on ISO/IEC 2022. */
+
+/* Control regions. */
+#define IN_CL(c)   ((uchar)(c) < 0x20)
+#define IN_CR(c)   (0x80 <= (uchar)(c) && (uchar)(c) <= 0x9F)
+
+/* General purpose regions. */
+#define IN_GL94(c) (0x21 <= (uchar)(c) && (uchar)(c) <= 0x7E)
+#define IN_GL96(c) (0x20 <= (uchar)(c) && (uchar)(c) <= 0x7F)
+#define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)
+#define IN_GR96(c) (0xA0 <= (uchar)(c) && (uchar)(c) <= 0xFF)
+
+#define IS_ASCII(c) ((uchar)(c) <= 0x7F)
+
+#define ESC 0x1B
+#define SO  0x0E
+#define SI  0x0F
+#define SS2 0x8E
+#define SS3 0x8F
+
+
 #if SCM_USE_EUCJP
-    return eucjp_str_endpos(str, k);
-#endif    
+/* G0 <- (96) ASCII (or was it JIS X 0201 Roman?)
+ * G1 <- (94x94) JIS X 0208 kanji/kana
+ * G2 <- (94) JIS X 0201 Katakana ("half-width katakana")
+ * G3 <- (94x94) JIS X 0212 kanji, or JIS X 0213 kanji plane 2
+ *
+ * GL <- G0 (ASCII)
+ * GR <- G1 (JIS X 0208)
+ * CL <- JIS X 0211 C0
+ * CR <- JIS X 0211 C1 */
+static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs)
+{
+    const char *str = SCM_MBS_GET_STR(mbs);
+    const int size  = SCM_MBS_GET_SIZE(mbs);
+    ENTER;
+
+    if (!size)
+        RETURN(0);
+
+    if (IN_CL(str[0]) || IN_GL96(str[0]))
+        RETURN(1);
+    else if (IN_GR94(str[0]) || (uchar)str[0] == SS2) {
+        EXPECT_SIZE(2);
+        if (size < 2)         RETURN_INCOMPLETE(1);
+#if SCM_STRICT_ENCODING_CHECK
+        if (!IN_GR94(str[1])) RETURN_ERROR();
+#endif
+        RETURN(2);
+    } else if ((uchar)str[0] == SS3) {
+        EXPECT_SIZE(3);
+#if SCM_STRICT_ENCODING_CHECK
+        if (size < 2)         RETURN_INCOMPLETE(size);
+        if (!IN_GR94(str[1])) RETURN_ERROR();
+        if (size < 3)         RETURN_INCOMPLETE(size);
+        if (!IN_GR94(str[2])) RETURN_ERROR();
+        RETURN(3);
+#else  /* not SCM_STRICT_ENCODING_CHECK */
+        if (size < 3)
+            RETURN_INCOMPLETE(size);
+        RETURN(3);
+#endif /* not SCM_STRICT_ENCODING_CHECK */
+    }
+
+    RETURN_ERROR();
 }
+#endif /* SCM_USE_EUCJP */
 
-static int eucjp_strlen(const char *str)
+#if SCM_USE_EUCCN
+/* FIXME: NOT TESTED!
+ * 
+ * G0 <- ASCII (or GB 1988?)
+ * G1 <- GB2312
+ *
+ * GL <- G0 (ASCII)
+ * GR <- G1 (GB2312) */
+static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs)
 {
-    int len = 0;
-    const unsigned char *cur = (const unsigned char *)str;
-    while (*cur) {
-        if (*cur > 127) {
-            /* 2 bytes */
-            cur++;
-        }
+    /* TODO: maybe we can make this an alias of eucjp_scan_char()? */
+    const char *str = SCM_MBS_GET_STR(mbs);
+    const int size  = SCM_MBS_GET_SIZE(mbs);
+    ENTER;
 
-        cur++;
-        len++;
+    if (!size)
+        RETURN(0);
+    if (IS_ASCII(str[0]))
+        RETURN(1);
+    if (IN_GR94(str[0])) {
+        EXPECT_SIZE(2);
+        if (size < 2)
+            RETURN_INCOMPLETE(size);
+#if SCM_STRICT_ENCODING_CHECK
+        if (!IN_GR94(str[1]))
+            RETURN_ERROR();
+#endif
+        RETURN(2);
     }
+    RETURN_ERROR();
+}
+#endif
 
-    return len;
+#if SCM_USE_EUCKR
+/* FIXME: NOT TESTED!  I'm not sure about this encoding.  There's also
+ * a Microsoft variant called CP949, which is not supported (yet).
+ * RFC 1557 says KS X 1001 is 94x94.
+ *
+ * G0 <- ASCII
+ * G1 <- KS X 1001 (aka KSC 5601)
+ *
+ * GL <- G0
+ * GR <- G1 */
+static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs)
+{
+    const char *str = SCM_MBS_GET_STR(mbs);
+    const int size  = SCM_MBS_GET_SIZE(mbs);
+    ENTER;
+
+    if (!size)
+        RETURN(0);
+    if (IS_ASCII(str[0]))
+        RETURN(1);
+    if (IN_GR94(str[0])) {
+        EXPECT_SIZE(2);
+        if (size < 2)
+            RETURN_INCOMPLETE(size);
+#if SCM_STRICT_ENCODING_CHECK
+        if (!IN_GR94(str[1]))
+            RETURN_ERROR();
+#endif
+        RETURN(2);
+    }
+    RETURN_ERROR();
 }
+#endif /* SCM_USE_EUCKR */
 
-static const char* eucjp_str_startpos(const char *str, int k)
+/*==== Encodings for Unicode ====*/
+#if SCM_USE_UTF8
+/* RFC 3629 */
+#define MASK(n)        ((LEN_CODE(n) >> 1) | 0x80)
+#define LEN_CODE(n)    (((1 << (n))-1) << (8-n))
+#define IS_LEN(c, n)   ((MASK(n) & (c)) == LEN_CODE(n))
+#define IS_TRAILING(c) (IS_LEN((c), 1))
+
+static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs)
 {
-    int len = 0;
-    const unsigned char *cur = (const unsigned char *)str;
-    while (*cur) {
-        if (len == k)
-            return (const char *)cur;
+    const char *str = SCM_MBS_GET_STR(mbs);
+    const int size  = SCM_MBS_GET_SIZE(mbs);
+    int len;
+    ENTER;
 
-        if (*cur > 127) {
-            /* 2 bytes */
-            cur++;
+    if (!size)
+        RETURN(0);
+    if (IS_ASCII(str[0]))
+        RETURN(1);
+
+    if (IS_LEN(str[0], 2))       len = 2;
+    else if (IS_LEN(str[0], 3))  len = 3;
+    else if (IS_LEN(str[0], 4))  len = 4;
+    else                         RETURN_ERROR();
+
+#if SCM_STRICT_ENCODING_CHECK
+    {
+        int i;
+        for (i=1; i < len; i++) {
+            if (size <= i)
+                RETURN_INCOMPLETE(size);
+            if (!IS_TRAILING(str[i]))
+                RETURN_ERROR();
         }
-
-        cur++;
-        len++;
     }
+#else  /* not SCM_STRICT_ENCODING_CHECK */
+    if (size < len)
+        RETURN_INCOMPLETE(size);
+#endif /* not SCM_STRICT_ENCODING_CHECK */
 
-    return (const char*)cur;
+    RETURN(len);
+
 }
 
-static const char* eucjp_str_endpos(const char *str, int k)
+#undef MASK
+#undef LEN_CODE
+#undef IS_LEN
+#undef IS_TRAILING
+#endif /* SCM_USE_UTF8 */
+
+/*==== Other encodings ====*/
+
+#if SCM_USE_SJIS
+/* The cwazy Japanese encoding.  This function implements the JIS X
+ * 0213 variant.
+ *
+ * 0 .. 0x7F: ASCII
+ * 0x80: undefined
+ * 0x81 .. 0x9F: lead byte of 2-byte char
+ * 0xA0: undefined
+ * 0xA1 .. 0xDF: JIS X 0201 katakana (1 byte)
+ * 0xE0 .. 0xEF: lead byte of 2-byte char
+ * 0xF0 .. 0xFC: lead byte of 2-byte char if JIS X 0213 is used
+ * 0xFD .. 0xFF: undefined
+ * 
+ * 0x40 .. 0x7E: trailing byte of 2-byte char
+ * 0x80 .. 0xFC: trailing byte of 2-byte char
+ */
+static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs)
 {
-    int len = 0;
-    const unsigned char *cur = (const unsigned char *)str;
-    while (*cur) {
-        if (*cur > 127) {
-            /* 2 bytes */
-            cur++;
-        }
+#define IS_KANA(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xDF)
+#define IS_LEAD(c)        \
+    (0x81 <= (uchar)(c)   \
+    && !IS_KANA(c)        \
+    && (uchar)(c) <= 0xFC \
+    && (uchar)(c) != 0xA0)
+#define IS_TRAIL(c) (0x40 <= (uchar)(c) && (uchar)(c) <= 0xFC && (c) != 0x7E)
 
-        cur++;
-        len++;
-
-        if (len == k + 1)
-            return (const char *)cur;
+    const char *str = SCM_MBS_GET_STR(mbs);
+    const int  size = SCM_MBS_GET_SIZE(mbs);
+    ENTER;
+    if (!size)
+        RETURN(0);
+    if (IS_LEAD(str[0])) {
+        EXPECT_SIZE(2);
+        if (size < 2)
+            RETURN_INCOMPLETE(size);
+#if SCM_STRICT_ENCODING_CHECK
+        if (!IS_TRAIL(str[1]))
+            RETURN_ERROR();
+#endif
+        RETURN(2);
     }
-    
-    if (len == k + 1)
-        return (const char *)cur;
+    RETURN(1);
 
-    SigScm_Error("eucjp_str_startpos : unreachable point");
-    return NULL;
+#undef IS_KANA
+#undef IS_LEAD
+#undef IS_TRAIL
 }
+#endif /* SCM_USE_SJIS */
+
+/* Single-byte encodings.  Please add any that you know are missing.
+ * Sorted alphabetically.
+ * 
+ * ASCII
+ * ISO 646
+ * ISO-8859-*
+ * VISCII
+ */
+static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs)
+{
+    ENTER;
+    if (SCM_MBS_GET_SIZE(mbs))
+        RETURN(1);
+    RETURN(0);
+}

Modified: branches/r5rs/sigscheme/operations-siod.c
===================================================================
--- branches/r5rs/sigscheme/operations-siod.c	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/operations-siod.c	2005-10-18 06:45:18 UTC (rev 1858)
@@ -180,9 +180,9 @@
 
     exp = SCM_CLOSURE_EXP(closure);
     if (NULLP(CDDR(exp)))
-	body = CADR(exp);
+        body = CADR(exp);
     else
-	body = CONS(Scm_Intern("begin"), CDR(exp));
+        body = CONS(Scm_Intern("begin"), CDR(exp));
     
     return CONS(CAR(exp), body);
 }
@@ -211,7 +211,7 @@
         SigScm_Error("SigScm_SetVerboseLevel : negative value has been given");
 
     if (sscm_verbose_level == level)
-	return;
+        return;
 
     sscm_verbose_level = level;
 

Modified: branches/r5rs/sigscheme/operations.c
===================================================================
--- branches/r5rs/sigscheme/operations.c	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/operations.c	2005-10-18 06:45:18 UTC (rev 1858)
@@ -1258,31 +1258,32 @@
 {
     DECLARE_FUNCTION("string-length", ProcedureFixed1);
     ASSERT_STRINGP(str);
-    return Scm_NewInt(SigScm_default_encoding_strlen(SCM_STRING_STR(str)));
+    return Scm_NewInt(Scm_mb_bare_c_strlen(SCM_STRING_STR(str)));
 }
 
 ScmObj ScmOp_string_ref(ScmObj str, ScmObj k)
 {
     int   c_index = 0;
     char *new_ch  = NULL;
-    const char *string_str   = NULL;
-    const char *ch_start_ptr = NULL;
-    const char *ch_end_ptr   = NULL;
+    ScmMultibyteString mbs;
     DECLARE_FUNCTION("string-ref", ProcedureFixed2);
 
     ASSERT_STRINGP(str);
     ASSERT_INTP(k);
 
+    SCM_MBS_INIT(mbs);
     /* get start_ptr and end_ptr */
     c_index = SCM_INT_VALUE(k);
-    string_str   = SCM_STRING_STR(str);
-    ch_start_ptr = SigScm_default_encoding_str_startpos(string_str, c_index);
-    ch_end_ptr   = SigScm_default_encoding_str_endpos(string_str, c_index);
+    SCM_MBS_SET_STR(mbs, SCM_STRING_STR(str));
 
+    /* FIXME: This strlen() can be eliminated. */
+    SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(str)));
+    mbs = Scm_mb_strref(mbs, c_index);
+
     /* copy from start_ptr to end_ptr */
-    new_ch = (char*)malloc(sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
-    memset(new_ch, 0, sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
-    strncpy(new_ch, ch_start_ptr, (ch_end_ptr - ch_start_ptr));
+    new_ch = (char*)malloc(SCM_MBS_GET_SIZE(mbs) + 1);
+    memcpy(new_ch, SCM_MBS_GET_STR(mbs), SCM_MBS_GET_SIZE(mbs));
+    new_ch[SCM_MBS_GET_SIZE(mbs)] = 0;
 
     return Scm_NewChar(new_ch);
 }
@@ -1290,14 +1291,13 @@
 ScmObj ScmOp_string_set(ScmObj str, ScmObj k, ScmObj ch)
 {
     int   c_start_index = 0;
-    int   front_size = 0;
+    int   prefix_size = 0;
     int   newch_size = 0;
-    int   back_size  = 0;
+    int   postfix_size  = 0;
     int   total_size = 0;
     char *new_str  = NULL;
+    ScmMultibyteString mbs;
     const char *string_str   = NULL;
-    const char *ch_start_ptr = NULL;
-    const char *ch_end_ptr   = NULL;
     DECLARE_FUNCTION("string-set!", ProcedureFixed3);
 
     ASSERT_STRINGP(str);
@@ -1307,23 +1307,28 @@
     /* get indexes */
     c_start_index = SCM_INT_VALUE(k);
     string_str    = SCM_STRING_STR(str);
-    ch_start_ptr  = SigScm_default_encoding_str_startpos(string_str, c_start_index);
-    ch_end_ptr    = SigScm_default_encoding_str_endpos(string_str, c_start_index);
+    /* FIXME: can string_str be NULL at this point or not? */
+    if (!string_str) string_str = "";
 
+    /* FIXME: strlen() can be eliminiated. */
+    SCM_MBS_INIT(mbs);
+    SCM_MBS_SET_STR(mbs, string_str);
+    SCM_MBS_SET_SIZE(mbs, strlen(string_str));
+    mbs = Scm_mb_strref(mbs, c_start_index);
+
     /* calculate total size */
-    front_size = strlen(string_str) - strlen(ch_start_ptr);
-    newch_size = strlen(SCM_CHAR_VALUE(ch));
-    back_size  = strlen(ch_end_ptr);
-    total_size = front_size + newch_size + back_size;
+    prefix_size = SCM_MBS_GET_STR(mbs) - string_str;
+    newch_size  = strlen(SCM_CHAR_VALUE(ch));
+    postfix_size  = strlen(SCM_MBS_GET_STR(mbs) + SCM_MBS_GET_SIZE(mbs));
+    total_size = prefix_size + newch_size + postfix_size;
 
-    /* copy each parts */
+    /* copy each part */
     new_str = (char*)malloc(total_size + 1);
-    memset(new_str, 0, total_size + 1);
-    strncpy(new_str                           , string_str      , front_size);
-    strncpy(new_str + front_size              , SCM_CHAR_VALUE(ch) , newch_size);
-    strncpy(new_str + front_size + newch_size , ch_end_ptr      , back_size);
+    memcpy(new_str, string_str, prefix_size);
+    memcpy(new_str+prefix_size, SCM_CHAR_VALUE(ch), newch_size);
+    memcpy(new_str+prefix_size+newch_size,
+           SCM_MBS_GET_STR(mbs)+SCM_MBS_GET_SIZE(mbs), postfix_size);
 
-    /* set */
     if (SCM_STRING_STR(str))
         free(SCM_STRING_STR(str));
 
@@ -1349,10 +1354,9 @@
 {
     int   c_start_index = 0;
     int   c_end_index   = 0;
-    char *new_str  = NULL;
+    char *new_str = NULL;
+    ScmMultibyteString mbs;
     const char *string_str   = NULL;
-    const char *ch_start_ptr = NULL;
-    const char *ch_end_ptr   = NULL;
     DECLARE_FUNCTION("substring", ProcedureFixed3);
 
     ASSERT_STRINGP(str);
@@ -1364,18 +1368,22 @@
     c_end_index   = SCM_INT_VALUE(end);
 
     /* sanity check */
-    if (c_start_index == c_end_index)
-        return Scm_NewStringCopying("");
+    if (c_start_index > c_end_index)
+        ERR("substring: start index is greater than end index.");
+    if (c_end_index > SCM_STRING_LEN(str))
+        ERR_OBJ("index out of range", end);
 
-    /* get str */
-    string_str    = SCM_STRING_STR(str);
-    ch_start_ptr  = SigScm_default_encoding_str_startpos(string_str, c_start_index);
-    ch_end_ptr    = SigScm_default_encoding_str_startpos(string_str, c_end_index);
+    /* FIXME: strlen() can be eliminated. */
+    string_str = SCM_STRING_STR(str);
+    SCM_MBS_INIT(mbs);
+    SCM_MBS_SET_STR(mbs, string_str);
+    SCM_MBS_SET_SIZE(mbs, strlen(string_str));
+    mbs = Scm_mb_substring(mbs, c_start_index, c_end_index - c_start_index);
 
     /* copy from start_ptr to end_ptr */
-    new_str = (char*)malloc(sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
-    memset(new_str, 0, sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
-    strncpy(new_str, ch_start_ptr, sizeof(char) * (ch_end_ptr - ch_start_ptr));
+    new_str = (char*)malloc(SCM_MBS_GET_SIZE(mbs) + 1);
+    memcpy(new_str, SCM_MBS_GET_STR(mbs), SCM_MBS_GET_SIZE(mbs));
+    new_str[SCM_MBS_GET_SIZE(mbs)] = 0;
 
     return Scm_NewString(new_str);
 }
@@ -1420,39 +1428,35 @@
 
 ScmObj ScmOp_string2list(ScmObj string)
 {
-    char *string_str = NULL;
-    int   str_len    = 0;
     ScmObj head = SCM_NULL;
-    ScmObj prev = NULL;
-    ScmObj next = NULL;
-    int i = 0;
-    const char *ch_start_ptr = NULL;
-    const char *ch_end_ptr   = NULL;
-    char *new_ch = NULL;
+    ScmObj tail = SCM_NULL;
+    ScmObj next = SCM_NULL;
+    ScmMultibyteString mbs;
+    ScmMultibyteCharInfo ch;
+    char *buf;
     DECLARE_FUNCTION("string->list", string);
 
     ASSERT_STRINGP(string);
 
-    string_str = SCM_STRING_STR(string);
-    str_len    = SCM_STRING_LEN(string);
-    if (str_len == 0)
-        return SCM_NULL;
+    SCM_MBS_INIT(mbs);
+    SCM_MBS_SET_STR(mbs, SCM_STRING_STR(string));
+    SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(string)));
 
-    for (i = 0; i < str_len; i++) {
-        ch_start_ptr = SigScm_default_encoding_str_startpos(string_str, i);
-        ch_end_ptr   = SigScm_default_encoding_str_endpos(string_str, i);
+    while (SCM_MBS_GET_SIZE(mbs)) {
+        ch = Scm_mb_scan_char(mbs);
+        buf = malloc(SCM_MBCINFO_GET_SIZE(ch)+1);
+        memcpy(buf, SCM_MBS_GET_STR(mbs), SCM_MBCINFO_GET_SIZE(ch));
+        buf[SCM_MBCINFO_GET_SIZE(ch)] = 0;
+        next = LIST_1(Scm_NewChar(buf));
 
-        new_ch = (char*)malloc(sizeof(char) * (ch_end_ptr - ch_start_ptr + 1));
-        memset(new_ch, 0, sizeof(char) * (ch_end_ptr - ch_start_ptr + 1));
-        strncpy(new_ch, ch_start_ptr, (sizeof(char) * (ch_end_ptr - ch_start_ptr)));
+        if (NULLP(tail))
+            head = tail = next;
+        else {
+            SET_CDR(tail, next);
+            tail = CDR(tail);
+        }
 
-        next = CONS(Scm_NewChar(new_ch), SCM_NULL);
-        if (prev)
-            SET_CDR(prev, next);
-        else
-            head = next;
-
-        prev = next;
+        SCM_MBS_SKIP_CHAR(mbs, ch);
     }
 
     return head;

Modified: branches/r5rs/sigscheme/sigscheme.h
===================================================================
--- branches/r5rs/sigscheme/sigscheme.h	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/sigscheme.h	2005-10-18 06:45:18 UTC (rev 1858)
@@ -161,6 +161,7 @@
     SCM_DBG_EXPERIMENTAL = 1 << 11,  /* developed but experimental features */
     SCM_DBG_DEVEL        = 1 << 12,  /* under development */
     SCM_DBG_COMPAT       = 1 << 13,  /* warns compatibility-sensitive code */
+    SCM_DBG_ENCODING     = 1 << 14,  /* multibyte handling */
     SCM_DBG_OTHER        = 1 << 30   /* all other messages */
 };
 
@@ -565,9 +566,11 @@
 #endif
 
 /* encoding.c */
-int SigScm_default_encoding_strlen(const char *str);
-const char* SigScm_default_encoding_str_startpos(const char *str, int k);
-const char* SigScm_default_encoding_str_endpos(const char *str, int k);
+int Scm_mb_strlen(ScmMultibyteString mbs);
+int Scm_mb_bare_c_strlen(const char *str);
+ScmMultibyteString Scm_mb_substring(ScmMultibyteString str, int i, int len);
+#define Scm_mb_strref(str, i) (Scm_mb_substring((str), (i), 1))
+extern ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs);
 
 /* read.c */
 ScmObj SigScm_Read(ScmObj port);

Modified: branches/r5rs/sigscheme/sigschemeinternal.h
===================================================================
--- branches/r5rs/sigscheme/sigschemeinternal.h	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/sigschemeinternal.h	2005-10-18 06:45:18 UTC (rev 1858)
@@ -252,7 +252,7 @@
      && (NULLP(args) \
          || (ERR_OBJ("improper argument list terminator", (args)), 1)))
 #define ASSERT_NO_MORE_ARG(args) \
-    (NO_MORE_ARG(args) || (ERR("superfluous argument(s)", (args)), 1))
+    (NO_MORE_ARG(args) || (ERR_OBJ("superfluous argument(s)", (args)), 1))
 #define ASSERT_PROPER_ARG_LIST(args) \
     (ScmOp_c_length(args) >= 0 \
      || (ERR_OBJ("bad argument list", (args)), 1))

Modified: branches/r5rs/sigscheme/sigschemetype.h
===================================================================
--- branches/r5rs/sigscheme/sigschemetype.h	2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/sigschemetype.h	2005-10-18 06:45:18 UTC (rev 1858)
@@ -248,6 +248,79 @@
 };
 
 /*=======================================
+   Multibyte encoding support
+=======================================*/
+
+/* This type will actually contain some encoding-dependent enum value.
+ * It might as well be defined as mbstate_t if we're using libc. */
+typedef int ScmMultibyteState;
+
+/* Metadata of a multibyte character.  These are usually allocated on
+   stack or register, so we'll make liberal use of space. */
+typedef struct {
+    const char *start;
+    int flag;
+    int size;
+
+#if SCM_USE_STATEFUL_ENCODING
+    /* Shift state at the *end* of the described character. */
+    ScmMultibyteState state;
+#endif
+} ScmMultibyteCharInfo;
+
+typedef struct {
+    const char *str;
+
+    /* Only the size is stored because ScmObj caches the length, and
+     * we'll have to traverse from the beginning all the time
+     * anyway. */
+    int size;
+#if SCM_USE_STATEFUL_ENCODING
+    ScmMultibyteState state;
+#endif
+} ScmMultibyteString;
+
+#define SCM_MBS_SET_STR(mbs, s)         ((mbs).str = (s))
+#define SCM_MBS_GET_STR(mbs)            ((mbs).str)
+#define SCM_MBS_SET_SIZE(mbs, siz)      ((mbs).size = (siz))
+#define SCM_MBS_GET_SIZE(mbs)           ((mbs).size)
+
+#define SCM_MBCINFO_SET_SIZE SCM_MBS_SET_SIZE
+#define SCM_MBCINFO_GET_SIZE SCM_MBS_GET_SIZE
+#define SCM_MBCINFO_CLEAR_STATE SCM_MBS_CLEAR_STATE
+#define SCM_MBCINFO_SET_STATE SCM_MBS_SET_STATE
+#define SCM_MBCINFO_GET_STATE SCM_MBS_GET_STATE
+#define SCM_MBCINFO_CLEAR_FLAG(inf)     ((inf).flag = 0)
+#define SCM_MBCINFO_SET_ERROR(inf)      ((inf).flag |= 1)
+#define SCM_MBCINFO_SET_INCOMPLETE(inf) ((inf).flag |= 2)
+#define SCM_MBCINFO_ERRORP(inf)         ((inf).flag & 1)
+#define SCM_MBCINFO_INCOMPLETEP(inf)    ((inf).flag & 2)
+#define SCM_MBCINFO_INIT(inf)  (SCM_MBCINFO_SET_SIZE((inf), 0),  \
+                                SCM_MBCINFO_CLEAR_STATE(inf),    \
+                                SCM_MBCINFO_CLEAR_FLAG(inf))
+
+
+#if SCM_USE_STATEFUL_ENCODING
+#define SCM_MBS_GET_STATE(mbs)        ((mbs).state)
+#define SCM_MBS_SET_STATE(mbs, stat)  ((mbs).state = (stat))
+#define SCM_MBS_CLEAR_STATE(mbs)      ((mbs).state = 0)
+#else
+#define SCM_MBS_GET_STATE(mbs)        0
+#define SCM_MBS_SET_STATE(mbs, stat)  0
+#define SCM_MBS_CLEAR_STATE(mbs)      0
+#endif
+#define SCM_MBS_INIT(mbs)  (SCM_MBS_SET_STR((mbs), NULL), \
+                            SCM_MBS_SET_SIZE((mbs), 0),   \
+                            SCM_MBS_CLEAR_STATE(mbs))
+#define SCM_MBS_SKIP_CHAR(mbs, inf)                                           \
+    (SCM_MBS_SET_STR((mbs), SCM_MBS_GET_STR(mbs) + SCM_MBCINFO_GET_SIZE(inf)),\
+     SCM_MBS_SET_SIZE((mbs),                                                  \
+                      SCM_MBS_GET_SIZE(mbs) - SCM_MBCINFO_GET_SIZE(inf)),     \
+     SCM_MBS_SET_STATE((mbs), SCM_MBCINFO_GET_STATE(inf)))
+
+
+
+/*=======================================
    Accessors For Scheme Objects
 =======================================*/
 /* ScmObj Global Attribute */



More information about the uim-commit mailing list