[uim-commit] r2235 - branches/r5rs/sigscheme

yamaken at freedesktop.org yamaken at freedesktop.org
Tue Nov 22 17:21:31 PST 2005


Author: yamaken
Date: 2005-11-22 17:21:27 -0800 (Tue, 22 Nov 2005)
New Revision: 2235

Modified:
   branches/r5rs/sigscheme/TODO
   branches/r5rs/sigscheme/config.h
   branches/r5rs/sigscheme/encoding.c
   branches/r5rs/sigscheme/encoding.h
   branches/r5rs/sigscheme/operations.c
   branches/r5rs/sigscheme/sigscheme.h
Log:
* This commit adds character codec feature, and implement
  char<->integer conversion procedures by using it

* sigscheme/config.h
  - (SCM_USE_EUCCN, SCM_USE_EUCJP, SCM_USE_EUCKR, SCM_USE_SJIS):
    Enable by default
  - (SCM_USE_UTF8_AS_DEFAULT, SCM_USE_EUCCN_AS_DEFAULT,
    SCM_USE_EUCJP_AS_DEFAULT, SCM_USE_EUCKR_AS_DEFAULT,
    SCM_USE_SJIS_AS_DEFAULT, SCM_USE_MULTIBYTE_CHAR): New macro
* sigscheme/encoding.h
  - (SCM_CHARCODEC_STR2INT, SCM_CHARCODEC_INT2STR): Add argument
    'state' and rename others
  - (SCM_MB_STATELESS): New macro
  - (ScmCharCodec): Add const to the typedef
  - (ScmCharCodecMethod_str2int, ScmCharCodecMethod_int2str): Add
    argument 'state' and rename others
  - (Scm_current_char_codec): New variable decl
* sigscheme/encoding.c
  - (uint): New type
  - (eucjp_encoding, eucjp_str2int, eucjp_int2str, sjis_encoding,
    sjis_int2str, dbc_str2int, euc_int2str, euccn_encoding,
    euckr_encoding, utf8_encoding, utf8_str2int, utf8_int2str,
    unibyte_encoding, unibyte_str2int, unibyte_int2str): New static
    function
  - (utf8_codec_vtbl, euccn_codec_vtbl, eucjp_codec_vtbl,
    euckr_codec_vtbl, sjis_codec_vtbl, unibyte_codec_vtbl): New static
    variable
  - (utf8_codec, euccn_codec, eucjp_codec, euckr_codec, sjis_codec,
    unibyte_codec): New macro
  - (Scm_mb_scan_char): Make default-encoding sensitive
  - (Scm_current_char_codec): New variable
  - (CHAR_BITS, BYTE_MASK, IS_1BYTE, IS_2BYTES, IS_3BYTES, IN_OCT_BMP,
    IN_BMP, IN_SMP, LEN_CODE_BITS, TRAILING_CODE_BITS,
    TRAILING_VAL_BITS, LEADING_VAL_BITS, LEADING_VAL, TRAILING_VAL):
    New macro
* sigscheme/sigscheme.h
  - (ScmOp_char2integer): New function decl
* sigscheme/operations.c
  - (ScmOp_char2integer): New function
  - (ScmOp_integer2char): Support multibyte char
* sigscheme/TODO
  - Update


Modified: branches/r5rs/sigscheme/TODO
===================================================================
--- branches/r5rs/sigscheme/TODO	2005-11-23 00:50:37 UTC (rev 2234)
+++ branches/r5rs/sigscheme/TODO	2005-11-23 01:21:27 UTC (rev 2235)
@@ -13,10 +13,6 @@
 
 * [uim] link libsscm into libuim statically
 
-* Dynamic encoding switching for a conversion between string and char list,
-  based on a SRFI or de facto standard API
-  - Evaluate ces API of Gauche
-
 * Fix large fixed-size buffer on stack without limit checking in read.c
 
 * grep "FIXME" and fix them
@@ -69,7 +65,13 @@
 Assigned to YamaKen:
 
 * Add integer->char and char->integer procedures with multibyte support
+  - Write test
+  - Make character object int-based
 
+* Dynamic encoding switching for a conversion between string and char list,
+  based on a SRFI or de facto standard API
+  - Evaluate ces API of Gauche
+
 * GCC4-optimization-proof stack protection
   - Update the document
 

Modified: branches/r5rs/sigscheme/config.h
===================================================================
--- branches/r5rs/sigscheme/config.h	2005-11-23 00:50:37 UTC (rev 2234)
+++ branches/r5rs/sigscheme/config.h	2005-11-23 01:21:27 UTC (rev 2235)
@@ -58,19 +58,19 @@
 ===========================================================================*/
 /* Support for each encoding will be compiled in if the corresponding
  * macro is defined as nonzero. */
-#define SCM_USE_EUCJP           0
-#define SCM_USE_SJIS            0
 #define SCM_USE_UTF8            1
-#define SCM_USE_EUCCN           0
-#define SCM_USE_EUCKR           0
+#define SCM_USE_EUCCN           1
+#define SCM_USE_EUCJP           1
+#define SCM_USE_EUCKR           1
+#define SCM_USE_SJIS            1
 
-/* For now, edit encoding.c and change the initialization of
- * Scm_mb_scan_char to change the default encoding. */
+/* choose exclusively. fallbacks to the unibyte encoding if nothing chosen. */
+#define SCM_USE_UTF8_AS_DEFAULT  1
+#define SCM_USE_EUCCN_AS_DEFAULT 0
+#define SCM_USE_EUCJP_AS_DEFAULT 0
+#define SCM_USE_EUCKR_AS_DEFAULT 0
+#define SCM_USE_SJIS_AS_DEFAULT  0
 
-/* "which encodings are enabled" and "which encoding is the default" will be
- * separated in future
- */
-
 /*===========================================================================
   Internal Behaviors
 ===========================================================================*/
@@ -139,6 +139,20 @@
 #define SCM_USE_VALUECONS       0
 #endif /* SCM_OBJ_COMPACT */
 
+#if (SCM_USE_UTF8 || SCM_USE_EUCCN || SCM_USE_EUCJP || SCM_USE_EUCKR || SCM_USE_SJIS)
+#define SCM_USE_MULTIBYTE_CHAR  1
+#else
+#define SCM_USE_MULTIBYTE_CHAR  0
+#endif
+
+#if (   SCM_USE_UTF8_AS_DEFAULT  && !SCM_USE_UTF8                            \
+     || SCM_USE_EUCCN_AS_DEFAULT && !SCM_USE_EUCCN                           \
+     || SCM_USE_EUCJP_AS_DEFAULT && !SCM_USE_EUCJP                           \
+     || SCM_USE_EUCKR_AS_DEFAULT && !SCM_USE_EUCKR                           \
+     || SCM_USE_SJIS_AS_DEFAULT  && !SCM_USE_SJIS)
+#error "disabled character encoding is chosen as default"
+#endif
+
 /* for Scm_eval_c_string_internal() */
 #undef SCM_USE_SRFI6
 #define SCM_USE_SRFI6           1

Modified: branches/r5rs/sigscheme/encoding.c
===================================================================
--- branches/r5rs/sigscheme/encoding.c	2005-11-23 00:50:37 UTC (rev 2234)
+++ branches/r5rs/sigscheme/encoding.c	2005-11-23 01:21:27 UTC (rev 2235)
@@ -43,6 +43,7 @@
   System Include
 =======================================*/
 #include <stdlib.h>
+#include <stdio.h>  /* for EOF */
 #include <string.h>
 
 /*=======================================
@@ -55,10 +56,21 @@
 #endif
 
 /*=======================================
+  File Local Type Definitions
+=======================================*/
+/* FIXME: replace with C99-independent stdint */
+typedef unsigned char uchar;
+typedef unsigned int  uint;
+
+/*=======================================
   File Local Functions
 =======================================*/
 #if SCM_USE_EUCJP
+static const char *eucjp_encoding(void);
 static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs);
+static int eucjp_str2int(const uchar *src, size_t len,
+                         ScmMultibyteState state);
+static uchar *eucjp_int2str(uchar *dst, int ch, ScmMultibyteState state);
 #endif
 
 #if SCM_USE_ISO2022KR
@@ -72,36 +84,140 @@
 #endif
 
 #if SCM_USE_SJIS
+static const char *sjis_encoding(void);
 static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs);
+static uchar *sjis_int2str(uchar *dst, int ch);
 #endif
 
+#if (SCM_USE_EUCCN || SCM_USE_EUCKR || SCM_USE_SJIS)
+/* generic double-byte char */
+static int dbc_str2int(const uchar *src, size_t len, ScmMultibyteState state);
+#endif
+
+#if (SCM_USE_EUCCN || SCM_USE_EUCKR)
+/* shared by EUCCN and EUCKR */
+static uchar *euc_int2str(uchar *dst, int ch, ScmMultibyteState state);
+#endif
+
 #if SCM_USE_EUCCN
+static const char *euccn_encoding(void);
 static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs);
 #endif
 
 #if SCM_USE_EUCKR
+static const char *euckr_encoding(void);
 static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs);
 #endif
 
 #if SCM_USE_UTF8
+static const char *utf8_encoding(void);
 static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs);
+static int utf8_str2int(const uchar *src, size_t len, ScmMultibyteState state);
+static uchar *utf8_int2str(uchar *dst, int ch, ScmMultibyteState state);
 #endif
 
+static const char *unibyte_encoding(void);
 static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs);
+static int unibyte_str2int(const uchar *src, size_t len,
+                           ScmMultibyteState state);
+static uchar *unibyte_int2str(uchar *dst, int ch, ScmMultibyteState state);
 
-typedef unsigned char uchar;
+/*=======================================
+  Local Variables
+=======================================*/
+#if SCM_USE_UTF8
+static const ScmCharCodecVTbl utf8_codec_vtbl = {
+    &utf8_encoding,
+    &utf8_scan_char,
+    (ScmCharCodecMethod_str2int)&utf8_str2int,
+    (ScmCharCodecMethod_int2str)&utf8_int2str
+};
+#define utf8_codec (&utf8_codec_vtbl)
+#endif
 
+#if SCM_USE_EUCCN
+static const ScmCharCodecVTbl euccn_codec_vtbl = {
+    &euccn_encoding,
+    &euccn_scan_char,
+    (ScmCharCodecMethod_str2int)&dbc_str2int,
+    (ScmCharCodecMethod_int2str)&euc_int2str
+};
+#define euccn_codec (&euccn_codec_vtbl)
+#endif
+
+#if SCM_USE_EUCJP
+static const ScmCharCodecVTbl eucjp_codec_vtbl = {
+    &eucjp_encoding,
+    &eucjp_scan_char,
+    (ScmCharCodecMethod_str2int)&eucjp_str2int,
+    (ScmCharCodecMethod_int2str)&eucjp_int2str
+};
+#define eucjp_codec (&eucjp_codec_vtbl)
+#endif
+
+#if SCM_USE_EUCKR
+static const ScmCharCodecVTbl euckr_codec_vtbl = {
+    &euckr_encoding,
+    &euckr_scan_char,
+    (ScmCharCodecMethod_str2int)&dbc_str2int,
+    (ScmCharCodecMethod_int2str)&euc_int2str
+};
+#define euckr_codec (&euckr_codec_vtbl)
+#endif
+
+#if SCM_USE_SJIS
+static const ScmCharCodecVTbl sjis_codec_vtbl = {
+    &sjis_encoding,
+    &sjis_scan_char,
+    (ScmCharCodecMethod_str2int)&dbc_str2int,
+    (ScmCharCodecMethod_int2str)&sjis_int2str
+};
+#define sjis_codec (&sjis_codec_vtbl)
+#endif
+
+static const ScmCharCodecVTbl unibyte_codec_vtbl = {
+    &unibyte_encoding,
+    &unibyte_scan_char,
+    (ScmCharCodecMethod_str2int)&unibyte_str2int,
+    (ScmCharCodecMethod_int2str)&unibyte_int2str
+};
+#define unibyte_codec (&unibyte_codec_vtbl)
+
 /*=======================================
   Global Variables
 =======================================*/
-/* TODO: add some mechanism to dynamically switch between encodings. */
+/* FIXME: merge into Scm_current_char_codec */
 ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)
-#if SCM_USE_UTF8
+#if SCM_USE_UTF8_AS_DEFAULT
     = utf8_scan_char;
+#elif SCM_USE_EUCCN_AS_DEFAULT
+    = euccn_scan_char;
+#elif SCM_USE_EUCJP_AS_DEFAULT
+    = eucjp_scan_char;
+#elif SCM_USE_EUCKR_AS_DEFAULT
+    = eucjp_scan_char;
+#elif SCM_USE_SJIS_AS_DEFAULT
+    = sjis_scan_char;
 #else
     = unibyte_scan_char;
 #endif
 
+/* temporary solution */
+ScmCharCodec *Scm_current_char_codec
+#if SCM_USE_UTF8_AS_DEFAULT
+    = utf8_codec;
+#elif SCM_USE_EUCJP_AS_DEFAULT
+    = eucjp_codec;
+#elif SCM_USE_EUCCN_AS_DEFAULT
+    = euccn_codec;
+#elif SCM_USE_EUCKR_AS_DEFAULT
+    = euckr_codec;
+#elif SCM_USE_SJIS_AS_DEFAULT
+    = sjis_codec;
+#else
+    = unibyte_codec;
+#endif
+
 /*=======================================
   Public API
 =======================================*/
@@ -196,14 +312,24 @@
 #define IS_ASCII(c) ((uchar)(c) <= 0x7F)
 #define IS_GR_SPC_OR_DEL(c)  ((uchar)(c) == 0xA0 || (uchar)(c) == 0xFF)
 
+#define CHAR_BITS    8
+#define BYTE_MASK    0xFF
+#define IS_1BYTE(e)  ((uint)(e) <= 0x7F)
+#define IS_2BYTES(e) ((uint)(e) <= 0xFFFF)
+#define IS_3BYTES(e) ((uint)(e) <= ((SS3 << CHAR_BITS * 2) | 0xFFFF))
+
 #define ESC 0x1B
 #define SO  0x0E
 #define SI  0x0F
 #define SS2 0x8E
 #define SS3 0x8F
 
+#if SCM_USE_EUCJP
+static const char *eucjp_encoding(void)
+{
+    return "EUC-JP";
+}
 
-#if SCM_USE_EUCJP
 /* G0 <- (96) ASCII (or was it JIS X 0201 Roman?)
  * G1 <- (94x94) JIS X 0208 kanji/kana
  * G2 <- (94) JIS X 0201 Katakana ("half-width katakana")
@@ -250,9 +376,132 @@
 
     RETURN_ERROR();
 }
+
+static int eucjp_str2int(const uchar *src, size_t len, ScmMultibyteState state)
+{
+    int ch;
+
+    switch (len) {
+    case 1:
+        ch = src[0];
+        break;
+
+    case 2:
+        ch  = src[0] << CHAR_BITS;
+        ch |= src[1];
+        break;
+
+    case 3:
+        ch  = src[0] << CHAR_BITS;
+        ch |= src[1] << CHAR_BITS * 2;
+        ch |= src[2];
+        break;
+
+    default:
+        return EOF;
+    }
+
+    return ch;
+}
+
+/* TODO: migrate to a canonical form shared with ISO-2022 variants that contain
+   absolute character set identifier instead of raw encoding-dependent
+   shifts */
+static uchar *eucjp_int2str(uchar *dst, int ch, ScmMultibyteState state)
+{
+#if SCM_STRICT_ENCODING_CHECK
+    uchar seq[3];
+#endif
+
+    if (IS_1BYTE(ch)) {
+        *dst++ = ch;
+    } else if (IS_2BYTES(ch)) {
+#if SCM_STRICT_ENCODING_CHECK
+        seq[0] = ch >> CHAR_BITS;
+        seq[1] = ch & BYTE_MASK;
+        if ((!IN_GR94(seq[0]) && seq[0] != SS2)
+            || !IN_GR96(seq[1]))
+            return NULL;
+#endif
+        *dst++ = ch >> CHAR_BITS;
+        *dst++ = ch & BYTE_MASK;
+    } else if (IS_3BYTES(ch)) {
+#if SCM_STRICT_ENCODING_CHECK
+        seq[0] = ch >> CHAR_BITS * 2;
+        seq[1] = (ch >> CHAR_BITS) & BYTE_MASK;
+        seq[2] = ch & BYTE_MASK;
+        if (seq[0] != SS3 || !IN_GR94(seq[1]) || !IN_GR94(seq[2]))
+            return NULL;
+#endif
+        *dst++ = ch >> CHAR_BITS * 2;
+        *dst++ = (ch >> CHAR_BITS) & BYTE_MASK;
+        *dst++ = ch & BYTE_MASK;
+    } else {
+        return NULL;
+    }
+    *dst = '\0';
+
+    return dst;
+}
 #endif /* SCM_USE_EUCJP */
 
+#if (SCM_USE_EUCCN || SCM_USE_EUCKR || SCM_USE_SJIS)
+/* generic double-byte char */
+static int dbc_str2int(const uchar *src, size_t len, ScmMultibyteState state)
+{
+    int ch;
+
+    switch (len) {
+    case 1:
+        ch = src[0];
+        break;
+
+    case 2:
+        ch  = src[0] << CHAR_BITS;
+        ch |= src[1];
+        break;
+
+    default:
+        return EOF;
+    }
+
+    return ch;
+}
+#endif /* (SCM_USE_EUCCN || SCM_USE_EUCKR || SCM_USE_SJIS) */
+
+#if (SCM_USE_EUCCN || SCM_USE_EUCKR)
+static uchar *euc_int2str(uchar *dst, int ch, ScmMultibyteState state)
+{
+#if SCM_STRICT_ENCODING_CHECK
+    uchar seq[2];
+#endif
+
+    if (IS_1BYTE(ch)) {
+        *dst++ = ch;
+    } else if (IS_2BYTES(ch)) {
+#if SCM_STRICT_ENCODING_CHECK
+        seq[0] = ch >> CHAR_BITS;
+        seq[1] = ch & BYTE_MASK;
+        if (!IN_GR94(seq[0]) || !IN_GR96(seq[1]))
+            return NULL;
+#endif
+        *dst++ = ch >> CHAR_BITS;
+        *dst++ = ch & BYTE_MASK;
+    } else {
+        return NULL;
+    }
+    *dst = '\0';
+
+    return dst;
+}
+#endif /* (SCM_USE_EUCCN || SCM_USE_EUCKR) */
+
 #if SCM_USE_EUCCN
+static const char *euccn_encoding(void)
+{
+    return "EUC-CN";
+}
+
 /* FIXME: NOT TESTED!
  * 
  * G0 <- ASCII (or GB 1988?)
@@ -286,6 +535,11 @@
 #endif
 
 #if SCM_USE_EUCKR
+static const char *euckr_encoding(void)
+{
+    return "EUC-KR";
+}
+
 /* FIXME: NOT TESTED!  I'm not sure about this encoding.  There's also
  * a Microsoft variant called CP949, which is not supported (yet).
  * RFC 1557 says KS X 1001 is 94x94.
@@ -320,6 +574,10 @@
 #endif /* SCM_USE_EUCKR */
 
 /*==== Encodings for Unicode ====*/
+#define IN_OCT_BMP(u)  ((uint)(u) <= 0x7ff)
+#define IN_BMP(u)      ((uint)(u) <= 0xffff)
+#define IN_SMP(u)      ((uint)(u) <= 0x10ffff && !IN_BMP(u))
+
 #if SCM_USE_UTF8
 /* RFC 3629 */
 #define MASK(n)        ((LEN_CODE(n) >> 1) | 0x80)
@@ -327,6 +585,20 @@
 #define IS_LEN(c, n)   ((MASK(n) & (c)) == LEN_CODE(n))
 #define IS_TRAILING(c) (IS_LEN((c), 1))
 
+#define LEN_CODE_BITS(n)    (n + 1)
+#define TRAILING_CODE_BITS  LEN_CODE_BITS(1)
+#define TRAILING_VAL_BITS   (CHAR_BITS - TRAILING_CODE_BITS)
+#define LEADING_VAL_BITS(n) (CHAR_BITS - LEN_CODE_BITS(n))
+#define LEADING_VAL(u, n)                                                    \
+    ((u) >> (LEN_CODE_BITS(n) + TRAILING_VAL_BITS * ((n) - 1)))
+#define TRAILING_VAL(u, n, i)                                                \
+    (~MASK(1) & ((u) >> (TRAILING_VAL_BITS * ((n) - 2 - (i)))))
+
+static const char *utf8_encoding(void)
+{
+    return "UTF-8";
+}
+
 static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs)
 {
     const char *str = SCM_MBS_GET_STR(mbs);
@@ -363,10 +635,73 @@
 
 }
 
+static int utf8_str2int(const uchar *src, size_t len, ScmMultibyteState state)
+{
+    int ch;
+
+    switch (len) {
+    case 1:
+        ch = src[0];
+        break;
+
+    case 2:
+        ch  = (~MASK(2) & src[0]) << TRAILING_VAL_BITS;
+        ch |= (~MASK(1) & src[1]);
+        break;
+
+    case 3:
+        ch  = (~MASK(3) & src[0]) << TRAILING_VAL_BITS * 2;
+        ch |= (~MASK(1) & src[1]) << TRAILING_VAL_BITS;
+        ch |= (~MASK(1) & src[2]);
+        break;
+
+    case 4:
+        ch  = (~MASK(4) & src[0]) << TRAILING_VAL_BITS * 3;
+        ch |= (~MASK(1) & src[1]) << TRAILING_VAL_BITS * 2;
+        ch |= (~MASK(1) & src[2]) << TRAILING_VAL_BITS;
+        ch |= (~MASK(1) & src[3]);
+        break;
+
+    default:
+        return EOF;
+    }
+
+    return ch;
+}
+
+static uchar *utf8_int2str(uchar *dst, int ch, ScmMultibyteState state)
+{
+    if (IS_ASCII(ch)) {
+        *dst++ = ch;
+    } else if (IN_OCT_BMP(ch)) {
+        *dst++ = LEN_CODE(2) | LEADING_VAL(ch, 2);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 2, 0);
+    } else if (IN_BMP(ch)) {
+        *dst++ = LEN_CODE(3) | LEADING_VAL(ch, 3);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 3, 0);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 3, 1);
+    } else if (IN_SMP(ch)) {
+        *dst++ = LEN_CODE(4) | LEADING_VAL(ch, 4);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 0);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 1);
+        *dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 4, 2);
+    } else {
+        return NULL;
+    }
+    *dst = '\0';
+
+    return dst;
+}
 #undef MASK
 #undef LEN_CODE
 #undef IS_LEN
 #undef IS_TRAILING
+#undef LEN_CODE_BITS
+#undef TRAILING_CODE_BITS
+#undef TRAILING_VAL_BITS
+#undef LEADING_VAL_BITS
+#undef LEADING_VAL
+#undef TRAILING_VAL
 #endif /* SCM_USE_UTF8 */
 
 /*==== Other encodings ====*/
@@ -387,8 +722,6 @@
  * 0x40 .. 0x7E: trailing byte of 2-byte char
  * 0x80 .. 0xFC: trailing byte of 2-byte char
  */
-static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs)
-{
 #define IS_KANA(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xDF)
 #define IS_LEAD(c)        \
     (0x81 <= (uchar)(c)   \
@@ -397,6 +730,13 @@
     && (uchar)(c) != 0xA0)
 #define IS_TRAIL(c) (0x40 <= (uchar)(c) && (uchar)(c) <= 0xFC && (c) != 0x7E)
 
+static const char *sjis_encoding(void)
+{
+    return "SHIFT_JIS";
+}
+
+static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs)
+{
     const char *str = SCM_MBS_GET_STR(mbs);
     const int  size = SCM_MBS_GET_SIZE(mbs);
     ENTER;
@@ -413,11 +753,34 @@
         RETURN(2);
     }
     RETURN(1);
+}
 
+static uchar *sjis_int2str(uchar *dst, int ch)
+{
+    uchar seq[2];
+
+#if SCM_STRICT_ENCODING_CHECK
+    if (ch >> CHAR_BITS * 2)
+        return NULL;
+#endif
+    seq[0] = ch >> CHAR_BITS;
+    seq[1] = ch & BYTE_MASK;
+
+    *dst++ = seq[0];
+    if (IS_LEAD(seq[0])) {
+#if SCM_STRICT_ENCODING_CHECK
+        if (!IS_TRAIL(seq[1]))
+            return NULL;
+#endif
+        *dst++ = seq[1];
+    }
+    *dst = '\0';
+
+    return dst;
+}
 #undef IS_KANA
 #undef IS_LEAD
 #undef IS_TRAIL
-}
 #endif /* SCM_USE_SJIS */
 
 /* Single-byte encodings.  Please add any that you know are missing.
@@ -428,6 +791,12 @@
  * ISO-8859-*
  * VISCII
  */
+static const char *unibyte_encoding(void)
+{
+    /* conventional assumption */
+    return "ISO-8859-1";
+}
+
 static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs)
 {
     ENTER;
@@ -435,3 +804,23 @@
         RETURN(1);
     RETURN(0);
 }
+
+static int unibyte_str2int(const uchar *src, size_t len,
+                           ScmMultibyteState state)
+{
+#if SCM_STRICT_ENCODING_CHECK
+    if (len != 1)
+        return EOF;
+#endif
+    return src[0];
+}
+
+static uchar *unibyte_int2str(uchar *dst, int ch, ScmMultibyteState state)
+{
+#if SCM_STRICT_ENCODING_CHECK
+    if (ch & ~BYTE_MASK)
+        return NULL;
+#endif
+    *dst++ = ch;
+    return dst;
+}

Modified: branches/r5rs/sigscheme/encoding.h
===================================================================
--- branches/r5rs/sigscheme/encoding.h	2005-11-23 00:50:37 UTC (rev 2234)
+++ branches/r5rs/sigscheme/encoding.h	2005-11-23 01:21:27 UTC (rev 2235)
@@ -92,10 +92,10 @@
 
 #define SCM_CHARCODEC_ENCODING(codec)           ((*codec->encoding)())
 #define SCM_CHARCODEC_SCAN_CHAR(codec, mbs)     ((*codec->scan_char)(mbs))
-#define SCM_CHARCODEC_STR2INT(codec, start, nbytes)                          \
-    ((*codec->str2int)((start), (nbytes)))
-#define SCM_CHARCODEC_INT2STR(codec, start, ch)                              \
-    ((*codec->int2str)((start), (ch)))
+#define SCM_CHARCODEC_STR2INT(codec, src, len, state)                        \
+    ((*codec->str2int)((src), (len), (state)))
+#define SCM_CHARCODEC_INT2STR(codec, dst, ch, state)                         \
+    ((*codec->int2str)((dst), (ch), (state)))
 
 /*=======================================
   Type Definitions
@@ -104,6 +104,8 @@
  * It might as well be defined as mbstate_t if we're using libc. */
 typedef int ScmMultibyteState;
 
+#define SCM_MB_STATELESS 0
+
 /* Metadata of a multibyte character.  These are usually allocated on
    stack or register, so we'll make liberal use of space. */
 typedef struct {
@@ -130,12 +132,16 @@
 } ScmMultibyteString;
 
 typedef struct ScmCharCodecVTbl_ ScmCharCodecVTbl;
-typedef ScmCharCodecVTbl ScmCharCodec;
+typedef const ScmCharCodecVTbl ScmCharCodec;
 
+/* FIXME: replace (char *) with (uchar *) once C99-independent stdint is
+   introduced */
 typedef const char *(*ScmCharCodecMethod_encoding)(void);
 typedef ScmMultibyteCharInfo (*ScmCharCodecMethod_scan_char)(ScmMultibyteString mbs);
-typedef int (*ScmCharCodecMethod_str2int)(const char *start, size_t nbytes);
-typedef char *(*ScmCharCodecMethod_int2str)(char *start, int ch);
+typedef int (*ScmCharCodecMethod_str2int)(const char *src, size_t len,
+                                          ScmMultibyteState state);
+typedef char *(*ScmCharCodecMethod_int2str)(char *dst, int ch,
+                                            ScmMultibyteState state);
 
 struct ScmCharCodecVTbl_ {
     ScmCharCodecMethod_encoding  encoding;
@@ -148,6 +154,7 @@
    Variable Declarations
 =======================================*/
 extern ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs);
+extern ScmCharCodec *Scm_current_char_codec;
 
 /*=======================================
    Function Declarations

Modified: branches/r5rs/sigscheme/operations.c
===================================================================
--- branches/r5rs/sigscheme/operations.c	2005-11-23 00:50:37 UTC (rev 2234)
+++ branches/r5rs/sigscheme/operations.c	2005-11-23 01:21:27 UTC (rev 2235)
@@ -1075,25 +1075,34 @@
     return SCM_FALSE;
 }
 
+ScmObj ScmOp_char2integer(ScmObj obj)
+{
+    int val;
+    const char *str;
+    DECLARE_FUNCTION("char->integer", ProcedureFixed1);
+
+    ASSERT_CHARP(obj);
+
+    str = SCM_CHAR_VALUE(obj);
+    val = SCM_CHARCODEC_STR2INT(Scm_current_char_codec, str, strlen(str),
+                                SCM_MB_STATELESS);
+    if (val == EOF)
+        ERR_OBJ("invalid char value", obj);
+    return Scm_NewInt(val);
+}
+
 ScmObj ScmOp_integer2char(ScmObj obj)
 {
     int val;
-    char *buf;
+    char buf[SCM_MB_MAX_LEN + sizeof((char)'\0')];
     DECLARE_FUNCTION("integer->char", ProcedureFixed1);
 
     ASSERT_INTP(obj);
 
-    /* FIXME: only supports ASCII */
     val = SCM_INT_VALUE(obj);
-    if (isascii(val)) {
-        buf = malloc(sizeof(char) + sizeof((char)'\0'));
-        buf[0] = val;
-        buf[1] = '\0';
-        return Scm_NewChar(buf);
-    } else {
-        ERR_OBJ("current implementation only supports ASCII", obj);
-        /* NOTREACHED */
-    }
+    if (!SCM_CHARCODEC_INT2STR(Scm_current_char_codec, buf, val, SCM_MB_STATELESS))
+        ERR_OBJ("invalid char value", obj);
+    return Scm_NewChar(strdup(buf));
 }
 
 ScmObj ScmOp_char_upcase(ScmObj obj)

Modified: branches/r5rs/sigscheme/sigscheme.h
===================================================================
--- branches/r5rs/sigscheme/sigscheme.h	2005-11-23 00:50:37 UTC (rev 2234)
+++ branches/r5rs/sigscheme/sigscheme.h	2005-11-23 01:21:27 UTC (rev 2235)
@@ -513,6 +513,7 @@
 ScmObj ScmOp_char_whitespacep(ScmObj obj);
 ScmObj ScmOp_char_upper_casep(ScmObj obj);
 ScmObj ScmOp_char_lower_casep(ScmObj obj);
+ScmObj ScmOp_char2integer(ScmObj obj);
 ScmObj ScmOp_integer2char(ScmObj obj);
 ScmObj ScmOp_char_upcase(ScmObj obj);
 ScmObj ScmOp_char_downcase(ScmObj obj);



More information about the uim-commit mailing list