[uim-commit] r1858 - branches/r5rs/sigscheme
jun0 at freedesktop.org
jun0 at freedesktop.org
Mon Oct 17 23:45:25 PDT 2005
Author: jun0
Date: 2005-10-17 23:45:18 -0700 (Mon, 17 Oct 2005)
New Revision: 1858
Modified:
branches/r5rs/sigscheme/config.h
branches/r5rs/sigscheme/datas.c
branches/r5rs/sigscheme/debug.c
branches/r5rs/sigscheme/encoding.c
branches/r5rs/sigscheme/operations-siod.c
branches/r5rs/sigscheme/operations.c
branches/r5rs/sigscheme/sigscheme.h
branches/r5rs/sigscheme/sigschemeinternal.h
branches/r5rs/sigscheme/sigschemetype.h
Log:
Adds proper multibyte encoding support.
* sigscheme/config.h
- (SCM_USE_EUCJP, SCM_USE_SJIS, SCM_USE_UTF8, SCM_USE_EUCCN,
SCM_USE_EUCKR): New macros.
- (SCM_STRICT_ENCODING_CHECK, SCM_DEBUG_ENCODING): New macro.
* sigscheme/encoding.c
- sigschemeinternal.h: New include.
- (eucjp_scan_char, sjis_scan_char, euccn_scan_char,
euckr_scan_char, utf8_scan_char, unibyte_scan_char): New
functions.
- (uchar): New type.
- (Scm_mb_scan_char): New global variable.
- (eucjp_strlen, eucjp_str_startpos, eucjp_str_endpos): Remove.
- (SigScm_default_encoding_strlen): Rename to Scm_mb_strlen.
- (Scm_mb_bare_c_strlen): New function. Better name wanted.
- (Scm_mb_substring): New function.
- (ENTER, RETURN, RETURN_ERROR, RETURN_INCOMPLETE, SAVE_STATE,
EXPECT_SIZE, IN_CL, IN_CR, IN_GL94, IN_GL96, IN_GR94, IN_GR96,
IS_ASCII, ESC, SO, SI, SS2, SS3): New macros.
- (iso2022kr_scan_char, iso2022kr_scan_input_char,
iso2022jp_scan_char, iso2022jp_scan_input_char): Declared, but not
implemented.
* sigscheme/operations.c
- (Scm_mb_bare_c_strlen): Follow renaming in encoding.c.
- (ScmOp_string_ref, ScmOp_string_set, ScmOp_string_substring,
ScmOp_string2list): Use the new multibyte stuff.
* sigscheme/datas.c
- (Scm_NewChar, Scm_NewString, Scm_NewStringCopying): Use the new
multibyte stuff.
* sigscheme/sigscheme.h
- (SCM_DBG_ENCODING): New enum constant.
- (SigScm_default_encoding_strlen,
SigScm_default_encoding_str_startpos,
SigScm_default_encoding_str_endpos): Remove prototypes.
- (Scm_mb_strlen, Scm_mb_bare_c_strlen, Scm_mb_scan_char): New declarations.
- (Scm_mb_strref): New macro. (wrapper for Scm_mb_substring)
* sigscheme/sigschemetype.h
- (ScmMultibyteState, ScmMultibyteCharInfo, ScmMultibyteString): New types.
- (SCM_MBS_SET_STR, SCM_MBS_GET_STR, SCM_MBS_SET_SIZE,
SCM_MBS_GET_SIZE, SCM_MBCINFO_SET_SIZE, SCM_MBCINFO_GET_SIZE,
SCM_MBCINFO_CLEAR_STATE, SCM_MBCINFO_SET_STATE, SCM_MBCINFO_GET_STATE,
SCM_MBCINFO_CLEAR_FLAG, SCM_MBCINFO_SET_ERROR,
SCM_MBCINFO_SET_INCOMPLETE, SCM_MBCINFO_ERRORP,
SCM_MBCINFO_INCOMPLETEP, SCM_MBCINFO_INIT, SCM_MBS_GET_STATE,
SCM_MBS_SET_STATE, SCM_MBS_CLEAR_STATE, SCM_MBS_GET_STATE,
SCM_MBS_SET_STATE, SCM_MBS_CLEAR_STATE, SCM_MBS_INIT,
SCM_MBS_SKIP_CHAR): New macros.
* sigscheme/debug.c
- (SigScm_PredefinedDebugCategories): Add SCM_DBG_ENCODING.
- (SigScm_WriteToPort, SigScm_DisplayToPort): Untabify.
* sigscheme/operations-siod.c
- (ScmOp_closure_code, SigScm_SetVerboseLevel): Untabify.
* sigscheme/sigschemeinternal.h
- (ASSERT_NO_MORE_ARG): Correct error reporting macro.
Modified: branches/r5rs/sigscheme/config.h
===================================================================
--- branches/r5rs/sigscheme/config.h 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/config.h 2005-10-18 06:45:18 UTC (rev 1858)
@@ -56,8 +56,17 @@
/*===========================================================================
Character Encoding Handlers
===========================================================================*/
-#define SCM_USE_EUCJP 1 /* use EUC-JP as internal encoding */
+/* Support for each encoding will be compiled in if the corresponding
+ * macro is defined as nonzero. */
+#define SCM_USE_EUCJP 0
+#define SCM_USE_SJIS 0
+#define SCM_USE_UTF8 1
+#define SCM_USE_EUCCN 0
+#define SCM_USE_EUCKR 0
+/* For now, edit encoding.c and change the initialization of
+ * Scm_mb_scan_char to change the default encoding. */
+
/* "which encodings are enabled" and "which encoding is the default" will be
* separated in future
*/
@@ -67,6 +76,7 @@
===========================================================================*/
#define SCM_STRICT_R5RS 0 /* use strict R5RS check */
#define SCM_STRICT_ARGCHECK 1 /* enable strict argument check */
+#define SCM_STRICT_ENCODING_CHECK 1 /* do all feasible encoding error checks */
#define SCM_ACCESSOR_ASSERT 0 /* enable strict type check with accessor */
#define SCM_USE_VALUECONS 1 /* use experimental values passing */
#define SCM_VOLATILE_OUTPUT 0 /* always flush files on write */
@@ -80,6 +90,7 @@
#define SCM_DEBUG 1 /* enable debugging features */
#define SCM_DEBUG_GC 0 /* enable GC debugging */
#define SCM_DEBUG_PARSER 0 /* enable parser debugging */
+#define SCM_DEBUG_ENCODING 0 /* debug encoding-related functions */
#define SCM_DEBUG_BACKTRACE_SEP 1 /* enable frame-separator on backtrace */
#define SCM_DEBUG_BACKTRACE_VAL 1 /* enable values printing on backtrace */
Modified: branches/r5rs/sigscheme/datas.c
===================================================================
--- branches/r5rs/sigscheme/datas.c 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/datas.c 2005-10-18 06:45:18 UTC (rev 1858)
@@ -721,11 +721,13 @@
ScmObj Scm_NewChar(char *ch)
{
ScmObj obj = SCM_FALSE;
+ int len;
- /* check length */
- if (SigScm_default_encoding_strlen(ch) != 1) {
+ /* assert length == 1 */
+ len = Scm_mb_bare_c_strlen(ch);
+ if (len != 1) {
SigScm_Error("Scm_NewChar : invalid character ch = [%s], len = %d",
- ch, SigScm_default_encoding_strlen(ch));
+ ch, len);
}
SCM_NEW_OBJ_INTERNAL(obj);
@@ -744,7 +746,7 @@
SCM_ENTYPE_STRING(obj);
SCM_STRING_SET_STR(obj, str);
- SCM_STRING_SET_LEN(obj, SigScm_default_encoding_strlen(str));
+ SCM_STRING_SET_LEN(obj, str ? Scm_mb_bare_c_strlen(str) : 0);
return obj;
}
@@ -758,7 +760,7 @@
SCM_ENTYPE_STRING(obj);
SCM_STRING_SET_STR(obj, strdup(str));
- SCM_STRING_SET_LEN(obj, SigScm_default_encoding_strlen(str));
+ SCM_STRING_SET_LEN(obj, Scm_mb_bare_c_strlen(str));
return obj;
}
Modified: branches/r5rs/sigscheme/debug.c
===================================================================
--- branches/r5rs/sigscheme/debug.c 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/debug.c 2005-10-18 06:45:18 UTC (rev 1858)
@@ -142,6 +142,9 @@
#if SCM_DEBUG_GC
| SCM_DBG_GC
#endif
+#if SCM_DEBUG_ENCODING
+ | SCM_DBG_ENCODING
+#endif
);
#else /* SCM_DEBUG */
return SCM_DBG_NONE;
@@ -182,7 +185,7 @@
DECLARE_INTERNAL_FUNCTION("SigScm_WriteToPort");
if (FALSEP(port))
- return;
+ return;
ASSERT_PORTP(port);
if (SCM_PORT_PORTDIRECTION(port) != PORT_OUTPUT)
@@ -201,7 +204,7 @@
DECLARE_INTERNAL_FUNCTION("SigScm_DisplayToPort");
if (FALSEP(port))
- return;
+ return;
ASSERT_PORTP(port);
if (SCM_PORT_PORTDIRECTION(port) != PORT_OUTPUT)
Modified: branches/r5rs/sigscheme/encoding.c
===================================================================
--- branches/r5rs/sigscheme/encoding.c 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/encoding.c 2005-10-18 06:45:18 UTC (rev 1858)
@@ -31,6 +31,12 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
===========================================================================*/
+
+/* Acknowledgement: much information was gained from the
+ * i18n-introduction of the debian project. Many thanks to its
+ * authors, Tomohiro KUBOTA, et al. */
+
+
/*=======================================
System Include
=======================================*/
@@ -39,107 +45,379 @@
Local Include
=======================================*/
#include "sigscheme.h"
+#include "sigschemeinternal.h"
/*=======================================
- File Local Struct Declarations
+ File Local Functions
=======================================*/
+#if SCM_USE_EUCJP
+static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs);
+#endif
-/*=======================================
- File Local Macro Declarations
-=======================================*/
+#if SCM_USE_ISO2022KR
+static ScmMultibyteCharInfo iso2022kr_scan_char(ScmMultibyteString mbs);
+static ScmMultibyteCharInfo iso2022kr_scan_input_char(ScmMultibyteString mbs);
+#endif
+#if SCM_USE_ISO2022JP
+static ScmMultibyteCharInfo iso2022jp_scan_char(ScmMultibyteString mbs);
+static ScmMultibyteCharInfo iso2022jp_scan_input_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_SJIS
+static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_EUCCN
+static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_EUCKR
+static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs);
+#endif
+
+#if SCM_USE_UTF8
+static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs);
+#endif
+
+static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs);
+
+typedef unsigned char uchar;
+
/*=======================================
- Variable Declarations
+ Global Variables
=======================================*/
+/* TODO: add some mechanism to dynamically switch between encodings. */
+ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)
+ = utf8_scan_char;
/*=======================================
- File Local Function Declarations
+ Public API
=======================================*/
-static int eucjp_strlen(const char *p);
-static const char* eucjp_str_startpos(const char *p, int k);
-static const char* eucjp_str_endpos(const char *p, int k);
-/*=======================================
- Function Implementations
-=======================================*/
-int SigScm_default_encoding_strlen(const char *str)
+int Scm_mb_strlen(ScmMultibyteString mbs)
{
-#if SCM_USE_EUCJP
- return eucjp_strlen(str);
-#endif
+ int len = 0;
+ ScmMultibyteCharInfo c;
+
+ CDBG((SCM_DBG_ENCODING, "mb_strlen: size = %d; str = %s;",
+ SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));
+
+ while (SCM_MBS_GET_SIZE(mbs)) {
+ c = Scm_mb_scan_char(mbs);
+ CDBG((SCM_DBG_ENCODING, "%d, %d;", SCM_MBCINFO_GET_SIZE(c), c.flag));
+ SCM_MBS_SKIP_CHAR(mbs, c);
+ len++;
+ }
+
+ CDBG((SCM_DBG_ENCODING, "len=%d\n", len));
+ return len;
}
-const char* SigScm_default_encoding_str_startpos(const char *str, int k)
+/* FIXME: pick a better name. */
+int Scm_mb_bare_c_strlen(const char *s)
{
-#if SCM_USE_EUCJP
- return eucjp_str_startpos(str, k);
-#endif
+ ScmMultibyteString mbs;
+ SCM_MBS_INIT(mbs);
+ SCM_MBS_SET_STR(mbs, s);
+ SCM_MBS_SET_SIZE(mbs, strlen(s));
+ return Scm_mb_strlen(mbs);
}
-const char* SigScm_default_encoding_str_endpos(const char *str, int k)
+ScmMultibyteString Scm_mb_substring(ScmMultibyteString mbs, int i, int len)
{
+ ScmMultibyteString ret;
+ ScmMultibyteString end;
+ ScmMultibyteCharInfo c;
+
+ ret = mbs;
+
+ while (i--) {
+ c = Scm_mb_scan_char(ret);
+ SCM_MBS_SKIP_CHAR(ret, c);
+ }
+
+ end = ret;
+
+ while (len--) {
+ c = Scm_mb_scan_char(end);
+ SCM_MBS_SKIP_CHAR(end, c);
+ }
+
+ SCM_MBS_SET_SIZE(ret, SCM_MBS_GET_STR(end) - SCM_MBS_GET_STR(ret));
+ return ret;
+}
+
+
+/*=======================================
+ Encoding-specific functions
+=======================================*/
+
+/* Every encoding implements the <encoding name>_scan_char()
+ * primitive. Its job is to determine the length of the first
+ * character in the given string. Stateful encodings should save
+ * their state *at exit*, that is, the state right after reading the
+ * first character (so don't omit it). */
+
+/* Convenience macros. Start with ENTER and return with RETURN*.
+ * EXPECT_SIZE() declares the expected length of the character. We'll
+ * use it to return information on how many octets are missing. It
+ * also serves as documentation. */
+#define ENTER ScmMultibyteCharInfo _ret; SCM_MBCINFO_INIT(_ret)
+#define RETURN(n) do { SCM_MBCINFO_SET_SIZE(_ret, n); return _ret; } while (0)
+#define RETURN_ERROR() do { SCM_MBCINFO_SET_ERROR(_ret); RETURN(1); } while (0)
+#define RETURN_INCOMPLETE(n) do { SCM_MBCINFO_SET_INCOMPLETE(_ret); RETURN(n); } while (0)
+#define SAVE_STATE(stat) (SCM_MBCINFO_SET_STATE(_ret, (stat)))
+#define EXPECT_SIZE(size) /* Currently ignored. */
+
+/* Encodings based on ISO/IEC 2022. */
+
+/* Control regions. */
+#define IN_CL(c) ((uchar)(c) < 0x20)
+#define IN_CR(c) (0x80 <= (uchar)(c) && (uchar)(c) <= 0x9F)
+
+/* General purpose regions. */
+#define IN_GL94(c) (0x21 <= (uchar)(c) && (uchar)(c) <= 0x7E)
+#define IN_GL96(c) (0x20 <= (uchar)(c) && (uchar)(c) <= 0x7F)
+#define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)
+#define IN_GR96(c) (0xA0 <= (uchar)(c) && (uchar)(c) <= 0xFF)
+
+#define IS_ASCII(c) ((uchar)(c) <= 0x7F)
+
+#define ESC 0x1B
+#define SO 0x0E
+#define SI 0x0F
+#define SS2 0x8E
+#define SS3 0x8F
+
+
#if SCM_USE_EUCJP
- return eucjp_str_endpos(str, k);
-#endif
+/* G0 <- (96) ASCII (or was it JIS X 0201 Roman?)
+ * G1 <- (94x94) JIS X 0208 kanji/kana
+ * G2 <- (94) JIS X 0201 Katakana ("half-width katakana")
+ * G3 <- (94x94) JIS X 0212 kanji, or JIS X 0213 kanji plane 2
+ *
+ * GL <- G0 (ASCII)
+ * GR <- G1 (JIS X 0208)
+ * CL <- JIS X 0211 C0
+ * CR <- JIS X 0211 C1 */
+static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs)
+{
+ const char *str = SCM_MBS_GET_STR(mbs);
+ const int size = SCM_MBS_GET_SIZE(mbs);
+ ENTER;
+
+ if (!size)
+ RETURN(0);
+
+ if (IN_CL(str[0]) || IN_GL96(str[0]))
+ RETURN(1);
+ else if (IN_GR94(str[0]) || (uchar)str[0] == SS2) {
+ EXPECT_SIZE(2);
+ if (size < 2) RETURN_INCOMPLETE(1);
+#if SCM_STRICT_ENCODING_CHECK
+ if (!IN_GR94(str[1])) RETURN_ERROR();
+#endif
+ RETURN(2);
+ } else if ((uchar)str[0] == SS3) {
+ EXPECT_SIZE(3);
+#if SCM_STRICT_ENCODING_CHECK
+ if (size < 2) RETURN_INCOMPLETE(size);
+ if (!IN_GR94(str[1])) RETURN_ERROR();
+ if (size < 3) RETURN_INCOMPLETE(size);
+ if (!IN_GR94(str[2])) RETURN_ERROR();
+ RETURN(3);
+#else /* not SCM_STRICT_ENCODING_CHECK */
+ if (size < 3)
+ RETURN_INCOMPLETE(size);
+ RETURN(3);
+#endif /* not SCM_STRICT_ENCODING_CHECK */
+ }
+
+ RETURN_ERROR();
}
+#endif /* SCM_USE_EUCJP */
-static int eucjp_strlen(const char *str)
+#if SCM_USE_EUCCN
+/* FIXME: NOT TESTED!
+ *
+ * G0 <- ASCII (or GB 1988?)
+ * G1 <- GB2312
+ *
+ * GL <- G0 (ASCII)
+ * GR <- G1 (GB2312) */
+static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs)
{
- int len = 0;
- const unsigned char *cur = (const unsigned char *)str;
- while (*cur) {
- if (*cur > 127) {
- /* 2 bytes */
- cur++;
- }
+ /* TODO: maybe we can make this an alias of eucjp_scan_char()? */
+ const char *str = SCM_MBS_GET_STR(mbs);
+ const int size = SCM_MBS_GET_SIZE(mbs);
+ ENTER;
- cur++;
- len++;
+ if (!size)
+ RETURN(0);
+ if (IS_ASCII(str[0]))
+ RETURN(1);
+ if (IN_GR94(str[0])) {
+ EXPECT_SIZE(2);
+ if (size < 2)
+ RETURN_INCOMPLETE(size);
+#if SCM_STRICT_ENCODING_CHECK
+ if (!IN_GR94(str[1]))
+ RETURN_ERROR();
+#endif
+ RETURN(2);
}
+ RETURN_ERROR();
+}
+#endif
- return len;
+#if SCM_USE_EUCKR
+/* FIXME: NOT TESTED! I'm not sure about this encoding. There's also
+ * a Microsoft variant called CP949, which is not supported (yet).
+ * RFC 1557 says KS X 1001 is 94x94.
+ *
+ * G0 <- ASCII
+ * G1 <- KS X 1001 (aka KSC 5601)
+ *
+ * GL <- G0
+ * GR <- G1 */
+static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs)
+{
+ const char *str = SCM_MBS_GET_STR(mbs);
+ const int size = SCM_MBS_GET_SIZE(mbs);
+ ENTER;
+
+ if (!size)
+ RETURN(0);
+ if (IS_ASCII(str[0]))
+ RETURN(1);
+ if (IN_GR94(str[0])) {
+ EXPECT_SIZE(2);
+ if (size < 2)
+ RETURN_INCOMPLETE(size);
+#if SCM_STRICT_ENCODING_CHECK
+ if (!IN_GR94(str[1]))
+ RETURN_ERROR();
+#endif
+ RETURN(2);
+ }
+ RETURN_ERROR();
}
+#endif /* SCM_USE_EUCKR */
-static const char* eucjp_str_startpos(const char *str, int k)
+/*==== Encodings for Unicode ====*/
+#if SCM_USE_UTF8
+/* RFC 3629 */
+#define MASK(n) ((LEN_CODE(n) >> 1) | 0x80)
+#define LEN_CODE(n) (((1 << (n))-1) << (8-n))
+#define IS_LEN(c, n) ((MASK(n) & (c)) == LEN_CODE(n))
+#define IS_TRAILING(c) (IS_LEN((c), 1))
+
+static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs)
{
- int len = 0;
- const unsigned char *cur = (const unsigned char *)str;
- while (*cur) {
- if (len == k)
- return (const char *)cur;
+ const char *str = SCM_MBS_GET_STR(mbs);
+ const int size = SCM_MBS_GET_SIZE(mbs);
+ int len;
+ ENTER;
- if (*cur > 127) {
- /* 2 bytes */
- cur++;
+ if (!size)
+ RETURN(0);
+ if (IS_ASCII(str[0]))
+ RETURN(1);
+
+ if (IS_LEN(str[0], 2)) len = 2;
+ else if (IS_LEN(str[0], 3)) len = 3;
+ else if (IS_LEN(str[0], 4)) len = 4;
+ else RETURN_ERROR();
+
+#if SCM_STRICT_ENCODING_CHECK
+ {
+ int i;
+ for (i=1; i < len; i++) {
+ if (size <= i)
+ RETURN_INCOMPLETE(size);
+ if (!IS_TRAILING(str[i]))
+ RETURN_ERROR();
}
-
- cur++;
- len++;
}
+#else /* not SCM_STRICT_ENCODING_CHECK */
+ if (size < len)
+ RETURN_INCOMPLETE(size);
+#endif /* not SCM_STRICT_ENCODING_CHECK */
- return (const char*)cur;
+ RETURN(len);
+
}
-static const char* eucjp_str_endpos(const char *str, int k)
+#undef MASK
+#undef LEN_CODE
+#undef IS_LEN
+#undef IS_TRAILING
+#endif /* SCM_USE_UTF8 */
+
+/*==== Other encodings ====*/
+
+#if SCM_USE_SJIS
+/* The cwazy Japanese encoding. This function implements the JIS X
+ * 0213 variant.
+ *
+ * 0 .. 0x7F: ASCII
+ * 0x80: undefined
+ * 0x81 .. 0x9F: lead byte of 2-byte char
+ * 0xA0: undefined
+ * 0xA1 .. 0xDF: JIS X 0201 katakana (1 byte)
+ * 0xE0 .. 0xEF: lead byte of 2-byte char
+ * 0xF0 .. 0xFC: lead byte of 2-byte char if JIS X 0213 is used
+ * 0xFD .. 0xFF: undefined
+ *
+ * 0x40 .. 0x7E: trailing byte of 2-byte char
+ * 0x80 .. 0xFC: trailing byte of 2-byte char
+ */
+static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs)
{
- int len = 0;
- const unsigned char *cur = (const unsigned char *)str;
- while (*cur) {
- if (*cur > 127) {
- /* 2 bytes */
- cur++;
- }
+#define IS_KANA(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xDF)
+#define IS_LEAD(c) \
+ (0x81 <= (uchar)(c) \
+ && !IS_KANA(c) \
+ && (uchar)(c) <= 0xFC \
+ && (uchar)(c) != 0xA0)
+#define IS_TRAIL(c) (0x40 <= (uchar)(c) && (uchar)(c) <= 0xFC && (c) != 0x7E)
- cur++;
- len++;
-
- if (len == k + 1)
- return (const char *)cur;
+ const char *str = SCM_MBS_GET_STR(mbs);
+ const int size = SCM_MBS_GET_SIZE(mbs);
+ ENTER;
+ if (!size)
+ RETURN(0);
+ if (IS_LEAD(str[0])) {
+ EXPECT_SIZE(2);
+ if (size < 2)
+ RETURN_INCOMPLETE(size);
+#if SCM_STRICT_ENCODING_CHECK
+ if (!IS_TRAIL(str[1]))
+ RETURN_ERROR();
+#endif
+ RETURN(2);
}
-
- if (len == k + 1)
- return (const char *)cur;
+ RETURN(1);
- SigScm_Error("eucjp_str_startpos : unreachable point");
- return NULL;
+#undef IS_KANA
+#undef IS_LEAD
+#undef IS_TRAIL
}
+#endif /* SCM_USE_SJIS */
+
+/* Single-byte encodings. Please add any that you know are missing.
+ * Sorted alphabetically.
+ *
+ * ASCII
+ * ISO 646
+ * ISO-8859-*
+ * VISCII
+ */
+static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs)
+{
+ ENTER;
+ if (SCM_MBS_GET_SIZE(mbs))
+ RETURN(1);
+ RETURN(0);
+}
Modified: branches/r5rs/sigscheme/operations-siod.c
===================================================================
--- branches/r5rs/sigscheme/operations-siod.c 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/operations-siod.c 2005-10-18 06:45:18 UTC (rev 1858)
@@ -180,9 +180,9 @@
exp = SCM_CLOSURE_EXP(closure);
if (NULLP(CDDR(exp)))
- body = CADR(exp);
+ body = CADR(exp);
else
- body = CONS(Scm_Intern("begin"), CDR(exp));
+ body = CONS(Scm_Intern("begin"), CDR(exp));
return CONS(CAR(exp), body);
}
@@ -211,7 +211,7 @@
SigScm_Error("SigScm_SetVerboseLevel : negative value has been given");
if (sscm_verbose_level == level)
- return;
+ return;
sscm_verbose_level = level;
Modified: branches/r5rs/sigscheme/operations.c
===================================================================
--- branches/r5rs/sigscheme/operations.c 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/operations.c 2005-10-18 06:45:18 UTC (rev 1858)
@@ -1258,31 +1258,32 @@
{
DECLARE_FUNCTION("string-length", ProcedureFixed1);
ASSERT_STRINGP(str);
- return Scm_NewInt(SigScm_default_encoding_strlen(SCM_STRING_STR(str)));
+ return Scm_NewInt(Scm_mb_bare_c_strlen(SCM_STRING_STR(str)));
}
ScmObj ScmOp_string_ref(ScmObj str, ScmObj k)
{
int c_index = 0;
char *new_ch = NULL;
- const char *string_str = NULL;
- const char *ch_start_ptr = NULL;
- const char *ch_end_ptr = NULL;
+ ScmMultibyteString mbs;
DECLARE_FUNCTION("string-ref", ProcedureFixed2);
ASSERT_STRINGP(str);
ASSERT_INTP(k);
+ SCM_MBS_INIT(mbs);
/* get start_ptr and end_ptr */
c_index = SCM_INT_VALUE(k);
- string_str = SCM_STRING_STR(str);
- ch_start_ptr = SigScm_default_encoding_str_startpos(string_str, c_index);
- ch_end_ptr = SigScm_default_encoding_str_endpos(string_str, c_index);
+ SCM_MBS_SET_STR(mbs, SCM_STRING_STR(str));
+ /* FIXME: This strlen() can be eliminated. */
+ SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(str)));
+ mbs = Scm_mb_strref(mbs, c_index);
+
/* copy from start_ptr to end_ptr */
- new_ch = (char*)malloc(sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
- memset(new_ch, 0, sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
- strncpy(new_ch, ch_start_ptr, (ch_end_ptr - ch_start_ptr));
+ new_ch = (char*)malloc(SCM_MBS_GET_SIZE(mbs) + 1);
+ memcpy(new_ch, SCM_MBS_GET_STR(mbs), SCM_MBS_GET_SIZE(mbs));
+ new_ch[SCM_MBS_GET_SIZE(mbs)] = 0;
return Scm_NewChar(new_ch);
}
@@ -1290,14 +1291,13 @@
ScmObj ScmOp_string_set(ScmObj str, ScmObj k, ScmObj ch)
{
int c_start_index = 0;
- int front_size = 0;
+ int prefix_size = 0;
int newch_size = 0;
- int back_size = 0;
+ int postfix_size = 0;
int total_size = 0;
char *new_str = NULL;
+ ScmMultibyteString mbs;
const char *string_str = NULL;
- const char *ch_start_ptr = NULL;
- const char *ch_end_ptr = NULL;
DECLARE_FUNCTION("string-set!", ProcedureFixed3);
ASSERT_STRINGP(str);
@@ -1307,23 +1307,28 @@
/* get indexes */
c_start_index = SCM_INT_VALUE(k);
string_str = SCM_STRING_STR(str);
- ch_start_ptr = SigScm_default_encoding_str_startpos(string_str, c_start_index);
- ch_end_ptr = SigScm_default_encoding_str_endpos(string_str, c_start_index);
+ /* FIXME: can string_str be NULL at this point or not? */
+ if (!string_str) string_str = "";
+ /* FIXME: strlen() can be eliminiated. */
+ SCM_MBS_INIT(mbs);
+ SCM_MBS_SET_STR(mbs, string_str);
+ SCM_MBS_SET_SIZE(mbs, strlen(string_str));
+ mbs = Scm_mb_strref(mbs, c_start_index);
+
/* calculate total size */
- front_size = strlen(string_str) - strlen(ch_start_ptr);
- newch_size = strlen(SCM_CHAR_VALUE(ch));
- back_size = strlen(ch_end_ptr);
- total_size = front_size + newch_size + back_size;
+ prefix_size = SCM_MBS_GET_STR(mbs) - string_str;
+ newch_size = strlen(SCM_CHAR_VALUE(ch));
+ postfix_size = strlen(SCM_MBS_GET_STR(mbs) + SCM_MBS_GET_SIZE(mbs));
+ total_size = prefix_size + newch_size + postfix_size;
- /* copy each parts */
+ /* copy each part */
new_str = (char*)malloc(total_size + 1);
- memset(new_str, 0, total_size + 1);
- strncpy(new_str , string_str , front_size);
- strncpy(new_str + front_size , SCM_CHAR_VALUE(ch) , newch_size);
- strncpy(new_str + front_size + newch_size , ch_end_ptr , back_size);
+ memcpy(new_str, string_str, prefix_size);
+ memcpy(new_str+prefix_size, SCM_CHAR_VALUE(ch), newch_size);
+ memcpy(new_str+prefix_size+newch_size,
+ SCM_MBS_GET_STR(mbs)+SCM_MBS_GET_SIZE(mbs), postfix_size);
- /* set */
if (SCM_STRING_STR(str))
free(SCM_STRING_STR(str));
@@ -1349,10 +1354,9 @@
{
int c_start_index = 0;
int c_end_index = 0;
- char *new_str = NULL;
+ char *new_str = NULL;
+ ScmMultibyteString mbs;
const char *string_str = NULL;
- const char *ch_start_ptr = NULL;
- const char *ch_end_ptr = NULL;
DECLARE_FUNCTION("substring", ProcedureFixed3);
ASSERT_STRINGP(str);
@@ -1364,18 +1368,22 @@
c_end_index = SCM_INT_VALUE(end);
/* sanity check */
- if (c_start_index == c_end_index)
- return Scm_NewStringCopying("");
+ if (c_start_index > c_end_index)
+ ERR("substring: start index is greater than end index.");
+ if (c_end_index > SCM_STRING_LEN(str))
+ ERR_OBJ("index out of range", end);
- /* get str */
- string_str = SCM_STRING_STR(str);
- ch_start_ptr = SigScm_default_encoding_str_startpos(string_str, c_start_index);
- ch_end_ptr = SigScm_default_encoding_str_startpos(string_str, c_end_index);
+ /* FIXME: strlen() can be eliminated. */
+ string_str = SCM_STRING_STR(str);
+ SCM_MBS_INIT(mbs);
+ SCM_MBS_SET_STR(mbs, string_str);
+ SCM_MBS_SET_SIZE(mbs, strlen(string_str));
+ mbs = Scm_mb_substring(mbs, c_start_index, c_end_index - c_start_index);
/* copy from start_ptr to end_ptr */
- new_str = (char*)malloc(sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
- memset(new_str, 0, sizeof(char) * (ch_end_ptr - ch_start_ptr) + 1);
- strncpy(new_str, ch_start_ptr, sizeof(char) * (ch_end_ptr - ch_start_ptr));
+ new_str = (char*)malloc(SCM_MBS_GET_SIZE(mbs) + 1);
+ memcpy(new_str, SCM_MBS_GET_STR(mbs), SCM_MBS_GET_SIZE(mbs));
+ new_str[SCM_MBS_GET_SIZE(mbs)] = 0;
return Scm_NewString(new_str);
}
@@ -1420,39 +1428,35 @@
ScmObj ScmOp_string2list(ScmObj string)
{
- char *string_str = NULL;
- int str_len = 0;
ScmObj head = SCM_NULL;
- ScmObj prev = NULL;
- ScmObj next = NULL;
- int i = 0;
- const char *ch_start_ptr = NULL;
- const char *ch_end_ptr = NULL;
- char *new_ch = NULL;
+ ScmObj tail = SCM_NULL;
+ ScmObj next = SCM_NULL;
+ ScmMultibyteString mbs;
+ ScmMultibyteCharInfo ch;
+ char *buf;
DECLARE_FUNCTION("string->list", string);
ASSERT_STRINGP(string);
- string_str = SCM_STRING_STR(string);
- str_len = SCM_STRING_LEN(string);
- if (str_len == 0)
- return SCM_NULL;
+ SCM_MBS_INIT(mbs);
+ SCM_MBS_SET_STR(mbs, SCM_STRING_STR(string));
+ SCM_MBS_SET_SIZE(mbs, strlen(SCM_STRING_STR(string)));
- for (i = 0; i < str_len; i++) {
- ch_start_ptr = SigScm_default_encoding_str_startpos(string_str, i);
- ch_end_ptr = SigScm_default_encoding_str_endpos(string_str, i);
+ while (SCM_MBS_GET_SIZE(mbs)) {
+ ch = Scm_mb_scan_char(mbs);
+ buf = malloc(SCM_MBCINFO_GET_SIZE(ch)+1);
+ memcpy(buf, SCM_MBS_GET_STR(mbs), SCM_MBCINFO_GET_SIZE(ch));
+ buf[SCM_MBCINFO_GET_SIZE(ch)] = 0;
+ next = LIST_1(Scm_NewChar(buf));
- new_ch = (char*)malloc(sizeof(char) * (ch_end_ptr - ch_start_ptr + 1));
- memset(new_ch, 0, sizeof(char) * (ch_end_ptr - ch_start_ptr + 1));
- strncpy(new_ch, ch_start_ptr, (sizeof(char) * (ch_end_ptr - ch_start_ptr)));
+ if (NULLP(tail))
+ head = tail = next;
+ else {
+ SET_CDR(tail, next);
+ tail = CDR(tail);
+ }
- next = CONS(Scm_NewChar(new_ch), SCM_NULL);
- if (prev)
- SET_CDR(prev, next);
- else
- head = next;
-
- prev = next;
+ SCM_MBS_SKIP_CHAR(mbs, ch);
}
return head;
Modified: branches/r5rs/sigscheme/sigscheme.h
===================================================================
--- branches/r5rs/sigscheme/sigscheme.h 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/sigscheme.h 2005-10-18 06:45:18 UTC (rev 1858)
@@ -161,6 +161,7 @@
SCM_DBG_EXPERIMENTAL = 1 << 11, /* developed but experimental features */
SCM_DBG_DEVEL = 1 << 12, /* under development */
SCM_DBG_COMPAT = 1 << 13, /* warns compatibility-sensitive code */
+ SCM_DBG_ENCODING = 1 << 14, /* multibyte handling */
SCM_DBG_OTHER = 1 << 30 /* all other messages */
};
@@ -565,9 +566,11 @@
#endif
/* encoding.c */
-int SigScm_default_encoding_strlen(const char *str);
-const char* SigScm_default_encoding_str_startpos(const char *str, int k);
-const char* SigScm_default_encoding_str_endpos(const char *str, int k);
+int Scm_mb_strlen(ScmMultibyteString mbs);
+int Scm_mb_bare_c_strlen(const char *str);
+ScmMultibyteString Scm_mb_substring(ScmMultibyteString str, int i, int len);
+#define Scm_mb_strref(str, i) (Scm_mb_substring((str), (i), 1))
+extern ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs);
/* read.c */
ScmObj SigScm_Read(ScmObj port);
Modified: branches/r5rs/sigscheme/sigschemeinternal.h
===================================================================
--- branches/r5rs/sigscheme/sigschemeinternal.h 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/sigschemeinternal.h 2005-10-18 06:45:18 UTC (rev 1858)
@@ -252,7 +252,7 @@
&& (NULLP(args) \
|| (ERR_OBJ("improper argument list terminator", (args)), 1)))
#define ASSERT_NO_MORE_ARG(args) \
- (NO_MORE_ARG(args) || (ERR("superfluous argument(s)", (args)), 1))
+ (NO_MORE_ARG(args) || (ERR_OBJ("superfluous argument(s)", (args)), 1))
#define ASSERT_PROPER_ARG_LIST(args) \
(ScmOp_c_length(args) >= 0 \
|| (ERR_OBJ("bad argument list", (args)), 1))
Modified: branches/r5rs/sigscheme/sigschemetype.h
===================================================================
--- branches/r5rs/sigscheme/sigschemetype.h 2005-10-17 22:27:54 UTC (rev 1857)
+++ branches/r5rs/sigscheme/sigschemetype.h 2005-10-18 06:45:18 UTC (rev 1858)
@@ -248,6 +248,79 @@
};
/*=======================================
+ Multibyte encoding support
+=======================================*/
+
+/* This type will actually contain some encoding-dependent enum value.
+ * It might as well be defined as mbstate_t if we're using libc. */
+typedef int ScmMultibyteState;
+
+/* Metadata of a multibyte character. These are usually allocated on
+ stack or register, so we'll make liberal use of space. */
+typedef struct {
+ const char *start;
+ int flag;
+ int size;
+
+#if SCM_USE_STATEFUL_ENCODING
+ /* Shift state at the *end* of the described character. */
+ ScmMultibyteState state;
+#endif
+} ScmMultibyteCharInfo;
+
+typedef struct {
+ const char *str;
+
+ /* Only the size is stored because ScmObj caches the length, and
+ * we'll have to traverse from the beginning all the time
+ * anyway. */
+ int size;
+#if SCM_USE_STATEFUL_ENCODING
+ ScmMultibyteState state;
+#endif
+} ScmMultibyteString;
+
+#define SCM_MBS_SET_STR(mbs, s) ((mbs).str = (s))
+#define SCM_MBS_GET_STR(mbs) ((mbs).str)
+#define SCM_MBS_SET_SIZE(mbs, siz) ((mbs).size = (siz))
+#define SCM_MBS_GET_SIZE(mbs) ((mbs).size)
+
+#define SCM_MBCINFO_SET_SIZE SCM_MBS_SET_SIZE
+#define SCM_MBCINFO_GET_SIZE SCM_MBS_GET_SIZE
+#define SCM_MBCINFO_CLEAR_STATE SCM_MBS_CLEAR_STATE
+#define SCM_MBCINFO_SET_STATE SCM_MBS_SET_STATE
+#define SCM_MBCINFO_GET_STATE SCM_MBS_GET_STATE
+#define SCM_MBCINFO_CLEAR_FLAG(inf) ((inf).flag = 0)
+#define SCM_MBCINFO_SET_ERROR(inf) ((inf).flag |= 1)
+#define SCM_MBCINFO_SET_INCOMPLETE(inf) ((inf).flag |= 2)
+#define SCM_MBCINFO_ERRORP(inf) ((inf).flag & 1)
+#define SCM_MBCINFO_INCOMPLETEP(inf) ((inf).flag & 2)
+#define SCM_MBCINFO_INIT(inf) (SCM_MBCINFO_SET_SIZE((inf), 0), \
+ SCM_MBCINFO_CLEAR_STATE(inf), \
+ SCM_MBCINFO_CLEAR_FLAG(inf))
+
+
+#if SCM_USE_STATEFUL_ENCODING
+#define SCM_MBS_GET_STATE(mbs) ((mbs).state)
+#define SCM_MBS_SET_STATE(mbs, stat) ((mbs).state = (stat))
+#define SCM_MBS_CLEAR_STATE(mbs) ((mbs).state = 0)
+#else
+#define SCM_MBS_GET_STATE(mbs) 0
+#define SCM_MBS_SET_STATE(mbs, stat) 0
+#define SCM_MBS_CLEAR_STATE(mbs) 0
+#endif
+#define SCM_MBS_INIT(mbs) (SCM_MBS_SET_STR((mbs), NULL), \
+ SCM_MBS_SET_SIZE((mbs), 0), \
+ SCM_MBS_CLEAR_STATE(mbs))
+#define SCM_MBS_SKIP_CHAR(mbs, inf) \
+ (SCM_MBS_SET_STR((mbs), SCM_MBS_GET_STR(mbs) + SCM_MBCINFO_GET_SIZE(inf)),\
+ SCM_MBS_SET_SIZE((mbs), \
+ SCM_MBS_GET_SIZE(mbs) - SCM_MBCINFO_GET_SIZE(inf)), \
+ SCM_MBS_SET_STATE((mbs), SCM_MBCINFO_GET_STATE(inf)))
+
+
+
+/*=======================================
Accessors For Scheme Objects
=======================================*/
/* ScmObj Global Attribute */
More information about the uim-commit
mailing list