[uim-commit] r2293 - in branches/r5rs/sigscheme: . test
yamaken at freedesktop.org
yamaken at freedesktop.org
Wed Nov 30 19:28:35 PST 2005
Author: yamaken
Date: 2005-11-30 19:28:26 -0800 (Wed, 30 Nov 2005)
New Revision: 2293
Modified:
branches/r5rs/sigscheme/TODO
branches/r5rs/sigscheme/config.h
branches/r5rs/sigscheme/io.c
branches/r5rs/sigscheme/read.c
branches/r5rs/sigscheme/test/test-string.scm
Log:
* This commit adds some SRFI-75 features
* sigscheme/config.h
- (SCM_USE_SRFI75): New macro
* sigscheme/read.c
- (read_sequence, read_unicode_sequence):
New static function
- (parse_unicode_sequence):
* New static function
* Port hexadecimal char parser from read_char()
* Fix invalid signed hexadecimal char acceptance such as #\x-0
- (read_char):
* Support Unicode sequence
* Move hexadecimal char support to parse_unicode_sequence() of
SRFI-75
- (read_string): Support Unicode sequence
* sigscheme/io.c
- (Scm_special_char_table): Add "\|" for SRFI-75
* sigscheme/test/test-string.scm
- Uncomment SRFI-75 tests
* sigscheme/TODO
- Update
Modified: branches/r5rs/sigscheme/TODO
===================================================================
--- branches/r5rs/sigscheme/TODO 2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/TODO 2005-12-01 03:28:26 UTC (rev 2293)
@@ -9,8 +9,12 @@
* Fix character and escape sequence related issues. grep Scm_special_char_table
to find the issues, and make all tests in test-char.scm and test-string.scm
passed
+ - Write tests for SRFI-75
* Autoconfiscate the SigScheme package (don't rely on uim's configure)
+ - Prepare replace functions (asprintf and so on)
+ - Introduce C99-independent stdint.h
+ http://autoconf-archive.cryp.to/ax_create_stdint_h.html
- [uim] configure sigscheme subdir by top-level configure like GCC
* Add tests for proper tail recursion with 'apply' and 'guard' to
@@ -32,8 +36,6 @@
- Evaluate ces API of Gauche
- Consider dynamic environment
-* Support SRFI-75 character literals such as #\u3042 and #\U0010FFFF
-
* Add GB18030 to encoding.c
* Add Big5 to encoding.c
Modified: branches/r5rs/sigscheme/config.h
===================================================================
--- branches/r5rs/sigscheme/config.h 2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/config.h 2005-12-01 03:28:26 UTC (rev 2293)
@@ -50,6 +50,7 @@
#define SCM_USE_SRFI38 1 /* use SRFI-38 'write-with-shared-structure' */
#define SCM_USE_SRFI60 1 /* use SRFI-60 integers as bits */
#define SCM_USE_SRFI75_NAMED_CHARS 1 /* use named characters of SRFI-75 R6RS unicode data */
+#define SCM_USE_SRFI75 1 /* use SRFI-75 R6RS unicode data */
#define SCM_COMPAT_SIOD 1 /* use SIOD compatible features */
#define SCM_COMPAT_SIOD_BUGS 1 /* emulate the buggy behaviors of SIOD */
@@ -130,6 +131,10 @@
#define SCM_USE_SRFI23 1
#endif /* SCM_USE_SRFI34 */
+#if SCM_USE_SRFI75
+#define SCM_USE_SRFI75_NAMED_CHARS 1
+#endif
+
#if SCM_DEBUG
#undef SCM_VOLATILE_OUTPUT
#define SCM_VOLATILE_OUTPUT 1
Modified: branches/r5rs/sigscheme/io.c
===================================================================
--- branches/r5rs/sigscheme/io.c 2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/io.c 2005-12-01 03:28:26 UTC (rev 2293)
@@ -82,6 +82,9 @@
/* to avoid portability problem, we should not support #\Space and so on */
{' ', " ", "Space"},
#endif
+#if SCM_USE_SRFI75
+ {'|', "\\|", "|"},
+#endif
/* control characters */
{'\n', "\\n", "newline"}, /* 10, R5RS */
Modified: branches/r5rs/sigscheme/read.c
===================================================================
--- branches/r5rs/sigscheme/read.c 2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/read.c 2005-12-01 03:28:26 UTC (rev 2293)
@@ -97,8 +97,13 @@
File Local Function Declarations
=======================================*/
static int skip_comment_and_space(ScmObj port);
+static void read_sequence(ScmObj port, char *buf, int len);
static char* read_word(ScmObj port);
static char* read_char_sequence(ScmObj port);
+#if SCM_USE_SRFI75
+static int parse_unicode_sequence(const char *seq);
+static int read_unicode_sequence(ScmObj port, char prefix);
+#endif
static ScmObj read_sexpression(ScmObj port);
static ScmObj read_list(ScmObj port, int closeParen);
@@ -168,6 +173,22 @@
}
}
+static void read_sequence(ScmObj port, char *buf, int len)
+{
+ int c;
+ char *p;
+
+ for (p = buf; p < &buf[len]; p++) {
+ c = SCM_PORT_GET_CHAR(port);
+ if (c == EOF)
+ ERR("unexpected EOF");
+ if (!isascii(c))
+ ERR("unexpected non-ASCII char");
+ *p = c;
+ }
+ buf[len] = '\0';
+}
+
static ScmObj read_sexpression(ScmObj port)
{
int c = 0;
@@ -333,12 +354,72 @@
}
}
+#if SCM_USE_SRFI75
+static int parse_unicode_sequence(const char *seq)
+{
+ int c;
+ size_t len;
+ char *first_nondigit;
+
+ len = strlen(seq);
+
+ /* reject ordinary char literal and invalid signed hexadecimal */
+ if (len < 3 || !isxdigit(seq[1]))
+ return -1;
+
+ switch (seq[0]) {
+ case 'x':
+ /* #\x<x><x> : <x> = a hexadecimal digit (ignore case) */
+ if (len != 3)
+ ERR("invalid hexadecimal character sequence. conform \\x<x><x>");
+ break;
+
+ case 'u':
+ /* #\u<x><x><x><x> : Unicode char of BMP */
+ if (len != 5 || (0xd800 <= c && c <= 0xdfff))
+ ERR("invalid Unicode sequence. conform \\u<x><x><x><x>");
+ break;
+
+ case 'U':
+ /* #\U<x><x><x><x><x><x><x><x> : Unicode char of BMP or SMP */
+ if (len != 8 || (0xd800 <= c && c <= 0xdfff) || 0x10ffff < c)
+ ERR("invalid Unicode sequence. conform \\U<x><x><x><x><x><x><x><x>");
+ break;
+
+ default:
+ return -1;
+ }
+ c = strtol(&seq[1], &first_nondigit, 16);
+ return (*first_nondigit) ? -1 : c;
+}
+
+static int read_unicode_sequence(ScmObj port, char prefix)
+{
+ int len;
+ char seq[sizeof("U0010ffff")];
+
+ switch (prefix) {
+ case 'x': len = 2; break;
+ case 'u': len = 4; break;
+ case 'U': len = 8; break;
+ default:
+ /* FIXME: add fatal error handling */
+ break;
+ }
+ seq[0] = prefix;
+ read_sequence(port, &seq[1], len);
+ return parse_unicode_sequence(seq);
+}
+#endif /* SCM_USE_SRFI75 */
+
static ScmObj read_char(ScmObj port)
{
int c;
+#if SCM_USE_SRFI75
+ int unicode;
+#endif
char *ch;
const ScmSpecialCharInfo *info;
- char *first_nondigit = NULL;
/* TODO: reorganize with read_char_sequence() */
/* non-ascii char (multibyte-ready) */
@@ -358,20 +439,19 @@
}
ch = read_char_sequence(port);
+ if (!ch)
+ ERR("memory exhausted");
CDBG((SCM_DBG_PARSER, "read_char : ch = %s", ch));
- /* check #\x<x><x> style character where <x> is a hexadecimal
- * digit and the sequence of two <x>s forms a hexadecimal
- * number between 0 and #xFF(defined in R6RS) */
- if (ch && ch[0] == 'x' && 1 < strlen(ch)) {
- if (strlen(ch) != 3)
- SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
- c = strtol(ch + 1, &first_nondigit, 16);
- if (*first_nondigit)
- SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
- } else {
- /* check special sequence */
+#if SCM_USE_SRFI75
+ unicode = parse_unicode_sequence(ch);
+ if (0 <= unicode) {
+ c = unicode;
+ } else
+#endif
+ {
+ /* named chars */
for (info = Scm_special_char_table; info->esc_seq; info++) {
if (strcmp(ch, info->lex_rep) == 0) {
c = info->code;
@@ -416,16 +496,25 @@
return obj;
case '\\':
- /*
- * (R5RS) 6.3.5 String
- * A double quote can be written inside a string only by
- * escaping it with a backslash (\).
- */
c = SCM_PORT_GET_CHAR(port);
- for (info = Scm_special_char_table; info->esc_seq; info++) {
- if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
- *p++ = info->code;
- goto found;
+#if SCM_USE_SRFI75
+ if (strchr("xuU", c)) {
+ c = read_unicode_sequence(port, c);
+ /* FIXME: check Unicode capability of Scm_current_char_codec */
+ p = SCM_CHARCODEC_INT2STR(Scm_current_char_codec,
+ p, c, SCM_MB_STATELESS);
+ if (!p)
+ ERR("invalid Unicode sequence in string: 0x%x", c);
+ goto found;
+ } else
+#endif
+ {
+ /* escape sequences */
+ for (info = Scm_special_char_table; info->esc_seq; info++) {
+ if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
+ *p++ = info->code;
+ goto found;
+ }
}
}
ERR("invalid escape sequence in string: \\%c", c);
Modified: branches/r5rs/sigscheme/test/test-string.scm
===================================================================
--- branches/r5rs/sigscheme/test/test-string.scm 2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/test/test-string.scm 2005-12-01 03:28:26 UTC (rev 2293)
@@ -198,14 +198,9 @@
(assert-equal? "R5RS escape sequence" '(#\newline) (string->list "\n")) ;; 110
;; R6RS(SRFI-75) compliant
-;
-; 2005/11/23 Kazuki Ohta <mover at hct.zaq.ne.jp>
-; temporally commented out, because we cannot handle "\x<x><x>" style escape
-; sequence yet.
-;
-;(assert-equal? "R6RS escape sequence" (integer->string 0) "\x00") ;; 0
-;(assert-equal? "R6RS escape sequence" (list->string '(#\nul)) "\x00") ;; 0
-;(assert-equal? "R6RS escape sequence" '(#\nul) (string->list "\x00")) ;; 0
+(assert-equal? "R6RS escape sequence" (integer->string 0) "\x00") ;; 0
+(assert-equal? "R6RS escape sequence" (list->string '(#\nul)) "\x00") ;; 0
+(assert-equal? "R6RS escape sequence" '(#\nul) (string->list "\x00")) ;; 0
(assert-equal? "R6RS escape sequence" (integer->string 7) "\a") ;; 97
(assert-equal? "R6RS escape sequence" (list->string '(#\alarm)) "\a") ;; 97
(assert-equal? "R6RS escape sequence" '(#\alarm) (string->list "\a")) ;; 97
@@ -224,12 +219,8 @@
(assert-equal? "R6RS escape sequence" (integer->string 11) "\v") ;; 118
(assert-equal? "R6RS escape sequence" (list->string '(#\vtab)) "\v") ;; 118
(assert-equal? "R6RS escape sequence" '(#\vtab) (string->list "\v")) ;; 118
+(assert-equal? "R6RS escape sequence" (integer->string 124) "\|") ;; 124
-; 2005/11/22 Kazuki Ohta <mover at hct.zaq.ne.jp>
-; temporally commented out
-;
-; (assert-equal? "R6RS escape sequence" (integer->string 124) "\|") ;; 124
-
;; All these conventional escape sequences should cause parse error as defined
;; in SRFI-75: "Any other character in a string after a backslash is an
;; error".
More information about the uim-commit
mailing list