[uim-commit] r2293 - in branches/r5rs/sigscheme: . test

Wed Nov 30 19:28:35 PST 2005

Author: yamaken
Date: 2005-11-30 19:28:26 -0800 (Wed, 30 Nov 2005)
New Revision: 2293

Modified:
   branches/r5rs/sigscheme/TODO
   branches/r5rs/sigscheme/config.h
   branches/r5rs/sigscheme/io.c
   branches/r5rs/sigscheme/read.c
   branches/r5rs/sigscheme/test/test-string.scm
Log:
* This commit adds some SRFI-75 features

* sigscheme/config.h
  - (SCM_USE_SRFI75): New macro
* sigscheme/read.c
  - (read_sequence, read_unicode_sequence):
    New static function
  - (parse_unicode_sequence):
    * New static function
    * Port hexadecimal char parser from read_char()
    * Fix invalid signed hexadecimal char acceptance such as #\x-0
  - (read_char):
    * Support Unicode sequence
    * Move hexadecimal char support to parse_unicode_sequence() of
      SRFI-75
  - (read_string): Support Unicode sequence
* sigscheme/io.c
  - (Scm_special_char_table): Add "\|" for SRFI-75
* sigscheme/test/test-string.scm
  - Uncomment SRFI-75 tests
* sigscheme/TODO
  - Update


Modified: branches/r5rs/sigscheme/TODO
===================================================================

--- branches/r5rs/sigscheme/TODO	2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/TODO	2005-12-01 03:28:26 UTC (rev 2293)
@@ -9,8 +9,12 @@
 * Fix character and escape sequence related issues. grep Scm_special_char_table
   to find the issues, and make all tests in test-char.scm and test-string.scm
   passed
+  - Write tests for SRFI-75
 
 * Autoconfiscate the SigScheme package (don't rely on uim's configure)
+  - Prepare replace functions (asprintf and so on)
+  - Introduce C99-independent stdint.h
+    http://autoconf-archive.cryp.to/ax_create_stdint_h.html
   - [uim] configure sigscheme subdir by top-level configure like GCC
 
 * Add tests for proper tail recursion with 'apply' and 'guard' to
@@ -32,8 +36,6 @@
   - Evaluate ces API of Gauche
   - Consider dynamic environment
 
-* Support SRFI-75 character literals such as #\u3042 and #\U0010FFFF
-
 * Add GB18030 to encoding.c
 
 * Add Big5 to encoding.c

Modified: branches/r5rs/sigscheme/config.h
===================================================================
--- branches/r5rs/sigscheme/config.h	2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/config.h	2005-12-01 03:28:26 UTC (rev 2293)
@@ -50,6 +50,7 @@
 #define SCM_USE_SRFI38          1  /* use SRFI-38 'write-with-shared-structure' */
 #define SCM_USE_SRFI60          1  /* use SRFI-60 integers as bits */
 #define SCM_USE_SRFI75_NAMED_CHARS 1  /* use named characters of SRFI-75 R6RS unicode data */
+#define SCM_USE_SRFI75          1  /* use SRFI-75 R6RS unicode data */
 
 #define SCM_COMPAT_SIOD         1  /* use SIOD compatible features */
 #define SCM_COMPAT_SIOD_BUGS    1  /* emulate the buggy behaviors of SIOD */
@@ -130,6 +131,10 @@
 #define SCM_USE_SRFI23          1
 #endif /* SCM_USE_SRFI34 */
 
+#if SCM_USE_SRFI75
+#define SCM_USE_SRFI75_NAMED_CHARS 1
+#endif
+
 #if SCM_DEBUG
 #undef SCM_VOLATILE_OUTPUT
 #define SCM_VOLATILE_OUTPUT     1

Modified: branches/r5rs/sigscheme/io.c
===================================================================
--- branches/r5rs/sigscheme/io.c	2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/io.c	2005-12-01 03:28:26 UTC (rev 2293)
@@ -82,6 +82,9 @@
     /* to avoid portability problem, we should not support #\Space and so on */
     {' ',    " ",     "Space"},
 #endif
+#if SCM_USE_SRFI75
+    {'|',    "\\|",   "|"},
+#endif
 
     /* control characters */
     {'\n',   "\\n",   "newline"},    /*  10, R5RS */

Modified: branches/r5rs/sigscheme/read.c
===================================================================
--- branches/r5rs/sigscheme/read.c	2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/read.c	2005-12-01 03:28:26 UTC (rev 2293)
@@ -97,8 +97,13 @@
   File Local Function Declarations
 =======================================*/
 static int    skip_comment_and_space(ScmObj port);
+static void   read_sequence(ScmObj port, char *buf, int len);
 static char*  read_word(ScmObj port);
 static char*  read_char_sequence(ScmObj port);
+#if SCM_USE_SRFI75
+static int    parse_unicode_sequence(const char *seq);
+static int    read_unicode_sequence(ScmObj port, char prefix);
+#endif
 
 static ScmObj read_sexpression(ScmObj port);
 static ScmObj read_list(ScmObj port, int closeParen);
@@ -168,6 +173,22 @@
     }
 }
 
+static void read_sequence(ScmObj port, char *buf, int len)
+{
+    int c;
+    char *p;
+
+    for (p = buf; p < &buf[len]; p++) {
+        c = SCM_PORT_GET_CHAR(port);
+        if (c == EOF)
+            ERR("unexpected EOF");
+        if (!isascii(c))
+            ERR("unexpected non-ASCII char");
+        *p = c;
+    }
+    buf[len] = '\0';
+}
+
 static ScmObj read_sexpression(ScmObj port)
 {
     int c  = 0;
@@ -333,12 +354,72 @@
     }
 }
 
+#if SCM_USE_SRFI75
+static int parse_unicode_sequence(const char *seq)
+{
+    int c;
+    size_t len;
+    char *first_nondigit;
+
+    len = strlen(seq);
+
+    /* reject ordinary char literal and invalid signed hexadecimal */
+    if (len < 3 || !isxdigit(seq[1]))
+        return -1;
+
+    switch (seq[0]) {
+    case 'x':
+        /* #\x<x><x> : <x> = a hexadecimal digit (ignore case) */
+        if (len != 3)
+            ERR("invalid hexadecimal character sequence. conform \\x<x><x>");
+        break;
+
+    case 'u':
+        /* #\u<x><x><x><x> : Unicode char of BMP */
+        if (len != 5 || (0xd800 <= c && c <= 0xdfff))
+            ERR("invalid Unicode sequence. conform \\u<x><x><x><x>");
+        break;
+
+    case 'U':
+        /* #\U<x><x><x><x><x><x><x><x> : Unicode char of BMP or SMP */
+        if (len != 8 || (0xd800 <= c && c <= 0xdfff) || 0x10ffff < c)
+            ERR("invalid Unicode sequence. conform \\U<x><x><x><x><x><x><x><x>");
+        break;
+
+    default:
+        return -1;
+    }
+    c = strtol(&seq[1], &first_nondigit, 16);
+    return (*first_nondigit) ? -1 : c;
+}
+
+static int read_unicode_sequence(ScmObj port, char prefix)
+{
+    int len;
+    char seq[sizeof("U0010ffff")];
+
+    switch (prefix) {
+    case 'x': len = 2; break;
+    case 'u': len = 4; break;
+    case 'U': len = 8; break;
+    default:
+        /* FIXME: add fatal error handling */
+        break;
+    }
+    seq[0] = prefix;
+    read_sequence(port, &seq[1], len);
+    return parse_unicode_sequence(seq);
+}
+#endif /* SCM_USE_SRFI75 */
+
 static ScmObj read_char(ScmObj port)
 {
     int c;
+#if SCM_USE_SRFI75
+    int unicode;
+#endif
     char *ch;
     const ScmSpecialCharInfo *info;
-    char *first_nondigit = NULL;
 
     /* TODO: reorganize with read_char_sequence() */
     /* non-ascii char (multibyte-ready) */
@@ -358,20 +439,19 @@
     }
 
     ch = read_char_sequence(port);
+    if (!ch)
+        ERR("memory exhausted");
 
     CDBG((SCM_DBG_PARSER, "read_char : ch = %s", ch));
 
-    /* check #\x<x><x> style character where <x> is a hexadecimal
-     * digit and the sequence of two <x>s forms a hexadecimal
-     * number between 0 and #xFF(defined in R6RS) */
-    if (ch && ch[0] == 'x' && 1 < strlen(ch)) {
-        if (strlen(ch) != 3)
-            SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
-        c = strtol(ch + 1, &first_nondigit, 16);
-        if (*first_nondigit)
-            SigScm_Error("invalid hexadecimal character form. should be #\\x<x><x>\n");
-    } else {
-        /* check special sequence */
+#if SCM_USE_SRFI75
+    unicode = parse_unicode_sequence(ch);
+    if (0 <= unicode) {
+        c = unicode;
+    } else
+#endif
+    {
+        /* named chars */
         for (info = Scm_special_char_table; info->esc_seq; info++) {
             if (strcmp(ch, info->lex_rep) == 0) {
                 c = info->code;
@@ -416,16 +496,25 @@
             return obj;
 
         case '\\':
-            /*
-             * (R5RS) 6.3.5 String
-             * A double quote can be written inside a string only by
-             * escaping it with a backslash (\).
-             */
             c = SCM_PORT_GET_CHAR(port);
-            for (info = Scm_special_char_table; info->esc_seq; info++) {
-                if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
-                    *p++ = info->code;
-                    goto found;
+#if SCM_USE_SRFI75
+            if (strchr("xuU", c)) {
+                c = read_unicode_sequence(port, c);
+                /* FIXME: check Unicode capability of Scm_current_char_codec */
+                p = SCM_CHARCODEC_INT2STR(Scm_current_char_codec,
+                                          p, c, SCM_MB_STATELESS);
+                if (!p)
+                    ERR("invalid Unicode sequence in string: 0x%x", c);
+                goto found;
+            } else
+#endif
+            {
+                /* escape sequences */
+                for (info = Scm_special_char_table; info->esc_seq; info++) {
+                    if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
+                        *p++ = info->code;
+                        goto found;
+                    }
                 }
             }
             ERR("invalid escape sequence in string: \\%c", c);

Modified: branches/r5rs/sigscheme/test/test-string.scm
===================================================================
--- branches/r5rs/sigscheme/test/test-string.scm	2005-11-30 15:59:59 UTC (rev 2292)
+++ branches/r5rs/sigscheme/test/test-string.scm	2005-12-01 03:28:26 UTC (rev 2293)
@@ -198,14 +198,9 @@
 (assert-equal? "R5RS escape sequence" '(#\newline) (string->list "\n"))  ;; 110
 
 ;; R6RS(SRFI-75) compliant
-;
-; 2005/11/23 Kazuki Ohta <mover at hct.zaq.ne.jp>
-; temporally commented out, because we cannot handle "\x<x><x>" style escape
-; sequence yet.
-;
-;(assert-equal? "R6RS escape sequence" (integer->string 0)      "\x00")  ;; 0
-;(assert-equal? "R6RS escape sequence" (list->string '(#\nul))  "\x00")  ;; 0
-;(assert-equal? "R6RS escape sequence" '(#\nul)  (string->list "\x00"))  ;; 0
+(assert-equal? "R6RS escape sequence" (integer->string 0)      "\x00")  ;; 0
+(assert-equal? "R6RS escape sequence" (list->string '(#\nul))  "\x00")  ;; 0
+(assert-equal? "R6RS escape sequence" '(#\nul)  (string->list "\x00"))  ;; 0
 (assert-equal? "R6RS escape sequence" (integer->string 7)        "\a")  ;; 97
 (assert-equal? "R6RS escape sequence" (list->string '(#\alarm))  "\a")  ;; 97
 (assert-equal? "R6RS escape sequence" '(#\alarm)  (string->list "\a"))  ;; 97
@@ -224,12 +219,8 @@
 (assert-equal? "R6RS escape sequence" (integer->string 11)       "\v")  ;; 118
 (assert-equal? "R6RS escape sequence" (list->string '(#\vtab))   "\v")  ;; 118
 (assert-equal? "R6RS escape sequence" '(#\vtab)   (string->list "\v"))  ;; 118
+(assert-equal? "R6RS escape sequence" (integer->string 124)      "\|")  ;; 124
 
-; 2005/11/22 Kazuki Ohta <mover at hct.zaq.ne.jp>
-; temporally commented out
-;
-; (assert-equal? "R6RS escape sequence" (integer->string 124)      "\|")  ;; 124
-
 ;; All these conventional escape sequences should cause parse error as defined
 ;; in SRFI-75: "Any other character in a string after a backslash is an
 ;; error".