[uim-commit] r3083 - branches/r5rs/sigscheme/src

yamaken at freedesktop.org yamaken at freedesktop.org
Thu Feb 2 08:06:12 PST 2006


Author: yamaken
Date: 2006-02-02 08:05:56 -0800 (Thu, 02 Feb 2006)
New Revision: 3083

Modified:
   branches/r5rs/sigscheme/src/char.c
   branches/r5rs/sigscheme/src/read.c
   branches/r5rs/sigscheme/src/sigschemeinternal.h
Log:
* sigscheme/src/char.c
  - (scm_char_class_table): Comment out SCM_CH_TOKEN_INITIAL from '.'
* sigscheme/src/sigschemeinternal.h
  - (enum ScmCharClass):
    * Modify comment about SCM_CH_TOKEN_INITIAL
    * Add new member 'SCM_CH_DELIMITER'
  - (ICHAR_ASCIIP, ICHAR_CLASS): Make EOF-acceptable
  - (ICHAR_HEXA_NUMERICP): New macro
* sigscheme/src/read.c
  - Exclude ctype.h
  - (WHITESPACE_CHARS, DELIMITER_CHARS): Removed
  - (skip_comment_and_space, read_sequence, read_sexpression,
    read_list, read_list, parse_unicode_sequence, read_symbol,
    read_number_or_symbol, read_number): Simplify with new ichar
    macros
  - (read_token):
    * Ditto
    * Change type for arg 'delim' to enum ScmCharClass


Modified: branches/r5rs/sigscheme/src/char.c
===================================================================
--- branches/r5rs/sigscheme/src/char.c	2006-02-02 16:00:59 UTC (rev 3082)
+++ branches/r5rs/sigscheme/src/char.c	2006-02-02 16:05:56 UTC (rev 3083)
@@ -180,7 +180,7 @@
     SCM_CH_SPECIAL_SUBSEQUENT, /*  43  +         */
     SCM_CH_TOKEN_INITIAL,      /*  44  ,         */
     SCM_CH_SPECIAL_SUBSEQUENT, /*  45  -         */
-    SCM_CH_SPECIAL_SUBSEQUENT | SCM_CH_TOKEN_INITIAL, /*  46  .         */
+    SCM_CH_SPECIAL_SUBSEQUENT /* | SCM_CH_TOKEN_INITIAL */, /*  46  .        */
     SCM_CH_SPECIAL_INITIAL,    /*  47  /         */
     SCM_CH_DIGIT,              /*  48  0         */
     SCM_CH_DIGIT,              /*  49  1         */

Modified: branches/r5rs/sigscheme/src/read.c
===================================================================
--- branches/r5rs/sigscheme/src/read.c	2006-02-02 16:00:59 UTC (rev 3082)
+++ branches/r5rs/sigscheme/src/read.c	2006-02-02 16:05:56 UTC (rev 3083)
@@ -33,13 +33,14 @@
  *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ===========================================================================*/
 
+/* TODO: replace with character class sequence expression-based tokenizer */
+
 #include "config.h"
 
 /*=======================================
   System Include
 =======================================*/
 #include <limits.h>
-#include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -71,9 +72,6 @@
 /* #b-010101... */
 #define INT_LITERAL_LEN_MAX (sizeof((char)'-') + SCM_INT_BITS)
 
-#define WHITESPACE_CHARS " \t\n\r\v\f"
-#define DELIMITER_CHARS  "()\";" WHITESPACE_CHARS
-
 #define DISCARD_LOOKAHEAD(port) (scm_port_get_char(port))
 
 /*=======================================
@@ -86,7 +84,7 @@
 static scm_ichar_t skip_comment_and_space(ScmObj port);
 static void   read_sequence(ScmObj port, char *buf, int len);
 static size_t read_token(ScmObj port, int *err,
-                         char *buf, size_t buf_size, const char *delim);
+                         char *buf, size_t buf_size, enum ScmCharClass delim);
 
 static ScmObj read_sexpression(ScmObj port);
 static ScmObj read_list(ScmObj port, scm_ichar_t closeParen);
@@ -149,7 +147,7 @@
         case LEX_ST_NORMAL:
             if (c == ';')
                 state = LEX_ST_COMMENT;
-            else if (!isascii(c) || !isspace(c) || c == EOF)
+            else if (!ICHAR_WHITESPACEP(c) || c == EOF)
                 return c;  /* peeked */
             break;
 
@@ -175,7 +173,7 @@
         c = scm_port_get_char(port);
         if (c == EOF)
             ERR("unexpected EOF");
-        if (!isascii(c))
+        if (!ICHAR_ASCIIP(c))
             ERR("unexpected non-ASCII char");
         *p = c;
     }
@@ -183,10 +181,11 @@
 }
 
 static size_t
-read_token(ScmObj port,
-           int *err, char *buf, size_t buf_size, const char *delim)
+read_token(ScmObj port, int *err,
+           char *buf, size_t buf_size, enum ScmCharClass delim)
 {
     ScmCharCodec *codec;
+    enum ScmCharClass ch_class;
     scm_ichar_t c;
     size_t len;
     char *p;
@@ -194,25 +193,20 @@
 
     for (p = buf;;) {
         c = scm_port_peek_char(port);
+        ch_class = ICHAR_CLASS(c);
         CDBG((SCM_DBG_PARSER, "c = %c", (int)c));
 
         if (p == buf) {
             if (c == EOF)
                 ERR("unexpected EOF at a token");
         } else {
-            if (strchr(delim, c) || c == EOF) {
+            if (ch_class & delim || c == EOF) {
                 *err = OK;
                 break;
             }
         }
 
-        if (isascii(c)) {
-            if (p == &buf[buf_size - sizeof("")]) {
-                *err = TOKEN_BUF_EXCEEDED;
-                break;
-            }
-            *p++ = c;
-        } else {
+        if (ch_class & SCM_CH_NONASCII) {
 #if SCM_USE_SRFI75
             if (&buf[buf_size] <= p + SCM_MB_MAX_LEN) {
                 *err = TOKEN_BUF_EXCEEDED;
@@ -228,6 +222,12 @@
 #else
             ERR("non-ASCII char in token: 0x%x", (int)c);
 #endif
+        } else {
+            if (p == &buf[buf_size - sizeof("")]) {
+                *err = TOKEN_BUF_EXCEEDED;
+                break;
+            }
+            *p++ = c;
         }
         DISCARD_LOOKAHEAD(port);
     }
@@ -241,6 +241,7 @@
 read_sexpression(ScmObj port)
 {
     ScmObj ret;
+    enum ScmCharClass ch_class;
     scm_ichar_t c;
     DECLARE_INTERNAL_FUNCTION("read");
 
@@ -251,22 +252,26 @@
 
         CDBG((SCM_DBG_PARSER, "read_sexpression c = %c", (int)c));
 
+        ch_class = ICHAR_CLASS(c);
+        if (ch_class & (SCM_CH_INITIAL | SCM_CH_NONASCII))
+            return read_symbol(port);
+
+        if (ch_class & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND))
+            return read_number_or_symbol(port);
+
         /* case labels are ordered by appearance rate and penalty cost */
+        DISCARD_LOOKAHEAD(port);
         switch (c) {
         case '(':
-            DISCARD_LOOKAHEAD(port);
             return read_list(port, ')');
 
         case '\"':
-            DISCARD_LOOKAHEAD(port);
             return read_string(port);
 
         case '\'':
-            DISCARD_LOOKAHEAD(port);
             return read_quote(port, SYM_QUOTE);
 
         case '#':
-            DISCARD_LOOKAHEAD(port);
             c = scm_port_get_char(port);
             switch (c) {
             case 't':
@@ -291,11 +296,9 @@
             break;
 
         case '`':
-            DISCARD_LOOKAHEAD(port);
             return read_quote(port, SYM_QUASIQUOTE);
 
         case ',':
-            DISCARD_LOOKAHEAD(port);
             c = scm_port_peek_char(port);
             switch (c) {
             case EOF:
@@ -310,12 +313,6 @@
                 return read_quote(port, SYM_UNQUOTE);
             }
 
-        case '.': case '+': case '-':
-        case '1': case '2': case '3': case '4': case '5':
-        case '6': case '7': case '8': case '9': case '0':
-        case '@':
-            return read_number_or_symbol(port);
-
         case ')':
             ERR("invalid close parenthesis");
             /* NOTREACHED */
@@ -324,7 +321,7 @@
             return SCM_EOF;
 
         default:
-            return read_symbol(port);
+            SCM_ASSERT(scm_false);
         }
     }
 }
@@ -374,7 +371,7 @@
              * '...' and numbers in R5RS (See "7.1.1 Lexical structure"), fixed
              * size buffer can safely buffer them.
              */
-            read_token(port, &err, dot_buf, sizeof(dot_buf), DELIMITER_CHARS);
+            read_token(port, &err, dot_buf, sizeof(dot_buf), SCM_CH_DELIMITER);
 
             if (dot_buf[1] == '\0') {
 #if !SCM_STRICT_R5RS
@@ -386,7 +383,7 @@
                  * require explicit whitespace around the dot.
                  */
                 c = scm_port_peek_char(port);
-                if (!strchr(WHITESPACE_CHARS, c))
+                if (!ICHAR_WHITESPACEP(c))
                     ERR("implicit dot delimitation is disabled to avoid compatibility problem");
 #endif
                 if (NULLP(lst))
@@ -420,7 +417,7 @@
     DECLARE_INTERNAL_FUNCTION("read");
 
     /* reject ordinary char literal and invalid signed hexadecimal */
-    if (len < 3 || !isxdigit(seq[1]))
+    if (len < 3 || !ICHAR_HEXA_NUMERICP(seq[1]))
         return -1;
 
     c = strtol(&seq[1], &end, 16);
@@ -492,15 +489,15 @@
     /* plain char (multibyte-ready) */
     c = scm_port_get_char(port);
     next = scm_port_peek_char(port);
-    if (strchr(DELIMITER_CHARS, next) || next == EOF)
+    if (ICHAR_ASCII_CLASS(next) & SCM_CH_DELIMITER || next == EOF)
         return MAKE_CHAR(c);
 #if SCM_USE_SRFI75
-    else if (!isascii(c))
+    else if (!ICHAR_ASCIIP(c))
         ERR("invalid character literal");
 #endif
 
     buf[0] = c;
-    len = read_token(port, &err, &buf[1], sizeof(buf) - 1, DELIMITER_CHARS);
+    len = read_token(port, &err, &buf[1], sizeof(buf) - 1, SCM_CH_DELIMITER);
     if (err == TOKEN_BUF_EXCEEDED)
         ERR("invalid character literal");
 
@@ -635,7 +632,7 @@
         tail_len = read_token(port, &err,
                               &LBUF_BUF(lbuf)[offset],
                               LBUF_SIZE(lbuf) - offset,
-                              DELIMITER_CHARS);
+                              SCM_CH_DELIMITER);
         if (err != TOKEN_BUF_EXCEEDED)
             break;
         offset += tail_len;
@@ -660,42 +657,43 @@
     CDBG((SCM_DBG_PARSER, "read"));
 
     c = scm_port_peek_char(port);
+    SCM_ASSERT(ICHAR_ASCII_CLASS(c)
+               & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND));
 
-    if (isascii(c)) {
-        if (isdigit(c))
-            return read_number(port, 'd');
+    if (ICHAR_NUMERICP(c))
+        return read_number(port, 'd');
 
-        if (c == '+' || c == '-') {
-            len = read_token(port, &err, buf, sizeof(buf), DELIMITER_CHARS);
-            if (err == TOKEN_BUF_EXCEEDED)
-                ERR("invalid number literal");
+    if (c == '+' || c == '-') {
+        len = read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
+        if (err == TOKEN_BUF_EXCEEDED)
+            ERR("invalid number literal");
 
             
-            if (!buf[1]                           /* '+' or '-' */
+        if (!buf[1]                           /* '+' or '-' */
 #if !SCM_STRICT_R5RS
-                || (c == '-' && isalpha(buf[1]))  /* '-sym' */
+            /* FIXME: Obsolete with SRFI-75 style '|-sym| */
+            || (c == '-' && ICHAR_ALPHABETICP(buf[1]))  /* '-sym' */
 #endif
-                )
-            {
-                return scm_intern(buf);
-            }
-
-            return parse_number(port, buf, sizeof(buf), 'd');
+            )
+        {
+            return scm_intern(buf);
         }
 
-        if (c == '.') {
-            read_token(port, &err, buf, sizeof(buf), DELIMITER_CHARS);
-            if (strcmp(buf, "...") == 0)
-                return scm_intern(buf);
-            /* TODO: support numeric expressions when the numeric tower is
-               implemented */
-            ERR("invalid identifier: %s", buf);
-        }
+        return parse_number(port, buf, sizeof(buf), 'd');
+    }
 
-        if (c == '@')
-            ERR("invalid identifier: %s", buf);
+    if (c == '.') {
+        read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
+        if (strcmp(buf, "...") == 0)
+            return scm_intern(buf);
+        /* TODO: support numeric expressions when the numeric tower is
+           implemented */
+        ERR("invalid identifier: %s", buf);
     }
 
+    if (c == '@')
+        ERR("invalid identifier: %s", buf);
+
     return read_symbol(port);
 }
 
@@ -735,7 +733,7 @@
     char buf[INT_LITERAL_LEN_MAX + sizeof("")];
     DECLARE_INTERNAL_FUNCTION("read");
 
-    len = read_token(port, &err, buf, sizeof(buf), DELIMITER_CHARS);
+    len = read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
     if (err == TOKEN_BUF_EXCEEDED)
         ERR("invalid number literal");
 

Modified: branches/r5rs/sigscheme/src/sigschemeinternal.h
===================================================================
--- branches/r5rs/sigscheme/src/sigschemeinternal.h	2006-02-02 16:00:59 UTC (rev 3082)
+++ branches/r5rs/sigscheme/src/sigschemeinternal.h	2006-02-02 16:05:56 UTC (rev 3083)
@@ -437,13 +437,16 @@
     SCM_CH_NONHEX_LETTER      = 1 << 4, /* [g-zG-Z] */
     SCM_CH_SPECIAL_INITIAL    = 1 << 5, /* [!$%&*\/:<=>?^_~] */
     SCM_CH_SPECIAL_SUBSEQUENT = 1 << 6, /* [-+\.@] */
-    SCM_CH_TOKEN_INITIAL      = 1 << 7, /* [()#'`,\.\"\|\{\}\[\]] */
+    /* currently '.' is not included in SCM_CH_TOKEN_INITIAL */
+    SCM_CH_TOKEN_INITIAL      = 1 << 7, /* [()#'`,\"\|\{\}\[\]] */
 
     SCM_CH_LETTER     = SCM_CH_HEX_LETTER | SCM_CH_NONHEX_LETTER,
     SCM_CH_HEX_DIGIT  = SCM_CH_DIGIT | SCM_CH_HEX_LETTER,
     SCM_CH_INITIAL    = SCM_CH_LETTER | SCM_CH_SPECIAL_INITIAL,
     SCM_CH_SUBSEQUENT = SCM_CH_INITIAL | SCM_CH_DIGIT,
     SCM_CH_PECULIAR_IDENTIFIER_CAND = SCM_CH_SPECIAL_SUBSEQUENT,
+    SCM_CH_DELIMITER
+        = SCM_CH_CONTROL | SCM_CH_WHITESPACE | SCM_CH_TOKEN_INITIAL,
 
     /* beyond ASCII */
     SCM_CH_ASCII              = 0 << 8,
@@ -455,15 +458,20 @@
 
 extern const unsigned char scm_char_class_table[];
 
-#define ICHAR_ASCIIP(c)      (SCM_ASSERT(0 <= (c)), (c) <= 127)
+/* accepts EOF */
+#define ICHAR_ASCIIP(c)      (0 <= (c) && (c) <= 127)
 #define ICHAR_ASCII_CLASS(c)                                                 \
     (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_INVALID)
 #define ICHAR_CLASS(c)                                                       \
-    (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_NONASCII)
+    ((127 < (c)) ? SCM_CH_NONASCII                                           \
+                 : (((c) < 0) ? SCM_CH_INVALID : scm_char_class_table[c]))
 
 #define ICHAR_CONTROLP(c)    ((0 <= (c) && (c) <= 31) || (c) == 127)
 #define ICHAR_WHITESPACEP(c) ((c) == ' ' || ('\t' <= (c) && (c) <= '\r'))
 #define ICHAR_NUMERICP(c)    ('0' <= (c) && (c) <= '9')
+#define ICHAR_HEXA_NUMERICP(c) (ICHAR_NUMERICP(c)                            \
+                                || ('a' <= (c) && (c) <= 'f')                \
+                                || ('A' <= (c) && (c) <= 'F'))
 #define ICHAR_ALPHABETICP(c) (ICHAR_UPPER_CASEP(c) || ICHAR_LOWER_CASEP(c))
 #define ICHAR_UPPER_CASEP(c) ('A' <= (c) && (c) <= 'Z')
 #define ICHAR_LOWER_CASEP(c) ('a' <= (c) && (c) <= 'z')



More information about the uim-commit mailing list