[uim-commit] r3083 - branches/r5rs/sigscheme/src
yamaken at freedesktop.org
yamaken at freedesktop.org
Thu Feb 2 08:06:12 PST 2006
Author: yamaken
Date: 2006-02-02 08:05:56 -0800 (Thu, 02 Feb 2006)
New Revision: 3083
Modified:
branches/r5rs/sigscheme/src/char.c
branches/r5rs/sigscheme/src/read.c
branches/r5rs/sigscheme/src/sigschemeinternal.h
Log:
* sigscheme/src/char.c
- (scm_char_class_table): Comment out SCM_CH_TOKEN_INITIAL from '.'
* sigscheme/src/sigschemeinternal.h
- (enum ScmCharClass):
* Modify comment about SCM_CH_TOKEN_INITIAL
* Add new member 'SCM_CH_DELIMITER'
- (ICHAR_ASCIIP, ICHAR_CLASS): Make EOF-acceptable
- (ICHAR_HEXA_NUMERICP): New macro
* sigscheme/src/read.c
- Exclude ctype.h
- (WHITESPACE_CHARS, DELIMITER_CHARS): Removed
- (skip_comment_and_space, read_sequence, read_sexpression,
read_list, read_list, parse_unicode_sequence, read_symbol,
read_number_or_symbol, read_number): Simplify with new ichar
macros
- (read_token):
* Ditto
* Change type for arg 'delim' to enum ScmCharClass
Modified: branches/r5rs/sigscheme/src/char.c
===================================================================
--- branches/r5rs/sigscheme/src/char.c 2006-02-02 16:00:59 UTC (rev 3082)
+++ branches/r5rs/sigscheme/src/char.c 2006-02-02 16:05:56 UTC (rev 3083)
@@ -180,7 +180,7 @@
SCM_CH_SPECIAL_SUBSEQUENT, /* 43 + */
SCM_CH_TOKEN_INITIAL, /* 44 , */
SCM_CH_SPECIAL_SUBSEQUENT, /* 45 - */
- SCM_CH_SPECIAL_SUBSEQUENT | SCM_CH_TOKEN_INITIAL, /* 46 . */
+ SCM_CH_SPECIAL_SUBSEQUENT /* | SCM_CH_TOKEN_INITIAL */, /* 46 . */
SCM_CH_SPECIAL_INITIAL, /* 47 / */
SCM_CH_DIGIT, /* 48 0 */
SCM_CH_DIGIT, /* 49 1 */
Modified: branches/r5rs/sigscheme/src/read.c
===================================================================
--- branches/r5rs/sigscheme/src/read.c 2006-02-02 16:00:59 UTC (rev 3082)
+++ branches/r5rs/sigscheme/src/read.c 2006-02-02 16:05:56 UTC (rev 3083)
@@ -33,13 +33,14 @@
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
===========================================================================*/
+/* TODO: replace with character class sequence expression-based tokenizer */
+
#include "config.h"
/*=======================================
System Include
=======================================*/
#include <limits.h>
-#include <ctype.h>
#include <stdlib.h>
#include <string.h>
@@ -71,9 +72,6 @@
/* #b-010101... */
#define INT_LITERAL_LEN_MAX (sizeof((char)'-') + SCM_INT_BITS)
-#define WHITESPACE_CHARS " \t\n\r\v\f"
-#define DELIMITER_CHARS "()\";" WHITESPACE_CHARS
-
#define DISCARD_LOOKAHEAD(port) (scm_port_get_char(port))
/*=======================================
@@ -86,7 +84,7 @@
static scm_ichar_t skip_comment_and_space(ScmObj port);
static void read_sequence(ScmObj port, char *buf, int len);
static size_t read_token(ScmObj port, int *err,
- char *buf, size_t buf_size, const char *delim);
+ char *buf, size_t buf_size, enum ScmCharClass delim);
static ScmObj read_sexpression(ScmObj port);
static ScmObj read_list(ScmObj port, scm_ichar_t closeParen);
@@ -149,7 +147,7 @@
case LEX_ST_NORMAL:
if (c == ';')
state = LEX_ST_COMMENT;
- else if (!isascii(c) || !isspace(c) || c == EOF)
+ else if (!ICHAR_WHITESPACEP(c) || c == EOF)
return c; /* peeked */
break;
@@ -175,7 +173,7 @@
c = scm_port_get_char(port);
if (c == EOF)
ERR("unexpected EOF");
- if (!isascii(c))
+ if (!ICHAR_ASCIIP(c))
ERR("unexpected non-ASCII char");
*p = c;
}
@@ -183,10 +181,11 @@
}
static size_t
-read_token(ScmObj port,
- int *err, char *buf, size_t buf_size, const char *delim)
+read_token(ScmObj port, int *err,
+ char *buf, size_t buf_size, enum ScmCharClass delim)
{
ScmCharCodec *codec;
+ enum ScmCharClass ch_class;
scm_ichar_t c;
size_t len;
char *p;
@@ -194,25 +193,20 @@
for (p = buf;;) {
c = scm_port_peek_char(port);
+ ch_class = ICHAR_CLASS(c);
CDBG((SCM_DBG_PARSER, "c = %c", (int)c));
if (p == buf) {
if (c == EOF)
ERR("unexpected EOF at a token");
} else {
- if (strchr(delim, c) || c == EOF) {
+ if (ch_class & delim || c == EOF) {
*err = OK;
break;
}
}
- if (isascii(c)) {
- if (p == &buf[buf_size - sizeof("")]) {
- *err = TOKEN_BUF_EXCEEDED;
- break;
- }
- *p++ = c;
- } else {
+ if (ch_class & SCM_CH_NONASCII) {
#if SCM_USE_SRFI75
if (&buf[buf_size] <= p + SCM_MB_MAX_LEN) {
*err = TOKEN_BUF_EXCEEDED;
@@ -228,6 +222,12 @@
#else
ERR("non-ASCII char in token: 0x%x", (int)c);
#endif
+ } else {
+ if (p == &buf[buf_size - sizeof("")]) {
+ *err = TOKEN_BUF_EXCEEDED;
+ break;
+ }
+ *p++ = c;
}
DISCARD_LOOKAHEAD(port);
}
@@ -241,6 +241,7 @@
read_sexpression(ScmObj port)
{
ScmObj ret;
+ enum ScmCharClass ch_class;
scm_ichar_t c;
DECLARE_INTERNAL_FUNCTION("read");
@@ -251,22 +252,26 @@
CDBG((SCM_DBG_PARSER, "read_sexpression c = %c", (int)c));
+ ch_class = ICHAR_CLASS(c);
+ if (ch_class & (SCM_CH_INITIAL | SCM_CH_NONASCII))
+ return read_symbol(port);
+
+ if (ch_class & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND))
+ return read_number_or_symbol(port);
+
/* case labels are ordered by appearance rate and penalty cost */
+ DISCARD_LOOKAHEAD(port);
switch (c) {
case '(':
- DISCARD_LOOKAHEAD(port);
return read_list(port, ')');
case '\"':
- DISCARD_LOOKAHEAD(port);
return read_string(port);
case '\'':
- DISCARD_LOOKAHEAD(port);
return read_quote(port, SYM_QUOTE);
case '#':
- DISCARD_LOOKAHEAD(port);
c = scm_port_get_char(port);
switch (c) {
case 't':
@@ -291,11 +296,9 @@
break;
case '`':
- DISCARD_LOOKAHEAD(port);
return read_quote(port, SYM_QUASIQUOTE);
case ',':
- DISCARD_LOOKAHEAD(port);
c = scm_port_peek_char(port);
switch (c) {
case EOF:
@@ -310,12 +313,6 @@
return read_quote(port, SYM_UNQUOTE);
}
- case '.': case '+': case '-':
- case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9': case '0':
- case '@':
- return read_number_or_symbol(port);
-
case ')':
ERR("invalid close parenthesis");
/* NOTREACHED */
@@ -324,7 +321,7 @@
return SCM_EOF;
default:
- return read_symbol(port);
+ SCM_ASSERT(scm_false);
}
}
}
@@ -374,7 +371,7 @@
* '...' and numbers in R5RS (See "7.1.1 Lexical structure"), fixed
* size buffer can safely buffer them.
*/
- read_token(port, &err, dot_buf, sizeof(dot_buf), DELIMITER_CHARS);
+ read_token(port, &err, dot_buf, sizeof(dot_buf), SCM_CH_DELIMITER);
if (dot_buf[1] == '\0') {
#if !SCM_STRICT_R5RS
@@ -386,7 +383,7 @@
* require explicit whitespace around the dot.
*/
c = scm_port_peek_char(port);
- if (!strchr(WHITESPACE_CHARS, c))
+ if (!ICHAR_WHITESPACEP(c))
ERR("implicit dot delimitation is disabled to avoid compatibility problem");
#endif
if (NULLP(lst))
@@ -420,7 +417,7 @@
DECLARE_INTERNAL_FUNCTION("read");
/* reject ordinary char literal and invalid signed hexadecimal */
- if (len < 3 || !isxdigit(seq[1]))
+ if (len < 3 || !ICHAR_HEXA_NUMERICP(seq[1]))
return -1;
c = strtol(&seq[1], &end, 16);
@@ -492,15 +489,15 @@
/* plain char (multibyte-ready) */
c = scm_port_get_char(port);
next = scm_port_peek_char(port);
- if (strchr(DELIMITER_CHARS, next) || next == EOF)
+ if (ICHAR_ASCII_CLASS(next) & SCM_CH_DELIMITER || next == EOF)
return MAKE_CHAR(c);
#if SCM_USE_SRFI75
- else if (!isascii(c))
+ else if (!ICHAR_ASCIIP(c))
ERR("invalid character literal");
#endif
buf[0] = c;
- len = read_token(port, &err, &buf[1], sizeof(buf) - 1, DELIMITER_CHARS);
+ len = read_token(port, &err, &buf[1], sizeof(buf) - 1, SCM_CH_DELIMITER);
if (err == TOKEN_BUF_EXCEEDED)
ERR("invalid character literal");
@@ -635,7 +632,7 @@
tail_len = read_token(port, &err,
&LBUF_BUF(lbuf)[offset],
LBUF_SIZE(lbuf) - offset,
- DELIMITER_CHARS);
+ SCM_CH_DELIMITER);
if (err != TOKEN_BUF_EXCEEDED)
break;
offset += tail_len;
@@ -660,42 +657,43 @@
CDBG((SCM_DBG_PARSER, "read"));
c = scm_port_peek_char(port);
+ SCM_ASSERT(ICHAR_ASCII_CLASS(c)
+ & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND));
- if (isascii(c)) {
- if (isdigit(c))
- return read_number(port, 'd');
+ if (ICHAR_NUMERICP(c))
+ return read_number(port, 'd');
- if (c == '+' || c == '-') {
- len = read_token(port, &err, buf, sizeof(buf), DELIMITER_CHARS);
- if (err == TOKEN_BUF_EXCEEDED)
- ERR("invalid number literal");
+ if (c == '+' || c == '-') {
+ len = read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
+ if (err == TOKEN_BUF_EXCEEDED)
+ ERR("invalid number literal");
- if (!buf[1] /* '+' or '-' */
+ if (!buf[1] /* '+' or '-' */
#if !SCM_STRICT_R5RS
- || (c == '-' && isalpha(buf[1])) /* '-sym' */
+ /* FIXME: Obsolete with SRFI-75 style '|-sym| */
+ || (c == '-' && ICHAR_ALPHABETICP(buf[1])) /* '-sym' */
#endif
- )
- {
- return scm_intern(buf);
- }
-
- return parse_number(port, buf, sizeof(buf), 'd');
+ )
+ {
+ return scm_intern(buf);
}
- if (c == '.') {
- read_token(port, &err, buf, sizeof(buf), DELIMITER_CHARS);
- if (strcmp(buf, "...") == 0)
- return scm_intern(buf);
- /* TODO: support numeric expressions when the numeric tower is
- implemented */
- ERR("invalid identifier: %s", buf);
- }
+ return parse_number(port, buf, sizeof(buf), 'd');
+ }
- if (c == '@')
- ERR("invalid identifier: %s", buf);
+ if (c == '.') {
+ read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
+ if (strcmp(buf, "...") == 0)
+ return scm_intern(buf);
+ /* TODO: support numeric expressions when the numeric tower is
+ implemented */
+ ERR("invalid identifier: %s", buf);
}
+ if (c == '@')
+ ERR("invalid identifier: %s", buf);
+
return read_symbol(port);
}
@@ -735,7 +733,7 @@
char buf[INT_LITERAL_LEN_MAX + sizeof("")];
DECLARE_INTERNAL_FUNCTION("read");
- len = read_token(port, &err, buf, sizeof(buf), DELIMITER_CHARS);
+ len = read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
if (err == TOKEN_BUF_EXCEEDED)
ERR("invalid number literal");
Modified: branches/r5rs/sigscheme/src/sigschemeinternal.h
===================================================================
--- branches/r5rs/sigscheme/src/sigschemeinternal.h 2006-02-02 16:00:59 UTC (rev 3082)
+++ branches/r5rs/sigscheme/src/sigschemeinternal.h 2006-02-02 16:05:56 UTC (rev 3083)
@@ -437,13 +437,16 @@
SCM_CH_NONHEX_LETTER = 1 << 4, /* [g-zG-Z] */
SCM_CH_SPECIAL_INITIAL = 1 << 5, /* [!$%&*\/:<=>?^_~] */
SCM_CH_SPECIAL_SUBSEQUENT = 1 << 6, /* [-+\.@] */
- SCM_CH_TOKEN_INITIAL = 1 << 7, /* [()#'`,\.\"\|\{\}\[\]] */
+ /* currently '.' is not included in SCM_CH_TOKEN_INITIAL */
+ SCM_CH_TOKEN_INITIAL = 1 << 7, /* [()#'`,\"\|\{\}\[\]] */
SCM_CH_LETTER = SCM_CH_HEX_LETTER | SCM_CH_NONHEX_LETTER,
SCM_CH_HEX_DIGIT = SCM_CH_DIGIT | SCM_CH_HEX_LETTER,
SCM_CH_INITIAL = SCM_CH_LETTER | SCM_CH_SPECIAL_INITIAL,
SCM_CH_SUBSEQUENT = SCM_CH_INITIAL | SCM_CH_DIGIT,
SCM_CH_PECULIAR_IDENTIFIER_CAND = SCM_CH_SPECIAL_SUBSEQUENT,
+ SCM_CH_DELIMITER
+ = SCM_CH_CONTROL | SCM_CH_WHITESPACE | SCM_CH_TOKEN_INITIAL,
/* beyond ASCII */
SCM_CH_ASCII = 0 << 8,
@@ -455,15 +458,20 @@
extern const unsigned char scm_char_class_table[];
-#define ICHAR_ASCIIP(c) (SCM_ASSERT(0 <= (c)), (c) <= 127)
+/* accepts EOF */
+#define ICHAR_ASCIIP(c) (0 <= (c) && (c) <= 127)
#define ICHAR_ASCII_CLASS(c) \
(ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_INVALID)
#define ICHAR_CLASS(c) \
- (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_NONASCII)
+ ((127 < (c)) ? SCM_CH_NONASCII \
+ : (((c) < 0) ? SCM_CH_INVALID : scm_char_class_table[c]))
#define ICHAR_CONTROLP(c) ((0 <= (c) && (c) <= 31) || (c) == 127)
#define ICHAR_WHITESPACEP(c) ((c) == ' ' || ('\t' <= (c) && (c) <= '\r'))
#define ICHAR_NUMERICP(c) ('0' <= (c) && (c) <= '9')
+#define ICHAR_HEXA_NUMERICP(c) (ICHAR_NUMERICP(c) \
+ || ('a' <= (c) && (c) <= 'f') \
+ || ('A' <= (c) && (c) <= 'F'))
#define ICHAR_ALPHABETICP(c) (ICHAR_UPPER_CASEP(c) || ICHAR_LOWER_CASEP(c))
#define ICHAR_UPPER_CASEP(c) ('A' <= (c) && (c) <= 'Z')
#define ICHAR_LOWER_CASEP(c) ('a' <= (c) && (c) <= 'z')
More information about the uim-commit
mailing list