[uim-commit] r3079 - branches/r5rs/sigscheme/src

yamaken at freedesktop.org yamaken at freedesktop.org
Thu Feb 2 06:02:06 PST 2006


Author: yamaken
Date: 2006-02-02 06:02:00 -0800 (Thu, 02 Feb 2006)
New Revision: 3079

Modified:
   branches/r5rs/sigscheme/src/char.c
   branches/r5rs/sigscheme/src/sigschemeinternal.h
Log:
* sigscheme/src/sigschemeinternal.h
  - Exclude ctype.h
  - (enum ScmCharClass): New type
  - (scm_char_class_table): New variable decl
  - (ICHAR_ASCIIP, ICHAR_ASCII_CLASS, ICHAR_CLASS, ICHAR_ALPHABETICP,
    ICHAR_NUMERICP, ICHAR_WHITESPACEP, ICHAR_UPPER_CASEP,
    ICHAR_LOWER_CASEP): New macro
  - (ICHAR_DOWNCASE, ICHAR_UPCASE): Make efficient
* sigscheme/src/char.c
  - Exclude ctype.h and stdlib.h
  - Cite "7.1.1 Lexical structure" of R5RS
  - (scm_char_class_table): New variable
  - (scm_p_char_alphabeticp, scm_p_char_numericp,
    scm_p_char_whitespacep, scm_p_char_upper_casep,
    scm_p_char_lower_casep, scm_p_integer2char): Simplify with the new
    macros


Modified: branches/r5rs/sigscheme/src/char.c
===================================================================
--- branches/r5rs/sigscheme/src/char.c	2006-02-02 11:30:13 UTC (rev 3078)
+++ branches/r5rs/sigscheme/src/char.c	2006-02-02 14:02:00 UTC (rev 3079)
@@ -38,8 +38,6 @@
 /*=======================================
   System Include
 =======================================*/
-#include <ctype.h>
-#include <stdlib.h>
 
 /*=======================================
   Local Include
@@ -58,7 +56,214 @@
 /*=======================================
   Variable Declarations
 =======================================*/
+/*
+ * R5RS: 7.1.1 Lexical structure
+ *
+ * <token> --> <identifier> | <boolean> | <number> | <character> | <string>
+ *      | ( | ) | #( | ' | ` | , | ,@ | .
+ * <delimiter> --> <whitespace> | ( | ) | " | ;
+ * <whitespace> --> <space or newline>
+ * <comment> --> ;  <all subsequent characters up to a
+ *                  line break>
+ * <atmosphere> --> <whitespace> | <comment>
+ * <intertoken space> --> <atmosphere>*
+ * 
+ * <identifier> --> <initial> <subsequent>* | <peculiar identifier>
+ * <initial> --> <letter> | <special initial>
+ * <letter> --> a | b | c | ... | z
+ * 
+ * <special initial> --> ! | $ | % | & | * | / | : | < | = | > | ? | ^ | _ | ~
+ * <subsequent> --> <initial> | <digit> | <special subsequent>
+ * <digit> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
+ * <special subsequent> --> + | - | . | @
+ * <peculiar identifier> --> + | - | ...
+ * <syntactic keyword> --> <expression keyword>
+ *      | else | => | define 
+ *      | unquote | unquote-splicing
+ * <expression keyword> --> quote | lambda | if
+ *      | set! | begin | cond | and | or | case
+ *      | let | let* | letrec | do | delay
+ *      | quasiquote
+ * 
+ * `<variable> => <'any <identifier> that isn't
+ *                 also a <syntactic keyword>>
+ * 
+ * <boolean> --> #t | #f
+ * <character> --> #\ <any character>
+ *      | #\ <character name>
+ * <character name> --> space | newline
+ * 
+ * <string> --> " <string element>* "
+ * <string element> --> <any character other than " or \>
+ *      | \" | \\ 
+ * 
+ * <number> --> <num 2>| <num 8>
+ *      | <num 10>| <num 16>
+ * 
+ * 
+ * <num R> --> <prefix R> <complex R>
+ * <complex R> --> <real R> | <real R> @ <real R>
+ *     | <real R> + <ureal R> i | <real R> - <ureal R> i
+ *     | <real R> + i | <real R> - i
+ *     | + <ureal R> i | - <ureal R> i | + i | - i
+ * <real R> --> <sign> <ureal R>
+ * <ureal R> --> <uinteger R>
+ *     | <uinteger R> / <uinteger R>
+ *     | <decimal R>
+ * <decimal 10> --> <uinteger 10> <suffix>
+ *     | . <digit 10>+ #* <suffix>
+ *     | <digit 10>+ . <digit 10>* #* <suffix>
+ *     | <digit 10>+ #+ . #* <suffix>
+ * <uinteger R> --> <digit R>+ #*
+ * <prefix R> --> <radix R> <exactness>
+ *     | <exactness> <radix R>
+ * 
+ * <suffix> --> <empty> 
+ *     | <exponent marker> <sign> <digit 10>+
+ * <exponent marker> --> e | s | f | d | l
+ * <sign> --> <empty>  | + |  -
+ * <exactness> --> <empty> | #i | #e
+ * <radix 2> --> #b
+ * <radix 8> --> #o
+ * <radix 10> --> <empty> | #d
+ * <radix 16> --> #x
+ * <digit 2> --> 0 | 1
+ * <digit 8> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7
+ * <digit 10> --> <digit>
+ * <digit 16> --> <digit 10> | a | b | c | d | e | f 
+ */
 
+const unsigned char scm_char_class_table[] = {
+    SCM_CH_CONTROL,            /*   0  nul       */
+    SCM_CH_CONTROL,            /*   1  x01       */
+    SCM_CH_CONTROL,            /*   2  x02       */
+    SCM_CH_CONTROL,            /*   3  x03       */
+    SCM_CH_CONTROL,            /*   4  x04       */
+    SCM_CH_CONTROL,            /*   5  x05       */
+    SCM_CH_CONTROL,            /*   6  x06       */
+    SCM_CH_CONTROL,            /*   7  alarm     */
+    SCM_CH_CONTROL,            /*   8  backspace */
+    SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*   9  tab       */
+    SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  10  newline   */
+    SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  11  vtab      */
+    SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  12  page      */
+    SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  13  return    */
+    SCM_CH_CONTROL,            /*  14  x0e       */
+    SCM_CH_CONTROL,            /*  15  x0f       */
+    SCM_CH_CONTROL,            /*  16  x10       */
+    SCM_CH_CONTROL,            /*  17  x11       */
+    SCM_CH_CONTROL,            /*  18  x12       */
+    SCM_CH_CONTROL,            /*  19  x13       */
+    SCM_CH_CONTROL,            /*  20  x14       */
+    SCM_CH_CONTROL,            /*  21  x15       */
+    SCM_CH_CONTROL,            /*  22  x16       */
+    SCM_CH_CONTROL,            /*  23  x17       */
+    SCM_CH_CONTROL,            /*  24  x18       */
+    SCM_CH_CONTROL,            /*  25  x19       */
+    SCM_CH_CONTROL,            /*  26  x1a       */
+    SCM_CH_CONTROL,            /*  27  esc       */
+    SCM_CH_CONTROL,            /*  28  x1c       */
+    SCM_CH_CONTROL,            /*  29  x1d       */
+    SCM_CH_CONTROL,            /*  30  x1e       */
+    SCM_CH_CONTROL,            /*  31  x1f       */
+    SCM_CH_WHITESPACE,         /*  32  space     */
+    SCM_CH_SPECIAL_INITIAL,    /*  33  !         */
+    SCM_CH_TOKEN_INITIAL,      /*  34  "         */
+    SCM_CH_TOKEN_INITIAL,      /*  35  #         */
+    SCM_CH_SPECIAL_INITIAL,    /*  36  $         */
+    SCM_CH_SPECIAL_INITIAL,    /*  37  %         */
+    SCM_CH_SPECIAL_INITIAL,    /*  38  &         */
+    SCM_CH_TOKEN_INITIAL,      /*  39  '         */
+    SCM_CH_TOKEN_INITIAL,      /*  40  (         */
+    SCM_CH_TOKEN_INITIAL,      /*  41  )         */
+    SCM_CH_SPECIAL_INITIAL,    /*  42  *         */
+    SCM_CH_SPECIAL_SUBSEQUENT, /*  43  +         */
+    SCM_CH_TOKEN_INITIAL,      /*  44  ,         */
+    SCM_CH_SPECIAL_SUBSEQUENT, /*  45  -         */
+    SCM_CH_SPECIAL_SUBSEQUENT | SCM_CH_TOKEN_INITIAL, /*  46  .         */
+    SCM_CH_SPECIAL_INITIAL,    /*  47  /         */
+    SCM_CH_DIGIT,              /*  48  0         */
+    SCM_CH_DIGIT,              /*  49  1         */
+    SCM_CH_DIGIT,              /*  50  2         */
+    SCM_CH_DIGIT,              /*  51  3         */
+    SCM_CH_DIGIT,              /*  52  4         */
+    SCM_CH_DIGIT,              /*  53  5         */
+    SCM_CH_DIGIT,              /*  54  6         */
+    SCM_CH_DIGIT,              /*  55  7         */
+    SCM_CH_DIGIT,              /*  56  8         */
+    SCM_CH_DIGIT,              /*  57  9         */
+    SCM_CH_SPECIAL_INITIAL,    /*  58  :         */
+    SCM_CH_TOKEN_INITIAL,      /*  59  ;         */
+    SCM_CH_SPECIAL_INITIAL,    /*  60  <         */
+    SCM_CH_SPECIAL_INITIAL,    /*  61  =         */
+    SCM_CH_SPECIAL_INITIAL,    /*  62  >         */
+    SCM_CH_SPECIAL_INITIAL,    /*  63  ?         */
+    SCM_CH_SPECIAL_SUBSEQUENT, /*  64  @         */
+    SCM_CH_HEX_LETTER,         /*  65  A         */
+    SCM_CH_HEX_LETTER,         /*  66  B         */
+    SCM_CH_HEX_LETTER,         /*  67  C         */
+    SCM_CH_HEX_LETTER,         /*  68  D         */
+    SCM_CH_HEX_LETTER,         /*  69  E         */
+    SCM_CH_HEX_LETTER,         /*  70  F         */
+    SCM_CH_NONHEX_LETTER,      /*  71  G         */
+    SCM_CH_NONHEX_LETTER,      /*  72  H         */
+    SCM_CH_NONHEX_LETTER,      /*  73  I         */
+    SCM_CH_NONHEX_LETTER,      /*  74  J         */
+    SCM_CH_NONHEX_LETTER,      /*  75  K         */
+    SCM_CH_NONHEX_LETTER,      /*  76  L         */
+    SCM_CH_NONHEX_LETTER,      /*  77  M         */
+    SCM_CH_NONHEX_LETTER,      /*  78  N         */
+    SCM_CH_NONHEX_LETTER,      /*  79  O         */
+    SCM_CH_NONHEX_LETTER,      /*  80  P         */
+    SCM_CH_NONHEX_LETTER,      /*  81  Q         */
+    SCM_CH_NONHEX_LETTER,      /*  82  R         */
+    SCM_CH_NONHEX_LETTER,      /*  83  S         */
+    SCM_CH_NONHEX_LETTER,      /*  84  T         */
+    SCM_CH_NONHEX_LETTER,      /*  85  U         */
+    SCM_CH_NONHEX_LETTER,      /*  86  V         */
+    SCM_CH_NONHEX_LETTER,      /*  87  W         */
+    SCM_CH_NONHEX_LETTER,      /*  88  X         */
+    SCM_CH_NONHEX_LETTER,      /*  89  Y         */
+    SCM_CH_NONHEX_LETTER,      /*  90  Z         */
+    SCM_CH_TOKEN_INITIAL,      /*  91  [         */
+    SCM_CH_CONTROL,            /*  92  \\        */
+    SCM_CH_TOKEN_INITIAL,      /*  93  ]         */
+    SCM_CH_SPECIAL_INITIAL,    /*  94  ^         */
+    SCM_CH_SPECIAL_INITIAL,    /*  95  _         */
+    SCM_CH_TOKEN_INITIAL,      /*  96  `         */
+    SCM_CH_HEX_LETTER,         /*  97  a         */
+    SCM_CH_HEX_LETTER,         /*  98  b         */
+    SCM_CH_HEX_LETTER,         /*  99  c         */
+    SCM_CH_HEX_LETTER,         /* 100  d         */
+    SCM_CH_HEX_LETTER,         /* 101  e         */
+    SCM_CH_HEX_LETTER,         /* 102  f         */
+    SCM_CH_NONHEX_LETTER,      /* 103  g         */
+    SCM_CH_NONHEX_LETTER,      /* 104  h         */
+    SCM_CH_NONHEX_LETTER,      /* 105  i         */
+    SCM_CH_NONHEX_LETTER,      /* 106  j         */
+    SCM_CH_NONHEX_LETTER,      /* 107  k         */
+    SCM_CH_NONHEX_LETTER,      /* 108  l         */
+    SCM_CH_NONHEX_LETTER,      /* 109  m         */
+    SCM_CH_NONHEX_LETTER,      /* 110  n         */
+    SCM_CH_NONHEX_LETTER,      /* 111  o         */
+    SCM_CH_NONHEX_LETTER,      /* 112  p         */
+    SCM_CH_NONHEX_LETTER,      /* 113  q         */
+    SCM_CH_NONHEX_LETTER,      /* 114  r         */
+    SCM_CH_NONHEX_LETTER,      /* 115  s         */
+    SCM_CH_NONHEX_LETTER,      /* 116  t         */
+    SCM_CH_NONHEX_LETTER,      /* 117  u         */
+    SCM_CH_NONHEX_LETTER,      /* 118  v         */
+    SCM_CH_NONHEX_LETTER,      /* 119  w         */
+    SCM_CH_NONHEX_LETTER,      /* 120  x         */
+    SCM_CH_NONHEX_LETTER,      /* 121  y         */
+    SCM_CH_NONHEX_LETTER,      /* 122  z         */
+    SCM_CH_TOKEN_INITIAL,      /* 123  {         */
+    SCM_CH_TOKEN_INITIAL,      /* 124  |         */
+    SCM_CH_TOKEN_INITIAL,      /* 125  }         */
+    SCM_CH_SPECIAL_INITIAL,    /* 126  ~         */
+    SCM_CH_CONTROL,            /* 127  delete    */
+};
+
 /*=======================================
   File Local Function Declarations
 =======================================*/
@@ -198,7 +403,7 @@
 
     val = SCM_CHAR_VALUE(ch);
 
-    return MAKE_BOOL(isascii(val) && isalpha(val));
+    return MAKE_BOOL(ICHAR_ALPHABETICP(val));
 }
 
 ScmObj
@@ -211,7 +416,7 @@
 
     val = SCM_CHAR_VALUE(ch);
 
-    return MAKE_BOOL(isascii(val) && isdigit(val));
+    return MAKE_BOOL(ICHAR_NUMERICP(val));
 }
 
 ScmObj
@@ -224,7 +429,7 @@
 
     val = SCM_CHAR_VALUE(ch);
 
-    return MAKE_BOOL(isascii(val) && isspace(val));
+    return MAKE_BOOL(ICHAR_WHITESPACEP(val));
 }
 
 ScmObj
@@ -237,7 +442,7 @@
 
     val = SCM_CHAR_VALUE(ch);
 
-    return MAKE_BOOL(isascii(val) && isupper(val));
+    return MAKE_BOOL(ICHAR_UPPER_CASEP(val));
 }
 
 ScmObj
@@ -250,7 +455,7 @@
 
     val = SCM_CHAR_VALUE(ch);
 
-    return MAKE_BOOL(isascii(val) && islower(val));
+    return MAKE_BOOL(ICHAR_LOWER_CASEP(val));
 }
 
 ScmObj
@@ -275,7 +480,7 @@
 #if SCM_USE_MULTIBYTE_CHAR
     if (!SCM_CHARCODEC_CHAR_LEN(scm_current_char_codec, val))
 #else
-    if (!isascii(val))
+    if (!ICHAR_ASCIIP(val))
 #endif
         ERR_OBJ("invalid char value", n);
 

Modified: branches/r5rs/sigscheme/src/sigschemeinternal.h
===================================================================
--- branches/r5rs/sigscheme/src/sigschemeinternal.h	2006-02-02 11:30:13 UTC (rev 3078)
+++ branches/r5rs/sigscheme/src/sigschemeinternal.h	2006-02-02 14:02:00 UTC (rev 3079)
@@ -39,7 +39,6 @@
 =======================================*/
 #include <stddef.h>
 #include <string.h>
-#include <ctype.h> /* for char macros */
 
 /*=======================================
    Local Include
@@ -428,10 +427,50 @@
 /*=======================================
    Characters
 =======================================*/
+enum ScmCharClass {
+    /* ASCII */
+    SCM_CH_INVALID            = 0,
+    SCM_CH_CONTROL            = 1 << 0, /* iscntrl(3) + backslash */
+    SCM_CH_WHITESPACE         = 1 << 1, /* [ \t\n\r\v\f] */
+    SCM_CH_DIGIT              = 1 << 2, /* [0-9] */
+    SCM_CH_HEX_LETTER         = 1 << 3, /* [a-fA-F] */
+    SCM_CH_NONHEX_LETTER      = 1 << 4, /* [g-zG-Z] */
+    SCM_CH_SPECIAL_INITIAL    = 1 << 5, /* [!$%&*\/:<=>?^_~] */
+    SCM_CH_SPECIAL_SUBSEQUENT = 1 << 6, /* [-+\.@] */
+    SCM_CH_TOKEN_INITIAL      = 1 << 7, /* [()#'`,\.\"\|\{\}\[\]] */
+
+    SCM_CH_LETTER     = SCM_CH_HEX_LETTER | SCM_CH_NONHEX_LETTER,
+    SCM_CH_HEX_DIGIT  = SCM_CH_DIGIT | SCM_CH_HEX_LETTER,
+    SCM_CH_INITIAL    = SCM_CH_LETTER | SCM_CH_SPECIAL_INITIAL,
+    SCM_CH_SUBSEQUENT = SCM_CH_INITIAL | SCM_CH_DIGIT,
+    SCM_CH_PECULIAR_IDENTIFIER_CAND = SCM_CH_SPECIAL_SUBSEQUENT,
+
+    /* beyond ASCII */
+    SCM_CH_ASCII              = 0 << 8,
+    SCM_CH_8BIT               = 1 << 8,
+    SCM_CH_MULTIBYTE          = 1 << 9,
+
+    SCM_CH_NONASCII           = SCM_CH_8BIT | SCM_CH_MULTIBYTE
+};
+
+extern const unsigned char scm_char_class_table[];
+
+#define ICHAR_ASCIIP(c)      (SCM_ASSERT(0 <= (c)), (c) <= 127)
+#define ICHAR_ASCII_CLASS(c)                                                 \
+    (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_INVALID)
+#define ICHAR_CLASS(c)                                                       \
+    (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_NONASCII)
+
+#define ICHAR_ALPHABETICP(c) (ICHAR_UPPER_CASEP(c) || ICHAR_LOWER_CASEP(c))
+#define ICHAR_NUMERICP(c)    ('0' <= (c) && (c) <= '9')
+#define ICHAR_WHITESPACEP(c) ((c) == ' ' || ('\t' <= (c) && (c) <= '\r'))
+#define ICHAR_UPPER_CASEP(c) ('A' <= (c) && (c) <= 'Z')
+#define ICHAR_LOWER_CASEP(c) ('a' <= (c) && (c) <= 'z')
+
 /*
- * SigScheme's case-insensitive comparison conforms to the foldcase'ed
- * comparison described in SRFI-75 and SRFI-13, although R5RS does not specify
- * comparison between alphabetic and non-alphabetic char.
+ * SigScheme's case-insensitive character comparison conforms to the
+ * foldcase'ed comparison described in SRFI-75 and SRFI-13, although R5RS does
+ * not define comparison between alphabetic and non-alphabetic char.
  *
  * This specification is needed to produce natural result on sort functions
  * with these case-insensitive predicates as comparator.
@@ -447,8 +486,8 @@
  *   - "Case mapping and case-folding" and "Comparison" section of SRFI-13
  */
 /* FIXME: support SRFI-75 */
-#define ICHAR_DOWNCASE(c) ((isascii((int)(c))) ? tolower((int)(c)) : (c))
-#define ICHAR_UPCASE(c)   ((isascii((int)(c))) ? toupper((int)(c)) : (c))
+#define ICHAR_DOWNCASE(c) (ICHAR_UPPER_CASEP(c) ? (c) + ('a' - 'A') : (c))
+#define ICHAR_UPCASE(c)   (ICHAR_LOWER_CASEP(c) ? (c) - ('a' - 'A') : (c))
 /* foldcase for case-insensitive character comparison is done by downcase as
  * described in SRFI-75. Although SRFI-13 expects (char-downcase (char-upcase
  * c)), this implementation is sufficient for ASCII range. */



More information about the uim-commit mailing list