[Mesa-dev] [PATCH 5/6 v2.] glsl/glcpp: Emit an error for any illegal GLSL character.

Thu Aug 7 15:20:38 PDT 2014

The GLSL Language Specification (version 4.30.6) is quite clear about the GLSL
character set and the expected behavior for other characters:

    Section 3.1 Character Set

    The source character set used for the OpenGL shading languages, outside of
    comments, is a subset of UTF-8. It includes the following characters:

        The letters a-z, A-Z, and the underscore ( _ ).

        The numbers 0-9.

        The symbols period (.), plus (+), dash (-), slash (/), asterisk (*),
        percent (%), angled brackets (< and >), square brackets ( [ and ] ),
        parentheses ( ( and ) ), braces ( { and } ), caret (^), vertical bar
        (|), ampersand (&), tilde (~), equals (=), exclamation point (!),
        colon (:), semicolon (;), comma (,), and question mark (?).

        The number sign (#) for preprocessor use.

        The backslash (\) as the line-continuation character when used as the
        last character of a line, just before a new line.

        White space: the space character, horizontal tab, vertical tab, form
        feed, carriage-return, and line-feed.

    A compile-time error will be given if any other character is used outside
    a comment.

By taking the set of all possible 8-bit characters, and subtracting the above,
we have the set of illegal characters:

    0x00 - 0x08 (^A - ^H)
    0x0E - 0x1F (^N - ^Z, ^[, ^\, ^], ^^, ^_)
    0x22 (")
    0x24 ($)
    0x27 (')
    0x40 (@)
    0x60 (')
    0x7F (DEL or ^?)
    0x80 - 0xFF (non-ASCII)

As well as (#) outside of uses defined by the preprocessor (not starting a
directive, nor as part of a legal paste operator in a replacement list), and
(\) appearing anywhere but at the end of a line.

So instead of the previous whitelist we had for "OTHER" characters, we now
add a blacklist for "ILLEGAL" characters based on the above, and then use a
simple regular expression of "." to catch any characters that get past the
blacklist.

This approach also means the internal-error rule with "." can no longer be
matched, so it goes away now.

v2: Instead of emitting the error as soon as the illegal character is lexed,
we instead emit an ILLEGAL token to the parser. This allows the parser to
allow the character as part of the replacement list of a macro, (since these
are specified to allow any character). However, if such a macro is actually
instantiated, the parser will emit an error when it goes to print the illegal
character as part of the preprocessed output.
---
 src/glsl/glcpp/glcpp-lex.l   | 32 +++++++++++---------------------
 src/glsl/glcpp/glcpp-parse.y | 25 ++++++++++++++++++-------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/glsl/glcpp/glcpp-lex.l b/src/glsl/glcpp/glcpp-lex.l
index 0dbdab0..0482c4e 100644
--- a/src/glsl/glcpp/glcpp-lex.l
+++ b/src/glsl/glcpp/glcpp-lex.l
@@ -175,15 +175,7 @@ HASH		#
 IDENTIFIER	[_a-zA-Z][_a-zA-Z0-9]*
 PP_NUMBER	[.]?[0-9]([._a-zA-Z0-9]|[eEpP][-+])*
 PUNCTUATION	[][(){}.&*~!/%<>^|;,=+-]
-
-/* The OTHER class is simply a catch-all for things that the CPP
-parser just doesn't care about. Since flex regular expressions that
-match longer strings take priority over those matching shorter
-strings, we have to be careful to avoid OTHER matching and hiding
-something that CPP does care about. So we simply exclude all
-characters that appear in any other expressions. */
-
-OTHER		[^][_#[:space:]#a-zA-Z0-9(){}.&*~!/%<>^|;,=+-]
+ILLEGAL		[\x00-\x08\x0E-\x1F"$'@`\x7F\x80-\xFF\\]
 
 DIGITS			[0-9][0-9]*
 DECIMAL_INTEGER		[1-9][0-9]*[uU]?
@@ -276,9 +268,10 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
          * token. */
 	if (parser->first_non_space_token_this_line) {
 		BEGIN HASH;
+		RETURN_TOKEN_NEVER_SKIP (HASH_TOKEN);
+	} else {
+		RETURN_STRING_TOKEN (ILLEGAL);
 	}
-
-	RETURN_TOKEN_NEVER_SKIP (HASH_TOKEN);
 }
 
 <HASH>version{HSPACE}+ {
@@ -505,8 +498,8 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	RETURN_TOKEN (yytext[0]);
 }
 
-{OTHER}+ {
-	RETURN_STRING_TOKEN (OTHER);
+{ILLEGAL} {
+	RETURN_STRING_TOKEN (ILLEGAL);
 }
 
 {HSPACE} {
@@ -539,14 +532,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 		RETURN_TOKEN (NEWLINE);
 }
 
-	/* This is a catch-all to avoid the annoying default flex action which
-	 * matches any character and prints it. If any input ever matches this
-	 * rule, then we have made a mistake above and need to fix one or more
-	 * of the preceding patterns to match that input. */
-
-<*>. {
-	glcpp_error(yylloc, yyextra, "Internal compiler error: Unexpected character: %s", yytext);
-
+<UNREACHABLE>. {
 	/* We don't actually use the UNREACHABLE start condition. We
 	only have this block here so that we can pretend to call some
 	generated functions, (to avoid "defined but not used"
@@ -557,6 +543,10 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	}
 }
 
+<*>. {
+	RETURN_STRING_TOKEN (OTHER);
+}
+
 %%
 
 void
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index e2e8aca..afca990 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -171,11 +171,11 @@ add_builtin_define(glcpp_parser_t *parser, const char *name, int value);
 	/* We use HASH_TOKEN, DEFINE_TOKEN and VERSION_TOKEN (as opposed to
          * HASH, DEFINE, and VERSION) to avoid conflicts with other symbols,
          * (such as the <HASH> and <DEFINE> start conditions in the lexer). */
-%token COMMA_FINAL DEFINED ELIF_EXPANDED HASH_TOKEN DEFINE_TOKEN FUNC_IDENTIFIER OBJ_IDENTIFIER ELIF ELSE ENDIF ERROR_TOKEN IF IFDEF IFNDEF LINE PRAGMA UNDEF VERSION_TOKEN GARBAGE IDENTIFIER IF_EXPANDED INTEGER INTEGER_STRING LINE_EXPANDED NEWLINE OTHER PLACEHOLDER SPACE PLUS_PLUS MINUS_MINUS
+%token COMMA_FINAL DEFINED ELIF_EXPANDED HASH_TOKEN DEFINE_TOKEN FUNC_IDENTIFIER OBJ_IDENTIFIER ELIF ELSE ENDIF ERROR_TOKEN IF IFDEF IFNDEF LINE PRAGMA UNDEF VERSION_TOKEN GARBAGE IDENTIFIER IF_EXPANDED ILLEGAL INTEGER INTEGER_STRING LINE_EXPANDED NEWLINE OTHER PLACEHOLDER SPACE PLUS_PLUS MINUS_MINUS
 %token PASTE
 %type <ival> INTEGER operator SPACE integer_constant
 %type <expression_value> expression
-%type <str> IDENTIFIER FUNC_IDENTIFIER OBJ_IDENTIFIER INTEGER_STRING OTHER ERROR_TOKEN PRAGMA
+%type <str> IDENTIFIER FUNC_IDENTIFIER OBJ_IDENTIFIER ILLEGAL INTEGER_STRING OTHER ERROR_TOKEN PRAGMA
 %type <string_list> identifier_list
 %type <token> preprocessing_token conditional_token
 %type <token_list> pp_tokens replacement_list text_line conditional_tokens
@@ -692,6 +692,12 @@ preprocessing_token:
 		$$ = _token_create_str (parser, IDENTIFIER, $1);
 		$$->location = yylloc;
 	}
+	/* We allow illegal characters at this point, but gripe
+	 * if these ever actually get printed in the output. */
+|	ILLEGAL {
+		$$ = _token_create_str (parser, ILLEGAL, $1);
+		$$->location = yylloc;
+	}
 |	INTEGER_STRING {
 		$$ = _token_create_str (parser, INTEGER_STRING, $1);
 		$$->location = yylloc;
@@ -1118,7 +1124,7 @@ _token_list_equal_ignoring_space (token_list_t *a, token_list_t *b)
 }
 
 static void
-_token_print (char **out, size_t *len, token_t *token)
+_token_print (glcpp_parser_t *parser, char **out, size_t *len, token_t *token)
 {
 	if (token->type < 256) {
 		ralloc_asprintf_rewrite_tail (out, len, "%c", token->type);
@@ -1176,6 +1182,9 @@ _token_print (char **out, size_t *len, token_t *token)
 	case PLACEHOLDER:
 		/* Nothing to print. */
 		break;
+        case ILLEGAL:
+		glcpp_error (&token->location, parser, "Illegal character '%s'", token->value.str);
+                break;
 	default:
 		assert(!"Error: Don't know how to print token.");
 		break;
@@ -1303,9 +1312,9 @@ _token_paste (glcpp_parser_t *parser, token_t *token, token_t *other)
     FAIL:
 	glcpp_error (&token->location, parser, "");
 	ralloc_asprintf_rewrite_tail (&parser->info_log, &parser->info_log_length, "Pasting \"");
-	_token_print (&parser->info_log, &parser->info_log_length, token);
+	_token_print (parser, &parser->info_log, &parser->info_log_length, token);
 	ralloc_asprintf_rewrite_tail (&parser->info_log, &parser->info_log_length, "\" and \"");
-	_token_print (&parser->info_log, &parser->info_log_length, other);
+	_token_print (parser, &parser->info_log, &parser->info_log_length, other);
 	ralloc_asprintf_rewrite_tail (&parser->info_log, &parser->info_log_length, "\" does not give a valid preprocessing token.\n");
 
 	return token;
@@ -1319,8 +1328,10 @@ _token_list_print (glcpp_parser_t *parser, token_list_t *list)
 	if (list == NULL)
 		return;
 
-	for (node = list->head; node; node = node->next)
-		_token_print (&parser->output, &parser->output_length, node->token);
+	for (node = list->head; node; node = node->next) {
+		_token_print (parser, &parser->output,
+			      &parser->output_length, node->token);
+	}
 }
 
 void
-- 
2.0.0