Fix mixed encoding in heredoc

Heredocs are parsed line-by-line, so we need to keep track of the temporary encoding of the string. Previously, a heredoc would only detect mixed encoding errors if they were on the same line, this changes things so they will be caught on different lines. Fixes [Bug #15839]
2019-05-14 22:04:46 -07:00 · 2019-05-14 22:04:46 -07:00 · c05eaa9325
--- a/parse.y
+++ b/parse.y
@ -5233,7 +5233,7 @@ none		: /* none */
 # define yylval  (*p->lval)

 static int regx_options(struct parser_params*);
-static int tokadd_string(struct parser_params*,int,int,int,long*,rb_encoding**);
+static int tokadd_string(struct parser_params*,int,int,int,long*,rb_encoding**,rb_encoding**);
 static void tokaddmbc(struct parser_params *p, int c, rb_encoding *enc);
 static enum yytokentype parse_string(struct parser_params*,rb_strterm_literal_t*);
 static enum yytokentype here_document(struct parser_params*,rb_strterm_heredoc_t*);
@ -6510,10 +6510,9 @@ parser_mixed_escape(struct parser_params *p, const char *beg, rb_encoding *enc1,
 static int
 tokadd_string(struct parser_params *p,
 	      int func, int term, int paren, long *nest,
-	      rb_encoding **encp)
+	      rb_encoding **encp, rb_encoding **enc)
 {
    int c;
-    rb_encoding *enc = 0;
    bool erred = false;

 #define mixed_error(enc1, enc2) \
@ -6569,7 +6568,7 @@ tokadd_string(struct parser_params *p,
 		    tokadd(p, '\\');
 		    break;
 		}
-		if (!parser_tokadd_utf8(p, &enc, term,
+		if (!parser_tokadd_utf8(p, enc, term,
 					func & STR_FUNC_SYMBOL,
 					func & STR_FUNC_REGEXP)) {
 		    continue;
@ -6588,17 +6587,17 @@ tokadd_string(struct parser_params *p,
 			continue;
 		    }
 		    pushback(p, c);
-		    if ((c = tokadd_escape(p, &enc)) < 0)
+		    if ((c = tokadd_escape(p, enc)) < 0)
 			return -1;
-		    if (enc && enc != *encp) {
-			mixed_escape(p->lex.ptok+2, enc, *encp);
+		    if (*enc && *enc != *encp) {
+			mixed_escape(p->lex.ptok+2, *enc, *encp);
 		    }
 		    continue;
 		}
 		else if (func & STR_FUNC_EXPAND) {
 		    pushback(p, c);
 		    if (func & STR_FUNC_ESCAPE) tokadd(p, '\\');
-		    c = read_escape(p, 0, &enc);
+		    c = read_escape(p, 0, enc);
 		}
 		else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
 		    /* ignore backslashed spaces in %w */
@ -6612,11 +6611,11 @@ tokadd_string(struct parser_params *p,
 	}
 	else if (!parser_isascii(p)) {
 	  non_ascii:
-	    if (!enc) {
-		enc = *encp;
+	    if (!*enc) {
+		*enc = *encp;
 	    }
-	    else if (enc != *encp) {
-		mixed_error(enc, *encp);
+	    else if (*enc != *encp) {
+		mixed_error(*enc, *encp);
 		continue;
 	    }
 	    if (tokadd_mbchar(p, c) == -1) return -1;
@ -6627,18 +6626,18 @@ tokadd_string(struct parser_params *p,
 	    break;
 	}
        if (c & 0x80) {
-	    if (!enc) {
-		enc = *encp;
+	    if (!*enc) {
+		*enc = *encp;
 	    }
-	    else if (enc != *encp) {
-		mixed_error(enc, *encp);
+	    else if (*enc != *encp) {
+		mixed_error(*enc, *encp);
 		continue;
 	    }
        }
 	tokadd(p, c);
    }
  terminate:
-    if (enc) *encp = enc;
+    if (*enc) *encp = *enc;
    return c;
 }

@ -6774,6 +6773,7 @@ parse_string(struct parser_params *p, rb_strterm_literal_t *quote)
    int paren = (int)quote->u2.paren;
    int c, space = 0;
    rb_encoding *enc = p->enc;
+    rb_encoding *base_enc = 0;
    VALUE lit;

    if (func & STR_FUNC_TERM) {
@ -6814,7 +6814,7 @@ parse_string(struct parser_params *p, rb_strterm_literal_t *quote)
    }
    pushback(p, c);
    if (tokadd_string(p, func, term, paren, &quote->u0.nest,
-		      &enc) == -1) {
+		      &enc, &base_enc) == -1) {
 	if (p->eofp) {
 #ifndef RIPPER
 # define unterminated_literal(mesg) yyerror0(mesg)
@ -7171,6 +7171,7 @@ here_document(struct parser_params *p, rb_strterm_heredoc_t *here)
    long len;
    VALUE str = 0;
    rb_encoding *enc = p->enc;
+    rb_encoding *base_enc = 0;
    int bol;

    eos = RSTRING_PTR(here->lastline) + here->offset;
@ -7282,7 +7283,8 @@ here_document(struct parser_params *p, rb_strterm_heredoc_t *here)
 	}
 	do {
 	    pushback(p, c);
-	    if ((c = tokadd_string(p, func, '\n', 0, NULL, &enc)) == -1) {
+	    enc = p->enc;
+	    if ((c = tokadd_string(p, func, '\n', 0, NULL, &enc, &base_enc)) == -1) {
 		if (p->eofp) goto error;
 		goto restore;
 	    }
--- a/test/ruby/test_syntax.rb
+++ b/test/ruby/test_syntax.rb
@ -774,6 +774,35 @@ eom
    assert_equal("\n0\n1", eval("<<~0 '1'\n \n0\#{}\n0"))
  end

+  def test_heredoc_mixed_encoding
+    assert_syntax_error(<<-'HEREDOC', 'UTF-8 mixed within Windows-31J source')
+      #encoding: cp932
+      <<-TEXT
+      \xe9\x9d\u1234
+      TEXT
+    HEREDOC
+    assert_syntax_error(<<-'HEREDOC', 'UTF-8 mixed within Windows-31J source')
+      #encoding: cp932
+      <<-TEXT
+      \xe9\x9d
+      \u1234
+      TEXT
+    HEREDOC
+    assert_syntax_error(<<-'HEREDOC', 'UTF-8 mixed within Windows-31J source')
+      #encoding: cp932
+      <<-TEXT
+      \u1234\xe9\x9d
+      TEXT
+    HEREDOC
+    assert_syntax_error(<<-'HEREDOC', 'UTF-8 mixed within Windows-31J source')
+      #encoding: cp932
+      <<-TEXT
+      \u1234
+      \xe9\x9d
+      TEXT
+    HEREDOC
+  end
+
  def test_lineno_operation_brace_block
    expected = __LINE__ + 1
    actual = caller_lineno\