Accept supplementary characters

scala · Dec 16, 2021 · 87e8373 · 87e8373
1 parent 0857285
commit 87e8373
Show file tree

Hide file tree

Showing 8 changed files with 201 additions and 104 deletions.
diff --git a/compiler/src/dotty/tools/dotc/parsing/Scanners.scala b/compiler/src/dotty/tools/dotc/parsing/Scanners.scala
@@ -5,7 +5,6 @@ package parsing
 import core.Names._, core.Contexts._, core.Decorators._, util.Spans._
 import core.StdNames._, core.Comments._
 import util.SourceFile
-import java.lang.Character.isDigit
 import util.Chars._
 import util.{SourcePosition, CharBuffer}
 import util.Spans.Span
@@ -705,6 +704,44 @@ object Scanners {
       recur(lastOffset, false)
     }
 
+    import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}
+
+    // given char (ch) is high surrogate followed by low, codepoint passes predicate.
+    // true means supplementary chars were put to buffer.
+    // strict to require low surrogate (if not in string literal).
+    private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
+      isHighSurrogate(high) && {
+        var res = false
+        nextChar()
+        val low = ch
+        if isLowSurrogate(low) then
+          nextChar()
+          val codepoint = toCodePoint(high, low)
+          if isValidCodePoint(codepoint) && test(codepoint) then
+            putChar(high)
+            putChar(low)
+            res = true
+          else
+            error(f"illegal character '\u${high.toInt}%04x\u${low.toInt}%04x'")
+            //error(f"illegal character '\\u$high%04x\\u$low%04x'")
+        else if !strict then
+          putChar(high)
+          res = true
+        else
+          error(f"illegal character '\u${high.toInt}%04x' missing low surrogate")
+          //error(f"illegal character '\\u$high%04x' missing low surrogate")
+        res
+      }
+    private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
+      isHighSurrogate(ch) && {
+        val hi = ch
+        val lo = lookaheadChar()
+        isLowSurrogate(lo) && {
+          val codepoint = toCodePoint(hi, lo)
+          isValidCodePoint(codepoint) && f(codepoint)
+        }
+      }
+
     /** read next token, filling TokenData fields of Scanner.
      */
     protected final def fetchToken(): Unit = {
@@ -831,11 +868,12 @@ object Scanners {
             else ch match {
               case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
                 token = QUOTE
-              case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
+              case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
                 val isEmptyCharLit = (ch == '\'')
                 getLitChar()
                 if ch == '\'' then
                   if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
+                  else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(c => f"\u${c.toInt}%04x").mkString("'", "", "'"))  // FIXME format
                   else finishCharLit()
                 else if isEmptyCharLit then error("empty character literal")
                 else error("unclosed character literal")
@@ -878,9 +916,11 @@ object Scanners {
           def fetchOther() =
             if (ch == '\u21D2') {
               nextChar(); token = ARROW
+              report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
             }
             else if (ch == '\u2190') {
               nextChar(); token = LARROW
+              report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
             }
             else if (Character.isUnicodeIdentifierStart(ch)) {
               putChar(ch)
@@ -892,9 +932,12 @@ object Scanners {
               nextChar()
               getOperatorRest()
             }
+            else if isSupplementary(ch, isUnicodeIdentifierStart) then
+              getIdentRest()
             else {
-              // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
-              error("illegal character '\\u%04x'".format(ch: Int))
+              // FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
+              //error(f"illegal character '\\u$ch%04x'")
+              error(f"illegal character '\u${ch.toInt}%04x'")
               nextChar()
             }
           fetchOther()
@@ -1033,11 +1076,12 @@ object Scanners {
       case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
         finishNamed()
       case _ =>
-        if (Character.isUnicodeIdentifierPart(ch)) {
+        if isUnicodeIdentifierPart(ch) then
           putChar(ch)
           nextChar()
           getIdentRest()
-        }
+        else if isSupplementary(ch, isUnicodeIdentifierPart) then
+          getIdentRest()
         else
           finishNamed()
     }
@@ -1120,7 +1164,7 @@ object Scanners {
       }
 
     // for interpolated strings
-    @annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
+    @tailrec private def getStringPart(multiLine: Boolean): Unit =
       if (ch == '"')
         if (multiLine) {
           nextRawChar()
@@ -1145,6 +1189,28 @@ object Scanners {
         getStringPart(multiLine)
       }
       else if (ch == '$') {
+        def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
+          @tailrec def loopRest(): Unit =
+            if ch != SU && isUnicodeIdentifierPart(ch) then
+              putChar(ch) ; nextRawChar()
+              loopRest()
+            else if atSupplementary(ch, isUnicodeIdentifierPart) then
+              putChar(ch) ; nextRawChar()
+              putChar(ch) ; nextRawChar()
+              loopRest()
+            else
+              finishNamedToken(IDENTIFIER, target = next)
+          end loopRest
+          setStrVal()
+          token = STRINGPART
+          next.lastOffset = charOffset - 1
+          next.offset = charOffset - 1
+          putChar(ch) ; nextRawChar()
+          if hasSupplement then
+            putChar(ch) ; nextRawChar()
+          loopRest()
+        end getInterpolatedIdentRest
+
         nextRawChar()
         if (ch == '$' || ch == '"') {
           putChar(ch)
@@ -1155,18 +1221,10 @@ object Scanners {
           setStrVal()
           token = STRINGPART
         }
-        else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
-          setStrVal()
-          token = STRINGPART
-          next.lastOffset = charOffset - 1
-          next.offset = charOffset - 1
-          while
-            putChar(ch)
-            nextRawChar()
-            ch != SU && Character.isUnicodeIdentifierPart(ch)
-          do ()
-          finishNamedToken(IDENTIFIER, target = next)
-        }
+        else if isUnicodeIdentifierStart(ch) || ch == '_' then
+          getInterpolatedIdentRest(hasSupplement = false)
+        else if atSupplementary(ch, isUnicodeIdentifierStart) then
+          getInterpolatedIdentRest(hasSupplement = true)
         else
           error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected")
       }
@@ -1212,76 +1270,76 @@ object Scanners {
         false
       }
 
-    /** copy current character into litBuf, interpreting any escape sequences,
-     *  and advance to next character.
+    /** Copy current character into cbuf, interpreting any escape sequences,
+     *  and advance to next character. Surrogate pairs are consumed (see check
+     *  at fetchSingleQuote), but orphan surrogate is allowed.
      */
     protected def getLitChar(): Unit =
-      def invalidUnicodeEscape() = {
-        error("invalid character in unicode escape sequence", charOffset - 1)
-        putChar(ch)
-      }
-      def putUnicode(): Unit = {
-        while ch == 'u' || ch == 'U' do nextChar()
-        var i = 0
-        var cp = 0
-        while (i < 4) {
-          val shift = (3 - i) * 4
-          val d = digit2int(ch, 16)
-          if(d < 0) {
-            return invalidUnicodeEscape()
-          }
-          cp += (d << shift)
-          nextChar()
-          i += 1
-        }
-        putChar(cp.asInstanceOf[Char])
-      }
-      if (ch == '\\') {
+      if ch == '\\' then
         nextChar()
-        if ('0' <= ch && ch <= '7') {
-          val start = charOffset - 2
-          val leadch: Char = ch
-          var oct: Int = digit2int(ch, 8)
-          nextChar()
-          if ('0' <= ch && ch <= '7') {
-            oct = oct * 8 + digit2int(ch, 8)
-            nextChar()
-            if (leadch <= '3' && '0' <= ch && ch <= '7') {
-              oct = oct * 8 + digit2int(ch, 8)
-              nextChar()
-            }
-          }
-          val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
-          error(s"octal escape literals are unsupported: use $alt instead", start)
-          putChar(oct.toChar)
-        }
-        else if (ch == 'u' || ch == 'U') {
-          putUnicode()
-        }
-        else {
-          ch match {
-            case 'b'  => putChar('\b')
-            case 't'  => putChar('\t')
-            case 'n'  => putChar('\n')
-            case 'f'  => putChar('\f')
-            case 'r'  => putChar('\r')
-            case '\"' => putChar('\"')
-            case '\'' => putChar('\'')
-            case '\\' => putChar('\\')
-            case _    => invalidEscape()
-          }
-          nextChar()
-        }
-      }
-      else {
+        charEscape()
+      else if !isSupplementary(ch, _ => true, strict = false) then
         putChar(ch)
         nextChar()
-      }
 
-    protected def invalidEscape(): Unit = {
+    private def charEscape(): Unit =
+      var bump = true
+      ch match
+        case 'b'  => putChar('\b')
+        case 't'  => putChar('\t')
+        case 'n'  => putChar('\n')
+        case 'f'  => putChar('\f')
+        case 'r'  => putChar('\r')
+        case '\"' => putChar('\"')
+        case '\'' => putChar('\'')
+        case '\\' => putChar('\\')
+        case 'u' |
+             'U'  => bump = uEscape()
+        case x if '0' <= x && x <= '7' => bump = octalEscape()
+        case _    => invalidEscape()
+      if bump then nextChar()
+    end charEscape
+
+    private def uEscape(): Boolean =
+      while ch == 'u' || ch == 'U' do nextChar()
+      var i  = 0
+      var cp = 0
+      while i < 4 do
+        val digit = digit2int(ch, 16)
+        if digit < 0 then
+          error("invalid character in unicode escape sequence", charOffset - 1)
+          putChar(ch)
+          return false
+        val shift = (3 - i) * 4
+        cp += digit << shift
+        nextChar()
+        i += 1
+      end while
+      putChar(cp.asInstanceOf[Char])
+      false
+    end uEscape
+
+    private def octalEscape(): Boolean =
+      val start = charOffset - 2
+      val leadch: Char = ch
+      var oct: Int = digit2int(ch, 8)
+      nextChar()
+      if '0' <= ch && ch <= '7' then
+        oct = oct * 8 + digit2int(ch, 8)
+        nextChar()
+        if leadch <= '3' && '0' <= ch && ch <= '7' then
+          oct = oct * 8 + digit2int(ch, 8)
+          nextChar()
+      //val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
+      val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
+      error(s"octal escape literals are unsupported: use $alt instead", start)
+      putChar(oct.toChar)
+      false
+    end octalEscape
+
+    protected def invalidEscape(): Unit =
       error("invalid escape character", charOffset - 1)
       putChar(ch)
-    }
 
     private def getLitChars(delimiter: Char) =
       while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1364,25 +1422,22 @@ object Scanners {
       setStrVal()
     }
 
-    private def finishCharLit(): Unit = {
+    private def finishCharLit(): Unit =
       nextChar()
       token = CHARLIT
       setStrVal()
-    }
 
     /** Parse character literal if current character is followed by \',
      *  or follow with given op and return a symbol literal token
      */
-    def charLitOr(op: => Token): Unit = {
+    def charLitOr(op: => Token): Unit =
       putChar(ch)
       nextChar()
-      if (ch == '\'') finishCharLit()
-      else {
+      if ch == '\'' then finishCharLit()
+      else
         token = op
         strVal = if (name != null) name.toString else null
         litBuf.clear()
-      }
-    }
 
     override def toString: String =
       showTokenDetailed(token) + {

diff --git a/compiler/src/dotty/tools/dotc/transform/Pickler.scala b/compiler/src/dotty/tools/dotc/transform/Pickler.scala
@@ -137,11 +137,14 @@ class Pickler extends Phase {
   }
 
   private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
-    if (previous != unpickled) {
+    import java.nio.charset.StandardCharsets.UTF_8
+    def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
+    val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
+    if unequal then
       output("before-pickling.txt", previous)
       output("after-pickling.txt", unpickled)
       report.error(s"""pickling difference for $cls in ${cls.source}, for details:
                    |
                    |  diff before-pickling.txt after-pickling.txt""".stripMargin)
-    }
+  end testSame
 }
diff --git a/scaladoc/src/dotty/tools/scaladoc/util/JSON.scala b/scaladoc/src/dotty/tools/scaladoc/util/JSON.scala
@@ -31,7 +31,7 @@ def jsonString(s: String): JSON =
 
     sb.append('"')
     firstToBeEncoded() match
-      case -1 ⇒ sb.append(s)
+      case -1 => sb.append(s)
       case first =>
         // sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
         sb.append(s.substring(0, first))

diff --git a/tests/neg-custom-args/deprecation/old-syntax.scala b/tests/neg-custom-args/deprecation/old-syntax.scala
@@ -0,0 +1,4 @@
+
+val f = (x: Int) ⇒ x + 1  // error
+
+val list = for (n ← List(42)) yield n + 1  // error
diff --git a/tests/neg/surrogates.scala b/tests/neg/surrogates.scala
@@ -0,0 +1,4 @@
+
+class C {
+  def `too wide for Char` = '𐐀' // error
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@

		val f = (x: Int) ⇒ x + 1 // error

		val list = for (n ← List(42)) yield n + 1 // error