Skip to content

Commit

Permalink
Accept supplementary characters
Browse files Browse the repository at this point in the history
  • Loading branch information
som-snytt committed Mar 7, 2022
1 parent 3c5dbc3 commit cf29787
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 96 deletions.
215 changes: 135 additions & 80 deletions compiler/src/dotty/tools/dotc/parsing/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,45 @@ object Scanners {
recur(lastOffset, false)
}

import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}

// f"\\u$c%04x" or f"${"\\"}u$c%04x"
private def toUnicode(c: Char): String = { val s = c.toInt.toHexString; "\\u" + "0" * (4 - s.length) + s }

// given char (ch) is high surrogate followed by low, codepoint passes predicate.
// true means supplementary chars were put to buffer.
// strict to require low surrogate (if not in string literal).
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
if isLowSurrogate(low) then
nextChar()
val codepoint = toCodePoint(high, low)
if isValidCodePoint(codepoint) && test(codepoint) then
putChar(high)
putChar(low)
res = true
else
error(s"illegal character '${toUnicode(high)}${toUnicode(low)}'")
else if !strict then
putChar(high)
res = true
else
error(s"illegal character '${toUnicode(high)}' missing low surrogate")
res
}
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
isHighSurrogate(ch) && {
val hi = ch
val lo = lookaheadChar()
isLowSurrogate(lo) && {
val codepoint = toCodePoint(hi, lo)
isValidCodePoint(codepoint) && f(codepoint)
}
}

/** read next token, filling TokenData fields of Scanner.
*/
protected final def fetchToken(): Unit = {
Expand Down Expand Up @@ -822,11 +861,12 @@ object Scanners {
else ch match {
case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
token = QUOTE
case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
val isEmptyCharLit = (ch == '\'')
getLitChar()
if ch == '\'' then
if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(toUnicode).mkString("'", "", "'"))
else finishCharLit()
else if isEmptyCharLit then error("empty character literal")
else error("unclosed character literal")
Expand Down Expand Up @@ -869,9 +909,11 @@ object Scanners {
def fetchOther() =
if (ch == '\u21D2') {
nextChar(); token = ARROW
report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (ch == '\u2190') {
nextChar(); token = LARROW
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (Character.isUnicodeIdentifierStart(ch)) {
putChar(ch)
Expand All @@ -883,9 +925,10 @@ object Scanners {
nextChar()
getOperatorRest()
}
else if isSupplementary(ch, isUnicodeIdentifierStart) then
getIdentRest()
else {
// FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
error("illegal character '\\u%04x'".format(ch: Int))
error(s"illegal character '${toUnicode(ch)}'")
nextChar()
}
fetchOther()
Expand Down Expand Up @@ -1024,11 +1067,12 @@ object Scanners {
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (Character.isUnicodeIdentifierPart(ch)) {
if isUnicodeIdentifierPart(ch) then
putChar(ch)
nextChar()
getIdentRest()
}
else if isSupplementary(ch, isUnicodeIdentifierPart) then
getIdentRest()
else
finishNamed()
}
Expand Down Expand Up @@ -1111,7 +1155,7 @@ object Scanners {
}

// for interpolated strings
@annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
@tailrec private def getStringPart(multiLine: Boolean): Unit =
if (ch == '"')
if (multiLine) {
nextRawChar()
Expand All @@ -1136,6 +1180,28 @@ object Scanners {
getStringPart(multiLine)
}
else if (ch == '$') {
def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
@tailrec def loopRest(): Unit =
if ch != SU && isUnicodeIdentifierPart(ch) then
putChar(ch) ; nextRawChar()
loopRest()
else if atSupplementary(ch, isUnicodeIdentifierPart) then
putChar(ch) ; nextRawChar()
putChar(ch) ; nextRawChar()
loopRest()
else
finishNamedToken(IDENTIFIER, target = next)
end loopRest
setStrVal()
token = STRINGPART
next.lastOffset = charOffset - 1
next.offset = charOffset - 1
putChar(ch) ; nextRawChar()
if hasSupplement then
putChar(ch) ; nextRawChar()
loopRest()
end getInterpolatedIdentRest

nextRawChar()
if (ch == '$' || ch == '"') {
putChar(ch)
Expand All @@ -1146,18 +1212,10 @@ object Scanners {
setStrVal()
token = STRINGPART
}
else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
setStrVal()
token = STRINGPART
next.lastOffset = charOffset - 1
next.offset = charOffset - 1
while
putChar(ch)
nextRawChar()
ch != SU && Character.isUnicodeIdentifierPart(ch)
do ()
finishNamedToken(IDENTIFIER, target = next)
}
else if isUnicodeIdentifierStart(ch) || ch == '_' then
getInterpolatedIdentRest(hasSupplement = false)
else if atSupplementary(ch, isUnicodeIdentifierStart) then
getInterpolatedIdentRest(hasSupplement = true)
else
error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected", off = charOffset - 2)
putChar('$')
Expand Down Expand Up @@ -1205,76 +1263,73 @@ object Scanners {
false
}

/** copy current character into litBuf, interpreting any escape sequences,
* and advance to next character.
/** Copy current character into cbuf, interpreting any escape sequences,
* and advance to next character. Surrogate pairs are consumed (see check
* at fetchSingleQuote), but orphan surrogate is allowed.
*/
protected def getLitChar(): Unit =
def invalidUnicodeEscape() = {
error("invalid character in unicode escape sequence", charOffset - 1)
putChar(ch)
}
def putUnicode(): Unit = {
while ch == 'u' || ch == 'U' do nextChar()
var i = 0
var cp = 0
while (i < 4) {
val shift = (3 - i) * 4
val d = digit2int(ch, 16)
if(d < 0) {
return invalidUnicodeEscape()
}
cp += (d << shift)
nextChar()
i += 1
}
putChar(cp.asInstanceOf[Char])
}
if (ch == '\\') {
if ch == '\\' then
nextChar()
if ('0' <= ch && ch <= '7') {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if oct == LF then raw"\n" else f"${"\\"}u$oct%04x"
error(s"octal escape literals are unsupported: use $alt instead", start)
putChar(oct.toChar)
}
else if (ch == 'u' || ch == 'U') {
putUnicode()
}
else {
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case _ => invalidEscape()
}
nextChar()
}
}
else {
charEscape()
else if !isSupplementary(ch, _ => true, strict = false) then
putChar(ch)
nextChar()
}

protected def invalidEscape(): Unit = {
private def charEscape(): Unit =
var bump = true
ch match
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case 'u' |
'U' => uEscape(); bump = false
case x if '0' <= x && x <= '7' => octalEscape(); bump = false
case _ => invalidEscape()
if bump then nextChar()
end charEscape

private def uEscape(): Unit =
while ch == 'u' || ch == 'U' do nextChar()
var i = 0
var cp = 0
while i < 4 do
val digit = digit2int(ch, 16)
if digit < 0 then
error("invalid character in unicode escape sequence", charOffset - 1)
putChar(ch)
return
val shift = (3 - i) * 4
cp += digit << shift
nextChar()
i += 1
end while
putChar(cp.asInstanceOf[Char])
end uEscape

private def octalEscape(): Unit =
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if '0' <= ch && ch <= '7' then
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if leadch <= '3' && '0' <= ch && ch <= '7' then
oct = oct * 8 + digit2int(ch, 8)
nextChar()
val alt = if oct == LF then raw"\n" else toUnicode(oct.toChar)
error(s"octal escape literals are unsupported: use $alt instead", start)
putChar(oct.toChar)
end octalEscape

protected def invalidEscape(): Unit =
error("invalid escape character", charOffset - 1)
putChar(ch)
}

private def getLitChars(delimiter: Char) =
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
Expand Down
7 changes: 5 additions & 2 deletions compiler/src/dotty/tools/dotc/transform/Pickler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,14 @@ class Pickler extends Phase {
}

private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
if (previous != unpickled) {
import java.nio.charset.StandardCharsets.UTF_8
def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
if unequal then
output("before-pickling.txt", previous)
output("after-pickling.txt", unpickled)
report.error(s"""pickling difference for $cls in ${cls.source}, for details:
|
| diff before-pickling.txt after-pickling.txt""".stripMargin)
}
end testSame
}
2 changes: 1 addition & 1 deletion scaladoc/src/dotty/tools/scaladoc/util/JSON.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def jsonString(s: String): JSON =

sb.append('"')
firstToBeEncoded() match
case -1 sb.append(s)
case -1 => sb.append(s)
case first =>
// sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
sb.append(s.substring(0, first))
Expand Down
4 changes: 4 additions & 0 deletions tests/neg-custom-args/deprecation/old-syntax.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

val f = (x: Int) x + 1 // error

val list = for (n List(42)) yield n + 1 // error
4 changes: 4 additions & 0 deletions tests/neg/surrogates.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

class C {
def `too wide for Char` = '𐐀' // error
}
14 changes: 7 additions & 7 deletions tests/patmat/t11620.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@ object B {
}

def foo[T](b: B[T]) = b match {
case B(A1(t)) t
case B(A2(t, _)) t
case B(A1(t)) => t
case B(A2(t, _)) => t
}

def foo2[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match {
case B.Aux(a @ A1(_ )) a.t
case B.Aux(a @ A2(_, _)) a.t1 // 👎 (false-positive): unreachable code
case B.Aux(a @ A1(_ )) => a.t
case B.Aux(a @ A2(_, _)) => a.t1 // 👎 (false-positive): unreachable code
}

def foo3[_A[+U] <: A[U], T](b: B.Aux[_A, T]) = b match {
case B.Aux(a: A1[T]) a.t
case B.Aux(a: A2[T]) a.t1 // 👎 (false-positive): unreachable code
case B.Aux(a: A1[T]) => a.t
case B.Aux(a: A2[T]) => a.t1 // 👎 (false-positive): unreachable code
}

def foo4[T](b: B[T]) = b match {
case B(A1(t)) t // 👎 (false-negative): incomplete match
case B(A1(t)) => t // 👎 (false-negative): incomplete match
}
28 changes: 28 additions & 0 deletions tests/pos/surrogates.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@

// allow supplementary chars in identifiers

class 𐐀 {
def 𐐀 = 42

// regression check: anything goes in strings
def x = "𐐀"
def y = s"$𐐀"
def w = s" 𐐀"
}

case class 𐐀𐐀(n: Int) {
def 𐐀𐐀 = n
def `𐐀𐐀1` = n + n
}

// uncontroversially, orphan surrogates may be introduced
// via unicode escape.
class Construction {
def hi = '\ud801'
def lo = '\udc00'
def endhi = "abc\ud801"
def startlo = "\udc00xyz"
def reversed = "xyz\udc00\ud801abc"
}

// was: error: illegal character '\ud801', '\udc00'
Loading

0 comments on commit cf29787

Please sign in to comment.