Skip to content

Commit

Permalink
Accept supplementary characters
Browse files Browse the repository at this point in the history
  • Loading branch information
som-snytt committed Dec 16, 2021
1 parent 0857285 commit 87e8373
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 104 deletions.
231 changes: 143 additions & 88 deletions compiler/src/dotty/tools/dotc/parsing/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ package parsing
import core.Names._, core.Contexts._, core.Decorators._, util.Spans._
import core.StdNames._, core.Comments._
import util.SourceFile
import java.lang.Character.isDigit
import util.Chars._
import util.{SourcePosition, CharBuffer}
import util.Spans.Span
Expand Down Expand Up @@ -705,6 +704,44 @@ object Scanners {
recur(lastOffset, false)
}

import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint}

// given char (ch) is high surrogate followed by low, codepoint passes predicate.
// true means supplementary chars were put to buffer.
// strict to require low surrogate (if not in string literal).
private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean =
isHighSurrogate(high) && {
var res = false
nextChar()
val low = ch
if isLowSurrogate(low) then
nextChar()
val codepoint = toCodePoint(high, low)
if isValidCodePoint(codepoint) && test(codepoint) then
putChar(high)
putChar(low)
res = true
else
error(f"illegal character '\u${high.toInt}%04x\u${low.toInt}%04x'")
//error(f"illegal character '\\u$high%04x\\u$low%04x'")
else if !strict then
putChar(high)
res = true
else
error(f"illegal character '\u${high.toInt}%04x' missing low surrogate")
//error(f"illegal character '\\u$high%04x' missing low surrogate")
res
}
private def atSupplementary(ch: Char, f: Int => Boolean): Boolean =
isHighSurrogate(ch) && {
val hi = ch
val lo = lookaheadChar()
isLowSurrogate(lo) && {
val codepoint = toCodePoint(hi, lo)
isValidCodePoint(codepoint) && f(codepoint)
}
}

/** read next token, filling TokenData fields of Scanner.
*/
protected final def fetchToken(): Unit = {
Expand Down Expand Up @@ -831,11 +868,12 @@ object Scanners {
else ch match {
case '{' | '[' | ' ' | '\t' if lookaheadChar() != '\'' =>
token = QUOTE
case _ if !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
case _ if !isAtEnd && ch != SU && ch != CR && ch != LF =>
val isEmptyCharLit = (ch == '\'')
getLitChar()
if ch == '\'' then
if isEmptyCharLit then error("empty character literal (use '\\'' for single quote)")
else if litBuf.length != 1 then error("illegal codepoint in Char constant: " + litBuf.toString.map(c => f"\u${c.toInt}%04x").mkString("'", "", "'")) // FIXME format
else finishCharLit()
else if isEmptyCharLit then error("empty character literal")
else error("unclosed character literal")
Expand Down Expand Up @@ -878,9 +916,11 @@ object Scanners {
def fetchOther() =
if (ch == '\u21D2') {
nextChar(); token = ARROW
report.deprecationWarning("The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (ch == '\u2190') {
nextChar(); token = LARROW
report.deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", sourcePos(offset))
}
else if (Character.isUnicodeIdentifierStart(ch)) {
putChar(ch)
Expand All @@ -892,9 +932,12 @@ object Scanners {
nextChar()
getOperatorRest()
}
else if isSupplementary(ch, isUnicodeIdentifierStart) then
getIdentRest()
else {
// FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
error("illegal character '\\u%04x'".format(ch: Int))
// FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
//error(f"illegal character '\\u$ch%04x'")
error(f"illegal character '\u${ch.toInt}%04x'")
nextChar()
}
fetchOther()
Expand Down Expand Up @@ -1033,11 +1076,12 @@ object Scanners {
case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
finishNamed()
case _ =>
if (Character.isUnicodeIdentifierPart(ch)) {
if isUnicodeIdentifierPart(ch) then
putChar(ch)
nextChar()
getIdentRest()
}
else if isSupplementary(ch, isUnicodeIdentifierPart) then
getIdentRest()
else
finishNamed()
}
Expand Down Expand Up @@ -1120,7 +1164,7 @@ object Scanners {
}

// for interpolated strings
@annotation.tailrec private def getStringPart(multiLine: Boolean): Unit =
@tailrec private def getStringPart(multiLine: Boolean): Unit =
if (ch == '"')
if (multiLine) {
nextRawChar()
Expand All @@ -1145,6 +1189,28 @@ object Scanners {
getStringPart(multiLine)
}
else if (ch == '$') {
def getInterpolatedIdentRest(hasSupplement: Boolean): Unit =
@tailrec def loopRest(): Unit =
if ch != SU && isUnicodeIdentifierPart(ch) then
putChar(ch) ; nextRawChar()
loopRest()
else if atSupplementary(ch, isUnicodeIdentifierPart) then
putChar(ch) ; nextRawChar()
putChar(ch) ; nextRawChar()
loopRest()
else
finishNamedToken(IDENTIFIER, target = next)
end loopRest
setStrVal()
token = STRINGPART
next.lastOffset = charOffset - 1
next.offset = charOffset - 1
putChar(ch) ; nextRawChar()
if hasSupplement then
putChar(ch) ; nextRawChar()
loopRest()
end getInterpolatedIdentRest

nextRawChar()
if (ch == '$' || ch == '"') {
putChar(ch)
Expand All @@ -1155,18 +1221,10 @@ object Scanners {
setStrVal()
token = STRINGPART
}
else if (Character.isUnicodeIdentifierStart(ch) || ch == '_') {
setStrVal()
token = STRINGPART
next.lastOffset = charOffset - 1
next.offset = charOffset - 1
while
putChar(ch)
nextRawChar()
ch != SU && Character.isUnicodeIdentifierPart(ch)
do ()
finishNamedToken(IDENTIFIER, target = next)
}
else if isUnicodeIdentifierStart(ch) || ch == '_' then
getInterpolatedIdentRest(hasSupplement = false)
else if atSupplementary(ch, isUnicodeIdentifierStart) then
getInterpolatedIdentRest(hasSupplement = true)
else
error("invalid string interpolation: `$$`, `$\"`, `$`ident or `$`BlockExpr expected")
}
Expand Down Expand Up @@ -1212,76 +1270,76 @@ object Scanners {
false
}

/** copy current character into litBuf, interpreting any escape sequences,
* and advance to next character.
/** Copy current character into cbuf, interpreting any escape sequences,
* and advance to next character. Surrogate pairs are consumed (see check
* at fetchSingleQuote), but orphan surrogate is allowed.
*/
protected def getLitChar(): Unit =
def invalidUnicodeEscape() = {
error("invalid character in unicode escape sequence", charOffset - 1)
putChar(ch)
}
def putUnicode(): Unit = {
while ch == 'u' || ch == 'U' do nextChar()
var i = 0
var cp = 0
while (i < 4) {
val shift = (3 - i) * 4
val d = digit2int(ch, 16)
if(d < 0) {
return invalidUnicodeEscape()
}
cp += (d << shift)
nextChar()
i += 1
}
putChar(cp.asInstanceOf[Char])
}
if (ch == '\\') {
if ch == '\\' then
nextChar()
if ('0' <= ch && ch <= '7') {
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if (leadch <= '3' && '0' <= ch && ch <= '7') {
oct = oct * 8 + digit2int(ch, 8)
nextChar()
}
}
val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
error(s"octal escape literals are unsupported: use $alt instead", start)
putChar(oct.toChar)
}
else if (ch == 'u' || ch == 'U') {
putUnicode()
}
else {
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case _ => invalidEscape()
}
nextChar()
}
}
else {
charEscape()
else if !isSupplementary(ch, _ => true, strict = false) then
putChar(ch)
nextChar()
}

protected def invalidEscape(): Unit = {
private def charEscape(): Unit =
var bump = true
ch match
case 'b' => putChar('\b')
case 't' => putChar('\t')
case 'n' => putChar('\n')
case 'f' => putChar('\f')
case 'r' => putChar('\r')
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case 'u' |
'U' => bump = uEscape()
case x if '0' <= x && x <= '7' => bump = octalEscape()
case _ => invalidEscape()
if bump then nextChar()
end charEscape

private def uEscape(): Boolean =
while ch == 'u' || ch == 'U' do nextChar()
var i = 0
var cp = 0
while i < 4 do
val digit = digit2int(ch, 16)
if digit < 0 then
error("invalid character in unicode escape sequence", charOffset - 1)
putChar(ch)
return false
val shift = (3 - i) * 4
cp += digit << shift
nextChar()
i += 1
end while
putChar(cp.asInstanceOf[Char])
false
end uEscape

private def octalEscape(): Boolean =
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if '0' <= ch && ch <= '7' then
oct = oct * 8 + digit2int(ch, 8)
nextChar()
if leadch <= '3' && '0' <= ch && ch <= '7' then
oct = oct * 8 + digit2int(ch, 8)
nextChar()
//val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
val alt = if oct == LF then raw"\n" else f"\u$oct%04x"
error(s"octal escape literals are unsupported: use $alt instead", start)
putChar(oct.toChar)
false
end octalEscape

protected def invalidEscape(): Unit =
error("invalid escape character", charOffset - 1)
putChar(ch)
}

private def getLitChars(delimiter: Char) =
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
Expand Down Expand Up @@ -1364,25 +1422,22 @@ object Scanners {
setStrVal()
}

private def finishCharLit(): Unit = {
private def finishCharLit(): Unit =
nextChar()
token = CHARLIT
setStrVal()
}

/** Parse character literal if current character is followed by \',
* or follow with given op and return a symbol literal token
*/
def charLitOr(op: => Token): Unit = {
def charLitOr(op: => Token): Unit =
putChar(ch)
nextChar()
if (ch == '\'') finishCharLit()
else {
if ch == '\'' then finishCharLit()
else
token = op
strVal = if (name != null) name.toString else null
litBuf.clear()
}
}

override def toString: String =
showTokenDetailed(token) + {
Expand Down
7 changes: 5 additions & 2 deletions compiler/src/dotty/tools/dotc/transform/Pickler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,14 @@ class Pickler extends Phase {
}

private def testSame(unpickled: String, previous: String, cls: ClassSymbol)(using Context) =
if (previous != unpickled) {
import java.nio.charset.StandardCharsets.UTF_8
def normal(s: String) = new String(s.getBytes(UTF_8), UTF_8)
val unequal = unpickled.length() != previous.length() || normal(unpickled) != normal(previous)
if unequal then
output("before-pickling.txt", previous)
output("after-pickling.txt", unpickled)
report.error(s"""pickling difference for $cls in ${cls.source}, for details:
|
| diff before-pickling.txt after-pickling.txt""".stripMargin)
}
end testSame
}
2 changes: 1 addition & 1 deletion scaladoc/src/dotty/tools/scaladoc/util/JSON.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def jsonString(s: String): JSON =

sb.append('"')
firstToBeEncoded() match
case -1 sb.append(s)
case -1 => sb.append(s)
case first =>
// sb.append(s, 0, first) for "abc", 0, 2 produce "(abc,0,2)" rather then "ab" as in Java
sb.append(s.substring(0, first))
Expand Down
4 changes: 4 additions & 0 deletions tests/neg-custom-args/deprecation/old-syntax.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

val f = (x: Int) x + 1 // error

val list = for (n List(42)) yield n + 1 // error
4 changes: 4 additions & 0 deletions tests/neg/surrogates.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

class C {
def `too wide for Char` = '𐐀' // error
}
Loading

0 comments on commit 87e8373

Please sign in to comment.