Skip to content

Commit

Permalink
feat: Unicode strings (--unicode-char) (#3016)
Browse files Browse the repository at this point in the history
Implementation of the design from
dafny-lang/rfcs#13.

Resolves #413. Fixes #2928. Fixes #818. Fixes #3058. Fixes #1293. Fixes
#3001.

Depends on #2976 to be fixed for the tests to pass consistently across
platforms.

I've documented some of the less obvious compilation strategy decisions
in `docs/Compilation/StringsAndChars.md`.

<small>By submitting this pull request, I confirm that my contribution
is made under the terms of the [MIT
license](https://github.com/dafny-lang/dafny/blob/master/LICENSE.txt).</small>
  • Loading branch information
robin-aws authored Nov 24, 2022
1 parent 312147b commit f65c111
Show file tree
Hide file tree
Showing 71 changed files with 2,212 additions and 291 deletions.
1 change: 1 addition & 0 deletions Source/DafnyCore/AST/Types.cs
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ public Type UseInternalSynonym() {
public bool IsRealType { get { return NormalizeExpand() is RealType; } }
public bool IsBigOrdinalType { get { return NormalizeExpand() is BigOrdinalType; } }
public bool IsBitVectorType { get { return AsBitVectorType != null; } }
public bool IsStringType { get { return AsSeqType?.Arg.IsCharType == true; } }
public BitvectorType AsBitVectorType { get { return NormalizeExpand() as BitvectorType; } }
public bool IsNumericBased() {
var t = NormalizeExpand();
Expand Down
13 changes: 12 additions & 1 deletion Source/DafnyCore/Coco/Scanner.frame
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public class Buffer {
// b) part of stream in buffer
// 2) non seekable stream (network, console)

public const int EOF = 65535 + 1; // char.MaxValue + 1;
public const int EOF = 0x11_0000; // Maximum Unicode codepoint + 1;
const int MIN_BUFFER_LENGTH = 1024; // 1KB
const int MAX_BUFFER_LENGTH = MIN_BUFFER_LENGTH * 64; // 64KB
byte[]/*!*/ buf; // input buffer
Expand Down Expand Up @@ -205,10 +205,16 @@ public class Buffer {
// UTF8Buffer
//-----------------------------------------------------------------------------------
public class UTF8Buffer: Buffer {
private int pendingLowSurrogate = -1;
public UTF8Buffer(Buffer/*!*/ b): base(b) {Contract.Requires(b != null);}

public override int Read() {
int ch;
if (pendingLowSurrogate != -1) {
ch = pendingLowSurrogate;
pendingLowSurrogate = -1;
return ch;
}
do {
ch = base.Read();
// until we find a utf8 start (0xxxxxxx or 11xxxxxx)
Expand All @@ -235,6 +241,11 @@ public class UTF8Buffer: Buffer {
int c2 = ch & 0x3F;
ch = (c1 << 6) | c2;
}
if (0x10000 <= ch && ch < 0x11_0000) {
ch = ch - 0x10000;
pendingLowSurrogate = 0xDC00 + (ch & 0x3FF);
ch = 0xD800 + ((ch >> 10) & 0x3FF);
}
return ch;
}
}
Expand Down
91 changes: 70 additions & 21 deletions Source/DafnyCore/Compilers/Compiler-Csharp.cs
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ protected override ConcreteSyntaxTree CreateStaticMain(IClassWriter cw, string a
var wr = ((ClassWriter)cw).StaticMemberWriter;
// See EmitCallToMain() - this is named differently because otherwise C# tries
// to resolve the reference to the instance-level Main method
return wr.NewBlock($"public static void _StaticMain(Dafny.ISequence<Dafny.ISequence<char>> {argsParameterName})");
return wr.NewBlock($"public static void _StaticMain(Dafny.ISequence<Dafny.ISequence<{CharTypeName}>> {argsParameterName})");
}

protected override ConcreteSyntaxTree CreateModule(string moduleName, bool isDefault, bool isExtern, string /*?*/ libraryName, ConcreteSyntaxTree wr) {
Expand Down Expand Up @@ -981,7 +981,12 @@ void DatatypeFieldsAndConstructor(DatatypeCtor ctor, int constructorIndex, Concr
w.WriteLine($"{tempVar} += \", \";");
}

w.WriteLine($"{tempVar} += {DafnyHelpersClass}.ToString(this.{FieldName(arg, i)});");
if (arg.Type.IsStringType && UnicodeCharEnabled) {
w.WriteLine($"{tempVar} += this.{FieldName(arg, i)}.ToVerbatimString(true);");
} else {
w.WriteLine($"{tempVar} += {DafnyHelpersClass}.ToString(this.{FieldName(arg, i)});");
}

i++;
}
}
Expand Down Expand Up @@ -1388,7 +1393,7 @@ internal override string TypeName(Type type, ConcreteSyntaxTree wr, IToken tok,
if (xType is BoolType) {
return "bool";
} else if (xType is CharType) {
return "char";
return CharTypeName;
} else if (xType is IntType || xType is BigOrdinalType) {
return "BigInteger";
} else if (xType is RealType) {
Expand Down Expand Up @@ -1441,6 +1446,20 @@ internal override string TypeName(Type type, ConcreteSyntaxTree wr, IToken tok,
}
}

// To improve readability
private static bool CharIsRune => UnicodeCharEnabled;
private static string CharTypeName => UnicodeCharEnabled ? "Dafny.Rune" : "char";

private void ConvertFromChar(Expression e, ConcreteSyntaxTree wr, bool inLetExprBody, ConcreteSyntaxTree wStmts) {
if (e.Type.IsCharType && UnicodeCharEnabled) {
wr.Write("(");
TrParenExpr(e, wr, inLetExprBody, wStmts);
wr.Write(".Value)");
} else {
TrParenExpr(e, wr, inLetExprBody, wStmts);
}
}

public string TypeHelperName(Type type, ConcreteSyntaxTree wr, IToken tok, Type/*?*/ otherType = null) {
var xType = type.NormalizeExpand();
if (xType is SeqType seqType) {
Expand Down Expand Up @@ -1809,7 +1828,8 @@ protected override string GenerateLhsDecl(string target, Type/*?*/ type, Concret
protected override void EmitPrintStmt(ConcreteSyntaxTree wr, Expression arg) {
var wStmts = wr.Fork();
var typeArgs = arg.Type.AsArrowType == null ? "" : $"<{TypeName(arg.Type, wr, null, null)}>";
wr.WriteLine($"{DafnyHelpersClass}.Print{typeArgs}({Expr(arg, false, wStmts)});");
var suffix = arg.Type.IsStringType && UnicodeCharEnabled ? ".ToVerbatimString(false)" : "";
wr.WriteLine($"{DafnyHelpersClass}.Print{typeArgs}(({Expr(arg, false, wStmts)}){suffix});");
}

protected override void EmitReturn(List<Formal> outParams, ConcreteSyntaxTree wr) {
Expand Down Expand Up @@ -1856,6 +1876,10 @@ protected override void EmitHalt(IToken tok, Expression/*?*/ messageExpr, Concre
}

TrExpr(messageExpr, wr, false, wStmts);
if (UnicodeCharEnabled && messageExpr.Type.IsStringType) {
wr.Write(".ToVerbatimString(false)");
}

wr.WriteLine(");");
}

Expand Down Expand Up @@ -2044,9 +2068,21 @@ protected override void EmitLiteralExpr(ConcreteSyntaxTree wr, LiteralExpr e) {
} else if (e.Value is bool) {
wr.Write((bool)e.Value ? "true" : "false");
} else if (e is CharLiteralExpr) {
wr.Write("'{0}'", (string)e.Value);
var v = (string)e.Value;
if (UnicodeCharEnabled) {
var codePoint = Util.UnescapedCharacters(v, false).Single();
if (codePoint > char.MaxValue) {
// C# supports \U, but doesn't allow values that require two UTF-16 code units in character literals.
// For such values we construct the Rune value directly from the unescaped codepoint.
wr.Write($"new Dafny.Rune(0x{codePoint:x})");
} else {
wr.Write($"new Dafny.Rune('{Util.ExpandUnicodeEscapes(v, false)}')");
}
} else {
wr.Write($"'{v}'");
}
} else if (e is StringLiteralExpr str) {
wr.Format($"{DafnySeqClass}<char>.FromString({StringLiteral(str)})");
wr.Format($"{DafnySeqClass}<{CharTypeName}>.{CharMethodQualifier}FromString({StringLiteral(str)})");
} else if (AsNativeType(e.Type) != null) {
string nativeName = null, literalSuffix = null;
bool needsCastAfterArithmetic = false;
Expand Down Expand Up @@ -2093,7 +2129,7 @@ void EmitIntegerLiteral(BigInteger i, ConcreteSyntaxTree wr) {
}

protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
wr.Write($"{(isVerbatim ? "@" : "")}\"{str}\"");
wr.Write($"{(isVerbatim ? "@" : "")}\"{Util.ExpandUnicodeEscapes(str, false)}\"");
}

protected override ConcreteSyntaxTree EmitBitvectorTruncation(BitvectorType bvType, bool surroundByUnchecked, ConcreteSyntaxTree wr) {
Expand Down Expand Up @@ -2856,17 +2892,29 @@ protected override void CompileBinOp(BinaryExpr.ResolvedOpcode op,
case BinaryExpr.ResolvedOpcode.RightShift:
opString = ">>"; convertE1_to_int = true; break;
case BinaryExpr.ResolvedOpcode.Add:
opString = "+"; truncateResult = true;
if (resultType.IsCharType) {
preOpString = "(char)(";
postOpString = ")";
if (CharIsRune) {
staticCallString = $"{DafnyHelpersClass}.AddRunes";
} else {
opString = "+"; truncateResult = true;
preOpString = $"(char)(";
postOpString = ")";
}
} else {
opString = "+"; truncateResult = true;
}
break;
case BinaryExpr.ResolvedOpcode.Sub:
opString = "-"; truncateResult = true;
if (resultType.IsCharType) {
preOpString = "(char)(";
postOpString = ")";
if (CharIsRune) {
staticCallString = $"{DafnyHelpersClass}.SubtractRunes";
} else {
opString = "-"; truncateResult = true;
preOpString = $"(char)(";
postOpString = ")";
}
} else {
opString = "-"; truncateResult = true;
}
break;
case BinaryExpr.ResolvedOpcode.Mul:
Expand Down Expand Up @@ -2953,10 +3001,11 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
if (AsNativeType(e.E.Type) != null) {
wr.Write("new BigInteger");
}
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
ConvertFromChar(e.E, wr, inLetExprBody, wStmts);
wr.Write(", BigInteger.One)");
} else if (e.ToType.IsCharType) {
wr.Format($"(char)({Expr(e.E, inLetExprBody, wStmts)})");
wr.Write($"({CharTypeName})");
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
} else {
// (int or bv or char) -> (int or bv or ORDINAL)
var fromNative = AsNativeType(e.E.Type);
Expand All @@ -2965,7 +3014,7 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
if (e.E.Type.IsCharType) {
// char -> big-integer (int or bv or ORDINAL)
wr.Write("new BigInteger");
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
ConvertFromChar(e.E, wr, inLetExprBody, wStmts);
} else {
// big-integer (int or bv) -> big-integer (int or bv or ORDINAL), so identity will do
TrExpr(e.E, wr, inLetExprBody, wStmts);
Expand Down Expand Up @@ -3005,7 +3054,7 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
}
} else {
// no optimization applies; use the standard translation
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
ConvertFromChar(e.E, wr, inLetExprBody, wStmts);
}
}
}
Expand All @@ -3018,7 +3067,7 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
} else {
// real -> (int or bv or char or ordinal)
if (e.ToType.IsCharType) {
wr.Write("(char)");
wr.Write($"({CharTypeName})");
} else if (AsNativeType(e.ToType) != null) {
wr.Write("({0})", GetNativeTypeName(AsNativeType(e.ToType)));
}
Expand All @@ -3029,7 +3078,7 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
if (e.ToType.IsNumericBased(Type.NumericPersuasion.Int) || e.ToType.IsBigOrdinalType) {
TrExpr(e.E, wr, inLetExprBody, wStmts);
} else if (e.ToType.IsCharType) {
wr.Write("(char)");
wr.Write($"({CharTypeName})");
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
} else if (e.ToType.IsNumericBased(Type.NumericPersuasion.Real)) {
wr.Write("new Dafny.BigRational(");
Expand Down Expand Up @@ -3341,15 +3390,15 @@ public override void EmitCallToMain(Method mainMethod, string baseName, Concrete
var idName = IssueCreateStaticMain(mainMethod) ? "_StaticMain" : IdName(mainMethod);

Coverage.EmitSetup(wBody);
wBody.WriteLine($"{GetHelperModuleName()}.WithHaltHandling(() => {companion}.{idName}(Dafny.Sequence<Dafny.ISequence<char>>.FromMainArguments(args)));");
wBody.WriteLine($"{GetHelperModuleName()}.WithHaltHandling(() => {companion}.{idName}(Dafny.Sequence<Dafny.ISequence<{CharTypeName}>>.{CharMethodQualifier}FromMainArguments(args)));");
Coverage.EmitTearDown(wBody);
}

protected override void EmitHaltRecoveryStmt(Statement body, string haltMessageVarName, Statement recoveryBody, ConcreteSyntaxTree wr) {
var tryBlock = wr.NewBlock("try");
TrStmt(body, tryBlock);
var catchBlock = wr.NewBlock("catch (Dafny.HaltException e)");
catchBlock.WriteLine($"var {haltMessageVarName} = Dafny.Sequence<char>.FromString(e.Message);");
catchBlock.WriteLine($"var {haltMessageVarName} = Dafny.Sequence<{CharTypeName}>.{CharMethodQualifier}FromString(e.Message);");
TrStmt(recoveryBody, catchBlock);
}

Expand Down
54 changes: 39 additions & 15 deletions Source/DafnyCore/Compilers/Compiler-cpp.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ public override void OnPreCompile(ErrorReporter reporter, ReadOnlyCollection<str
Feature.MapComprehensions,
Feature.ExactBoundedPool,
Feature.RunAllTests,
Feature.MethodSynthesis
Feature.MethodSynthesis,
Feature.UnicodeChars,
Feature.ConvertingValuesToStrings
};

public override string TargetLanguage => "C++";
Expand Down Expand Up @@ -96,12 +98,20 @@ public override void OnPreCompile(ErrorReporter reporter, ReadOnlyCollection<str
protected override string ClassAccessor => "->";

protected override void EmitHeader(Program program, ConcreteSyntaxTree wr) {
// This seems to be a good place to check for unsupported options
if (UnicodeCharEnabled) {
throw new UnsupportedFeatureException(program.GetFirstTopLevelToken(), Feature.UnicodeChars);
}

wr.WriteLine("// Dafny program {0} compiled into Cpp", program.Name);
wr.WriteLine("#include \"DafnyRuntime.h\"");
foreach (var header in this.headers) {
wr.WriteLine("#include \"{0}\"", header);
}

// For "..."s string literals, to avoid interpreting /0 as the C end of the string, cstring-style
wr.WriteLine("using namespace std::literals;");

var filenameNoExtension = program.Name.Substring(0, program.Name.Length - 4);
var headerFileName = String.Format("{0}.h", filenameNoExtension);
wr.WriteLine("#include \"{0}\"", headerFileName);
Expand Down Expand Up @@ -208,7 +218,7 @@ private string InstantiateTemplate(List<Type> typeArgs) {
if (typeArgs != null) {
var targs = "";
if (typeArgs.Count > 0) {
targs = String.Format("<{0}>", Util.Comma(typeArgs, ta => TypeName(ta, null, null)));
targs = String.Format("<{0}>", Util.Comma(typeArgs, ta => TypeName(ta, null, Token.NoToken)));
}

return targs;
Expand Down Expand Up @@ -1108,7 +1118,7 @@ protected override string TypeInitializationValue(Type type, ConcreteSyntaxTree

private string ActualTypeArgs(List<Type> typeArgs) {
return typeArgs.Count > 0
? String.Format(" <{0}> ", Util.Comma(typeArgs, tp => TypeName(tp, null, null))) : "";
? String.Format(" <{0}> ", Util.Comma(typeArgs, tp => TypeName(tp, null, Token.NoToken))) : "";
}

protected override string TypeName_UDT(string fullCompileName, List<TypeParameter.TPVariance> variance, List<Type> typeArgs,
Expand Down Expand Up @@ -1319,7 +1329,14 @@ protected override void EmitHalt(IToken tok, Expression messageExpr, ConcreteSyn
wr.Write("\"" + Dafny.ErrorReporter.TokenToString(tok) + ": \" + ");
}

TrExpr(messageExpr, wr, false, wStmts);
if (messageExpr.Type.IsStringType) {
wr.Write("ToVerbatimString(");
TrExpr(messageExpr, wr, false, wStmts);
wr.Write(")");
} else {
throw new UnsupportedFeatureException(tok, Feature.ConvertingValuesToStrings);
}

wr.WriteLine(");");
}

Expand Down Expand Up @@ -1447,8 +1464,9 @@ protected override void EmitLiteralExpr(ConcreteSyntaxTree wr, LiteralExpr e) {
wr.Write("'{0}'", v);
} else if (e is StringLiteralExpr) {
var str = (StringLiteralExpr)e;
// TODO: the string should be converted to a Dafny seq<char>
wr.Write("DafnySequenceFromString(");
TrStringLiteral(str, wr);
wr.Write(")");
} else if (AsNativeType(e.Type) is NativeType nt) {
wr.Write("({0}){1}", GetNativeTypeName(nt), (BigInteger)e.Value);
if ((BigInteger)e.Value > 9223372036854775807) {
Expand All @@ -1470,9 +1488,8 @@ void EmitIntegerLiteral(BigInteger i, ConcreteSyntaxTree wr) {

protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteSyntaxTree wr) {
var n = str.Length;
wr.Write("DafnySequenceFromString(");
if (!isVerbatim) {
wr.Write("\"{0}\"", str);
wr.Write($"\"{TranslateEscapes(str)}\"");
} else {
wr.Write("\"");
for (var i = 0; i < n; i++) {
Expand All @@ -1491,7 +1508,18 @@ protected override void EmitStringLiteral(string str, bool isVerbatim, ConcreteS
}
wr.Write("\"");
}
wr.Write(")");

// Use the postfix "..."s operator (operator""s) to convert to std::string values
// without interpreting /0 as a terminator:
// https://en.cppreference.com/w/cpp/string/basic_string/operator%22%22s
wr.Write("s");
}

private static string TranslateEscapes(string s) {
s = Util.ReplaceNullEscapesWithCharacterEscapes(s);
// TODO: Other cases, once we address the fact that we shouldn't be
// using the C++ char as the Dafny 16-bit char in the first place.
return s;
}

protected override ConcreteSyntaxTree EmitBitvectorTruncation(BitvectorType bvType, bool surroundByUnchecked, ConcreteSyntaxTree wr) {
Expand Down Expand Up @@ -2279,9 +2307,8 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
if (e.ToType.IsNumericBased(Type.NumericPersuasion.Real)) {
throw new UnsupportedFeatureException(e.tok, Feature.RealNumbers);
} else if (e.ToType.IsCharType) {
wr.Write("_dafny.Char(");
wr.Write("(char)");
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
wr.Write(".Int32())");
} else {
// (int or bv or char) -> (int or bv or ORDINAL)
var fromNative = AsNativeType(e.E.Type);
Expand All @@ -2293,13 +2320,10 @@ protected override void EmitConversionExpr(ConversionExpr e, bool inLetExprBody,
} else if (e.E.Type.IsCharType) {
Contract.Assert(fromNative == null);
if (toNative == null) {
// char -> big-integer (int or bv or ORDINAL)
wr.Write("_dafny.IntOfInt32(rune(");
TrExpr(e.E, wr, inLetExprBody, wStmts);
wr.Write("))");
throw new UnsupportedFeatureException(e.tok, Feature.UnboundedIntegers);
} else {
// char -> native
wr.Write(GetNativeTypeName(toNative));
wr.Write($"({GetNativeTypeName(toNative)})");
TrParenExpr(e.E, wr, inLetExprBody, wStmts);
}
} else if (fromNative == null && toNative == null) {
Expand Down
Loading

0 comments on commit f65c111

Please sign in to comment.