diff --git a/Benchmark/FastParserBenchmark.cs b/Benchmark/FastParserBenchmark.cs index bcf5cfa..e67b50b 100644 --- a/Benchmark/FastParserBenchmark.cs +++ b/Benchmark/FastParserBenchmark.cs @@ -23,6 +23,7 @@ namespace csFastFloat.Benchmark public class FFBencmark { private string[] _lines; + private byte[][] _linesUtf8; private class Config : ManualConfig { @@ -34,6 +35,33 @@ public Config() } } + [Benchmark(Description = "Utf8Parser")] + public double Utf8Parser() + { + double max = double.MinValue; + + foreach (byte[] l in _linesUtf8) + { + if (!System.Buffers.Text.Utf8Parser.TryParse(l, out double d, out int consumed) || consumed != l.Length) + throw new InvalidOperationException(); + + max = d > max ? d : max; + } + return max; + } + + [Benchmark(Description = "FastFloat.ParseDouble() - UTF8")] + public double FastParserUtf8_() + { + double max = double.MinValue; + + foreach (byte[] l in _linesUtf8) + { + double d = FastDoubleParser.ParseDouble(l); + max = d > max ? d : max; + } + return max; + } [Benchmark(Description = "FastFloat.ParseDouble()")] public double FastParser_() @@ -98,6 +126,7 @@ public void Setup() { Console.WriteLine("reading data"); _lines = System.IO.File.ReadAllLines(FileName); + _linesUtf8 = Array.ConvertAll(_lines, System.Text.Encoding.UTF8.GetBytes); } } diff --git a/README.md b/README.md index 53a0de8..113e1da 100644 --- a/README.md +++ b/README.md @@ -82,9 +82,9 @@ Job=.NET Core 5.0 Runtime=.NET Core 5.0 # Usage -Two functions are available. ParseDouble and ParseFloat. +Two functions are available: `FastDoubleParser.ParseDouble` and `FastFloatParser.ParseFloat`. -String, char * and ReadOnlySpan are supported inputs. +`String` and `ReadOnlySpan` are supported inputs. ```C# using csFastFloat; @@ -100,6 +100,10 @@ foreach (string l in lines) } ``` +Input strings are expected to be valid UTF-16. + +For UTF-8 or ASCII inputs, you may pass a `ReadOnlySpan` argument. + # Testing The set of unit tests in /TestcsFastFloat project combines unit tests from many libraries. It includes tests used by the Go Team. diff --git a/TestcsFastFloat/Basic/BasicTests.cs b/TestcsFastFloat/Basic/BasicTests.cs index 0afea7b..9c772a6 100644 --- a/TestcsFastFloat/Basic/BasicTests.cs +++ b/TestcsFastFloat/Basic/BasicTests.cs @@ -229,8 +229,12 @@ private void TestInfinity_Double(string sut, double expected_value) [InlineData("1.7976931348623158e308", 1.7976931348623157e+308)] // 0x1.fffffffffffffp + 1023)] [InlineData("9007199254740993.0", 9007199254740992.0)] // 0x1p53)] [Theory] - private void TestGeneral_Double(string sut, double expected_value) => Assert.Equal(expected_value, FastDoubleParser.ParseDouble(sut)); - + private void TestGeneral_Double(string sut, double expected_value) + { + Assert.Equal(expected_value, FastDoubleParser.ParseDouble(sut)); + Assert.Equal(expected_value, FastDoubleParser.ParseDouble(sut.AsSpan())); + Assert.Equal(expected_value, FastDoubleParser.ParseDouble(System.Text.Encoding.UTF8.GetBytes(sut))); + } [Trait("Category", "Smoke Test")] [InlineData("1.1754941406275178592461758986628081843312458647327962400313859427181746759860647699724722770042717456817626953125", 655, "", 1.17549419)] [InlineData("1.1754941406275178592461758986628081843312458647327962400313859427181746759860647699724722770042717456817626953125", 656, "", 1.17549419)] @@ -240,7 +244,6 @@ private void TestInfinity_Double(string sut, double expected_value) [InlineData("1.1754941406275178592461758986628081843312458647327962400313859427181746759860647699724722770042717456817626953125", 1000, "e-38", 1.1754941E-38)] [Theory] private void TestGeneral_Float_appendZeros(string sut, int zeros, string exp, float expected_value) => Assert.Equal(expected_value, FastFloatParser.ParseFloat(sut.PadRight(zeros, '0') + exp)); - // //verify32(1.00000006e+09f)] ////verify32(1.4012984643e-45f)] ////verify32(1.1754942107e-38f)] @@ -311,6 +314,7 @@ private void TestGeneral_Float(string sut, float expected_value) { Assert.Equal(expected_value, FastFloatParser.ParseFloat(sut)); Assert.Equal(expected_value, FastFloatParser.ParseFloat(sut.AsSpan())); + Assert.Equal(expected_value, FastFloatParser.ParseFloat(System.Text.Encoding.UTF8.GetBytes(sut))); } } } \ No newline at end of file diff --git a/TestcsFastFloat/suppl_tests/SupplFilesTest.cs b/TestcsFastFloat/suppl_tests/SupplFilesTest.cs index c4b202f..659a193 100644 --- a/TestcsFastFloat/suppl_tests/SupplFilesTest.cs +++ b/TestcsFastFloat/suppl_tests/SupplFilesTest.cs @@ -66,6 +66,18 @@ private static void VerifyFile(string fileName) Assert.True(_f == f); double d = FastDoubleParser.ParseDouble(sut[3]); Assert.True(_d == d); + + // parse and assert equality + float f_span = FastFloatParser.ParseFloat(sut[3].AsSpan()); + Assert.True(_f == f_span); + double d_span = FastDoubleParser.ParseDouble(sut[3].AsSpan()); + Assert.True(_d == d_span); + + // parse and assert equality + float f_utf8 = FastFloatParser.ParseFloat(System.Text.Encoding.UTF8.GetBytes(sut[3])); + Assert.True(_f == f_utf8); + double d_utf8 = FastDoubleParser.ParseDouble(System.Text.Encoding.UTF8.GetBytes(sut[3])); + Assert.True(_d == d_utf8); } catch (Exception ex) { diff --git a/csFastFloat/FastDoubleParser.cs b/csFastFloat/FastDoubleParser.cs index 6861a3a..e7be21b 100644 --- a/csFastFloat/FastDoubleParser.cs +++ b/csFastFloat/FastDoubleParser.cs @@ -115,7 +115,49 @@ unsafe static internal double ParseNumber(char* first, char* last, chars_format return ToFloat(pns.negative, am); } + unsafe static internal Double ParseNumber (byte* first, byte* last, chars_format expectedFormat = chars_format.is_general, byte decimal_separator = (byte)'.') + { + while ((first != last) && Utils.is_space(*first)) + { + first++; + } + if (first == last) + { + ThrowArgumentException(); + } + ParsedNumberString pns = ParsedNumberString.ParseNumberString(first, last, expectedFormat); + if (!pns.valid) + { + return HandleInvalidInput(first, last); + } + + // Next is Clinger's fast path. + if (DoubleBinaryConstants.min_exponent_fast_path <= pns.exponent && pns.exponent <= DoubleBinaryConstants.max_exponent_fast_path && pns.mantissa <= DoubleBinaryConstants.max_mantissa_fast_path && !pns.too_many_digits) + { + return FastPath(pns); + } + + AdjustedMantissa am = ComputeFloat(pns.exponent, pns.mantissa); + if (pns.too_many_digits) + { + if (am != ComputeFloat(pns.exponent, pns.mantissa + 1)) + { + am.power2 = -1; // value is invalid. + } + } + // If we called compute_float>(pns.exponent, pns.mantissa) and we have an invalid power (am.power2 < 0), + // then we need to go the long way around again. This is very uncommon. + if (am.power2 < 0) { am = ParseLongMantissa(first, last, decimal_separator); } + return ToFloat(pns.negative, am); + } + public static unsafe double ParseDouble(ReadOnlySpan s, chars_format expectedFormat = chars_format.is_general, byte decimal_separator = (byte)'.') + { + fixed(byte* pStart = s) + { + return ParseNumber(pStart, pStart + s.Length, expectedFormat, decimal_separator); + } + } /// /// @@ -363,12 +405,19 @@ internal static AdjustedMantissa ComputeFloat(DecimalInfo d) return answer; } + // UTF-16 inputs unsafe static internal AdjustedMantissa ParseLongMantissa(char* first, char* last, char decimal_separator) { DecimalInfo d = DecimalInfo.parse_decimal(first, last, decimal_separator); return ComputeFloat(d); } + // UTF-8/ASCII inputs + unsafe static internal AdjustedMantissa ParseLongMantissa(byte* first, byte* last, byte decimal_separator) + { + DecimalInfo d = DecimalInfo.parse_decimal(first, last, decimal_separator); + return ComputeFloat(d); + } unsafe static internal double HandleInvalidInput(char* first, char* last) @@ -404,6 +453,55 @@ unsafe static internal double HandleInvalidInput(char* first, char* last) } + unsafe static internal double HandleInvalidInput(byte* first, byte* last) + { + // C# does not (yet) allow literal ASCII strings (it uses UTF-16), so + // we need to use byte arrays. + // "infinity" string in ASCII, e.g., 105 = i + ReadOnlySpan infinity_string = new byte[]{105, 110, 102, 105, 110, 105, 116, 121}; + // "inf" string in ASCII + ReadOnlySpan inf_string = new byte[]{105, 110, 102}; + // "+inf" string in ASCII + ReadOnlySpan pinf_string = new byte[]{43, 105, 110, 102}; + // "-inf" string in ASCII + ReadOnlySpan minf_string = new byte[]{5, 105, 110, 102}; + // "nan" string in ASCII + ReadOnlySpan nan_string = new byte[]{110, 97, 110}; + // "-nan" string in ASCII + ReadOnlySpan mnan_string = new byte[]{45, 110, 97, 110}; + // "+nan" string in ASCII + ReadOnlySpan pnan_string = new byte[]{43, 110, 97, 110}; + + if (last - first >= 3) + { + if (Utils.strncasecmp(first, nan_string, 3)) + { + return DoubleBinaryConstants.NaN; + } + if (Utils.strncasecmp(first, inf_string, 3)) + { + if ((last - first >= 8) && Utils.strncasecmp(first, infinity_string, 8)) + return DoubleBinaryConstants.PositiveInfinity; + return DoubleBinaryConstants.PositiveInfinity; + } + if (last - first >= 4) + { + if (Utils.strncasecmp(first, pnan_string, 4) || Utils.strncasecmp(first, mnan_string, 4)) + { + return DoubleBinaryConstants.NaN; + } + if (Utils.strncasecmp(first, pinf_string, 4) || + Utils.strncasecmp(first, minf_string, 4) || + ((last - first >= 8) && Utils.strncasecmp(first + 1, infinity_string, 8))) + { + return (first[0] == '-') ? DoubleBinaryConstants.NegativeInfinity : DoubleBinaryConstants.PositiveInfinity; + } + } + } + ThrowArgumentException(); + return 0d; + } + diff --git a/csFastFloat/FastFloatParser.cs b/csFastFloat/FastFloatParser.cs index a87c6e6..02e9c38 100644 --- a/csFastFloat/FastFloatParser.cs +++ b/csFastFloat/FastFloatParser.cs @@ -114,6 +114,48 @@ unsafe static internal float ParseNumber(char* first, char* last, chars_format e } + unsafe static internal float ParseNumber(byte* first, byte* last, chars_format expectedFormat = chars_format.is_general, byte decimal_separator = (byte)'.') + { + while ((first != last) && Utils.is_space(*first)) + { + first++; + } + if (first == last) + { + ThrowArgumentException(); + } + ParsedNumberString pns = ParseNumberString(first, last, expectedFormat); + if (!pns.valid) + { + return HandleInvalidInput(first, last); + } + + // Next is Clinger's fast path. + if (FloatBinaryConstants.min_exponent_fast_path <= pns.exponent && pns.exponent <= FloatBinaryConstants.max_exponent_fast_path && pns.mantissa <= FloatBinaryConstants.max_mantissa_fast_path && !pns.too_many_digits) + { + return FastPath(pns); + } + + AdjustedMantissa am = ComputeFloat(pns.exponent, pns.mantissa); + if (pns.too_many_digits) + { + if (am != ComputeFloat(pns.exponent, pns.mantissa + 1)) + { + am.power2 = -1; // value is invalid. + } + } + // If we called compute_float>(pns.exponent, pns.mantissa) and we have an invalid power (am.power2 < 0), + // then we need to go the long way around again. This is very uncommon. + if (am.power2 < 0) { am = ParseLongMantissa(first, last, (byte)decimal_separator); } + return ToFloat(pns.negative, am); + } + public static unsafe float ParseFloat(ReadOnlySpan s, chars_format expectedFormat = chars_format.is_general, byte decimal_separator = (byte)'.') + { + fixed(byte* pStart = s) + { + return ParseNumber(pStart, pStart + s.Length, expectedFormat, decimal_separator); + } + } /// /// @@ -368,12 +410,19 @@ internal static AdjustedMantissa ComputeFloat(DecimalInfo d) return answer; } + // UTF-16 inputs unsafe static internal AdjustedMantissa ParseLongMantissa(char* first, char* last, char decimal_separator) { DecimalInfo d = DecimalInfo.parse_decimal(first, last, decimal_separator); return ComputeFloat(d); } + // UTF-8/ASCII inputs + unsafe static internal AdjustedMantissa ParseLongMantissa(byte* first, byte* last, byte decimal_separator) + { + DecimalInfo d = DecimalInfo.parse_decimal(first, last, decimal_separator); + return ComputeFloat(d); + } @@ -410,6 +459,54 @@ unsafe static internal float HandleInvalidInput(char* first, char* last) return 0f; } + unsafe static internal float HandleInvalidInput(byte* first, byte* last) + { + // C# does not (yet) allow literal ASCII strings (it uses UTF-16), so + // we need to use byte arrays. + // "infinity" string in ASCII, e.g., 105 = i + ReadOnlySpan infinity_string = new byte[]{105, 110, 102, 105, 110, 105, 116, 121}; + // "inf" string in ASCII + ReadOnlySpan inf_string = new byte[]{105, 110, 102}; + // "+inf" string in ASCII + ReadOnlySpan pinf_string = new byte[]{43, 105, 110, 102}; + // "-inf" string in ASCII + ReadOnlySpan minf_string = new byte[]{5, 105, 110, 102}; + // "nan" string in ASCII + ReadOnlySpan nan_string = new byte[]{110, 97, 110}; + // "-nan" string in ASCII + ReadOnlySpan mnan_string = new byte[]{45, 110, 97, 110}; + // "+nan" string in ASCII + ReadOnlySpan pnan_string = new byte[]{43, 110, 97, 110}; + + if (last - first >= 3) + { + if (Utils.strncasecmp(first, nan_string, 3)) + { + return FloatBinaryConstants.NaN; + } + if (Utils.strncasecmp(first, inf_string, 3)) + { + if ((last - first >= 8) && Utils.strncasecmp(first, infinity_string, 8)) + return FloatBinaryConstants.PositiveInfinity; + return FloatBinaryConstants.PositiveInfinity; + } + if (last - first >= 4) + { + if (Utils.strncasecmp(first, pnan_string, 4) || Utils.strncasecmp(first, mnan_string, 4)) + { + return FloatBinaryConstants.NaN; + } + if (Utils.strncasecmp(first, pinf_string, 4) || + Utils.strncasecmp(first, minf_string, 4) || + ((last - first >= 8) && Utils.strncasecmp(first + 1, infinity_string, 8))) + { + return (first[0] == '-') ? FloatBinaryConstants.NegativeInfinity : FloatBinaryConstants.PositiveInfinity; + } + } + } + ThrowArgumentException(); + return 0f; + } @@ -564,6 +661,164 @@ unsafe static internal ParsedNumberString ParseNumberString(char* p, char* pend, return answer; } + + unsafe static internal ParsedNumberString ParseNumberString(byte* p, byte* pend, chars_format expectedFormat = chars_format.is_general, char decimal_separator = '.') + { + ParsedNumberString answer = new ParsedNumberString(); + + answer.valid = false; + answer.too_many_digits = false; + answer.negative = (*p == '-'); + if ((*p == '-') || (*p == '+')) + { + ++p; + if (p == pend) + { + return answer; + } + if (!Utils.is_integer(*p, out uint digit) && (*p != decimal_separator)) // culture info ? + { // a sign must be followed by an integer or the dot + return answer; + } + } + byte* start_digits = p; + + ulong i = 0; // an unsigned int avoids signed overflows (which are bad) + + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + digit; // might overflow, we will handle the overflow later + ++p; + } + byte* end_of_integer_part = p; + long digit_count = (long)(end_of_integer_part - start_digits); + long exponent = 0; + if ((p != pend) && (*p == decimal_separator)) + { + ++p; + if ((p + 8 <= pend) && Utils.is_made_of_eight_digits_fast(p)) + { + i = i * 100000000 + Utils.parse_eight_digits_unrolled(p); + p += 8; + if ((p + 8 <= pend) && Utils.is_made_of_eight_digits_fast(p)) { + i = i * 100000000 + Utils.parse_eight_digits_unrolled(p); + p += 8; + } + } + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + } + exponent = end_of_integer_part + 1 - p; + digit_count -= exponent; + } + // we must have encountered at least one integer! + if (digit_count == 0) + { + return answer; + } + long exp_number = 0; // explicit exponential part + if (expectedFormat.HasFlag(chars_format.is_scientific) && (p != pend) && (('e' == *p) || ('E' == *p))) + { + byte* location_of_e = p; + ++p; + bool neg_exp = false; + if ((p != pend) && ('-' == *p)) + { + neg_exp = true; + ++p; + } + else if ((p != pend) && ('+' == *p)) + { + ++p; + } + if ((p == pend) || !Utils.is_integer(*p, out uint digit)) + { + if (expectedFormat != chars_format.is_fixed) + { + // We are in error. + return answer; + } + // Otherwise, we will be ignoring the 'e'. + p = location_of_e; + } + else + { + while ((p != pend) && Utils.is_integer(*p, out uint cdigit)) + { + if (exp_number < 0x10000) + { + exp_number = 10 * exp_number + cdigit; + } + ++p; + } + if (neg_exp) { exp_number = -exp_number; } + exponent += exp_number; + } + } + else + { + // If it scientific and not fixed, we have to bail out. + if ((expectedFormat.HasFlag(chars_format.is_scientific)) && !(expectedFormat.HasFlag(chars_format.is_fixed))) { return answer; } + } + //answer.lastmatch = p; + answer.valid = true; + + // If we frequently had to deal with long strings of digits, + // we could extend our code by using a 128-bit integer instead + // of a 64-bit integer. However, this is uncommon. + // + // We can deal with up to 19 digits. + if (digit_count > 19) + { // this is uncommon + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + // We need to be mindful of the case where we only have zeroes... + // E.g., 0.000000000...000. + byte* start = start_digits; + while ((start != pend) && (*start == '0' || *start == decimal_separator)) + { + if (*start == '0') { digit_count--; } + start++; + } + if (digit_count > 19) + { + answer.too_many_digits = true; + // Let us start again, this time, avoiding overflows. + i = 0; + p = start_digits; + const ulong minimal_nineteen_digit_integer = 1000000000000000000; + while ((i < minimal_nineteen_digit_integer) && (p != pend) && Utils.is_integer(*p, out uint digit)) + { + i = i * 10 + digit; + ++p; + } + if (i >= minimal_nineteen_digit_integer) + { // We have a big integers + exponent = end_of_integer_part - p + exp_number; + } + else + { // We have a value with a fractional component. + p++; // skip the '.' + byte* first_after_period = p; + while ((i < minimal_nineteen_digit_integer) && (p != pend) && Utils.is_integer(*p, out uint digit)) + { + i = i * 10 + digit; + ++p; + } + exponent = first_after_period - p + exp_number; + } + // We have now corrected both exponent and i, to a truncated value + } + } + answer.exponent = exponent; + answer.mantissa = i; + return answer; + } + // This should always succeed since it follows a call to parse_number_string // This function could be optimized. In particular, we could stop after 19 digits // and try to bail out. Furthermore, we should be able to recover the computed diff --git a/csFastFloat/Structures/DecimalInfo.cs b/csFastFloat/Structures/DecimalInfo.cs index 1de78a2..b1a5167 100644 --- a/csFastFloat/Structures/DecimalInfo.cs +++ b/csFastFloat/Structures/DecimalInfo.cs @@ -310,6 +310,7 @@ internal void decimal_right_shift(int shift) trim(); } + // UTF-16 inputs unsafe internal static DecimalInfo parse_decimal(char* p, char* pend, char decimal_separator) { DecimalInfo answer = new DecimalInfo() { negative = (*p == '-') }; @@ -345,21 +346,6 @@ unsafe internal static DecimalInfo parse_decimal(char* p, char* pend, char decim ++p; } } - //#if FASTFLOAT_IS_BIG_ENDIAN == 0 - // // We expect that this loop will often take the bulk of the running time - // // because when a value has lots of digits, these digits often - // while ((p + 8 <= pend) && (answer.num_digits + 8 < Constants.max_digits)) - // { - // ulong val; - // ::memcpy(&val, p, sizeof(ulong)); - // if (!is_made_of_eight_digits_fast(val)) { break; } - // // We have eight digits, process them in one go! - // val -= 0x3030303030303030; - // ::memcpy(answer.digits + answer.num_digits, &val, sizeof(ulong)); - // answer.num_digits += 8; - // p += 8; - // } - //#endif while ((p != pend) && Utils.is_integer(*p, out uint cMinus0)) { if (answer.num_digits < Constants.max_digits) @@ -425,5 +411,106 @@ unsafe internal static DecimalInfo parse_decimal(char* p, char* pend, char decim return answer; } + + // UTF-8/ASCII inputs + unsafe internal static DecimalInfo parse_decimal(byte* p, byte* pend, byte decimal_separator) + { + DecimalInfo answer = new DecimalInfo() { negative = (*p == '-') }; + + if ((*p == '-') || (*p == '+')) + { + ++p; + } + // skip leading zeroes + while ((p != pend) && (*p == '0')) + { + ++p; + } + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + if (answer.num_digits < Constants.max_digits) + { + answer.digits[answer.num_digits] = (byte)digit; + } + answer.num_digits++; + ++p; + } + if ((p != pend) && (*p == decimal_separator)) + { + ++p; + byte* first_after_period = p; + // if we have not yet encountered a zero, we have to skip it as well + if (answer.num_digits == 0) + { + // skip zeros + while ((p != pend) && (*p == '0')) + { + ++p; + } + } + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + if (answer.num_digits < Constants.max_digits) + { + answer.digits[answer.num_digits] = (byte)digit; + } + answer.num_digits++; + ++p; + } + answer.decimal_point = (int)(first_after_period - p); + } + // We want num_digits to be the number of significant digits, excluding + // leading *and* trailing zeros! Otherwise the truncated flag later is + // going to be misleading. + if (answer.num_digits > 0) + { + // We potentially need the answer.num_digits > 0 guard because we + // prune leading zeros. So with answer.num_digits > 0, we know that + // we have at least one non-zero digit. + byte* preverse = p - 1; + int trailing_zeros = 0; + while ((*preverse == '0') || (*preverse == decimal_separator)) + { + if (*preverse == '0') { trailing_zeros++; }; + --preverse; + } + answer.decimal_point += (int)(answer.num_digits); + answer.num_digits -= (uint)(trailing_zeros); + } + if (answer.num_digits > Constants.max_digits) + { + answer.truncated = true; + answer.num_digits = Constants.max_digits; + } + if ((p != pend) && (('e' == *p) || ('E' == *p))) + { + ++p; + bool neg_exp = false; + if ((p != pend) && ('-' == *p)) + { + neg_exp = true; + ++p; + } + else if ((p != pend) && ('+' == *p)) + { + ++p; + } + int exp_number = 0; // exponential part + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + if (exp_number < 0x10000) + { + exp_number = 10 * exp_number + (int)digit; + } + ++p; + } + answer.decimal_point += (neg_exp ? -exp_number : exp_number); + } + // In very rare cases, we may have fewer than 19 digits, we want to be able to reliably + // assume that all digits up to max_digit_without_overflow have been initialized. + for (uint i = answer.num_digits; i < Constants.max_digit_without_overflow; i++) { answer.digits[i] = 0; } + + return answer; + } } } \ No newline at end of file diff --git a/csFastFloat/Structures/ParsedNumberString.cs b/csFastFloat/Structures/ParsedNumberString.cs index bf1cf94..1a163f2 100644 --- a/csFastFloat/Structures/ParsedNumberString.cs +++ b/csFastFloat/Structures/ParsedNumberString.cs @@ -11,7 +11,7 @@ public unsafe struct ParsedNumberString internal bool valid; internal bool too_many_digits; - + // UTF-16 inputs. unsafe static internal ParsedNumberString ParseNumberString(char* p, char* pend, chars_format expectedFormat = chars_format.is_general, char decimal_separator = '.') { ParsedNumberString answer = new ParsedNumberString(); @@ -162,6 +162,163 @@ unsafe static internal ParsedNumberString ParseNumberString(char* p, char* pend, return answer; } + // UTF-8 / ASCII inputs. + unsafe static internal ParsedNumberString ParseNumberString(byte* p, byte* pend, chars_format expectedFormat = chars_format.is_general, byte decimal_separator = (byte)'.') + { + ParsedNumberString answer = new ParsedNumberString(); + + answer.valid = false; + answer.too_many_digits = false; + answer.negative = (*p == '-'); + if ((*p == '-') || (*p == '+')) + { + ++p; + if (p == pend) + { + return answer; + } + if (!Utils.is_integer(*p, out uint digit) && (*p != decimal_separator)) + { // a sign must be followed by an integer or the dot + return answer; + } + } + byte* start_digits = p; + + ulong i = 0; // an unsigned int avoids signed overflows (which are bad) + + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + digit; // might overflow, we will handle the overflow later + ++p; + } + byte* end_of_integer_part = p; + long digit_count = (long)(end_of_integer_part - start_digits); + long exponent = 0; + if ((p != pend) && (*p == decimal_separator)) + { + ++p; + if ((p + 8 <= pend) && Utils.is_made_of_eight_digits_fast(p)) + { + i = i * 100000000 + Utils.parse_eight_digits_unrolled(p); + p += 8; + if ((p + 8 <= pend) && Utils.is_made_of_eight_digits_fast(p)) { + i = i * 100000000 + Utils.parse_eight_digits_unrolled(p); + p += 8; + } + } + while ((p != pend) && Utils.is_integer(*p, out uint digit)) + { + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + } + exponent = end_of_integer_part + 1 - p; + digit_count -= exponent; + } + // we must have encountered at least one integer! + if (digit_count == 0) + { + return answer; + } + long exp_number = 0; // explicit exponential part + if (expectedFormat.HasFlag(chars_format.is_scientific) && (p != pend) && (('e' == *p) || ('E' == *p))) + { + byte* location_of_e = p; + ++p; + bool neg_exp = false; + if ((p != pend) && ('-' == *p)) + { + neg_exp = true; + ++p; + } + else if ((p != pend) && ('+' == *p)) + { + ++p; + } + if ((p == pend) || !Utils.is_integer(*p, out uint digit)) + { + if (expectedFormat != chars_format.is_fixed) + { + // We are in error. + return answer; + } + // Otherwise, we will be ignoring the 'e'. + p = location_of_e; + } + else + { + while ((p != pend) && Utils.is_integer(*p, out uint cdigit)) + { + if (exp_number < 0x10000) + { + exp_number = 10 * exp_number + cdigit; + } + ++p; + } + if (neg_exp) { exp_number = -exp_number; } + exponent += exp_number; + } + } + else + { + // If it scientific and not fixed, we have to bail out. + if ((expectedFormat.HasFlag(chars_format.is_scientific)) && !(expectedFormat.HasFlag(chars_format.is_fixed))) { return answer; } + } + //answer.lastmatch = p; + answer.valid = true; + + // If we frequently had to deal with long strings of digits, + // we could extend our code by using a 128-bit integer instead + // of a 64-bit integer. However, this is uncommon. + // + // We can deal with up to 19 digits. + if (digit_count > 19) + { // this is uncommon + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + // We need to be mindful of the case where we only have zeroes... + // E.g., 0.000000000...000. + byte* start = start_digits; + while ((start != pend) && (*start == '0' || *start == decimal_separator)) + { + if (*start == '0') { digit_count--; } + start++; + } + if (digit_count > 19) + { + answer.too_many_digits = true; + // Let us start again, this time, avoiding overflows. + i = 0; + p = start_digits; + const ulong minimal_nineteen_digit_integer = 1000000000000000000; + while ((i < minimal_nineteen_digit_integer) && (p != pend) && Utils.is_integer(*p, out uint digit)) + { + i = i * 10 + digit; + ++p; + } + if (i >= minimal_nineteen_digit_integer) + { // We have a big integers + exponent = end_of_integer_part - p + exp_number; + } + else + { // We have a value with a fractional component. + p++; // skip the '.' + byte* first_after_period = p; + while ((i < minimal_nineteen_digit_integer) && (p != pend) && Utils.is_integer(*p, out uint digit)) + { + i = i * 10 + digit; + ++p; + } + exponent = first_after_period - p + exp_number; + } + // We have now corrected both exponent and i, to a truncated value + } + } + answer.exponent = exponent; + answer.mantissa = i; + return answer; + } }; } \ No newline at end of file diff --git a/csFastFloat/Utils/Utils.cs b/csFastFloat/Utils/Utils.cs index 103409a..ff37be0 100644 --- a/csFastFloat/Utils/Utils.cs +++ b/csFastFloat/Utils/Utils.cs @@ -22,8 +22,42 @@ public value128(ulong h, ulong l) : this() public static class Utils { - // Next function can be micro-optimized, but compilers are entirely - // able to optimize it well. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static uint parse_eight_digits_unrolled(ulong val) + { + const ulong mask = 0x000000FF000000FF; + const ulong mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) + const ulong mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) + val -= 0x3030303030303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32; + return (uint)val; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + unsafe internal static uint parse_eight_digits_unrolled(byte* chars) + { + ulong val = Unsafe.ReadUnaligned(chars); + return parse_eight_digits_unrolled(val); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + unsafe internal static bool is_made_of_eight_digits_fast(ulong val) + { + // We only enable paths depending on this function on little endian + // platforms (it happens to be effectively nearly everywhere). + return BitConverter.IsLittleEndian && (((val & 0xF0F0F0F0F0F0F0F0) | + (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) == + 0x3333333333333333); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + unsafe internal static bool is_made_of_eight_digits_fast(byte* chars) + { + ulong val = Unsafe.ReadUnaligned(chars); + return is_made_of_eight_digits_fast(val); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool is_integer(char c, out uint cMinus0) { @@ -32,7 +66,16 @@ internal static bool is_integer(char c, out uint cMinus0) cMinus0 = cc; return res; } - + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool is_integer(byte c, out uint cMinus0) + { + uint cc = (uint)(c - (byte)'0'); + bool res = cc <= '9' - '0'; + cMinus0 = cc; + return res; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static value128 compute_product_approximation(int bitPrecision, long q, ulong w) { @@ -147,5 +190,27 @@ internal unsafe static bool strncasecmp(char* input1, char* input2, int length) } return (running_diff == 0) || (running_diff == 32); } + internal unsafe static bool strncasecmp(byte* input1, ReadOnlySpan input2, int length) + { + int running_diff = 0; + + for (int i = 0; i < length; i++) + { + running_diff = running_diff | (input1[i] ^ input2[i]); + } + return (running_diff == 0) || (running_diff == 32); + } + + internal unsafe static bool strncasecmp(byte* input1, byte* input2, int length) + { + int running_diff = 0; + + for (int i = 0; i < length; i++) + { + running_diff = running_diff | (input1[i] ^ input2[i]); + } + return (running_diff == 0) || (running_diff == 32); + } + } }