Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41047: [C#] Address performance issue of reading from StringArray #41048

Merged
merged 3 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion csharp/src/Apache.Arrow/Arrays/StringArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,21 @@
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Types;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
using Apache.Arrow.Types;

namespace Apache.Arrow
{
public class StringArray: BinaryArray, IReadOnlyList<string>
{
public static readonly Encoding DefaultEncoding = Encoding.UTF8;

private Dictionary<Encoding, string[]> materializedStringStore;

public new class Builder : BuilderBase<StringArray, Builder>
{
public Builder() : base(StringType.Default) { }
Expand Down Expand Up @@ -71,16 +73,28 @@ public StringArray(int length,

public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor);

/// <summary>
/// Get the string value at the given index
/// </summary>
/// <param name="index">Input index</param>
/// <param name="encoding">Optional: the string encoding, default is UTF8</param>
/// <returns>The string object at the given index</returns>
public string GetString(int index, Encoding encoding = default)
{
encoding ??= DefaultEncoding;

if (materializedStringStore != null && materializedStringStore.TryGetValue(encoding, out string[] materializedStrings))
{
return materializedStrings[index];
}

ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);

if (isNull)
{
return null;
}

if (bytes.Length == 0)
{
return string.Empty;
Expand All @@ -93,6 +107,50 @@ public string GetString(int index, Encoding encoding = default)
}
}

/// <summary>
/// Materialize the array for the given encoding to accelerate the string access
/// </summary>
/// <param name="encoding">Optional: the string encoding, default is UTF8</param>
/// <remarks>This method is not thread safe when it is called in parallel with <see cref="GetString(int, Encoding)"/> or <see cref="Materialize(Encoding)"/>.</remarks>
public void Materialize(Encoding encoding = default)
{
encoding ??= DefaultEncoding;

if (IsMaterialized(encoding))
{
return;
}

if (materializedStringStore == null)
{
materializedStringStore = new Dictionary<Encoding, string[]>();
}

var stringStore = new string[Length];
for (int i = 0; i < Length; i++)
{
stringStore[i] = GetString(i, encoding);
}

materializedStringStore[encoding] = stringStore;
}

/// <summary>
/// Check if the array has been materialized for the given encoding
/// </summary>
/// <param name="encoding">Optional: the string encoding, default is UTF8</param>
/// <returns>True of false whether the array has been materialized</returns>
public bool IsMaterialized(Encoding encoding = default)
{
if (materializedStringStore == null)
{
return false;
}

encoding ??= DefaultEncoding;
return materializedStringStore.ContainsKey(encoding);
}

int IReadOnlyCollection<string>.Count => Length;

string IReadOnlyList<string>.this[int index] => GetString(index);
Expand Down
31 changes: 31 additions & 0 deletions csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,37 @@ public void ReturnsAppendedValue(string firstValue, string secondValue)
// Assert
Assert.Equal(firstValue, retrievedValue);
}

[Theory]
[InlineData(null, null)]
[InlineData(null, "")]
[InlineData(null, "value")]
[InlineData("", null)]
[InlineData("", "")]
[InlineData("", "value")]
[InlineData("value", null)]
[InlineData("value", "")]
[InlineData("value", "value")]
public void ReturnsAppendedValueMaterialize(string firstValue, string secondValue)
{
// Arrange
// Create an array with two elements. The second element being null,
// empty, or non-empty may influence the underlying BinaryArray
// storage such that retrieving an empty first element could result
// in an empty span or a 0-length span backed by storage.
var array = new StringArray.Builder()
.Append(firstValue)
.Append(secondValue)
.Build();

// Act
array.Materialize();
var retrievedValue = array.GetString(0);

// Assert
Assert.True(array.IsMaterialized());
Assert.Equal(firstValue, retrievedValue);
}
}
}
}
Loading