Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41047: [C#] Address performance issue of reading from StringArray #41048

Merged
merged 3 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/StringArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ public class StringArray: BinaryArray, IReadOnlyList<string>
{
public static readonly Encoding DefaultEncoding = Encoding.UTF8;

private Dictionary<Encoding, List<string>> materializedStringStore = new Dictionary<Encoding, List<string>>();

public new class Builder : BuilderBase<StringArray, Builder>
{
public Builder() : base(StringType.Default) { }
Expand Down Expand Up @@ -75,6 +77,11 @@ public string GetString(int index, Encoding encoding = default)
{
encoding ??= DefaultEncoding;

if (materializedStringStore.TryGetValue(encoding, out List<string> materializedStrings))
{
return materializedStrings[index];
}

ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);

if (isNull)
Expand All @@ -93,6 +100,30 @@ public string GetString(int index, Encoding encoding = default)
}
}

public void Materialize(Encoding encoding = default)
{
encoding ??= DefaultEncoding;

if (IsMaterialized(encoding))
{
return;
}

var stringStore = new List<string>(Length);
for (int i = 0; i < Length; i++)
{
stringStore.Add(GetString(i, encoding));
}

materializedStringStore[encoding] = stringStore;
}

public bool IsMaterialized(Encoding encoding = default)
{
encoding ??= DefaultEncoding;
return materializedStringStore.ContainsKey(encoding);
}

int IReadOnlyCollection<string>.Count => Length;

string IReadOnlyList<string>.this[int index] => GetString(index);
Expand Down
31 changes: 31 additions & 0 deletions csharp/test/Apache.Arrow.Tests/StringArrayTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,37 @@ public void ReturnsAppendedValue(string firstValue, string secondValue)
// Assert
Assert.Equal(firstValue, retrievedValue);
}

[Theory]
[InlineData(null, null)]
[InlineData(null, "")]
[InlineData(null, "value")]
[InlineData("", null)]
[InlineData("", "")]
[InlineData("", "value")]
[InlineData("value", null)]
[InlineData("value", "")]
[InlineData("value", "value")]
public void ReturnsAppendedValueMaterialize(string firstValue, string secondValue)
{
// Arrange
// Create an array with two elements. The second element being null,
// empty, or non-empty may influence the underlying BinaryArray
// storage such that retrieving an empty first element could result
// in an empty span or a 0-length span backed by storage.
var array = new StringArray.Builder()
.Append(firstValue)
.Append(secondValue)
.Build();

// Act
array.Materialize();
var retrievedValue = array.GetString(0);

// Assert
Assert.True(array.IsMaterialized());
Assert.Equal(firstValue, retrievedValue);
}
}
}
}
Loading