Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stream/field name validation/filtering #1004

Merged
merged 8 commits into from
Nov 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion airbyte-protocol/models/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ plugins {
}

dependencies {
implementation group: 'javax.validation', name: 'validation-api', version: '1.1.0.Final'
implementation 'javax.validation:validation-api:1.1.0.Final'
implementation 'org.apache.commons:commons-lang3:3.11'
}

jsonSchema2Pojo {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,21 @@
package io.airbyte.protocol.models;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import io.airbyte.commons.json.Jsons;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;

public class CatalogHelpers {

Expand Down Expand Up @@ -77,4 +84,62 @@ public static Set<String> getTopLevelFieldNames(final AirbyteStream stream) {
return object.keySet();
}

/**
* @param node any json node
* @return a set of all keys for all objects within the node
*/
@VisibleForTesting
protected static Set<String> getAllFieldNames(JsonNode node) {
Set<String> allFieldNames = new HashSet<>();

Iterator<String> fieldNames = node.fieldNames();
while (fieldNames.hasNext()) {
String fieldName = fieldNames.next();
allFieldNames.add(fieldName);
JsonNode fieldValue = node.get(fieldName);
if (fieldValue.isObject()) {
allFieldNames.addAll(getAllFieldNames(fieldValue));
}
}

return allFieldNames;
}

/**
* @param identifier stream name or field name
* @return if the identifier matches the alphanumeric+underscore requirement for identifiers
*/
public static boolean isValidIdentifier(String identifier) {
return StringUtils.isAlphanumeric(identifier.replace("_", ""));
}

/**
* @param catalog airbyte catalog
* @return list of stream names in the catalog that are invalid
*/
public static List<String> getInvalidStreamNames(AirbyteCatalog catalog) {
return catalog.getStreams().stream()
.map(AirbyteStream::getName)
.filter(streamName -> !isValidIdentifier(streamName))
.collect(Collectors.toList());
}

/**
* @param catalog airbyte catalog
* @return multimap of stream names to all invalid field names in that stream
*/
public static Multimap<String, String> getInvalidFieldNames(AirbyteCatalog catalog) {
Multimap<String, String> streamNameToInvalidFieldNames = Multimaps.newSetMultimap(new HashMap<>(), HashSet::new);

for (AirbyteStream stream : catalog.getStreams()) {
Set<String> invalidFieldNames = getAllFieldNames(stream.getJsonSchema()).stream()
.filter(streamName -> !isValidIdentifier(streamName))
.collect(Collectors.toSet());

streamNameToInvalidFieldNames.putAll(stream.getName(), invalidFieldNames);
}

return streamNameToInvalidFieldNames;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,21 @@
package io.airbyte.protocol.models;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import io.airbyte.commons.json.Jsons;
import io.airbyte.commons.resources.MoreResources;
import io.airbyte.protocol.models.Field.JsonSchemaPrimitive;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Test;

Expand All @@ -51,4 +61,76 @@ void testGetTopLevelFieldNames() {
assertEquals(Sets.newHashSet("name"), actualFieldNames);
}

@Test
void testValidIdentifiers() {
assertTrue(CatalogHelpers.isValidIdentifier("identifier_name"));
assertTrue(CatalogHelpers.isValidIdentifier("iDenTiFieR_name"));
assertTrue(CatalogHelpers.isValidIdentifier("__identifier_name"));
assertTrue(CatalogHelpers.isValidIdentifier("IDENTIFIER_NAME"));
assertTrue(CatalogHelpers.isValidIdentifier("123identifier_name"));
assertTrue(CatalogHelpers.isValidIdentifier("i0d0e0n0t0i0f0i0e0r0n0a0m0e"));
assertTrue(CatalogHelpers.isValidIdentifier("identifiêr"));
assertTrue(CatalogHelpers.isValidIdentifier("a_unicode_name_文"));
assertTrue(CatalogHelpers.isValidIdentifier("identifier__name__"));
}

@Test
void testInvalidIdentifiers() {
assertFalse(CatalogHelpers.isValidIdentifier("invalid-identifier"));
assertFalse(CatalogHelpers.isValidIdentifier("\"identifier name"));
assertFalse(CatalogHelpers.isValidIdentifier("$identifier"));
assertFalse(CatalogHelpers.isValidIdentifier("identifier name"));
assertFalse(CatalogHelpers.isValidIdentifier("identifier%"));
assertFalse(CatalogHelpers.isValidIdentifier("`identifier`"));
assertFalse(CatalogHelpers.isValidIdentifier("'identifier'"));
}

@Test
void testGetInvalidStreamNames() {
final String validStreamName = "Valid_Stream";
final AirbyteStream validStream = new AirbyteStream();
validStream.setName(validStreamName);

final String invalidStreamName = "invalid stream";
AirbyteStream invalidStream = new AirbyteStream();
invalidStream.setName(invalidStreamName);

AirbyteCatalog catalog = new AirbyteCatalog();
catalog.setStreams(List.of(validStream, invalidStream));

List<String> invalidStreamNames = CatalogHelpers.getInvalidStreamNames(catalog);
assertIterableEquals(Collections.singleton(invalidStreamName), invalidStreamNames);
}

@Test
void testGetFieldNames() throws IOException {
JsonNode node = Jsons.deserialize(MoreResources.readResource("valid_schema.json"));
Set<String> actualFieldNames = CatalogHelpers.getAllFieldNames(node);
Set<String> expectedFieldNames = ImmutableSet.of("type", "properties", "format", "date", "CAD", "HKD", "ISK", "PHP", "DKK", "HUF", "文");

assertEquals(expectedFieldNames, actualFieldNames);
}

@Test
void testGetInvalidFieldNames() throws IOException {
final String validStreamName = "Valid_Stream";
final AirbyteStream validStream = new AirbyteStream();
validStream.setName(validStreamName);
JsonNode validSchema = Jsons.deserialize(MoreResources.readResource("valid_schema.json"));
validStream.setJsonSchema(validSchema);

final String invalidStreamName = "invalid stream";
AirbyteStream invalidStream = new AirbyteStream();
invalidStream.setName(invalidStreamName);
JsonNode invalidSchema = Jsons.deserialize(MoreResources.readResource("invalid_schema.json"));
invalidStream.setJsonSchema(invalidSchema);

AirbyteCatalog catalog = new AirbyteCatalog();
catalog.setStreams(List.of(validStream, invalidStream));

Multimap<String, String> streamNameToInvalidFieldNames = CatalogHelpers.getInvalidFieldNames(catalog);
assertIterableEquals(Collections.singleton(invalidStreamName), streamNameToInvalidFieldNames.keySet());
assertIterableEquals(ImmutableList.of("C A D", "\"type"), streamNameToInvalidFieldNames.get(invalidStreamName));
}

}
13 changes: 13 additions & 0 deletions airbyte-protocol/models/src/test/resources/invalid_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"type": "object",
"properties": {
"date": { "type": "string", "format": "date-time" },
"C A D": { "type": ["null", "number"] },
"HKD": { "type": ["null", "number"] },
"ISK": { "type": ["null", "number"] },
"PHP": { "type": ["null", "number"] },
"DKK": { "type": ["null", "number"] },
"HUF": { "\"type": ["null", "number"] },
"CZK": { "type": ["null", "number"] }
}
}
13 changes: 13 additions & 0 deletions airbyte-protocol/models/src/test/resources/valid_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should have a test with Unicode

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

"type": "object",
"properties": {
"date": { "type": "string", "format": "date-time" },
"CAD": { "type": ["null", "number"] },
"HKD": { "type": ["null", "number"] },
"ISK": { "type": ["null", "number"] },
"PHP": { "type": ["null", "number"] },
"DKK": { "type": ["null", "number"] },
"HUF": { "type": ["null", "number"] },
"文": { "type": ["null", "number"] }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ private WorkerRun createSyncWorker(JobSyncConfig config, Path jobRoot) {
jobRoot,
syncInput,
new JobOutputSyncWorker(
new DefaultSyncWorker<>(
new DefaultSyncWorker(
new DefaultAirbyteSource(sourceLauncher),
new DefaultAirbyteDestination(destinationLauncher),
new AirbyteMessageTracker(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package io.airbyte.workers;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.Multimap;
import io.airbyte.commons.io.IOs;
import io.airbyte.commons.io.LineGobbler;
import io.airbyte.commons.json.Jsons;
Expand All @@ -33,11 +34,13 @@
import io.airbyte.protocol.models.AirbyteCatalog;
import io.airbyte.protocol.models.AirbyteMessage;
import io.airbyte.protocol.models.AirbyteMessage.Type;
import io.airbyte.protocol.models.CatalogHelpers;
import io.airbyte.workers.process.IntegrationLauncher;
import io.airbyte.workers.protocols.airbyte.AirbyteStreamFactory;
import io.airbyte.workers.protocols.airbyte.DefaultAirbyteStreamFactory;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
Expand Down Expand Up @@ -87,7 +90,21 @@ public OutputAndStatus<StandardDiscoverCatalogOutput> run(final StandardDiscover
int exitCode = process.exitValue();
if (exitCode == 0) {
if (catalog.isEmpty()) {
LOGGER.error("integration failed to output a catalog struct.");
LOGGER.error("Integration failed to output a catalog struct.");
return new OutputAndStatus<>(JobStatus.FAILED);
}

List<String> invalidStreamNames = CatalogHelpers.getInvalidStreamNames(catalog.get());

if (!invalidStreamNames.isEmpty()) {
invalidStreamNames.forEach(streamName -> LOGGER.error("Cannot sync invalid stream name: " + streamName));
return new OutputAndStatus<>(JobStatus.FAILED);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we also return a message describing why discovery failed so we can show this on the UI?

Copy link
Contributor Author

@jrhizor jrhizor Nov 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OutputAndStatus doesn't allow for error messages unfortunately. Adding messages outside of logs seems a bit out of scope for this PR. Added #1011

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that, but I'm just worried that in its current state, there is no way for the user to know what's wrong. It'll just look like airbyte is broken if e.g: they sync Gsheets with a space in the name, and all they'll see on the UI is "an error happened"

}

Multimap<String, String> streamNameToInvalidFieldNames = CatalogHelpers.getInvalidFieldNames(catalog.get());
if (!streamNameToInvalidFieldNames.isEmpty()) {
streamNameToInvalidFieldNames
.forEach((streamName, fieldNames) -> LOGGER.error("Cannot sync invalid field names for stream " + streamName + ": " + fieldNames));
return new OutputAndStatus<>(JobStatus.FAILED);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,37 +24,46 @@

package io.airbyte.workers;

import com.google.common.collect.Sets;
import io.airbyte.config.StandardSyncInput;
import io.airbyte.config.StandardSyncOutput;
import io.airbyte.config.StandardSyncSummary;
import io.airbyte.config.StandardTapConfig;
import io.airbyte.config.StandardTargetConfig;
import io.airbyte.config.State;
import io.airbyte.protocol.models.AirbyteCatalog;
import io.airbyte.protocol.models.AirbyteMessage;
import io.airbyte.protocol.models.AirbyteStream;
import io.airbyte.protocol.models.CatalogHelpers;
import io.airbyte.workers.normalization.NormalizationRunner;
import io.airbyte.workers.protocols.Destination;
import io.airbyte.workers.protocols.MessageTracker;
import io.airbyte.workers.protocols.Source;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DefaultSyncWorker<T> implements SyncWorker {
public class DefaultSyncWorker implements SyncWorker {

private static final Logger LOGGER = LoggerFactory.getLogger(DefaultSyncWorker.class);

private final Source<T> source;
private final Destination<T> destination;
private final MessageTracker<T> messageTracker;
private final Source<AirbyteMessage> source;
private final Destination<AirbyteMessage> destination;
private final MessageTracker<AirbyteMessage> messageTracker;
private final NormalizationRunner normalizationRunner;

private final AtomicBoolean cancelled;

public DefaultSyncWorker(final Source<T> source,
final Destination<T> destination,
final MessageTracker<T> messageTracker,
public DefaultSyncWorker(final Source<AirbyteMessage> source,
final Destination<AirbyteMessage> destination,
final MessageTracker<AirbyteMessage> messageTracker,
final NormalizationRunner normalizationRunner) {
this.source = source;
this.destination = destination;
Expand All @@ -68,6 +77,9 @@ public DefaultSyncWorker(final Source<T> source,
public OutputAndStatus<StandardSyncOutput> run(StandardSyncInput syncInput, Path jobRoot) {
long startTime = System.currentTimeMillis();

// clean catalog object
removeInvalidStreams(syncInput.getCatalog());

final StandardTapConfig tapConfig = WorkerUtils.syncToTapConfig(syncInput);
final StandardTargetConfig targetConfig = WorkerUtils.syncToTargetConfig(syncInput);

Expand All @@ -77,11 +89,16 @@ public OutputAndStatus<StandardSyncOutput> run(StandardSyncInput syncInput, Path
source.start(tapConfig, jobRoot);

while (!cancelled.get() && !source.isFinished()) {
final Optional<T> maybeMessage = source.attemptRead();
final Optional<AirbyteMessage> maybeMessage = source.attemptRead();
if (maybeMessage.isPresent()) {
final T message = maybeMessage.get();
messageTracker.accept(message);
destination.accept(message);
final AirbyteMessage message = maybeMessage.get();

if (message.getType().equals(AirbyteMessage.Type.RECORD) && !CatalogHelpers.isValidIdentifier(message.getRecord().getStream())) {
LOGGER.error("Filtered out record for invalid stream: " + message.getRecord().getStream());
} else {
messageTracker.accept(message);
destination.accept(message);
}
}
}

Expand Down Expand Up @@ -122,4 +139,16 @@ public void cancel() {
cancelled.set(true);
}

private void removeInvalidStreams(AirbyteCatalog catalog) {
final Set<String> invalidStreams = Sets.union(
new HashSet<>(CatalogHelpers.getInvalidStreamNames(catalog)),
CatalogHelpers.getInvalidFieldNames(catalog).keySet());

final List<AirbyteStream> streams = catalog.getStreams().stream()
.filter(stream -> !invalidStreams.contains(stream.getName()))
.collect(Collectors.toList());

catalog.setStreams(streams);
}

}
Loading