Skip to content

Commit

Permalink
fix(validation): additional URN validation adjustments (datahub-proje…
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored and sleeperdeep committed Dec 17, 2024
1 parent 1cde28f commit d9de671
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 20 deletions.
16 changes: 11 additions & 5 deletions docs/what/urn.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)

## Restrictions

There are a few restrictions when creating an urn:
There are a few restrictions when creating an URN:

1. Commas are reserved character in URN fields: `,`
2. Parentheses are reserved characters in URN fields: `(` or `)`
3. Colons are reserved characters in URN fields: `:`
4. Urn separator UTF-8 character ``
The following characters are not allowed anywhere in the URN

1. Parentheses are reserved characters in URN fields: `(` or `)`
2. The "unit separator" unicode character `` (U+241F)

The following characters are not allowed within an URN tuple.

1. Commas are reserved characters in URN tuples: `,`

Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.

Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
4 changes: 4 additions & 0 deletions metadata-io/metadata-io-api/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ dependencies {
testImplementation externalDependency.lombok
testAnnotationProcessor externalDependency.lombok
}

test {
environment 'STRICT_URN_VALIDATION_ENABLED', 'true'
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ public class ValidationApiUtils {
// Related to BrowsePathv2
public static final String URN_DELIMITER_SEPARATOR = "␟";
// https://datahubproject.io/docs/what/urn/#restrictions
public static final Set<String> ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of(":", "(", ")", ",");
public static final Set<String> ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")");
public static final Set<String> ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(",");

/**
* Validates a {@link RecordTemplate} and throws {@link ValidationException} if validation fails.
Expand Down Expand Up @@ -86,11 +87,10 @@ public static void validateUrn(
"Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character");
}

int totalParts = urn.getEntityKey().getParts().size();
List<String> illegalComponents =
urn.getEntityKey().getParts().stream()
.flatMap(ValidationApiUtils::processUrnPartRecursively)
.filter(
urnPart -> ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(urnPart::contains))
.flatMap(part -> processUrnPartRecursively(part, totalParts))
.collect(Collectors.toList());

if (!illegalComponents.isEmpty()) {
Expand All @@ -114,15 +114,25 @@ public static void validateUrn(
}

/** Recursively process URN parts with URL decoding */
private static Stream<String> processUrnPartRecursively(String urnPart) {
private static Stream<String> processUrnPartRecursively(String urnPart, int totalParts) {
String decodedPart =
URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8);
if (decodedPart.startsWith("urn:li:")) {
// Recursively process nested URN after decoding
int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size();
return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream()
.flatMap(ValidationApiUtils::processUrnPartRecursively);
.flatMap(part -> processUrnPartRecursively(part, nestedParts));
}
return Stream.of(decodedPart);
if (totalParts > 1) {
if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}
}
if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}

return Stream.empty();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,36 @@ public void testValidateDatasetUrn() {
// If no exception is thrown, test passes
}

@Test(expectedExceptions = IllegalArgumentException.class)
@Test
public void testSimpleUrnColon() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:foo:bar");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true);
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true);
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:[email protected]"), true);
// If no exception is thrown, test passes
}

@Test
public void testSimpleUrnComma() {
ValidationApiUtils.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true);
// If no exception is thrown, test passes
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testTupleUrnComma() {
ValidationApiUtils.validateUrn(
entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true);
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testFabricTypeCasing() {
// prod != PROD
ValidationApiUtils.validateUrn(
entityRegistry,
UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"),
true);
}

@Test
Expand All @@ -34,7 +60,7 @@ public void testComplexUrnColon() throws URISyntaxException {
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testUrnFabricType() {
public void testFabricTypeParen() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
}
Expand Down Expand Up @@ -83,20 +109,20 @@ public void testValidComplexUrn() {
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)");

ValidationApiUtils.validateUrn(entityRegistry, validUrn);
ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

@Test(expectedExceptions = NullPointerException.class)
public void testUrnNull() {
ValidationApiUtils.validateUrn(entityRegistry, null);
ValidationApiUtils.validateUrn(entityRegistry, null, true);
}

@Test
public void testValidPartialUrlEncode() {
Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29");

ValidationApiUtils.validateUrn(entityRegistry, validUrn);
ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

Expand All @@ -106,7 +132,23 @@ public void testValidPartialUrlEncode2() {
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)");

ValidationApiUtils.validateUrn(entityRegistry, validUrn);
ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

@Test
public void testValidColon() {
Urn validUrn =
UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)");

ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}

@Test
public void testNoTupleComma() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
// If no exception is thrown, test passes
}
}

0 comments on commit d9de671

Please sign in to comment.