diff --git a/docs/what/urn.md b/docs/what/urn.md index 2f4dffb985653c..c7fb0555cd992f 100644 --- a/docs/what/urn.md +++ b/docs/what/urn.md @@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI) ## Restrictions -There are a few restrictions when creating an urn: +There are a few restrictions when creating an URN: -1. Commas are reserved character in URN fields: `,` -2. Parentheses are reserved characters in URN fields: `(` or `)` -3. Colons are reserved characters in URN fields: `:` -4. Urn separator UTF-8 character `␟` +The following characters are not allowed anywhere in the URN + +1. Parentheses are reserved characters in URN fields: `(` or `)` +2. The "unit separator" unicode character `␟` (U+241F) + +The following characters are not allowed within an URN tuple. + +1. Commas are reserved characters in URN tuples: `,` + +Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid. Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters. diff --git a/metadata-io/metadata-io-api/build.gradle b/metadata-io/metadata-io-api/build.gradle index b8028fad07bb65..5273177b752819 100644 --- a/metadata-io/metadata-io-api/build.gradle +++ b/metadata-io/metadata-io-api/build.gradle @@ -16,3 +16,7 @@ dependencies { testImplementation externalDependency.lombok testAnnotationProcessor externalDependency.lombok } + +test { + environment 'STRICT_URN_VALIDATION_ENABLED', 'true' +} \ No newline at end of file diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java index c2e1c47eca1fdb..5e1f09fcc64393 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java @@ -30,7 +30,8 @@ public class ValidationApiUtils { // Related to BrowsePathv2 public static final String URN_DELIMITER_SEPARATOR = "␟"; // https://datahubproject.io/docs/what/urn/#restrictions - public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of(":", "(", ")", ","); + public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")"); + public static final Set ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(","); /** * Validates a {@link RecordTemplate} and throws {@link ValidationException} if validation fails. @@ -86,11 +87,10 @@ public static void validateUrn( "Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character"); } + int totalParts = urn.getEntityKey().getParts().size(); List illegalComponents = urn.getEntityKey().getParts().stream() - .flatMap(ValidationApiUtils::processUrnPartRecursively) - .filter( - urnPart -> ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(urnPart::contains)) + .flatMap(part -> processUrnPartRecursively(part, totalParts)) .collect(Collectors.toList()); if (!illegalComponents.isEmpty()) { @@ -114,15 +114,25 @@ public static void validateUrn( } /** Recursively process URN parts with URL decoding */ - private static Stream processUrnPartRecursively(String urnPart) { + private static Stream processUrnPartRecursively(String urnPart, int totalParts) { String decodedPart = URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8); if (decodedPart.startsWith("urn:li:")) { // Recursively process nested URN after decoding + int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size(); return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream() - .flatMap(ValidationApiUtils::processUrnPartRecursively); + .flatMap(part -> processUrnPartRecursively(part, nestedParts)); } - return Stream.of(decodedPart); + if (totalParts > 1) { + if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + } + if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + + return Stream.empty(); } /** diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java index e683e594d8766c..a2c9a15d92f90a 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java @@ -18,10 +18,36 @@ public void testValidateDatasetUrn() { // If no exception is thrown, test passes } - @Test(expectedExceptions = IllegalArgumentException.class) + @Test public void testSimpleUrnColon() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:foo:bar"); - ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar@example.com"), true); + // If no exception is thrown, test passes + } + + @Test + public void testSimpleUrnComma() { + ValidationApiUtils.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true); + // If no exception is thrown, test passes + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testTupleUrnComma() { + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testFabricTypeCasing() { + // prod != PROD + ValidationApiUtils.validateUrn( + entityRegistry, + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"), + true); } @Test @@ -34,7 +60,7 @@ public void testComplexUrnColon() throws URISyntaxException { } @Test(expectedExceptions = IllegalArgumentException.class) - public void testUrnFabricType() { + public void testFabricTypeParen() { Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())"); ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); } @@ -83,20 +109,20 @@ public void testValidComplexUrn() { UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)"); - ValidationApiUtils.validateUrn(entityRegistry, validUrn); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); // If no exception is thrown, test passes } @Test(expectedExceptions = NullPointerException.class) public void testUrnNull() { - ValidationApiUtils.validateUrn(entityRegistry, null); + ValidationApiUtils.validateUrn(entityRegistry, null, true); } @Test public void testValidPartialUrlEncode() { Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29"); - ValidationApiUtils.validateUrn(entityRegistry, validUrn); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); // If no exception is thrown, test passes } @@ -106,7 +132,23 @@ public void testValidPartialUrlEncode2() { UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)"); - ValidationApiUtils.validateUrn(entityRegistry, validUrn); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testValidColon() { + Urn validUrn = + UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)"); + + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testNoTupleComma() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); // If no exception is thrown, test passes } }