diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md index 639198c67ed9a..88cf57faaa10b 100644 --- a/packages/@aws-cdk/aws-glue/README.md +++ b/packages/@aws-cdk/aws-glue/README.md @@ -264,6 +264,31 @@ myTable.addPartitionIndex({ }); ``` +### Partition Filtering + +If you have a table with a large number of partitions that grows over time, consider using AWS Glue partition indexing and filtering. + +```ts +declare const myDatabase: glue.Database; +new glue.Table(this, 'MyTable', { + database: myDatabase, + tableName: 'my_table', + columns: [{ + name: 'col1', + type: glue.Schema.STRING, + }], + partitionKeys: [{ + name: 'year', + type: glue.Schema.SMALL_INT, + }, { + name: 'month', + type: glue.Schema.SMALL_INT, + }], + dataFormat: glue.DataFormat.JSON, + enablePartitionFiltering: true, +}); +``` + ## [Encryption](https://docs.aws.amazon.com/athena/latest/ug/encryption.html) You can enable encryption on a Table's data: diff --git a/packages/@aws-cdk/aws-glue/lib/table.ts b/packages/@aws-cdk/aws-glue/lib/table.ts index 1b17da32e5454..ea958879d816b 100644 --- a/packages/@aws-cdk/aws-glue/lib/table.ts +++ b/packages/@aws-cdk/aws-glue/lib/table.ts @@ -172,6 +172,15 @@ export interface TableProps { * @default false */ readonly storedAsSubDirectories?: boolean; + + /** + * Enables partition filtering. + * + * @see https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html#glue-best-practices-partition-index + * + * @default - The parameter is not defined + */ + readonly enablePartitionFiltering?: boolean; } /** @@ -302,8 +311,9 @@ export class Table extends Resource implements ITable { partitionKeys: renderColumns(props.partitionKeys), parameters: { - classification: props.dataFormat.classificationString?.value, - has_encrypted_data: this.encryption !== TableEncryption.UNENCRYPTED, + 'classification': props.dataFormat.classificationString?.value, + 'has_encrypted_data': this.encryption !== TableEncryption.UNENCRYPTED, + 'partition_filtering.enabled': props.enablePartitionFiltering, }, storageDescriptor: { location: `s3://${this.bucket.bucketName}/${this.s3Prefix}`, diff --git a/packages/@aws-cdk/aws-glue/test/integ.table.ts b/packages/@aws-cdk/aws-glue/test/integ.table.ts index e9d54d659921e..d9d543a124d16 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.table.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.table.ts @@ -87,6 +87,14 @@ const encryptedTable = new glue.Table(stack, 'MyEncryptedTable', { encryptionKey: new kms.Key(stack, 'MyKey'), }); +new glue.Table(stack, 'MyPartitionFilteredTable', { + database, + tableName: 'partition_filtered_table', + columns, + dataFormat: glue.DataFormat.JSON, + enablePartitionFiltering: true, +}); + const user = new iam.User(stack, 'MyUser'); csvTable.grantReadWrite(user); encryptedTable.grantReadWrite(user); diff --git a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.assets.json b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.assets.json index 772ec2685ec29..0cd673b0f5be4 100644 --- a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.assets.json +++ b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.assets.json @@ -1,7 +1,7 @@ { - "version": "17.0.0", + "version": "20.0.0", "files": { - "92638b7a8efe38efd7c845883423f3767018a9e5bd3d67d8d638332f054d0d0f": { + "419b39f03d496de4fb02e795181e9a2ab218fb90bf7a5c9354cf93baa6fea2cf": { "source": { "path": "aws-cdk-glue.template.json", "packaging": "file" @@ -9,7 +9,7 @@ "destinations": { "current_account-current_region": { "bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}", - "objectKey": "92638b7a8efe38efd7c845883423f3767018a9e5bd3d67d8d638332f054d0d0f.json", + "objectKey": "419b39f03d496de4fb02e795181e9a2ab218fb90bf7a5c9354cf93baa6fea2cf.json", "assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}" } } diff --git a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.template.json b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.template.json index 7f5f3e286f0ef..e64e4fe3b8a20 100644 --- a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.template.json +++ b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/aws-cdk-glue.template.json @@ -423,6 +423,76 @@ } } }, + "MyPartitionFilteredTableBucket6ACAA137": { + "Type": "AWS::S3::Bucket", + "UpdateReplacePolicy": "Retain", + "DeletionPolicy": "Retain" + }, + "MyPartitionFilteredTable324BA27A": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "partition_filtered_table generated by CDK", + "Name": "partition_filtered_table", + "Parameters": { + "classification": "json", + "has_encrypted_data": false, + "partition_filtering.enabled": true + }, + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "MyPartitionFilteredTableBucket6ACAA137" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, "MyUserDC45028B": { "Type": "AWS::IAM::User" }, diff --git a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/cdk.out b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/cdk.out index 90bef2e09ad39..588d7b269d34f 100644 --- a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/cdk.out +++ b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/cdk.out @@ -1 +1 @@ -{"version":"17.0.0"} \ No newline at end of file +{"version":"20.0.0"} \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/integ.json b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/integ.json index 85b6fe8295b26..1f604630bc610 100644 --- a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/integ.json +++ b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/integ.json @@ -1,7 +1,7 @@ { - "version": "18.0.0", + "version": "20.0.0", "testCases": { - "aws-glue/test/integ.table": { + "integ.table": { "stacks": [ "aws-cdk-glue" ], diff --git a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/manifest.json b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/manifest.json index 221b7524100fa..aed4259921a6f 100644 --- a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/manifest.json +++ b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/manifest.json @@ -1,5 +1,5 @@ { - "version": "17.0.0", + "version": "20.0.0", "artifacts": { "Tree": { "type": "cdk:tree", @@ -69,6 +69,18 @@ "data": "MyEncryptedTable981A88C6" } ], + "/aws-cdk-glue/MyPartitionFilteredTable/Bucket/Resource": [ + { + "type": "aws:cdk:logicalId", + "data": "MyPartitionFilteredTableBucket6ACAA137" + } + ], + "/aws-cdk-glue/MyPartitionFilteredTable/Table": [ + { + "type": "aws:cdk:logicalId", + "data": "MyPartitionFilteredTable324BA27A" + } + ], "/aws-cdk-glue/MyUser/Resource": [ { "type": "aws:cdk:logicalId", diff --git a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/tree.json b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/tree.json index 5ec24f621772d..189e9f749a2d6 100644 --- a/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/tree.json +++ b/packages/@aws-cdk/aws-glue/test/table.integ.snapshot/tree.json @@ -9,7 +9,7 @@ "path": "Tree", "constructInfo": { "fqn": "constructs.Construct", - "version": "10.0.9" + "version": "10.1.33" } }, "aws-cdk-glue": { @@ -596,6 +596,111 @@ "version": "0.0.0" } }, + "MyPartitionFilteredTable": { + "id": "MyPartitionFilteredTable", + "path": "aws-cdk-glue/MyPartitionFilteredTable", + "children": { + "Bucket": { + "id": "Bucket", + "path": "aws-cdk-glue/MyPartitionFilteredTable/Bucket", + "children": { + "Resource": { + "id": "Resource", + "path": "aws-cdk-glue/MyPartitionFilteredTable/Bucket/Resource", + "attributes": { + "aws:cdk:cloudformation:type": "AWS::S3::Bucket", + "aws:cdk:cloudformation:props": {} + }, + "constructInfo": { + "fqn": "@aws-cdk/aws-s3.CfnBucket", + "version": "0.0.0" + } + } + }, + "constructInfo": { + "fqn": "@aws-cdk/aws-s3.Bucket", + "version": "0.0.0" + } + }, + "Table": { + "id": "Table", + "path": "aws-cdk-glue/MyPartitionFilteredTable/Table", + "attributes": { + "aws:cdk:cloudformation:type": "AWS::Glue::Table", + "aws:cdk:cloudformation:props": { + "catalogId": { + "Ref": "AWS::AccountId" + }, + "databaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "tableInput": { + "name": "partition_filtered_table", + "description": "partition_filtered_table generated by CDK", + "parameters": { + "classification": "json", + "has_encrypted_data": false, + "partition_filtering.enabled": true + }, + "storageDescriptor": { + "location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "MyPartitionFilteredTableBucket6ACAA137" + }, + "/" + ] + ] + }, + "compressed": false, + "storedAsSubDirectories": false, + "columns": [ + { + "name": "col1", + "type": "string" + }, + { + "name": "col2", + "type": "string", + "comment": "col2 comment" + }, + { + "name": "col3", + "type": "array" + }, + { + "name": "col4", + "type": "map" + }, + { + "name": "col5", + "type": "struct" + } + ], + "inputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "serdeInfo": { + "serializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + } + }, + "tableType": "EXTERNAL_TABLE" + } + } + }, + "constructInfo": { + "fqn": "@aws-cdk/aws-glue.CfnTable", + "version": "0.0.0" + } + } + }, + "constructInfo": { + "fqn": "@aws-cdk/aws-glue.Table", + "version": "0.0.0" + } + }, "MyUser": { "id": "MyUser", "path": "aws-cdk-glue/MyUser", diff --git a/packages/@aws-cdk/aws-glue/test/table.test.ts b/packages/@aws-cdk/aws-glue/test/table.test.ts index e3f5df9bb6a3f..79920d73e1a81 100644 --- a/packages/@aws-cdk/aws-glue/test/table.test.ts +++ b/packages/@aws-cdk/aws-glue/test/table.test.ts @@ -1,4 +1,4 @@ -import { Template } from '@aws-cdk/assertions'; +import { Template, Match } from '@aws-cdk/assertions'; import * as iam from '@aws-cdk/aws-iam'; import * as kms from '@aws-cdk/aws-kms'; import * as s3 from '@aws-cdk/aws-s3'; @@ -1567,6 +1567,98 @@ test('Table.fromTableArn', () => { expect(table.tableName).toEqual('tbl1'); }); +test.each([ + ['enabled', true], + ['disabled', false], +])('Partition filtering on table %s', (_, enabled) => { + const app = new cdk.App(); + const dbStack = new cdk.Stack(app, 'db'); + const database = new glue.Database(dbStack, 'Database', { + databaseName: 'database', + }); + + const tableStack = new cdk.Stack(app, 'table'); + new glue.Table(tableStack, 'Table', { + database, + tableName: 'table', + columns: [{ + name: 'col', + type: glue.Schema.STRING, + }], + partitionKeys: [{ + name: 'year', + type: glue.Schema.SMALL_INT, + }], + dataFormat: glue.DataFormat.JSON, + enablePartitionFiltering: enabled, + }); + + Template.fromStack(tableStack).hasResourceProperties('AWS::Glue::Table', { + CatalogId: { + Ref: 'AWS::AccountId', + }, + DatabaseName: { + 'Fn::ImportValue': 'db:ExportsOutputRefDatabaseB269D8BB88F4B1C4', + }, + TableInput: { + Name: 'table', + Description: 'table generated by CDK', + Parameters: { + 'classification': 'json', + 'has_encrypted_data': false, + 'partition_filtering.enabled': enabled, + }, + PartitionKeys: Match.anyValue(), + StorageDescriptor: Match.anyValue(), + TableType: Match.anyValue(), + }, + }); +}); + +test('Partition filtering on table is not defined (default behavior)', () => { + const app = new cdk.App(); + const dbStack = new cdk.Stack(app, 'db'); + const database = new glue.Database(dbStack, 'Database', { + databaseName: 'database', + }); + + const tableStack = new cdk.Stack(app, 'table'); + new glue.Table(tableStack, 'Table', { + database, + tableName: 'table', + columns: [{ + name: 'col', + type: glue.Schema.STRING, + }], + partitionKeys: [{ + name: 'year', + type: glue.Schema.SMALL_INT, + }], + dataFormat: glue.DataFormat.JSON, + enablePartitionFiltering: undefined, + }); + + Template.fromStack(tableStack).hasResourceProperties('AWS::Glue::Table', { + CatalogId: { + Ref: 'AWS::AccountId', + }, + DatabaseName: { + 'Fn::ImportValue': 'db:ExportsOutputRefDatabaseB269D8BB88F4B1C4', + }, + TableInput: { + Name: 'table', + Description: 'table generated by CDK', + Parameters: { + classification: 'json', + has_encrypted_data: false, + }, + PartitionKeys: Match.anyValue(), + StorageDescriptor: Match.anyValue(), + TableType: Match.anyValue(), + }, + }); +}); + function createTable(props: Pick>): void { const stack = new cdk.Stack(); new glue.Table(stack, 'table', {