From f37b5598691920f560de8a3d797fddd0404283b2 Mon Sep 17 00:00:00 2001 From: Nick Phura Date: Wed, 8 Feb 2023 15:28:25 -0800 Subject: [PATCH] BHBC-2140: Add functions for safely lowercasing/trimming unknown values (#939) --- api/src/utils/media/csv/csv-file.ts | 5 +- .../csv/validation/csv-header-validator.ts | 9 +- .../media/csv/validation/csv-row-validator.ts | 21 +-- .../file-type-and-content-validator.ts | 7 +- .../media/xlsx/validation/xlsx-validation.ts | 5 +- api/src/utils/media/xlsx/xlsx-utils.ts | 5 +- api/src/utils/string-utils.test.ts | 120 ++++++++++++++++++ api/src/utils/string-utils.ts | 37 ++++++ .../src/seeds/02_dwc_spatial_transform.ts | 86 +++++++------ 9 files changed, 235 insertions(+), 60 deletions(-) create mode 100644 api/src/utils/string-utils.test.ts create mode 100644 api/src/utils/string-utils.ts diff --git a/api/src/utils/media/csv/csv-file.ts b/api/src/utils/media/csv/csv-file.ts index bc60c2ee15..2270a55f67 100644 --- a/api/src/utils/media/csv/csv-file.ts +++ b/api/src/utils/media/csv/csv-file.ts @@ -1,5 +1,6 @@ import xlsx from 'xlsx'; import { SUBMISSION_MESSAGE_TYPE } from '../../../constants/status'; +import { safeToLowerCase, safeTrim } from '../../string-utils'; import { IMediaState, MediaValidation } from '../media-file'; import { getCellValue, getWorksheetRange, replaceCellDates, trimCellWhitespace } from '../xlsx/xlsx-utils'; @@ -101,7 +102,7 @@ export class CSVWorksheet { if (aoaHeaders.length > 0) { // Parse the headers array from the array of arrays produced by calling `xlsx.utils.sheet_to_json` - this._headers = aoaHeaders[0].map((item) => item?.trim()); + this._headers = aoaHeaders[0].map(safeTrim); } } @@ -110,7 +111,7 @@ export class CSVWorksheet { getHeadersLowerCase(): string[] { if (!this._headersLowerCase.length) { - this._headersLowerCase = this.getHeaders().map((item) => item?.toLowerCase()); + this._headersLowerCase = this.getHeaders().map(safeToLowerCase); } return this._headersLowerCase; diff --git a/api/src/utils/media/csv/validation/csv-header-validator.ts b/api/src/utils/media/csv/validation/csv-header-validator.ts index 9a85a8dda2..504977d211 100644 --- a/api/src/utils/media/csv/validation/csv-header-validator.ts +++ b/api/src/utils/media/csv/validation/csv-header-validator.ts @@ -1,4 +1,5 @@ import { SUBMISSION_MESSAGE_TYPE } from '../../../../constants/status'; +import { safeToLowerCase, safeTrim } from '../../../string-utils'; import { CSVValidator } from '../csv-file'; /** @@ -62,7 +63,7 @@ export const hasRequiredHeadersValidator = (config?: FileRequiredHeaderValidator const headersLowerCase = csvWorksheet.getHeadersLowerCase(); for (const requiredHeader of config.file_required_columns_validator.required_columns) { - if (!headersLowerCase.includes(requiredHeader.toLowerCase())) { + if (!headersLowerCase.includes(safeToLowerCase(requiredHeader))) { csvWorksheet.csvValidation.addHeaderErrors([ { errorCode: SUBMISSION_MESSAGE_TYPE.MISSING_REQUIRED_HEADER, @@ -118,7 +119,7 @@ export const hasRecommendedHeadersValidator = (config?: FileRecommendedHeaderVal } for (const recommendedHeader of config.file_recommended_columns_validator.recommended_columns) { - if (!headersLowerCase.includes(recommendedHeader.toLowerCase())) { + if (!headersLowerCase.includes(safeToLowerCase(recommendedHeader))) { csvWorksheet.csvValidation.addHeaderWarnings([ { errorCode: SUBMISSION_MESSAGE_TYPE.MISSING_RECOMMENDED_HEADER, @@ -162,8 +163,8 @@ export const getValidHeadersValidator = (config?: FileValidHeadersValidatorConfi for (const header of headers) { if ( !config.file_valid_columns_validator.valid_columns - .map((item) => item.toLowerCase()) - .includes(header.trim().toLowerCase()) + .map(safeToLowerCase) + .includes(safeToLowerCase(safeTrim(header))) ) { csvWorksheet.csvValidation.addHeaderWarnings([ { diff --git a/api/src/utils/media/csv/validation/csv-row-validator.ts b/api/src/utils/media/csv/validation/csv-row-validator.ts index 521e2bd5b8..286347f199 100644 --- a/api/src/utils/media/csv/validation/csv-row-validator.ts +++ b/api/src/utils/media/csv/validation/csv-row-validator.ts @@ -1,4 +1,5 @@ import { SUBMISSION_MESSAGE_TYPE } from '../../../../constants/status'; +import { safeToLowerCase } from '../../../string-utils'; import { CSVValidator } from '../csv-file'; export type RequiredFieldsValidatorConfig = { @@ -21,7 +22,7 @@ export const getRequiredFieldsValidator = (config?: RequiredFieldsValidatorConfi const headersLowerCase = csvWorksheet.getHeadersLowerCase(); rows.forEach((row, rowIndex) => { - const columnIndex = headersLowerCase.indexOf(config.columnName.toLowerCase()); + const columnIndex = headersLowerCase.indexOf(safeToLowerCase(config.columnName)); // if column does not exist, return if (columnIndex < 0) { @@ -80,7 +81,7 @@ export const getCodeValueFieldsValidator = (config?: ColumnCodeValidatorConfig): const headersLowerCase = csvWorksheet.getHeadersLowerCase(); rows.forEach((row, rowIndex) => { - const columnIndex = headersLowerCase.indexOf(config.columnName.toLowerCase()); + const columnIndex = headersLowerCase.indexOf(safeToLowerCase(config.columnName)); // if column does not exist, return if (columnIndex < 0) { @@ -95,14 +96,14 @@ export const getCodeValueFieldsValidator = (config?: ColumnCodeValidatorConfig): } // compare allowed code values as lowercase strings - const allowedCodeValuesLowerCase: string[] = []; + const allowedCodeValuesLowerCase: (string | number)[] = []; const allowedCodeValues = config.column_code_validator.allowed_code_values.map((allowedCode) => { - allowedCodeValuesLowerCase.push(allowedCode.name?.toString().toLowerCase()); + allowedCodeValuesLowerCase.push(safeToLowerCase(allowedCode.name)); return allowedCode.name; }); // Add an error if the cell value is not one of the elements in the codeValues array - if (!allowedCodeValuesLowerCase.includes(rowValueForColumn?.toLowerCase())) { + if (!allowedCodeValuesLowerCase.includes(safeToLowerCase(rowValueForColumn))) { csvWorksheet.csvValidation.addRowErrors([ { errorCode: SUBMISSION_MESSAGE_TYPE.INVALID_VALUE, @@ -147,7 +148,7 @@ export const getValidRangeFieldsValidator = (config?: ColumnRangeValidatorConfig const headersLowerCase = csvWorksheet.getHeadersLowerCase(); rows.forEach((row, rowIndex) => { - const columnIndex = headersLowerCase.indexOf(config.columnName.toLowerCase()); + const columnIndex = headersLowerCase.indexOf(safeToLowerCase(config.columnName)); // if column does not exist, return if (columnIndex < 0) { @@ -248,7 +249,7 @@ export const getNumericFieldsValidator = (config?: ColumnNumericValidatorConfig) const headersLowerCase = csvWorksheet.getHeadersLowerCase(); rows.forEach((row, rowIndex) => { - const columnIndex = headersLowerCase.indexOf(config.columnName.toLowerCase()); + const columnIndex = headersLowerCase.indexOf(safeToLowerCase(config.columnName)); // if column does not exist, return if (columnIndex < 0) { @@ -311,7 +312,7 @@ export const getValidFormatFieldsValidator = (config?: ColumnFormatValidatorConf const headersLowerCase = csvWorksheet.getHeadersLowerCase(); rows.forEach((row, rowIndex) => { - const columnIndex = headersLowerCase.indexOf(config.columnName.toLowerCase()); + const columnIndex = headersLowerCase.indexOf(safeToLowerCase(config.columnName)); // if column does not exist, return if (columnIndex < 0) { @@ -367,7 +368,7 @@ export const getUniqueColumnsValidator = (config?: FileColumnUniqueValidatorConf // find the indices of all provided column names in the worksheet const columnIndices = config.file_column_unique_validator.column_names.map((column) => - lowercaseHeaders.indexOf(column.toLocaleLowerCase()) + lowercaseHeaders.indexOf(safeToLowerCase(column)) ); // checks list of column indices if any are missing (-1) and returns early @@ -377,7 +378,7 @@ export const getUniqueColumnsValidator = (config?: FileColumnUniqueValidatorConf rows.forEach((row, rowIndex) => { const key = config.file_column_unique_validator.column_names - .map((columnIndex) => `${row[columnIndex] || ''}`.trim().toLocaleLowerCase()) + .map((columnIndex) => `${row[columnIndex] || ''}`.trim().toLowerCase()) .join(', '); // check if key exists already if (!keySet.has(key)) { diff --git a/api/src/utils/media/validation/file-type-and-content-validator.ts b/api/src/utils/media/validation/file-type-and-content-validator.ts index d697e50415..cbceca2f9d 100644 --- a/api/src/utils/media/validation/file-type-and-content-validator.ts +++ b/api/src/utils/media/validation/file-type-and-content-validator.ts @@ -1,3 +1,4 @@ +import { safeToLowerCase } from '../../string-utils'; import { DWCArchive, DWCArchiveValidator } from '../dwc/dwc-archive-file'; import { MediaValidator } from '../media-file'; import { XLSXCSV, XLSXCSVValidator } from '../xlsx/xlsx-file'; @@ -93,7 +94,7 @@ const checkRequiredFieldsInDWCArchive = (dwcArchive: DWCArchive, config: Submiss const fileNames = dwcArchive.rawFile.mediaFiles.map((mediaFile) => mediaFile.name); config.submission_required_files_validator.required_files.forEach((requiredFile) => { - if (!fileNames.includes(requiredFile.toLowerCase())) { + if (!fileNames.includes(safeToLowerCase(requiredFile))) { dwcArchive.mediaValidation.addFileErrors([`Missing required file: ${requiredFile}`]); } }); @@ -112,10 +113,10 @@ const checkRequiredFieldsInXLSXCSV = (xlsxCsv: XLSXCSV, config: SubmissionRequir return xlsxCsv; } - const worksheetNames = Object.keys(xlsxCsv.workbook.worksheets).map((item) => item.toLowerCase()); + const worksheetNames = Object.keys(xlsxCsv.workbook.worksheets).map(safeToLowerCase); config.submission_required_files_validator.required_files.forEach((requiredFile) => { - if (!worksheetNames.includes(requiredFile.toLowerCase())) { + if (!worksheetNames.includes(safeToLowerCase(requiredFile))) { xlsxCsv.mediaValidation.addFileErrors([`Missing required sheet: ${requiredFile}`]); } }); diff --git a/api/src/utils/media/xlsx/validation/xlsx-validation.ts b/api/src/utils/media/xlsx/validation/xlsx-validation.ts index ca89210b9a..2cd9c186b6 100644 --- a/api/src/utils/media/xlsx/validation/xlsx-validation.ts +++ b/api/src/utils/media/xlsx/validation/xlsx-validation.ts @@ -1,4 +1,5 @@ import { SUBMISSION_MESSAGE_TYPE } from '../../../../constants/status'; +import { safeTrim } from '../../../string-utils'; import { CSVWorkBook, WorkBookValidator } from '../../csv/csv-file'; export type ParentChildKeyMatchValidatorConfig = { @@ -48,7 +49,7 @@ export const getParentChildKeyMatchValidator = (config?: ParentChildKeyMatchVali } // Filter column names to only check key violation on columns included in the child sheet - const filteredColumnNames = column_names.filter((columnName: string) => Boolean(childRowObjects[0][columnName])); + const filteredColumnNames = column_names.filter((columnName) => Boolean(childRowObjects[0][columnName])); /** * Encodes the column values for a worksheet at a given row into a string, which is used for comparison with another worksheet @@ -65,7 +66,7 @@ export const getParentChildKeyMatchValidator = (config?: ParentChildKeyMatchVali .filter(Boolean) // Trim whitespace - .map((columnValue: string) => columnValue.trim()) + .map(safeTrim) // Deliminate column values .join('|') diff --git a/api/src/utils/media/xlsx/xlsx-utils.ts b/api/src/utils/media/xlsx/xlsx-utils.ts index 6b4140809e..8def99bf7e 100644 --- a/api/src/utils/media/xlsx/xlsx-utils.ts +++ b/api/src/utils/media/xlsx/xlsx-utils.ts @@ -1,4 +1,5 @@ import xlsx, { CellObject } from 'xlsx'; +import { safeTrim } from '../../string-utils'; /** * Get a worksheet by name. @@ -63,12 +64,12 @@ export function prepareWorksheetCells(worksheet: xlsx.WorkSheet) { export function trimCellWhitespace(cell: CellObject) { // check and clean raw strings if (cell.t === 's') { - cell.v = (cell.v as string).trim(); + cell.v = safeTrim(cell.v); } // check and clean formatted strings if (cell.w) { - cell.w = cell.w.trim(); + cell.w = safeTrim(cell.w); } return cell; diff --git a/api/src/utils/string-utils.test.ts b/api/src/utils/string-utils.test.ts new file mode 100644 index 0000000000..80d4555b3f --- /dev/null +++ b/api/src/utils/string-utils.test.ts @@ -0,0 +1,120 @@ +import { expect } from 'chai'; +import { safeToLowerCase, safeTrim } from './string-utils'; + +describe('safeToLowerCase', () => { + describe('returns value lowercase', () => { + it('when value is a lowercase string', () => { + expect(safeToLowerCase('string')).to.equal('string'); + }); + + it('when value is an uppercase string', () => { + expect(safeToLowerCase('STRING')).to.equal('string'); + }); + + it('when value is a mixed case string', () => { + expect(safeToLowerCase('sTRiNG')).to.equal('string'); + }); + }); + + describe('returns value unaltered', () => { + it('when value is a negative number', () => { + expect(safeToLowerCase(-123)).to.equal(-123); + }); + + it('when value is a zero', () => { + expect(safeToLowerCase(0)).to.equal(0); + }); + + it('when value is a positive number', () => { + expect(safeToLowerCase(123)).to.equal(123); + }); + + it('when value is `false`', () => { + expect(safeToLowerCase(false)).to.equal(false); + }); + + it('when value is `true`', () => { + expect(safeToLowerCase(true)).to.equal(true); + }); + + it('when value is an empty object', () => { + expect(safeToLowerCase({})).to.eql({}); + }); + + it('when value is an empty array', () => { + expect(safeToLowerCase([])).to.eql([]); + }); + + it('when value is a non-empty array of numbers', () => { + expect(safeToLowerCase([1, 2, 3])).to.eql([1, 2, 3]); + }); + + it('when value is a non-empty array of strings', () => { + expect(safeToLowerCase(['1', 'string', 'false'])).to.eql(['1', 'string', 'false']); + }); + + it('when value is a function', () => { + const fn = (a: number, b: number) => a * b; + expect(safeToLowerCase(fn)).to.equal(fn); + }); + }); +}); + +describe('safeTrim', () => { + describe('returns value trimmed', () => { + it('when value is a lowercase string', () => { + expect(safeTrim(' string ')).to.equal('string'); + }); + + it('when value is an uppercase string', () => { + expect(safeTrim(' STRING ')).to.equal('STRING'); + }); + + it('when value is a mixed case string', () => { + expect(safeTrim(' sTRiNG ')).to.equal('sTRiNG'); + }); + }); + + describe('returns value unaltered', () => { + it('when value is a negative number', () => { + expect(safeTrim(-123)).to.equal(-123); + }); + + it('when value is a zero', () => { + expect(safeTrim(0)).to.equal(0); + }); + + it('when value is a positive number', () => { + expect(safeTrim(123)).to.equal(123); + }); + + it('when value is `false`', () => { + expect(safeTrim(false)).to.equal(false); + }); + + it('when value is `true`', () => { + expect(safeTrim(true)).to.equal(true); + }); + + it('when value is an empty object', () => { + expect(safeTrim({})).to.eql({}); + }); + + it('when value is an empty array', () => { + expect(safeTrim([])).to.eql([]); + }); + + it('when value is a non-empty array of numbers', () => { + expect(safeTrim([1, 2, 3])).to.eql([1, 2, 3]); + }); + + it('when value is a non-empty array of strings', () => { + expect(safeTrim([' 1 ', ' string ', ' false '])).to.eql([' 1 ', ' string ', ' false ']); + }); + + it('when value is a function', () => { + const fn = (a: number, b: number) => a * b; + expect(safeTrim(fn)).to.equal(fn); + }); + }); +}); diff --git a/api/src/utils/string-utils.ts b/api/src/utils/string-utils.ts new file mode 100644 index 0000000000..e0e9fa4619 --- /dev/null +++ b/api/src/utils/string-utils.ts @@ -0,0 +1,37 @@ +import { isString } from 'lodash'; + +/** + * Safely apply `.toLowerCase()` to a value of unknown type. + * + * If the value is not a string, then the original unaltered value will be returned. + * + * @export + * @template T + * @param {T} value + * @return {*} {T} + */ +export function safeToLowerCase(value: T): T { + if (isString(value)) { + return (value.toLowerCase() as unknown) as T; + } + + return value; +} + +/** + * Safely apply `.trim()` to a value of unknown type. + * + * If the value is not a string, then the original unaltered value will be returned. + * + * @export + * @template T + * @param {T} value + * @return {*} {T} + */ +export function safeTrim(value: T): T { + if (isString(value)) { + return (value.trim() as unknown) as T; + } + + return value; +} diff --git a/database/src/seeds/02_dwc_spatial_transform.ts b/database/src/seeds/02_dwc_spatial_transform.ts index fc71cf1db0..269e754bc5 100644 --- a/database/src/seeds/02_dwc_spatial_transform.ts +++ b/database/src/seeds/02_dwc_spatial_transform.ts @@ -24,6 +24,10 @@ export async function seed(knex: Knex): Promise { await knex.raw(` ${insertSpatialTransform()} `); + } else { + await knex.raw(` + ${updateSpatialTransform()} + `); } } @@ -44,42 +48,50 @@ const insertSpatialTransform = () => ` INSERT into spatial_transform (name, description, record_effective_date, transform) VALUES ( - 'DwC Occurrences', 'Extracts occurrences and properties from DwC JSON source.', now(), - $transform$ - with submission as (select * - from occurrence_submission - where occurrence_submission_id = ?) - , occurrences as (select occurrence_submission_id, occs - from submission, jsonb_path_query(darwin_core_source, '$.occurrence') occs) - , occurrence as (select occurrence_submission_id, jsonb_array_elements(occs) occ - from occurrences) - , events as (select evns - from submission, jsonb_path_query(darwin_core_source, '$.event') evns) - , event as (select jsonb_array_elements(evns) evn - from events) - , locations as (select locs - from submission, jsonb_path_query(darwin_core_source, '$.location') locs) - , location as (select jsonb_array_elements(locs) loc - from locations) - , event_coord as (select st_x(pt) x, st_y(pt) y, loc - from location, ST_SetSRID(ST_MakePoint((loc->>'decimalLongitude')::float, (loc->>'decimalLatitude')::float), 4326) pt) - , normal as (select distinct o.occurrence_submission_id, o.occ, ec.*, e.evn - from occurrence o - left outer join event_coord ec on - (ec.loc->'eventID' = o.occ->'eventID') - left outer join event e on - (e.evn->'eventID' = o.occ->'eventID')) - - select jsonb_build_object('type', 'FeatureCollection' - , 'features', jsonb_build_array(jsonb_build_object('type', 'Feature' - , 'geometry', jsonb_build_object('type', 'Point', 'coordinates', json_build_array(n.x, n.y)) - , 'properties', jsonb_build_object('type', 'Occurrence', 'dwc', jsonb_build_object( - 'type', 'PhysicalObject', 'basisOfRecord', 'Occurrence', 'datasetID', n.occurrence_submission_id, 'occurrenceID', n.occ->'occurrenceID' - , 'sex', n.occ->'sex', 'lifeStage', n.occ->'lifeStage', 'taxonID', n.occ->'taxonID', 'vernacularName', n.occ->'vernacularName', 'individualCount', n.occ->'individualCount' - , 'eventDate', n.evn->'eventDate', 'verbatimSRS', n.evn->'verbatimSRS', 'verbatimCoordinates', n.evn->'verbatimCoordinates' - )))) - )result_data - from normal n; - $transform$ + 'DwC Occurrences', 'Extracts occurrences and properties from DwC JSON source.', now(),${transformString} ); `; + +const updateSpatialTransform = () => ` +UPDATE + spatial_transform SET transform = ${transformString} +WHERE +name = 'DwC Occurrences'; +`; + +const transformString = ` +$transform$ +with submission as (select * + from occurrence_submission + where occurrence_submission_id = ?) + , occurrences as (select occurrence_submission_id, occs + from submission, jsonb_path_query(darwin_core_source, '$.occurrence') occs) + , occurrence as (select occurrence_submission_id, jsonb_array_elements(occs) occ + from occurrences) + , events as (select evns + from submission, jsonb_path_query(darwin_core_source, '$.event') evns) + , event as (select jsonb_array_elements(evns) evn + from events) + , locations as (select locs + from submission, jsonb_path_query(darwin_core_source, '$.location') locs) + , location as (select jsonb_array_elements(locs) loc + from locations) + , event_coord as (select st_x(pt) x, st_y(pt) y, loc + from location, ST_SetSRID(ST_MakePoint((nullif(loc->>'decimalLongitude', ''))::float, (nullif(loc->>'decimalLatitude', ''))::float), 4326) pt) + , normal as (select distinct o.occurrence_submission_id, o.occ, ec.*, e.evn + from occurrence o + left outer join event_coord ec on + (ec.loc->'eventID' = o.occ->'eventID') + left outer join event e on + (e.evn->'eventID' = o.occ->'eventID')) + select jsonb_build_object('type', 'FeatureCollection' + , 'features', jsonb_build_array(jsonb_build_object('type', 'Feature' + , 'geometry', jsonb_build_object('type', 'Point', 'coordinates', json_build_array(n.x, n.y)) + , 'properties', jsonb_build_object('type', 'Occurrence', 'dwc', jsonb_build_object( + 'type', 'PhysicalObject', 'basisOfRecord', 'Occurrence', 'datasetID', n.occurrence_submission_id, 'occurrenceID', n.occ->'occurrenceID' + , 'sex', n.occ->'sex', 'lifeStage', n.occ->'lifeStage', 'taxonID', n.occ->'taxonID', 'vernacularName', n.occ->'vernacularName', 'individualCount', n.occ->'individualCount' + , 'eventDate', n.evn->'eventDate', 'verbatimSRS', n.evn->'verbatimSRS', 'verbatimCoordinates', n.evn->'verbatimCoordinates' + )))) + )result_data + from normal n; +$transform$`;