diff --git a/src/cmd/validate.rs b/src/cmd/validate.rs index 889a3165e..de628151d 100644 --- a/src/cmd/validate.rs +++ b/src/cmd/validate.rs @@ -34,44 +34,53 @@ qsv supports a custom format - `currency`. This format will only accept a valid qsv also supports a custom keyword - `dynamicEnum`. It allows for dynamic validation against a CSV. This is useful for validating against a set of values unknown at the time of schema creation or -when the set of valid values is dynamic or too large to hardcode into the schema. +when the set of valid values is dynamic or too large to hardcode into the JSON Schema. `dynamicEnum` can be used to validate against a CSV file on the local filesystem or a URL (http/https, dathere and ckan schemes supported). The "dynamicEnum" value has the form: - // qsvlite binary variant only supports URIs which can be files on the local filesystem - // or remote files (http and https schemes supported) - dynamicEnum = "URI" + // qsvlite binary variant only supports URIs which can be files on the local filesystem + // or remote files (http and https schemes supported) + dynamicEnum = "URI|colname" where colname is the column name or column index (0-based) - // on other qsv binary variants, dynamicEnum has expanded functionality - dynamicEnum = "[cache_name;cache_age]|URI" where cache_name and cache_age are optional + // use data.csv from the current working directory; use the 1st column for validation + dynamicEnum = "data.csv" - // get data.csv; cache it as data.csv with a default cache age of 3600 seconds - // i.e. the cached data.csv expires after 1 hour - dynamicEnum = "https://example.com/data.csv" + // use data.csv in /lookup_dir directory; use the column "Agency" for validation + dynamicEnum = "/lookupdir/data.csv|Agency" - // get data.csv; cache it as custom_name.csv, cache age 600 seconds - dynamicEnum = "custom_name;600|https://example.com/data.csv" + // get data.csv; use the 3rd column for validation (2 as the col index is 0-based) + dynamicEnum = "https://example.com/data.csv|2" - // get data.csv; cache it as data.csv, cache age 800 seconds - dynamicEnum = ";800|https://example.com/data.csv" + // on other qsv binary variants, dynamicEnum has expanded caching functionality + dynamicEnum = "[cache_name;cache_age]|URI|colname" where cache_name and cache_age are optional - // get the top matching result for nyc_neighborhoods (signaled by trailing ?), - // cache it as nyc_neighborhood_data.csv (NOTE: cache name is required when using CKAN scheme) - // with a default cache age of 3600 seconds - // be sure to set --ckan-api, otherwise it will default to datHere's CKAN (data.dathere.com) - dynamicEnum = "nyc_neighborhood_data|ckan:://nyc_neighborhoods?" + // use data.csv from current working directory; cache it as data with a default + // cache age of 3600 seconds i.e. the cached data.csv expires after 1 hour + dynamicEnum = "data.csv" - // get CKAN resource with id 1234567, cache it as resname, 3600 secs cache age - // note that if the resource is a private resource, you'll need to set --ckan-token - dynamicEnum = "resname|ckan:://1234567" + // get data.csv; cache it as custom_name, cache age 600 seconds + dynamicEnum = "custom_name;600|https://example.com/data.csv" - // same as above but with a cache age of 100 seconds - dynamicEnum = "resname;100|ckan:://1234567 + // get data.csv; cache it as data, cache age 800 seconds + dynamicEnum = ";800|https://example.com/data.csv" - // get us_states.csv from datHere lookup tables - dynamicEnum = "dathere://us_states.csv" + // get the top matching result for nyc_neighborhoods (signaled by trailing ?), + // cache it as nyc_neighborhood_data.csv (NOTE: cache name is required when using CKAN scheme) + // with a default cache age of 3600 seconds + // be sure to set --ckan-api, otherwise it will default to datHere's CKAN (data.dathere.com) + dynamicEnum = "nyc_neighborhood_data|ckan:://nyc_neighborhoods?" -Only the first column of the CSV file is read and used for validation. + // get CKAN resource with id 1234567, cache it as resname, 3600 secs cache age + // note that if the resource is a private resource, you'll need to set --ckan-token + dynamicEnum = "resname|ckan:://1234567" + + // same as above but with a cache age of 100 seconds; use the borough column for validation + dynamicEnum = "resname;100|ckan:://1234567|borough + + // get us_states.csv from datHere lookup tables + dynamicEnum = "dathere://us_states.csv" + +If colname is not specified, the first column of the CSV file is read and used for validation. You can create a JSON Schema file from a reference CSV file using the `qsv schema` command. Once the schema is created, you can fine-tune it to your needs and use it to validate other CSV @@ -359,42 +368,192 @@ impl Keyword for DynEnumValidator { } } -/// Parse the dynamicEnum URI string to extract cache name and age -/// Format: "[cache_name;cache_age]|URL" where cache_name and cache_age are optional +/// Parse the dynamicEnum URI string to extract cache_name, final_uri, cache_age and column +/// Format: "[cache_name;cache_age]|URL[|column]" where cache_name, cache_age and column are +/// optional +/// +/// # Arguments +/// * `uri` - The dynamicEnum URI string to parse +/// +/// # uri parsing examples: +/// lookup.csv +/// - cache_name: lookup, final_uri: lookup.csv, cache_age: 3600, column: None +/// lookup.csv|name +/// - cache_name: lookup, final_uri: lookup.csv, cache_age: 3600, column: Some(name) +/// lookup_name;600|lookup.csv +/// - cache_name: lookup_name, final_uri: lookup.csv, cache_age: 600, column: None +/// remote_lookup|https://example.com/remote.csv|col1 +/// - cache_name: remote_lookup, final_uri: https://example.com/remote.csv, cache_age: 3600, +/// column: Some(col1) +/// https://example.com/remote.csv +/// - cache_name: remote, final_uri: https://example.com/remote.csv, cache_age: 3600, column: +/// None +/// +/// # Returns +/// * `(String, String, i64, Option)` - Tuple containing: +/// - cache_name: Name to use for caching the lookup table +/// - final_uri: The actual URI/URL to load the lookup table from +/// - cache_age: How long to cache the lookup table in seconds +/// - column: Optional column name/index to use from the lookup table #[cfg(not(feature = "lite"))] -fn parse_dynenum_uri(uri: &str) -> (String, i64) { +fn parse_dynenum_uri(uri: &str) -> (String, String, i64, Option) { const DEFAULT_CACHE_AGE_SECS: i64 = 3600; // 1 hour - const DEFAULT_LOOKUP_NAME: &str = "dynenum"; - if !uri.contains('|') { - return ( + // Extract cache name from URI (handles both URLs and local files) + fn get_cache_name(uri: &str) -> String { + // For URIs with schemes (http://, dathere://, ckan://, etc.) + if uri.contains("://") { + // Split on "://" and take everything after it + let after_scheme = uri.split("://").nth(1).unwrap_or(uri); + // Then take the last part of the path and remove .csv case-insensitively + after_scheme + .split('/') + .next_back() + .unwrap_or(after_scheme) + .to_lowercase() + .trim_end_matches(".csv") + .to_string() + } else { + // For regular paths, just take the last part and remove .csv case-insensitively uri.split('/') .next_back() - .unwrap_or(DEFAULT_LOOKUP_NAME) + .unwrap_or(uri) + .to_lowercase() .trim_end_matches(".csv") - .to_string(), - DEFAULT_CACHE_AGE_SECS, - ); + .to_string() + } } - let cache_config = uri.split('|').next().unwrap(); - - if !cache_config.contains(';') { - return (cache_config.to_owned(), DEFAULT_CACHE_AGE_SECS); + // Handle simple URL case with no pipe separators + if !uri.contains('|') { + let final_uri = uri.to_string(); + let cache_name = get_cache_name(&final_uri); + return (cache_name, final_uri, DEFAULT_CACHE_AGE_SECS, None); } - let parts: Vec<&str> = cache_config.split(';').collect(); - let cache_age = parts[1].parse::().unwrap_or(DEFAULT_CACHE_AGE_SECS); - let cache_name = parts[0].to_string(); + // Split the URI into parts + let parts: Vec<&str> = uri.split('|').collect(); - ( - if cache_name.is_empty() { - cache_config.to_string() + // Get the final URI and handle cache configuration + let (final_uri, cache_name, cache_age) = if parts[0].contains(';') { + // Has cache config: "name;age|uri" + let config_parts: Vec<&str> = parts[0].split(';').collect(); + let name = if config_parts[0].is_empty() { + get_cache_name(parts[1]) } else { - cache_name - }, - cache_age, - ) + config_parts[0].trim_end_matches(".csv").to_string() + }; + let age = config_parts + .get(1) + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_CACHE_AGE_SECS); + (parts[1].to_string(), name, age) + } else if parts[1].contains("://") { + // Has URL/scheme: "name|scheme://uri" + ( + parts[1].to_string(), + get_cache_name(parts[0]), + DEFAULT_CACHE_AGE_SECS, + ) + } else { + // Simple case: "uri|column" + ( + parts[0].to_string(), + get_cache_name(parts[0]), + DEFAULT_CACHE_AGE_SECS, + ) + }; + + // Extract column if present (last part if it's not the URI) + let column = if parts.len() > 2 { + Some(parts[2].to_string()) + } else if parts.len() == 2 + && !parts[1].contains("://") + && !parts[1].to_lowercase().ends_with(".csv") + { + Some(parts[1].to_string()) + } else { + None + }; + + (cache_name, final_uri, cache_age, column) +} + +#[cfg(not(feature = "lite"))] +#[test] +fn test_parse_dynenum_uri() { + // Test simple URL with no pipe separators + let (cache_name, uri, cache_age, column) = parse_dynenum_uri("https://example.com/data.csv"); + assert_eq!(cache_name, "data"); + assert_eq!(uri, "https://example.com/data.csv"); + assert_eq!(cache_age, 3600); + assert_eq!(column, None); + + // Test with custom cache name and age + let (cache_name, uri, cache_age, column) = + parse_dynenum_uri("custom_name;600|https://example.com/data.csv"); + assert_eq!(cache_name, "custom_name"); + assert_eq!(uri, "https://example.com/data.csv"); + assert_eq!(cache_age, 600); + assert_eq!(column, None); + + // Test with column name + let (cache_name, uri, cache_age, column) = parse_dynenum_uri("lookup.csv|name"); + assert_eq!(cache_name, "lookup"); + assert_eq!(uri, "lookup.csv"); + assert_eq!(cache_age, 3600); + assert_eq!(column, Some("name".to_string())); + + // Test with cache config and column + let (cache_name, uri, cache_age, column) = parse_dynenum_uri("mycache;1800|lookup.csv|code"); + assert_eq!(cache_name, "mycache"); + assert_eq!(uri, "lookup.csv"); + assert_eq!(cache_age, 1800); + assert_eq!(column, Some("code".to_string())); + + let (cache_name, uri, cache_age, column) = parse_dynenum_uri(";1800|lookup.csv|code"); + assert_eq!(cache_name, "lookup"); + assert_eq!(uri, "lookup.csv"); + assert_eq!(cache_age, 1800); + assert_eq!(column, Some("code".to_string())); + + let (cache_name, uri, cache_age, column) = parse_dynenum_uri(";1800|lookup.csv"); + assert_eq!(cache_name, "lookup"); + assert_eq!(uri, "lookup.csv"); + assert_eq!(cache_age, 1800); + assert_eq!(column, None); + + let (cache_name, uri, cache_age, column) = parse_dynenum_uri("lookup.csv"); + assert_eq!(cache_name, "lookup"); + assert_eq!(uri, "lookup.csv"); + assert_eq!(cache_age, 3600); + assert_eq!(column, None); + + let (cache_name, uri, cache_age, column) = parse_dynenum_uri("LookUp.csv"); + assert_eq!(cache_name, "lookup"); + assert_eq!(uri, "LookUp.csv"); + assert_eq!(cache_age, 3600); + assert_eq!(column, None); + + let (cache_name, uri, cache_age, column) = + parse_dynenum_uri("NYC_neighborhood_data|ckan://nyc_neighborhoods?"); + assert_eq!(cache_name, "nyc_neighborhood_data"); + assert_eq!(uri, "ckan://nyc_neighborhoods?"); + assert_eq!(cache_age, 3600); + assert_eq!(column, None); + + let (cache_name, uri, cache_age, column) = parse_dynenum_uri("dathere://us_states.csv"); + assert_eq!(cache_name, "us_states"); + assert_eq!(uri, "dathere://us_states.csv"); + assert_eq!(cache_age, 3600); + assert_eq!(column, None); + + let (cache_name, uri, cache_age, column) = + parse_dynenum_uri("dathere://us_states.csv|state_col"); + assert_eq!(cache_name, "us_states"); + assert_eq!(uri, "dathere://us_states.csv"); + assert_eq!(cache_age, 3600); + assert_eq!(column, Some("state_col".to_string())); } /// Factory function that creates a DynEnumValidator for validating against dynamic enums loaded @@ -436,18 +595,17 @@ fn dyn_enum_validator_factory<'a>( ) })?; - let (lookup_name, cache_age_secs) = parse_dynenum_uri(uri); + let (lookup_name, final_uri, cache_age_secs, column) = parse_dynenum_uri(uri); // Create lookup table options let opts = LookupTableOptions { name: lookup_name, - uri: uri.to_string(), + uri: final_uri, cache_age_secs, cache_dir: QSV_CACHE_DIR.get().unwrap().to_string(), delimiter: DELIMITER.get().copied().flatten(), ckan_api_url: CKAN_API.get().cloned(), - #[allow(clippy::redundant_closure_for_method_calls)] - ckan_token: CKAN_TOKEN.get().and_then(|t| t.clone()), + ckan_token: CKAN_TOKEN.get().and_then(std::clone::Clone::clone), timeout_secs: TIMEOUT_SECS.load(Ordering::Relaxed), }; @@ -457,7 +615,7 @@ fn dyn_enum_validator_factory<'a>( Err(e) => return fail_validation_error!("Error loading dynamicEnum lookup table: {}", e), }; - // Read the first column into a HashSet + // Read the specified column into a HashSet let mut enum_set = HashSet::with_capacity(lookup_result.headers.len()); let rconfig = Config::new(Some(lookup_result.filepath).as_ref()); let mut rdr = match rconfig @@ -470,10 +628,37 @@ fn dyn_enum_validator_factory<'a>( Err(e) => return fail_validation_error!("Error opening dynamicEnum file: {e}"), }; + // Get column index based on name or default to first column + let column_idx = if let Some(col_name) = column { + // Try parsing as index first + if let Ok(idx) = col_name.parse::() { + idx + } else { + // Try finding column by name + match rdr.headers() { + Ok(headers) => { + let idx = headers.iter().position(|h| h == col_name); + match idx { + Some(i) => i, + None => { + return fail_validation_error!( + "Column '{}' not found in lookup table", + col_name + ) + }, + } + }, + Err(e) => return fail_validation_error!("Error reading headers: {e}"), + } + } + } else { + 0 + }; + for result in rdr.records() { match result { Ok(record) => { - if let Some(value) = record.get(0) { + if let Some(value) = record.get(column_idx) { enum_set.insert(value.to_owned()); } }, @@ -496,8 +681,13 @@ fn dyn_enum_validator_factory<'a>( Err(e) => return fail_validation_error!("Failed to create temporary file: {}", e), }; - let dynenum_path = if uri.starts_with("http") { - let valid_url = reqwest::Url::parse(uri).map_err(|e| { + // Split URI to get column specification + let parts: Vec<&str> = uri.split('|').collect(); + let base_uri = parts[0]; + let column = parts.get(1).map(std::string::ToString::to_string); + + let dynenum_path = if base_uri.starts_with("http") { + let valid_url = reqwest::Url::parse(base_uri).map_err(|e| { ValidationError::custom( Location::default(), location, @@ -530,15 +720,15 @@ fn dyn_enum_validator_factory<'a>( temp_download.path().to_str().unwrap().to_string() } else { // its a local file - let uri_path = std::path::Path::new(uri); + let uri_path = std::path::Path::new(base_uri); let uri_exists = uri_path.exists(); if !uri_exists { - return fail_validation_error!("dynamicEnum file not found - {uri}"); + return fail_validation_error!("dynamicEnum file not found - {base_uri}"); } uri_path.to_str().unwrap().to_string() }; - // read the first column into a HashSet + // read the specified column into a HashSet let mut enum_set = HashSet::with_capacity(50); let rconfig = Config::new(Some(dynenum_path).as_ref()); let mut rdr = match rconfig @@ -550,10 +740,27 @@ fn dyn_enum_validator_factory<'a>( Ok(reader) => reader, Err(e) => return fail_validation_error!("Error opening dynamicEnum file: {e}"), }; + + // Get column index based on name or default to first column + let column_idx = if let Some(col_name) = column { + // Try parsing as index first + if let Ok(idx) = col_name.parse::() { + idx + } else { + // Try finding column by name + match rdr.headers() { + Ok(headers) => headers.iter().position(|h| h == col_name).unwrap_or(0), + Err(_) => 0, + } + } + } else { + 0 + }; + for result in rdr.records() { match result { Ok(record) => { - if let Some(value) = record.get(0) { + if let Some(value) = record.get(column_idx) { enum_set.insert(value.to_owned()); } }, @@ -874,7 +1081,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // parse and compile supplied JSON Schema let (schema_json, schema_compiled): (Value, Validator) = // safety: we know the schema is_some() because we checked above - match load_json(&args.arg_json_schema.unwrap()) { + match load_json(&args.arg_json_schema.clone().unwrap()) { Ok(s) => { // parse JSON string let mut s_slice = s.as_bytes().to_vec(); @@ -890,13 +1097,13 @@ pub fn run(argv: &[&str]) -> CliResult<()> { Ok(schema) => (json, schema), Err(e) => { return fail_clierror!(r#"Cannot compile JSONschema. error: {e} -Try running `qsv validate --validate-schema` to check the JSON Schema file."#); +Try running `qsv validate schema {}` to check the JSON Schema file."#, args.arg_json_schema.unwrap()); }, } }, Err(e) => { return fail_clierror!(r#"Unable to parse JSONschema. error: {e} -Try running `qsv validate --validate-schema` to check the JSON Schema file."#); +Try running `qsv validate schema {}` to check the JSON Schema file."#, args.arg_json_schema.unwrap()); }, } }, diff --git a/tests/test_validate.rs b/tests/test_validate.rs index a1a9b48dc..6cd25533e 100644 --- a/tests/test_validate.rs +++ b/tests/test_validate.rs @@ -374,3 +374,259 @@ fn validate_adur_public_toilets_dataset_with_json_schema_url() { assert_eq!(adur_errors(), validation_error_output); wrk.assert_err(&mut cmd); } + +#[test] +fn validate_dynenum_with_column() { + let wrk = Workdir::new("validate_dynenum_with_column").flexible(true); + + // Create lookup file first + wrk.create( + "lookup.csv", + vec![ + svec!["code", "name", "category"], + svec!["A1", "Apple", "fruit"], + svec!["B2", "Banana", "fruit"], + svec!["C3", "Carrot", "vegetable"], + ], + ); + + // Create test data + wrk.create( + "data.csv", + vec![ + svec!["id", "product", "type"], + svec!["1", "Apple", "fruit"], + svec!["2", "Banana", "fruit"], + svec!["3", "Orange", "fruit"], // Invalid - not in lookup + svec!["4", "Grape", "fruit"], // Invalid - not in lookup + ], + ); + + // Create schema using dynamicEnum with column specification + wrk.create_from_string( + "schema.json", + r#"{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "id": { "type": "string" }, + "product": { + "type": "string", + "dynamicEnum": "lookup.csv|name" + }, + "type": { "type": "string" } + } + }"#, + ); + + // Run validate command + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv").arg("schema.json"); + wrk.output(&mut cmd); + + wrk.assert_err(&mut cmd); + + // Check validation-errors.tsv + let validation_errors: String = wrk.from_str(&wrk.path("data.csv.validation-errors.tsv")); + + let expected_errors = "row_number\tfield\terror\n3\tproduct\t\"Orange\" is not a valid \ + dynamicEnum value\n4\tproduct\t\"Grape\" is not a valid dynamicEnum \ + value\n"; + assert_eq!(validation_errors, expected_errors); + + // Check valid records + let valid_records: Vec> = wrk.read_csv("data.csv.valid"); + let expected_valid = vec![svec!["1", "Apple", "fruit"], svec!["2", "Banana", "fruit"]]; + assert_eq!(valid_records, expected_valid); + + // Check invalid records + let invalid_records: Vec> = wrk.read_csv("data.csv.invalid"); + let expected_invalid = vec![svec!["3", "Orange", "fruit"], svec!["4", "Grape", "fruit"]]; + assert_eq!(invalid_records, expected_invalid); + + wrk.assert_err(&mut cmd); +} + +#[test] +fn validate_dynenum_with_column_index() { + let wrk = Workdir::new("validate_dynenum_with_column_index").flexible(true); + + // Create a sample CSV file with multiple columns + wrk.create( + "lookup.csv", + vec![ + svec!["code", "name", "category"], + svec!["A1", "Apple", "fruit"], + svec!["B2", "Banana", "fruit"], + svec!["C3", "Carrot", "vegetable"], + ], + ); + + // Create test data + wrk.create( + "data.csv", + vec![ + svec!["id", "category", "code"], + svec!["1", "fruit", "A1"], + svec!["2", "vegetable", "D4"], // Invalid - code not in lookup + svec!["3", "fruit", "B2"], + svec!["4", "fruit", "X9"], // Invalid - code not in lookup + ], + ); + + // Create schema using dynamicEnum with column index + wrk.create_from_string( + "schema.json", + r#"{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "id": { "type": "string" }, + "category": { "type": "string" }, + "code": { + "type": "string", + "dynamicEnum": "lookup.csv|0" + } + } + }"#, + ); + + // Run validate command + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv").arg("schema.json"); + wrk.output(&mut cmd); + + wrk.assert_err(&mut cmd); + + // Check validation-errors.tsv + let validation_errors = wrk + .read_to_string("data.csv.validation-errors.tsv") + .unwrap(); + let expected_errors = "row_number\tfield\terror\n2\tcode\t\"D4\" is not a valid dynamicEnum \ + value\n4\tcode\t\"X9\" is not a valid dynamicEnum value\n"; + assert_eq!(validation_errors, expected_errors); + + // Check valid records + let valid_records: Vec> = wrk.read_csv("data.csv.valid"); + let expected_valid = vec![svec!["1", "fruit", "A1"], svec!["3", "fruit", "B2"]]; + assert_eq!(valid_records, expected_valid); + + // Check invalid records + let invalid_records: Vec> = wrk.read_csv("data.csv.invalid"); + let expected_invalid = vec![svec!["2", "vegetable", "D4"], svec!["4", "fruit", "X9"]]; + assert_eq!(invalid_records, expected_invalid); + + wrk.assert_err(&mut cmd); +} + +#[test] +fn validate_dynenum_with_invalid_column() { + let wrk = Workdir::new("validate_dynenum_with_invalid_column").flexible(true); + + // Create lookup file first + wrk.create( + "lookup.csv", + vec![ + svec!["code", "name"], + svec!["A1", "Apple"], + svec!["B2", "Banana"], + ], + ); + + // Create test data + wrk.create("data.csv", vec![svec!["id", "name"], svec!["1", "Apple"]]); + + // Create schema using dynamicEnum with non-existent column + wrk.create_from_string( + "schema.json", + r#"{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "id": { "type": "string" }, + "name": { + "type": "string", + "dynamicEnum": "lookup.csv|nonexistent_column" + } + } + }"#, + ); + + // Run validate command + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv").arg("schema.json"); + + // Check error output + let got = wrk.output_stderr(&mut cmd); + #[cfg(feature = "lite")] + assert_eq!(got, "1 out of 1 records invalid.\n"); + #[cfg(not(feature = "lite"))] + assert_eq!( + got, + "Cannot compile JSONschema. error: Column 'nonexistent_column' not found in lookup \ + table\nTry running `qsv validate schema schema.json` to check the JSON Schema file.\n" + ); + + wrk.assert_err(&mut cmd); +} + +#[test] +fn validate_dynenum_with_remote_csv() { + let wrk = Workdir::new("validate_dynenum_with_remote_csv").flexible(true); + + // Create test data + wrk.create( + "data.csv", + vec![ + svec!["id", "fruit"], + svec!["1", "banana"], + svec!["2", "mango"], // Invalid - not in fruits.csv + svec!["3", "apple"], + svec!["4", "dragonfruit"], // Invalid - not in fruits.csv + ], + ); + + // Create schema using dynamicEnum with remote CSV + wrk.create_from_string( + "schema.json", + r#"{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "id": { "type": "string" }, + "fruit": { + "type": "string", + "dynamicEnum": "https://raw.githubusercontent.com/dathere/qsv/refs/heads/master/resources/test/fruits.csv" + } + } + }"#, + ); + + // Run validate command + let mut cmd = wrk.command("validate"); + cmd.arg("data.csv").arg("schema.json"); + wrk.output(&mut cmd); + + wrk.assert_err(&mut cmd); + + // Check validation-errors.tsv + let validation_errors = wrk + .read_to_string("data.csv.validation-errors.tsv") + .unwrap(); + let expected_errors = "row_number\tfield\terror\n2\tfruit\t\"mango\" is not a valid \ + dynamicEnum value\n4\tfruit\t\"dragonfruit\" is not a valid \ + dynamicEnum value\n"; + assert_eq!(validation_errors, expected_errors); + + // Check valid records + let valid_records: Vec> = wrk.read_csv("data.csv.valid"); + let expected_valid = vec![svec!["1", "banana"], svec!["3", "apple"]]; + assert_eq!(valid_records, expected_valid); + + // Check invalid records + let invalid_records: Vec> = wrk.read_csv("data.csv.invalid"); + let expected_invalid = vec![svec!["2", "mango"], svec!["4", "dragonfruit"]]; + assert_eq!(invalid_records, expected_invalid); + + wrk.assert_err(&mut cmd); +} diff --git a/tests/workdir.rs b/tests/workdir.rs index 676e16d8a..360eb707e 100644 --- a/tests/workdir.rs +++ b/tests/workdir.rs @@ -267,6 +267,19 @@ impl Workdir { path.push(name); create_dir_all(path) } + + /// Read a CSV file and parse it into Vec> + /// Note that this does not return the header row + pub fn read_csv(&self, name: &str) -> Vec> { + let path = self.path(name); + let mut rdr = csv::ReaderBuilder::new() + .flexible(self.flexible) + .from_path(&path) + .unwrap(); + rdr.records() + .map(|r| r.unwrap().iter().map(|s| s.to_string()).collect()) + .collect() + } } impl fmt::Debug for Workdir {