From 1b5da58bf57dacae17e9cc70a696648b911500dc Mon Sep 17 00:00:00 2001 From: VG Date: Tue, 16 Apr 2024 13:14:32 +0800 Subject: [PATCH 1/4] chore: remove submodule --- .gitmodules | 3 --- dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c | 1 - 2 files changed, 4 deletions(-) delete mode 100644 .gitmodules delete mode 160000 dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index a179f0e620..0000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c"] - path = dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c - url = https://github.com/aerospike/aerospike-client-c diff --git a/dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c b/dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c deleted file mode 160000 index 029db7ac63..0000000000 --- a/dozer-sink-aerospike/aerospike-client-sys/aerospike-client-c +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 029db7ac63ba3533150c359e0dec5a51e54914ab From aefdd46847f86c683b91536a0f5cccfe736988d9 Mon Sep 17 00:00:00 2001 From: VG Date: Tue, 16 Apr 2024 13:13:30 +0800 Subject: [PATCH 2/4] chore: remove unsupported features --- Cargo.lock | 79 - Cargo.toml | 2 - dozer-cli/Cargo.toml | 3 - dozer-cli/src/errors.rs | 2 + dozer-cli/src/pipeline/builder.rs | 56 +- dozer-ingestion/Cargo.toml | 2 - dozer-ingestion/aerospike/Cargo.toml | 13 - dozer-ingestion/aerospike/src/connector.rs | 984 ---------- dozer-ingestion/aerospike/src/lib.rs | 4 - dozer-ingestion/aerospike/src/tests.rs | 78 - dozer-ingestion/oracle/Cargo.toml | 15 - dozer-ingestion/oracle/src/connector/join.rs | 60 - .../oracle/src/connector/listing.rs | 132 -- .../oracle/src/connector/mapping.rs | 242 --- dozer-ingestion/oracle/src/connector/mod.rs | 548 ------ .../src/connector/replicate/log/listing.rs | 152 -- .../src/connector/replicate/log/merge.rs | 52 - .../oracle/src/connector/replicate/log/mod.rs | 176 -- .../connector/replicate/log/redo/log_miner.rs | 148 -- .../src/connector/replicate/log/redo/mod.rs | 26 - .../oracle/src/connector/replicate/mod.rs | 5 - .../replicate/transaction/aggregate/commit.rs | 39 - .../replicate/transaction/aggregate/forest.rs | 111 -- .../replicate/transaction/aggregate/mod.rs | 144 -- .../replicate/transaction/aggregate/op.rs | 27 - .../connector/replicate/transaction/csf.rs | 48 - .../connector/replicate/transaction/map.rs | 161 -- .../connector/replicate/transaction/mod.rs | 53 - .../replicate/transaction/parse/delete.rs | 62 - .../replicate/transaction/parse/insert.rs | 63 - .../replicate/transaction/parse/mod.rs | 141 -- .../replicate/transaction/parse/row.rs | 36 - .../replicate/transaction/parse/update.rs | 100 - dozer-ingestion/oracle/src/lib.rs | 215 --- dozer-ingestion/src/errors.rs | 3 + dozer-ingestion/src/lib.rs | 19 +- dozer-sink-aerospike/Cargo.toml | 14 - .../aerospike-client-sys/Cargo.lock | 7 - .../aerospike-client-sys/Cargo.toml | 15 - .../aerospike-client-sys/aerospike_client.h | 16 - .../aerospike-client-sys/build.rs | 74 - .../aerospike-client-sys/src/lib.rs | 174 -- dozer-sink-aerospike/src/aerospike.rs | 1252 ------------- dozer-sink-aerospike/src/denorm_dag.rs | 1632 ----------------- dozer-sink-aerospike/src/lib.rs | 730 -------- dozer-sink-oracle/Cargo.toml | 12 - dozer-sink-oracle/src/lib.rs | 989 ---------- dozer-types/src/models/sink.rs | 17 +- 48 files changed, 27 insertions(+), 8906 deletions(-) delete mode 100644 dozer-ingestion/aerospike/Cargo.toml delete mode 100644 dozer-ingestion/aerospike/src/connector.rs delete mode 100644 dozer-ingestion/aerospike/src/lib.rs delete mode 100644 dozer-ingestion/aerospike/src/tests.rs delete mode 100644 dozer-ingestion/oracle/Cargo.toml delete mode 100644 dozer-ingestion/oracle/src/connector/join.rs delete mode 100644 dozer-ingestion/oracle/src/connector/listing.rs delete mode 100644 dozer-ingestion/oracle/src/connector/mapping.rs delete mode 100644 dozer-ingestion/oracle/src/connector/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/log/listing.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/log/merge.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/log/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/log/redo/log_miner.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/log/redo/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/commit.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/forest.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/op.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/csf.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/map.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/parse/delete.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/parse/insert.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/parse/mod.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/parse/row.rs delete mode 100644 dozer-ingestion/oracle/src/connector/replicate/transaction/parse/update.rs delete mode 100644 dozer-ingestion/oracle/src/lib.rs delete mode 100644 dozer-sink-aerospike/Cargo.toml delete mode 100644 dozer-sink-aerospike/aerospike-client-sys/Cargo.lock delete mode 100644 dozer-sink-aerospike/aerospike-client-sys/Cargo.toml delete mode 100644 dozer-sink-aerospike/aerospike-client-sys/aerospike_client.h delete mode 100644 dozer-sink-aerospike/aerospike-client-sys/build.rs delete mode 100644 dozer-sink-aerospike/aerospike-client-sys/src/lib.rs delete mode 100644 dozer-sink-aerospike/src/aerospike.rs delete mode 100644 dozer-sink-aerospike/src/denorm_dag.rs delete mode 100644 dozer-sink-aerospike/src/lib.rs delete mode 100644 dozer-sink-oracle/Cargo.toml delete mode 100644 dozer-sink-oracle/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 90ea1fa764..55a96a2c33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -245,13 +245,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "aerospike-client-sys" -version = "0.1.0" -dependencies = [ - "bindgen", -] - [[package]] name = "aes" version = "0.8.3" @@ -976,15 +969,12 @@ dependencies = [ "itertools 0.12.1", "lazy_static", "lazycell", - "log", - "prettyplease", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", "syn 2.0.53", - "which 4.4.2", ] [[package]] @@ -2789,9 +2779,7 @@ dependencies = [ "clap", "dozer-core", "dozer-ingestion", - "dozer-sink-aerospike", "dozer-sink-clickhouse", - "dozer-sink-oracle", "dozer-sql", "dozer-tracing", "dozer-types", @@ -2872,7 +2860,6 @@ dependencies = [ "bytes", "chrono", "criterion", - "dozer-ingestion-aerospike", "dozer-ingestion-connector", "dozer-ingestion-deltalake", "dozer-ingestion-ethereum", @@ -2882,7 +2869,6 @@ dependencies = [ "dozer-ingestion-mongodb", "dozer-ingestion-mysql", "dozer-ingestion-object-store", - "dozer-ingestion-oracle", "dozer-ingestion-postgres", "dozer-ingestion-snowflake", "dozer-ingestion-webhook", @@ -2900,16 +2886,6 @@ dependencies = [ "url", ] -[[package]] -name = "dozer-ingestion-aerospike" -version = "0.4.0" -dependencies = [ - "actix-web", - "base64 0.21.7", - "dozer-ingestion-connector", - "dozer-sink-aerospike", -] - [[package]] name = "dozer-ingestion-connector" version = "0.4.0" @@ -3001,16 +2977,6 @@ dependencies = [ "url", ] -[[package]] -name = "dozer-ingestion-oracle" -version = "0.1.0" -dependencies = [ - "dozer-ingestion-connector", - "env_logger 0.11.2", - "oracle", - "regex", -] - [[package]] name = "dozer-ingestion-postgres" version = "0.4.0" @@ -3053,17 +3019,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "dozer-sink-aerospike" -version = "0.1.0" -dependencies = [ - "aerospike-client-sys", - "dozer-core", - "dozer-types", - "itertools 0.12.1", - "smallvec", -] - [[package]] name = "dozer-sink-clickhouse" version = "0.1.0" @@ -3076,15 +3031,6 @@ dependencies = [ "serde", ] -[[package]] -name = "dozer-sink-oracle" -version = "0.1.0" -dependencies = [ - "dozer-core", - "dozer-types", - "oracle", -] - [[package]] name = "dozer-sql" version = "0.4.0" @@ -5948,31 +5894,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "oracle" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe80334af1fbaea016fbef0af77f5fa32452362e29a039389b8c93737585003" -dependencies = [ - "cc", - "chrono", - "lazy_static", - "oracle_procmacro", - "paste", -] - -[[package]] -name = "oracle_procmacro" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad247f3421d57de56a0d0408d3249d4b1048a522be2013656d92f022c3d8af27" -dependencies = [ - "darling 0.13.4", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "ordered-float" version = "2.10.1" diff --git a/Cargo.toml b/Cargo.toml index eadc28e819..b47925d2f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,9 +8,7 @@ members = [ "dozer-tracing", "dozer-tests", "dozer-utils", - "dozer-sink-aerospike", "dozer-sink-clickhouse", - "dozer-sink-oracle", ] resolver = "2" diff --git a/dozer-cli/Cargo.toml b/dozer-cli/Cargo.toml index 838c73dff8..cb0519af7e 100644 --- a/dozer-cli/Cargo.toml +++ b/dozer-cli/Cargo.toml @@ -16,10 +16,7 @@ dozer-core = { path = "../dozer-core" } dozer-sql = { path = "../dozer-sql" } dozer-types = { path = "../dozer-types" } dozer-tracing = { path = "../dozer-tracing" } -dozer-sink-aerospike = { path = "../dozer-sink-aerospike" } dozer-sink-clickhouse = { path = "../dozer-sink-clickhouse" } -dozer-sink-oracle = { path = "../dozer-sink-oracle" } - actix-web = "4.4.0" async-trait = "0.1.74" uuid = { version = "1.6.1", features = ["v4", "serde"] } diff --git a/dozer-cli/src/errors.rs b/dozer-cli/src/errors.rs index 0f24c67fe8..ecca284d63 100644 --- a/dozer-cli/src/errors.rs +++ b/dozer-cli/src/errors.rs @@ -88,6 +88,8 @@ pub enum OrchestrationError { LockedNoLockFile, #[error("Command was aborted")] Aborted, + #[error("This feature is only supported in enterprise: {0}")] + UnsupportedFeature(String), } #[derive(Error, Debug)] diff --git a/dozer-cli/src/pipeline/builder.rs b/dozer-cli/src/pipeline/builder.rs index 9371d54e42..45e755a58e 100644 --- a/dozer-cli/src/pipeline/builder.rs +++ b/dozer-cli/src/pipeline/builder.rs @@ -24,9 +24,7 @@ use std::hash::Hash; use tokio::runtime::Runtime; use crate::pipeline::dummy_sink::DummySinkFactory; -use dozer_sink_aerospike::AerospikeSinkFactory; use dozer_sink_clickhouse::ClickhouseSinkFactory; -use dozer_sink_oracle::OracleSinkFactory; use super::source_builder::SourceBuilder; use crate::errors::OrchestrationError; @@ -246,35 +244,7 @@ impl<'a> PipelineBuilder<'a> { id, vec![(get_table_info(&config.table_name)?, DEFAULT_PORT_HANDLE)], ), - SinkConfig::Aerospike(config) => { - let connection = self - .connections - .iter() - .find_map(|conn| match conn { - Connection { - config: ConnectionConfig::Aerospike(conn_config), - name, - } if name == &config.connection => Some(conn_config), - _ => None, - }) - .ok_or_else(|| { - OrchestrationError::ConnectionNotFound(config.connection.clone()) - })?; - let sink_factory = Box::new(AerospikeSinkFactory::new( - connection.clone(), - config.clone(), - )); - let table_infos = config - .tables - .iter() - .enumerate() - .map(|(port, table)| { - let table_info = get_table_info(&table.source_table_name)?; - Ok((table_info, port as PortHandle)) - }) - .collect::, OrchestrationError>>()?; - add_sink_to_pipeline(&mut pipeline, sink_factory, id, table_infos); - } + SinkConfig::Clickhouse(config) => { let sink = Box::new(ClickhouseSinkFactory::new(config.clone(), runtime.clone())); @@ -286,28 +256,8 @@ impl<'a> PipelineBuilder<'a> { vec![(table_info, DEFAULT_PORT_HANDLE)], ); } - SinkConfig::Oracle(config) => { - let connection = self - .connections - .iter() - .find_map(|conn| match conn { - Connection { - config: ConnectionConfig::Oracle(conn_config), - name, - } if name == &config.connection => Some(conn_config), - _ => None, - }) - .ok_or_else(|| { - OrchestrationError::ConnectionNotFound(config.connection.clone()) - })?; - let sink = Box::new(OracleSinkFactory::new(connection.clone(), config.clone())); - let table_info = get_table_info(&config.table_name)?; - add_sink_to_pipeline( - &mut pipeline, - sink, - id, - vec![(table_info, DEFAULT_PORT_HANDLE)], - ); + x => { + return Err(OrchestrationError::UnsupportedFeature(x.name())); } } } diff --git a/dozer-ingestion/Cargo.toml b/dozer-ingestion/Cargo.toml index 4b9fcfe2dd..1525d8cc5f 100644 --- a/dozer-ingestion/Cargo.toml +++ b/dozer-ingestion/Cargo.toml @@ -19,9 +19,7 @@ dozer-ingestion-mysql = { path = "./mysql" } dozer-ingestion-object-store = { path = "./object-store", optional = true } dozer-ingestion-postgres = { path = "./postgres" } dozer-ingestion-snowflake = { path = "./snowflake", optional = true } -dozer-ingestion-aerospike = { path = "./aerospike" } dozer-ingestion-webhook = { path = "./webhook" } -dozer-ingestion-oracle = { path = "./oracle" } tokio = { version = "1", features = ["full"] } futures = "0.3.28" diff --git a/dozer-ingestion/aerospike/Cargo.toml b/dozer-ingestion/aerospike/Cargo.toml deleted file mode 100644 index 5b44eb6818..0000000000 --- a/dozer-ingestion/aerospike/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "dozer-ingestion-aerospike" -version = "0.4.0" -edition = "2021" -license = "AGPL-3.0-or-later" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -dozer-ingestion-connector = { path = "../connector" } -actix-web = "4.5.1" -base64 = "0.21.7" -dozer-sink-aerospike = { path = "../../dozer-sink-aerospike" } diff --git a/dozer-ingestion/aerospike/src/connector.rs b/dozer-ingestion/aerospike/src/connector.rs deleted file mode 100644 index c87fdef157..0000000000 --- a/dozer-ingestion/aerospike/src/connector.rs +++ /dev/null @@ -1,984 +0,0 @@ -use dozer_ingestion_connector::dozer_types::epoch::SourceTime; -use dozer_ingestion_connector::dozer_types::errors::internal::BoxedError; -use dozer_ingestion_connector::dozer_types::errors::types::DeserializationError; -use dozer_ingestion_connector::dozer_types::event::Event; -use dozer_ingestion_connector::dozer_types::json_types::serde_json_to_json_value; -use dozer_ingestion_connector::dozer_types::log::{debug, error, info, trace, warn}; -use dozer_ingestion_connector::dozer_types::models::connection::AerospikeConnection; -use dozer_ingestion_connector::dozer_types::models::ingestion_types::{ - IngestionMessage, TransactionInfo, -}; -use dozer_ingestion_connector::dozer_types::node::{NodeHandle, OpIdentifier, SourceState}; -use dozer_ingestion_connector::dozer_types::types::Operation::Insert; -use dozer_ingestion_connector::dozer_types::types::{Field, FieldDefinition, FieldType, Schema}; -use dozer_ingestion_connector::tokio::sync::broadcast::error::RecvError; -use dozer_ingestion_connector::tokio::sync::broadcast::Receiver; -use dozer_ingestion_connector::tokio::sync::{mpsc, oneshot}; -use dozer_ingestion_connector::{ - async_trait, dozer_types, tokio, Connector, Ingestor, SourceSchema, SourceSchemaResult, - TableIdentifier, TableInfo, -}; -use std::collections::HashMap; -use std::ffi::{CStr, CString}; -use std::num::TryFromIntError; - -use std::time::Duration; - -use dozer_ingestion_connector::dozer_types::serde::Deserialize; - -use actix_web::dev::Server; -use actix_web::post; -use actix_web::web; -use actix_web::App; -use actix_web::HttpRequest; -use actix_web::HttpServer; -use actix_web::{get, HttpResponse}; - -use dozer_ingestion_connector::dozer_types::rust_decimal::Decimal; -use dozer_ingestion_connector::dozer_types::serde_json; -use dozer_ingestion_connector::dozer_types::serde_json::Value; - -use base64::prelude::*; -use dozer_ingestion_connector::dozer_types::chrono::{DateTime, FixedOffset, NaiveDateTime, Utc}; - -use dozer_ingestion_connector::dozer_types::thiserror::{self, Error}; -use dozer_ingestion_connector::schema_parser::SchemaParser; - -use dozer_sink_aerospike::Client; - -#[derive(Debug, Error)] -pub enum AerospikeConnectorError { - #[error("Cannot start server: {0}")] - CannotStartServer(#[from] std::io::Error), - - #[error("Set name is none. Key: {0:?}, {1:?}, {2:?}")] - SetNameIsNone( - Option, - Option, - Option, - ), - - #[error("PK is none: {0:?}, {1:?}, {2:?}")] - PkIsNone(Option, String, Option), - - #[error("Invalid key value: {0:?}. Key is supposed to have 4 elements.")] - InvalidKeyValue(Vec>), - - #[error("Unsupported type. Bin type {bin_type:?}, field type: {field_type:?}")] - UnsupportedTypeForFieldType { - bin_type: String, - field_type: FieldType, - }, - - #[error("Unsupported type: {0}")] - UnsupportedType(FieldType), - - #[error("Invalid timestamp: {0}")] - InvalidTimestamp(i64), - - #[error("Invalid days: {0}")] - InvalidDate(i64), - - #[error("Error decoding base64: {0}")] - BytesDecodingError(#[from] base64::DecodeError), - - #[error("Error parsing float: {0}")] - FloatParsingError(#[from] std::num::ParseFloatError), - - #[error("Error parsing int: {0}")] - IntParsingError(#[from] std::num::ParseIntError), - - #[error("Error casting int: {0}")] - IntCastError(#[from] TryFromIntError), - - #[error("Failed days number parsing")] - ParsingDaysError, - - #[error("Failed timestamp parsing")] - ParsingTimestampFailed, - - #[error("Failed point parsing")] - ParsingPointFailed, - - #[error("Failed int parsing")] - ParsingIntFailed, - - #[error("Failed uint parsing")] - ParsingUIntFailed, - - #[error("Failed float parsing")] - ParsingFloatFailed, - - #[error("Failed decimal parsing")] - ParsingDecimalFailed(#[from] dozer_types::rust_decimal::Error), - - #[error("Schema not found: {0}")] - SchemaNotFound(String), - - #[error("Failed parsing timestamp: {0}")] - TimestampParsingError(#[from] dozer_ingestion_connector::dozer_types::chrono::ParseError), - - #[error("Key is neither string or int")] - KeyNotSupported(Value), - - #[error("Failed to parse json")] - JsonParsingFailed(#[from] DeserializationError), - - #[error("Failed to parse duration")] - ParsingDurationFailed, -} - -#[derive(Deserialize, Debug)] -#[serde(crate = "dozer_types::serde")] -pub struct AerospikeEvent { - msg: String, - key: Vec>, - // gen: u32, - // exp: u32, - lut: u64, - bins: Vec, -} - -#[derive(Deserialize, Debug)] -#[serde(crate = "dozer_types::serde")] -pub struct Bin { - name: String, - value: Option, - r#type: String, -} - -#[derive(Debug)] -pub struct AerospikeConnector { - pub config: AerospikeConnection, - node_handle: NodeHandle, - event_receiver: Receiver, -} - -impl AerospikeConnector { - pub fn new( - config: AerospikeConnection, - node_handle: NodeHandle, - event_receiver: Receiver, - ) -> Self { - Self { - config, - node_handle, - event_receiver, - } - } - - fn start_server(&self, server_state: ServerState) -> Result { - let address = format!( - "{}:{}", - self.config.replication.server_address, self.config.replication.server_port - ); - - info!("Starting aerospike replication server on {}", address); - - Ok(HttpServer::new(move || { - App::new() - .app_data(web::JsonConfig::default().error_handler(|err, _req| { - error!("Error parsing json: {:?}", err); - actix_web::error::InternalError::from_response( - "", - HttpResponse::BadRequest() - .content_type("application/json") - .body(format!(r#"{{"error":"{}"}}"#, err)), - ) - .into() - })) - .app_data(web::Data::new(server_state.clone())) - .service(healthcheck) - .service(healthcheck_batch) - .service(event_request_handler) - .service(batch_event_request_handler) - }) - .bind(address)? - .run()) - } - - async fn rewind( - &self, - client: &Client, - dc_name: &str, - namespace: &str, - ) -> Result { - unsafe { - let request = CString::new(format!( - "set-config:context=xdr;dc={dc_name};namespace={namespace};action=add;rewind=all" - ))?; - - // Wait until the replication configuration is set. - // It may take some time, so retrying until rewind returns ok. - let mut response: *mut i8 = std::ptr::null_mut(); - client.info(&request, &mut response).map_err(Box::new)?; - - let string = CStr::from_ptr(response); - - let parts: Vec<&str> = string.to_str()?.trim().split('\t').collect(); - - if let Some(status) = parts.get(1) { - Ok(status.replace('\n', "") == *"ok") - } else { - Ok(false) - } - } - } -} - -#[derive(Debug)] -struct PendingMessage { - source_time: SourceTime, - messages: Vec, - sender: oneshot::Sender<()>, -} - -#[derive(Debug)] -struct PendingOperationId { - operation_id: u64, - sender: oneshot::Sender<()>, -} - -/// This loop assigns an operation id to each request and sends it to the ingestor. -async fn ingestor_loop( - mut message_receiver: mpsc::UnboundedReceiver, - ingestor: Ingestor, - operation_id_sender: mpsc::UnboundedSender, -) { - let mut operation_id = 0; - while let Some(message) = message_receiver.recv().await { - let pending_operation_id = PendingOperationId { - operation_id, - sender: message.sender, - }; - - // Propagate panic in the pipeline event processor loop. - operation_id_sender.send(pending_operation_id).unwrap(); - - // Ignore the error, because the server can be down. - for message in message.messages { - let _ = ingestor.handle_message(message).await; - } - let _ = ingestor - .handle_message(IngestionMessage::TransactionInfo(TransactionInfo::Commit { - id: Some(OpIdentifier::new(0, operation_id)), - source_time: Some(message.source_time), - })) - .await; - - operation_id += 1; - } -} - -/// This loop triggers the pending operation id that's before the event's payload. -async fn pipeline_event_processor( - node_handle: NodeHandle, - mut operation_id_receiver: mpsc::UnboundedReceiver, - mut event_receiver: Receiver, -) { - let mut operation_id_from_pipeline = None; - let mut pending_operation_id: Option = None; - loop { - if operation_id_from_pipeline - < pending_operation_id - .as_ref() - .map(|operation_id| operation_id.operation_id) - { - // We have pending operation id, wait for pipeline event. - let event = match event_receiver.recv().await { - Ok(event) => event, - Err(RecvError::Closed) => { - // Pipeline is down. - return; - } - Err(RecvError::Lagged(_)) => { - // Ignore lagged events. - continue; - } - }; - if let Some(operation_id) = get_operation_id_from_event(&event, &node_handle) { - operation_id_from_pipeline = Some(operation_id); - } - } else if let Some(pending) = pending_operation_id.take() { - // This operation id is already confirmed by the pipeline. - let _ = pending.sender.send(()); - } else { - // Wait for the next operation id. - let Some(pending) = operation_id_receiver.recv().await else { - // Ingestor is down. - return; - }; - pending_operation_id = Some(pending); - } - } -} - -fn get_operation_id_from_event(event: &Event, node_handle: &NodeHandle) -> Option { - match event { - Event::SinkFlushed { epoch, .. } => epoch - .common_info - .source_states - .get(node_handle) - .and_then(|state| match state { - SourceState::Restartable(id) => Some(id.seq_in_tx), - _ => None, - }), - } -} - -fn map_error(error: AerospikeConnectorError) -> HttpResponse { - error!("Aerospike ingestion error: {:?}", error); - HttpResponse::InternalServerError().finish() -} - -#[get("/")] -async fn healthcheck(_req: HttpRequest) -> HttpResponse { - HttpResponse::Ok().finish() -} - -#[get("/batch")] -async fn healthcheck_batch(_req: HttpRequest) -> HttpResponse { - HttpResponse::Ok().finish() -} - -#[post("/")] -async fn event_request_handler( - json: web::Json, - data: web::Data, -) -> HttpResponse { - let event = json.into_inner(); - let state = data.into_inner(); - - trace!(target: "aerospike_http_server", "Event data: {:?}", event); - // TODO: Handle delete - if event.msg != "write" { - return HttpResponse::Ok().finish(); - } - - let source_time = SourceTime::new(event.lut, 1); - let message = map_record(event, &state.tables_index_map); - - trace!(target: "aerospike_http_server", "Mapped message {:?}", message); - match message { - Ok(None) => HttpResponse::Ok().finish(), - Ok(Some(message)) => { - let (sender, receiver) = oneshot::channel::<()>(); - if let Err(e) = state.sender.send(PendingMessage { - source_time, - messages: vec![message], - sender, - }) { - error!("Ingestor is down: {:?}", e); - return HttpResponse::InternalServerError().finish(); - } - if let Err(e) = receiver.await { - error!("Pipeline event processor is down: {:?}", e); - HttpResponse::InternalServerError().finish() - } else { - HttpResponse::Ok().finish() - } - } - Err(e) => map_error(e), - } -} - -#[post("/batch")] -async fn batch_event_request_handler( - json: web::Json>, - data: web::Data, -) -> HttpResponse { - let events = json.into_inner(); - let state = data.into_inner(); - - debug!(target: "aerospike_http_server", "Aerospike events count {:?}", events.len()); - trace!(target: "aerospike_http_server", "Aerospike events {:?}", events); - - let mut min_lut = u64::MAX; - let messages = match events - .into_iter() - .filter_map(|e| { - let lut = e.lut; - let msg = map_record(e, &state.tables_index_map).transpose()?; - min_lut = min_lut.min(lut); - Some(msg) - }) - .collect::, AerospikeConnectorError>>() - { - Ok(msgs) => msgs, - Err(e) => return map_error(e), - }; - - debug!(target: "aerospike_http_server", "Mapped {:?} messages", messages.len()); - trace!(target: "aerospike_http_server", "Mapped messages {:?}", messages); - - if !messages.is_empty() { - let (sender, receiver) = oneshot::channel::<()>(); - if let Err(e) = state.sender.send(PendingMessage { - messages, - sender, - source_time: SourceTime::new(min_lut, 1), - }) { - error!("Ingestor is down: {:?}", e); - return HttpResponse::InternalServerError().finish(); - } - - if let Err(e) = receiver.await { - error!("Pipeline event processor is down: {:?}", e); - return HttpResponse::InternalServerError().finish(); - } - } - - HttpResponse::Ok().finish() -} - -#[derive(Clone, Debug)] -struct TableIndexMap { - table_index: usize, - columns_map: HashMap, -} - -#[derive(Clone)] -struct ServerState { - tables_index_map: HashMap, - sender: mpsc::UnboundedSender, -} - -#[async_trait] -impl Connector for AerospikeConnector { - fn types_mapping() -> Vec<(String, Option)> - where - Self: Sized, - { - vec![ - ("str".into(), Some(FieldType::Decimal)), - ("bool".into(), Some(FieldType::Boolean)), - ("int".into(), Some(FieldType::Int)), - ("float".into(), Some(FieldType::Float)), - ("blob".into(), Some(FieldType::Boolean)), - ("list".into(), None), - ("map".into(), None), - ("geojson".into(), None), - ] - } - - async fn validate_connection(&mut self) -> Result<(), BoxedError> { - Ok(()) - } - - async fn list_tables(&mut self) -> Result, BoxedError> { - Ok(self - .config - .sets - .iter() - .map(|set| TableIdentifier { - schema: Some(self.config.namespace.clone()), - name: set.to_string(), - }) - .collect()) - } - - async fn validate_tables(&mut self, _tables: &[TableIdentifier]) -> Result<(), BoxedError> { - Ok(()) - } - - async fn list_columns( - &mut self, - _tables: Vec, - ) -> Result, BoxedError> { - Ok(vec![]) - } - - async fn get_schemas( - &mut self, - table_infos: &[TableInfo], - ) -> Result, BoxedError> { - let schemas: HashMap = match self.config.schemas.clone() { - Some(schemas) => { - let schema = SchemaParser::parse_config(&schemas)?; - serde_json::from_str(&schema)? - } - None => table_infos - .iter() - .map(|table_info| { - let table_name = table_info.name.clone(); - let primary_index = table_info - .column_names - .iter() - .position(|n| n == "PK") - .map_or(vec![], |i| vec![i]); - - ( - table_name, - SourceSchema { - schema: Schema { - fields: table_info - .column_names - .iter() - .map(|name| FieldDefinition { - name: name.clone(), - typ: if name == "inserted_at" { - FieldType::Timestamp - } else if name == "PK" { - FieldType::UInt - } else { - FieldType::String - }, - nullable: name != "PK", - source: Default::default(), - description: None, - }) - .collect(), - primary_index, - }, - cdc_type: Default::default(), - }, - ) - }) - .collect(), - }; - - Ok(table_infos - .iter() - .map(|table_info| { - let table_name = table_info.name.clone(); - let schema = schemas - .get(&table_name) - .cloned() - .ok_or(AerospikeConnectorError::SchemaNotFound(table_name.clone()))?; - - let filtered_schema = if table_info.column_names.is_empty() { - schema - } else { - let primary_key_field_names: Vec = schema - .schema - .primary_index - .iter() - .map(|idx| { - schema - .schema - .fields - .get(*idx) - .map(|field| field.name.clone()) - .expect("Field should be present") - }) - .collect(); - - let filtered_fields: Vec = schema - .schema - .fields - .into_iter() - .filter(|field| table_info.column_names.contains(&field.name)) - .collect(); - - let new_primary_index = filtered_fields - .iter() - .enumerate() - .filter_map(|(i, field)| { - if primary_key_field_names.contains(&field.name) { - Some(i) - } else { - None - } - }) - .collect(); - - SourceSchema { - schema: Schema { - fields: filtered_fields, - primary_index: new_primary_index, - }, - cdc_type: Default::default(), - } - }; - - Ok(filtered_schema) - }) - .collect()) - } - - async fn serialize_state(&self) -> Result, BoxedError> { - Ok(vec![]) - } - - async fn start( - &mut self, - ingestor: &Ingestor, - tables: Vec, - last_checkpoint: Option, - ) -> Result<(), BoxedError> { - let hosts = CString::new(self.config.hosts.as_str())?; - let client = Client::new(&hosts).map_err(Box::new)?; - - if last_checkpoint.is_none() { - let dc_name = self.config.replication.datacenter.clone(); - let namespace = self.config.namespace.clone(); - - // To read data snapshot we need to rewind xdr stream. - // Before rewinding we need to remove xdr configuration and then add it again. - unsafe { - let request = CString::new(format!( - "set-config:context=xdr;dc={dc_name};namespace={namespace};action=remove" - ))?; - let mut response: *mut i8 = std::ptr::null_mut(); - client.info(&request, &mut response).map_err(Box::new)?; - } - - loop { - if self.rewind(&client, &dc_name, &namespace).await? { - info!("Aerospike replication configuration set successfully"); - break; - } else { - warn!("Aerospike replication configuration set failed"); - tokio::time::sleep(Duration::from_secs(3)).await; - } - } - } - - let mapped_schema = self.get_schemas(&tables).await?; - ingestor - .handle_message(IngestionMessage::TransactionInfo( - TransactionInfo::SnapshottingStarted, - )) - .await?; - ingestor - .handle_message(IngestionMessage::TransactionInfo( - TransactionInfo::SnapshottingDone { id: None }, - )) - .await?; - - let tables_index_map: HashMap = mapped_schema - .into_iter() - .enumerate() - .map(|(table_index, schema)| { - let columns_map: HashMap = schema - .expect("Schema should be present") - .schema - .fields - .iter() - .enumerate() - .map(|(i, field)| (field.name.clone(), (i, field.typ))) - .collect(); - - ( - tables[table_index].name.clone(), - TableIndexMap { - table_index, - columns_map, - }, - ) - }) - .collect(); - - let (message_sender, message_receiver) = mpsc::unbounded_channel(); - let (operation_id_sender, operation_id_receiver) = mpsc::unbounded_channel(); - let ingestor = ingestor.clone(); - tokio::spawn(async move { - ingestor_loop(message_receiver, ingestor, operation_id_sender).await - }); - let node_handle = self.node_handle.clone(); - let event_receiver = self.event_receiver.resubscribe(); - tokio::spawn(async move { - pipeline_event_processor(node_handle, operation_id_receiver, event_receiver).await - }); - let server_state = ServerState { - tables_index_map: tables_index_map.clone(), - sender: message_sender, - }; - - let _server = self.start_server(server_state)?.await; - - Ok(()) - } -} - -fn map_record( - event: AerospikeEvent, - tables_map: &HashMap, -) -> Result, AerospikeConnectorError> { - let key: [Option; 4] = match event.key.try_into() { - Ok(key) => key, - Err(key) => return Err(AerospikeConnectorError::InvalidKeyValue(key)), - }; - let [key0, set_name, key2, pk_in_key] = key; - let Some(set_name) = set_name else { - return Err(AerospikeConnectorError::SetNameIsNone( - key0, key2, pk_in_key, - )); - }; - - let table_name = match set_name { - serde_json::Value::String(s) => s.clone(), - _ => { - return Err(AerospikeConnectorError::SetNameIsNone( - key0, key2, pk_in_key, - )) - } - }; - - let Some(TableIndexMap { - columns_map, - table_index, - }) = tables_map.get(&table_name) - else { - return Ok(None); - }; - - let mut fields = vec![Field::Null; columns_map.len()]; - if let Some((pk, _)) = columns_map.get("PK") { - if let Some(pk_in_key) = pk_in_key { - match pk_in_key { - serde_json::Value::String(s) => { - fields[*pk] = Field::String(s.clone()); - } - serde_json::Value::Number(n) => { - fields[*pk] = Field::UInt( - n.as_u64() - .ok_or(AerospikeConnectorError::ParsingUIntFailed)?, - ); - } - v => return Err(AerospikeConnectorError::KeyNotSupported(v)), - } - } else { - return Err(AerospikeConnectorError::PkIsNone(key0, table_name, key2)); - } - } - - if let Some((index, _)) = columns_map.get("inserted_at") { - // Create a NaiveDateTime from the timestamp - let naive = NaiveDateTime::from_timestamp_millis(event.lut as i64) - .ok_or(AerospikeConnectorError::InvalidTimestamp(event.lut as i64))?; - - // Create a normal DateTime from the NaiveDateTime - let datetime: DateTime = - DateTime::::from_naive_utc_and_offset(naive, Utc).fixed_offset(); - - fields[*index] = Field::Timestamp(datetime); - } - - for bin in event.bins { - if let Some((i, typ)) = columns_map.get(bin.name.as_str()) { - fields[*i] = match bin.value { - Some(value) => map_value_to_field(bin.r#type.as_str(), value, *typ)?, - None => Field::Null, - }; - } - } - - Ok(Some(IngestionMessage::OperationEvent { - table_index: *table_index, - op: Insert { - new: dozer_types::types::Record::new(fields), - }, - id: Some(OpIdentifier::new(event.lut, 0)), - })) -} - -pub(crate) fn map_value_to_field( - bin_type: &str, - mut value: Value, - typ: FieldType, -) -> Result { - if value.is_null() { - return Ok(Field::Null); - } - let unsupported_type = || AerospikeConnectorError::UnsupportedTypeForFieldType { - bin_type: bin_type.to_owned(), - field_type: typ, - }; - let check_type = |wanted_typ| { - if bin_type == wanted_typ { - Ok(()) - } else { - Err(unsupported_type()) - } - }; - match typ { - FieldType::UInt => { - check_type("int")?; - let number = value.as_number().ok_or_else(unsupported_type)?; - Ok(Field::UInt(number.as_u64().ok_or_else(|| { - AerospikeConnectorError::ParsingUIntFailed - })?)) - } - FieldType::Int => { - check_type("int")?; - let number = value.as_number().ok_or_else(unsupported_type)?; - Ok(Field::Int(number.as_i64().ok_or_else(|| { - AerospikeConnectorError::ParsingIntFailed - })?)) - } - FieldType::Int8 => { - check_type("int8")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::Int8(string.parse()?)) - } - FieldType::U128 => { - check_type("str")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::U128(string.parse()?)) - } - FieldType::I128 => { - check_type("str")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::I128(string.parse()?)) - } - FieldType::Float => { - check_type("float")?; - let number = value.as_number().ok_or_else(unsupported_type)?; - Ok(Field::Float( - number - .as_f64() - .ok_or(AerospikeConnectorError::ParsingFloatFailed)? - .into(), - )) - } - FieldType::Boolean => { - check_type("bool")?; - Ok(Field::Boolean( - value.as_bool().ok_or_else(unsupported_type)?, - )) - } - FieldType::String => { - check_type("str")?; - Ok(Field::String( - value.as_str().ok_or_else(unsupported_type)?.to_owned(), - )) - } - FieldType::Text => { - check_type("str")?; - Ok(Field::Text( - value.as_str().ok_or_else(unsupported_type)?.to_owned(), - )) - } - FieldType::Binary => { - check_type("blob")?; - if bin_type != "blob" { - return Err(unsupported_type()); - } - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::Binary(BASE64_STANDARD.decode(string)?)) - } - FieldType::Decimal => { - check_type("str")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::Decimal(string.parse()?)) - } - FieldType::Timestamp => { - check_type("str")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::Timestamp(DateTime::parse_from_rfc3339(string)?)) - } - FieldType::Date => { - check_type("str")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - Ok(Field::Date(string.parse()?)) - } - FieldType::Json => Ok(Field::Json(serde_json_to_json_value(value)?)), - FieldType::Point => { - check_type("geojson")?; - let json = value.as_object_mut().ok_or_else(unsupported_type)?; - if !json.get("type").is_some_and(|type_| type_ == "Point") { - return Err(AerospikeConnectorError::ParsingPointFailed); - } - let Some(Value::Array(coords)) = json.remove("coordinates") else { - return Err(AerospikeConnectorError::ParsingPointFailed); - }; - let p: [Value; 2] = coords - .try_into() - .map_err(|_| AerospikeConnectorError::ParsingPointFailed)?; - if let (Some(x), Some(y)) = (p[0].as_f64(), p[1].as_f64()) { - Ok(Field::Point((x, y).into())) - } else { - Err(AerospikeConnectorError::ParsingPointFailed) - } - } - FieldType::Duration => { - check_type("str")?; - let string = value.as_str().ok_or_else(unsupported_type)?; - let duration = parse_duration(string)?; - Ok(Field::Duration(dozer_types::types::DozerDuration( - duration, - dozer_types::types::TimeUnit::Nanoseconds, - ))) - } - } -} - -fn parse_duration(string: &str) -> Result { - let err = |_| AerospikeConnectorError::ParsingDurationFailed; - if !string.get(0..2).is_some_and(|chars| chars == "PT") { - return Err(AerospikeConnectorError::ParsingDurationFailed); - } - let string = &string[2..]; - let to_duration = |scale, number: &Decimal| -> Result { - let as_secs: Decimal = number * Decimal::new(scale, 0); - let secs = as_secs.try_into().map_err(err)?; - let frac = as_secs.fract() * Decimal::new(1_000_000_000, 0); - Ok(Duration::new(secs, frac.try_into().map_err(err)?)) - }; - let (hours, string) = parse_duration_part(string, 'H')?; - let mut duration = to_duration(3600, &hours)?; - if hours.is_integer() { - let (mins, string) = parse_duration_part(string, 'M')?; - duration += to_duration(60, &mins)?; - if mins.is_integer() { - let (secs, string) = parse_duration_part(string, 'S')?; - duration += to_duration(1, &secs)?; - if !string.is_empty() { - return Err(AerospikeConnectorError::ParsingDurationFailed); - } - } else if !string.is_empty() { - return Err(AerospikeConnectorError::ParsingDurationFailed); - } - } else if !string.is_empty() { - return Err(AerospikeConnectorError::ParsingDurationFailed); - } - Ok(duration) -} - -fn parse_duration_part( - string: &str, - delim: char, -) -> Result<(Decimal, &str), AerospikeConnectorError> { - let idx = string.find(delim); - let value = idx - .map_or(Ok(Decimal::ZERO), |idx| string[..idx].parse()) - .map_err(|_| AerospikeConnectorError::ParsingDurationFailed)?; - if let Some(idx) = idx { - Ok((value, &string[idx + 1..])) - } else { - Ok((value, string)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_duration() { - assert_eq!(parse_duration("PT3H").unwrap(), Duration::new(3600 * 3, 0)); - assert_eq!(parse_duration("PT3M").unwrap(), Duration::new(60 * 3, 0)); - assert_eq!(parse_duration("PT3S").unwrap(), Duration::new(3, 0)); - - assert_eq!( - parse_duration("PT3H3S").unwrap(), - Duration::new(3600 * 3 + 3, 0) - ); - - assert_eq!( - parse_duration("PT3.2H").unwrap(), - Duration::new(3600 * 3 + 12 * 60, 0) - ); - - assert_eq!( - parse_duration("PT3.2H").unwrap(), - Duration::new(3600 * 3 + 12 * 60, 0) - ); - assert!(parse_duration("PT3.2H2M").is_err()); - assert_eq!( - parse_duration("PT0.000123S").unwrap(), - Duration::new(0, 123_000) - ); - } -} diff --git a/dozer-ingestion/aerospike/src/lib.rs b/dozer-ingestion/aerospike/src/lib.rs deleted file mode 100644 index 4e1dde9fda..0000000000 --- a/dozer-ingestion/aerospike/src/lib.rs +++ /dev/null @@ -1,4 +0,0 @@ -pub mod connector; - -#[cfg(test)] -mod tests; diff --git a/dozer-ingestion/aerospike/src/tests.rs b/dozer-ingestion/aerospike/src/tests.rs deleted file mode 100644 index 04b1717871..0000000000 --- a/dozer-ingestion/aerospike/src/tests.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::time::Duration; - -use crate::connector::map_value_to_field; -use base64::prelude::BASE64_STANDARD; -use base64::Engine; -use dozer_ingestion_connector::dozer_types::ordered_float::OrderedFloat; -use dozer_ingestion_connector::dozer_types::rust_decimal::Decimal; -use dozer_ingestion_connector::dozer_types::serde_json::{json, Value}; -use dozer_ingestion_connector::dozer_types::types::{Field, FieldType}; - -#[test] -pub fn test_type_conversion() { - assert_eq!( - map_value_to_field("str", Value::Null, FieldType::UInt).unwrap(), - Field::Null - ); - - assert_eq!( - map_value_to_field("bool", Value::Bool(true), FieldType::Boolean).unwrap(), - Field::Boolean(true) - ); - assert!(map_value_to_field("str", Value::String("hello".into()), FieldType::Boolean).is_err()); - - assert_eq!( - map_value_to_field("int", json!(30), FieldType::UInt).unwrap(), - (Field::UInt(30)) - ); - assert_eq!( - map_value_to_field("int", json!(30), FieldType::Int).unwrap(), - (Field::Int(30)) - ); - assert!(map_value_to_field("float", json!(30), FieldType::UInt).is_err()); - - assert_eq!( - map_value_to_field("float", json!(34.35), FieldType::Float).unwrap(), - (Field::Float(OrderedFloat(34.35))) - ); - - assert_eq!( - map_value_to_field("float", json!(30), FieldType::Float).unwrap(), - (Field::Float(OrderedFloat(30.))) - ); - assert!(map_value_to_field("int", json!(1), FieldType::Float).is_err()); - - assert_eq!( - map_value_to_field("str", json!("47"), FieldType::String).unwrap(), - Field::String("47".to_string()) - ); - assert_eq!( - map_value_to_field("str", json!("48"), FieldType::Text).unwrap(), - Field::Text("48".to_string()) - ); - - assert_eq!( - map_value_to_field( - "blob", - json!(BASE64_STANDARD.encode(vec![52, 57])), - FieldType::Binary - ) - .unwrap(), - Field::Binary(vec![52, 57]) - ); - - assert_eq!( - map_value_to_field("str", json!("30.42"), FieldType::Decimal).unwrap(), - Field::Decimal(Decimal::new(3042, 2)) - ); - - assert_eq!( - map_value_to_field("str", json!("PT3.0012S"), FieldType::Duration).unwrap(), - Field::Duration( - dozer_ingestion_connector::dozer_types::types::DozerDuration( - Duration::new(3, 1_200_000), - dozer_ingestion_connector::dozer_types::types::TimeUnit::Nanoseconds - ) - ) - ); -} diff --git a/dozer-ingestion/oracle/Cargo.toml b/dozer-ingestion/oracle/Cargo.toml deleted file mode 100644 index 2532eab1ad..0000000000 --- a/dozer-ingestion/oracle/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -name = "dozer-ingestion-oracle" -version = "0.1.0" -edition = "2021" -license = "AGPL-3.0-or-later" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -dozer-ingestion-connector = { path = "../connector" } -oracle = { version = "0.5.7", features = ["chrono", "stmt_without_lifetime"] } -regex = "1.10.3" - -[dev-dependencies] -env_logger = "0.11.1" diff --git a/dozer-ingestion/oracle/src/connector/join.rs b/dozer-ingestion/oracle/src/connector/join.rs deleted file mode 100644 index 0073463449..0000000000 --- a/dozer-ingestion/oracle/src/connector/join.rs +++ /dev/null @@ -1,60 +0,0 @@ -use std::collections::{HashMap, HashSet}; - -use super::listing::{Constraint, ConstraintColumn, TableColumn}; - -#[derive(Debug)] -pub struct Column { - pub name: String, - pub data_type: Option, - pub nullable: Option, - pub is_primary_key: bool, - pub precision: Option, - pub scale: Option, -} - -pub fn join_columns_constraints( - table_columns: Vec, - constraint_columns: Vec, - constraints: Vec, -) -> HashMap<(String, String), Vec> { - let constraints = constraints.into_iter().collect::>(); - let mut all_primary_key_columns = HashSet::<(String, String, String)>::new(); - for constraint_column in constraint_columns { - let Some(column_name) = constraint_column.column_name else { - continue; - }; - let constraint = Constraint { - owner: Some(constraint_column.owner.clone()), - constraint_name: Some(constraint_column.constraint_name), - }; - if constraints.contains(&constraint) { - all_primary_key_columns.insert(( - constraint_column.owner, - constraint_column.table_name, - column_name, - )); - } - } - - let mut table_to_columns = HashMap::<(String, String), Vec>::new(); - for table_column in table_columns { - let column_triple = ( - table_column.owner, - table_column.table_name, - table_column.column_name, - ); - let is_primary_key = all_primary_key_columns.contains(&column_triple); - let column = Column { - name: column_triple.2, - data_type: table_column.data_type, - nullable: table_column.nullable, - is_primary_key, - precision: table_column.precision, - scale: table_column.scale, - }; - let table_pair = (column_triple.0, column_triple.1); - table_to_columns.entry(table_pair).or_default().push(column); - } - - table_to_columns -} diff --git a/dozer-ingestion/oracle/src/connector/listing.rs b/dozer-ingestion/oracle/src/connector/listing.rs deleted file mode 100644 index 33abd6f2bc..0000000000 --- a/dozer-ingestion/oracle/src/connector/listing.rs +++ /dev/null @@ -1,132 +0,0 @@ -use dozer_ingestion_connector::dozer_types::log::debug; -use oracle::Connection; - -use super::Error; - -#[derive(Debug, Clone)] -pub struct TableColumn { - pub owner: String, - pub table_name: String, - pub column_name: String, - pub data_type: Option, - pub nullable: Option, - pub precision: Option, - pub scale: Option, -} - -impl TableColumn { - pub fn list(connection: &Connection, schemas: &[String]) -> Result, Error> { - assert!(!schemas.is_empty()); - let sql = " - SELECT OWNER, TABLE_NAME, COLUMN_NAME, DATA_TYPE, NULLABLE, DATA_PRECISION, DATA_SCALE - FROM ALL_TAB_COLUMNS - WHERE OWNER IN (SELECT COLUMN_VALUE FROM TABLE(:2)) - "; - let schemas = super::string_collection(connection, schemas)?; - debug!("{}, {}", sql, schemas); - let rows = connection.query_as::<( - String, - String, - String, - Option, - Option, - Option, - Option, - )>(sql, &[&schemas])?; - - let mut columns = Vec::new(); - for row in rows { - let (owner, table_name, column_name, data_type, nullable, precision, scale) = row?; - let column = TableColumn { - owner, - table_name, - column_name, - data_type, - nullable, - precision, - scale, - }; - columns.push(column); - } - Ok(columns) - } -} - -#[derive(Debug, Clone)] -pub struct ConstraintColumn { - pub owner: String, - pub constraint_name: String, - pub table_name: String, - pub column_name: Option, -} - -impl ConstraintColumn { - pub fn list( - connection: &Connection, - schemas: &[String], - ) -> Result, Error> { - assert!(!schemas.is_empty()); - let sql = " - SELECT - OWNER, - CONSTRAINT_NAME, - TABLE_NAME, - COLUMN_NAME - FROM ALL_CONS_COLUMNS - WHERE OWNER IN (SELECT COLUMN_VALUE FROM TABLE(:2)) - "; - let schemas = super::string_collection(connection, schemas)?; - debug!("{}, {}", sql, schemas); - let rows = - connection.query_as::<(String, String, String, Option)>(sql, &[&schemas])?; - - let mut columns = Vec::new(); - for row in rows { - let (owner, constraint_name, table_name, column_name) = row?; - let column = ConstraintColumn { - owner, - constraint_name, - table_name, - column_name, - }; - columns.push(column); - } - Ok(columns) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Constraint { - pub owner: Option, - pub constraint_name: Option, -} - -impl Constraint { - pub fn list(connection: &Connection, schemas: &[String]) -> Result, Error> { - assert!(!schemas.is_empty()); - let sql = " - SELECT - OWNER, - CONSTRAINT_NAME - FROM ALL_CONSTRAINTS - WHERE - OWNER IN (SELECT COLUMN_VALUE FROM TABLE(:2)) - AND - CONSTRAINT_TYPE = 'P' - "; - let schemas = super::string_collection(connection, schemas)?; - debug!("{}, {}", sql, schemas); - let rows = connection.query_as::<(Option, Option)>(sql, &[&schemas])?; - - let mut constraints = Vec::new(); - for row in rows { - let (owner, constraint_name) = row?; - let constraint = Constraint { - owner, - constraint_name, - }; - constraints.push(constraint); - } - Ok(constraints) - } -} diff --git a/dozer-ingestion/oracle/src/connector/mapping.rs b/dozer-ingestion/oracle/src/connector/mapping.rs deleted file mode 100644 index e109cbbbfa..0000000000 --- a/dozer-ingestion/oracle/src/connector/mapping.rs +++ /dev/null @@ -1,242 +0,0 @@ -use std::{collections::HashMap, str::FromStr}; - -use dozer_ingestion_connector::{ - dozer_types::{ - chrono::{DateTime, NaiveDate, Utc}, - log::warn, - ordered_float::OrderedFloat, - rust_decimal::Decimal, - thiserror, - types::{Field, FieldDefinition, FieldType, Record, Schema, SourceDefinition}, - }, - CdcType, SourceSchema, -}; -use oracle::Row; - -use super::{join::Column, Error}; - -#[derive(Debug, Clone, Copy)] -pub struct MappedColumn { - pub typ: FieldType, - pub nullable: bool, -} - -#[derive(Debug, Clone, thiserror::Error)] -pub enum DataTypeError { - #[error("unsupported data type: {0}")] - UnsupportedDataType(String), - #[error("column {schema}.{table_name}.{column_name} has null data type")] - ColumnDataTypeIsNull { - schema: String, - table_name: String, - column_name: String, - }, -} - -fn map_data_type( - schema: &str, - table_name: &str, - column_name: &str, - data_type: Option<&str>, - nullable: Option<&str>, - precision: Option, - scale: Option, -) -> Result { - let data_type = data_type.ok_or_else(|| DataTypeError::ColumnDataTypeIsNull { - schema: schema.to_string(), - table_name: table_name.to_string(), - column_name: column_name.to_string(), - })?; - let typ = if data_type.starts_with("TIMESTAMP") { - FieldType::Timestamp - } else { - match data_type { - "VARCHAR2" => Ok(FieldType::String), - "NVARCHAR2" => Ok(FieldType::String), - "INTEGER" => Ok(FieldType::I128), - "NUMBER" => match (precision, scale) { - (Some(precision), Some(0)) if precision <= 19 => Ok(FieldType::Int), - (_, Some(0)) => Ok(FieldType::I128), - _ => Ok(FieldType::Decimal), - }, - "FLOAT" => Ok(FieldType::Float), - "DATE" => Ok(FieldType::Date), - "BINARY_FLOAT" => Ok(FieldType::Float), - "BINARY_DOUBLE" => Ok(FieldType::Float), - "RAW" => Ok(FieldType::Binary), - "ROWID" => Ok(FieldType::String), - "CHAR" => Ok(FieldType::String), - "NCHAR" => Ok(FieldType::String), - "CLOB" => Ok(FieldType::String), - "NCLOB" => Ok(FieldType::String), - "BLOB" => Ok(FieldType::Binary), - other => Err(DataTypeError::UnsupportedDataType(other.to_string())), - }? - }; - let nullable = nullable != Some("N"); - Ok(MappedColumn { typ, nullable }) -} - -pub fn map_row(schema: &Schema, row: Row) -> Result { - if schema.fields.len() != row.sql_values().len() { - return Err(Error::ColumnCountMismatch { - expected: schema.fields.len(), - actual: row.sql_values().len(), - }); - } - - let values = schema - .fields - .iter() - .enumerate() - .map(|(index, field)| map_field(index, field, &row)) - .collect::, _>>()?; - Ok(Record::new(values)) -} - -fn map_field(index: usize, field: &FieldDefinition, row: &Row) -> Result { - Ok(match (field.typ, field.nullable) { - (FieldType::Int, true) => row - .get::<_, Option>(index)? - .map_or(Field::Null, Field::Int), - (FieldType::Int, false) => Field::Int(row.get(index)?), - (FieldType::UInt, true) => row - .get::<_, Option>(index)? - .map_or(Field::Null, Field::UInt), - (FieldType::UInt, false) => Field::UInt(row.get(index)?), - (FieldType::Float, true) => row - .get::<_, Option>(index)? - .map_or(Field::Null, |value| Field::Float(OrderedFloat(value))), - (FieldType::Float, false) => Field::Float(OrderedFloat(row.get(index)?)), - (FieldType::Decimal, true) => match row.get::<_, Option>(index)? { - Some(decimal) => Field::Decimal(Decimal::from_str(&decimal)?), - None => Field::Null, - }, - (FieldType::Decimal, false) => { - Field::Decimal(Decimal::from_str(&row.get::<_, String>(index)?)?) - } - (FieldType::String, true) => row - .get::<_, Option>(index)? - .map_or(Field::Null, Field::String), - (FieldType::String, false) => Field::String(row.get(index)?), - (FieldType::Binary, true) => row - .get::<_, Option>>(index)? - .map_or(Field::Null, Field::Binary), - (FieldType::Binary, false) => Field::Binary(row.get(index)?), - (FieldType::Date, true) => row - .get::<_, Option>(index)? - .map_or(Field::Null, Field::Date), - (FieldType::Date, false) => Field::Date(row.get(index)?), - (FieldType::Timestamp, true) => row - .get::<_, Option>>(index)? - .map_or(Field::Null, |value| Field::Timestamp(value.fixed_offset())), - (FieldType::Timestamp, false) => { - Field::Timestamp(row.get::<_, DateTime>(index)?.fixed_offset()) - } - _ => unreachable!(), - }) -} - -#[derive(Debug, Clone)] -pub struct MappedColumnResult { - pub is_primary_key: bool, - pub is_used: bool, - pub map_result: Result, -} - -pub type ColumnMap = HashMap; - -pub fn map_tables( - tables: HashMap<(String, String), Vec>, -) -> HashMap<(String, String), ColumnMap> { - tables - .into_iter() - .map(|((schema, table_name), columns)| { - let column_map = map_columns(&schema, &table_name, columns); - ((schema, table_name), column_map) - }) - .collect() -} - -fn map_columns(schema: &str, table_name: &str, columns: Vec) -> ColumnMap { - columns - .into_iter() - .map(|column| { - let map_result = map_data_type( - schema, - table_name, - &column.name, - column.data_type.as_deref(), - column.nullable.as_deref(), - column.precision, - column.scale, - ); - ( - column.name, - MappedColumnResult { - is_primary_key: column.is_primary_key, - is_used: false, - map_result, - }, - ) - }) - .collect() -} - -pub fn decide_schema( - connection: &str, - schema: Option, - table_name: String, - column_names: &[String], - mut columns: ColumnMap, -) -> Result { - let mut fields = vec![]; - let mut primary_index = vec![]; - for column_name in column_names { - let Some(column) = columns.get_mut(column_name) else { - return Err(Error::ColumnNotFound { - schema, - table_name, - column_name: column_name.clone(), - }); - }; - - column.is_used = true; - if column.is_primary_key { - primary_index.push(fields.len()); - } - - match &column.map_result { - Ok(column) => fields.push(FieldDefinition { - name: column_name.clone(), - typ: column.typ, - nullable: column.nullable, - source: SourceDefinition::Table { - connection: connection.to_string(), - name: table_name.clone(), - }, - description: None, - }), - Err(err) => return Err(Error::DataType(err.clone())), - } - } - - if let Some((column_name, _)) = columns - .iter() - .find(|(_, column)| !column.is_used && column.is_primary_key) - { - warn!( - "Primary key column {} of table {} in connection {} is not used. Dropping primary key.", - column_name, table_name, connection - ); - primary_index.clear(); - } - - Ok(SourceSchema { - schema: Schema { - fields, - primary_index, - }, - cdc_type: CdcType::OnlyPK, // Doesn't matter - }) -} diff --git a/dozer-ingestion/oracle/src/connector/mod.rs b/dozer-ingestion/oracle/src/connector/mod.rs deleted file mode 100644 index 70ea0b617f..0000000000 --- a/dozer-ingestion/oracle/src/connector/mod.rs +++ /dev/null @@ -1,548 +0,0 @@ -use std::{ - collections::{HashMap, HashSet}, - num::ParseFloatError, - sync::Arc, - time::Duration, -}; - -use dozer_ingestion_connector::{ - dozer_types::{ - chrono, - epoch::SourceTime, - log::{debug, error}, - models::ingestion_types::{IngestionMessage, OracleReplicator, TransactionInfo}, - node::OpIdentifier, - rust_decimal::{self, Decimal}, - thiserror, - types::{FieldType, Operation, Schema}, - }, - Ingestor, SourceSchema, TableIdentifier, TableInfo, -}; -use oracle::{ - sql_type::{Collection, ObjectType}, - Connection, -}; - -#[derive(Debug, Clone)] -pub struct Connector { - connection_name: String, - connection: Arc, - username: String, - batch_size: usize, - replicator: OracleReplicator, -} - -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error("oracle error: {0:?}")] - Oracle(#[from] oracle::Error), - #[error("pdb not found: {0}")] - PdbNotFound(String), - #[error("table not found: {0:?}")] - TableNotFound(TableIdentifier), - #[error("data type: {0}")] - DataType(#[from] mapping::DataTypeError), - #[error("column {schema:?}.{table_name}.{column_name} not found")] - ColumnNotFound { - schema: Option, - table_name: String, - column_name: String, - }, - #[error("column count mismatch: expected {expected}, actual {actual}")] - ColumnCountMismatch { expected: usize, actual: usize }, - #[error("cannot convert Oracle number to decimal: {0}")] - NumberToDecimal(#[from] rust_decimal::Error), - #[error("insert failed to match: {0}")] - InsertFailedToMatch(String), - #[error("delete failed to match: {0}")] - DeleteFailedToMatch(String), - #[error("update failed to match: {0}")] - UpdateFailedToMatch(String), - #[error("field {0} not found")] - FieldNotFound(String), - #[error("null value for non-nullable field {0}")] - NullValue(String), - #[error("cannot parse float: {0}")] - ParseFloat(#[from] ParseFloatError), - #[error("cannot parse date time from {1}: {0}")] - ParseDateTime(#[source] chrono::ParseError, String), - #[error("got overflow float number {0}")] - FloatOverflow(Decimal), - #[error("got error when parsing uint {0}")] - ParseUIntFailed(Decimal), - #[error("got error when parsing int {0}")] - ParseIntFailed(Decimal), - #[error("type mismatch for {field}, expected {expected:?}, actual {actual:?}")] - TypeMismatch { - field: String, - expected: FieldType, - actual: FieldType, - }, -} - -/// `oracle`'s `ToSql` implementation for `&str` uses `NVARCHAR2` type, which Oracle expects to be UTF16 encoded by default. -/// Here we use `VARCHAR2` type instead, which Oracle expects to be UTF8 encoded by default. -/// This is a macro because it references a temporary `OracleType`. -macro_rules! str_to_sql { - ($s:expr) => { - // `s.len()` is the upper bound of `s.chars().count()` - ( - &$s, - &::oracle::sql_type::OracleType::Varchar2($s.len() as u32), - ) - }; -} - -pub type Scn = u64; - -impl Connector { - pub fn new( - connection_name: String, - username: String, - password: &str, - connect_string: &str, - batch_size: usize, - replicator: OracleReplicator, - ) -> Result { - let connection = Connection::connect(&username, password, connect_string)?; - - Ok(Self { - connection_name, - connection: Arc::new(connection), - username, - batch_size, - replicator, - }) - } - - pub fn get_con_id(&mut self, pdb: &str) -> Result { - let sql = "SELECT CON_NAME_TO_ID(:1) FROM DUAL"; - let con_id = self - .connection - .query_row_as::>(sql, &[&str_to_sql!(pdb)])? - .ok_or_else(|| Error::PdbNotFound(pdb.to_string())); - self.connection.commit()?; - con_id - } - - pub fn list_tables(&mut self, schemas: &[String]) -> Result, Error> { - let rows = if schemas.is_empty() { - let sql = "SELECT OWNER, TABLE_NAME FROM ALL_TABLES"; - debug!("{}", sql); - self.connection.query_as::<(String, String)>(sql, &[])? - } else { - let sql = " - SELECT OWNER, TABLE_NAME - FROM ALL_TABLES - WHERE OWNER IN (SELECT COLUMN_VALUE FROM TABLE(:2)) - "; - let owners = string_collection(&self.connection, schemas)?; - debug!("{}, {}", sql, owners); - self.connection - .query_as::<(String, String)>(sql, &[&owners])? - }; - - let tables = rows - .map(|row| { - row.map(|(owner, table_name)| TableIdentifier { - schema: Some(owner), - name: table_name, - }) - .map_err(Into::into) - }) - .collect(); - self.connection.commit()?; - tables - } - - pub fn list_columns(&mut self, tables: Vec) -> Result, Error> { - // List all tables and columns. - let schemas = tables - .iter() - .map(|table| { - table - .schema - .clone() - .unwrap_or_else(|| self.username.clone()) - }) - .collect::>(); - let table_columns = - listing::TableColumn::list(&self.connection, &schemas.into_iter().collect::>())?; - let mut table_to_columns = HashMap::<(String, String), Vec>::new(); - for table_column in table_columns { - let table_pair = (table_column.owner, table_column.table_name); - table_to_columns - .entry(table_pair) - .or_default() - .push(table_column.column_name); - } - - // Collect columns for requested tables. - let mut result = vec![]; - for table in tables { - let schema = table - .schema - .clone() - .unwrap_or_else(|| self.username.clone()); - let table_pair = (schema, table.name.clone()); - let column_names = table_to_columns - .remove(&table_pair) - .ok_or_else(|| Error::TableNotFound(table.clone()))?; - result.push(TableInfo { - schema: table.schema, - name: table.name, - column_names, - }); - } - self.connection.commit()?; - Ok(result) - } - - pub fn get_schemas( - &mut self, - table_infos: &[TableInfo], - ) -> Result>, Error> { - // Collect all tables and columns. - let schemas = table_infos - .iter() - .map(|table| { - table - .schema - .clone() - .unwrap_or_else(|| self.username.clone()) - }) - .collect::>() - .into_iter() - .collect::>(); - let table_columns = listing::TableColumn::list(&self.connection, &schemas)?; - let constraint_columns = - listing::ConstraintColumn::list(&self.connection, &schemas).unwrap(); - let constraints = listing::Constraint::list(&self.connection, &schemas).unwrap(); - let table_columns = - join::join_columns_constraints(table_columns, constraint_columns, constraints); - - // Map all the columns. - let mut table_columns = mapping::map_tables(table_columns); - - // Decide `SourceSchemaResult` for each `table_info` - let mut result = vec![]; - for table_info in table_infos { - let schema = table_info - .schema - .clone() - .unwrap_or_else(|| self.username.clone()); - let table_pair = (schema, table_info.name.clone()); - let columns = table_columns.remove(&table_pair).ok_or_else(|| { - Error::TableNotFound(TableIdentifier { - schema: table_info.schema.clone(), - name: table_info.name.clone(), - }) - })?; - result.push(mapping::decide_schema( - &self.connection_name, - table_info.schema.clone(), - table_pair.1, - &table_info.column_names, - columns, - )); - } - self.connection.commit()?; - - Ok(result) - } - - pub fn snapshot(&mut self, ingestor: &Ingestor, tables: Vec) -> Result { - let schemas = self - .get_schemas(&tables)? - .into_iter() - .collect::, _>>()?; - - let sql = "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"; - debug!("{}", sql); - self.connection.execute(sql, &[])?; - - for (table_index, (table, schema)) in tables.into_iter().zip(schemas).enumerate() { - let columns = table.column_names.join(", "); - let owner = table.schema.unwrap_or_else(|| self.username.clone()); - let sql = format!("SELECT {} FROM {}.{}", columns, owner, table.name); - debug!("{}", sql); - let rows = self.connection.query(&sql, &[])?; - - let mut batch = Vec::with_capacity(self.batch_size); - for row in rows { - batch.push(mapping::map_row(&schema.schema, row?)?); - if batch.len() >= self.batch_size - && ingestor - .blocking_handle_message(IngestionMessage::OperationEvent { - table_index, - op: Operation::BatchInsert { - new: std::mem::take(&mut batch), - }, - id: None, - }) - .is_err() - { - return self.get_scn_and_commit(); - } - } - - if !batch.is_empty() - && ingestor - .blocking_handle_message(IngestionMessage::OperationEvent { - table_index, - op: Operation::BatchInsert { new: batch }, - id: None, - }) - .is_err() - { - return self.get_scn_and_commit(); - } - } - - self.get_scn_and_commit() - } - - fn get_scn_and_commit(&mut self) -> Result { - let sql = "SELECT DBMS_FLASHBACK.GET_SYSTEM_CHANGE_NUMBER() FROM DUAL"; - let scn = self.connection.query_row_as::(sql, &[])?; - self.connection.commit()?; - Ok(scn) - } - - pub fn replicate( - &mut self, - ingestor: &Ingestor, - tables: Vec, - schemas: Vec, - checkpoint: Scn, - con_id: Option, - ) { - match self.replicator { - OracleReplicator::LogMiner { - poll_interval_in_milliseconds, - } => self.replicate_log_miner( - ingestor, - tables, - schemas, - checkpoint, - con_id, - Duration::from_millis(poll_interval_in_milliseconds), - ), - OracleReplicator::DozerLogReader => unimplemented!("dozer log reader"), - } - } - - fn replicate_log_miner( - &mut self, - ingestor: &Ingestor, - tables: Vec, - schemas: Vec, - checkpoint: Scn, - con_id: Option, - poll_interval: Duration, - ) { - let start_scn = checkpoint + 1; - let table_pair_to_index = tables - .into_iter() - .enumerate() - .map(|(index, table)| { - let schema = table.schema.unwrap_or_else(|| self.username.clone()); - ((schema, table.name), index) - }) - .collect::>(); - let processor = replicate::Processor::new(start_scn, table_pair_to_index, schemas); - - let (sender, receiver) = std::sync::mpsc::sync_channel(100); - let handle = { - let connection = self.connection.clone(); - let ingestor = ingestor.clone(); - std::thread::spawn(move || { - replicate::log_miner_loop( - &connection, - start_scn, - con_id, - poll_interval, - sender, - &ingestor, - ) - }) - }; - - for transaction in processor.process(receiver) { - let transaction = match transaction { - Ok(transaction) => transaction, - Err(e) => { - error!("Error during transaction processing: {e}"); - continue; - } - }; - - for (seq, (table_index, op)) in transaction.operations.into_iter().enumerate() { - if ingestor - .blocking_handle_message(IngestionMessage::OperationEvent { - table_index, - op, - id: Some(OpIdentifier::new(transaction.commit_scn, seq as u64)), - }) - .is_err() - { - return; - }; - } - - if ingestor - .blocking_handle_message(IngestionMessage::TransactionInfo( - TransactionInfo::Commit { - id: Some(OpIdentifier::new(transaction.commit_scn, 0)), - source_time: Some(SourceTime::from_chrono( - &transaction.commit_timestamp, - 1000, - )), - }, - )) - .is_err() - { - return; - } - } - - handle.join().unwrap(); - } -} - -mod join; -mod listing; -mod mapping; -mod replicate; - -const TEMP_DOZER_TYPE_NAME: &str = "TEMP_DOZER_TYPE"; - -fn temp_varray_of_vchar2( - connection: &Connection, - num_strings: usize, - max_num_chars: usize, -) -> Result { - let sql = format!( - "CREATE OR REPLACE TYPE {} AS VARRAY({}) OF VARCHAR2({})", - TEMP_DOZER_TYPE_NAME, num_strings, max_num_chars - ); - debug!("{}", sql); - connection.execute(&sql, &[])?; - connection - .object_type(TEMP_DOZER_TYPE_NAME) - .map_err(Into::into) -} - -fn string_collection(connection: &Connection, strings: &[String]) -> Result { - let temp_type = temp_varray_of_vchar2( - connection, - strings.len(), - strings.iter().map(|s| s.len()).max().unwrap(), - )?; - let mut collection = temp_type.new_collection()?; - for string in strings { - collection.push(&str_to_sql!(*string))?; - } - Ok(collection) -} - -mod tests { - #[test] - #[ignore] - fn test_connector() { - use dozer_ingestion_connector::{ - dozer_types::models::ingestion_types::OracleReplicator, IngestionConfig, Ingestor, - }; - use dozer_ingestion_connector::{ - dozer_types::{models::ingestion_types::IngestionMessage, types::Operation}, - IngestionIterator, - }; - use std::time::Instant; - - fn row_count(message: &IngestionMessage) -> usize { - match message { - IngestionMessage::OperationEvent { op, .. } => match op { - Operation::BatchInsert { new } => new.len(), - Operation::Insert { .. } => 1, - Operation::Delete { .. } => 1, - Operation::Update { .. } => 1, - }, - _ => 0, - } - } - - fn estimate_throughput(iterator: IngestionIterator) { - let mut tic = None; - let mut count = 0; - let print_count_interval = 10_000; - let mut count_mod_interval = 0; - for message in iterator { - if tic.is_none() { - tic = Some(Instant::now()); - } - - count += row_count(&message); - let new_count_mod_interval = count / print_count_interval; - if new_count_mod_interval > count_mod_interval { - count_mod_interval = new_count_mod_interval; - println!("{} rows in {:?}", count, tic.unwrap().elapsed()); - } - } - println!("{} rows in {:?}", count, tic.unwrap().elapsed()); - println!( - "Throughput: {} rows/s", - count as f64 / tic.unwrap().elapsed().as_secs_f64() - ); - } - - env_logger::init(); - - let replicate_user = "DOZER"; - let data_user = "DOZER"; - let host = "database-1.cxtwfj9nkwtu.ap-southeast-1.rds.amazonaws.com"; - let sid = "ORCL"; - - let mut connector = super::Connector::new( - "oracle".into(), - replicate_user.into(), - "123", - &format!("{}:{}/{}", host, 1521, sid), - 100_000, - OracleReplicator::DozerLogReader, - ) - .unwrap(); - let tables = connector.list_tables(&[data_user.into()]).unwrap(); - let tables = connector.list_columns(tables).unwrap(); - let schemas = connector.get_schemas(&tables).unwrap(); - let schemas = schemas.into_iter().map(Result::unwrap).collect::>(); - dbg!(&schemas); - let (ingestor, iterator) = Ingestor::initialize_channel(IngestionConfig::default()); - let handle = { - let tables = tables.clone(); - std::thread::spawn(move || connector.snapshot(&ingestor, tables)) - }; - - estimate_throughput(iterator); - let checkpoint = handle.join().unwrap().unwrap(); - - let mut connector = super::Connector::new( - "oracle".into(), - replicate_user.into(), - "123", - &format!("{}:{}/{}", host, 1521, sid), - 1, - OracleReplicator::LogMiner { - poll_interval_in_milliseconds: 1000, - }, - ) - .unwrap(); - let (ingestor, iterator) = Ingestor::initialize_channel(IngestionConfig::default()); - let schemas = schemas.into_iter().map(|schema| schema.schema).collect(); - let handle = std::thread::spawn(move || { - connector.replicate(&ingestor, tables, schemas, checkpoint, None) - }); - - estimate_throughput(iterator); - handle.join().unwrap(); - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/log/listing.rs b/dozer-ingestion/oracle/src/connector/replicate/log/listing.rs deleted file mode 100644 index 533747130a..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/log/listing.rs +++ /dev/null @@ -1,152 +0,0 @@ -use dozer_ingestion_connector::dozer_types::log::{debug, warn}; -use oracle::Connection; - -use crate::connector::{Error, Scn}; - -#[derive(Debug, Clone)] -pub struct ArchivedLog { - pub name: String, - pub sequence: u32, - pub first_change: Scn, - pub next_change: Scn, -} - -impl ArchivedLog { - pub fn list(connection: &Connection, start_scn: Scn) -> Result, Error> { - let sql = "SELECT NAME, SEQUENCE#, FIRST_CHANGE#, NEXT_CHANGE# FROM V$ARCHIVED_LOG WHERE NEXT_CHANGE# > :start_scn AND STATUS = 'A' ORDER BY SEQUENCE# ASC"; - debug!("{}, {}", sql, start_scn); - let rows = connection - .query_as::<(String, u32, Scn, Scn)>(sql, &[&start_scn]) - .unwrap(); - - let mut result = vec![]; - for row in rows { - let (name, sequence, first_change, next_change) = row?; - let log = ArchivedLog { - name, - sequence, - first_change, - next_change, - }; - if is_continuous(result.last(), &log) { - result.push(log); - } - } - - Ok(result) - } -} - -#[derive(Debug, Clone, Copy)] -pub struct Log { - pub group: u32, - pub sequence: u32, - pub first_change: Scn, - pub next_change: Scn, -} - -impl Log { - pub fn list(connection: &Connection, start_scn: Scn) -> Result, Error> { - let sql = "SELECT GROUP#, SEQUENCE#, FIRST_CHANGE#, NEXT_CHANGE# FROM V$LOG WHERE NEXT_CHANGE# > :start_scn ORDER BY SEQUENCE# ASC"; - debug!("{}, {}", sql, start_scn); - let rows = connection - .query_as::<(u32, u32, Scn, Scn)>(sql, &[&start_scn]) - .unwrap(); - - let mut result = vec![]; - for row in rows { - let (group, sequence, first_change, next_change) = row?; - let log = Log { - group, - sequence, - first_change, - next_change, - }; - if is_continuous(result.last(), &log) { - result.push(log); - } - } - - Ok(result) - } -} - -#[derive(Debug, Clone)] -pub struct LogFile { - pub group: u32, - pub member: String, -} - -impl LogFile { - pub fn list(connection: &Connection) -> Result, Error> { - let sql = "SELECT GROUP#, MEMBER FROM V$LOGFILE WHERE STATUS IS NULL"; - debug!("{}", sql); - let rows = connection.query_as::<(u32, String)>(sql, &[]).unwrap(); - - let mut result = vec![]; - for row in rows { - let (group, member) = row?; - let log_file = LogFile { group, member }; - result.push(log_file); - } - - Ok(result) - } -} - -pub trait HasLogIdentifier { - fn sequence(&self) -> u32; - fn first_change(&self) -> Scn; - fn next_change(&self) -> Scn; -} - -impl HasLogIdentifier for ArchivedLog { - fn sequence(&self) -> u32 { - self.sequence - } - - fn first_change(&self) -> Scn { - self.first_change - } - - fn next_change(&self) -> Scn { - self.next_change - } -} - -impl HasLogIdentifier for Log { - fn sequence(&self) -> u32 { - self.sequence - } - - fn first_change(&self) -> Scn { - self.first_change - } - - fn next_change(&self) -> Scn { - self.next_change - } -} - -pub fn is_continuous( - last_log: Option<&impl HasLogIdentifier>, - current_log: &impl HasLogIdentifier, -) -> bool { - let Some(last_log) = last_log else { - return true; - }; - - let sequence_is_continuous = last_log.sequence() + 1 == current_log.sequence(); - let scn_is_continuous = last_log.next_change() == current_log.first_change(); - - if sequence_is_continuous != scn_is_continuous { - warn!( - "Log {} has next change {}, but log {} has first change {}", - last_log.sequence(), - last_log.next_change(), - current_log.sequence(), - current_log.first_change() - ); - } - sequence_is_continuous && scn_is_continuous -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/log/merge.rs b/dozer-ingestion/oracle/src/connector/replicate/log/merge.rs deleted file mode 100644 index c51f092bfc..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/log/merge.rs +++ /dev/null @@ -1,52 +0,0 @@ -use std::collections::HashMap; - -use oracle::Connection; - -use crate::connector::{Error, Scn}; - -use super::listing::{is_continuous, ArchivedLog, Log, LogFile}; - -pub fn list_and_join_online_log( - connection: &Connection, - start_scn: Scn, -) -> Result, Error> { - let logs = Log::list(connection, start_scn)?; - let log_files = LogFile::list(connection)?; - let mut log_files = log_files - .into_iter() - .map(|log_file| (log_file.group, log_file.member)) - .collect::>(); - - let mut result = vec![]; - for log in logs { - if let Some(name) = log_files.remove(&log.group) { - let archived_log = ArchivedLog { - name, - sequence: log.sequence, - first_change: log.first_change, - next_change: log.next_change, - }; - result.push(archived_log); - } else { - // We only want continuous logs - break; - } - } - - Ok(result) -} - -pub fn list_and_merge_archived_log( - connection: &Connection, - start_scn: Scn, - mut online_logs: Vec, -) -> Result, Error> { - let mut archived_logs = ArchivedLog::list(connection, start_scn)?; - let first_continuous_online_log_index = online_logs - .iter() - .position(|log| is_continuous(archived_logs.last(), log)); - if let Some(index) = first_continuous_online_log_index { - archived_logs.extend(online_logs.drain(index..)); - } - Ok(archived_logs) -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/log/mod.rs b/dozer-ingestion/oracle/src/connector/replicate/log/mod.rs deleted file mode 100644 index aa0a15a098..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/log/mod.rs +++ /dev/null @@ -1,176 +0,0 @@ -use std::{sync::mpsc::SyncSender, time::Duration}; - -use dozer_ingestion_connector::dozer_types::log::debug; -use dozer_ingestion_connector::{ - dozer_types::{ - chrono::{DateTime, Utc}, - log::{error, info}, - }, - Ingestor, -}; -use oracle::Connection; - -use crate::connector::{Error, Scn}; - -mod listing; -mod merge; -mod redo; - -pub type TransactionId = [u8; 8]; - -#[derive(Debug, Clone)] -/// This is a raw row from V$LOGMNR_CONTENTS -pub struct LogManagerContent { - pub scn: Scn, - pub timestamp: DateTime, - pub xid: TransactionId, - pub pxid: TransactionId, - pub operation_code: u8, - pub seg_owner: Option, - pub table_name: Option, - pub rbasqn: u32, - pub rbablk: u32, - pub rbabyte: u16, - pub sql_redo: Option, - pub csf: u8, -} - -/// `ingestor` is only used for checking if ingestion has ended so we can break the loop. -pub fn log_miner_loop( - connection: &Connection, - start_scn: Scn, - con_id: Option, - poll_interval: Duration, - sender: SyncSender, - ingestor: &Ingestor, -) { - log_reader_loop( - connection, - start_scn, - con_id, - poll_interval, - redo::LogMiner, - sender, - ingestor, - ) -} - -fn log_reader_loop( - connection: &Connection, - mut start_scn: Scn, - con_id: Option, - poll_interval: Duration, - reader: impl redo::RedoReader, - sender: SyncSender, - ingestor: &Ingestor, -) { - #[derive(Debug, Clone, Copy)] - struct LastRba { - sqn: u32, - blk: u32, - byte: u16, - } - let mut last_rba: Option = None; - - loop { - debug!(target: "oracle_replication", "Listing logs starting from SCN {}", start_scn); - let mut logs = match list_logs(connection, start_scn) { - Ok(logs) => logs, - Err(e) => { - if ingestor.is_closed() { - return; - } - error!("Error listing logs: {}. Retrying.", e); - continue; - } - }; - - if logs.is_empty() { - if ingestor.is_closed() { - return; - } - info!("No logs found, retrying after {:?}", poll_interval); - std::thread::sleep(poll_interval); - continue; - } - - 'replicate_logs: while !logs.is_empty() { - let log = logs.remove(0); - debug!(target: "oracle_replication", - "Reading log {} ({}) ({}, {}), starting from {:?}", - log.name, log.sequence, log.first_change, log.next_change, last_rba - ); - - let iterator = { - let last_rba = last_rba.and_then(|last_rba| { - if log.sequence == last_rba.sqn { - Some((last_rba.blk, last_rba.byte)) - } else { - None - } - }); - match reader.read(connection, &log.name, last_rba, con_id) { - Ok(iterator) => iterator, - Err(e) => { - if ingestor.is_closed() { - return; - } - error!("Error reading log {}: {}. Retrying.", log.name, e); - break 'replicate_logs; - } - } - }; - - for content in iterator { - let content = match content { - Ok(content) => content, - Err(e) => { - if ingestor.is_closed() { - return; - } - error!("Error reading log {}: {}. Retrying.", log.name, e); - break 'replicate_logs; - } - }; - last_rba = Some(LastRba { - sqn: content.rbasqn, - blk: content.rbablk, - byte: content.rbabyte, - }); - if sender.send(content).is_err() { - return; - } - } - - if logs.is_empty() { - if ingestor.is_closed() { - return; - } - debug!(target: "oracle_replication", "Read all logs, retrying after {:?}", poll_interval); - std::thread::sleep(poll_interval); - } else { - // If there are more logs, we need to start from the next log's first change. - start_scn = log.next_change; - } - } - } -} - -fn list_logs(connection: &Connection, start_scn: Scn) -> Result, Error> { - let logs = merge::list_and_join_online_log(connection, start_scn)?; - if !log_contains_scn(logs.first(), start_scn) { - info!( - "Online log is empty or doesn't contain start scn {}, listing and merging archived logs", - start_scn - ); - merge::list_and_merge_archived_log(connection, start_scn, logs) - } else { - Ok(logs) - } -} - -fn log_contains_scn(log: Option<&listing::ArchivedLog>, scn: Scn) -> bool { - log.map_or(false, |log| { - log.first_change <= scn && log.next_change > scn - }) -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/log/redo/log_miner.rs b/dozer-ingestion/oracle/src/connector/replicate/log/redo/log_miner.rs deleted file mode 100644 index 675db2ba58..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/log/redo/log_miner.rs +++ /dev/null @@ -1,148 +0,0 @@ -use dozer_ingestion_connector::dozer_types::{ - chrono::{DateTime, Utc}, - log::{error, trace}, -}; -use oracle::{Connection, ResultSet, RowValue}; - -use crate::connector::{Error, Scn}; - -use super::{LogManagerContent, RedoReader}; - -#[derive(Debug, Clone, Copy)] -pub struct LogMiner; - -#[derive(Debug)] -pub struct LogMinerIter<'a> { - result_set: ResultSet<'a, LogManagerContent>, - connection: &'a Connection, -} - -impl<'a> Drop for LogMinerIter<'a> { - fn drop(&mut self) { - let sql = "BEGIN DBMS_LOGMNR.END_LOGMNR; END;"; - trace!("{}", sql); - if let Err(e) = self.connection.execute(sql, &[]) { - error!("Failed to end log miner: {}", e); - } - } -} - -impl<'a> Iterator for LogMinerIter<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - self.result_set.next().map(|row| row.map_err(Into::into)) - } -} - -impl RedoReader for LogMiner { - type Iterator<'a> = LogMinerIter<'a>; - - fn read<'a>( - &self, - connection: &'a Connection, - log_file_name: &str, - last_rba: Option<(u32, u16)>, - con_id: Option, - ) -> Result, Error> { - let sql = - "BEGIN DBMS_LOGMNR.ADD_LOGFILE(LOGFILENAME => :name, OPTIONS => DBMS_LOGMNR.NEW); END;"; - trace!("{}, {}", sql, log_file_name); - connection.execute(sql, &[&str_to_sql!(log_file_name)])?; - - let sql = " - BEGIN - DBMS_LOGMNR.START_LOGMNR( - OPTIONS => - DBMS_LOGMNR.DICT_FROM_ONLINE_CATALOG + - DBMS_LOGMNR.PRINT_PRETTY_SQL + - DBMS_LOGMNR.NO_ROWID_IN_STMT - ); - END;"; - trace!("{}", sql); - connection.execute(sql, &[])?; - - let base_sql = "SELECT SCN, TIMESTAMP, XID, PXID, OPERATION_CODE, SEG_OWNER, TABLE_NAME, RBASQN, RBABLK, RBABYTE, SQL_REDO, CSF FROM V$LOGMNR_CONTENTS"; - let rba_filter = "(RBABLK > :last_blk OR (RBABLK = :last_blk AND RBABYTE > :last_byte))"; - let con_id_filter = "SRC_CON_ID = :con_id"; - let result_set = match (last_rba, con_id) { - (Some((last_blk, last_byte)), Some(con_id)) => { - let sql = format!("{} WHERE {} AND {}", base_sql, rba_filter, con_id_filter); - trace!("{}, {}, {}, {}", sql, last_blk, last_byte, con_id); - connection.query_as_named( - &sql, - &[ - ("last_blk", &last_blk), - ("last_byte", &last_byte), - ("con_id", &con_id), - ], - ) - } - (Some((last_blk, last_byte)), None) => { - let sql = format!("{} WHERE {}", base_sql, rba_filter); - trace!("{}, {}, {}", sql, last_blk, last_byte); - connection - .query_as_named(&sql, &[("last_blk", &last_blk), ("last_byte", &last_byte)]) - } - (None, Some(con_id)) => { - let sql = format!("{} WHERE {}", base_sql, con_id_filter); - trace!("{}, {}", sql, con_id); - connection.query_as_named(&sql, &[("con_id", &con_id)]) - } - (None, None) => { - trace!("{}", base_sql); - connection.query_as(base_sql, &[]) - } - }?; - Ok(LogMinerIter { - result_set, - connection, - }) - } -} - -impl RowValue for LogManagerContent { - fn get(row: &oracle::Row) -> oracle::Result { - let ( - scn, - timestamp, - xid, - pxid, - operation_code, - seg_owner, - table_name, - rbasqn, - rbablk, - rbabyte, - sql_redo, - csf, - ) = <( - Scn, - DateTime, - Vec, - Vec, - u8, - Option, - Option, - u32, - u32, - u16, - Option, - u8, - ) as RowValue>::get(row)?; - Ok(LogManagerContent { - scn, - timestamp, - xid: xid.try_into().expect("xid must be 8 bytes"), - pxid: pxid.try_into().expect("pxid must be 8 bytes"), - operation_code, - seg_owner, - table_name, - rbasqn, - rbablk, - rbabyte, - sql_redo, - csf, - }) - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/log/redo/mod.rs b/dozer-ingestion/oracle/src/connector/replicate/log/redo/mod.rs deleted file mode 100644 index 7d011dfdab..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/log/redo/mod.rs +++ /dev/null @@ -1,26 +0,0 @@ -use oracle::Connection; - -use crate::connector::Error; - -/// Given a log file name, a redo reader emits `LogManagerContent` rows -pub trait RedoReader { - type Iterator<'a>: Iterator>; - - /// Reads the `LogManagerContent` rows that have: - /// - /// - scn >= start_scn - /// - rba > last_rba.0 || (rba == last_rba.0 && rbabyte > last_rba.1) - fn read<'a>( - &self, - connection: &'a Connection, - log_file_name: &str, - last_rba: Option<(u32, u16)>, - con_id: Option, - ) -> Result, Error>; -} - -mod log_miner; - -pub use log_miner::LogMiner; - -use super::LogManagerContent; diff --git a/dozer-ingestion/oracle/src/connector/replicate/mod.rs b/dozer-ingestion/oracle/src/connector/replicate/mod.rs deleted file mode 100644 index 284bfee75b..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -mod log; -mod transaction; - -pub use log::log_miner_loop; -pub use transaction::Processor; diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/commit.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/commit.rs deleted file mode 100644 index ea06f2c3db..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/commit.rs +++ /dev/null @@ -1,39 +0,0 @@ -use dozer_ingestion_connector::dozer_types::{ - chrono::{DateTime, Utc}, - log::warn, -}; - -use crate::connector::{replicate::log::TransactionId, Scn}; - -use super::{Transaction, TransactionForest}; - -pub fn commit( - xid: TransactionId, - pxid: TransactionId, - scn: Scn, - timestamp: DateTime, - transaction_forest: &mut TransactionForest, -) -> Option { - let mut operations = vec![]; - transaction_forest.remove_subtree(xid, |_, ops| operations.extend(ops)); - - if xid == pxid { - // This is a top level transaction - Some(Transaction { - commit_scn: scn, - commit_timestamp: timestamp, - operations, - }) - } else { - // This is a sub transaction. - let Some(parent_operations) = transaction_forest.get_mut(&pxid) else { - warn!( - "Parent transaction {:02X?} not found for sub transaction {:02X?}", - pxid, xid - ); - return None; - }; - parent_operations.extend(operations); - None - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/forest.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/forest.rs deleted file mode 100644 index 97de993ed9..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/forest.rs +++ /dev/null @@ -1,111 +0,0 @@ -use std::{ - collections::{hash_map::Entry, HashMap}, - hash::Hash, -}; - -#[derive(Debug, Clone)] -struct Node { - data: T, - parent: Option, - children: Vec, -} - -impl Default for Node { - fn default() -> Self { - Self { - data: T::default(), - parent: None, - children: vec![], - } - } -} - -#[derive(Debug, Clone, Default)] -pub struct Forest { - nodes: HashMap>, -} - -impl Forest { - pub fn remove_subtree(&mut self, id: Id, mut f: impl FnMut(Id, T)) -> bool { - let Some(node) = self.nodes.remove(&id) else { - return false; - }; - if let Some(parent) = node.parent.as_ref() { - self.nodes - .get_mut(parent) - .unwrap() - .children - .retain(|child| child != &id); - } - let mut stack = vec![(id, node)]; - while let Some((id, node)) = stack.pop() { - f(id, node.data); - for child in node.children { - let node = self.nodes.remove(&child).unwrap(); - stack.push((child, node)); - } - } - true - } - - pub fn get_mut(&mut self, id: &Id) -> Option<&mut T> { - self.nodes.get_mut(id).map(|node| &mut node.data) - } -} - -impl Forest { - pub fn insert_or_get_root(&mut self, id: Id) -> &mut T { - &mut self.nodes.entry(id).or_default().data - } -} - -impl Forest { - pub fn insert_or_get_child(&mut self, parent: Id, child: Id) -> Option<&mut T> { - if !self.nodes.contains_key(&parent) { - return None; - } - - let is_new_child = if let Entry::Vacant(entry) = self.nodes.entry(child.clone()) { - entry.insert(Node { - data: T::default(), - parent: Some(parent.clone()), - children: vec![], - }); - true - } else { - false - }; - - if is_new_child { - self.nodes - .get_mut(&parent) - .unwrap() - .children - .push(child.clone()); - } - - Some(&mut self.nodes.get_mut(&child).unwrap().data) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_transaction_forest() { - let mut forest = Forest::>::default(); - let node1 = forest.insert_or_get_root(1); - assert_eq!(node1, &vec![]); - node1.push(()); - assert_eq!(forest.insert_or_get_root(2), &vec![]); - assert_eq!(forest.insert_or_get_child(0, 3), None); - let node3 = forest.insert_or_get_child(1, 3).unwrap(); - assert_eq!(node3, &vec![]); - node3.extend([(), ()]); - let mut collected = vec![]; - forest.remove_subtree(1, |_, data| collected.extend(data)); - assert_eq!(collected.len(), 3); - assert_eq!(forest.insert_or_get_root(1), &vec![]); - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/mod.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/mod.rs deleted file mode 100644 index bf54d3e1ef..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/mod.rs +++ /dev/null @@ -1,144 +0,0 @@ -use dozer_ingestion_connector::dozer_types::{ - chrono::{DateTime, Utc}, - log::{trace, warn}, -}; - -use crate::connector::{ - replicate::log::{LogManagerContent, TransactionId}, - Scn, -}; - -#[derive(Debug, Clone)] -pub struct Transaction { - pub commit_scn: Scn, - pub commit_timestamp: DateTime, - pub operations: Vec, -} - -#[derive(Debug, Clone)] -pub struct Operation { - pub seg_owner: String, - pub table_name: String, - pub kind: OperationKind, - pub sql_redo: String, -} - -#[derive(Debug, Clone, Copy)] -pub enum OperationKind { - Insert, - Delete, - Update, -} - -#[derive(Debug, Clone)] -pub struct Aggregator { - start_scn: Scn, -} - -impl Aggregator { - pub fn new(start_scn: Scn) -> Self { - Self { start_scn } - } - - pub fn process( - &self, - iterator: impl Iterator, - ) -> impl Iterator { - Processor { - iterator, - start_scn: self.start_scn, - transaction_forest: Default::default(), - } - } -} - -type TransactionForest = forest::Forest>; - -#[derive(Debug)] -struct Processor> { - iterator: I, - start_scn: Scn, - transaction_forest: TransactionForest, -} - -impl> Iterator for Processor { - type Item = Transaction; - - fn next(&mut self) -> Option { - loop { - let content = self.iterator.next()?; - - if content.operation_code == OP_CODE_COMMIT { - if let Some(transaction) = commit::commit( - content.xid, - content.pxid, - content.scn, - content.timestamp, - &mut self.transaction_forest, - ) { - if transaction.commit_scn >= self.start_scn { - return Some(transaction); - } - } - continue; - } - - if content.operation_code == OP_CODE_ROLLBACK { - self.transaction_forest - .remove_subtree(content.xid, |_, _| ()); - continue; - } - - let Some(seg_owner) = content.seg_owner else { - continue; - }; - let Some(table_name) = content.table_name else { - continue; - }; - let (kind, sql_redo) = match content.operation_code { - OP_CODE_INSERT => ( - OperationKind::Insert, - content.sql_redo.expect("insert must have redo"), - ), - OP_CODE_DELETE => ( - OperationKind::Delete, - content.sql_redo.expect("delete must have redo"), - ), - OP_CODE_UPDATE => ( - OperationKind::Update, - content.sql_redo.expect("update must have redo"), - ), - OP_CODE_DDL => { - warn!("Ignoring DDL operation: {:?}", content.sql_redo); - continue; - } - _ => { - trace!("Ignoring operation: {:?}", content.sql_redo); - continue; - } - }; - op::process_operation( - content.xid, - content.pxid, - Operation { - seg_owner, - table_name, - kind, - sql_redo, - }, - &mut self.transaction_forest, - ); - } - } -} - -mod commit; -mod forest; -mod op; - -const OP_CODE_INSERT: u8 = 1; -const OP_CODE_DELETE: u8 = 2; -const OP_CODE_UPDATE: u8 = 3; -const OP_CODE_DDL: u8 = 5; -const OP_CODE_COMMIT: u8 = 7; -const OP_CODE_ROLLBACK: u8 = 36; diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/op.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/op.rs deleted file mode 100644 index ad393b4729..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/aggregate/op.rs +++ /dev/null @@ -1,27 +0,0 @@ -use dozer_ingestion_connector::dozer_types::log::warn; - -use crate::connector::replicate::log::TransactionId; - -use super::{Operation, TransactionForest}; - -pub fn process_operation( - xid: TransactionId, - pxid: TransactionId, - operation: Operation, - transaction_forest: &mut TransactionForest, -) { - if xid == pxid { - // This is a top level transaction - transaction_forest.insert_or_get_root(xid).push(operation); - } else { - // This is a sub transaction. - let Some(operations) = transaction_forest.insert_or_get_child(pxid, xid) else { - warn!( - "Parent transaction {:02X?} not found for sub transaction {:02X?}", - pxid, xid - ); - return; - }; - operations.push(operation); - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/csf.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/csf.rs deleted file mode 100644 index 3fd5dec366..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/csf.rs +++ /dev/null @@ -1,48 +0,0 @@ -//! Handles the Continuation SQL flag in V$LOGMNR_CONTENTS. - -use crate::connector::replicate::log::LogManagerContent; - -/// Output items is guaranteed to have CSF = 0. -pub fn process( - iterator: impl Iterator, -) -> impl Iterator { - Processor { - iterator, - pending: None, - } -} - -struct Processor> { - iterator: I, - pending: Option, -} - -impl> Iterator for Processor { - type Item = LogManagerContent; - - fn next(&mut self) -> Option { - loop { - let content = self.iterator.next()?; - - if let Some(mut previous_content) = self.pending.take() { - previous_content.sql_redo = match (previous_content.sql_redo, content.sql_redo) { - (Some(mut previous), Some(current)) => { - previous.push_str(¤t); - Some(previous) - } - (previous, current) => previous.or(current), - }; - if content.csf == 0 { - previous_content.csf = 0; - return Some(previous_content); - } else { - self.pending = Some(previous_content); - } - } else if content.csf == 0 { - return Some(content); - } else { - self.pending = Some(content); - } - } - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/map.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/map.rs deleted file mode 100644 index 122ce3b5f9..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/map.rs +++ /dev/null @@ -1,161 +0,0 @@ -use dozer_ingestion_connector::dozer_types::{ - chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime, ParseError, Utc}, - ordered_float::OrderedFloat, - rust_decimal::prelude::ToPrimitive, - types::{Field, FieldType, Operation, Record, Schema}, -}; - -use crate::connector::Error; - -use super::{ - parse::{ParsedOperation, ParsedOperationKind, ParsedRow, ParsedTransaction, ParsedValue}, - Transaction, -}; - -#[derive(Debug, Clone)] -pub struct Mapper { - schemas: Vec, -} - -impl Mapper { - pub fn new(schemas: Vec) -> Self { - Self { schemas } - } - - pub fn process<'a>( - &'a self, - iterator: impl Iterator> + 'a, - ) -> impl Iterator> + 'a { - Processor { - iterator, - mapper: self, - } - } - - fn map(&self, operation: ParsedOperation) -> Result<(usize, Operation), Error> { - let schema = &self.schemas[operation.table_index]; - Ok(( - operation.table_index, - match operation.kind { - ParsedOperationKind::Insert(row) => Operation::Insert { - new: map_row(row, schema)?, - }, - ParsedOperationKind::Delete(row) => Operation::Delete { - old: map_row(row, schema)?, - }, - ParsedOperationKind::Update { old, new } => Operation::Update { - old: map_row(old, schema)?, - new: map_row(new, schema)?, - }, - }, - )) - } -} - -#[derive(Debug)] -struct Processor<'a, I: Iterator>> { - iterator: I, - mapper: &'a Mapper, -} - -impl<'a, I: Iterator>> Iterator for Processor<'a, I> { - type Item = Result; - - fn next(&mut self) -> Option { - let transaction = match self.iterator.next()? { - Ok(transaction) => transaction, - Err(err) => return Some(Err(err)), - }; - - let mut operations = vec![]; - for operation in transaction.operations { - match self.mapper.map(operation) { - Ok(operation) => operations.push(operation), - Err(err) => return Some(Err(err)), - } - } - - Some(Ok(Transaction { - commit_scn: transaction.commit_scn, - commit_timestamp: transaction.commit_timestamp, - operations, - })) - } -} - -fn map_row(mut row: ParsedRow, schema: &Schema) -> Result { - let mut values = vec![]; - for field in &schema.fields { - let value = row - .remove(&field.name) - .ok_or_else(|| Error::FieldNotFound(field.name.clone()))?; - values.push(map_value(value, field.typ, field.nullable, &field.name)?); - } - - Ok(Record::new(values)) -} - -fn map_value( - value: ParsedValue, - typ: FieldType, - nullable: bool, - name: &str, -) -> Result { - match (value, typ, nullable) { - (ParsedValue::Null, _, false) => Err(Error::NullValue(name.to_string())), - (ParsedValue::Null, _, true) => Ok(Field::Null), - (ParsedValue::String(string), FieldType::Float, _) => { - Ok(Field::Float(OrderedFloat(string.parse()?))) - } - (ParsedValue::Number(number), FieldType::Float, _) => Ok(Field::Float(OrderedFloat( - number - .to_f64() - .ok_or_else(|| Error::FloatOverflow(number))?, - ))), - (ParsedValue::String(string), FieldType::Decimal, _) => Ok(Field::Decimal(string.parse()?)), - (ParsedValue::Number(number), FieldType::Decimal, _) => Ok(Field::Decimal(number)), - (ParsedValue::Number(number), FieldType::Int, _) => Ok(Field::Int( - number - .to_i64() - .ok_or_else(|| Error::ParseIntFailed(number))?, - )), - (ParsedValue::Number(number), FieldType::UInt, _) => Ok(Field::UInt( - number - .to_u64() - .ok_or_else(|| Error::ParseUIntFailed(number))?, - )), - (ParsedValue::String(string), FieldType::String, _) => Ok(Field::String(string)), - (ParsedValue::Number(_), FieldType::String, _) => Err(Error::TypeMismatch { - field: name.to_string(), - expected: FieldType::String, - actual: FieldType::Decimal, - }), - (_, FieldType::Binary, _) => unimplemented!("parse binary from redo sql"), - (ParsedValue::String(string), FieldType::Date, _) => Ok(Field::Date( - parse_date(&string).map_err(|e| Error::ParseDateTime(e, string))?, - )), - (ParsedValue::Number(_), FieldType::Date, _) => Err(Error::TypeMismatch { - field: name.to_string(), - expected: FieldType::Date, - actual: FieldType::Decimal, - }), - (ParsedValue::String(string), FieldType::Timestamp, _) => Ok(Field::Timestamp( - parse_date_time(&string).map_err(|e| Error::ParseDateTime(e, string))?, - )), - (ParsedValue::Number(_), FieldType::Timestamp, _) => Err(Error::TypeMismatch { - field: name.to_string(), - expected: FieldType::Timestamp, - actual: FieldType::Decimal, - }), - _ => unreachable!(), - } -} - -fn parse_date(string: &str) -> Result { - NaiveDate::parse_from_str(string, "%d-%b-%y") -} - -fn parse_date_time(string: &str) -> Result, ParseError> { - let date_time = NaiveDateTime::parse_from_str(string, "%d-%b-%y %I.%M.%S%.6f %p")?; - Ok(Ok(DateTime::::from_naive_utc_and_offset(date_time, Utc))?.fixed_offset()) -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/mod.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/mod.rs deleted file mode 100644 index d12afc6c81..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/mod.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::collections::HashMap; - -use dozer_ingestion_connector::dozer_types::{ - chrono::{DateTime, Utc}, - types::{Operation, Schema}, -}; - -use crate::connector::{Error, Scn}; - -use super::log::LogManagerContent; - -#[derive(Debug, Clone)] -pub struct Transaction { - pub commit_scn: Scn, - pub commit_timestamp: DateTime, - pub operations: Vec<(usize, Operation)>, -} - -#[derive(Debug, Clone)] -pub struct Processor { - aggregator: aggregate::Aggregator, - parser: parse::Parser, - mapper: map::Mapper, -} - -impl Processor { - pub fn new( - start_scn: Scn, - table_pair_to_index: HashMap<(String, String), usize>, - schemas: Vec, - ) -> Self { - Self { - aggregator: aggregate::Aggregator::new(start_scn), - parser: parse::Parser::new(table_pair_to_index), - mapper: map::Mapper::new(schemas), - } - } - - pub fn process<'a>( - &'a self, - iterator: impl IntoIterator + 'a, - ) -> impl Iterator> + 'a { - let csf = csf::process(iterator.into_iter()); - let transaction = self.aggregator.process(csf); - let parse = self.parser.process(transaction); - self.mapper.process(parse) - } -} - -mod aggregate; -mod csf; -mod map; -mod parse; diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/delete.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/delete.rs deleted file mode 100644 index 8a90dc8f8f..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/delete.rs +++ /dev/null @@ -1,62 +0,0 @@ -use dozer_ingestion_connector::dozer_types::log::warn; -use regex::Regex; - -use crate::connector::Error; - -use super::{row, ParsedRow}; - -#[derive(Debug, Clone)] -pub struct Parser { - regex: Regex, - row_parser: row::Parser, -} - -impl Parser { - pub fn new() -> Self { - let regex = - Regex::new(r#"^delete from "((?:C##)?\w+)"\."(\w+)"\n *where\n(?s)(.+)$"#).unwrap(); - Self { - regex, - row_parser: row::Parser::new(" and", ";"), - } - } - - pub fn parse(&self, sql_redo: &str, table_pair: &(String, String)) -> Result { - let captures = self - .regex - .captures(sql_redo) - .ok_or_else(|| Error::DeleteFailedToMatch(sql_redo.to_string()))?; - let owner = captures.get(1).unwrap().as_str(); - let table_name = captures.get(2).unwrap().as_str(); - if owner != table_pair.0 || table_name != table_pair.1 { - warn!( - "Table name {}.{} doesn't match {}.{} in log content", - owner, table_name, table_pair.0, table_pair.1 - ); - } - - self.row_parser.parse(captures.get(3).unwrap().as_str()) - } -} - -#[test] -fn test_parse() { - let parser = Parser::new(); - let sql_redo = r#"delete from "HR"."EMPLOYEES" - where - "EMPLOYEE_ID" = 306 and - "FIRST_NAME" = 'Nandini' and - "LAST_NAME" = 'Shastry' and - "EMAIL" = 'NSHASTRY' and - "PHONE_NUMBER" = '1234567890' and - "JOB_ID" = 'HR_REP' and - "SALARY" = 120000 and - "COMMISSION_PCT" = .05 and - "MANAGER_ID" = 105 and - "DEPARTMENT_ID" = 10; - "#; - let parsed = parser - .parse(sql_redo, &("HR".to_string(), "EMPLOYEES".to_string())) - .unwrap(); - assert_eq!(parsed.len(), 10); -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/insert.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/insert.rs deleted file mode 100644 index a5ab56daf2..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/insert.rs +++ /dev/null @@ -1,63 +0,0 @@ -use dozer_ingestion_connector::dozer_types::log::warn; -use regex::Regex; - -use crate::connector::Error; - -use super::{row, ParsedRow}; - -#[derive(Debug, Clone)] -pub struct Parser { - regex: Regex, - row_parser: row::Parser, -} - -impl Parser { - pub fn new() -> Self { - let regex = - Regex::new(r#"^insert into "((?:C##)?\w+)"\."(\w+)"\n *values\n(?s)(.+)$"#).unwrap(); - Self { - regex, - row_parser: row::Parser::new(",", ";"), - } - } - - pub fn parse(&self, sql_redo: &str, table_pair: &(String, String)) -> Result { - let captures = self - .regex - .captures(sql_redo) - .ok_or_else(|| Error::InsertFailedToMatch(sql_redo.to_string()))?; - let owner = captures.get(1).unwrap().as_str(); - let table_name = captures.get(2).unwrap().as_str(); - if owner != table_pair.0 || table_name != table_pair.1 { - warn!( - "Table name {}.{} doesn't match {}.{} in log content", - owner, table_name, table_pair.0, table_pair.1 - ); - } - - self.row_parser.parse(captures.get(3).unwrap().as_str()) - } -} - -#[test] -fn test_parse() { - let parser = Parser::new(); - let sql_redo = r#"insert into "HR"."EMPLOYEES" - values - "EMPLOYEE_ID" = 306, - "FIRST_NAME" = 'Nandini', - "LAST_NAME" = 'Shastry', - "EMAIL" = 'NSHASTRY', - "PHONE_NUMBER" = '1234567890', - "JOB_ID" = 'HR_REP', - "SALARY" = 120000, - "COMMISSION_PCT" = .05, - "MANAGER_ID" = 105, - "NULL_FIELD" IS NULL, - "DEPARTMENT_ID" = 10; - "#; - let parsed = parser - .parse(sql_redo, &("HR".to_string(), "EMPLOYEES".to_string())) - .unwrap(); - assert_eq!(parsed.len(), 11); -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/mod.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/mod.rs deleted file mode 100644 index 1bbb371ab8..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/mod.rs +++ /dev/null @@ -1,141 +0,0 @@ -use std::{collections::HashMap, str::FromStr}; - -use dozer_ingestion_connector::dozer_types::{ - chrono::{DateTime, Utc}, - log::trace, - rust_decimal::Decimal, -}; - -use crate::connector::{Error, Scn}; - -use super::aggregate::{Operation, OperationKind, Transaction}; - -#[derive(Debug, Clone)] -pub struct ParsedTransaction { - pub commit_scn: Scn, - pub commit_timestamp: DateTime, - pub operations: Vec, -} - -#[derive(Debug, Clone)] -pub struct ParsedOperation { - pub table_index: usize, - pub kind: ParsedOperationKind, -} - -pub type ParsedRow = HashMap; - -#[derive(Debug, Clone)] -pub enum ParsedOperationKind { - Insert(ParsedRow), - Delete(ParsedRow), - Update { old: ParsedRow, new: ParsedRow }, -} - -#[derive(Debug, Clone, PartialEq)] -pub enum ParsedValue { - String(String), - Number(Decimal), - Null, -} - -#[derive(Debug, Clone)] -pub struct Parser { - insert_parser: insert::Parser, - delete_parser: delete::Parser, - update_parser: update::Parser, - table_pair_to_index: HashMap<(String, String), usize>, -} - -impl Parser { - pub fn new(table_pair_to_index: HashMap<(String, String), usize>) -> Self { - Self { - insert_parser: insert::Parser::new(), - delete_parser: delete::Parser::new(), - update_parser: update::Parser::new(), - table_pair_to_index, - } - } - - pub fn process<'a>( - &'a self, - iterator: impl Iterator + 'a, - ) -> impl Iterator> + 'a { - Processor { - iterator, - parser: self, - } - } - - fn parse(&self, operation: Operation) -> Result, Error> { - let table_pair = (operation.seg_owner, operation.table_name); - let Some(&table_index) = self.table_pair_to_index.get(&table_pair) else { - trace!( - "Ignoring operation on table {}.{}", - table_pair.0, - table_pair.1 - ); - return Ok(None); - }; - - let kind = match operation.kind { - OperationKind::Insert => ParsedOperationKind::Insert( - self.insert_parser.parse(&operation.sql_redo, &table_pair)?, - ), - OperationKind::Delete => ParsedOperationKind::Delete( - self.delete_parser.parse(&operation.sql_redo, &table_pair)?, - ), - OperationKind::Update => { - let (old, new) = self.update_parser.parse(&operation.sql_redo, &table_pair)?; - ParsedOperationKind::Update { old, new } - } - }; - Ok(Some(ParsedOperation { table_index, kind })) - } -} - -#[derive(Debug)] -struct Processor<'a, I: Iterator> { - iterator: I, - parser: &'a Parser, -} - -impl<'a, I: Iterator> Iterator for Processor<'a, I> { - type Item = Result; - - fn next(&mut self) -> Option { - let transaction = self.iterator.next()?; - - let mut operations = vec![]; - for operation in transaction.operations { - match self.parser.parse(operation) { - Ok(Some(operation)) => operations.push(operation), - Ok(None) => continue, - Err(err) => return Some(Err(err)), - } - } - - Some(Ok(ParsedTransaction { - commit_scn: transaction.commit_scn, - commit_timestamp: transaction.commit_timestamp, - operations, - })) - } -} - -impl FromStr for ParsedValue { - type Err = Error; - - fn from_str(s: &str) -> Result { - if s.starts_with('\'') { - Ok(ParsedValue::String(s[1..s.len() - 1].to_string())) - } else { - Ok(ParsedValue::Number(s.parse()?)) - } - } -} - -mod delete; -mod insert; -mod row; -mod update; diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/row.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/row.rs deleted file mode 100644 index fcafe6a5ed..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/row.rs +++ /dev/null @@ -1,36 +0,0 @@ -use std::collections::HashMap; - -use regex::Regex; - -use crate::connector::Error; - -use super::{ParsedRow, ParsedValue}; - -#[derive(Debug, Clone)] -pub struct Parser { - regex: Regex, -} - -impl Parser { - pub fn new(delimiter: &str, end: &str) -> Self { - let regex = Regex::new(&format!( - "\"(\\w+)\" (= (.+)|IS NULL)({} *\\n|{})", - delimiter, end - )) - .unwrap(); - Self { regex } - } - - pub fn parse(&self, values: &str) -> Result { - let mut result = HashMap::new(); - for cap in self.regex.captures_iter(values) { - let column = cap.get(1).unwrap().as_str(); - let value = match cap.get(3) { - Some(value) => value.as_str().parse()?, - None => ParsedValue::Null, - }; - result.insert(column.to_string(), value); - } - Ok(result) - } -} diff --git a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/update.rs b/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/update.rs deleted file mode 100644 index e963938010..0000000000 --- a/dozer-ingestion/oracle/src/connector/replicate/transaction/parse/update.rs +++ /dev/null @@ -1,100 +0,0 @@ -use dozer_ingestion_connector::dozer_types::log::warn; -use regex::Regex; - -use crate::connector::Error; - -use super::{row, ParsedRow}; - -#[derive(Debug, Clone)] -pub struct Parser { - regex: Regex, - new_row_parser: row::Parser, - old_row_parser: row::Parser, -} - -impl Parser { - pub fn new() -> Self { - let regex = Regex::new( - r#"^update "((?:C##)?\w+)"\."(\w+)"\n *set *\n *(?s)(.+) *where *\n(?s)(.+)$"#, - ) - .unwrap(); - Self { - regex, - new_row_parser: row::Parser::new(",", "\n"), - old_row_parser: row::Parser::new(" and", ";"), - } - } - - pub fn parse( - &self, - sql_redo: &str, - table_pair: &(String, String), - ) -> Result<(ParsedRow, ParsedRow), Error> { - let captures = self - .regex - .captures(sql_redo) - .ok_or_else(|| Error::UpdateFailedToMatch(sql_redo.to_string()))?; - let owner = captures.get(1).unwrap().as_str(); - let table_name = captures.get(2).unwrap().as_str(); - if owner != table_pair.0 || table_name != table_pair.1 { - warn!( - "Table name {}.{} doesn't match {}.{} in log content", - owner, table_name, table_pair.0, table_pair.1 - ); - } - - let mut new_row = self - .new_row_parser - .parse(captures.get(3).unwrap().as_str())?; - let old_row = self - .old_row_parser - .parse(captures.get(4).unwrap().as_str())?; - for (column, old_value) in old_row.iter() { - if !new_row.contains_key(column) { - new_row.insert(column.clone(), old_value.clone()); - } - } - Ok((old_row, new_row)) - } -} - -#[test] -fn test_parse() { - use super::ParsedValue; - - let parser = Parser::new(); - let sql_redo = r#"update "DOZER"."TRANSACTIONS" - set - "TYPE" = 'REBATE' - where - "TRANSACTION_ID" = 12001 and - "CUSTOMER_ID" = 63147 and - "TYPE" = 'Withdrawal' and - "AMOUNT" = 9691.34 and - "CURRENCY" = 'USD' and - "TRANSACTION_DATE" = '28-JAN-24' and - "STATUS" = 'Completed' and - "DESCRIPTION" = 'Yeah become language inside purpose.'; - "#; - let (old, new) = parser - .parse(sql_redo, &("HR".to_string(), "EMPLOYEES".to_string())) - .unwrap(); - assert_eq!(old.len(), 8); - assert_eq!(new.len(), 8); - assert_eq!( - old.get("TRANSACTION_ID").unwrap(), - &ParsedValue::Number("12001".parse().unwrap()) - ); - assert_eq!( - new.get("TRANSACTION_ID").unwrap(), - &ParsedValue::Number("12001".parse().unwrap()) - ); - assert_eq!( - old.get("TYPE").unwrap(), - &ParsedValue::String("Withdrawal".to_string()) - ); - assert_eq!( - new.get("TYPE").unwrap(), - &ParsedValue::String("REBATE".to_string()) - ); -} diff --git a/dozer-ingestion/oracle/src/lib.rs b/dozer-ingestion/oracle/src/lib.rs deleted file mode 100644 index cf398cb097..0000000000 --- a/dozer-ingestion/oracle/src/lib.rs +++ /dev/null @@ -1,215 +0,0 @@ -use dozer_ingestion_connector::{ - async_trait, - dozer_types::{ - errors::internal::BoxedError, - log::info, - models::ingestion_types::{IngestionMessage, OracleConfig, TransactionInfo}, - node::OpIdentifier, - types::FieldType, - }, - tokio, Connector, Ingestor, SourceSchemaResult, TableIdentifier, TableInfo, -}; - -#[derive(Debug)] -pub struct OracleConnector { - connection_name: String, - config: OracleConfig, - connectors: Option, -} - -#[derive(Debug, Clone)] -struct Connectors { - root_connector: connector::Connector, - pdb_connector: connector::Connector, - con_id: Option, -} - -const DEFAULT_BATCH_SIZE: usize = 100_000; - -impl OracleConnector { - pub fn new(connection_name: String, config: OracleConfig) -> Self { - Self { - connection_name, - config, - connectors: None, - } - } - - async fn ensure_connection( - &mut self, - force_reconnect: bool, - ) -> Result { - if self.connectors.is_none() || force_reconnect { - let connection_name = self.connection_name.clone(); - let config = self.config.clone(); - let pdb = self.config.pdb.clone(); - self.connectors = Some( - tokio::task::spawn_blocking(move || { - let root_connect_string = - format!("{}:{}/{}", config.host, config.port, config.sid); - let batch_size = config.batch_size.unwrap_or(DEFAULT_BATCH_SIZE); - let mut root_connector = connector::Connector::new( - connection_name.clone(), - config.user.clone(), - &config.password, - &root_connect_string, - batch_size, - config.replicator, - )?; - - let (pdb_connector, con_id) = if let Some(pdb) = pdb { - let pdb_connect_string = format!("{}:{}/{}", config.host, config.port, pdb); - let pdb_connector = connector::Connector::new( - connection_name, - config.user.clone(), - &config.password, - &pdb_connect_string, - batch_size, - config.replicator, - )?; - let con_id = root_connector.get_con_id(&pdb)?; - (pdb_connector, Some(con_id)) - } else { - (root_connector.clone(), None) - }; - - Ok::<_, connector::Error>(Connectors { - root_connector, - pdb_connector, - con_id, - }) - }) - .await - .unwrap()?, - ); - } - Ok(self.connectors.as_ref().unwrap().clone()) - } -} - -#[async_trait] -impl Connector for OracleConnector { - fn types_mapping() -> Vec<(String, Option)> - where - Self: Sized, - { - todo!() - } - - async fn validate_connection(&mut self) -> Result<(), BoxedError> { - self.ensure_connection(false).await?; - Ok(()) - } - - async fn list_tables(&mut self) -> Result, BoxedError> { - let mut connectors = self.ensure_connection(false).await?; - let schemas = self.config.schemas.clone(); - tokio::task::spawn_blocking(move || connectors.pdb_connector.list_tables(&schemas)) - .await - .unwrap() - .map_err(Into::into) - } - - async fn validate_tables(&mut self, tables: &[TableIdentifier]) -> Result<(), BoxedError> { - self.list_columns(tables.to_vec()).await?; - Ok(()) - } - - async fn list_columns( - &mut self, - tables: Vec, - ) -> Result, BoxedError> { - let mut connectors = self.ensure_connection(false).await?; - tokio::task::spawn_blocking(move || connectors.pdb_connector.list_columns(tables)) - .await - .unwrap() - .map_err(Into::into) - } - - async fn get_schemas( - &mut self, - table_infos: &[TableInfo], - ) -> Result, BoxedError> { - let mut connectors = self.ensure_connection(false).await?; - let table_infos = table_infos.to_vec(); - Ok( - tokio::task::spawn_blocking(move || connectors.pdb_connector.get_schemas(&table_infos)) - .await - .unwrap()? - .into_iter() - .map(|result| result.map_err(Into::into)) - .collect(), - ) - } - - async fn serialize_state(&self) -> Result, BoxedError> { - Ok(vec![]) - } - - async fn start( - &mut self, - ingestor: &Ingestor, - tables: Vec, - last_checkpoint: Option, - ) -> Result<(), BoxedError> { - let checkpoint = if let Some(last_checkpoint) = last_checkpoint { - last_checkpoint.txid - } else { - info!("No checkpoint passed, starting snapshotting"); - - let ingestor_clone = ingestor.clone(); - let tables = tables.clone(); - let mut connectors = self.ensure_connection(false).await?; - - if ingestor - .handle_message(IngestionMessage::TransactionInfo( - TransactionInfo::SnapshottingStarted, - )) - .await - .is_err() - { - return Ok(()); - } - let scn = tokio::task::spawn_blocking(move || { - connectors.pdb_connector.snapshot(&ingestor_clone, tables) - }) - .await - .unwrap()?; - ingestor - .handle_message(IngestionMessage::TransactionInfo( - TransactionInfo::SnapshottingDone { - id: Some(OpIdentifier { - txid: scn, - seq_in_tx: 0, - }), - }, - )) - .await?; - scn - }; - - info!("Replicating from checkpoint: {}", checkpoint); - let ingestor = ingestor.clone(); - let schemas = self.get_schemas(&tables).await?; - let schemas = schemas - .into_iter() - .map(|schema| schema.map(|schema| schema.schema)) - .collect::, _>>()?; - let mut connectors = self.ensure_connection(false).await?; - tokio::task::spawn_blocking(move || { - connectors.root_connector.replicate( - &ingestor, - tables, - schemas, - checkpoint, - connectors.con_id, - ) - }) - .await - .unwrap(); - - Ok(()) - } -} - -mod connector; diff --git a/dozer-ingestion/src/errors.rs b/dozer-ingestion/src/errors.rs index 5f6646c4e2..8668bc6e20 100644 --- a/dozer-ingestion/src/errors.rs +++ b/dozer-ingestion/src/errors.rs @@ -33,6 +33,9 @@ pub enum ConnectorError { #[error("javascript feature is not enabled")] JavascrtiptFeatureNotEnabled, + #[error("{0}: This feature is only avaialble in enteprise. Please contact us.")] + FeatureNotEnabled(String), + #[error("{0} is not supported as a source connector")] Unsupported(String), } diff --git a/dozer-ingestion/src/lib.rs b/dozer-ingestion/src/lib.rs index 72090e531a..934429f0d3 100644 --- a/dozer-ingestion/src/lib.rs +++ b/dozer-ingestion/src/lib.rs @@ -1,6 +1,3 @@ -use std::sync::Arc; - -use dozer_ingestion_aerospike::connector::AerospikeConnector; #[cfg(feature = "ethereum")] use dozer_ingestion_connector::dozer_types::models::ingestion_types::EthProviderConfig; use dozer_ingestion_connector::dozer_types::{ @@ -10,7 +7,6 @@ use dozer_ingestion_connector::dozer_types::{ connection::{Connection, ConnectionConfig}, ingestion_types::default_grpc_adapter, }, - node::NodeHandle, prettytable::Table, }; #[cfg(feature = "datafusion")] @@ -27,7 +23,6 @@ use dozer_ingestion_mongodb::MongodbConnector; use dozer_ingestion_mysql::connector::{mysql_connection_opts_from_url, MySQLConnector}; #[cfg(feature = "datafusion")] use dozer_ingestion_object_store::connector::ObjectStoreConnector; -use dozer_ingestion_oracle::OracleConnector; use dozer_ingestion_postgres::{ connection::helper::map_connection_config, connector::{PostgresConfig, PostgresConnector}, @@ -36,6 +31,7 @@ use dozer_ingestion_postgres::{ use dozer_ingestion_snowflake::connector::SnowflakeConnector; use dozer_ingestion_webhook::connector::WebhookConnector; use errors::ConnectorError; +use std::sync::Arc; use tokio::runtime::Runtime; pub mod errors; @@ -157,15 +153,10 @@ pub fn get_connector( runtime, javascript_config, ))), - ConnectionConfig::Aerospike(config) => Ok(Box::new(AerospikeConnector::new( - config, - NodeHandle::new(None, connection.name), - event_hub.receiver, - ))), - ConnectionConfig::Oracle(oracle_config) => Ok(Box::new(OracleConnector::new( - connection.name, - oracle_config, - ))), + ConnectionConfig::Aerospike(_) => { + Err(ConnectorError::FeatureNotEnabled("Aerospike".to_string())) + } + ConnectionConfig::Oracle(_) => Err(ConnectorError::FeatureNotEnabled("Oracle".to_string())), } } diff --git a/dozer-sink-aerospike/Cargo.toml b/dozer-sink-aerospike/Cargo.toml deleted file mode 100644 index 2f711eaf79..0000000000 --- a/dozer-sink-aerospike/Cargo.toml +++ /dev/null @@ -1,14 +0,0 @@ -[package] -name = "dozer-sink-aerospike" -version = "0.1.0" -edition = "2021" -license = "AGPL-3.0-or-later" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -dozer-core = { path = "../dozer-core" } -dozer-types = { path = "../dozer-types" } -aerospike-client-sys = { path = "./aerospike-client-sys" } -itertools = "0.12" -smallvec = "1.13.1" diff --git a/dozer-sink-aerospike/aerospike-client-sys/Cargo.lock b/dozer-sink-aerospike/aerospike-client-sys/Cargo.lock deleted file mode 100644 index 7ae1b029d4..0000000000 --- a/dozer-sink-aerospike/aerospike-client-sys/Cargo.lock +++ /dev/null @@ -1,7 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "aerospike-client-sys" -version = "0.1.0" diff --git a/dozer-sink-aerospike/aerospike-client-sys/Cargo.toml b/dozer-sink-aerospike/aerospike-client-sys/Cargo.toml deleted file mode 100644 index ec99384aae..0000000000 --- a/dozer-sink-aerospike/aerospike-client-sys/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -name = "aerospike-client-sys" -version = "0.1.0" -edition = "2021" -license = "AGPL-3.0-or-later" - -[lib] -doctest = false - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] - -[build-dependencies] -bindgen = "0.69.4" diff --git a/dozer-sink-aerospike/aerospike-client-sys/aerospike_client.h b/dozer-sink-aerospike/aerospike-client-sys/aerospike_client.h deleted file mode 100644 index c7716020e1..0000000000 --- a/dozer-sink-aerospike/aerospike-client-sys/aerospike_client.h +++ /dev/null @@ -1,16 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include diff --git a/dozer-sink-aerospike/aerospike-client-sys/build.rs b/dozer-sink-aerospike/aerospike-client-sys/build.rs deleted file mode 100644 index cd6143c8fd..0000000000 --- a/dozer-sink-aerospike/aerospike-client-sys/build.rs +++ /dev/null @@ -1,74 +0,0 @@ -use core::panic; -use std::{ - env, fs, - path::{Path, PathBuf}, - process::Command, -}; - -fn cp_r(dir: &Path, dest: &Path) { - for entry in fs::read_dir(dir).unwrap() { - let entry = entry.unwrap(); - let path = entry.path(); - let dst = dest.join(path.file_name().expect("Failed to get filename of path")); - if fs::metadata(&path).unwrap().is_file() { - fs::copy(path, dst).unwrap(); - } else { - fs::create_dir_all(&dst).unwrap(); - cp_r(&path, &dst); - } - } -} -fn main() { - let out_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap()); - let build_dir = out_dir.join("build"); - fs::create_dir_all(&build_dir).unwrap(); - let output_dir = build_dir.join("out"); - let lib_dir = output_dir.join("lib"); - let include_dir = output_dir.join("include"); - let make_flags = vec!["TARGET_BASE=out"]; - - let current_dir = env::current_dir().unwrap(); - let source_dir = current_dir.join("aerospike-client-c"); - cp_r(&source_dir, &build_dir); - - let mut make = Command::new("make"); - make.args(make_flags) - .env("MAKEFLAGS", std::env::var("CARGO_MAKEFLAGS").unwrap()) - // The Makefile checks whether DEBUG is defined and cargo always sets it - // (it's either DEBUG=false or DEBUG=true, but always defined). When DEBUG, - // it tries to link against gcov, which we don't want - .env_remove("DEBUG") - .current_dir(build_dir); - let out = make.output().unwrap(); - if !out.status.success() { - panic!( - "Building aerospike client failed with exit code {}.\nstout: {}\nstderr: {}", - out.status.code().unwrap(), - String::from_utf8_lossy(&out.stdout), - String::from_utf8_lossy(&out.stderr), - ); - } - println!("cargo:rustc-link-search=native={}", lib_dir.display()); - println!("cargo:rustc-link-lib=static=aerospike"); - println!("cargo:rustc-link-lib=ssl"); - println!("cargo:rustc-link-lib=crypto"); - println!("cargo:rustc-link-lib=m"); - println!("cargo:rustc-link-lib=z"); - println!("cargo:rustc-link-lib=pthread"); - - println!("cargo:rerun-if-changed=aerospike_client.h"); - println!("cargo:rerun-if-changed=aerospike-client-c"); - let bindings = bindgen::Builder::default() - .header("aerospike_client.h") - .allowlist_type("(as|aerospike)_.*") - .allowlist_type("aerospike") - .allowlist_function("(as|aerospike)_.*") - .allowlist_var("(as|AS)_.*") - .clang_arg(format!("-I{}", include_dir.to_str().unwrap())) - .generate() - .expect("Unable to generate bindings"); - - bindings - .write_to_file(out_dir.join("generated.rs")) - .expect("Failed to write bindings"); -} diff --git a/dozer-sink-aerospike/aerospike-client-sys/src/lib.rs b/dozer-sink-aerospike/aerospike-client-sys/src/lib.rs deleted file mode 100644 index e67d2da84b..0000000000 --- a/dozer-sink-aerospike/aerospike-client-sys/src/lib.rs +++ /dev/null @@ -1,174 +0,0 @@ -#![allow(clippy::all)] -#![allow(non_camel_case_types)] -#![allow(non_upper_case_globals)] -#![allow(non_snake_case)] - -include!(concat!(env!("OUT_DIR"), "/generated.rs")); - -#[macro_export] -macro_rules! as_exp_build { - ($func:ident $args:tt ) => {{ - let mut v = Vec::new(); - $crate::as_exp_build_inner!(v, $func $args); - $crate::as_exp_compile(v.as_mut_ptr(), v.len() as u32) - }} -} - -#[macro_export] -macro_rules! as_exp_build_inner { - ($v:expr, as_exp_bin_int($bin_name:expr $(,)?)) => {{ - let bin_name: *const i8 = $bin_name; - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_BIN, - count: 3, - sz: 0, - prev_va_args: 0, - v: std::mem::zeroed(), - }); - $crate::as_exp_build_inner!($v, as_exp_int($crate::as_exp_type_AS_EXP_TYPE_INT as i64)); - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_VAL_RAWSTR, - v: $crate::as_exp_entry__bindgen_ty_1 { str_val: bin_name }, - count: 0, - sz: 0, - prev_va_args: 0, - }); - }}; - ($v:expr, as_exp_int($val:expr)) => { - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_VAL_INT, - v: $crate::as_exp_entry__bindgen_ty_1 { int_val: $val }, - count: 0, - sz: 0, - prev_va_args: 0, - }) - }; - ($v:expr, as_exp_uint($val:expr)) => { - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_VAL_UINT, - v: $crate::as_exp_entry__bindgen_ty_1 { uint_val: $val }, - count: 0, - sz: 0, - prev_va_args: 0, - }) - }; - ($v:expr, as_exp_cmp_eq($left_name:ident $left_args:tt, $right_name:ident $right_args:tt $(,)?)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_CMP_EQ, - count: 3, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $crate::as_exp_build_inner!($v, $left_name $left_args); - $crate::as_exp_build_inner!($v, $right_name $right_args); - }}; - ($v:expr, as_exp_cmp_gt($left_name:ident $left_args:tt, $right_name:ident $right_args:tt $(,)?)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_CMP_GT, - count: 3, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $crate::as_exp_build_inner!($v, $left_name $left_args); - $crate::as_exp_build_inner!($v, $right_name $right_args); - }}; - ($v:expr, as_exp_cmp_ge($left_name:ident $left_args:tt, $right_name:ident $right_args:tt $(,)?)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_CMP_GE, - count: 3, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $crate::as_exp_build_inner!($v, $left); - $crate::as_exp_build_inner!($v, $right); - }}; - ($v:expr, as_exp_cmp_lt($left_name:ident $left_args:tt, $right_name:ident $right_args:tt $(,)?)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_CMP_LT, - count: 3, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $crate::as_exp_build_inner!($v, $left_name $left_args); - $crate::as_exp_build_inner!($v, $right_name $right_args); - }}; - ($v:expr, as_exp_cmp_le($left_name:ident $left_args:tt, $right_name:ident $right_args:tt $(,)?)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_CMP_LE, - count: 3, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $crate::as_exp_build_inner!($v, $left_name $left_args); - $crate::as_exp_build_inner!($v, $right_name $right_args); - }}; - ($v:expr, as_exp_and($($arg_name:ident $arg_args:tt),*)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_AND, - count: 0, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $($crate::as_exp_build_inner!($v, $arg_name $arg_args));*; - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_END_OF_VA_ARGS, - count: 0, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - }}; -($v:expr, as_exp_or($($arg_name:ident $arg_args:tt),*)) => {{ - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_OR, - count: 0, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - $($crate::as_exp_build_inner!($v, $arg_name $arg_args));*; - $v.push($crate::as_exp_entry { - op: $crate::as_exp_ops__AS_EXP_CODE_END_OF_VA_ARGS, - count: 0, - v: std::mem::zeroed(), - sz: 0, - prev_va_args: 0, - }); - }}; -} - -#[cfg(test)] -mod tests { - use std::ffi::CString; - - use super::*; - - #[test] - fn test_as_exp_build() { - // Tested that this results in the same compiled expression as when - // using the macros from the C library - let bin_name = CString::new("bin_name").unwrap(); - unsafe { - let exp = as_exp_build! { - as_exp_and( - as_exp_cmp_gt( - as_exp_bin_int(bin_name.as_ptr()), - as_exp_int(3) - ), - as_exp_cmp_lt( - as_exp_bin_int(bin_name.as_ptr()), - as_exp_int(8) - ) - ) - }; - assert!(!exp.is_null()); - as_exp_destroy(exp); - } - } -} diff --git a/dozer-sink-aerospike/src/aerospike.rs b/dozer-sink-aerospike/src/aerospike.rs deleted file mode 100644 index d6d59d9a1e..0000000000 --- a/dozer-sink-aerospike/src/aerospike.rs +++ /dev/null @@ -1,1252 +0,0 @@ -use std::time::Instant; -use std::{ - alloc::{handle_alloc_error, Layout}, - ffi::{c_char, c_void, CStr, CString, NulError}, - fmt::Display, - mem::MaybeUninit, - ptr::{addr_of, addr_of_mut, NonNull}, - slice, -}; - -use itertools::Itertools; - -use aerospike_client_sys::*; -use dozer_types::log::debug; -use dozer_types::{ - chrono::{DateTime, NaiveDate}, - geo::{Coord, Point}, - json_types::{DestructuredJsonRef, JsonValue}, - ordered_float::OrderedFloat, - rust_decimal::prelude::*, - thiserror, - types::{DozerDuration, DozerPoint, Field, Schema}, -}; - -use crate::{denorm_dag::Error, AerospikeSinkError}; - -#[derive(Debug)] -pub struct BinNames { - storage: Vec, - _ptrs: Vec<*mut i8>, -} - -unsafe impl Send for BinNames {} - -impl Clone for BinNames { - fn clone(&self) -> Self { - let storage = self.storage.clone(); - let ptrs = Self::make_ptrs(&storage); - Self { - storage, - _ptrs: ptrs, - } - } -} - -impl BinNames { - fn make_ptrs(storage: &[CString]) -> Vec<*mut i8> { - storage - .iter() - .map(|name| name.as_ptr() as *mut i8) - .collect() - } - - pub(crate) fn _len(&self) -> usize { - self.storage.len() - } - - pub(crate) unsafe fn _ptrs(&mut self) -> *mut *mut i8 { - self._ptrs.as_mut_ptr() - } - - pub(crate) fn names(&self) -> &[CString] { - &self.storage - } - - pub(crate) fn new<'a, I: IntoIterator>(names: I) -> Result { - let storage: Vec = names - .into_iter() - .map(CString::new) - .collect::>()?; - let ptrs = Self::make_ptrs(&storage); - Ok(Self { - storage, - _ptrs: ptrs, - }) - } -} - -#[derive(Debug, thiserror::Error)] -pub struct AerospikeError { - pub(crate) code: i32, - pub(crate) message: String, -} - -impl AerospikeError { - pub(crate) fn from_code(value: as_status) -> Self { - let message = unsafe { as_error_string(value) }; - - let message = unsafe { CStr::from_ptr(message) }; - // The message is ASCII (I think?), so this should not fail - Self { - code: value, - message: message.to_str().unwrap().to_owned(), - } - } -} - -impl From for AerospikeError { - fn from(value: as_error) -> Self { - let code = value.code; - let message = unsafe { - let message = CStr::from_ptr(value.message.as_ptr()); - // The message is ASCII (I think?), so this should not fail - message.to_str().unwrap() - }; - Self { - code, - message: message.to_owned(), - } - } -} - -impl std::fmt::Display for AerospikeError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} - {}", self.code, self.message) - } -} - -// Client should never be `Clone`, because of the custom Drop impl -#[derive(Debug)] -pub struct Client { - inner: NonNull, -} - -// The aerospike client API is thread-safe. -unsafe impl Send for Client {} -unsafe impl Sync for Client {} - -#[inline(always)] -pub(crate) unsafe fn check_alloc(ptr: *mut T) -> *mut T { - if ptr.is_null() { - handle_alloc_error(Layout::new::()) - } - ptr -} -#[inline(always)] -unsafe fn as_try(f: impl FnOnce(*mut as_error) -> as_status) -> Result<(), AerospikeError> { - let mut err = MaybeUninit::uninit(); - if f(err.as_mut_ptr()) == as_status_e_AEROSPIKE_OK { - Ok(()) - } else { - Err(AerospikeError::from(err.assume_init())) - } -} - -impl Client { - pub fn new(hosts: &CStr) -> Result { - let mut config = unsafe { - let mut config = MaybeUninit::uninit(); - as_config_init(config.as_mut_ptr()); - config.assume_init() - }; - - config.policies.batch.base.total_timeout = 30000; - config.policies.batch.base.socket_timeout = 30000; - config.policies.write.key = as_policy_key_e_AS_POLICY_KEY_SEND; - config.policies.batch_write.key = as_policy_key_e_AS_POLICY_KEY_SEND; - unsafe { - // The hosts string will be copied, so pass it as `as_ptr` so the original - // gets deallocated at the end of this block - as_config_add_hosts(&mut config as *mut as_config, hosts.as_ptr(), 3000); - } - // Allocate a new client instance. Our `Drop` implementation will make - // sure it is destroyed - let this = unsafe { - let inner = aerospike_new(&mut config as *mut as_config); - if inner.is_null() { - handle_alloc_error(Layout::new::()) - } - let this = Self { - inner: NonNull::new_unchecked(inner), - }; - this.connect()?; - this - }; - Ok(this) - } - - fn connect(&self) -> Result<(), AerospikeError> { - unsafe { as_try(|err| aerospike_connect(self.inner.as_ptr(), err)) } - } - - unsafe fn put( - &self, - key: *const as_key, - record: *mut as_record, - mut policy: as_policy_write, - filter: Option>, - ) -> Result<(), AerospikeError> { - if let Some(filter) = filter { - policy.base.filter_exp = filter.as_ptr(); - } - - as_try(|err| { - aerospike_key_put( - self.inner.as_ptr(), - err, - &policy as *const as_policy_write, - key, - record, - ) - }) - } - - pub(crate) unsafe fn _insert( - &self, - key: *const as_key, - new: *mut as_record, - filter: Option>, - ) -> Result<(), AerospikeError> { - let mut policy = self.inner.as_ref().config.policies.write; - policy.exists = as_policy_exists_e_AS_POLICY_EXISTS_CREATE; - - self.put(key, new, policy, filter) - } - - pub(crate) unsafe fn _update( - &self, - key: *const as_key, - new: *mut as_record, - filter: Option>, - ) -> Result<(), AerospikeError> { - let mut policy = self.inner.as_ref().config.policies.write; - policy.exists = as_policy_exists_e_AS_POLICY_EXISTS_UPDATE; - self.put(key, new, policy, filter) - } - - pub(crate) unsafe fn upsert( - &self, - key: *const as_key, - new: *mut as_record, - filter: Option>, - ) -> Result<(), AerospikeError> { - let mut policy = self.inner.as_ref().config.policies.write; - policy.exists = as_policy_exists_e_AS_POLICY_EXISTS_CREATE_OR_REPLACE; - self.put(key, new, policy, filter) - } - - pub(crate) unsafe fn _delete( - &self, - key: *const as_key, - filter: Option>, - ) -> Result<(), AerospikeError> { - let mut policy = self.inner.as_ref().config.policies.remove; - if let Some(filter) = filter { - policy.base.filter_exp = filter.as_ptr(); - } - as_try(|err| { - aerospike_key_remove( - self.inner.as_ptr(), - err, - &policy as *const as_policy_remove, - key, - ) - }) - } - - pub(crate) fn config(&self) -> &as_config { - unsafe { &(*self.inner.as_ptr()).config } - } - - pub(crate) unsafe fn write_batch( - &self, - batch: *mut as_batch_records, - policy: Option<*const as_policy_batch>, - ) -> Result<(), AerospikeError> { - debug!(target: "aerospike_sink", "Writing batch of size {}", batch.as_ref().unwrap().list.size); - - let started = Instant::now(); - as_try(|err| { - aerospike_batch_write( - self.inner.as_ptr(), - err, - policy.unwrap_or(std::ptr::null()), - batch, - ) - })?; - debug!(target: "aerospike_sink", "Batch write took {:?}", started.elapsed()); - Ok(()) - } - - pub(crate) unsafe fn _select( - &self, - key: *const as_key, - bins: &[*const c_char], - record: &mut *mut as_record, - ) -> Result<(), AerospikeError> { - as_try(|err| { - aerospike_key_select( - self.inner.as_ptr(), - err, - std::ptr::null(), - key, - // This won't write to the mut ptr - bins.as_ptr() as *mut *const c_char, - record as *mut *mut as_record, - ) - }) - } - pub(crate) unsafe fn get( - &self, - key: *const as_key, - record: &mut *mut as_record, - ) -> Result<(), AerospikeError> { - as_try(|err| { - aerospike_key_get( - self.inner.as_ptr(), - err, - std::ptr::null(), - key, - record as *mut *mut as_record, - ) - }) - } - - pub(crate) unsafe fn batch_get( - &self, - batch: *mut as_batch_records, - ) -> Result<(), AerospikeError> { - as_try(|err| aerospike_batch_read(self.inner.as_ptr(), err, std::ptr::null(), batch)) - } - - /// # Safety - /// The caller is responsible for cleaning up the response - /// - /// This function sends a raw info request to the aerospike server - pub unsafe fn info( - &self, - request: &CStr, - response: &mut *mut i8, - ) -> Result<(), AerospikeError> { - as_try(|err| { - aerospike_info_any( - self.inner.as_ptr(), - err, - std::ptr::null(), - request.as_ptr(), - response as *mut *mut i8, - ) - }) - } -} - -impl Drop for Client { - fn drop(&mut self) { - unsafe { - aerospike_destroy(self.inner.as_ptr()); - } - } -} - -pub(crate) fn convert_json(value: &JsonValue) -> Result<*mut as_bin_value, AerospikeSinkError> { - unsafe { - Ok(match value.destructure_ref() { - // as_nil is a static, so we can't directly create a mutable pointer to it. We cast - // through a const pointer instead. This location will never be written to, - // because `free = false` - DestructuredJsonRef::Null => addr_of!(as_nil) as *mut as_val as *mut as_bin_value, - DestructuredJsonRef::Bool(value) => { - check_alloc(as_boolean_new(value)) as *mut as_bin_value - } - DestructuredJsonRef::Number(value) => { - if let Some(float) = value.to_f64() { - check_alloc(as_double_new(float)) as *mut as_bin_value - } else if let Some(integer) = value.to_i64() { - check_alloc(as_integer_new(integer)) as *mut as_bin_value - } else { - // If we can't represent as i64, we have a u64 that's larger than i64::MAX - return Err(AerospikeSinkError::IntegerOutOfRange( - value.to_u64().unwrap(), - )); - } - } - DestructuredJsonRef::String(value) => { - let bytes = check_alloc(as_bytes_new(value.len() as u32)); - as_bytes_set(bytes, 0, value.as_ptr(), value.len() as u32); - (*bytes).type_ = as_bytes_type_e_AS_BYTES_STRING; - bytes as *mut as_bin_value - } - DestructuredJsonRef::Array(value) => { - let list = check_alloc(as_arraylist_new(value.len() as u32, value.len() as u32)); - for v in value.iter() { - let as_value = convert_json(v)?; - if as_arraylist_append(list, as_value as *mut as_val) - != as_status_e_AEROSPIKE_OK - { - as_arraylist_destroy(list); - return Err(AerospikeSinkError::CreateRecordError); - } - } - list as *mut as_bin_value - } - DestructuredJsonRef::Object(value) => { - let map = check_alloc(as_orderedmap_new(value.len() as u32)); - struct Map(*mut as_orderedmap); - impl Drop for Map { - fn drop(&mut self) { - unsafe { - as_orderedmap_destroy(self.0); - } - } - } - // Make sure the map is deallocated if we encounter any error... - let _map_guard = Map(map); - for (k, v) in value.iter() { - let as_value = convert_json(v)?; - let key = { - let bytes = check_alloc(as_bytes_new(k.len() as u32)); - debug_assert!(as_bytes_set(bytes, 0, k.as_ptr(), k.len() as u32)); - (*bytes).type_ = as_bytes_type_e_AS_BYTES_STRING; - bytes as *mut as_val - }; - if as_orderedmap_set(map, key, as_value as *mut as_val) != 0 { - return Err(AerospikeSinkError::CreateRecordError); - }; - } - // ...but don't deallocate if we succeed - std::mem::forget(_map_guard); - map as *mut as_bin_value - } - }) - } -} - -#[inline] -fn set_str_key( - key: *mut as_key, - namespace: &CStr, - set: &CStr, - mut string: String, - allocated_strings: &mut Vec, -) { - unsafe { - let bytes = as_bytes_new_wrap(string.as_mut_ptr(), string.len() as u32, false); - (*bytes).type_ = as_bytes_type_e_AS_BYTES_STRING; - allocated_strings.push(string); - as_key_init_value( - key, - namespace.as_ptr(), - set.as_ptr(), - bytes as *const _ as *const as_key_value, - ); - } -} - -pub(crate) unsafe fn init_key( - key: *mut as_key, - namespace: &CStr, - set: &CStr, - key_fields: &[Field], - allocated_strings: &mut Vec, -) -> Result<(), AerospikeSinkError> { - assert!(!key_fields.is_empty()); - // Fast option - if key_fields.len() == 1 { - return init_key_single(key, namespace, set, &key_fields[0], allocated_strings); - } - - let key_string = key_fields.iter().join("_"); - set_str_key(key, namespace, set, key_string, allocated_strings); - - Ok(()) -} - -unsafe fn init_key_single( - key: *mut as_key, - namespace: &CStr, - set: &CStr, - key_field: &Field, - allocated_strings: &mut Vec, -) -> Result<(), AerospikeSinkError> { - unsafe { - match key_field { - Field::UInt(v) => { - as_key_init_int64(key, namespace.as_ptr(), set.as_ptr(), *v as i64); - } - Field::Int(v) => { - as_key_init_int64(key, namespace.as_ptr(), set.as_ptr(), *v); - } - Field::Int8(v) => { - as_key_init_int64(key, namespace.as_ptr(), set.as_ptr(), (*v).into()); - } - Field::U128(v) => set_str_key(key, namespace, set, v.to_string(), allocated_strings), - Field::I128(v) => set_str_key(key, namespace, set, v.to_string(), allocated_strings), - Field::Decimal(v) => set_str_key(key, namespace, set, v.to_string(), allocated_strings), - Field::Text(string) | Field::String(string) => { - set_str_key(key, namespace, set, string.clone(), allocated_strings); - } - Field::Binary(v) => { - as_key_init_rawp( - key, - namespace.as_ptr(), - set.as_ptr(), - v.as_ptr(), - v.len() as u32, - false, - ); - } - - Field::Timestamp(v) => { - set_str_key(key, namespace, set, v.to_rfc3339(), allocated_strings) - } - // Date's display implementation is RFC3339 compatible - Field::Date(v) => set_str_key(key, namespace, set, v.to_string(), allocated_strings), - // We can ignore the time unit, as we always output a - // full-resolution duration - Field::Duration(DozerDuration(duration, _)) => set_str_key( - key, - namespace, - set, - format!("PT{},{:09}S", duration.as_secs(), duration.subsec_nanos()), - allocated_strings, - ), - Field::Null => unreachable!("Primary key cannot be null"), - Field::Boolean(_) | Field::Json(_) | Field::Point(_) | Field::Float(_) => { - unreachable!("Unsupported primary key type. If this is reached, it means this record does not conform to the schema.") - } - }; - } - Ok(()) -} - -unsafe fn map_set_str( - map: *mut as_orderedmap, - key: *const as_val, - string: impl Display, - allocated_strings: &mut Vec, -) { - let string = format!("{string}\0"); - - let cstr = CStr::from_bytes_with_nul(string.as_bytes()).unwrap(); - let val = - as_string_new_wlen(cstr.as_ptr() as *mut c_char, string.len(), false) as *const as_val; - as_orderedmap_set(map, key, val); - allocated_strings.push(string); -} - -pub(crate) unsafe fn new_record_map( - dozer_record: &[Field], - bin_names: &[CString], - allocated_strings: &mut Vec, -) -> Result<*mut as_orderedmap, AerospikeSinkError> { - let map = check_alloc(as_orderedmap_new(bin_names.len().try_into().unwrap())); - for (def, field) in bin_names.iter().zip(dozer_record) { - let key = check_alloc(as_string_new_strdup(def.as_ptr())) as *const as_val; - match field { - Field::UInt(v) => { - as_orderedmap_set( - map, - key, - check_alloc(as_integer_new((*v).try_into().unwrap())) as *const as_val, - ); - } - Field::U128(v) => { - map_set_str(map, key, v, allocated_strings); - } - Field::Int(v) => { - as_orderedmap_set(map, key, check_alloc(as_integer_new(*v)) as *const as_val); - } - Field::Int8(v) => { - as_orderedmap_set( - map, - key, - check_alloc(as_integer_new((*v).into())) as *const as_val, - ); - } - Field::I128(v) => { - map_set_str(map, key, v, allocated_strings); - } - Field::Float(OrderedFloat(v)) => { - as_orderedmap_set(map, key, check_alloc(as_double_new(*v)) as *const as_val); - } - Field::Boolean(v) => { - as_orderedmap_set(map, key, check_alloc(as_boolean_new(*v)) as *const as_val); - } - Field::String(v) | Field::Text(v) => { - map_set_str(map, key, v, allocated_strings); - } - Field::Binary(v) => { - let bytes = check_alloc(as_bytes_new(v.len().try_into().unwrap())); - as_bytes_set(bytes, 0, v.as_ptr(), v.len().try_into().unwrap()); - as_orderedmap_set(map, key, bytes as *const as_val); - } - Field::Decimal(v) => { - map_set_str(map, key, v, allocated_strings); - } - Field::Timestamp(v) => { - map_set_str(map, key, v.to_rfc3339(), allocated_strings); - } - // Date's display implementation is RFC3339 compatible - Field::Date(v) => { - map_set_str(map, key, v, allocated_strings); - } - Field::Null => { - as_orderedmap_set(map, key, addr_of!(as_nil) as *const as_val); - } - // XXX: Geojson points have to have coordinates <90. Dozer points can - // be arbitrary locations. - Field::Point(DozerPoint(Point(Coord { x, y }))) => { - // Using our string-as-bytes trick does not work, as BYTES_GEOJSON is not - // a plain string format. Instead, we just make sure we include a nul-byte - // in our regular string, as that is easiest to integration with the other - // string allocations. - map_set_str( - map, - key, - format_args!(r#"{{"type": "Point", "coordinates": [{}, {}]}}"#, x.0, y.0), - allocated_strings, - ); - // Parsing is unimplemented and it's better to fail early - unimplemented!(); - } - Field::Json(v) => { - let val = convert_json(v)? as *const as_val; - as_orderedmap_set(map, key, val); - // Parsing is unimplemented and it's better to fail early - unimplemented!(); - } - Field::Duration(DozerDuration(duration, _)) => { - map_set_str( - map, - key, - format_args!("PT{},{:09}S", duration.as_secs(), duration.subsec_nanos()), - allocated_strings, - ); - // Parsing is unimplemented and it's better to fail early - unimplemented!(); - } - } - } - Ok(map) -} - -unsafe fn set_operation_str( - ops: *mut as_operations, - name: *const c_char, - mut string: String, - allocated_strings: &mut Vec, -) { - let ptr = string.as_mut_ptr(); - let len = string.len(); - allocated_strings.push(string); - // Unfortunately we need to do an allocation here for the bytes container. - // This is because as_operations does not allow setting a bytes type in - // its operations api. TODO: Add a raw_typep api like `as_record_set_raw_typep` - // for as_operations - let bytes = as_bytes_new_wrap(ptr, len as u32, false); - (*bytes).type_ = as_bytes_type_e_AS_BYTES_STRING; - as_operations_add_write(ops, name, bytes as *mut as_bin_value); -} - -pub(crate) unsafe fn init_batch_write_operations( - ops: *mut as_operations, - dozer_record: &[Field], - bin_names: &[CString], - allocated_strings: &mut Vec, -) -> Result<(), AerospikeSinkError> { - for (def, field) in bin_names.iter().zip(dozer_record) { - let name = def.as_ptr(); - // This is almost the same as the implementation for keys, - // the key difference being that we don't have to allocate a new - // string, because we can use `as_record_set_raw_typep` to set - // rust strings directly without intermediate allocations - // TODO: Unify the implementations - match field { - Field::UInt(v) => { - as_operations_add_write_int64(ops, name, *v as i64); - } - Field::U128(v) => { - set_operation_str(ops, name, v.to_string(), allocated_strings); - } - Field::Int(v) => { - as_operations_add_write_int64(ops, name, *v); - } - Field::Int8(v) => { - as_operations_add_write_int64(ops, name, (*v).into()); - } - Field::I128(v) => { - set_operation_str(ops, name, v.to_string(), allocated_strings); - } - Field::Float(v) => { - as_operations_add_write_double(ops, name, v.0); - } - Field::Boolean(v) => { - as_operations_add_write_bool(ops, name, *v); - } - Field::String(string) | Field::Text(string) => { - set_operation_str(ops, name, string.to_owned(), allocated_strings); - } - Field::Binary(v) => { - as_operations_add_write_rawp(ops, name, v.as_ptr(), v.len() as u32, false); - } - Field::Decimal(v) => { - set_operation_str(ops, name, v.to_string(), allocated_strings); - } - Field::Timestamp(v) => { - set_operation_str(ops, name, v.to_rfc3339(), allocated_strings); - } - // Date's display implementation is RFC3339 compatible - Field::Date(v) => { - set_operation_str(ops, name, v.to_string(), allocated_strings); - } - Field::Duration(DozerDuration(duration, _)) => { - set_operation_str( - ops, - name, - format!("PT{},{:09}S", duration.as_secs(), duration.subsec_nanos()), - allocated_strings, - ); - } - Field::Null => { - // as_bin_value is a union, with nil being an as_val. It is therefore - // valid to just cast a pointer to the as_nil constant (of type as_val), - // as its location is static - as_operations_add_write(ops, name, addr_of!(as_nil) as *mut as_bin_value); - } - Field::Point(DozerPoint(Point(Coord { x, y }))) => { - // Using our string-as-bytes trick does not work, as BYTES_GEOJSON is not - // a plain string format. Instead, we just make sure we include a nul-byte - // in our regular string, as that is easiest to integration with the other - // string allocations being `String` and not `CString`. We know we whttps://docs.oracle.com/en/database/oracle/oracle-database/19/ladbi/running-oracle-universal-installer-to-install-oracle-database.html#GUID-DD4800E9-C651-4B08-A6AC-E5ECCC6512B9on't - // have any intermediate nul-bytes, as we control the string - let string = format!( - r#"{{"type": "Point", "coordinates": [{}, {}]}}{}"#, - x.0, y.0, '\0' - ); - as_operations_add_write_geojson_strp(ops, name, string.as_ptr().cast(), false); - allocated_strings.push(string); - } - Field::Json(v) => { - as_operations_add_write(ops, name, convert_json(v)?); - } - } - } - Ok(()) -} - -#[inline(always)] -fn map(val: *mut as_val, typ: as_val_type_e, f: impl FnOnce(&T) -> Option) -> Field { - as_util_fromval(val, typ) - .map(|val| unsafe { val.as_ref() }) - .and_then(f) - .unwrap_or(Field::Null) -} - -pub(crate) fn parse_record_many( - record: &as_record, - schema: &Schema, - list_bin: &CStr, - bin_names: &BinNames, -) -> Result>, Error> { - unsafe { - let list = as_record_get_list(record, list_bin.as_ptr()); - let n_recs = as_list_size(list); - - let mut result = Vec::with_capacity(n_recs as usize); - for elem in (0..n_recs).map(|i| as_list_get_map(list, i)) { - if elem.is_null() { - continue; - } - - let mut values = Vec::with_capacity(schema.fields.len()); - for (field, name) in schema.fields.iter().zip(bin_names.names()) { - let mut string = MaybeUninit::uninit(); - let key = as_string_init(string.as_mut_ptr(), name.as_ptr() as *mut c_char, false); - let val = as_map_get(elem, key as *const as_val); - as_string_destroy(&mut string.assume_init() as *mut as_string); - let v = parse_val(val, field)?; - values.push(v); - } - result.push(values); - } - Ok(result) - } -} - -#[inline(always)] -unsafe fn as_string_destroy(string: *mut as_string_s) { - as_val_val_destroy(string as *mut as_val); -} - -pub(crate) fn parse_record( - record: &as_record, - schema: &Schema, - bin_names: &BinNames, -) -> Result, Error> { - let record = record as *const as_record; - let mut values = Vec::with_capacity(schema.fields.len()); - for (field, name) in schema.fields.iter().zip(bin_names.names()) { - let val = unsafe { as_record_get(record, name.as_ptr()) as *mut as_val }; - let v = parse_val(val, field)?; - values.push(v); - } - Ok(values) -} - -fn parse_val( - val: *mut as_val_s, - field: &dozer_types::types::FieldDefinition, -) -> Result { - let v = if val.is_null() { - Field::Null - } else { - match field.typ { - dozer_types::types::FieldType::UInt => { - map(val, as_val_type_e_AS_INTEGER, |v: &as_integer| { - Some(Field::UInt(v.value.to_u64()?)) - }) - } - dozer_types::types::FieldType::U128 => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::U128(unsafe { - CStr::from_ptr(v.value).to_str().ok()?.parse().ok()? - })) - }) - } - dozer_types::types::FieldType::Int => { - map(val, as_val_type_e_AS_INTEGER, |v: &as_integer| { - Some(Field::Int(v.value)) - }) - } - dozer_types::types::FieldType::Int8 => { - map(val, as_val_type_e_AS_INTEGER, |v: &as_integer| { - Some(Field::Int8(v.value as i8)) - }) - } - dozer_types::types::FieldType::I128 => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::I128(unsafe { - CStr::from_ptr(v.value).to_str().ok()?.parse().ok()? - })) - }) - } - dozer_types::types::FieldType::Float => { - map(val, as_val_type_e_AS_DOUBLE, |v: &as_double| { - Some(Field::Float(OrderedFloat(v.value))) - }) - } - dozer_types::types::FieldType::Boolean => { - map(val, as_val_type_e_AS_BOOLEAN, |v: &as_boolean| { - Some(Field::Boolean(v.value)) - }) - } - dozer_types::types::FieldType::String => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::String( - unsafe { CStr::from_ptr(v.value) }.to_str().ok()?.to_owned(), - )) - }) - } - dozer_types::types::FieldType::Text => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::Text( - unsafe { CStr::from_ptr(v.value) }.to_str().ok()?.to_owned(), - )) - }) - } - dozer_types::types::FieldType::Binary => { - map(val, as_val_type_e_AS_BYTES, |v: &as_bytes| { - Some(Field::Binary(unsafe { - slice::from_raw_parts(v.value, v.size as usize).to_vec() - })) - }) - } - dozer_types::types::FieldType::Decimal => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::Decimal(unsafe { - CStr::from_ptr(v.value).to_str().ok()?.parse().ok()? - })) - }) - } - dozer_types::types::FieldType::Timestamp => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::Timestamp(unsafe { - DateTime::parse_from_rfc3339(CStr::from_ptr(v.value).to_str().ok()?).ok()? - })) - }) - } - - dozer_types::types::FieldType::Date => { - map(val, as_val_type_e_AS_STRING, |v: &as_string| { - Some(Field::Date(unsafe { - NaiveDate::from_str(CStr::from_ptr(v.value).to_str().ok()?).ok()? - })) - }) - } - dozer_types::types::FieldType::Point => unimplemented!(), - dozer_types::types::FieldType::Duration => unimplemented!(), - dozer_types::types::FieldType::Json => unimplemented!(), - } - }; - if !field.nullable && v == Field::Null { - return Err(Error::NotNullNotFound); - } - Ok(v) -} - -#[inline(always)] -fn as_util_fromval(v: *mut as_val, typ: as_val_type_e) -> Option> { - unsafe { - let v = NonNull::new(v)?; - if v.as_ref().type_ != typ as u8 { - return None; - } - Some(v.cast()) - } -} - -#[inline(always)] -unsafe fn as_vector_reserve(vector: *mut as_vector) -> *mut c_void { - if (*vector).size >= (*vector).capacity { - as_vector_increase_capacity(vector); - check_alloc((*vector).list); - } - let item = (*vector) - .list - .byte_add((*vector).size as usize * (*vector).item_size as usize); - (item as *mut u8).write_bytes(0, (*vector).item_size as usize); - (*vector).size += 1; - item -} - -#[inline(always)] -pub(crate) unsafe fn as_vector_get(vector: *const as_vector, index: usize) -> *const c_void { - debug_assert!(index < (*vector).size as usize); - (*vector) - .list - .byte_add((*vector).item_size as usize * index) -} - -#[inline(always)] -pub(crate) unsafe fn as_batch_write_reserve( - records: *mut as_batch_records, -) -> *mut as_batch_write_record { - let r = as_vector_reserve(&mut (*records).list as *mut as_vector) as *mut as_batch_write_record; - (*r).type_ = AS_BATCH_WRITE as u8; - (*r).has_write = true; - r -} - -#[inline(always)] -pub(crate) unsafe fn as_batch_remove_reserve( - records: *mut as_batch_records, -) -> *mut as_batch_remove_record { - let r = - as_vector_reserve(&mut (*records).list as *mut as_vector) as *mut as_batch_remove_record; - (*r).type_ = AS_BATCH_REMOVE as u8; - (*r).has_write = true; - r -} - -#[inline(always)] -pub(crate) unsafe fn as_batch_read_reserve( - records: *mut as_batch_records, -) -> *mut as_batch_read_record { - let r = as_vector_reserve(&mut (*records).list as *mut as_vector) as *mut as_batch_read_record; - (*r).type_ = AS_BATCH_READ as u8; - r -} - -#[inline(always)] -pub(crate) unsafe fn as_batch_records_create(capacity: u32) -> *mut as_batch_records { - as_vector_create(std::mem::size_of::() as u32, capacity) - as *mut as_batch_records -} - -pub(crate) struct AsOperations(*mut as_operations); -impl AsOperations { - pub(crate) fn new(capacity: u16) -> Self { - unsafe { Self(check_alloc(as_operations_new(capacity))) } - } - - pub(crate) fn as_mut_ptr(&mut self) -> *mut as_operations { - self.0 - } -} - -impl Drop for AsOperations { - fn drop(&mut self) { - unsafe { as_operations_destroy(self.0) } - } -} - -macro_rules! as_util_hook { - ($hook:tt, $default:expr, $object:expr $(,$($arg:tt),*)?) => {{ - if !$object.is_null() && !(*$object).hooks.is_null() && (*(*$object).hooks).$hook.is_some() { - (*(*$object).hooks).$hook.unwrap()($object, $($($arg)*)?) - } else { - $default - } - }}; -} - -#[inline(always)] -unsafe fn as_list_size(list: *const as_list) -> u32 { - as_util_hook!(size, 0, list) -} - -#[inline(always)] -unsafe fn as_list_get(list: *const as_list, i: u32) -> *const as_val { - as_util_hook!(get, std::ptr::null(), list, i) -} - -#[inline(always)] -unsafe fn as_list_get_map(list: *const as_list, i: u32) -> *const as_map { - let val = as_list_get(list, i); - if !val.is_null() && (*val).type_ as u32 == as_val_type_e_AS_MAP { - val as *const as_map - } else { - std::ptr::null() - } -} - -#[inline(always)] -unsafe fn as_map_get(map: *const as_map, key: *const as_val) -> *mut as_val { - as_util_hook!(get, std::ptr::null_mut(), map, key) -} - -pub(crate) struct ReadBatchResults { - recs: AsBatchRecords, -} - -impl ReadBatchResults { - fn vector(&self) -> *const as_vector { - &self.recs.as_ref().list - } - - pub(crate) fn get(&self, idx: usize) -> Result, AerospikeError> { - let rec = unsafe { - assert!(idx < (*self.vector()).size as usize); - let rec = as_vector_get(self.vector(), idx) as *const as_batch_read_record; - rec.as_ref().unwrap() - }; - - #[allow(non_upper_case_globals)] - match rec.result { - as_status_e_AEROSPIKE_OK => Ok(Some(&rec.record)), - as_status_e_AEROSPIKE_ERR_RECORD_NOT_FOUND => Ok(None), - other => Err(AerospikeError::from_code(other)), - } - } -} - -pub(crate) struct ReadBatch<'a> { - client: &'a Client, - inner: Option, - allocated_strings: Vec, - read_ops: usize, -} - -impl<'a> ReadBatch<'a> { - fn reserve_read(&mut self) -> *mut as_batch_read_record { - unsafe { check_alloc(as_batch_read_reserve(self.inner.as_mut().unwrap().as_ptr())) } - } - - pub(crate) fn add_read_all( - &mut self, - namespace: &CStr, - set: &CStr, - key: &[Field], - ) -> Result { - let idx = self.read_ops; - let read_rec = self.reserve_read(); - unsafe { - init_key( - addr_of_mut!((*read_rec).key), - namespace, - set, - key, - &mut self.allocated_strings, - )?; - (*read_rec).read_all_bins = true; - } - self.read_ops += 1; - Ok(idx) - } - - pub(crate) fn execute(mut self) -> Result { - unsafe { self.client.batch_get(self.inner.as_mut().unwrap().as_ptr()) }?; - - Ok(ReadBatchResults { - recs: self.inner.take().unwrap(), - }) - } - - pub(crate) fn new( - client: &'a Client, - capacity: u32, - allocated_strings: Option>, - ) -> Self { - Self { - client, - inner: Some(AsBatchRecords::new(capacity)), - allocated_strings: allocated_strings.unwrap_or_default(), - read_ops: 0, - } - } -} - -struct AsBatchRecords(NonNull); - -impl AsBatchRecords { - fn new(capacity: u32) -> Self { - // Capacity needs to be at least 1, otherwise growing the vector will fail - // because it uses naive doubling of the capacity. We use rustc's heuristic - // for the minimum size of the vector (4 if the size of the element <= 1024) - // to save some re-allocations for small vectors. - let capacity = capacity.max(4); - unsafe { Self(NonNull::new(as_batch_records_create(capacity)).unwrap()) } - } - - fn as_ref(&self) -> &as_batch_records { - unsafe { self.0.as_ref() } - } - - fn as_ptr(&mut self) -> *mut as_batch_records { - self.0.as_ptr() - } -} - -pub(crate) struct WriteBatch<'a> { - client: &'a Client, - inner: Option, - allocated_strings: Vec, - operations: Vec, -} - -impl<'a> WriteBatch<'a> { - pub(crate) fn new( - client: &'a Client, - capacity: u32, - allocated_strings: Option>, - ) -> Self { - Self { - client, - inner: Some(AsBatchRecords::new(capacity)), - allocated_strings: allocated_strings.unwrap_or_default(), - operations: Vec::with_capacity(capacity as usize), - } - } - - fn batch_ptr(&mut self) -> *mut as_batch_records { - self.inner.as_mut().unwrap().as_ptr() - } - - pub(crate) fn reserve_write(&mut self) -> *mut as_batch_write_record { - unsafe { check_alloc(as_batch_write_reserve(self.batch_ptr())) } - } - - pub(crate) fn reserve_remove(&mut self) -> *mut as_batch_remove_record { - unsafe { check_alloc(as_batch_remove_reserve(self.batch_ptr())) } - } - - pub(crate) fn add_write( - &mut self, - namespace: &CStr, - set: &CStr, - bin_names: &[CString], - key: &[Field], - values: &[Field], - ) -> Result<(), AerospikeSinkError> { - let write_rec = self.reserve_write(); - unsafe { - init_key( - addr_of_mut!((*write_rec).key), - namespace, - set, - key, - &mut self.allocated_strings, - )?; - let mut ops = AsOperations::new(values.len().try_into().unwrap()); - init_batch_write_operations( - ops.as_mut_ptr(), - values, - bin_names, - &mut self.allocated_strings, - )?; - (*write_rec).ops = ops.as_mut_ptr(); - self.operations.push(ops); - } - Ok(()) - } - - pub(crate) fn add_write_list( - &mut self, - namespace: &CStr, - set: &CStr, - bin: &CStr, - key: &[Field], - bin_names: &[CString], - values: &[Vec], - ) -> Result<(), AerospikeSinkError> { - let write_rec = self.reserve_write(); - unsafe { - init_key( - addr_of_mut!((*write_rec).key), - namespace, - set, - key, - &mut self.allocated_strings, - )?; - let mut ops = AsOperations::new(1); - let list = as_arraylist_new(values.len().try_into().unwrap(), 0); - for record in values { - let map = new_record_map(record, bin_names, &mut self.allocated_strings)?; - as_arraylist_append(list, map as *mut as_val); - } - as_operations_add_write(ops.as_mut_ptr(), bin.as_ptr(), list as *mut as_bin_value); - (*write_rec).ops = ops.as_mut_ptr(); - self.operations.push(ops); - } - Ok(()) - } - - pub(crate) fn add_remove( - &mut self, - namespace: &CStr, - set: &CStr, - key: &[Field], - ) -> Result<(), AerospikeSinkError> { - let remove_rec = self.reserve_remove(); - unsafe { - init_key( - addr_of_mut!((*remove_rec).key), - namespace, - set, - key, - &mut self.allocated_strings, - )?; - } - Ok(()) - } - - pub(crate) fn execute(mut self) -> Result<(), AerospikeError> { - let config = self.client.config(); - let mut policy = config.policies.batch; - policy.base.max_retries = 2; - policy.base.sleep_between_retries = 1000; - unsafe { - self.client.write_batch( - self.inner.take().unwrap().as_ptr(), - Some((&policy) as *const as_policy_batch), - ) - } - } -} - -impl Drop for AsBatchRecords { - fn drop(&mut self) { - unsafe { as_batch_records_destroy(self.0.as_ptr()) } - } -} diff --git a/dozer-sink-aerospike/src/denorm_dag.rs b/dozer-sink-aerospike/src/denorm_dag.rs deleted file mode 100644 index a40d2ba372..0000000000 --- a/dozer-sink-aerospike/src/denorm_dag.rs +++ /dev/null @@ -1,1632 +0,0 @@ -use std::collections::HashMap; -use std::ffi::{CStr, CString, NulError}; - -use dozer_core::daggy::petgraph::Direction; -use dozer_core::daggy::{self, EdgeIndex, NodeIndex}; -use dozer_core::petgraph::visit::{ - EdgeRef, IntoEdgesDirected, IntoNeighborsDirected, IntoNodeReferences, -}; -use dozer_types::indexmap::IndexMap; - -use dozer_types::models::sink::{AerospikeSet, AerospikeSinkTable}; -use dozer_types::thiserror; -use dozer_types::types::{Field, Record, Schema, TableOperation}; -use itertools::{Either, Itertools}; -use smallvec::SmallVec; - -use crate::aerospike::{ - parse_record, parse_record_many, BinNames, Client, ReadBatch, ReadBatchResults, WriteBatch, -}; -use crate::AerospikeSinkError; - -const MANY_LIST_BIN: &CStr = unsafe { CStr::from_bytes_with_nul_unchecked("data\0".as_bytes()) }; - -#[derive(Debug, Clone)] -struct CachedRecord { - dirty: bool, - version: usize, - record: Option>, -} - -#[derive(Debug, Clone, Default)] -struct OneToOneBatch(IndexMap, SmallVec<[CachedRecord; 2]>>); - -#[derive(Debug, Clone)] -enum ManyOp { - Add(Vec), - Remove(Vec), -} - -#[derive(Debug, Clone)] -struct ManyRecord { - version: usize, - ops: Vec, -} - -#[derive(Debug, Clone, Default)] -struct OneToManyEntry { - base: Option>>, - ops: Vec, -} - -#[derive(Debug, Clone, Default)] -struct OneToManyBatch(IndexMap, OneToManyEntry>); - -impl OneToManyBatch { - fn insert_point( - &mut self, - key: Vec, - version: usize, - ) -> (&mut OneToManyEntry, usize, usize) { - let entry = self.0.entry(key); - let idx = entry.index(); - let entry = entry.or_default(); - let insert_point = entry - .ops - .iter() - .position(|rec| rec.version >= version) - .unwrap_or(entry.ops.len()); - (entry, idx, insert_point) - } - - fn insert_local(&mut self, key: Vec, value: Vec, version: usize) -> usize { - let (entry, idx, insert_point) = self.insert_point(key, version); - match entry.ops.get_mut(insert_point) { - Some(entry) if entry.version == version => { - entry.ops.push(ManyOp::Add(value)); - } - _ => { - entry.ops.insert( - insert_point, - ManyRecord { - version, - ops: vec![ManyOp::Add(value)], - }, - ); - } - } - idx - } - - fn remove_local(&mut self, key: Vec, old_value: &[Field], version: usize) -> usize { - let (entry, idx, insert_point) = self.insert_point(key, version); - match entry.ops.get_mut(insert_point) { - Some(entry) if entry.version == version => { - if let Some(added) = entry - .ops - .iter() - .position(|entry| matches!(entry, ManyOp::Add(value) if value == old_value)) - { - let _ = entry.ops.swap_remove(added); - } else { - entry.ops.push(ManyOp::Remove(old_value.to_vec())); - } - } - _ => entry.ops.insert( - insert_point, - ManyRecord { - version, - ops: vec![ManyOp::Remove(old_value.to_vec())], - }, - ), - }; - idx - } - - fn replace_local( - &mut self, - key: Vec, - old_value: Vec, - new_value: Vec, - version: usize, - ) -> usize { - let (entry, idx, insert_point) = self.insert_point(key, version); - match entry.ops.get_mut(insert_point) { - Some(entry) if entry.version == version => { - if let Some(added) = entry - .ops - .iter_mut() - .find(|entry| matches!(entry, ManyOp::Add(value) if value == &old_value)) - { - *added = ManyOp::Add(new_value); - } else { - entry.ops.push(ManyOp::Remove(old_value)); - entry.ops.push(ManyOp::Add(new_value)); - } - } - _ => entry.ops.insert( - insert_point, - ManyRecord { - version, - ops: vec![ManyOp::Remove(old_value), ManyOp::Add(new_value)], - }, - ), - }; - idx - } - - fn insert_remote(&mut self, index: usize, value: Vec>) { - let (_, record) = self.0.get_index_mut(index).unwrap(); - record.base = Some(value); - } - - fn get(&self, key: &[Field], version: usize) -> Option>> { - let entry = self.0.get(key)?; - - Self::get_inner(entry, version) - } - fn get_index(&self, index: usize, version: usize) -> Option>> { - let (_, entry) = self.0.get_index(index)?; - - Self::get_inner(entry, version) - } - - fn get_inner(entry: &OneToManyEntry, version: usize) -> Option>> { - let mut recs = entry.base.clone()?; - for version in entry.ops.iter().take_while(|ops| ops.version <= version) { - for op in &version.ops { - match op { - ManyOp::Add(rec) => recs.push(rec.clone()), - ManyOp::Remove(to_remove) => { - if let Some(to_remove) = recs.iter().position(|rec| rec == to_remove) { - recs.swap_remove(to_remove); - } - } - } - } - } - if recs.is_empty() { - None - } else { - Some(recs.into_iter()) - } - } - - fn write( - &mut self, - record_batch: &mut WriteBatch, - schema: &AerospikeSchema, - ) -> Result<(), AerospikeSinkError> { - for (k, v) in self.0.drain(..) { - // We should always have a base, otherwise we can't do idempotent writes - let mut record = v.base.unwrap(); - // Apply ops - for version in v.ops { - for op in version.ops { - match op { - ManyOp::Add(rec) => { - record.push(rec); - } - ManyOp::Remove(rec) => { - if let Some(pos) = record.iter().position(|r| r == &rec) { - record.swap_remove(pos); - } - } - } - } - } - record_batch.add_write_list( - &schema.namespace, - &schema.set, - MANY_LIST_BIN, - &k, - schema.bins.names(), - &record, - )?; - } - Ok(()) - } -} - -#[derive(Debug, Clone)] -enum CachedBatch { - One(OneToOneBatch), - Many(OneToManyBatch), -} -struct DirtyRecord<'a> { - idx: usize, - key: &'a [Field], - version: usize, -} - -impl CachedBatch { - fn iter_dirty(&self) -> impl Iterator { - match self { - Self::One(batch) => batch - .0 - .iter() - .enumerate() - .filter_map(|(i, (k, v))| Some((i, k, v.last()?))) - .filter(|(_, _, v)| v.dirty) - .map(|(i, k, v)| DirtyRecord { - idx: i, - key: k, - version: v.version, - }), - Self::Many(_) => unimplemented!(), - } - } - - fn remove_local(&mut self, key: Vec, old_value: &[Field], version: usize) -> usize { - match self { - Self::One(batch) => batch.insert_local(key, None, version), - Self::Many(batch) => batch.remove_local(key, old_value, version), - } - } - - fn insert_local(&mut self, key: Vec, value: Vec, version: usize) -> usize { - match self { - Self::One(batch) => batch.insert_local(key, Some(value), version), - Self::Many(batch) => batch.insert_local(key, value, version), - } - } - - fn replace_local( - &mut self, - key: Vec, - old_value: Vec, - new_value: Vec, - version: usize, - ) -> usize { - match self { - Self::One(batch) => batch.insert_impl(key, Some(new_value), version, true, true), - Self::Many(batch) => batch.replace_local(key, old_value, new_value, version), - } - } - - fn clear(&mut self) { - match self { - Self::One(batch) => batch.clear(), - Self::Many(batch) => batch.0.clear(), - } - } - - fn len(&self) -> usize { - match self { - Self::One(batch) => batch.len(), - Self::Many(batch) => batch.0.len(), - } - } - - fn write( - &mut self, - record_batch: &mut WriteBatch, - schema: &AerospikeSchema, - ) -> Result<(), AerospikeSinkError> { - match self { - Self::One(batch) => batch.write(record_batch, schema), - Self::Many(batch) => batch.write(record_batch, schema), - } - } - - fn get<'a>( - &'a self, - key: &[Field], - version: usize, - ) -> Option> + 'a> { - match self { - Self::One(batch) => { - let record = batch.get(key, version)?.record.clone()?; - Some(Either::Left(std::iter::once(record))) - } - Self::Many(batch) => Some(Either::Right(batch.get(key, version)?)), - } - } - - fn get_index( - &self, - index: usize, - version: usize, - ) -> Option> + '_> { - match self { - Self::One(batch) => { - let record = batch.get_index(index, version)?.record.clone()?; - Some(Either::Left(std::iter::once(record))) - } - Self::Many(batch) => Some(Either::Right(batch.get_index(index, version)?)), - } - } - - fn should_update_at(&mut self, key: Vec, version: usize) -> (bool, usize) { - match self { - Self::One(batch) => { - let (index, exists) = batch.index_or_default(key, version); - (!exists, index) - } - // For a many batch, we always need the base from the remote - Self::Many(batch) => { - let entry = batch.0.entry(key); - let idx = entry.index(); - (entry.or_default().base.is_none(), idx) - } - } - } -} - -impl OneToOneBatch { - fn insert_local( - &mut self, - key: Vec, - value: Option>, - version: usize, - ) -> usize { - self.insert_impl(key, value, version, true, true) - } - - fn insert_impl( - &mut self, - key: Vec, - value: Option>, - version: usize, - replace: bool, - dirty: bool, - ) -> usize { - let entry = self.0.entry(key); - let idx = entry.index(); - let versions = entry.or_default(); - let record = CachedRecord { - dirty, - version, - record: value, - }; - // This is basically partition_by, but that does a binary search, while - // a linear search should in general be a better bet here - let insert_point = versions - .iter() - .position(|cur| cur.version >= version) - .unwrap_or(versions.len()); - // If the version already exists, replace it - if versions - .get(insert_point) - .is_some_and(|rec| rec.version == version) - { - if replace { - versions[insert_point] = record; - } - } else { - versions.insert(insert_point, record); - } - idx - } - - fn insert_remote(&mut self, index: usize, value: Option>) { - let (_, versions) = self.0.get_index_mut(index).unwrap(); - versions.insert( - 0, - CachedRecord { - dirty: false, - version: 0, - record: value, - }, - ); - } - - fn get<'a>(&'a self, key: &[Field], version: usize) -> Option<&'a CachedRecord> { - let versions = self.0.get(key)?; - // Find the last version thats <= version - versions.iter().take_while(|v| v.version <= version).last() - } - - fn get_index(&self, index: usize, version: usize) -> Option<&CachedRecord> { - let (_, versions) = self.0.get_index(index)?; - // Find the last version thats <= version - versions.iter().take_while(|v| v.version <= version).last() - } - - /// Returns the index at which the entry for the given key exists, - /// or was created and whether it existed - fn index_or_default(&mut self, key: Vec, version: usize) -> (usize, bool) { - let entry = self.0.entry(key); - let idx = entry.index(); - let versions = entry.or_default(); - (idx, versions.first().is_some_and(|v| v.version <= version)) - } - - fn clear(&mut self) { - self.0.clear() - } - - fn len(&self) -> usize { - self.0.len() - } - - fn write( - &mut self, - batch: &mut WriteBatch, - schema: &AerospikeSchema, - ) -> Result<(), AerospikeSinkError> { - for (key, dirty_record) in self.0.drain(..).filter_map(|(key, mut rec)| { - let last_version = rec.pop()?; - last_version.dirty.then_some((key, last_version.record)) - }) { - if let Some(dirty_record) = dirty_record { - batch.add_write( - &schema.namespace, - &schema.set, - schema.bins.names(), - &key, - &dirty_record, - )?; - } else { - batch.add_remove(&schema.namespace, &schema.set, &key)?; - } - } - Ok(()) - } -} - -#[derive(Debug, Clone)] -struct AerospikeSchema { - namespace: CString, - set: CString, - bins: BinNames, -} - -#[derive(Debug, Clone)] -struct Node { - schema: Schema, - batch: CachedBatch, - as_schema: AerospikeSchema, - denormalize_to: Option<(CString, CString, Vec)>, -} - -#[derive(Debug, Clone, PartialEq, Hash, Eq)] -struct LookupSource { - index: usize, - version: usize, -} - -#[derive(Debug, Clone)] -struct Edge { - bins: BinNames, - key_fields: Vec, - field_indices: Vec, -} - -#[derive(thiserror::Error, Debug)] -pub(crate) enum Error { - #[error("Duplicate sink set definition: {namespace}.{set}")] - DuplicateSinkTable { namespace: String, set: String }, - #[error("Set referenced in denormalization not found: {namespace}.{set}")] - SetNotFound { namespace: String, set: String }, - #[error("Adding denormalizing lookup on set {namespace}.{set} from set {from_namespace}.{from_set} would create a cycle")] - Cycle { - namespace: String, - set: String, - from_namespace: String, - from_set: String, - }, - #[error("Field not found")] - FieldNotFound(String), - #[error("Invalid name")] - InvalidName(#[from] NulError), - #[error("Non-nullible lookup value not found")] - NotNullNotFound, - #[error("The primary key for lookup set \"{lookup_namespace}\".\"{lookup_set}\" does not match the denormalization key specified by the denormalizing set \"{denorm_namespace}\".\"{denorm_set}\"")] - MismatchedKeys { - lookup_namespace: String, - lookup_set: String, - denorm_namespace: String, - denorm_set: String, - }, -} - -#[derive(Debug)] -pub(crate) struct DenormalizationState { - dag: DenormDag, - current_transaction: Option, - base_tables: Vec<(NodeIndex, Vec, Vec)>, - transaction_counter: usize, -} - -#[derive(Debug, PartialEq)] -pub(crate) struct DenormalizedTable { - pub(crate) bin_names: Vec, - pub(crate) namespace: CString, - pub(crate) set: CString, - pub(crate) records: Vec>, - pub(crate) pk: Vec, -} -type DenormDag = daggy::Dag; - -fn bin_names_recursive(dag: &DenormDag, nid: NodeIndex, bins: &mut Vec) { - for edge in dag.edges_directed(nid, Direction::Outgoing) { - bins.extend_from_slice(edge.weight().bins.names()); - bin_names_recursive(dag, edge.target(), bins); - } -} - -impl DenormalizationState { - fn node(&self, index: NodeIndex) -> &Node { - self.dag.node_weight(index).unwrap() - } - - fn edge(&self, index: EdgeIndex) -> &Edge { - self.dag.edge_weight(index).unwrap() - } -} - -impl DenormalizationState { - pub(crate) fn new(tables: &[(AerospikeSinkTable, Schema)]) -> Result { - assert!(!tables.is_empty()); - let dag = Self::build_dag(tables)?; - let base_tables: Vec<_> = dag - .node_references() - // Filter out non-base-tables - .filter_map(|(i, node)| node.denormalize_to.as_ref().map(|(_, _, pk)| (i, pk))) - // Find all added bin names using a depth-first search - .map(|(id, pk)| -> Result<_, Error> { - let mut bin_names = dag.node_weight(id).unwrap().as_schema.bins.names().to_vec(); - bin_names_recursive(&dag, id, &mut bin_names); - let mut primary_key = Vec::new(); - for key in pk { - let idx = bin_names - .iter() - .position(|bin| bin.to_str().is_ok_and(|bin| bin == key)) - .ok_or_else(|| Error::FieldNotFound(key.clone()))?; - primary_key.push(idx); - } - Ok((id, bin_names, primary_key)) - }) - .try_collect()?; - Ok(Self { - dag, - current_transaction: None, - base_tables, - transaction_counter: 0, - }) - } - - fn build_dag(tables: &[(AerospikeSinkTable, Schema)]) -> Result { - let mut dag: daggy::Dag = daggy::Dag::new(); - let mut node_by_name = HashMap::new(); - for (table, schema) in tables.iter() { - let bin_names = BinNames::new(schema.fields.iter().map(|field| field.name.as_str()))?; - let denormalize_to = table - .write_denormalized_to - .as_ref() - .map(|to| -> Result<_, Error> { - let AerospikeSet { - namespace, - set, - primary_key, - } = to; - Ok(( - CString::new(namespace.as_str())?, - CString::new(set.as_str())?, - primary_key.clone(), - )) - }) - .transpose()?; - let idx = dag.add_node(Node { - as_schema: AerospikeSchema { - namespace: CString::new(table.namespace.as_str())?, - set: CString::new(table.set_name.as_str())?, - bins: bin_names, - }, - schema: schema.clone(), - batch: if table.aggregate_by_pk { - CachedBatch::Many(OneToManyBatch::default()) - } else { - CachedBatch::One(OneToOneBatch::default()) - }, - denormalize_to, - }); - - if node_by_name - .insert((table.namespace.clone(), table.set_name.clone()), idx) - .is_some() - { - return Err(Error::DuplicateSinkTable { - namespace: table.namespace.clone(), - set: table.set_name.clone(), - }); - } - } - for (table, schema) in tables { - let to_idx = node_by_name[&(table.namespace.clone(), table.set_name.clone())]; - - for denorm in &table.denormalize { - let from_idx = node_by_name - .get(&(denorm.from_namespace.clone(), denorm.from_set.clone())) - .copied() - .ok_or_else(|| Error::SetNotFound { - namespace: denorm.from_namespace.clone(), - set: denorm.from_set.clone(), - })?; - - let from_schema = &dag.node_weight(from_idx).unwrap().schema; - let key_idx = match &denorm.key { - dozer_types::models::sink::DenormKey::Simple(name) => { - vec![ - schema - .get_field_index(name) - .map_err(|_| Error::FieldNotFound(name.to_owned()))? - .0, - ] - } - dozer_types::models::sink::DenormKey::Composite(names) => names - .iter() - .map(|name| { - schema - .get_field_index(name) - .map(|(i, _)| i) - .map_err(|_| Error::FieldNotFound(name.to_owned())) - }) - .collect::, _>>()?, - }; - let mismatch_err = || Error::MismatchedKeys { - lookup_namespace: denorm.from_namespace.clone(), - lookup_set: denorm.from_set.clone(), - denorm_namespace: table.namespace.clone(), - denorm_set: table.set_name.clone(), - }; - if key_idx.len() != from_schema.primary_index.len() { - return Err(mismatch_err()); - } - for (denorm_idx, lookup_idx) in key_idx.iter().zip(&from_schema.primary_index) { - let denorm_field = &schema.fields[*denorm_idx]; - let lookup_field = &from_schema.fields[*lookup_idx]; - if denorm_field.typ != lookup_field.typ { - return Err(mismatch_err()); - } - } - - let bin_names = BinNames::new(denorm.columns.iter().map(|col| { - let (_, dst) = col.to_src_dst(); - dst - }))?; - - let bin_indices: Vec<_> = denorm - .columns - .iter() - .map(|col| -> Result<_, Error> { - let (src, _) = col.to_src_dst(); - let (id, _) = from_schema - .get_field_index(src) - .map_err(|_| Error::FieldNotFound(src.to_owned()))?; - Ok(id) - }) - .try_collect()?; - - dag.add_edge( - to_idx, - from_idx, - Edge { - key_fields: key_idx, - bins: bin_names, - field_indices: bin_indices, - }, - ) - .map_err(|_| Error::Cycle { - namespace: table.namespace.clone(), - set: table.set_name.clone(), - from_namespace: denorm.from_namespace.clone(), - from_set: denorm.from_set.clone(), - })?; - } - } - - Ok(dag) - } -} - -#[derive(Clone)] -struct BatchLookup { - node: NodeIndex, - nodebatch_idx: usize, - version: usize, - readbatch_idx: Option, - follow: bool, -} - -impl DenormalizationState { - fn do_insert(&mut self, node_id: NodeIndex, new: Record) { - let node = self.dag.node_weight_mut(node_id).unwrap(); - let idx = new.get_key_fields(&node.schema); - - node.batch - .insert_local(idx, new.values, self.transaction_counter); - } - - pub(crate) fn process(&mut self, op: TableOperation) -> Result<(), AerospikeSinkError> { - self.current_transaction = op.id.map(|id| id.txid); - let node_id: NodeIndex = (op.port as u32).into(); - match op.op { - dozer_types::types::Operation::Delete { old } => { - let node = self.dag.node_weight_mut(node_id).unwrap(); - let schema = &node.schema; - let idx = old.get_key_fields(schema); - node.batch - .remove_local(idx, &old.values, self.transaction_counter); - } - dozer_types::types::Operation::Insert { new } => { - self.do_insert(node_id, new); - } - dozer_types::types::Operation::Update { old, new } => { - let node = self.dag.node_weight_mut(node_id).unwrap(); - let schema = &node.schema; - let old_pk = old.get_key_fields(schema); - let new_pk = new.get_key_fields(schema); - if old_pk != new_pk { - return Err(AerospikeSinkError::PrimaryKeyChanged { - old: old_pk.clone(), - new: new_pk.clone(), - }); - } - node.batch - .replace_local(new_pk, old.values, new.values, self.transaction_counter); - } - dozer_types::types::Operation::BatchInsert { new } => { - for value in new { - self.do_insert(node_id, value); - } - } - } - Ok(()) - } - - pub(crate) fn clear(&mut self) { - for node in self.dag.node_weights_mut() { - node.batch.clear(); - } - } - pub(crate) fn persist(&mut self, client: &Client) -> Result<(), AerospikeSinkError> { - let mut read_batch = ReadBatch::new(client, 0, None); - let mut lookups = Vec::new(); - self.add_manynode_base_lookups(&mut read_batch, &mut lookups)?; - let read_results = read_batch.execute()?; - for lookup in lookups { - self.update_from_lookup( - lookup.readbatch_idx.unwrap(), - lookup.node, - &read_results, - lookup.nodebatch_idx, - )?; - } - let batch_size_upper_bound: usize = self - .dag - .node_references() - .map(|(_, node)| node.batch.len()) - .sum(); - - let batch_size: u32 = batch_size_upper_bound.try_into().unwrap(); - let mut write_batch = WriteBatch::new(client, batch_size, None); - - for node in self.dag.node_weights_mut() { - // Only write if the last version is dirty (the newest version was changed by this - // batch) - node.batch.write(&mut write_batch, &node.as_schema)?; - } - - write_batch.execute()?; - self.transaction_counter = 0; - Ok(()) - } - - pub(crate) fn perform_denorm( - &mut self, - client: &Client, - ) -> Result, AerospikeSinkError> { - let mut lookups = Vec::new(); - for (nid, _, _) in &self.base_tables { - let node = self.node(*nid); - let node_keys = node.batch.iter_dirty().map( - |DirtyRecord { - idx, - key: _, - version, - }| BatchLookup { - version, - node: *nid, - nodebatch_idx: idx, - readbatch_idx: None, - follow: true, - }, - ); - lookups.extend(node_keys); - } - - let mut n_lookups = 0; - let mut batch = ReadBatch::new(client, 0, None); - while !lookups.is_empty() { - let batch_results = batch.execute()?; - let mut new_lookups = Vec::with_capacity(lookups.len()); - let mut new_batch = ReadBatch::new(client, lookups.len().try_into().unwrap(), None); - - // For persisting, we need all many-node baselines, so put them in the - // first batch - if n_lookups == 0 { - self.add_manynode_base_lookups(&mut new_batch, &mut new_lookups)?; - } - for BatchLookup { - node: nid, - nodebatch_idx, - version, - readbatch_idx, - follow, - } in lookups - { - // Update the node's local batch - if let Some(readbatch_idx) = readbatch_idx { - self.update_from_lookup(readbatch_idx, nid, &batch_results, nodebatch_idx)?; - } - if !follow { - continue; - } - let Some(values) = self.node(nid).batch.get_index(nodebatch_idx, version) else { - continue; - }; - let values = values.collect_vec(); - let mut edges = self - .dag - .neighbors_directed(nid, Direction::Outgoing) - .detach(); - while let Some((edge, target)) = edges.next(self.dag.graph()) { - for value in &values { - let key = self - .edge(edge) - .key_fields - .iter() - .copied() - .map(|i| value[i].clone()) - .collect_vec(); - let (should_update, batch_idx) = self - .dag - .node_weight_mut(target) - .unwrap() - .batch - .should_update_at(key.clone(), version); - let target_schema = &self.node(target).as_schema; - let batch_read_index = if should_update { - Some(new_batch.add_read_all( - &target_schema.namespace, - &target_schema.set, - &key, - )?) - } else { - None - }; - - new_lookups.push(BatchLookup { - node: target, - nodebatch_idx: batch_idx, - version, - readbatch_idx: batch_read_index, - follow: true, - }) - } - } - } - lookups = new_lookups; - batch = new_batch; - n_lookups += 1; - } - - let mut res = Vec::new(); - // Recursively collect results - for (nid, bin_names, pk) in &self.base_tables { - let mut results = Vec::new(); - let node = self.node(*nid); - for DirtyRecord { - idx: _, - key, - version, - } in node.batch.iter_dirty() - { - let field_indices = (0..node.schema.fields.len()).collect_vec(); - results.append(&mut self.recurse_lookup(&field_indices, *nid, key, version)) - } - let (namespace, set, _) = node.denormalize_to.clone().unwrap(); - res.push(DenormalizedTable { - bin_names: bin_names.clone(), - namespace, - set, - records: results, - pk: pk.clone(), - }) - } - Ok(res) - } - - fn add_manynode_base_lookups( - &mut self, - read_batch: &mut ReadBatch<'_>, - lookups: &mut Vec, - ) -> Result<(), AerospikeSinkError> { - for (i, node) in self.dag.node_references() { - if let CachedBatch::Many(node_batch) = &node.batch { - for (batch_idx, key) in node_batch - .0 - .iter() - .enumerate() - .filter_map(|(i, (key, entry))| entry.base.is_none().then_some((i, key))) - { - let batch_read_index = read_batch.add_read_all( - &node.as_schema.namespace, - &node.as_schema.set, - key, - )?; - lookups.push(BatchLookup { - node: i, - nodebatch_idx: batch_idx, - version: 0, - readbatch_idx: Some(batch_read_index), - follow: false, - }); - } - } - } - Ok(()) - } - - fn update_from_lookup( - &mut self, - readbatch_idx: usize, - nid: NodeIndex, - batch_results: &ReadBatchResults, - nodebatch_idx: usize, - ) -> Result<(), AerospikeSinkError> { - let node = self.dag.node_weight_mut(nid).unwrap(); - let rec = batch_results.get(readbatch_idx)?; - match &mut node.batch { - CachedBatch::One(batch) => batch.insert_remote( - nodebatch_idx, - rec.map(|rec| -> Result<_, Error> { - parse_record(rec, &node.schema, &node.as_schema.bins) - }) - .transpose()?, - ), - CachedBatch::Many(batch) => batch.insert_remote( - nodebatch_idx, - rec.map(|rec| -> Result<_, Error> { - parse_record_many(rec, &node.schema, MANY_LIST_BIN, &node.as_schema.bins) - }) - .transpose()? - .unwrap_or_default(), - ), - } - Ok(()) - } - - fn recurse_lookup( - &self, - field_indices: &[usize], - node_id: NodeIndex, - key: &[Field], - version: usize, - ) -> Vec> { - let node = self.node(node_id); - let records = { - match node.batch.get(key, version) { - Some(t) => Either::Right(t), - None => Either::Left(std::iter::once(vec![Field::Null; node.schema.fields.len()])), - } - }; - - let mut result = Vec::new(); - for record in records { - let mut results_per_edge = Vec::new(); - for edge in self.dag.edges_directed(node_id, Direction::Outgoing) { - let key = edge - .weight() - .key_fields - .iter() - .map(|i| record[*i].clone()) - .collect_vec(); - let edge_results = - self.recurse_lookup(&edge.weight().field_indices, edge.target(), &key, version); - results_per_edge.push(edge_results); - } - - let mut record_result = vec![field_indices - .iter() - .map(|i| record[*i].clone()) - .collect_vec()]; - for edge_result in results_per_edge { - record_result = record_result - .into_iter() - .cartesian_product(edge_result) - .map(|(mut old, mut new)| { - old.append(&mut new); - old - }) - .collect_vec(); - } - result.append(&mut record_result); - } - result - } - - pub(crate) fn commit(&mut self) { - self.transaction_counter += 1; - } -} - -#[cfg(test)] -mod tests { - use std::ffi::CString; - - use dozer_types::{ - models::sink::AerospikeSinkTable, - types::{ - Field, FieldDefinition, FieldType, Operation, Record, Schema, SourceDefinition, - TableOperation, - }, - }; - - use crate::{aerospike::Client, denorm_dag::DenormalizedTable}; - - use super::DenormalizationState; - - macro_rules! schema_row { - ($schema:expr, $f:literal: $t:ident PRIMARY_KEY) => { - $schema.field( - FieldDefinition::new($f.into(), FieldType::$t, true, SourceDefinition::Dynamic), - true, - ); - }; - - ($schema:expr, $f:literal: $t:ident) => { - $schema.field( - FieldDefinition::new($f.into(), FieldType::$t, true, SourceDefinition::Dynamic), - false, - ); - }; - } - macro_rules! schema { - ($($f:literal: $t:ident $($pk:ident)?),+$(,)?) => {{ - let mut schema = Schema::new(); - $(schema_row!(schema, $f: $t $($pk)?));+; - schema - }}; - } - - trait Table { - fn schema() -> Schema; - fn to_row(&self) -> Vec; - fn to_record(&self) -> Record { - Record::new(self.to_row()) - } - } - struct Customer { - id: &'static str, - phone_number: &'static str, - } - - impl Table for Customer { - fn schema() -> Schema { - schema! { - "id": String PRIMARY_KEY, - "phone_number": String - } - } - fn to_row(&self) -> Vec { - vec![ - Field::String(self.id.to_owned()), - Field::String(self.phone_number.to_owned()), - ] - } - } - - struct AccountOwner { - account_id: u64, - customer_id: &'static str, - transaction_limit: Option, - } - - impl Table for AccountOwner { - fn schema() -> Schema { - schema! { - "account_id": UInt PRIMARY_KEY, - "customer_id": String, - "transaction_limit": UInt - } - } - - fn to_row(&self) -> Vec { - vec![ - Field::UInt(self.account_id), - Field::String(self.customer_id.to_owned()), - self.transaction_limit.map_or(Field::Null, Field::UInt), - ] - } - } - - struct Transaction { - id: u64, - account_id: u64, - amount: &'static str, - } - - impl Table for Transaction { - fn schema() -> Schema { - schema! { - "id": UInt PRIMARY_KEY, - "account_id": UInt, - "amount": Decimal - } - } - - fn to_row(&self) -> Vec { - vec![ - Field::UInt(self.id), - Field::UInt(self.account_id), - Field::Decimal(self.amount.try_into().unwrap()), - ] - } - } - - #[derive(Debug)] - struct DenormResult { - id: u64, - account_id: u64, - amount: &'static str, - customer_id: Option<&'static str>, - transaction_limit: Option, - phone_number: Option<&'static str>, - } - - impl Table for DenormResult { - fn schema() -> Schema { - schema! { - "id": UInt PRIMARY_KEY, - "account_id": UInt, - "amount": Decimal, - "customer_id": String PRIMARY_KEY, - "transaction_limit": UInt, - "phone_number": String - } - } - - fn to_row(&self) -> Vec { - vec![ - Field::UInt(self.id), - Field::UInt(self.account_id), - Field::Decimal(self.amount.try_into().unwrap()), - self.customer_id - .map_or(Field::Null, |s| Field::String(s.to_owned())), - self.transaction_limit.map_or(Field::Null, Field::UInt), - self.phone_number - .map_or(Field::Null, |s| Field::String(s.to_owned())), - ] - } - } - - impl PartialEq> for DenormalizedTable { - fn eq(&self, other: &Vec) -> bool { - other.eq(self) - } - } - - impl PartialEq for Vec { - fn eq(&self, other: &DenormalizedTable) -> bool { - let DenormalizedTable { - bin_names, - namespace: _, - set: _, - records, - pk, - } = other; - bin_names - .iter() - .map(|name| name.to_str().unwrap()) - .eq(DenormResult::schema() - .fields - .iter() - .map(|field| field.name.as_str())) - && records - .iter() - .cloned() - .eq(self.iter().map(|rec| rec.to_row())) - && pk == &DenormResult::schema().primary_index - } - } - - fn client() -> Client { - let client = Client::new(&CString::new("localhost:3000").unwrap()).unwrap(); - let mut response = std::ptr::null_mut(); - let request = "truncate-namespace:namespace=test"; - let request = CString::new(request).unwrap(); - unsafe { - client.info(&request, &mut response).unwrap(); - } - client - } - - fn lookup_table(name: &str) -> (AerospikeSinkTable, Schema) { - let mut schema = Schema::new(); - schema - .field( - FieldDefinition { - name: "id".into(), - typ: FieldType::UInt, - nullable: false, - source: SourceDefinition::Dynamic, - description: None, - }, - true, - ) - .field( - FieldDefinition::new( - format!("{name}_value"), - FieldType::UInt, - false, - SourceDefinition::Dynamic, - ), - false, - ); - ( - dozer_types::serde_yaml::from_str(&format!( - r#" - source_table_name: - namespace: test - set_name: {name} - primary_key: - - id - "#, - )) - .unwrap(), - schema, - ) - } - - #[test] - #[ignore] - fn test_denorm_order() { - let tables = vec![ - ( - dozer_types::serde_yaml::from_str( - r#" - source_table_name: - namespace: test - set_name: base - primary_key: - - id - denormalize: - - from_namespace: test - from_set: lookup_0 - key: lookup_0_id - columns: [lookup_0_value] - - from_namespace: test - from_set: lookup_1 - key: lookup_1_id - columns: [lookup_1_value] - write_denormalized_to: - primary_key: [id] - namespace: test - set: denorm - "#, - ) - .unwrap(), - schema! { - "id": UInt PRIMARY_KEY, - "base_value": UInt, - "lookup_0_id": UInt, - "lookup_1_id": UInt, - }, - ), - lookup_table("lookup_0"), - lookup_table("lookup_1"), - ]; - - let mut state = DenormalizationState::new(&tables).unwrap(); - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Record::new(vec![ - Field::UInt(1), - Field::UInt(1), - Field::UInt(100), - Field::UInt(200), - ]), - }, - port: 0, - }) - .unwrap(); - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Record::new(vec![Field::UInt(100), Field::UInt(1000)]), - }, - port: 1, - }) - .unwrap(); - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Record::new(vec![Field::UInt(200), Field::UInt(2000)]), - }, - port: 2, - }) - .unwrap(); - - let client = &client(); - assert_eq!( - state.perform_denorm(client).unwrap(), - vec![DenormalizedTable { - bin_names: vec![ - CString::new("id").unwrap(), - CString::new("base_value").unwrap(), - CString::new("lookup_0_id").unwrap(), - CString::new("lookup_1_id").unwrap(), - CString::new("lookup_1_value").unwrap(), - CString::new("lookup_0_value").unwrap(), - ], - namespace: CString::new("test").unwrap(), - set: CString::new("denorm").unwrap(), - records: vec![vec![ - Field::UInt(1), - Field::UInt(1), - Field::UInt(100), - Field::UInt(200), - Field::UInt(2000), - Field::UInt(1000), - ]], - pk: vec![0], - }] - ); - } - - #[test] - #[ignore] - fn test_denorm_missing() { - let mut state = state(); - let client = client(); - - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Transaction { - id: 0, - account_id: 100, - amount: "10.01", - } - .to_record(), - }, - port: 2, - }) - .unwrap(); - - assert_eq!( - state.perform_denorm(&client).unwrap(), - vec![vec![DenormResult { - id: 0, - account_id: 100, - amount: "10.01", - customer_id: None, - transaction_limit: None, - phone_number: None, - }]] - ) - } - - #[test] - #[ignore] - fn test_denorm() { - let mut state = state(); - let client = client(); - // Customers - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Customer { - id: "1001", - phone_number: "+1234567", - } - .to_record(), - }, - port: 0, - }) - .unwrap(); - // Accounts - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: AccountOwner { - account_id: 101, - customer_id: "1001", - transaction_limit: None, - } - .to_record(), - }, - port: 1, - }) - .unwrap(); - state.persist(&client).unwrap(); - assert_eq!(state.perform_denorm(&client).unwrap(), vec![vec![]]); - // Transactions - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Transaction { - id: 1, - account_id: 101, - amount: "1.23", - } - .to_record(), - }, - port: 2, - }) - .unwrap(); - let res = state.perform_denorm(&client).unwrap(); - assert_eq!( - res, - vec![vec![DenormResult { - id: 1, - account_id: 101, - amount: "1.23", - customer_id: Some("1001"), - transaction_limit: None, - phone_number: Some("+1234567"), - }]] - ); - state.commit(); - state.persist(&client).unwrap(); - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Transaction { - id: 2, - account_id: 101, - amount: "3.21", - } - .to_record(), - }, - port: 2, - }) - .unwrap(); - state.commit(); - state - .process(TableOperation { - id: None, - op: Operation::Update { - old: Customer { - id: "1001", - phone_number: "+1234567", - } - .to_record(), - new: Customer { - id: "1001", - phone_number: "+7654321", - } - .to_record(), - }, - port: 0, - }) - .unwrap(); - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Transaction { - id: 3, - account_id: 101, - amount: "1.23", - } - .to_record(), - }, - port: 2, - }) - .unwrap(); - state - .process(TableOperation { - id: None, - op: Operation::Insert { - new: Customer { - id: "1001", - phone_number: "+2 123", - } - .to_record(), - }, - port: 0, - }) - .unwrap(); - state.commit(); - let res = state.perform_denorm(&client).unwrap(); - assert_eq!( - res, - vec![vec![ - DenormResult { - id: 2, - account_id: 101, - amount: "3.21", - customer_id: Some("1001"), - transaction_limit: None, - phone_number: Some("+1234567") - }, - DenormResult { - id: 3, - account_id: 101, - amount: "1.23", - customer_id: Some("1001"), - transaction_limit: None, - phone_number: Some("+7654321"), - }, - DenormResult { - id: 3, - account_id: 101, - amount: "1.23", - customer_id: Some("1001"), - transaction_limit: None, - phone_number: Some("+2 123"), - }, - ],] - ); - state.persist(&client).unwrap(); - } - - fn state() -> DenormalizationState { - let tables = vec![ - ( - dozer_types::serde_yaml::from_str( - r#" - source_table_name: - namespace: test - set_name: customers - primary_key: - - id - aggregate_by_pk: true - "#, - ) - .unwrap(), - Customer::schema(), - ), - ( - dozer_types::serde_yaml::from_str( - r#" - source_table_name: - namespace: test - set_name: accounts - primary_key: - - account_id - denormalize: - - from_namespace: test - from_set: customers - key: customer_id - columns: - - phone_number - "#, - ) - .unwrap(), - AccountOwner::schema(), - ), - ( - dozer_types::serde_yaml::from_str( - r#" - source_table_name: - namespace: test - set_name: transactions - primary_key: - - id - denormalize: - - from_namespace: test - from_set: accounts - key: account_id - columns: - - customer_id - - transaction_limit - write_denormalized_to: - namespace: test - set: transactions_denorm - primary_key: - - id - - customer_id - "#, - ) - .unwrap(), - Transaction::schema(), - ), - ]; - - DenormalizationState::new(&tables).unwrap() - } -} diff --git a/dozer-sink-aerospike/src/lib.rs b/dozer-sink-aerospike/src/lib.rs deleted file mode 100644 index 1a3c20675f..0000000000 --- a/dozer-sink-aerospike/src/lib.rs +++ /dev/null @@ -1,730 +0,0 @@ -pub use crate::aerospike::Client; - -use aerospike_client_sys::*; -use denorm_dag::DenormalizationState; -use dozer_core::event::EventHub; -use dozer_types::log::error; -use dozer_types::models::connection::AerospikeConnection; -use dozer_types::node::OpIdentifier; -use dozer_types::thiserror; -use itertools::Itertools; - -use std::collections::HashMap; -use std::ffi::{CStr, CString, NulError}; -use std::mem::MaybeUninit; -use std::ptr::NonNull; -use std::sync::Arc; - -use crate::aerospike::{AerospikeError, WriteBatch}; - -mod aerospike; -mod denorm_dag; - -use dozer_core::node::{PortHandle, Sink, SinkFactory}; - -use dozer_types::errors::internal::BoxedError; -use dozer_types::tonic::async_trait; -use dozer_types::{ - errors::types::TypeError, - log::warn, - models::sink::AerospikeSinkConfig, - types::{Field, FieldType, Schema, TableOperation}, -}; - -mod constants { - use std::ffi::CStr; - - // TODO: Replace with cstring literals when they're stablized, - // currently planned for Rust 1.77 - const fn cstr(value: &'static [u8]) -> &'static CStr { - // Check that the supplied value is valid (ends with nul byte) - assert!(CStr::from_bytes_with_nul(value).is_ok()); - // Do the conversion again - unsafe { CStr::from_bytes_with_nul_unchecked(value) } - } - - pub(super) const META_KEY: &CStr = cstr(b"metadata\0"); - pub(super) const META_BASE_TXN_ID_BIN: &CStr = cstr(b"txn_id\0"); - pub(super) const META_LOOKUP_TXN_ID_BIN: &CStr = cstr(b"txn_id\0"); -} - -#[derive(thiserror::Error, Debug)] -enum AerospikeSinkError { - #[error("Aerospike client error: {0}")] - Aerospike(#[from] AerospikeError), - #[error("No primary key found. Aerospike requires records to have a primary key")] - NoPrimaryKey, - #[error("Unsupported type for primary key: {0}")] - UnsupportedPrimaryKeyType(FieldType), - #[error("Type error: {0}")] - TypeError(#[from] TypeError), - #[error("String with internal NUL byte")] - NulError(#[from] NulError), - #[error("Could not create record")] - CreateRecordError, - #[error("Column name \"{}\" exceeds aerospike's maximum bin name length ({})", .0, AS_BIN_NAME_MAX_LEN)] - BinNameTooLong(String), - #[error("Integer out of range. The supplied usigned integer was larger than the maximum representable value for an aerospike integer")] - IntegerOutOfRange(u64), - #[error("Changing the value of a primary key is not supported for Aerospike sink. Old: {old:?}, new: {new:?}")] - PrimaryKeyChanged { old: Vec, new: Vec }, - #[error("Denormalization error: {0}")] - DenormError(#[from] denorm_dag::Error), - #[error("Inconsistent txid. Denormalized: {denorm:?}, lookup {lookup:?}")] - InconsistentTxids { - denorm: Option, - lookup: Option, - }, -} - -#[derive(Debug)] -pub struct AerospikeSinkFactory { - connection_config: AerospikeConnection, - config: AerospikeSinkConfig, -} - -impl AerospikeSinkFactory { - pub fn new(connection_config: AerospikeConnection, config: AerospikeSinkConfig) -> Self { - Self { - connection_config, - config, - } - } -} - -#[async_trait] -impl SinkFactory for AerospikeSinkFactory { - fn get_input_ports(&self) -> Vec { - (0..self.config.tables.len() as PortHandle).collect() - } - - fn get_input_port_name(&self, port: &PortHandle) -> String { - self.config.tables[*port as usize].source_table_name.clone() - } - - fn prepare(&self, input_schemas: HashMap) -> Result<(), BoxedError> { - debug_assert!(input_schemas.len() == self.config.tables.len()); - Ok(()) - } - - async fn build( - &self, - mut input_schemas: HashMap, - _event_hub: EventHub, - ) -> Result, BoxedError> { - let hosts = CString::new(self.connection_config.hosts.as_str())?; - let client = Client::new(&hosts).map_err(AerospikeSinkError::from)?; - - let tables: Vec<_> = self - .config - .tables - .iter() - .cloned() - .enumerate() - .map(|(i, table)| -> Result<_, TypeError> { - let mut schema = input_schemas.remove(&(i as PortHandle)).unwrap(); - if !table.primary_key.is_empty() { - let fields = table - .primary_key - .iter() - .map(|key| schema.get_field_index(key)) - .map_ok(|(i, _)| i) - .try_collect()?; - schema.primary_index = fields; - } - Ok((table, schema)) - }) - .try_collect()?; - // Validate schemas - for (_, schema) in tables.iter() { - if schema.primary_index.is_empty() { - return Err(AerospikeSinkError::NoPrimaryKey.into()); - }; - for idx in schema.primary_index.iter() { - match schema.fields[*idx].typ { - // These are definitely OK as the primary key - dozer_types::types::FieldType::UInt - | dozer_types::types::FieldType::U128 - | dozer_types::types::FieldType::Int - | dozer_types::types::FieldType::Int8 - | dozer_types::types::FieldType::I128 - | dozer_types::types::FieldType::String - | dozer_types::types::FieldType::Text - | dozer_types::types::FieldType::Duration - | dozer_types::types::FieldType::Binary => {} - - // These are OK because we convert them to strings, so warn about - // them to make sure the user is aware - typ @ (dozer_types::types::FieldType::Decimal | - dozer_types::types::FieldType::Timestamp | - dozer_types::types::FieldType::Date) => warn!("Using a {typ} column as a primary key for Aerospike sink. This is only allowed because this type is converted to a String. Cast to a type supported by aerospike to silence this warning."), - - // These are not OK as keys, so error out - typ @ (dozer_types::types::FieldType::Float| - dozer_types::types::FieldType::Boolean | - dozer_types::types::FieldType::Json | - dozer_types::types::FieldType::Point ) => { - return Err(Box::new(AerospikeSinkError::UnsupportedPrimaryKeyType(typ))); - } - } - for field in &schema.fields { - if field.name.len() > AS_BIN_NAME_MAX_LEN as usize { - return Err( - AerospikeSinkError::BinNameTooLong(field.name.to_owned()).into() - ); - } - } - } - } - let denorm_state = DenormalizationState::new(&tables)?; - - let metadata_namespace = CString::new(self.config.metadata_namespace.clone())?; - let metadata_set = CString::new( - self.config - .metadata_set - .to_owned() - .unwrap_or("__replication_metadata".to_owned()), - )?; - Ok(Box::new(AerospikeSink::new( - self.config.clone(), - client, - denorm_state, - metadata_namespace, - metadata_set, - )?)) - } - - fn type_name(&self) -> String { - "aerospike".to_string() - } -} - -// A wrapper type responsible for cleaning up a key. This doesn't own an as_key -// instance, as that would involve moving it, while an initialized as_key might -// be self-referential -struct Key<'a>(&'a mut as_key); - -impl Key<'_> { - fn as_ptr(&self) -> *const as_key { - (&*self.0) as *const as_key - } -} - -impl Drop for Key<'_> { - fn drop(&mut self) { - let ptr = self.0 as *mut as_key; - unsafe { as_key_destroy(ptr) } - } -} - -// A wrapper type responsible for cleaning up a record. This doesn't own an as_record -// instance, as that would involve moving it, while an initialized as_record might -// be self-referential -struct AsRecord<'a>(&'a mut as_record); - -impl AsRecord<'_> { - fn as_ptr(&self) -> *const as_record { - &*self.0 as *const as_record - } -} - -impl Drop for AsRecord<'_> { - fn drop(&mut self) { - let ptr = self.0 as *mut as_record; - unsafe { as_record_destroy(ptr) } - } -} - -#[derive(Debug)] -struct AerospikeSink { - config: AerospikeSinkConfig, - replication_worker: AerospikeSinkWorker, - metadata_namespace: CString, - metadata_set: CString, - client: Arc, -} - -type TxnId = u64; - -#[derive(Debug)] -struct AerospikeMetadata { - client: Arc, - key: NonNull, - record: NonNull, - last_denorm_transaction: Option, - last_lookup_transaction: Option, -} - -// NonNull doesn't impl Send -unsafe impl Send for AerospikeMetadata {} - -impl AerospikeMetadata { - fn new(client: Arc, namespace: CString, set: CString) -> Result { - unsafe { - let key = NonNull::new(as_key_new( - namespace.as_ptr(), - set.as_ptr(), - constants::META_KEY.as_ptr(), - )) - .unwrap(); - let mut record = std::ptr::null_mut(); - #[allow(non_upper_case_globals)] - let (base, lookup) = match client.get(key.as_ptr(), &mut record) { - Ok(()) => { - let lookup = - as_record_get_integer(record, constants::META_LOOKUP_TXN_ID_BIN.as_ptr()); - let base = - as_record_get_integer(record, constants::META_BASE_TXN_ID_BIN.as_ptr()); - let base = if base.is_null() { - None - } else { - Some((*base).value.try_into().unwrap()) - }; - let lookup = if lookup.is_null() { - None - } else { - Some((*lookup).value.try_into().unwrap()) - }; - (base, lookup) - } - Err(AerospikeError { - code: as_status_e_AEROSPIKE_ERR_RECORD_NOT_FOUND, - message: _, - }) => (None, None), - Err(e) => return Err(e), - }; - // Not found, so allocate a new record - if record.is_null() { - record = as_record_new(2); - } - Ok(Self { - client, - key, - record: NonNull::new(record).unwrap(), - last_denorm_transaction: base, - last_lookup_transaction: lookup, - }) - } - } - - fn write(&mut self, txid: TxnId, bin: &CStr) -> Result<(), AerospikeSinkError> { - unsafe { - as_record_set_int64(self.record.as_ptr(), bin.as_ptr(), txid as i64); - self.client - .upsert(self.key.as_ptr(), self.record.as_ptr(), None)?; - } - Ok(()) - } - - fn write_denorm(&mut self, txid: TxnId) -> Result<(), AerospikeSinkError> { - self.last_denorm_transaction = Some(txid); - self.write(txid, constants::META_BASE_TXN_ID_BIN)?; - Ok(()) - } - - fn write_lookup(&mut self, txid: TxnId) -> Result<(), AerospikeSinkError> { - self.last_lookup_transaction = Some(txid); - self.write(txid, constants::META_LOOKUP_TXN_ID_BIN)?; - Ok(()) - } -} - -impl Drop for AerospikeMetadata { - fn drop(&mut self) { - unsafe { - as_record_destroy(self.record.as_ptr()); - as_key_destroy(self.key.as_ptr()); - } - } -} - -impl AerospikeSink { - fn new( - config: AerospikeSinkConfig, - client: Client, - state: DenormalizationState, - metadata_namespace: CString, - metadata_set: CString, - ) -> Result { - let client = Arc::new(client); - - let metadata_writer = AerospikeMetadata::new( - client.clone(), - metadata_namespace.clone(), - metadata_set.clone(), - )?; - - let worker_instance = AerospikeSinkWorker { - client: client.clone(), - state, - metadata_writer, - last_committed_transaction: None, - }; - - Ok(Self { - config, - replication_worker: worker_instance, - metadata_namespace, - metadata_set, - client, - }) - } -} - -#[derive(Debug)] -struct AerospikeSinkWorker { - client: Arc, - state: DenormalizationState, - last_committed_transaction: Option, - metadata_writer: AerospikeMetadata, -} - -impl AerospikeSinkWorker { - fn process(&mut self, op: TableOperation) -> Result<(), AerospikeSinkError> { - self.state.process(op)?; - Ok(()) - } - - fn commit(&mut self, txid: Option) -> Result<(), AerospikeSinkError> { - match ( - txid, - self.metadata_writer.last_denorm_transaction, - self.metadata_writer.last_lookup_transaction, - ) { - (Some(current), Some(last_denorm), Some(last_lookup)) => { - if current <= last_lookup { - // We're not caught up so just clear state - self.state.clear(); - return Ok(()); - } - if current <= last_denorm { - // Catching up between lookup and denorm. Only need to write lookup. - self.state.persist(&self.client)?; - self.metadata_writer.write_lookup(current)?; - return Ok(()); - } - // Else, we're in the normal state and we do the full denorm - - }, - (None, Some(_), None) => { - // We are re-snapshotting, because we went down between writing - // the base table and writing the lookup tables during the first - // transaction after initial snapshotting. Only write the lookup - // tables - self.state.persist(&self.client)?; - return Ok(()); - } - // First transaction. No need to do anything special - (Some(_) | None, None, None) => {} - // Base should always be ahead of lookup - (_, denorm @ None, lookup @ Some(_)) | - // If lookup is None, we should be snapshotting and thus have no txid - (Some(_), denorm @ Some(_), lookup @ None)| - // If we previously had txid's we should always continue to have txid's - ( None, denorm @ Some(_), lookup @ Some(_)) => { - return Err(AerospikeSinkError::InconsistentTxids { denorm, lookup }) - } - } - self.state.commit(); - self.last_committed_transaction = txid; - Ok(()) - } - - fn flush_batch(&mut self) -> Result<(), AerospikeSinkError> { - let txid = self.last_committed_transaction.take(); - let denormalized_tables = self.state.perform_denorm(&self.client)?; - let batch_size_est: usize = denormalized_tables - .iter() - .map(|table| table.records.len()) - .sum(); - // Write denormed tables - let mut batch = WriteBatch::new(&self.client, batch_size_est as u32, None); - for table in denormalized_tables { - for record in table.records { - let key = table.pk.iter().map(|i| record[*i].clone()).collect_vec(); - batch.add_write( - &table.namespace, - &table.set, - &table.bin_names, - &key, - &record, - )?; - } - } - - batch.execute()?; - - // Write denormed txid - if let Some(txid) = txid { - self.metadata_writer.write_denorm(txid)?; - } - - self.state.persist(&self.client)?; - - if let Some(txid) = txid { - self.metadata_writer.write_lookup(txid)?; - } - Ok(()) - } -} - -impl Sink for AerospikeSink { - fn supports_batching(&self) -> bool { - true - } - - fn flush_batch(&mut self) -> Result<(), BoxedError> { - self.replication_worker.flush_batch()?; - Ok(()) - } - - fn commit(&mut self, epoch_details: &dozer_core::epoch::Epoch) -> Result<(), BoxedError> { - debug_assert_eq!(epoch_details.common_info.source_states.len(), 1); - let txid = epoch_details - .common_info - .source_states - .iter() - .next() - .and_then(|(_, state)| state.op_id()) - .map(|op_id| op_id.txid); - - self.replication_worker.commit(txid)?; - Ok(()) - } - - fn process(&mut self, op: TableOperation) -> Result<(), BoxedError> { - self.replication_worker.process(op)?; - Ok(()) - } - - fn on_source_snapshotting_started( - &mut self, - _connection_name: String, - ) -> Result<(), BoxedError> { - Ok(()) - } - - fn on_source_snapshotting_done( - &mut self, - _connection_name: String, - _id: Option, - ) -> Result<(), BoxedError> { - Ok(()) - } - - fn set_source_state(&mut self, _source_state: &[u8]) -> Result<(), BoxedError> { - Ok(()) - } - - fn get_source_state(&mut self) -> Result>, BoxedError> { - Ok(None) - } - - fn get_latest_op_id(&mut self) -> Result, BoxedError> { - let mut _k = MaybeUninit::uninit(); - let mut _r = std::ptr::null_mut(); - unsafe { - as_key_init_strp( - _k.as_mut_ptr(), - self.metadata_namespace.as_ptr(), - self.metadata_set.as_ptr(), - constants::META_KEY.as_ptr(), - false, - ); - let key = Key(_k.assume_init_mut()); - #[allow(non_upper_case_globals)] - match self.client.get(key.as_ptr(), &mut _r) { - Ok(_) => {} - Err(AerospikeError { - code: as_status_e_AEROSPIKE_ERR_RECORD_NOT_FOUND, - message: _, - }) => return Ok(None), - Err(e) => return Err(e.into()), - } - let record = AsRecord(_r.as_mut().unwrap()); - let txid = as_record_get_int64( - record.as_ptr(), - constants::META_LOOKUP_TXN_ID_BIN.as_ptr(), - -1, - ); - if txid > 0 { - Ok(Some(OpIdentifier { - txid: txid as u64, - seq_in_tx: 0, - })) - } else { - Ok(None) - } - } - } - - fn max_batch_duration_ms(&self) -> Option { - self.config.max_batch_duration_ms - } - - fn preferred_batch_size(&self) -> Option { - self.config.preferred_batch_size - } -} - -#[cfg(test)] -mod tests { - - use dozer_core::{tokio, DEFAULT_PORT_HANDLE}; - use std::time::Duration; - - use dozer_types::{ - chrono::{DateTime, NaiveDate}, - geo::Point, - models::sink::AerospikeSinkTable, - ordered_float::OrderedFloat, - rust_decimal::Decimal, - types::{DozerDuration, DozerPoint, FieldDefinition, Operation, Record}, - }; - - use super::*; - - fn f(name: &str, typ: FieldType) -> FieldDefinition { - FieldDefinition { - name: name.to_owned(), - typ, - nullable: false, - source: dozer_types::types::SourceDefinition::Dynamic, - description: None, - } - } - - const N_RECORDS: usize = 1000; - const BATCH_SIZE: usize = 1000; - - #[tokio::test] - #[ignore] - async fn test_inserts() { - let mut sink = sink("inserts").await; - for i in 0..N_RECORDS { - sink.process(TableOperation::without_id( - Operation::Insert { - new: record(i as u64), - }, - DEFAULT_PORT_HANDLE, - )) - .unwrap(); - } - } - - #[tokio::test] - #[ignore] - async fn test_inserts_batch() { - let mut batches = Vec::with_capacity(N_RECORDS / BATCH_SIZE); - for i in 0..N_RECORDS / BATCH_SIZE { - let mut batch = Vec::with_capacity(BATCH_SIZE); - for j in (i * BATCH_SIZE)..((i + 1) * BATCH_SIZE) { - batch.push(record(j as u64)); - } - batches.push(batch); - } - let mut sink = sink("inserts_batch").await; - for batch in batches { - sink.process(TableOperation::without_id( - Operation::BatchInsert { new: batch }, - DEFAULT_PORT_HANDLE, - )) - .unwrap() - } - } - - async fn sink(set: &str) -> Box { - let mut schema = Schema::new(); - schema - .field(f("uint", FieldType::UInt), true) - .field(f("int", FieldType::Int), false) - .field(f("float", FieldType::Float), false) - .field(f("boolean", FieldType::Boolean), false) - .field(f("string", FieldType::String), false) - .field(f("text", FieldType::Text), false) - .field(f("binary", FieldType::Binary), false) - .field(f("u128", FieldType::U128), false) - .field(f("i128", FieldType::I128), false) - .field(f("decimal", FieldType::Decimal), false) - .field(f("timestamp", FieldType::Timestamp), false) - .field(f("date", FieldType::Date), false) - .field(f("point", FieldType::Point), false) - .field(f("duration", FieldType::Duration), false) - .field( - FieldDefinition { - name: "nil".into(), - typ: FieldType::UInt, - nullable: true, - source: dozer_types::types::SourceDefinition::Dynamic, - description: None, - }, - false, - ) - .field(f("json", FieldType::Json), false); - let connection_config = AerospikeConnection { - hosts: "localhost:3000".into(), - namespace: "test".into(), - sets: vec![set.to_owned()], - batching: false, - ..Default::default() - }; - let factory = AerospikeSinkFactory::new( - connection_config, - AerospikeSinkConfig { - connection: "".to_owned(), - n_threads: Some(1.try_into().unwrap()), - tables: vec![AerospikeSinkTable { - source_table_name: "test".into(), - namespace: "test".into(), - set_name: set.to_owned(), - denormalize: vec![], - write_denormalized_to: None, - primary_key: vec![], - aggregate_by_pk: false, - }], - max_batch_duration_ms: None, - preferred_batch_size: None, - metadata_namespace: "test".into(), - metadata_set: None, - }, - ); - factory - .build([(DEFAULT_PORT_HANDLE, schema)].into(), EventHub::new(1)) - .await - .unwrap() - } - - fn record(i: u64) -> Record { - Record::new(vec![ - Field::UInt(i), - Field::Int(i as _), - Field::Float(OrderedFloat(i as _)), - Field::Boolean(i % 2 == 0), - Field::String(i.to_string()), - Field::Text(i.to_string()), - Field::Binary(vec![(i % 256) as u8; 1]), - Field::U128(i as _), - Field::I128(i as _), - Field::Decimal(Decimal::new(i as _, 1)), - Field::Timestamp(DateTime::from_timestamp(i as _, i as _).unwrap().into()), - Field::Date(NaiveDate::from_num_days_from_ce_opt(i as _).unwrap()), - Field::Point(DozerPoint(Point::new( - OrderedFloat((i % 90) as f64), - OrderedFloat((i % 90) as f64), - ))), - Field::Duration(DozerDuration( - Duration::from_secs(i), - dozer_types::types::TimeUnit::Seconds, - )), - Field::Null, - Field::Json(dozer_types::json_types::json!({ - i.to_string(): i, - i.to_string(): i as f64, - "array": vec![i; 5], - "object": { - "haha": i - } - })), - ]) - } -} diff --git a/dozer-sink-oracle/Cargo.toml b/dozer-sink-oracle/Cargo.toml deleted file mode 100644 index af9533b9c5..0000000000 --- a/dozer-sink-oracle/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "dozer-sink-oracle" -version = "0.1.0" -edition = "2021" -license = "AGPL-3.0-or-later" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -dozer-core = { path = "../dozer-core" } -dozer-types = { path = "../dozer-types" } -oracle = { version = "0.5.7", features = ["chrono"] } diff --git a/dozer-sink-oracle/src/lib.rs b/dozer-sink-oracle/src/lib.rs deleted file mode 100644 index 56e288a765..0000000000 --- a/dozer-sink-oracle/src/lib.rs +++ /dev/null @@ -1,989 +0,0 @@ -use dozer_types::{ - log::warn, - models::sink::OracleSinkConfig, - thiserror, - types::{FieldDefinition, Operation, SourceDefinition, TableOperation}, -}; -use std::collections::HashMap; - -use dozer_core::{ - event::EventHub, - node::{PortHandle, Sink, SinkFactory}, - DEFAULT_PORT_HANDLE, -}; -use dozer_types::{ - chrono::{self, DateTime, NaiveDate, Utc}, - errors::internal::BoxedError, - log::{debug, info}, - models::ingestion_types::OracleConfig, - node::OpIdentifier, - thiserror::Error, - tonic::async_trait, - types::{Field, FieldType, Record, Schema}, -}; -use oracle::{ - sql_type::{OracleType, ToSql}, - Connection, -}; - -const TXN_ID_COL: &str = "__txn_id"; -const TXN_SEQ_COL: &str = "__txn_seq"; -const OPKIND_COL: &str = "DOZER_OPKIND"; -const METADATA_TABLE: &str = "__replication_metadata"; -const META_TXN_ID_COL: &str = "txn_id"; -const META_TABLE_COL: &str = "table"; - -fn format_null(nullable: bool) -> &'static str { - if nullable { - "NULL" - } else { - "NOT NULL" - } -} - -#[derive(Error, Debug)] -enum SchemaValidationError { - #[error("Missing column: {0}")] - MissingColumn(String), - #[error("Extra column found: {0:?}")] - ExtraColumns(Vec), - #[error("Incompatible type for field {field}. Internal type: {dozer_type}, sink type: {remote_type}")] - IncompatibleType { - field: String, - dozer_type: FieldType, - remote_type: OracleType, - }, - #[error("Unsupported type in sink table: {0}")] - UnsupportedType(String), - #[error("Incompatibly mismatched nullability. Source: {}, sink: {}", format_null(*.src), format_null(*.sink))] - MismatchedNullability { src: bool, sink: bool }, -} - -#[derive(Error, Debug)] -enum Error { - #[error("Updating a primary key is not supported. Old: {old:?}, new: {new:?}")] - UpdatedPrimaryKey { old: Vec, new: Vec }, - #[error("Destination table {table} has incompatible schema. {inner}")] - IncompatibleSchema { - table: Table, - inner: SchemaValidationError, - }, - #[error("Oracle database error: {0}")] - Oracle(oracle::Error), -} - -impl From for Error { - fn from(value: oracle::Error) -> Self { - Error::Oracle(value) - } -} - -#[derive(Debug)] -struct BatchedOperation { - op_id: Option, - op_kind: OpKind, - params: Record, -} - -#[derive(Debug)] -struct OracleSink { - conn: Connection, - insert_append: String, - pk: Vec, - field_types: Vec, - merge_statement: String, - batch_params: Vec, - batch_size: usize, - insert_metadata: String, - update_metadata: String, - select_metadata: String, - latest_txid: Option, - insert_statement: String, - delete_statement: String, -} - -#[derive(Debug)] -pub struct OracleSinkFactory { - connection_config: OracleConfig, - table: Table, -} - -impl std::fmt::Display for Table { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "\"{}\".\"{}\"", &self.owner, &self.name) - } -} - -impl OracleSinkFactory { - pub fn new(connection_config: OracleConfig, config: OracleSinkConfig) -> Self { - let owner = config - .owner - .unwrap_or_else(|| connection_config.user.clone()); - Self { - connection_config, - table: Table { - owner, - name: config.table_name, - unique_key: config.unique_key, - }, - } - } -} - -fn parse_oracle_type( - name: &str, - length: u32, - precision: Option, - scale: Option, -) -> Option { - let typ = match name { - "VARCHAR2" => OracleType::Varchar2(length), - "NVARCHAR2" => OracleType::NVarchar2(length), - "CHAR" => OracleType::Char(length), - "NCHAR" => OracleType::NChar(length), - "ROWID" => OracleType::Rowid, - "RAW" => OracleType::Raw(length), - "BINARY_FLOAT" => OracleType::BinaryFloat, - "BINARY_DOUBLE" => OracleType::BinaryDouble, - "NUMBER" => OracleType::Number(precision.unwrap_or(38), scale.unwrap_or(0)), - "FLOAT" => OracleType::Float(precision.unwrap_or(126)), - "DATE" => OracleType::Date, - "JSON" => OracleType::Json, - _ if name.starts_with("TIMESTAMP") => { - let fracp_start = name.find('(').unwrap(); - let fracp_end = name.find(')').unwrap(); - let fracp = name[fracp_start + 1..fracp_end].parse().unwrap(); - - match &name[fracp_end + 1..] { - " WITH LOCAL TIME ZONE" => OracleType::TimestampLTZ(fracp), - " WITH TIME ZONE" => OracleType::TimestampTZ(fracp), - "" => OracleType::Timestamp(fracp), - _ => return None, - } - } - _ if name.starts_with("INTERVAL") => { - if name[9..].starts_with("DAY") { - let dayp_start = name.find('(').unwrap(); - let dayp_end = name.find(')').unwrap(); - let dayp = name[dayp_start + 1..dayp_end].parse().unwrap(); - - let secondp_start = name[dayp_end + 1..].find('(').unwrap(); - let secondp_end = name[dayp_end + 1..].find(')').unwrap(); - let secondp = name[dayp_end + 1..][secondp_start + 1..secondp_end] - .parse() - .unwrap(); - OracleType::IntervalDS(dayp, secondp) - } else if name[9..].starts_with("YEAR") { - let yearp_start = name.find('(').unwrap(); - let yearp_end = name.find(')').unwrap(); - let yearp = name[yearp_start + 1..yearp_end].parse().unwrap(); - OracleType::IntervalYM(yearp) - } else { - return None; - } - } - _ => { - return None; - } - }; - Some(typ) -} - -impl OracleSinkFactory { - fn validate_table( - &self, - connection: &Connection, - table: &Table, - schema: &Schema, - ) -> Result { - let err = |e| Error::IncompatibleSchema { - table: table.clone(), - inner: e, - }; - - let results = connection.query_as::<(String, String, u32, Option, Option, String)>( - "SELECT COLUMN_NAME, DATA_TYPE, DATA_LENGTH, DATA_PRECISION, DATA_SCALE, NULLABLE FROM ALL_TAB_COLS WHERE table_name = :1 AND owner = :2", - &[&table.name, &table.owner], - )?; - - let mut cols = HashMap::new(); - for col in results { - let col = col?; - cols.insert(col.0.clone(), col); - } - - // The table does not exist - if cols.is_empty() { - return Ok(false); - } - - for field in &schema.fields { - let definition = cols - .remove(&field.name) - .ok_or_else(|| err(SchemaValidationError::MissingColumn(field.name.clone())))?; - let (_, type_name, length, precision, scale, nullable) = definition; - let Some(typ) = parse_oracle_type(&type_name, length, precision, scale) else { - return Err(err(SchemaValidationError::UnsupportedType( - type_name.clone(), - ))); - }; - match (field.typ, typ) { - ( - FieldType::String | FieldType::Text, - OracleType::Varchar2(_) | OracleType::NVarchar2(_), - ) => {} - (FieldType::U128 | FieldType::I128, OracleType::Number(precision, 0)) - if precision >= 39 => {} - (FieldType::UInt | FieldType::Int, OracleType::Number(precision, 0)) - if precision >= 20 => {} - (FieldType::Float, OracleType::Number(38, 0) | OracleType::BinaryDouble) => {} - (FieldType::Boolean, OracleType::Number(_, 0)) => {} - (FieldType::Binary, OracleType::Raw(_)) => {} - (FieldType::Timestamp, OracleType::Timestamp(_) | OracleType::TimestampTZ(_)) => {} - (FieldType::Date, OracleType::Date) => {} - (FieldType::Decimal, OracleType::Number(_, _)) => {} - (dozer_type, remote_type) => { - return Err(err(SchemaValidationError::IncompatibleType { - field: field.name.clone(), - dozer_type, - remote_type, - })) - } - } - if (field.nullable, nullable.as_str()) == (true, "N") { - return Err(err(SchemaValidationError::MismatchedNullability { - src: field.nullable, - sink: false, - })); - } - } - - if !cols.is_empty() { - return Err(err(SchemaValidationError::ExtraColumns( - cols.keys().cloned().collect(), - ))); - } - Ok(true) - } - - fn validate_or_create_table( - &self, - connection: &Connection, - table: &Table, - temp_table: Option<&Table>, - schema: &Schema, - ) -> Result<(), Error> { - let mut column_defs = Vec::with_capacity(schema.fields.len()); - for field in &schema.fields { - let name = &field.name; - let col_type = match field.typ { - FieldType::UInt => "NUMBER(20)", - FieldType::U128 => unimplemented!(), - FieldType::Int => "NUMBER(20)", - FieldType::Int8 => unimplemented!(), - FieldType::I128 => unimplemented!(), - // Should this be BINARY_DOUBLE? - FieldType::Float => "NUMBER", - FieldType::Boolean => "NUMBER", - FieldType::String => "VARCHAR2(2000)", - FieldType::Text => "VARCHAR2(2000)", - FieldType::Binary => "RAW(1000)", - FieldType::Decimal => "NUMBER(29, 10)", - FieldType::Timestamp => "TIMESTAMP(9) WITH TIME ZONE", - FieldType::Date => "TIMESTAMP(0)", - FieldType::Json => unimplemented!(), - FieldType::Point => unimplemented!("Oracle Point"), - FieldType::Duration => unimplemented!(), - }; - column_defs.push(format!( - "\"{name}\" {col_type}{}", - if field.nullable { "" } else { " NOT NULL" } - )); - } - - if !(self.validate_table(connection, table, schema)?) { - let table_query = format!("CREATE TABLE {table} ({})", column_defs.join(",\n")); - info!("### CREATE TABLE ####\n{}", table_query); - connection.execute(&table_query, &[])?; - } - - if let Some(temp_table) = temp_table { - let temp_table_query = format!("CREATE PRIVATE TEMPORARY TABLE {temp_table} ({},\n {OPKIND_COL} NUMBER(1)) ON COMMIT PRESERVE DEFINITION", column_defs.join(",\n")).replace("NOT NULL", ""); - info!("### CREATE TEMPORARY TABLE ####\n{}", temp_table_query); - connection.execute(&temp_table_query, &[])?; - } - - Ok(()) - } - - fn create_index( - &self, - connection: &Connection, - table: &Table, - schema: &Schema, - ) -> Result<(), Error> { - let mut columns = schema - .primary_index - .iter() - .map(|ix| schema.fields[*ix].name.clone()) - .collect::>(); - - let index_name = format!( - "{}_{}_{}_TXN_ID_TXN_SEQ_INDEX", - table - .owner - .to_ascii_uppercase() - .strip_prefix("C##") - .unwrap_or(&table.owner), - table.name.to_ascii_uppercase(), - columns - .iter() - .map(|col| col - .chars() - .filter_map(|c| c.is_ascii_alphabetic().then_some(c.to_ascii_uppercase())) - .collect::()) - .collect::>() - .join("_") - ); - - columns.push(TXN_ID_COL.to_owned()); - columns.push(TXN_SEQ_COL.to_owned()); - - let query = "SELECT index_name FROM all_indexes WHERE table_name = :1 AND owner = :2"; - info!("Index check query {query}"); - - let mut index_exist = connection.query(query, &[&table.name, &table.owner])?; - if index_exist.next().is_some() { - info!("Index {index_name} already exist"); - } else { - let query = format!( - "CREATE INDEX {index_name} ON {table} ({})", - columns - .into_iter() - .map(|col| format!("\"{col}\"")) - .collect::>() - .join(", ") - ); - dbg!(&query); - info!("### CREATE INDEX #### \n: {index_name}. Query: {query}"); - connection.execute(&query, &[])?; - } - - Ok(()) - } -} - -fn generate_merge_statement(table: &Table, temp_table: &Table, schema: &Schema) -> String { - let field_names = schema - .fields - .iter() - .map(|field| field.name.as_str()) - .chain([TXN_ID_COL, TXN_SEQ_COL]); - - let destination_columns = field_names - .clone() - .map(|name| format!("D.\"{name}\"")) - .collect::>() - .join(", "); - - let source_values = field_names - .clone() - .map(|name| format!("S.\"{name}\"")) - .collect::>() - .join(", "); - - let destination_assign = field_names - .clone() - .enumerate() - .filter(|(i, _)| !schema.primary_index.contains(i)) - .map(|(_, name)| format!("D.\"{name}\" = S.\"{name}\"")) - .collect::>() - .join(", "); - - let unique_fields = if !table.unique_key.is_empty() { - table.unique_key.clone() - } else { - schema - .primary_index - .iter() - .map(|ix| &schema.fields[*ix].name) - .map(|name| format!("D.\"{name}\" = S.\"{name}\"")) - .collect::>() - }; - let pk_select = if !unique_fields.is_empty() { - unique_fields.join(" AND ") - } else { - warn!("No unique key defined for oracle sink table {table}. Table will be append-only"); - "1 = 0".to_owned() - }; - - let opid_select = format!( - r#"(D."{TXN_ID_COL}" IS NULL - OR S."{TXN_ID_COL}" > D."{TXN_ID_COL}" - OR (S."{TXN_ID_COL}" = D."{TXN_ID_COL}" AND S."{TXN_SEQ_COL}" > D."{TXN_SEQ_COL}"))"# - ); - - // Match on PK and txn_id. - // If the record does not exist and the op is INSERT, do the INSERT - // If the record exists, but the txid is higher than the operation's txid, - // do nothing (if the op is INSERT, - format!( - r#"MERGE INTO {table} D - USING {temp_table} S - ON ({pk_select}) - WHEN NOT MATCHED THEN INSERT ({destination_columns}) VALUES ({source_values}) WHERE S.DOZER_OPKIND = 0 - WHEN MATCHED THEN UPDATE SET {destination_assign} WHERE S.DOZER_OPKIND = 1 AND {opid_select} - DELETE WHERE S.DOZER_OPKIND = 2 AND {opid_select} - "# - ) -} - -fn generate_insert_statement(table: &Table, schema: &Schema) -> String { - let field_names = schema - .fields - .iter() - .map(|field| field.name.as_str()) - .chain([TXN_ID_COL, TXN_SEQ_COL, OPKIND_COL]); - - let mut parameter_index = 1usize..; - let input_fields = field_names - .clone() - .zip(&mut parameter_index) - .map(|(name, i)| format!(":{i} \"{name}\"")) - .collect::>() - .join(", "); - - // Match on PK and txn_id. - // If the record does not exist and the op is INSERT, do the INSERT - // If the record exists, but the txid is higher than the operation's txid, - // do nothing (if the op is INSERT, - format!( - r#"INSERT INTO {table} - SELECT * - FROM - (SELECT {input_fields} FROM DUAL) - "# - ) -} -fn generate_delete_statement(table: &Table) -> String { - format!(r#"DELETE FROM {table}"#) -} - -#[derive(Debug, Clone)] -struct Table { - owner: String, - name: String, - unique_key: Vec, -} - -#[async_trait] -impl SinkFactory for OracleSinkFactory { - fn type_name(&self) -> String { - "oracle".to_string() - } - - fn get_input_ports(&self) -> Vec { - vec![DEFAULT_PORT_HANDLE] - } - - fn get_input_port_name(&self, _port: &PortHandle) -> String { - self.table.name.clone() - } - - fn prepare(&self, _input_schemas: HashMap) -> Result<(), BoxedError> { - Ok(()) - } - - async fn build( - &self, - mut input_schemas: HashMap, - _event_hub: EventHub, - ) -> Result, BoxedError> { - let config = &self.connection_config; - let root_connect_string = format!( - "{}:{}/{}", - config.host, - config.port, - config.pdb.as_ref().unwrap_or(&config.sid) - ); - let connection = Connection::connect(&config.user, &config.password, root_connect_string)?; - - let schema = input_schemas.remove(&DEFAULT_PORT_HANDLE).unwrap(); - - let mut amended_schema = schema.clone(); - amended_schema.field( - dozer_types::types::FieldDefinition { - name: TXN_ID_COL.to_owned(), - typ: FieldType::UInt, - nullable: true, - source: dozer_types::types::SourceDefinition::Dynamic, - description: None, - }, - false, - ); - amended_schema.field( - dozer_types::types::FieldDefinition { - name: TXN_SEQ_COL.to_owned(), - typ: FieldType::UInt, - nullable: true, - source: dozer_types::types::SourceDefinition::Dynamic, - description: None, - }, - false, - ); - - let temp_table = Table { - owner: self.table.owner.clone(), - name: format!("ORA$PTT_{}", &self.table.name), - unique_key: vec![], - }; - - self.validate_or_create_table( - &connection, - &self.table, - Some(&temp_table), - &amended_schema, - )?; - self.create_index(&connection, &self.table, &amended_schema)?; - let meta_table = Table { - owner: self.table.owner.clone(), - name: METADATA_TABLE.to_owned(), - unique_key: vec![], - }; - self.validate_or_create_table( - &connection, - &meta_table, - None, - Schema::new() - .field( - FieldDefinition { - name: META_TABLE_COL.to_owned(), - typ: FieldType::String, - nullable: false, - source: SourceDefinition::Dynamic, - description: None, - }, - true, - ) - .field( - FieldDefinition { - name: META_TXN_ID_COL.to_owned(), - typ: FieldType::UInt, - nullable: false, - source: SourceDefinition::Dynamic, - description: None, - }, - false, - ), - )?; - - let insert_append = format!( - //"INSERT /*+ APPEND */ INTO \"{table_name}\" VALUES ({})", - "INSERT INTO {} VALUES ({})", - &self.table, - (1..=amended_schema.fields.len()) - .map(|i| format!(":{i}")) - .collect::>() - .join(", ") - ); - - let field_types = schema.fields.iter().map(|field| field.typ).collect(); - - let merge_statement = generate_merge_statement(&self.table, &temp_table, &schema); - info!(target: "oracle_sink", "Merge statement {}", merge_statement); - - let insert_statement = generate_insert_statement(&temp_table, &schema); - info!(target: "oracle_sink", "Insert statement {}", insert_statement); - - let delete_statement = generate_delete_statement(&temp_table); - info!(target: "oracle_sink", "Delete statement {}", delete_statement); - - Ok(Box::new(OracleSink { - conn: connection, - insert_append, - merge_statement, - insert_statement, - delete_statement, - field_types, - pk: schema.primary_index, - batch_params: Vec::new(), - //TODO: make this configurable - batch_size: 10000, - insert_metadata: format!("INSERT INTO \"{METADATA_TABLE}\" (\"{META_TABLE_COL}\", \"{META_TXN_ID_COL}\") VALUES (q'\"{}_{}\"', :1)", &self.table.owner, &self.table.name), - update_metadata: format!("UPDATE \"{METADATA_TABLE}\" SET \"{META_TXN_ID_COL}\" = :1 WHERE \"{META_TABLE_COL}\" = q'\"{}_{}\"'", &self.table.owner, &self.table.name) , - select_metadata: format!("SELECT \"{META_TXN_ID_COL}\" FROM \"{METADATA_TABLE}\" WHERE \"{META_TABLE_COL}\" = q'\"{}_{}\"'", &self.table.owner, &self.table.name), - latest_txid: None, - })) - } -} - -#[derive(Debug)] -struct OraField(Field, FieldType); - -impl ToSql for OraField { - fn oratype(&self, conn: &Connection) -> oracle::Result { - match &self.0 { - Field::UInt(v) => v.oratype(conn), - Field::Int(v) => v.oratype(conn), - Field::Float(v) => v.oratype(conn), - Field::Boolean(_) => Ok(OracleType::Number(1, 0)), - Field::String(v) | Field::Text(v) => v.oratype(conn), - Field::Binary(v) => v.oratype(conn), - Field::Decimal(_) => Ok(OracleType::Number(29, 10)), - Field::Timestamp(v) => v.oratype(conn), - Field::Date(v) => v.oratype(conn), - Field::Duration(_) => Ok(OracleType::IntervalDS(9, 9)), - Field::Null => match self.1 { - FieldType::UInt => 0u64.oratype(conn), - FieldType::Int => 0i64.oratype(conn), - FieldType::Float => 0f64.oratype(conn), - FieldType::Boolean => Ok(OracleType::Number(1, 0)), - FieldType::String | FieldType::Text => "".oratype(conn), - FieldType::Binary => Vec::::new().oratype(conn), - FieldType::Decimal => Ok(OracleType::Number(29, 10)), - FieldType::Timestamp => DateTime::::MAX_UTC.oratype(conn), - FieldType::Date => NaiveDate::MAX.oratype(conn), - FieldType::Duration => Ok(OracleType::IntervalDS(9, 9)), - _ => unimplemented!(), - }, - _ => unimplemented!(), - } - } - - fn to_sql(&self, val: &mut oracle::SqlValue) -> oracle::Result<()> { - match &self.0 { - Field::UInt(v) => v.to_sql(val), - Field::Int(v) => v.to_sql(val), - Field::Float(v) => v.to_sql(val), - Field::Boolean(_) => 1.to_sql(val), - Field::String(v) | Field::Text(v) => v.to_sql(val), - Field::Binary(v) => v.to_sql(val), - Field::Timestamp(v) => v.to_sql(val), - Field::Decimal(v) => v.to_string().to_sql(val), - Field::Date(v) => v.to_sql(val), - Field::Duration(d) => chrono::Duration::from_std(d.0) - .map_err(|e| oracle::Error::OutOfRange(e.to_string())) - .and_then(|v| v.to_sql(val)), - Field::Null => val.set_null(), - _ => unimplemented!(), - } - } -} - -#[derive(Debug)] -enum OpKind { - Insert = 0, - Update = 1, - Delete = 2, -} - -impl OracleSink { - fn exec_batch(&mut self) -> oracle::Result<()> { - debug!(target: "oracle_sink", "Executing batch of size {}", self.batch_params.len()); - let started = std::time::Instant::now(); - - let mut batch = self - .conn - .batch(&self.insert_statement, self.batch_params.len()) - .build()?; - for params in self.batch_params.drain(..) { - let mut bind_idx = 1..; - for ((field, typ), i) in params - .params - .values - .into_iter() - .zip(&self.field_types) - .zip(&mut bind_idx) - { - batch.set(i, &OraField(field, *typ))?; - } - let (txid, seq_in_tx) = params.op_id.map(|opid| (opid.txid, opid.seq_in_tx)).unzip(); - batch.set(bind_idx.next().unwrap(), &txid)?; - batch.set(bind_idx.next().unwrap(), &seq_in_tx)?; - batch.set(bind_idx.next().unwrap(), &(params.op_kind as u64))?; - batch.append_row(&[])?; - } - batch.execute()?; - - self.conn.execute(&self.merge_statement, &[])?; - - self.conn.execute(&self.delete_statement, &[])?; - - debug!(target: "oracle_sink", "Execution took {:?}", started.elapsed()); - Ok(()) - } - - fn batch( - &mut self, - op_id: Option, - kind: OpKind, - record: Record, - ) -> oracle::Result<()> { - self.batch_params.push(BatchedOperation { - op_id, - op_kind: kind, - params: record, - }); - if self.batch_params.len() >= self.batch_size { - self.exec_batch()?; - } - Ok(()) - } -} - -impl Sink for OracleSink { - fn commit( - &mut self, - _epoch_details: &dozer_core::epoch::Epoch, - ) -> Result<(), dozer_types::errors::internal::BoxedError> { - // Ok(self.conn.commit()?) - Ok(()) - } - - fn supports_batching(&self) -> bool { - true - } - - fn flush_batch(&mut self) -> Result<(), BoxedError> { - self.exec_batch()?; - if let Some(txid) = self.latest_txid { - // If the row_count == 0, we need to insert instead. - if self - .conn - .execute(&self.update_metadata, &[&txid])? - .row_count()? - == 0 - { - self.conn.execute(&self.insert_metadata, &[&txid])?; - } - } - self.conn.commit()?; - Ok(()) - } - - fn process( - &mut self, - op: TableOperation, - ) -> Result<(), dozer_types::errors::internal::BoxedError> { - self.latest_txid = op.id.map(|id| id.txid); - match op.op { - Operation::Delete { old } => { - self.batch(op.id, OpKind::Delete, old)?; - } - Operation::Insert { new } => { - self.batch(op.id, OpKind::Insert, new)?; - } - Operation::Update { old, new } => { - let old_index = old.get_fields_by_indexes(&self.pk); - let new_index = new.get_fields_by_indexes(&self.pk); - if old_index != new_index { - return Err(Box::new(Error::UpdatedPrimaryKey { - old: old_index, - new: new_index, - })); - } - - self.batch(op.id, OpKind::Update, new)?; - } - Operation::BatchInsert { mut new } => { - let mut batch = self - .conn - .batch(&self.insert_append, self.batch_size) - .build()?; - for record in new.drain(..) { - let mut bind_idx = 1..; - for ((field, typ), i) in record - .values - .into_iter() - .zip(&self.field_types) - .zip(&mut bind_idx) - { - batch.set(i, &OraField(field, *typ))?; - } - let (txid, seq_in_tx) = op.id.map(|id| (id.txid, id.seq_in_tx)).unzip(); - batch.set(bind_idx.next().unwrap(), &txid)?; - batch.set(bind_idx.next().unwrap(), &seq_in_tx)?; - - batch.append_row(&[])?; - } - batch.execute()?; - } - } - Ok(()) - } - - fn on_source_snapshotting_started( - &mut self, - _connection_name: String, - ) -> Result<(), dozer_types::errors::internal::BoxedError> { - Ok(()) - } - - fn on_source_snapshotting_done( - &mut self, - _connection_name: String, - id: Option, - ) -> Result<(), dozer_types::errors::internal::BoxedError> { - self.latest_txid = id.map(|opid| opid.txid); - self.flush_batch()?; - Ok(()) - } - - fn set_source_state( - &mut self, - _source_state: &[u8], - ) -> Result<(), dozer_types::errors::internal::BoxedError> { - Ok(()) - } - - fn get_source_state( - &mut self, - ) -> Result>, dozer_types::errors::internal::BoxedError> { - Ok(None) - } - - fn get_latest_op_id( - &mut self, - ) -> Result, dozer_types::errors::internal::BoxedError> - { - match self.conn.query_row_as::(&self.select_metadata, &[]) { - Ok(txid) => Ok(Some(OpIdentifier { txid, seq_in_tx: 0 })), - Err(oracle::Error::NoDataFound) => Ok(None), - Err(e) => Err(e.into()), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use dozer_core::tokio; - - fn trim_str(s: impl AsRef) -> String { - s.as_ref() - .lines() - .map(|line| line.trim()) - .filter(|line| !line.is_empty()) - .collect::>() - .join(" ") - } - - #[test] - fn test_generate_merge_stmt() { - let mut schema = Schema::new(); - schema - .field(f("id"), true) - .field(f("name"), true) - .field(f("content"), false); - - let table = Table { - owner: "owner".to_owned(), - name: "tablename".to_owned(), - unique_key: vec![], - }; - - let temp_table = Table { - owner: "owner".to_owned(), - name: "tablename_temp".to_owned(), - unique_key: vec![], - }; - - let stmt = generate_merge_statement(&table, &temp_table, &schema); - assert_eq!( - trim_str(stmt), - trim_str( - r#" - MERGE INTO "owner"."tablename" D - USING "owner"."tablename_temp" S - ON (D."id" = S."id" AND D."name" = S."name") - WHEN NOT MATCHED THEN INSERT (D."id", D."name", D."content", D."__txn_id", D."__txn_seq") VALUES (S."id", S."name", S."content", S."__txn_id", S."__txn_seq") WHERE S.DOZER_OPKIND = 0 - WHEN MATCHED THEN UPDATE SET D."content" = S."content", D."__txn_id" = S."__txn_id", D."__txn_seq" = S."__txn_seq" - WHERE S.DOZER_OPKIND = 1 AND (D."__txn_id" IS NULL - OR S."__txn_id" > D."__txn_id" - OR (S."__txn_id" = D."__txn_id" AND S."__txn_seq" > D."__txn_seq")) - DELETE WHERE S.DOZER_OPKIND = 2 AND (D."__txn_id" IS NULL - OR S."__txn_id" > D."__txn_id" - OR (S."__txn_id" = D."__txn_id" AND S."__txn_seq" > D."__txn_seq")) -"# - ) - ) - } - - #[tokio::test] - #[ignore = "Needs oracle database"] - async fn test_insert_composite() { - let factory = OracleSinkFactory::new( - OracleConfig { - user: "C##DOZER".into(), - password: "123".into(), - host: "localhost".into(), - port: 1521, - sid: "ORCLCDB".into(), - pdb: Some("ORCLPDB1".into()), - schemas: vec![], - batch_size: None, - replicator: dozer_types::models::ingestion_types::OracleReplicator::LogMiner { - poll_interval_in_milliseconds: 0, - }, - }, - OracleSinkConfig { - connection: "".into(), - table_name: "test".into(), - owner: None, - unique_key: vec![], - }, - ); - let mut schema = Schema::new(); - schema.field( - FieldDefinition { - name: "ida".into(), - typ: FieldType::UInt, - nullable: false, - source: SourceDefinition::Dynamic, - description: None, - }, - true, - ); - schema.field( - FieldDefinition { - name: "idb".into(), - typ: FieldType::UInt, - nullable: false, - source: SourceDefinition::Dynamic, - description: None, - }, - true, - ); - let schemas = HashMap::from_iter([(DEFAULT_PORT_HANDLE, schema)]); - let mut sink = factory.build(schemas, EventHub::new(100)).await.unwrap(); - for id0 in 0..2 { - for id1 in 0..2 { - sink.process(TableOperation { - id: None, - op: Operation::Insert { - new: Record::new(vec![Field::UInt(id0), Field::UInt(id1)]), - }, - port: 0, - }) - .unwrap(); - } - } - sink.flush_batch().unwrap(); - let conn = Connection::connect("C##DOZER", "123", "localhost/ORCLPDB1").unwrap(); - assert_eq!( - conn.query_row_as::("SELECT COUNT(*) FROM \"test\"", &[]) - .unwrap(), - 4 - ) - } - - fn f(name: &str) -> FieldDefinition { - FieldDefinition { - name: name.to_owned(), - typ: FieldType::String, - nullable: false, - source: SourceDefinition::Dynamic, - description: None, - } - } -} diff --git a/dozer-types/src/models/sink.rs b/dozer-types/src/models/sink.rs index 10c1f468d5..ac86af10bb 100644 --- a/dozer-types/src/models/sink.rs +++ b/dozer-types/src/models/sink.rs @@ -1,9 +1,9 @@ -use std::num::NonZeroUsize; +use std::{num::NonZeroUsize, path::Display}; +use super::equal_default; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; - -use super::equal_default; +use std::fmt; #[derive(Debug, Serialize, Deserialize, JsonSchema, Default, Eq, PartialEq, Clone)] #[serde(deny_unknown_fields)] @@ -110,6 +110,17 @@ pub enum SinkConfig { Clickhouse(ClickhouseSinkConfig), Oracle(OracleSinkConfig), } +impl SinkConfig { + pub fn name(&self) -> String { + let name = match self { + SinkConfig::Dummy(_) => "dummy", + SinkConfig::Aerospike(_) => "aerospike", + SinkConfig::Clickhouse(_) => "clickhouse", + SinkConfig::Oracle(_) => "oracle", + }; + return name.to_string(); + } +} #[derive(Debug, Serialize, Deserialize, JsonSchema, Clone, PartialEq, Eq)] #[serde(deny_unknown_fields)] From bdca79522ea10f5406d82539442dd2c94105640d Mon Sep 17 00:00:00 2001 From: VG Date: Tue, 16 Apr 2024 13:16:11 +0800 Subject: [PATCH 3/4] chore: remove git notifications --- .github/labeler.yml | 7 - .github/workflows/general.yaml | 23 -- .github/workflows/integration.yaml | 54 ---- .github/workflows/labeler.yml | 17 -- .github/workflows/pulls.yaml | 33 --- .github/workflows/release.yaml | 413 ----------------------------- .github/workflows/unit.yaml | 93 ------- 7 files changed, 640 deletions(-) delete mode 100644 .github/labeler.yml delete mode 100644 .github/workflows/general.yaml delete mode 100644 .github/workflows/integration.yaml delete mode 100644 .github/workflows/labeler.yml delete mode 100644 .github/workflows/pulls.yaml delete mode 100644 .github/workflows/release.yaml delete mode 100644 .github/workflows/unit.yaml diff --git a/.github/labeler.yml b/.github/labeler.yml deleted file mode 100644 index b2712fa177..0000000000 --- a/.github/labeler.yml +++ /dev/null @@ -1,7 +0,0 @@ -# Note that any updates to this files will not be applied in CI -# until this file is merged into main. This is due to oddities of the labeller Github Action. -'doc-update-needed': - - dozer-types/src/models/* - - dozer-cli/src/cli/types.rs - - dozer-types/protos/* - - dozer-api/src/generator/protoc/generator/template/proto.tmpl diff --git a/.github/workflows/general.yaml b/.github/workflows/general.yaml deleted file mode 100644 index aaa1365106..0000000000 --- a/.github/workflows/general.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: Dozer General - -on: - issues: - types: [opened, edited, milestoned] - issue_comment: - types: [created, deleted, edited] - discussion: - types: [created, edited] - discussion_comment: - types: [created, deleted, edited] -concurrency: - group: general - -jobs: - notify: - name: Discord General - runs-on: ubuntu-latest - steps: - - name: Discord notification - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }} - uses: Ilshidur/action-discord@master diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml deleted file mode 100644 index ac38c4949a..0000000000 --- a/.github/workflows/integration.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Dozer Integration Test - -on: - workflow_dispatch: - inputs: - dozer-version: - description: Expected Dozer version number. Leave blank to skip verifying the version. - -env: - CARGO_TERM_COLOR: always - DOZER_VERSION: ${{ github.event.inputs.dozer-version }} - -concurrency: - group: integration/${{ github.head_ref }} - cancel-in-progress: true - -jobs: - integration-linux: - timeout-minutes: 60 - strategy: - matrix: - labels: [ubuntu-latest, ubuntu-20.04] - fail-fast: false - runs-on: - labels: ${{ matrix.labels }} - steps: - - uses: actions/checkout@v3 - - - name: Install Dozer - run: sudo sh .github/workflows/integration/dockerfiles/install-dozer-ubuntu-amd64.sh - - - name: Install Protoc Ubuntu 22.04 - if: matrix.labels == 'ubuntu-latest' - run: sudo sh .github/workflows/integration/dockerfiles/install-protoc-ubuntu-22.sh - - - name: Install Protoc Ubuntu 20.04 - if: matrix.labels == 'ubuntu-20.04' - run: sudo sh .github/workflows/integration/dockerfiles/install-protoc-ubuntu-20-amd64.sh - - - name: Run test - run: sudo sh .github/workflows/integration/test-dozer-ubuntu.sh - - integration-macos: - timeout-minutes: 60 - runs-on: - labels: macos-12 - steps: - - uses: actions/checkout@v3 - - - name: Install Dozer - run: brew tap getdozer/dozer && brew install dozer - - - name: Run test - run: sh .github/workflows/integration/test-dozer.sh diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index d0755a7167..0000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: 'Pull Request Labeler' -on: - - pull_request_target - -jobs: - triage: - permissions: - contents: read - pull-requests: write - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v3 - - name: Run labeler - uses: actions/labeler@v4 - with: - repo-token: '${{ secrets.GITHUB_TOKEN }}' diff --git a/.github/workflows/pulls.yaml b/.github/workflows/pulls.yaml deleted file mode 100644 index b241d8d7a8..0000000000 --- a/.github/workflows/pulls.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Dozer Pulls -on: - pull_request_target: - branches: [main, pull-yaml-dev] - types: [opened] - pull_request_review: - types: [submitted] - pull_request_review_comment: - types: [created, deleted] - -concurrency: - group: pull - -jobs: - notify: - name: Discord Pull - runs-on: ubuntu-latest - steps: - - name: Pull Request - if: ${{ github.event_name == 'pull_request_target' || github.event_name == 'pull_request' }} - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_GITHUB_WEBOOK }} - DISCORD_EMBEDS: '[ { - "title": " Pull request #${{ github.event.pull_request.number }} opened by ${{ github.actor }}", - "author": { "icon_url": "https://avatars.githubusercontent.com/${{ github.actor }}", "name": "${{ github.actor }}", "url": "https://github.com/${{ github.actor }}" }, - "fields": [ - { "name": "Pull Request", "value": "[${{ github.event.pull_request.title }}](${{ github.event.pull_request.html_url }})" }, - { "name": "Repository", "value": "[getdozer/dozer](https://github.com/getdozer/dozer)" }, - { "name": "Message", "value": ${{ toJSON(github.event.pull_request.body || github.event.pull_request.title) }}} - ], - "color": 990099 - }]' - uses: Ilshidur/action-discord@master diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index 53390587fe..0000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,413 +0,0 @@ -name: Release -on: - workflow_dispatch: - push: - branches: [release, release-dev, release-test, main] - tags: - - "v*.*.*" -env: - CARGO_TERM_COLOR: always - BUCKET_NAME: "dozer-releases" - ECR_REGISTRY: public.ecr.aws/k7k6x1d4 - ECR_REPOSITORY: dozer - DOCKERHUB_REGISTRY: getdozer - DOCKERHUB_REPOSITORY: dozer - -permissions: - id-token: write # This is required for requesting the JWT - contents: write # This is required for actions/checkout - -jobs: - # https://github.com/orhun/git-cliff/blob/main/.github/workflows/cd.yml - prepare: - name: Prepare - runs-on: ubuntu-20.04 - timeout-minutes: 60 - outputs: - release_body: ${{ steps.release.outputs.release_body }} - version: ${{ steps.version.outputs.version }} - prerelease: ${{ steps.version.outputs.prerelease }} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'recursive' - - name: Generate a changelog - uses: orhun/git-cliff-action@v1 - id: git-cliff - with: - config: .github/config/cliff.toml - args: -vv --latest --strip header - env: - OUTPUT: CHANGES.md - - - name: Set the release body - id: release - shell: bash - run: | - r=$(cat ${{ steps.git-cliff.outputs.changelog }}) - r="$(printf "$r" | tail -n +3)" - r="${r//'%'/'%25'}" - r="${r//$'\n'/'%0A'}" - r="${r//$'\r'/'%0D'}" - echo "::set-output name=release_body::$r" - - - name: Set release version - id: version - run: | - tag=$(printf "%q" ${{ github.ref_name }}) - - if [[ $tag =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then - echo "::set-output name=version::$tag" - echo "::set-output name=prerelease::false" - else - echo "::set-output name=version::dev" - echo "::set-output name=prerelease::true" - fi - - release-linux-aarch64: - name: Release Linux binary for aarch64 - runs-on: ubuntu-20.04 - needs: prepare - env: - CARGO_TARGET: aarch64-unknown-linux-gnu - DEB_NAME: dozer-linux-aarch64 - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - - name: Rust cache - uses: swatinem/rust-cache@v2 - - - name: Install toolchain - uses: dtolnay/rust-toolchain@master - with: - toolchain: stable - target: ${{ env.CARGO_TARGET }} - - - name: Install cross - uses: baptiste0928/cargo-install@v1 - with: - crate: cross - cache-key: '${{ env.CARGO_TARGET }}' - - - name: Build dozer - run: cross build --package=dozer-cli --profile=release --target ${{ env.CARGO_TARGET }} --bin dozer - - - name: Install cargo-deb - uses: baptiste0928/cargo-install@v1 - with: - crate: cargo-deb - cache-key: '${{ env.CARGO_TARGET }}' - - - name: Compile deb file - run: cargo deb -p dozer-cli --target ${{ env.CARGO_TARGET }} --no-build --no-strip --output ./deb/${{ env.DEB_NAME }}.deb - - - name: Prepare release assets - shell: bash - run: | - mkdir -p release - cp {LICENSE,README.md,CHANGELOG.md} release/ 2> /dev/null || echo "Copy Failed...Ignoring.." - cp target/${{ env.CARGO_TARGET }}/release/dozer release/ - - mv release/ ${{ env.DEB_NAME }}/ - - tar -czvf ${{ env.DEB_NAME }}.tar.gz ${{ env.DEB_NAME }}/ - - cp deb/${{ env.DEB_NAME }}.deb ./ - - ls -l ${{ env.DEB_NAME }}* - - - name: Upload the release - uses: svenstaro/upload-release-action@v2 - with: - repo_token: ${{ secrets.GITHUB_TOKEN }} - file: ${{ env.DEB_NAME }}* - file_glob: true - overwrite: true - tag: ${{ needs.prepare.outputs.version }} - release_name: "Development Release - ${{ needs.prepare.outputs.version }}" - prerelease: ${{ needs.prepare.outputs.prerelease }} - body: "${{ needs.prepare.outputs.release_body }}" - - - name: Set env variables - env: - VERSION: ${{ needs.prepare.outputs.version }} - RELEASE_NAME: ${{ env.DEB_NAME }}.tar.gz - run: | - echo "RELEASE_NAME=${{env.RELEASE_NAME}}" >> $GITHUB_ENV - echo "DEB_NAME=${{ env.DEB_NAME }}.deb" >> $GITHUB_ENV - echo "VERSION=${{env.VERSION}}" >> $GITHUB_ENV - echo "ARTIFACT_URL=https://${{ env.BUCKET_NAME }}.s3.ap-southeast-1.amazonaws.com/${{ env.VERSION }}/${{ env.RELEASE_NAME }}" >> $GITHUB_ENV - - - name: List deb output files - run: ls -lR ./deb - - - name: configure aws credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} - role-session-name: deployer - aws-region: ap-southeast-1 - - - name: Upload release to S3 - id: upload_s3 - run: | - aws s3 cp $RELEASE_NAME s3://${{ env.BUCKET_NAME }}/$VERSION/$RELEASE_NAME - - - name: Upload release deb to S3 - id: upload_s3_deb - run: | - aws s3 cp deb/$DEB_NAME s3://${{ env.BUCKET_NAME }}/$VERSION/$DEB_NAME - - release-macos-apple-silicon: - name: Release binary for macOS silicon - runs-on: ${{ matrix.os }} - needs: prepare - strategy: - fail-fast: false - matrix: - include: - - os: macos-12 - target: aarch64-apple-darwin - file_name: dozer - asset_name: dozer-macos-aarch64 - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - name: Installing Rust toolchain - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - profile: minimal - target: ${{ matrix.target }} - override: true - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Rust cache - uses: swatinem/rust-cache@v2 - - - name: Cargo build - uses: actions-rs/cargo@v1 - with: - command: build - args: --release --target ${{ matrix.target }} --bin ${{ matrix.file_name }} - - - name: List target output files - run: ls -lR ./target - - - name: Prepare release assets - shell: bash - run: | - mkdir -p release - cp {LICENSE,README.md,CHANGELOG.md} release/ 2> /dev/null || echo "Copy Failed...Ignoring.." - cp target/${{ matrix.target }}/release/${{matrix.file_name}} release/ - - mv release/ ${{matrix.asset_name}}/ - - tar -czvf ${{matrix.asset_name}}.tar.gz ${{matrix.asset_name}}/ - - - name: Upload the release - uses: svenstaro/upload-release-action@v2 - with: - repo_token: ${{ secrets.GITHUB_TOKEN }} - file: ${{matrix.asset_name}}* - file_glob: true - overwrite: true - tag: ${{ needs.prepare.outputs.version }} - release_name: "Development Release - ${{ needs.prepare.outputs.version }}" - prerelease: ${{ needs.prepare.outputs.prerelease }} - body: "${{ needs.prepare.outputs.release_body }}" - - - name: Set env variables - env: - VERSION: ${{ needs.prepare.outputs.version }} - RELEASE_NAME: ${{matrix.asset_name}}.tar.gz - run: | - echo "RELEASE_NAME=${{env.RELEASE_NAME}}" >> $GITHUB_ENV - echo "VERSION=${{env.VERSION}}" >> $GITHUB_ENV - echo "ARTIFACT_URL=https://${{ env.BUCKET_NAME }}.s3.ap-southeast-1.amazonaws.com/${{ env.VERSION }}/${{ env.RELEASE_NAME }}" >> $GITHUB_ENV - - - name: configure aws credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} - role-session-name: deployer - aws-region: ap-southeast-1 - - - name: Upload release to S3 - id: upload_s3 - run: | - aws s3 cp $RELEASE_NAME s3://${{ env.BUCKET_NAME }}/$VERSION/$RELEASE_NAME - - release: - name: Release - runs-on: - labels: ${{ matrix.os }} - needs: prepare - strategy: - matrix: - os: [ubuntu-20.04] - include: - - os: ubuntu-20.04 - file_name: dozer - target: x86_64-unknown-linux-gnu - asset_name: dozer-linux-amd64 - - os: macos-12 - file_name: dozer - target: x86_64-apple-darwin - asset_name: dozer-macos-amd64 - steps: - - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - name: Install minimal stable with clippy and rustfmt - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - target: ${{ matrix.target }} - components: rustfmt, clippy - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Rust cache - uses: swatinem/rust-cache@v2 - - - name: Install cargo-deb - if: matrix.os == 'ubuntu-20.04' - run: cargo install cargo-deb - - - name: Compile deb file - if: matrix.os == 'ubuntu-20.04' - run: cargo-deb -p dozer-cli --output ./deb/${{matrix.asset_name}}.deb - - - name: Build package - if: matrix.os != 'ubuntu-20.04' - run: cargo build --release --bin ${{ matrix.file_name }} - - - name: Build package for ubuntu (with kafka & snowflake) - if: matrix.os == 'ubuntu-20.04' - run: cargo build --release --bin ${{ matrix.file_name }} --features "kafka snowflake" - - - name: Prepare release assets - shell: bash - run: | - mkdir -p release - cp {LICENSE,README.md,CHANGELOG.md} release/ 2> /dev/null || echo "Copy Failed...Ignoring.." - cp target/release/${{matrix.file_name}} release/ - - mv release/ ${{matrix.asset_name}}/ - - tar -czvf ${{matrix.asset_name}}.tar.gz \ - ${{matrix.asset_name}}/ - - cp deb/${{matrix.asset_name}}.deb ./ 2>/dev/null || : - - ls -l ${{matrix.asset_name}}* - - - name: Upload the release - uses: svenstaro/upload-release-action@v2 - with: - repo_token: ${{ secrets.GITHUB_TOKEN }} - file: ${{matrix.asset_name}}* - file_glob: true - overwrite: true - tag: ${{ needs.prepare.outputs.version }} - release_name: "Development Release - ${{ needs.prepare.outputs.version }}" - prerelease: ${{ needs.prepare.outputs.prerelease }} - body: "${{ needs.prepare.outputs.release_body }}" - - - name: Set env variables - env: - VERSION: ${{ needs.prepare.outputs.version }} - RELEASE_NAME: ${{matrix.asset_name}}.tar.gz - run: | - echo "RELEASE_NAME=${{env.RELEASE_NAME}}" >> $GITHUB_ENV - echo "DEB_NAME=${{matrix.asset_name}}.deb" >> $GITHUB_ENV - echo "VERSION=${{env.VERSION}}" >> $GITHUB_ENV - echo "ARTIFACT_URL=https://${{ env.BUCKET_NAME }}.s3.ap-southeast-1.amazonaws.com/${{ env.VERSION }}/${{ env.RELEASE_NAME }}" >> $GITHUB_ENV - - - name: configure aws credentials - if: matrix.os == 'ubuntu-20.04' - uses: aws-actions/configure-aws-credentials@v1 - with: - role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} - role-session-name: deployer - aws-region: ap-southeast-1 - - - name: Upload release to S3 - id: upload_s3 - if: matrix.os == 'ubuntu-20.04' - run: | - aws s3 cp $RELEASE_NAME s3://${{ env.BUCKET_NAME }}/$VERSION/$RELEASE_NAME - - - name: Upload release deb to S3 - id: upload_s3_deb - if: matrix.os == 'ubuntu-20.04' - run: | - aws s3 cp deb/$DEB_NAME s3://${{ env.BUCKET_NAME }}/$VERSION/$DEB_NAME - - - name: Build, tag, and push image to Amazon ECR - id: build_push_ecr - if: matrix.os == 'ubuntu-20.04' - env: - IMAGE_TAG: ${{ needs.prepare.outputs.version }} - run: | - aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $ECR_REGISTRY - docker build -f ci/Dockerfile -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:$GITHUB_SHA . - docker push $ECR_REGISTRY/$ECR_REPOSITORY --all-tags - - - name: Update latest image if releasing - if: (needs.prepare.outputs.prerelease == 'false') && (matrix.os == 'ubuntu-20.04') - env: - IMAGE_TAG: ${{ needs.prepare.outputs.version }} - run: | - docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest - docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest - - - name: Log in to Docker Hub - if: (github.event_name == 'release') && (needs.prepare.outputs.prerelease == 'false') && (matrix.os == 'ubuntu-20.04') - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and push Docker image to Docker Hub - if: (github.event_name == 'release') && (needs.prepare.outputs.prerelease == 'false') && (matrix.os == 'ubuntu-20.04') - env: - IMAGE_TAG: ${{ needs.prepare.outputs.version }} - uses: docker/build-push-action@v2 - with: - context: . - file: ./ci/Dockerfile - push: true - tags: ${{ env.DOCKERHUB_REGISTRY }}/${{ env.DOCKERHUB_REPOSITORY }}:latest,${{ env.DOCKERHUB_REGISTRY }}/${{ env.DOCKERHUB_REPOSITORY }}:${{ env.IMAGE_TAG }} - - - name: Release notification - if: ${{ env.VERSION != 'dev' && matrix.os == 'ubuntu-20.04'}} - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_RELEASE_HOOK }} - DISCORD_EMBEDS: '[ { - "title": "New version `${{env.VERSION}}` released", - "author": { "icon_url": "https://avatars.githubusercontent.com/${{ github.actor }}", "name": "${{ github.actor }}", "url": "https://github.com/${{github.actor}}" }, - "fields": [ - { "name": "Repository", "value": "[getdozer/dozer](https://github.com/getdozer/dozer)", "inline": true }, - { "name": "Binary", "value": "[${{ env.RELEASE_NAME }}](${{ env.ARTIFACT_URL }})", "inline": true }, - { "name": "Using Binary", "value": "`dozer -h`"}, - { "name": "Release Notes", "value": "Release notes can be found [here](https://github.com/getdozer/dozer/releases/tag/${{env.VERSION}})"} - ], - "color": 990099 - }]' - uses: Ilshidur/action-discord@master diff --git a/.github/workflows/unit.yaml b/.github/workflows/unit.yaml deleted file mode 100644 index 9737477e96..0000000000 --- a/.github/workflows/unit.yaml +++ /dev/null @@ -1,93 +0,0 @@ -name: Unit Tests - -on: - workflow_dispatch: - pull_request_target: - branches: [main] - merge_group: - -env: - CARGO_TERM_COLOR: always - -concurrency: - group: unit/${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -permissions: - id-token: write # This is required for requesting the JWT - contents: write # This is required for actions/checkout - -jobs: - # Run unit tests - unit: - timeout-minutes: 60 - runs-on: ubuntu-latest - services: - postgres: - image: debezium/postgres:13 - ports: - - 5434:5432 - env: - POSTGRES_DB: dozer_test - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - ALLOW_IP_RANGE: 0.0.0.0/0 - # command: postgres -c hba_file=/var/lib/stock-sample/pg_hba.conf - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v2 - with: - role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} - role-session-name: dozer-coverage - aws-region: us-east-2 - - - if: github.event_name == 'pull_request_target' - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - submodules: 'recursive' - - - if: github.event_name != 'pull_request_target' - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Rust cache - uses: swatinem/rust-cache@v2 - - - uses: ./.github/workflows/setup-snowflake-and-kafka - - - uses: ./.github/workflows/setup-mysql-and-mariadb - - - name: Run connectors tests - env: - SN_SERVER: ${{ secrets.SN_SERVER }} - SN_USER: ${{ secrets.SN_USER }} - SN_PASSWORD: ${{ secrets.SN_PASSWORD }} - SN_DATABASE: ${{ secrets.SN_DATABASE }} - SN_WAREHOUSE: ${{ secrets.SN_WAREHOUSE }} - SN_DRIVER: ${{ secrets.SN_DRIVER }} - shell: bash - run: | - cargo test \ - -p dozer-ingestion-postgres \ - -p dozer-ingestion-kafka \ - -p dozer-ingestion-mysql \ - --lib --no-fail-fast -- --ignored - - name: Run tests - shell: bash - run: | - source ./dozer-tests/python_udf/virtualenv.sh - cargo test --features snowflake,ethereum,kafka,python,mongodb --no-fail-fast - From 03bbcb7187bac7ab46cdb3d02f71502c499ce0c6 Mon Sep 17 00:00:00 2001 From: VG Date: Tue, 16 Apr 2024 13:36:47 +0800 Subject: [PATCH 4/4] chore: update readme --- README.md | 131 ++++++++++++++++++++---------------------------------- 1 file changed, 48 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index ff37b8fdcf..c971899975 100644 --- a/README.md +++ b/README.md @@ -1,88 +1,53 @@ - - -

- CI - Coverage Status - Docs - Join on Discord - License -

- ## Overview -Dozer is a **data platform for building, deploying and maintaining real-time data products.** - -It is ideal for companies with multiple databases, data warehouses and data lakes that are in need of combining, aggregating and transforming data in real time, and create customer facing or internal data applications. - -*Put it simply, Dozer empowers a single developer go from data sources to ready-made APIs in just a few minutes. All with just a with a simple configuration file.* +Dozer is a **real time data movement tool leveraging CDC from various sources to multiple sinks.** -## How it works -Dozer pulls data from various sources like databases, data lakes, and data warehouses using Change Data Capture (CDC) and periodic polling mechanisms. This ensures up-to-date data ingestion in real-time or near-real-time. - -After capturing data, Dozer offers the possibility of combining, transforming and aggregating it -using its own internal real-time transformation engine. It supports Streaming SQL, WebAssembly (coming soon) and TypeScript (coming soon), as well as ONNX for performing AI predictions in real-time. - -After processing, data is stored and indexed in a low-latency datastore (based on [LMDB](https://github.com/LMDB/lmdb)), queryable using REST and gRPC. +Dozer is magnitudes of times faster than Debezium+Kafka and natively supports stateless transformations. +Primarily used for moving data into warehouses. In our own application, we move data to **Clickhouse** and build data APIs and integration with LLMs. ## How to use it - -### ① Build -A Dozer application consists of a YAML file that can be run locally using the Dozer Live UI or Dozer CLI. As YAML is edited, -changes are immediately reflected on Dozer Live UI. - -![Screenshot](./images/dozer_live_screen1.png) - -### ② Test -Dozer can run the entire infrastructure locally. You can inspect data flowing in in real time or use the built-it API explorer to query data through REST and gRPC. Dozer Live explorer also provides ready-made samples to integrate results into your front-end applications. - -![Screenshot](./images/dozer_live_screen2.png) - -### ③ Deploy -Dozer applications can be self-hosted or deployed in the cloud with a single command. Dozer Cloud (coming soon) provides self-healing and monitoring capabilities, making sure your APIs are always available. - - -## Supported Sources and Tranformation Engines -Dozer currently supports a variety of source databases, data warehouses and object stores. Whenever possible, Dozer leverages Change Data Capture (CDC) to keep data always fresh. For sources that do not support CDC, periodic polling is used. - -Dozer transformations can be executed using Dozer's highly cutomizable streaming SQL engine, which provides UDF supports in WASM (coming soon), TypeScript (coming soon) and ONNX. - -Here is an overview of all supported source types and transformation engines: - -![Screenshot](./images/supported_sources.png) - - -## Why Dozer ? -As teams embark on the journey of implementing real-time data products, they invariably come across a host of challenges that can make the task seem daunting: - -1. **Integration with Various Systems**: Integrating with various data sources can present numerous technical hurdles and interoperability issues. - -2. **Managing Latency**: Ensuring low-latency data access, especially for customer-facing applications, can be a significant challenge. - -3. **Real-Time Data Transformation**: Managing real-time data transformations, especially when dealing with complex queries or large volumes of data, can be difficult and resource-intensive. - -4. **Maintaining Data Freshness**: Keeping the data up-to-date in real-time, particularly when it's sourced from multiple locations like databases, data lakes, or warehouses, can be a daunting task. - -4. **Scalability and High Availability**: Building a data application that can efficiently handle high-volume operations and remain reliable under heavy loads requires advanced architecture design and robust infrastructure. - -To address all the above issues, teams often find themselves stitching together multiple technologies and a significant amount of custom code. This could involve integrating diverse systems like Kafka for real-time data streaming, Redis for low-latency data access and caching, and Spark or Flink for processing and analyzing streaming data. - -![Complex Tools Setup](./images/tools.png) - -The complexity of such a setup can become overwhelming. Ensuring that these different technologies communicate effectively, maintaining them, and handling potential failure points requires extensive effort and expertise. - -This is where Dozer steps in, aiming to dramatically simplify this process. Dozer is designed as an all-in-one backend solution that integrates the capabilities of these disparate technologies into a single, streamlined tool. By doing so, Dozer offers the capacity to build an end-to-end real-time data product without the need to manage multiple technologies and extensive custom code. - -Dozer's goal is to empower a single engineer or a small team of engineers to fully manage the entire lifecycle of a Data Product! - -## Getting Started - -Follow the links below to get started with Dozer: - -- [Installation](https://getdozer.io/docs/installation) -- [Build a sample application using NY Taxi dataset](https://getdozer.io/docs/getting_started) - -For a more comprehensive list of samples check out our [GitHub Samples repo](https://github.com/getdozer/dozer-samples) +Dozer runs with a single configuration file like the following: +```yaml +app_name: dozer-bench +version: 1 +connections: + - name: pg_1 + config: !Postgres + user: user + password: postgres + host: localhost + port: 5432 + database: customers +sinks: + - name: customers + config: !Dummy + table_name: customers +``` + +Full documentation can be found [here](https://github.com/getdozer/dozer/blob/main/dozer-types/src/models/config.rs#L15) + + +## Supported Sources + +| Connector | Extraction | Resuming | Enterprise | +| -------------------- | ---------- | -------- | ------------------- | +| Postgres | ✅ | ✅ | ✅ | +| MySQL | ✅ | ✅ | ✅ | +| Snowflake | ✅ | ✅ | ✅ | +| Kafka | ✅ | 🚧 | ✅ | +| MongoDB | ✅ | 🎯 | ✅ | +| Amazon S3 | ✅ | 🎯 | ✅ | +| Google Cloud Storage | ✅ | 🎯 | ✅ | +| **Oracle | ✅ | ✅ | **Enterprise Only** | +| **Aerospike | ✅ | ✅ | **Enterprise Only** | + + +## Supported Sinks +| Database | Connectivity | Enterprise | +| ---------- | ------------ | ------------------- | +| Clickhouse | ✅ | | +| Postgres | ✅ | | +| MySQL | ✅ | | +| Big Query | ✅ | | +| Oracle | ✅ | **Enterprise Only** | +| Aerospike | ✅ | **Enterprise Only** | \ No newline at end of file