## Learn more
diff --git a/docs/en/concept/config.md b/docs/en/concept/config.md
index cec01dc0d8f..d1fb0f07cda 100644
--- a/docs/en/concept/config.md
+++ b/docs/en/concept/config.md
@@ -19,6 +19,12 @@ config directory.
The config file is similar to the below one:
+:::warn
+
+The old configuration name `source_table_name`/`result_table_name` is deprecated, please migrate to the new name `plugin_input`/`plugin_output` as soon as possible.
+
+:::
+
### hocon
```hocon
@@ -28,7 +34,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -42,8 +48,8 @@ source {
transform {
Filter {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
fields = [name, card]
}
}
@@ -56,7 +62,7 @@ sink {
fields = ["name", "card"]
username = "default"
password = ""
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
```
@@ -80,7 +86,7 @@ Source is used to define where SeaTunnel needs to fetch data, and use the fetche
Multiple sources can be defined at the same time. The supported source can be found
in [Source of SeaTunnel](../connector-v2/source). Each source has its own specific parameters to define how to
fetch data, and SeaTunnel also extracts the parameters that each source will use, such as
-the `result_table_name` parameter, which is used to specify the name of the data generated by the current
+the `plugin_output` parameter, which is used to specify the name of the data generated by the current
source, which is convenient for follow-up used by other modules.
### transform
@@ -96,7 +102,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -116,7 +122,7 @@ sink {
fields = ["name", "age", "card"]
username = "default"
password = ""
- source_table_name = "fake"
+ plugin_input = "fake"
}
}
```
@@ -134,11 +140,11 @@ and efficiently. Sink and source are very similar, but the difference is reading
### Other Information
You will find that when multiple sources and multiple sinks are defined, which data is read by each sink, and
-which is the data read by each transform? We introduce two key configurations called `result_table_name` and
-`source_table_name`. Each source module will be configured with a `result_table_name` to indicate the name of the
-data source generated by the data source, and other transform and sink modules can use `source_table_name` to
+which is the data read by each transform? We introduce two key configurations called `plugin_output` and
+`plugin_input`. Each source module will be configured with a `plugin_output` to indicate the name of the
+data source generated by the data source, and other transform and sink modules can use `plugin_input` to
refer to the corresponding data source name, indicating that I want to read the data for processing. Then
-transform, as an intermediate processing module, can use both `result_table_name` and `source_table_name`
+transform, as an intermediate processing module, can use both `plugin_output` and `plugin_input`
configurations at the same time. But you will find that in the above example config, not every module is
configured with these two parameters, because in SeaTunnel, there is a default convention, if these two
parameters are not configured, then the generated data from the last module of the previous node will be used.
@@ -170,7 +176,7 @@ Before writing the config file, please make sure that the name of the config fil
"source": [
{
"plugin_name": "FakeSource",
- "result_table_name": "fake",
+ "plugin_output": "fake",
"row.num": 100,
"schema": {
"fields": {
@@ -184,8 +190,8 @@ Before writing the config file, please make sure that the name of the config fil
"transform": [
{
"plugin_name": "Filter",
- "source_table_name": "fake",
- "result_table_name": "fake1",
+ "plugin_input": "fake",
+ "plugin_output": "fake1",
"fields": ["name", "card"]
}
],
@@ -198,7 +204,7 @@ Before writing the config file, please make sure that the name of the config fil
"fields": ["name", "card"],
"username": "default",
"password": "",
- "source_table_name": "fake1"
+ "plugin_input": "fake1"
}
]
}
@@ -234,7 +240,7 @@ env {
source {
FakeSource {
- result_table_name = "${resName:fake_test}_table"
+ plugin_output = "${resName:fake_test}_table"
row.num = "${rowNum:50}"
string.template = ${strTemplate}
int.template = [20, 21]
@@ -249,8 +255,8 @@ source {
transform {
sql {
- source_table_name = "${resName:fake_test}_table"
- result_table_name = "sql"
+ plugin_input = "${resName:fake_test}_table"
+ plugin_output = "sql"
query = "select * from ${resName:fake_test}_table where name = '${nameVal}' "
}
@@ -258,7 +264,7 @@ transform {
sink {
Console {
- source_table_name = "sql"
+ plugin_input = "sql"
username = ${username}
password = ${password}
}
@@ -291,7 +297,7 @@ env {
source {
FakeSource {
- result_table_name = "fake_test_table"
+ plugin_output = "fake_test_table"
row.num = 50
string.template = ['abc','d~f','hi']
int.template = [20, 21]
@@ -306,8 +312,8 @@ source {
transform {
sql {
- source_table_name = "fake_test_table"
- result_table_name = "sql"
+ plugin_input = "fake_test_table"
+ plugin_output = "sql"
query = "select * from fake_test_table where name = 'abc' "
}
@@ -315,7 +321,7 @@ transform {
sink {
Console {
- source_table_name = "sql"
+ plugin_input = "sql"
username = "seatunnel=2.3.1"
password = "$a^b%c.d~e0*9("
}
diff --git a/docs/en/concept/schema-evolution.md b/docs/en/concept/schema-evolution.md
index 067bfc7b1c9..f28c10f5f54 100644
--- a/docs/en/concept/schema-evolution.md
+++ b/docs/en/concept/schema-evolution.md
@@ -1,19 +1,25 @@
# Schema evolution
Schema Evolution means that the schema of a data table can be changed and the data synchronization task can automatically adapt to the changes of the new table structure without any other operations.
-Now we only support the operation about `add column`、`drop column`、`rename column` and `modify column` of the table in CDC source. This feature is only support zeta engine at now.
+Now we only support the operation about `add column`、`drop column`、`rename column` and `modify column` of the table in CDC source. This feature is only support zeta engine at now.
+
## Supported connectors
### Source
[Mysql-CDC](https://github.com/apache/seatunnel/blob/dev/docs/en/connector-v2/source/MySQL-CDC.md)
+[Oracle-CDC](https://github.com/apache/seatunnel/blob/dev/docs/en/connector-v2/source/Oracle-CDC.md)
### Sink
[Jdbc-Mysql](https://github.com/apache/seatunnel/blob/dev/docs/en/connector-v2/sink/Jdbc.md)
+[Jdbc-Oracle](https://github.com/apache/seatunnel/blob/dev/docs/en/connector-v2/sink/Jdbc.md)
+
+Note: The schema evolution is not support the transform at now. The schema evolution of different types of databases(Oracle-CDC -> Jdbc-Mysql)is currently not supported the default value of the column in ddl.
-Note: The schema evolution is not support the transform at now.
+When you use the Oracle-CDC,you can not use the username named `SYS` or `SYSTEM` to modify the table schema, otherwise the ddl event will be filtered out which can lead to the schema evolution not working.
+Otherwise, If your table name start with `ORA_TEMP_` will also has the same problem.
## Enable schema evolution
-Schema evolution is disabled by default in CDC source. You need configure `debezium.include.schema.changes = true` which is only supported in MySQL-CDC to enable it.
+Schema evolution is disabled by default in CDC source. You need configure `debezium.include.schema.changes = true` which is only supported in CDC to enable it. When you use Oracle-CDC with schema-evolution enabled, you must specify `redo_log_catalog` as `log.mining.strategy` in the `debezium` attribute.
## Examples
@@ -56,3 +62,92 @@ sink {
}
}
```
+
+### Oracle-cdc -> Jdbc-Oracle
+```
+env {
+ # You can set engine configuration here
+ parallelism = 1
+ job.mode = "STREAMING"
+ checkpoint.interval = 5000
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ Oracle-CDC {
+ plugin_output = "customers"
+ username = "dbzuser"
+ password = "dbz"
+ database-names = ["ORCLCDB"]
+ schema-names = ["DEBEZIUM"]
+ table-names = ["ORCLCDB.DEBEZIUM.FULL_TYPES"]
+ base-url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB"
+ source.reader.close.timeout = 120000
+ connection.pool.size = 1
+ debezium {
+ include.schema.changes = true
+ log.mining.strategy = redo_log_catalog
+ }
+ }
+}
+
+sink {
+ Jdbc {
+ plugin_input = "customers"
+ driver = "oracle.jdbc.driver.OracleDriver"
+ url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB"
+ user = "dbzuser"
+ password = "dbz"
+ generate_sink_sql = true
+ database = "ORCLCDB"
+ table = "DEBEZIUM.FULL_TYPES_SINK"
+ batch_size = 1
+ primary_keys = ["ID"]
+ connection.pool.size = 1
+ }
+}
+```
+
+### Oracle-cdc -> Jdbc-Mysql
+```
+env {
+ # You can set engine configuration here
+ parallelism = 1
+ job.mode = "STREAMING"
+ checkpoint.interval = 5000
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ Oracle-CDC {
+ plugin_output = "customers"
+ username = "dbzuser"
+ password = "dbz"
+ database-names = ["ORCLCDB"]
+ schema-names = ["DEBEZIUM"]
+ table-names = ["ORCLCDB.DEBEZIUM.FULL_TYPES"]
+ base-url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB"
+ source.reader.close.timeout = 120000
+ connection.pool.size = 1
+ debezium {
+ include.schema.changes = true
+ log.mining.strategy = redo_log_catalog
+ }
+ }
+}
+
+sink {
+ jdbc {
+ plugin_input = "customers"
+ url = "jdbc:mysql://oracle-host:3306/oracle_sink"
+ driver = "com.mysql.cj.jdbc.Driver"
+ user = "st_user_sink"
+ password = "mysqlpw"
+ generate_sink_sql = true
+ # You need to configure both database and table
+ database = oracle_sink
+ table = oracle_cdc_2_mysql_sink_table
+ primary_keys = ["ID"]
+ }
+}
+```
diff --git a/docs/en/concept/schema-feature.md b/docs/en/concept/schema-feature.md
index feb94cc640a..3a4e83e06e5 100644
--- a/docs/en/concept/schema-feature.md
+++ b/docs/en/concept/schema-feature.md
@@ -172,6 +172,46 @@ constraintKeys = [
| INDEX_KEY | key |
| UNIQUE_KEY | unique key |
+## Multi table schemas
+
+```
+tables_configs = [
+ {
+ schema {
+ table = "database.schema.table1"
+ schema_first = false
+ comment = "comment"
+ columns = [
+ ...
+ ]
+ primaryKey {
+ ...
+ }
+ constraintKeys {
+ ...
+ }
+ }
+ },
+ {
+ schema = {
+ table = "database.schema.table2"
+ schema_first = false
+ comment = "comment"
+ columns = [
+ ...
+ ]
+ primaryKey {
+ ...
+ }
+ constraintKeys {
+ ...
+ }
+ }
+ }
+]
+
+```
+
## How to use schema
### Recommended
@@ -180,7 +220,7 @@ constraintKeys = [
source {
FakeSource {
parallelism = 2
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema {
table = "FakeDatabase.FakeTable"
@@ -234,7 +274,7 @@ If you only need to define the column, you can use fields to define the column,
source {
FakeSource {
parallelism = 2
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/Config-Encryption-Decryption.md b/docs/en/connector-v2/Config-Encryption-Decryption.md
index a5ff8e3439a..341fbf11a1b 100644
--- a/docs/en/connector-v2/Config-Encryption-Decryption.md
+++ b/docs/en/connector-v2/Config-Encryption-Decryption.md
@@ -42,7 +42,7 @@ Next, I'll show how to quickly use SeaTunnel's own `base64` encryption:
source {
MySQL-CDC {
- result_table_name = "fake"
+ plugin_output = "fake"
parallelism = 1
server-id = 5656
port = 56725
@@ -96,7 +96,7 @@ Next, I'll show how to quickly use SeaTunnel's own `base64` encryption:
"port" : 56725,
"database-name" : "inventory_vwyw0n",
"parallelism" : 1,
- "result_table_name" : "fake",
+ "plugin_output" : "fake",
"table-name" : "products",
"plugin_name" : "MySQL-CDC",
"server-id" : 5656,
diff --git a/docs/en/connector-v2/formats/avro.md b/docs/en/connector-v2/formats/avro.md
index 8fef411fb58..62cb19b95da 100644
--- a/docs/en/connector-v2/formats/avro.md
+++ b/docs/en/connector-v2/formats/avro.md
@@ -51,7 +51,7 @@ source {
}
}
}
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
@@ -76,7 +76,7 @@ source {
Kafka {
bootstrap.servers = "kafkaCluster:9092"
topic = "test_avro_topic"
- result_table_name = "kafka_table"
+ plugin_output = "kafka_table"
start_mode = "earliest"
format = avro
format_error_handle_way = skip
@@ -104,7 +104,7 @@ source {
sink {
Console {
- source_table_name = "kafka_table"
+ plugin_input = "kafka_table"
}
}
```
diff --git a/docs/en/connector-v2/formats/canal-json.md b/docs/en/connector-v2/formats/canal-json.md
index 6e133a9a82a..cb8aa3d5edb 100644
--- a/docs/en/connector-v2/formats/canal-json.md
+++ b/docs/en/connector-v2/formats/canal-json.md
@@ -85,7 +85,7 @@ source {
Kafka {
bootstrap.servers = "kafkaCluster:9092"
topic = "products_binlog"
- result_table_name = "kafka_name"
+ plugin_output = "kafka_name"
start_mode = earliest
schema = {
fields {
diff --git a/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md b/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md
index b35501a62a7..564eb2356ce 100644
--- a/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md
+++ b/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md
@@ -17,7 +17,7 @@ env {
source {
MySQL-CDC {
- result_table_name = "table1"
+ plugin_output = "table1"
base-url="jdbc:mysql://localhost:3306/test"
"startup.mode"=INITIAL
@@ -43,9 +43,10 @@ source {
sink {
Kafka {
- source_table_name = "table1"
+ plugin_input = "table1"
bootstrap.servers = "localhost:9092"
+ topic = "${topic}"
# compatible_debezium_json options
format = compatible_debezium_json
diff --git a/docs/en/connector-v2/formats/debezium-json.md b/docs/en/connector-v2/formats/debezium-json.md
index 5f71e14f09d..e296d2404e1 100644
--- a/docs/en/connector-v2/formats/debezium-json.md
+++ b/docs/en/connector-v2/formats/debezium-json.md
@@ -84,7 +84,7 @@ source {
Kafka {
bootstrap.servers = "kafkaCluster:9092"
topic = "products_binlog"
- result_table_name = "kafka_name"
+ plugin_output = "kafka_name"
start_mode = earliest
schema = {
fields {
diff --git a/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
index def638367ca..32ad5808c1c 100644
--- a/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
+++ b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
@@ -16,7 +16,7 @@ source {
Kafka {
bootstrap.servers = "localhost:9092"
topic = "jdbc_source_record"
- result_table_name = "kafka_table"
+ plugin_output = "kafka_table"
start_mode = earliest
schema = {
fields {
diff --git a/docs/en/connector-v2/formats/maxwell-json.md b/docs/en/connector-v2/formats/maxwell-json.md
index 5e1c851d9e9..d271d71624a 100644
--- a/docs/en/connector-v2/formats/maxwell-json.md
+++ b/docs/en/connector-v2/formats/maxwell-json.md
@@ -62,7 +62,7 @@ source {
Kafka {
bootstrap.servers = "kafkaCluster:9092"
topic = "products_binlog"
- result_table_name = "kafka_name"
+ plugin_output = "kafka_name"
start_mode = earliest
schema = {
fields {
diff --git a/docs/en/connector-v2/formats/ogg-json.md b/docs/en/connector-v2/formats/ogg-json.md
index 3faeb33c4f0..fb14802aaa4 100644
--- a/docs/en/connector-v2/formats/ogg-json.md
+++ b/docs/en/connector-v2/formats/ogg-json.md
@@ -66,7 +66,7 @@ source {
Kafka {
bootstrap.servers = "127.0.0.1:9092"
topic = "ogg"
- result_table_name = "kafka_name"
+ plugin_output = "kafka_name"
start_mode = earliest
schema = {
fields {
diff --git a/docs/en/connector-v2/formats/protobuf.md b/docs/en/connector-v2/formats/protobuf.md
new file mode 100644
index 00000000000..916da551b76
--- /dev/null
+++ b/docs/en/connector-v2/formats/protobuf.md
@@ -0,0 +1,163 @@
+# Protobuf Format
+
+Protobuf (Protocol Buffers) is a language-neutral, platform-independent data serialization format developed by Google. It provides an efficient way to encode structured data and supports multiple programming languages and platforms.
+
+Currently, Protobuf format can be used with Kafka.
+
+## Kafka Usage Example
+
+- Example of simulating a randomly generated data source and writing it to Kafka in Protobuf format
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ parallelism = 1
+ plugin_output = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ c_int32 = int
+ c_int64 = long
+ c_float = float
+ c_double = double
+ c_bool = boolean
+ c_string = string
+ c_bytes = bytes
+
+ Address {
+ city = string
+ state = string
+ street = string
+ }
+ attributes = "map"
+ phone_numbers = "array"
+ }
+ }
+ }
+}
+
+sink {
+ kafka {
+ topic = "test_protobuf_topic_fake_source"
+ bootstrap.servers = "kafkaCluster:9092"
+ format = protobuf
+ kafka.request.timeout.ms = 60000
+ kafka.config = {
+ acks = "all"
+ request.timeout.ms = 60000
+ buffer.memory = 33554432
+ }
+ protobuf_message_name = Person
+ protobuf_schema = """
+ syntax = "proto3";
+
+ package org.apache.seatunnel.format.protobuf;
+
+ option java_outer_classname = "ProtobufE2E";
+
+ message Person {
+ int32 c_int32 = 1;
+ int64 c_int64 = 2;
+ float c_float = 3;
+ double c_double = 4;
+ bool c_bool = 5;
+ string c_string = 6;
+ bytes c_bytes = 7;
+
+ message Address {
+ string street = 1;
+ string city = 2;
+ string state = 3;
+ string zip = 4;
+ }
+
+ Address address = 8;
+
+ map attributes = 9;
+
+ repeated string phone_numbers = 10;
+ }
+ """
+ }
+}
+```
+
+- Example of reading data from Kafka in Protobuf format and printing it to the console
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Kafka {
+ topic = "test_protobuf_topic_fake_source"
+ format = protobuf
+ protobuf_message_name = Person
+ protobuf_schema = """
+ syntax = "proto3";
+
+ package org.apache.seatunnel.format.protobuf;
+
+ option java_outer_classname = "ProtobufE2E";
+
+ message Person {
+ int32 c_int32 = 1;
+ int64 c_int64 = 2;
+ float c_float = 3;
+ double c_double = 4;
+ bool c_bool = 5;
+ string c_string = 6;
+ bytes c_bytes = 7;
+
+ message Address {
+ string street = 1;
+ string city = 2;
+ string state = 3;
+ string zip = 4;
+ }
+
+ Address address = 8;
+
+ map attributes = 9;
+
+ repeated string phone_numbers = 10;
+ }
+ """
+ schema = {
+ fields {
+ c_int32 = int
+ c_int64 = long
+ c_float = float
+ c_double = double
+ c_bool = boolean
+ c_string = string
+ c_bytes = bytes
+
+ Address {
+ city = string
+ state = string
+ street = string
+ }
+ attributes = "map"
+ phone_numbers = "array"
+ }
+ }
+ bootstrap.servers = "kafkaCluster:9092"
+ start_mode = "earliest"
+ plugin_output = "kafka_table"
+ }
+}
+
+sink {
+ Console {
+ plugin_input = "kafka_table"
+ }
+}
+```
\ No newline at end of file
diff --git a/docs/en/connector-v2/sink-common-options.md b/docs/en/connector-v2/sink-common-options.md
index 20ceda3dfe7..c452adc801e 100644
--- a/docs/en/connector-v2/sink-common-options.md
+++ b/docs/en/connector-v2/sink-common-options.md
@@ -6,13 +6,19 @@ sidebar_position: 4
> Common parameters of sink connectors
-| Name | Type | Required | Default | Description |
-|-------------------|--------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| source_table_name | String | No | - | When `source_table_name` is not specified, the current plug-in processes the data set `dataset` output by the previous plugin in the configuration file When `source_table_name` is specified, the current plug-in is processing the data set corresponding to this parameter. |
+:::warn
+
+The old configuration name `source_table_name` is deprecated, please migrate to the new name `plugin_input` as soon as possible.
+
+:::
+
+| Name | Type | Required | Default | Description |
+|--------------|--------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| plugin_input | String | No | - | When `plugin_input` is not specified, the current plug-in processes the data set `dataset` output by the previous plugin in the configuration file When `plugin_input` is specified, the current plug-in is processing the data set corresponding to this parameter. |
# Important note
-When the job configuration `source_table_name` you must set the `result_table_name` parameter
+When the job configuration `plugin_input` you must set the `plugin_output` parameter
## Task Example
@@ -24,34 +30,34 @@ When the job configuration `source_table_name` you must set the `result_table_na
source {
FakeSourceStream {
parallelism = 2
- result_table_name = "fake"
+ plugin_output = "fake"
field_name = "name,age"
}
}
transform {
Filter {
- source_table_name = "fake"
+ plugin_input = "fake"
fields = [name]
- result_table_name = "fake_name"
+ plugin_output = "fake_name"
}
Filter {
- source_table_name = "fake"
+ plugin_input = "fake"
fields = [age]
- result_table_name = "fake_age"
+ plugin_output = "fake_age"
}
}
sink {
Console {
- source_table_name = "fake_name"
+ plugin_input = "fake_name"
}
Console {
- source_table_name = "fake_age"
+ plugin_input = "fake_age"
}
}
```
-> If the job only have one source and one(or zero) transform and one sink, You do not need to specify `source_table_name` and `result_table_name` for connector.
-> If the number of any operator in source, transform and sink is greater than 1, you must specify the `source_table_name` and `result_table_name` for each connector in the job.
+> If the job only have one source and one(or zero) transform and one sink, You do not need to specify `plugin_input` and `plugin_output` for connector.
+> If the number of any operator in source, transform and sink is greater than 1, you must specify the `plugin_input` and `plugin_output` for each connector in the job.
diff --git a/docs/en/connector-v2/sink/AmazonSqs.md b/docs/en/connector-v2/sink/AmazonSqs.md
index 8efabfa395b..4a43349b388 100644
--- a/docs/en/connector-v2/sink/AmazonSqs.md
+++ b/docs/en/connector-v2/sink/AmazonSqs.md
@@ -70,7 +70,7 @@ source {
}
}
}
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
diff --git a/docs/en/connector-v2/sink/Assert.md b/docs/en/connector-v2/sink/Assert.md
index bc5b4c1bf32..026adddfae3 100644
--- a/docs/en/connector-v2/sink/Assert.md
+++ b/docs/en/connector-v2/sink/Assert.md
@@ -267,13 +267,13 @@ source {
]
}
]
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
sink{
Assert {
- source_table_name = "fake"
+ plugin_input = "fake"
rules =
{
row_rules = [
diff --git a/docs/en/connector-v2/sink/Console.md b/docs/en/connector-v2/sink/Console.md
index a1c3b570baf..3493915d029 100644
--- a/docs/en/connector-v2/sink/Console.md
+++ b/docs/en/connector-v2/sink/Console.md
@@ -44,7 +44,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
schema = {
fields {
name = "string"
@@ -56,7 +56,7 @@ source {
sink {
Console {
- source_table_name = "fake"
+ plugin_input = "fake"
}
}
```
@@ -73,7 +73,7 @@ env {
source {
FakeSource {
- result_table_name = "fake1"
+ plugin_output = "fake1"
schema = {
fields {
id = "int"
@@ -84,7 +84,7 @@ source {
}
}
FakeSource {
- result_table_name = "fake2"
+ plugin_output = "fake2"
schema = {
fields {
name = "string"
@@ -96,10 +96,10 @@ source {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
Console {
- source_table_name = "fake2"
+ plugin_input = "fake2"
}
}
```
diff --git a/docs/en/connector-v2/sink/DB2.md b/docs/en/connector-v2/sink/DB2.md
index 92df20bd63d..7902c31f08c 100644
--- a/docs/en/connector-v2/sink/DB2.md
+++ b/docs/en/connector-v2/sink/DB2.md
@@ -101,7 +101,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/HdfsFile.md b/docs/en/connector-v2/sink/HdfsFile.md
index 9c2aec0c54b..3060e8ac8b2 100644
--- a/docs/en/connector-v2/sink/HdfsFile.md
+++ b/docs/en/connector-v2/sink/HdfsFile.md
@@ -93,7 +93,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Hive.md b/docs/en/connector-v2/sink/Hive.md
index 147fd766a9f..20f3d22cb86 100644
--- a/docs/en/connector-v2/sink/Hive.md
+++ b/docs/en/connector-v2/sink/Hive.md
@@ -8,7 +8,7 @@ Write data to Hive.
:::tip
-In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9.
+In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9 and 3.1.3 .
If you use SeaTunnel Engine, You need put seatunnel-hadoop3-3.1.4-uber.jar and hive-exec-3.1.3.jar and libfb303-0.9.3.jar in $SEATUNNEL_HOME/lib/ dir.
:::
@@ -182,6 +182,78 @@ sink {
}
```
+### example2: Kerberos
+
+```bash
+sink {
+ Hive {
+ table_name = "default.test_hive_sink_on_hdfs_with_kerberos"
+ metastore_uri = "thrift://metastore:9083"
+ hive_site_path = "/tmp/hive-site.xml"
+ kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM"
+ kerberos_keytab_path = "/tmp/hive.keytab"
+ krb5_path = "/tmp/krb5.conf"
+ }
+}
+```
+
+Description:
+
+- `hive_site_path`: The path to the `hive-site.xml` file.
+- `kerberos_principal`: The principal for Kerberos authentication.
+- `kerberos_keytab_path`: The keytab file path for Kerberos authentication.
+- `krb5_path`: The path to the `krb5.conf` file used for Kerberos authentication.
+
+Run the case:
+
+```bash
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ schema = {
+ fields {
+ pk_id = bigint
+ name = string
+ score = int
+ }
+ primaryKey {
+ name = "pk_id"
+ columnNames = [pk_id]
+ }
+ }
+ rows = [
+ {
+ kind = INSERT
+ fields = [1, "A", 100]
+ },
+ {
+ kind = INSERT
+ fields = [2, "B", 100]
+ },
+ {
+ kind = INSERT
+ fields = [3, "C", 100]
+ }
+ ]
+ }
+}
+
+sink {
+ Hive {
+ table_name = "default.test_hive_sink_on_hdfs_with_kerberos"
+ metastore_uri = "thrift://metastore:9083"
+ hive_site_path = "/tmp/hive-site.xml"
+ kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM"
+ kerberos_keytab_path = "/tmp/hive.keytab"
+ krb5_path = "/tmp/krb5.conf"
+ }
+}
+```
+
## Hive on s3
### Step 1
@@ -395,26 +467,3 @@ sink {
}
}
```
-
-## Changelog
-
-### 2.2.0-beta 2022-09-26
-
-- Add Hive Sink Connector
-
-### 2.3.0-beta 2022-10-20
-
-- [Improve] Hive Sink supports automatic partition repair ([3133](https://github.com/apache/seatunnel/pull/3133))
-
-### 2.3.0 2022-12-30
-
-- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258))
- - When field from upstream is null it will throw NullPointerException
- - Sink columns mapping failed
- - When restore writer from states getting transaction directly failed
-
-### Next version
-
-- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840))
-- [Improve] Added partition_dir_expression validation logic ([3886](https://github.com/apache/seatunnel/pull/3886))
-
diff --git a/docs/en/connector-v2/sink/Hudi.md b/docs/en/connector-v2/sink/Hudi.md
index 6c424fde15e..ea4c066d2f8 100644
--- a/docs/en/connector-v2/sink/Hudi.md
+++ b/docs/en/connector-v2/sink/Hudi.md
@@ -8,7 +8,7 @@ Used to write data to Hudi.
## Key features
-- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
- [x] [cdc](../../concept/connector-v2-features.md)
- [x] [support multiple table write](../../concept/connector-v2-features.md)
@@ -21,7 +21,6 @@ Base configuration:
| table_dfs_path | string | yes | - |
| conf_files_path | string | no | - |
| table_list | Array | no | - |
-| auto_commit | boolean | no | true |
| schema_save_mode | enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST|
| common-options | Config | no | - |
@@ -44,6 +43,7 @@ Table list configuration:
| index_type | enum | no | BLOOM |
| index_class_name | string | no | - |
| record_byte_size | Int | no | 1024 |
+| cdc_enabled | boolean| no | false |
Note: When this configuration corresponds to a single table, you can flatten the configuration items in table_list to the outer layer.
@@ -115,9 +115,9 @@ Note: When this configuration corresponds to a single table, you can flatten the
`max_commits_to_keep` The max commits to keep of hudi table.
-### auto_commit [boolean]
+### cdc_enabled [boolean]
-`auto_commit` Automatic transaction commit is enabled by default.
+`cdc_enabled` Whether to persist the CDC change log. When enable, persist the change data if necessary, and the table can be queried as a CDC query mode.
### schema_save_mode [Enum]
diff --git a/docs/en/connector-v2/sink/Iceberg.md b/docs/en/connector-v2/sink/Iceberg.md
index 721c5ea7c08..54c46b849bf 100644
--- a/docs/en/connector-v2/sink/Iceberg.md
+++ b/docs/en/connector-v2/sink/Iceberg.md
@@ -59,7 +59,7 @@ libfb303-xxx.jar
## Sink Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|----------------------------------------|---------|----------|------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| catalog_name | string | yes | default | User-specified catalog name. default is `default` |
| namespace | string | yes | default | The iceberg database name in the backend catalog. default is `default` |
@@ -76,6 +76,7 @@ libfb303-xxx.jar
| iceberg.table.upsert-mode-enabled | boolean | no | false | Set to `true` to enable upsert mode, default is `false` |
| schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | the schema save mode, please refer to `schema_save_mode` below |
| data_save_mode | Enum | no | APPEND_DATA | the data save mode, please refer to `data_save_mode` below |
+| custom_sql | string | no | - | Custom `delete` data sql for data save mode. e.g: `delete from ... where ...` |
| iceberg.table.commit-branch | string | no | - | Default branch for commits |
## Task Example
@@ -91,7 +92,7 @@ env {
source {
MySQL-CDC {
- result_table_name = "customers_mysql_cdc_iceberg"
+ plugin_output = "customers_mysql_cdc_iceberg"
server-id = 5652
username = "st_user"
password = "seatunnel"
diff --git a/docs/en/connector-v2/sink/Jdbc.md b/docs/en/connector-v2/sink/Jdbc.md
index 1ddbdd507d9..9b86a27721d 100644
--- a/docs/en/connector-v2/sink/Jdbc.md
+++ b/docs/en/connector-v2/sink/Jdbc.md
@@ -226,7 +226,7 @@ In the case of is_exactly_once = "true", Xa transactions are used. This requires
there are some reference value for params above.
-| datasource | driver | url | xa_data_source_class_name | maven |
+| datasource | driver | url | xa_data_source_class_name | maven |
|-------------------|----------------------------------------------|--------------------------------------------------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
| MySQL | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | com.mysql.cj.jdbc.MysqlXADataSource | https://mvnrepository.com/artifact/mysql/mysql-connector-java |
| PostgreSQL | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql |
@@ -235,7 +235,7 @@ there are some reference value for params above.
| SQL Server | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | com.microsoft.sqlserver.jdbc.SQLServerXADataSource | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc |
| Oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | oracle.jdbc.xa.OracleXADataSource | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 |
| sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | / | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc |
-| GBase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | / | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar |
+| GBase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | / | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar |
| StarRocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java |
| db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | com.ibm.db2.jcc.DB2XADataSource | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 |
| saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | / | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc |
@@ -245,9 +245,10 @@ there are some reference value for params above.
| Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc |
| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar |
| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | / | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar |
-| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.11/oceanbase-client-2.4.11.jar |
+| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.12/oceanbase-client-2.4.12.jar |
| xugu | com.xugu.cloudjdbc.Driver | jdbc:xugu://localhost:5138 | / | https://repo1.maven.org/maven2/com/xugudb/xugu-jdbc/12.2.0/xugu-jdbc-12.2.0.jar |
| InterSystems IRIS | com.intersystems.jdbc.IRISDriver | jdbc:IRIS://localhost:1972/%SYS | / | https://raw.githubusercontent.com/intersystems-community/iris-driver-distribution/main/JDBC/JDK18/intersystems-jdbc-3.8.4.jar |
+| opengauss | org.opengauss.Driver | jdbc:opengauss://localhost:5432/postgres | / | https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/5.1.0-og/opengauss-jdbc-5.1.0-og.jar |
## Example
diff --git a/docs/en/connector-v2/sink/Kafka.md b/docs/en/connector-v2/sink/Kafka.md
index 9868e44f602..d201582e38b 100644
--- a/docs/en/connector-v2/sink/Kafka.md
+++ b/docs/en/connector-v2/sink/Kafka.md
@@ -111,7 +111,7 @@ env {
source {
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Kingbase.md b/docs/en/connector-v2/sink/Kingbase.md
index d985517f9ca..d4a5b8b56d5 100644
--- a/docs/en/connector-v2/sink/Kingbase.md
+++ b/docs/en/connector-v2/sink/Kingbase.md
@@ -105,7 +105,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Kudu.md b/docs/en/connector-v2/sink/Kudu.md
index c67b79385fd..d95501ebd17 100644
--- a/docs/en/connector-v2/sink/Kudu.md
+++ b/docs/en/connector-v2/sink/Kudu.md
@@ -67,7 +67,7 @@ env {
}
source {
FakeSource {
- result_table_name = "kudu"
+ plugin_output = "kudu"
schema = {
fields {
id = int
@@ -114,7 +114,7 @@ env {
sink {
kudu{
- source_table_name = "kudu"
+ plugin_input = "kudu"
kudu_masters = "kudu-master-cdc:7051"
table_name = "kudu_sink_table"
enable_kerberos = true
diff --git a/docs/en/connector-v2/sink/Mivlus.md b/docs/en/connector-v2/sink/Milvus.md
similarity index 79%
rename from docs/en/connector-v2/sink/Mivlus.md
rename to docs/en/connector-v2/sink/Milvus.md
index 081f427a5df..6b6598fae30 100644
--- a/docs/en/connector-v2/sink/Mivlus.md
+++ b/docs/en/connector-v2/sink/Milvus.md
@@ -4,8 +4,11 @@
## Description
-Write data to Milvus or Zilliz Cloud
-
+This Milvus sink connector write data to Milvus or Zilliz Cloud, it has the following features:
+- support read and write data by partition
+- support write dynamic schema data from Metadata Column
+- json data will be converted to json string and sink as json as well
+- retry automatically to bypass ratelimit and grpc limit
## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
@@ -34,7 +37,7 @@ Write data to Milvus or Zilliz Cloud
## Sink Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|----------------------|---------|----------|------------------------------|-----------------------------------------------------------|
| url | String | Yes | - | The URL to connect to Milvus or Zilliz Cloud. |
| token | String | Yes | - | User:password |
@@ -44,6 +47,7 @@ Write data to Milvus or Zilliz Cloud
| enable_upsert | boolean | No | false | Upsert data not insert. |
| enable_dynamic_field | boolean | No | true | Enable create table with dynamic field. |
| batch_size | int | No | 1000 | Write batch size. |
+| partition_key | String | No | | Milvus partition key field |
## Task Example
diff --git a/docs/en/connector-v2/sink/Mysql.md b/docs/en/connector-v2/sink/Mysql.md
index 6151394b809..78c2e342fd9 100644
--- a/docs/en/connector-v2/sink/Mysql.md
+++ b/docs/en/connector-v2/sink/Mysql.md
@@ -112,7 +112,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/OceanBase.md b/docs/en/connector-v2/sink/OceanBase.md
index 6ebe101b188..accbbd72cd4 100644
--- a/docs/en/connector-v2/sink/OceanBase.md
+++ b/docs/en/connector-v2/sink/OceanBase.md
@@ -111,7 +111,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Oracle.md b/docs/en/connector-v2/sink/Oracle.md
index fefc31e4e18..d42e3b00fb4 100644
--- a/docs/en/connector-v2/sink/Oracle.md
+++ b/docs/en/connector-v2/sink/Oracle.md
@@ -110,7 +110,7 @@ env {
source {
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Paimon.md b/docs/en/connector-v2/sink/Paimon.md
index 8133b6e8360..68c0755cfd3 100644
--- a/docs/en/connector-v2/sink/Paimon.md
+++ b/docs/en/connector-v2/sink/Paimon.md
@@ -31,7 +31,7 @@ libfb303-xxx.jar
## Options
-| name | type | required | default value | Description |
+| name | type | required | default value | Description |
|-----------------------------|--------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| warehouse | String | Yes | - | Paimon warehouse path |
| catalog_type | String | No | filesystem | Catalog type of Paimon, support filesystem and hive |
@@ -43,7 +43,7 @@ libfb303-xxx.jar
| data_save_mode | Enum | No | APPEND_DATA | The data save mode |
| paimon.table.primary-keys | String | No | - | Default comma-separated list of columns (primary key) that identify a row in tables.(Notice: The partition field needs to be included in the primary key fields) |
| paimon.table.partition-keys | String | No | - | Default comma-separated list of partition fields to use when creating tables. |
-| paimon.table.write-props | Map | No | - | Properties passed through to paimon table initialization, [reference](https://paimon.apache.org/docs/master/maintenance/configurations/#coreoptions). |
+| paimon.table.write-props | Map | No | - | Properties passed through to paimon table initialization, [reference](https://paimon.apache.org/docs/master/maintenance/configurations/#coreoptions). |
| paimon.hadoop.conf | Map | No | - | Properties in hadoop conf |
| paimon.hadoop.conf-path | String | No | - | The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files |
@@ -52,9 +52,21 @@ You must configure the `changelog-producer=input` option to enable the changelog
The changelog producer mode of the paimon table has [four mode](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/) which is `none`、`input`、`lookup` and `full-compaction`.
-Currently, we only support the `none` and `input` mode. The default is `none` which will not output the changelog file. The `input` mode will output the changelog file in paimon table.
+All `changelog-producer` modes are currently supported. The default is `none`.
+
+* [`none`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#none)
+* [`input`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#input)
+* [`lookup`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#lookup)
+* [`full-compaction`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#full-compaction)
+> note:
+> When you use a streaming mode to read paimon table,different mode will produce [different results](https://github.com/apache/seatunnel/blob/dev/docs/en/connector-v2/source/Paimon.md#changelog)。
+
+## Filesystems
+The Paimon connector supports writing data to multiple file systems. Currently, the supported file systems are hdfs and s3.
+If you use the s3 filesystem. You can configure the `fs.s3a.access-key`、`fs.s3a.secret-key`、`fs.s3a.endpoint`、`fs.s3a.path.style.access`、`fs.s3a.aws.credentials.provider` properties in the `paimon.hadoop.conf` option.
+Besides, the warehouse should start with `s3a://`.
+
-When you use a streaming mode to read paimon table, these two mode will produce [different results](https://github.com/apache/seatunnel/blob/dev/docs/en/connector-v2/source/Paimon.md#changelog).
## Examples
@@ -89,6 +101,53 @@ sink {
}
```
+### Single table with s3 filesystem
+
+```hocon
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ schema = {
+ fields {
+ c_map = "map"
+ c_array = "array"
+ c_string = string
+ c_boolean = boolean
+ c_tinyint = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_bytes = bytes
+ c_date = date
+ c_decimal = "decimal(38, 18)"
+ c_timestamp = timestamp
+ }
+ }
+ }
+}
+
+sink {
+ Paimon {
+ warehouse = "s3a://test/"
+ database = "seatunnel_namespace11"
+ table = "st_test"
+ paimon.hadoop.conf = {
+ fs.s3a.access-key=G52pnxg67819khOZ9ezX
+ fs.s3a.secret-key=SHJuAQqHsLrgZWikvMa3lJf5T0NfM5LMFliJh9HF
+ fs.s3a.endpoint="http://minio4:9000"
+ fs.s3a.path.style.access=true
+ fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
+ }
+ }
+}
+```
+
### Single table(Specify hadoop HA config and kerberos config)
```hocon
@@ -250,6 +309,38 @@ sink {
}
```
+#### Write with the `changelog-producer` attribute
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "STREAMING"
+ checkpoint.interval = 5000
+}
+
+source {
+ Mysql-CDC {
+ base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel"
+ username = "root"
+ password = "******"
+ table-names = ["seatunnel.role"]
+ }
+}
+
+sink {
+ Paimon {
+ catalog_name = "seatunnel_test"
+ warehouse = "file:///tmp/seatunnel/paimon/hadoop-sink/"
+ database = "seatunnel"
+ table = "role"
+ paimon.table.write-props = {
+ changelog-producer = full-compaction
+ changelog-tmp-path = /tmp/paimon/changelog
+ }
+ }
+}
+```
+
### Write to dynamic bucket table
Single dynamic bucket table with write props of paimon,operates on the primary key table and bucket is -1.
diff --git a/docs/en/connector-v2/sink/PostgreSql.md b/docs/en/connector-v2/sink/PostgreSql.md
index cde299f6734..cf4bc2e3ada 100644
--- a/docs/en/connector-v2/sink/PostgreSql.md
+++ b/docs/en/connector-v2/sink/PostgreSql.md
@@ -154,7 +154,7 @@ env {
source {
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Prometheus.md b/docs/en/connector-v2/sink/Prometheus.md
new file mode 100644
index 00000000000..7852a87d5b5
--- /dev/null
+++ b/docs/en/connector-v2/sink/Prometheus.md
@@ -0,0 +1,103 @@
+# Prometheus
+
+> Prometheus sink connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
+- [x] [support multiple table write](../../concept/connector-v2-features.md)
+
+## Description
+
+Used to launch web hooks using data.
+
+> For example, if the data from upstream is [`label: {"__name__": "test1"}, value: 1.2.3,time:2024-08-15T17:00:00`], the body content is the following: `{"label":{"__name__": "test1"}, "value":"1.23","time":"2024-08-15T17:00:00"}`
+
+**Tips: Prometheus sink only support `post json` webhook and the data from source will be treated as body content in web hook.And does not support passing past data**
+
+## Supported DataSource Info
+
+In order to use the Http connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Dependency |
+|------------|--------------------|------------------------------------------------------------------------------------------------------------------|
+| Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-prometheus) |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|-----------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | Http request url |
+| headers | Map | No | - | Http headers |
+| retry | Int | No | - | The max retry times if request http return to `IOException` |
+| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed |
+| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed |
+| connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. |
+| socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. |
+| key_timestamp | Int | NO | - | prometheus timestamp key . |
+| key_label | String | yes | - | prometheus label key |
+| key_value | Double | yes | - | prometheus value |
+| batch_size | Int | false | 1024 | prometheus batch size write |
+| flush_interval | Long | false | 300000L | prometheus flush commit interval |
+| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../sink-common-options.md) for details |
+
+## Example
+
+simple:
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ schema = {
+ fields {
+ c_map = "map"
+ c_double = double
+ c_timestamp = timestamp
+ }
+ }
+ plugin_output = "fake"
+ rows = [
+ {
+ kind = INSERT
+ fields = [{"__name__": "test1"}, 1.23, "2024-08-15T17:00:00"]
+ },
+ {
+ kind = INSERT
+ fields = [{"__name__": "test2"}, 1.23, "2024-08-15T17:00:00"]
+ }
+ ]
+ }
+}
+
+
+sink {
+ Prometheus {
+ url = "http://prometheus:9090/api/v1/write"
+ key_label = "c_map"
+ key_value = "c_double"
+ key_timestamp = "c_timestamp"
+ batch_size = 1
+ }
+}
+
+```
+
+## Changelog
+
+### 2.3.8-beta 2024-08-22
+
+- Add Http Sink Connector
+
diff --git a/docs/en/connector-v2/sink/Pulsar.md b/docs/en/connector-v2/sink/Pulsar.md
index a0fc5bd092a..3e29eabbea5 100644
--- a/docs/en/connector-v2/sink/Pulsar.md
+++ b/docs/en/connector-v2/sink/Pulsar.md
@@ -145,7 +145,7 @@ env {
source {
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
@@ -161,7 +161,7 @@ sink {
topic = "example"
client.service-url = "localhost:pulsar://localhost:6650"
admin.service-url = "http://my-broker.example.com:8080"
- result_table_name = "test"
+ plugin_output = "test"
pulsar.config = {
sendTimeoutMs = 30000
}
diff --git a/docs/en/connector-v2/sink/Redis.md b/docs/en/connector-v2/sink/Redis.md
index b5f444bb117..5b37720891b 100644
--- a/docs/en/connector-v2/sink/Redis.md
+++ b/docs/en/connector-v2/sink/Redis.md
@@ -12,21 +12,25 @@ Used to write data to Redis.
## Options
-| name | type | required | default value |
-|----------------|--------|-----------------------|---------------|
-| host | string | yes | - |
-| port | int | yes | - |
-| key | string | yes | - |
-| data_type | string | yes | - |
-| batch_size | int | no | 10 |
-| user | string | no | - |
-| auth | string | no | - |
-| db_num | int | no | 0 |
-| mode | string | no | single |
-| nodes | list | yes when mode=cluster | - |
-| format | string | no | json |
-| expire | long | no | -1 |
-| common-options | | no | - |
+| name | type | required | default value |
+|--------------------|---------|-----------------------|---------------|
+| host | string | yes | - |
+| port | int | yes | - |
+| key | string | yes | - |
+| data_type | string | yes | - |
+| batch_size | int | no | 10 |
+| user | string | no | - |
+| auth | string | no | - |
+| db_num | int | no | 0 |
+| mode | string | no | single |
+| nodes | list | yes when mode=cluster | - |
+| format | string | no | json |
+| expire | long | no | -1 |
+| support_custom_key | boolean | no | false |
+| value_field | string | no | - |
+| hash_key_field | string | no | - |
+| hash_value_field | string | no | - |
+| common-options | | no | - |
### host [string]
@@ -50,12 +54,12 @@ Upstream data is the following:
| 500 | internal error | false |
If you assign field name to `code` and data_type to `key`, two data will be written to redis:
-1. `200 -> {code: 200, message: true, data: get success}`
-2. `500 -> {code: 500, message: false, data: internal error}`
+1. `200 -> {code: 200, data: get success, success: true}`
+2. `500 -> {code: 500, data: internal error, success: false}`
If you assign field name to `value` and data_type to `key`, only one data will be written to redis because `value` is not existed in upstream data's fields:
-1. `value -> {code: 500, message: false, data: internal error}`
+1. `value -> {code: 500, data: internal error, success: false}`
Please see the data_type section for specific writing rules.
@@ -85,7 +89,7 @@ Redis data types, support `key` `hash` `list` `set` `zset`
> Each data from upstream will be added to the configured zset key with a weight of 1. So the order of data in zset is based on the order of data consumption.
>
- ### batch_size [int]
+### batch_size [int]
ensure the batch write size in single-machine mode; no guarantees in cluster mode.
@@ -135,6 +139,61 @@ Connector will generate data as the following and write it to redis:
Set redis expiration time, the unit is second. The default value is -1, keys do not automatically expire by default.
+### support_custom_key [boolean]
+
+if true, the key can be customized by the field value in the upstream data.
+
+Upstream data is the following:
+
+| code | data | success |
+|------|----------------|---------|
+| 200 | get success | true |
+| 500 | internal error | false |
+
+You can customize the Redis key using '{' and '}', and the field name in '{}' will be parsed and replaced by the field value in the upstream data. For example, If you assign field name to `{code}` and data_type to `key`, two data will be written to redis:
+1. `200 -> {code: 200, data: get success, success: true}`
+2. `500 -> {code: 500, data: internal error, success: false}`
+
+Redis key can be composed of fixed and variable parts, connected by ':'. For example, If you assign field name to `code:{code}` and data_type to `key`, two data will be written to redis:
+1. `code:200 -> {code: 200, data: get success, success: true}`
+2. `code:500 -> {code: 500, data: internal error, success: false}`
+
+### value_field [string]
+
+The field of value you want to write to redis, `data_type` support `key` `list` `set` `zset`.
+
+When you assign field name to `value` and value_field is `data` and data_type to `key`, for example:
+
+Upstream data is the following:
+
+| code | data | success |
+|------|-------------|---------|
+| 200 | get success | true |
+
+The following data will be written to redis:
+1. `value -> get success`
+
+### hash_key_field [string]
+
+The field of hash key you want to write to redis, `data_type` support `hash`
+
+### hash_value_field [string]
+
+The field of hash value you want to write to redis, `data_type` support `hash`
+
+When you assign field name to `value` and hash_key_field is `data` and hash_value_field is `success` and data_type to `hash`, for example:
+
+Upstream data is the following:
+
+| code | data | success |
+|------|-------------|---------|
+| 200 | get success | true |
+
+Connector will generate data as the following and write it to redis:
+
+The following data will be written to redis:
+1. `value -> get success | true`
+
### common options
Sink plugin common parameters, please refer to [Sink Common Options](../sink-common-options.md) for details
@@ -152,6 +211,43 @@ Redis {
}
```
+custom key:
+
+```hocon
+Redis {
+ host = localhost
+ port = 6379
+ key = "name:{name}"
+ support_custom_key = true
+ data_type = key
+}
+```
+
+custom value:
+
+```hocon
+Redis {
+ host = localhost
+ port = 6379
+ key = person
+ value_field = "name"
+ data_type = key
+}
+```
+
+custom HashKey and HashValue:
+
+```hocon
+Redis {
+ host = localhost
+ port = 6379
+ key = person
+ hash_key_field = "name"
+ hash_value_field = "age"
+ data_type = hash
+}
+```
+
## Changelog
### 2.2.0-beta 2022-09-26
diff --git a/docs/en/connector-v2/sink/RocketMQ.md b/docs/en/connector-v2/sink/RocketMQ.md
index f0672a3c7f6..f1a7fd86234 100644
--- a/docs/en/connector-v2/sink/RocketMQ.md
+++ b/docs/en/connector-v2/sink/RocketMQ.md
@@ -32,6 +32,7 @@ Write Rows to a Apache RocketMQ topic.
| access.key | String | no | | When ACL_ENABLED is true, access key cannot be empty |
| secret.key | String | no | | When ACL_ENABLED is true, secret key cannot be empty |
| producer.group | String | no | SeaTunnel-producer-Group | SeaTunnel-producer-Group |
+| tag | String | no | - | `RocketMQ` message tag. |
| partition.key.fields | array | no | - | - |
| format | String | no | json | Data format. The default format is json. Optional text format. The default field separator is ",".If you customize the delimiter, add the "field_delimiter" option. |
| field.delimiter | String | no | , | Customize the field delimiter for data format. |
@@ -114,7 +115,7 @@ source {
Rocketmq {
name.srv.addr = "localhost:9876"
topics = "test_topic"
- result_table_name = "rocketmq_table"
+ plugin_output = "rocketmq_table"
schema = {
fields {
c_map = "map"
@@ -160,7 +161,7 @@ source {
Rocketmq {
name.srv.addr = "localhost:9876"
topics = "test_topic"
- result_table_name = "rocketmq_table"
+ plugin_output = "rocketmq_table"
start.mode = "CONSUME_FROM_FIRST_OFFSET"
batch.size = "400"
consumer.group = "test_topic_group"
diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md
index 007c9395f7d..4251fe7f532 100644
--- a/docs/en/connector-v2/sink/S3File.md
+++ b/docs/en/connector-v2/sink/S3File.md
@@ -315,7 +315,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/SftpFile.md b/docs/en/connector-v2/sink/SftpFile.md
index a383cc72da5..4509baa7cff 100644
--- a/docs/en/connector-v2/sink/SftpFile.md
+++ b/docs/en/connector-v2/sink/SftpFile.md
@@ -63,6 +63,8 @@ By default, we use 2PC commit to ensure `exactly-once`
| parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. |
| parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. |
| encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. |
+| schema_save_mode | string | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Existing dir processing method |
+| data_save_mode | string | no | APPEND_DATA | Existing data processing method |
### host [string]
@@ -220,6 +222,19 @@ Support writing Parquet INT96 from a 12-byte field, only valid for parquet files
Only used when file_format_type is json,text,csv,xml.
The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`.
+### schema_save_mode [string]
+Existing dir processing method.
+- RECREATE_SCHEMA: will create when the dir does not exist, delete and recreate when the dir is exist
+- CREATE_SCHEMA_WHEN_NOT_EXIST: will create when the dir does not exist, skipped when the dir is exist
+- ERROR_WHEN_SCHEMA_NOT_EXIST: error will be reported when the dir does not exist
+- IGNORE :Ignore the treatment of the table
+
+### data_save_mode [string]
+Existing data processing method.
+- DROP_DATA: preserve dir and delete data files
+- APPEND_DATA: preserve dir, preserve data files
+- ERROR_WHEN_DATA_EXISTS: when there is data files, an error is reported
+
## Example
For text file format with `have_partition` and `custom_filename` and `sink_columns`
@@ -247,6 +262,35 @@ SftpFile {
is_enable_transaction = true
}
+```
+
+When our source end is multiple tables, and wants different expressions to different directory, we can configure this way
+
+```hocon
+SftpFile {
+ host = "xxx.xxx.xxx.xxx"
+ port = 22
+ user = "username"
+ password = "password"
+ path = "/data/sftp/seatunnel/job1/${table_name}"
+ tmp_path = "/data/sftp/seatunnel/tmp"
+ file_format_type = "text"
+ field_delimiter = "\t"
+ row_delimiter = "\n"
+ have_partition = true
+ partition_by = ["age"]
+ partition_dir_expression = "${k0}=${v0}"
+ is_partition_field_write_in_file = true
+ custom_filename = true
+ file_name_expression = "${transactionId}_${now}"
+ filename_time_format = "yyyy.MM.dd"
+ sink_columns = ["name","age"]
+ is_enable_transaction = true
+ schema_save_mode=RECREATE_SCHEMA
+ data_save_mode=DROP_DATA
+}
+
+
```
## Changelog
diff --git a/docs/en/connector-v2/sink/Sls.md b/docs/en/connector-v2/sink/Sls.md
new file mode 100644
index 00000000000..487786548d0
--- /dev/null
+++ b/docs/en/connector-v2/sink/Sls.md
@@ -0,0 +1,84 @@
+# Sls
+
+> Sls sink connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> Seatunnel Zeta
+
+## Key Features
+
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
+
+## Description
+
+Sink connector for Aliyun Sls.
+
+## Supported DataSource Info
+
+In order to use the Sls connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Maven |
+|------------|--------------------|-----------------------------------------------------------------------------------|
+| Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-sls) |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|-------------------------------------|---------|----------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------|
+| project | String | Yes | - | [Aliyun Sls Project](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) |
+| logstore | String | Yes | - | [Aliyun Sls Logstore](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) |
+| endpoint | String | Yes | - | [Aliyun Access Endpoint](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) |
+| access_key_id | String | Yes | - | [Aliyun AccessKey ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) |
+| access_key_secret | String | Yes | - | [Aliyun AccessKey Secret](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) |
+| source | String | No | SeaTunnel-Source | Data Source marking in sls |
+| topic | String | No | SeaTunnel-Topic | Data topic marking in sls |
+
+## Task Example
+
+### Simple
+
+> This example write data to the sls's logstore1.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+[Create RAM user and authorization](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4),Please ensure thr ram user have sufficient rights to perform, reference [RAM Custom Authorization Example](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b)
+
+```hocon
+# Defining the runtime environment
+env {
+ parallelism = 2
+ job.mode = "STREAMING"
+ checkpoint.interval = 30000
+}
+source {
+ FakeSource {
+ row.num = 10
+ map.size = 10
+ array.size = 10
+ bytes.length = 10
+ string.length = 10
+ schema = {
+ fields = {
+ id = "int"
+ name = "string"
+ description = "string"
+ weight = "string"
+ }
+ }
+ }
+}
+
+sink {
+ Sls {
+ endpoint = "cn-hangzhou-intranet.log.aliyuncs.com"
+ project = "project1"
+ logstore = "logstore1"
+ access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx"
+ access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/Snowflake.md b/docs/en/connector-v2/sink/Snowflake.md
index f40cb2b675d..dd84bcc2cef 100644
--- a/docs/en/connector-v2/sink/Snowflake.md
+++ b/docs/en/connector-v2/sink/Snowflake.md
@@ -89,7 +89,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/sink/Socket.md b/docs/en/connector-v2/sink/Socket.md
index 581a1a5caab..cdd5b7b034d 100644
--- a/docs/en/connector-v2/sink/Socket.md
+++ b/docs/en/connector-v2/sink/Socket.md
@@ -39,7 +39,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
schema = {
fields {
name = "string"
diff --git a/docs/en/connector-v2/sink/SqlServer.md b/docs/en/connector-v2/sink/SqlServer.md
index a9f6bdd8955..3a03d3a2df8 100644
--- a/docs/en/connector-v2/sink/SqlServer.md
+++ b/docs/en/connector-v2/sink/SqlServer.md
@@ -147,7 +147,7 @@ sink {
```
Jdbc {
- source_table_name = "customers"
+ plugin_input = "customers"
driver = com.microsoft.sqlserver.jdbc.SQLServerDriver
url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test"
user = SA
diff --git a/docs/en/connector-v2/sink/Typesense.md b/docs/en/connector-v2/sink/Typesense.md
index 8700d68dc77..f3c78af1617 100644
--- a/docs/en/connector-v2/sink/Typesense.md
+++ b/docs/en/connector-v2/sink/Typesense.md
@@ -77,7 +77,7 @@ Simple example:
```bash
sink {
Typesense {
- source_table_name = "typesense_test_table"
+ plugin_input = "typesense_test_table"
hosts = ["localhost:8108"]
collection = "typesense_to_typesense_sink_with_query"
max_retry_count = 3
diff --git a/docs/en/connector-v2/sink/Vertica.md b/docs/en/connector-v2/sink/Vertica.md
index ef303b59453..04aa77f0e6f 100644
--- a/docs/en/connector-v2/sink/Vertica.md
+++ b/docs/en/connector-v2/sink/Vertica.md
@@ -109,7 +109,7 @@ source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
FakeSource {
parallelism = 1
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
diff --git a/docs/en/connector-v2/source-common-options.md b/docs/en/connector-v2/source-common-options.md
index a66eb34a44c..1c40f287796 100644
--- a/docs/en/connector-v2/source-common-options.md
+++ b/docs/en/connector-v2/source-common-options.md
@@ -6,14 +6,20 @@ sidebar_position: 3
> Common parameters of source connectors
-| Name | Type | Required | Default | Description |
-|-------------------|--------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| result_table_name | String | No | - | When `result_table_name` is not specified, the data processed by this plugin will not be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` When `result_table_name` is specified, the data processed by this plugin will be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The data set `(dataStream/dataset)` registered here can be directly accessed by other plugins by specifying `source_table_name` . |
-| parallelism | Int | No | - | When `parallelism` is not specified, the `parallelism` in env is used by default. When parallelism is specified, it will override the parallelism in env. |
+:::warn
+
+The old configuration name `result_table_name` is deprecated, please migrate to the new name `plugin_output` as soon as possible.
+
+:::
+
+| Name | Type | Required | Default | Description |
+|---------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| plugin_output | String | No | - | When `plugin_output` is not specified, the data processed by this plugin will not be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` When `plugin_output` is specified, the data processed by this plugin will be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The data set `(dataStream/dataset)` registered here can be directly accessed by other plugins by specifying `plugin_input` . |
+| parallelism | Int | No | - | When `parallelism` is not specified, the `parallelism` in env is used by default. When parallelism is specified, it will override the parallelism in env. |
# Important note
-When the job configuration `result_table_name` you must set the `source_table_name` parameter
+When the job configuration `plugin_output` you must set the `plugin_input` parameter
## Task Example
@@ -24,7 +30,7 @@ When the job configuration `result_table_name` you must set the `source_table_na
```bash
source {
FakeSourceStream {
- result_table_name = "fake_table"
+ plugin_output = "fake_table"
}
}
```
@@ -40,7 +46,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -64,9 +70,9 @@ source {
transform {
Sql {
- source_table_name = "fake"
- result_table_name = "fake1"
- # the query table name must same as field 'source_table_name'
+ plugin_input = "fake"
+ plugin_output = "fake1"
+ # the query table name must same as field 'plugin_input'
query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from fake"
}
# The SQL transform support base function and criteria operation
@@ -75,10 +81,10 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
Console {
- source_table_name = "fake"
+ plugin_input = "fake"
}
}
```
diff --git a/docs/en/connector-v2/source/Cassandra.md b/docs/en/connector-v2/source/Cassandra.md
index d4d4e97088a..32966de1e36 100644
--- a/docs/en/connector-v2/source/Cassandra.md
+++ b/docs/en/connector-v2/source/Cassandra.md
@@ -67,7 +67,7 @@ source {
datacenter = "datacenter1"
keyspace = "test"
cql = "select * from source_table"
- result_table_name = "source_table"
+ plugin_output = "source_table"
}
}
```
diff --git a/docs/en/connector-v2/source/Clickhouse.md b/docs/en/connector-v2/source/Clickhouse.md
index 47907bd3025..e3048894ff7 100644
--- a/docs/en/connector-v2/source/Clickhouse.md
+++ b/docs/en/connector-v2/source/Clickhouse.md
@@ -80,7 +80,7 @@ source {
username = "xxxxx"
password = "xxxxx"
server_time_zone = "UTC"
- result_table_name = "test"
+ plugin_output = "test"
clickhouse.config = {
"socket_timeout": "300000"
}
diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md
index 702439c3062..1cbda880139 100644
--- a/docs/en/connector-v2/source/CosFile.md
+++ b/docs/en/connector-v2/source/CosFile.md
@@ -45,7 +45,7 @@ To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and
## Options
-| name | type | required | default value |
+| name | type | required | default value |
|---------------------------|---------|----------|---------------------|
| path | string | yes | - |
| file_format_type | string | yes | - |
@@ -64,7 +64,7 @@ To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and
| sheet_name | string | no | - |
| xml_row_tag | string | no | - |
| xml_use_attr_format | boolean | no | - |
-| file_filter_pattern | string | no | - |
+| file_filter_pattern | string | no | |
| compress_codec | string | no | none |
| archive_compress_codec | string | no | none |
| encoding | string | no | UTF-8 |
@@ -275,6 +275,55 @@ Specifies Whether to process data using the tag attribute format.
Filter pattern, which used for filtering files.
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### compress_codec [string]
The compress codec of files and the details that supported as the following shown:
@@ -294,6 +343,7 @@ The compress codec of archive files and the details that supported as the follow
| ZIP | txt,json,excel,xml | .zip |
| TAR | txt,json,excel,xml | .tar |
| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
| NONE | all | .* |
### encoding [string]
@@ -372,6 +422,33 @@ sink {
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ CosFile {
+ bucket = "cosn://seatunnel-test-1259587829"
+ secret_id = "xxxxxxxxxxxxxxxxxxx"
+ secret_key = "xxxxxxxxxxxxxxxxxxx"
+ region = "ap-chengdu"
+ path = "/seatunnel/read/binary/"
+ file_format_type = "binary"
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
+
## Changelog
### next version
diff --git a/docs/en/connector-v2/source/Doris.md b/docs/en/connector-v2/source/Doris.md
index c67444b58c8..373b84f8fdd 100644
--- a/docs/en/connector-v2/source/Doris.md
+++ b/docs/en/connector-v2/source/Doris.md
@@ -13,15 +13,14 @@
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
- [ ] [exactly-once](../../concept/connector-v2-features.md)
-- [x] [schema projection](../../concept/connector-v2-features.md)
+- [x] [column projection](../../concept/connector-v2-features.md)
- [x] [parallelism](../../concept/connector-v2-features.md)
- [x] [support user-defined split](../../concept/connector-v2-features.md)
+- [x] [support multiple table read](../../concept/connector-v2-features.md)
## Description
-Used to read data from Doris.
-Doris Source will send a SQL to FE, FE will parse it into an execution plan, send it to BE, and BE will
-directly return the data
+Used to read data from Apache Doris.
## Supported DataSource Info
@@ -29,11 +28,6 @@ directly return the data
|------------|--------------------------------------|--------|-----|-------|
| Doris | Only Doris2.0 or later is supported. | - | - | - |
-## Database Dependency
-
-> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/'
-> working directory
-
## Data Type Mapping
| Doris Data type | SeaTunnel Data type |
@@ -54,29 +48,40 @@ directly return the data
## Source Options
+Base configuration:
+
| Name | Type | Required | Default | Description |
|----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------|
| fenodes | string | yes | - | FE address, the format is `"fe_host:fe_http_port"` |
| username | string | yes | - | User username |
| password | string | yes | - | User password |
+| doris.request.retries | int | no | 3 | Number of retries to send requests to Doris FE. |
+| doris.request.read.timeout.ms | int | no | 30000 | |
+| doris.request.connect.timeout.ms | int | no | 30000 | |
+| query-port | string | no | 9030 | Doris QueryPort |
+| doris.request.query.timeout.s | int | no | 3600 | Timeout period of Doris scan data, expressed in seconds. |
+| table_list | string | 否 | - | table list |
+
+Table list configuration:
+
+| Name | Type | Required | Default | Description |
+|----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------|
| database | string | yes | - | The name of Doris database |
| table | string | yes | - | The name of Doris table |
| doris.read.field | string | no | - | Use the 'doris.read.field' parameter to select the doris table columns to read |
-| query-port | string | no | 9030 | Doris QueryPort |
| doris.filter.query | string | no | - | Data filtering in doris. the format is "field = value",example : doris.filter.query = "F_ID > 2" |
| doris.batch.size | int | no | 1024 | The maximum value that can be obtained by reading Doris BE once. |
-| doris.request.query.timeout.s | int | no | 3600 | Timeout period of Doris scan data, expressed in seconds. |
| doris.exec.mem.limit | long | no | 2147483648 | Maximum memory that can be used by a single be scan request. The default memory is 2G (2147483648). |
-| doris.request.retries | int | no | 3 | Number of retries to send requests to Doris FE. |
-| doris.request.read.timeout.ms | int | no | 30000 | |
-| doris.request.connect.timeout.ms | int | no | 30000 | |
+
+Note: When this configuration corresponds to a single table, you can flatten the configuration items in table_list to the outer layer.
### Tips
> It is not recommended to modify advanced parameters at will
-## Task Example
+## Example
+### single table
> This is an example of reading a Doris table and writing to Console.
```
@@ -159,4 +164,49 @@ sink {
Console {}
}
```
+### Multiple table
+```
+env{
+ parallelism = 1
+ job.mode = "BATCH"
+}
+source{
+ Doris {
+ fenodes = "xxxx:8030"
+ username = root
+ password = ""
+ table_list = [
+ {
+ database = "st_source_0"
+ table = "doris_table_0"
+ doris.read.field = "F_ID,F_INT,F_BIGINT,F_TINYINT"
+ doris.filter.query = "F_ID >= 50"
+ },
+ {
+ database = "st_source_1"
+ table = "doris_table_1"
+ }
+ ]
+ }
+}
+
+transform {}
+
+sink{
+ Doris {
+ fenodes = "xxxx:8030"
+ schema_save_mode = "RECREATE_SCHEMA"
+ username = root
+ password = ""
+ database = "st_sink"
+ table = "${table_name}"
+ sink.enable-2pc = "true"
+ sink.label-prefix = "test_json"
+ doris.config = {
+ format="json"
+ read_json_by_line="true"
+ }
+ }
+}
+```
diff --git a/docs/en/connector-v2/source/FakeSource.md b/docs/en/connector-v2/source/FakeSource.md
index 6f6b259736b..48333e244c0 100644
--- a/docs/en/connector-v2/source/FakeSource.md
+++ b/docs/en/connector-v2/source/FakeSource.md
@@ -142,7 +142,7 @@ source {
c_timestamp = timestamp
}
}
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
```
diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md
index 656f7a00422..f65255bfd77 100644
--- a/docs/en/connector-v2/source/FtpFile.md
+++ b/docs/en/connector-v2/source/FtpFile.md
@@ -38,7 +38,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
## Options
-| name | type | required | default value |
+| name | type | required | default value |
|---------------------------|---------|----------|---------------------|
| host | string | yes | - |
| port | int | yes | - |
@@ -62,6 +62,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
| compress_codec | string | no | none |
| archive_compress_codec | string | no | none |
| encoding | string | no | UTF-8 |
+| null_format | string | no | - |
| common-options | | no | - |
### host [string]
@@ -84,6 +85,59 @@ The target ftp password is required
The source file path.
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
+
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### file_format_type [string]
File type, supported as the following file types:
@@ -275,6 +329,7 @@ The compress codec of archive files and the details that supported as the follow
| ZIP | txt,json,excel,xml | .zip |
| TAR | txt,json,excel,xml | .tar |
| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
| NONE | all | .* |
### encoding [string]
@@ -282,6 +337,13 @@ The compress codec of archive files and the details that supported as the follow
Only used when file_format_type is json,text,csv,xml.
The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`.
+### null_format [string]
+
+Only used when file_format_type is text.
+null_format to define which strings can be represented as null.
+
+e.g: `\N`
+
### common options
Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details.
@@ -306,6 +368,67 @@ Source plugin common parameters, please refer to [Source Common Options](../sour
```
+### Multiple Table
+
+```hocon
+
+FtpFile {
+ tables_configs = [
+ {
+ schema {
+ table = "student"
+ }
+ path = "/tmp/seatunnel/sink/text"
+ host = "192.168.31.48"
+ port = 21
+ user = tyrantlucifer
+ password = tianchao
+ file_format_type = "parquet"
+ },
+ {
+ schema {
+ table = "teacher"
+ }
+ path = "/tmp/seatunnel/sink/text"
+ host = "192.168.31.48"
+ port = 21
+ user = tyrantlucifer
+ password = tianchao
+ file_format_type = "parquet"
+ }
+ ]
+}
+
+```
+
+```hocon
+
+FtpFile {
+ tables_configs = [
+ {
+ schema {
+ fields {
+ name = string
+ age = int
+ }
+ }
+ path = "/apps/hive/demo/student"
+ file_format_type = "json"
+ },
+ {
+ schema {
+ fields {
+ name = string
+ age = int
+ }
+ }
+ path = "/apps/hive/demo/teacher"
+ file_format_type = "json"
+ }
+}
+
+```
+
### Transfer Binary File
```hocon
@@ -339,6 +462,33 @@ sink {
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FtpFile {
+ host = "192.168.31.48"
+ port = 21
+ user = tyrantlucifer
+ password = tianchao
+ path = "/seatunnel/read/binary/"
+ file_format_type = "binary"
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
+
## Changelog
### 2.2.0-beta 2022-09-26
diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md
index 7413c0428b8..caaf9972a06 100644
--- a/docs/en/connector-v2/source/HdfsFile.md
+++ b/docs/en/connector-v2/source/HdfsFile.md
@@ -41,7 +41,7 @@ Read data from hdfs file system.
## Source Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| path | string | yes | - | The source file path. |
| file_format_type | string | yes | - | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. |
@@ -62,15 +62,70 @@ Read data from hdfs file system.
| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. |
| xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. |
| xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. |
+| file_filter_pattern | string | no | | Filter pattern, which used for filtering files. |
| compress_codec | string | no | none | The compress codec of files |
| archive_compress_codec | string | no | none |
| encoding | string | no | UTF-8 | |
+| null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` |
| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. |
### delimiter/field_delimiter [string]
**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead.
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
+
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### compress_codec [string]
The compress codec of files and the details that supported as the following shown:
@@ -90,6 +145,7 @@ The compress codec of archive files and the details that supported as the follow
| ZIP | txt,json,excel,xml | .zip |
| TAR | txt,json,excel,xml | .tar |
| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
| NONE | all | .* |
### encoding [string]
@@ -146,3 +202,26 @@ sink {
}
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ HdfsFile {
+ path = "/apps/hive/demo/student"
+ file_format_type = "json"
+ fs.defaultFS = "hdfs://namenode001"
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
diff --git a/docs/en/connector-v2/source/Hive.md b/docs/en/connector-v2/source/Hive.md
index 5669906c3b9..d87739f1034 100644
--- a/docs/en/connector-v2/source/Hive.md
+++ b/docs/en/connector-v2/source/Hive.md
@@ -8,7 +8,7 @@ Read data from Hive.
:::tip
-In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9.
+In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9 and 3.1.3 .
If you use SeaTunnel Engine, You need put seatunnel-hadoop3-3.1.4-uber.jar and hive-exec-3.1.3.jar and libfb303-0.9.3.jar in $SEATUNNEL_HOME/lib/ dir.
:::
@@ -120,6 +120,24 @@ Source plugin common parameters, please refer to [Source Common Options](../sour
```
### Example 2: Multiple tables
+> Note: Hive is a structured data source and should be use 'table_list', and 'tables_configs' will be removed in the future.
+
+```bash
+
+ Hive {
+ table_list = [
+ {
+ table_name = "default.seatunnel_orc_1"
+ metastore_uri = "thrift://namenode001:9083"
+ },
+ {
+ table_name = "default.seatunnel_orc_2"
+ metastore_uri = "thrift://namenode001:9083"
+ }
+ ]
+ }
+
+```
```bash
@@ -138,6 +156,95 @@ Source plugin common parameters, please refer to [Source Common Options](../sour
```
+### Example3 : Kerberos
+
+```bash
+source {
+ Hive {
+ table_name = "default.test_hive_sink_on_hdfs_with_kerberos"
+ metastore_uri = "thrift://metastore:9083"
+ hive.hadoop.conf-path = "/tmp/hadoop"
+ result_table_name = hive_source
+ hive_site_path = "/tmp/hive-site.xml"
+ kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM"
+ kerberos_keytab_path = "/tmp/hive.keytab"
+ krb5_path = "/tmp/krb5.conf"
+ }
+}
+```
+
+Description:
+
+- `hive_site_path`: The path to the `hive-site.xml` file.
+- `kerberos_principal`: The principal for Kerberos authentication.
+- `kerberos_keytab_path`: The keytab file path for Kerberos authentication.
+- `krb5_path`: The path to the `krb5.conf` file used for Kerberos authentication.
+
+Run the case:
+
+```bash
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Hive {
+ table_name = "default.test_hive_sink_on_hdfs_with_kerberos"
+ metastore_uri = "thrift://metastore:9083"
+ hive.hadoop.conf-path = "/tmp/hadoop"
+ result_table_name = hive_source
+ hive_site_path = "/tmp/hive-site.xml"
+ kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM"
+ kerberos_keytab_path = "/tmp/hive.keytab"
+ krb5_path = "/tmp/krb5.conf"
+ }
+}
+
+sink {
+ Assert {
+ source_table_name = hive_source
+ rules {
+ row_rules = [
+ {
+ rule_type = MAX_ROW
+ rule_value = 3
+ }
+ ],
+ field_rules = [
+ {
+ field_name = pk_id
+ field_type = bigint
+ field_value = [
+ {
+ rule_type = NOT_NULL
+ }
+ ]
+ },
+ {
+ field_name = name
+ field_type = string
+ field_value = [
+ {
+ rule_type = NOT_NULL
+ }
+ ]
+ },
+ {
+ field_name = score
+ field_type = int
+ field_value = [
+ {
+ rule_type = NOT_NULL
+ }
+ ]
+ }
+ ]
+ }
+ }
+}
+```
+
## Hive on s3
### Step 1
@@ -265,15 +372,3 @@ sink {
}
}
```
-
-## Changelog
-
-### 2.2.0-beta 2022-09-26
-
-- Add Hive Source Connector
-
-### Next version
-
-- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840))
-- Support user-defined partitions ([3842](https://github.com/apache/seatunnel/pull/3842))
-
diff --git a/docs/en/connector-v2/source/HiveJdbc.md b/docs/en/connector-v2/source/HiveJdbc.md
index 19619d924c1..23227aa306f 100644
--- a/docs/en/connector-v2/source/HiveJdbc.md
+++ b/docs/en/connector-v2/source/HiveJdbc.md
@@ -72,7 +72,7 @@ Read external data source data through JDBC.
| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details |
-| useKerberos | Boolean | No | no | Whether to enable Kerberos, default is false |
+| use_kerberos | Boolean | No | no | Whether to enable Kerberos, default is false |
| kerberos_principal | String | No | - | When use kerberos, we should set kerberos principal such as 'test_user@xxx'. |
| kerberos_keytab_path | String | No | - | When use kerberos, we should set kerberos principal file path such as '/home/test/test_user.keytab' . |
| krb5_path | String | No | /etc/krb5.conf | When use kerberos, we should set krb5 path file path such as '/seatunnel/krb5.conf' or use the default path '/etc/krb5.conf '. |
diff --git a/docs/en/connector-v2/source/Http.md b/docs/en/connector-v2/source/Http.md
index 9c60b4c9aa4..511ba04132d 100644
--- a/docs/en/connector-v2/source/Http.md
+++ b/docs/en/connector-v2/source/Http.md
@@ -78,7 +78,7 @@ env {
source {
Http {
- result_table_name = "http"
+ plugin_output = "http"
url = "http://mockserver:1080/example/http"
method = "GET"
format = "json"
diff --git a/docs/en/connector-v2/source/Iceberg.md b/docs/en/connector-v2/source/Iceberg.md
index 4203c85bb87..8bb21eb7b63 100644
--- a/docs/en/connector-v2/source/Iceberg.md
+++ b/docs/en/connector-v2/source/Iceberg.md
@@ -127,7 +127,7 @@ source {
}
namespace = "database1"
table = "source"
- result_table_name = "iceberg"
+ plugin_output = "iceberg"
}
}
@@ -136,7 +136,7 @@ transform {
sink {
Console {
- source_table_name = "iceberg"
+ plugin_input = "iceberg"
}
}
```
@@ -160,7 +160,7 @@ source {
}
namespace = "your_iceberg_database"
table = "your_iceberg_table"
- result_table_name = "iceberg_test"
+ plugin_output = "iceberg_test"
}
}
```
diff --git a/docs/en/connector-v2/source/Jdbc.md b/docs/en/connector-v2/source/Jdbc.md
index 27b3d875580..2b5897cbaea 100644
--- a/docs/en/connector-v2/source/Jdbc.md
+++ b/docs/en/connector-v2/source/Jdbc.md
@@ -113,7 +113,7 @@ The JDBC Source connector supports parallel reading of data from tables. SeaTunn
there are some reference value for params above.
-| datasource | driver | url | maven |
+| datasource | driver | url | maven |
|-------------------|-----------------------------------------------------|------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
| mysql | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java |
| postgresql | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | https://mvnrepository.com/artifact/org.postgresql/postgresql |
@@ -122,7 +122,7 @@ there are some reference value for params above.
| sqlserver | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc |
| oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 |
| sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc |
-| gbase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar |
+| gbase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar |
| starrocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java |
| db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 |
| tablestore | com.alicloud.openservices.tablestore.jdbc.OTSDriver | "jdbc:ots:http s://myinstance.cn-hangzhou.ots.aliyuncs.com/myinstance" | https://mvnrepository.com/artifact/com.aliyun.openservices/tablestore-jdbc |
@@ -133,10 +133,11 @@ there are some reference value for params above.
| Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb?defaultRowFetchSize=1000 | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 |
| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar |
| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar |
-| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.11/oceanbase-client-2.4.11.jar |
+| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.12/oceanbase-client-2.4.12.jar |
| Hive | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000 | https://repo1.maven.org/maven2/org/apache/hive/hive-jdbc/3.1.3/hive-jdbc-3.1.3-standalone.jar |
| xugu | com.xugu.cloudjdbc.Driver | jdbc:xugu://localhost:5138 | https://repo1.maven.org/maven2/com/xugudb/xugu-jdbc/12.2.0/xugu-jdbc-12.2.0.jar |
| InterSystems IRIS | com.intersystems.jdbc.IRISDriver | jdbc:IRIS://localhost:1972/%SYS | https://raw.githubusercontent.com/intersystems-community/iris-driver-distribution/main/JDBC/JDK18/intersystems-jdbc-3.8.4.jar |
+| opengauss | org.opengauss.Driver | jdbc:opengauss://localhost:5432/postgres | https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/5.1.0-og/opengauss-jdbc-5.1.0-og.jar |
## Example
diff --git a/docs/en/connector-v2/source/kafka.md b/docs/en/connector-v2/source/Kafka.md
similarity index 94%
rename from docs/en/connector-v2/source/kafka.md
rename to docs/en/connector-v2/source/Kafka.md
index 90c183c2c13..dfc23a7572f 100644
--- a/docs/en/connector-v2/source/kafka.md
+++ b/docs/en/connector-v2/source/Kafka.md
@@ -59,6 +59,7 @@ They can be downloaded via install-plugin.sh or from the Maven central repositor
### Simple
> This example reads the data of kafka's topic_1, topic_2, topic_3 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+> In batch mode, during the enumerator sharding process, it will fetch the latest offset for each partition and use it as the stopping point.
```hocon
# Defining the runtime environment
@@ -188,6 +189,65 @@ source {
> This is written to the same pg table according to different formats and topics of parsing kafka Perform upsert operations based on the id
+> Note: Kafka is an unstructured data source and should be use 'tables_configs', and 'table_list' will be removed in the future.
+
+```hocon
+
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Kafka {
+ bootstrap.servers = "kafka_e2e:9092"
+ tables_configs = [
+ {
+ topic = "^test-ogg-sou.*"
+ pattern = "true"
+ consumer.group = "ogg_multi_group"
+ start_mode = earliest
+ schema = {
+ fields {
+ id = "int"
+ name = "string"
+ description = "string"
+ weight = "string"
+ }
+ },
+ format = ogg_json
+ },
+ {
+ topic = "test-cdc_mds"
+ start_mode = earliest
+ schema = {
+ fields {
+ id = "int"
+ name = "string"
+ description = "string"
+ weight = "string"
+ }
+ },
+ format = canal_json
+ }
+ ]
+ }
+}
+
+sink {
+ Jdbc {
+ driver = org.postgresql.Driver
+ url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF"
+ user = test
+ password = test
+ generate_sink_sql = true
+ database = test
+ table = public.sink
+ primary_keys = ["id"]
+ }
+}
+```
+
```hocon
env {
@@ -289,7 +349,7 @@ source {
"""
bootstrap.servers = "kafkaCluster:9092"
start_mode = "earliest"
- result_table_name = "kafka_table"
+ plugin_output = "kafka_table"
}
}
```
diff --git a/docs/en/connector-v2/source/Klaviyo.md b/docs/en/connector-v2/source/Klaviyo.md
index 10b4ed42e9e..848fe38ef8f 100644
--- a/docs/en/connector-v2/source/Klaviyo.md
+++ b/docs/en/connector-v2/source/Klaviyo.md
@@ -45,7 +45,7 @@ http request url
API private key for login, you can get more detail at this link:
-https://developers.klaviyo.com/en/docs/retrieve_api_credentials
+https://developers.klaviyo.com/en/docs/authenticate_#private-key-authentication
### revision [String]
diff --git a/docs/en/connector-v2/source/Kudu.md b/docs/en/connector-v2/source/Kudu.md
index ccd63e090b3..a6fee76f12c 100644
--- a/docs/en/connector-v2/source/Kudu.md
+++ b/docs/en/connector-v2/source/Kudu.md
@@ -78,7 +78,7 @@ source {
kudu {
kudu_masters = "kudu-master:7051"
table_name = "kudu_source_table"
- result_table_name = "kudu"
+ plugin_output = "kudu"
enable_kerberos = true
kerberos_principal = "xx@xx.COM"
kerberos_keytab = "xx.keytab"
@@ -90,11 +90,11 @@ transform {
sink {
console {
- source_table_name = "kudu"
+ plugin_input = "kudu"
}
kudu {
- source_table_name = "kudu"
+ plugin_input = "kudu"
kudu_masters = "kudu-master:7051"
table_name = "kudu_sink_table"
enable_kerberos = true
@@ -125,7 +125,7 @@ source {
table_name = "kudu_source_table_2"
}
]
- result_table_name = "kudu"
+ plugin_output = "kudu"
}
}
diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md
index 6d11b992e3a..477a4d41399 100644
--- a/docs/en/connector-v2/source/LocalFile.md
+++ b/docs/en/connector-v2/source/LocalFile.md
@@ -43,7 +43,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
## Options
-| name | type | required | default value |
+| name | type | required | default value |
|---------------------------|---------|----------|--------------------------------------|
| path | string | yes | - |
| file_format_type | string | yes | - |
@@ -58,10 +58,11 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
| sheet_name | string | no | - |
| xml_row_tag | string | no | - |
| xml_use_attr_format | boolean | no | - |
-| file_filter_pattern | string | no | - |
+| file_filter_pattern | string | no | |
| compress_codec | string | no | none |
| archive_compress_codec | string | no | none |
| encoding | string | no | UTF-8 |
+| null_format | string | no | - |
| common-options | | no | - |
| tables_configs | list | no | used to define a multiple table task |
@@ -254,6 +255,55 @@ Specifies Whether to process data using the tag attribute format.
Filter pattern, which used for filtering files.
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### compress_codec [string]
The compress codec of files and the details that supported as the following shown:
@@ -273,6 +323,7 @@ The compress codec of archive files and the details that supported as the follow
| ZIP | txt,json,excel,xml | .zip |
| TAR | txt,json,excel,xml | .tar |
| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
| NONE | all | .* |
### encoding [string]
@@ -280,6 +331,13 @@ The compress codec of archive files and the details that supported as the follow
Only used when file_format_type is json,text,csv,xml.
The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`.
+### null_format [string]
+
+Only used when file_format_type is text.
+null_format to define which strings can be represented as null.
+
+e.g: `\N`
+
### common options
Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details
@@ -406,6 +464,30 @@ sink {
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ LocalFile {
+ path = "/data/seatunnel/"
+ file_format_type = "csv"
+ skip_header_row_number = 1
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
+
## Changelog
### 2.2.0-beta 2022-09-26
@@ -417,4 +499,6 @@ sink {
- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980))
- [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085))
- [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985))
+### 2.3.9-beta 2024-11-12
+- [Improve] Support parse field from file path ([8019](https://github.com/apache/seatunnel/issues/8019))
diff --git a/docs/en/connector-v2/source/Mivlus.md b/docs/en/connector-v2/source/Milvus.md
similarity index 85%
rename from docs/en/connector-v2/source/Mivlus.md
rename to docs/en/connector-v2/source/Milvus.md
index a56df4c5fe7..e9560489762 100644
--- a/docs/en/connector-v2/source/Mivlus.md
+++ b/docs/en/connector-v2/source/Milvus.md
@@ -4,7 +4,11 @@
## Description
-Read data from Milvus or Zilliz Cloud
+This Milvus source connector reads data from Milvus or Zilliz Cloud, it has the following features:
+- support read and write data by partition
+- support read dynamic schema data into Metadata Column
+- json data will be converted to json string and sink as json as well
+- retry automatically to bypass ratelimit and grpc limit
## Key Features
@@ -53,3 +57,5 @@ source {
}
```
+## Changelog
+
diff --git a/docs/en/connector-v2/source/MongoDB-CDC.md b/docs/en/connector-v2/source/MongoDB-CDC.md
index 301d7075738..d7e6c7e440f 100644
--- a/docs/en/connector-v2/source/MongoDB-CDC.md
+++ b/docs/en/connector-v2/source/MongoDB-CDC.md
@@ -105,13 +105,14 @@ For specific types in MongoDB, we use Extended JSON format to map them to Seatun
## Source Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|------------------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| hosts | String | Yes | - | The comma-separated list of hostname and port pairs of the MongoDB servers. eg. `localhost:27017,localhost:27018` |
| username | String | No | - | Name of the database user to be used when connecting to MongoDB. |
| password | String | No | - | Password to be used when connecting to MongoDB. |
| database | List | Yes | - | Name of the database to watch for changes. If not set then all databases will be captured. The database also supports regular expressions to monitor multiple databases matching the regular expression. eg. `db1,db2`. |
| collection | List | Yes | - | Name of the collection in the database to watch for changes. If not set then all collections will be captured. The collection also supports regular expressions to monitor multiple collections matching fully-qualified collection identifiers. eg. `db1.coll1,db2.coll2`. |
+| schema | | yes | - | The structure of the data, including field names and field types. |
| connection.options | String | No | - | The ampersand-separated connection options of MongoDB. eg. `replicaSet=test&connectTimeoutMS=300000`. |
| batch.size | Long | No | 1024 | The cursor batch size. |
| poll.max.batch.size | Enum | No | 1024 | Maximum number of change stream documents to include in a single batch when polling for new data. |
@@ -185,6 +186,14 @@ source {
collection = ["inventory.products"]
username = stuser
password = stpw
+ schema = {
+ fields {
+ "_id" : string,
+ "name" : string,
+ "description" : string,
+ "weight" : string
+ }
+ }
}
}
@@ -204,76 +213,6 @@ sink {
}
```
-## Multi-table Synchronization
-
-The following example demonstrates how to create a data synchronization job that read the cdc data of multiple library tables mongodb and prints it on the local client:
-
-```hocon
-env {
- # You can set engine configuration here
- parallelism = 1
- job.mode = "STREAMING"
- checkpoint.interval = 5000
-}
-
-source {
- MongoDB-CDC {
- hosts = "mongo0:27017"
- database = ["inventory","crm"]
- collection = ["inventory.products","crm.test"]
- username = stuser
- password = stpw
- }
-}
-
-# Console printing of the read Mongodb data
-sink {
- Console {
- parallelism = 1
- }
-}
-```
-
-### Tips:
-
-> 1.The cdc synchronization of multiple library tables cannot specify the schema, and can only output json data downstream.
-> This is because MongoDB does not provide metadata information for querying, so if you want to support multiple tables, all tables can only be read as one structure.
-
-## Regular Expression Matching for Multiple Tables
-
-The following example demonstrates how to create a data synchronization job that through regular expression read the data of multiple library tables mongodb and prints it on the local client:
-
-| Matching example | Expressions | | Describe |
-|------------------|-------------|---|----------------------------------------------------------------------------------------|
-| Prefix matching | ^(test).* | | Match the database name or table name with the prefix test, such as test1, test2, etc. |
-| Suffix matching | .*[p$] | | Match the database name or table name with the suffix p, such as cdcp, edcp, etc. |
-
-```hocon
-env {
- # You can set engine configuration here
- parallelism = 1
- job.mode = "STREAMING"
- checkpoint.interval = 5000
-}
-
-source {
- MongoDB-CDC {
- hosts = "mongo0:27017"
- # So this example is used (^(test).*|^(tpc).*|txc|.*[p$]|t{2}).(t[5-8]|tt),matching txc.tt、test2.test5.
- database = ["(^(test).*|^(tpc).*|txc|.*[p$]|t{2})"]
- collection = ["(t[5-8]|tt)"]
- username = stuser
- password = stpw
- }
-}
-
-# Console printing of the read Mongodb data
-sink {
- Console {
- parallelism = 1
- }
-}
-```
## Format of real-time streaming data
@@ -309,4 +248,3 @@ sink {
}
}
```
-
diff --git a/docs/en/connector-v2/source/MySQL-CDC.md b/docs/en/connector-v2/source/MySQL-CDC.md
index fc2ea4d8ff0..cc58ec44596 100644
--- a/docs/en/connector-v2/source/MySQL-CDC.md
+++ b/docs/en/connector-v2/source/MySQL-CDC.md
@@ -169,14 +169,14 @@ When an initial consistent snapshot is made for large databases, your establishe
## Source Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:mysql://localhost:3306:3306/test`. |
| username | String | Yes | - | Name of the database to use when connecting to the database server. |
| password | String | Yes | - | Password to use when connecting to the database server. |
| database-names | List | No | - | Database name of the database to monitor. |
| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` |
-| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] |
+| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] |
| startup.mode | Enum | No | INITIAL | Optional startup mode for MySQL CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. `specific`: Startup from user-supplied specific offsets. |
| startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** |
| startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** |
diff --git a/docs/en/connector-v2/source/Opengauss-CDC.md b/docs/en/connector-v2/source/Opengauss-CDC.md
index 81691ea1ff4..26825202963 100644
--- a/docs/en/connector-v2/source/Opengauss-CDC.md
+++ b/docs/en/connector-v2/source/Opengauss-CDC.md
@@ -64,31 +64,31 @@ select 'ALTER TABLE ' || schemaname || '.' || tablename || ' REPLICA IDENTITY FU
## Source Options
-| Name | Type | Required | Default | Description |
-|------------------------------------------------|----------|----------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. |
-| username | String | Yes | - | Username of the database to use when connecting to the database server. |
-| password | String | Yes | - | Password to use when connecting to the database server. |
-| database-names | List | No | - | Database name of the database to monitor. |
-| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` |
-| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] |
-| startup.mode | Enum | No | INITIAL | Optional startup mode for Opengauss CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. `specific`: Startup from user-supplied specific offsets. |
-| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. |
-| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. |
-| slot.name | String | No | - | The name of the Opengauss logical decoding slot that was created for streaming changes from a particular plug-in for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. Default is seatunnel. |
-| decoding.plugin.name | String | No | pgoutput | The name of the Postgres logical decoding plug-in installed on the server,Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming,wal2json_rds_streaming and pgoutput. |
-| server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. |
-| connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. |
-| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. |
-| connection.pool.size | Integer | No | 20 | The jdbc connection pool size. |
+| Name | Type | Required | Default | Description |
+|------------------------------------------------|----------|----------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. |
+| username | String | Yes | - | Username of the database to use when connecting to the database server. |
+| password | String | Yes | - | Password to use when connecting to the database server. |
+| database-names | List | No | - | Database name of the database to monitor. |
+| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` |
+| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] |
+| startup.mode | Enum | No | INITIAL | Optional startup mode for Opengauss CDC consumer, valid enumerations are `initial`, `earliest`, `latest`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. |
+| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. |
+| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. |
+| slot.name | String | No | - | The name of the Opengauss logical decoding slot that was created for streaming changes from a particular plug-in for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. Default is seatunnel. |
+| decoding.plugin.name | String | No | pgoutput | The name of the Postgres logical decoding plug-in installed on the server,Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming,wal2json_rds_streaming and pgoutput. |
+| server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. |
+| connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. |
+| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. |
+| connection.pool.size | Integer | No | 20 | The jdbc connection pool size. |
| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. |
-| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. |
-| sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. |
-| inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. |
-| exactly_once | Boolean | No | false | Enable exactly once semantic. |
-| format | Enum | No | DEFAULT | Optional output format for Opengauss CDC, valid enumerations are `DEFAULT`, `COMPATIBLE_DEBEZIUM_JSON`. |
-| debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) to Debezium Embedded Engine which is used to capture data changes from Opengauss server. |
-| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details |
+| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. |
+| sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. |
+| inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. |
+| exactly_once | Boolean | No | false | Enable exactly once semantic. |
+| format | Enum | No | DEFAULT | Optional output format for Opengauss CDC, valid enumerations are `DEFAULT`, `COMPATIBLE_DEBEZIUM_JSON`. |
+| debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) to Debezium Embedded Engine which is used to capture data changes from Opengauss server. |
+| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details |
## Task Example
@@ -109,7 +109,7 @@ env {
source {
Opengauss-CDC {
- result_table_name = "customers_opengauss_cdc"
+ plugin_output = "customers_opengauss_cdc"
username = "gaussdb"
password = "openGauss@123"
database-names = ["opengauss_cdc"]
@@ -126,7 +126,7 @@ transform {
sink {
jdbc {
- source_table_name = "customers_opengauss_cdc"
+ plugin_input = "customers_opengauss_cdc"
url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc"
driver = "org.postgresql.Driver"
user = "dailai"
@@ -149,7 +149,7 @@ sink {
```
source {
Opengauss-CDC {
- result_table_name = "customers_opengauss_cdc"
+ plugin_output = "customers_opengauss_cdc"
username = "gaussdb"
password = "openGauss@123"
database-names = ["opengauss_cdc"]
diff --git a/docs/en/connector-v2/source/Oracle-CDC.md b/docs/en/connector-v2/source/Oracle-CDC.md
index feef58a0d2e..dad52faa0b1 100644
--- a/docs/en/connector-v2/source/Oracle-CDC.md
+++ b/docs/en/connector-v2/source/Oracle-CDC.md
@@ -220,7 +220,7 @@ exit;
## Source Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `idbc:oracle:thin:datasource01:1523:xe`. |
| username | String | Yes | - | Name of the database to use when connecting to the database server. |
@@ -228,7 +228,7 @@ exit;
| database-names | List | No | - | Database name of the database to monitor. |
| schema-names | List | No | - | Schema name of the database to monitor. |
| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` |
-| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] |
+| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] |
| startup.mode | Enum | No | INITIAL | Optional startup mode for Oracle CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. `specific`: Startup from user-supplied specific offsets. |
| startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** |
| startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** |
@@ -262,7 +262,7 @@ exit;
source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
Oracle-CDC {
- result_table_name = "customers"
+ plugin_output = "customers"
username = "system"
password = "oracle"
database-names = ["XE"]
@@ -280,7 +280,7 @@ source {
> source {
> # This is a example source plugin **only for test and demonstrate the feature source plugin**
> Oracle-CDC {
-> result_table_name = "customers"
+> plugin_output = "customers"
> use_select_count = true
> username = "system"
> password = "oracle"
@@ -299,7 +299,7 @@ source {
> source {
> # This is a example source plugin **only for test and demonstrate the feature source plugin**
> Oracle-CDC {
-> result_table_name = "customers"
+> plugin_output = "customers"
> skip_analyze = true
> username = "system"
> password = "oracle"
@@ -318,7 +318,7 @@ source {
source {
Oracle-CDC {
- result_table_name = "customers"
+ plugin_output = "customers"
base-url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe"
source.reader.close.timeout = 120000
username = "system"
diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md
index d5326cb86a4..42163a9d13e 100644
--- a/docs/en/connector-v2/source/OssFile.md
+++ b/docs/en/connector-v2/source/OssFile.md
@@ -190,7 +190,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto
## Options
-| name | type | required | default value | Description |
+| name | type | required | default value | Description |
|---------------------------|---------|----------|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| path | string | yes | - | The Oss path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option |
| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` |
@@ -211,7 +211,8 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto
| xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. |
| compress_codec | string | no | none | Which compress codec the files used. |
| encoding | string | no | UTF-8 |
-| file_filter_pattern | string | no | | `*.txt` means you only need read the files end with `.txt` |
+| null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` |
+| file_filter_pattern | string | no | | Filter pattern, which used for filtering files. |
| common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. |
### compress_codec [string]
@@ -233,6 +234,55 @@ The encoding of the file to read. This param will be parsed by `Charset.forName(
Filter pattern, which used for filtering files.
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### schema [config]
Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata).
@@ -344,7 +394,7 @@ source {
file_format_type = "orc"
}
]
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
@@ -461,7 +511,7 @@ source {
}
}
]
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
@@ -474,6 +524,33 @@ sink {
}
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ OssFile {
+ path = "/seatunnel/orc"
+ bucket = "oss://tyrantlucifer-image-bed"
+ access_key = "xxxxxxxxxxxxxxxxx"
+ access_secret = "xxxxxxxxxxxxxxxxxxxxxx"
+ endpoint = "oss-cn-beijing.aliyuncs.com"
+ file_format_type = "orc"
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
+
## Changelog
### 2.2.0-beta 2022-09-26
diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md
index d5bd6d14fa3..9b83a0b0501 100644
--- a/docs/en/connector-v2/source/OssJindoFile.md
+++ b/docs/en/connector-v2/source/OssJindoFile.md
@@ -49,7 +49,7 @@ It only supports hadoop version **2.9.X+**.
## Options
-| name | type | required | default value |
+| name | type | required | default value |
|---------------------------|---------|----------|---------------------|
| path | string | yes | - |
| file_format_type | string | yes | - |
@@ -68,10 +68,11 @@ It only supports hadoop version **2.9.X+**.
| sheet_name | string | no | - |
| xml_row_tag | string | no | - |
| xml_use_attr_format | boolean | no | - |
-| file_filter_pattern | string | no | - |
+| file_filter_pattern | string | no | |
| compress_codec | string | no | none |
| archive_compress_codec | string | no | none |
| encoding | string | no | UTF-8 |
+| null_format | string | no | - |
| common-options | | no | - |
### path [string]
@@ -267,6 +268,55 @@ Reader the sheet of the workbook.
Filter pattern, which used for filtering files.
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### compress_codec [string]
The compress codec of files and the details that supported as the following shown:
@@ -286,6 +336,7 @@ The compress codec of archive files and the details that supported as the follow
| ZIP | txt,json,excel,xml | .zip |
| TAR | txt,json,excel,xml | .tar |
| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
| NONE | all | .* |
### encoding [string]
@@ -293,6 +344,13 @@ The compress codec of archive files and the details that supported as the follow
Only used when file_format_type is json,text,csv,xml.
The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`.
+### null_format [string]
+
+Only used when file_format_type is text.
+null_format to define which strings can be represented as null.
+
+e.g: `\N`
+
### common options
Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details.
@@ -364,6 +422,33 @@ sink {
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ OssJindoFile {
+ bucket = "oss://tyrantlucifer-image-bed"
+ access_key = "xxxxxxxxxxxxxxxxx"
+ access_secret = "xxxxxxxxxxxxxxxxxxxxxx"
+ endpoint = "oss-cn-beijing.aliyuncs.com"
+ path = "/seatunnel/read/binary/"
+ file_format_type = "binary"
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
+
## Changelog
### next version
diff --git a/docs/en/connector-v2/source/Paimon.md b/docs/en/connector-v2/source/Paimon.md
index e586a4fd9d8..cbe3b592f8b 100644
--- a/docs/en/connector-v2/source/Paimon.md
+++ b/docs/en/connector-v2/source/Paimon.md
@@ -82,6 +82,11 @@ Properties in hadoop conf
The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files
+## Filesystems
+The Paimon connector supports writing data to multiple file systems. Currently, the supported file systems are hdfs and s3.
+If you use the s3 filesystem. You can configure the `fs.s3a.access-key`、`fs.s3a.secret-key`、`fs.s3a.endpoint`、`fs.s3a.path.style.access`、`fs.s3a.aws.credentials.provider` properties in the `paimon.hadoop.conf` option.
+Besides, the warehouse should start with `s3a://`.
+
## Examples
### Simple example
@@ -109,6 +114,33 @@ source {
}
```
+### S3 example
+```hocon
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Paimon {
+ warehouse = "s3a://test/"
+ database = "seatunnel_namespace11"
+ table = "st_test"
+ paimon.hadoop.conf = {
+ fs.s3a.access-key=G52pnxg67819khOZ9ezX
+ fs.s3a.secret-key=SHJuAQqHsLrgZWikvMa3lJf5T0NfM5LMFliJh9HF
+ fs.s3a.endpoint="http://minio4:9000"
+ fs.s3a.path.style.access=true
+ fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
+ }
+ }
+}
+
+sink {
+ Console{}
+}
+```
+
### Hadoop conf example
```hocon
diff --git a/docs/en/connector-v2/source/PostgreSQL-CDC.md b/docs/en/connector-v2/source/PostgreSQL-CDC.md
index be87d03edd4..21afa42f701 100644
--- a/docs/en/connector-v2/source/PostgreSQL-CDC.md
+++ b/docs/en/connector-v2/source/PostgreSQL-CDC.md
@@ -86,15 +86,15 @@ ALTER TABLE your_table_name REPLICA IDENTITY FULL;
## Source Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|------------------------------------------------|----------|----------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. |
| username | String | Yes | - | Name of the database to use when connecting to the database server. |
| password | String | Yes | - | Password to use when connecting to the database server. |
| database-names | List | No | - | Database name of the database to monitor. |
| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` |
-| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] |
-| startup.mode | Enum | No | INITIAL | Optional startup mode for PostgreSQL CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. `specific`: Startup from user-supplied specific offsets. |
+| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] |
+| startup.mode | Enum | No | INITIAL | Optional startup mode for PostgreSQL CDC consumer, valid enumerations are `initial`, `earliest` and `latest`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. |
| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. |
| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. |
| slot.name | String | No | - | The name of the PostgreSQL logical decoding slot that was created for streaming changes from a particular plug-in for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. Default is seatunnel. |
@@ -132,7 +132,7 @@ env {
source {
Postgres-CDC {
- result_table_name = "customers_Postgre_cdc"
+ plugin_output = "customers_Postgre_cdc"
username = "postgres"
password = "postgres"
database-names = ["postgres_cdc"]
@@ -148,7 +148,7 @@ transform {
sink {
jdbc {
- source_table_name = "customers_Postgre_cdc"
+ plugin_input = "customers_Postgre_cdc"
url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF"
driver = "org.postgresql.Driver"
user = "postgres"
@@ -169,7 +169,7 @@ sink {
```
source {
Postgres-CDC {
- result_table_name = "customers_mysql_cdc"
+ plugin_output = "customers_mysql_cdc"
username = "postgres"
password = "postgres"
database-names = ["postgres_cdc"]
diff --git a/docs/en/connector-v2/source/PostgreSQL.md b/docs/en/connector-v2/source/PostgreSQL.md
index 101902d3618..d383b113c2e 100644
--- a/docs/en/connector-v2/source/PostgreSQL.md
+++ b/docs/en/connector-v2/source/PostgreSQL.md
@@ -261,7 +261,7 @@ source{
partition_column= "id"
# The name of the table returned
- result_table_name = "jdbc"
+ plugin_output = "jdbc"
partition_lower_bound = 1
partition_upper_bound = 50
partition_num = 5
diff --git a/docs/en/connector-v2/source/Prometheus.md b/docs/en/connector-v2/source/Prometheus.md
new file mode 100644
index 00000000000..ba8979f023e
--- /dev/null
+++ b/docs/en/connector-v2/source/Prometheus.md
@@ -0,0 +1,152 @@
+# Prometheus
+
+> Prometheus source connector
+
+## Description
+
+Used to read data from Prometheus.
+
+## Key features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [ ] [parallelism](../../concept/connector-v2-features.md)
+
+## Options
+
+| name | type | required | default value |
+|-----------------------------|---------|----------|-----------------|
+| url | String | Yes | - |
+| query | String | Yes | - |
+| query_type | String | Yes | Instant |
+| content_field | String | Yes | $.data.result.* |
+| schema.fields | Config | Yes | - |
+| format | String | No | json |
+| params | Map | Yes | - |
+| poll_interval_millis | int | No | - |
+| retry | int | No | - |
+| retry_backoff_multiplier_ms | int | No | 100 |
+| retry_backoff_max_ms | int | No | 10000 |
+| enable_multi_lines | boolean | No | false |
+| common-options | config | No | - |
+
+### url [String]
+
+http request url
+
+### query [String]
+
+Prometheus expression query string
+
+### query_type [String]
+
+Instant/Range
+
+1. Instant : The following endpoint evaluates an instant query at a single point in time
+2. Range : The following endpoint evaluates an expression query over a range of time
+
+https://prometheus.io/docs/prometheus/latest/querying/api/
+
+### params [Map]
+
+http request params
+
+### poll_interval_millis [int]
+
+request http api interval(millis) in stream mode
+
+### retry [int]
+
+The max retry times if request http return to `IOException`
+
+### retry_backoff_multiplier_ms [int]
+
+The retry-backoff times(millis) multiplier if request http failed
+
+### retry_backoff_max_ms [int]
+
+The maximum retry-backoff times(millis) if request http failed
+
+### format [String]
+
+the format of upstream data, default `json`.
+
+### schema [Config]
+
+Fill in a fixed value
+
+```hocon
+ schema = {
+ fields {
+ metric = "map"
+ value = double
+ time = long
+ }
+ }
+
+```
+
+#### fields [Config]
+
+the schema fields of upstream data
+
+### common options
+
+Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details
+
+## Example
+
+### Instant:
+
+```hocon
+source {
+ Prometheus {
+ plugin_output = "http"
+ url = "http://mockserver:1080"
+ query = "up"
+ query_type = "Instant"
+ content_field = "$.data.result.*"
+ format = "json"
+ schema = {
+ fields {
+ metric = "map"
+ value = double
+ time = long
+ }
+ }
+ }
+}
+```
+
+### Range
+
+```hocon
+source {
+ Prometheus {
+ plugin_output = "http"
+ url = "http://mockserver:1080"
+ query = "up"
+ query_type = "Range"
+ content_field = "$.data.result.*"
+ format = "json"
+ start = "2024-07-22T20:10:30.781Z"
+ end = "2024-07-22T20:11:00.781Z"
+ step = "15s"
+ schema = {
+ fields {
+ metric = "map"
+ value = double
+ time = long
+ }
+ }
+ }
+ }
+```
+
+## Changelog
+
+### next version
+
+- Add Prometheus Source Connector
+- Reduce configuration items
+
diff --git a/docs/en/connector-v2/source/Pulsar.md b/docs/en/connector-v2/source/Pulsar.md
index 73496180626..77d9938008b 100644
--- a/docs/en/connector-v2/source/Pulsar.md
+++ b/docs/en/connector-v2/source/Pulsar.md
@@ -147,7 +147,7 @@ source {
subscription.name = "seatunnel"
client.service-url = "pulsar://localhost:6650"
admin.service-url = "http://my-broker.example.com:8080"
- result_table_name = "test"
+ plugin_output = "test"
}
}
```
diff --git a/docs/en/connector-v2/source/RocketMQ.md b/docs/en/connector-v2/source/RocketMQ.md
index 744f4c94ae8..eb8edc1c806 100644
--- a/docs/en/connector-v2/source/RocketMQ.md
+++ b/docs/en/connector-v2/source/RocketMQ.md
@@ -76,7 +76,7 @@ source {
Rocketmq {
name.srv.addr = "rocketmq-e2e:9876"
topics = "test_topic_json"
- result_table_name = "rocketmq_table"
+ plugin_output = "rocketmq_table"
schema = {
fields {
id = bigint
@@ -124,7 +124,7 @@ source {
Rocketmq {
name.srv.addr = "localhost:9876"
topics = "test_topic"
- result_table_name = "rocketmq_table"
+ plugin_output = "rocketmq_table"
start.mode = "CONSUME_FROM_FIRST_OFFSET"
batch.size = "400"
consumer.group = "test_topic_group"
diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md
index d280d6dc7f2..b0e69cd1e36 100644
--- a/docs/en/connector-v2/source/S3File.md
+++ b/docs/en/connector-v2/source/S3File.md
@@ -196,7 +196,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto
## Options
-| name | type | required | default value | Description |
+| name | type | required | default value | Description |
|---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option |
| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` |
@@ -220,12 +220,67 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto
| compress_codec | string | no | none | |
| archive_compress_codec | string | no | none | |
| encoding | string | no | UTF-8 | |
+| null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` |
+| file_filter_pattern | string | no | | Filter pattern, which used for filtering files. |
| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. |
### delimiter/field_delimiter [string]
**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead.
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
+
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### compress_codec [string]
The compress codec of files and the details that supported as the following shown:
@@ -245,6 +300,7 @@ The compress codec of archive files and the details that supported as the follow
| ZIP | txt,json,excel,xml | .zip |
| TAR | txt,json,excel,xml | .tar |
| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
| NONE | all | .* |
### encoding [string]
@@ -349,6 +405,33 @@ sink {
}
```
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ S3File {
+ path = "/seatunnel/json"
+ bucket = "s3a://seatunnel-test"
+ fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
+ fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider"
+ file_format_type = "json"
+ read_columns = ["id", "name"]
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
+
## Changelog
### 2.3.0-beta 2022-10-20
diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md
index 6d6ec5ea8db..f5b76ce3055 100644
--- a/docs/en/connector-v2/source/SftpFile.md
+++ b/docs/en/connector-v2/source/SftpFile.md
@@ -71,7 +71,7 @@ The File does not have a specific type list, and we can indicate which SeaTunnel
## Source Options
-| Name | Type | Required | default value | Description |
+| Name | Type | Required | default value | Description |
|---------------------------|---------|----------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| host | String | Yes | - | The target sftp host is required |
| port | Int | Yes | - | The target sftp port is required |
@@ -94,8 +94,62 @@ The File does not have a specific type list, and we can indicate which SeaTunnel
| compress_codec | String | No | None | The compress codec of files and the details that supported as the following shown: - txt: `lzo` `None` - json: `lzo` `None` - csv: `lzo` `None` - orc: `lzo` `snappy` `lz4` `zlib` `None` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `None` Tips: excel type does Not support any compression format |
| archive_compress_codec | string | no | none |
| encoding | string | no | UTF-8 |
+| null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` |
| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. |
+### file_filter_pattern [string]
+
+Filter pattern, which used for filtering files.
+
+The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression.
+There are some examples.
+
+File Structure Example:
+```
+/data/seatunnel/20241001/report.txt
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+/data/seatunnel/20241012/logo.png
+```
+Matching Rules Example:
+
+**Example 1**: *Match all .txt files*,Regular Expression:
+```
+/data/seatunnel/20241001/.*\.txt
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241001/report.txt
+```
+**Example 2**: *Match all file starting with abc*,Regular Expression:
+```
+/data/seatunnel/20241002/abc.*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+```
+**Example 3**: *Match all file starting with abc,And the fourth character is either h or g*, the Regular Expression:
+```
+/data/seatunnel/20241007/abc[h,g].*
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+```
+**Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression:
+```
+/data/seatunnel/202410\d*/.*\.csv
+```
+The result of this example matching is:
+```
+/data/seatunnel/20241007/abch202410.csv
+/data/seatunnel/20241002/abcg202410.csv
+/data/seatunnel/20241005/old_data.csv
+```
+
### file_format_type [string]
File type, supported as the following file types:
@@ -182,11 +236,12 @@ The compress codec of files and the details that supported as the following show
The compress codec of archive files and the details that supported as the following shown:
| archive_compress_codec | file_format | archive_compress_suffix |
-|------------------------|--------------------|-------------------------|
-| ZIP | txt,json,excel,xml | .zip |
-| TAR | txt,json,excel,xml | .tar |
-| TAR_GZ | txt,json,excel,xml | .tar.gz |
-| NONE | all | .* |
+|--------------------|--------------------|---------------------|
+| ZIP | txt,json,excel,xml | .zip |
+| TAR | txt,json,excel,xml | .tar |
+| TAR_GZ | txt,json,excel,xml | .tar.gz |
+| GZ | txt,json,xml | .gz |
+| NONE | all | .* |
### encoding [string]
@@ -219,7 +274,7 @@ source {
password = pass
path = "tmp/seatunnel/read/json"
file_format_type = "json"
- result_table_name = "sftp"
+ plugin_output = "sftp"
schema = {
fields {
c_map = "map"
@@ -264,4 +319,71 @@ sink {
}
}
```
+### Multiple Table
+
+```hocon
+SftpFile {
+ tables_configs = [
+ {
+ schema {
+ table = "student"
+ fields {
+ name = string
+ age = int
+ }
+ }
+ path = "/tmp/seatunnel/sink/text"
+ host = "192.168.31.48"
+ port = 21
+ user = tyrantlucifer
+ password = tianchao
+ file_format_type = "parquet"
+ },
+ {
+ schema {
+ table = "teacher"
+ fields {
+ name = string
+ age = int
+ }
+ }
+ path = "/tmp/seatunnel/sink/text"
+ host = "192.168.31.48"
+ port = 21
+ user = tyrantlucifer
+ password = tianchao
+ file_format_type = "parquet"
+ }
+ ]
+}
+
+```
+
+### Filter File
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ SftpFile {
+ host = "sftp"
+ port = 22
+ user = seatunnel
+ password = pass
+ path = "tmp/seatunnel/read/json"
+ file_format_type = "json"
+ plugin_output = "sftp"
+ // file example abcD2024.csv
+ file_filter_pattern = "abc[DX]*.*"
+ }
+}
+
+sink {
+ Console {
+ }
+}
+```
diff --git a/docs/en/connector-v2/source/SqlServer-CDC.md b/docs/en/connector-v2/source/SqlServer-CDC.md
index a64b3abfa88..8a3d8423748 100644
--- a/docs/en/connector-v2/source/SqlServer-CDC.md
+++ b/docs/en/connector-v2/source/SqlServer-CDC.md
@@ -63,13 +63,13 @@ describes how to setup the Sql Server CDC connector to run SQL queries against S
## Source Options
-| Name | Type | Required | Default | Description |
+| Name | Type | Required | Default | Description |
|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| username | String | Yes | - | Name of the database to use when connecting to the database server. |
| password | String | Yes | - | Password to use when connecting to the database server. |
| database-names | List | Yes | - | Database name of the database to monitor. |
| table-names | List | Yes | - | Table name is a combination of schema name and table name (databaseName.schemaName.tableName). |
-| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] |
+| table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] |
| base-url | String | Yes | - | URL has to be with database, like "jdbc:sqlserver://localhost:1433;databaseName=test". |
| startup.mode | Enum | No | INITIAL | Optional startup mode for SqlServer CDC consumer, valid enumerations are "initial", "earliest", "latest" and "specific". |
| startup.timestamp | Long | No | - | Start from the specified epoch timestamp (in milliseconds). **Note, This option is required when** the **"startup.mode" option used `'timestamp'`.** |
@@ -141,7 +141,7 @@ env {
source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
SqlServer-CDC {
- result_table_name = "customers"
+ plugin_output = "customers"
username = "sa"
password = "Y.sa123456"
startup.mode="initial"
@@ -156,7 +156,7 @@ transform {
sink {
console {
- source_table_name = "customers"
+ plugin_input = "customers"
}
```
@@ -177,7 +177,7 @@ source {
SqlServer-CDC {
# Set up accurate one read
exactly_once=true
- result_table_name = "customers"
+ plugin_output = "customers"
username = "sa"
password = "Y.sa123456"
startup.mode="latest"
@@ -192,7 +192,7 @@ transform {
sink {
console {
- source_table_name = "customers"
+ plugin_input = "customers"
}
```
diff --git a/docs/en/connector-v2/source/StarRocks.md b/docs/en/connector-v2/source/StarRocks.md
index d46105cc9af..1c1a109480a 100644
--- a/docs/en/connector-v2/source/StarRocks.md
+++ b/docs/en/connector-v2/source/StarRocks.md
@@ -19,25 +19,25 @@ delivers the query plan as a parameter to BE nodes, and then obtains data result
## Options
-| name | type | required | default value |
-|-------------------------|--------|----------|-------------------|
-| node_urls | list | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| database | string | yes | - |
-| table | string | yes | - |
-| scan_filter | string | no | - |
-| schema | config | yes | - |
-| request_tablet_size | int | no | Integer.MAX_VALUE |
-| scan_connect_timeout_ms | int | no | 30000 |
-| scan_query_timeout_sec | int | no | 3600 |
-| scan_keep_alive_min | int | no | 10 |
-| scan_batch_rows | int | no | 1024 |
-| scan_mem_limit | long | no | 2147483648 |
-| max_retries | int | no | 3 |
-| scan.params.* | string | no | - |
-
-### node_urls [list]
+| name | type | required | default value |
+|--------------------------|--------|----------|-------------------|
+| nodeUrls | list | yes | - |
+| username | string | yes | - |
+| password | string | yes | - |
+| database | string | yes | - |
+| table | string | yes | - |
+| scan_filter | string | no | - |
+| schema | config | yes | - |
+| request_tablet_size | int | no | Integer.MAX_VALUE |
+| scan_connect_timeout_ms | int | no | 30000 |
+| scan_query_timeout_sec | int | no | 3600 |
+| scan_keep_alive_min | int | no | 10 |
+| scan_batch_rows | int | no | 1024 |
+| scan_mem_limit | long | no | 2147483648 |
+| max_retries | int | no | 3 |
+| scan.params.* | string | no | - |
+
+### nodeUrls [list]
`StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]`
diff --git a/docs/en/connector-v2/source/TDengine.md b/docs/en/connector-v2/source/TDengine.md
index a24744d5c17..26480c12354 100644
--- a/docs/en/connector-v2/source/TDengine.md
+++ b/docs/en/connector-v2/source/TDengine.md
@@ -78,7 +78,7 @@ source {
stable : "meters"
lower_bound : "2018-10-03 14:38:05.000"
upper_bound : "2018-10-03 14:38:16.800"
- result_table_name = "tdengine_result"
+ plugin_output = "tdengine_result"
}
}
```
diff --git a/docs/en/connector-v2/source/TiDB-CDC.md b/docs/en/connector-v2/source/TiDB-CDC.md
index ffa307f8bc1..1cce8ec3ac2 100644
--- a/docs/en/connector-v2/source/TiDB-CDC.md
+++ b/docs/en/connector-v2/source/TiDB-CDC.md
@@ -91,7 +91,7 @@ env {
source {
# This is a example source plugin **only for test and demonstrate the feature source plugin**
TiDB-CDC {
- result_table_name = "products_tidb_cdc"
+ plugin_output = "products_tidb_cdc"
base-url = "jdbc:mysql://tidb0:4000/inventory"
driver = "com.mysql.cj.jdbc.Driver"
tikv.grpc.timeout_in_ms = 20000
@@ -108,7 +108,7 @@ transform {
sink {
jdbc {
- source_table_name = "products_tidb_cdc"
+ plugin_input = "products_tidb_cdc"
url = "jdbc:mysql://tidb0:4000/inventory"
driver = "com.mysql.cj.jdbc.Driver"
user = "root"
diff --git a/docs/en/contribution/how-to-create-your-connector.md b/docs/en/contribution/how-to-create-your-connector.md
new file mode 100644
index 00000000000..b99bc85d999
--- /dev/null
+++ b/docs/en/contribution/how-to-create-your-connector.md
@@ -0,0 +1,3 @@
+# Develop Your Own Connector
+
+If you want to develop your own connector for the new SeaTunnel connector API (Connector V2), please check [here](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md).
\ No newline at end of file
diff --git a/docs/en/contribution/setup.md b/docs/en/contribution/setup.md
index b2579e1ee1e..8fd632a24b0 100644
--- a/docs/en/contribution/setup.md
+++ b/docs/en/contribution/setup.md
@@ -80,7 +80,7 @@ After all the above things are done, you just finish the environment setup and c
of box. All examples are in module `seatunnel-examples`, you could pick one you are interested in, [Running Or Debugging
It In IDEA](https://www.jetbrains.com/help/idea/run-debug-configuration.html) as you wish.
-Here we use `seatunnel-examples/seatunnel-engine-examples/src/main/java/org/apache/seatunnel/example/engine/SeaTunnelEngineExample.java`
+Here we use `seatunnel-examples/seatunnel-engine-examples/src/main/java/org/apache/seatunnel/example/engine/SeaTunnelEngineLocalExample.java`
as an example, when you run it successfully you can see the output as below:
```log
diff --git a/docs/en/faq.md b/docs/en/faq.md
index 02c125ad4fd..6a4e838eaed 100644
--- a/docs/en/faq.md
+++ b/docs/en/faq.md
@@ -1,332 +1,116 @@
-# FAQs
+# FAQ
-## Why should I install a computing engine like Spark or Flink?
+## What data sources and destinations does SeaTunnel support?
+SeaTunnel supports various data sources and destinations. You can find a detailed list on the following list:
+- Supported data sources (Source): [Source List](https://seatunnel.apache.org/docs/connector-v2/source)
+- Supported data destinations (Sink): [Sink List](https://seatunnel.apache.org/docs/connector-v2/sink)
-SeaTunnel now uses computing engines such as Spark and Flink to complete resource scheduling and node communication, so we can focus on the ease of use of data synchronization and the development of high-performance components. But this is only temporary.
+## Does SeaTunnel support batch and streaming processing?
+SeaTunnel supports both batch and streaming processing modes. You can select the appropriate mode based on your specific business scenarios and needs. Batch processing is suitable for scheduled data integration tasks, while streaming processing is ideal for real-time integration and Change Data Capture (CDC).
-## I have a question, and I cannot solve it by myself
+## Is it necessary to install engines like Spark or Flink when using SeaTunnel?
+Spark and Flink are not mandatory. SeaTunnel supports Zeta, Spark, and Flink as integration engines, allowing you to choose one based on your needs. The community highly recommends Zeta, a new generation high-performance integration engine specifically designed for integration scenarios. Zeta is affectionately called "Ultraman Zeta" by community users! The community offers extensive support for Zeta, making it the most feature-rich option.
-I have encountered a problem when using SeaTunnel and I cannot solve it by myself. What should I do? First, search in [Issue List](https://github.com/apache/seatunnel/issues) or [Mailing List](https://lists.apache.org/list.html?dev@seatunnel.apache.org) to see if someone has already asked the same question and got an answer. If you cannot find an answer to your question, you can contact community members for help in [These Ways](https://github.com/apache/seatunnel#contact-us).
+## What data transformation functions does SeaTunnel provide?
+SeaTunnel supports multiple data transformation functions, including field mapping, data filtering, data format conversion, and more. You can implement data transformations through the `transform` module in the configuration file. For more details, refer to the SeaTunnel [Transform Documentation](https://seatunnel.apache.org/docs/transform-v2).
-## How do I declare a variable?
+## Can SeaTunnel support custom data cleansing rules?
+Yes, SeaTunnel supports custom data cleansing rules. You can configure custom rules in the `transform` module, such as cleaning up dirty data, removing invalid records, or converting fields.
-Do you want to know how to declare a variable in SeaTunnel's configuration, and then dynamically replace the value of the variable at runtime?
+## Does SeaTunnel support real-time incremental integration?
+SeaTunnel supports incremental data integration. For example, the CDC connector allows real-time capture of data changes, which is ideal for scenarios requiring real-time data integration.
-Since `v1.2.4`, SeaTunnel supports variable substitution in the configuration. This feature is often used for timing or non-timing offline processing to replace variables such as time and date. The usage is as follows:
+## What CDC data sources are currently supported by SeaTunnel?
+SeaTunnel currently supports MongoDB CDC, MySQL CDC, OpenGauss CDC, Oracle CDC, PostgreSQL CDC, SQL Server CDC, TiDB CDC, and more. For more details, refer to the [Source List](https://seatunnel.apache.org/docs/connector-v2/source).
-Configure the variable name in the configuration. Here is an example of sql transform (actually, anywhere in the configuration file the value in `'key = value'` can use the variable substitution):
+## How do I enable permissions required for SeaTunnel CDC integration?
+Please refer to the official SeaTunnel documentation for the necessary steps to enable permissions for each connector’s CDC functionality.
-```
-...
-transform {
- sql {
- query = "select * from user_view where city ='"${city}"' and dt = '"${date}"'"
- }
-}
-...
-```
+## Does SeaTunnel support CDC from MySQL replicas? How are logs pulled?
+Yes, SeaTunnel supports CDC from MySQL replicas by subscribing to binlog logs, which are then parsed on the SeaTunnel server.
-Taking Spark Local mode as an example, the startup command is as follows:
+## Does SeaTunnel support CDC integration for tables without primary keys?
+SeaTunnel does not support CDC integration for tables without primary keys. The reason is that if two identical records exist in the upstream and one is deleted or modified, the downstream cannot determine which record to delete or modify, leading to potential issues. Primary keys are essential to ensure data uniqueness.
-```bash
-./bin/start-seatunnel-spark.sh \
--c ./config/your_app.conf \
--e client \
--m local[2] \
--i city=shanghai \
--i date=20190319
-```
+## Does SeaTunnel support automatic table creation?
+Before starting an integration task, you can select different handling schemes for existing table structures on the target side, controlled via the `schema_save_mode` parameter. Available options include:
+- **`RECREATE_SCHEMA`**: Creates the table if it does not exist; if the table exists, it is deleted and recreated.
+- **`CREATE_SCHEMA_WHEN_NOT_EXIST`**: Creates the table if it does not exist; skips creation if the table already exists.
+- **`ERROR_WHEN_SCHEMA_NOT_EXIST`**: Throws an error if the table does not exist.
+- **`IGNORE`**: Ignores table handling.
+ Many connectors currently support automatic table creation. Refer to the specific connector documentation, such as [Jdbc sink](https://seatunnel.apache.org/docs/2.3.8/connector-v2/sink/Jdbc#schema_save_mode-enum), for more information.
-You can use the parameter `-i` or `--variable` followed by `key=value` to specify the value of the variable, where the key needs to be same as the variable name in the configuration.
+## Does SeaTunnel support handling existing data before starting a data integration task?
+Yes, you can specify different processing schemes for existing data on the target side before starting an integration task, controlled via the `data_save_mode` parameter. Available options include:
+- **`DROP_DATA`**: Retains the database structure but deletes the data.
+- **`APPEND_DATA`**: Retains both the database structure and data.
+- **`CUSTOM_PROCESSING`**: User-defined processing.
+- **`ERROR_WHEN_DATA_EXISTS`**: Throws an error if data already exists.
+ Many connectors support handling existing data; please refer to the respective connector documentation, such as [Jdbc sink](https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc#data_save_mode-enum).
-## How do I write a configuration item in multi-line text in the configuration file?
+## Does SeaTunnel support exactly-once consistency?
+SeaTunnel supports exactly-once consistency for some data sources, such as MySQL and PostgreSQL, ensuring data consistency during integration. Note that exactly-once consistency depends on the capabilities of the underlying database.
-When a configured text is very long and you want to wrap it, you can use three double quotes to indicate its start and end:
+## Can SeaTunnel execute scheduled tasks?
+You can use Linux cron jobs to achieve periodic data integration, or leverage scheduling tools like Apache DolphinScheduler or Apache Airflow to manage complex scheduled tasks.
-```
-var = """
- whatever you want
-"""
-```
+## I encountered an issue with SeaTunnel that I cannot resolve. What should I do?
+If you encounter issues with SeaTunnel, here are a few ways to get help:
+1. Search the [Issue List](https://github.com/apache/seatunnel/issues) or [Mailing List](https://lists.apache.org/list.html?dev@seatunnel.apache.org) to see if someone else has faced a similar issue.
+2. If you cannot find an answer, reach out to the community through [these methods](https://github.com/apache/seatunnel#contact-us).
-## How do I implement variable substitution for multi-line text?
+## How do I declare variables?
+Would you like to declare a variable in SeaTunnel's configuration and dynamically replace it at runtime? This feature is commonly used in both scheduled and ad-hoc offline processing to replace time, date, or other variables. Here's an example:
-It is a little troublesome to do variable substitution in multi-line text, because the variable cannot be included in three double quotation marks:
-
-```
-var = """
-your string 1
-"""${you_var}""" your string 2"""
-```
-
-Refer to: [lightbend/config#456](https://github.com/lightbend/config/issues/456).
-
-## Is SeaTunnel supported in Azkaban, Oozie, DolphinScheduler?
-
-Of course! See the screenshot below:
-
-![workflow.png](../images/workflow.png)
-
-![azkaban.png](../images/azkaban.png)
-
-## Does SeaTunnel have a case for configuring multiple sources, such as configuring elasticsearch and hdfs in source at the same time?
-
-```
-env {
- ...
-}
-
-source {
- hdfs { ... }
- elasticsearch { ... }
- jdbc {...}
-}
+Define the variable in the configuration. For example, in an SQL transformation (the value in any "key = value" pair in the configuration file can be replaced with variables):
+```plaintext
+...
transform {
- ...
-}
-
-sink {
- elasticsearch { ... }
-}
-```
-
-## Are there any HBase plugins?
-
-There is a HBase input plugin. You can download it from here: https://github.com/garyelephant/waterdrop-input-hbase .
-
-## How can I use SeaTunnel to write data to Hive?
-
-```
-env {
- spark.sql.catalogImplementation = "hive"
- spark.hadoop.hive.exec.dynamic.partition = "true"
- spark.hadoop.hive.exec.dynamic.partition.mode = "nonstrict"
-}
-
-source {
- sql = "insert into ..."
-}
-
-sink {
- // The data has been written to hive through the sql source. This is just a placeholder, it does not actually work.
- stdout {
- limit = 1
- }
-}
-```
-
-In addition, SeaTunnel has implemented a `Hive` output plugin after version `1.5.7` in `1.x` branch; in `2.x` branch. The Hive plugin for the Spark engine has been supported from version `2.0.5`: https://github.com/apache/seatunnel/issues/910.
-
-## How does SeaTunnel write multiple instances of ClickHouse to achieve load balancing?
-
-1. Write distributed tables directly (not recommended)
-
-2. Add a proxy or domain name (DNS) in front of multiple instances of ClickHouse:
-
- ```
- {
- output {
- clickhouse {
- host = "ck-proxy.xx.xx:8123"
- # Local table
- table = "table_name"
- }
- }
- }
- ```
-3. Configure multiple instances in the configuration:
-
- ```
- {
- output {
- clickhouse {
- host = "ck1:8123,ck2:8123,ck3:8123"
- # Local table
- table = "table_name"
- }
- }
- }
- ```
-4. Use cluster mode:
-
- ```
- {
- output {
- clickhouse {
- # Configure only one host
- host = "ck1:8123"
- cluster = "clickhouse_cluster_name"
- # Local table
- table = "table_name"
- }
- }
- }
- ```
-
-## How can I solve OOM when SeaTunnel consumes Kafka?
-
-In most cases, OOM is caused by not having a rate limit for consumption. The solution is as follows:
-
-For the current limit of Spark consumption of Kafka:
-
-1. Suppose the number of partitions of Kafka `Topic 1` you consume with KafkaStream = N.
-
-2. Assuming that the production speed of the message producer (Producer) of `Topic 1` is K messages/second, the speed of write messages to the partition must be uniform.
-
-3. Suppose that, after testing, it is found that the processing capacity of Spark Executor per core per second is M.
-
-The following conclusions can be drawn:
-
-1. If you want to make Spark's consumption of `Topic 1` keep up with its production speed, then you need `spark.executor.cores` * `spark.executor.instances` >= K / M
-
-2. When a data delay occurs, if you want the consumption speed not to be too fast, resulting in spark executor OOM, then you need to configure `spark.streaming.kafka.maxRatePerPartition` <= (`spark.executor.cores` * `spark.executor.instances`) * M / N
-
-3. In general, both M and N are determined, and the conclusion can be drawn from 2: The size of `spark.streaming.kafka.maxRatePerPartition` is positively correlated with the size of `spark.executor.cores` * `spark.executor.instances`, and it can be increased while increasing the resource `maxRatePerPartition` to speed up consumption.
-
-![Kafka](../images/kafka.png)
-
-## How can I solve the Error `Exception in thread "main" java.lang.NoSuchFieldError: INSTANCE`?
-
-The reason is that the version of httpclient.jar that comes with the CDH version of Spark is lower, and The httpclient version that ClickHouse JDBC is based on is 4.5.2, and the package versions conflict. The solution is to replace the jar package that comes with CDH with the httpclient-4.5.2 version.
-
-## The default JDK of my Spark cluster is JDK7. After I install JDK8, how can I specify that SeaTunnel starts with JDK8?
-
-In SeaTunnel's config file, specify the following configuration:
-
-```shell
-spark {
- ...
- spark.executorEnv.JAVA_HOME="/your/java_8_home/directory"
- spark.yarn.appMasterEnv.JAVA_HOME="/your/java_8_home/directory"
- ...
+ Sql {
+ query = "select * from user_view where city ='${city}' and dt = '${date}'"
+ }
}
+...
```
-## What should I do if OOM always appears when running SeaTunnel in Spark local[*] mode?
-
-If you run in local mode, you need to modify the `start-seatunnel.sh` startup script. After `spark-submit`, add a parameter `--driver-memory 4g` . Under normal circumstances, local mode is not used in the production environment. Therefore, this parameter generally does not need to be set during On YARN. See: [Application Properties](https://spark.apache.org/docs/latest/configuration.html#application-properties) for details.
-
-## Where can I place self-written plugins or third-party jdbc.jars to be loaded by SeaTunnel?
-
-Place the Jar package under the specified structure of the plugins directory:
+To start SeaTunnel in Zeta Local mode with variables:
```bash
-cd SeaTunnel
-mkdir -p plugins/my_plugins/lib
-cp third-part.jar plugins/my_plugins/lib
+$SEATUNNEL_HOME/bin/seatunnel.sh \
+-c $SEATUNNEL_HOME/config/your_app.conf \
+-m local[2] \
+-i city=Singapore \
+-i date=20231110
```
-`my_plugins` can be any string.
-
-## How do I configure logging-related parameters in SeaTunnel-V1(Spark)?
-
-There are three ways to configure logging-related parameters (such as Log Level):
-
-- [Not recommended] Change the default `$SPARK_HOME/conf/log4j.properties`.
- - This will affect all programs submitted via `$SPARK_HOME/bin/spark-submit`.
-- [Not recommended] Modify logging related parameters directly in the Spark code of SeaTunnel.
- - This is equivalent to hardcoding, and each change needs to be recompiled.
-- [Recommended] Use the following methods to change the logging configuration in the SeaTunnel configuration file (The change only takes effect if SeaTunnel >= 1.5.5 ):
-
- ```
- env {
- spark.driver.extraJavaOptions = "-Dlog4j.configuration=file:/log4j.properties"
- spark.executor.extraJavaOptions = "-Dlog4j.configuration=file:/log4j.properties"
- }
- source {
- ...
- }
- transform {
- ...
- }
- sink {
- ...
- }
- ```
-
-The contents of the log4j configuration file for reference are as follows:
-
-```
-$ cat log4j.properties
-log4j.rootLogger=ERROR, console
+Use the `-i` or `--variable` parameter with `key=value` to specify the variable's value, where `key` matches the variable name in the configuration. For details, see: [SeaTunnel Variable Configuration](https://seatunnel.apache.org/docs/concept/config)
-# set the log level for these components
-log4j.logger.org=ERROR
-log4j.logger.org.apache.spark=ERROR
-log4j.logger.org.spark-project=ERROR
-log4j.logger.org.apache.hadoop=ERROR
-log4j.logger.io.netty=ERROR
-log4j.logger.org.apache.zookeeper=ERROR
+## How can I write multi-line text in the configuration file?
+If the text is long and needs to be wrapped, you can use triple quotes to indicate the beginning and end:
-# add a ConsoleAppender to the logger stdout to write to the console
-log4j.appender.console=org.apache.log4j.ConsoleAppender
-log4j.appender.console.layout=org.apache.log4j.PatternLayout
-# use a simple message format
-log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
+```plaintext
+var = """
+Apache SeaTunnel is a
+next-generation high-performance,
+distributed, massive data integration tool.
+"""
```
-## How do I configure logging related parameters in SeaTunnel-V2(Spark, Flink)?
-
-Currently, they cannot be set directly. you need to modify the SeaTunnel startup script. The relevant parameters are specified in the task submission command. For specific parameters, please refer to the official documents:
-
-- Spark official documentation: http://spark.apache.org/docs/latest/configuration.html#configuring-logging
-- Flink official documentation: https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/logging.html
-
-Reference:
-
-https://stackoverflow.com/questions/27781187/how-to-stop-info-messages-displaying-on-spark-console
-
-http://spark.apache.org/docs/latest/configuration.html#configuring-logging
-
-https://medium.com/@iacomini.riccardo/spark-logging-configuration-in-yarn-faf5ba5fdb01
-
-## How do I configure logging related parameters of SeaTunnel-E2E Test?
-
-The log4j configuration file of `seatunnel-e2e` existed in `seatunnel-e2e/seatunnel-e2e-common/src/test/resources/log4j2.properties`. You can modify logging related parameters directly in the configuration file.
-
-For example, if you want to output more detailed logs of E2E Test, just downgrade `rootLogger.level` in the configuration file.
-
-## Error when writing to ClickHouse: ClassCastException
-
-In SeaTunnel, the data type will not be actively converted. After the Input reads the data, the corresponding
-Schema. When writing ClickHouse, the field type needs to be strictly matched, and the mismatch needs to be resolved.
+## How do I perform variable substitution in multi-line text?
+Performing variable substitution in multi-line text can be tricky because variables cannot be enclosed within triple quotes:
-Data conversion can be achieved through the following two plugins:
-
-1. Filter Convert plugin
-2. Filter Sql plugin
-
-Detailed data type conversion reference: [ClickHouse Data Type Check List](https://interestinglab.github.io/seatunnel-docs/#/en/configuration/output-plugins/Clickhouse?id=clickhouse-data-type-check-list)
-
-Refer to issue:[#488](https://github.com/apache/seatunnel/issues/488) [#382](https://github.com/apache/seatunnel/issues/382).
-
-## How does SeaTunnel access kerberos-authenticated HDFS, YARN, Hive and other resources?
-
-Please refer to: [#590](https://github.com/apache/seatunnel/issues/590).
-
-## How do I troubleshoot NoClassDefFoundError, ClassNotFoundException and other issues?
-
-There is a high probability that there are multiple different versions of the corresponding Jar package class loaded in the Java classpath, because of the conflict of the load order, not because the Jar is really missing. Modify this SeaTunnel startup command, adding the following parameters to the spark-submit submission section, and debug in detail through the output log.
-
-```
-spark-submit --verbose
- ...
- --conf 'spark.driver.extraJavaOptions=-verbose:class'
- --conf 'spark.executor.extraJavaOptions=-verbose:class'
- ...
+```plaintext
+var = """
+your string 1
+"""${your_var}""" your string 2"""
```
-## I want to learn the source code of SeaTunnel. Where should I start?
-
-SeaTunnel has a completely abstract and structured code implementation, and many people have chosen SeaTunnel As a way to learn Spark. You can learn the source code from the main program entry: SeaTunnel.java
-
-## When SeaTunnel developers develop their own plugins, do they need to understand the SeaTunnel code? Should these plugins be integrated into the SeaTunnel project?
-
-The plugin developed by the developer has nothing to do with the SeaTunnel project and does not need to include your plugin code.
+For more details, see: [lightbend/config#456](https://github.com/lightbend/config/issues/456).
-The plugin can be completely independent from SeaTunnel project, so you can write it using Java, Scala, Maven, sbt, Gradle, or whatever you want. This is also the way we recommend developers to develop plugins.
-## When I import a project, the compiler has the exception "class not found `org.apache.seatunnel.shade.com.typesafe.config.Config`"
+## Where should I start if I want to learn SeaTunnel source code?
+SeaTunnel features a highly abstracted and well-structured architecture, making it an excellent choice for learning big data architecture. You can start by exploring and debugging the `seatunnel-examples` module: `SeaTunnelEngineLocalExample.java`. For more details, refer to the [SeaTunnel Contribution Guide](https://seatunnel.apache.org/docs/contribution/setup).
-Run `mvn install` first. In the `seatunnel-config/seatunnel-config-base` subproject, the package `com.typesafe.config` has been relocated to `org.apache.seatunnel.shade.com.typesafe.config` and installed to the maven local repository in the subproject `seatunnel-config/seatunnel-config-shade`.
+## Do I need to understand all of SeaTunnel’s source code if I want to develop my own source, sink, or transform?
+No, you only need to focus on the interfaces for source, sink, and transform. If you want to develop your own connector (Connector V2) for the SeaTunnel API, refer to the **[Connector Development Guide](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md)**.
diff --git a/docs/en/other-engine/flink.md b/docs/en/other-engine/flink.md
index 8a77fbfc241..b6e7d6af77e 100644
--- a/docs/en/other-engine/flink.md
+++ b/docs/en/other-engine/flink.md
@@ -37,7 +37,7 @@ env {
source {
FakeSource {
row.num = 16
- result_table_name = "fake_table"
+ plugin_output = "fake_table"
schema = {
fields {
c_map = "map"
diff --git a/docs/en/seatunnel-engine/checkpoint-storage.md b/docs/en/seatunnel-engine/checkpoint-storage.md
index 7027f8067fb..19c617e0154 100644
--- a/docs/en/seatunnel-engine/checkpoint-storage.md
+++ b/docs/en/seatunnel-engine/checkpoint-storage.md
@@ -14,7 +14,7 @@ Checkpoint Storage is a storage mechanism for storing checkpoint data.
SeaTunnel Engine supports the following checkpoint storage types:
-- HDFS (OSS,S3,HDFS,LocalFile)
+- HDFS (OSS,COS,S3,HDFS,LocalFile)
- LocalFile (native), (it's deprecated: use Hdfs(LocalFile) instead.
We use the microkernel design pattern to separate the checkpoint storage module from the engine. This allows users to implement their own checkpoint storage modules.
@@ -73,6 +73,42 @@ For additional reading on the Hadoop Credential Provider API, you can see: [Cred
For Aliyun OSS Credential Provider implements, you can see: [Auth Credential Providers](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth)
+#### COS
+
+Tencent COS based hdfs-file you can refer [Hadoop COS Docs](https://hadoop.apache.org/docs/stable/hadoop-cos/cloud-storage/) to config COS.
+
+Except when interacting with cos buckets, the cos client needs the credentials needed to interact with buckets.
+The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of com.qcloud.cos.auth.COSCredentialsProvider may also be used.
+If you used SimpleCredentialsProvider (can be obtained from the Tencent Cloud API Key Management), these consist of an access key, a secret key.
+You can config like this:
+
+```yaml
+seatunnel:
+ engine:
+ checkpoint:
+ interval: 6000
+ timeout: 7000
+ storage:
+ type: hdfs
+ max-retained: 3
+ plugin-config:
+ storage.type: cos
+ cos.bucket: cosn://your-bucket
+ fs.cosn.credentials.provider: org.apache.hadoop.fs.cosn.auth.SimpleCredentialsProvider
+ fs.cosn.userinfo.secretId: your-secretId
+ fs.cosn.userinfo.secretKey: your-secretKey
+ fs.cosn.bucket.region: your-region
+```
+
+For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html).
+
+For additional COS configuration, you can see: [Tencent Hadoop-COS Docs](https://doc.fincloud.tencent.cn/tcloud/Storage/COS/846365/hadoop)
+
+Please add the following jar to the lib directory:
+- [hadoop-cos-3.4.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-cos/3.4.1)
+- [cos_api-bundle-5.6.69.jar](https://mvnrepository.com/artifact/com.qcloud/cos_api-bundle/5.6.69)
+- [hadoop-shaded-guava-1.1.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop.thirdparty/hadoop-shaded-guava/1.1.1)
+
#### S3
S3 based hdfs-file you can refer [hadoop s3 docs](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) to config s3.
diff --git a/docs/en/seatunnel-engine/download-seatunnel.md b/docs/en/seatunnel-engine/download-seatunnel.md
index 48b5ed63a54..12b169e482c 100644
--- a/docs/en/seatunnel-engine/download-seatunnel.md
+++ b/docs/en/seatunnel-engine/download-seatunnel.md
@@ -20,7 +20,7 @@ Go to the [Seatunnel Download Page](https://seatunnel.apache.org/download) to do
Or you can also download it through the terminal.
```shell
-export version="2.3.8"
+export version="2.3.9"
wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz"
tar -xzvf "apache-seatunnel-${version}-bin.tar.gz"
```
@@ -33,10 +33,10 @@ Starting from the 2.2.0-beta version, the binary package no longer provides the
sh bin/install-plugin.sh
```
-If you need a specific connector version, taking 2.3.8 as an example, you need to execute the following command.
+If you need a specific connector version, taking 2.3.9 as an example, you need to execute the following command.
```bash
-sh bin/install-plugin.sh 2.3.8
+sh bin/install-plugin.sh 2.3.9
```
Usually you don't need all the connector plugins, so you can specify the plugins you need through configuring `config/plugin_config`, for example, if you only need the `connector-console` plugin, then you can modify the plugin.properties configuration file as follows.
diff --git a/docs/en/seatunnel-engine/hybrid-cluster-deployment.md b/docs/en/seatunnel-engine/hybrid-cluster-deployment.md
index ebada38957f..ac072c494df 100644
--- a/docs/en/seatunnel-engine/hybrid-cluster-deployment.md
+++ b/docs/en/seatunnel-engine/hybrid-cluster-deployment.md
@@ -43,7 +43,7 @@ Therefore, the SeaTunnel Engine can implement cluster HA without using other ser
`backup count` is a parameter that defines the number of synchronous backups. For example, if it is set to 1, the backup of the partition will be placed on one other member. If it is set to 2, it will be placed on two other members.
-We recommend that the value of `backup count` be `min(1, max(5, N/2))`. `N` is the number of cluster nodes.
+We recommend that the value of `backup count` be `max(1, min(5, N/2))`. `N` is the number of cluster nodes.
```yaml
seatunnel:
@@ -127,7 +127,7 @@ seatunnel:
This configuration primarily addresses the issue of resource leakage caused by constantly creating and attempting to destroy the class loader.
If you encounter exceptions related to metaspace overflow, you can try enabling this configuration.
To reduce the frequency of class loader creation, after enabling this configuration, SeaTunnel will not attempt to release the corresponding class loader when a job is completed, allowing it to be used by subsequent jobs. This is more effective when the number of Source/Sink connectors used in the running job is not excessive.
-The default value is false.
+The default value is true.
Example
```yaml
@@ -136,6 +136,24 @@ seatunnel:
classloader-cache-mode: true
```
+### 4.6 Job Scheduling Strategy
+
+When resources are insufficient, the job scheduling strategy can be configured in the following two modes:
+
+1. `WAIT`: Wait for resources to be available.
+
+2. `REJECT`: Reject the job, default value.
+
+Example
+
+```yaml
+seatunnel:
+ engine:
+ job-schedule-strategy: WAIT
+```
+
+When `dynamic-slot: true` is used, the `job-schedule-strategy: WAIT` configuration will become invalid and will be forcibly changed to `job-schedule-strategy: REJECT`, because this parameter is meaningless in dynamic slots.
+
## 5. Configure The SeaTunnel Engine Network Service
All SeaTunnel Engine network-related configurations are in the `hazelcast.yaml` file.
@@ -319,4 +337,4 @@ Now that the cluster is deployed, you can complete the submission and management
### 8.2 Submit Jobs With The REST API
-The SeaTunnel Engine provides a REST API for submitting and managing jobs. For more information, please refer to [REST API](rest-api.md)
\ No newline at end of file
+The SeaTunnel Engine provides a REST API for submitting and managing jobs. For more information, please refer to [REST API V2](rest-api-v2.md)
diff --git a/docs/en/seatunnel-engine/local-mode-deployment.md b/docs/en/seatunnel-engine/local-mode-deployment.md
index 92df4220b68..5418477c523 100644
--- a/docs/en/seatunnel-engine/local-mode-deployment.md
+++ b/docs/en/seatunnel-engine/local-mode-deployment.md
@@ -27,6 +27,16 @@ In this mode, you only need to copy the downloaded and created installation pack
$SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -m local
```
+### Configure The JVM Options For Local Mode
+
+Local Mode supports two methods for setting JVM options:
+
+1. Add the JVM options to `$SEATUNNEL_HOME/config/jvm_client_options`.
+
+ Modify the JVM parameters in the `$SEATUNNEL_HOME/config/jvm_client_options` file. Please note that the JVM parameters in this file will be applied to all jobs submitted using `seatunnel.sh`, including Local Mode and Cluster Mode.
+
+2. Add JVM options when starting the Local Mode. For example, `$SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -m local -DJvmOption="-Xms2G -Xmx2G"`
+
## Job Operations
Jobs submitted in local mode will run in the process that submitted the job, and the process will exit when the job is completed. If you want to abort the job, you only need to exit the process that submitted the job. The job's runtime logs will be output to the standard output of the process that submitted the job.
diff --git a/docs/en/seatunnel-engine/logging.md b/docs/en/seatunnel-engine/logging.md
index 7c827887b82..be0bc12f0a2 100644
--- a/docs/en/seatunnel-engine/logging.md
+++ b/docs/en/seatunnel-engine/logging.md
@@ -30,7 +30,7 @@ The MDC is propagated by slf4j to the logging backend which usually adds it to t
Log4j 2 is controlled using property files.
-The SeaTunnel Engine distribution ships with the following log4j properties files in the `confing` directory, which are used automatically if Log4j 2 is enabled:
+The SeaTunnel Engine distribution ships with the following log4j properties files in the `config` directory, which are used automatically if Log4j 2 is enabled:
- `log4j2_client.properties`: used by the command line client (e.g., `seatunnel.sh`)
- `log4j2.properties`: used for SeaTunnel Engine server processes (e.g., `seatunnel-cluster.sh`)
@@ -80,6 +80,36 @@ appender.file.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-
SeaTunnel Engine automatically integrates Log framework bridge, allowing existing applications that work against Log4j1/Logback classes to continue working.
+### Query Logs via REST API
+
+SeaTunnel provides an API for querying logs.
+
+**Usage examples:**
+- Retrieve logs for all nodes with `jobId` of `733584788375666689`: `http://localhost:8080/logs/733584788375666689`
+- Retrieve the log list for all nodes: `http://localhost:8080/logs`
+- Retrieve the log list for all nodes in JSON format: `http://localhost:8080/logs?format=json`
+- Retrieve log file content: `http://localhost:8080/logs/job-898380162133917698.log`
+
+For more details, please refer to the [REST-API](rest-api-v2.md).
+
+## SeaTunnel Log Configuration
+
+### Scheduled deletion of old logs
+
+SeaTunnel supports scheduled deletion of old log files to prevent disk space exhaustion. You can add the following configuration in the `seatunnel.yml` file:
+
+```yaml
+seatunnel:
+ engine:
+ history-job-expire-minutes: 1440
+ telemetry:
+ logs:
+ scheduled-deletion-enable: true
+```
+
+- `history-job-expire-minutes`: Sets the retention time for historical job data and logs (in minutes). The system will automatically clear expired job information and log files after the specified period.
+- `scheduled-deletion-enable`: Enable scheduled cleanup, with default value of `true`. The system will automatically delete relevant log files when job expiration time, as defined by `history-job-expire-minutes`, is reached. If this feature is disabled, logs will remain permanently on disk, requiring manual management, which may affect disk space usage. It is recommended to configure this setting based on specific needs.
+
## Best practices for developers
You can create an SLF4J logger by calling `org.slf4j.LoggerFactory#LoggerFactory.getLogger` with the Class of your class as an argument.
diff --git a/docs/en/seatunnel-engine/resource-isolation.md b/docs/en/seatunnel-engine/resource-isolation.md
index 5b9e1ff0ba0..4b68401ee15 100644
--- a/docs/en/seatunnel-engine/resource-isolation.md
+++ b/docs/en/seatunnel-engine/resource-isolation.md
@@ -4,46 +4,46 @@ sidebar_position: 9
# Resource Isolation
-After version 2.3.6. SeaTunnel can add `tag` to each worker node, when you submit job you can use `tag_filter` to filter the node you want run this job.
+SeaTunnel can add `tag` to each worker node, when you submit job you can use `tag_filter` to filter the node you want run this job.
-## How To Archive This:
+## Configuration
1. update the config in `hazelcast.yaml`,
-```yaml
-hazelcast:
- cluster-name: seatunnel
- network:
- rest-api:
- enabled: true
- endpoint-groups:
- CLUSTER_WRITE:
+ ```yaml
+ hazelcast:
+ cluster-name: seatunnel
+ network:
+ rest-api:
enabled: true
- DATA:
- enabled: true
- join:
- tcp-ip:
- enabled: true
- member-list:
- - localhost
- port:
- auto-increment: false
- port: 5801
- properties:
- hazelcast.invocation.max.retry.count: 20
- hazelcast.tcp.join.port.try.count: 30
- hazelcast.logging.type: log4j2
- hazelcast.operation.generic.thread.count: 50
- member-attributes:
- group:
- type: string
- value: platform
- team:
- type: string
- value: team1
-```
-
-In this config, we specify the tag by `member-attributes`, the node has `group=platform, team=team1` tags.
+ endpoint-groups:
+ CLUSTER_WRITE:
+ enabled: true
+ DATA:
+ enabled: true
+ join:
+ tcp-ip:
+ enabled: true
+ member-list:
+ - localhost
+ port:
+ auto-increment: false
+ port: 5801
+ properties:
+ hazelcast.invocation.max.retry.count: 20
+ hazelcast.tcp.join.port.try.count: 30
+ hazelcast.logging.type: log4j2
+ hazelcast.operation.generic.thread.count: 50
+ member-attributes:
+ group:
+ type: string
+ value: platform
+ team:
+ type: string
+ value: team1
+ ```
+
+ In this config, we specify the tag by `member-attributes`, the node has `group=platform, team=team1` tags.
2. add `tag_filter` to your job config
@@ -58,7 +58,7 @@ env {
}
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
parallelism = 1
schema = {
fields {
@@ -71,18 +71,18 @@ transform {
}
sink {
console {
- source_table_name="fake"
+ plugin_input="fake"
}
}
```
-**Notice:**
-- If not set `tag_filter` in job config, it will random choose the node in all active nodes.
-- When you add multiple tag in `tag_filter`, it need all key exist and value match. if all node not match, you will get `NoEnoughResourceException` exception.
+ **Notice:**
+ - If not set `tag_filter` in job config, it will random choose the node in all active nodes.
+ - When you add multiple tag in `tag_filter`, it need all key exist and value match. if all node not match, you will get `NoEnoughResourceException` exception.
-![img.png](../../images/resource-isolation.png)
+ ![img.png](../../images/resource-isolation.png)
3. update running node tags by rest api (optional)
-for more information, please refer to [Update the tags of running node](https://seatunnel.apache.org/docs/seatunnel-engine/rest-api/)
+ for more information, please refer to [Update the tags of running node](rest-api-v2.md)
diff --git a/docs/en/seatunnel-engine/rest-api.md b/docs/en/seatunnel-engine/rest-api-v1.md
similarity index 81%
rename from docs/en/seatunnel-engine/rest-api.md
rename to docs/en/seatunnel-engine/rest-api-v1.md
index a1ddc76e539..8859faa32f1 100644
--- a/docs/en/seatunnel-engine/rest-api.md
+++ b/docs/en/seatunnel-engine/rest-api-v1.md
@@ -2,7 +2,13 @@
sidebar_position: 11
---
-# RESTful API
+# RESTful API V1
+
+:::caution warn
+
+It is recommended to use the v2 version of the Rest API. The v1 version is deprecated and will be removed in the future.
+
+:::
SeaTunnel has a monitoring API that can be used to query status and statistics of running jobs, as well as recent
completed jobs. The monitoring API is a RESTful API that accepts HTTP requests and responds with JSON data.
@@ -115,10 +121,19 @@ network:
},
"createTime": "",
"jobDag": {
- "vertices": [
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
],
- "edges": [
- ]
+ "pipelineEdges": {}
},
"pluginJarsUrls": [
],
@@ -155,10 +170,19 @@ network:
"jobStatus": "",
"createTime": "",
"jobDag": {
- "vertices": [
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
],
- "edges": [
- ]
+ "pipelineEdges": {}
},
"metrics": {
"sourceReceivedCount": "",
@@ -212,10 +236,19 @@ This API has been deprecated, please use /hazelcast/rest/maps/job-info/:jobId in
"jobStatus": "",
"createTime": "",
"jobDag": {
- "vertices": [
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
],
- "edges": [
- ]
+ "pipelineEdges": {}
},
"metrics": {
"SourceReceivedCount": "",
@@ -283,7 +316,21 @@ When we can't get the job info, the response will be:
"errorMsg": null,
"createTime": "",
"finishTime": "",
- "jobDag": "",
+ "jobDag": {
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
+ ],
+ "pipelineEdges": {}
+ },
"metrics": ""
}
]
@@ -384,7 +431,7 @@ When we can't get the job info, the response will be:
"source": [
{
"plugin_name": "FakeSource",
- "result_table_name": "fake",
+ "plugin_output": "fake",
"row.num": 100,
"schema": {
"fields": {
@@ -400,7 +447,7 @@ When we can't get the job info, the response will be:
"sink": [
{
"plugin_name": "Console",
- "source_table_name": ["fake"]
+ "plugin_input": ["fake"]
}
]
}
@@ -447,7 +494,7 @@ When we can't get the job info, the response will be:
"source": [
{
"plugin_name": "FakeSource",
- "result_table_name": "fake",
+ "plugin_output": "fake",
"row.num": 1000,
"schema": {
"fields": {
@@ -463,7 +510,7 @@ When we can't get the job info, the response will be:
"sink": [
{
"plugin_name": "Console",
- "source_table_name": ["fake"]
+ "plugin_input": ["fake"]
}
]
},
@@ -478,7 +525,7 @@ When we can't get the job info, the response will be:
"source": [
{
"plugin_name": "FakeSource",
- "result_table_name": "fake",
+ "plugin_output": "fake",
"row.num": 1000,
"schema": {
"fields": {
@@ -494,7 +541,7 @@ When we can't get the job info, the response will be:
"sink": [
{
"plugin_name": "Console",
- "source_table_name": ["fake"]
+ "plugin_input": ["fake"]
}
]
}
@@ -603,7 +650,7 @@ For more information about customize encryption, please refer to the documentati
"age": "int"
}
},
- "result_table_name": "fake",
+ "plugin_output": "fake",
"parallelism": 1,
"hostname": "127.0.0.1",
"username": "seatunnel",
@@ -643,7 +690,7 @@ For more information about customize encryption, please refer to the documentati
"age": "int"
}
},
- "result_table_name": "fake",
+ "plugin_output": "fake",
"parallelism": 1,
"hostname": "127.0.0.1",
"username": "c2VhdHVubmVs",
@@ -729,3 +776,70 @@ If the parameter is an empty `Map` object, it means that the tags of the current
```
+------------------------------------------------------------------------------------------
+
+### Get All Node Log Content
+
+
+ GET/hazelcast/rest/maps/logs/:jobId(Returns a list of logs.)
+
+#### Request Parameters
+
+#### Parameters (Add in the `params` field of the request body)
+
+> | Parameter Name | Required | Type | Description |
+> |----------------------|------------|---------|---------------------------------|
+> | jobId | optional | string | job id |
+
+When `jobId` is empty, it returns log information for all nodes; otherwise, it returns the log list of the specified `jobId` across all nodes.
+
+#### Response
+
+Returns a list of logs and content from the requested nodes.
+
+#### Get All Log Files List
+
+If you'd like to view the log list first, you can use a `GET` request to retrieve the log list:
+`http://localhost:5801/hazelcast/rest/maps/logs?format=json`
+
+```json
+[
+ {
+ "node": "localhost:5801",
+ "logLink": "http://localhost:5801/hazelcast/rest/maps/logs/job-899485770241277953.log",
+ "logName": "job-899485770241277953.log"
+ },
+ {
+ "node": "localhost:5801",
+ "logLink": "http://localhost:5801/hazelcast/rest/maps/logs/job-899470314109468673.log",
+ "logName": "job-899470314109468673.log"
+ }
+]
+```
+
+The supported formats are `json` and `html`, with `html` as the default.
+
+#### Examples
+
+Retrieve logs for all nodes with the `jobId` of `733584788375666689`: `http://localhost:5801/hazelcast/rest/maps/logs/733584788375666689`
+Retrieve the log list for all nodes: `http://localhost:5801/hazelcast/rest/maps/logs`
+Retrieve the log list for all nodes in JSON format: `http://localhost:5801/hazelcast/rest/maps/logs?format=json`
+Retrieve log file content: `http://localhost:5801/hazelcast/rest/maps/logs/job-898380162133917698.log`
+
+
+
+### Get Log Content from a Single Node
+
+
+ GET/hazelcast/rest/maps/log(Returns a list of logs.)
+
+#### Response
+
+Returns a list of logs from the requested node.
+
+#### Examples
+
+To get a list of logs from the current node: `http://localhost:5801/hazelcast/rest/maps/log`
+To get the content of a log file: `http://localhost:5801/hazelcast/rest/maps/log/job-898380162133917698.log`
+
+
\ No newline at end of file
diff --git a/docs/en/seatunnel-engine/rest-api-v2.md b/docs/en/seatunnel-engine/rest-api-v2.md
new file mode 100644
index 00000000000..8a5e3a8d7d3
--- /dev/null
+++ b/docs/en/seatunnel-engine/rest-api-v2.md
@@ -0,0 +1,847 @@
+---
+sidebar_position: 12
+---
+
+# RESTful API V2
+
+SeaTunnel has a monitoring API that can be used to query status and statistics of running jobs, as well as recent
+completed jobs. The monitoring API is a RESTful API that accepts HTTP requests and responds with JSON data.
+
+## Overview
+
+The v2 version of the api uses jetty support. It is the same as the interface specification of v1 version
+, you can specify the port and context-path by modifying the configuration items in `seatunnel.yaml`,
+you can configure `enable-dynamic-port` to enable dynamic ports (the default port is accumulated starting from `port`), and the default is closed,
+If enable-dynamic-port is true, We will use the unused port in the range within the range of `port` and `port` + `port-range`, default range is 100
+
+```yaml
+
+seatunnel:
+ engine:
+ http:
+ enable-http: true
+ port: 8080
+ enable-dynamic-port: false
+ port-range: 100
+```
+
+Context-path can also be configured as follows:
+
+```yaml
+
+seatunnel:
+ engine:
+ http:
+ enable-http: true
+ port: 8080
+ context-path: /seatunnel
+```
+
+## API reference
+
+### Returns an overview over the Zeta engine cluster.
+
+
+ GET/overview?tag1=value1&tag2=value2(Returns an overview over the Zeta engine cluster.)
+
+#### Parameters
+
+> | name | type | data type | description |
+> |----------|----------|-----------|------------------------------------------------------------------------------------------------------|
+> | tag_name | optional | string | the tags filter, you can add tag filter to get those matched worker count, and slot on those workers |
+
+#### Responses
+
+```json
+{
+ "projectVersion":"2.3.5-SNAPSHOT",
+ "gitCommitAbbrev":"DeadD0d0",
+ "totalSlot":"0",
+ "unassignedSlot":"0",
+ "works":"1",
+ "runningJobs":"0",
+ "finishedJobs":"0",
+ "failedJobs":"0",
+ "cancelledJobs":"0"
+}
+```
+
+**Notes:**
+- If you use `dynamic-slot`, the `totalSlot` and `unassignedSlot` always be `0`. when you set it to fix slot number, it will return the correct total and unassigned slot number
+- If the url has tag filter, the `works`, `totalSlot` and `unassignedSlot` will return the result on the matched worker. but the job related metric will always return the cluster level information.
+
+
+
+------------------------------------------------------------------------------------------
+
+### Returns An Overview And State Of All Jobs
+
+
+ GET/running-jobs(Returns an overview over all jobs and their current state.)
+
+#### Parameters
+
+#### Responses
+
+```json
+[
+ {
+ "jobId": "",
+ "jobName": "",
+ "jobStatus": "",
+ "envOptions": {
+ },
+ "createTime": "",
+ "jobDag": {
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
+ ],
+ "pipelineEdges": {}
+ },
+ "pluginJarsUrls": [
+ ],
+ "isStartWithSavePoint": false,
+ "metrics": {
+ "sourceReceivedCount": "",
+ "sinkWriteCount": ""
+ }
+ }
+]
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Return Details Of A Job
+
+
+ GET/job-info/:jobId(Return details of a job. )
+
+#### Parameters
+
+> | name | type | data type | description |
+> |-------|----------|-----------|-------------|
+> | jobId | required | long | job id |
+
+#### Responses
+
+```json
+{
+ "jobId": "",
+ "jobName": "",
+ "jobStatus": "",
+ "createTime": "",
+ "jobDag": {
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
+ ],
+ "pipelineEdges": {}
+ },
+ "metrics": {
+ "sourceReceivedCount": "",
+ "sinkWriteCount": ""
+ },
+ "finishedTime": "",
+ "errorMsg": null,
+ "envOptions": {
+ },
+ "pluginJarsUrls": [
+ ],
+ "isStartWithSavePoint": false
+}
+```
+
+`jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned.
+`envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running.
+`finishedTime`, `errorMsg` will return when job is finished.
+
+When we can't get the job info, the response will be:
+
+```json
+{
+ "jobId" : ""
+}
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Return Details Of A Job
+
+This API has been deprecated, please use /job-info/:jobId instead
+
+
+ GET/running-job/:jobId(Return details of a job. )
+
+#### Parameters
+
+> | name | type | data type | description |
+> |-------|----------|-----------|-------------|
+> | jobId | required | long | job id |
+
+#### Responses
+
+```json
+{
+ "jobId": "",
+ "jobName": "",
+ "jobStatus": "",
+ "createTime": "",
+ "jobDag": {
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
+ ],
+ "pipelineEdges": {}
+ },
+ "metrics": {
+ "SourceReceivedCount": "",
+ "SourceReceivedQPS": "",
+ "SourceReceivedBytes": "",
+ "SourceReceivedBytesPerSeconds": "",
+ "SinkWriteCount": "",
+ "SinkWriteQPS": "",
+ "SinkWriteBytes": "",
+ "SinkWriteBytesPerSeconds": "",
+ "TableSourceReceivedCount": {},
+ "TableSourceReceivedBytes": {},
+ "TableSourceReceivedBytesPerSeconds": {},
+ "TableSourceReceivedQPS": {},
+ "TableSinkWriteCount": {},
+ "TableSinkWriteQPS": {},
+ "TableSinkWriteBytes": {},
+ "TableSinkWriteBytesPerSeconds": {}
+ },
+ "finishedTime": "",
+ "errorMsg": null,
+ "envOptions": {
+ },
+ "pluginJarsUrls": [
+ ],
+ "isStartWithSavePoint": false
+}
+```
+
+`jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned.
+`envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running.
+`finishedTime`, `errorMsg` will return when job is finished.
+
+When we can't get the job info, the response will be:
+
+```json
+{
+ "jobId" : ""
+}
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Return All Finished Jobs Info
+
+
+ GET/finished-jobs/:state(Return all finished Jobs Info.)
+
+#### Parameters
+
+> | name | type | data type | description |
+> |-------|----------|-----------|------------------------------------------------------------------|
+> | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`UNKNOWABLE` |
+
+#### Responses
+
+```json
+[
+ {
+ "jobId": "",
+ "jobName": "",
+ "jobStatus": "",
+ "errorMsg": null,
+ "createTime": "",
+ "finishTime": "",
+ "jobDag": {
+ "jobId": "",
+ "envOptions": [],
+ "vertexInfoMap": [
+ {
+ "vertexId": 1,
+ "type": "",
+ "vertexName": "",
+ "tablePaths": [
+ ""
+ ]
+ }
+ ],
+ "pipelineEdges": {}
+ },
+ "metrics": ""
+ }
+]
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Returns System Monitoring Information
+
+
+ GET/system-monitoring-information(Returns system monitoring information.)
+
+#### Parameters
+
+#### Responses
+
+```json
+[
+ {
+ "processors":"8",
+ "physical.memory.total":"16.0G",
+ "physical.memory.free":"16.3M",
+ "swap.space.total":"0",
+ "swap.space.free":"0",
+ "heap.memory.used":"135.7M",
+ "heap.memory.free":"440.8M",
+ "heap.memory.total":"576.5M",
+ "heap.memory.max":"3.6G",
+ "heap.memory.used/total":"23.54%",
+ "heap.memory.used/max":"3.73%",
+ "minor.gc.count":"6",
+ "minor.gc.time":"110ms",
+ "major.gc.count":"2",
+ "major.gc.time":"73ms",
+ "load.process":"24.78%",
+ "load.system":"60.00%",
+ "load.systemAverage":"2.07",
+ "thread.count":"117",
+ "thread.peakCount":"118",
+ "cluster.timeDiff":"0",
+ "event.q.size":"0",
+ "executor.q.async.size":"0",
+ "executor.q.client.size":"0",
+ "executor.q.client.query.size":"0",
+ "executor.q.client.blocking.size":"0",
+ "executor.q.query.size":"0",
+ "executor.q.scheduled.size":"0",
+ "executor.q.io.size":"0",
+ "executor.q.system.size":"0",
+ "executor.q.operations.size":"0",
+ "executor.q.priorityOperation.size":"0",
+ "operations.completed.count":"10",
+ "executor.q.mapLoad.size":"0",
+ "executor.q.mapLoadAllKeys.size":"0",
+ "executor.q.cluster.size":"0",
+ "executor.q.response.size":"0",
+ "operations.running.count":"0",
+ "operations.pending.invocations.percentage":"0.00%",
+ "operations.pending.invocations.count":"0",
+ "proxy.count":"8",
+ "clientEndpoint.count":"0",
+ "connection.active.count":"2",
+ "client.connection.count":"0",
+ "connection.count":"0"
+ }
+]
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Submit A Job
+
+
+POST/submit-job(Returns jobId and jobName if job submitted successfully.)
+
+#### Parameters
+
+> | name | type | data type | description |
+> |----------------------|----------|-----------|-----------------------------------|
+> | jobId | optional | string | job id |
+> | jobName | optional | string | job name |
+> | isStartWithSavePoint | optional | string | if job is started with save point |
+> | format | optional | string | config format, support json and hocon, default json |
+
+#### Body
+
+You can choose json or hocon to pass request body.
+The json format example:
+``` json
+{
+ "env": {
+ "job.mode": "batch"
+ },
+ "source": [
+ {
+ "plugin_name": "FakeSource",
+ "plugin_output": "fake",
+ "row.num": 100,
+ "schema": {
+ "fields": {
+ "name": "string",
+ "age": "int",
+ "card": "int"
+ }
+ }
+ }
+ ],
+ "transform": [
+ ],
+ "sink": [
+ {
+ "plugin_name": "Console",
+ "plugin_input": ["fake"]
+ }
+ ]
+}
+```
+The hocon format example:
+``` hocon
+env {
+ job.mode = "batch"
+}
+
+source {
+ FakeSource {
+ result_table_name = "fake"
+ row.num = 100
+ schema = {
+ fields {
+ name = "string"
+ age = "int"
+ card = "int"
+ }
+ }
+ }
+}
+
+transform {
+}
+
+sink {
+ Console {
+ source_table_name = "fake"
+ }
+}
+
+```
+
+
+#### Responses
+
+```json
+{
+ "jobId": 733584788375666689,
+ "jobName": "rest_api_test"
+}
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Batch Submit Jobs
+
+
+POST/submit-jobs(Returns jobId and jobName if the job is successfully submitted.)
+
+#### Parameters (add in the `params` field in the request body)
+
+> | Parameter Name | Required | Type | Description |
+> |----------------------|--------------|---------|---------------------------------------|
+> | jobId | optional | string | job id |
+> | jobName | optional | string | job name |
+> | isStartWithSavePoint | optional | string | if the job is started with save point |
+
+#### Request Body
+
+```json
+[
+ {
+ "params":{
+ "jobId":"123456",
+ "jobName":"SeaTunnel-01"
+ },
+ "env": {
+ "job.mode": "batch"
+ },
+ "source": [
+ {
+ "plugin_name": "FakeSource",
+ "plugin_output": "fake",
+ "row.num": 1000,
+ "schema": {
+ "fields": {
+ "name": "string",
+ "age": "int",
+ "card": "int"
+ }
+ }
+ }
+ ],
+ "transform": [
+ ],
+ "sink": [
+ {
+ "plugin_name": "Console",
+ "plugin_input": ["fake"]
+ }
+ ]
+ },
+ {
+ "params":{
+ "jobId":"1234567",
+ "jobName":"SeaTunnel-02"
+ },
+ "env": {
+ "job.mode": "batch"
+ },
+ "source": [
+ {
+ "plugin_name": "FakeSource",
+ "plugin_output": "fake",
+ "row.num": 1000,
+ "schema": {
+ "fields": {
+ "name": "string",
+ "age": "int",
+ "card": "int"
+ }
+ }
+ }
+ ],
+ "transform": [
+ ],
+ "sink": [
+ {
+ "plugin_name": "Console",
+ "plugin_input": ["fake"]
+ }
+ ]
+ }
+]
+```
+
+#### Response
+
+```json
+[
+ {
+ "jobId": "123456",
+ "jobName": "SeaTunnel-01"
+ },{
+ "jobId": "1234567",
+ "jobName": "SeaTunnel-02"
+ }
+]
+```
+
+
+
+------------------------------------------------------------------------------------------
+
+### Stop A Job
+
+
+POST/stop-job(Returns jobId if job stoped successfully.)
+
+#### Body
+
+```json
+{
+ "jobId": 733584788375666689,
+ "isStopWithSavePoint": false # if job is stopped with save point
+}
+```
+
+#### Responses
+
+```json
+{
+"jobId": 733584788375666689
+}
+```
+
+
+
+------------------------------------------------------------------------------------------
+### Batch Stop Jobs
+
+
+POST/stop-jobs(Returns jobId if the job is successfully stopped.)
+
+#### Request Body
+
+```json
+[
+ {
+ "jobId": 881432421482889220,
+ "isStopWithSavePoint": false
+ },
+ {
+ "jobId": 881432456517910529,
+ "isStopWithSavePoint": false
+ }
+]
+```
+
+#### Response
+
+```json
+[
+ {
+ "jobId": 881432421482889220
+ },
+ {
+ "jobId": 881432456517910529
+ }
+]
+```
+
+
+
+------------------------------------------------------------------------------------------
+### Encrypt Config
+
+
+POST/encrypt-config(Returns the encrypted config if config is encrypted successfully.)
+For more information about customize encryption, please refer to the documentation [config-encryption-decryption](../connector-v2/Config-Encryption-Decryption.md).
+
+#### Body
+
+```json
+{
+ "env": {
+ "parallelism": 1,
+ "shade.identifier":"base64"
+ },
+ "source": [
+ {
+ "plugin_name": "MySQL-CDC",
+ "schema" : {
+ "fields": {
+ "name": "string",
+ "age": "int"
+ }
+ },
+ "plugin_output": "fake",
+ "parallelism": 1,
+ "hostname": "127.0.0.1",
+ "username": "seatunnel",
+ "password": "seatunnel_password",
+ "table-name": "inventory_vwyw0n"
+ }
+ ],
+ "transform": [
+ ],
+ "sink": [
+ {
+ "plugin_name": "Clickhouse",
+ "host": "localhost:8123",
+ "database": "default",
+ "table": "fake_all",
+ "username": "seatunnel",
+ "password": "seatunnel_password"
+ }
+ ]
+}
+```
+
+#### Responses
+
+```json
+{
+ "env": {
+ "parallelism": 1,
+ "shade.identifier": "base64"
+ },
+ "source": [
+ {
+ "plugin_name": "MySQL-CDC",
+ "schema": {
+ "fields": {
+ "name": "string",
+ "age": "int"
+ }
+ },
+ "plugin_output": "fake",
+ "parallelism": 1,
+ "hostname": "127.0.0.1",
+ "username": "c2VhdHVubmVs",
+ "password": "c2VhdHVubmVsX3Bhc3N3b3Jk",
+ "table-name": "inventory_vwyw0n"
+ }
+ ],
+ "transform": [],
+ "sink": [
+ {
+ "plugin_name": "Clickhouse",
+ "host": "localhost:8123",
+ "database": "default",
+ "table": "fake_all",
+ "username": "c2VhdHVubmVs",
+ "password": "c2VhdHVubmVsX3Bhc3N3b3Jk"
+ }
+ ]
+}
+```
+
+
+
+
+------------------------------------------------------------------------------------------
+
+### Update the tags of running node
+
+POST/update-tagsBecause the update can only target a specific node, the current node's `ip:port` needs to be used for the update(If the update is successful, return a success message)
+
+
+#### update node tags
+##### Body
+If the request parameter is a `Map` object, it indicates that the tags of the current node need to be updated
+```json
+{
+ "tag1": "dev_1",
+ "tag2": "dev_2"
+}
+```
+##### Responses
+
+```json
+{
+ "status": "success",
+ "message": "update node tags done."
+}
+```
+#### remove node tags
+##### Body
+If the parameter is an empty `Map` object, it means that the tags of the current node need to be cleared
+```json
+{}
+```
+##### Responses
+
+```json
+{
+ "status": "success",
+ "message": "update node tags done."
+}
+```
+
+#### Request parameter exception
+- If the parameter body is empty
+
+##### Responses
+
+```json
+{
+ "status": "fail",
+ "message": "Request body is empty."
+}
+```
+- If the parameter is not a `Map` object
+##### Responses
+
+```json
+{
+ "status": "fail",
+ "message": "Invalid JSON format in request body."
+}
+```
+
+
+------------------------------------------------------------------------------------------
+
+### Get Logs from All Nodes
+
+
+ GET/logs/:jobId(Returns a list of logs.)
+
+#### Request Parameters
+
+#### Parameters (to be added in the `params` field of the request body)
+
+> | Parameter Name | Required | Type | Description |
+> |-----------------------|--------------|---------|------------------------------------|
+> | jobId | optional | string | job id |
+
+If `jobId` is empty, the request will return logs from all nodes. Otherwise, it will return the list of logs for the specified `jobId` from all nodes.
+
+#### Response
+
+Returns a list of logs from the requested nodes along with their content.
+
+#### Return List of All Log Files
+
+If you want to view the log list first, you can retrieve it via a `GET` request: `http://localhost:8080/logs?format=json`
+
+```json
+[
+ {
+ "node": "localhost:8080",
+ "logLink": "http://localhost:8080/logs/job-899485770241277953.log",
+ "logName": "job-899485770241277953.log"
+ },
+ {
+ "node": "localhost:8080",
+ "logLink": "http://localhost:8080/logs/job-899470314109468673.log",
+ "logName": "job-899470314109468673.log"
+ }
+]
+```
+
+Supported formats are `json` and `html`, with `html` as the default.
+
+#### Examples
+
+Retrieve logs for `jobId` `733584788375666689` across all nodes: `http://localhost:8080/logs/733584788375666689`
+Retrieve the list of logs from all nodes: `http://localhost:8080/logs`
+Retrieve the list of logs in JSON format: `http://localhost:8080/logs?format=json`
+Retrieve the content of a specific log file: `http://localhost:8080/logs/job-898380162133917698.log`
+
+
+
+### Get Log Content from a Single Node
+
+
+ GET/log(Returns a list of logs.)
+
+#### Response
+
+Returns a list of logs from the requested node.
+
+#### Examples
+
+To get a list of logs from the current node: `http://localhost:5801/log`
+To get the content of a log file: `http://localhost:5801/log/job-898380162133917698.log`
+
+
diff --git a/docs/en/seatunnel-engine/separated-cluster-deployment.md b/docs/en/seatunnel-engine/separated-cluster-deployment.md
index dcfe58d2f55..91215eb459a 100644
--- a/docs/en/seatunnel-engine/separated-cluster-deployment.md
+++ b/docs/en/seatunnel-engine/separated-cluster-deployment.md
@@ -71,7 +71,7 @@ SeaTunnel Engine implements cluster management based on [Hazelcast IMDG](https:/
The `backup count` is a parameter that defines the number of synchronous backups. For example, if it is set to 1, the backup of the partition will be placed on one other member. If it is set to 2, it will be placed on two other members.
-We recommend that the value of `backup-count` be `min(1, max(5, N/2))`. `N` is the number of cluster nodes.
+We recommend that the value of `backup-count` be `max(1, min(5, N/2))`. `N` is the number of cluster nodes.
```yaml
seatunnel:
@@ -173,7 +173,7 @@ seatunnel:
This configuration mainly solves the problem of resource leakage caused by continuously creating and attempting to destroy class loaders.
If you encounter an exception related to metaspace space overflow, you can try to enable this configuration.
In order to reduce the frequency of creating class loaders, after enabling this configuration, SeaTunnel will not try to release the corresponding class loader when the job is completed, so that it can be used by subsequent jobs, that is to say, when not too many types of Source/Sink connector are used in the running job, it is more effective.
-The default value is false.
+The default value is true.
Example
```yaml
@@ -280,6 +280,23 @@ netty-common-4.1.89.Final.jar
seatunnel-hadoop3-3.1.4-uber.jar
```
+### 4.7 Job Scheduling Strategy
+
+When resources are insufficient, the job scheduling strategy can be configured in the following two modes:
+
+1. `WAIT`: Wait for resources to be available.
+
+2. `REJECT`: Reject the job, default value.
+
+Example
+
+```yaml
+seatunnel:
+ engine:
+ job-schedule-strategy: WAIT
+```
+When `dynamic-slot: true` is used, the `job-schedule-strategy: WAIT` configuration will become invalid and will be forcibly changed to `job-schedule-strategy: REJECT`, because this parameter is meaningless in dynamic slots.
+
## 5. Configuring SeaTunnel Engine Network Services
All network-related configurations of the SeaTunnel Engine are in the `hazelcast-master.yaml` and `hazelcast-worker.yaml` files.
@@ -431,4 +448,4 @@ Now that the cluster has been deployed, you can complete the job submission and
### 8.2 Submit Jobs With The REST API
-The SeaTunnel Engine provides a REST API for submitting and managing jobs. For more information, please refer to [REST API](rest-api.md)
\ No newline at end of file
+The SeaTunnel Engine provides a REST API for submitting and managing jobs. For more information, please refer to [REST API V2](rest-api-v2.md)
diff --git a/docs/en/seatunnel-engine/telemetry.md b/docs/en/seatunnel-engine/telemetry.md
index 1febb3f08e6..530385392a3 100644
--- a/docs/en/seatunnel-engine/telemetry.md
+++ b/docs/en/seatunnel-engine/telemetry.md
@@ -1,5 +1,5 @@
---
-sidebar_position: 13
+sidebar_position: 14
---
# Telemetry
@@ -48,8 +48,8 @@ Note: All metrics both have the same labelName `cluster`, that's value is the co
| hazelcast_executor_queueSize | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor queueSize of seatunnel cluster node |
| hazelcast_partition_partitionCount | Gauge | - | The partitionCount of seatunnel cluster node |
| hazelcast_partition_activePartition | Gauge | - | The activePartition of seatunnel cluster node |
-| hazelcast_partition_isClusterSafe | Gauge | - | Weather is cluster safe of partition |
-| hazelcast_partition_isLocalMemberSafe | Gauge | - | Weather is local member safe of partition |
+| hazelcast_partition_isClusterSafe | Gauge | - | Whether is cluster safe of partition |
+| hazelcast_partition_isLocalMemberSafe | Gauge | - | Whether is local member safe of partition |
### Thread Pool Status
diff --git a/docs/en/seatunnel-engine/user-command.md b/docs/en/seatunnel-engine/user-command.md
index 2504198b2b1..f8957a98276 100644
--- a/docs/en/seatunnel-engine/user-command.md
+++ b/docs/en/seatunnel-engine/user-command.md
@@ -1,5 +1,5 @@
---
-sidebar_position: 12
+sidebar_position: 13
---
# Command Line Tool
@@ -122,3 +122,13 @@ This command will cancel the specified job. After canceling the job, the job wil
Supports batch cancellation of jobs, and can cancel multiple jobs at one time.
All breakpoint information of the canceled job will be deleted and cannot be resumed by seatunnel.sh -r <jobId>.
+
+## Configure The JVM Options
+
+We can configure the JVM options for the SeaTunnel Engine client in the following ways:
+
+1. Add the JVM options to `$SEATUNNEL_HOME/config/jvm_client_options`.
+
+ Modify the JVM parameters in the `$SEATUNNEL_HOME/config/jvm_client_options` file. Please note that the JVM parameters in this file will be applied to all jobs submitted using `seatunnel.sh`, including Local Mode and Cluster Mode.
+
+2. Add JVM options when submitting jobs. For example, `sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -DJvmOption="-Xms2G -Xmx2G"`
diff --git a/docs/en/seatunnel-engine/web-ui.md b/docs/en/seatunnel-engine/web-ui.md
new file mode 100644
index 00000000000..84c73d2fae8
--- /dev/null
+++ b/docs/en/seatunnel-engine/web-ui.md
@@ -0,0 +1,48 @@
+# Web UI
+
+## Access
+
+Before accessing the web ui we need to enable the http rest api. first you need to configure it in the `seatunnel.yaml` configuration file
+
+```
+seatunnel:
+ engine:
+ http:
+ enable-http: true
+ port: 8080
+
+```
+
+Then visit `http://ip:8080/#/overview`
+
+## Overview
+
+The Web UI of Apache SeaTunnel offers a user-friendly interface for monitoring and managing SeaTunnel jobs. Through the Web UI, users can view real-time information on currently running jobs, finished jobs, and the status of worker and master nodes within the cluster. The main functional modules include Jobs, Workers, and Master, each providing detailed status information and operational options to help users efficiently manage and optimize their data processing workflows.
+![overview.png](../../images/ui/overview.png)
+
+## Jobs
+
+### Running Jobs
+
+The "Running Jobs" section lists all SeaTunnel jobs that are currently in execution. Users can view basic information for each job, including Job ID, submission time, status, execution time, and more. By clicking on a specific job, users can access detailed information such as task distribution, resource utilization, and log outputs, allowing for real-time monitoring of job progress and timely handling of potential issues.
+![running.png](../../images/ui/running.png)
+![detail.png](../../images/ui/detail.png)
+
+### Finished Jobs
+
+The "Finished Jobs" section displays all SeaTunnel jobs that have either successfully completed or failed. This section provides execution results, completion times, durations, and failure reasons (if any) for each job. Users can review past job records through this module to analyze job performance, troubleshoot issues, or rerun specific jobs as needed.
+![finished.png](../../images/ui/finished.png)
+
+## Workers
+
+### Workers Information
+
+The "Workers" section displays detailed information about all worker nodes in the cluster, including each worker's address, running status, CPU and memory usage, number of tasks being executed, and more. Through this module, users can monitor the health of each worker node, promptly identify and address resource bottlenecks or node failures, ensuring the stable operation of the SeaTunnel cluster.
+![workers.png](../../images/ui/workers.png)
+
+## Master
+
+### Master Information
+
+The "Master" section provides the status and configuration information of the master node in the SeaTunnel cluster. Users can view the master's address, running status, job scheduling responsibilities, and overall resource allocation within the cluster. This module helps users gain a comprehensive understanding of the cluster's core management components, facilitating cluster configuration optimization and troubleshooting.
+![master.png](../../images/ui/master.png)
diff --git a/docs/en/start-v2/docker/docker.md b/docs/en/start-v2/docker/docker.md
index 3dfe3ec8c85..2c2c7824f4f 100644
--- a/docs/en/start-v2/docker/docker.md
+++ b/docs/en/start-v2/docker/docker.md
@@ -40,7 +40,7 @@ You can download the source code from the [download page](https://seatunnel.apac
```shell
cd seatunnel
# Use already sett maven profile
-sh ./mvnw -B clean install -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -Dlicense.skipAddThirdParty=true -D"docker.build.skip"=false -D"docker.verify.skip"=false -D"docker.push.skip"=true -D"docker.tag"=2.3.8 -Dmaven.deploy.skip --no-snapshot-updates -Pdocker,seatunnel
+sh ./mvnw -B clean install -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -Dlicense.skipAddThirdParty=true -D"docker.build.skip"=false -D"docker.verify.skip"=false -D"docker.push.skip"=true -D"docker.tag"=2.3.9 -Dmaven.deploy.skip -D"skip.spotless"=true --no-snapshot-updates -Pdocker,seatunnel
# Check the docker image
docker images | grep apache/seatunnel
@@ -53,10 +53,10 @@ sh ./mvnw clean package -DskipTests -Dskip.spotless=true
# Build docker image
cd seatunnel-dist
-docker build -f src/main/docker/Dockerfile --build-arg VERSION=2.3.8 -t apache/seatunnel:2.3.8 .
+docker build -f src/main/docker/Dockerfile --build-arg VERSION=2.3.9 -t apache/seatunnel:2.3.9 .
# If you build from dev branch, you should add SNAPSHOT suffix to the version
-docker build -f src/main/docker/Dockerfile --build-arg VERSION=2.3.8-SNAPSHOT -t apache/seatunnel:2.3.8-SNAPSHOT .
+docker build -f src/main/docker/Dockerfile --build-arg VERSION=2.3.9-SNAPSHOT -t apache/seatunnel:2.3.9-SNAPSHOT .
# Check the docker image
docker images | grep apache/seatunnel
@@ -167,24 +167,26 @@ docker run -d --name seatunnel_master \
- get created container ip
```shell
-docker inspect master-1
+docker inspect seatunnel_master
```
run this command to get the pod ip.
- start worker node
```shell
+# you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST`
docker run -d --name seatunnel_worker_1 \
--network seatunnel-network \
--rm \
- -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ # set master container ip to here
+ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \
apache/seatunnel \
./bin/seatunnel-cluster.sh -r worker
## start worker2
-docker run -d --name seatunnel_worker_2 \
+# you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST`
+docker run -d --name seatunnel_worker_2 \
--network seatunnel-network \
--rm \
- -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ # set master container ip to here
+ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \
apache/seatunnel \
./bin/seatunnel-cluster.sh -r worker
@@ -194,20 +196,22 @@ docker run -d --name seatunnel_worker_2 \
run this command to start master node.
```shell
+# you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST`
docker run -d --name seatunnel_master \
--network seatunnel-network \
--rm \
- -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ # set exist master container ip to here
+ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \
apache/seatunnel \
./bin/seatunnel-cluster.sh -r master
```
run this command to start worker node.
```shell
+# you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST`
docker run -d --name seatunnel_worker_1 \
--network seatunnel-network \
--rm \
- -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ # set master container ip to here
+ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \
apache/seatunnel \
./bin/seatunnel-cluster.sh -r worker
```
@@ -371,21 +375,23 @@ and run `docker-compose up -d` command, the new worker node will start, and the
#### use docker as a client
- submit job :
```shell
+# you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST`
docker run --name seatunnel_client \
--network seatunnel-network \
+ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \
--rm \
apache/seatunnel \
- -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ # set it as master node container ip
- ./bin/seatunnel.sh -c config/v2.batch.config.template # this is an default config, if you need submit your self config, you can mount config file.
+ ./bin/seatunnel.sh -c config/v2.batch.config.template
```
- list job
```shell
+# you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST`
docker run --name seatunnel_client \
--network seatunnel-network \
+ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \
--rm \
apache/seatunnel \
- -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ # set it as master node container ip
./bin/seatunnel.sh -l
```
@@ -395,5 +401,5 @@ more command please refer [user-command](../../seatunnel-engine/user-command.md)
#### use rest api
-please refer [Submit A Job](../../seatunnel-engine/rest-api.md#submit-a-job)
+please refer [Submit A Job](../../seatunnel-engine/rest-api-v2.md#submit-a-job)
diff --git a/docs/en/start-v2/kubernetes/kubernetes.mdx b/docs/en/start-v2/kubernetes/kubernetes.mdx
index eb231850514..f3cc9e6b0d5 100644
--- a/docs/en/start-v2/kubernetes/kubernetes.mdx
+++ b/docs/en/start-v2/kubernetes/kubernetes.mdx
@@ -44,7 +44,7 @@ To run the image with SeaTunnel, first create a `Dockerfile`:
```Dockerfile
FROM flink:1.13
-ENV SEATUNNEL_VERSION="2.3.8"
+ENV SEATUNNEL_VERSION="2.3.9"
ENV SEATUNNEL_HOME="/opt/seatunnel"
RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz
@@ -56,13 +56,13 @@ RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION}
Then run the following commands to build the image:
```bash
-docker build -t seatunnel:2.3.8-flink-1.13 -f Dockerfile .
+docker build -t seatunnel:2.3.9-flink-1.13 -f Dockerfile .
```
-Image `seatunnel:2.3.8-flink-1.13` needs to be present in the host (minikube) so that the deployment can take place.
+Image `seatunnel:2.3.9-flink-1.13` needs to be present in the host (minikube) so that the deployment can take place.
Load image to minikube via:
```bash
-minikube image load seatunnel:2.3.8-flink-1.13
+minikube image load seatunnel:2.3.9-flink-1.13
```
@@ -72,7 +72,7 @@ minikube image load seatunnel:2.3.8-flink-1.13
```Dockerfile
FROM openjdk:8
-ENV SEATUNNEL_VERSION="2.3.8"
+ENV SEATUNNEL_VERSION="2.3.9"
ENV SEATUNNEL_HOME="/opt/seatunnel"
RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz
@@ -84,13 +84,13 @@ RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION}
Then run the following commands to build the image:
```bash
-docker build -t seatunnel:2.3.8 -f Dockerfile .
+docker build -t seatunnel:2.3.9 -f Dockerfile .
```
-Image `seatunnel:2.3.8` need to be present in the host (minikube) so that the deployment can take place.
+Image `seatunnel:2.3.9` need to be present in the host (minikube) so that the deployment can take place.
Load image to minikube via:
```bash
-minikube image load seatunnel:2.3.8
+minikube image load seatunnel:2.3.9
```
@@ -100,7 +100,7 @@ minikube image load seatunnel:2.3.8
```Dockerfile
FROM openjdk:8
-ENV SEATUNNEL_VERSION="2.3.8"
+ENV SEATUNNEL_VERSION="2.3.9"
ENV SEATUNNEL_HOME="/opt/seatunnel"
RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz
@@ -112,13 +112,13 @@ RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION}
Then run the following commands to build the image:
```bash
-docker build -t seatunnel:2.3.8 -f Dockerfile .
+docker build -t seatunnel:2.3.9 -f Dockerfile .
```
-Image `seatunnel:2.3.8` needs to be present in the host (minikube) so that the deployment can take place.
+Image `seatunnel:2.3.9` needs to be present in the host (minikube) so that the deployment can take place.
Load image to minikube via:
```bash
-minikube image load seatunnel:2.3.8
+minikube image load seatunnel:2.3.9
```
@@ -191,7 +191,7 @@ none
]}>
-In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.8-release/config/v2.streaming.conf.template):
+In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.9-release/config/v2.streaming.conf.template):
```conf
env {
@@ -202,7 +202,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 160000
schema = {
fields {
@@ -215,8 +215,8 @@ source {
transform {
FieldMapper {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
field_mapper = {
age = age
name = new_name
@@ -226,7 +226,7 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
```
@@ -245,7 +245,7 @@ kind: FlinkDeployment
metadata:
name: seatunnel-flink-streaming-example
spec:
- image: seatunnel:2.3.8-flink-1.13
+ image: seatunnel:2.3.9-flink-1.13
flinkVersion: v1_13
flinkConfiguration:
taskmanager.numberOfTaskSlots: "2"
@@ -291,7 +291,7 @@ kubectl apply -f seatunnel-flink.yaml
-In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.8-release/config/v2.streaming.conf.template):
+In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.9-release/config/v2.streaming.conf.template):
```conf
env {
@@ -303,7 +303,7 @@ env {
source {
FakeSource {
parallelism = 2
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
@@ -334,7 +334,7 @@ metadata:
spec:
containers:
- name: seatunnel
- image: seatunnel:2.3.8
+ image: seatunnel:2.3.9
command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel.sh --config /data/seatunnel.streaming.conf -e local"]
resources:
limits:
@@ -366,7 +366,7 @@ kubectl apply -f seatunnel.yaml
-In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.8-release/config/v2.streaming.conf.template):
+In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/2.3.9-release/config/v2.streaming.conf.template):
```conf
env {
@@ -378,7 +378,7 @@ env {
source {
FakeSource {
parallelism = 2
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
@@ -524,7 +524,7 @@ spec:
spec:
containers:
- name: seatunnel
- image: seatunnel:2.3.8
+ image: seatunnel:2.3.9
imagePullPolicy: IfNotPresent
ports:
- containerPort: 5801
diff --git a/docs/en/start-v2/locally/deployment.md b/docs/en/start-v2/locally/deployment.md
index 8555c097f36..4684871acb0 100644
--- a/docs/en/start-v2/locally/deployment.md
+++ b/docs/en/start-v2/locally/deployment.md
@@ -22,7 +22,7 @@ Visit the [SeaTunnel Download Page](https://seatunnel.apache.org/download) to do
Or you can also download it through the terminal:
```shell
-export version="2.3.8"
+export version="2.3.9"
wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz"
tar -xzvf "apache-seatunnel-${version}-bin.tar.gz"
```
@@ -35,10 +35,10 @@ Starting from version 2.2.0-beta, the binary package no longer provides connecto
sh bin/install-plugin.sh
```
-If you need a specific connector version, taking 2.3.8 as an example, you need to execute the following command:
+If you need a specific connector version, taking 2.3.9 as an example, you need to execute the following command:
```bash
-sh bin/install-plugin.sh 2.3.8
+sh bin/install-plugin.sh 2.3.9
```
Typically, you do not need all the connector plugins. You can specify the required plugins by configuring `config/plugin_config`. For example, if you want the sample application to work properly, you will need the `connector-console` and `connector-fake` plugins. You can modify the `plugin_config` configuration file as follows:
@@ -71,7 +71,7 @@ You can download the source code from the [download page](https://seatunnel.apac
cd seatunnel
sh ./mvnw clean install -DskipTests -Dskip.spotless=true
# get the binary package
-cp seatunnel-dist/target/apache-seatunnel-2.3.8-bin.tar.gz /The-Path-You-Want-To-Copy
+cp seatunnel-dist/target/apache-seatunnel-2.3.9-bin.tar.gz /The-Path-You-Want-To-Copy
cd /The-Path-You-Want-To-Copy
tar -xzvf "apache-seatunnel-${version}-bin.tar.gz"
diff --git a/docs/en/start-v2/locally/quick-start-flink.md b/docs/en/start-v2/locally/quick-start-flink.md
index 244dfd8c9e6..fbfc945fc7c 100644
--- a/docs/en/start-v2/locally/quick-start-flink.md
+++ b/docs/en/start-v2/locally/quick-start-flink.md
@@ -27,7 +27,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
@@ -40,8 +40,8 @@ source {
transform {
FieldMapper {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
field_mapper = {
age = age
name = new_name
@@ -51,7 +51,7 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
diff --git a/docs/en/start-v2/locally/quick-start-seatunnel-engine.md b/docs/en/start-v2/locally/quick-start-seatunnel-engine.md
index d5b48b27247..fe9d8ee7983 100644
--- a/docs/en/start-v2/locally/quick-start-seatunnel-engine.md
+++ b/docs/en/start-v2/locally/quick-start-seatunnel-engine.md
@@ -21,7 +21,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
@@ -34,8 +34,8 @@ source {
transform {
FieldMapper {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
field_mapper = {
age = age
name = new_name
@@ -45,7 +45,7 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
diff --git a/docs/en/start-v2/locally/quick-start-spark.md b/docs/en/start-v2/locally/quick-start-spark.md
index d5dd82725bd..e490f238b3d 100644
--- a/docs/en/start-v2/locally/quick-start-spark.md
+++ b/docs/en/start-v2/locally/quick-start-spark.md
@@ -28,7 +28,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 16
schema = {
fields {
@@ -41,8 +41,8 @@ source {
transform {
FieldMapper {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
field_mapper = {
age = age
name = new_name
@@ -52,7 +52,7 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
diff --git a/docs/en/transform-v2/common-options.md b/docs/en/transform-v2/common-options.md
index 5bd0e6e2591..32e91bf8243 100644
--- a/docs/en/transform-v2/common-options.md
+++ b/docs/en/transform-v2/common-options.md
@@ -6,10 +6,16 @@ sidebar_position: 1
> This is a process of intermediate conversion between the source and sink terminals,You can use sql statements to smoothly complete the conversion process
-| Name | Type | Required | Default | Description |
-|-------------------|--------|----------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| result_table_name | String | No | - | When `source_table_name` is not specified, the current plugin processes the data set `(dataset)` output by the previous plugin in the configuration file; When `source_table_name` is specified, the current plugin is processing the data set corresponding to this parameter. |
-| source_table_name | String | No | - | When `result_table_name` is not specified, the data processed by this plugin will not be registered as a data set that can be directly accessed by other plugins, or called a temporary table `(table)`; When `result_table_name` is specified, the data processed by this plugin will be registered as a data set `(dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The dataset registered here can be directly accessed by other plugins by specifying `source_table_name` . |
+:::warn
+
+The old configuration name `source_table_name`/`result_table_name` is deprecated, please migrate to the new name `plugin_input`/`plugin_output` as soon as possible.
+
+:::
+
+| Name | Type | Required | Default | Description |
+|---------------|--------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| plugin_output | String | No | - | When `plugin_input` is not specified, the current plugin processes the data set `(dataset)` output by the previous plugin in the configuration file; When `plugin_input` is specified, the current plugin is processing the data set corresponding to this parameter. |
+| plugin_input | String | No | - | When `plugin_output` is not specified, the data processed by this plugin will not be registered as a data set that can be directly accessed by other plugins, or called a temporary table `(table)`; When `plugin_output` is specified, the data processed by this plugin will be registered as a data set `(dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The dataset registered here can be directly accessed by other plugins by specifying `plugin_input` . |
## Task Example
@@ -24,7 +30,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -48,9 +54,9 @@ source {
transform {
Sql {
- source_table_name = "fake"
- result_table_name = "fake1"
- # the query table name must same as field 'source_table_name'
+ plugin_input = "fake"
+ plugin_output = "fake1"
+ # the query table name must same as field 'plugin_input'
query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from fake"
}
# The SQL transform support base function and criteria operation
@@ -59,10 +65,10 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
Console {
- source_table_name = "fake"
+ plugin_input = "fake"
}
}
```
diff --git a/docs/en/transform-v2/copy.md b/docs/en/transform-v2/copy.md
index 7a0e73f44be..eede3f7d077 100644
--- a/docs/en/transform-v2/copy.md
+++ b/docs/en/transform-v2/copy.md
@@ -36,8 +36,8 @@ We want copy fields `name`、`age` to a new fields `name1`、`name2`、`age1`, w
```
transform {
Copy {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
fields {
name1 = name
name2 = name
diff --git a/docs/en/transform-v2/dynamic-compile.md b/docs/en/transform-v2/dynamic-compile.md
index fb5500880ac..d5f21f2708d 100644
--- a/docs/en/transform-v2/dynamic-compile.md
+++ b/docs/en/transform-v2/dynamic-compile.md
@@ -82,13 +82,13 @@ Use this DynamicCompile to add a new column `compile_language`, and update the `
```hacon
transform {
DynamicCompile {
- source_table_name = "fake"
- result_table_name = "groovy_out"
+ plugin_input = "fake"
+ plugin_output = "groovy_out"
compile_language="GROOVY"
compile_pattern="SOURCE_CODE"
source_code="""
import org.apache.seatunnel.api.table.catalog.Column
- import org.apache.seatunnel.transform.common.SeaTunnelRowAccessor
+ import org.apache.seatunnel.api.table.type.SeaTunnelRowAccessor
import org.apache.seatunnel.api.table.catalog.CatalogTable
import org.apache.seatunnel.api.table.catalog.PhysicalColumn;
import org.apache.seatunnel.api.table.type.*;
@@ -140,13 +140,13 @@ transform {
```hacon
transform {
DynamicCompile {
- source_table_name = "fake"
- result_table_name = "java_out"
+ plugin_input = "fake"
+ plugin_output = "java_out"
compile_language="JAVA"
compile_pattern="SOURCE_CODE"
source_code="""
import org.apache.seatunnel.api.table.catalog.Column;
- import org.apache.seatunnel.transform.common.SeaTunnelRowAccessor;
+ import org.apache.seatunnel.api.table.type.SeaTunnelRowAccessor;
import org.apache.seatunnel.api.table.catalog.*;
import org.apache.seatunnel.api.table.type.*;
import java.util.ArrayList;
@@ -195,8 +195,8 @@ transform {
```hacon
transform {
DynamicCompile {
- source_table_name = "fake"
- result_table_name = "groovy_out"
+ plugin_input = "fake"
+ plugin_output = "groovy_out"
compile_language="GROOVY"
compile_pattern="ABSOLUTE_PATH"
absolute_path="""/tmp/GroovyFile"""
diff --git a/docs/en/transform-v2/embedding.md b/docs/en/transform-v2/embedding.md
index 046f72789ac..350a23fc555 100644
--- a/docs/en/transform-v2/embedding.md
+++ b/docs/en/transform-v2/embedding.md
@@ -166,13 +166,13 @@ source {
"Herman Melville (1819–1891) was an American novelist, short story writer, and poet of the American Renaissance period. Born in New York City, Melville gained initial fame with novels such as Typee and Omoo, but it was Moby-Dick, published in 1851, that would later be recognized as his masterpiece. Melville’s work is known for its complexity, symbolism, and exploration of themes such as man’s place in the universe, the nature of evil, and the quest for meaning. Despite facing financial difficulties and critical neglect during his lifetime, Melville’s reputation soared posthumously, and he is now considered one of the great American authors."
], kind = INSERT}
]
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
transform {
Embedding {
- source_table_name = "fake"
+ plugin_input = "fake"
embedding_model_provider = QIANFAN
model = bge_large_en
api_key = xxxxxxxxxx
@@ -182,13 +182,13 @@ transform {
book_intro_vector = book_intro
author_biography_vector = author_biography
}
- result_table_name = "embedding_output"
+ plugin_output = "embedding_output"
}
}
sink {
Assert {
- source_table_name = "embedding_output"
+ plugin_input = "embedding_output"
rules =
{
field_rules = [
@@ -293,13 +293,13 @@ source {
"Herman Melville (1819–1891) was an American novelist, short story writer, and poet of the American Renaissance period. Born in New York City, Melville gained initial fame with novels such as Typee and Omoo, but it was Moby-Dick, published in 1851, that would later be recognized as his masterpiece. Melville’s work is known for its complexity, symbolism, and exploration of themes such as man’s place in the universe, the nature of evil, and the quest for meaning. Despite facing financial difficulties and critical neglect during his lifetime, Melville’s reputation soared posthumously, and he is now considered one of the great American authors."
], kind = INSERT}
]
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
transform {
Embedding {
- source_table_name = "fake"
+ plugin_input = "fake"
model_provider = CUSTOM
model = text-embedding-3-small
api_key = xxxxxxxx
@@ -320,13 +320,13 @@ transform {
inputx = ["${input}"]
}
}
- result_table_name = "embedding_output_1"
+ plugin_output = "embedding_output_1"
}
}
sink {
Assert {
- source_table_name = "embedding_output_1"
+ plugin_input = "embedding_output_1"
rules =
{
field_rules = [
diff --git a/docs/en/transform-v2/field-mapper.md b/docs/en/transform-v2/field-mapper.md
index e0bd32e1492..fa54ced741e 100644
--- a/docs/en/transform-v2/field-mapper.md
+++ b/docs/en/transform-v2/field-mapper.md
@@ -36,8 +36,8 @@ We want to delete `age` field and update the filed order to `id`, `card`, `name`
```
transform {
FieldMapper {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
field_mapper = {
id = id
card = card
diff --git a/docs/en/transform-v2/filter-rowkind.md b/docs/en/transform-v2/filter-rowkind.md
index e6ef5ba98cd..68aab44b973 100644
--- a/docs/en/transform-v2/filter-rowkind.md
+++ b/docs/en/transform-v2/filter-rowkind.md
@@ -39,7 +39,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -53,15 +53,15 @@ source {
transform {
FilterRowKind {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
exclude_kinds = ["INSERT"]
}
}
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
```
diff --git a/docs/en/transform-v2/filter.md b/docs/en/transform-v2/filter.md
index f9f28b8398a..748934e621a 100644
--- a/docs/en/transform-v2/filter.md
+++ b/docs/en/transform-v2/filter.md
@@ -43,8 +43,8 @@ we want to keep the field named `name`, `card`, we can add a `Filter` Transform
```
transform {
Filter {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
include_fields = [name, card]
}
}
@@ -55,8 +55,8 @@ Or we can delete the field named `age` by adding a `Filter` Transform with `excl
```
transform {
Filter {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
exclude_fields = [age]
}
}
diff --git a/docs/en/transform-v2/jsonpath.md b/docs/en/transform-v2/jsonpath.md
index 1948f5ca694..f787487069e 100644
--- a/docs/en/transform-v2/jsonpath.md
+++ b/docs/en/transform-v2/jsonpath.md
@@ -93,8 +93,8 @@ Assuming we want to use JsonPath to extract properties.
```json
transform {
JsonPath {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
columns = [
{
"src_field" = "data"
@@ -175,8 +175,8 @@ The JsonPath transform converts the values of seatunnel into an array,
```hocon
transform {
JsonPath {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
row_error_handle_way = FAIL
columns = [
diff --git a/docs/en/transform-v2/llm.md b/docs/en/transform-v2/llm.md
index 8ee5a36a9ab..c1c9798abe3 100644
--- a/docs/en/transform-v2/llm.md
+++ b/docs/en/transform-v2/llm.md
@@ -11,7 +11,7 @@ more.
## Options
| name | type | required | default value |
-|------------------------| ------ | -------- |---------------|
+|------------------------|--------|----------|---------------|
| model_provider | enum | yes | |
| output_data_type | enum | no | String |
| output_column_name | string | no | llm_output |
@@ -28,7 +28,9 @@ more.
### model_provider
The model provider to use. The available options are:
-OPENAI, DOUBAO, KIMIAI, CUSTOM
+OPENAI, DOUBAO, KIMIAI, MICROSOFT, CUSTOM
+
+> tips: If you use Microsoft, please make sure api_path cannot be empty
### output_data_type
@@ -254,6 +256,7 @@ sink {
}
}
```
+
### Customize the LLM model
```hocon
@@ -277,13 +280,13 @@ source {
{fields = [4, "Eric"], kind = INSERT}
{fields = [5, "Guangdong Liu"], kind = INSERT}
]
- result_table_name = "fake"
+ plugin_output = "fake"
}
}
transform {
LLM {
- source_table_name = "fake"
+ plugin_input = "fake"
model_provider = CUSTOM
model = gpt-4o-mini
api_key = sk-xxx
@@ -308,13 +311,13 @@ transform {
}]
}
}
- result_table_name = "llm_output"
+ plugin_output = "llm_output"
}
}
sink {
Assert {
- source_table_name = "llm_output"
+ plugin_input = "llm_output"
rules =
{
field_rules = [
diff --git a/docs/en/transform-v2/metadata.md b/docs/en/transform-v2/metadata.md
new file mode 100644
index 00000000000..abae10e4483
--- /dev/null
+++ b/docs/en/transform-v2/metadata.md
@@ -0,0 +1,85 @@
+# Metadata
+
+> Metadata transform plugin
+
+## Description
+Metadata transform plugin for adding metadata fields to data
+
+## Available Metadata
+
+| Key | DataType | Description |
+|:---------:|:--------:|:---------------------------------------------------------------------------------------------------|
+| Database | string | Name of the table that contain the row. |
+| Table | string | Name of the table that contain the row. |
+| RowKind | string | The type of operation |
+| EventTime | Long | The time at which the connector processed the event. |
+| Delay | Long | The difference between data extraction time and database change time |
+| Partition | string | Contains the partition field of the corresponding number table of the row, multiple using `,` join |
+
+### note
+ `Delay` `Partition` only worked on cdc series connectors for now , except TiDB-CDC
+
+## Options
+
+| name | type | required | default value | Description |
+|:---------------:|------|----------|---------------|---------------------------------------------------------------------------|
+| metadata_fields | map | yes | | A mapping metadata input fields and their corresponding output fields. |
+
+### metadata_fields [map]
+
+A mapping between metadata fields and their respective output fields.
+
+```hocon
+metadata_fields {
+ Database = c_database
+ Table = c_table
+ RowKind = c_rowKind
+ EventTime = c_ts_ms
+ Delay = c_delay
+}
+```
+
+## Examples
+
+```yaml
+
+env {
+ parallelism = 1
+ job.mode = "STREAMING"
+ checkpoint.interval = 5000
+ read_limit.bytes_per_second = 7000000
+ read_limit.rows_per_second = 400
+}
+
+source {
+ MySQL-CDC {
+ plugin_output = "customers_mysql_cdc"
+ server-id = 5652
+ username = "root"
+ password = "zdyk_Dev@2024"
+ table-names = ["source.user"]
+ base-url = "jdbc:mysql://172.16.17.123:3306/source"
+ }
+}
+
+transform {
+ Metadata {
+ metadata_fields {
+ Database = database
+ Table = table
+ RowKind = rowKind
+ EventTime = ts_ms
+ Delay = delay
+ }
+ plugin_output = "trans_result"
+ }
+}
+
+sink {
+ Console {
+ plugin_input = "custom_name"
+ }
+}
+
+```
+
diff --git a/docs/en/transform-v2/replace.md b/docs/en/transform-v2/replace.md
index 1cc99c0ace7..ebb15a9c8ba 100644
--- a/docs/en/transform-v2/replace.md
+++ b/docs/en/transform-v2/replace.md
@@ -56,8 +56,8 @@ We want to replace the char ` ` to `_` at the `name` field. Then we can add a `R
```
transform {
Replace {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
replace_field = "name"
pattern = " "
replacement = "_"
@@ -84,7 +84,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -97,8 +97,8 @@ source {
transform {
Replace {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
replace_field = "name"
pattern = ".+"
replacement = "b"
@@ -108,7 +108,7 @@ transform {
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
```
diff --git a/docs/en/transform-v2/rowkind-extractor.md b/docs/en/transform-v2/rowkind-extractor.md
new file mode 100644
index 00000000000..a2ee384c347
--- /dev/null
+++ b/docs/en/transform-v2/rowkind-extractor.md
@@ -0,0 +1,113 @@
+# RowKindExtractor
+
+> RowKindExtractor transform plugin
+
+## Description
+
+transform cdc row to append only row that contains the cdc RowKind.
+Example:
+CDC row: -D 1, test1, test2
+transformed Row: +I 1,test1,test2,DELETE
+
+## Options
+
+| name | type | required | default value |
+|-------------------|--------|----------|---------------|
+| custom_field_name | string | yes | row_kind |
+| transform_type | enum | yes | SHORT |
+
+### custom_field_name [string]
+
+Custom field name of the RowKind field
+
+### transform_type [enum]
+
+the RowKind field value formatting , the option can be `SHORT` or `FULL`
+
+`SHORT` : +I, -U , +U, -D
+`FULL` : INSERT, UPDATE_BEFORE, UPDATE_AFTER , DELETE
+
+## Examples
+
+
+```yaml
+
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ schema = {
+ fields {
+ pk_id = bigint
+ name = string
+ score = int
+ }
+ primaryKey {
+ name = "pk_id"
+ columnNames = [pk_id]
+ }
+ }
+ rows = [
+ {
+ kind = INSERT
+ fields = [1, "A", 100]
+ },
+ {
+ kind = INSERT
+ fields = [2, "B", 100]
+ },
+ {
+ kind = INSERT
+ fields = [3, "C", 100]
+ },
+ {
+ kind = INSERT
+ fields = [4, "D", 100]
+ },
+ {
+ kind = UPDATE_BEFORE
+ fields = [1, "A", 100]
+ },
+ {
+ kind = UPDATE_AFTER
+ fields = [1, "F", 100]
+ }
+ {
+ kind = UPDATE_BEFORE
+ fields = [2, "B", 100]
+ },
+ {
+ kind = UPDATE_AFTER
+ fields = [2, "G", 100]
+ },
+ {
+ kind = DELETE
+ fields = [3, "C", 100]
+ },
+ {
+ kind = DELETE
+ fields = [4, "D", 100]
+ }
+ ]
+ }
+}
+
+transform {
+ RowKindExtractor {
+ custom_field_name = "custom_name"
+ transform_type = FULL
+ plugin_output = "trans_result"
+ }
+}
+
+sink {
+ Console {
+ plugin_input = "custom_name"
+ }
+}
+
+```
+
diff --git a/docs/en/transform-v2/split.md b/docs/en/transform-v2/split.md
index ecfe94c854b..0df9afbdef2 100644
--- a/docs/en/transform-v2/split.md
+++ b/docs/en/transform-v2/split.md
@@ -46,8 +46,8 @@ We want split `name` field to `first_name` and `second name`, we can add `Split`
```
transform {
Split {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
separator = " "
split_field = "name"
output_fields = [first_name, second_name]
diff --git a/docs/en/transform-v2/sql-functions.md b/docs/en/transform-v2/sql-functions.md
index 3438a24de9c..31a33989375 100644
--- a/docs/en/transform-v2/sql-functions.md
+++ b/docs/en/transform-v2/sql-functions.md
@@ -302,6 +302,14 @@ Example:
REPLACE(NAME, ' ')
+### SPLIT
+
+Split a string into an array.
+
+Example:
+
+select SPLIT(test,';') as arrays
+
### SOUNDEX
```SOUNDEX(string)```
@@ -973,3 +981,37 @@ It is used to determine whether the condition is valid and return different valu
Example:
case when c_string in ('c_string') then 1 else 0 end
+
+### UUID
+
+```UUID()```
+
+Generate a uuid through java function.
+
+Example:
+
+select UUID() as seatunnel_uuid
+
+### ARRAY
+
+Generate an array.
+
+Example:
+
+select ARRAY('test1','test2','test3') as arrays
+
+
+### LATERAL VIEW
+#### EXPLODE
+
+explode array column to rows.
+OUTER EXPLODE will return NULL, while array is NULL or empty
+EXPLODE(SPLIT(FIELD_NAME,separator))Used to split string type. The first parameter of SPLIT function is the field name, the second parameter is the separator
+EXPLODE(ARRAY(value1,value2)) Used to custom array type.
+```
+SELECT * FROM fake
+ LATERAL VIEW EXPLODE ( SPLIT ( NAME, ',' ) ) AS NAME
+ LATERAL VIEW EXPLODE ( SPLIT ( pk_id, ';' ) ) AS pk_id
+ LATERAL VIEW OUTER EXPLODE ( age ) AS age
+ LATERAL VIEW OUTER EXPLODE ( ARRAY(1,1) ) AS num
+```
diff --git a/docs/en/transform-v2/sql-udf.md b/docs/en/transform-v2/sql-udf.md
index df5d3b93fe5..a857fe4c51f 100644
--- a/docs/en/transform-v2/sql-udf.md
+++ b/docs/en/transform-v2/sql-udf.md
@@ -110,8 +110,8 @@ We use UDF of SQL query to transform the source data like this:
```
transform {
Sql {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
query = "select id, example(name) as name, age from fake"
}
}
diff --git a/docs/en/transform-v2/sql.md b/docs/en/transform-v2/sql.md
index a3bdb9bbfc1..a8f12568d53 100644
--- a/docs/en/transform-v2/sql.md
+++ b/docs/en/transform-v2/sql.md
@@ -12,11 +12,11 @@ SQL transform use memory SQL engine, we can via SQL functions and ability of SQL
| name | type | required | default value |
|-------------------|--------|----------|---------------|
-| source_table_name | string | yes | - |
-| result_table_name | string | yes | - |
+| plugin_input | string | yes | - |
+| plugin_output | string | yes | - |
| query | string | yes | - |
-### source_table_name [string]
+### plugin_input [string]
The source table name, the query SQL table name must match this field.
@@ -43,8 +43,8 @@ We use SQL query to transform the source data like this:
```
transform {
Sql {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
query = "select id, concat(name, '_') as name, age+1 as age from fake where id>0"
}
}
@@ -66,7 +66,7 @@ if your upstream data schema is like this:
```hacon
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
string.template = ["innerQuery"]
schema = {
@@ -123,7 +123,7 @@ env {
source {
FakeSource {
- result_table_name = "fake"
+ plugin_output = "fake"
row.num = 100
schema = {
fields {
@@ -137,15 +137,15 @@ source {
transform {
Sql {
- source_table_name = "fake"
- result_table_name = "fake1"
+ plugin_input = "fake"
+ plugin_output = "fake1"
query = "select id, concat(name, '_') as name, age+1 as age from fake where id>0"
}
}
sink {
Console {
- source_table_name = "fake1"
+ plugin_input = "fake1"
}
}
```
diff --git a/docs/en/transform-v2/transform-multi-table.md b/docs/en/transform-v2/transform-multi-table.md
new file mode 100644
index 00000000000..e642ec9cd2d
--- /dev/null
+++ b/docs/en/transform-v2/transform-multi-table.md
@@ -0,0 +1,128 @@
+---
+sidebar_position: 2
+---
+
+# Multi-Table Transform in SeaTunnel
+
+SeaTunnel’s transform feature supports multi-table transformations, which is especially useful when the upstream plugin outputs multiple tables. This allows you to complete all necessary transformation operations within a single transform configuration. Currently, many connectors in SeaTunnel support multi-table outputs, such as `JDBCSource` and `MySQL-CDC`. All transforms can be configured for multi-table transform as described below.
+
+:::tip
+
+Multi-table Transform has no limitations on Transform capabilities; any Transform configuration can be used in a multi-table Transform. The purpose of multi-table Transform is to handle multiple tables in the data stream individually and merge the Transform configurations of multiple tables into one Transform for easier management.
+
+:::
+
+## Properties
+
+| Name | Type | Required | Default | Description |
+|----------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| table_match_regex | String | No | .* | A regular expression to match the tables that require transformation. By default, it matches all tables. Note that this table name refers to the actual upstream table name, not `result_table_name`. |
+| table_transform | List | No | - | You can use a list in `table_transform` to specify rules for individual tables. If a transformation rule is configured for a specific table in `table_transform`, the outer rules will not apply to that table. The rules in `table_transform` take precedence. |
+| table_transform.table_path | String | No | - | When configuring a transformation rule for a table in `table_transform`, you need to specify the table path using the `table_path` field. The table path should include `databaseName[.schemaName].tableName`. |
+
+## Matching Logic
+
+Suppose we read five tables from upstream: `test.abc`, `test.abcd`, `test.xyz`, `test.xyzxyz`, and `test.www`. They share the same structure, each having three fields: `id`, `name`, and `age`.
+
+| id | name | age |
+
+Now, let's say we want to copy the data from these five tables using the Copy transform with the following specific requirements:
+- For tables `test.abc` and `test.abcd`, we need to copy the `name` field to a new field `name1`.
+- For `test.xyz`, we want to copy the `name` field to `name2`.
+- For `test.xyzxyz`, we want to copy the `name` field to `name3`.
+- For `test.www`, no changes are needed.
+
+We can configure this as follows:
+
+```hocon
+transform {
+ Copy {
+ source_table_name = "fake" // Optional dataset name to read from
+ result_table_name = "fake1" // Optional dataset name for output
+
+ table_match_regex = "test.a.*" // 1. Matches tables needing transformation, here matching `test.abc` and `test.abcd`
+ src_field = "name" // Source field
+ dest_field = "name1" // Destination field
+
+ table_transform = [{
+ table_path = "test.xyz" // 2. Specifies the table name for transformation
+ src_field = "name" // Source field
+ dest_field = "name2" // Destination field
+ }, {
+ table_path = "test.xyzxyz"
+ src_field = "name"
+ dest_field = "name3"
+ }]
+ }
+}
+```
+
+### Explanation
+
+1. With the regular expression and corresponding Copy transform options, we match tables `test.abc` and `test.abcd` and copy the `name` field to `name1`.
+2. Using the `table_transform` configuration, we specify that for table `test.xyz`, the `name` field should be copied to `name2`.
+
+This allows us to handle transformations for multiple tables within a single transform configuration.
+
+For each table, the priority of configuration is: `table_transform` > `table_match_regex`. If no rules match a table, no transformation will be applied.
+
+Below are the transform configurations for each table:
+
+- **test.abc** and **test.abcd**
+
+```hocon
+transform {
+ Copy {
+ src_field = "name"
+ dest_field = "name1"
+ }
+}
+```
+
+Output structure:
+
+| id | name | age | name1 |
+
+- **test.xyz**
+
+```hocon
+transform {
+ Copy {
+ src_field = "name"
+ dest_field = "name2"
+ }
+}
+```
+
+Output structure:
+
+| id | name | age | name2 |
+
+- **test.xyzxyz**
+
+```hocon
+transform {
+ Copy {
+ src_field = "name"
+ dest_field = "name3"
+ }
+}
+```
+
+Output structure:
+
+| id | name | age | name3 |
+
+- **test.www**
+
+```hocon
+transform {
+ // No transformation needed
+}
+```
+
+Output structure:
+
+| id | name | age |
+
+In this example, we used the Copy transform, but all transforms in SeaTunnel support multi-table transformations, and you can configure them similarly within the corresponding transform block.
\ No newline at end of file
diff --git a/docs/images/icons/Apache Iceberg.svg b/docs/images/icons/Apache Iceberg.svg
new file mode 100644
index 00000000000..d04e866a0f6
--- /dev/null
+++ b/docs/images/icons/Apache Iceberg.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Doris.svg b/docs/images/icons/Doris.svg
new file mode 100644
index 00000000000..2729c9a6985
--- /dev/null
+++ b/docs/images/icons/Doris.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/FtpFile.svg b/docs/images/icons/FtpFile.svg
new file mode 100644
index 00000000000..4cf14476e97
--- /dev/null
+++ b/docs/images/icons/FtpFile.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Greenplum.svg b/docs/images/icons/Greenplum.svg
new file mode 100644
index 00000000000..ead7dc6bfeb
--- /dev/null
+++ b/docs/images/icons/Greenplum.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git "a/docs/images/icons/Hdfs\346\226\207\344\273\266.svg" "b/docs/images/icons/Hdfs\346\226\207\344\273\266.svg"
new file mode 100644
index 00000000000..7bc4a938f74
--- /dev/null
+++ "b/docs/images/icons/Hdfs\346\226\207\344\273\266.svg"
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Hive.svg b/docs/images/icons/Hive.svg
new file mode 100644
index 00000000000..70859e23b97
--- /dev/null
+++ b/docs/images/icons/Hive.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/HiveJdbc.svg b/docs/images/icons/HiveJdbc.svg
new file mode 100644
index 00000000000..70859e23b97
--- /dev/null
+++ b/docs/images/icons/HiveJdbc.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Http.svg b/docs/images/icons/Http.svg
new file mode 100644
index 00000000000..e9fcaf50aca
--- /dev/null
+++ b/docs/images/icons/Http.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/InfluxDB.svg b/docs/images/icons/InfluxDB.svg
new file mode 100644
index 00000000000..a0bd1c639b6
--- /dev/null
+++ b/docs/images/icons/InfluxDB.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/IoTDB.svg b/docs/images/icons/IoTDB.svg
new file mode 100644
index 00000000000..1aad0988b75
--- /dev/null
+++ b/docs/images/icons/IoTDB.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/JDBC.svg b/docs/images/icons/JDBC.svg
new file mode 100644
index 00000000000..00365006920
--- /dev/null
+++ b/docs/images/icons/JDBC.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Jira.svg b/docs/images/icons/Jira.svg
new file mode 100644
index 00000000000..e49c6d768f9
--- /dev/null
+++ b/docs/images/icons/Jira.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Kafka.png b/docs/images/icons/Kafka.png
deleted file mode 100644
index a4b5359b866..00000000000
Binary files a/docs/images/icons/Kafka.png and /dev/null differ
diff --git a/docs/images/icons/Kafka.svg b/docs/images/icons/Kafka.svg
new file mode 100644
index 00000000000..094d598c4c2
--- /dev/null
+++ b/docs/images/icons/Kafka.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Kingbase.svg b/docs/images/icons/Kingbase.svg
new file mode 100644
index 00000000000..65a72ff2122
--- /dev/null
+++ b/docs/images/icons/Kingbase.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Klaviyo.svg b/docs/images/icons/Klaviyo.svg
new file mode 100644
index 00000000000..77f75c139fa
--- /dev/null
+++ b/docs/images/icons/Klaviyo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/LocalFile.svg b/docs/images/icons/LocalFile.svg
new file mode 100644
index 00000000000..414c3dde3b9
--- /dev/null
+++ b/docs/images/icons/LocalFile.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Maxcompute.svg b/docs/images/icons/Maxcompute.svg
new file mode 100644
index 00000000000..dca95d03c36
--- /dev/null
+++ b/docs/images/icons/Maxcompute.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Milvus.svg b/docs/images/icons/Milvus.svg
new file mode 100644
index 00000000000..a057c16e418
--- /dev/null
+++ b/docs/images/icons/Milvus.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/MySQL CDC.svg b/docs/images/icons/MySQL CDC.svg
new file mode 100644
index 00000000000..92cca4e38d0
--- /dev/null
+++ b/docs/images/icons/MySQL CDC.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Notion.svg b/docs/images/icons/Notion.svg
new file mode 100644
index 00000000000..3c6e3b0f72f
--- /dev/null
+++ b/docs/images/icons/Notion.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/ObsFile.png b/docs/images/icons/ObsFile.png
new file mode 100644
index 00000000000..be943c607ac
Binary files /dev/null and b/docs/images/icons/ObsFile.png differ
diff --git a/docs/images/icons/OceanBase.svg b/docs/images/icons/OceanBase.svg
new file mode 100644
index 00000000000..e4589987ea6
--- /dev/null
+++ b/docs/images/icons/OceanBase.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/OneSignal.svg b/docs/images/icons/OneSignal.svg
new file mode 100644
index 00000000000..8f0c26700da
--- /dev/null
+++ b/docs/images/icons/OneSignal.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/OpenMldb.png b/docs/images/icons/OpenMldb.png
new file mode 100644
index 00000000000..b66e8dedef4
Binary files /dev/null and b/docs/images/icons/OpenMldb.png differ
diff --git a/docs/images/icons/Oracle CDC.svg b/docs/images/icons/Oracle CDC.svg
new file mode 100644
index 00000000000..9f739d77862
--- /dev/null
+++ b/docs/images/icons/Oracle CDC.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Oracle.svg b/docs/images/icons/Oracle.svg
new file mode 100644
index 00000000000..c4865624c3e
--- /dev/null
+++ b/docs/images/icons/Oracle.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Paimon.svg b/docs/images/icons/Paimon.svg
new file mode 100644
index 00000000000..9dac157fdb6
--- /dev/null
+++ b/docs/images/icons/Paimon.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Persistiq.svg b/docs/images/icons/Persistiq.svg
new file mode 100644
index 00000000000..2ab14f08a78
--- /dev/null
+++ b/docs/images/icons/Persistiq.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Phoenix.svg b/docs/images/icons/Phoenix.svg
new file mode 100644
index 00000000000..6fa6e48a403
--- /dev/null
+++ b/docs/images/icons/Phoenix.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/PostgreSQL CDC.svg b/docs/images/icons/PostgreSQL CDC.svg
new file mode 100644
index 00000000000..38547f16078
--- /dev/null
+++ b/docs/images/icons/PostgreSQL CDC.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/PostgreSQL.svg b/docs/images/icons/PostgreSQL.svg
new file mode 100644
index 00000000000..38547f16078
--- /dev/null
+++ b/docs/images/icons/PostgreSQL.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Pulsar.svg b/docs/images/icons/Pulsar.svg
new file mode 100644
index 00000000000..cabedf1e022
--- /dev/null
+++ b/docs/images/icons/Pulsar.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Qdrant.svg b/docs/images/icons/Qdrant.svg
new file mode 100644
index 00000000000..b431d111a6a
--- /dev/null
+++ b/docs/images/icons/Qdrant.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Rabbitmq.svg b/docs/images/icons/Rabbitmq.svg
new file mode 100644
index 00000000000..a4ecbc6cfbf
--- /dev/null
+++ b/docs/images/icons/Rabbitmq.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Redis.svg b/docs/images/icons/Redis.svg
new file mode 100644
index 00000000000..4cbd41cada9
--- /dev/null
+++ b/docs/images/icons/Redis.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/RocketMQ.svg b/docs/images/icons/RocketMQ.svg
new file mode 100644
index 00000000000..3fd2c1adba9
--- /dev/null
+++ b/docs/images/icons/RocketMQ.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/S3File.svg b/docs/images/icons/S3File.svg
new file mode 100644
index 00000000000..ddd50aeff00
--- /dev/null
+++ b/docs/images/icons/S3File.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/SQL Server.svg b/docs/images/icons/SQL Server.svg
new file mode 100644
index 00000000000..db4b76ca740
--- /dev/null
+++ b/docs/images/icons/SQL Server.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Sftp.svg b/docs/images/icons/Sftp.svg
new file mode 100644
index 00000000000..2a8015eb504
--- /dev/null
+++ b/docs/images/icons/Sftp.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Snowflake.svg b/docs/images/icons/Snowflake.svg
new file mode 100644
index 00000000000..fb4c2868fba
--- /dev/null
+++ b/docs/images/icons/Snowflake.svg
@@ -0,0 +1,3 @@
+
diff --git a/docs/images/icons/StarRocks.svg b/docs/images/icons/StarRocks.svg
new file mode 100644
index 00000000000..10a52bbf355
--- /dev/null
+++ b/docs/images/icons/StarRocks.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/TDengine.svg b/docs/images/icons/TDengine.svg
new file mode 100644
index 00000000000..588347b3727
--- /dev/null
+++ b/docs/images/icons/TDengine.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Tablestore.svg b/docs/images/icons/Tablestore.svg
new file mode 100644
index 00000000000..24526c988b9
--- /dev/null
+++ b/docs/images/icons/Tablestore.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/docs/images/icons/Typesense.png b/docs/images/icons/Typesense.png
new file mode 100644
index 00000000000..f25cc7e9e71
Binary files /dev/null and b/docs/images/icons/Typesense.png differ
diff --git a/docs/images/icons/Web3j.png b/docs/images/icons/Web3j.png
new file mode 100644
index 00000000000..ec031cb3280
Binary files /dev/null and b/docs/images/icons/Web3j.png differ
diff --git a/docs/images/ui/detail.png b/docs/images/ui/detail.png
new file mode 100644
index 00000000000..a376b6e4880
Binary files /dev/null and b/docs/images/ui/detail.png differ
diff --git a/docs/images/ui/finished.png b/docs/images/ui/finished.png
new file mode 100644
index 00000000000..fa800bd6029
Binary files /dev/null and b/docs/images/ui/finished.png differ
diff --git a/docs/images/ui/master.png b/docs/images/ui/master.png
new file mode 100644
index 00000000000..5e42d2854ee
Binary files /dev/null and b/docs/images/ui/master.png differ
diff --git a/docs/images/ui/overview.png b/docs/images/ui/overview.png
new file mode 100644
index 00000000000..67123532499
Binary files /dev/null and b/docs/images/ui/overview.png differ
diff --git a/docs/images/ui/running.png b/docs/images/ui/running.png
new file mode 100644
index 00000000000..889edb303b1
Binary files /dev/null and b/docs/images/ui/running.png differ
diff --git a/docs/images/ui/workers.png b/docs/images/ui/workers.png
new file mode 100644
index 00000000000..a2bf39ec218
Binary files /dev/null and b/docs/images/ui/workers.png differ
diff --git a/docs/sidebars.js b/docs/sidebars.js
index e3bb42f9e3f..3257181b11a 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -202,10 +202,12 @@ const sidebars = {
"seatunnel-engine/engine-jar-storage-mode",
"seatunnel-engine/tcp",
"seatunnel-engine/resource-isolation",
- "seatunnel-engine/rest-api",
+ "seatunnel-engine/rest-api-v1",
+ "seatunnel-engine/rest-api-v2",
"seatunnel-engine/user-command",
"seatunnel-engine/logging",
- "seatunnel-engine/telemetry"
+ "seatunnel-engine/telemetry",
+ "seatunnel-engine/web-ui"
]
},
{
@@ -224,6 +226,7 @@ const sidebars = {
'contribution/new-license',
'contribution/coding-guide',
'contribution/contribute-transform-v2-guide',
+ 'contribution/how-to-create-your-connector'
],
},
"faq"
diff --git a/docs/zh/about.md b/docs/zh/about.md
index c938cc7b62e..244b27af1ae 100644
--- a/docs/zh/about.md
+++ b/docs/zh/about.md
@@ -62,7 +62,7 @@ SeaTunnel 拥有大量用户。 您可以在[用户](https://seatunnel.apache.or