From 087563ed65573331035b390b73bccbc7076a0dad Mon Sep 17 00:00:00 2001 From: cooper-lzy <78672629+cooper-lzy@users.noreply.github.com> Date: Wed, 1 Sep 2021 10:08:43 +0800 Subject: [PATCH] add importer doc (#658) --- .../nebula-importer/config-with-header.md | 183 +++++++++++ .../nebula-importer/config-without-header.md | 211 ++++++++++++ docs-2.0/nebula-importer/nebula-importer.md | 15 - docs-2.0/nebula-importer/use-importer.md | 301 ++++++++++++++++++ mkdocs.yml | 7 +- 5 files changed, 701 insertions(+), 16 deletions(-) create mode 100644 docs-2.0/nebula-importer/config-with-header.md create mode 100644 docs-2.0/nebula-importer/config-without-header.md delete mode 100644 docs-2.0/nebula-importer/nebula-importer.md create mode 100644 docs-2.0/nebula-importer/use-importer.md diff --git a/docs-2.0/nebula-importer/config-with-header.md b/docs-2.0/nebula-importer/config-with-header.md new file mode 100644 index 00000000000..9c4f86723b7 --- /dev/null +++ b/docs-2.0/nebula-importer/config-with-header.md @@ -0,0 +1,183 @@ +# Configuration with Header + +For a CSV file with header, you need to set `withHeader` to `true` in the configuration file, indicating that the first behavior in the CSV file is the header. The header content has special meanings. + +!!! caution + + If the CSV file contains headers, the Importer will parse the Schema of each row of data according to the headers and ignore the vertex or edge settings in the YAML file. + +## Sample files + +The following is an example of a CSV file with header: + +- sample of vertex + + Example data for `student_with_header.csv`: + + ```csv + :VID(string),student.name:string,student.age:int,student.gender:string + student100,Monica,16,female + student101,Mike,18,male + student102,Jane,17,female + ``` + + The first column is the vertex ID, followed by the properties `name`, `age`, and `gender`. + +- sample of edge + + Example data for `follow_with_header.csv`: + + ```csv + :SRC_VID(string),:DST_VID(string),:RANK,follow.degree:double + student100,student101,0,92.5 + student101,student100,1,85.6 + student101,student102,2,93.2 + student100,student102,1,96.2 + ``` + + The first two columns are the start vertex ID and destination vertex ID, respectively. The third column is rank, and the fourth column is property `degree`. + +## Header format description + +The header defines the start vertex, the destination vertex, the rank, and some special functions by keywords as follows: + +- `:VID`(mandatory): Vertex ID. Need to use `:VID(type)` form to set data type, for example `:VID(string)` or `:VID(int)`. + +- `:SRC_VID`(mandatory): The start vertex ID of the edge. The data type needs to be set in the form `:SRC_VID(type)`. + +- `:DST_VID`(mandatory): The destination vertex ID of the edge. The data type needs to be set in the form `:DST_VID(type)`. + +- `:RANK`(optional): The rank value of the edge. + +- `:IGNORE`(optional): Ignore this column when inserting data. + +- `:LABEL`(optional): Insert (+) or delete (-) the row. Must be column 1. For example: + + ```csv + :LABEL, + +, + -, + ``` + +!!! note + All columns except the `:LABEL` column can be sorted in any order, so for larger CSV files, the user has the flexibility to set the header to select the desired column. + +For Tag or Edge type properties, the format is `.:`, described as follows: + +- ``: Tag or Edge type name. + +- ``: property name. + +- ``: property type. Support `bool`, `int`, `float`, `double`, `timestamp` and `string`, default `string`. + +Such as `student.name:string`、`follow.degree:double`. + +## Sample configuration + +```yaml +# Connected to the Nebula Graph version, set to v2 when connected to 2.x. +version: v2 + +description: example + +# Whether to delete temporarily generated logs and error data files. +removeTempFiles: false + +clientSettings: + + # Retry times of nGQL statement execution failures. + retry: 3 + + # Number of Nebula Graph client concurrency. + concurrency: 10 + + # Cache queue size per Nebula Graph client. + channelBufferSize: 128 + + # Specifies the Nebula Graph space to import the data into. + space: student + + # Connection information. + connection: + user: root + password: nebula + address: 192.168.*.13:9669 + + postStart: + # Configure some of the operations to perform after connecting to the Nebula Graph server, and before inserting data. + commands: | + DROP SPACE IF EXISTS student; + CREATE SPACE IF NOT EXISTS student(partition_num=5, replica_factor=1, vid_type=FIXED_STRING(20)); + USE student; + CREATE TAG student(name string, age int,gender string); + CREATE EDGE follow(degree int); + + # The interval between the execution of the above command and the execution of the insert data command. + afterPeriod: 15s + + preStop: + # Configure some of the actions you performed before disconnecting from the Nebula Graph server. + commands: | + +# Path of the error log file. +logPath: ./err/test.log + +# CSV file Settings. +files: + + # Path for storing data files. If a relative path is used, the path is merged with the current configuration file directory. The first data file in this example is vertex data. + - path: ./student_with_header.csv + + # Insert the failed data file storage path, so that data can be written later. + failDataPath: ./err/studenterr.csv + + # The number of statements inserting data in a batch. + batchSize: 10 + + # Limit on the number of rows of read data. + limit: 10 + + # Whether to insert rows in the file in order. If the value is set to false, the import rate decreases due to data skew. + inOrder: true + + # File type. Currently, only CSV files are supported. + type: csv + + csv: + # Whether there is a header. + withHeader: true + + # Whether there is a LABEL. + withLabel: false + + # Specifies the delimiter for the CSV file. A string delimiter that supports only one character. + delimiter: "," + + schema: + # Schema type. Possible values are vertex and edge. + type: vertex + + # The second data file in this example is edge data. + - path: ./follow_with_header.csv + failDataPath: ./err/followerr.csv + batchSize: 10 + limit: 10 + inOrder: true + type: csv + csv: + withHeader: true + withLabel: false + schema: + # The type of Schema is edge. + type: edge + edge: + # Edge type name. + name: follow + + # Whether to include rank. + withRanking: true +``` + +!!! Note + + The data type of the vertex ID must be the same as the data type of the statement in `clientSettings.postStart.commands` that creates the graph space. \ No newline at end of file diff --git a/docs-2.0/nebula-importer/config-without-header.md b/docs-2.0/nebula-importer/config-without-header.md new file mode 100644 index 00000000000..98b7627ffbe --- /dev/null +++ b/docs-2.0/nebula-importer/config-without-header.md @@ -0,0 +1,211 @@ +# Configuration without Header + +For CSV files without header, you need to set `withHeader` to `false` in the configuration file, indicating that the CSV file contains only data (excluding the header of the first row). You may also need to set the data type and corresponding columns. + +## Sample files + +The following is an example of a CSV file without header: + +- sample of vertex + + Example data for `student_without_header.csv`: + + ```csv + student100,Monica,16,female + student101,Mike,18,male + student102,Jane,17,female + ``` + + The first column is the vertex ID, followed by the properties `name`, `age`, and `gender`. + +- sample of edge + + Example data for `follow_without_header.csv`: + + ```csv + student100,student101,0,92.5 + student101,student100,1,85.6 + student101,student102,2,93.2 + student100,student102,1,96.2 + ``` + + The first two columns are the start vertex ID and destination vertex ID, respectively. The third column is rank, and the fourth column is property `degree`. + +## Sample configuration + +```yaml +# Connected to the Nebula Graph version, set to v2 when connected to 2.x. +version: v2 + +description: example + +# Whether to delete temporarily generated logs and error data files. +removeTempFiles: false + +clientSettings: + + # Retry times of nGQL statement execution failures. + retry: 3 + + # Number of Nebula Graph client concurrency. + concurrency: 10 + + # Cache queue size per Nebula Graph client. + channelBufferSize: 128 + + # Specifies the Nebula Graph space to import the data into. + space: student + + # Connection information. + connection: + user: root + password: nebula + address: 192.168.*.13:9669 + + postStart: + # Configure some of the operations to perform after connecting to the Nebula Graph server, and before inserting data. + commands: | + DROP SPACE IF EXISTS student; + CREATE SPACE IF NOT EXISTS student(partition_num=5, replica_factor=1, vid_type=FIXED_STRING(20)); + USE student; + CREATE TAG student(name string, age int,gender string); + CREATE EDGE follow(degree int); + + # The interval between the execution of the above command and the execution of the insert data command. + afterPeriod: 15s + + preStop: + # Configure some of the actions you performed before disconnecting from the Nebula Graph server. + commands: | + +# Path of the error log file. +logPath: ./err/test.log + +# CSV file Settings. +files: + + # Path for storing data files. If a relative path is used, the path is merged with the current configuration file directory. The first data file in this example is vertex data. + - path: ./student_without_header.csv + + # Insert the failed data file storage path, so that data can be written later. + failDataPath: ./err/studenterr.csv + + # The number of statements inserting data in a batch. + batchSize: 10 + + # Limit on the number of rows of read data. + limit: 10 + + # Whether to insert rows in the file in order. If the value is set to false, the import rate decreases due to data skew. + inOrder: true + + # File type. Currently, only CSV files are supported. + type: csv + + csv: + # Whether there is a header. + withHeader: false + + # Whether there is a LABEL. + withLabel: false + + # Specifies the delimiter for the CSV file. A string delimiter that supports only one character. + delimiter: "," + + schema: + # Schema type. Possible values are vertex and edge. + type: vertex + + vertex: + + # Vertex ID Settings. + vid: + # The vertex ID corresponds to the column number in the CSV file. Columns in the CSV file are numbered from 0. + index: 0 + + # The data type of the vertex ID. The optional values are int and string, corresponding to INT64 and FIXED_STRING in the Nebula Graph, respectively. + type: string + + # Tag Settings. + # Tag name. + - name: student + + # property Settings in the Tag. + props: + # property name. + - name: name + + # Property data type. + type: string + + # Property corresponds to the sequence number of the column in the CSV file. + index: 1 + + - name: age + type: int + index: 2 + - name: gender + type: string + index: 3 + + # The second data file in this example is edge data. + - path: ./follow_without_header.csv + failDataPath: ./err/followerr.csv + batchSize: 10 + limit: 10 + inOrder: true + type: csv + csv: + withHeader: false + withLabel: false + schema: + # The type of Schema is edge. + type: edge + edge: + # Edge type name. + name: follow + + # Whether to include rank. + withRanking: true + + # Start vertex ID setting. + srcVID: + # Data type. + type: string + + # The start vertex ID corresponds to the sequence number of a column in the CSV file. + index: 0 + + # Destination vertex ID. + dstVID: + type: string + index: 1 + + # rank setting. + rank: + # Rank Indicates the rank number of a column in the CSV file. If index is not set, be sure to set the rank value in the third column. Subsequent columns set each property in turn. + index: 2 + + # Edge Type property Settings. + props: + # property name. + - name: degree + + # Data type. + type: double + + # Property corresponds to the sequence number of the column in the CSV file. + index: 3 +``` + +!!! Note + + - The sequence numbers of the columns in the CSV file start from 0, that is, the sequence numbers of the first column are 0, and the sequence numbers of the second column are 1. + + - The data type of the vertex ID must be the same as the data type of the statement in `clientSettings.postStart.commands` that creates the graph space. + + - If the index field is not specified, the CSV file must comply with the following rules: + + + In the vertex data file, the first column must be the vertex ID, followed by the properties, and must correspond to the order in the configuration file. + + + In the side data file, the first column must be the start vertex ID, the second column must be the destination vertex ID, if `withRanking` is `true`, the third column must be the rank value, and the following columns must be properties, and must correspond to the order in the configuration file. diff --git a/docs-2.0/nebula-importer/nebula-importer.md b/docs-2.0/nebula-importer/nebula-importer.md deleted file mode 100644 index 518d4cc4594..00000000000 --- a/docs-2.0/nebula-importer/nebula-importer.md +++ /dev/null @@ -1,15 +0,0 @@ -# Nebula Importer - -Nebula Importer(hereinafter referred to as Importer)is a tool of [Nebula Graph](https://github.com/vesoft-inc/nebula) for importing CSV (comma-separated values) files. Importer supports reading CSV data source and importing data to the Nebula Graph database. - -For more information, see [Nebula Importer](https://github.com/vesoft-inc/nebula-importer/blob/release-v2.0.0-ga/README.md). - -## Use cases - -Importer is suitable for importing local CSV files into Nebula Graph. - -## Benefits - -* Lightweight and fast. Importer supports use in a simple environment and quickly importing data. - -* High flexibility. Importer supports flexible CSV data filtering through its configuration. \ No newline at end of file diff --git a/docs-2.0/nebula-importer/use-importer.md b/docs-2.0/nebula-importer/use-importer.md new file mode 100644 index 00000000000..815b2880486 --- /dev/null +++ b/docs-2.0/nebula-importer/use-importer.md @@ -0,0 +1,301 @@ +# Nebula Importer + +Nebula Importer (Importer) is a standalone import tool for CSV files with [Nebula Graph](https://github.com/vesoft-inc/nebula). Importer can read the local CSV file and then import the data into the Nebula Graph database. + +## Scenario + +Importer is used to import the contents of a local CSV file into the Nebula Graph. + +## Advantage + +- Lightweight and fast: no complex environment can be used, fast data import. + +- Flexible filtering: You can flexibly filter CSV data through configuration files. + +## Prerequisites + +Before using Nebula Importer, make sure: + +- Nebula Graph service has been deployed. There are currently three deployment modes: + + - [Deploy Nebula Graph with Docker Compose](../4.deployment-and-installation/2.compile-and-install-nebula-graph/3.deploy-nebula-graph-with-docker-compose.md) + + - [Install Nebula Graph with RPM or DEB package](../4.deployment-and-installation/2.compile-and-install-nebula-graph/2.install-nebula-graph-by-rpm-or-deb.md) + + - [Install Nebula Graph by compiling the source code](../4.deployment-and-installation/2.compile-and-install-nebula-graph/1.install-nebula-graph-by-compiling-the-source-code.md) + +- Schema is created in Nebula Graph, including space, Tag and Edge type, or set by parameter `clientSettings.postStart.commands`. + +- Golang environment has been deployed on the machine running the Importer. For details, see [Build Go environment](https://github.com/vesoft-inc/nebula-importer/blob/release-v2.0.0-ga/docs/golang-install-en.md). + +## Steps + +Configure the YAML file and prepare the CSV file to be imported to use the tool to batch write data to Nebula Graph. + +### Source code compile and run + +1. Clone repository. + + ```bash + $ git clone -b {{importer.release}} https://github.com/vesoft-inc/nebula-importer.git + ``` + + !!! note + + Use the correct branch. + Nebula Graph 1.x and 2.x have different RPC protocols, so: + + - The Nebula Importer V1 branch can only connect to Nebula Graph 1.x. + - The Nebula Importer Master branch and v2 branch can connect to Nebula Graph 2.x. + +2. Access the directory `nebula-importer`. + + ```bash + $ cd nebula-importer + ``` + +3. Compile the source code. + + ```bash + $ make build + ``` + +4. Start the service. + + ```bash + $ ./nebula-importer --config + ``` + + !!! note + For details about the YAML configuration file, see configuration file description at the end of topic. + +#### No network compilation mode + +If the server cannot be connected to the Internet, it is recommended to upload the source code and various dependency packages to the corresponding server for compilation on the machine that can be connected to the Internet. The operation steps are as follows: + +1. Clone repository. + + ```bash + $ git clone -b {{importer.release}} https://github.com/vesoft-inc/nebula-importer.git + ``` + +2. Use the following command to download and package the dependent source code. + + ```bash + $ cd nebula-importer + $ go mod vendor + $ cd .. && tar -zcvf nebula-importer.tar.gz nebula-importer + ``` + +3. Upload the compressed package to a server that cannot be connected to the Internet. + +4. Unzip and compile. + + ```bash + $ tar -zxvf nebula-importer.tar.gz + $ cd nebula-importer + $ go build -mod vendor cmd/importer.go + ``` + +### Run in Docker mode + +Instead of installing the Go locale locally, you can use Docker to pull the [image](https://hub.docker.com/r/vesoft/nebula-importer) of the Nebula Importer and mount the local configuration file and CSV data file into the container. The command is as follows: + +```bash +$ docker run --rm -ti \ + --network=host \ + -v : \ + -v : \ + vesoft/nebula-importer: + --config +``` + +- ``: The absolute path to the local YAML configuration file. +- ``: The absolute path to the local CSV data file. +- ``: Nebula Graph 2.x Please fill in 'v2'. + +!!! note + A relative path is recommended. If you use a local absolute path, check that the path maps to the path in the Docker. + +## Configuration File Description + +Nebula Importer uses configuration(`nebula-importer/examples/v2/example.yaml`) files to describe information about the files to be imported, the Nebula Graph server, and more. You can refer to the example configuration file: [Configuration without Header](config-without-header.md)/[Configuration with Header](config-with-header.md). This section describes the fields in the configuration file by category. + +### Basic configuration + +The example configuration is as follows: + +```yaml +version: v2 +description: example +removeTempFiles: false +``` + +|Parameter|Default value|Required|Description| +|:---|:---|:---|:---| +|`version`|v2|Yes|Target version of Nebula Graph.| +|`description`|example|No|Description of the configuration file.| +|`removeTempFiles`|false|No|Whether to delete temporarily generated logs and error data files.| + +### Client configuration + +The client configuration stores the configurations associated with Nebula Graph. + +The example configuration is as follows: + +```yaml +clientSettings: + retry: 3 + concurrency: 10 + channelBufferSize: 128 + space: test + connection: + user: user + password: password + address: 192.168.*.13:9669,192.168.*.14:9669 + postStart: + commands: | + UPDATE CONFIGS storage:wal_ttl=3600; + UPDATE CONFIGS storage:rocksdb_column_family_options = { disable_auto_compactions = true }; + afterPeriod: 8s + preStop: + commands: | + UPDATE CONFIGS storage:wal_ttl=86400; + UPDATE CONFIGS storage:rocksdb_column_family_options = { disable_auto_compactions = false }; +``` + +|Parameter|Default value|Required|Description| +|:---|:---|:---|:---| +|`clientSettings.retry`|3|No|Retry times of nGQL statement execution failures.| +|`clientSettings.concurrency`|10|No|Number of Nebula Graph client concurrency.| +|`clientSettings.channelBufferSize`|128|No|Cache queue size per Nebula Graph client.| +|`clientSettings.space`|-|Yes|Specifies the Nebula Graph space to import the data into. Do not import multiple spaces at the same time to avoid performance impact.| +|`clientSettings.connection.user`|-|Yes|Nebula Graph user name.| +|`clientSettings.connection.password`|-|Yes|The password for the Nebula Graph user name.| +|`clientSettings.connection.address`|-|Yes|Addresses and ports for all Graph services.| +|`clientSettings.postStart.commands`|-|No|Configure some of the operations to perform after connecting to the Nebula Graph server, and before inserting data.| +|`clientSettings.postStart.afterPeriod`|-|No|The interval, between executing the above `commands` and executing the insert data command, such as `8s`.| +|`clientSettings.preStop.commands`|-|No|Configure some of the actions you performed before disconnecting from the Nebula Graph server.| + +### File configuration + +File configuration Stores the configuration of data files and logs, and details about the Schema. + +#### File and log configuration + +The example configuration is as follows: + +```yaml +logPath: ./err/test.log +files: + - path: ./student_without_header.csv + failDataPath: ./err/studenterr.csv + batchSize: 128 + limit: 10 + inOrder: false + type: csv + csv: + withHeader: false + withLabel: false + delimiter: "," +``` + +|Parameter|Default value|Required|Description| +|:---|:---|:---|:---| +|`logPath`|-|No|Path for exporting log information, such as errors during import.| +|`files.path`|-|Yes|Path for storing data files. If a relative path is used, the path is merged with the current configuration file directory. You can use an asterisk (\*) for fuzzy matching to import multiple files with similar names, but the files need to be the same structure.| +|`files.failDataPath`|-|Yes|Insert the failed data file storage path, so that data can be written later.| +|`files.batchSize`|128|No|The number of statements inserting data in a batch.| +|`files.limit`|-|No|Limit on the number of rows of read data.| +|`files.inOrder`|-|No|Whether to insert rows in the file in order. If the value is set to `false`, the import rate decreases due to data skew.| +|`files.type`|-|Yes|The file type.| +|`files.csv.withHeader`|`false`|Yes|Whether there is a header.| +|`files.csv.withLabel`|`false`|Yes|Whether there is a label.| +|`files.csv.delimiter`|`","`|Yes|Specifies the delimiter for the CSV file. A string delimiter that supports only one character.| + +#### Schema configuration + +Schema configuration describes the Meta information of the current data file. Schema types are vertex and edge. Multiple vertexes or edges can be configured at the same time. + +- vertex configuration + +The example configuration is as follows: + +```yaml +schema: + type: vertex + vertex: + vid: + type: string + index: 0 + tags: + - name: student + props: + - name: name + type: string + index: 1 + - name: age + type: int + index: 2 + - name: gender + type: string + index: 3 +``` + +|Parameter|Default value|Required|Description| +|:---|:---|:---|:---| +|`files.schema.type`|-|Yes|Schema type. Possible values are `vertex` and `edge`.| +|`files.schema.vertex.vid.type`|-|No|The data type of the vertex ID. Possible values are `int` and `string`.| +|`files.schema.vertex.vid.index`|-|No|The vertex ID corresponds to the column number in the CSV file.| +|`files.schema.vertex.tags.name`|-|Yes|Tag name.| +|`files.schema.vertex.tags.props.name`|-|Yes|Tag property name, which must match the Tag property in the Nebula Graph.| +|`files.schema.vertex.tags.props.type`|-|No|Property data type, supporting `bool`, `int`, `float`, `double`, `timestamp` and `string`.| +|`files.schema.vertex.tags.props.index`|-|No|Property corresponds to the sequence number of the column in the CSV file.| + +!!! note + The sequence numbers of the columns in the CSV file start from 0, that is, the sequence numbers of the first column are 0, and the sequence numbers of the second column are 1. + +- edge configuration + +The example configuration is as follows: + +```yaml +schema: + type: edge + edge: + name: follow + withRanking: true + srcVID: + type: string + index: 0 + dstVID: + type: string + index: 1 + rank: + index: 2 + props: + - name: degree + type: double + index: 3 +``` + +|Parameter|Default value|Required|Description| +|:---|:---|:---|:---| +|`files.schema.type`|-|Yes|Schema type. Possible values are `vertex` and `edge`.| +|`files.schema.edge.name`|-|Yes|Edge type name.| +|`files.schema.edge.srcVID.type`|-|No|边的起始点ID的数据类型.| +|`files.schema.edge.srcVID.index`|-|No|The data type of the starting vertex ID of the edge.| +|`files.schema.edge.dstVID.type`|-|No|The data type of the destination vertex ID of the edge.| +|`files.schema.edge.dstVID.index`|-|No|The destination vertex ID of the edge corresponds to the column number in the CSV file.| +|`files.schema.edge.rank.index`|-|No|The rank value of the edge corresponds to the column number in the CSV file.| +|`files.schema.edge.props.name`|-|Yes|The Edge Type property name must match the Edge Type property in the Nebula Graph.| +|`files.schema.edge.props.type`|-|No|Property data type, supporting `bool`, `int`, `float`, `double`, `timestamp` and `string`.| +|`files.schema.edge.props.index`|-|No|Property corresponds to the sequence number of the column in the CSV file.| + +## About the CSV file header + +According to whether the CSV file has a header or not, the Importer needs to make different Settings on the configuration file. For relevant examples and explanations, please refer to: + +- [Configuration without Header](config-without-header.md) + +- [Configuration with Header](config-with-header.md) diff --git a/mkdocs.yml b/mkdocs.yml index f06429a6700..af5967d6517 100755 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -310,8 +310,13 @@ nav: - Use Dashboard: nebula-dashboard/4.use-dashboard.md - Monitor Parameter: nebula-dashboard/6.monitor-parameter.md - - Nebula Algorithm: nebula-algorithm.md + - Nebula Importer: + - Use Nebula Importer: nebula-importer/use-importer.md + - Configuration with Header: nebula-importer/config-with-header.md + - Configuration without Header: nebula-importer/config-without-header.md + - Nebula Algorithm: nebula-algorithm.md + - Nebula Spark Connector: nebula-spark-connector.md - Nebula Flink Connector: nebula-flink-connector.md