Skip to content

Commit

Permalink
[ML] Process delimited files like semi-structured text (#56038)
Browse files Browse the repository at this point in the history
Changes the file upload functionality to process delimited
files by splitting them into to messages, then sending
these to the ingest pipeline as a single field for further
processing in Elasticsearch.

The csv_importer has been removed and the old sst_importer
replaced with a similar message_importer that has been
enhanced to cover the edge cases required by delimited
file processing.

Previously the file upload functionality parsed CSV in the
browser, but by parsing CSV in the ingest pipeline it
makes the Kibana file upload functionality more easily
interchangable with Filebeat such that the configurations
it creates can more easily be used to import data with the
same structure repeatedly in production.

Companion to elastic/elasticsearch#51492
  • Loading branch information
droberts195 authored Jan 28, 2020
1 parent 05ed2d6 commit 9fcbeb3
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 178 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ export class ImportView extends Component {
if (success) {
const importer = importerFactory(format, results, indexCreationSettings);
if (importer !== undefined) {
const readResp = await importer.read(fileContents, this.setReadProgress);
const readResp = importer.read(fileContents, this.setReadProgress);
success = readResp.success;
this.setState({
readStatus: success ? IMPORT_STATUS.COMPLETE : IMPORT_STATUS.FAILED,
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
* you may not use this file except in compliance with the Elastic License.
*/

import { CsvImporter } from './csv_importer';
import { SstImporter } from './sst_importer';
import { MessageImporter } from './message_importer';
import { NdjsonImporter } from './ndjson_importer';

export function importerFactory(format, results, settings) {
switch (format) {
// delimited and semi-structured text are both handled by splitting the
// file into messages, then sending these to ES for further processing
// in an ingest pipeline in documents containing a single "message"
// field (like Filebeat does)
case 'delimited':
return new CsvImporter(results, settings);
case 'semi_structured_text':
return new SstImporter(results, settings);
return new MessageImporter(results, settings);
case 'ndjson':
return new NdjsonImporter(results, settings);
default:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/

import { Importer } from './importer';

export class MessageImporter extends Importer {
constructor(results, settings) {
super(settings);

this.excludeLinesRegex =
results.exclude_lines_pattern === undefined
? null
: new RegExp(results.exclude_lines_pattern);
this.multilineStartRegex =
results.multiline_start_pattern === undefined
? null
: new RegExp(results.multiline_start_pattern);
}

// split the text into an array of lines by looking for newlines.
// any lines that match the exclude_lines_pattern regex are ignored.
// if a newline is found, check the next line to see if it starts with the
// multiline_start_pattern regex
// if it does, it is a legitimate end of line and can be pushed into the list,
// if not, it must be a newline char inside a field value, so keep looking.
read(text) {
try {
const data = [];

let message = '';
let line = '';
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (char === '\n') {
message = this.processLine(data, message, line);
line = '';
} else {
line += char;
}
}

// the last line may have been missing a newline ending
if (line !== '') {
message = this.processLine(data, message, line);
}

// add the last message to the list if not already done
if (message !== '') {
this.addMessage(data, message);
}

// remove first line if it is blank
if (data[0] && data[0].message === '') {
data.shift();
}

this.data = data;
this.docArray = this.data;

return {
success: true,
};
} catch (error) {
console.error(error);
return {
success: false,
error,
};
}
}

processLine(data, message, line) {
if (this.excludeLinesRegex === null || line.match(this.excludeLinesRegex) === null) {
if (this.multilineStartRegex === null || line.match(this.multilineStartRegex) !== null) {
this.addMessage(data, message);
message = '';
} else {
message += '\n';
}
message += line;
}
return message;
}

addMessage(data, message) {
// if the message ended \r\n (Windows line endings)
// then omit the \r as well as the \n for consistency
message = message.replace(/\r$/, '');
data.push({ message });
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export class NdjsonImporter extends Importer {
super(settings);
}

async read(json) {
read(json) {
try {
const splitJson = json.split(/}\s*\n/);

Expand Down

This file was deleted.

0 comments on commit 9fcbeb3

Please sign in to comment.