From 523fb5e400b4271f68a2026170a6f715142e1f8f Mon Sep 17 00:00:00 2001 From: ruflin Date: Tue, 23 Aug 2016 17:08:33 +0200 Subject: [PATCH] Strip bom from beginning of a message Reading a file with a bom included the bom with the first event. This change removes the bom part from the first event in case it exists. * Tests for utf-8 and utf-16 added Closes #1349 --- CHANGELOG.asciidoc | 3 +- filebeat/harvester/log.go | 7 ++ filebeat/harvester/reader/strip_newline.go | 1 + filebeat/tests/files/logs/bom8.log | 7 ++ filebeat/tests/system/test_harvester.py | 77 ++++++++++++++++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 filebeat/tests/files/logs/bom8.log diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index aca007871bf..1c6446d5660 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -95,7 +95,8 @@ https://github.com/elastic/beats/compare/v5.0.0-alpha5...master[Check the HEAD d *Topbeat* *Filebeat* -- Introduce close_timeout harvester options {issue}1600[1600] +- Introduce close_timeout harvester options {issue}1926[1926] +- Strip BOM from first message in case of BOM files {issue}2351[2351] - Add harvester_limit option {pull}2417[2417] diff --git a/filebeat/harvester/log.go b/filebeat/harvester/log.go index 18a8bb98471..c9d6b7abe70 100644 --- a/filebeat/harvester/log.go +++ b/filebeat/harvester/log.go @@ -1,6 +1,7 @@ package harvester import ( + "bytes" "errors" "expvar" "io" @@ -87,6 +88,12 @@ func (h *Harvester) Harvest() { return } + // Strip UTF-8 BOM if beginning of file + // As all BOMS are converted to UTF-8 it is enough to only remove this one + if h.state.Offset == 0 { + message.Content = bytes.Trim(message.Content, "\xef\xbb\xbf") + } + // Update offset h.state.Offset += int64(message.Bytes) diff --git a/filebeat/harvester/reader/strip_newline.go b/filebeat/harvester/reader/strip_newline.go index 7c6ebf5ab89..80286fa9468 100644 --- a/filebeat/harvester/reader/strip_newline.go +++ b/filebeat/harvester/reader/strip_newline.go @@ -20,6 +20,7 @@ func (p *StripNewline) Next() (Message, error) { L := message.Content message.Content = L[:len(L)-lineEndingChars(L)] + return message, err } diff --git a/filebeat/tests/files/logs/bom8.log b/filebeat/tests/files/logs/bom8.log new file mode 100644 index 00000000000..6c14a1d0729 --- /dev/null +++ b/filebeat/tests/files/logs/bom8.log @@ -0,0 +1,7 @@ +#Software: Microsoft Exchange Server +#Version: 14.0.0.0 +#Log-type: Message Tracking Log +#Date: 2016-04-05T00:00:02.052Z +#Fields: date-time,client-ip,client-hostname,server-ip,server-hostname,source-context,connector-id,source,event-id,internal-message-id,message-id,recipient-address,recipient-status,total-bytes,recipient-count,related-recipient-address,reference,message-subject,sender-address,return-path,message-info,directionality,tenant-id,original-client-ip,original-server-ip,custom-data +2016-04-05T00:00:02.052Z,,,,,"MDB:61914740-3f1b-4ddb-94e0-557196870cfa, Mailbox:279f077c-216f-4323-a9ee-48e50ffd3cad, Event:269492708, MessageClass:IPM.Note.StorageQuotaWarning.Warning, CreationTime:2016-04-05T00:00:01.022Z, ClientType:System",,STOREDRIVER,NOTIFYMAPI,,,,,,,,,,,,,,,,,S:ItemEntryId=00-00-00-00-37-DB-F9-F9-B5-F2-42-4F-86-62-E6-5D-FC-0C-A1-41-07-00-0E-D6-03-16-80-DC-8C-44-9D-30-07-23-ED-71-B7-F7-00-00-1F-D4-B5-0E-00-00-2E-EF-F2-59-0E-E8-2D-46-BC-31-02-85-0D-67-98-43-00-00-37-4A-A3-B3-00-00 +2016-04-05T00:00:02.145Z,,,,,"MDB:61914740-3f1b-4ddb-94e0-557196870cfa, Mailbox:49cb09c6-5b76-415d-a085-da0ad9079682, Event:269492711, MessageClass:IPM.Note.StorageQuotaWarning.Warning, CreationTime:2016-04-05T00:00:01.038Z, ClientType:System",,STOREDRIVER,NOTIFYMAPI,,,,,,,,,,,,,,,,,S:ItemEntryId=00-00-00-00-97-8F-07-43-51-44-61-4A-AD-BD-29-D4-97-4E-20-A0-07-00-0E-D6-03-16-80-DC-8C-44-9D-30-07-23-ED-71-B7-F7-00-8E-8F-BD-EB-57-00-00-3D-FB-CE-26-A4-8D-46-4C-A4-35-0F-A7-9B-FA-D7-B9-00-00-37-44-2F-CA-00-00 diff --git a/filebeat/tests/system/test_harvester.py b/filebeat/tests/system/test_harvester.py index b4439c039b7..e0b5a6991bb 100644 --- a/filebeat/tests/system/test_harvester.py +++ b/filebeat/tests/system/test_harvester.py @@ -1,5 +1,6 @@ from filebeat import BaseTest import os +import codecs import time """ @@ -407,3 +408,79 @@ def test_close_timeout(self): assert self.output_lines() < 1000 assert self.output_lines() > 0 + + def test_bom_utf8(self): + """ + Test utf8 log file with bom + Additional test here to make sure in case generation in python is not correct + """ + self.render_config_template( + path=os.path.abspath(self.working_dir) + "/log/*", + ) + + os.mkdir(self.working_dir + "/log/") + self.copy_files(["logs/bom8.log"], + source_dir="../files", + target_dir="log") + + filebeat = self.start_beat() + self.wait_until( + lambda: self.output_has(lines=7), + max_timeout=10) + + # Check that output does not cotain bom + output = self.read_output_json() + assert output[0]["message"] == "#Software: Microsoft Exchange Server" + + filebeat.check_kill_and_wait() + + def test_boms(self): + + """ + Test bom log files if bom is removed properly + """ + + os.mkdir(self.working_dir + "/log/") + os.mkdir(self.working_dir + "/output/") + + message = "Hello World" + + # Config array contains: + # filebeat encoding, python encoding name, bom + configs = [ + ("utf-8", "utf-8", codecs.BOM_UTF8), + ("utf-16be-bom", "utf-16-be", codecs.BOM_UTF16_BE), + ("utf-16le-bom", "utf-16-le", codecs.BOM_UTF16_LE), + ] + + for config in configs: + + # Render config with specific encoding + self.render_config_template( + path=os.path.abspath(self.working_dir) + "/log/*", + encoding=config[0], + output_file_filename=config[0], + ) + + logfile = self.working_dir + "/log/" + config[0] + "test.log" + + # Write bom to file + with codecs.open(logfile, 'wb') as file: + file.write(config[2]) + + # Write hello world to file + with codecs.open(logfile, 'a', config[1]) as file: + content = message + '\n' + file.write(content) + + filebeat = self.start_beat() + + self.wait_until( + lambda: self.output_has(lines=1, output_file="output/" + config[0]), + max_timeout=10) + + # Verify that output does not contain bom + output = self.read_output_json(output_file="output/" + config[0]) + assert output[0]["message"] == message + + filebeat.kill_and_wait()