Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip bom from message #2351

Merged
merged 1 commit into from
Sep 2, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ https://github.com/elastic/beats/compare/v5.0.0-alpha5...master[Check the HEAD d
*Topbeat*

*Filebeat*
- Introduce close_timeout harvester options {issue}1600[1600]
- Introduce close_timeout harvester options {issue}1926[1926]
- Strip BOM from first message in case of BOM files {issue}2351[2351]


- Add harvester_limit option {pull}2417[2417]
Expand Down
7 changes: 7 additions & 0 deletions filebeat/harvester/log.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package harvester

import (
"bytes"
"errors"
"expvar"
"io"
Expand Down Expand Up @@ -87,6 +88,12 @@ func (h *Harvester) Harvest() {
return
}

// Strip UTF-8 BOM if beginning of file
// As all BOMS are converted to UTF-8 it is enough to only remove this one
if h.state.Offset == 0 {
message.Content = bytes.Trim(message.Content, "\xef\xbb\xbf")
}

// Update offset
h.state.Offset += int64(message.Bytes)

Expand Down
1 change: 1 addition & 0 deletions filebeat/harvester/reader/strip_newline.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ func (p *StripNewline) Next() (Message, error) {

L := message.Content
message.Content = L[:len(L)-lineEndingChars(L)]

return message, err
}

Expand Down
7 changes: 7 additions & 0 deletions filebeat/tests/files/logs/bom8.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#Software: Microsoft Exchange Server
#Version: 14.0.0.0
#Log-type: Message Tracking Log
#Date: 2016-04-05T00:00:02.052Z
#Fields: date-time,client-ip,client-hostname,server-ip,server-hostname,source-context,connector-id,source,event-id,internal-message-id,message-id,recipient-address,recipient-status,total-bytes,recipient-count,related-recipient-address,reference,message-subject,sender-address,return-path,message-info,directionality,tenant-id,original-client-ip,original-server-ip,custom-data
2016-04-05T00:00:02.052Z,,,,,"MDB:61914740-3f1b-4ddb-94e0-557196870cfa, Mailbox:279f077c-216f-4323-a9ee-48e50ffd3cad, Event:269492708, MessageClass:IPM.Note.StorageQuotaWarning.Warning, CreationTime:2016-04-05T00:00:01.022Z, ClientType:System",,STOREDRIVER,NOTIFYMAPI,,,,,,,,,,,,,,,,,S:ItemEntryId=00-00-00-00-37-DB-F9-F9-B5-F2-42-4F-86-62-E6-5D-FC-0C-A1-41-07-00-0E-D6-03-16-80-DC-8C-44-9D-30-07-23-ED-71-B7-F7-00-00-1F-D4-B5-0E-00-00-2E-EF-F2-59-0E-E8-2D-46-BC-31-02-85-0D-67-98-43-00-00-37-4A-A3-B3-00-00
2016-04-05T00:00:02.145Z,,,,,"MDB:61914740-3f1b-4ddb-94e0-557196870cfa, Mailbox:49cb09c6-5b76-415d-a085-da0ad9079682, Event:269492711, MessageClass:IPM.Note.StorageQuotaWarning.Warning, CreationTime:2016-04-05T00:00:01.038Z, ClientType:System",,STOREDRIVER,NOTIFYMAPI,,,,,,,,,,,,,,,,,S:ItemEntryId=00-00-00-00-97-8F-07-43-51-44-61-4A-AD-BD-29-D4-97-4E-20-A0-07-00-0E-D6-03-16-80-DC-8C-44-9D-30-07-23-ED-71-B7-F7-00-8E-8F-BD-EB-57-00-00-3D-FB-CE-26-A4-8D-46-4C-A4-35-0F-A7-9B-FA-D7-B9-00-00-37-44-2F-CA-00-00
77 changes: 77 additions & 0 deletions filebeat/tests/system/test_harvester.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from filebeat import BaseTest
import os
import codecs
import time

"""
Expand Down Expand Up @@ -407,3 +408,79 @@ def test_close_timeout(self):
assert self.output_lines() < 1000
assert self.output_lines() > 0


def test_bom_utf8(self):
"""
Test utf8 log file with bom
Additional test here to make sure in case generation in python is not correct
"""
self.render_config_template(
path=os.path.abspath(self.working_dir) + "/log/*",
)

os.mkdir(self.working_dir + "/log/")
self.copy_files(["logs/bom8.log"],
source_dir="../files",
target_dir="log")

filebeat = self.start_beat()
self.wait_until(
lambda: self.output_has(lines=7),
max_timeout=10)

# Check that output does not cotain bom
output = self.read_output_json()
assert output[0]["message"] == "#Software: Microsoft Exchange Server"

filebeat.check_kill_and_wait()

def test_boms(self):

"""
Test bom log files if bom is removed properly
"""

os.mkdir(self.working_dir + "/log/")
os.mkdir(self.working_dir + "/output/")

message = "Hello World"

# Config array contains:
# filebeat encoding, python encoding name, bom
configs = [
("utf-8", "utf-8", codecs.BOM_UTF8),
("utf-16be-bom", "utf-16-be", codecs.BOM_UTF16_BE),
("utf-16le-bom", "utf-16-le", codecs.BOM_UTF16_LE),
]

for config in configs:

# Render config with specific encoding
self.render_config_template(
path=os.path.abspath(self.working_dir) + "/log/*",
encoding=config[0],
output_file_filename=config[0],
)

logfile = self.working_dir + "/log/" + config[0] + "test.log"

# Write bom to file
with codecs.open(logfile, 'wb') as file:
file.write(config[2])

# Write hello world to file
with codecs.open(logfile, 'a', config[1]) as file:
content = message + '\n'
file.write(content)

filebeat = self.start_beat()

self.wait_until(
lambda: self.output_has(lines=1, output_file="output/" + config[0]),
max_timeout=10)

# Verify that output does not contain bom
output = self.read_output_json(output_file="output/" + config[0])
assert output[0]["message"] == message

filebeat.kill_and_wait()