From f17152dfd250e501659aa4239a30f4ee1c152d8d Mon Sep 17 00:00:00 2001 From: Laurent Savaete Date: Wed, 10 Nov 2021 12:33:14 +0000 Subject: [PATCH] Clean control chars from issue body field (#44) --- tap_github/streams.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tap_github/streams.py b/tap_github/streams.py index 62f1c31e..c150ec46 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -310,6 +310,12 @@ def http_headers(self) -> dict: def post_process(self, row: dict, context: Optional[dict] = None) -> dict: row["type"] = "pull_request" if "pull_request" in row else "issue" + if row['body'] is not None: + # some issue bodies include control characters such as \x00 + # that some targets (such as postgresql) choke on. This ensures + # such chars are removed from the data before we pass it on to + # the target + row['body'] = row['body'].encode('utf-8', errors='ignore') return row schema = th.PropertiesList(