From d064fce28e5c80a4472ccb69de1cb17795a663f8 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Sun, 22 Dec 2024 16:01:22 -0500 Subject: [PATCH] qutil improvements fix https://github.com/mediacloud/story-indexer/issues/355 by only dumping x-mc- headers (as noted in comments) Add --really-queue and --ignore-domain options --- indexer/scripts/qutil.py | 76 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 6 deletions(-) diff --git a/indexer/scripts/qutil.py b/indexer/scripts/qutil.py index c9928a18..03165891 100644 --- a/indexer/scripts/qutil.py +++ b/indexer/scripts/qutil.py @@ -14,6 +14,7 @@ from logging import getLogger from types import FrameType from typing import Callable, List, Optional, cast +from urllib.parse import urlparse # PyPI from pika import BasicProperties @@ -39,10 +40,40 @@ def command(func: CommandMethod) -> CommandMethod: return func +def check_domains(url: Optional[str], domains: List[str]) -> bool: + """ + return True, if, and only if url hostname is in one of domains + """ + if not url: + return False # not a reason to eliminate + + try: + u = urlparse(url) + if not u.hostname: + return False + except ValueError: + return False + + for domain in domains: + if u.hostname == domain or u.hostname.endswith("." + domain): + return True + return False + + class QUtil(QApp): def define_options(self, ap: argparse.ArgumentParser) -> None: super().define_options(ap) ap.add_argument("--max", type=int, help="max items to process") + ap.add_argument( + "--ignore-domain", help="domain names to NOT load", action="append" + ) + # XXX add --only-domain (allow multiple) for filtering?? mutually exclusive with --ignore-domain!! + ap.add_argument( + "--really-queue", + help="actually queue stories when loading", + action="store_true", + default=False, + ) ap.add_argument( "command", @@ -96,8 +127,11 @@ def writer(body: bytes, tag: int, properties: BasicProperties) -> None: work_dir=".", ) - # XXX just save headers that start with "x-mc-"? - extras = {"rabbitmq_headers": properties.headers} + # only save headers that start with "x-mc-" + # (broker x-death contains list of dicts which contains a datetime value) + headers = properties.headers or {} + mc_headers = {k: v for k, v in headers.items() if k.startswith("x-mc-")} + extras = {"rabbitmq_headers": mc_headers} aw.write_story(story, extra_metadata=extras, raise_errors=False) stories += 1 @@ -159,6 +193,27 @@ def help(self) -> None: descr = self.get_command_func(cmd).__doc__ print(f"{cmd:16.16} {descr}") + def check_url_domains(self, story: BaseStory) -> bool: + """ + look for forbidden domains in all the places they might be hiding + + """ + assert self.args + + # XXX Maybe also implement "only_domains" check?? + + # discrete statements (not single expression with and) for debug: + if check_domains(story.rss_entry().link, self.args.ignore_domain): + return False + + if check_domains(story.http_metadata().final_url, self.args.ignore_domain): + return False + + if check_domains(story.content_metadata().url, self.args.ignore_domain): + return False + + return True + @command def load_archives(self) -> None: """load archive files into queue""" @@ -179,15 +234,24 @@ def load_archives(self) -> None: logger.error("need input files") return + if not self.args.really_queue: + logger.warning("NOTE! give --queue option to actually queue stories!") + for fname in input_files: logger.info("reading archive %s", fname) with open(fname, "rb") as f: reader = StoryArchiveReader(f) - count = 0 + read = queued = 0 for story in reader.read_stories(): - sender.send_story(story, exchange, routing_key) - count += 1 - logger.info("read %d stories", count) + read += 1 + if self.args.really_queue: + if self.args.ignore_domain and not self.check_url_domains( + story + ): + continue + sender.send_story(story, exchange, routing_key) + queued += 1 + logger.info("read %d stories, queued %d", read, queued) @command def purge(self) -> None: