From 0c2006360f76d783b59b013c580ed2995e27131d Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Tue, 20 Dec 2022 19:08:47 +0530 Subject: [PATCH 1/2] =?UTF-8?q?Optimized=20tsvectors=20insertion=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../commands/set_contentnode_tsvectors.py | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index 4e5673d9ec..4a901a055c 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -8,15 +8,17 @@ from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef +from django.db.models import Value from search.constants import CONTENTNODE_AUTHOR_TSVECTOR from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.models import ContentNodeFullTextSearch +from contentcuration.models import Channel from contentcuration.models import ContentNode logmodule.basicConfig(level=logmodule.INFO) -logging = logmodule.getLogger("command") +logging = logmodule.getLogger(__name__) CHUNKSIZE = 10000 @@ -26,34 +28,40 @@ class Command(BaseCommand): def handle(self, *args, **options): start = time.time() - tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + all_published_channels = list(Channel.objects.filter(main_tree__published=True, deleted=False).values("id", "main_tree__tree_id")) - tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) - .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), - keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, - author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) - .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) - - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) total_tsvectors_inserted = 0 - while insertable_nodes_tsvector: - logging.info("Inserting contentnode tsvectors.") + for published_channel in all_published_channels: + tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + tsvector_nodes_query = (ContentNode.objects + .annotate(channel_id=Value(published_channel["id"]), + contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) + .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"]) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector") + .order_by()) + + insertable_nodes_tsvector = list(tsvector_nodes_query[:CHUNKSIZE]) + logging.info("Inserting contentnode tsvectors of channel {}.".format(published_channel["id"])) + + while insertable_nodes_tsvector: + insert_objs = list() + for node in insertable_nodes_tsvector: + obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], + keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) + insert_objs.append(obj) - insert_objs = list() - for node in insertable_nodes_tsvector: - obj = ContentNodeFullTextSearch(contentnode_id=node["id"], channel_id=node["channel_id"], - keywords_tsvector=node["keywords_tsvector"], author_tsvector=node["author_tsvector"]) - insert_objs.append(obj) + inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) - inserted_objs_list = ContentNodeFullTextSearch.objects.bulk_create(insert_objs) + current_inserts_count = len(inserted_objs_list) + total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count - current_inserts_count = len(inserted_objs_list) - total_tsvectors_inserted = total_tsvectors_inserted + current_inserts_count + logging.info("Inserted {} contentnode tsvectors of channel {}.".format(current_inserts_count, published_channel["id"])) - logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) + insertable_nodes_tsvector = list(tsvector_nodes_query[:CHUNKSIZE]) - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + logging.info("Insertion complete for channel {}.".format(published_channel["id"])) logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start)) From ec99d4765c7805d6a6af8493595b39333a0f11e3 Mon Sep 17 00:00:00 2001 From: Vivek Agrawal Date: Tue, 27 Dec 2022 21:03:17 +0530 Subject: [PATCH 2/2] Don't create tsvectors for incomplete and unpublished nodes --- .../search/management/commands/set_contentnode_tsvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index 4a901a055c..fcf0eb55a2 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -39,7 +39,7 @@ def handle(self, *args, **options): contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"]) + .filter(tsvector_not_already_inserted_query, tree_id=published_channel["main_tree__tree_id"], published=True, complete=True) .values("id", "channel_id", "keywords_tsvector", "author_tsvector") .order_by())